aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig35
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c27
-rw-r--r--mm/bootmem.c32
-rw-r--r--mm/filemap.c66
-rw-r--r--mm/highmem.c17
-rw-r--r--mm/hugetlb.c551
-rw-r--r--mm/hwpoison-inject.c113
-rw-r--r--mm/internal.h35
-rw-r--r--mm/kmemleak.c197
-rw-r--r--mm/ksm.c962
-rw-r--r--mm/madvise.c21
-rw-r--r--mm/memcontrol.c448
-rw-r--r--mm/memory-failure.c598
-rw-r--r--mm/memory.c49
-rw-r--r--mm/memory_hotplug.c40
-rw-r--r--mm/mempolicy.c82
-rw-r--r--mm/migrate.c135
-rw-r--r--mm/mincore.c37
-rw-r--r--mm/mlock.c45
-rw-r--r--mm/mmap.c136
-rw-r--r--mm/mremap.c241
-rw-r--r--mm/nommu.c39
-rw-r--r--mm/oom_kill.c103
-rw-r--r--mm/page-writeback.c15
-rw-r--r--mm/page_alloc.c126
-rw-r--r--mm/page_io.c17
-rw-r--r--mm/pagewalk.c32
-rw-r--r--mm/percpu.c162
-rw-r--r--mm/readahead.c12
-rw-r--r--mm/rmap.c354
-rw-r--r--mm/shmem.c84
-rw-r--r--mm/shmem_acl.c171
-rw-r--r--mm/slab.c142
-rw-r--r--mm/slub.c24
-rw-r--r--mm/swapfile.c862
-rw-r--r--mm/truncate.c8
-rw-r--r--mm/vmalloc.c55
-rw-r--r--mm/vmscan.c335
-rw-r--r--mm/vmstat.c3
40 files changed, 4220 insertions, 2192 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index edd300aca173..17b8947aa7da 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -67,7 +67,7 @@ config DISCONTIGMEM
67 67
68config SPARSEMEM 68config SPARSEMEM
69 def_bool y 69 def_bool y
70 depends on SPARSEMEM_MANUAL 70 depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
71 71
72config FLATMEM 72config FLATMEM
73 def_bool y 73 def_bool y
@@ -128,11 +128,8 @@ config SPARSEMEM_VMEMMAP
128config MEMORY_HOTPLUG 128config MEMORY_HOTPLUG
129 bool "Allow for memory hot-add" 129 bool "Allow for memory hot-add"
130 depends on SPARSEMEM || X86_64_ACPI_NUMA 130 depends on SPARSEMEM || X86_64_ACPI_NUMA
131 depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG 131 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
132 depends on (IA64 || X86 || PPC64 || SUPERH || S390) 132 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
133
134comment "Memory hotplug is currently incompatible with Software Suspend"
135 depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390
136 133
137config MEMORY_HOTPLUG_SPARSE 134config MEMORY_HOTPLUG_SPARSE
138 def_bool y 135 def_bool y
@@ -161,11 +158,13 @@ config PAGEFLAGS_EXTENDED
161# Default to 4 for wider testing, though 8 might be more appropriate. 158# Default to 4 for wider testing, though 8 might be more appropriate.
162# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. 159# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
163# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. 160# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
161# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
164# 162#
165config SPLIT_PTLOCK_CPUS 163config SPLIT_PTLOCK_CPUS
166 int 164 int
167 default "4096" if ARM && !CPU_CACHE_VIPT 165 default "999999" if ARM && !CPU_CACHE_VIPT
168 default "4096" if PARISC && !PA20 166 default "999999" if PARISC && !PA20
167 default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC
169 default "4" 168 default "4"
170 169
171# 170#
@@ -203,14 +202,6 @@ config VIRT_TO_BUS
203 def_bool y 202 def_bool y
204 depends on !ARCH_NO_VIRT_TO_BUS 203 depends on !ARCH_NO_VIRT_TO_BUS
205 204
206config HAVE_MLOCK
207 bool
208 default y if MMU=y
209
210config HAVE_MLOCKED_PAGE_BIT
211 bool
212 default y if HAVE_MLOCK=y
213
214config MMU_NOTIFIER 205config MMU_NOTIFIER
215 bool 206 bool
216 207
@@ -221,13 +212,16 @@ config KSM
221 Enable Kernel Samepage Merging: KSM periodically scans those areas 212 Enable Kernel Samepage Merging: KSM periodically scans those areas
222 of an application's address space that an app has advised may be 213 of an application's address space that an app has advised may be
223 mergeable. When it finds pages of identical content, it replaces 214 mergeable. When it finds pages of identical content, it replaces
224 the many instances by a single resident page with that content, so 215 the many instances by a single page with that content, so
225 saving memory until one or another app needs to modify the content. 216 saving memory until one or another app needs to modify the content.
226 Recommended for use with KVM, or with other duplicative applications. 217 Recommended for use with KVM, or with other duplicative applications.
227 See Documentation/vm/ksm.txt for more information. 218 See Documentation/vm/ksm.txt for more information: KSM is inactive
219 until a program has madvised that an area is MADV_MERGEABLE, and
220 root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
228 221
229config DEFAULT_MMAP_MIN_ADDR 222config DEFAULT_MMAP_MIN_ADDR
230 int "Low address space to protect from user allocation" 223 int "Low address space to protect from user allocation"
224 depends on MMU
231 default 4096 225 default 4096
232 help 226 help
233 This is the portion of low virtual memory which should be protected 227 This is the portion of low virtual memory which should be protected
@@ -258,8 +252,9 @@ config MEMORY_FAILURE
258 special hardware support and typically ECC memory. 252 special hardware support and typically ECC memory.
259 253
260config HWPOISON_INJECT 254config HWPOISON_INJECT
261 tristate "Poison pages injector" 255 tristate "HWPoison pages injector"
262 depends on MEMORY_FAILURE && DEBUG_KERNEL 256 depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS
257 select PROC_PAGE_MONITOR
263 258
264config NOMMU_INITIAL_TRIM_EXCESS 259config NOMMU_INITIAL_TRIM_EXCESS
265 int "Turn on mmap() excess space trimming before booting" 260 int "Turn on mmap() excess space trimming before booting"
diff --git a/mm/Makefile b/mm/Makefile
index 82131d0f8d85..7a68d2ab5560 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o
22obj-$(CONFIG_NUMA) += mempolicy.o 22obj-$(CONFIG_NUMA) += mempolicy.o
23obj-$(CONFIG_SPARSEMEM) += sparse.o 23obj-$(CONFIG_SPARSEMEM) += sparse.o
24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
28obj-$(CONFIG_KSM) += ksm.o 27obj-$(CONFIG_KSM) += ksm.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3d3accb1f800..0e8ca0347707 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -92,7 +92,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
92 "BdiDirtyThresh: %8lu kB\n" 92 "BdiDirtyThresh: %8lu kB\n"
93 "DirtyThresh: %8lu kB\n" 93 "DirtyThresh: %8lu kB\n"
94 "BackgroundThresh: %8lu kB\n" 94 "BackgroundThresh: %8lu kB\n"
95 "WriteBack threads:%8lu\n" 95 "WritebackThreads: %8lu\n"
96 "b_dirty: %8lu\n" 96 "b_dirty: %8lu\n"
97 "b_io: %8lu\n" 97 "b_io: %8lu\n"
98 "b_more_io: %8lu\n" 98 "b_more_io: %8lu\n"
@@ -604,15 +604,36 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
604 604
605 /* 605 /*
606 * Finally, kill the kernel threads. We don't need to be RCU 606 * Finally, kill the kernel threads. We don't need to be RCU
607 * safe anymore, since the bdi is gone from visibility. 607 * safe anymore, since the bdi is gone from visibility. Force
608 * unfreeze of the thread before calling kthread_stop(), otherwise
609 * it would never exet if it is currently stuck in the refrigerator.
608 */ 610 */
609 list_for_each_entry(wb, &bdi->wb_list, list) 611 list_for_each_entry(wb, &bdi->wb_list, list) {
612 thaw_process(wb->task);
610 kthread_stop(wb->task); 613 kthread_stop(wb->task);
614 }
615}
616
617/*
618 * This bdi is going away now, make sure that no super_blocks point to it
619 */
620static void bdi_prune_sb(struct backing_dev_info *bdi)
621{
622 struct super_block *sb;
623
624 spin_lock(&sb_lock);
625 list_for_each_entry(sb, &super_blocks, s_list) {
626 if (sb->s_bdi == bdi)
627 sb->s_bdi = NULL;
628 }
629 spin_unlock(&sb_lock);
611} 630}
612 631
613void bdi_unregister(struct backing_dev_info *bdi) 632void bdi_unregister(struct backing_dev_info *bdi)
614{ 633{
615 if (bdi->dev) { 634 if (bdi->dev) {
635 bdi_prune_sb(bdi);
636
616 if (!bdi_cap_flush_forker(bdi)) 637 if (!bdi_cap_flush_forker(bdi))
617 bdi_wb_shutdown(bdi); 638 bdi_wb_shutdown(bdi);
618 bdi_debug_unregister(bdi); 639 bdi_debug_unregister(bdi);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 555d5d2731c6..7d1486875e1c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -143,6 +143,30 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
143 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); 143 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
144} 144}
145 145
146/*
147 * free_bootmem_late - free bootmem pages directly to page allocator
148 * @addr: starting address of the range
149 * @size: size of the range in bytes
150 *
151 * This is only useful when the bootmem allocator has already been torn
152 * down, but we are still initializing the system. Pages are given directly
153 * to the page allocator, no bootmem metadata is updated because it is gone.
154 */
155void __init free_bootmem_late(unsigned long addr, unsigned long size)
156{
157 unsigned long cursor, end;
158
159 kmemleak_free_part(__va(addr), size);
160
161 cursor = PFN_UP(addr);
162 end = PFN_DOWN(addr + size);
163
164 for (; cursor < end; cursor++) {
165 __free_pages_bootmem(pfn_to_page(cursor), 0);
166 totalram_pages++;
167 }
168}
169
146static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 170static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
147{ 171{
148 int aligned; 172 int aligned;
@@ -408,8 +432,8 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
408 return mark_bootmem(start, end, 1, flags); 432 return mark_bootmem(start, end, 1, flags);
409} 433}
410 434
411static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, 435static unsigned long __init align_idx(struct bootmem_data *bdata,
412 unsigned long step) 436 unsigned long idx, unsigned long step)
413{ 437{
414 unsigned long base = bdata->node_min_pfn; 438 unsigned long base = bdata->node_min_pfn;
415 439
@@ -421,8 +445,8 @@ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
421 return ALIGN(base + idx, step) - base; 445 return ALIGN(base + idx, step) - base;
422} 446}
423 447
424static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, 448static unsigned long __init align_off(struct bootmem_data *bdata,
425 unsigned long align) 449 unsigned long off, unsigned long align)
426{ 450{
427 unsigned long base = PFN_PHYS(bdata->node_min_pfn); 451 unsigned long base = PFN_PHYS(bdata->node_min_pfn);
428 452
diff --git a/mm/filemap.c b/mm/filemap.c
index ef169f37156d..96ac6b0eb6cb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -260,27 +260,27 @@ int filemap_flush(struct address_space *mapping)
260EXPORT_SYMBOL(filemap_flush); 260EXPORT_SYMBOL(filemap_flush);
261 261
262/** 262/**
263 * wait_on_page_writeback_range - wait for writeback to complete 263 * filemap_fdatawait_range - wait for writeback to complete
264 * @mapping: target address_space 264 * @mapping: address space structure to wait for
265 * @start: beginning page index 265 * @start_byte: offset in bytes where the range starts
266 * @end: ending page index 266 * @end_byte: offset in bytes where the range ends (inclusive)
267 * 267 *
268 * Wait for writeback to complete against pages indexed by start->end 268 * Walk the list of under-writeback pages of the given address space
269 * inclusive 269 * in the given range and wait for all of them.
270 */ 270 */
271int wait_on_page_writeback_range(struct address_space *mapping, 271int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
272 pgoff_t start, pgoff_t end) 272 loff_t end_byte)
273{ 273{
274 pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
275 pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
274 struct pagevec pvec; 276 struct pagevec pvec;
275 int nr_pages; 277 int nr_pages;
276 int ret = 0; 278 int ret = 0;
277 pgoff_t index;
278 279
279 if (end < start) 280 if (end_byte < start_byte)
280 return 0; 281 return 0;
281 282
282 pagevec_init(&pvec, 0); 283 pagevec_init(&pvec, 0);
283 index = start;
284 while ((index <= end) && 284 while ((index <= end) &&
285 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 285 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
286 PAGECACHE_TAG_WRITEBACK, 286 PAGECACHE_TAG_WRITEBACK,
@@ -310,25 +310,6 @@ int wait_on_page_writeback_range(struct address_space *mapping,
310 310
311 return ret; 311 return ret;
312} 312}
313
314/**
315 * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range
316 * @mapping: address space structure to wait for
317 * @start: offset in bytes where the range starts
318 * @end: offset in bytes where the range ends (inclusive)
319 *
320 * Walk the list of under-writeback pages of the given address space
321 * in the given range and wait for all of them.
322 *
323 * This is just a simple wrapper so that callers don't have to convert offsets
324 * to page indexes themselves
325 */
326int filemap_fdatawait_range(struct address_space *mapping, loff_t start,
327 loff_t end)
328{
329 return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT,
330 end >> PAGE_CACHE_SHIFT);
331}
332EXPORT_SYMBOL(filemap_fdatawait_range); 313EXPORT_SYMBOL(filemap_fdatawait_range);
333 314
334/** 315/**
@@ -345,8 +326,7 @@ int filemap_fdatawait(struct address_space *mapping)
345 if (i_size == 0) 326 if (i_size == 0)
346 return 0; 327 return 0;
347 328
348 return wait_on_page_writeback_range(mapping, 0, 329 return filemap_fdatawait_range(mapping, 0, i_size - 1);
349 (i_size - 1) >> PAGE_CACHE_SHIFT);
350} 330}
351EXPORT_SYMBOL(filemap_fdatawait); 331EXPORT_SYMBOL(filemap_fdatawait);
352 332
@@ -393,9 +373,8 @@ int filemap_write_and_wait_range(struct address_space *mapping,
393 WB_SYNC_ALL); 373 WB_SYNC_ALL);
394 /* See comment of filemap_write_and_wait() */ 374 /* See comment of filemap_write_and_wait() */
395 if (err != -EIO) { 375 if (err != -EIO) {
396 int err2 = wait_on_page_writeback_range(mapping, 376 int err2 = filemap_fdatawait_range(mapping,
397 lstart >> PAGE_CACHE_SHIFT, 377 lstart, lend);
398 lend >> PAGE_CACHE_SHIFT);
399 if (!err) 378 if (!err)
400 err = err2; 379 err = err2;
401 } 380 }
@@ -1844,7 +1823,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1844 1823
1845/* 1824/*
1846 * Copy as much as we can into the page and return the number of bytes which 1825 * Copy as much as we can into the page and return the number of bytes which
1847 * were sucessfully copied. If a fault is encountered then return the number of 1826 * were successfully copied. If a fault is encountered then return the number of
1848 * bytes which were copied. 1827 * bytes which were copied.
1849 */ 1828 */
1850size_t iov_iter_copy_from_user_atomic(struct page *page, 1829size_t iov_iter_copy_from_user_atomic(struct page *page,
@@ -2261,7 +2240,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2261 size_t count, ssize_t written) 2240 size_t count, ssize_t written)
2262{ 2241{
2263 struct file *file = iocb->ki_filp; 2242 struct file *file = iocb->ki_filp;
2264 struct address_space *mapping = file->f_mapping;
2265 ssize_t status; 2243 ssize_t status;
2266 struct iov_iter i; 2244 struct iov_iter i;
2267 2245
@@ -2273,15 +2251,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2273 *ppos = pos + status; 2251 *ppos = pos + status;
2274 } 2252 }
2275 2253
2276 /*
2277 * If we get here for O_DIRECT writes then we must have fallen through
2278 * to buffered writes (block instantiation inside i_size). So we sync
2279 * the file data here, to try to honour O_DIRECT expectations.
2280 */
2281 if (unlikely(file->f_flags & O_DIRECT) && written)
2282 status = filemap_write_and_wait_range(mapping,
2283 pos, pos + written - 1);
2284
2285 return written ? written : status; 2254 return written ? written : status;
2286} 2255}
2287EXPORT_SYMBOL(generic_file_buffered_write); 2256EXPORT_SYMBOL(generic_file_buffered_write);
@@ -2380,10 +2349,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2380 * semantics. 2349 * semantics.
2381 */ 2350 */
2382 endbyte = pos + written_buffered - written - 1; 2351 endbyte = pos + written_buffered - written - 1;
2383 err = do_sync_mapping_range(file->f_mapping, pos, endbyte, 2352 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
2384 SYNC_FILE_RANGE_WAIT_BEFORE|
2385 SYNC_FILE_RANGE_WRITE|
2386 SYNC_FILE_RANGE_WAIT_AFTER);
2387 if (err == 0) { 2353 if (err == 0) {
2388 written = written_buffered; 2354 written = written_buffered;
2389 invalidate_mapping_pages(mapping, 2355 invalidate_mapping_pages(mapping,
diff --git a/mm/highmem.c b/mm/highmem.c
index 25878cc49daa..9c1e627f282e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -426,16 +426,21 @@ void __init page_address_init(void)
426 426
427void debug_kmap_atomic(enum km_type type) 427void debug_kmap_atomic(enum km_type type)
428{ 428{
429 static unsigned warn_count = 10; 429 static int warn_count = 10;
430 430
431 if (unlikely(warn_count == 0)) 431 if (unlikely(warn_count < 0))
432 return; 432 return;
433 433
434 if (unlikely(in_interrupt())) { 434 if (unlikely(in_interrupt())) {
435 if (in_irq()) { 435 if (in_nmi()) {
436 if (type != KM_NMI && type != KM_NMI_PTE) {
437 WARN_ON(1);
438 warn_count--;
439 }
440 } else if (in_irq()) {
436 if (type != KM_IRQ0 && type != KM_IRQ1 && 441 if (type != KM_IRQ0 && type != KM_IRQ1 &&
437 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && 442 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
438 type != KM_BOUNCE_READ) { 443 type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
439 WARN_ON(1); 444 WARN_ON(1);
440 warn_count--; 445 warn_count--;
441 } 446 }
@@ -452,7 +457,9 @@ void debug_kmap_atomic(enum km_type type)
452 } 457 }
453 458
454 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || 459 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
455 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { 460 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
461 type == KM_IRQ_PTE || type == KM_NMI ||
462 type == KM_NMI_PTE ) {
456 if (!irqs_disabled()) { 463 if (!irqs_disabled()) {
457 WARN_ON(1); 464 WARN_ON(1);
458 warn_count--; 465 warn_count--;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5d7601b02874..65f38c218207 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,6 +24,7 @@
24#include <asm/io.h> 24#include <asm/io.h>
25 25
26#include <linux/hugetlb.h> 26#include <linux/hugetlb.h>
27#include <linux/node.h>
27#include "internal.h" 28#include "internal.h"
28 29
29const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 30const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -622,42 +623,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
622} 623}
623 624
624/* 625/*
625 * Use a helper variable to find the next node and then 626 * common helper functions for hstate_next_node_to_{alloc|free}.
626 * copy it back to next_nid_to_alloc afterwards: 627 * We may have allocated or freed a huge page based on a different
627 * otherwise there's a window in which a racer might 628 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
628 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. 629 * be outside of *nodes_allowed. Ensure that we use an allowed
629 * But we don't need to use a spin_lock here: it really 630 * node for alloc or free.
630 * doesn't matter if occasionally a racer chooses the
631 * same nid as we do. Move nid forward in the mask even
632 * if we just successfully allocated a hugepage so that
633 * the next caller gets hugepages on the next node.
634 */ 631 */
635static int hstate_next_node_to_alloc(struct hstate *h) 632static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
636{ 633{
637 int next_nid; 634 nid = next_node(nid, *nodes_allowed);
638 next_nid = next_node(h->next_nid_to_alloc, node_online_map); 635 if (nid == MAX_NUMNODES)
639 if (next_nid == MAX_NUMNODES) 636 nid = first_node(*nodes_allowed);
640 next_nid = first_node(node_online_map); 637 VM_BUG_ON(nid >= MAX_NUMNODES);
641 h->next_nid_to_alloc = next_nid; 638
642 return next_nid; 639 return nid;
640}
641
642static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
643{
644 if (!node_isset(nid, *nodes_allowed))
645 nid = next_node_allowed(nid, nodes_allowed);
646 return nid;
647}
648
649/*
650 * returns the previously saved node ["this node"] from which to
651 * allocate a persistent huge page for the pool and advance the
652 * next node from which to allocate, handling wrap at end of node
653 * mask.
654 */
655static int hstate_next_node_to_alloc(struct hstate *h,
656 nodemask_t *nodes_allowed)
657{
658 int nid;
659
660 VM_BUG_ON(!nodes_allowed);
661
662 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
663 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
664
665 return nid;
643} 666}
644 667
645static int alloc_fresh_huge_page(struct hstate *h) 668static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
646{ 669{
647 struct page *page; 670 struct page *page;
648 int start_nid; 671 int start_nid;
649 int next_nid; 672 int next_nid;
650 int ret = 0; 673 int ret = 0;
651 674
652 start_nid = h->next_nid_to_alloc; 675 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
653 next_nid = start_nid; 676 next_nid = start_nid;
654 677
655 do { 678 do {
656 page = alloc_fresh_huge_page_node(h, next_nid); 679 page = alloc_fresh_huge_page_node(h, next_nid);
657 if (page) 680 if (page) {
658 ret = 1; 681 ret = 1;
659 next_nid = hstate_next_node_to_alloc(h); 682 break;
660 } while (!page && next_nid != start_nid); 683 }
684 next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
685 } while (next_nid != start_nid);
661 686
662 if (ret) 687 if (ret)
663 count_vm_event(HTLB_BUDDY_PGALLOC); 688 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -668,17 +693,21 @@ static int alloc_fresh_huge_page(struct hstate *h)
668} 693}
669 694
670/* 695/*
671 * helper for free_pool_huge_page() - find next node 696 * helper for free_pool_huge_page() - return the previously saved
672 * from which to free a huge page 697 * node ["this node"] from which to free a huge page. Advance the
698 * next node id whether or not we find a free huge page to free so
699 * that the next attempt to free addresses the next node.
673 */ 700 */
674static int hstate_next_node_to_free(struct hstate *h) 701static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
675{ 702{
676 int next_nid; 703 int nid;
677 next_nid = next_node(h->next_nid_to_free, node_online_map); 704
678 if (next_nid == MAX_NUMNODES) 705 VM_BUG_ON(!nodes_allowed);
679 next_nid = first_node(node_online_map); 706
680 h->next_nid_to_free = next_nid; 707 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
681 return next_nid; 708 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
709
710 return nid;
682} 711}
683 712
684/* 713/*
@@ -687,13 +716,14 @@ static int hstate_next_node_to_free(struct hstate *h)
687 * balanced over allowed nodes. 716 * balanced over allowed nodes.
688 * Called with hugetlb_lock locked. 717 * Called with hugetlb_lock locked.
689 */ 718 */
690static int free_pool_huge_page(struct hstate *h, bool acct_surplus) 719static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
720 bool acct_surplus)
691{ 721{
692 int start_nid; 722 int start_nid;
693 int next_nid; 723 int next_nid;
694 int ret = 0; 724 int ret = 0;
695 725
696 start_nid = h->next_nid_to_free; 726 start_nid = hstate_next_node_to_free(h, nodes_allowed);
697 next_nid = start_nid; 727 next_nid = start_nid;
698 728
699 do { 729 do {
@@ -715,9 +745,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
715 } 745 }
716 update_and_free_page(h, page); 746 update_and_free_page(h, page);
717 ret = 1; 747 ret = 1;
748 break;
718 } 749 }
719 next_nid = hstate_next_node_to_free(h); 750 next_nid = hstate_next_node_to_free(h, nodes_allowed);
720 } while (!ret && next_nid != start_nid); 751 } while (next_nid != start_nid);
721 752
722 return ret; 753 return ret;
723} 754}
@@ -911,14 +942,14 @@ static void return_unused_surplus_pages(struct hstate *h,
911 942
912 /* 943 /*
913 * We want to release as many surplus pages as possible, spread 944 * We want to release as many surplus pages as possible, spread
914 * evenly across all nodes. Iterate across all nodes until we 945 * evenly across all nodes with memory. Iterate across these nodes
915 * can no longer free unreserved surplus pages. This occurs when 946 * until we can no longer free unreserved surplus pages. This occurs
916 * the nodes with surplus pages have no free pages. 947 * when the nodes with surplus pages have no free pages.
917 * free_pool_huge_page() will balance the the frees across the 948 * free_pool_huge_page() will balance the the freed pages across the
918 * on-line nodes for us and will handle the hstate accounting. 949 * on-line nodes with memory and will handle the hstate accounting.
919 */ 950 */
920 while (nr_pages--) { 951 while (nr_pages--) {
921 if (!free_pool_huge_page(h, 1)) 952 if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
922 break; 953 break;
923 } 954 }
924} 955}
@@ -1022,16 +1053,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1022int __weak alloc_bootmem_huge_page(struct hstate *h) 1053int __weak alloc_bootmem_huge_page(struct hstate *h)
1023{ 1054{
1024 struct huge_bootmem_page *m; 1055 struct huge_bootmem_page *m;
1025 int nr_nodes = nodes_weight(node_online_map); 1056 int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
1026 1057
1027 while (nr_nodes) { 1058 while (nr_nodes) {
1028 void *addr; 1059 void *addr;
1029 1060
1030 addr = __alloc_bootmem_node_nopanic( 1061 addr = __alloc_bootmem_node_nopanic(
1031 NODE_DATA(h->next_nid_to_alloc), 1062 NODE_DATA(hstate_next_node_to_alloc(h,
1063 &node_states[N_HIGH_MEMORY])),
1032 huge_page_size(h), huge_page_size(h), 0); 1064 huge_page_size(h), huge_page_size(h), 0);
1033 1065
1034 hstate_next_node_to_alloc(h);
1035 if (addr) { 1066 if (addr) {
1036 /* 1067 /*
1037 * Use the beginning of the huge page to store the 1068 * Use the beginning of the huge page to store the
@@ -1084,7 +1115,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1084 if (h->order >= MAX_ORDER) { 1115 if (h->order >= MAX_ORDER) {
1085 if (!alloc_bootmem_huge_page(h)) 1116 if (!alloc_bootmem_huge_page(h))
1086 break; 1117 break;
1087 } else if (!alloc_fresh_huge_page(h)) 1118 } else if (!alloc_fresh_huge_page(h,
1119 &node_states[N_HIGH_MEMORY]))
1088 break; 1120 break;
1089 } 1121 }
1090 h->max_huge_pages = i; 1122 h->max_huge_pages = i;
@@ -1126,14 +1158,15 @@ static void __init report_hugepages(void)
1126} 1158}
1127 1159
1128#ifdef CONFIG_HIGHMEM 1160#ifdef CONFIG_HIGHMEM
1129static void try_to_free_low(struct hstate *h, unsigned long count) 1161static void try_to_free_low(struct hstate *h, unsigned long count,
1162 nodemask_t *nodes_allowed)
1130{ 1163{
1131 int i; 1164 int i;
1132 1165
1133 if (h->order >= MAX_ORDER) 1166 if (h->order >= MAX_ORDER)
1134 return; 1167 return;
1135 1168
1136 for (i = 0; i < MAX_NUMNODES; ++i) { 1169 for_each_node_mask(i, *nodes_allowed) {
1137 struct page *page, *next; 1170 struct page *page, *next;
1138 struct list_head *freel = &h->hugepage_freelists[i]; 1171 struct list_head *freel = &h->hugepage_freelists[i];
1139 list_for_each_entry_safe(page, next, freel, lru) { 1172 list_for_each_entry_safe(page, next, freel, lru) {
@@ -1149,7 +1182,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count)
1149 } 1182 }
1150} 1183}
1151#else 1184#else
1152static inline void try_to_free_low(struct hstate *h, unsigned long count) 1185static inline void try_to_free_low(struct hstate *h, unsigned long count,
1186 nodemask_t *nodes_allowed)
1153{ 1187{
1154} 1188}
1155#endif 1189#endif
@@ -1159,7 +1193,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1159 * balanced by operating on them in a round-robin fashion. 1193 * balanced by operating on them in a round-robin fashion.
1160 * Returns 1 if an adjustment was made. 1194 * Returns 1 if an adjustment was made.
1161 */ 1195 */
1162static int adjust_pool_surplus(struct hstate *h, int delta) 1196static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
1197 int delta)
1163{ 1198{
1164 int start_nid, next_nid; 1199 int start_nid, next_nid;
1165 int ret = 0; 1200 int ret = 0;
@@ -1167,29 +1202,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
1167 VM_BUG_ON(delta != -1 && delta != 1); 1202 VM_BUG_ON(delta != -1 && delta != 1);
1168 1203
1169 if (delta < 0) 1204 if (delta < 0)
1170 start_nid = h->next_nid_to_alloc; 1205 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
1171 else 1206 else
1172 start_nid = h->next_nid_to_free; 1207 start_nid = hstate_next_node_to_free(h, nodes_allowed);
1173 next_nid = start_nid; 1208 next_nid = start_nid;
1174 1209
1175 do { 1210 do {
1176 int nid = next_nid; 1211 int nid = next_nid;
1177 if (delta < 0) { 1212 if (delta < 0) {
1178 next_nid = hstate_next_node_to_alloc(h);
1179 /* 1213 /*
1180 * To shrink on this node, there must be a surplus page 1214 * To shrink on this node, there must be a surplus page
1181 */ 1215 */
1182 if (!h->surplus_huge_pages_node[nid]) 1216 if (!h->surplus_huge_pages_node[nid]) {
1217 next_nid = hstate_next_node_to_alloc(h,
1218 nodes_allowed);
1183 continue; 1219 continue;
1220 }
1184 } 1221 }
1185 if (delta > 0) { 1222 if (delta > 0) {
1186 next_nid = hstate_next_node_to_free(h);
1187 /* 1223 /*
1188 * Surplus cannot exceed the total number of pages 1224 * Surplus cannot exceed the total number of pages
1189 */ 1225 */
1190 if (h->surplus_huge_pages_node[nid] >= 1226 if (h->surplus_huge_pages_node[nid] >=
1191 h->nr_huge_pages_node[nid]) 1227 h->nr_huge_pages_node[nid]) {
1228 next_nid = hstate_next_node_to_free(h,
1229 nodes_allowed);
1192 continue; 1230 continue;
1231 }
1193 } 1232 }
1194 1233
1195 h->surplus_huge_pages += delta; 1234 h->surplus_huge_pages += delta;
@@ -1202,7 +1241,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
1202} 1241}
1203 1242
1204#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1243#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1205static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) 1244static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
1245 nodemask_t *nodes_allowed)
1206{ 1246{
1207 unsigned long min_count, ret; 1247 unsigned long min_count, ret;
1208 1248
@@ -1222,7 +1262,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1222 */ 1262 */
1223 spin_lock(&hugetlb_lock); 1263 spin_lock(&hugetlb_lock);
1224 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 1264 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
1225 if (!adjust_pool_surplus(h, -1)) 1265 if (!adjust_pool_surplus(h, nodes_allowed, -1))
1226 break; 1266 break;
1227 } 1267 }
1228 1268
@@ -1233,11 +1273,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1233 * and reducing the surplus. 1273 * and reducing the surplus.
1234 */ 1274 */
1235 spin_unlock(&hugetlb_lock); 1275 spin_unlock(&hugetlb_lock);
1236 ret = alloc_fresh_huge_page(h); 1276 ret = alloc_fresh_huge_page(h, nodes_allowed);
1237 spin_lock(&hugetlb_lock); 1277 spin_lock(&hugetlb_lock);
1238 if (!ret) 1278 if (!ret)
1239 goto out; 1279 goto out;
1240 1280
1281 /* Bail for signals. Probably ctrl-c from user */
1282 if (signal_pending(current))
1283 goto out;
1241 } 1284 }
1242 1285
1243 /* 1286 /*
@@ -1257,13 +1300,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1257 */ 1300 */
1258 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 1301 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
1259 min_count = max(count, min_count); 1302 min_count = max(count, min_count);
1260 try_to_free_low(h, min_count); 1303 try_to_free_low(h, min_count, nodes_allowed);
1261 while (min_count < persistent_huge_pages(h)) { 1304 while (min_count < persistent_huge_pages(h)) {
1262 if (!free_pool_huge_page(h, 0)) 1305 if (!free_pool_huge_page(h, nodes_allowed, 0))
1263 break; 1306 break;
1264 } 1307 }
1265 while (count < persistent_huge_pages(h)) { 1308 while (count < persistent_huge_pages(h)) {
1266 if (!adjust_pool_surplus(h, 1)) 1309 if (!adjust_pool_surplus(h, nodes_allowed, 1))
1267 break; 1310 break;
1268 } 1311 }
1269out: 1312out:
@@ -1282,43 +1325,117 @@ out:
1282static struct kobject *hugepages_kobj; 1325static struct kobject *hugepages_kobj;
1283static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 1326static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1284 1327
1285static struct hstate *kobj_to_hstate(struct kobject *kobj) 1328static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
1329
1330static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
1286{ 1331{
1287 int i; 1332 int i;
1333
1288 for (i = 0; i < HUGE_MAX_HSTATE; i++) 1334 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1289 if (hstate_kobjs[i] == kobj) 1335 if (hstate_kobjs[i] == kobj) {
1336 if (nidp)
1337 *nidp = NUMA_NO_NODE;
1290 return &hstates[i]; 1338 return &hstates[i];
1291 BUG(); 1339 }
1292 return NULL; 1340
1341 return kobj_to_node_hstate(kobj, nidp);
1293} 1342}
1294 1343
1295static ssize_t nr_hugepages_show(struct kobject *kobj, 1344static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1296 struct kobj_attribute *attr, char *buf) 1345 struct kobj_attribute *attr, char *buf)
1297{ 1346{
1298 struct hstate *h = kobj_to_hstate(kobj); 1347 struct hstate *h;
1299 return sprintf(buf, "%lu\n", h->nr_huge_pages); 1348 unsigned long nr_huge_pages;
1349 int nid;
1350
1351 h = kobj_to_hstate(kobj, &nid);
1352 if (nid == NUMA_NO_NODE)
1353 nr_huge_pages = h->nr_huge_pages;
1354 else
1355 nr_huge_pages = h->nr_huge_pages_node[nid];
1356
1357 return sprintf(buf, "%lu\n", nr_huge_pages);
1300} 1358}
1301static ssize_t nr_hugepages_store(struct kobject *kobj, 1359static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1302 struct kobj_attribute *attr, const char *buf, size_t count) 1360 struct kobject *kobj, struct kobj_attribute *attr,
1361 const char *buf, size_t len)
1303{ 1362{
1304 int err; 1363 int err;
1305 unsigned long input; 1364 int nid;
1306 struct hstate *h = kobj_to_hstate(kobj); 1365 unsigned long count;
1366 struct hstate *h;
1367 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
1307 1368
1308 err = strict_strtoul(buf, 10, &input); 1369 err = strict_strtoul(buf, 10, &count);
1309 if (err) 1370 if (err)
1310 return 0; 1371 return 0;
1311 1372
1312 h->max_huge_pages = set_max_huge_pages(h, input); 1373 h = kobj_to_hstate(kobj, &nid);
1374 if (nid == NUMA_NO_NODE) {
1375 /*
1376 * global hstate attribute
1377 */
1378 if (!(obey_mempolicy &&
1379 init_nodemask_of_mempolicy(nodes_allowed))) {
1380 NODEMASK_FREE(nodes_allowed);
1381 nodes_allowed = &node_states[N_HIGH_MEMORY];
1382 }
1383 } else if (nodes_allowed) {
1384 /*
1385 * per node hstate attribute: adjust count to global,
1386 * but restrict alloc/free to the specified node.
1387 */
1388 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
1389 init_nodemask_of_node(nodes_allowed, nid);
1390 } else
1391 nodes_allowed = &node_states[N_HIGH_MEMORY];
1392
1393 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1313 1394
1314 return count; 1395 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1396 NODEMASK_FREE(nodes_allowed);
1397
1398 return len;
1399}
1400
1401static ssize_t nr_hugepages_show(struct kobject *kobj,
1402 struct kobj_attribute *attr, char *buf)
1403{
1404 return nr_hugepages_show_common(kobj, attr, buf);
1405}
1406
1407static ssize_t nr_hugepages_store(struct kobject *kobj,
1408 struct kobj_attribute *attr, const char *buf, size_t len)
1409{
1410 return nr_hugepages_store_common(false, kobj, attr, buf, len);
1315} 1411}
1316HSTATE_ATTR(nr_hugepages); 1412HSTATE_ATTR(nr_hugepages);
1317 1413
1414#ifdef CONFIG_NUMA
1415
1416/*
1417 * hstate attribute for optionally mempolicy-based constraint on persistent
1418 * huge page alloc/free.
1419 */
1420static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
1421 struct kobj_attribute *attr, char *buf)
1422{
1423 return nr_hugepages_show_common(kobj, attr, buf);
1424}
1425
1426static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
1427 struct kobj_attribute *attr, const char *buf, size_t len)
1428{
1429 return nr_hugepages_store_common(true, kobj, attr, buf, len);
1430}
1431HSTATE_ATTR(nr_hugepages_mempolicy);
1432#endif
1433
1434
1318static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 1435static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1319 struct kobj_attribute *attr, char *buf) 1436 struct kobj_attribute *attr, char *buf)
1320{ 1437{
1321 struct hstate *h = kobj_to_hstate(kobj); 1438 struct hstate *h = kobj_to_hstate(kobj, NULL);
1322 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 1439 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1323} 1440}
1324static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 1441static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
@@ -1326,7 +1443,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1326{ 1443{
1327 int err; 1444 int err;
1328 unsigned long input; 1445 unsigned long input;
1329 struct hstate *h = kobj_to_hstate(kobj); 1446 struct hstate *h = kobj_to_hstate(kobj, NULL);
1330 1447
1331 err = strict_strtoul(buf, 10, &input); 1448 err = strict_strtoul(buf, 10, &input);
1332 if (err) 1449 if (err)
@@ -1343,15 +1460,24 @@ HSTATE_ATTR(nr_overcommit_hugepages);
1343static ssize_t free_hugepages_show(struct kobject *kobj, 1460static ssize_t free_hugepages_show(struct kobject *kobj,
1344 struct kobj_attribute *attr, char *buf) 1461 struct kobj_attribute *attr, char *buf)
1345{ 1462{
1346 struct hstate *h = kobj_to_hstate(kobj); 1463 struct hstate *h;
1347 return sprintf(buf, "%lu\n", h->free_huge_pages); 1464 unsigned long free_huge_pages;
1465 int nid;
1466
1467 h = kobj_to_hstate(kobj, &nid);
1468 if (nid == NUMA_NO_NODE)
1469 free_huge_pages = h->free_huge_pages;
1470 else
1471 free_huge_pages = h->free_huge_pages_node[nid];
1472
1473 return sprintf(buf, "%lu\n", free_huge_pages);
1348} 1474}
1349HSTATE_ATTR_RO(free_hugepages); 1475HSTATE_ATTR_RO(free_hugepages);
1350 1476
1351static ssize_t resv_hugepages_show(struct kobject *kobj, 1477static ssize_t resv_hugepages_show(struct kobject *kobj,
1352 struct kobj_attribute *attr, char *buf) 1478 struct kobj_attribute *attr, char *buf)
1353{ 1479{
1354 struct hstate *h = kobj_to_hstate(kobj); 1480 struct hstate *h = kobj_to_hstate(kobj, NULL);
1355 return sprintf(buf, "%lu\n", h->resv_huge_pages); 1481 return sprintf(buf, "%lu\n", h->resv_huge_pages);
1356} 1482}
1357HSTATE_ATTR_RO(resv_hugepages); 1483HSTATE_ATTR_RO(resv_hugepages);
@@ -1359,8 +1485,17 @@ HSTATE_ATTR_RO(resv_hugepages);
1359static ssize_t surplus_hugepages_show(struct kobject *kobj, 1485static ssize_t surplus_hugepages_show(struct kobject *kobj,
1360 struct kobj_attribute *attr, char *buf) 1486 struct kobj_attribute *attr, char *buf)
1361{ 1487{
1362 struct hstate *h = kobj_to_hstate(kobj); 1488 struct hstate *h;
1363 return sprintf(buf, "%lu\n", h->surplus_huge_pages); 1489 unsigned long surplus_huge_pages;
1490 int nid;
1491
1492 h = kobj_to_hstate(kobj, &nid);
1493 if (nid == NUMA_NO_NODE)
1494 surplus_huge_pages = h->surplus_huge_pages;
1495 else
1496 surplus_huge_pages = h->surplus_huge_pages_node[nid];
1497
1498 return sprintf(buf, "%lu\n", surplus_huge_pages);
1364} 1499}
1365HSTATE_ATTR_RO(surplus_hugepages); 1500HSTATE_ATTR_RO(surplus_hugepages);
1366 1501
@@ -1370,6 +1505,9 @@ static struct attribute *hstate_attrs[] = {
1370 &free_hugepages_attr.attr, 1505 &free_hugepages_attr.attr,
1371 &resv_hugepages_attr.attr, 1506 &resv_hugepages_attr.attr,
1372 &surplus_hugepages_attr.attr, 1507 &surplus_hugepages_attr.attr,
1508#ifdef CONFIG_NUMA
1509 &nr_hugepages_mempolicy_attr.attr,
1510#endif
1373 NULL, 1511 NULL,
1374}; 1512};
1375 1513
@@ -1377,19 +1515,21 @@ static struct attribute_group hstate_attr_group = {
1377 .attrs = hstate_attrs, 1515 .attrs = hstate_attrs,
1378}; 1516};
1379 1517
1380static int __init hugetlb_sysfs_add_hstate(struct hstate *h) 1518static int __init hugetlb_sysfs_add_hstate(struct hstate *h,
1519 struct kobject *parent,
1520 struct kobject **hstate_kobjs,
1521 struct attribute_group *hstate_attr_group)
1381{ 1522{
1382 int retval; 1523 int retval;
1524 int hi = h - hstates;
1383 1525
1384 hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, 1526 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1385 hugepages_kobj); 1527 if (!hstate_kobjs[hi])
1386 if (!hstate_kobjs[h - hstates])
1387 return -ENOMEM; 1528 return -ENOMEM;
1388 1529
1389 retval = sysfs_create_group(hstate_kobjs[h - hstates], 1530 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
1390 &hstate_attr_group);
1391 if (retval) 1531 if (retval)
1392 kobject_put(hstate_kobjs[h - hstates]); 1532 kobject_put(hstate_kobjs[hi]);
1393 1533
1394 return retval; 1534 return retval;
1395} 1535}
@@ -1404,17 +1544,184 @@ static void __init hugetlb_sysfs_init(void)
1404 return; 1544 return;
1405 1545
1406 for_each_hstate(h) { 1546 for_each_hstate(h) {
1407 err = hugetlb_sysfs_add_hstate(h); 1547 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
1548 hstate_kobjs, &hstate_attr_group);
1408 if (err) 1549 if (err)
1409 printk(KERN_ERR "Hugetlb: Unable to add hstate %s", 1550 printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
1410 h->name); 1551 h->name);
1411 } 1552 }
1412} 1553}
1413 1554
1555#ifdef CONFIG_NUMA
1556
1557/*
1558 * node_hstate/s - associate per node hstate attributes, via their kobjects,
1559 * with node sysdevs in node_devices[] using a parallel array. The array
1560 * index of a node sysdev or _hstate == node id.
1561 * This is here to avoid any static dependency of the node sysdev driver, in
1562 * the base kernel, on the hugetlb module.
1563 */
1564struct node_hstate {
1565 struct kobject *hugepages_kobj;
1566 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1567};
1568struct node_hstate node_hstates[MAX_NUMNODES];
1569
1570/*
1571 * A subset of global hstate attributes for node sysdevs
1572 */
1573static struct attribute *per_node_hstate_attrs[] = {
1574 &nr_hugepages_attr.attr,
1575 &free_hugepages_attr.attr,
1576 &surplus_hugepages_attr.attr,
1577 NULL,
1578};
1579
1580static struct attribute_group per_node_hstate_attr_group = {
1581 .attrs = per_node_hstate_attrs,
1582};
1583
1584/*
1585 * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
1586 * Returns node id via non-NULL nidp.
1587 */
1588static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1589{
1590 int nid;
1591
1592 for (nid = 0; nid < nr_node_ids; nid++) {
1593 struct node_hstate *nhs = &node_hstates[nid];
1594 int i;
1595 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1596 if (nhs->hstate_kobjs[i] == kobj) {
1597 if (nidp)
1598 *nidp = nid;
1599 return &hstates[i];
1600 }
1601 }
1602
1603 BUG();
1604 return NULL;
1605}
1606
1607/*
1608 * Unregister hstate attributes from a single node sysdev.
1609 * No-op if no hstate attributes attached.
1610 */
1611void hugetlb_unregister_node(struct node *node)
1612{
1613 struct hstate *h;
1614 struct node_hstate *nhs = &node_hstates[node->sysdev.id];
1615
1616 if (!nhs->hugepages_kobj)
1617 return; /* no hstate attributes */
1618
1619 for_each_hstate(h)
1620 if (nhs->hstate_kobjs[h - hstates]) {
1621 kobject_put(nhs->hstate_kobjs[h - hstates]);
1622 nhs->hstate_kobjs[h - hstates] = NULL;
1623 }
1624
1625 kobject_put(nhs->hugepages_kobj);
1626 nhs->hugepages_kobj = NULL;
1627}
1628
1629/*
1630 * hugetlb module exit: unregister hstate attributes from node sysdevs
1631 * that have them.
1632 */
1633static void hugetlb_unregister_all_nodes(void)
1634{
1635 int nid;
1636
1637 /*
1638 * disable node sysdev registrations.
1639 */
1640 register_hugetlbfs_with_node(NULL, NULL);
1641
1642 /*
1643 * remove hstate attributes from any nodes that have them.
1644 */
1645 for (nid = 0; nid < nr_node_ids; nid++)
1646 hugetlb_unregister_node(&node_devices[nid]);
1647}
1648
1649/*
1650 * Register hstate attributes for a single node sysdev.
1651 * No-op if attributes already registered.
1652 */
1653void hugetlb_register_node(struct node *node)
1654{
1655 struct hstate *h;
1656 struct node_hstate *nhs = &node_hstates[node->sysdev.id];
1657 int err;
1658
1659 if (nhs->hugepages_kobj)
1660 return; /* already allocated */
1661
1662 nhs->hugepages_kobj = kobject_create_and_add("hugepages",
1663 &node->sysdev.kobj);
1664 if (!nhs->hugepages_kobj)
1665 return;
1666
1667 for_each_hstate(h) {
1668 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
1669 nhs->hstate_kobjs,
1670 &per_node_hstate_attr_group);
1671 if (err) {
1672 printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
1673 " for node %d\n",
1674 h->name, node->sysdev.id);
1675 hugetlb_unregister_node(node);
1676 break;
1677 }
1678 }
1679}
1680
1681/*
1682 * hugetlb init time: register hstate attributes for all registered node
1683 * sysdevs of nodes that have memory. All on-line nodes should have
1684 * registered their associated sysdev by this time.
1685 */
1686static void hugetlb_register_all_nodes(void)
1687{
1688 int nid;
1689
1690 for_each_node_state(nid, N_HIGH_MEMORY) {
1691 struct node *node = &node_devices[nid];
1692 if (node->sysdev.id == nid)
1693 hugetlb_register_node(node);
1694 }
1695
1696 /*
1697 * Let the node sysdev driver know we're here so it can
1698 * [un]register hstate attributes on node hotplug.
1699 */
1700 register_hugetlbfs_with_node(hugetlb_register_node,
1701 hugetlb_unregister_node);
1702}
1703#else /* !CONFIG_NUMA */
1704
1705static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1706{
1707 BUG();
1708 if (nidp)
1709 *nidp = -1;
1710 return NULL;
1711}
1712
1713static void hugetlb_unregister_all_nodes(void) { }
1714
1715static void hugetlb_register_all_nodes(void) { }
1716
1717#endif
1718
1414static void __exit hugetlb_exit(void) 1719static void __exit hugetlb_exit(void)
1415{ 1720{
1416 struct hstate *h; 1721 struct hstate *h;
1417 1722
1723 hugetlb_unregister_all_nodes();
1724
1418 for_each_hstate(h) { 1725 for_each_hstate(h) {
1419 kobject_put(hstate_kobjs[h - hstates]); 1726 kobject_put(hstate_kobjs[h - hstates]);
1420 } 1727 }
@@ -1449,6 +1756,8 @@ static int __init hugetlb_init(void)
1449 1756
1450 hugetlb_sysfs_init(); 1757 hugetlb_sysfs_init();
1451 1758
1759 hugetlb_register_all_nodes();
1760
1452 return 0; 1761 return 0;
1453} 1762}
1454module_init(hugetlb_init); 1763module_init(hugetlb_init);
@@ -1472,8 +1781,8 @@ void __init hugetlb_add_hstate(unsigned order)
1472 h->free_huge_pages = 0; 1781 h->free_huge_pages = 0;
1473 for (i = 0; i < MAX_NUMNODES; ++i) 1782 for (i = 0; i < MAX_NUMNODES; ++i)
1474 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1783 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1475 h->next_nid_to_alloc = first_node(node_online_map); 1784 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
1476 h->next_nid_to_free = first_node(node_online_map); 1785 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
1477 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1786 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1478 huge_page_size(h)/1024); 1787 huge_page_size(h)/1024);
1479 1788
@@ -1536,9 +1845,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
1536} 1845}
1537 1846
1538#ifdef CONFIG_SYSCTL 1847#ifdef CONFIG_SYSCTL
1539int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1848static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1540 void __user *buffer, 1849 struct ctl_table *table, int write,
1541 size_t *length, loff_t *ppos) 1850 void __user *buffer, size_t *length, loff_t *ppos)
1542{ 1851{
1543 struct hstate *h = &default_hstate; 1852 struct hstate *h = &default_hstate;
1544 unsigned long tmp; 1853 unsigned long tmp;
@@ -1550,12 +1859,40 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1550 table->maxlen = sizeof(unsigned long); 1859 table->maxlen = sizeof(unsigned long);
1551 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1860 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1552 1861
1553 if (write) 1862 if (write) {
1554 h->max_huge_pages = set_max_huge_pages(h, tmp); 1863 NODEMASK_ALLOC(nodemask_t, nodes_allowed,
1864 GFP_KERNEL | __GFP_NORETRY);
1865 if (!(obey_mempolicy &&
1866 init_nodemask_of_mempolicy(nodes_allowed))) {
1867 NODEMASK_FREE(nodes_allowed);
1868 nodes_allowed = &node_states[N_HIGH_MEMORY];
1869 }
1870 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
1871
1872 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1873 NODEMASK_FREE(nodes_allowed);
1874 }
1555 1875
1556 return 0; 1876 return 0;
1557} 1877}
1558 1878
1879int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1880 void __user *buffer, size_t *length, loff_t *ppos)
1881{
1882
1883 return hugetlb_sysctl_handler_common(false, table, write,
1884 buffer, length, ppos);
1885}
1886
1887#ifdef CONFIG_NUMA
1888int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
1889 void __user *buffer, size_t *length, loff_t *ppos)
1890{
1891 return hugetlb_sysctl_handler_common(true, table, write,
1892 buffer, length, ppos);
1893}
1894#endif /* CONFIG_NUMA */
1895
1559int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 1896int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1560 void __user *buffer, 1897 void __user *buffer,
1561 size_t *length, loff_t *ppos) 1898 size_t *length, loff_t *ppos)
@@ -1903,6 +2240,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
1903 + (vma->vm_pgoff >> PAGE_SHIFT); 2240 + (vma->vm_pgoff >> PAGE_SHIFT);
1904 mapping = (struct address_space *)page_private(page); 2241 mapping = (struct address_space *)page_private(page);
1905 2242
2243 /*
2244 * Take the mapping lock for the duration of the table walk. As
2245 * this mapping should be shared between all the VMAs,
2246 * __unmap_hugepage_range() is called as the lock is already held
2247 */
2248 spin_lock(&mapping->i_mmap_lock);
1906 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 2249 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1907 /* Do not unmap the current VMA */ 2250 /* Do not unmap the current VMA */
1908 if (iter_vma == vma) 2251 if (iter_vma == vma)
@@ -1916,10 +2259,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
1916 * from the time of fork. This would look like data corruption 2259 * from the time of fork. This would look like data corruption
1917 */ 2260 */
1918 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 2261 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1919 unmap_hugepage_range(iter_vma, 2262 __unmap_hugepage_range(iter_vma,
1920 address, address + huge_page_size(h), 2263 address, address + huge_page_size(h),
1921 page); 2264 page);
1922 } 2265 }
2266 spin_unlock(&mapping->i_mmap_lock);
1923 2267
1924 return 1; 2268 return 1;
1925} 2269}
@@ -1959,6 +2303,9 @@ retry_avoidcopy:
1959 outside_reserve = 1; 2303 outside_reserve = 1;
1960 2304
1961 page_cache_get(old_page); 2305 page_cache_get(old_page);
2306
2307 /* Drop page_table_lock as buddy allocator may be called */
2308 spin_unlock(&mm->page_table_lock);
1962 new_page = alloc_huge_page(vma, address, outside_reserve); 2309 new_page = alloc_huge_page(vma, address, outside_reserve);
1963 2310
1964 if (IS_ERR(new_page)) { 2311 if (IS_ERR(new_page)) {
@@ -1976,19 +2323,25 @@ retry_avoidcopy:
1976 if (unmap_ref_private(mm, vma, old_page, address)) { 2323 if (unmap_ref_private(mm, vma, old_page, address)) {
1977 BUG_ON(page_count(old_page) != 1); 2324 BUG_ON(page_count(old_page) != 1);
1978 BUG_ON(huge_pte_none(pte)); 2325 BUG_ON(huge_pte_none(pte));
2326 spin_lock(&mm->page_table_lock);
1979 goto retry_avoidcopy; 2327 goto retry_avoidcopy;
1980 } 2328 }
1981 WARN_ON_ONCE(1); 2329 WARN_ON_ONCE(1);
1982 } 2330 }
1983 2331
2332 /* Caller expects lock to be held */
2333 spin_lock(&mm->page_table_lock);
1984 return -PTR_ERR(new_page); 2334 return -PTR_ERR(new_page);
1985 } 2335 }
1986 2336
1987 spin_unlock(&mm->page_table_lock);
1988 copy_huge_page(new_page, old_page, address, vma); 2337 copy_huge_page(new_page, old_page, address, vma);
1989 __SetPageUptodate(new_page); 2338 __SetPageUptodate(new_page);
1990 spin_lock(&mm->page_table_lock);
1991 2339
2340 /*
2341 * Retake the page_table_lock to check for racing updates
2342 * before the page tables are altered
2343 */
2344 spin_lock(&mm->page_table_lock);
1992 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2345 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
1993 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2346 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
1994 /* Break COW */ 2347 /* Break COW */
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index e1d85137f086..10ea71905c1f 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -3,18 +3,68 @@
3#include <linux/debugfs.h> 3#include <linux/debugfs.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/swap.h>
7#include <linux/pagemap.h>
8#include "internal.h"
6 9
7static struct dentry *hwpoison_dir, *corrupt_pfn; 10static struct dentry *hwpoison_dir;
8 11
9static int hwpoison_inject(void *data, u64 val) 12static int hwpoison_inject(void *data, u64 val)
10{ 13{
14 unsigned long pfn = val;
15 struct page *p;
16 int err;
17
18 if (!capable(CAP_SYS_ADMIN))
19 return -EPERM;
20
21 if (!hwpoison_filter_enable)
22 goto inject;
23 if (!pfn_valid(pfn))
24 return -ENXIO;
25
26 p = pfn_to_page(pfn);
27 /*
28 * This implies unable to support free buddy pages.
29 */
30 if (!get_page_unless_zero(p))
31 return 0;
32
33 if (!PageLRU(p))
34 shake_page(p, 0);
35 /*
36 * This implies unable to support non-LRU pages.
37 */
38 if (!PageLRU(p))
39 return 0;
40
41 /*
42 * do a racy check with elevated page count, to make sure PG_hwpoison
43 * will only be set for the targeted owner (or on a free page).
44 * We temporarily take page lock for try_get_mem_cgroup_from_page().
45 * __memory_failure() will redo the check reliably inside page lock.
46 */
47 lock_page(p);
48 err = hwpoison_filter(p);
49 unlock_page(p);
50 if (err)
51 return 0;
52
53inject:
54 printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
55 return __memory_failure(pfn, 18, MF_COUNT_INCREASED);
56}
57
58static int hwpoison_unpoison(void *data, u64 val)
59{
11 if (!capable(CAP_SYS_ADMIN)) 60 if (!capable(CAP_SYS_ADMIN))
12 return -EPERM; 61 return -EPERM;
13 printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); 62
14 return __memory_failure(val, 18, 0); 63 return unpoison_memory(val);
15} 64}
16 65
17DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); 66DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
67DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
18 68
19static void pfn_inject_exit(void) 69static void pfn_inject_exit(void)
20{ 70{
@@ -24,16 +74,63 @@ static void pfn_inject_exit(void)
24 74
25static int pfn_inject_init(void) 75static int pfn_inject_init(void)
26{ 76{
77 struct dentry *dentry;
78
27 hwpoison_dir = debugfs_create_dir("hwpoison", NULL); 79 hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
28 if (hwpoison_dir == NULL) 80 if (hwpoison_dir == NULL)
29 return -ENOMEM; 81 return -ENOMEM;
30 corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, 82
83 /*
84 * Note that the below poison/unpoison interfaces do not involve
85 * hardware status change, hence do not require hardware support.
86 * They are mainly for testing hwpoison in software level.
87 */
88 dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
31 NULL, &hwpoison_fops); 89 NULL, &hwpoison_fops);
32 if (corrupt_pfn == NULL) { 90 if (!dentry)
33 pfn_inject_exit(); 91 goto fail;
34 return -ENOMEM; 92
35 } 93 dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir,
94 NULL, &unpoison_fops);
95 if (!dentry)
96 goto fail;
97
98 dentry = debugfs_create_u32("corrupt-filter-enable", 0600,
99 hwpoison_dir, &hwpoison_filter_enable);
100 if (!dentry)
101 goto fail;
102
103 dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600,
104 hwpoison_dir, &hwpoison_filter_dev_major);
105 if (!dentry)
106 goto fail;
107
108 dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600,
109 hwpoison_dir, &hwpoison_filter_dev_minor);
110 if (!dentry)
111 goto fail;
112
113 dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600,
114 hwpoison_dir, &hwpoison_filter_flags_mask);
115 if (!dentry)
116 goto fail;
117
118 dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600,
119 hwpoison_dir, &hwpoison_filter_flags_value);
120 if (!dentry)
121 goto fail;
122
123#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
124 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
125 hwpoison_dir, &hwpoison_filter_memcg);
126 if (!dentry)
127 goto fail;
128#endif
129
36 return 0; 130 return 0;
131fail:
132 pfn_inject_exit();
133 return -ENOMEM;
37} 134}
38 135
39module_init(pfn_inject_init); 136module_init(pfn_inject_init);
diff --git a/mm/internal.h b/mm/internal.h
index 22ec8d2b0fb8..6a697bb97fc5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -50,6 +50,9 @@ extern void putback_lru_page(struct page *page);
50 */ 50 */
51extern void __free_pages_bootmem(struct page *page, unsigned int order); 51extern void __free_pages_bootmem(struct page *page, unsigned int order);
52extern void prep_compound_page(struct page *page, unsigned long order); 52extern void prep_compound_page(struct page *page, unsigned long order);
53#ifdef CONFIG_MEMORY_FAILURE
54extern bool is_free_buddy_page(struct page *page);
55#endif
53 56
54 57
55/* 58/*
@@ -63,7 +66,7 @@ static inline unsigned long page_order(struct page *page)
63 return page_private(page); 66 return page_private(page);
64} 67}
65 68
66#ifdef CONFIG_HAVE_MLOCK 69#ifdef CONFIG_MMU
67extern long mlock_vma_pages_range(struct vm_area_struct *vma, 70extern long mlock_vma_pages_range(struct vm_area_struct *vma,
68 unsigned long start, unsigned long end); 71 unsigned long start, unsigned long end);
69extern void munlock_vma_pages_range(struct vm_area_struct *vma, 72extern void munlock_vma_pages_range(struct vm_area_struct *vma,
@@ -72,22 +75,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
72{ 75{
73 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); 76 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
74} 77}
75#endif
76 78
77/* 79/*
78 * unevictable_migrate_page() called only from migrate_page_copy() to
79 * migrate unevictable flag to new page.
80 * Note that the old page has been isolated from the LRU lists at this
81 * point so we don't need to worry about LRU statistics.
82 */
83static inline void unevictable_migrate_page(struct page *new, struct page *old)
84{
85 if (TestClearPageUnevictable(old))
86 SetPageUnevictable(new);
87}
88
89#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
90/*
91 * Called only in fault path via page_evictable() for a new page 80 * Called only in fault path via page_evictable() for a new page
92 * to determine if it's being mapped into a LOCKED vma. 81 * to determine if it's being mapped into a LOCKED vma.
93 * If so, mark page as mlocked. 82 * If so, mark page as mlocked.
@@ -107,9 +96,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
107} 96}
108 97
109/* 98/*
110 * must be called with vma's mmap_sem held for read, and page locked. 99 * must be called with vma's mmap_sem held for read or write, and page locked.
111 */ 100 */
112extern void mlock_vma_page(struct page *page); 101extern void mlock_vma_page(struct page *page);
102extern void munlock_vma_page(struct page *page);
113 103
114/* 104/*
115 * Clear the page's PageMlocked(). This can be useful in a situation where 105 * Clear the page's PageMlocked(). This can be useful in a situation where
@@ -144,7 +134,7 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
144 } 134 }
145} 135}
146 136
147#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 137#else /* !CONFIG_MMU */
148static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 138static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
149{ 139{
150 return 0; 140 return 0;
@@ -153,7 +143,7 @@ static inline void clear_page_mlock(struct page *page) { }
153static inline void mlock_vma_page(struct page *page) { } 143static inline void mlock_vma_page(struct page *page) { }
154static inline void mlock_migrate_page(struct page *new, struct page *old) { } 144static inline void mlock_migrate_page(struct page *new, struct page *old) { }
155 145
156#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 146#endif /* !CONFIG_MMU */
157 147
158/* 148/*
159 * Return the mem_map entry representing the 'offset' subpage within 149 * Return the mem_map entry representing the 'offset' subpage within
@@ -260,3 +250,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
260#define ZONE_RECLAIM_SOME 0 250#define ZONE_RECLAIM_SOME 0
261#define ZONE_RECLAIM_SUCCESS 1 251#define ZONE_RECLAIM_SUCCESS 1
262#endif 252#endif
253
254extern int hwpoison_filter(struct page *p);
255
256extern u32 hwpoison_filter_dev_major;
257extern u32 hwpoison_filter_dev_minor;
258extern u64 hwpoison_filter_flags_mask;
259extern u64 hwpoison_filter_flags_value;
260extern u64 hwpoison_filter_memcg;
261extern u32 hwpoison_filter_enable;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 4ea4510e2996..5b069e4f5e48 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -93,6 +93,7 @@
93#include <linux/nodemask.h> 93#include <linux/nodemask.h>
94#include <linux/mm.h> 94#include <linux/mm.h>
95#include <linux/workqueue.h> 95#include <linux/workqueue.h>
96#include <linux/crc32.h>
96 97
97#include <asm/sections.h> 98#include <asm/sections.h>
98#include <asm/processor.h> 99#include <asm/processor.h>
@@ -108,7 +109,6 @@
108#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ 109#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
109#define SECS_FIRST_SCAN 60 /* delay before the first scan */ 110#define SECS_FIRST_SCAN 60 /* delay before the first scan */
110#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ 111#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
111#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */
112#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ 112#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
113 113
114#define BYTES_PER_POINTER sizeof(void *) 114#define BYTES_PER_POINTER sizeof(void *)
@@ -119,8 +119,8 @@
119/* scanning area inside a memory block */ 119/* scanning area inside a memory block */
120struct kmemleak_scan_area { 120struct kmemleak_scan_area {
121 struct hlist_node node; 121 struct hlist_node node;
122 unsigned long offset; 122 unsigned long start;
123 size_t length; 123 size_t size;
124}; 124};
125 125
126#define KMEMLEAK_GREY 0 126#define KMEMLEAK_GREY 0
@@ -149,6 +149,8 @@ struct kmemleak_object {
149 int min_count; 149 int min_count;
150 /* the total number of pointers found pointing to this object */ 150 /* the total number of pointers found pointing to this object */
151 int count; 151 int count;
152 /* checksum for detecting modified objects */
153 u32 checksum;
152 /* memory ranges to be scanned inside an object (empty for all) */ 154 /* memory ranges to be scanned inside an object (empty for all) */
153 struct hlist_head area_list; 155 struct hlist_head area_list;
154 unsigned long trace[MAX_TRACE]; 156 unsigned long trace[MAX_TRACE];
@@ -164,8 +166,6 @@ struct kmemleak_object {
164#define OBJECT_REPORTED (1 << 1) 166#define OBJECT_REPORTED (1 << 1)
165/* flag set to not scan the object */ 167/* flag set to not scan the object */
166#define OBJECT_NO_SCAN (1 << 2) 168#define OBJECT_NO_SCAN (1 << 2)
167/* flag set on newly allocated objects */
168#define OBJECT_NEW (1 << 3)
169 169
170/* number of bytes to print per line; must be 16 or 32 */ 170/* number of bytes to print per line; must be 16 or 32 */
171#define HEX_ROW_SIZE 16 171#define HEX_ROW_SIZE 16
@@ -241,8 +241,6 @@ struct early_log {
241 const void *ptr; /* allocated/freed memory block */ 241 const void *ptr; /* allocated/freed memory block */
242 size_t size; /* memory block size */ 242 size_t size; /* memory block size */
243 int min_count; /* minimum reference count */ 243 int min_count; /* minimum reference count */
244 unsigned long offset; /* scan area offset */
245 size_t length; /* scan area length */
246 unsigned long trace[MAX_TRACE]; /* stack trace */ 244 unsigned long trace[MAX_TRACE]; /* stack trace */
247 unsigned int trace_len; /* stack trace length */ 245 unsigned int trace_len; /* stack trace length */
248}; 246};
@@ -323,11 +321,6 @@ static bool color_gray(const struct kmemleak_object *object)
323 object->count >= object->min_count; 321 object->count >= object->min_count;
324} 322}
325 323
326static bool color_black(const struct kmemleak_object *object)
327{
328 return object->min_count == KMEMLEAK_BLACK;
329}
330
331/* 324/*
332 * Objects are considered unreferenced only if their color is white, they have 325 * Objects are considered unreferenced only if their color is white, they have
333 * not be deleted and have a minimum age to avoid false positives caused by 326 * not be deleted and have a minimum age to avoid false positives caused by
@@ -335,7 +328,7 @@ static bool color_black(const struct kmemleak_object *object)
335 */ 328 */
336static bool unreferenced_object(struct kmemleak_object *object) 329static bool unreferenced_object(struct kmemleak_object *object)
337{ 330{
338 return (object->flags & OBJECT_ALLOCATED) && color_white(object) && 331 return (color_white(object) && object->flags & OBJECT_ALLOCATED) &&
339 time_before_eq(object->jiffies + jiffies_min_age, 332 time_before_eq(object->jiffies + jiffies_min_age,
340 jiffies_last_scan); 333 jiffies_last_scan);
341} 334}
@@ -348,11 +341,13 @@ static void print_unreferenced(struct seq_file *seq,
348 struct kmemleak_object *object) 341 struct kmemleak_object *object)
349{ 342{
350 int i; 343 int i;
344 unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
351 345
352 seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", 346 seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
353 object->pointer, object->size); 347 object->pointer, object->size);
354 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", 348 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
355 object->comm, object->pid, object->jiffies); 349 object->comm, object->pid, object->jiffies,
350 msecs_age / 1000, msecs_age % 1000);
356 hex_dump_object(seq, object); 351 hex_dump_object(seq, object);
357 seq_printf(seq, " backtrace:\n"); 352 seq_printf(seq, " backtrace:\n");
358 353
@@ -381,6 +376,7 @@ static void dump_object_info(struct kmemleak_object *object)
381 pr_notice(" min_count = %d\n", object->min_count); 376 pr_notice(" min_count = %d\n", object->min_count);
382 pr_notice(" count = %d\n", object->count); 377 pr_notice(" count = %d\n", object->count);
383 pr_notice(" flags = 0x%lx\n", object->flags); 378 pr_notice(" flags = 0x%lx\n", object->flags);
379 pr_notice(" checksum = %d\n", object->checksum);
384 pr_notice(" backtrace:\n"); 380 pr_notice(" backtrace:\n");
385 print_stack_trace(&trace, 4); 381 print_stack_trace(&trace, 4);
386} 382}
@@ -522,12 +518,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
522 INIT_HLIST_HEAD(&object->area_list); 518 INIT_HLIST_HEAD(&object->area_list);
523 spin_lock_init(&object->lock); 519 spin_lock_init(&object->lock);
524 atomic_set(&object->use_count, 1); 520 atomic_set(&object->use_count, 1);
525 object->flags = OBJECT_ALLOCATED | OBJECT_NEW; 521 object->flags = OBJECT_ALLOCATED;
526 object->pointer = ptr; 522 object->pointer = ptr;
527 object->size = size; 523 object->size = size;
528 object->min_count = min_count; 524 object->min_count = min_count;
529 object->count = -1; /* no color initially */ 525 object->count = 0; /* white color initially */
530 object->jiffies = jiffies; 526 object->jiffies = jiffies;
527 object->checksum = 0;
531 528
532 /* task information */ 529 /* task information */
533 if (in_irq()) { 530 if (in_irq()) {
@@ -720,14 +717,13 @@ static void make_black_object(unsigned long ptr)
720 * Add a scanning area to the object. If at least one such area is added, 717 * Add a scanning area to the object. If at least one such area is added,
721 * kmemleak will only scan these ranges rather than the whole memory block. 718 * kmemleak will only scan these ranges rather than the whole memory block.
722 */ 719 */
723static void add_scan_area(unsigned long ptr, unsigned long offset, 720static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
724 size_t length, gfp_t gfp)
725{ 721{
726 unsigned long flags; 722 unsigned long flags;
727 struct kmemleak_object *object; 723 struct kmemleak_object *object;
728 struct kmemleak_scan_area *area; 724 struct kmemleak_scan_area *area;
729 725
730 object = find_and_get_object(ptr, 0); 726 object = find_and_get_object(ptr, 1);
731 if (!object) { 727 if (!object) {
732 kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", 728 kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n",
733 ptr); 729 ptr);
@@ -741,7 +737,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset,
741 } 737 }
742 738
743 spin_lock_irqsave(&object->lock, flags); 739 spin_lock_irqsave(&object->lock, flags);
744 if (offset + length > object->size) { 740 if (ptr + size > object->pointer + object->size) {
745 kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); 741 kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
746 dump_object_info(object); 742 dump_object_info(object);
747 kmem_cache_free(scan_area_cache, area); 743 kmem_cache_free(scan_area_cache, area);
@@ -749,8 +745,8 @@ static void add_scan_area(unsigned long ptr, unsigned long offset,
749 } 745 }
750 746
751 INIT_HLIST_NODE(&area->node); 747 INIT_HLIST_NODE(&area->node);
752 area->offset = offset; 748 area->start = ptr;
753 area->length = length; 749 area->size = size;
754 750
755 hlist_add_head(&area->node, &object->area_list); 751 hlist_add_head(&area->node, &object->area_list);
756out_unlock: 752out_unlock:
@@ -786,7 +782,7 @@ static void object_no_scan(unsigned long ptr)
786 * processed later once kmemleak is fully initialized. 782 * processed later once kmemleak is fully initialized.
787 */ 783 */
788static void __init log_early(int op_type, const void *ptr, size_t size, 784static void __init log_early(int op_type, const void *ptr, size_t size,
789 int min_count, unsigned long offset, size_t length) 785 int min_count)
790{ 786{
791 unsigned long flags; 787 unsigned long flags;
792 struct early_log *log; 788 struct early_log *log;
@@ -808,8 +804,6 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
808 log->ptr = ptr; 804 log->ptr = ptr;
809 log->size = size; 805 log->size = size;
810 log->min_count = min_count; 806 log->min_count = min_count;
811 log->offset = offset;
812 log->length = length;
813 if (op_type == KMEMLEAK_ALLOC) 807 if (op_type == KMEMLEAK_ALLOC)
814 log->trace_len = __save_stack_trace(log->trace); 808 log->trace_len = __save_stack_trace(log->trace);
815 crt_early_log++; 809 crt_early_log++;
@@ -833,12 +827,15 @@ static void early_alloc(struct early_log *log)
833 */ 827 */
834 rcu_read_lock(); 828 rcu_read_lock();
835 object = create_object((unsigned long)log->ptr, log->size, 829 object = create_object((unsigned long)log->ptr, log->size,
836 log->min_count, GFP_KERNEL); 830 log->min_count, GFP_ATOMIC);
831 if (!object)
832 goto out;
837 spin_lock_irqsave(&object->lock, flags); 833 spin_lock_irqsave(&object->lock, flags);
838 for (i = 0; i < log->trace_len; i++) 834 for (i = 0; i < log->trace_len; i++)
839 object->trace[i] = log->trace[i]; 835 object->trace[i] = log->trace[i];
840 object->trace_len = log->trace_len; 836 object->trace_len = log->trace_len;
841 spin_unlock_irqrestore(&object->lock, flags); 837 spin_unlock_irqrestore(&object->lock, flags);
838out:
842 rcu_read_unlock(); 839 rcu_read_unlock();
843} 840}
844 841
@@ -855,7 +852,7 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
855 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 852 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
856 create_object((unsigned long)ptr, size, min_count, gfp); 853 create_object((unsigned long)ptr, size, min_count, gfp);
857 else if (atomic_read(&kmemleak_early_log)) 854 else if (atomic_read(&kmemleak_early_log))
858 log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); 855 log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
859} 856}
860EXPORT_SYMBOL_GPL(kmemleak_alloc); 857EXPORT_SYMBOL_GPL(kmemleak_alloc);
861 858
@@ -870,7 +867,7 @@ void __ref kmemleak_free(const void *ptr)
870 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 867 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
871 delete_object_full((unsigned long)ptr); 868 delete_object_full((unsigned long)ptr);
872 else if (atomic_read(&kmemleak_early_log)) 869 else if (atomic_read(&kmemleak_early_log))
873 log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); 870 log_early(KMEMLEAK_FREE, ptr, 0, 0);
874} 871}
875EXPORT_SYMBOL_GPL(kmemleak_free); 872EXPORT_SYMBOL_GPL(kmemleak_free);
876 873
@@ -885,7 +882,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
885 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 882 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
886 delete_object_part((unsigned long)ptr, size); 883 delete_object_part((unsigned long)ptr, size);
887 else if (atomic_read(&kmemleak_early_log)) 884 else if (atomic_read(&kmemleak_early_log))
888 log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); 885 log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
889} 886}
890EXPORT_SYMBOL_GPL(kmemleak_free_part); 887EXPORT_SYMBOL_GPL(kmemleak_free_part);
891 888
@@ -900,7 +897,7 @@ void __ref kmemleak_not_leak(const void *ptr)
900 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 897 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
901 make_gray_object((unsigned long)ptr); 898 make_gray_object((unsigned long)ptr);
902 else if (atomic_read(&kmemleak_early_log)) 899 else if (atomic_read(&kmemleak_early_log))
903 log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); 900 log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
904} 901}
905EXPORT_SYMBOL(kmemleak_not_leak); 902EXPORT_SYMBOL(kmemleak_not_leak);
906 903
@@ -916,22 +913,21 @@ void __ref kmemleak_ignore(const void *ptr)
916 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 913 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
917 make_black_object((unsigned long)ptr); 914 make_black_object((unsigned long)ptr);
918 else if (atomic_read(&kmemleak_early_log)) 915 else if (atomic_read(&kmemleak_early_log))
919 log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); 916 log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
920} 917}
921EXPORT_SYMBOL(kmemleak_ignore); 918EXPORT_SYMBOL(kmemleak_ignore);
922 919
923/* 920/*
924 * Limit the range to be scanned in an allocated memory block. 921 * Limit the range to be scanned in an allocated memory block.
925 */ 922 */
926void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, 923void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
927 size_t length, gfp_t gfp)
928{ 924{
929 pr_debug("%s(0x%p)\n", __func__, ptr); 925 pr_debug("%s(0x%p)\n", __func__, ptr);
930 926
931 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 927 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
932 add_scan_area((unsigned long)ptr, offset, length, gfp); 928 add_scan_area((unsigned long)ptr, size, gfp);
933 else if (atomic_read(&kmemleak_early_log)) 929 else if (atomic_read(&kmemleak_early_log))
934 log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); 930 log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
935} 931}
936EXPORT_SYMBOL(kmemleak_scan_area); 932EXPORT_SYMBOL(kmemleak_scan_area);
937 933
@@ -945,11 +941,25 @@ void __ref kmemleak_no_scan(const void *ptr)
945 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 941 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
946 object_no_scan((unsigned long)ptr); 942 object_no_scan((unsigned long)ptr);
947 else if (atomic_read(&kmemleak_early_log)) 943 else if (atomic_read(&kmemleak_early_log))
948 log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); 944 log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
949} 945}
950EXPORT_SYMBOL(kmemleak_no_scan); 946EXPORT_SYMBOL(kmemleak_no_scan);
951 947
952/* 948/*
949 * Update an object's checksum and return true if it was modified.
950 */
951static bool update_checksum(struct kmemleak_object *object)
952{
953 u32 old_csum = object->checksum;
954
955 if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
956 return false;
957
958 object->checksum = crc32(0, (void *)object->pointer, object->size);
959 return object->checksum != old_csum;
960}
961
962/*
953 * Memory scanning is a long process and it needs to be interruptable. This 963 * Memory scanning is a long process and it needs to be interruptable. This
954 * function checks whether such interrupt condition occured. 964 * function checks whether such interrupt condition occured.
955 */ 965 */
@@ -1028,11 +1038,14 @@ static void scan_block(void *_start, void *_end,
1028 * added to the gray_list. 1038 * added to the gray_list.
1029 */ 1039 */
1030 object->count++; 1040 object->count++;
1031 if (color_gray(object)) 1041 if (color_gray(object)) {
1032 list_add_tail(&object->gray_list, &gray_list); 1042 list_add_tail(&object->gray_list, &gray_list);
1033 else 1043 spin_unlock_irqrestore(&object->lock, flags);
1034 put_object(object); 1044 continue;
1045 }
1046
1035 spin_unlock_irqrestore(&object->lock, flags); 1047 spin_unlock_irqrestore(&object->lock, flags);
1048 put_object(object);
1036 } 1049 }
1037} 1050}
1038 1051
@@ -1047,8 +1060,8 @@ static void scan_object(struct kmemleak_object *object)
1047 unsigned long flags; 1060 unsigned long flags;
1048 1061
1049 /* 1062 /*
1050 * Once the object->lock is aquired, the corresponding memory block 1063 * Once the object->lock is acquired, the corresponding memory block
1051 * cannot be freed (the same lock is aquired in delete_object). 1064 * cannot be freed (the same lock is acquired in delete_object).
1052 */ 1065 */
1053 spin_lock_irqsave(&object->lock, flags); 1066 spin_lock_irqsave(&object->lock, flags);
1054 if (object->flags & OBJECT_NO_SCAN) 1067 if (object->flags & OBJECT_NO_SCAN)
@@ -1072,14 +1085,47 @@ static void scan_object(struct kmemleak_object *object)
1072 } 1085 }
1073 } else 1086 } else
1074 hlist_for_each_entry(area, elem, &object->area_list, node) 1087 hlist_for_each_entry(area, elem, &object->area_list, node)
1075 scan_block((void *)(object->pointer + area->offset), 1088 scan_block((void *)area->start,
1076 (void *)(object->pointer + area->offset 1089 (void *)(area->start + area->size),
1077 + area->length), object, 0); 1090 object, 0);
1078out: 1091out:
1079 spin_unlock_irqrestore(&object->lock, flags); 1092 spin_unlock_irqrestore(&object->lock, flags);
1080} 1093}
1081 1094
1082/* 1095/*
1096 * Scan the objects already referenced (gray objects). More objects will be
1097 * referenced and, if there are no memory leaks, all the objects are scanned.
1098 */
1099static void scan_gray_list(void)
1100{
1101 struct kmemleak_object *object, *tmp;
1102
1103 /*
1104 * The list traversal is safe for both tail additions and removals
1105 * from inside the loop. The kmemleak objects cannot be freed from
1106 * outside the loop because their use_count was incremented.
1107 */
1108 object = list_entry(gray_list.next, typeof(*object), gray_list);
1109 while (&object->gray_list != &gray_list) {
1110 cond_resched();
1111
1112 /* may add new objects to the list */
1113 if (!scan_should_stop())
1114 scan_object(object);
1115
1116 tmp = list_entry(object->gray_list.next, typeof(*object),
1117 gray_list);
1118
1119 /* remove the object from the list and release it */
1120 list_del(&object->gray_list);
1121 put_object(object);
1122
1123 object = tmp;
1124 }
1125 WARN_ON(!list_empty(&gray_list));
1126}
1127
1128/*
1083 * Scan data sections and all the referenced memory blocks allocated via the 1129 * Scan data sections and all the referenced memory blocks allocated via the
1084 * kernel's standard allocators. This function must be called with the 1130 * kernel's standard allocators. This function must be called with the
1085 * scan_mutex held. 1131 * scan_mutex held.
@@ -1087,10 +1133,9 @@ out:
1087static void kmemleak_scan(void) 1133static void kmemleak_scan(void)
1088{ 1134{
1089 unsigned long flags; 1135 unsigned long flags;
1090 struct kmemleak_object *object, *tmp; 1136 struct kmemleak_object *object;
1091 int i; 1137 int i;
1092 int new_leaks = 0; 1138 int new_leaks = 0;
1093 int gray_list_pass = 0;
1094 1139
1095 jiffies_last_scan = jiffies; 1140 jiffies_last_scan = jiffies;
1096 1141
@@ -1111,7 +1156,6 @@ static void kmemleak_scan(void)
1111#endif 1156#endif
1112 /* reset the reference count (whiten the object) */ 1157 /* reset the reference count (whiten the object) */
1113 object->count = 0; 1158 object->count = 0;
1114 object->flags &= ~OBJECT_NEW;
1115 if (color_gray(object) && get_object(object)) 1159 if (color_gray(object) && get_object(object))
1116 list_add_tail(&object->gray_list, &gray_list); 1160 list_add_tail(&object->gray_list, &gray_list);
1117 1161
@@ -1169,62 +1213,36 @@ static void kmemleak_scan(void)
1169 1213
1170 /* 1214 /*
1171 * Scan the objects already referenced from the sections scanned 1215 * Scan the objects already referenced from the sections scanned
1172 * above. More objects will be referenced and, if there are no memory 1216 * above.
1173 * leaks, all the objects will be scanned. The list traversal is safe
1174 * for both tail additions and removals from inside the loop. The
1175 * kmemleak objects cannot be freed from outside the loop because their
1176 * use_count was increased.
1177 */ 1217 */
1178repeat: 1218 scan_gray_list();
1179 object = list_entry(gray_list.next, typeof(*object), gray_list);
1180 while (&object->gray_list != &gray_list) {
1181 cond_resched();
1182
1183 /* may add new objects to the list */
1184 if (!scan_should_stop())
1185 scan_object(object);
1186
1187 tmp = list_entry(object->gray_list.next, typeof(*object),
1188 gray_list);
1189
1190 /* remove the object from the list and release it */
1191 list_del(&object->gray_list);
1192 put_object(object);
1193
1194 object = tmp;
1195 }
1196
1197 if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES)
1198 goto scan_end;
1199 1219
1200 /* 1220 /*
1201 * Check for new objects allocated during this scanning and add them 1221 * Check for new or unreferenced objects modified since the previous
1202 * to the gray list. 1222 * scan and color them gray until the next scan.
1203 */ 1223 */
1204 rcu_read_lock(); 1224 rcu_read_lock();
1205 list_for_each_entry_rcu(object, &object_list, object_list) { 1225 list_for_each_entry_rcu(object, &object_list, object_list) {
1206 spin_lock_irqsave(&object->lock, flags); 1226 spin_lock_irqsave(&object->lock, flags);
1207 if ((object->flags & OBJECT_NEW) && !color_black(object) && 1227 if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
1208 get_object(object)) { 1228 && update_checksum(object) && get_object(object)) {
1209 object->flags &= ~OBJECT_NEW; 1229 /* color it gray temporarily */
1230 object->count = object->min_count;
1210 list_add_tail(&object->gray_list, &gray_list); 1231 list_add_tail(&object->gray_list, &gray_list);
1211 } 1232 }
1212 spin_unlock_irqrestore(&object->lock, flags); 1233 spin_unlock_irqrestore(&object->lock, flags);
1213 } 1234 }
1214 rcu_read_unlock(); 1235 rcu_read_unlock();
1215 1236
1216 if (!list_empty(&gray_list)) 1237 /*
1217 goto repeat; 1238 * Re-scan the gray list for modified unreferenced objects.
1218 1239 */
1219scan_end: 1240 scan_gray_list();
1220 WARN_ON(!list_empty(&gray_list));
1221 1241
1222 /* 1242 /*
1223 * If scanning was stopped or new objects were being allocated at a 1243 * If scanning was stopped do not report any new unreferenced objects.
1224 * higher rate than gray list scanning, do not report any new
1225 * unreferenced objects.
1226 */ 1244 */
1227 if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) 1245 if (scan_should_stop())
1228 return; 1246 return;
1229 1247
1230 /* 1248 /*
@@ -1639,8 +1657,7 @@ void __init kmemleak_init(void)
1639 kmemleak_ignore(log->ptr); 1657 kmemleak_ignore(log->ptr);
1640 break; 1658 break;
1641 case KMEMLEAK_SCAN_AREA: 1659 case KMEMLEAK_SCAN_AREA:
1642 kmemleak_scan_area(log->ptr, log->offset, log->length, 1660 kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
1643 GFP_KERNEL);
1644 break; 1661 break;
1645 case KMEMLEAK_NO_SCAN: 1662 case KMEMLEAK_NO_SCAN:
1646 kmemleak_no_scan(log->ptr); 1663 kmemleak_no_scan(log->ptr);
diff --git a/mm/ksm.c b/mm/ksm.c
index f7edac356f46..56a0da1f9979 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -29,11 +29,13 @@
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/rbtree.h> 31#include <linux/rbtree.h>
32#include <linux/memory.h>
32#include <linux/mmu_notifier.h> 33#include <linux/mmu_notifier.h>
33#include <linux/swap.h> 34#include <linux/swap.h>
34#include <linux/ksm.h> 35#include <linux/ksm.h>
35 36
36#include <asm/tlbflush.h> 37#include <asm/tlbflush.h>
38#include "internal.h"
37 39
38/* 40/*
39 * A few notes about the KSM scanning process, 41 * A few notes about the KSM scanning process,
@@ -79,13 +81,13 @@
79 * struct mm_slot - ksm information per mm that is being scanned 81 * struct mm_slot - ksm information per mm that is being scanned
80 * @link: link to the mm_slots hash list 82 * @link: link to the mm_slots hash list
81 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head 83 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
82 * @rmap_list: head for this mm_slot's list of rmap_items 84 * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
83 * @mm: the mm that this information is valid for 85 * @mm: the mm that this information is valid for
84 */ 86 */
85struct mm_slot { 87struct mm_slot {
86 struct hlist_node link; 88 struct hlist_node link;
87 struct list_head mm_list; 89 struct list_head mm_list;
88 struct list_head rmap_list; 90 struct rmap_item *rmap_list;
89 struct mm_struct *mm; 91 struct mm_struct *mm;
90}; 92};
91 93
@@ -93,7 +95,7 @@ struct mm_slot {
93 * struct ksm_scan - cursor for scanning 95 * struct ksm_scan - cursor for scanning
94 * @mm_slot: the current mm_slot we are scanning 96 * @mm_slot: the current mm_slot we are scanning
95 * @address: the next address inside that to be scanned 97 * @address: the next address inside that to be scanned
96 * @rmap_item: the current rmap that we are scanning inside the rmap_list 98 * @rmap_list: link to the next rmap to be scanned in the rmap_list
97 * @seqnr: count of completed full scans (needed when removing unstable node) 99 * @seqnr: count of completed full scans (needed when removing unstable node)
98 * 100 *
99 * There is only the one ksm_scan instance of this cursor structure. 101 * There is only the one ksm_scan instance of this cursor structure.
@@ -101,37 +103,51 @@ struct mm_slot {
101struct ksm_scan { 103struct ksm_scan {
102 struct mm_slot *mm_slot; 104 struct mm_slot *mm_slot;
103 unsigned long address; 105 unsigned long address;
104 struct rmap_item *rmap_item; 106 struct rmap_item **rmap_list;
105 unsigned long seqnr; 107 unsigned long seqnr;
106}; 108};
107 109
108/** 110/**
111 * struct stable_node - node of the stable rbtree
112 * @node: rb node of this ksm page in the stable tree
113 * @hlist: hlist head of rmap_items using this ksm page
114 * @kpfn: page frame number of this ksm page
115 */
116struct stable_node {
117 struct rb_node node;
118 struct hlist_head hlist;
119 unsigned long kpfn;
120};
121
122/**
109 * struct rmap_item - reverse mapping item for virtual addresses 123 * struct rmap_item - reverse mapping item for virtual addresses
110 * @link: link into mm_slot's rmap_list (rmap_list is per mm) 124 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
125 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
111 * @mm: the memory structure this rmap_item is pointing into 126 * @mm: the memory structure this rmap_item is pointing into
112 * @address: the virtual address this rmap_item tracks (+ flags in low bits) 127 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
113 * @oldchecksum: previous checksum of the page at that virtual address 128 * @oldchecksum: previous checksum of the page at that virtual address
114 * @node: rb_node of this rmap_item in either unstable or stable tree 129 * @node: rb node of this rmap_item in the unstable tree
115 * @next: next rmap_item hanging off the same node of the stable tree 130 * @head: pointer to stable_node heading this list in the stable tree
116 * @prev: previous rmap_item hanging off the same node of the stable tree 131 * @hlist: link into hlist of rmap_items hanging off that stable_node
117 */ 132 */
118struct rmap_item { 133struct rmap_item {
119 struct list_head link; 134 struct rmap_item *rmap_list;
135 struct anon_vma *anon_vma; /* when stable */
120 struct mm_struct *mm; 136 struct mm_struct *mm;
121 unsigned long address; /* + low bits used for flags below */ 137 unsigned long address; /* + low bits used for flags below */
138 unsigned int oldchecksum; /* when unstable */
122 union { 139 union {
123 unsigned int oldchecksum; /* when unstable */ 140 struct rb_node node; /* when node of unstable tree */
124 struct rmap_item *next; /* when stable */ 141 struct { /* when listed from stable tree */
125 }; 142 struct stable_node *head;
126 union { 143 struct hlist_node hlist;
127 struct rb_node node; /* when tree node */ 144 };
128 struct rmap_item *prev; /* in stable list */
129 }; 145 };
130}; 146};
131 147
132#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ 148#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
133#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ 149#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
134#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ 150#define STABLE_FLAG 0x200 /* is listed from the stable tree */
135 151
136/* The stable and unstable tree heads */ 152/* The stable and unstable tree heads */
137static struct rb_root root_stable_tree = RB_ROOT; 153static struct rb_root root_stable_tree = RB_ROOT;
@@ -148,6 +164,7 @@ static struct ksm_scan ksm_scan = {
148}; 164};
149 165
150static struct kmem_cache *rmap_item_cache; 166static struct kmem_cache *rmap_item_cache;
167static struct kmem_cache *stable_node_cache;
151static struct kmem_cache *mm_slot_cache; 168static struct kmem_cache *mm_slot_cache;
152 169
153/* The number of nodes in the stable tree */ 170/* The number of nodes in the stable tree */
@@ -162,9 +179,6 @@ static unsigned long ksm_pages_unshared;
162/* The number of rmap_items in use: to calculate pages_volatile */ 179/* The number of rmap_items in use: to calculate pages_volatile */
163static unsigned long ksm_rmap_items; 180static unsigned long ksm_rmap_items;
164 181
165/* Limit on the number of unswappable pages used */
166static unsigned long ksm_max_kernel_pages;
167
168/* Number of pages ksmd should scan in one batch */ 182/* Number of pages ksmd should scan in one batch */
169static unsigned int ksm_thread_pages_to_scan = 100; 183static unsigned int ksm_thread_pages_to_scan = 100;
170 184
@@ -184,24 +198,25 @@ static DEFINE_SPINLOCK(ksm_mmlist_lock);
184 sizeof(struct __struct), __alignof__(struct __struct),\ 198 sizeof(struct __struct), __alignof__(struct __struct),\
185 (__flags), NULL) 199 (__flags), NULL)
186 200
187static void __init ksm_init_max_kernel_pages(void)
188{
189 ksm_max_kernel_pages = nr_free_buffer_pages() / 4;
190}
191
192static int __init ksm_slab_init(void) 201static int __init ksm_slab_init(void)
193{ 202{
194 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); 203 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
195 if (!rmap_item_cache) 204 if (!rmap_item_cache)
196 goto out; 205 goto out;
197 206
207 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
208 if (!stable_node_cache)
209 goto out_free1;
210
198 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); 211 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
199 if (!mm_slot_cache) 212 if (!mm_slot_cache)
200 goto out_free; 213 goto out_free2;
201 214
202 return 0; 215 return 0;
203 216
204out_free: 217out_free2:
218 kmem_cache_destroy(stable_node_cache);
219out_free1:
205 kmem_cache_destroy(rmap_item_cache); 220 kmem_cache_destroy(rmap_item_cache);
206out: 221out:
207 return -ENOMEM; 222 return -ENOMEM;
@@ -210,6 +225,7 @@ out:
210static void __init ksm_slab_free(void) 225static void __init ksm_slab_free(void)
211{ 226{
212 kmem_cache_destroy(mm_slot_cache); 227 kmem_cache_destroy(mm_slot_cache);
228 kmem_cache_destroy(stable_node_cache);
213 kmem_cache_destroy(rmap_item_cache); 229 kmem_cache_destroy(rmap_item_cache);
214 mm_slot_cache = NULL; 230 mm_slot_cache = NULL;
215} 231}
@@ -231,6 +247,16 @@ static inline void free_rmap_item(struct rmap_item *rmap_item)
231 kmem_cache_free(rmap_item_cache, rmap_item); 247 kmem_cache_free(rmap_item_cache, rmap_item);
232} 248}
233 249
250static inline struct stable_node *alloc_stable_node(void)
251{
252 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
253}
254
255static inline void free_stable_node(struct stable_node *stable_node)
256{
257 kmem_cache_free(stable_node_cache, stable_node);
258}
259
234static inline struct mm_slot *alloc_mm_slot(void) 260static inline struct mm_slot *alloc_mm_slot(void)
235{ 261{
236 if (!mm_slot_cache) /* initialization failed */ 262 if (!mm_slot_cache) /* initialization failed */
@@ -280,7 +306,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
280 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 306 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
281 % MM_SLOTS_HASH_HEADS]; 307 % MM_SLOTS_HASH_HEADS];
282 mm_slot->mm = mm; 308 mm_slot->mm = mm;
283 INIT_LIST_HEAD(&mm_slot->rmap_list);
284 hlist_add_head(&mm_slot->link, bucket); 309 hlist_add_head(&mm_slot->link, bucket);
285} 310}
286 311
@@ -289,6 +314,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
289 return rmap_item->address & STABLE_FLAG; 314 return rmap_item->address & STABLE_FLAG;
290} 315}
291 316
317static void hold_anon_vma(struct rmap_item *rmap_item,
318 struct anon_vma *anon_vma)
319{
320 rmap_item->anon_vma = anon_vma;
321 atomic_inc(&anon_vma->ksm_refcount);
322}
323
324static void drop_anon_vma(struct rmap_item *rmap_item)
325{
326 struct anon_vma *anon_vma = rmap_item->anon_vma;
327
328 if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) {
329 int empty = list_empty(&anon_vma->head);
330 spin_unlock(&anon_vma->lock);
331 if (empty)
332 anon_vma_free(anon_vma);
333 }
334}
335
292/* 336/*
293 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's 337 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
294 * page tables after it has passed through ksm_exit() - which, if necessary, 338 * page tables after it has passed through ksm_exit() - which, if necessary,
@@ -361,10 +405,18 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
361 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; 405 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
362} 406}
363 407
364static void break_cow(struct mm_struct *mm, unsigned long addr) 408static void break_cow(struct rmap_item *rmap_item)
365{ 409{
410 struct mm_struct *mm = rmap_item->mm;
411 unsigned long addr = rmap_item->address;
366 struct vm_area_struct *vma; 412 struct vm_area_struct *vma;
367 413
414 /*
415 * It is not an accident that whenever we want to break COW
416 * to undo, we also need to drop a reference to the anon_vma.
417 */
418 drop_anon_vma(rmap_item);
419
368 down_read(&mm->mmap_sem); 420 down_read(&mm->mmap_sem);
369 if (ksm_test_exit(mm)) 421 if (ksm_test_exit(mm))
370 goto out; 422 goto out;
@@ -408,21 +460,77 @@ out: page = NULL;
408 return page; 460 return page;
409} 461}
410 462
463static void remove_node_from_stable_tree(struct stable_node *stable_node)
464{
465 struct rmap_item *rmap_item;
466 struct hlist_node *hlist;
467
468 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
469 if (rmap_item->hlist.next)
470 ksm_pages_sharing--;
471 else
472 ksm_pages_shared--;
473 drop_anon_vma(rmap_item);
474 rmap_item->address &= PAGE_MASK;
475 cond_resched();
476 }
477
478 rb_erase(&stable_node->node, &root_stable_tree);
479 free_stable_node(stable_node);
480}
481
411/* 482/*
412 * get_ksm_page: checks if the page at the virtual address in rmap_item 483 * get_ksm_page: checks if the page indicated by the stable node
413 * is still PageKsm, in which case we can trust the content of the page, 484 * is still its ksm page, despite having held no reference to it.
414 * and it returns the gotten page; but NULL if the page has been zapped. 485 * In which case we can trust the content of the page, and it
486 * returns the gotten page; but if the page has now been zapped,
487 * remove the stale node from the stable tree and return NULL.
488 *
489 * You would expect the stable_node to hold a reference to the ksm page.
490 * But if it increments the page's count, swapping out has to wait for
491 * ksmd to come around again before it can free the page, which may take
492 * seconds or even minutes: much too unresponsive. So instead we use a
493 * "keyhole reference": access to the ksm page from the stable node peeps
494 * out through its keyhole to see if that page still holds the right key,
495 * pointing back to this stable node. This relies on freeing a PageAnon
496 * page to reset its page->mapping to NULL, and relies on no other use of
497 * a page to put something that might look like our key in page->mapping.
498 *
499 * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
500 * but this is different - made simpler by ksm_thread_mutex being held, but
501 * interesting for assuming that no other use of the struct page could ever
502 * put our expected_mapping into page->mapping (or a field of the union which
503 * coincides with page->mapping). The RCU calls are not for KSM at all, but
504 * to keep the page_count protocol described with page_cache_get_speculative.
505 *
506 * Note: it is possible that get_ksm_page() will return NULL one moment,
507 * then page the next, if the page is in between page_freeze_refs() and
508 * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
509 * is on its way to being freed; but it is an anomaly to bear in mind.
415 */ 510 */
416static struct page *get_ksm_page(struct rmap_item *rmap_item) 511static struct page *get_ksm_page(struct stable_node *stable_node)
417{ 512{
418 struct page *page; 513 struct page *page;
419 514 void *expected_mapping;
420 page = get_mergeable_page(rmap_item); 515
421 if (page && !PageKsm(page)) { 516 page = pfn_to_page(stable_node->kpfn);
517 expected_mapping = (void *)stable_node +
518 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
519 rcu_read_lock();
520 if (page->mapping != expected_mapping)
521 goto stale;
522 if (!get_page_unless_zero(page))
523 goto stale;
524 if (page->mapping != expected_mapping) {
422 put_page(page); 525 put_page(page);
423 page = NULL; 526 goto stale;
424 } 527 }
528 rcu_read_unlock();
425 return page; 529 return page;
530stale:
531 rcu_read_unlock();
532 remove_node_from_stable_tree(stable_node);
533 return NULL;
426} 534}
427 535
428/* 536/*
@@ -431,35 +539,29 @@ static struct page *get_ksm_page(struct rmap_item *rmap_item)
431 */ 539 */
432static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) 540static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
433{ 541{
434 if (in_stable_tree(rmap_item)) { 542 if (rmap_item->address & STABLE_FLAG) {
435 struct rmap_item *next_item = rmap_item->next; 543 struct stable_node *stable_node;
436 544 struct page *page;
437 if (rmap_item->address & NODE_FLAG) {
438 if (next_item) {
439 rb_replace_node(&rmap_item->node,
440 &next_item->node,
441 &root_stable_tree);
442 next_item->address |= NODE_FLAG;
443 ksm_pages_sharing--;
444 } else {
445 rb_erase(&rmap_item->node, &root_stable_tree);
446 ksm_pages_shared--;
447 }
448 } else {
449 struct rmap_item *prev_item = rmap_item->prev;
450 545
451 BUG_ON(prev_item->next != rmap_item); 546 stable_node = rmap_item->head;
452 prev_item->next = next_item; 547 page = get_ksm_page(stable_node);
453 if (next_item) { 548 if (!page)
454 BUG_ON(next_item->prev != rmap_item); 549 goto out;
455 next_item->prev = rmap_item->prev; 550
456 } 551 lock_page(page);
552 hlist_del(&rmap_item->hlist);
553 unlock_page(page);
554 put_page(page);
555
556 if (stable_node->hlist.first)
457 ksm_pages_sharing--; 557 ksm_pages_sharing--;
458 } 558 else
559 ksm_pages_shared--;
459 560
460 rmap_item->next = NULL; 561 drop_anon_vma(rmap_item);
562 rmap_item->address &= PAGE_MASK;
461 563
462 } else if (rmap_item->address & NODE_FLAG) { 564 } else if (rmap_item->address & UNSTABLE_FLAG) {
463 unsigned char age; 565 unsigned char age;
464 /* 566 /*
465 * Usually ksmd can and must skip the rb_erase, because 567 * Usually ksmd can and must skip the rb_erase, because
@@ -472,24 +574,21 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
472 BUG_ON(age > 1); 574 BUG_ON(age > 1);
473 if (!age) 575 if (!age)
474 rb_erase(&rmap_item->node, &root_unstable_tree); 576 rb_erase(&rmap_item->node, &root_unstable_tree);
577
475 ksm_pages_unshared--; 578 ksm_pages_unshared--;
579 rmap_item->address &= PAGE_MASK;
476 } 580 }
477 581out:
478 rmap_item->address &= PAGE_MASK;
479
480 cond_resched(); /* we're called from many long loops */ 582 cond_resched(); /* we're called from many long loops */
481} 583}
482 584
483static void remove_trailing_rmap_items(struct mm_slot *mm_slot, 585static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
484 struct list_head *cur) 586 struct rmap_item **rmap_list)
485{ 587{
486 struct rmap_item *rmap_item; 588 while (*rmap_list) {
487 589 struct rmap_item *rmap_item = *rmap_list;
488 while (cur != &mm_slot->rmap_list) { 590 *rmap_list = rmap_item->rmap_list;
489 rmap_item = list_entry(cur, struct rmap_item, link);
490 cur = cur->next;
491 remove_rmap_item_from_tree(rmap_item); 591 remove_rmap_item_from_tree(rmap_item);
492 list_del(&rmap_item->link);
493 free_rmap_item(rmap_item); 592 free_rmap_item(rmap_item);
494 } 593 }
495} 594}
@@ -555,7 +654,7 @@ static int unmerge_and_remove_all_rmap_items(void)
555 goto error; 654 goto error;
556 } 655 }
557 656
558 remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); 657 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
559 658
560 spin_lock(&ksm_mmlist_lock); 659 spin_lock(&ksm_mmlist_lock);
561 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, 660 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
@@ -651,7 +750,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
651 * Check that no O_DIRECT or similar I/O is in progress on the 750 * Check that no O_DIRECT or similar I/O is in progress on the
652 * page 751 * page
653 */ 752 */
654 if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { 753 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
655 set_pte_at_notify(mm, addr, ptep, entry); 754 set_pte_at_notify(mm, addr, ptep, entry);
656 goto out_unlock; 755 goto out_unlock;
657 } 756 }
@@ -669,15 +768,15 @@ out:
669 768
670/** 769/**
671 * replace_page - replace page in vma by new ksm page 770 * replace_page - replace page in vma by new ksm page
672 * @vma: vma that holds the pte pointing to oldpage 771 * @vma: vma that holds the pte pointing to page
673 * @oldpage: the page we are replacing by newpage 772 * @page: the page we are replacing by kpage
674 * @newpage: the ksm page we replace oldpage by 773 * @kpage: the ksm page we replace page by
675 * @orig_pte: the original value of the pte 774 * @orig_pte: the original value of the pte
676 * 775 *
677 * Returns 0 on success, -EFAULT on failure. 776 * Returns 0 on success, -EFAULT on failure.
678 */ 777 */
679static int replace_page(struct vm_area_struct *vma, struct page *oldpage, 778static int replace_page(struct vm_area_struct *vma, struct page *page,
680 struct page *newpage, pte_t orig_pte) 779 struct page *kpage, pte_t orig_pte)
681{ 780{
682 struct mm_struct *mm = vma->vm_mm; 781 struct mm_struct *mm = vma->vm_mm;
683 pgd_t *pgd; 782 pgd_t *pgd;
@@ -686,12 +785,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
686 pte_t *ptep; 785 pte_t *ptep;
687 spinlock_t *ptl; 786 spinlock_t *ptl;
688 unsigned long addr; 787 unsigned long addr;
689 pgprot_t prot;
690 int err = -EFAULT; 788 int err = -EFAULT;
691 789
692 prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); 790 addr = page_address_in_vma(page, vma);
693
694 addr = page_address_in_vma(oldpage, vma);
695 if (addr == -EFAULT) 791 if (addr == -EFAULT)
696 goto out; 792 goto out;
697 793
@@ -713,15 +809,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
713 goto out; 809 goto out;
714 } 810 }
715 811
716 get_page(newpage); 812 get_page(kpage);
717 page_add_ksm_rmap(newpage); 813 page_add_anon_rmap(kpage, vma, addr);
718 814
719 flush_cache_page(vma, addr, pte_pfn(*ptep)); 815 flush_cache_page(vma, addr, pte_pfn(*ptep));
720 ptep_clear_flush(vma, addr, ptep); 816 ptep_clear_flush(vma, addr, ptep);
721 set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); 817 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
722 818
723 page_remove_rmap(oldpage); 819 page_remove_rmap(page);
724 put_page(oldpage); 820 put_page(page);
725 821
726 pte_unmap_unlock(ptep, ptl); 822 pte_unmap_unlock(ptep, ptl);
727 err = 0; 823 err = 0;
@@ -731,32 +827,27 @@ out:
731 827
732/* 828/*
733 * try_to_merge_one_page - take two pages and merge them into one 829 * try_to_merge_one_page - take two pages and merge them into one
734 * @vma: the vma that hold the pte pointing into oldpage 830 * @vma: the vma that holds the pte pointing to page
735 * @oldpage: the page that we want to replace with newpage 831 * @page: the PageAnon page that we want to replace with kpage
736 * @newpage: the page that we want to map instead of oldpage 832 * @kpage: the PageKsm page that we want to map instead of page,
737 * 833 * or NULL the first time when we want to use page as kpage.
738 * Note:
739 * oldpage should be a PageAnon page, while newpage should be a PageKsm page,
740 * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
741 * 834 *
742 * This function returns 0 if the pages were merged, -EFAULT otherwise. 835 * This function returns 0 if the pages were merged, -EFAULT otherwise.
743 */ 836 */
744static int try_to_merge_one_page(struct vm_area_struct *vma, 837static int try_to_merge_one_page(struct vm_area_struct *vma,
745 struct page *oldpage, 838 struct page *page, struct page *kpage)
746 struct page *newpage)
747{ 839{
748 pte_t orig_pte = __pte(0); 840 pte_t orig_pte = __pte(0);
749 int err = -EFAULT; 841 int err = -EFAULT;
750 842
843 if (page == kpage) /* ksm page forked */
844 return 0;
845
751 if (!(vma->vm_flags & VM_MERGEABLE)) 846 if (!(vma->vm_flags & VM_MERGEABLE))
752 goto out; 847 goto out;
753 848 if (!PageAnon(page))
754 if (!PageAnon(oldpage))
755 goto out; 849 goto out;
756 850
757 get_page(newpage);
758 get_page(oldpage);
759
760 /* 851 /*
761 * We need the page lock to read a stable PageSwapCache in 852 * We need the page lock to read a stable PageSwapCache in
762 * write_protect_page(). We use trylock_page() instead of 853 * write_protect_page(). We use trylock_page() instead of
@@ -764,26 +855,39 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
764 * prefer to continue scanning and merging different pages, 855 * prefer to continue scanning and merging different pages,
765 * then come back to this page when it is unlocked. 856 * then come back to this page when it is unlocked.
766 */ 857 */
767 if (!trylock_page(oldpage)) 858 if (!trylock_page(page))
768 goto out_putpage; 859 goto out;
769 /* 860 /*
770 * If this anonymous page is mapped only here, its pte may need 861 * If this anonymous page is mapped only here, its pte may need
771 * to be write-protected. If it's mapped elsewhere, all of its 862 * to be write-protected. If it's mapped elsewhere, all of its
772 * ptes are necessarily already write-protected. But in either 863 * ptes are necessarily already write-protected. But in either
773 * case, we need to lock and check page_count is not raised. 864 * case, we need to lock and check page_count is not raised.
774 */ 865 */
775 if (write_protect_page(vma, oldpage, &orig_pte)) { 866 if (write_protect_page(vma, page, &orig_pte) == 0) {
776 unlock_page(oldpage); 867 if (!kpage) {
777 goto out_putpage; 868 /*
869 * While we hold page lock, upgrade page from
870 * PageAnon+anon_vma to PageKsm+NULL stable_node:
871 * stable_tree_insert() will update stable_node.
872 */
873 set_page_stable_node(page, NULL);
874 mark_page_accessed(page);
875 err = 0;
876 } else if (pages_identical(page, kpage))
877 err = replace_page(vma, page, kpage, orig_pte);
778 } 878 }
779 unlock_page(oldpage);
780 879
781 if (pages_identical(oldpage, newpage)) 880 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
782 err = replace_page(vma, oldpage, newpage, orig_pte); 881 munlock_vma_page(page);
882 if (!PageMlocked(kpage)) {
883 unlock_page(page);
884 lock_page(kpage);
885 mlock_vma_page(kpage);
886 page = kpage; /* for final unlock */
887 }
888 }
783 889
784out_putpage: 890 unlock_page(page);
785 put_page(oldpage);
786 put_page(newpage);
787out: 891out:
788 return err; 892 return err;
789} 893}
@@ -791,26 +895,31 @@ out:
791/* 895/*
792 * try_to_merge_with_ksm_page - like try_to_merge_two_pages, 896 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
793 * but no new kernel page is allocated: kpage must already be a ksm page. 897 * but no new kernel page is allocated: kpage must already be a ksm page.
898 *
899 * This function returns 0 if the pages were merged, -EFAULT otherwise.
794 */ 900 */
795static int try_to_merge_with_ksm_page(struct mm_struct *mm1, 901static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
796 unsigned long addr1, 902 struct page *page, struct page *kpage)
797 struct page *page1,
798 struct page *kpage)
799{ 903{
904 struct mm_struct *mm = rmap_item->mm;
800 struct vm_area_struct *vma; 905 struct vm_area_struct *vma;
801 int err = -EFAULT; 906 int err = -EFAULT;
802 907
803 down_read(&mm1->mmap_sem); 908 down_read(&mm->mmap_sem);
804 if (ksm_test_exit(mm1)) 909 if (ksm_test_exit(mm))
910 goto out;
911 vma = find_vma(mm, rmap_item->address);
912 if (!vma || vma->vm_start > rmap_item->address)
805 goto out; 913 goto out;
806 914
807 vma = find_vma(mm1, addr1); 915 err = try_to_merge_one_page(vma, page, kpage);
808 if (!vma || vma->vm_start > addr1) 916 if (err)
809 goto out; 917 goto out;
810 918
811 err = try_to_merge_one_page(vma, page1, kpage); 919 /* Must get reference to anon_vma while still holding mmap_sem */
920 hold_anon_vma(rmap_item, vma->anon_vma);
812out: 921out:
813 up_read(&mm1->mmap_sem); 922 up_read(&mm->mmap_sem);
814 return err; 923 return err;
815} 924}
816 925
@@ -818,109 +927,73 @@ out:
818 * try_to_merge_two_pages - take two identical pages and prepare them 927 * try_to_merge_two_pages - take two identical pages and prepare them
819 * to be merged into one page. 928 * to be merged into one page.
820 * 929 *
821 * This function returns 0 if we successfully mapped two identical pages 930 * This function returns the kpage if we successfully merged two identical
822 * into one page, -EFAULT otherwise. 931 * pages into one ksm page, NULL otherwise.
823 * 932 *
824 * Note that this function allocates a new kernel page: if one of the pages 933 * Note that this function upgrades page to ksm page: if one of the pages
825 * is already a ksm page, try_to_merge_with_ksm_page should be used. 934 * is already a ksm page, try_to_merge_with_ksm_page should be used.
826 */ 935 */
827static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, 936static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
828 struct page *page1, struct mm_struct *mm2, 937 struct page *page,
829 unsigned long addr2, struct page *page2) 938 struct rmap_item *tree_rmap_item,
939 struct page *tree_page)
830{ 940{
831 struct vm_area_struct *vma; 941 int err;
832 struct page *kpage;
833 int err = -EFAULT;
834
835 /*
836 * The number of nodes in the stable tree
837 * is the number of kernel pages that we hold.
838 */
839 if (ksm_max_kernel_pages &&
840 ksm_max_kernel_pages <= ksm_pages_shared)
841 return err;
842
843 kpage = alloc_page(GFP_HIGHUSER);
844 if (!kpage)
845 return err;
846
847 down_read(&mm1->mmap_sem);
848 if (ksm_test_exit(mm1)) {
849 up_read(&mm1->mmap_sem);
850 goto out;
851 }
852 vma = find_vma(mm1, addr1);
853 if (!vma || vma->vm_start > addr1) {
854 up_read(&mm1->mmap_sem);
855 goto out;
856 }
857
858 copy_user_highpage(kpage, page1, addr1, vma);
859 err = try_to_merge_one_page(vma, page1, kpage);
860 up_read(&mm1->mmap_sem);
861 942
943 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
862 if (!err) { 944 if (!err) {
863 err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); 945 err = try_to_merge_with_ksm_page(tree_rmap_item,
946 tree_page, page);
864 /* 947 /*
865 * If that fails, we have a ksm page with only one pte 948 * If that fails, we have a ksm page with only one pte
866 * pointing to it: so break it. 949 * pointing to it: so break it.
867 */ 950 */
868 if (err) 951 if (err)
869 break_cow(mm1, addr1); 952 break_cow(rmap_item);
870 } 953 }
871out: 954 return err ? NULL : page;
872 put_page(kpage);
873 return err;
874} 955}
875 956
876/* 957/*
877 * stable_tree_search - search page inside the stable tree 958 * stable_tree_search - search for page inside the stable tree
878 * @page: the page that we are searching identical pages to.
879 * @page2: pointer into identical page that we are holding inside the stable
880 * tree that we have found.
881 * @rmap_item: the reverse mapping item
882 * 959 *
883 * This function checks if there is a page inside the stable tree 960 * This function checks if there is a page inside the stable tree
884 * with identical content to the page that we are scanning right now. 961 * with identical content to the page that we are scanning right now.
885 * 962 *
886 * This function return rmap_item pointer to the identical item if found, 963 * This function returns the stable tree node of identical content if found,
887 * NULL otherwise. 964 * NULL otherwise.
888 */ 965 */
889static struct rmap_item *stable_tree_search(struct page *page, 966static struct page *stable_tree_search(struct page *page)
890 struct page **page2,
891 struct rmap_item *rmap_item)
892{ 967{
893 struct rb_node *node = root_stable_tree.rb_node; 968 struct rb_node *node = root_stable_tree.rb_node;
969 struct stable_node *stable_node;
970
971 stable_node = page_stable_node(page);
972 if (stable_node) { /* ksm page forked */
973 get_page(page);
974 return page;
975 }
894 976
895 while (node) { 977 while (node) {
896 struct rmap_item *tree_rmap_item, *next_rmap_item; 978 struct page *tree_page;
897 int ret; 979 int ret;
898 980
899 tree_rmap_item = rb_entry(node, struct rmap_item, node); 981 cond_resched();
900 while (tree_rmap_item) { 982 stable_node = rb_entry(node, struct stable_node, node);
901 BUG_ON(!in_stable_tree(tree_rmap_item)); 983 tree_page = get_ksm_page(stable_node);
902 cond_resched(); 984 if (!tree_page)
903 page2[0] = get_ksm_page(tree_rmap_item);
904 if (page2[0])
905 break;
906 next_rmap_item = tree_rmap_item->next;
907 remove_rmap_item_from_tree(tree_rmap_item);
908 tree_rmap_item = next_rmap_item;
909 }
910 if (!tree_rmap_item)
911 return NULL; 985 return NULL;
912 986
913 ret = memcmp_pages(page, page2[0]); 987 ret = memcmp_pages(page, tree_page);
914 988
915 if (ret < 0) { 989 if (ret < 0) {
916 put_page(page2[0]); 990 put_page(tree_page);
917 node = node->rb_left; 991 node = node->rb_left;
918 } else if (ret > 0) { 992 } else if (ret > 0) {
919 put_page(page2[0]); 993 put_page(tree_page);
920 node = node->rb_right; 994 node = node->rb_right;
921 } else { 995 } else
922 return tree_rmap_item; 996 return tree_page;
923 }
924 } 997 }
925 998
926 return NULL; 999 return NULL;
@@ -930,38 +1003,26 @@ static struct rmap_item *stable_tree_search(struct page *page,
930 * stable_tree_insert - insert rmap_item pointing to new ksm page 1003 * stable_tree_insert - insert rmap_item pointing to new ksm page
931 * into the stable tree. 1004 * into the stable tree.
932 * 1005 *
933 * @page: the page that we are searching identical page to inside the stable 1006 * This function returns the stable tree node just allocated on success,
934 * tree. 1007 * NULL otherwise.
935 * @rmap_item: pointer to the reverse mapping item.
936 *
937 * This function returns rmap_item if success, NULL otherwise.
938 */ 1008 */
939static struct rmap_item *stable_tree_insert(struct page *page, 1009static struct stable_node *stable_tree_insert(struct page *kpage)
940 struct rmap_item *rmap_item)
941{ 1010{
942 struct rb_node **new = &root_stable_tree.rb_node; 1011 struct rb_node **new = &root_stable_tree.rb_node;
943 struct rb_node *parent = NULL; 1012 struct rb_node *parent = NULL;
1013 struct stable_node *stable_node;
944 1014
945 while (*new) { 1015 while (*new) {
946 struct rmap_item *tree_rmap_item, *next_rmap_item;
947 struct page *tree_page; 1016 struct page *tree_page;
948 int ret; 1017 int ret;
949 1018
950 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 1019 cond_resched();
951 while (tree_rmap_item) { 1020 stable_node = rb_entry(*new, struct stable_node, node);
952 BUG_ON(!in_stable_tree(tree_rmap_item)); 1021 tree_page = get_ksm_page(stable_node);
953 cond_resched(); 1022 if (!tree_page)
954 tree_page = get_ksm_page(tree_rmap_item);
955 if (tree_page)
956 break;
957 next_rmap_item = tree_rmap_item->next;
958 remove_rmap_item_from_tree(tree_rmap_item);
959 tree_rmap_item = next_rmap_item;
960 }
961 if (!tree_rmap_item)
962 return NULL; 1023 return NULL;
963 1024
964 ret = memcmp_pages(page, tree_page); 1025 ret = memcmp_pages(kpage, tree_page);
965 put_page(tree_page); 1026 put_page(tree_page);
966 1027
967 parent = *new; 1028 parent = *new;
@@ -979,22 +1040,24 @@ static struct rmap_item *stable_tree_insert(struct page *page,
979 } 1040 }
980 } 1041 }
981 1042
982 rmap_item->address |= NODE_FLAG | STABLE_FLAG; 1043 stable_node = alloc_stable_node();
983 rmap_item->next = NULL; 1044 if (!stable_node)
984 rb_link_node(&rmap_item->node, parent, new); 1045 return NULL;
985 rb_insert_color(&rmap_item->node, &root_stable_tree);
986 1046
987 ksm_pages_shared++; 1047 rb_link_node(&stable_node->node, parent, new);
988 return rmap_item; 1048 rb_insert_color(&stable_node->node, &root_stable_tree);
1049
1050 INIT_HLIST_HEAD(&stable_node->hlist);
1051
1052 stable_node->kpfn = page_to_pfn(kpage);
1053 set_page_stable_node(kpage, stable_node);
1054
1055 return stable_node;
989} 1056}
990 1057
991/* 1058/*
992 * unstable_tree_search_insert - search and insert items into the unstable tree. 1059 * unstable_tree_search_insert - search for identical page,
993 * 1060 * else insert rmap_item into the unstable tree.
994 * @page: the page that we are going to search for identical page or to insert
995 * into the unstable tree
996 * @page2: pointer into identical page that was found inside the unstable tree
997 * @rmap_item: the reverse mapping item of page
998 * 1061 *
999 * This function searches for a page in the unstable tree identical to the 1062 * This function searches for a page in the unstable tree identical to the
1000 * page currently being scanned; and if no identical page is found in the 1063 * page currently being scanned; and if no identical page is found in the
@@ -1006,46 +1069,50 @@ static struct rmap_item *stable_tree_insert(struct page *page,
1006 * This function does both searching and inserting, because they share 1069 * This function does both searching and inserting, because they share
1007 * the same walking algorithm in an rbtree. 1070 * the same walking algorithm in an rbtree.
1008 */ 1071 */
1009static struct rmap_item *unstable_tree_search_insert(struct page *page, 1072static
1010 struct page **page2, 1073struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1011 struct rmap_item *rmap_item) 1074 struct page *page,
1075 struct page **tree_pagep)
1076
1012{ 1077{
1013 struct rb_node **new = &root_unstable_tree.rb_node; 1078 struct rb_node **new = &root_unstable_tree.rb_node;
1014 struct rb_node *parent = NULL; 1079 struct rb_node *parent = NULL;
1015 1080
1016 while (*new) { 1081 while (*new) {
1017 struct rmap_item *tree_rmap_item; 1082 struct rmap_item *tree_rmap_item;
1083 struct page *tree_page;
1018 int ret; 1084 int ret;
1019 1085
1086 cond_resched();
1020 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 1087 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1021 page2[0] = get_mergeable_page(tree_rmap_item); 1088 tree_page = get_mergeable_page(tree_rmap_item);
1022 if (!page2[0]) 1089 if (!tree_page)
1023 return NULL; 1090 return NULL;
1024 1091
1025 /* 1092 /*
1026 * Don't substitute an unswappable ksm page 1093 * Don't substitute a ksm page for a forked page.
1027 * just for one good swappable forked page.
1028 */ 1094 */
1029 if (page == page2[0]) { 1095 if (page == tree_page) {
1030 put_page(page2[0]); 1096 put_page(tree_page);
1031 return NULL; 1097 return NULL;
1032 } 1098 }
1033 1099
1034 ret = memcmp_pages(page, page2[0]); 1100 ret = memcmp_pages(page, tree_page);
1035 1101
1036 parent = *new; 1102 parent = *new;
1037 if (ret < 0) { 1103 if (ret < 0) {
1038 put_page(page2[0]); 1104 put_page(tree_page);
1039 new = &parent->rb_left; 1105 new = &parent->rb_left;
1040 } else if (ret > 0) { 1106 } else if (ret > 0) {
1041 put_page(page2[0]); 1107 put_page(tree_page);
1042 new = &parent->rb_right; 1108 new = &parent->rb_right;
1043 } else { 1109 } else {
1110 *tree_pagep = tree_page;
1044 return tree_rmap_item; 1111 return tree_rmap_item;
1045 } 1112 }
1046 } 1113 }
1047 1114
1048 rmap_item->address |= NODE_FLAG; 1115 rmap_item->address |= UNSTABLE_FLAG;
1049 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); 1116 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1050 rb_link_node(&rmap_item->node, parent, new); 1117 rb_link_node(&rmap_item->node, parent, new);
1051 rb_insert_color(&rmap_item->node, &root_unstable_tree); 1118 rb_insert_color(&rmap_item->node, &root_unstable_tree);
@@ -1060,18 +1127,16 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page,
1060 * the same ksm page. 1127 * the same ksm page.
1061 */ 1128 */
1062static void stable_tree_append(struct rmap_item *rmap_item, 1129static void stable_tree_append(struct rmap_item *rmap_item,
1063 struct rmap_item *tree_rmap_item) 1130 struct stable_node *stable_node)
1064{ 1131{
1065 rmap_item->next = tree_rmap_item->next; 1132 rmap_item->head = stable_node;
1066 rmap_item->prev = tree_rmap_item;
1067
1068 if (tree_rmap_item->next)
1069 tree_rmap_item->next->prev = rmap_item;
1070
1071 tree_rmap_item->next = rmap_item;
1072 rmap_item->address |= STABLE_FLAG; 1133 rmap_item->address |= STABLE_FLAG;
1134 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
1073 1135
1074 ksm_pages_sharing++; 1136 if (rmap_item->hlist.next)
1137 ksm_pages_sharing++;
1138 else
1139 ksm_pages_shared++;
1075} 1140}
1076 1141
1077/* 1142/*
@@ -1085,49 +1150,37 @@ static void stable_tree_append(struct rmap_item *rmap_item,
1085 */ 1150 */
1086static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) 1151static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1087{ 1152{
1088 struct page *page2[1];
1089 struct rmap_item *tree_rmap_item; 1153 struct rmap_item *tree_rmap_item;
1154 struct page *tree_page = NULL;
1155 struct stable_node *stable_node;
1156 struct page *kpage;
1090 unsigned int checksum; 1157 unsigned int checksum;
1091 int err; 1158 int err;
1092 1159
1093 if (in_stable_tree(rmap_item)) 1160 remove_rmap_item_from_tree(rmap_item);
1094 remove_rmap_item_from_tree(rmap_item);
1095 1161
1096 /* We first start with searching the page inside the stable tree */ 1162 /* We first start with searching the page inside the stable tree */
1097 tree_rmap_item = stable_tree_search(page, page2, rmap_item); 1163 kpage = stable_tree_search(page);
1098 if (tree_rmap_item) { 1164 if (kpage) {
1099 if (page == page2[0]) /* forked */ 1165 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
1100 err = 0;
1101 else
1102 err = try_to_merge_with_ksm_page(rmap_item->mm,
1103 rmap_item->address,
1104 page, page2[0]);
1105 put_page(page2[0]);
1106
1107 if (!err) { 1166 if (!err) {
1108 /* 1167 /*
1109 * The page was successfully merged: 1168 * The page was successfully merged:
1110 * add its rmap_item to the stable tree. 1169 * add its rmap_item to the stable tree.
1111 */ 1170 */
1112 stable_tree_append(rmap_item, tree_rmap_item); 1171 lock_page(kpage);
1172 stable_tree_append(rmap_item, page_stable_node(kpage));
1173 unlock_page(kpage);
1113 } 1174 }
1175 put_page(kpage);
1114 return; 1176 return;
1115 } 1177 }
1116 1178
1117 /* 1179 /*
1118 * A ksm page might have got here by fork, but its other 1180 * If the hash value of the page has changed from the last time
1119 * references have already been removed from the stable tree. 1181 * we calculated it, this page is changing frequently: therefore we
1120 * Or it might be left over from a break_ksm which failed 1182 * don't want to insert it in the unstable tree, and we don't want
1121 * when the mem_cgroup had reached its limit: try again now. 1183 * to waste our time searching for something identical to it there.
1122 */
1123 if (PageKsm(page))
1124 break_cow(rmap_item->mm, rmap_item->address);
1125
1126 /*
1127 * In case the hash value of the page was changed from the last time we
1128 * have calculated it, this page to be changed frequely, therefore we
1129 * don't want to insert it to the unstable tree, and we don't want to
1130 * waste our time to search if there is something identical to it there.
1131 */ 1184 */
1132 checksum = calc_checksum(page); 1185 checksum = calc_checksum(page);
1133 if (rmap_item->oldchecksum != checksum) { 1186 if (rmap_item->oldchecksum != checksum) {
@@ -1135,21 +1188,27 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1135 return; 1188 return;
1136 } 1189 }
1137 1190
1138 tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); 1191 tree_rmap_item =
1192 unstable_tree_search_insert(rmap_item, page, &tree_page);
1139 if (tree_rmap_item) { 1193 if (tree_rmap_item) {
1140 err = try_to_merge_two_pages(rmap_item->mm, 1194 kpage = try_to_merge_two_pages(rmap_item, page,
1141 rmap_item->address, page, 1195 tree_rmap_item, tree_page);
1142 tree_rmap_item->mm, 1196 put_page(tree_page);
1143 tree_rmap_item->address, page2[0]);
1144 /* 1197 /*
1145 * As soon as we merge this page, we want to remove the 1198 * As soon as we merge this page, we want to remove the
1146 * rmap_item of the page we have merged with from the unstable 1199 * rmap_item of the page we have merged with from the unstable
1147 * tree, and insert it instead as new node in the stable tree. 1200 * tree, and insert it instead as new node in the stable tree.
1148 */ 1201 */
1149 if (!err) { 1202 if (kpage) {
1150 rb_erase(&tree_rmap_item->node, &root_unstable_tree); 1203 remove_rmap_item_from_tree(tree_rmap_item);
1151 tree_rmap_item->address &= ~NODE_FLAG; 1204
1152 ksm_pages_unshared--; 1205 lock_page(kpage);
1206 stable_node = stable_tree_insert(kpage);
1207 if (stable_node) {
1208 stable_tree_append(tree_rmap_item, stable_node);
1209 stable_tree_append(rmap_item, stable_node);
1210 }
1211 unlock_page(kpage);
1153 1212
1154 /* 1213 /*
1155 * If we fail to insert the page into the stable tree, 1214 * If we fail to insert the page into the stable tree,
@@ -1157,37 +1216,28 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1157 * to a ksm page left outside the stable tree, 1216 * to a ksm page left outside the stable tree,
1158 * in which case we need to break_cow on both. 1217 * in which case we need to break_cow on both.
1159 */ 1218 */
1160 if (stable_tree_insert(page2[0], tree_rmap_item)) 1219 if (!stable_node) {
1161 stable_tree_append(rmap_item, tree_rmap_item); 1220 break_cow(tree_rmap_item);
1162 else { 1221 break_cow(rmap_item);
1163 break_cow(tree_rmap_item->mm,
1164 tree_rmap_item->address);
1165 break_cow(rmap_item->mm, rmap_item->address);
1166 } 1222 }
1167 } 1223 }
1168
1169 put_page(page2[0]);
1170 } 1224 }
1171} 1225}
1172 1226
1173static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, 1227static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1174 struct list_head *cur, 1228 struct rmap_item **rmap_list,
1175 unsigned long addr) 1229 unsigned long addr)
1176{ 1230{
1177 struct rmap_item *rmap_item; 1231 struct rmap_item *rmap_item;
1178 1232
1179 while (cur != &mm_slot->rmap_list) { 1233 while (*rmap_list) {
1180 rmap_item = list_entry(cur, struct rmap_item, link); 1234 rmap_item = *rmap_list;
1181 if ((rmap_item->address & PAGE_MASK) == addr) { 1235 if ((rmap_item->address & PAGE_MASK) == addr)
1182 if (!in_stable_tree(rmap_item))
1183 remove_rmap_item_from_tree(rmap_item);
1184 return rmap_item; 1236 return rmap_item;
1185 }
1186 if (rmap_item->address > addr) 1237 if (rmap_item->address > addr)
1187 break; 1238 break;
1188 cur = cur->next; 1239 *rmap_list = rmap_item->rmap_list;
1189 remove_rmap_item_from_tree(rmap_item); 1240 remove_rmap_item_from_tree(rmap_item);
1190 list_del(&rmap_item->link);
1191 free_rmap_item(rmap_item); 1241 free_rmap_item(rmap_item);
1192 } 1242 }
1193 1243
@@ -1196,7 +1246,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1196 /* It has already been zeroed */ 1246 /* It has already been zeroed */
1197 rmap_item->mm = mm_slot->mm; 1247 rmap_item->mm = mm_slot->mm;
1198 rmap_item->address = addr; 1248 rmap_item->address = addr;
1199 list_add_tail(&rmap_item->link, cur); 1249 rmap_item->rmap_list = *rmap_list;
1250 *rmap_list = rmap_item;
1200 } 1251 }
1201 return rmap_item; 1252 return rmap_item;
1202} 1253}
@@ -1221,8 +1272,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1221 spin_unlock(&ksm_mmlist_lock); 1272 spin_unlock(&ksm_mmlist_lock);
1222next_mm: 1273next_mm:
1223 ksm_scan.address = 0; 1274 ksm_scan.address = 0;
1224 ksm_scan.rmap_item = list_entry(&slot->rmap_list, 1275 ksm_scan.rmap_list = &slot->rmap_list;
1225 struct rmap_item, link);
1226 } 1276 }
1227 1277
1228 mm = slot->mm; 1278 mm = slot->mm;
@@ -1248,10 +1298,10 @@ next_mm:
1248 flush_anon_page(vma, *page, ksm_scan.address); 1298 flush_anon_page(vma, *page, ksm_scan.address);
1249 flush_dcache_page(*page); 1299 flush_dcache_page(*page);
1250 rmap_item = get_next_rmap_item(slot, 1300 rmap_item = get_next_rmap_item(slot,
1251 ksm_scan.rmap_item->link.next, 1301 ksm_scan.rmap_list, ksm_scan.address);
1252 ksm_scan.address);
1253 if (rmap_item) { 1302 if (rmap_item) {
1254 ksm_scan.rmap_item = rmap_item; 1303 ksm_scan.rmap_list =
1304 &rmap_item->rmap_list;
1255 ksm_scan.address += PAGE_SIZE; 1305 ksm_scan.address += PAGE_SIZE;
1256 } else 1306 } else
1257 put_page(*page); 1307 put_page(*page);
@@ -1267,14 +1317,13 @@ next_mm:
1267 1317
1268 if (ksm_test_exit(mm)) { 1318 if (ksm_test_exit(mm)) {
1269 ksm_scan.address = 0; 1319 ksm_scan.address = 0;
1270 ksm_scan.rmap_item = list_entry(&slot->rmap_list, 1320 ksm_scan.rmap_list = &slot->rmap_list;
1271 struct rmap_item, link);
1272 } 1321 }
1273 /* 1322 /*
1274 * Nuke all the rmap_items that are above this current rmap: 1323 * Nuke all the rmap_items that are above this current rmap:
1275 * because there were no VM_MERGEABLE vmas with such addresses. 1324 * because there were no VM_MERGEABLE vmas with such addresses.
1276 */ 1325 */
1277 remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); 1326 remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
1278 1327
1279 spin_lock(&ksm_mmlist_lock); 1328 spin_lock(&ksm_mmlist_lock);
1280 ksm_scan.mm_slot = list_entry(slot->mm_list.next, 1329 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
@@ -1327,14 +1376,6 @@ static void ksm_do_scan(unsigned int scan_npages)
1327 return; 1376 return;
1328 if (!PageKsm(page) || !in_stable_tree(rmap_item)) 1377 if (!PageKsm(page) || !in_stable_tree(rmap_item))
1329 cmp_and_merge_page(page, rmap_item); 1378 cmp_and_merge_page(page, rmap_item);
1330 else if (page_mapcount(page) == 1) {
1331 /*
1332 * Replace now-unshared ksm page by ordinary page.
1333 */
1334 break_cow(rmap_item->mm, rmap_item->address);
1335 remove_rmap_item_from_tree(rmap_item);
1336 rmap_item->oldchecksum = calc_checksum(page);
1337 }
1338 put_page(page); 1379 put_page(page);
1339 } 1380 }
1340} 1381}
@@ -1379,7 +1420,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1379 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | 1420 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1380 VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1421 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1381 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | 1422 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1382 VM_MIXEDMAP | VM_SAO)) 1423 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
1383 return 0; /* just ignore the advice */ 1424 return 0; /* just ignore the advice */
1384 1425
1385 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { 1426 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
@@ -1456,7 +1497,7 @@ void __ksm_exit(struct mm_struct *mm)
1456 spin_lock(&ksm_mmlist_lock); 1497 spin_lock(&ksm_mmlist_lock);
1457 mm_slot = get_mm_slot(mm); 1498 mm_slot = get_mm_slot(mm);
1458 if (mm_slot && ksm_scan.mm_slot != mm_slot) { 1499 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1459 if (list_empty(&mm_slot->rmap_list)) { 1500 if (!mm_slot->rmap_list) {
1460 hlist_del(&mm_slot->link); 1501 hlist_del(&mm_slot->link);
1461 list_del(&mm_slot->mm_list); 1502 list_del(&mm_slot->mm_list);
1462 easy_to_free = 1; 1503 easy_to_free = 1;
@@ -1477,6 +1518,249 @@ void __ksm_exit(struct mm_struct *mm)
1477 } 1518 }
1478} 1519}
1479 1520
1521struct page *ksm_does_need_to_copy(struct page *page,
1522 struct vm_area_struct *vma, unsigned long address)
1523{
1524 struct page *new_page;
1525
1526 unlock_page(page); /* any racers will COW it, not modify it */
1527
1528 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1529 if (new_page) {
1530 copy_user_highpage(new_page, page, address, vma);
1531
1532 SetPageDirty(new_page);
1533 __SetPageUptodate(new_page);
1534 SetPageSwapBacked(new_page);
1535 __set_page_locked(new_page);
1536
1537 if (page_evictable(new_page, vma))
1538 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
1539 else
1540 add_page_to_unevictable_list(new_page);
1541 }
1542
1543 page_cache_release(page);
1544 return new_page;
1545}
1546
1547int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
1548 unsigned long *vm_flags)
1549{
1550 struct stable_node *stable_node;
1551 struct rmap_item *rmap_item;
1552 struct hlist_node *hlist;
1553 unsigned int mapcount = page_mapcount(page);
1554 int referenced = 0;
1555 int search_new_forks = 0;
1556
1557 VM_BUG_ON(!PageKsm(page));
1558 VM_BUG_ON(!PageLocked(page));
1559
1560 stable_node = page_stable_node(page);
1561 if (!stable_node)
1562 return 0;
1563again:
1564 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1565 struct anon_vma *anon_vma = rmap_item->anon_vma;
1566 struct vm_area_struct *vma;
1567
1568 spin_lock(&anon_vma->lock);
1569 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
1570 if (rmap_item->address < vma->vm_start ||
1571 rmap_item->address >= vma->vm_end)
1572 continue;
1573 /*
1574 * Initially we examine only the vma which covers this
1575 * rmap_item; but later, if there is still work to do,
1576 * we examine covering vmas in other mms: in case they
1577 * were forked from the original since ksmd passed.
1578 */
1579 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1580 continue;
1581
1582 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
1583 continue;
1584
1585 referenced += page_referenced_one(page, vma,
1586 rmap_item->address, &mapcount, vm_flags);
1587 if (!search_new_forks || !mapcount)
1588 break;
1589 }
1590 spin_unlock(&anon_vma->lock);
1591 if (!mapcount)
1592 goto out;
1593 }
1594 if (!search_new_forks++)
1595 goto again;
1596out:
1597 return referenced;
1598}
1599
1600int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1601{
1602 struct stable_node *stable_node;
1603 struct hlist_node *hlist;
1604 struct rmap_item *rmap_item;
1605 int ret = SWAP_AGAIN;
1606 int search_new_forks = 0;
1607
1608 VM_BUG_ON(!PageKsm(page));
1609 VM_BUG_ON(!PageLocked(page));
1610
1611 stable_node = page_stable_node(page);
1612 if (!stable_node)
1613 return SWAP_FAIL;
1614again:
1615 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1616 struct anon_vma *anon_vma = rmap_item->anon_vma;
1617 struct vm_area_struct *vma;
1618
1619 spin_lock(&anon_vma->lock);
1620 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
1621 if (rmap_item->address < vma->vm_start ||
1622 rmap_item->address >= vma->vm_end)
1623 continue;
1624 /*
1625 * Initially we examine only the vma which covers this
1626 * rmap_item; but later, if there is still work to do,
1627 * we examine covering vmas in other mms: in case they
1628 * were forked from the original since ksmd passed.
1629 */
1630 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1631 continue;
1632
1633 ret = try_to_unmap_one(page, vma,
1634 rmap_item->address, flags);
1635 if (ret != SWAP_AGAIN || !page_mapped(page)) {
1636 spin_unlock(&anon_vma->lock);
1637 goto out;
1638 }
1639 }
1640 spin_unlock(&anon_vma->lock);
1641 }
1642 if (!search_new_forks++)
1643 goto again;
1644out:
1645 return ret;
1646}
1647
1648#ifdef CONFIG_MIGRATION
1649int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
1650 struct vm_area_struct *, unsigned long, void *), void *arg)
1651{
1652 struct stable_node *stable_node;
1653 struct hlist_node *hlist;
1654 struct rmap_item *rmap_item;
1655 int ret = SWAP_AGAIN;
1656 int search_new_forks = 0;
1657
1658 VM_BUG_ON(!PageKsm(page));
1659 VM_BUG_ON(!PageLocked(page));
1660
1661 stable_node = page_stable_node(page);
1662 if (!stable_node)
1663 return ret;
1664again:
1665 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1666 struct anon_vma *anon_vma = rmap_item->anon_vma;
1667 struct vm_area_struct *vma;
1668
1669 spin_lock(&anon_vma->lock);
1670 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
1671 if (rmap_item->address < vma->vm_start ||
1672 rmap_item->address >= vma->vm_end)
1673 continue;
1674 /*
1675 * Initially we examine only the vma which covers this
1676 * rmap_item; but later, if there is still work to do,
1677 * we examine covering vmas in other mms: in case they
1678 * were forked from the original since ksmd passed.
1679 */
1680 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1681 continue;
1682
1683 ret = rmap_one(page, vma, rmap_item->address, arg);
1684 if (ret != SWAP_AGAIN) {
1685 spin_unlock(&anon_vma->lock);
1686 goto out;
1687 }
1688 }
1689 spin_unlock(&anon_vma->lock);
1690 }
1691 if (!search_new_forks++)
1692 goto again;
1693out:
1694 return ret;
1695}
1696
1697void ksm_migrate_page(struct page *newpage, struct page *oldpage)
1698{
1699 struct stable_node *stable_node;
1700
1701 VM_BUG_ON(!PageLocked(oldpage));
1702 VM_BUG_ON(!PageLocked(newpage));
1703 VM_BUG_ON(newpage->mapping != oldpage->mapping);
1704
1705 stable_node = page_stable_node(newpage);
1706 if (stable_node) {
1707 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
1708 stable_node->kpfn = page_to_pfn(newpage);
1709 }
1710}
1711#endif /* CONFIG_MIGRATION */
1712
1713#ifdef CONFIG_MEMORY_HOTREMOVE
1714static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn,
1715 unsigned long end_pfn)
1716{
1717 struct rb_node *node;
1718
1719 for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
1720 struct stable_node *stable_node;
1721
1722 stable_node = rb_entry(node, struct stable_node, node);
1723 if (stable_node->kpfn >= start_pfn &&
1724 stable_node->kpfn < end_pfn)
1725 return stable_node;
1726 }
1727 return NULL;
1728}
1729
1730static int ksm_memory_callback(struct notifier_block *self,
1731 unsigned long action, void *arg)
1732{
1733 struct memory_notify *mn = arg;
1734 struct stable_node *stable_node;
1735
1736 switch (action) {
1737 case MEM_GOING_OFFLINE:
1738 /*
1739 * Keep it very simple for now: just lock out ksmd and
1740 * MADV_UNMERGEABLE while any memory is going offline.
1741 */
1742 mutex_lock(&ksm_thread_mutex);
1743 break;
1744
1745 case MEM_OFFLINE:
1746 /*
1747 * Most of the work is done by page migration; but there might
1748 * be a few stable_nodes left over, still pointing to struct
1749 * pages which have been offlined: prune those from the tree.
1750 */
1751 while ((stable_node = ksm_check_stable_tree(mn->start_pfn,
1752 mn->start_pfn + mn->nr_pages)) != NULL)
1753 remove_node_from_stable_tree(stable_node);
1754 /* fallthrough */
1755
1756 case MEM_CANCEL_OFFLINE:
1757 mutex_unlock(&ksm_thread_mutex);
1758 break;
1759 }
1760 return NOTIFY_OK;
1761}
1762#endif /* CONFIG_MEMORY_HOTREMOVE */
1763
1480#ifdef CONFIG_SYSFS 1764#ifdef CONFIG_SYSFS
1481/* 1765/*
1482 * This all compiles without CONFIG_SYSFS, but is a waste of space. 1766 * This all compiles without CONFIG_SYSFS, but is a waste of space.
@@ -1555,8 +1839,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1555 /* 1839 /*
1556 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. 1840 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
1557 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, 1841 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
1558 * breaking COW to free the unswappable pages_shared (but leaves 1842 * breaking COW to free the pages_shared (but leaves mm_slots
1559 * mm_slots on the list for when ksmd may be set running again). 1843 * on the list for when ksmd may be set running again).
1560 */ 1844 */
1561 1845
1562 mutex_lock(&ksm_thread_mutex); 1846 mutex_lock(&ksm_thread_mutex);
@@ -1581,29 +1865,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1581} 1865}
1582KSM_ATTR(run); 1866KSM_ATTR(run);
1583 1867
1584static ssize_t max_kernel_pages_store(struct kobject *kobj,
1585 struct kobj_attribute *attr,
1586 const char *buf, size_t count)
1587{
1588 int err;
1589 unsigned long nr_pages;
1590
1591 err = strict_strtoul(buf, 10, &nr_pages);
1592 if (err)
1593 return -EINVAL;
1594
1595 ksm_max_kernel_pages = nr_pages;
1596
1597 return count;
1598}
1599
1600static ssize_t max_kernel_pages_show(struct kobject *kobj,
1601 struct kobj_attribute *attr, char *buf)
1602{
1603 return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
1604}
1605KSM_ATTR(max_kernel_pages);
1606
1607static ssize_t pages_shared_show(struct kobject *kobj, 1868static ssize_t pages_shared_show(struct kobject *kobj,
1608 struct kobj_attribute *attr, char *buf) 1869 struct kobj_attribute *attr, char *buf)
1609{ 1870{
@@ -1653,7 +1914,6 @@ static struct attribute *ksm_attrs[] = {
1653 &sleep_millisecs_attr.attr, 1914 &sleep_millisecs_attr.attr,
1654 &pages_to_scan_attr.attr, 1915 &pages_to_scan_attr.attr,
1655 &run_attr.attr, 1916 &run_attr.attr,
1656 &max_kernel_pages_attr.attr,
1657 &pages_shared_attr.attr, 1917 &pages_shared_attr.attr,
1658 &pages_sharing_attr.attr, 1918 &pages_sharing_attr.attr,
1659 &pages_unshared_attr.attr, 1919 &pages_unshared_attr.attr,
@@ -1673,8 +1933,6 @@ static int __init ksm_init(void)
1673 struct task_struct *ksm_thread; 1933 struct task_struct *ksm_thread;
1674 int err; 1934 int err;
1675 1935
1676 ksm_init_max_kernel_pages();
1677
1678 err = ksm_slab_init(); 1936 err = ksm_slab_init();
1679 if (err) 1937 if (err)
1680 goto out; 1938 goto out;
@@ -1697,8 +1955,18 @@ static int __init ksm_init(void)
1697 kthread_stop(ksm_thread); 1955 kthread_stop(ksm_thread);
1698 goto out_free2; 1956 goto out_free2;
1699 } 1957 }
1958#else
1959 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
1960
1700#endif /* CONFIG_SYSFS */ 1961#endif /* CONFIG_SYSFS */
1701 1962
1963#ifdef CONFIG_MEMORY_HOTREMOVE
1964 /*
1965 * Choose a high priority since the callback takes ksm_thread_mutex:
1966 * later callbacks could only be taking locks which nest within that.
1967 */
1968 hotplug_memory_notifier(ksm_memory_callback, 100);
1969#endif
1702 return 0; 1970 return 0;
1703 1971
1704out_free2: 1972out_free2:
diff --git a/mm/madvise.c b/mm/madvise.c
index 35b1479b7c9d..319528b8db74 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -9,6 +9,7 @@
9#include <linux/pagemap.h> 9#include <linux/pagemap.h>
10#include <linux/syscalls.h> 10#include <linux/syscalls.h>
11#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
12#include <linux/page-isolation.h>
12#include <linux/hugetlb.h> 13#include <linux/hugetlb.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/ksm.h> 15#include <linux/ksm.h>
@@ -222,7 +223,7 @@ static long madvise_remove(struct vm_area_struct *vma,
222/* 223/*
223 * Error injection support for memory error handling. 224 * Error injection support for memory error handling.
224 */ 225 */
225static int madvise_hwpoison(unsigned long start, unsigned long end) 226static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
226{ 227{
227 int ret = 0; 228 int ret = 0;
228 229
@@ -230,15 +231,21 @@ static int madvise_hwpoison(unsigned long start, unsigned long end)
230 return -EPERM; 231 return -EPERM;
231 for (; start < end; start += PAGE_SIZE) { 232 for (; start < end; start += PAGE_SIZE) {
232 struct page *p; 233 struct page *p;
233 int ret = get_user_pages(current, current->mm, start, 1, 234 int ret = get_user_pages_fast(start, 1, 0, &p);
234 0, 0, &p, NULL);
235 if (ret != 1) 235 if (ret != 1)
236 return ret; 236 return ret;
237 if (bhv == MADV_SOFT_OFFLINE) {
238 printk(KERN_INFO "Soft offlining page %lx at %lx\n",
239 page_to_pfn(p), start);
240 ret = soft_offline_page(p, MF_COUNT_INCREASED);
241 if (ret)
242 break;
243 continue;
244 }
237 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", 245 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
238 page_to_pfn(p), start); 246 page_to_pfn(p), start);
239 /* Ignore return value for now */ 247 /* Ignore return value for now */
240 __memory_failure(page_to_pfn(p), 0, 1); 248 __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
241 put_page(p);
242 } 249 }
243 return ret; 250 return ret;
244} 251}
@@ -335,8 +342,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
335 size_t len; 342 size_t len;
336 343
337#ifdef CONFIG_MEMORY_FAILURE 344#ifdef CONFIG_MEMORY_FAILURE
338 if (behavior == MADV_HWPOISON) 345 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
339 return madvise_hwpoison(start, start+len_in); 346 return madvise_hwpoison(behavior, start, start+len_in);
340#endif 347#endif
341 if (!madvise_behavior_valid(behavior)) 348 if (!madvise_behavior_valid(behavior))
342 return error; 349 return error;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f99f5991d6bb..488b644e0e8e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -38,6 +38,7 @@
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/mm_inline.h> 39#include <linux/mm_inline.h>
40#include <linux/page_cgroup.h> 40#include <linux/page_cgroup.h>
41#include <linux/cpu.h>
41#include "internal.h" 42#include "internal.h"
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
@@ -54,7 +55,6 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
54#define do_swap_account (0) 55#define do_swap_account (0)
55#endif 56#endif
56 57
57static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
58#define SOFTLIMIT_EVENTS_THRESH (1000) 58#define SOFTLIMIT_EVENTS_THRESH (1000)
59 59
60/* 60/*
@@ -66,7 +66,7 @@ enum mem_cgroup_stat_index {
66 */ 66 */
67 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 67 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
68 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 68 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
69 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ 69 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ 72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
@@ -209,7 +209,7 @@ struct mem_cgroup {
209 int prev_priority; /* for recording reclaim priority */ 209 int prev_priority; /* for recording reclaim priority */
210 210
211 /* 211 /*
212 * While reclaiming in a hiearchy, we cache the last child we 212 * While reclaiming in a hierarchy, we cache the last child we
213 * reclaimed from. 213 * reclaimed from.
214 */ 214 */
215 int last_scanned_child; 215 int last_scanned_child;
@@ -275,6 +275,7 @@ enum charge_type {
275static void mem_cgroup_get(struct mem_cgroup *mem); 275static void mem_cgroup_get(struct mem_cgroup *mem);
276static void mem_cgroup_put(struct mem_cgroup *mem); 276static void mem_cgroup_put(struct mem_cgroup *mem);
277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
278static void drain_all_stock_async(void);
278 279
279static struct mem_cgroup_per_zone * 280static struct mem_cgroup_per_zone *
280mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 281mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -282,6 +283,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
282 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 283 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
283} 284}
284 285
286struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
287{
288 return &mem->css;
289}
290
285static struct mem_cgroup_per_zone * 291static struct mem_cgroup_per_zone *
286page_cgroup_zoneinfo(struct page_cgroup *pc) 292page_cgroup_zoneinfo(struct page_cgroup *pc)
287{ 293{
@@ -758,7 +764,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
758 task_unlock(task); 764 task_unlock(task);
759 if (!curr) 765 if (!curr)
760 return 0; 766 return 0;
761 if (curr->use_hierarchy) 767 /*
768 * We should check use_hierarchy of "mem" not "curr". Because checking
769 * use_hierarchy of "curr" here make this function true if hierarchy is
770 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
771 * hierarchy(even if use_hierarchy is disabled in "mem").
772 */
773 if (mem->use_hierarchy)
762 ret = css_is_ancestor(&curr->css, &mem->css); 774 ret = css_is_ancestor(&curr->css, &mem->css);
763 else 775 else
764 ret = (curr == mem); 776 ret = (curr == mem);
@@ -1007,7 +1019,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1007 static char memcg_name[PATH_MAX]; 1019 static char memcg_name[PATH_MAX];
1008 int ret; 1020 int ret;
1009 1021
1010 if (!memcg) 1022 if (!memcg || !p)
1011 return; 1023 return;
1012 1024
1013 1025
@@ -1137,6 +1149,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1137 victim = mem_cgroup_select_victim(root_mem); 1149 victim = mem_cgroup_select_victim(root_mem);
1138 if (victim == root_mem) { 1150 if (victim == root_mem) {
1139 loop++; 1151 loop++;
1152 if (loop >= 1)
1153 drain_all_stock_async();
1140 if (loop >= 2) { 1154 if (loop >= 2) {
1141 /* 1155 /*
1142 * If we have not been able to reclaim 1156 * If we have not been able to reclaim
@@ -1223,7 +1237,7 @@ static void record_last_oom(struct mem_cgroup *mem)
1223 * Currently used to update mapped file statistics, but the routine can be 1237 * Currently used to update mapped file statistics, but the routine can be
1224 * generalized to update other statistics as well. 1238 * generalized to update other statistics as well.
1225 */ 1239 */
1226void mem_cgroup_update_mapped_file_stat(struct page *page, int val) 1240void mem_cgroup_update_file_mapped(struct page *page, int val)
1227{ 1241{
1228 struct mem_cgroup *mem; 1242 struct mem_cgroup *mem;
1229 struct mem_cgroup_stat *stat; 1243 struct mem_cgroup_stat *stat;
@@ -1231,9 +1245,6 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
1231 int cpu; 1245 int cpu;
1232 struct page_cgroup *pc; 1246 struct page_cgroup *pc;
1233 1247
1234 if (!page_is_file_cache(page))
1235 return;
1236
1237 pc = lookup_page_cgroup(page); 1248 pc = lookup_page_cgroup(page);
1238 if (unlikely(!pc)) 1249 if (unlikely(!pc))
1239 return; 1250 return;
@@ -1253,12 +1264,139 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
1253 stat = &mem->stat; 1264 stat = &mem->stat;
1254 cpustat = &stat->cpustat[cpu]; 1265 cpustat = &stat->cpustat[cpu];
1255 1266
1256 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); 1267 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
1257done: 1268done:
1258 unlock_page_cgroup(pc); 1269 unlock_page_cgroup(pc);
1259} 1270}
1260 1271
1261/* 1272/*
1273 * size of first charge trial. "32" comes from vmscan.c's magic value.
1274 * TODO: maybe necessary to use big numbers in big irons.
1275 */
1276#define CHARGE_SIZE (32 * PAGE_SIZE)
1277struct memcg_stock_pcp {
1278 struct mem_cgroup *cached; /* this never be root cgroup */
1279 int charge;
1280 struct work_struct work;
1281};
1282static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1283static atomic_t memcg_drain_count;
1284
1285/*
1286 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
1287 * from local stock and true is returned. If the stock is 0 or charges from a
1288 * cgroup which is not current target, returns false. This stock will be
1289 * refilled.
1290 */
1291static bool consume_stock(struct mem_cgroup *mem)
1292{
1293 struct memcg_stock_pcp *stock;
1294 bool ret = true;
1295
1296 stock = &get_cpu_var(memcg_stock);
1297 if (mem == stock->cached && stock->charge)
1298 stock->charge -= PAGE_SIZE;
1299 else /* need to call res_counter_charge */
1300 ret = false;
1301 put_cpu_var(memcg_stock);
1302 return ret;
1303}
1304
1305/*
1306 * Returns stocks cached in percpu to res_counter and reset cached information.
1307 */
1308static void drain_stock(struct memcg_stock_pcp *stock)
1309{
1310 struct mem_cgroup *old = stock->cached;
1311
1312 if (stock->charge) {
1313 res_counter_uncharge(&old->res, stock->charge);
1314 if (do_swap_account)
1315 res_counter_uncharge(&old->memsw, stock->charge);
1316 }
1317 stock->cached = NULL;
1318 stock->charge = 0;
1319}
1320
1321/*
1322 * This must be called under preempt disabled or must be called by
1323 * a thread which is pinned to local cpu.
1324 */
1325static void drain_local_stock(struct work_struct *dummy)
1326{
1327 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1328 drain_stock(stock);
1329}
1330
1331/*
1332 * Cache charges(val) which is from res_counter, to local per_cpu area.
1333 * This will be consumed by consumt_stock() function, later.
1334 */
1335static void refill_stock(struct mem_cgroup *mem, int val)
1336{
1337 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1338
1339 if (stock->cached != mem) { /* reset if necessary */
1340 drain_stock(stock);
1341 stock->cached = mem;
1342 }
1343 stock->charge += val;
1344 put_cpu_var(memcg_stock);
1345}
1346
1347/*
1348 * Tries to drain stocked charges in other cpus. This function is asynchronous
1349 * and just put a work per cpu for draining localy on each cpu. Caller can
1350 * expects some charges will be back to res_counter later but cannot wait for
1351 * it.
1352 */
1353static void drain_all_stock_async(void)
1354{
1355 int cpu;
1356 /* This function is for scheduling "drain" in asynchronous way.
1357 * The result of "drain" is not directly handled by callers. Then,
1358 * if someone is calling drain, we don't have to call drain more.
1359 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
1360 * there is a race. We just do loose check here.
1361 */
1362 if (atomic_read(&memcg_drain_count))
1363 return;
1364 /* Notify other cpus that system-wide "drain" is running */
1365 atomic_inc(&memcg_drain_count);
1366 get_online_cpus();
1367 for_each_online_cpu(cpu) {
1368 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1369 schedule_work_on(cpu, &stock->work);
1370 }
1371 put_online_cpus();
1372 atomic_dec(&memcg_drain_count);
1373 /* We don't wait for flush_work */
1374}
1375
1376/* This is a synchronous drain interface. */
1377static void drain_all_stock_sync(void)
1378{
1379 /* called when force_empty is called */
1380 atomic_inc(&memcg_drain_count);
1381 schedule_on_each_cpu(drain_local_stock);
1382 atomic_dec(&memcg_drain_count);
1383}
1384
1385static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1386 unsigned long action,
1387 void *hcpu)
1388{
1389 int cpu = (unsigned long)hcpu;
1390 struct memcg_stock_pcp *stock;
1391
1392 if (action != CPU_DEAD)
1393 return NOTIFY_OK;
1394 stock = &per_cpu(memcg_stock, cpu);
1395 drain_stock(stock);
1396 return NOTIFY_OK;
1397}
1398
1399/*
1262 * Unlike exported interface, "oom" parameter is added. if oom==true, 1400 * Unlike exported interface, "oom" parameter is added. if oom==true,
1263 * oom-killer can be invoked. 1401 * oom-killer can be invoked.
1264 */ 1402 */
@@ -1269,6 +1407,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1269 struct mem_cgroup *mem, *mem_over_limit; 1407 struct mem_cgroup *mem, *mem_over_limit;
1270 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1408 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1271 struct res_counter *fail_res; 1409 struct res_counter *fail_res;
1410 int csize = CHARGE_SIZE;
1272 1411
1273 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1412 if (unlikely(test_thread_flag(TIF_MEMDIE))) {
1274 /* Don't account this! */ 1413 /* Don't account this! */
@@ -1293,23 +1432,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1293 return 0; 1432 return 0;
1294 1433
1295 VM_BUG_ON(css_is_removed(&mem->css)); 1434 VM_BUG_ON(css_is_removed(&mem->css));
1435 if (mem_cgroup_is_root(mem))
1436 goto done;
1296 1437
1297 while (1) { 1438 while (1) {
1298 int ret = 0; 1439 int ret = 0;
1299 unsigned long flags = 0; 1440 unsigned long flags = 0;
1300 1441
1301 if (mem_cgroup_is_root(mem)) 1442 if (consume_stock(mem))
1302 goto done; 1443 goto charged;
1303 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 1444
1445 ret = res_counter_charge(&mem->res, csize, &fail_res);
1304 if (likely(!ret)) { 1446 if (likely(!ret)) {
1305 if (!do_swap_account) 1447 if (!do_swap_account)
1306 break; 1448 break;
1307 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 1449 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1308 &fail_res);
1309 if (likely(!ret)) 1450 if (likely(!ret))
1310 break; 1451 break;
1311 /* mem+swap counter fails */ 1452 /* mem+swap counter fails */
1312 res_counter_uncharge(&mem->res, PAGE_SIZE); 1453 res_counter_uncharge(&mem->res, csize);
1313 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1454 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1314 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1455 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1315 memsw); 1456 memsw);
@@ -1318,6 +1459,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1318 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1459 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1319 res); 1460 res);
1320 1461
1462 /* reduce request size and retry */
1463 if (csize > PAGE_SIZE) {
1464 csize = PAGE_SIZE;
1465 continue;
1466 }
1321 if (!(gfp_mask & __GFP_WAIT)) 1467 if (!(gfp_mask & __GFP_WAIT))
1322 goto nomem; 1468 goto nomem;
1323 1469
@@ -1339,14 +1485,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1339 1485
1340 if (!nr_retries--) { 1486 if (!nr_retries--) {
1341 if (oom) { 1487 if (oom) {
1342 mutex_lock(&memcg_tasklist);
1343 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 1488 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
1344 mutex_unlock(&memcg_tasklist);
1345 record_last_oom(mem_over_limit); 1489 record_last_oom(mem_over_limit);
1346 } 1490 }
1347 goto nomem; 1491 goto nomem;
1348 } 1492 }
1349 } 1493 }
1494 if (csize > PAGE_SIZE)
1495 refill_stock(mem, csize - PAGE_SIZE);
1496charged:
1350 /* 1497 /*
1351 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 1498 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1352 * if they exceeds softlimit. 1499 * if they exceeds softlimit.
@@ -1361,6 +1508,21 @@ nomem:
1361} 1508}
1362 1509
1363/* 1510/*
1511 * Somemtimes we have to undo a charge we got by try_charge().
1512 * This function is for that and do uncharge, put css's refcnt.
1513 * gotten by try_charge().
1514 */
1515static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1516{
1517 if (!mem_cgroup_is_root(mem)) {
1518 res_counter_uncharge(&mem->res, PAGE_SIZE);
1519 if (do_swap_account)
1520 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1521 }
1522 css_put(&mem->css);
1523}
1524
1525/*
1364 * A helper function to get mem_cgroup from ID. must be called under 1526 * A helper function to get mem_cgroup from ID. must be called under
1365 * rcu_read_lock(). The caller must check css_is_removed() or some if 1527 * rcu_read_lock(). The caller must check css_is_removed() or some if
1366 * it's concern. (dropping refcnt from swap can be called against removed 1528 * it's concern. (dropping refcnt from swap can be called against removed
@@ -1379,25 +1541,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1379 return container_of(css, struct mem_cgroup, css); 1541 return container_of(css, struct mem_cgroup, css);
1380} 1542}
1381 1543
1382static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 1544struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1383{ 1545{
1384 struct mem_cgroup *mem; 1546 struct mem_cgroup *mem = NULL;
1385 struct page_cgroup *pc; 1547 struct page_cgroup *pc;
1386 unsigned short id; 1548 unsigned short id;
1387 swp_entry_t ent; 1549 swp_entry_t ent;
1388 1550
1389 VM_BUG_ON(!PageLocked(page)); 1551 VM_BUG_ON(!PageLocked(page));
1390 1552
1391 if (!PageSwapCache(page))
1392 return NULL;
1393
1394 pc = lookup_page_cgroup(page); 1553 pc = lookup_page_cgroup(page);
1395 lock_page_cgroup(pc); 1554 lock_page_cgroup(pc);
1396 if (PageCgroupUsed(pc)) { 1555 if (PageCgroupUsed(pc)) {
1397 mem = pc->mem_cgroup; 1556 mem = pc->mem_cgroup;
1398 if (mem && !css_tryget(&mem->css)) 1557 if (mem && !css_tryget(&mem->css))
1399 mem = NULL; 1558 mem = NULL;
1400 } else { 1559 } else if (PageSwapCache(page)) {
1401 ent.val = page_private(page); 1560 ent.val = page_private(page);
1402 id = lookup_swap_cgroup(ent); 1561 id = lookup_swap_cgroup(ent);
1403 rcu_read_lock(); 1562 rcu_read_lock();
@@ -1426,12 +1585,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1426 lock_page_cgroup(pc); 1585 lock_page_cgroup(pc);
1427 if (unlikely(PageCgroupUsed(pc))) { 1586 if (unlikely(PageCgroupUsed(pc))) {
1428 unlock_page_cgroup(pc); 1587 unlock_page_cgroup(pc);
1429 if (!mem_cgroup_is_root(mem)) { 1588 mem_cgroup_cancel_charge(mem);
1430 res_counter_uncharge(&mem->res, PAGE_SIZE);
1431 if (do_swap_account)
1432 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1433 }
1434 css_put(&mem->css);
1435 return; 1589 return;
1436 } 1590 }
1437 1591
@@ -1464,27 +1618,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1464} 1618}
1465 1619
1466/** 1620/**
1467 * mem_cgroup_move_account - move account of the page 1621 * __mem_cgroup_move_account - move account of the page
1468 * @pc: page_cgroup of the page. 1622 * @pc: page_cgroup of the page.
1469 * @from: mem_cgroup which the page is moved from. 1623 * @from: mem_cgroup which the page is moved from.
1470 * @to: mem_cgroup which the page is moved to. @from != @to. 1624 * @to: mem_cgroup which the page is moved to. @from != @to.
1471 * 1625 *
1472 * The caller must confirm following. 1626 * The caller must confirm following.
1473 * - page is not on LRU (isolate_page() is useful.) 1627 * - page is not on LRU (isolate_page() is useful.)
1474 * 1628 * - the pc is locked, used, and ->mem_cgroup points to @from.
1475 * returns 0 at success,
1476 * returns -EBUSY when lock is busy or "pc" is unstable.
1477 * 1629 *
1478 * This function does "uncharge" from old cgroup but doesn't do "charge" to 1630 * This function does "uncharge" from old cgroup but doesn't do "charge" to
1479 * new cgroup. It should be done by a caller. 1631 * new cgroup. It should be done by a caller.
1480 */ 1632 */
1481 1633
1482static int mem_cgroup_move_account(struct page_cgroup *pc, 1634static void __mem_cgroup_move_account(struct page_cgroup *pc,
1483 struct mem_cgroup *from, struct mem_cgroup *to) 1635 struct mem_cgroup *from, struct mem_cgroup *to)
1484{ 1636{
1485 struct mem_cgroup_per_zone *from_mz, *to_mz;
1486 int nid, zid;
1487 int ret = -EBUSY;
1488 struct page *page; 1637 struct page *page;
1489 int cpu; 1638 int cpu;
1490 struct mem_cgroup_stat *stat; 1639 struct mem_cgroup_stat *stat;
@@ -1492,38 +1641,27 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1492 1641
1493 VM_BUG_ON(from == to); 1642 VM_BUG_ON(from == to);
1494 VM_BUG_ON(PageLRU(pc->page)); 1643 VM_BUG_ON(PageLRU(pc->page));
1495 1644 VM_BUG_ON(!PageCgroupLocked(pc));
1496 nid = page_cgroup_nid(pc); 1645 VM_BUG_ON(!PageCgroupUsed(pc));
1497 zid = page_cgroup_zid(pc); 1646 VM_BUG_ON(pc->mem_cgroup != from);
1498 from_mz = mem_cgroup_zoneinfo(from, nid, zid);
1499 to_mz = mem_cgroup_zoneinfo(to, nid, zid);
1500
1501 if (!trylock_page_cgroup(pc))
1502 return ret;
1503
1504 if (!PageCgroupUsed(pc))
1505 goto out;
1506
1507 if (pc->mem_cgroup != from)
1508 goto out;
1509 1647
1510 if (!mem_cgroup_is_root(from)) 1648 if (!mem_cgroup_is_root(from))
1511 res_counter_uncharge(&from->res, PAGE_SIZE); 1649 res_counter_uncharge(&from->res, PAGE_SIZE);
1512 mem_cgroup_charge_statistics(from, pc, false); 1650 mem_cgroup_charge_statistics(from, pc, false);
1513 1651
1514 page = pc->page; 1652 page = pc->page;
1515 if (page_is_file_cache(page) && page_mapped(page)) { 1653 if (page_mapped(page) && !PageAnon(page)) {
1516 cpu = smp_processor_id(); 1654 cpu = smp_processor_id();
1517 /* Update mapped_file data for mem_cgroup "from" */ 1655 /* Update mapped_file data for mem_cgroup "from" */
1518 stat = &from->stat; 1656 stat = &from->stat;
1519 cpustat = &stat->cpustat[cpu]; 1657 cpustat = &stat->cpustat[cpu];
1520 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, 1658 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1521 -1); 1659 -1);
1522 1660
1523 /* Update mapped_file data for mem_cgroup "to" */ 1661 /* Update mapped_file data for mem_cgroup "to" */
1524 stat = &to->stat; 1662 stat = &to->stat;
1525 cpustat = &stat->cpustat[cpu]; 1663 cpustat = &stat->cpustat[cpu];
1526 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, 1664 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1527 1); 1665 1);
1528 } 1666 }
1529 1667
@@ -1534,15 +1672,28 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1534 css_get(&to->css); 1672 css_get(&to->css);
1535 pc->mem_cgroup = to; 1673 pc->mem_cgroup = to;
1536 mem_cgroup_charge_statistics(to, pc, true); 1674 mem_cgroup_charge_statistics(to, pc, true);
1537 ret = 0;
1538out:
1539 unlock_page_cgroup(pc);
1540 /* 1675 /*
1541 * We charges against "to" which may not have any tasks. Then, "to" 1676 * We charges against "to" which may not have any tasks. Then, "to"
1542 * can be under rmdir(). But in current implementation, caller of 1677 * can be under rmdir(). But in current implementation, caller of
1543 * this function is just force_empty() and it's garanteed that 1678 * this function is just force_empty() and it's garanteed that
1544 * "to" is never removed. So, we don't check rmdir status here. 1679 * "to" is never removed. So, we don't check rmdir status here.
1545 */ 1680 */
1681}
1682
1683/*
1684 * check whether the @pc is valid for moving account and call
1685 * __mem_cgroup_move_account()
1686 */
1687static int mem_cgroup_move_account(struct page_cgroup *pc,
1688 struct mem_cgroup *from, struct mem_cgroup *to)
1689{
1690 int ret = -EINVAL;
1691 lock_page_cgroup(pc);
1692 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
1693 __mem_cgroup_move_account(pc, from, to);
1694 ret = 0;
1695 }
1696 unlock_page_cgroup(pc);
1546 return ret; 1697 return ret;
1547} 1698}
1548 1699
@@ -1564,45 +1715,27 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1564 if (!pcg) 1715 if (!pcg)
1565 return -EINVAL; 1716 return -EINVAL;
1566 1717
1718 ret = -EBUSY;
1719 if (!get_page_unless_zero(page))
1720 goto out;
1721 if (isolate_lru_page(page))
1722 goto put;
1567 1723
1568 parent = mem_cgroup_from_cont(pcg); 1724 parent = mem_cgroup_from_cont(pcg);
1569
1570
1571 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); 1725 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1572 if (ret || !parent) 1726 if (ret || !parent)
1573 return ret; 1727 goto put_back;
1574
1575 if (!get_page_unless_zero(page)) {
1576 ret = -EBUSY;
1577 goto uncharge;
1578 }
1579
1580 ret = isolate_lru_page(page);
1581
1582 if (ret)
1583 goto cancel;
1584 1728
1585 ret = mem_cgroup_move_account(pc, child, parent); 1729 ret = mem_cgroup_move_account(pc, child, parent);
1586 1730 if (!ret)
1731 css_put(&parent->css); /* drop extra refcnt by try_charge() */
1732 else
1733 mem_cgroup_cancel_charge(parent); /* does css_put */
1734put_back:
1587 putback_lru_page(page); 1735 putback_lru_page(page);
1588 if (!ret) { 1736put:
1589 put_page(page);
1590 /* drop extra refcnt by try_charge() */
1591 css_put(&parent->css);
1592 return 0;
1593 }
1594
1595cancel:
1596 put_page(page); 1737 put_page(page);
1597uncharge: 1738out:
1598 /* drop extra refcnt by try_charge() */
1599 css_put(&parent->css);
1600 /* uncharge if move fails */
1601 if (!mem_cgroup_is_root(parent)) {
1602 res_counter_uncharge(&parent->res, PAGE_SIZE);
1603 if (do_swap_account)
1604 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1605 }
1606 return ret; 1739 return ret;
1607} 1740}
1608 1741
@@ -1720,7 +1853,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1720/* 1853/*
1721 * While swap-in, try_charge -> commit or cancel, the page is locked. 1854 * While swap-in, try_charge -> commit or cancel, the page is locked.
1722 * And when try_charge() successfully returns, one refcnt to memcg without 1855 * And when try_charge() successfully returns, one refcnt to memcg without
1723 * struct page_cgroup is aquired. This refcnt will be cumsumed by 1856 * struct page_cgroup is acquired. This refcnt will be consumed by
1724 * "commit()" or removed by "cancel()" 1857 * "commit()" or removed by "cancel()"
1725 */ 1858 */
1726int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 1859int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
@@ -1737,12 +1870,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1737 goto charge_cur_mm; 1870 goto charge_cur_mm;
1738 /* 1871 /*
1739 * A racing thread's fault, or swapoff, may have already updated 1872 * A racing thread's fault, or swapoff, may have already updated
1740 * the pte, and even removed page from swap cache: return success 1873 * the pte, and even removed page from swap cache: in those cases
1741 * to go on to do_swap_page()'s pte_same() test, which should fail. 1874 * do_swap_page()'s pte_same() test will fail; but there's also a
1875 * KSM case which does need to charge the page.
1742 */ 1876 */
1743 if (!PageSwapCache(page)) 1877 if (!PageSwapCache(page))
1744 return 0; 1878 goto charge_cur_mm;
1745 mem = try_get_mem_cgroup_from_swapcache(page); 1879 mem = try_get_mem_cgroup_from_page(page);
1746 if (!mem) 1880 if (!mem)
1747 goto charge_cur_mm; 1881 goto charge_cur_mm;
1748 *ptr = mem; 1882 *ptr = mem;
@@ -1818,14 +1952,53 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1818 return; 1952 return;
1819 if (!mem) 1953 if (!mem)
1820 return; 1954 return;
1821 if (!mem_cgroup_is_root(mem)) { 1955 mem_cgroup_cancel_charge(mem);
1822 res_counter_uncharge(&mem->res, PAGE_SIZE);
1823 if (do_swap_account)
1824 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1825 }
1826 css_put(&mem->css);
1827} 1956}
1828 1957
1958static void
1959__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
1960{
1961 struct memcg_batch_info *batch = NULL;
1962 bool uncharge_memsw = true;
1963 /* If swapout, usage of swap doesn't decrease */
1964 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1965 uncharge_memsw = false;
1966 /*
1967 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
1968 * In those cases, all pages freed continously can be expected to be in
1969 * the same cgroup and we have chance to coalesce uncharges.
1970 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
1971 * because we want to do uncharge as soon as possible.
1972 */
1973 if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
1974 goto direct_uncharge;
1975
1976 batch = &current->memcg_batch;
1977 /*
1978 * In usual, we do css_get() when we remember memcg pointer.
1979 * But in this case, we keep res->usage until end of a series of
1980 * uncharges. Then, it's ok to ignore memcg's refcnt.
1981 */
1982 if (!batch->memcg)
1983 batch->memcg = mem;
1984 /*
1985 * In typical case, batch->memcg == mem. This means we can
1986 * merge a series of uncharges to an uncharge of res_counter.
1987 * If not, we uncharge res_counter ony by one.
1988 */
1989 if (batch->memcg != mem)
1990 goto direct_uncharge;
1991 /* remember freed charge and uncharge it later */
1992 batch->bytes += PAGE_SIZE;
1993 if (uncharge_memsw)
1994 batch->memsw_bytes += PAGE_SIZE;
1995 return;
1996direct_uncharge:
1997 res_counter_uncharge(&mem->res, PAGE_SIZE);
1998 if (uncharge_memsw)
1999 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
2000 return;
2001}
1829 2002
1830/* 2003/*
1831 * uncharge if !page_mapped(page) 2004 * uncharge if !page_mapped(page)
@@ -1874,12 +2047,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1874 break; 2047 break;
1875 } 2048 }
1876 2049
1877 if (!mem_cgroup_is_root(mem)) { 2050 if (!mem_cgroup_is_root(mem))
1878 res_counter_uncharge(&mem->res, PAGE_SIZE); 2051 __do_uncharge(mem, ctype);
1879 if (do_swap_account &&
1880 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1881 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1882 }
1883 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2052 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1884 mem_cgroup_swap_statistics(mem, true); 2053 mem_cgroup_swap_statistics(mem, true);
1885 mem_cgroup_charge_statistics(mem, pc, false); 2054 mem_cgroup_charge_statistics(mem, pc, false);
@@ -1925,6 +2094,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
1925 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 2094 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1926} 2095}
1927 2096
2097/*
2098 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
2099 * In that cases, pages are freed continuously and we can expect pages
2100 * are in the same memcg. All these calls itself limits the number of
2101 * pages freed at once, then uncharge_start/end() is called properly.
2102 * This may be called prural(2) times in a context,
2103 */
2104
2105void mem_cgroup_uncharge_start(void)
2106{
2107 current->memcg_batch.do_batch++;
2108 /* We can do nest. */
2109 if (current->memcg_batch.do_batch == 1) {
2110 current->memcg_batch.memcg = NULL;
2111 current->memcg_batch.bytes = 0;
2112 current->memcg_batch.memsw_bytes = 0;
2113 }
2114}
2115
2116void mem_cgroup_uncharge_end(void)
2117{
2118 struct memcg_batch_info *batch = &current->memcg_batch;
2119
2120 if (!batch->do_batch)
2121 return;
2122
2123 batch->do_batch--;
2124 if (batch->do_batch) /* If stacked, do nothing. */
2125 return;
2126
2127 if (!batch->memcg)
2128 return;
2129 /*
2130 * This "batch->memcg" is valid without any css_get/put etc...
2131 * bacause we hide charges behind us.
2132 */
2133 if (batch->bytes)
2134 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2135 if (batch->memsw_bytes)
2136 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2137 /* forget this pointer (for sanity check) */
2138 batch->memcg = NULL;
2139}
2140
1928#ifdef CONFIG_SWAP 2141#ifdef CONFIG_SWAP
1929/* 2142/*
1930 * called after __delete_from_swap_cache() and drop "page" account. 2143 * called after __delete_from_swap_cache() and drop "page" account.
@@ -2100,7 +2313,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2100 unsigned long long val) 2313 unsigned long long val)
2101{ 2314{
2102 int retry_count; 2315 int retry_count;
2103 int progress;
2104 u64 memswlimit; 2316 u64 memswlimit;
2105 int ret = 0; 2317 int ret = 0;
2106 int children = mem_cgroup_count_children(memcg); 2318 int children = mem_cgroup_count_children(memcg);
@@ -2144,8 +2356,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2144 if (!ret) 2356 if (!ret)
2145 break; 2357 break;
2146 2358
2147 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, 2359 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2148 GFP_KERNEL,
2149 MEM_CGROUP_RECLAIM_SHRINK); 2360 MEM_CGROUP_RECLAIM_SHRINK);
2150 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2361 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2151 /* Usage is reduced ? */ 2362 /* Usage is reduced ? */
@@ -2384,6 +2595,7 @@ move_account:
2384 goto out; 2595 goto out;
2385 /* This is for making all *used* pages to be on LRU. */ 2596 /* This is for making all *used* pages to be on LRU. */
2386 lru_add_drain_all(); 2597 lru_add_drain_all();
2598 drain_all_stock_sync();
2387 ret = 0; 2599 ret = 0;
2388 for_each_node_state(node, N_HIGH_MEMORY) { 2600 for_each_node_state(node, N_HIGH_MEMORY) {
2389 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 2601 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
@@ -2466,7 +2678,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
2466 2678
2467 cgroup_lock(); 2679 cgroup_lock();
2468 /* 2680 /*
2469 * If parent's use_hiearchy is set, we can't make any modifications 2681 * If parent's use_hierarchy is set, we can't make any modifications
2470 * in the child subtrees. If it is unset, then the change can 2682 * in the child subtrees. If it is unset, then the change can
2471 * occur, provided the current cgroup has no children. 2683 * occur, provided the current cgroup has no children.
2472 * 2684 *
@@ -2541,6 +2753,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2541 val += idx_val; 2753 val += idx_val;
2542 mem_cgroup_get_recursive_idx_stat(mem, 2754 mem_cgroup_get_recursive_idx_stat(mem,
2543 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 2755 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2756 val += idx_val;
2544 val <<= PAGE_SHIFT; 2757 val <<= PAGE_SHIFT;
2545 } else 2758 } else
2546 val = res_counter_read_u64(&mem->memsw, name); 2759 val = res_counter_read_u64(&mem->memsw, name);
@@ -2660,7 +2873,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2660enum { 2873enum {
2661 MCS_CACHE, 2874 MCS_CACHE,
2662 MCS_RSS, 2875 MCS_RSS,
2663 MCS_MAPPED_FILE, 2876 MCS_FILE_MAPPED,
2664 MCS_PGPGIN, 2877 MCS_PGPGIN,
2665 MCS_PGPGOUT, 2878 MCS_PGPGOUT,
2666 MCS_SWAP, 2879 MCS_SWAP,
@@ -2704,8 +2917,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2704 s->stat[MCS_CACHE] += val * PAGE_SIZE; 2917 s->stat[MCS_CACHE] += val * PAGE_SIZE;
2705 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 2918 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
2706 s->stat[MCS_RSS] += val * PAGE_SIZE; 2919 s->stat[MCS_RSS] += val * PAGE_SIZE;
2707 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); 2920 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
2708 s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; 2921 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
2709 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); 2922 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
2710 s->stat[MCS_PGPGIN] += val; 2923 s->stat[MCS_PGPGIN] += val;
2711 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 2924 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
@@ -3097,11 +3310,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3097 3310
3098 /* root ? */ 3311 /* root ? */
3099 if (cont->parent == NULL) { 3312 if (cont->parent == NULL) {
3313 int cpu;
3100 enable_swap_cgroup(); 3314 enable_swap_cgroup();
3101 parent = NULL; 3315 parent = NULL;
3102 root_mem_cgroup = mem; 3316 root_mem_cgroup = mem;
3103 if (mem_cgroup_soft_limit_tree_init()) 3317 if (mem_cgroup_soft_limit_tree_init())
3104 goto free_out; 3318 goto free_out;
3319 for_each_possible_cpu(cpu) {
3320 struct memcg_stock_pcp *stock =
3321 &per_cpu(memcg_stock, cpu);
3322 INIT_WORK(&stock->work, drain_local_stock);
3323 }
3324 hotcpu_notifier(memcg_stock_cpu_callback, 0);
3105 3325
3106 } else { 3326 } else {
3107 parent = mem_cgroup_from_cont(cont->parent); 3327 parent = mem_cgroup_from_cont(cont->parent);
@@ -3170,12 +3390,10 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3170 struct task_struct *p, 3390 struct task_struct *p,
3171 bool threadgroup) 3391 bool threadgroup)
3172{ 3392{
3173 mutex_lock(&memcg_tasklist);
3174 /* 3393 /*
3175 * FIXME: It's better to move charges of this process from old 3394 * FIXME: It's better to move charges of this process from old
3176 * memcg to new memcg. But it's just on TODO-List now. 3395 * memcg to new memcg. But it's just on TODO-List now.
3177 */ 3396 */
3178 mutex_unlock(&memcg_tasklist);
3179} 3397}
3180 3398
3181struct cgroup_subsys mem_cgroup_subsys = { 3399struct cgroup_subsys mem_cgroup_subsys = {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 729d4b15b645..17299fd4577c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -34,11 +34,16 @@
34#include <linux/kernel.h> 34#include <linux/kernel.h>
35#include <linux/mm.h> 35#include <linux/mm.h>
36#include <linux/page-flags.h> 36#include <linux/page-flags.h>
37#include <linux/kernel-page-flags.h>
37#include <linux/sched.h> 38#include <linux/sched.h>
39#include <linux/ksm.h>
38#include <linux/rmap.h> 40#include <linux/rmap.h>
39#include <linux/pagemap.h> 41#include <linux/pagemap.h>
40#include <linux/swap.h> 42#include <linux/swap.h>
41#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
44#include <linux/migrate.h>
45#include <linux/page-isolation.h>
46#include <linux/suspend.h>
42#include "internal.h" 47#include "internal.h"
43 48
44int sysctl_memory_failure_early_kill __read_mostly = 0; 49int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -47,6 +52,129 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
47 52
48atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); 53atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
49 54
55#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
56
57u32 hwpoison_filter_enable = 0;
58u32 hwpoison_filter_dev_major = ~0U;
59u32 hwpoison_filter_dev_minor = ~0U;
60u64 hwpoison_filter_flags_mask;
61u64 hwpoison_filter_flags_value;
62EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
63EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
64EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
65EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
66EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
67
68static int hwpoison_filter_dev(struct page *p)
69{
70 struct address_space *mapping;
71 dev_t dev;
72
73 if (hwpoison_filter_dev_major == ~0U &&
74 hwpoison_filter_dev_minor == ~0U)
75 return 0;
76
77 /*
78 * page_mapping() does not accept slab page
79 */
80 if (PageSlab(p))
81 return -EINVAL;
82
83 mapping = page_mapping(p);
84 if (mapping == NULL || mapping->host == NULL)
85 return -EINVAL;
86
87 dev = mapping->host->i_sb->s_dev;
88 if (hwpoison_filter_dev_major != ~0U &&
89 hwpoison_filter_dev_major != MAJOR(dev))
90 return -EINVAL;
91 if (hwpoison_filter_dev_minor != ~0U &&
92 hwpoison_filter_dev_minor != MINOR(dev))
93 return -EINVAL;
94
95 return 0;
96}
97
98static int hwpoison_filter_flags(struct page *p)
99{
100 if (!hwpoison_filter_flags_mask)
101 return 0;
102
103 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
104 hwpoison_filter_flags_value)
105 return 0;
106 else
107 return -EINVAL;
108}
109
110/*
111 * This allows stress tests to limit test scope to a collection of tasks
112 * by putting them under some memcg. This prevents killing unrelated/important
113 * processes such as /sbin/init. Note that the target task may share clean
114 * pages with init (eg. libc text), which is harmless. If the target task
115 * share _dirty_ pages with another task B, the test scheme must make sure B
116 * is also included in the memcg. At last, due to race conditions this filter
117 * can only guarantee that the page either belongs to the memcg tasks, or is
118 * a freed page.
119 */
120#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
121u64 hwpoison_filter_memcg;
122EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
123static int hwpoison_filter_task(struct page *p)
124{
125 struct mem_cgroup *mem;
126 struct cgroup_subsys_state *css;
127 unsigned long ino;
128
129 if (!hwpoison_filter_memcg)
130 return 0;
131
132 mem = try_get_mem_cgroup_from_page(p);
133 if (!mem)
134 return -EINVAL;
135
136 css = mem_cgroup_css(mem);
137 /* root_mem_cgroup has NULL dentries */
138 if (!css->cgroup->dentry)
139 return -EINVAL;
140
141 ino = css->cgroup->dentry->d_inode->i_ino;
142 css_put(css);
143
144 if (ino != hwpoison_filter_memcg)
145 return -EINVAL;
146
147 return 0;
148}
149#else
150static int hwpoison_filter_task(struct page *p) { return 0; }
151#endif
152
153int hwpoison_filter(struct page *p)
154{
155 if (!hwpoison_filter_enable)
156 return 0;
157
158 if (hwpoison_filter_dev(p))
159 return -EINVAL;
160
161 if (hwpoison_filter_flags(p))
162 return -EINVAL;
163
164 if (hwpoison_filter_task(p))
165 return -EINVAL;
166
167 return 0;
168}
169#else
170int hwpoison_filter(struct page *p)
171{
172 return 0;
173}
174#endif
175
176EXPORT_SYMBOL_GPL(hwpoison_filter);
177
50/* 178/*
51 * Send all the processes who have the page mapped an ``action optional'' 179 * Send all the processes who have the page mapped an ``action optional''
52 * signal. 180 * signal.
@@ -82,6 +210,36 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
82} 210}
83 211
84/* 212/*
213 * When a unknown page type is encountered drain as many buffers as possible
214 * in the hope to turn the page into a LRU or free page, which we can handle.
215 */
216void shake_page(struct page *p, int access)
217{
218 if (!PageSlab(p)) {
219 lru_add_drain_all();
220 if (PageLRU(p))
221 return;
222 drain_all_pages();
223 if (PageLRU(p) || is_free_buddy_page(p))
224 return;
225 }
226
227 /*
228 * Only all shrink_slab here (which would also
229 * shrink other caches) if access is not potentially fatal.
230 */
231 if (access) {
232 int nr;
233 do {
234 nr = shrink_slab(1000, GFP_KERNEL, 1000);
235 if (page_count(p) == 0)
236 break;
237 } while (nr > 10);
238 }
239}
240EXPORT_SYMBOL_GPL(shake_page);
241
242/*
85 * Kill all processes that have a poisoned page mapped and then isolate 243 * Kill all processes that have a poisoned page mapped and then isolate
86 * the page. 244 * the page.
87 * 245 *
@@ -173,10 +331,9 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
173 list_for_each_entry_safe (tk, next, to_kill, nd) { 331 list_for_each_entry_safe (tk, next, to_kill, nd) {
174 if (doit) { 332 if (doit) {
175 /* 333 /*
176 * In case something went wrong with munmaping 334 * In case something went wrong with munmapping
177 * make sure the process doesn't catch the 335 * make sure the process doesn't catch the
178 * signal and then access the memory. Just kill it. 336 * signal and then access the memory. Just kill it.
179 * the signal handlers
180 */ 337 */
181 if (fail || tk->addr_valid == 0) { 338 if (fail || tk->addr_valid == 0) {
182 printk(KERN_ERR 339 printk(KERN_ERR
@@ -313,33 +470,49 @@ static void collect_procs(struct page *page, struct list_head *tokill)
313 */ 470 */
314 471
315enum outcome { 472enum outcome {
316 FAILED, /* Error handling failed */ 473 IGNORED, /* Error: cannot be handled */
474 FAILED, /* Error: handling failed */
317 DELAYED, /* Will be handled later */ 475 DELAYED, /* Will be handled later */
318 IGNORED, /* Error safely ignored */
319 RECOVERED, /* Successfully recovered */ 476 RECOVERED, /* Successfully recovered */
320}; 477};
321 478
322static const char *action_name[] = { 479static const char *action_name[] = {
480 [IGNORED] = "Ignored",
323 [FAILED] = "Failed", 481 [FAILED] = "Failed",
324 [DELAYED] = "Delayed", 482 [DELAYED] = "Delayed",
325 [IGNORED] = "Ignored",
326 [RECOVERED] = "Recovered", 483 [RECOVERED] = "Recovered",
327}; 484};
328 485
329/* 486/*
330 * Error hit kernel page. 487 * XXX: It is possible that a page is isolated from LRU cache,
331 * Do nothing, try to be lucky and not touch this instead. For a few cases we 488 * and then kept in swap cache or failed to remove from page cache.
332 * could be more sophisticated. 489 * The page count will stop it from being freed by unpoison.
490 * Stress tests should be aware of this memory leak problem.
333 */ 491 */
334static int me_kernel(struct page *p, unsigned long pfn) 492static int delete_from_lru_cache(struct page *p)
335{ 493{
336 return DELAYED; 494 if (!isolate_lru_page(p)) {
495 /*
496 * Clear sensible page flags, so that the buddy system won't
497 * complain when the page is unpoison-and-freed.
498 */
499 ClearPageActive(p);
500 ClearPageUnevictable(p);
501 /*
502 * drop the page count elevated by isolate_lru_page()
503 */
504 page_cache_release(p);
505 return 0;
506 }
507 return -EIO;
337} 508}
338 509
339/* 510/*
340 * Already poisoned page. 511 * Error hit kernel page.
512 * Do nothing, try to be lucky and not touch this instead. For a few cases we
513 * could be more sophisticated.
341 */ 514 */
342static int me_ignore(struct page *p, unsigned long pfn) 515static int me_kernel(struct page *p, unsigned long pfn)
343{ 516{
344 return IGNORED; 517 return IGNORED;
345} 518}
@@ -354,14 +527,6 @@ static int me_unknown(struct page *p, unsigned long pfn)
354} 527}
355 528
356/* 529/*
357 * Free memory
358 */
359static int me_free(struct page *p, unsigned long pfn)
360{
361 return DELAYED;
362}
363
364/*
365 * Clean (or cleaned) page cache page. 530 * Clean (or cleaned) page cache page.
366 */ 531 */
367static int me_pagecache_clean(struct page *p, unsigned long pfn) 532static int me_pagecache_clean(struct page *p, unsigned long pfn)
@@ -370,8 +535,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
370 int ret = FAILED; 535 int ret = FAILED;
371 struct address_space *mapping; 536 struct address_space *mapping;
372 537
373 if (!isolate_lru_page(p)) 538 delete_from_lru_cache(p);
374 page_cache_release(p);
375 539
376 /* 540 /*
377 * For anonymous pages we're done the only reference left 541 * For anonymous pages we're done the only reference left
@@ -498,30 +662,24 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
498 */ 662 */
499static int me_swapcache_dirty(struct page *p, unsigned long pfn) 663static int me_swapcache_dirty(struct page *p, unsigned long pfn)
500{ 664{
501 int ret = FAILED;
502
503 ClearPageDirty(p); 665 ClearPageDirty(p);
504 /* Trigger EIO in shmem: */ 666 /* Trigger EIO in shmem: */
505 ClearPageUptodate(p); 667 ClearPageUptodate(p);
506 668
507 if (!isolate_lru_page(p)) { 669 if (!delete_from_lru_cache(p))
508 page_cache_release(p); 670 return DELAYED;
509 ret = DELAYED; 671 else
510 } 672 return FAILED;
511
512 return ret;
513} 673}
514 674
515static int me_swapcache_clean(struct page *p, unsigned long pfn) 675static int me_swapcache_clean(struct page *p, unsigned long pfn)
516{ 676{
517 int ret = FAILED;
518
519 if (!isolate_lru_page(p)) {
520 page_cache_release(p);
521 ret = RECOVERED;
522 }
523 delete_from_swap_cache(p); 677 delete_from_swap_cache(p);
524 return ret; 678
679 if (!delete_from_lru_cache(p))
680 return RECOVERED;
681 else
682 return FAILED;
525} 683}
526 684
527/* 685/*
@@ -564,7 +722,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
564#define tail (1UL << PG_tail) 722#define tail (1UL << PG_tail)
565#define compound (1UL << PG_compound) 723#define compound (1UL << PG_compound)
566#define slab (1UL << PG_slab) 724#define slab (1UL << PG_slab)
567#define buddy (1UL << PG_buddy)
568#define reserved (1UL << PG_reserved) 725#define reserved (1UL << PG_reserved)
569 726
570static struct page_state { 727static struct page_state {
@@ -573,8 +730,11 @@ static struct page_state {
573 char *msg; 730 char *msg;
574 int (*action)(struct page *p, unsigned long pfn); 731 int (*action)(struct page *p, unsigned long pfn);
575} error_states[] = { 732} error_states[] = {
576 { reserved, reserved, "reserved kernel", me_ignore }, 733 { reserved, reserved, "reserved kernel", me_kernel },
577 { buddy, buddy, "free kernel", me_free }, 734 /*
735 * free pages are specially detected outside this table:
736 * PG_buddy pages only make a small fraction of all free pages.
737 */
578 738
579 /* 739 /*
580 * Could in theory check if slab page is free or if we can drop 740 * Could in theory check if slab page is free or if we can drop
@@ -596,14 +756,11 @@ static struct page_state {
596 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, 756 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
597 { unevict, unevict, "unevictable LRU", me_pagecache_clean}, 757 { unevict, unevict, "unevictable LRU", me_pagecache_clean},
598 758
599#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
600 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, 759 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
601 { mlock, mlock, "mlocked LRU", me_pagecache_clean }, 760 { mlock, mlock, "mlocked LRU", me_pagecache_clean },
602#endif
603 761
604 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, 762 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
605 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 763 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
606 { swapbacked, swapbacked, "anonymous", me_pagecache_clean },
607 764
608 /* 765 /*
609 * Catchall entry: must be at end. 766 * Catchall entry: must be at end.
@@ -611,38 +768,54 @@ static struct page_state {
611 { 0, 0, "unknown page state", me_unknown }, 768 { 0, 0, "unknown page state", me_unknown },
612}; 769};
613 770
771#undef dirty
772#undef sc
773#undef unevict
774#undef mlock
775#undef writeback
614#undef lru 776#undef lru
777#undef swapbacked
778#undef head
779#undef tail
780#undef compound
781#undef slab
782#undef reserved
615 783
616static void action_result(unsigned long pfn, char *msg, int result) 784static void action_result(unsigned long pfn, char *msg, int result)
617{ 785{
618 struct page *page = NULL; 786 struct page *page = pfn_to_page(pfn);
619 if (pfn_valid(pfn))
620 page = pfn_to_page(pfn);
621 787
622 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", 788 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
623 pfn, 789 pfn,
624 page && PageDirty(page) ? "dirty " : "", 790 PageDirty(page) ? "dirty " : "",
625 msg, action_name[result]); 791 msg, action_name[result]);
626} 792}
627 793
628static int page_action(struct page_state *ps, struct page *p, 794static int page_action(struct page_state *ps, struct page *p,
629 unsigned long pfn, int ref) 795 unsigned long pfn)
630{ 796{
631 int result; 797 int result;
798 int count;
632 799
633 result = ps->action(p, pfn); 800 result = ps->action(p, pfn);
634 action_result(pfn, ps->msg, result); 801 action_result(pfn, ps->msg, result);
635 if (page_count(p) != 1 + ref) 802
803 count = page_count(p) - 1;
804 if (ps->action == me_swapcache_dirty && result == DELAYED)
805 count--;
806 if (count != 0) {
636 printk(KERN_ERR 807 printk(KERN_ERR
637 "MCE %#lx: %s page still referenced by %d users\n", 808 "MCE %#lx: %s page still referenced by %d users\n",
638 pfn, ps->msg, page_count(p) - 1); 809 pfn, ps->msg, count);
810 result = FAILED;
811 }
639 812
640 /* Could do more checks here if page looks ok */ 813 /* Could do more checks here if page looks ok */
641 /* 814 /*
642 * Could adjust zone counters here to correct for the missing page. 815 * Could adjust zone counters here to correct for the missing page.
643 */ 816 */
644 817
645 return result == RECOVERED ? 0 : -EBUSY; 818 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
646} 819}
647 820
648#define N_UNMAP_TRIES 5 821#define N_UNMAP_TRIES 5
@@ -651,7 +824,7 @@ static int page_action(struct page_state *ps, struct page *p,
651 * Do all that is necessary to remove user space mappings. Unmap 824 * Do all that is necessary to remove user space mappings. Unmap
652 * the pages and send SIGBUS to the processes if the data was dirty. 825 * the pages and send SIGBUS to the processes if the data was dirty.
653 */ 826 */
654static void hwpoison_user_mappings(struct page *p, unsigned long pfn, 827static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
655 int trapno) 828 int trapno)
656{ 829{
657 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 830 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
@@ -661,18 +834,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
661 int i; 834 int i;
662 int kill = 1; 835 int kill = 1;
663 836
664 if (PageReserved(p) || PageCompound(p) || PageSlab(p)) 837 if (PageReserved(p) || PageSlab(p))
665 return; 838 return SWAP_SUCCESS;
666
667 if (!PageLRU(p))
668 lru_add_drain_all();
669 839
670 /* 840 /*
671 * This check implies we don't kill processes if their pages 841 * This check implies we don't kill processes if their pages
672 * are in the swap cache early. Those are always late kills. 842 * are in the swap cache early. Those are always late kills.
673 */ 843 */
674 if (!page_mapped(p)) 844 if (!page_mapped(p))
675 return; 845 return SWAP_SUCCESS;
846
847 if (PageCompound(p) || PageKsm(p))
848 return SWAP_FAIL;
676 849
677 if (PageSwapCache(p)) { 850 if (PageSwapCache(p)) {
678 printk(KERN_ERR 851 printk(KERN_ERR
@@ -683,6 +856,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
683 /* 856 /*
684 * Propagate the dirty bit from PTEs to struct page first, because we 857 * Propagate the dirty bit from PTEs to struct page first, because we
685 * need this to decide if we should kill or just drop the page. 858 * need this to decide if we should kill or just drop the page.
859 * XXX: the dirty test could be racy: set_page_dirty() may not always
860 * be called inside page lock (it's recommended but not enforced).
686 */ 861 */
687 mapping = page_mapping(p); 862 mapping = page_mapping(p);
688 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { 863 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
@@ -734,9 +909,11 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
734 */ 909 */
735 kill_procs_ao(&tokill, !!PageDirty(p), trapno, 910 kill_procs_ao(&tokill, !!PageDirty(p), trapno,
736 ret != SWAP_SUCCESS, pfn); 911 ret != SWAP_SUCCESS, pfn);
912
913 return ret;
737} 914}
738 915
739int __memory_failure(unsigned long pfn, int trapno, int ref) 916int __memory_failure(unsigned long pfn, int trapno, int flags)
740{ 917{
741 struct page_state *ps; 918 struct page_state *ps;
742 struct page *p; 919 struct page *p;
@@ -746,13 +923,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
746 panic("Memory failure from trap %d on page %lx", trapno, pfn); 923 panic("Memory failure from trap %d on page %lx", trapno, pfn);
747 924
748 if (!pfn_valid(pfn)) { 925 if (!pfn_valid(pfn)) {
749 action_result(pfn, "memory outside kernel control", IGNORED); 926 printk(KERN_ERR
750 return -EIO; 927 "MCE %#lx: memory outside kernel control\n",
928 pfn);
929 return -ENXIO;
751 } 930 }
752 931
753 p = pfn_to_page(pfn); 932 p = pfn_to_page(pfn);
754 if (TestSetPageHWPoison(p)) { 933 if (TestSetPageHWPoison(p)) {
755 action_result(pfn, "already hardware poisoned", IGNORED); 934 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
756 return 0; 935 return 0;
757 } 936 }
758 937
@@ -769,9 +948,38 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
769 * In fact it's dangerous to directly bump up page count from 0, 948 * In fact it's dangerous to directly bump up page count from 0,
770 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 949 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
771 */ 950 */
772 if (!get_page_unless_zero(compound_head(p))) { 951 if (!(flags & MF_COUNT_INCREASED) &&
773 action_result(pfn, "free or high order kernel", IGNORED); 952 !get_page_unless_zero(compound_head(p))) {
774 return PageBuddy(compound_head(p)) ? 0 : -EBUSY; 953 if (is_free_buddy_page(p)) {
954 action_result(pfn, "free buddy", DELAYED);
955 return 0;
956 } else {
957 action_result(pfn, "high order kernel", IGNORED);
958 return -EBUSY;
959 }
960 }
961
962 /*
963 * We ignore non-LRU pages for good reasons.
964 * - PG_locked is only well defined for LRU pages and a few others
965 * - to avoid races with __set_page_locked()
966 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
967 * The check (unnecessarily) ignores LRU pages being isolated and
968 * walked by the page reclaim code, however that's not a big loss.
969 */
970 if (!PageLRU(p))
971 shake_page(p, 0);
972 if (!PageLRU(p)) {
973 /*
974 * shake_page could have turned it free.
975 */
976 if (is_free_buddy_page(p)) {
977 action_result(pfn, "free buddy, 2nd try", DELAYED);
978 return 0;
979 }
980 action_result(pfn, "non LRU", IGNORED);
981 put_page(p);
982 return -EBUSY;
775 } 983 }
776 984
777 /* 985 /*
@@ -780,26 +988,48 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
780 * and in many cases impossible, so we just avoid it here. 988 * and in many cases impossible, so we just avoid it here.
781 */ 989 */
782 lock_page_nosync(p); 990 lock_page_nosync(p);
991
992 /*
993 * unpoison always clear PG_hwpoison inside page lock
994 */
995 if (!PageHWPoison(p)) {
996 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
997 res = 0;
998 goto out;
999 }
1000 if (hwpoison_filter(p)) {
1001 if (TestClearPageHWPoison(p))
1002 atomic_long_dec(&mce_bad_pages);
1003 unlock_page(p);
1004 put_page(p);
1005 return 0;
1006 }
1007
783 wait_on_page_writeback(p); 1008 wait_on_page_writeback(p);
784 1009
785 /* 1010 /*
786 * Now take care of user space mappings. 1011 * Now take care of user space mappings.
1012 * Abort on fail: __remove_from_page_cache() assumes unmapped page.
787 */ 1013 */
788 hwpoison_user_mappings(p, pfn, trapno); 1014 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
1015 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1016 res = -EBUSY;
1017 goto out;
1018 }
789 1019
790 /* 1020 /*
791 * Torn down by someone else? 1021 * Torn down by someone else?
792 */ 1022 */
793 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { 1023 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
794 action_result(pfn, "already truncated LRU", IGNORED); 1024 action_result(pfn, "already truncated LRU", IGNORED);
795 res = 0; 1025 res = -EBUSY;
796 goto out; 1026 goto out;
797 } 1027 }
798 1028
799 res = -EBUSY; 1029 res = -EBUSY;
800 for (ps = error_states;; ps++) { 1030 for (ps = error_states;; ps++) {
801 if ((p->flags & ps->mask) == ps->res) { 1031 if ((p->flags & ps->mask) == ps->res) {
802 res = page_action(ps, p, pfn, ref); 1032 res = page_action(ps, p, pfn);
803 break; 1033 break;
804 } 1034 }
805 } 1035 }
@@ -830,3 +1060,235 @@ void memory_failure(unsigned long pfn, int trapno)
830{ 1060{
831 __memory_failure(pfn, trapno, 0); 1061 __memory_failure(pfn, trapno, 0);
832} 1062}
1063
1064/**
1065 * unpoison_memory - Unpoison a previously poisoned page
1066 * @pfn: Page number of the to be unpoisoned page
1067 *
1068 * Software-unpoison a page that has been poisoned by
1069 * memory_failure() earlier.
1070 *
1071 * This is only done on the software-level, so it only works
1072 * for linux injected failures, not real hardware failures
1073 *
1074 * Returns 0 for success, otherwise -errno.
1075 */
1076int unpoison_memory(unsigned long pfn)
1077{
1078 struct page *page;
1079 struct page *p;
1080 int freeit = 0;
1081
1082 if (!pfn_valid(pfn))
1083 return -ENXIO;
1084
1085 p = pfn_to_page(pfn);
1086 page = compound_head(p);
1087
1088 if (!PageHWPoison(p)) {
1089 pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
1090 return 0;
1091 }
1092
1093 if (!get_page_unless_zero(page)) {
1094 if (TestClearPageHWPoison(p))
1095 atomic_long_dec(&mce_bad_pages);
1096 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
1097 return 0;
1098 }
1099
1100 lock_page_nosync(page);
1101 /*
1102 * This test is racy because PG_hwpoison is set outside of page lock.
1103 * That's acceptable because that won't trigger kernel panic. Instead,
1104 * the PG_hwpoison page will be caught and isolated on the entrance to
1105 * the free buddy page pool.
1106 */
1107 if (TestClearPageHWPoison(p)) {
1108 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
1109 atomic_long_dec(&mce_bad_pages);
1110 freeit = 1;
1111 }
1112 unlock_page(page);
1113
1114 put_page(page);
1115 if (freeit)
1116 put_page(page);
1117
1118 return 0;
1119}
1120EXPORT_SYMBOL(unpoison_memory);
1121
1122static struct page *new_page(struct page *p, unsigned long private, int **x)
1123{
1124 int nid = page_to_nid(p);
1125 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1126}
1127
1128/*
1129 * Safely get reference count of an arbitrary page.
1130 * Returns 0 for a free page, -EIO for a zero refcount page
1131 * that is not free, and 1 for any other page type.
1132 * For 1 the page is returned with increased page count, otherwise not.
1133 */
1134static int get_any_page(struct page *p, unsigned long pfn, int flags)
1135{
1136 int ret;
1137
1138 if (flags & MF_COUNT_INCREASED)
1139 return 1;
1140
1141 /*
1142 * The lock_system_sleep prevents a race with memory hotplug,
1143 * because the isolation assumes there's only a single user.
1144 * This is a big hammer, a better would be nicer.
1145 */
1146 lock_system_sleep();
1147
1148 /*
1149 * Isolate the page, so that it doesn't get reallocated if it
1150 * was free.
1151 */
1152 set_migratetype_isolate(p);
1153 if (!get_page_unless_zero(compound_head(p))) {
1154 if (is_free_buddy_page(p)) {
1155 pr_debug("get_any_page: %#lx free buddy page\n", pfn);
1156 /* Set hwpoison bit while page is still isolated */
1157 SetPageHWPoison(p);
1158 ret = 0;
1159 } else {
1160 pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1161 pfn, p->flags);
1162 ret = -EIO;
1163 }
1164 } else {
1165 /* Not a free page */
1166 ret = 1;
1167 }
1168 unset_migratetype_isolate(p);
1169 unlock_system_sleep();
1170 return ret;
1171}
1172
1173/**
1174 * soft_offline_page - Soft offline a page.
1175 * @page: page to offline
1176 * @flags: flags. Same as memory_failure().
1177 *
1178 * Returns 0 on success, otherwise negated errno.
1179 *
1180 * Soft offline a page, by migration or invalidation,
1181 * without killing anything. This is for the case when
1182 * a page is not corrupted yet (so it's still valid to access),
1183 * but has had a number of corrected errors and is better taken
1184 * out.
1185 *
1186 * The actual policy on when to do that is maintained by
1187 * user space.
1188 *
1189 * This should never impact any application or cause data loss,
1190 * however it might take some time.
1191 *
1192 * This is not a 100% solution for all memory, but tries to be
1193 * ``good enough'' for the majority of memory.
1194 */
1195int soft_offline_page(struct page *page, int flags)
1196{
1197 int ret;
1198 unsigned long pfn = page_to_pfn(page);
1199
1200 ret = get_any_page(page, pfn, flags);
1201 if (ret < 0)
1202 return ret;
1203 if (ret == 0)
1204 goto done;
1205
1206 /*
1207 * Page cache page we can handle?
1208 */
1209 if (!PageLRU(page)) {
1210 /*
1211 * Try to free it.
1212 */
1213 put_page(page);
1214 shake_page(page, 1);
1215
1216 /*
1217 * Did it turn free?
1218 */
1219 ret = get_any_page(page, pfn, 0);
1220 if (ret < 0)
1221 return ret;
1222 if (ret == 0)
1223 goto done;
1224 }
1225 if (!PageLRU(page)) {
1226 pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
1227 pfn, page->flags);
1228 return -EIO;
1229 }
1230
1231 lock_page(page);
1232 wait_on_page_writeback(page);
1233
1234 /*
1235 * Synchronized using the page lock with memory_failure()
1236 */
1237 if (PageHWPoison(page)) {
1238 unlock_page(page);
1239 put_page(page);
1240 pr_debug("soft offline: %#lx page already poisoned\n", pfn);
1241 return -EBUSY;
1242 }
1243
1244 /*
1245 * Try to invalidate first. This should work for
1246 * non dirty unmapped page cache pages.
1247 */
1248 ret = invalidate_inode_page(page);
1249 unlock_page(page);
1250
1251 /*
1252 * Drop count because page migration doesn't like raised
1253 * counts. The page could get re-allocated, but if it becomes
1254 * LRU the isolation will just fail.
1255 * RED-PEN would be better to keep it isolated here, but we
1256 * would need to fix isolation locking first.
1257 */
1258 put_page(page);
1259 if (ret == 1) {
1260 ret = 0;
1261 pr_debug("soft_offline: %#lx: invalidated\n", pfn);
1262 goto done;
1263 }
1264
1265 /*
1266 * Simple invalidation didn't work.
1267 * Try to migrate to a new page instead. migrate.c
1268 * handles a large number of cases for us.
1269 */
1270 ret = isolate_lru_page(page);
1271 if (!ret) {
1272 LIST_HEAD(pagelist);
1273
1274 list_add(&page->lru, &pagelist);
1275 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1276 if (ret) {
1277 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1278 pfn, ret, page->flags);
1279 if (ret > 0)
1280 ret = -EIO;
1281 }
1282 } else {
1283 pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1284 pfn, ret, page_count(page), page->flags);
1285 }
1286 if (ret)
1287 return ret;
1288
1289done:
1290 atomic_long_add(1, &mce_bad_pages);
1291 SetPageHWPoison(page);
1292 /* keep elevated page count for bad page */
1293 return ret;
1294}
diff --git a/mm/memory.c b/mm/memory.c
index 7e91b5f9f690..09e4b1be7b67 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -572,7 +572,7 @@ out:
572 * covered by this vma. 572 * covered by this vma.
573 */ 573 */
574 574
575static inline void 575static inline unsigned long
576copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 576copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
577 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, 577 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
578 unsigned long addr, int *rss) 578 unsigned long addr, int *rss)
@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
586 if (!pte_file(pte)) { 586 if (!pte_file(pte)) {
587 swp_entry_t entry = pte_to_swp_entry(pte); 587 swp_entry_t entry = pte_to_swp_entry(pte);
588 588
589 swap_duplicate(entry); 589 if (swap_duplicate(entry) < 0)
590 return entry.val;
591
590 /* make sure dst_mm is on swapoff's mmlist. */ 592 /* make sure dst_mm is on swapoff's mmlist. */
591 if (unlikely(list_empty(&dst_mm->mmlist))) { 593 if (unlikely(list_empty(&dst_mm->mmlist))) {
592 spin_lock(&mmlist_lock); 594 spin_lock(&mmlist_lock);
@@ -635,16 +637,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
635 637
636out_set_pte: 638out_set_pte:
637 set_pte_at(dst_mm, addr, dst_pte, pte); 639 set_pte_at(dst_mm, addr, dst_pte, pte);
640 return 0;
638} 641}
639 642
640static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 643static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
641 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, 644 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
642 unsigned long addr, unsigned long end) 645 unsigned long addr, unsigned long end)
643{ 646{
647 pte_t *orig_src_pte, *orig_dst_pte;
644 pte_t *src_pte, *dst_pte; 648 pte_t *src_pte, *dst_pte;
645 spinlock_t *src_ptl, *dst_ptl; 649 spinlock_t *src_ptl, *dst_ptl;
646 int progress = 0; 650 int progress = 0;
647 int rss[2]; 651 int rss[2];
652 swp_entry_t entry = (swp_entry_t){0};
648 653
649again: 654again:
650 rss[1] = rss[0] = 0; 655 rss[1] = rss[0] = 0;
@@ -654,6 +659,8 @@ again:
654 src_pte = pte_offset_map_nested(src_pmd, addr); 659 src_pte = pte_offset_map_nested(src_pmd, addr);
655 src_ptl = pte_lockptr(src_mm, src_pmd); 660 src_ptl = pte_lockptr(src_mm, src_pmd);
656 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 661 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
662 orig_src_pte = src_pte;
663 orig_dst_pte = dst_pte;
657 arch_enter_lazy_mmu_mode(); 664 arch_enter_lazy_mmu_mode();
658 665
659 do { 666 do {
@@ -671,16 +678,25 @@ again:
671 progress++; 678 progress++;
672 continue; 679 continue;
673 } 680 }
674 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); 681 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
682 vma, addr, rss);
683 if (entry.val)
684 break;
675 progress += 8; 685 progress += 8;
676 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 686 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
677 687
678 arch_leave_lazy_mmu_mode(); 688 arch_leave_lazy_mmu_mode();
679 spin_unlock(src_ptl); 689 spin_unlock(src_ptl);
680 pte_unmap_nested(src_pte - 1); 690 pte_unmap_nested(orig_src_pte);
681 add_mm_rss(dst_mm, rss[0], rss[1]); 691 add_mm_rss(dst_mm, rss[0], rss[1]);
682 pte_unmap_unlock(dst_pte - 1, dst_ptl); 692 pte_unmap_unlock(orig_dst_pte, dst_ptl);
683 cond_resched(); 693 cond_resched();
694
695 if (entry.val) {
696 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
697 return -ENOMEM;
698 progress = 0;
699 }
684 if (addr != end) 700 if (addr != end)
685 goto again; 701 goto again;
686 return 0; 702 return 0;
@@ -940,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
940 details = NULL; 956 details = NULL;
941 957
942 BUG_ON(addr >= end); 958 BUG_ON(addr >= end);
959 mem_cgroup_uncharge_start();
943 tlb_start_vma(tlb, vma); 960 tlb_start_vma(tlb, vma);
944 pgd = pgd_offset(vma->vm_mm, addr); 961 pgd = pgd_offset(vma->vm_mm, addr);
945 do { 962 do {
@@ -952,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
952 zap_work, details); 969 zap_work, details);
953 } while (pgd++, addr = next, (addr != end && *zap_work > 0)); 970 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
954 tlb_end_vma(tlb, vma); 971 tlb_end_vma(tlb, vma);
972 mem_cgroup_uncharge_end();
955 973
956 return addr; 974 return addr;
957} 975}
@@ -1820,10 +1838,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1820 token = pmd_pgtable(*pmd); 1838 token = pmd_pgtable(*pmd);
1821 1839
1822 do { 1840 do {
1823 err = fn(pte, token, addr, data); 1841 err = fn(pte++, token, addr, data);
1824 if (err) 1842 if (err)
1825 break; 1843 break;
1826 } while (pte++, addr += PAGE_SIZE, addr != end); 1844 } while (addr += PAGE_SIZE, addr != end);
1827 1845
1828 arch_leave_lazy_mmu_mode(); 1846 arch_leave_lazy_mmu_mode();
1829 1847
@@ -2511,7 +2529,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2511 ret = VM_FAULT_HWPOISON; 2529 ret = VM_FAULT_HWPOISON;
2512 } else { 2530 } else {
2513 print_bad_pte(vma, address, orig_pte, NULL); 2531 print_bad_pte(vma, address, orig_pte, NULL);
2514 ret = VM_FAULT_OOM; 2532 ret = VM_FAULT_SIGBUS;
2515 } 2533 }
2516 goto out; 2534 goto out;
2517 } 2535 }
@@ -2537,14 +2555,24 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2537 ret = VM_FAULT_MAJOR; 2555 ret = VM_FAULT_MAJOR;
2538 count_vm_event(PGMAJFAULT); 2556 count_vm_event(PGMAJFAULT);
2539 } else if (PageHWPoison(page)) { 2557 } else if (PageHWPoison(page)) {
2558 /*
2559 * hwpoisoned dirty swapcache pages are kept for killing
2560 * owner processes (which may be unknown at hwpoison time)
2561 */
2540 ret = VM_FAULT_HWPOISON; 2562 ret = VM_FAULT_HWPOISON;
2541 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2563 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2542 goto out; 2564 goto out_release;
2543 } 2565 }
2544 2566
2545 lock_page(page); 2567 lock_page(page);
2546 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2568 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2547 2569
2570 page = ksm_might_need_to_copy(page, vma, address);
2571 if (!page) {
2572 ret = VM_FAULT_OOM;
2573 goto out;
2574 }
2575
2548 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 2576 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2549 ret = VM_FAULT_OOM; 2577 ret = VM_FAULT_OOM;
2550 goto out_page; 2578 goto out_page;
@@ -2611,6 +2639,7 @@ out_nomap:
2611 pte_unmap_unlock(page_table, ptl); 2639 pte_unmap_unlock(page_table, ptl);
2612out_page: 2640out_page:
2613 unlock_page(page); 2641 unlock_page(page);
2642out_release:
2614 page_cache_release(page); 2643 page_cache_release(page);
2615 return ret; 2644 return ret;
2616} 2645}
@@ -2906,7 +2935,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2906 * Page table corrupted: show pte and kill process. 2935 * Page table corrupted: show pte and kill process.
2907 */ 2936 */
2908 print_bad_pte(vma, address, orig_pte, NULL); 2937 print_bad_pte(vma, address, orig_pte, NULL);
2909 return VM_FAULT_OOM; 2938 return VM_FAULT_SIGBUS;
2910 } 2939 }
2911 2940
2912 pgoff = pte_to_pgoff(orig_pte); 2941 pgoff = pte_to_pgoff(orig_pte);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 821dee596377..030ce8a5bb0e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,8 @@
26#include <linux/migrate.h> 26#include <linux/migrate.h>
27#include <linux/page-isolation.h> 27#include <linux/page-isolation.h>
28#include <linux/pfn.h> 28#include <linux/pfn.h>
29#include <linux/suspend.h>
30#include <linux/mm_inline.h>
29 31
30#include <asm/tlbflush.h> 32#include <asm/tlbflush.h>
31 33
@@ -70,7 +72,9 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type)
70 atomic_inc(&page->_count); 72 atomic_inc(&page->_count);
71} 73}
72 74
73void put_page_bootmem(struct page *page) 75/* reference to __meminit __free_pages_bootmem is valid
76 * so use __ref to tell modpost not to generate a warning */
77void __ref put_page_bootmem(struct page *page)
74{ 78{
75 int type; 79 int type;
76 80
@@ -447,7 +451,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
447} 451}
448#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 452#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
449 453
450static pg_data_t *hotadd_new_pgdat(int nid, u64 start) 454/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
455static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
451{ 456{
452 struct pglist_data *pgdat; 457 struct pglist_data *pgdat;
453 unsigned long zones_size[MAX_NR_ZONES] = {0}; 458 unsigned long zones_size[MAX_NR_ZONES] = {0};
@@ -484,14 +489,18 @@ int __ref add_memory(int nid, u64 start, u64 size)
484 struct resource *res; 489 struct resource *res;
485 int ret; 490 int ret;
486 491
492 lock_system_sleep();
493
487 res = register_memory_resource(start, size); 494 res = register_memory_resource(start, size);
495 ret = -EEXIST;
488 if (!res) 496 if (!res)
489 return -EEXIST; 497 goto out;
490 498
491 if (!node_online(nid)) { 499 if (!node_online(nid)) {
492 pgdat = hotadd_new_pgdat(nid, start); 500 pgdat = hotadd_new_pgdat(nid, start);
501 ret = -ENOMEM;
493 if (!pgdat) 502 if (!pgdat)
494 return -ENOMEM; 503 goto out;
495 new_pgdat = 1; 504 new_pgdat = 1;
496 } 505 }
497 506
@@ -514,7 +523,8 @@ int __ref add_memory(int nid, u64 start, u64 size)
514 BUG_ON(ret); 523 BUG_ON(ret);
515 } 524 }
516 525
517 return ret; 526 goto out;
527
518error: 528error:
519 /* rollback pgdat allocation and others */ 529 /* rollback pgdat allocation and others */
520 if (new_pgdat) 530 if (new_pgdat)
@@ -522,6 +532,8 @@ error:
522 if (res) 532 if (res)
523 release_memory_resource(res); 533 release_memory_resource(res);
524 534
535out:
536 unlock_system_sleep();
525 return ret; 537 return ret;
526} 538}
527EXPORT_SYMBOL_GPL(add_memory); 539EXPORT_SYMBOL_GPL(add_memory);
@@ -663,6 +675,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
663 if (!ret) { /* Success */ 675 if (!ret) { /* Success */
664 list_add_tail(&page->lru, &source); 676 list_add_tail(&page->lru, &source);
665 move_pages--; 677 move_pages--;
678 inc_zone_page_state(page, NR_ISOLATED_ANON +
679 page_is_file_cache(page));
680
666 } else { 681 } else {
667 /* Becasue we don't have big zone->lock. we should 682 /* Becasue we don't have big zone->lock. we should
668 check this again here. */ 683 check this again here. */
@@ -685,7 +700,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
685 if (list_empty(&source)) 700 if (list_empty(&source))
686 goto out; 701 goto out;
687 /* this function returns # of failed pages */ 702 /* this function returns # of failed pages */
688 ret = migrate_pages(&source, hotremove_migrate_alloc, 0); 703 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
689 704
690out: 705out:
691 return ret; 706 return ret;
@@ -738,7 +753,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
738 return offlined; 753 return offlined;
739} 754}
740 755
741int offline_pages(unsigned long start_pfn, 756static int offline_pages(unsigned long start_pfn,
742 unsigned long end_pfn, unsigned long timeout) 757 unsigned long end_pfn, unsigned long timeout)
743{ 758{
744 unsigned long pfn, nr_pages, expire; 759 unsigned long pfn, nr_pages, expire;
@@ -758,6 +773,8 @@ int offline_pages(unsigned long start_pfn,
758 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 773 if (!test_pages_in_a_zone(start_pfn, end_pfn))
759 return -EINVAL; 774 return -EINVAL;
760 775
776 lock_system_sleep();
777
761 zone = page_zone(pfn_to_page(start_pfn)); 778 zone = page_zone(pfn_to_page(start_pfn));
762 node = zone_to_nid(zone); 779 node = zone_to_nid(zone);
763 nr_pages = end_pfn - start_pfn; 780 nr_pages = end_pfn - start_pfn;
@@ -765,7 +782,7 @@ int offline_pages(unsigned long start_pfn,
765 /* set above range as isolated */ 782 /* set above range as isolated */
766 ret = start_isolate_page_range(start_pfn, end_pfn); 783 ret = start_isolate_page_range(start_pfn, end_pfn);
767 if (ret) 784 if (ret)
768 return ret; 785 goto out;
769 786
770 arg.start_pfn = start_pfn; 787 arg.start_pfn = start_pfn;
771 arg.nr_pages = nr_pages; 788 arg.nr_pages = nr_pages;
@@ -838,11 +855,16 @@ repeat:
838 855
839 setup_per_zone_wmarks(); 856 setup_per_zone_wmarks();
840 calculate_zone_inactive_ratio(zone); 857 calculate_zone_inactive_ratio(zone);
858 if (!node_present_pages(node)) {
859 node_clear_state(node, N_HIGH_MEMORY);
860 kswapd_stop(node);
861 }
841 862
842 vm_total_pages = nr_free_pagecache_pages(); 863 vm_total_pages = nr_free_pagecache_pages();
843 writeback_set_ratelimit(); 864 writeback_set_ratelimit();
844 865
845 memory_notify(MEM_OFFLINE, &arg); 866 memory_notify(MEM_OFFLINE, &arg);
867 unlock_system_sleep();
846 return 0; 868 return 0;
847 869
848failed_removal: 870failed_removal:
@@ -852,6 +874,8 @@ failed_removal:
852 /* pushback to free area */ 874 /* pushback to free area */
853 undo_isolate_page_range(start_pfn, end_pfn); 875 undo_isolate_page_range(start_pfn, end_pfn);
854 876
877out:
878 unlock_system_sleep();
855 return ret; 879 return ret;
856} 880}
857 881
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7dd9d9f80694..290fb5bf0440 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -85,10 +85,12 @@
85#include <linux/seq_file.h> 85#include <linux/seq_file.h>
86#include <linux/proc_fs.h> 86#include <linux/proc_fs.h>
87#include <linux/migrate.h> 87#include <linux/migrate.h>
88#include <linux/ksm.h>
88#include <linux/rmap.h> 89#include <linux/rmap.h>
89#include <linux/security.h> 90#include <linux/security.h>
90#include <linux/syscalls.h> 91#include <linux/syscalls.h>
91#include <linux/ctype.h> 92#include <linux/ctype.h>
93#include <linux/mm_inline.h>
92 94
93#include <asm/tlbflush.h> 95#include <asm/tlbflush.h>
94#include <asm/uaccess.h> 96#include <asm/uaccess.h>
@@ -412,17 +414,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
412 if (!page) 414 if (!page)
413 continue; 415 continue;
414 /* 416 /*
415 * The check for PageReserved here is important to avoid 417 * vm_normal_page() filters out zero pages, but there might
416 * handling zero pages and other pages that may have been 418 * still be PageReserved pages to skip, perhaps in a VDSO.
417 * marked special by the system. 419 * And we cannot move PageKsm pages sensibly or safely yet.
418 *
419 * If the PageReserved would not be checked here then f.e.
420 * the location of the zero page could have an influence
421 * on MPOL_MF_STRICT, zero pages would be counted for
422 * the per node stats, and there would be useless attempts
423 * to put zero pages on the migration list.
424 */ 420 */
425 if (PageReserved(page)) 421 if (PageReserved(page) || PageKsm(page))
426 continue; 422 continue;
427 nid = page_to_nid(page); 423 nid = page_to_nid(page);
428 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 424 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -809,6 +805,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
809 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { 805 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
810 if (!isolate_lru_page(page)) { 806 if (!isolate_lru_page(page)) {
811 list_add_tail(&page->lru, pagelist); 807 list_add_tail(&page->lru, pagelist);
808 inc_zone_page_state(page, NR_ISOLATED_ANON +
809 page_is_file_cache(page));
812 } 810 }
813 } 811 }
814} 812}
@@ -836,7 +834,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
836 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 834 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
837 835
838 if (!list_empty(&pagelist)) 836 if (!list_empty(&pagelist))
839 err = migrate_pages(&pagelist, new_node_page, dest); 837 err = migrate_pages(&pagelist, new_node_page, dest, 0);
840 838
841 return err; 839 return err;
842} 840}
@@ -1024,7 +1022,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1024 1022
1025 err = migrate_prep(); 1023 err = migrate_prep();
1026 if (err) 1024 if (err)
1027 return err; 1025 goto mpol_out;
1028 } 1026 }
1029 { 1027 {
1030 NODEMASK_SCRATCH(scratch); 1028 NODEMASK_SCRATCH(scratch);
@@ -1039,10 +1037,9 @@ static long do_mbind(unsigned long start, unsigned long len,
1039 err = -ENOMEM; 1037 err = -ENOMEM;
1040 NODEMASK_SCRATCH_FREE(scratch); 1038 NODEMASK_SCRATCH_FREE(scratch);
1041 } 1039 }
1042 if (err) { 1040 if (err)
1043 mpol_put(new); 1041 goto mpol_out;
1044 return err; 1042
1045 }
1046 vma = check_range(mm, start, end, nmask, 1043 vma = check_range(mm, start, end, nmask,
1047 flags | MPOL_MF_INVERT, &pagelist); 1044 flags | MPOL_MF_INVERT, &pagelist);
1048 1045
@@ -1054,13 +1051,15 @@ static long do_mbind(unsigned long start, unsigned long len,
1054 1051
1055 if (!list_empty(&pagelist)) 1052 if (!list_empty(&pagelist))
1056 nr_failed = migrate_pages(&pagelist, new_vma_page, 1053 nr_failed = migrate_pages(&pagelist, new_vma_page,
1057 (unsigned long)vma); 1054 (unsigned long)vma, 0);
1058 1055
1059 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1056 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1060 err = -EIO; 1057 err = -EIO;
1061 } 1058 } else
1059 putback_lru_pages(&pagelist);
1062 1060
1063 up_write(&mm->mmap_sem); 1061 up_write(&mm->mmap_sem);
1062 mpol_out:
1064 mpol_put(new); 1063 mpol_put(new);
1065 return err; 1064 return err;
1066} 1065}
@@ -1564,6 +1563,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1564 } 1563 }
1565 return zl; 1564 return zl;
1566} 1565}
1566
1567/*
1568 * init_nodemask_of_mempolicy
1569 *
1570 * If the current task's mempolicy is "default" [NULL], return 'false'
1571 * to indicate default policy. Otherwise, extract the policy nodemask
1572 * for 'bind' or 'interleave' policy into the argument nodemask, or
1573 * initialize the argument nodemask to contain the single node for
1574 * 'preferred' or 'local' policy and return 'true' to indicate presence
1575 * of non-default mempolicy.
1576 *
1577 * We don't bother with reference counting the mempolicy [mpol_get/put]
1578 * because the current task is examining it's own mempolicy and a task's
1579 * mempolicy is only ever changed by the task itself.
1580 *
1581 * N.B., it is the caller's responsibility to free a returned nodemask.
1582 */
1583bool init_nodemask_of_mempolicy(nodemask_t *mask)
1584{
1585 struct mempolicy *mempolicy;
1586 int nid;
1587
1588 if (!(mask && current->mempolicy))
1589 return false;
1590
1591 mempolicy = current->mempolicy;
1592 switch (mempolicy->mode) {
1593 case MPOL_PREFERRED:
1594 if (mempolicy->flags & MPOL_F_LOCAL)
1595 nid = numa_node_id();
1596 else
1597 nid = mempolicy->v.preferred_node;
1598 init_nodemask_of_node(mask, nid);
1599 break;
1600
1601 case MPOL_BIND:
1602 /* Fall through */
1603 case MPOL_INTERLEAVE:
1604 *mask = mempolicy->v.nodes;
1605 break;
1606
1607 default:
1608 BUG();
1609 }
1610
1611 return true;
1612}
1567#endif 1613#endif
1568 1614
1569/* Allocate a page in interleaved policy. 1615/* Allocate a page in interleaved policy.
diff --git a/mm/migrate.c b/mm/migrate.c
index 1a4bf4813780..efddbf0926b2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -21,6 +21,7 @@
21#include <linux/mm_inline.h> 21#include <linux/mm_inline.h>
22#include <linux/nsproxy.h> 22#include <linux/nsproxy.h>
23#include <linux/pagevec.h> 23#include <linux/pagevec.h>
24#include <linux/ksm.h>
24#include <linux/rmap.h> 25#include <linux/rmap.h>
25#include <linux/topology.h> 26#include <linux/topology.h>
26#include <linux/cpu.h> 27#include <linux/cpu.h>
@@ -78,8 +79,8 @@ int putback_lru_pages(struct list_head *l)
78/* 79/*
79 * Restore a potential migration pte to a working pte entry 80 * Restore a potential migration pte to a working pte entry
80 */ 81 */
81static void remove_migration_pte(struct vm_area_struct *vma, 82static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
82 struct page *old, struct page *new) 83 unsigned long addr, void *old)
83{ 84{
84 struct mm_struct *mm = vma->vm_mm; 85 struct mm_struct *mm = vma->vm_mm;
85 swp_entry_t entry; 86 swp_entry_t entry;
@@ -88,40 +89,37 @@ static void remove_migration_pte(struct vm_area_struct *vma,
88 pmd_t *pmd; 89 pmd_t *pmd;
89 pte_t *ptep, pte; 90 pte_t *ptep, pte;
90 spinlock_t *ptl; 91 spinlock_t *ptl;
91 unsigned long addr = page_address_in_vma(new, vma);
92
93 if (addr == -EFAULT)
94 return;
95 92
96 pgd = pgd_offset(mm, addr); 93 pgd = pgd_offset(mm, addr);
97 if (!pgd_present(*pgd)) 94 if (!pgd_present(*pgd))
98 return; 95 goto out;
99 96
100 pud = pud_offset(pgd, addr); 97 pud = pud_offset(pgd, addr);
101 if (!pud_present(*pud)) 98 if (!pud_present(*pud))
102 return; 99 goto out;
103 100
104 pmd = pmd_offset(pud, addr); 101 pmd = pmd_offset(pud, addr);
105 if (!pmd_present(*pmd)) 102 if (!pmd_present(*pmd))
106 return; 103 goto out;
107 104
108 ptep = pte_offset_map(pmd, addr); 105 ptep = pte_offset_map(pmd, addr);
109 106
110 if (!is_swap_pte(*ptep)) { 107 if (!is_swap_pte(*ptep)) {
111 pte_unmap(ptep); 108 pte_unmap(ptep);
112 return; 109 goto out;
113 } 110 }
114 111
115 ptl = pte_lockptr(mm, pmd); 112 ptl = pte_lockptr(mm, pmd);
116 spin_lock(ptl); 113 spin_lock(ptl);
117 pte = *ptep; 114 pte = *ptep;
118 if (!is_swap_pte(pte)) 115 if (!is_swap_pte(pte))
119 goto out; 116 goto unlock;
120 117
121 entry = pte_to_swp_entry(pte); 118 entry = pte_to_swp_entry(pte);
122 119
123 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) 120 if (!is_migration_entry(entry) ||
124 goto out; 121 migration_entry_to_page(entry) != old)
122 goto unlock;
125 123
126 get_page(new); 124 get_page(new);
127 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 125 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
@@ -137,58 +135,10 @@ static void remove_migration_pte(struct vm_area_struct *vma,
137 135
138 /* No need to invalidate - it was non-present before */ 136 /* No need to invalidate - it was non-present before */
139 update_mmu_cache(vma, addr, pte); 137 update_mmu_cache(vma, addr, pte);
140 138unlock:
141out:
142 pte_unmap_unlock(ptep, ptl); 139 pte_unmap_unlock(ptep, ptl);
143} 140out:
144 141 return SWAP_AGAIN;
145/*
146 * Note that remove_file_migration_ptes will only work on regular mappings,
147 * Nonlinear mappings do not use migration entries.
148 */
149static void remove_file_migration_ptes(struct page *old, struct page *new)
150{
151 struct vm_area_struct *vma;
152 struct address_space *mapping = new->mapping;
153 struct prio_tree_iter iter;
154 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
155
156 if (!mapping)
157 return;
158
159 spin_lock(&mapping->i_mmap_lock);
160
161 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
162 remove_migration_pte(vma, old, new);
163
164 spin_unlock(&mapping->i_mmap_lock);
165}
166
167/*
168 * Must hold mmap_sem lock on at least one of the vmas containing
169 * the page so that the anon_vma cannot vanish.
170 */
171static void remove_anon_migration_ptes(struct page *old, struct page *new)
172{
173 struct anon_vma *anon_vma;
174 struct vm_area_struct *vma;
175 unsigned long mapping;
176
177 mapping = (unsigned long)new->mapping;
178
179 if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
180 return;
181
182 /*
183 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
184 */
185 anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
186 spin_lock(&anon_vma->lock);
187
188 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
189 remove_migration_pte(vma, old, new);
190
191 spin_unlock(&anon_vma->lock);
192} 142}
193 143
194/* 144/*
@@ -197,10 +147,7 @@ static void remove_anon_migration_ptes(struct page *old, struct page *new)
197 */ 147 */
198static void remove_migration_ptes(struct page *old, struct page *new) 148static void remove_migration_ptes(struct page *old, struct page *new)
199{ 149{
200 if (PageAnon(new)) 150 rmap_walk(new, remove_migration_pte, old);
201 remove_anon_migration_ptes(old, new);
202 else
203 remove_file_migration_ptes(old, new);
204} 151}
205 152
206/* 153/*
@@ -341,8 +288,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
341 if (TestClearPageActive(page)) { 288 if (TestClearPageActive(page)) {
342 VM_BUG_ON(PageUnevictable(page)); 289 VM_BUG_ON(PageUnevictable(page));
343 SetPageActive(newpage); 290 SetPageActive(newpage);
344 } else 291 } else if (TestClearPageUnevictable(page))
345 unevictable_migrate_page(newpage, page); 292 SetPageUnevictable(newpage);
346 if (PageChecked(page)) 293 if (PageChecked(page))
347 SetPageChecked(newpage); 294 SetPageChecked(newpage);
348 if (PageMappedToDisk(page)) 295 if (PageMappedToDisk(page))
@@ -361,6 +308,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
361 } 308 }
362 309
363 mlock_migrate_page(newpage, page); 310 mlock_migrate_page(newpage, page);
311 ksm_migrate_page(newpage, page);
364 312
365 ClearPageSwapCache(page); 313 ClearPageSwapCache(page);
366 ClearPagePrivate(page); 314 ClearPagePrivate(page);
@@ -580,9 +528,9 @@ static int move_to_new_page(struct page *newpage, struct page *page)
580 else 528 else
581 rc = fallback_migrate_page(mapping, newpage, page); 529 rc = fallback_migrate_page(mapping, newpage, page);
582 530
583 if (!rc) { 531 if (!rc)
584 remove_migration_ptes(page, newpage); 532 remove_migration_ptes(page, newpage);
585 } else 533 else
586 newpage->mapping = NULL; 534 newpage->mapping = NULL;
587 535
588 unlock_page(newpage); 536 unlock_page(newpage);
@@ -595,14 +543,14 @@ static int move_to_new_page(struct page *newpage, struct page *page)
595 * to the newly allocated page in newpage. 543 * to the newly allocated page in newpage.
596 */ 544 */
597static int unmap_and_move(new_page_t get_new_page, unsigned long private, 545static int unmap_and_move(new_page_t get_new_page, unsigned long private,
598 struct page *page, int force) 546 struct page *page, int force, int offlining)
599{ 547{
600 int rc = 0; 548 int rc = 0;
601 int *result = NULL; 549 int *result = NULL;
602 struct page *newpage = get_new_page(page, private, &result); 550 struct page *newpage = get_new_page(page, private, &result);
603 int rcu_locked = 0; 551 int rcu_locked = 0;
604 int charge = 0; 552 int charge = 0;
605 struct mem_cgroup *mem; 553 struct mem_cgroup *mem = NULL;
606 554
607 if (!newpage) 555 if (!newpage)
608 return -ENOMEM; 556 return -ENOMEM;
@@ -621,6 +569,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
621 lock_page(page); 569 lock_page(page);
622 } 570 }
623 571
572 /*
573 * Only memory hotplug's offline_pages() caller has locked out KSM,
574 * and can safely migrate a KSM page. The other cases have skipped
575 * PageKsm along with PageReserved - but it is only now when we have
576 * the page lock that we can be certain it will not go KSM beneath us
577 * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
578 * its pagecount raised, but only here do we take the page lock which
579 * serializes that).
580 */
581 if (PageKsm(page) && !offlining) {
582 rc = -EBUSY;
583 goto unlock;
584 }
585
624 /* charge against new page */ 586 /* charge against new page */
625 charge = mem_cgroup_prepare_migration(page, &mem); 587 charge = mem_cgroup_prepare_migration(page, &mem);
626 if (charge == -ENOMEM) { 588 if (charge == -ENOMEM) {
@@ -737,7 +699,7 @@ move_newpage:
737 * Return: Number of pages not migrated or error code. 699 * Return: Number of pages not migrated or error code.
738 */ 700 */
739int migrate_pages(struct list_head *from, 701int migrate_pages(struct list_head *from,
740 new_page_t get_new_page, unsigned long private) 702 new_page_t get_new_page, unsigned long private, int offlining)
741{ 703{
742 int retry = 1; 704 int retry = 1;
743 int nr_failed = 0; 705 int nr_failed = 0;
@@ -746,13 +708,6 @@ int migrate_pages(struct list_head *from,
746 struct page *page2; 708 struct page *page2;
747 int swapwrite = current->flags & PF_SWAPWRITE; 709 int swapwrite = current->flags & PF_SWAPWRITE;
748 int rc; 710 int rc;
749 unsigned long flags;
750
751 local_irq_save(flags);
752 list_for_each_entry(page, from, lru)
753 __inc_zone_page_state(page, NR_ISOLATED_ANON +
754 page_is_file_cache(page));
755 local_irq_restore(flags);
756 711
757 if (!swapwrite) 712 if (!swapwrite)
758 current->flags |= PF_SWAPWRITE; 713 current->flags |= PF_SWAPWRITE;
@@ -764,7 +719,7 @@ int migrate_pages(struct list_head *from,
764 cond_resched(); 719 cond_resched();
765 720
766 rc = unmap_and_move(get_new_page, private, 721 rc = unmap_and_move(get_new_page, private,
767 page, pass > 2); 722 page, pass > 2, offlining);
768 723
769 switch(rc) { 724 switch(rc) {
770 case -ENOMEM: 725 case -ENOMEM:
@@ -860,7 +815,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
860 if (!page) 815 if (!page)
861 goto set_status; 816 goto set_status;
862 817
863 if (PageReserved(page)) /* Check for zero page */ 818 /* Use PageReserved to check for zero page */
819 if (PageReserved(page) || PageKsm(page))
864 goto put_and_set; 820 goto put_and_set;
865 821
866 pp->page = page; 822 pp->page = page;
@@ -878,8 +834,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
878 goto put_and_set; 834 goto put_and_set;
879 835
880 err = isolate_lru_page(page); 836 err = isolate_lru_page(page);
881 if (!err) 837 if (!err) {
882 list_add_tail(&page->lru, &pagelist); 838 list_add_tail(&page->lru, &pagelist);
839 inc_zone_page_state(page, NR_ISOLATED_ANON +
840 page_is_file_cache(page));
841 }
883put_and_set: 842put_and_set:
884 /* 843 /*
885 * Either remove the duplicate refcount from 844 * Either remove the duplicate refcount from
@@ -894,7 +853,7 @@ set_status:
894 err = 0; 853 err = 0;
895 if (!list_empty(&pagelist)) 854 if (!list_empty(&pagelist))
896 err = migrate_pages(&pagelist, new_page_node, 855 err = migrate_pages(&pagelist, new_page_node,
897 (unsigned long)pm); 856 (unsigned long)pm, 0);
898 857
899 up_read(&mm->mmap_sem); 858 up_read(&mm->mmap_sem);
900 return err; 859 return err;
@@ -1015,7 +974,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1015 974
1016 err = -ENOENT; 975 err = -ENOENT;
1017 /* Use PageReserved to check for zero page */ 976 /* Use PageReserved to check for zero page */
1018 if (!page || PageReserved(page)) 977 if (!page || PageReserved(page) || PageKsm(page))
1019 goto set_status; 978 goto set_status;
1020 979
1021 err = page_to_nid(page); 980 err = page_to_nid(page);
@@ -1044,7 +1003,7 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1044 int err; 1003 int err;
1045 1004
1046 for (i = 0; i < nr_pages; i += chunk_nr) { 1005 for (i = 0; i < nr_pages; i += chunk_nr) {
1047 if (chunk_nr + i > nr_pages) 1006 if (chunk_nr > nr_pages - i)
1048 chunk_nr = nr_pages - i; 1007 chunk_nr = nr_pages - i;
1049 1008
1050 err = copy_from_user(chunk_pages, &pages[i], 1009 err = copy_from_user(chunk_pages, &pages[i],
diff --git a/mm/mincore.c b/mm/mincore.c
index 8cb508f84ea4..7a3436ef39eb 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -14,6 +14,7 @@
14#include <linux/syscalls.h> 14#include <linux/syscalls.h>
15#include <linux/swap.h> 15#include <linux/swap.h>
16#include <linux/swapops.h> 16#include <linux/swapops.h>
17#include <linux/hugetlb.h>
17 18
18#include <asm/uaccess.h> 19#include <asm/uaccess.h>
19#include <asm/pgtable.h> 20#include <asm/pgtable.h>
@@ -72,6 +73,42 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag
72 if (!vma || addr < vma->vm_start) 73 if (!vma || addr < vma->vm_start)
73 return -ENOMEM; 74 return -ENOMEM;
74 75
76#ifdef CONFIG_HUGETLB_PAGE
77 if (is_vm_hugetlb_page(vma)) {
78 struct hstate *h;
79 unsigned long nr_huge;
80 unsigned char present;
81
82 i = 0;
83 nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT);
84 h = hstate_vma(vma);
85 nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h))
86 - (addr >> huge_page_shift(h)) + 1;
87 nr_huge = min(nr_huge,
88 (vma->vm_end - addr) >> huge_page_shift(h));
89 while (1) {
90 /* hugepage always in RAM for now,
91 * but generally it needs to be check */
92 ptep = huge_pte_offset(current->mm,
93 addr & huge_page_mask(h));
94 present = !!(ptep &&
95 !huge_pte_none(huge_ptep_get(ptep)));
96 while (1) {
97 vec[i++] = present;
98 addr += PAGE_SIZE;
99 /* reach buffer limit */
100 if (i == nr)
101 return nr;
102 /* check hugepage border */
103 if (!((addr & ~huge_page_mask(h))
104 >> PAGE_SHIFT))
105 break;
106 }
107 }
108 return nr;
109 }
110#endif
111
75 /* 112 /*
76 * Calculate how many pages there are left in the last level of the 113 * Calculate how many pages there are left in the last level of the
77 * PTE array for our address. 114 * PTE array for our address.
diff --git a/mm/mlock.c b/mm/mlock.c
index bd6f0e466f6c..2b8335a89400 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -88,25 +88,22 @@ void mlock_vma_page(struct page *page)
88 } 88 }
89} 89}
90 90
91/* 91/**
92 * called from munlock()/munmap() path with page supposedly on the LRU. 92 * munlock_vma_page - munlock a vma page
93 * @page - page to be unlocked
93 * 94 *
94 * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked 95 * called from munlock()/munmap() path with page supposedly on the LRU.
95 * [in try_to_munlock()] and then attempt to isolate the page. We must 96 * When we munlock a page, because the vma where we found the page is being
96 * isolate the page to keep others from messing with its unevictable 97 * munlock()ed or munmap()ed, we want to check whether other vmas hold the
97 * and mlocked state while trying to munlock. However, we pre-clear the 98 * page locked so that we can leave it on the unevictable lru list and not
98 * mlocked state anyway as we might lose the isolation race and we might 99 * bother vmscan with it. However, to walk the page's rmap list in
99 * not get another chance to clear PageMlocked. If we successfully 100 * try_to_munlock() we must isolate the page from the LRU. If some other
100 * isolate the page and try_to_munlock() detects other VM_LOCKED vmas 101 * task has removed the page from the LRU, we won't be able to do that.
101 * mapping the page, it will restore the PageMlocked state, unless the page 102 * So we clear the PageMlocked as we might not get another chance. If we
102 * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), 103 * can't isolate the page, we leave it for putback_lru_page() and vmscan
103 * perhaps redundantly. 104 * [page_referenced()/try_to_unmap()] to deal with.
104 * If we lose the isolation race, and the page is mapped by other VM_LOCKED
105 * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
106 * either of which will restore the PageMlocked state by calling
107 * mlock_vma_page() above, if it can grab the vma's mmap sem.
108 */ 105 */
109static void munlock_vma_page(struct page *page) 106void munlock_vma_page(struct page *page)
110{ 107{
111 BUG_ON(!PageLocked(page)); 108 BUG_ON(!PageLocked(page));
112 109
@@ -117,18 +114,18 @@ static void munlock_vma_page(struct page *page)
117 /* 114 /*
118 * did try_to_unlock() succeed or punt? 115 * did try_to_unlock() succeed or punt?
119 */ 116 */
120 if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) 117 if (ret != SWAP_MLOCK)
121 count_vm_event(UNEVICTABLE_PGMUNLOCKED); 118 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
122 119
123 putback_lru_page(page); 120 putback_lru_page(page);
124 } else { 121 } else {
125 /* 122 /*
126 * We lost the race. let try_to_unmap() deal 123 * Some other task has removed the page from the LRU.
127 * with it. At least we get the page state and 124 * putback_lru_page() will take care of removing the
128 * mlock stats right. However, page is still on 125 * page from the unevictable list, if necessary.
129 * the noreclaim list. We'll fix that up when 126 * vmscan [page_referenced()] will move the page back
130 * the page is eventually freed or we scan the 127 * to the unevictable list if some other vma has it
131 * noreclaim list. 128 * mlocked.
132 */ 129 */
133 if (PageUnevictable(page)) 130 if (PageUnevictable(page))
134 count_vm_event(UNEVICTABLE_PGSTRANDED); 131 count_vm_event(UNEVICTABLE_PGSTRANDED);
diff --git a/mm/mmap.c b/mm/mmap.c
index 73f5e4b64010..ee2298936fe6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -20,7 +20,6 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/personality.h> 21#include <linux/personality.h>
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/ima.h>
24#include <linux/hugetlb.h> 23#include <linux/hugetlb.h>
25#include <linux/profile.h> 24#include <linux/profile.h>
26#include <linux/module.h> 25#include <linux/module.h>
@@ -932,13 +931,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
932 if (!(flags & MAP_FIXED)) 931 if (!(flags & MAP_FIXED))
933 addr = round_hint_to_min(addr); 932 addr = round_hint_to_min(addr);
934 933
935 error = arch_mmap_check(addr, len, flags);
936 if (error)
937 return error;
938
939 /* Careful about overflows.. */ 934 /* Careful about overflows.. */
940 len = PAGE_ALIGN(len); 935 len = PAGE_ALIGN(len);
941 if (!len || len > TASK_SIZE) 936 if (!len)
942 return -ENOMEM; 937 return -ENOMEM;
943 938
944 /* offset overflow? */ 939 /* offset overflow? */
@@ -949,24 +944,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
949 if (mm->map_count > sysctl_max_map_count) 944 if (mm->map_count > sysctl_max_map_count)
950 return -ENOMEM; 945 return -ENOMEM;
951 946
952 if (flags & MAP_HUGETLB) {
953 struct user_struct *user = NULL;
954 if (file)
955 return -EINVAL;
956
957 /*
958 * VM_NORESERVE is used because the reservations will be
959 * taken when vm_ops->mmap() is called
960 * A dummy user value is used because we are not locking
961 * memory so no accounting is necessary
962 */
963 len = ALIGN(len, huge_page_size(&default_hstate));
964 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
965 &user, HUGETLB_ANONHUGE_INODE);
966 if (IS_ERR(file))
967 return PTR_ERR(file);
968 }
969
970 /* Obtain the address to map to. we verify (or select) it and ensure 947 /* Obtain the address to map to. we verify (or select) it and ensure
971 * that it represents a valid section of the address space. 948 * that it represents a valid section of the address space.
972 */ 949 */
@@ -1061,14 +1038,51 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1061 error = security_file_mmap(file, reqprot, prot, flags, addr, 0); 1038 error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
1062 if (error) 1039 if (error)
1063 return error; 1040 return error;
1064 error = ima_file_mmap(file, prot);
1065 if (error)
1066 return error;
1067 1041
1068 return mmap_region(file, addr, len, flags, vm_flags, pgoff); 1042 return mmap_region(file, addr, len, flags, vm_flags, pgoff);
1069} 1043}
1070EXPORT_SYMBOL(do_mmap_pgoff); 1044EXPORT_SYMBOL(do_mmap_pgoff);
1071 1045
1046SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1047 unsigned long, prot, unsigned long, flags,
1048 unsigned long, fd, unsigned long, pgoff)
1049{
1050 struct file *file = NULL;
1051 unsigned long retval = -EBADF;
1052
1053 if (!(flags & MAP_ANONYMOUS)) {
1054 if (unlikely(flags & MAP_HUGETLB))
1055 return -EINVAL;
1056 file = fget(fd);
1057 if (!file)
1058 goto out;
1059 } else if (flags & MAP_HUGETLB) {
1060 struct user_struct *user = NULL;
1061 /*
1062 * VM_NORESERVE is used because the reservations will be
1063 * taken when vm_ops->mmap() is called
1064 * A dummy user value is used because we are not locking
1065 * memory so no accounting is necessary
1066 */
1067 len = ALIGN(len, huge_page_size(&default_hstate));
1068 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
1069 &user, HUGETLB_ANONHUGE_INODE);
1070 if (IS_ERR(file))
1071 return PTR_ERR(file);
1072 }
1073
1074 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1075
1076 down_write(&current->mm->mmap_sem);
1077 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1078 up_write(&current->mm->mmap_sem);
1079
1080 if (file)
1081 fput(file);
1082out:
1083 return retval;
1084}
1085
1072/* 1086/*
1073 * Some shared mappigns will want the pages marked read-only 1087 * Some shared mappigns will want the pages marked read-only
1074 * to track write events. If so, we'll downgrade vm_page_prot 1088 * to track write events. If so, we'll downgrade vm_page_prot
@@ -1224,8 +1238,20 @@ munmap_back:
1224 goto free_vma; 1238 goto free_vma;
1225 } 1239 }
1226 1240
1227 if (vma_wants_writenotify(vma)) 1241 if (vma_wants_writenotify(vma)) {
1242 pgprot_t pprot = vma->vm_page_prot;
1243
1244 /* Can vma->vm_page_prot have changed??
1245 *
1246 * Answer: Yes, drivers may have changed it in their
1247 * f_op->mmap method.
1248 *
1249 * Ensures that vmas marked as uncached stay that way.
1250 */
1228 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); 1251 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
1252 if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
1253 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1254 }
1229 1255
1230 vma_link(mm, vma, prev, rb_link, rb_parent); 1256 vma_link(mm, vma, prev, rb_link, rb_parent);
1231 file = vma->vm_file; 1257 file = vma->vm_file;
@@ -1459,6 +1485,14 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1459 unsigned long (*get_area)(struct file *, unsigned long, 1485 unsigned long (*get_area)(struct file *, unsigned long,
1460 unsigned long, unsigned long, unsigned long); 1486 unsigned long, unsigned long, unsigned long);
1461 1487
1488 unsigned long error = arch_mmap_check(addr, len, flags);
1489 if (error)
1490 return error;
1491
1492 /* Careful about overflows.. */
1493 if (len > TASK_SIZE)
1494 return -ENOMEM;
1495
1462 get_area = current->mm->get_unmapped_area; 1496 get_area = current->mm->get_unmapped_area;
1463 if (file && file->f_op && file->f_op->get_unmapped_area) 1497 if (file && file->f_op && file->f_op->get_unmapped_area)
1464 get_area = file->f_op->get_unmapped_area; 1498 get_area = file->f_op->get_unmapped_area;
@@ -1829,10 +1863,10 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1829} 1863}
1830 1864
1831/* 1865/*
1832 * Split a vma into two pieces at address 'addr', a new vma is allocated 1866 * __split_vma() bypasses sysctl_max_map_count checking. We use this on the
1833 * either for the first part or the tail. 1867 * munmap path where it doesn't make sense to fail.
1834 */ 1868 */
1835int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, 1869static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1836 unsigned long addr, int new_below) 1870 unsigned long addr, int new_below)
1837{ 1871{
1838 struct mempolicy *pol; 1872 struct mempolicy *pol;
@@ -1842,9 +1876,6 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1842 ~(huge_page_mask(hstate_vma(vma))))) 1876 ~(huge_page_mask(hstate_vma(vma)))))
1843 return -EINVAL; 1877 return -EINVAL;
1844 1878
1845 if (mm->map_count >= sysctl_max_map_count)
1846 return -ENOMEM;
1847
1848 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 1879 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1849 if (!new) 1880 if (!new)
1850 return -ENOMEM; 1881 return -ENOMEM;
@@ -1884,6 +1915,19 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1884 return 0; 1915 return 0;
1885} 1916}
1886 1917
1918/*
1919 * Split a vma into two pieces at address 'addr', a new vma is allocated
1920 * either for the first part or the tail.
1921 */
1922int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1923 unsigned long addr, int new_below)
1924{
1925 if (mm->map_count >= sysctl_max_map_count)
1926 return -ENOMEM;
1927
1928 return __split_vma(mm, vma, addr, new_below);
1929}
1930
1887/* Munmap is split into 2 main parts -- this part which finds 1931/* Munmap is split into 2 main parts -- this part which finds
1888 * what needs doing, and the areas themselves, which do the 1932 * what needs doing, and the areas themselves, which do the
1889 * work. This now handles partial unmappings. 1933 * work. This now handles partial unmappings.
@@ -1919,7 +1963,17 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1919 * places tmp vma above, and higher split_vma places tmp vma below. 1963 * places tmp vma above, and higher split_vma places tmp vma below.
1920 */ 1964 */
1921 if (start > vma->vm_start) { 1965 if (start > vma->vm_start) {
1922 int error = split_vma(mm, vma, start, 0); 1966 int error;
1967
1968 /*
1969 * Make sure that map_count on return from munmap() will
1970 * not exceed its limit; but let map_count go just above
1971 * its limit temporarily, to help free resources as expected.
1972 */
1973 if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
1974 return -ENOMEM;
1975
1976 error = __split_vma(mm, vma, start, 0);
1923 if (error) 1977 if (error)
1924 return error; 1978 return error;
1925 prev = vma; 1979 prev = vma;
@@ -1928,7 +1982,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1928 /* Does it split the last one? */ 1982 /* Does it split the last one? */
1929 last = find_vma(mm, end); 1983 last = find_vma(mm, end);
1930 if (last && end > last->vm_start) { 1984 if (last && end > last->vm_start) {
1931 int error = split_vma(mm, last, end, 1); 1985 int error = __split_vma(mm, last, end, 1);
1932 if (error) 1986 if (error)
1933 return error; 1987 return error;
1934 } 1988 }
@@ -2003,20 +2057,14 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2003 if (!len) 2057 if (!len)
2004 return addr; 2058 return addr;
2005 2059
2006 if ((addr + len) > TASK_SIZE || (addr + len) < addr)
2007 return -EINVAL;
2008
2009 if (is_hugepage_only_range(mm, addr, len))
2010 return -EINVAL;
2011
2012 error = security_file_mmap(NULL, 0, 0, 0, addr, 1); 2060 error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
2013 if (error) 2061 if (error)
2014 return error; 2062 return error;
2015 2063
2016 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 2064 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
2017 2065
2018 error = arch_mmap_check(addr, len, flags); 2066 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
2019 if (error) 2067 if (error & ~PAGE_MASK)
2020 return error; 2068 return error;
2021 2069
2022 /* 2070 /*
diff --git a/mm/mremap.c b/mm/mremap.c
index 97bff2547719..845190898d59 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -261,6 +261,137 @@ static unsigned long move_vma(struct vm_area_struct *vma,
261 return new_addr; 261 return new_addr;
262} 262}
263 263
264static struct vm_area_struct *vma_to_resize(unsigned long addr,
265 unsigned long old_len, unsigned long new_len, unsigned long *p)
266{
267 struct mm_struct *mm = current->mm;
268 struct vm_area_struct *vma = find_vma(mm, addr);
269
270 if (!vma || vma->vm_start > addr)
271 goto Efault;
272
273 if (is_vm_hugetlb_page(vma))
274 goto Einval;
275
276 /* We can't remap across vm area boundaries */
277 if (old_len > vma->vm_end - addr)
278 goto Efault;
279
280 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
281 if (new_len > old_len)
282 goto Efault;
283 }
284
285 if (vma->vm_flags & VM_LOCKED) {
286 unsigned long locked, lock_limit;
287 locked = mm->locked_vm << PAGE_SHIFT;
288 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
289 locked += new_len - old_len;
290 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
291 goto Eagain;
292 }
293
294 if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
295 goto Enomem;
296
297 if (vma->vm_flags & VM_ACCOUNT) {
298 unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
299 if (security_vm_enough_memory(charged))
300 goto Efault;
301 *p = charged;
302 }
303
304 return vma;
305
306Efault: /* very odd choice for most of the cases, but... */
307 return ERR_PTR(-EFAULT);
308Einval:
309 return ERR_PTR(-EINVAL);
310Enomem:
311 return ERR_PTR(-ENOMEM);
312Eagain:
313 return ERR_PTR(-EAGAIN);
314}
315
316static unsigned long mremap_to(unsigned long addr,
317 unsigned long old_len, unsigned long new_addr,
318 unsigned long new_len)
319{
320 struct mm_struct *mm = current->mm;
321 struct vm_area_struct *vma;
322 unsigned long ret = -EINVAL;
323 unsigned long charged = 0;
324 unsigned long map_flags;
325
326 if (new_addr & ~PAGE_MASK)
327 goto out;
328
329 if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
330 goto out;
331
332 /* Check if the location we're moving into overlaps the
333 * old location at all, and fail if it does.
334 */
335 if ((new_addr <= addr) && (new_addr+new_len) > addr)
336 goto out;
337
338 if ((addr <= new_addr) && (addr+old_len) > new_addr)
339 goto out;
340
341 ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
342 if (ret)
343 goto out;
344
345 ret = do_munmap(mm, new_addr, new_len);
346 if (ret)
347 goto out;
348
349 if (old_len >= new_len) {
350 ret = do_munmap(mm, addr+new_len, old_len - new_len);
351 if (ret && old_len != new_len)
352 goto out;
353 old_len = new_len;
354 }
355
356 vma = vma_to_resize(addr, old_len, new_len, &charged);
357 if (IS_ERR(vma)) {
358 ret = PTR_ERR(vma);
359 goto out;
360 }
361
362 map_flags = MAP_FIXED;
363 if (vma->vm_flags & VM_MAYSHARE)
364 map_flags |= MAP_SHARED;
365
366 ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
367 ((addr - vma->vm_start) >> PAGE_SHIFT),
368 map_flags);
369 if (ret & ~PAGE_MASK)
370 goto out1;
371
372 ret = move_vma(vma, addr, old_len, new_len, new_addr);
373 if (!(ret & ~PAGE_MASK))
374 goto out;
375out1:
376 vm_unacct_memory(charged);
377
378out:
379 return ret;
380}
381
382static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
383{
384 unsigned long end = vma->vm_end + delta;
385 if (end < vma->vm_end) /* overflow */
386 return 0;
387 if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
388 return 0;
389 if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
390 0, MAP_FIXED) & ~PAGE_MASK)
391 return 0;
392 return 1;
393}
394
264/* 395/*
265 * Expand (or shrink) an existing mapping, potentially moving it at the 396 * Expand (or shrink) an existing mapping, potentially moving it at the
266 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) 397 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
@@ -294,32 +425,10 @@ unsigned long do_mremap(unsigned long addr,
294 if (!new_len) 425 if (!new_len)
295 goto out; 426 goto out;
296 427
297 /* new_addr is only valid if MREMAP_FIXED is specified */
298 if (flags & MREMAP_FIXED) { 428 if (flags & MREMAP_FIXED) {
299 if (new_addr & ~PAGE_MASK) 429 if (flags & MREMAP_MAYMOVE)
300 goto out; 430 ret = mremap_to(addr, old_len, new_addr, new_len);
301 if (!(flags & MREMAP_MAYMOVE)) 431 goto out;
302 goto out;
303
304 if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
305 goto out;
306
307 /* Check if the location we're moving into overlaps the
308 * old location at all, and fail if it does.
309 */
310 if ((new_addr <= addr) && (new_addr+new_len) > addr)
311 goto out;
312
313 if ((addr <= new_addr) && (addr+old_len) > new_addr)
314 goto out;
315
316 ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
317 if (ret)
318 goto out;
319
320 ret = do_munmap(mm, new_addr, new_len);
321 if (ret)
322 goto out;
323 } 432 }
324 433
325 /* 434 /*
@@ -332,60 +441,23 @@ unsigned long do_mremap(unsigned long addr,
332 if (ret && old_len != new_len) 441 if (ret && old_len != new_len)
333 goto out; 442 goto out;
334 ret = addr; 443 ret = addr;
335 if (!(flags & MREMAP_FIXED) || (new_addr == addr)) 444 goto out;
336 goto out;
337 old_len = new_len;
338 } 445 }
339 446
340 /* 447 /*
341 * Ok, we need to grow.. or relocate. 448 * Ok, we need to grow..
342 */ 449 */
343 ret = -EFAULT; 450 vma = vma_to_resize(addr, old_len, new_len, &charged);
344 vma = find_vma(mm, addr); 451 if (IS_ERR(vma)) {
345 if (!vma || vma->vm_start > addr) 452 ret = PTR_ERR(vma);
346 goto out;
347 if (is_vm_hugetlb_page(vma)) {
348 ret = -EINVAL;
349 goto out;
350 }
351 /* We can't remap across vm area boundaries */
352 if (old_len > vma->vm_end - addr)
353 goto out;
354 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
355 if (new_len > old_len)
356 goto out;
357 }
358 if (vma->vm_flags & VM_LOCKED) {
359 unsigned long locked, lock_limit;
360 locked = mm->locked_vm << PAGE_SHIFT;
361 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
362 locked += new_len - old_len;
363 ret = -EAGAIN;
364 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
365 goto out;
366 }
367 if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
368 ret = -ENOMEM;
369 goto out; 453 goto out;
370 } 454 }
371 455
372 if (vma->vm_flags & VM_ACCOUNT) {
373 charged = (new_len - old_len) >> PAGE_SHIFT;
374 if (security_vm_enough_memory(charged))
375 goto out_nc;
376 }
377
378 /* old_len exactly to the end of the area.. 456 /* old_len exactly to the end of the area..
379 * And we're not relocating the area.
380 */ 457 */
381 if (old_len == vma->vm_end - addr && 458 if (old_len == vma->vm_end - addr) {
382 !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
383 (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
384 unsigned long max_addr = TASK_SIZE;
385 if (vma->vm_next)
386 max_addr = vma->vm_next->vm_start;
387 /* can we just expand the current mapping? */ 459 /* can we just expand the current mapping? */
388 if (max_addr - addr >= new_len) { 460 if (vma_expandable(vma, new_len - old_len)) {
389 int pages = (new_len - old_len) >> PAGE_SHIFT; 461 int pages = (new_len - old_len) >> PAGE_SHIFT;
390 462
391 vma_adjust(vma, vma->vm_start, 463 vma_adjust(vma, vma->vm_start,
@@ -409,28 +481,27 @@ unsigned long do_mremap(unsigned long addr,
409 */ 481 */
410 ret = -ENOMEM; 482 ret = -ENOMEM;
411 if (flags & MREMAP_MAYMOVE) { 483 if (flags & MREMAP_MAYMOVE) {
412 if (!(flags & MREMAP_FIXED)) { 484 unsigned long map_flags = 0;
413 unsigned long map_flags = 0; 485 if (vma->vm_flags & VM_MAYSHARE)
414 if (vma->vm_flags & VM_MAYSHARE) 486 map_flags |= MAP_SHARED;
415 map_flags |= MAP_SHARED; 487
416 488 new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
417 new_addr = get_unmapped_area(vma->vm_file, 0, new_len, 489 vma->vm_pgoff +
418 vma->vm_pgoff, map_flags); 490 ((addr - vma->vm_start) >> PAGE_SHIFT),
419 if (new_addr & ~PAGE_MASK) { 491 map_flags);
420 ret = new_addr; 492 if (new_addr & ~PAGE_MASK) {
421 goto out; 493 ret = new_addr;
422 } 494 goto out;
423
424 ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
425 if (ret)
426 goto out;
427 } 495 }
496
497 ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
498 if (ret)
499 goto out;
428 ret = move_vma(vma, addr, old_len, new_len, new_addr); 500 ret = move_vma(vma, addr, old_len, new_len, new_addr);
429 } 501 }
430out: 502out:
431 if (ret & ~PAGE_MASK) 503 if (ret & ~PAGE_MASK)
432 vm_unacct_memory(charged); 504 vm_unacct_memory(charged);
433out_nc:
434 return ret; 505 return ret;
435} 506}
436 507
diff --git a/mm/nommu.c b/mm/nommu.c
index 5189b5aed8c0..6f9248f89bde 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1143,9 +1143,6 @@ static int do_mmap_private(struct vm_area_struct *vma,
1143 if (ret < rlen) 1143 if (ret < rlen)
1144 memset(base + ret, 0, rlen - ret); 1144 memset(base + ret, 0, rlen - ret);
1145 1145
1146 } else {
1147 /* if it's an anonymous mapping, then just clear it */
1148 memset(base, 0, rlen);
1149 } 1146 }
1150 1147
1151 return 0; 1148 return 0;
@@ -1343,6 +1340,11 @@ unsigned long do_mmap_pgoff(struct file *file,
1343 goto error_just_free; 1340 goto error_just_free;
1344 add_nommu_region(region); 1341 add_nommu_region(region);
1345 1342
1343 /* clear anonymous mappings that don't ask for uninitialized data */
1344 if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
1345 memset((void *)region->vm_start, 0,
1346 region->vm_end - region->vm_start);
1347
1346 /* okay... we have a mapping; now we have to register it */ 1348 /* okay... we have a mapping; now we have to register it */
1347 result = vma->vm_start; 1349 result = vma->vm_start;
1348 1350
@@ -1362,9 +1364,11 @@ share:
1362error_just_free: 1364error_just_free:
1363 up_write(&nommu_region_sem); 1365 up_write(&nommu_region_sem);
1364error: 1366error:
1365 fput(region->vm_file); 1367 if (region->vm_file)
1368 fput(region->vm_file);
1366 kmem_cache_free(vm_region_jar, region); 1369 kmem_cache_free(vm_region_jar, region);
1367 fput(vma->vm_file); 1370 if (vma->vm_file)
1371 fput(vma->vm_file);
1368 if (vma->vm_flags & VM_EXECUTABLE) 1372 if (vma->vm_flags & VM_EXECUTABLE)
1369 removed_exe_file_vma(vma->vm_mm); 1373 removed_exe_file_vma(vma->vm_mm);
1370 kmem_cache_free(vm_area_cachep, vma); 1374 kmem_cache_free(vm_area_cachep, vma);
@@ -1394,6 +1398,31 @@ error_getting_region:
1394} 1398}
1395EXPORT_SYMBOL(do_mmap_pgoff); 1399EXPORT_SYMBOL(do_mmap_pgoff);
1396 1400
1401SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1402 unsigned long, prot, unsigned long, flags,
1403 unsigned long, fd, unsigned long, pgoff)
1404{
1405 struct file *file = NULL;
1406 unsigned long retval = -EBADF;
1407
1408 if (!(flags & MAP_ANONYMOUS)) {
1409 file = fget(fd);
1410 if (!file)
1411 goto out;
1412 }
1413
1414 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1415
1416 down_write(&current->mm->mmap_sem);
1417 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1418 up_write(&current->mm->mmap_sem);
1419
1420 if (file)
1421 fput(file);
1422out:
1423 return retval;
1424}
1425
1397/* 1426/*
1398 * split a vma into two pieces at address 'addr', a new vma is allocated either 1427 * split a vma into two pieces at address 'addr', a new vma is allocated either
1399 * for the first part or the tail. 1428 * for the first part or the tail.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ea2147dabba6..f52481b1c1e5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -196,27 +196,46 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
196/* 196/*
197 * Determine the type of allocation constraint. 197 * Determine the type of allocation constraint.
198 */ 198 */
199static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
200 gfp_t gfp_mask)
201{
202#ifdef CONFIG_NUMA 199#ifdef CONFIG_NUMA
200static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
201 gfp_t gfp_mask, nodemask_t *nodemask)
202{
203 struct zone *zone; 203 struct zone *zone;
204 struct zoneref *z; 204 struct zoneref *z;
205 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 205 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
206 nodemask_t nodes = node_states[N_HIGH_MEMORY];
207 206
208 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 207 /*
209 if (cpuset_zone_allowed_softwall(zone, gfp_mask)) 208 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
210 node_clear(zone_to_nid(zone), nodes); 209 * to kill current.We have to random task kill in this case.
211 else 210 * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
212 return CONSTRAINT_CPUSET; 211 */
212 if (gfp_mask & __GFP_THISNODE)
213 return CONSTRAINT_NONE;
213 214
214 if (!nodes_empty(nodes)) 215 /*
216 * The nodemask here is a nodemask passed to alloc_pages(). Now,
217 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
218 * feature. mempolicy is an only user of nodemask here.
219 * check mempolicy's nodemask contains all N_HIGH_MEMORY
220 */
221 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask))
215 return CONSTRAINT_MEMORY_POLICY; 222 return CONSTRAINT_MEMORY_POLICY;
216#endif
217 223
224 /* Check this allocation failure is caused by cpuset's wall function */
225 for_each_zone_zonelist_nodemask(zone, z, zonelist,
226 high_zoneidx, nodemask)
227 if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
228 return CONSTRAINT_CPUSET;
229
230 return CONSTRAINT_NONE;
231}
232#else
233static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
234 gfp_t gfp_mask, nodemask_t *nodemask)
235{
218 return CONSTRAINT_NONE; 236 return CONSTRAINT_NONE;
219} 237}
238#endif
220 239
221/* 240/*
222 * Simple selection loop. We chose the process with the highest 241 * Simple selection loop. We chose the process with the highest
@@ -337,6 +356,24 @@ static void dump_tasks(const struct mem_cgroup *mem)
337 } while_each_thread(g, p); 356 } while_each_thread(g, p);
338} 357}
339 358
359static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
360 struct mem_cgroup *mem)
361{
362 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
363 "oom_adj=%d\n",
364 current->comm, gfp_mask, order, current->signal->oom_adj);
365 task_lock(current);
366 cpuset_print_task_mems_allowed(current);
367 task_unlock(current);
368 dump_stack();
369 mem_cgroup_print_oom_info(mem, p);
370 show_mem();
371 if (sysctl_oom_dump_tasks)
372 dump_tasks(mem);
373}
374
375#define K(x) ((x) << (PAGE_SHIFT-10))
376
340/* 377/*
341 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO 378 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
342 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO 379 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
@@ -350,15 +387,23 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
350 return; 387 return;
351 } 388 }
352 389
390 task_lock(p);
353 if (!p->mm) { 391 if (!p->mm) {
354 WARN_ON(1); 392 WARN_ON(1);
355 printk(KERN_WARNING "tried to kill an mm-less task!\n"); 393 printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n",
394 task_pid_nr(p), p->comm);
395 task_unlock(p);
356 return; 396 return;
357 } 397 }
358 398
359 if (verbose) 399 if (verbose)
360 printk(KERN_ERR "Killed process %d (%s)\n", 400 printk(KERN_ERR "Killed process %d (%s) "
361 task_pid_nr(p), p->comm); 401 "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
402 task_pid_nr(p), p->comm,
403 K(p->mm->total_vm),
404 K(get_mm_counter(p->mm, anon_rss)),
405 K(get_mm_counter(p->mm, file_rss)));
406 task_unlock(p);
362 407
363 /* 408 /*
364 * We give our sacrificial lamb high priority and access to 409 * We give our sacrificial lamb high priority and access to
@@ -395,20 +440,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
395{ 440{
396 struct task_struct *c; 441 struct task_struct *c;
397 442
398 if (printk_ratelimit()) { 443 if (printk_ratelimit())
399 printk(KERN_WARNING "%s invoked oom-killer: " 444 dump_header(p, gfp_mask, order, mem);
400 "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
401 current->comm, gfp_mask, order,
402 current->signal->oom_adj);
403 task_lock(current);
404 cpuset_print_task_mems_allowed(current);
405 task_unlock(current);
406 dump_stack();
407 mem_cgroup_print_oom_info(mem, current);
408 show_mem();
409 if (sysctl_oom_dump_tasks)
410 dump_tasks(mem);
411 }
412 445
413 /* 446 /*
414 * If the task is already exiting, don't alarm the sysadmin or kill 447 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -544,6 +577,7 @@ retry:
544 /* Found nothing?!?! Either we hang forever, or we panic. */ 577 /* Found nothing?!?! Either we hang forever, or we panic. */
545 if (!p) { 578 if (!p) {
546 read_unlock(&tasklist_lock); 579 read_unlock(&tasklist_lock);
580 dump_header(NULL, gfp_mask, order, NULL);
547 panic("Out of memory and no killable processes...\n"); 581 panic("Out of memory and no killable processes...\n");
548 } 582 }
549 583
@@ -599,7 +633,8 @@ rest_and_return:
599 * OR try to be smart about which process to kill. Note that we 633 * OR try to be smart about which process to kill. Note that we
600 * don't have to be perfect here, we just have to be good. 634 * don't have to be perfect here, we just have to be good.
601 */ 635 */
602void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) 636void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
637 int order, nodemask_t *nodemask)
603{ 638{
604 unsigned long freed = 0; 639 unsigned long freed = 0;
605 enum oom_constraint constraint; 640 enum oom_constraint constraint;
@@ -609,14 +644,16 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
609 /* Got some memory back in the last second. */ 644 /* Got some memory back in the last second. */
610 return; 645 return;
611 646
612 if (sysctl_panic_on_oom == 2) 647 if (sysctl_panic_on_oom == 2) {
648 dump_header(NULL, gfp_mask, order, NULL);
613 panic("out of memory. Compulsory panic_on_oom is selected.\n"); 649 panic("out of memory. Compulsory panic_on_oom is selected.\n");
650 }
614 651
615 /* 652 /*
616 * Check if there were limitations on the allocation (only relevant for 653 * Check if there were limitations on the allocation (only relevant for
617 * NUMA) that may require different handling. 654 * NUMA) that may require different handling.
618 */ 655 */
619 constraint = constrained_alloc(zonelist, gfp_mask); 656 constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
620 read_lock(&tasklist_lock); 657 read_lock(&tasklist_lock);
621 658
622 switch (constraint) { 659 switch (constraint) {
@@ -626,8 +663,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
626 break; 663 break;
627 664
628 case CONSTRAINT_NONE: 665 case CONSTRAINT_NONE:
629 if (sysctl_panic_on_oom) 666 if (sysctl_panic_on_oom) {
667 dump_header(NULL, gfp_mask, order, NULL);
630 panic("out of memory. panic_on_oom is selected\n"); 668 panic("out of memory. panic_on_oom is selected\n");
669 }
631 /* Fall-through */ 670 /* Fall-through */
632 case CONSTRAINT_CPUSET: 671 case CONSTRAINT_CPUSET:
633 __out_of_memory(gfp_mask, order); 672 __out_of_memory(gfp_mask, order);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a3b14090b1fb..0b19943ecf8b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -566,7 +566,8 @@ static void balance_dirty_pages(struct address_space *mapping,
566 if (pages_written >= write_chunk) 566 if (pages_written >= write_chunk)
567 break; /* We've done our duty */ 567 break; /* We've done our duty */
568 568
569 schedule_timeout_interruptible(pause); 569 __set_current_state(TASK_INTERRUPTIBLE);
570 io_schedule_timeout(pause);
570 571
571 /* 572 /*
572 * Increase the delay for each loop, up to our previous 573 * Increase the delay for each loop, up to our previous
@@ -820,7 +821,6 @@ int write_cache_pages(struct address_space *mapping,
820 struct writeback_control *wbc, writepage_t writepage, 821 struct writeback_control *wbc, writepage_t writepage,
821 void *data) 822 void *data)
822{ 823{
823 struct backing_dev_info *bdi = mapping->backing_dev_info;
824 int ret = 0; 824 int ret = 0;
825 int done = 0; 825 int done = 0;
826 struct pagevec pvec; 826 struct pagevec pvec;
@@ -833,11 +833,6 @@ int write_cache_pages(struct address_space *mapping,
833 int range_whole = 0; 833 int range_whole = 0;
834 long nr_to_write = wbc->nr_to_write; 834 long nr_to_write = wbc->nr_to_write;
835 835
836 if (wbc->nonblocking && bdi_write_congested(bdi)) {
837 wbc->encountered_congestion = 1;
838 return 0;
839 }
840
841 pagevec_init(&pvec, 0); 836 pagevec_init(&pvec, 0);
842 if (wbc->range_cyclic) { 837 if (wbc->range_cyclic) {
843 writeback_index = mapping->writeback_index; /* prev offset */ 838 writeback_index = mapping->writeback_index; /* prev offset */
@@ -956,12 +951,6 @@ continue_unlock:
956 break; 951 break;
957 } 952 }
958 } 953 }
959
960 if (wbc->nonblocking && bdi_write_congested(bdi)) {
961 wbc->encountered_congestion = 1;
962 done = 1;
963 break;
964 }
965 } 954 }
966 pagevec_release(&pvec); 955 pagevec_release(&pvec);
967 cond_resched(); 956 cond_resched();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bf720550b44d..4e9f5cc5fb59 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
48#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
49#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <linux/memory.h>
51#include <trace/events/kmem.h> 52#include <trace/events/kmem.h>
52 53
53#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
@@ -486,7 +487,6 @@ static inline void __free_one_page(struct page *page,
486 zone->free_area[order].nr_free++; 487 zone->free_area[order].nr_free++;
487} 488}
488 489
489#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
490/* 490/*
491 * free_page_mlock() -- clean up attempts to free and mlocked() page. 491 * free_page_mlock() -- clean up attempts to free and mlocked() page.
492 * Page should not be on lru, so no need to fix that up. 492 * Page should not be on lru, so no need to fix that up.
@@ -497,9 +497,6 @@ static inline void free_page_mlock(struct page *page)
497 __dec_zone_page_state(page, NR_MLOCK); 497 __dec_zone_page_state(page, NR_MLOCK);
498 __count_vm_event(UNEVICTABLE_MLOCKFREED); 498 __count_vm_event(UNEVICTABLE_MLOCKFREED);
499} 499}
500#else
501static void free_page_mlock(struct page *page) { }
502#endif
503 500
504static inline int free_pages_check(struct page *page) 501static inline int free_pages_check(struct page *page)
505{ 502{
@@ -1658,12 +1655,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1658 if (page) 1655 if (page)
1659 goto out; 1656 goto out;
1660 1657
1661 /* The OOM killer will not help higher order allocs */ 1658 if (!(gfp_mask & __GFP_NOFAIL)) {
1662 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) 1659 /* The OOM killer will not help higher order allocs */
1663 goto out; 1660 if (order > PAGE_ALLOC_COSTLY_ORDER)
1664 1661 goto out;
1662 /*
1663 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
1664 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
1665 * The caller should handle page allocation failure by itself if
1666 * it specifies __GFP_THISNODE.
1667 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
1668 */
1669 if (gfp_mask & __GFP_THISNODE)
1670 goto out;
1671 }
1665 /* Exhausted what can be done so it's blamo time */ 1672 /* Exhausted what can be done so it's blamo time */
1666 out_of_memory(zonelist, gfp_mask, order); 1673 out_of_memory(zonelist, gfp_mask, order, nodemask);
1667 1674
1668out: 1675out:
1669 clear_zonelist_oom(zonelist, gfp_mask); 1676 clear_zonelist_oom(zonelist, gfp_mask);
@@ -1769,7 +1776,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1769 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1776 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1770 */ 1777 */
1771 alloc_flags &= ~ALLOC_CPUSET; 1778 alloc_flags &= ~ALLOC_CPUSET;
1772 } else if (unlikely(rt_task(p))) 1779 } else if (unlikely(rt_task(p)) && !in_interrupt())
1773 alloc_flags |= ALLOC_HARDER; 1780 alloc_flags |= ALLOC_HARDER;
1774 1781
1775 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 1782 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
@@ -1817,9 +1824,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1817 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1824 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1818 goto nopage; 1825 goto nopage;
1819 1826
1827restart:
1820 wake_all_kswapd(order, zonelist, high_zoneidx); 1828 wake_all_kswapd(order, zonelist, high_zoneidx);
1821 1829
1822restart:
1823 /* 1830 /*
1824 * OK, we're below the kswapd watermark and have kicked background 1831 * OK, we're below the kswapd watermark and have kicked background
1825 * reclaim. Now things get more complex, so set up alloc_flags according 1832 * reclaim. Now things get more complex, so set up alloc_flags according
@@ -2183,7 +2190,7 @@ void show_free_areas(void)
2183 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 2190 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
2184 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 2191 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
2185 " unevictable:%lu" 2192 " unevictable:%lu"
2186 " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n" 2193 " dirty:%lu writeback:%lu unstable:%lu\n"
2187 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" 2194 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2188 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", 2195 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
2189 global_page_state(NR_ACTIVE_ANON), 2196 global_page_state(NR_ACTIVE_ANON),
@@ -2196,7 +2203,6 @@ void show_free_areas(void)
2196 global_page_state(NR_FILE_DIRTY), 2203 global_page_state(NR_FILE_DIRTY),
2197 global_page_state(NR_WRITEBACK), 2204 global_page_state(NR_WRITEBACK),
2198 global_page_state(NR_UNSTABLE_NFS), 2205 global_page_state(NR_UNSTABLE_NFS),
2199 nr_blockdev_pages(),
2200 global_page_state(NR_FREE_PAGES), 2206 global_page_state(NR_FREE_PAGES),
2201 global_page_state(NR_SLAB_RECLAIMABLE), 2207 global_page_state(NR_SLAB_RECLAIMABLE),
2202 global_page_state(NR_SLAB_UNRECLAIMABLE), 2208 global_page_state(NR_SLAB_UNRECLAIMABLE),
@@ -2396,13 +2402,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2396{ 2402{
2397 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 2403 char saved_string[NUMA_ZONELIST_ORDER_LEN];
2398 int ret; 2404 int ret;
2405 static DEFINE_MUTEX(zl_order_mutex);
2399 2406
2407 mutex_lock(&zl_order_mutex);
2400 if (write) 2408 if (write)
2401 strncpy(saved_string, (char*)table->data, 2409 strcpy(saved_string, (char*)table->data);
2402 NUMA_ZONELIST_ORDER_LEN);
2403 ret = proc_dostring(table, write, buffer, length, ppos); 2410 ret = proc_dostring(table, write, buffer, length, ppos);
2404 if (ret) 2411 if (ret)
2405 return ret; 2412 goto out;
2406 if (write) { 2413 if (write) {
2407 int oldval = user_zonelist_order; 2414 int oldval = user_zonelist_order;
2408 if (__parse_numa_zonelist_order((char*)table->data)) { 2415 if (__parse_numa_zonelist_order((char*)table->data)) {
@@ -2415,7 +2422,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2415 } else if (oldval != user_zonelist_order) 2422 } else if (oldval != user_zonelist_order)
2416 build_all_zonelists(); 2423 build_all_zonelists();
2417 } 2424 }
2418 return 0; 2425out:
2426 mutex_unlock(&zl_order_mutex);
2427 return ret;
2419} 2428}
2420 2429
2421 2430
@@ -3128,7 +3137,7 @@ static int __cpuinit process_zones(int cpu)
3128 3137
3129 if (percpu_pagelist_fraction) 3138 if (percpu_pagelist_fraction)
3130 setup_pagelist_highmark(zone_pcp(zone, cpu), 3139 setup_pagelist_highmark(zone_pcp(zone, cpu),
3131 (zone->present_pages / percpu_pagelist_fraction)); 3140 (zone->present_pages / percpu_pagelist_fraction));
3132 } 3141 }
3133 3142
3134 return 0; 3143 return 0;
@@ -3574,7 +3583,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
3574 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 3583 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
3575 * then all holes in the requested range will be accounted for. 3584 * then all holes in the requested range will be accounted for.
3576 */ 3585 */
3577static unsigned long __meminit __absent_pages_in_range(int nid, 3586unsigned long __meminit __absent_pages_in_range(int nid,
3578 unsigned long range_start_pfn, 3587 unsigned long range_start_pfn,
3579 unsigned long range_end_pfn) 3588 unsigned long range_end_pfn)
3580{ 3589{
@@ -4103,7 +4112,7 @@ static int __init cmp_node_active_region(const void *a, const void *b)
4103} 4112}
4104 4113
4105/* sort the node_map by start_pfn */ 4114/* sort the node_map by start_pfn */
4106static void __init sort_node_map(void) 4115void __init sort_node_map(void)
4107{ 4116{
4108 sort(early_node_map, (size_t)nr_nodemap_entries, 4117 sort(early_node_map, (size_t)nr_nodemap_entries,
4109 sizeof(struct node_active_region), 4118 sizeof(struct node_active_region),
@@ -5003,23 +5012,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5003int set_migratetype_isolate(struct page *page) 5012int set_migratetype_isolate(struct page *page)
5004{ 5013{
5005 struct zone *zone; 5014 struct zone *zone;
5006 unsigned long flags; 5015 struct page *curr_page;
5016 unsigned long flags, pfn, iter;
5017 unsigned long immobile = 0;
5018 struct memory_isolate_notify arg;
5019 int notifier_ret;
5007 int ret = -EBUSY; 5020 int ret = -EBUSY;
5008 int zone_idx; 5021 int zone_idx;
5009 5022
5010 zone = page_zone(page); 5023 zone = page_zone(page);
5011 zone_idx = zone_idx(zone); 5024 zone_idx = zone_idx(zone);
5025
5012 spin_lock_irqsave(&zone->lock, flags); 5026 spin_lock_irqsave(&zone->lock, flags);
5027 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
5028 zone_idx == ZONE_MOVABLE) {
5029 ret = 0;
5030 goto out;
5031 }
5032
5033 pfn = page_to_pfn(page);
5034 arg.start_pfn = pfn;
5035 arg.nr_pages = pageblock_nr_pages;
5036 arg.pages_found = 0;
5037
5013 /* 5038 /*
5014 * In future, more migrate types will be able to be isolation target. 5039 * It may be possible to isolate a pageblock even if the
5040 * migratetype is not MIGRATE_MOVABLE. The memory isolation
5041 * notifier chain is used by balloon drivers to return the
5042 * number of pages in a range that are held by the balloon
5043 * driver to shrink memory. If all the pages are accounted for
5044 * by balloons, are free, or on the LRU, isolation can continue.
5045 * Later, for example, when memory hotplug notifier runs, these
5046 * pages reported as "can be isolated" should be isolated(freed)
5047 * by the balloon driver through the memory notifier chain.
5015 */ 5048 */
5016 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && 5049 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5017 zone_idx != ZONE_MOVABLE) 5050 notifier_ret = notifier_to_errno(notifier_ret);
5051 if (notifier_ret || !arg.pages_found)
5018 goto out; 5052 goto out;
5019 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5053
5020 move_freepages_block(zone, page, MIGRATE_ISOLATE); 5054 for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
5021 ret = 0; 5055 if (!pfn_valid_within(pfn))
5056 continue;
5057
5058 curr_page = pfn_to_page(iter);
5059 if (!page_count(curr_page) || PageLRU(curr_page))
5060 continue;
5061
5062 immobile++;
5063 }
5064
5065 if (arg.pages_found == immobile)
5066 ret = 0;
5067
5022out: 5068out:
5069 if (!ret) {
5070 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
5071 move_freepages_block(zone, page, MIGRATE_ISOLATE);
5072 }
5073
5023 spin_unlock_irqrestore(&zone->lock, flags); 5074 spin_unlock_irqrestore(&zone->lock, flags);
5024 if (!ret) 5075 if (!ret)
5025 drain_all_pages(); 5076 drain_all_pages();
@@ -5086,3 +5137,24 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5086 spin_unlock_irqrestore(&zone->lock, flags); 5137 spin_unlock_irqrestore(&zone->lock, flags);
5087} 5138}
5088#endif 5139#endif
5140
5141#ifdef CONFIG_MEMORY_FAILURE
5142bool is_free_buddy_page(struct page *page)
5143{
5144 struct zone *zone = page_zone(page);
5145 unsigned long pfn = page_to_pfn(page);
5146 unsigned long flags;
5147 int order;
5148
5149 spin_lock_irqsave(&zone->lock, flags);
5150 for (order = 0; order < MAX_ORDER; order++) {
5151 struct page *page_head = page - (pfn & ((1 << order) - 1));
5152
5153 if (PageBuddy(page_head) && page_order(page_head) >= order)
5154 break;
5155 }
5156 spin_unlock_irqrestore(&zone->lock, flags);
5157
5158 return order < MAX_ORDER;
5159}
5160#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index c6f3e5071de3..a19af956ee1b 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -19,20 +19,15 @@
19#include <linux/writeback.h> 19#include <linux/writeback.h>
20#include <asm/pgtable.h> 20#include <asm/pgtable.h>
21 21
22static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, 22static struct bio *get_swap_bio(gfp_t gfp_flags,
23 struct page *page, bio_end_io_t end_io) 23 struct page *page, bio_end_io_t end_io)
24{ 24{
25 struct bio *bio; 25 struct bio *bio;
26 26
27 bio = bio_alloc(gfp_flags, 1); 27 bio = bio_alloc(gfp_flags, 1);
28 if (bio) { 28 if (bio) {
29 struct swap_info_struct *sis; 29 bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
30 swp_entry_t entry = { .val = index, }; 30 bio->bi_sector <<= PAGE_SHIFT - 9;
31
32 sis = get_swap_info_struct(swp_type(entry));
33 bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
34 (PAGE_SIZE >> 9);
35 bio->bi_bdev = sis->bdev;
36 bio->bi_io_vec[0].bv_page = page; 31 bio->bi_io_vec[0].bv_page = page;
37 bio->bi_io_vec[0].bv_len = PAGE_SIZE; 32 bio->bi_io_vec[0].bv_len = PAGE_SIZE;
38 bio->bi_io_vec[0].bv_offset = 0; 33 bio->bi_io_vec[0].bv_offset = 0;
@@ -102,8 +97,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
102 unlock_page(page); 97 unlock_page(page);
103 goto out; 98 goto out;
104 } 99 }
105 bio = get_swap_bio(GFP_NOIO, page_private(page), page, 100 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
106 end_swap_bio_write);
107 if (bio == NULL) { 101 if (bio == NULL) {
108 set_page_dirty(page); 102 set_page_dirty(page);
109 unlock_page(page); 103 unlock_page(page);
@@ -127,8 +121,7 @@ int swap_readpage(struct page *page)
127 121
128 VM_BUG_ON(!PageLocked(page)); 122 VM_BUG_ON(!PageLocked(page));
129 VM_BUG_ON(PageUptodate(page)); 123 VM_BUG_ON(PageUptodate(page));
130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page, 124 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
131 end_swap_bio_read);
132 if (bio == NULL) { 125 if (bio == NULL) {
133 unlock_page(page); 126 unlock_page(page);
134 ret = -ENOMEM; 127 ret = -ENOMEM;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index d5878bed7841..7b47a57b6646 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -1,6 +1,7 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/highmem.h> 2#include <linux/highmem.h>
3#include <linux/sched.h> 3#include <linux/sched.h>
4#include <linux/hugetlb.h>
4 5
5static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 6static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
6 struct mm_walk *walk) 7 struct mm_walk *walk)
@@ -107,6 +108,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
107 pgd_t *pgd; 108 pgd_t *pgd;
108 unsigned long next; 109 unsigned long next;
109 int err = 0; 110 int err = 0;
111 struct vm_area_struct *vma;
110 112
111 if (addr >= end) 113 if (addr >= end)
112 return err; 114 return err;
@@ -117,11 +119,38 @@ int walk_page_range(unsigned long addr, unsigned long end,
117 pgd = pgd_offset(walk->mm, addr); 119 pgd = pgd_offset(walk->mm, addr);
118 do { 120 do {
119 next = pgd_addr_end(addr, end); 121 next = pgd_addr_end(addr, end);
122
123 /*
124 * handle hugetlb vma individually because pagetable walk for
125 * the hugetlb page is dependent on the architecture and
126 * we can't handled it in the same manner as non-huge pages.
127 */
128 vma = find_vma(walk->mm, addr);
129#ifdef CONFIG_HUGETLB_PAGE
130 if (vma && is_vm_hugetlb_page(vma)) {
131 pte_t *pte;
132 struct hstate *hs;
133
134 if (vma->vm_end < next)
135 next = vma->vm_end;
136 hs = hstate_vma(vma);
137 pte = huge_pte_offset(walk->mm,
138 addr & huge_page_mask(hs));
139 if (pte && !huge_pte_none(huge_ptep_get(pte))
140 && walk->hugetlb_entry)
141 err = walk->hugetlb_entry(pte, addr,
142 next, walk);
143 if (err)
144 break;
145 continue;
146 }
147#endif
120 if (pgd_none_or_clear_bad(pgd)) { 148 if (pgd_none_or_clear_bad(pgd)) {
121 if (walk->pte_hole) 149 if (walk->pte_hole)
122 err = walk->pte_hole(addr, next, walk); 150 err = walk->pte_hole(addr, next, walk);
123 if (err) 151 if (err)
124 break; 152 break;
153 pgd++;
125 continue; 154 continue;
126 } 155 }
127 if (walk->pgd_entry) 156 if (walk->pgd_entry)
@@ -131,7 +160,8 @@ int walk_page_range(unsigned long addr, unsigned long end,
131 err = walk_pud_range(pgd, addr, next, walk); 160 err = walk_pud_range(pgd, addr, next, walk);
132 if (err) 161 if (err)
133 break; 162 break;
134 } while (pgd++, addr = next, addr != end); 163 pgd++;
164 } while (addr = next, addr != end);
135 165
136 return err; 166 return err;
137} 167}
diff --git a/mm/percpu.c b/mm/percpu.c
index 77c6f7994a46..626e43c99498 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -72,6 +72,7 @@
72#include <asm/cacheflush.h> 72#include <asm/cacheflush.h>
73#include <asm/sections.h> 73#include <asm/sections.h>
74#include <asm/tlbflush.h> 74#include <asm/tlbflush.h>
75#include <asm/io.h>
75 76
76#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 77#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
77#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 78#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
@@ -151,7 +152,10 @@ static int pcpu_reserved_chunk_limit;
151 * 152 *
152 * During allocation, pcpu_alloc_mutex is kept locked all the time and 153 * During allocation, pcpu_alloc_mutex is kept locked all the time and
153 * pcpu_lock is grabbed and released as necessary. All actual memory 154 * pcpu_lock is grabbed and released as necessary. All actual memory
154 * allocations are done using GFP_KERNEL with pcpu_lock released. 155 * allocations are done using GFP_KERNEL with pcpu_lock released. In
156 * general, percpu memory can't be allocated with irq off but
157 * irqsave/restore are still used in alloc path so that it can be used
158 * from early init path - sched_init() specifically.
155 * 159 *
156 * Free path accesses and alters only the index data structures, so it 160 * Free path accesses and alters only the index data structures, so it
157 * can be safely called from atomic context. When memory needs to be 161 * can be safely called from atomic context. When memory needs to be
@@ -350,63 +354,86 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
350} 354}
351 355
352/** 356/**
353 * pcpu_extend_area_map - extend area map for allocation 357 * pcpu_need_to_extend - determine whether chunk area map needs to be extended
354 * @chunk: target chunk 358 * @chunk: chunk of interest
355 * 359 *
356 * Extend area map of @chunk so that it can accomodate an allocation. 360 * Determine whether area map of @chunk needs to be extended to
357 * A single allocation can split an area into three areas, so this 361 * accomodate a new allocation.
358 * function makes sure that @chunk->map has at least two extra slots.
359 * 362 *
360 * CONTEXT: 363 * CONTEXT:
361 * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired 364 * pcpu_lock.
362 * if area map is extended.
363 * 365 *
364 * RETURNS: 366 * RETURNS:
365 * 0 if noop, 1 if successfully extended, -errno on failure. 367 * New target map allocation length if extension is necessary, 0
368 * otherwise.
366 */ 369 */
367static int pcpu_extend_area_map(struct pcpu_chunk *chunk) 370static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
368 __releases(lock) __acquires(lock)
369{ 371{
370 int new_alloc; 372 int new_alloc;
371 int *new;
372 size_t size;
373 373
374 /* has enough? */
375 if (chunk->map_alloc >= chunk->map_used + 2) 374 if (chunk->map_alloc >= chunk->map_used + 2)
376 return 0; 375 return 0;
377 376
378 spin_unlock_irq(&pcpu_lock);
379
380 new_alloc = PCPU_DFL_MAP_ALLOC; 377 new_alloc = PCPU_DFL_MAP_ALLOC;
381 while (new_alloc < chunk->map_used + 2) 378 while (new_alloc < chunk->map_used + 2)
382 new_alloc *= 2; 379 new_alloc *= 2;
383 380
384 new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); 381 return new_alloc;
385 if (!new) { 382}
386 spin_lock_irq(&pcpu_lock); 383
384/**
385 * pcpu_extend_area_map - extend area map of a chunk
386 * @chunk: chunk of interest
387 * @new_alloc: new target allocation length of the area map
388 *
389 * Extend area map of @chunk to have @new_alloc entries.
390 *
391 * CONTEXT:
392 * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock.
393 *
394 * RETURNS:
395 * 0 on success, -errno on failure.
396 */
397static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
398{
399 int *old = NULL, *new = NULL;
400 size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
401 unsigned long flags;
402
403 new = pcpu_mem_alloc(new_size);
404 if (!new)
387 return -ENOMEM; 405 return -ENOMEM;
388 }
389 406
390 /* 407 /* acquire pcpu_lock and switch to new area map */
391 * Acquire pcpu_lock and switch to new area map. Only free 408 spin_lock_irqsave(&pcpu_lock, flags);
392 * could have happened inbetween, so map_used couldn't have
393 * grown.
394 */
395 spin_lock_irq(&pcpu_lock);
396 BUG_ON(new_alloc < chunk->map_used + 2);
397 409
398 size = chunk->map_alloc * sizeof(chunk->map[0]); 410 if (new_alloc <= chunk->map_alloc)
399 memcpy(new, chunk->map, size); 411 goto out_unlock;
412
413 old_size = chunk->map_alloc * sizeof(chunk->map[0]);
414 memcpy(new, chunk->map, old_size);
400 415
401 /* 416 /*
402 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is 417 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
403 * one of the first chunks and still using static map. 418 * one of the first chunks and still using static map.
404 */ 419 */
405 if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) 420 if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
406 pcpu_mem_free(chunk->map, size); 421 old = chunk->map;
407 422
408 chunk->map_alloc = new_alloc; 423 chunk->map_alloc = new_alloc;
409 chunk->map = new; 424 chunk->map = new;
425 new = NULL;
426
427out_unlock:
428 spin_unlock_irqrestore(&pcpu_lock, flags);
429
430 /*
431 * pcpu_mem_free() might end up calling vfree() which uses
432 * IRQ-unsafe lock and thus can't be called under pcpu_lock.
433 */
434 pcpu_mem_free(old, old_size);
435 pcpu_mem_free(new, new_size);
436
410 return 0; 437 return 0;
411} 438}
412 439
@@ -1043,7 +1070,8 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
1043 static int warn_limit = 10; 1070 static int warn_limit = 10;
1044 struct pcpu_chunk *chunk; 1071 struct pcpu_chunk *chunk;
1045 const char *err; 1072 const char *err;
1046 int slot, off; 1073 int slot, off, new_alloc;
1074 unsigned long flags;
1047 1075
1048 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { 1076 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
1049 WARN(true, "illegal size (%zu) or align (%zu) for " 1077 WARN(true, "illegal size (%zu) or align (%zu) for "
@@ -1052,19 +1080,30 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
1052 } 1080 }
1053 1081
1054 mutex_lock(&pcpu_alloc_mutex); 1082 mutex_lock(&pcpu_alloc_mutex);
1055 spin_lock_irq(&pcpu_lock); 1083 spin_lock_irqsave(&pcpu_lock, flags);
1056 1084
1057 /* serve reserved allocations from the reserved chunk if available */ 1085 /* serve reserved allocations from the reserved chunk if available */
1058 if (reserved && pcpu_reserved_chunk) { 1086 if (reserved && pcpu_reserved_chunk) {
1059 chunk = pcpu_reserved_chunk; 1087 chunk = pcpu_reserved_chunk;
1060 if (size > chunk->contig_hint || 1088
1061 pcpu_extend_area_map(chunk) < 0) { 1089 if (size > chunk->contig_hint) {
1062 err = "failed to extend area map of reserved chunk"; 1090 err = "alloc from reserved chunk failed";
1063 goto fail_unlock; 1091 goto fail_unlock;
1064 } 1092 }
1093
1094 while ((new_alloc = pcpu_need_to_extend(chunk))) {
1095 spin_unlock_irqrestore(&pcpu_lock, flags);
1096 if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
1097 err = "failed to extend area map of reserved chunk";
1098 goto fail_unlock_mutex;
1099 }
1100 spin_lock_irqsave(&pcpu_lock, flags);
1101 }
1102
1065 off = pcpu_alloc_area(chunk, size, align); 1103 off = pcpu_alloc_area(chunk, size, align);
1066 if (off >= 0) 1104 if (off >= 0)
1067 goto area_found; 1105 goto area_found;
1106
1068 err = "alloc from reserved chunk failed"; 1107 err = "alloc from reserved chunk failed";
1069 goto fail_unlock; 1108 goto fail_unlock;
1070 } 1109 }
@@ -1076,14 +1115,20 @@ restart:
1076 if (size > chunk->contig_hint) 1115 if (size > chunk->contig_hint)
1077 continue; 1116 continue;
1078 1117
1079 switch (pcpu_extend_area_map(chunk)) { 1118 new_alloc = pcpu_need_to_extend(chunk);
1080 case 0: 1119 if (new_alloc) {
1081 break; 1120 spin_unlock_irqrestore(&pcpu_lock, flags);
1082 case 1: 1121 if (pcpu_extend_area_map(chunk,
1083 goto restart; /* pcpu_lock dropped, restart */ 1122 new_alloc) < 0) {
1084 default: 1123 err = "failed to extend area map";
1085 err = "failed to extend area map"; 1124 goto fail_unlock_mutex;
1086 goto fail_unlock; 1125 }
1126 spin_lock_irqsave(&pcpu_lock, flags);
1127 /*
1128 * pcpu_lock has been dropped, need to
1129 * restart cpu_slot list walking.
1130 */
1131 goto restart;
1087 } 1132 }
1088 1133
1089 off = pcpu_alloc_area(chunk, size, align); 1134 off = pcpu_alloc_area(chunk, size, align);
@@ -1093,7 +1138,7 @@ restart:
1093 } 1138 }
1094 1139
1095 /* hmmm... no space left, create a new chunk */ 1140 /* hmmm... no space left, create a new chunk */
1096 spin_unlock_irq(&pcpu_lock); 1141 spin_unlock_irqrestore(&pcpu_lock, flags);
1097 1142
1098 chunk = alloc_pcpu_chunk(); 1143 chunk = alloc_pcpu_chunk();
1099 if (!chunk) { 1144 if (!chunk) {
@@ -1101,16 +1146,16 @@ restart:
1101 goto fail_unlock_mutex; 1146 goto fail_unlock_mutex;
1102 } 1147 }
1103 1148
1104 spin_lock_irq(&pcpu_lock); 1149 spin_lock_irqsave(&pcpu_lock, flags);
1105 pcpu_chunk_relocate(chunk, -1); 1150 pcpu_chunk_relocate(chunk, -1);
1106 goto restart; 1151 goto restart;
1107 1152
1108area_found: 1153area_found:
1109 spin_unlock_irq(&pcpu_lock); 1154 spin_unlock_irqrestore(&pcpu_lock, flags);
1110 1155
1111 /* populate, map and clear the area */ 1156 /* populate, map and clear the area */
1112 if (pcpu_populate_chunk(chunk, off, size)) { 1157 if (pcpu_populate_chunk(chunk, off, size)) {
1113 spin_lock_irq(&pcpu_lock); 1158 spin_lock_irqsave(&pcpu_lock, flags);
1114 pcpu_free_area(chunk, off); 1159 pcpu_free_area(chunk, off);
1115 err = "failed to populate"; 1160 err = "failed to populate";
1116 goto fail_unlock; 1161 goto fail_unlock;
@@ -1122,7 +1167,7 @@ area_found:
1122 return __addr_to_pcpu_ptr(chunk->base_addr + off); 1167 return __addr_to_pcpu_ptr(chunk->base_addr + off);
1123 1168
1124fail_unlock: 1169fail_unlock:
1125 spin_unlock_irq(&pcpu_lock); 1170 spin_unlock_irqrestore(&pcpu_lock, flags);
1126fail_unlock_mutex: 1171fail_unlock_mutex:
1127 mutex_unlock(&pcpu_alloc_mutex); 1172 mutex_unlock(&pcpu_alloc_mutex);
1128 if (warn_limit) { 1173 if (warn_limit) {
@@ -1254,6 +1299,27 @@ void free_percpu(void *ptr)
1254} 1299}
1255EXPORT_SYMBOL_GPL(free_percpu); 1300EXPORT_SYMBOL_GPL(free_percpu);
1256 1301
1302/**
1303 * per_cpu_ptr_to_phys - convert translated percpu address to physical address
1304 * @addr: the address to be converted to physical address
1305 *
1306 * Given @addr which is dereferenceable address obtained via one of
1307 * percpu access macros, this function translates it into its physical
1308 * address. The caller is responsible for ensuring @addr stays valid
1309 * until this function finishes.
1310 *
1311 * RETURNS:
1312 * The physical address for @addr.
1313 */
1314phys_addr_t per_cpu_ptr_to_phys(void *addr)
1315{
1316 if ((unsigned long)addr < VMALLOC_START ||
1317 (unsigned long)addr >= VMALLOC_END)
1318 return __pa(addr);
1319 else
1320 return page_to_phys(vmalloc_to_page(addr));
1321}
1322
1257static inline size_t pcpu_calc_fc_sizes(size_t static_size, 1323static inline size_t pcpu_calc_fc_sizes(size_t static_size,
1258 size_t reserved_size, 1324 size_t reserved_size,
1259 ssize_t *dyn_sizep) 1325 ssize_t *dyn_sizep)
diff --git a/mm/readahead.c b/mm/readahead.c
index aa1aa2345235..033bc135a41f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -547,5 +547,17 @@ page_cache_async_readahead(struct address_space *mapping,
547 547
548 /* do read-ahead */ 548 /* do read-ahead */
549 ondemand_readahead(mapping, ra, filp, true, offset, req_size); 549 ondemand_readahead(mapping, ra, filp, true, offset, req_size);
550
551#ifdef CONFIG_BLOCK
552 /*
553 * Normally the current page is !uptodate and lock_page() will be
554 * immediately called to implicitly unplug the device. However this
555 * is not always true for RAID conifgurations, where data arrives
556 * not strictly in their submission order. In this case we need to
557 * explicitly kick off the IO.
558 */
559 if (PageUptodate(page))
560 blk_run_backing_dev(mapping->backing_dev_info, NULL);
561#endif
550} 562}
551EXPORT_SYMBOL_GPL(page_cache_async_readahead); 563EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/rmap.c b/mm/rmap.c
index dd43373a483f..278cd277bdec 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,6 +49,7 @@
49#include <linux/swapops.h> 49#include <linux/swapops.h>
50#include <linux/slab.h> 50#include <linux/slab.h>
51#include <linux/init.h> 51#include <linux/init.h>
52#include <linux/ksm.h>
52#include <linux/rmap.h> 53#include <linux/rmap.h>
53#include <linux/rcupdate.h> 54#include <linux/rcupdate.h>
54#include <linux/module.h> 55#include <linux/module.h>
@@ -67,7 +68,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
67 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 68 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
68} 69}
69 70
70static inline void anon_vma_free(struct anon_vma *anon_vma) 71void anon_vma_free(struct anon_vma *anon_vma)
71{ 72{
72 kmem_cache_free(anon_vma_cachep, anon_vma); 73 kmem_cache_free(anon_vma_cachep, anon_vma);
73} 74}
@@ -171,7 +172,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
171 list_del(&vma->anon_vma_node); 172 list_del(&vma->anon_vma_node);
172 173
173 /* We must garbage collect the anon_vma if it's empty */ 174 /* We must garbage collect the anon_vma if it's empty */
174 empty = list_empty(&anon_vma->head); 175 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
175 spin_unlock(&anon_vma->lock); 176 spin_unlock(&anon_vma->lock);
176 177
177 if (empty) 178 if (empty)
@@ -183,6 +184,7 @@ static void anon_vma_ctor(void *data)
183 struct anon_vma *anon_vma = data; 184 struct anon_vma *anon_vma = data;
184 185
185 spin_lock_init(&anon_vma->lock); 186 spin_lock_init(&anon_vma->lock);
187 ksm_refcount_init(anon_vma);
186 INIT_LIST_HEAD(&anon_vma->head); 188 INIT_LIST_HEAD(&anon_vma->head);
187} 189}
188 190
@@ -202,8 +204,8 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
202 unsigned long anon_mapping; 204 unsigned long anon_mapping;
203 205
204 rcu_read_lock(); 206 rcu_read_lock();
205 anon_mapping = (unsigned long) page->mapping; 207 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
206 if (!(anon_mapping & PAGE_MAPPING_ANON)) 208 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
207 goto out; 209 goto out;
208 if (!page_mapped(page)) 210 if (!page_mapped(page))
209 goto out; 211 goto out;
@@ -248,8 +250,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
248unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 250unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
249{ 251{
250 if (PageAnon(page)) { 252 if (PageAnon(page)) {
251 if ((void *)vma->anon_vma != 253 if (vma->anon_vma != page_anon_vma(page))
252 (void *)page->mapping - PAGE_MAPPING_ANON)
253 return -EFAULT; 254 return -EFAULT;
254 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 255 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
255 if (!vma->vm_file || 256 if (!vma->vm_file ||
@@ -337,21 +338,15 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
337 * Subfunctions of page_referenced: page_referenced_one called 338 * Subfunctions of page_referenced: page_referenced_one called
338 * repeatedly from either page_referenced_anon or page_referenced_file. 339 * repeatedly from either page_referenced_anon or page_referenced_file.
339 */ 340 */
340static int page_referenced_one(struct page *page, 341int page_referenced_one(struct page *page, struct vm_area_struct *vma,
341 struct vm_area_struct *vma, 342 unsigned long address, unsigned int *mapcount,
342 unsigned int *mapcount, 343 unsigned long *vm_flags)
343 unsigned long *vm_flags)
344{ 344{
345 struct mm_struct *mm = vma->vm_mm; 345 struct mm_struct *mm = vma->vm_mm;
346 unsigned long address;
347 pte_t *pte; 346 pte_t *pte;
348 spinlock_t *ptl; 347 spinlock_t *ptl;
349 int referenced = 0; 348 int referenced = 0;
350 349
351 address = vma_address(page, vma);
352 if (address == -EFAULT)
353 goto out;
354
355 pte = page_check_address(page, mm, address, &ptl, 0); 350 pte = page_check_address(page, mm, address, &ptl, 0);
356 if (!pte) 351 if (!pte)
357 goto out; 352 goto out;
@@ -388,9 +383,10 @@ static int page_referenced_one(struct page *page,
388out_unmap: 383out_unmap:
389 (*mapcount)--; 384 (*mapcount)--;
390 pte_unmap_unlock(pte, ptl); 385 pte_unmap_unlock(pte, ptl);
391out: 386
392 if (referenced) 387 if (referenced)
393 *vm_flags |= vma->vm_flags; 388 *vm_flags |= vma->vm_flags;
389out:
394 return referenced; 390 return referenced;
395} 391}
396 392
@@ -409,6 +405,9 @@ static int page_referenced_anon(struct page *page,
409 405
410 mapcount = page_mapcount(page); 406 mapcount = page_mapcount(page);
411 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 407 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
408 unsigned long address = vma_address(page, vma);
409 if (address == -EFAULT)
410 continue;
412 /* 411 /*
413 * If we are reclaiming on behalf of a cgroup, skip 412 * If we are reclaiming on behalf of a cgroup, skip
414 * counting on behalf of references from different 413 * counting on behalf of references from different
@@ -416,7 +415,7 @@ static int page_referenced_anon(struct page *page,
416 */ 415 */
417 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 416 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
418 continue; 417 continue;
419 referenced += page_referenced_one(page, vma, 418 referenced += page_referenced_one(page, vma, address,
420 &mapcount, vm_flags); 419 &mapcount, vm_flags);
421 if (!mapcount) 420 if (!mapcount)
422 break; 421 break;
@@ -474,6 +473,9 @@ static int page_referenced_file(struct page *page,
474 mapcount = page_mapcount(page); 473 mapcount = page_mapcount(page);
475 474
476 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 475 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
476 unsigned long address = vma_address(page, vma);
477 if (address == -EFAULT)
478 continue;
477 /* 479 /*
478 * If we are reclaiming on behalf of a cgroup, skip 480 * If we are reclaiming on behalf of a cgroup, skip
479 * counting on behalf of references from different 481 * counting on behalf of references from different
@@ -481,7 +483,7 @@ static int page_referenced_file(struct page *page,
481 */ 483 */
482 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 484 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
483 continue; 485 continue;
484 referenced += page_referenced_one(page, vma, 486 referenced += page_referenced_one(page, vma, address,
485 &mapcount, vm_flags); 487 &mapcount, vm_flags);
486 if (!mapcount) 488 if (!mapcount)
487 break; 489 break;
@@ -507,46 +509,47 @@ int page_referenced(struct page *page,
507 unsigned long *vm_flags) 509 unsigned long *vm_flags)
508{ 510{
509 int referenced = 0; 511 int referenced = 0;
512 int we_locked = 0;
510 513
511 if (TestClearPageReferenced(page)) 514 if (TestClearPageReferenced(page))
512 referenced++; 515 referenced++;
513 516
514 *vm_flags = 0; 517 *vm_flags = 0;
515 if (page_mapped(page) && page->mapping) { 518 if (page_mapped(page) && page_rmapping(page)) {
516 if (PageAnon(page)) 519 if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
520 we_locked = trylock_page(page);
521 if (!we_locked) {
522 referenced++;
523 goto out;
524 }
525 }
526 if (unlikely(PageKsm(page)))
527 referenced += page_referenced_ksm(page, mem_cont,
528 vm_flags);
529 else if (PageAnon(page))
517 referenced += page_referenced_anon(page, mem_cont, 530 referenced += page_referenced_anon(page, mem_cont,
518 vm_flags); 531 vm_flags);
519 else if (is_locked) 532 else if (page->mapping)
520 referenced += page_referenced_file(page, mem_cont, 533 referenced += page_referenced_file(page, mem_cont,
521 vm_flags); 534 vm_flags);
522 else if (!trylock_page(page)) 535 if (we_locked)
523 referenced++;
524 else {
525 if (page->mapping)
526 referenced += page_referenced_file(page,
527 mem_cont, vm_flags);
528 unlock_page(page); 536 unlock_page(page);
529 }
530 } 537 }
531 538out:
532 if (page_test_and_clear_young(page)) 539 if (page_test_and_clear_young(page))
533 referenced++; 540 referenced++;
534 541
535 return referenced; 542 return referenced;
536} 543}
537 544
538static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) 545static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
546 unsigned long address)
539{ 547{
540 struct mm_struct *mm = vma->vm_mm; 548 struct mm_struct *mm = vma->vm_mm;
541 unsigned long address;
542 pte_t *pte; 549 pte_t *pte;
543 spinlock_t *ptl; 550 spinlock_t *ptl;
544 int ret = 0; 551 int ret = 0;
545 552
546 address = vma_address(page, vma);
547 if (address == -EFAULT)
548 goto out;
549
550 pte = page_check_address(page, mm, address, &ptl, 1); 553 pte = page_check_address(page, mm, address, &ptl, 1);
551 if (!pte) 554 if (!pte)
552 goto out; 555 goto out;
@@ -578,8 +581,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
578 581
579 spin_lock(&mapping->i_mmap_lock); 582 spin_lock(&mapping->i_mmap_lock);
580 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 583 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
581 if (vma->vm_flags & VM_SHARED) 584 if (vma->vm_flags & VM_SHARED) {
582 ret += page_mkclean_one(page, vma); 585 unsigned long address = vma_address(page, vma);
586 if (address == -EFAULT)
587 continue;
588 ret += page_mkclean_one(page, vma, address);
589 }
583 } 590 }
584 spin_unlock(&mapping->i_mmap_lock); 591 spin_unlock(&mapping->i_mmap_lock);
585 return ret; 592 return ret;
@@ -620,14 +627,7 @@ static void __page_set_anon_rmap(struct page *page,
620 BUG_ON(!anon_vma); 627 BUG_ON(!anon_vma);
621 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 628 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
622 page->mapping = (struct address_space *) anon_vma; 629 page->mapping = (struct address_space *) anon_vma;
623
624 page->index = linear_page_index(vma, address); 630 page->index = linear_page_index(vma, address);
625
626 /*
627 * nr_mapped state can be updated without turning off
628 * interrupts because it is not modified via interrupt.
629 */
630 __inc_zone_page_state(page, NR_ANON_PAGES);
631} 631}
632 632
633/** 633/**
@@ -665,14 +665,23 @@ static void __page_check_anon_rmap(struct page *page,
665 * @vma: the vm area in which the mapping is added 665 * @vma: the vm area in which the mapping is added
666 * @address: the user virtual address mapped 666 * @address: the user virtual address mapped
667 * 667 *
668 * The caller needs to hold the pte lock and the page must be locked. 668 * The caller needs to hold the pte lock, and the page must be locked in
669 * the anon_vma case: to serialize mapping,index checking after setting,
670 * and to ensure that PageAnon is not being upgraded racily to PageKsm
671 * (but PageKsm is never downgraded to PageAnon).
669 */ 672 */
670void page_add_anon_rmap(struct page *page, 673void page_add_anon_rmap(struct page *page,
671 struct vm_area_struct *vma, unsigned long address) 674 struct vm_area_struct *vma, unsigned long address)
672{ 675{
676 int first = atomic_inc_and_test(&page->_mapcount);
677 if (first)
678 __inc_zone_page_state(page, NR_ANON_PAGES);
679 if (unlikely(PageKsm(page)))
680 return;
681
673 VM_BUG_ON(!PageLocked(page)); 682 VM_BUG_ON(!PageLocked(page));
674 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 683 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
675 if (atomic_inc_and_test(&page->_mapcount)) 684 if (first)
676 __page_set_anon_rmap(page, vma, address); 685 __page_set_anon_rmap(page, vma, address);
677 else 686 else
678 __page_check_anon_rmap(page, vma, address); 687 __page_check_anon_rmap(page, vma, address);
@@ -694,6 +703,7 @@ void page_add_new_anon_rmap(struct page *page,
694 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 703 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
695 SetPageSwapBacked(page); 704 SetPageSwapBacked(page);
696 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 705 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
706 __inc_zone_page_state(page, NR_ANON_PAGES);
697 __page_set_anon_rmap(page, vma, address); 707 __page_set_anon_rmap(page, vma, address);
698 if (page_evictable(page, vma)) 708 if (page_evictable(page, vma))
699 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 709 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -711,7 +721,7 @@ void page_add_file_rmap(struct page *page)
711{ 721{
712 if (atomic_inc_and_test(&page->_mapcount)) { 722 if (atomic_inc_and_test(&page->_mapcount)) {
713 __inc_zone_page_state(page, NR_FILE_MAPPED); 723 __inc_zone_page_state(page, NR_FILE_MAPPED);
714 mem_cgroup_update_mapped_file_stat(page, 1); 724 mem_cgroup_update_file_mapped(page, 1);
715 } 725 }
716} 726}
717 727
@@ -743,8 +753,8 @@ void page_remove_rmap(struct page *page)
743 __dec_zone_page_state(page, NR_ANON_PAGES); 753 __dec_zone_page_state(page, NR_ANON_PAGES);
744 } else { 754 } else {
745 __dec_zone_page_state(page, NR_FILE_MAPPED); 755 __dec_zone_page_state(page, NR_FILE_MAPPED);
756 mem_cgroup_update_file_mapped(page, -1);
746 } 757 }
747 mem_cgroup_update_mapped_file_stat(page, -1);
748 /* 758 /*
749 * It would be tidy to reset the PageAnon mapping here, 759 * It would be tidy to reset the PageAnon mapping here,
750 * but that might overwrite a racing page_add_anon_rmap 760 * but that might overwrite a racing page_add_anon_rmap
@@ -760,20 +770,15 @@ void page_remove_rmap(struct page *page)
760 * Subfunctions of try_to_unmap: try_to_unmap_one called 770 * Subfunctions of try_to_unmap: try_to_unmap_one called
761 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 771 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
762 */ 772 */
763static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 773int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
764 enum ttu_flags flags) 774 unsigned long address, enum ttu_flags flags)
765{ 775{
766 struct mm_struct *mm = vma->vm_mm; 776 struct mm_struct *mm = vma->vm_mm;
767 unsigned long address;
768 pte_t *pte; 777 pte_t *pte;
769 pte_t pteval; 778 pte_t pteval;
770 spinlock_t *ptl; 779 spinlock_t *ptl;
771 int ret = SWAP_AGAIN; 780 int ret = SWAP_AGAIN;
772 781
773 address = vma_address(page, vma);
774 if (address == -EFAULT)
775 goto out;
776
777 pte = page_check_address(page, mm, address, &ptl, 0); 782 pte = page_check_address(page, mm, address, &ptl, 0);
778 if (!pte) 783 if (!pte)
779 goto out; 784 goto out;
@@ -784,10 +789,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
784 * skipped over this mm) then we should reactivate it. 789 * skipped over this mm) then we should reactivate it.
785 */ 790 */
786 if (!(flags & TTU_IGNORE_MLOCK)) { 791 if (!(flags & TTU_IGNORE_MLOCK)) {
787 if (vma->vm_flags & VM_LOCKED) { 792 if (vma->vm_flags & VM_LOCKED)
788 ret = SWAP_MLOCK; 793 goto out_mlock;
794
795 if (TTU_ACTION(flags) == TTU_MUNLOCK)
789 goto out_unmap; 796 goto out_unmap;
790 }
791 } 797 }
792 if (!(flags & TTU_IGNORE_ACCESS)) { 798 if (!(flags & TTU_IGNORE_ACCESS)) {
793 if (ptep_clear_flush_young_notify(vma, address, pte)) { 799 if (ptep_clear_flush_young_notify(vma, address, pte)) {
@@ -822,7 +828,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
822 * Store the swap location in the pte. 828 * Store the swap location in the pte.
823 * See handle_pte_fault() ... 829 * See handle_pte_fault() ...
824 */ 830 */
825 swap_duplicate(entry); 831 if (swap_duplicate(entry) < 0) {
832 set_pte_at(mm, address, pte, pteval);
833 ret = SWAP_FAIL;
834 goto out_unmap;
835 }
826 if (list_empty(&mm->mmlist)) { 836 if (list_empty(&mm->mmlist)) {
827 spin_lock(&mmlist_lock); 837 spin_lock(&mmlist_lock);
828 if (list_empty(&mm->mmlist)) 838 if (list_empty(&mm->mmlist))
@@ -849,7 +859,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
849 } else 859 } else
850 dec_mm_counter(mm, file_rss); 860 dec_mm_counter(mm, file_rss);
851 861
852
853 page_remove_rmap(page); 862 page_remove_rmap(page);
854 page_cache_release(page); 863 page_cache_release(page);
855 864
@@ -857,6 +866,27 @@ out_unmap:
857 pte_unmap_unlock(pte, ptl); 866 pte_unmap_unlock(pte, ptl);
858out: 867out:
859 return ret; 868 return ret;
869
870out_mlock:
871 pte_unmap_unlock(pte, ptl);
872
873
874 /*
875 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
876 * unstable result and race. Plus, We can't wait here because
877 * we now hold anon_vma->lock or mapping->i_mmap_lock.
878 * if trylock failed, the page remain in evictable lru and later
879 * vmscan could retry to move the page to unevictable lru if the
880 * page is actually mlocked.
881 */
882 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
883 if (vma->vm_flags & VM_LOCKED) {
884 mlock_vma_page(page);
885 ret = SWAP_MLOCK;
886 }
887 up_read(&vma->vm_mm->mmap_sem);
888 }
889 return ret;
860} 890}
861 891
862/* 892/*
@@ -922,11 +952,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
922 return ret; 952 return ret;
923 953
924 /* 954 /*
925 * MLOCK_PAGES => feature is configured. 955 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
926 * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
927 * keep the sem while scanning the cluster for mlocking pages. 956 * keep the sem while scanning the cluster for mlocking pages.
928 */ 957 */
929 if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { 958 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
930 locked_vma = (vma->vm_flags & VM_LOCKED); 959 locked_vma = (vma->vm_flags & VM_LOCKED);
931 if (!locked_vma) 960 if (!locked_vma)
932 up_read(&vma->vm_mm->mmap_sem); /* don't need it */ 961 up_read(&vma->vm_mm->mmap_sem); /* don't need it */
@@ -976,29 +1005,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
976 return ret; 1005 return ret;
977} 1006}
978 1007
979/*
980 * common handling for pages mapped in VM_LOCKED vmas
981 */
982static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
983{
984 int mlocked = 0;
985
986 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
987 if (vma->vm_flags & VM_LOCKED) {
988 mlock_vma_page(page);
989 mlocked++; /* really mlocked the page */
990 }
991 up_read(&vma->vm_mm->mmap_sem);
992 }
993 return mlocked;
994}
995
996/** 1008/**
997 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based 1009 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
998 * rmap method 1010 * rmap method
999 * @page: the page to unmap/unlock 1011 * @page: the page to unmap/unlock
1000 * @unlock: request for unlock rather than unmap [unlikely] 1012 * @flags: action and flags
1001 * @migration: unmapping for migration - ignored if @unlock
1002 * 1013 *
1003 * Find all the mappings of a page using the mapping pointer and the vma chains 1014 * Find all the mappings of a page using the mapping pointer and the vma chains
1004 * contained in the anon_vma struct it points to. 1015 * contained in the anon_vma struct it points to.
@@ -1014,42 +1025,22 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1014{ 1025{
1015 struct anon_vma *anon_vma; 1026 struct anon_vma *anon_vma;
1016 struct vm_area_struct *vma; 1027 struct vm_area_struct *vma;
1017 unsigned int mlocked = 0;
1018 int ret = SWAP_AGAIN; 1028 int ret = SWAP_AGAIN;
1019 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1020
1021 if (MLOCK_PAGES && unlikely(unlock))
1022 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
1023 1029
1024 anon_vma = page_lock_anon_vma(page); 1030 anon_vma = page_lock_anon_vma(page);
1025 if (!anon_vma) 1031 if (!anon_vma)
1026 return ret; 1032 return ret;
1027 1033
1028 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1034 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
1029 if (MLOCK_PAGES && unlikely(unlock)) { 1035 unsigned long address = vma_address(page, vma);
1030 if (!((vma->vm_flags & VM_LOCKED) && 1036 if (address == -EFAULT)
1031 page_mapped_in_vma(page, vma))) 1037 continue;
1032 continue; /* must visit all unlocked vmas */ 1038 ret = try_to_unmap_one(page, vma, address, flags);
1033 ret = SWAP_MLOCK; /* saw at least one mlocked vma */ 1039 if (ret != SWAP_AGAIN || !page_mapped(page))
1034 } else { 1040 break;
1035 ret = try_to_unmap_one(page, vma, flags);
1036 if (ret == SWAP_FAIL || !page_mapped(page))
1037 break;
1038 }
1039 if (ret == SWAP_MLOCK) {
1040 mlocked = try_to_mlock_page(page, vma);
1041 if (mlocked)
1042 break; /* stop if actually mlocked page */
1043 }
1044 } 1041 }
1045 1042
1046 page_unlock_anon_vma(anon_vma); 1043 page_unlock_anon_vma(anon_vma);
1047
1048 if (mlocked)
1049 ret = SWAP_MLOCK; /* actually mlocked the page */
1050 else if (ret == SWAP_MLOCK)
1051 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
1052
1053 return ret; 1044 return ret;
1054} 1045}
1055 1046
@@ -1079,48 +1070,30 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1079 unsigned long max_nl_cursor = 0; 1070 unsigned long max_nl_cursor = 0;
1080 unsigned long max_nl_size = 0; 1071 unsigned long max_nl_size = 0;
1081 unsigned int mapcount; 1072 unsigned int mapcount;
1082 unsigned int mlocked = 0;
1083 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1084
1085 if (MLOCK_PAGES && unlikely(unlock))
1086 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
1087 1073
1088 spin_lock(&mapping->i_mmap_lock); 1074 spin_lock(&mapping->i_mmap_lock);
1089 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1075 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1090 if (MLOCK_PAGES && unlikely(unlock)) { 1076 unsigned long address = vma_address(page, vma);
1091 if (!((vma->vm_flags & VM_LOCKED) && 1077 if (address == -EFAULT)
1092 page_mapped_in_vma(page, vma))) 1078 continue;
1093 continue; /* must visit all vmas */ 1079 ret = try_to_unmap_one(page, vma, address, flags);
1094 ret = SWAP_MLOCK; 1080 if (ret != SWAP_AGAIN || !page_mapped(page))
1095 } else { 1081 goto out;
1096 ret = try_to_unmap_one(page, vma, flags);
1097 if (ret == SWAP_FAIL || !page_mapped(page))
1098 goto out;
1099 }
1100 if (ret == SWAP_MLOCK) {
1101 mlocked = try_to_mlock_page(page, vma);
1102 if (mlocked)
1103 break; /* stop if actually mlocked page */
1104 }
1105 } 1082 }
1106 1083
1107 if (mlocked) 1084 if (list_empty(&mapping->i_mmap_nonlinear))
1108 goto out; 1085 goto out;
1109 1086
1110 if (list_empty(&mapping->i_mmap_nonlinear)) 1087 /*
1088 * We don't bother to try to find the munlocked page in nonlinears.
1089 * It's costly. Instead, later, page reclaim logic may call
1090 * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
1091 */
1092 if (TTU_ACTION(flags) == TTU_MUNLOCK)
1111 goto out; 1093 goto out;
1112 1094
1113 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1095 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1114 shared.vm_set.list) { 1096 shared.vm_set.list) {
1115 if (MLOCK_PAGES && unlikely(unlock)) {
1116 if (!(vma->vm_flags & VM_LOCKED))
1117 continue; /* must visit all vmas */
1118 ret = SWAP_MLOCK; /* leave mlocked == 0 */
1119 goto out; /* no need to look further */
1120 }
1121 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1122 (vma->vm_flags & VM_LOCKED))
1123 continue;
1124 cursor = (unsigned long) vma->vm_private_data; 1097 cursor = (unsigned long) vma->vm_private_data;
1125 if (cursor > max_nl_cursor) 1098 if (cursor > max_nl_cursor)
1126 max_nl_cursor = cursor; 1099 max_nl_cursor = cursor;
@@ -1153,16 +1126,12 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1153 do { 1126 do {
1154 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1127 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1155 shared.vm_set.list) { 1128 shared.vm_set.list) {
1156 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1157 (vma->vm_flags & VM_LOCKED))
1158 continue;
1159 cursor = (unsigned long) vma->vm_private_data; 1129 cursor = (unsigned long) vma->vm_private_data;
1160 while ( cursor < max_nl_cursor && 1130 while ( cursor < max_nl_cursor &&
1161 cursor < vma->vm_end - vma->vm_start) { 1131 cursor < vma->vm_end - vma->vm_start) {
1162 ret = try_to_unmap_cluster(cursor, &mapcount, 1132 if (try_to_unmap_cluster(cursor, &mapcount,
1163 vma, page); 1133 vma, page) == SWAP_MLOCK)
1164 if (ret == SWAP_MLOCK) 1134 ret = SWAP_MLOCK;
1165 mlocked = 2; /* to return below */
1166 cursor += CLUSTER_SIZE; 1135 cursor += CLUSTER_SIZE;
1167 vma->vm_private_data = (void *) cursor; 1136 vma->vm_private_data = (void *) cursor;
1168 if ((int)mapcount <= 0) 1137 if ((int)mapcount <= 0)
@@ -1183,10 +1152,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1183 vma->vm_private_data = NULL; 1152 vma->vm_private_data = NULL;
1184out: 1153out:
1185 spin_unlock(&mapping->i_mmap_lock); 1154 spin_unlock(&mapping->i_mmap_lock);
1186 if (mlocked)
1187 ret = SWAP_MLOCK; /* actually mlocked the page */
1188 else if (ret == SWAP_MLOCK)
1189 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
1190 return ret; 1155 return ret;
1191} 1156}
1192 1157
@@ -1210,7 +1175,9 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1210 1175
1211 BUG_ON(!PageLocked(page)); 1176 BUG_ON(!PageLocked(page));
1212 1177
1213 if (PageAnon(page)) 1178 if (unlikely(PageKsm(page)))
1179 ret = try_to_unmap_ksm(page, flags);
1180 else if (PageAnon(page))
1214 ret = try_to_unmap_anon(page, flags); 1181 ret = try_to_unmap_anon(page, flags);
1215 else 1182 else
1216 ret = try_to_unmap_file(page, flags); 1183 ret = try_to_unmap_file(page, flags);
@@ -1229,17 +1196,98 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1229 * 1196 *
1230 * Return values are: 1197 * Return values are:
1231 * 1198 *
1232 * SWAP_SUCCESS - no vma's holding page mlocked. 1199 * SWAP_AGAIN - no vma is holding page mlocked, or,
1233 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem 1200 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
1201 * SWAP_FAIL - page cannot be located at present
1234 * SWAP_MLOCK - page is now mlocked. 1202 * SWAP_MLOCK - page is now mlocked.
1235 */ 1203 */
1236int try_to_munlock(struct page *page) 1204int try_to_munlock(struct page *page)
1237{ 1205{
1238 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1206 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1239 1207
1240 if (PageAnon(page)) 1208 if (unlikely(PageKsm(page)))
1209 return try_to_unmap_ksm(page, TTU_MUNLOCK);
1210 else if (PageAnon(page))
1241 return try_to_unmap_anon(page, TTU_MUNLOCK); 1211 return try_to_unmap_anon(page, TTU_MUNLOCK);
1242 else 1212 else
1243 return try_to_unmap_file(page, TTU_MUNLOCK); 1213 return try_to_unmap_file(page, TTU_MUNLOCK);
1244} 1214}
1245 1215
1216#ifdef CONFIG_MIGRATION
1217/*
1218 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
1219 * Called by migrate.c to remove migration ptes, but might be used more later.
1220 */
1221static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1222 struct vm_area_struct *, unsigned long, void *), void *arg)
1223{
1224 struct anon_vma *anon_vma;
1225 struct vm_area_struct *vma;
1226 int ret = SWAP_AGAIN;
1227
1228 /*
1229 * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
1230 * because that depends on page_mapped(); but not all its usages
1231 * are holding mmap_sem, which also gave the necessary guarantee
1232 * (that this anon_vma's slab has not already been destroyed).
1233 * This needs to be reviewed later: avoiding page_lock_anon_vma()
1234 * is risky, and currently limits the usefulness of rmap_walk().
1235 */
1236 anon_vma = page_anon_vma(page);
1237 if (!anon_vma)
1238 return ret;
1239 spin_lock(&anon_vma->lock);
1240 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
1241 unsigned long address = vma_address(page, vma);
1242 if (address == -EFAULT)
1243 continue;
1244 ret = rmap_one(page, vma, address, arg);
1245 if (ret != SWAP_AGAIN)
1246 break;
1247 }
1248 spin_unlock(&anon_vma->lock);
1249 return ret;
1250}
1251
1252static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
1253 struct vm_area_struct *, unsigned long, void *), void *arg)
1254{
1255 struct address_space *mapping = page->mapping;
1256 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1257 struct vm_area_struct *vma;
1258 struct prio_tree_iter iter;
1259 int ret = SWAP_AGAIN;
1260
1261 if (!mapping)
1262 return ret;
1263 spin_lock(&mapping->i_mmap_lock);
1264 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1265 unsigned long address = vma_address(page, vma);
1266 if (address == -EFAULT)
1267 continue;
1268 ret = rmap_one(page, vma, address, arg);
1269 if (ret != SWAP_AGAIN)
1270 break;
1271 }
1272 /*
1273 * No nonlinear handling: being always shared, nonlinear vmas
1274 * never contain migration ptes. Decide what to do about this
1275 * limitation to linear when we need rmap_walk() on nonlinear.
1276 */
1277 spin_unlock(&mapping->i_mmap_lock);
1278 return ret;
1279}
1280
1281int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
1282 struct vm_area_struct *, unsigned long, void *), void *arg)
1283{
1284 VM_BUG_ON(!PageLocked(page));
1285
1286 if (unlikely(PageKsm(page)))
1287 return rmap_walk_ksm(page, rmap_one, arg);
1288 else if (PageAnon(page))
1289 return rmap_walk_anon(page, rmap_one, arg);
1290 else
1291 return rmap_walk_file(page, rmap_one, arg);
1292}
1293#endif /* CONFIG_MIGRATION */
diff --git a/mm/shmem.c b/mm/shmem.c
index 356dd99566ec..eef4ebea5158 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -29,7 +29,6 @@
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/swap.h> 31#include <linux/swap.h>
32#include <linux/ima.h>
33 32
34static struct vfsmount *shm_mnt; 33static struct vfsmount *shm_mnt;
35 34
@@ -42,6 +41,7 @@ static struct vfsmount *shm_mnt;
42 41
43#include <linux/xattr.h> 42#include <linux/xattr.h>
44#include <linux/exportfs.h> 43#include <linux/exportfs.h>
44#include <linux/posix_acl.h>
45#include <linux/generic_acl.h> 45#include <linux/generic_acl.h>
46#include <linux/mman.h> 46#include <linux/mman.h>
47#include <linux/string.h> 47#include <linux/string.h>
@@ -810,7 +810,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
810 error = inode_setattr(inode, attr); 810 error = inode_setattr(inode, attr);
811#ifdef CONFIG_TMPFS_POSIX_ACL 811#ifdef CONFIG_TMPFS_POSIX_ACL
812 if (!error && (attr->ia_valid & ATTR_MODE)) 812 if (!error && (attr->ia_valid & ATTR_MODE))
813 error = generic_acl_chmod(inode, &shmem_acl_ops); 813 error = generic_acl_chmod(inode);
814#endif 814#endif
815 if (page) 815 if (page)
816 page_cache_release(page); 816 page_cache_release(page);
@@ -1017,7 +1017,14 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
1017 goto out; 1017 goto out;
1018 } 1018 }
1019 mutex_unlock(&shmem_swaplist_mutex); 1019 mutex_unlock(&shmem_swaplist_mutex);
1020out: return found; /* 0 or 1 or -ENOMEM */ 1020 /*
1021 * Can some race bring us here? We've been holding page lock,
1022 * so I think not; but would rather try again later than BUG()
1023 */
1024 unlock_page(page);
1025 page_cache_release(page);
1026out:
1027 return (found < 0) ? found : 0;
1021} 1028}
1022 1029
1023/* 1030/*
@@ -1080,7 +1087,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1080 else 1087 else
1081 inode = NULL; 1088 inode = NULL;
1082 spin_unlock(&info->lock); 1089 spin_unlock(&info->lock);
1083 swap_duplicate(swap); 1090 swap_shmem_alloc(swap);
1084 BUG_ON(page_mapped(page)); 1091 BUG_ON(page_mapped(page));
1085 page_cache_release(page); /* pagecache ref */ 1092 page_cache_release(page); /* pagecache ref */
1086 swap_writepage(page, wbc); 1093 swap_writepage(page, wbc);
@@ -1817,11 +1824,15 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1817 return error; 1824 return error;
1818 } 1825 }
1819 } 1826 }
1820 error = shmem_acl_init(inode, dir); 1827#ifdef CONFIG_TMPFS_POSIX_ACL
1828 error = generic_acl_init(inode, dir);
1821 if (error) { 1829 if (error) {
1822 iput(inode); 1830 iput(inode);
1823 return error; 1831 return error;
1824 } 1832 }
1833#else
1834 error = 0;
1835#endif
1825 if (dir->i_mode & S_ISGID) { 1836 if (dir->i_mode & S_ISGID) {
1826 inode->i_gid = dir->i_gid; 1837 inode->i_gid = dir->i_gid;
1827 if (S_ISDIR(mode)) 1838 if (S_ISDIR(mode))
@@ -2036,27 +2047,28 @@ static const struct inode_operations shmem_symlink_inode_operations = {
2036 * filesystem level, though. 2047 * filesystem level, though.
2037 */ 2048 */
2038 2049
2039static size_t shmem_xattr_security_list(struct inode *inode, char *list, 2050static size_t shmem_xattr_security_list(struct dentry *dentry, char *list,
2040 size_t list_len, const char *name, 2051 size_t list_len, const char *name,
2041 size_t name_len) 2052 size_t name_len, int handler_flags)
2042{ 2053{
2043 return security_inode_listsecurity(inode, list, list_len); 2054 return security_inode_listsecurity(dentry->d_inode, list, list_len);
2044} 2055}
2045 2056
2046static int shmem_xattr_security_get(struct inode *inode, const char *name, 2057static int shmem_xattr_security_get(struct dentry *dentry, const char *name,
2047 void *buffer, size_t size) 2058 void *buffer, size_t size, int handler_flags)
2048{ 2059{
2049 if (strcmp(name, "") == 0) 2060 if (strcmp(name, "") == 0)
2050 return -EINVAL; 2061 return -EINVAL;
2051 return xattr_getsecurity(inode, name, buffer, size); 2062 return xattr_getsecurity(dentry->d_inode, name, buffer, size);
2052} 2063}
2053 2064
2054static int shmem_xattr_security_set(struct inode *inode, const char *name, 2065static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
2055 const void *value, size_t size, int flags) 2066 const void *value, size_t size, int flags, int handler_flags)
2056{ 2067{
2057 if (strcmp(name, "") == 0) 2068 if (strcmp(name, "") == 0)
2058 return -EINVAL; 2069 return -EINVAL;
2059 return security_inode_setsecurity(inode, name, value, size, flags); 2070 return security_inode_setsecurity(dentry->d_inode, name, value,
2071 size, flags);
2060} 2072}
2061 2073
2062static struct xattr_handler shmem_xattr_security_handler = { 2074static struct xattr_handler shmem_xattr_security_handler = {
@@ -2067,8 +2079,8 @@ static struct xattr_handler shmem_xattr_security_handler = {
2067}; 2079};
2068 2080
2069static struct xattr_handler *shmem_xattr_handlers[] = { 2081static struct xattr_handler *shmem_xattr_handlers[] = {
2070 &shmem_xattr_acl_access_handler, 2082 &generic_acl_access_handler,
2071 &shmem_xattr_acl_default_handler, 2083 &generic_acl_default_handler,
2072 &shmem_xattr_security_handler, 2084 &shmem_xattr_security_handler,
2073 NULL 2085 NULL
2074}; 2086};
@@ -2447,7 +2459,7 @@ static const struct inode_operations shmem_inode_operations = {
2447 .getxattr = generic_getxattr, 2459 .getxattr = generic_getxattr,
2448 .listxattr = generic_listxattr, 2460 .listxattr = generic_listxattr,
2449 .removexattr = generic_removexattr, 2461 .removexattr = generic_removexattr,
2450 .check_acl = shmem_check_acl, 2462 .check_acl = generic_check_acl,
2451#endif 2463#endif
2452 2464
2453}; 2465};
@@ -2470,7 +2482,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
2470 .getxattr = generic_getxattr, 2482 .getxattr = generic_getxattr,
2471 .listxattr = generic_listxattr, 2483 .listxattr = generic_listxattr,
2472 .removexattr = generic_removexattr, 2484 .removexattr = generic_removexattr,
2473 .check_acl = shmem_check_acl, 2485 .check_acl = generic_check_acl,
2474#endif 2486#endif
2475}; 2487};
2476 2488
@@ -2481,7 +2493,7 @@ static const struct inode_operations shmem_special_inode_operations = {
2481 .getxattr = generic_getxattr, 2493 .getxattr = generic_getxattr,
2482 .listxattr = generic_listxattr, 2494 .listxattr = generic_listxattr,
2483 .removexattr = generic_removexattr, 2495 .removexattr = generic_removexattr,
2484 .check_acl = shmem_check_acl, 2496 .check_acl = generic_check_acl,
2485#endif 2497#endif
2486}; 2498};
2487 2499
@@ -2619,7 +2631,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2619 int error; 2631 int error;
2620 struct file *file; 2632 struct file *file;
2621 struct inode *inode; 2633 struct inode *inode;
2622 struct dentry *dentry, *root; 2634 struct path path;
2635 struct dentry *root;
2623 struct qstr this; 2636 struct qstr this;
2624 2637
2625 if (IS_ERR(shm_mnt)) 2638 if (IS_ERR(shm_mnt))
@@ -2636,38 +2649,35 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2636 this.len = strlen(name); 2649 this.len = strlen(name);
2637 this.hash = 0; /* will go */ 2650 this.hash = 0; /* will go */
2638 root = shm_mnt->mnt_root; 2651 root = shm_mnt->mnt_root;
2639 dentry = d_alloc(root, &this); 2652 path.dentry = d_alloc(root, &this);
2640 if (!dentry) 2653 if (!path.dentry)
2641 goto put_memory; 2654 goto put_memory;
2642 2655 path.mnt = mntget(shm_mnt);
2643 error = -ENFILE;
2644 file = get_empty_filp();
2645 if (!file)
2646 goto put_dentry;
2647 2656
2648 error = -ENOSPC; 2657 error = -ENOSPC;
2649 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); 2658 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags);
2650 if (!inode) 2659 if (!inode)
2651 goto close_file; 2660 goto put_dentry;
2652 2661
2653 d_instantiate(dentry, inode); 2662 d_instantiate(path.dentry, inode);
2654 inode->i_size = size; 2663 inode->i_size = size;
2655 inode->i_nlink = 0; /* It is unlinked */ 2664 inode->i_nlink = 0; /* It is unlinked */
2656 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
2657 &shmem_file_operations);
2658
2659#ifndef CONFIG_MMU 2665#ifndef CONFIG_MMU
2660 error = ramfs_nommu_expand_for_mapping(inode, size); 2666 error = ramfs_nommu_expand_for_mapping(inode, size);
2661 if (error) 2667 if (error)
2662 goto close_file; 2668 goto put_dentry;
2663#endif 2669#endif
2664 ima_counts_get(file); 2670
2671 error = -ENFILE;
2672 file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
2673 &shmem_file_operations);
2674 if (!file)
2675 goto put_dentry;
2676
2665 return file; 2677 return file;
2666 2678
2667close_file:
2668 put_filp(file);
2669put_dentry: 2679put_dentry:
2670 dput(dentry); 2680 path_put(&path);
2671put_memory: 2681put_memory:
2672 shmem_unacct_size(flags, size); 2682 shmem_unacct_size(flags, size);
2673 return ERR_PTR(error); 2683 return ERR_PTR(error);
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
deleted file mode 100644
index df2c87fdae50..000000000000
--- a/mm/shmem_acl.c
+++ /dev/null
@@ -1,171 +0,0 @@
1/*
2 * mm/shmem_acl.c
3 *
4 * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
5 *
6 * This file is released under the GPL.
7 */
8
9#include <linux/fs.h>
10#include <linux/shmem_fs.h>
11#include <linux/xattr.h>
12#include <linux/generic_acl.h>
13
14/**
15 * shmem_get_acl - generic_acl_operations->getacl() operation
16 */
17static struct posix_acl *
18shmem_get_acl(struct inode *inode, int type)
19{
20 struct posix_acl *acl = NULL;
21
22 spin_lock(&inode->i_lock);
23 switch(type) {
24 case ACL_TYPE_ACCESS:
25 acl = posix_acl_dup(inode->i_acl);
26 break;
27
28 case ACL_TYPE_DEFAULT:
29 acl = posix_acl_dup(inode->i_default_acl);
30 break;
31 }
32 spin_unlock(&inode->i_lock);
33
34 return acl;
35}
36
37/**
38 * shmem_set_acl - generic_acl_operations->setacl() operation
39 */
40static void
41shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
42{
43 struct posix_acl *free = NULL;
44
45 spin_lock(&inode->i_lock);
46 switch(type) {
47 case ACL_TYPE_ACCESS:
48 free = inode->i_acl;
49 inode->i_acl = posix_acl_dup(acl);
50 break;
51
52 case ACL_TYPE_DEFAULT:
53 free = inode->i_default_acl;
54 inode->i_default_acl = posix_acl_dup(acl);
55 break;
56 }
57 spin_unlock(&inode->i_lock);
58 posix_acl_release(free);
59}
60
61struct generic_acl_operations shmem_acl_ops = {
62 .getacl = shmem_get_acl,
63 .setacl = shmem_set_acl,
64};
65
66/**
67 * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access,
68 * shmem_xattr_acl_access_handler - plumbing code to implement the
69 * system.posix_acl_access xattr using the generic acl functions.
70 */
71
72static size_t
73shmem_list_acl_access(struct inode *inode, char *list, size_t list_size,
74 const char *name, size_t name_len)
75{
76 return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS,
77 list, list_size);
78}
79
80static int
81shmem_get_acl_access(struct inode *inode, const char *name, void *buffer,
82 size_t size)
83{
84 if (strcmp(name, "") != 0)
85 return -EINVAL;
86 return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer,
87 size);
88}
89
90static int
91shmem_set_acl_access(struct inode *inode, const char *name, const void *value,
92 size_t size, int flags)
93{
94 if (strcmp(name, "") != 0)
95 return -EINVAL;
96 return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value,
97 size);
98}
99
100struct xattr_handler shmem_xattr_acl_access_handler = {
101 .prefix = POSIX_ACL_XATTR_ACCESS,
102 .list = shmem_list_acl_access,
103 .get = shmem_get_acl_access,
104 .set = shmem_set_acl_access,
105};
106
107/**
108 * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default,
109 * shmem_xattr_acl_default_handler - plumbing code to implement the
110 * system.posix_acl_default xattr using the generic acl functions.
111 */
112
113static size_t
114shmem_list_acl_default(struct inode *inode, char *list, size_t list_size,
115 const char *name, size_t name_len)
116{
117 return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT,
118 list, list_size);
119}
120
121static int
122shmem_get_acl_default(struct inode *inode, const char *name, void *buffer,
123 size_t size)
124{
125 if (strcmp(name, "") != 0)
126 return -EINVAL;
127 return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer,
128 size);
129}
130
131static int
132shmem_set_acl_default(struct inode *inode, const char *name, const void *value,
133 size_t size, int flags)
134{
135 if (strcmp(name, "") != 0)
136 return -EINVAL;
137 return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value,
138 size);
139}
140
141struct xattr_handler shmem_xattr_acl_default_handler = {
142 .prefix = POSIX_ACL_XATTR_DEFAULT,
143 .list = shmem_list_acl_default,
144 .get = shmem_get_acl_default,
145 .set = shmem_set_acl_default,
146};
147
148/**
149 * shmem_acl_init - Inizialize the acl(s) of a new inode
150 */
151int
152shmem_acl_init(struct inode *inode, struct inode *dir)
153{
154 return generic_acl_init(inode, dir, &shmem_acl_ops);
155}
156
157/**
158 * shmem_check_acl - check_acl() callback for generic_permission()
159 */
160int
161shmem_check_acl(struct inode *inode, int mask)
162{
163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
164
165 if (acl) {
166 int error = posix_acl_permission(inode, acl, mask);
167 posix_acl_release(acl);
168 return error;
169 }
170 return -EAGAIN;
171}
diff --git a/mm/slab.c b/mm/slab.c
index 211b1746c63c..7451bdacaf18 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -490,7 +490,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
490 490
491#endif 491#endif
492 492
493#ifdef CONFIG_KMEMTRACE 493#ifdef CONFIG_TRACING
494size_t slab_buffer_size(struct kmem_cache *cachep) 494size_t slab_buffer_size(struct kmem_cache *cachep)
495{ 495{
496 return cachep->buffer_size; 496 return cachep->buffer_size;
@@ -604,6 +604,26 @@ static struct kmem_cache cache_cache = {
604 604
605#define BAD_ALIEN_MAGIC 0x01020304ul 605#define BAD_ALIEN_MAGIC 0x01020304ul
606 606
607/*
608 * chicken and egg problem: delay the per-cpu array allocation
609 * until the general caches are up.
610 */
611static enum {
612 NONE,
613 PARTIAL_AC,
614 PARTIAL_L3,
615 EARLY,
616 FULL
617} g_cpucache_up;
618
619/*
620 * used by boot code to determine if it can use slab based allocator
621 */
622int slab_is_available(void)
623{
624 return g_cpucache_up >= EARLY;
625}
626
607#ifdef CONFIG_LOCKDEP 627#ifdef CONFIG_LOCKDEP
608 628
609/* 629/*
@@ -620,40 +640,52 @@ static struct kmem_cache cache_cache = {
620static struct lock_class_key on_slab_l3_key; 640static struct lock_class_key on_slab_l3_key;
621static struct lock_class_key on_slab_alc_key; 641static struct lock_class_key on_slab_alc_key;
622 642
623static inline void init_lock_keys(void) 643static void init_node_lock_keys(int q)
624
625{ 644{
626 int q;
627 struct cache_sizes *s = malloc_sizes; 645 struct cache_sizes *s = malloc_sizes;
628 646
629 while (s->cs_size != ULONG_MAX) { 647 if (g_cpucache_up != FULL)
630 for_each_node(q) { 648 return;
631 struct array_cache **alc; 649
632 int r; 650 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
633 struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; 651 struct array_cache **alc;
634 if (!l3 || OFF_SLAB(s->cs_cachep)) 652 struct kmem_list3 *l3;
635 continue; 653 int r;
636 lockdep_set_class(&l3->list_lock, &on_slab_l3_key); 654
637 alc = l3->alien; 655 l3 = s->cs_cachep->nodelists[q];
638 /* 656 if (!l3 || OFF_SLAB(s->cs_cachep))
639 * FIXME: This check for BAD_ALIEN_MAGIC 657 continue;
640 * should go away when common slab code is taught to 658 lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
641 * work even without alien caches. 659 alc = l3->alien;
642 * Currently, non NUMA code returns BAD_ALIEN_MAGIC 660 /*
643 * for alloc_alien_cache, 661 * FIXME: This check for BAD_ALIEN_MAGIC
644 */ 662 * should go away when common slab code is taught to
645 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) 663 * work even without alien caches.
646 continue; 664 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
647 for_each_node(r) { 665 * for alloc_alien_cache,
648 if (alc[r]) 666 */
649 lockdep_set_class(&alc[r]->lock, 667 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
650 &on_slab_alc_key); 668 continue;
651 } 669 for_each_node(r) {
670 if (alc[r])
671 lockdep_set_class(&alc[r]->lock,
672 &on_slab_alc_key);
652 } 673 }
653 s++;
654 } 674 }
655} 675}
676
677static inline void init_lock_keys(void)
678{
679 int node;
680
681 for_each_node(node)
682 init_node_lock_keys(node);
683}
656#else 684#else
685static void init_node_lock_keys(int q)
686{
687}
688
657static inline void init_lock_keys(void) 689static inline void init_lock_keys(void)
658{ 690{
659} 691}
@@ -665,26 +697,6 @@ static inline void init_lock_keys(void)
665static DEFINE_MUTEX(cache_chain_mutex); 697static DEFINE_MUTEX(cache_chain_mutex);
666static struct list_head cache_chain; 698static struct list_head cache_chain;
667 699
668/*
669 * chicken and egg problem: delay the per-cpu array allocation
670 * until the general caches are up.
671 */
672static enum {
673 NONE,
674 PARTIAL_AC,
675 PARTIAL_L3,
676 EARLY,
677 FULL
678} g_cpucache_up;
679
680/*
681 * used by boot code to determine if it can use slab based allocator
682 */
683int slab_is_available(void)
684{
685 return g_cpucache_up >= EARLY;
686}
687
688static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); 700static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
689 701
690static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 702static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -1120,7 +1132,7 @@ static void __cpuinit cpuup_canceled(long cpu)
1120 if (nc) 1132 if (nc)
1121 free_block(cachep, nc->entry, nc->avail, node); 1133 free_block(cachep, nc->entry, nc->avail, node);
1122 1134
1123 if (!cpus_empty(*mask)) { 1135 if (!cpumask_empty(mask)) {
1124 spin_unlock_irq(&l3->list_lock); 1136 spin_unlock_irq(&l3->list_lock);
1125 goto free_array_cache; 1137 goto free_array_cache;
1126 } 1138 }
@@ -1254,6 +1266,8 @@ static int __cpuinit cpuup_prepare(long cpu)
1254 kfree(shared); 1266 kfree(shared);
1255 free_alien_cache(alien); 1267 free_alien_cache(alien);
1256 } 1268 }
1269 init_node_lock_keys(node);
1270
1257 return 0; 1271 return 0;
1258bad: 1272bad:
1259 cpuup_canceled(cpu); 1273 cpuup_canceled(cpu);
@@ -2261,9 +2275,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2261 /* 2275 /*
2262 * Determine if the slab management is 'on' or 'off' slab. 2276 * Determine if the slab management is 'on' or 'off' slab.
2263 * (bootstrapping cannot cope with offslab caches so don't do 2277 * (bootstrapping cannot cope with offslab caches so don't do
2264 * it too early on.) 2278 * it too early on. Always use on-slab management when
2279 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
2265 */ 2280 */
2266 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) 2281 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
2282 !(flags & SLAB_NOLEAKTRACE))
2267 /* 2283 /*
2268 * Size is large, assume best to place the slab management obj 2284 * Size is large, assume best to place the slab management obj
2269 * off-slab (should allow better packing of objs). 2285 * off-slab (should allow better packing of objs).
@@ -2582,8 +2598,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2582 * kmemleak does not treat the ->s_mem pointer as a reference 2598 * kmemleak does not treat the ->s_mem pointer as a reference
2583 * to the object. Otherwise we will not report the leak. 2599 * to the object. Otherwise we will not report the leak.
2584 */ 2600 */
2585 kmemleak_scan_area(slabp, offsetof(struct slab, list), 2601 kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
2586 sizeof(struct list_head), local_flags); 2602 local_flags);
2587 if (!slabp) 2603 if (!slabp)
2588 return NULL; 2604 return NULL;
2589 } else { 2605 } else {
@@ -3103,13 +3119,19 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3103 } else { 3119 } else {
3104 STATS_INC_ALLOCMISS(cachep); 3120 STATS_INC_ALLOCMISS(cachep);
3105 objp = cache_alloc_refill(cachep, flags); 3121 objp = cache_alloc_refill(cachep, flags);
3122 /*
3123 * the 'ac' may be updated by cache_alloc_refill(),
3124 * and kmemleak_erase() requires its correct value.
3125 */
3126 ac = cpu_cache_get(cachep);
3106 } 3127 }
3107 /* 3128 /*
3108 * To avoid a false negative, if an object that is in one of the 3129 * To avoid a false negative, if an object that is in one of the
3109 * per-CPU caches is leaked, we need to make sure kmemleak doesn't 3130 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3110 * treat the array pointers as a reference to the object. 3131 * treat the array pointers as a reference to the object.
3111 */ 3132 */
3112 kmemleak_erase(&ac->entry[ac->avail]); 3133 if (objp)
3134 kmemleak_erase(&ac->entry[ac->avail]);
3113 return objp; 3135 return objp;
3114} 3136}
3115 3137
@@ -3306,7 +3328,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3306 cache_alloc_debugcheck_before(cachep, flags); 3328 cache_alloc_debugcheck_before(cachep, flags);
3307 local_irq_save(save_flags); 3329 local_irq_save(save_flags);
3308 3330
3309 if (unlikely(nodeid == -1)) 3331 if (nodeid == -1)
3310 nodeid = numa_node_id(); 3332 nodeid = numa_node_id();
3311 3333
3312 if (unlikely(!cachep->nodelists[nodeid])) { 3334 if (unlikely(!cachep->nodelists[nodeid])) {
@@ -3558,7 +3580,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3558} 3580}
3559EXPORT_SYMBOL(kmem_cache_alloc); 3581EXPORT_SYMBOL(kmem_cache_alloc);
3560 3582
3561#ifdef CONFIG_KMEMTRACE 3583#ifdef CONFIG_TRACING
3562void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) 3584void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
3563{ 3585{
3564 return __cache_alloc(cachep, flags, __builtin_return_address(0)); 3586 return __cache_alloc(cachep, flags, __builtin_return_address(0));
@@ -3621,7 +3643,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3621} 3643}
3622EXPORT_SYMBOL(kmem_cache_alloc_node); 3644EXPORT_SYMBOL(kmem_cache_alloc_node);
3623 3645
3624#ifdef CONFIG_KMEMTRACE 3646#ifdef CONFIG_TRACING
3625void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, 3647void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
3626 gfp_t flags, 3648 gfp_t flags,
3627 int nodeid) 3649 int nodeid)
@@ -3649,7 +3671,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3649 return ret; 3671 return ret;
3650} 3672}
3651 3673
3652#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) 3674#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3653void *__kmalloc_node(size_t size, gfp_t flags, int node) 3675void *__kmalloc_node(size_t size, gfp_t flags, int node)
3654{ 3676{
3655 return __do_kmalloc_node(size, flags, node, 3677 return __do_kmalloc_node(size, flags, node,
@@ -3669,7 +3691,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
3669 return __do_kmalloc_node(size, flags, node, NULL); 3691 return __do_kmalloc_node(size, flags, node, NULL);
3670} 3692}
3671EXPORT_SYMBOL(__kmalloc_node); 3693EXPORT_SYMBOL(__kmalloc_node);
3672#endif /* CONFIG_DEBUG_SLAB */ 3694#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
3673#endif /* CONFIG_NUMA */ 3695#endif /* CONFIG_NUMA */
3674 3696
3675/** 3697/**
@@ -3701,7 +3723,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3701} 3723}
3702 3724
3703 3725
3704#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) 3726#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3705void *__kmalloc(size_t size, gfp_t flags) 3727void *__kmalloc(size_t size, gfp_t flags)
3706{ 3728{
3707 return __do_kmalloc(size, flags, __builtin_return_address(0)); 3729 return __do_kmalloc(size, flags, __builtin_return_address(0));
diff --git a/mm/slub.c b/mm/slub.c
index 4996fc719552..8d71aaf888d7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1735,7 +1735,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1735 } 1735 }
1736 local_irq_restore(flags); 1736 local_irq_restore(flags);
1737 1737
1738 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1738 if (unlikely(gfpflags & __GFP_ZERO) && object)
1739 memset(object, 0, objsize); 1739 memset(object, 0, objsize);
1740 1740
1741 kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); 1741 kmemcheck_slab_alloc(s, gfpflags, object, c->objsize);
@@ -1754,7 +1754,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1754} 1754}
1755EXPORT_SYMBOL(kmem_cache_alloc); 1755EXPORT_SYMBOL(kmem_cache_alloc);
1756 1756
1757#ifdef CONFIG_KMEMTRACE 1757#ifdef CONFIG_TRACING
1758void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) 1758void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
1759{ 1759{
1760 return slab_alloc(s, gfpflags, -1, _RET_IP_); 1760 return slab_alloc(s, gfpflags, -1, _RET_IP_);
@@ -1775,7 +1775,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1775EXPORT_SYMBOL(kmem_cache_alloc_node); 1775EXPORT_SYMBOL(kmem_cache_alloc_node);
1776#endif 1776#endif
1777 1777
1778#ifdef CONFIG_KMEMTRACE 1778#ifdef CONFIG_TRACING
1779void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, 1779void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
1780 gfp_t gfpflags, 1780 gfp_t gfpflags,
1781 int node) 1781 int node)
@@ -4371,12 +4371,28 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4371 return len + sprintf(buf + len, "\n"); 4371 return len + sprintf(buf + len, "\n");
4372} 4372}
4373 4373
4374static void clear_stat(struct kmem_cache *s, enum stat_item si)
4375{
4376 int cpu;
4377
4378 for_each_online_cpu(cpu)
4379 get_cpu_slab(s, cpu)->stat[si] = 0;
4380}
4381
4374#define STAT_ATTR(si, text) \ 4382#define STAT_ATTR(si, text) \
4375static ssize_t text##_show(struct kmem_cache *s, char *buf) \ 4383static ssize_t text##_show(struct kmem_cache *s, char *buf) \
4376{ \ 4384{ \
4377 return show_stat(s, buf, si); \ 4385 return show_stat(s, buf, si); \
4378} \ 4386} \
4379SLAB_ATTR_RO(text); \ 4387static ssize_t text##_store(struct kmem_cache *s, \
4388 const char *buf, size_t length) \
4389{ \
4390 if (buf[0] != '0') \
4391 return -EINVAL; \
4392 clear_stat(s, si); \
4393 return length; \
4394} \
4395SLAB_ATTR(text); \
4380 4396
4381STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 4397STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
4382STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 4398STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4de7f02f820b..6c0585b16418 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -22,6 +22,7 @@
22#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/ksm.h>
25#include <linux/rmap.h> 26#include <linux/rmap.h>
26#include <linux/security.h> 27#include <linux/security.h>
27#include <linux/backing-dev.h> 28#include <linux/backing-dev.h>
@@ -35,11 +36,15 @@
35#include <linux/swapops.h> 36#include <linux/swapops.h>
36#include <linux/page_cgroup.h> 37#include <linux/page_cgroup.h>
37 38
39static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
40 unsigned char);
41static void free_swap_count_continuations(struct swap_info_struct *);
42static sector_t map_swap_entry(swp_entry_t, struct block_device**);
43
38static DEFINE_SPINLOCK(swap_lock); 44static DEFINE_SPINLOCK(swap_lock);
39static unsigned int nr_swapfiles; 45static unsigned int nr_swapfiles;
40long nr_swap_pages; 46long nr_swap_pages;
41long total_swap_pages; 47long total_swap_pages;
42static int swap_overflow;
43static int least_priority; 48static int least_priority;
44 49
45static const char Bad_file[] = "Bad swap file entry "; 50static const char Bad_file[] = "Bad swap file entry ";
@@ -49,42 +54,20 @@ static const char Unused_offset[] = "Unused swap offset entry ";
49 54
50static struct swap_list_t swap_list = {-1, -1}; 55static struct swap_list_t swap_list = {-1, -1};
51 56
52static struct swap_info_struct swap_info[MAX_SWAPFILES]; 57static struct swap_info_struct *swap_info[MAX_SWAPFILES];
53 58
54static DEFINE_MUTEX(swapon_mutex); 59static DEFINE_MUTEX(swapon_mutex);
55 60
56/* For reference count accounting in swap_map */ 61static inline unsigned char swap_count(unsigned char ent)
57/* enum for swap_map[] handling. internal use only */
58enum {
59 SWAP_MAP = 0, /* ops for reference from swap users */
60 SWAP_CACHE, /* ops for reference from swap cache */
61};
62
63static inline int swap_count(unsigned short ent)
64{
65 return ent & SWAP_COUNT_MASK;
66}
67
68static inline bool swap_has_cache(unsigned short ent)
69{ 62{
70 return !!(ent & SWAP_HAS_CACHE); 63 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
71} 64}
72 65
73static inline unsigned short encode_swapmap(int count, bool has_cache) 66/* returns 1 if swap entry is freed */
74{
75 unsigned short ret = count;
76
77 if (has_cache)
78 return SWAP_HAS_CACHE | ret;
79 return ret;
80}
81
82/* returnes 1 if swap entry is freed */
83static int 67static int
84__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) 68__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
85{ 69{
86 int type = si - swap_info; 70 swp_entry_t entry = swp_entry(si->type, offset);
87 swp_entry_t entry = swp_entry(type, offset);
88 struct page *page; 71 struct page *page;
89 int ret = 0; 72 int ret = 0;
90 73
@@ -120,7 +103,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
120 down_read(&swap_unplug_sem); 103 down_read(&swap_unplug_sem);
121 entry.val = page_private(page); 104 entry.val = page_private(page);
122 if (PageSwapCache(page)) { 105 if (PageSwapCache(page)) {
123 struct block_device *bdev = swap_info[swp_type(entry)].bdev; 106 struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
124 struct backing_dev_info *bdi; 107 struct backing_dev_info *bdi;
125 108
126 /* 109 /*
@@ -146,23 +129,28 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
146static int discard_swap(struct swap_info_struct *si) 129static int discard_swap(struct swap_info_struct *si)
147{ 130{
148 struct swap_extent *se; 131 struct swap_extent *se;
132 sector_t start_block;
133 sector_t nr_blocks;
149 int err = 0; 134 int err = 0;
150 135
151 list_for_each_entry(se, &si->extent_list, list) { 136 /* Do not discard the swap header page! */
152 sector_t start_block = se->start_block << (PAGE_SHIFT - 9); 137 se = &si->first_swap_extent;
153 sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 138 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
140 if (nr_blocks) {
141 err = blkdev_issue_discard(si->bdev, start_block,
142 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
143 if (err)
144 return err;
145 cond_resched();
146 }
154 147
155 if (se->start_page == 0) { 148 list_for_each_entry(se, &si->first_swap_extent.list, list) {
156 /* Do not discard the swap header page! */ 149 start_block = se->start_block << (PAGE_SHIFT - 9);
157 start_block += 1 << (PAGE_SHIFT - 9); 150 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
158 nr_blocks -= 1 << (PAGE_SHIFT - 9);
159 if (!nr_blocks)
160 continue;
161 }
162 151
163 err = blkdev_issue_discard(si->bdev, start_block, 152 err = blkdev_issue_discard(si->bdev, start_block,
164 nr_blocks, GFP_KERNEL, 153 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
165 DISCARD_FL_BARRIER);
166 if (err) 154 if (err)
167 break; 155 break;
168 156
@@ -201,14 +189,11 @@ static void discard_swap_cluster(struct swap_info_struct *si,
201 start_block <<= PAGE_SHIFT - 9; 189 start_block <<= PAGE_SHIFT - 9;
202 nr_blocks <<= PAGE_SHIFT - 9; 190 nr_blocks <<= PAGE_SHIFT - 9;
203 if (blkdev_issue_discard(si->bdev, start_block, 191 if (blkdev_issue_discard(si->bdev, start_block,
204 nr_blocks, GFP_NOIO, 192 nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
205 DISCARD_FL_BARRIER))
206 break; 193 break;
207 } 194 }
208 195
209 lh = se->list.next; 196 lh = se->list.next;
210 if (lh == &si->extent_list)
211 lh = lh->next;
212 se = list_entry(lh, struct swap_extent, list); 197 se = list_entry(lh, struct swap_extent, list);
213 } 198 }
214} 199}
@@ -223,7 +208,7 @@ static int wait_for_discard(void *word)
223#define LATENCY_LIMIT 256 208#define LATENCY_LIMIT 256
224 209
225static inline unsigned long scan_swap_map(struct swap_info_struct *si, 210static inline unsigned long scan_swap_map(struct swap_info_struct *si,
226 int cache) 211 unsigned char usage)
227{ 212{
228 unsigned long offset; 213 unsigned long offset;
229 unsigned long scan_base; 214 unsigned long scan_base;
@@ -354,10 +339,7 @@ checks:
354 si->lowest_bit = si->max; 339 si->lowest_bit = si->max;
355 si->highest_bit = 0; 340 si->highest_bit = 0;
356 } 341 }
357 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ 342 si->swap_map[offset] = usage;
358 si->swap_map[offset] = encode_swapmap(0, true);
359 else /* at suspend */
360 si->swap_map[offset] = encode_swapmap(1, false);
361 si->cluster_next = offset + 1; 343 si->cluster_next = offset + 1;
362 si->flags -= SWP_SCANNING; 344 si->flags -= SWP_SCANNING;
363 345
@@ -467,10 +449,10 @@ swp_entry_t get_swap_page(void)
467 nr_swap_pages--; 449 nr_swap_pages--;
468 450
469 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 451 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
470 si = swap_info + type; 452 si = swap_info[type];
471 next = si->next; 453 next = si->next;
472 if (next < 0 || 454 if (next < 0 ||
473 (!wrapped && si->prio != swap_info[next].prio)) { 455 (!wrapped && si->prio != swap_info[next]->prio)) {
474 next = swap_list.head; 456 next = swap_list.head;
475 wrapped++; 457 wrapped++;
476 } 458 }
@@ -482,7 +464,7 @@ swp_entry_t get_swap_page(void)
482 464
483 swap_list.next = next; 465 swap_list.next = next;
484 /* This is called for allocating swap entry for cache */ 466 /* This is called for allocating swap entry for cache */
485 offset = scan_swap_map(si, SWAP_CACHE); 467 offset = scan_swap_map(si, SWAP_HAS_CACHE);
486 if (offset) { 468 if (offset) {
487 spin_unlock(&swap_lock); 469 spin_unlock(&swap_lock);
488 return swp_entry(type, offset); 470 return swp_entry(type, offset);
@@ -503,11 +485,11 @@ swp_entry_t get_swap_page_of_type(int type)
503 pgoff_t offset; 485 pgoff_t offset;
504 486
505 spin_lock(&swap_lock); 487 spin_lock(&swap_lock);
506 si = swap_info + type; 488 si = swap_info[type];
507 if (si->flags & SWP_WRITEOK) { 489 if (si && (si->flags & SWP_WRITEOK)) {
508 nr_swap_pages--; 490 nr_swap_pages--;
509 /* This is called for allocating swap entry, not cache */ 491 /* This is called for allocating swap entry, not cache */
510 offset = scan_swap_map(si, SWAP_MAP); 492 offset = scan_swap_map(si, 1);
511 if (offset) { 493 if (offset) {
512 spin_unlock(&swap_lock); 494 spin_unlock(&swap_lock);
513 return swp_entry(type, offset); 495 return swp_entry(type, offset);
@@ -518,9 +500,9 @@ swp_entry_t get_swap_page_of_type(int type)
518 return (swp_entry_t) {0}; 500 return (swp_entry_t) {0};
519} 501}
520 502
521static struct swap_info_struct * swap_info_get(swp_entry_t entry) 503static struct swap_info_struct *swap_info_get(swp_entry_t entry)
522{ 504{
523 struct swap_info_struct * p; 505 struct swap_info_struct *p;
524 unsigned long offset, type; 506 unsigned long offset, type;
525 507
526 if (!entry.val) 508 if (!entry.val)
@@ -528,7 +510,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry)
528 type = swp_type(entry); 510 type = swp_type(entry);
529 if (type >= nr_swapfiles) 511 if (type >= nr_swapfiles)
530 goto bad_nofile; 512 goto bad_nofile;
531 p = & swap_info[type]; 513 p = swap_info[type];
532 if (!(p->flags & SWP_USED)) 514 if (!(p->flags & SWP_USED))
533 goto bad_device; 515 goto bad_device;
534 offset = swp_offset(entry); 516 offset = swp_offset(entry);
@@ -554,41 +536,56 @@ out:
554 return NULL; 536 return NULL;
555} 537}
556 538
557static int swap_entry_free(struct swap_info_struct *p, 539static unsigned char swap_entry_free(struct swap_info_struct *p,
558 swp_entry_t ent, int cache) 540 swp_entry_t entry, unsigned char usage)
559{ 541{
560 unsigned long offset = swp_offset(ent); 542 unsigned long offset = swp_offset(entry);
561 int count = swap_count(p->swap_map[offset]); 543 unsigned char count;
562 bool has_cache; 544 unsigned char has_cache;
563 545
564 has_cache = swap_has_cache(p->swap_map[offset]); 546 count = p->swap_map[offset];
547 has_cache = count & SWAP_HAS_CACHE;
548 count &= ~SWAP_HAS_CACHE;
565 549
566 if (cache == SWAP_MAP) { /* dropping usage count of swap */ 550 if (usage == SWAP_HAS_CACHE) {
567 if (count < SWAP_MAP_MAX) {
568 count--;
569 p->swap_map[offset] = encode_swapmap(count, has_cache);
570 }
571 } else { /* dropping swap cache flag */
572 VM_BUG_ON(!has_cache); 551 VM_BUG_ON(!has_cache);
573 p->swap_map[offset] = encode_swapmap(count, false); 552 has_cache = 0;
574 553 } else if (count == SWAP_MAP_SHMEM) {
554 /*
555 * Or we could insist on shmem.c using a special
556 * swap_shmem_free() and free_shmem_swap_and_cache()...
557 */
558 count = 0;
559 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
560 if (count == COUNT_CONTINUED) {
561 if (swap_count_continued(p, offset, count))
562 count = SWAP_MAP_MAX | COUNT_CONTINUED;
563 else
564 count = SWAP_MAP_MAX;
565 } else
566 count--;
575 } 567 }
576 /* return code. */ 568
577 count = p->swap_map[offset]; 569 if (!count)
570 mem_cgroup_uncharge_swap(entry);
571
572 usage = count | has_cache;
573 p->swap_map[offset] = usage;
574
578 /* free if no reference */ 575 /* free if no reference */
579 if (!count) { 576 if (!usage) {
580 if (offset < p->lowest_bit) 577 if (offset < p->lowest_bit)
581 p->lowest_bit = offset; 578 p->lowest_bit = offset;
582 if (offset > p->highest_bit) 579 if (offset > p->highest_bit)
583 p->highest_bit = offset; 580 p->highest_bit = offset;
584 if (p->prio > swap_info[swap_list.next].prio) 581 if (swap_list.next >= 0 &&
585 swap_list.next = p - swap_info; 582 p->prio > swap_info[swap_list.next]->prio)
583 swap_list.next = p->type;
586 nr_swap_pages++; 584 nr_swap_pages++;
587 p->inuse_pages--; 585 p->inuse_pages--;
588 } 586 }
589 if (!swap_count(count)) 587
590 mem_cgroup_uncharge_swap(ent); 588 return usage;
591 return count;
592} 589}
593 590
594/* 591/*
@@ -597,11 +594,11 @@ static int swap_entry_free(struct swap_info_struct *p,
597 */ 594 */
598void swap_free(swp_entry_t entry) 595void swap_free(swp_entry_t entry)
599{ 596{
600 struct swap_info_struct * p; 597 struct swap_info_struct *p;
601 598
602 p = swap_info_get(entry); 599 p = swap_info_get(entry);
603 if (p) { 600 if (p) {
604 swap_entry_free(p, entry, SWAP_MAP); 601 swap_entry_free(p, entry, 1);
605 spin_unlock(&swap_lock); 602 spin_unlock(&swap_lock);
606 } 603 }
607} 604}
@@ -612,26 +609,21 @@ void swap_free(swp_entry_t entry)
612void swapcache_free(swp_entry_t entry, struct page *page) 609void swapcache_free(swp_entry_t entry, struct page *page)
613{ 610{
614 struct swap_info_struct *p; 611 struct swap_info_struct *p;
615 int ret; 612 unsigned char count;
616 613
617 p = swap_info_get(entry); 614 p = swap_info_get(entry);
618 if (p) { 615 if (p) {
619 ret = swap_entry_free(p, entry, SWAP_CACHE); 616 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
620 if (page) { 617 if (page)
621 bool swapout; 618 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
622 if (ret)
623 swapout = true; /* the end of swap out */
624 else
625 swapout = false; /* no more swap users! */
626 mem_cgroup_uncharge_swapcache(page, entry, swapout);
627 }
628 spin_unlock(&swap_lock); 619 spin_unlock(&swap_lock);
629 } 620 }
630 return;
631} 621}
632 622
633/* 623/*
634 * How many references to page are currently swapped out? 624 * How many references to page are currently swapped out?
625 * This does not give an exact answer when swap count is continued,
626 * but does include the high COUNT_CONTINUED flag to allow for that.
635 */ 627 */
636static inline int page_swapcount(struct page *page) 628static inline int page_swapcount(struct page *page)
637{ 629{
@@ -659,6 +651,8 @@ int reuse_swap_page(struct page *page)
659 int count; 651 int count;
660 652
661 VM_BUG_ON(!PageLocked(page)); 653 VM_BUG_ON(!PageLocked(page));
654 if (unlikely(PageKsm(page)))
655 return 0;
662 count = page_mapcount(page); 656 count = page_mapcount(page);
663 if (count <= 1 && PageSwapCache(page)) { 657 if (count <= 1 && PageSwapCache(page)) {
664 count += page_swapcount(page); 658 count += page_swapcount(page);
@@ -667,7 +661,7 @@ int reuse_swap_page(struct page *page)
667 SetPageDirty(page); 661 SetPageDirty(page);
668 } 662 }
669 } 663 }
670 return count == 1; 664 return count <= 1;
671} 665}
672 666
673/* 667/*
@@ -704,7 +698,7 @@ int free_swap_and_cache(swp_entry_t entry)
704 698
705 p = swap_info_get(entry); 699 p = swap_info_get(entry);
706 if (p) { 700 if (p) {
707 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { 701 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
708 page = find_get_page(&swapper_space, entry.val); 702 page = find_get_page(&swapper_space, entry.val);
709 if (page && !trylock_page(page)) { 703 if (page && !trylock_page(page)) {
710 page_cache_release(page); 704 page_cache_release(page);
@@ -741,14 +735,14 @@ int free_swap_and_cache(swp_entry_t entry)
741int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) 735int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
742{ 736{
743 struct block_device *bdev = NULL; 737 struct block_device *bdev = NULL;
744 int i; 738 int type;
745 739
746 if (device) 740 if (device)
747 bdev = bdget(device); 741 bdev = bdget(device);
748 742
749 spin_lock(&swap_lock); 743 spin_lock(&swap_lock);
750 for (i = 0; i < nr_swapfiles; i++) { 744 for (type = 0; type < nr_swapfiles; type++) {
751 struct swap_info_struct *sis = swap_info + i; 745 struct swap_info_struct *sis = swap_info[type];
752 746
753 if (!(sis->flags & SWP_WRITEOK)) 747 if (!(sis->flags & SWP_WRITEOK))
754 continue; 748 continue;
@@ -758,20 +752,18 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
758 *bdev_p = bdgrab(sis->bdev); 752 *bdev_p = bdgrab(sis->bdev);
759 753
760 spin_unlock(&swap_lock); 754 spin_unlock(&swap_lock);
761 return i; 755 return type;
762 } 756 }
763 if (bdev == sis->bdev) { 757 if (bdev == sis->bdev) {
764 struct swap_extent *se; 758 struct swap_extent *se = &sis->first_swap_extent;
765 759
766 se = list_entry(sis->extent_list.next,
767 struct swap_extent, list);
768 if (se->start_block == offset) { 760 if (se->start_block == offset) {
769 if (bdev_p) 761 if (bdev_p)
770 *bdev_p = bdgrab(sis->bdev); 762 *bdev_p = bdgrab(sis->bdev);
771 763
772 spin_unlock(&swap_lock); 764 spin_unlock(&swap_lock);
773 bdput(bdev); 765 bdput(bdev);
774 return i; 766 return type;
775 } 767 }
776 } 768 }
777 } 769 }
@@ -783,6 +775,21 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
783} 775}
784 776
785/* 777/*
778 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
779 * corresponding to given index in swap_info (swap type).
780 */
781sector_t swapdev_block(int type, pgoff_t offset)
782{
783 struct block_device *bdev;
784
785 if ((unsigned int)type >= nr_swapfiles)
786 return 0;
787 if (!(swap_info[type]->flags & SWP_WRITEOK))
788 return 0;
789 return map_swap_entry(swp_entry(type, offset), &bdev);
790}
791
792/*
786 * Return either the total number of swap pages of given type, or the number 793 * Return either the total number of swap pages of given type, or the number
787 * of free pages of that type (depending on @free) 794 * of free pages of that type (depending on @free)
788 * 795 *
@@ -792,18 +799,20 @@ unsigned int count_swap_pages(int type, int free)
792{ 799{
793 unsigned int n = 0; 800 unsigned int n = 0;
794 801
795 if (type < nr_swapfiles) { 802 spin_lock(&swap_lock);
796 spin_lock(&swap_lock); 803 if ((unsigned int)type < nr_swapfiles) {
797 if (swap_info[type].flags & SWP_WRITEOK) { 804 struct swap_info_struct *sis = swap_info[type];
798 n = swap_info[type].pages; 805
806 if (sis->flags & SWP_WRITEOK) {
807 n = sis->pages;
799 if (free) 808 if (free)
800 n -= swap_info[type].inuse_pages; 809 n -= sis->inuse_pages;
801 } 810 }
802 spin_unlock(&swap_lock);
803 } 811 }
812 spin_unlock(&swap_lock);
804 return n; 813 return n;
805} 814}
806#endif 815#endif /* CONFIG_HIBERNATION */
807 816
808/* 817/*
809 * No need to decide whether this PTE shares the swap entry with others, 818 * No need to decide whether this PTE shares the swap entry with others,
@@ -932,7 +941,7 @@ static int unuse_vma(struct vm_area_struct *vma,
932 unsigned long addr, end, next; 941 unsigned long addr, end, next;
933 int ret; 942 int ret;
934 943
935 if (page->mapping) { 944 if (page_anon_vma(page)) {
936 addr = page_address_in_vma(page, vma); 945 addr = page_address_in_vma(page, vma);
937 if (addr == -EFAULT) 946 if (addr == -EFAULT)
938 return 0; 947 return 0;
@@ -988,7 +997,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
988{ 997{
989 unsigned int max = si->max; 998 unsigned int max = si->max;
990 unsigned int i = prev; 999 unsigned int i = prev;
991 int count; 1000 unsigned char count;
992 1001
993 /* 1002 /*
994 * No need for swap_lock here: we're just looking 1003 * No need for swap_lock here: we're just looking
@@ -1024,16 +1033,14 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1024 */ 1033 */
1025static int try_to_unuse(unsigned int type) 1034static int try_to_unuse(unsigned int type)
1026{ 1035{
1027 struct swap_info_struct * si = &swap_info[type]; 1036 struct swap_info_struct *si = swap_info[type];
1028 struct mm_struct *start_mm; 1037 struct mm_struct *start_mm;
1029 unsigned short *swap_map; 1038 unsigned char *swap_map;
1030 unsigned short swcount; 1039 unsigned char swcount;
1031 struct page *page; 1040 struct page *page;
1032 swp_entry_t entry; 1041 swp_entry_t entry;
1033 unsigned int i = 0; 1042 unsigned int i = 0;
1034 int retval = 0; 1043 int retval = 0;
1035 int reset_overflow = 0;
1036 int shmem;
1037 1044
1038 /* 1045 /*
1039 * When searching mms for an entry, a good strategy is to 1046 * When searching mms for an entry, a good strategy is to
@@ -1047,8 +1054,7 @@ static int try_to_unuse(unsigned int type)
1047 * together, child after parent. If we race with dup_mmap(), we 1054 * together, child after parent. If we race with dup_mmap(), we
1048 * prefer to resolve parent before child, lest we miss entries 1055 * prefer to resolve parent before child, lest we miss entries
1049 * duplicated after we scanned child: using last mm would invert 1056 * duplicated after we scanned child: using last mm would invert
1050 * that. Though it's only a serious concern when an overflowed 1057 * that.
1051 * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
1052 */ 1058 */
1053 start_mm = &init_mm; 1059 start_mm = &init_mm;
1054 atomic_inc(&init_mm.mm_users); 1060 atomic_inc(&init_mm.mm_users);
@@ -1110,17 +1116,18 @@ static int try_to_unuse(unsigned int type)
1110 1116
1111 /* 1117 /*
1112 * Remove all references to entry. 1118 * Remove all references to entry.
1113 * Whenever we reach init_mm, there's no address space
1114 * to search, but use it as a reminder to search shmem.
1115 */ 1119 */
1116 shmem = 0;
1117 swcount = *swap_map; 1120 swcount = *swap_map;
1118 if (swap_count(swcount)) { 1121 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1119 if (start_mm == &init_mm) 1122 retval = shmem_unuse(entry, page);
1120 shmem = shmem_unuse(entry, page); 1123 /* page has already been unlocked and released */
1121 else 1124 if (retval < 0)
1122 retval = unuse_mm(start_mm, entry, page); 1125 break;
1126 continue;
1123 } 1127 }
1128 if (swap_count(swcount) && start_mm != &init_mm)
1129 retval = unuse_mm(start_mm, entry, page);
1130
1124 if (swap_count(*swap_map)) { 1131 if (swap_count(*swap_map)) {
1125 int set_start_mm = (*swap_map >= swcount); 1132 int set_start_mm = (*swap_map >= swcount);
1126 struct list_head *p = &start_mm->mmlist; 1133 struct list_head *p = &start_mm->mmlist;
@@ -1131,7 +1138,7 @@ static int try_to_unuse(unsigned int type)
1131 atomic_inc(&new_start_mm->mm_users); 1138 atomic_inc(&new_start_mm->mm_users);
1132 atomic_inc(&prev_mm->mm_users); 1139 atomic_inc(&prev_mm->mm_users);
1133 spin_lock(&mmlist_lock); 1140 spin_lock(&mmlist_lock);
1134 while (swap_count(*swap_map) && !retval && !shmem && 1141 while (swap_count(*swap_map) && !retval &&
1135 (p = p->next) != &start_mm->mmlist) { 1142 (p = p->next) != &start_mm->mmlist) {
1136 mm = list_entry(p, struct mm_struct, mmlist); 1143 mm = list_entry(p, struct mm_struct, mmlist);
1137 if (!atomic_inc_not_zero(&mm->mm_users)) 1144 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1145,14 +1152,12 @@ static int try_to_unuse(unsigned int type)
1145 swcount = *swap_map; 1152 swcount = *swap_map;
1146 if (!swap_count(swcount)) /* any usage ? */ 1153 if (!swap_count(swcount)) /* any usage ? */
1147 ; 1154 ;
1148 else if (mm == &init_mm) { 1155 else if (mm == &init_mm)
1149 set_start_mm = 1; 1156 set_start_mm = 1;
1150 shmem = shmem_unuse(entry, page); 1157 else
1151 } else
1152 retval = unuse_mm(mm, entry, page); 1158 retval = unuse_mm(mm, entry, page);
1153 1159
1154 if (set_start_mm && 1160 if (set_start_mm && *swap_map < swcount) {
1155 swap_count(*swap_map) < swcount) {
1156 mmput(new_start_mm); 1161 mmput(new_start_mm);
1157 atomic_inc(&mm->mm_users); 1162 atomic_inc(&mm->mm_users);
1158 new_start_mm = mm; 1163 new_start_mm = mm;
@@ -1165,13 +1170,6 @@ static int try_to_unuse(unsigned int type)
1165 mmput(start_mm); 1170 mmput(start_mm);
1166 start_mm = new_start_mm; 1171 start_mm = new_start_mm;
1167 } 1172 }
1168 if (shmem) {
1169 /* page has already been unlocked and released */
1170 if (shmem > 0)
1171 continue;
1172 retval = shmem;
1173 break;
1174 }
1175 if (retval) { 1173 if (retval) {
1176 unlock_page(page); 1174 unlock_page(page);
1177 page_cache_release(page); 1175 page_cache_release(page);
@@ -1179,30 +1177,6 @@ static int try_to_unuse(unsigned int type)
1179 } 1177 }
1180 1178
1181 /* 1179 /*
1182 * How could swap count reach 0x7ffe ?
1183 * There's no way to repeat a swap page within an mm
1184 * (except in shmem, where it's the shared object which takes
1185 * the reference count)?
1186 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1187 * short is too small....)
1188 * If that's wrong, then we should worry more about
1189 * exit_mmap() and do_munmap() cases described above:
1190 * we might be resetting SWAP_MAP_MAX too early here.
1191 * We know "Undead"s can happen, they're okay, so don't
1192 * report them; but do report if we reset SWAP_MAP_MAX.
1193 */
1194 /* We might release the lock_page() in unuse_mm(). */
1195 if (!PageSwapCache(page) || page_private(page) != entry.val)
1196 goto retry;
1197
1198 if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1199 spin_lock(&swap_lock);
1200 *swap_map = encode_swapmap(0, true);
1201 spin_unlock(&swap_lock);
1202 reset_overflow = 1;
1203 }
1204
1205 /*
1206 * If a reference remains (rare), we would like to leave 1180 * If a reference remains (rare), we would like to leave
1207 * the page in the swap cache; but try_to_unmap could 1181 * the page in the swap cache; but try_to_unmap could
1208 * then re-duplicate the entry once we drop page lock, 1182 * then re-duplicate the entry once we drop page lock,
@@ -1214,6 +1188,12 @@ static int try_to_unuse(unsigned int type)
1214 * read from disk into another page. Splitting into two 1188 * read from disk into another page. Splitting into two
1215 * pages would be incorrect if swap supported "shared 1189 * pages would be incorrect if swap supported "shared
1216 * private" pages, but they are handled by tmpfs files. 1190 * private" pages, but they are handled by tmpfs files.
1191 *
1192 * Given how unuse_vma() targets one particular offset
1193 * in an anon_vma, once the anon_vma has been determined,
1194 * this splitting happens to be just what is needed to
1195 * handle where KSM pages have been swapped out: re-reading
1196 * is unnecessarily slow, but we can fix that later on.
1217 */ 1197 */
1218 if (swap_count(*swap_map) && 1198 if (swap_count(*swap_map) &&
1219 PageDirty(page) && PageSwapCache(page)) { 1199 PageDirty(page) && PageSwapCache(page)) {
@@ -1243,7 +1223,6 @@ static int try_to_unuse(unsigned int type)
1243 * mark page dirty so shrink_page_list will preserve it. 1223 * mark page dirty so shrink_page_list will preserve it.
1244 */ 1224 */
1245 SetPageDirty(page); 1225 SetPageDirty(page);
1246retry:
1247 unlock_page(page); 1226 unlock_page(page);
1248 page_cache_release(page); 1227 page_cache_release(page);
1249 1228
@@ -1255,10 +1234,6 @@ retry:
1255 } 1234 }
1256 1235
1257 mmput(start_mm); 1236 mmput(start_mm);
1258 if (reset_overflow) {
1259 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
1260 swap_overflow = 0;
1261 }
1262 return retval; 1237 return retval;
1263} 1238}
1264 1239
@@ -1271,10 +1246,10 @@ retry:
1271static void drain_mmlist(void) 1246static void drain_mmlist(void)
1272{ 1247{
1273 struct list_head *p, *next; 1248 struct list_head *p, *next;
1274 unsigned int i; 1249 unsigned int type;
1275 1250
1276 for (i = 0; i < nr_swapfiles; i++) 1251 for (type = 0; type < nr_swapfiles; type++)
1277 if (swap_info[i].inuse_pages) 1252 if (swap_info[type]->inuse_pages)
1278 return; 1253 return;
1279 spin_lock(&mmlist_lock); 1254 spin_lock(&mmlist_lock);
1280 list_for_each_safe(p, next, &init_mm.mmlist) 1255 list_for_each_safe(p, next, &init_mm.mmlist)
@@ -1284,12 +1259,23 @@ static void drain_mmlist(void)
1284 1259
1285/* 1260/*
1286 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which 1261 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
1287 * corresponds to page offset `offset'. 1262 * corresponds to page offset for the specified swap entry.
1263 * Note that the type of this function is sector_t, but it returns page offset
1264 * into the bdev, not sector offset.
1288 */ 1265 */
1289sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) 1266static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1290{ 1267{
1291 struct swap_extent *se = sis->curr_swap_extent; 1268 struct swap_info_struct *sis;
1292 struct swap_extent *start_se = se; 1269 struct swap_extent *start_se;
1270 struct swap_extent *se;
1271 pgoff_t offset;
1272
1273 sis = swap_info[swp_type(entry)];
1274 *bdev = sis->bdev;
1275
1276 offset = swp_offset(entry);
1277 start_se = sis->curr_swap_extent;
1278 se = start_se;
1293 1279
1294 for ( ; ; ) { 1280 for ( ; ; ) {
1295 struct list_head *lh; 1281 struct list_head *lh;
@@ -1299,40 +1285,31 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
1299 return se->start_block + (offset - se->start_page); 1285 return se->start_block + (offset - se->start_page);
1300 } 1286 }
1301 lh = se->list.next; 1287 lh = se->list.next;
1302 if (lh == &sis->extent_list)
1303 lh = lh->next;
1304 se = list_entry(lh, struct swap_extent, list); 1288 se = list_entry(lh, struct swap_extent, list);
1305 sis->curr_swap_extent = se; 1289 sis->curr_swap_extent = se;
1306 BUG_ON(se == start_se); /* It *must* be present */ 1290 BUG_ON(se == start_se); /* It *must* be present */
1307 } 1291 }
1308} 1292}
1309 1293
1310#ifdef CONFIG_HIBERNATION
1311/* 1294/*
1312 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 1295 * Returns the page offset into bdev for the specified page's swap entry.
1313 * corresponding to given index in swap_info (swap type).
1314 */ 1296 */
1315sector_t swapdev_block(int swap_type, pgoff_t offset) 1297sector_t map_swap_page(struct page *page, struct block_device **bdev)
1316{ 1298{
1317 struct swap_info_struct *sis; 1299 swp_entry_t entry;
1318 1300 entry.val = page_private(page);
1319 if (swap_type >= nr_swapfiles) 1301 return map_swap_entry(entry, bdev);
1320 return 0;
1321
1322 sis = swap_info + swap_type;
1323 return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
1324} 1302}
1325#endif /* CONFIG_HIBERNATION */
1326 1303
1327/* 1304/*
1328 * Free all of a swapdev's extent information 1305 * Free all of a swapdev's extent information
1329 */ 1306 */
1330static void destroy_swap_extents(struct swap_info_struct *sis) 1307static void destroy_swap_extents(struct swap_info_struct *sis)
1331{ 1308{
1332 while (!list_empty(&sis->extent_list)) { 1309 while (!list_empty(&sis->first_swap_extent.list)) {
1333 struct swap_extent *se; 1310 struct swap_extent *se;
1334 1311
1335 se = list_entry(sis->extent_list.next, 1312 se = list_entry(sis->first_swap_extent.list.next,
1336 struct swap_extent, list); 1313 struct swap_extent, list);
1337 list_del(&se->list); 1314 list_del(&se->list);
1338 kfree(se); 1315 kfree(se);
@@ -1353,8 +1330,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1353 struct swap_extent *new_se; 1330 struct swap_extent *new_se;
1354 struct list_head *lh; 1331 struct list_head *lh;
1355 1332
1356 lh = sis->extent_list.prev; /* The highest page extent */ 1333 if (start_page == 0) {
1357 if (lh != &sis->extent_list) { 1334 se = &sis->first_swap_extent;
1335 sis->curr_swap_extent = se;
1336 se->start_page = 0;
1337 se->nr_pages = nr_pages;
1338 se->start_block = start_block;
1339 return 1;
1340 } else {
1341 lh = sis->first_swap_extent.list.prev; /* Highest extent */
1358 se = list_entry(lh, struct swap_extent, list); 1342 se = list_entry(lh, struct swap_extent, list);
1359 BUG_ON(se->start_page + se->nr_pages != start_page); 1343 BUG_ON(se->start_page + se->nr_pages != start_page);
1360 if (se->start_block + se->nr_pages == start_block) { 1344 if (se->start_block + se->nr_pages == start_block) {
@@ -1374,7 +1358,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1374 new_se->nr_pages = nr_pages; 1358 new_se->nr_pages = nr_pages;
1375 new_se->start_block = start_block; 1359 new_se->start_block = start_block;
1376 1360
1377 list_add_tail(&new_se->list, &sis->extent_list); 1361 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1378 return 1; 1362 return 1;
1379} 1363}
1380 1364
@@ -1426,7 +1410,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1426 if (S_ISBLK(inode->i_mode)) { 1410 if (S_ISBLK(inode->i_mode)) {
1427 ret = add_swap_extent(sis, 0, sis->max, 0); 1411 ret = add_swap_extent(sis, 0, sis->max, 0);
1428 *span = sis->pages; 1412 *span = sis->pages;
1429 goto done; 1413 goto out;
1430 } 1414 }
1431 1415
1432 blkbits = inode->i_blkbits; 1416 blkbits = inode->i_blkbits;
@@ -1497,25 +1481,22 @@ reprobe:
1497 sis->max = page_no; 1481 sis->max = page_no;
1498 sis->pages = page_no - 1; 1482 sis->pages = page_no - 1;
1499 sis->highest_bit = page_no - 1; 1483 sis->highest_bit = page_no - 1;
1500done: 1484out:
1501 sis->curr_swap_extent = list_entry(sis->extent_list.prev, 1485 return ret;
1502 struct swap_extent, list);
1503 goto out;
1504bad_bmap: 1486bad_bmap:
1505 printk(KERN_ERR "swapon: swapfile has holes\n"); 1487 printk(KERN_ERR "swapon: swapfile has holes\n");
1506 ret = -EINVAL; 1488 ret = -EINVAL;
1507out: 1489 goto out;
1508 return ret;
1509} 1490}
1510 1491
1511SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1492SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1512{ 1493{
1513 struct swap_info_struct * p = NULL; 1494 struct swap_info_struct *p = NULL;
1514 unsigned short *swap_map; 1495 unsigned char *swap_map;
1515 struct file *swap_file, *victim; 1496 struct file *swap_file, *victim;
1516 struct address_space *mapping; 1497 struct address_space *mapping;
1517 struct inode *inode; 1498 struct inode *inode;
1518 char * pathname; 1499 char *pathname;
1519 int i, type, prev; 1500 int i, type, prev;
1520 int err; 1501 int err;
1521 1502
@@ -1536,8 +1517,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1536 mapping = victim->f_mapping; 1517 mapping = victim->f_mapping;
1537 prev = -1; 1518 prev = -1;
1538 spin_lock(&swap_lock); 1519 spin_lock(&swap_lock);
1539 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 1520 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1540 p = swap_info + type; 1521 p = swap_info[type];
1541 if (p->flags & SWP_WRITEOK) { 1522 if (p->flags & SWP_WRITEOK) {
1542 if (p->swap_file->f_mapping == mapping) 1523 if (p->swap_file->f_mapping == mapping)
1543 break; 1524 break;
@@ -1556,18 +1537,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1556 spin_unlock(&swap_lock); 1537 spin_unlock(&swap_lock);
1557 goto out_dput; 1538 goto out_dput;
1558 } 1539 }
1559 if (prev < 0) { 1540 if (prev < 0)
1560 swap_list.head = p->next; 1541 swap_list.head = p->next;
1561 } else { 1542 else
1562 swap_info[prev].next = p->next; 1543 swap_info[prev]->next = p->next;
1563 }
1564 if (type == swap_list.next) { 1544 if (type == swap_list.next) {
1565 /* just pick something that's safe... */ 1545 /* just pick something that's safe... */
1566 swap_list.next = swap_list.head; 1546 swap_list.next = swap_list.head;
1567 } 1547 }
1568 if (p->prio < 0) { 1548 if (p->prio < 0) {
1569 for (i = p->next; i >= 0; i = swap_info[i].next) 1549 for (i = p->next; i >= 0; i = swap_info[i]->next)
1570 swap_info[i].prio = p->prio--; 1550 swap_info[i]->prio = p->prio--;
1571 least_priority++; 1551 least_priority++;
1572 } 1552 }
1573 nr_swap_pages -= p->pages; 1553 nr_swap_pages -= p->pages;
@@ -1585,16 +1565,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1585 if (p->prio < 0) 1565 if (p->prio < 0)
1586 p->prio = --least_priority; 1566 p->prio = --least_priority;
1587 prev = -1; 1567 prev = -1;
1588 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 1568 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1589 if (p->prio >= swap_info[i].prio) 1569 if (p->prio >= swap_info[i]->prio)
1590 break; 1570 break;
1591 prev = i; 1571 prev = i;
1592 } 1572 }
1593 p->next = i; 1573 p->next = i;
1594 if (prev < 0) 1574 if (prev < 0)
1595 swap_list.head = swap_list.next = p - swap_info; 1575 swap_list.head = swap_list.next = type;
1596 else 1576 else
1597 swap_info[prev].next = p - swap_info; 1577 swap_info[prev]->next = type;
1598 nr_swap_pages += p->pages; 1578 nr_swap_pages += p->pages;
1599 total_swap_pages += p->pages; 1579 total_swap_pages += p->pages;
1600 p->flags |= SWP_WRITEOK; 1580 p->flags |= SWP_WRITEOK;
@@ -1607,6 +1587,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1607 up_write(&swap_unplug_sem); 1587 up_write(&swap_unplug_sem);
1608 1588
1609 destroy_swap_extents(p); 1589 destroy_swap_extents(p);
1590 if (p->flags & SWP_CONTINUED)
1591 free_swap_count_continuations(p);
1592
1610 mutex_lock(&swapon_mutex); 1593 mutex_lock(&swapon_mutex);
1611 spin_lock(&swap_lock); 1594 spin_lock(&swap_lock);
1612 drain_mmlist(); 1595 drain_mmlist();
@@ -1654,8 +1637,8 @@ out:
1654/* iterator */ 1637/* iterator */
1655static void *swap_start(struct seq_file *swap, loff_t *pos) 1638static void *swap_start(struct seq_file *swap, loff_t *pos)
1656{ 1639{
1657 struct swap_info_struct *ptr = swap_info; 1640 struct swap_info_struct *si;
1658 int i; 1641 int type;
1659 loff_t l = *pos; 1642 loff_t l = *pos;
1660 1643
1661 mutex_lock(&swapon_mutex); 1644 mutex_lock(&swapon_mutex);
@@ -1663,11 +1646,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1663 if (!l) 1646 if (!l)
1664 return SEQ_START_TOKEN; 1647 return SEQ_START_TOKEN;
1665 1648
1666 for (i = 0; i < nr_swapfiles; i++, ptr++) { 1649 for (type = 0; type < nr_swapfiles; type++) {
1667 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1650 smp_rmb(); /* read nr_swapfiles before swap_info[type] */
1651 si = swap_info[type];
1652 if (!(si->flags & SWP_USED) || !si->swap_map)
1668 continue; 1653 continue;
1669 if (!--l) 1654 if (!--l)
1670 return ptr; 1655 return si;
1671 } 1656 }
1672 1657
1673 return NULL; 1658 return NULL;
@@ -1675,21 +1660,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1675 1660
1676static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 1661static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1677{ 1662{
1678 struct swap_info_struct *ptr; 1663 struct swap_info_struct *si = v;
1679 struct swap_info_struct *endptr = swap_info + nr_swapfiles; 1664 int type;
1680 1665
1681 if (v == SEQ_START_TOKEN) 1666 if (v == SEQ_START_TOKEN)
1682 ptr = swap_info; 1667 type = 0;
1683 else { 1668 else
1684 ptr = v; 1669 type = si->type + 1;
1685 ptr++;
1686 }
1687 1670
1688 for (; ptr < endptr; ptr++) { 1671 for (; type < nr_swapfiles; type++) {
1689 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1672 smp_rmb(); /* read nr_swapfiles before swap_info[type] */
1673 si = swap_info[type];
1674 if (!(si->flags & SWP_USED) || !si->swap_map)
1690 continue; 1675 continue;
1691 ++*pos; 1676 ++*pos;
1692 return ptr; 1677 return si;
1693 } 1678 }
1694 1679
1695 return NULL; 1680 return NULL;
@@ -1702,24 +1687,24 @@ static void swap_stop(struct seq_file *swap, void *v)
1702 1687
1703static int swap_show(struct seq_file *swap, void *v) 1688static int swap_show(struct seq_file *swap, void *v)
1704{ 1689{
1705 struct swap_info_struct *ptr = v; 1690 struct swap_info_struct *si = v;
1706 struct file *file; 1691 struct file *file;
1707 int len; 1692 int len;
1708 1693
1709 if (ptr == SEQ_START_TOKEN) { 1694 if (si == SEQ_START_TOKEN) {
1710 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 1695 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1711 return 0; 1696 return 0;
1712 } 1697 }
1713 1698
1714 file = ptr->swap_file; 1699 file = si->swap_file;
1715 len = seq_path(swap, &file->f_path, " \t\n\\"); 1700 len = seq_path(swap, &file->f_path, " \t\n\\");
1716 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1701 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1717 len < 40 ? 40 - len : 1, " ", 1702 len < 40 ? 40 - len : 1, " ",
1718 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? 1703 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1719 "partition" : "file\t", 1704 "partition" : "file\t",
1720 ptr->pages << (PAGE_SHIFT - 10), 1705 si->pages << (PAGE_SHIFT - 10),
1721 ptr->inuse_pages << (PAGE_SHIFT - 10), 1706 si->inuse_pages << (PAGE_SHIFT - 10),
1722 ptr->prio); 1707 si->prio);
1723 return 0; 1708 return 0;
1724} 1709}
1725 1710
@@ -1766,7 +1751,7 @@ late_initcall(max_swapfiles_check);
1766 */ 1751 */
1767SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 1752SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1768{ 1753{
1769 struct swap_info_struct * p; 1754 struct swap_info_struct *p;
1770 char *name = NULL; 1755 char *name = NULL;
1771 struct block_device *bdev = NULL; 1756 struct block_device *bdev = NULL;
1772 struct file *swap_file = NULL; 1757 struct file *swap_file = NULL;
@@ -1780,30 +1765,52 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1780 sector_t span; 1765 sector_t span;
1781 unsigned long maxpages = 1; 1766 unsigned long maxpages = 1;
1782 unsigned long swapfilepages; 1767 unsigned long swapfilepages;
1783 unsigned short *swap_map = NULL; 1768 unsigned char *swap_map = NULL;
1784 struct page *page = NULL; 1769 struct page *page = NULL;
1785 struct inode *inode = NULL; 1770 struct inode *inode = NULL;
1786 int did_down = 0; 1771 int did_down = 0;
1787 1772
1788 if (!capable(CAP_SYS_ADMIN)) 1773 if (!capable(CAP_SYS_ADMIN))
1789 return -EPERM; 1774 return -EPERM;
1775
1776 p = kzalloc(sizeof(*p), GFP_KERNEL);
1777 if (!p)
1778 return -ENOMEM;
1779
1790 spin_lock(&swap_lock); 1780 spin_lock(&swap_lock);
1791 p = swap_info; 1781 for (type = 0; type < nr_swapfiles; type++) {
1792 for (type = 0 ; type < nr_swapfiles ; type++,p++) 1782 if (!(swap_info[type]->flags & SWP_USED))
1793 if (!(p->flags & SWP_USED))
1794 break; 1783 break;
1784 }
1795 error = -EPERM; 1785 error = -EPERM;
1796 if (type >= MAX_SWAPFILES) { 1786 if (type >= MAX_SWAPFILES) {
1797 spin_unlock(&swap_lock); 1787 spin_unlock(&swap_lock);
1788 kfree(p);
1798 goto out; 1789 goto out;
1799 } 1790 }
1800 if (type >= nr_swapfiles) 1791 if (type >= nr_swapfiles) {
1801 nr_swapfiles = type+1; 1792 p->type = type;
1802 memset(p, 0, sizeof(*p)); 1793 swap_info[type] = p;
1803 INIT_LIST_HEAD(&p->extent_list); 1794 /*
1795 * Write swap_info[type] before nr_swapfiles, in case a
1796 * racing procfs swap_start() or swap_next() is reading them.
1797 * (We never shrink nr_swapfiles, we never free this entry.)
1798 */
1799 smp_wmb();
1800 nr_swapfiles++;
1801 } else {
1802 kfree(p);
1803 p = swap_info[type];
1804 /*
1805 * Do not memset this entry: a racing procfs swap_next()
1806 * would be relying on p->type to remain valid.
1807 */
1808 }
1809 INIT_LIST_HEAD(&p->first_swap_extent.list);
1804 p->flags = SWP_USED; 1810 p->flags = SWP_USED;
1805 p->next = -1; 1811 p->next = -1;
1806 spin_unlock(&swap_lock); 1812 spin_unlock(&swap_lock);
1813
1807 name = getname(specialfile); 1814 name = getname(specialfile);
1808 error = PTR_ERR(name); 1815 error = PTR_ERR(name);
1809 if (IS_ERR(name)) { 1816 if (IS_ERR(name)) {
@@ -1823,7 +1830,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1823 1830
1824 error = -EBUSY; 1831 error = -EBUSY;
1825 for (i = 0; i < nr_swapfiles; i++) { 1832 for (i = 0; i < nr_swapfiles; i++) {
1826 struct swap_info_struct *q = &swap_info[i]; 1833 struct swap_info_struct *q = swap_info[i];
1827 1834
1828 if (i == type || !q->swap_file) 1835 if (i == type || !q->swap_file)
1829 continue; 1836 continue;
@@ -1898,6 +1905,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1898 1905
1899 p->lowest_bit = 1; 1906 p->lowest_bit = 1;
1900 p->cluster_next = 1; 1907 p->cluster_next = 1;
1908 p->cluster_nr = 0;
1901 1909
1902 /* 1910 /*
1903 * Find out how many pages are allowed for a single swap 1911 * Find out how many pages are allowed for a single swap
@@ -1933,13 +1941,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1933 goto bad_swap; 1941 goto bad_swap;
1934 1942
1935 /* OK, set up the swap map and apply the bad block list */ 1943 /* OK, set up the swap map and apply the bad block list */
1936 swap_map = vmalloc(maxpages * sizeof(short)); 1944 swap_map = vmalloc(maxpages);
1937 if (!swap_map) { 1945 if (!swap_map) {
1938 error = -ENOMEM; 1946 error = -ENOMEM;
1939 goto bad_swap; 1947 goto bad_swap;
1940 } 1948 }
1941 1949
1942 memset(swap_map, 0, maxpages * sizeof(short)); 1950 memset(swap_map, 0, maxpages);
1943 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1951 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1944 int page_nr = swap_header->info.badpages[i]; 1952 int page_nr = swap_header->info.badpages[i];
1945 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { 1953 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
@@ -1974,12 +1982,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1974 goto bad_swap; 1982 goto bad_swap;
1975 } 1983 }
1976 1984
1977 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 1985 if (p->bdev) {
1978 p->flags |= SWP_SOLIDSTATE; 1986 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
1979 p->cluster_next = 1 + (random32() % p->highest_bit); 1987 p->flags |= SWP_SOLIDSTATE;
1988 p->cluster_next = 1 + (random32() % p->highest_bit);
1989 }
1990 if (discard_swap(p) == 0)
1991 p->flags |= SWP_DISCARDABLE;
1980 } 1992 }
1981 if (discard_swap(p) == 0)
1982 p->flags |= SWP_DISCARDABLE;
1983 1993
1984 mutex_lock(&swapon_mutex); 1994 mutex_lock(&swapon_mutex);
1985 spin_lock(&swap_lock); 1995 spin_lock(&swap_lock);
@@ -2002,18 +2012,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2002 2012
2003 /* insert swap space into swap_list: */ 2013 /* insert swap space into swap_list: */
2004 prev = -1; 2014 prev = -1;
2005 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 2015 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
2006 if (p->prio >= swap_info[i].prio) { 2016 if (p->prio >= swap_info[i]->prio)
2007 break; 2017 break;
2008 }
2009 prev = i; 2018 prev = i;
2010 } 2019 }
2011 p->next = i; 2020 p->next = i;
2012 if (prev < 0) { 2021 if (prev < 0)
2013 swap_list.head = swap_list.next = p - swap_info; 2022 swap_list.head = swap_list.next = type;
2014 } else { 2023 else
2015 swap_info[prev].next = p - swap_info; 2024 swap_info[prev]->next = type;
2016 }
2017 spin_unlock(&swap_lock); 2025 spin_unlock(&swap_lock);
2018 mutex_unlock(&swapon_mutex); 2026 mutex_unlock(&swapon_mutex);
2019 error = 0; 2027 error = 0;
@@ -2050,15 +2058,15 @@ out:
2050 2058
2051void si_swapinfo(struct sysinfo *val) 2059void si_swapinfo(struct sysinfo *val)
2052{ 2060{
2053 unsigned int i; 2061 unsigned int type;
2054 unsigned long nr_to_be_unused = 0; 2062 unsigned long nr_to_be_unused = 0;
2055 2063
2056 spin_lock(&swap_lock); 2064 spin_lock(&swap_lock);
2057 for (i = 0; i < nr_swapfiles; i++) { 2065 for (type = 0; type < nr_swapfiles; type++) {
2058 if (!(swap_info[i].flags & SWP_USED) || 2066 struct swap_info_struct *si = swap_info[type];
2059 (swap_info[i].flags & SWP_WRITEOK)) 2067
2060 continue; 2068 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2061 nr_to_be_unused += swap_info[i].inuse_pages; 2069 nr_to_be_unused += si->inuse_pages;
2062 } 2070 }
2063 val->freeswap = nr_swap_pages + nr_to_be_unused; 2071 val->freeswap = nr_swap_pages + nr_to_be_unused;
2064 val->totalswap = total_swap_pages + nr_to_be_unused; 2072 val->totalswap = total_swap_pages + nr_to_be_unused;
@@ -2068,101 +2076,107 @@ void si_swapinfo(struct sysinfo *val)
2068/* 2076/*
2069 * Verify that a swap entry is valid and increment its swap map count. 2077 * Verify that a swap entry is valid and increment its swap map count.
2070 * 2078 *
2071 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
2072 * "permanent", but will be reclaimed by the next swapoff.
2073 * Returns error code in following case. 2079 * Returns error code in following case.
2074 * - success -> 0 2080 * - success -> 0
2075 * - swp_entry is invalid -> EINVAL 2081 * - swp_entry is invalid -> EINVAL
2076 * - swp_entry is migration entry -> EINVAL 2082 * - swp_entry is migration entry -> EINVAL
2077 * - swap-cache reference is requested but there is already one. -> EEXIST 2083 * - swap-cache reference is requested but there is already one. -> EEXIST
2078 * - swap-cache reference is requested but the entry is not used. -> ENOENT 2084 * - swap-cache reference is requested but the entry is not used. -> ENOENT
2085 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
2079 */ 2086 */
2080static int __swap_duplicate(swp_entry_t entry, bool cache) 2087static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2081{ 2088{
2082 struct swap_info_struct * p; 2089 struct swap_info_struct *p;
2083 unsigned long offset, type; 2090 unsigned long offset, type;
2084 int result = -EINVAL; 2091 unsigned char count;
2085 int count; 2092 unsigned char has_cache;
2086 bool has_cache; 2093 int err = -EINVAL;
2087 2094
2088 if (non_swap_entry(entry)) 2095 if (non_swap_entry(entry))
2089 return -EINVAL; 2096 goto out;
2090 2097
2091 type = swp_type(entry); 2098 type = swp_type(entry);
2092 if (type >= nr_swapfiles) 2099 if (type >= nr_swapfiles)
2093 goto bad_file; 2100 goto bad_file;
2094 p = type + swap_info; 2101 p = swap_info[type];
2095 offset = swp_offset(entry); 2102 offset = swp_offset(entry);
2096 2103
2097 spin_lock(&swap_lock); 2104 spin_lock(&swap_lock);
2098
2099 if (unlikely(offset >= p->max)) 2105 if (unlikely(offset >= p->max))
2100 goto unlock_out; 2106 goto unlock_out;
2101 2107
2102 count = swap_count(p->swap_map[offset]); 2108 count = p->swap_map[offset];
2103 has_cache = swap_has_cache(p->swap_map[offset]); 2109 has_cache = count & SWAP_HAS_CACHE;
2110 count &= ~SWAP_HAS_CACHE;
2111 err = 0;
2104 2112
2105 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ 2113 if (usage == SWAP_HAS_CACHE) {
2106 2114
2107 /* set SWAP_HAS_CACHE if there is no cache and entry is used */ 2115 /* set SWAP_HAS_CACHE if there is no cache and entry is used */
2108 if (!has_cache && count) { 2116 if (!has_cache && count)
2109 p->swap_map[offset] = encode_swapmap(count, true); 2117 has_cache = SWAP_HAS_CACHE;
2110 result = 0; 2118 else if (has_cache) /* someone else added cache */
2111 } else if (has_cache) /* someone added cache */ 2119 err = -EEXIST;
2112 result = -EEXIST; 2120 else /* no users remaining */
2113 else if (!count) /* no users */ 2121 err = -ENOENT;
2114 result = -ENOENT;
2115 2122
2116 } else if (count || has_cache) { 2123 } else if (count || has_cache) {
2117 if (count < SWAP_MAP_MAX - 1) { 2124
2118 p->swap_map[offset] = encode_swapmap(count + 1, 2125 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2119 has_cache); 2126 count += usage;
2120 result = 0; 2127 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2121 } else if (count <= SWAP_MAP_MAX) { 2128 err = -EINVAL;
2122 if (swap_overflow++ < 5) 2129 else if (swap_count_continued(p, offset, count))
2123 printk(KERN_WARNING 2130 count = COUNT_CONTINUED;
2124 "swap_dup: swap entry overflow\n"); 2131 else
2125 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, 2132 err = -ENOMEM;
2126 has_cache);
2127 result = 0;
2128 }
2129 } else 2133 } else
2130 result = -ENOENT; /* unused swap entry */ 2134 err = -ENOENT; /* unused swap entry */
2135
2136 p->swap_map[offset] = count | has_cache;
2137
2131unlock_out: 2138unlock_out:
2132 spin_unlock(&swap_lock); 2139 spin_unlock(&swap_lock);
2133out: 2140out:
2134 return result; 2141 return err;
2135 2142
2136bad_file: 2143bad_file:
2137 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2144 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2138 goto out; 2145 goto out;
2139} 2146}
2147
2148/*
2149 * Help swapoff by noting that swap entry belongs to shmem/tmpfs
2150 * (in which case its reference count is never incremented).
2151 */
2152void swap_shmem_alloc(swp_entry_t entry)
2153{
2154 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2155}
2156
2140/* 2157/*
2141 * increase reference count of swap entry by 1. 2158 * increase reference count of swap entry by 1.
2142 */ 2159 */
2143void swap_duplicate(swp_entry_t entry) 2160int swap_duplicate(swp_entry_t entry)
2144{ 2161{
2145 __swap_duplicate(entry, SWAP_MAP); 2162 int err = 0;
2163
2164 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2165 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2166 return err;
2146} 2167}
2147 2168
2148/* 2169/*
2149 * @entry: swap entry for which we allocate swap cache. 2170 * @entry: swap entry for which we allocate swap cache.
2150 * 2171 *
2151 * Called when allocating swap cache for exising swap entry, 2172 * Called when allocating swap cache for existing swap entry,
2152 * This can return error codes. Returns 0 at success. 2173 * This can return error codes. Returns 0 at success.
2153 * -EBUSY means there is a swap cache. 2174 * -EBUSY means there is a swap cache.
2154 * Note: return code is different from swap_duplicate(). 2175 * Note: return code is different from swap_duplicate().
2155 */ 2176 */
2156int swapcache_prepare(swp_entry_t entry) 2177int swapcache_prepare(swp_entry_t entry)
2157{ 2178{
2158 return __swap_duplicate(entry, SWAP_CACHE); 2179 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2159}
2160
2161
2162struct swap_info_struct *
2163get_swap_info_struct(unsigned type)
2164{
2165 return &swap_info[type];
2166} 2180}
2167 2181
2168/* 2182/*
@@ -2180,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2180 if (!our_page_cluster) /* no readahead */ 2194 if (!our_page_cluster) /* no readahead */
2181 return 0; 2195 return 0;
2182 2196
2183 si = &swap_info[swp_type(entry)]; 2197 si = swap_info[swp_type(entry)];
2184 target = swp_offset(entry); 2198 target = swp_offset(entry);
2185 base = (target >> our_page_cluster) << our_page_cluster; 2199 base = (target >> our_page_cluster) << our_page_cluster;
2186 end = base + (1 << our_page_cluster); 2200 end = base + (1 << our_page_cluster);
@@ -2216,3 +2230,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2216 *offset = ++toff; 2230 *offset = ++toff;
2217 return nr_pages? ++nr_pages: 0; 2231 return nr_pages? ++nr_pages: 0;
2218} 2232}
2233
2234/*
2235 * add_swap_count_continuation - called when a swap count is duplicated
2236 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
2237 * page of the original vmalloc'ed swap_map, to hold the continuation count
2238 * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
2239 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
2240 *
2241 * These continuation pages are seldom referenced: the common paths all work
2242 * on the original swap_map, only referring to a continuation page when the
2243 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
2244 *
2245 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
2246 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
2247 * can be called after dropping locks.
2248 */
2249int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2250{
2251 struct swap_info_struct *si;
2252 struct page *head;
2253 struct page *page;
2254 struct page *list_page;
2255 pgoff_t offset;
2256 unsigned char count;
2257
2258 /*
2259 * When debugging, it's easier to use __GFP_ZERO here; but it's better
2260 * for latency not to zero a page while GFP_ATOMIC and holding locks.
2261 */
2262 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2263
2264 si = swap_info_get(entry);
2265 if (!si) {
2266 /*
2267 * An acceptable race has occurred since the failing
2268 * __swap_duplicate(): the swap entry has been freed,
2269 * perhaps even the whole swap_map cleared for swapoff.
2270 */
2271 goto outer;
2272 }
2273
2274 offset = swp_offset(entry);
2275 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2276
2277 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2278 /*
2279 * The higher the swap count, the more likely it is that tasks
2280 * will race to add swap count continuation: we need to avoid
2281 * over-provisioning.
2282 */
2283 goto out;
2284 }
2285
2286 if (!page) {
2287 spin_unlock(&swap_lock);
2288 return -ENOMEM;
2289 }
2290
2291 /*
2292 * We are fortunate that although vmalloc_to_page uses pte_offset_map,
2293 * no architecture is using highmem pages for kernel pagetables: so it
2294 * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
2295 */
2296 head = vmalloc_to_page(si->swap_map + offset);
2297 offset &= ~PAGE_MASK;
2298
2299 /*
2300 * Page allocation does not initialize the page's lru field,
2301 * but it does always reset its private field.
2302 */
2303 if (!page_private(head)) {
2304 BUG_ON(count & COUNT_CONTINUED);
2305 INIT_LIST_HEAD(&head->lru);
2306 set_page_private(head, SWP_CONTINUED);
2307 si->flags |= SWP_CONTINUED;
2308 }
2309
2310 list_for_each_entry(list_page, &head->lru, lru) {
2311 unsigned char *map;
2312
2313 /*
2314 * If the previous map said no continuation, but we've found
2315 * a continuation page, free our allocation and use this one.
2316 */
2317 if (!(count & COUNT_CONTINUED))
2318 goto out;
2319
2320 map = kmap_atomic(list_page, KM_USER0) + offset;
2321 count = *map;
2322 kunmap_atomic(map, KM_USER0);
2323
2324 /*
2325 * If this continuation count now has some space in it,
2326 * free our allocation and use this one.
2327 */
2328 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2329 goto out;
2330 }
2331
2332 list_add_tail(&page->lru, &head->lru);
2333 page = NULL; /* now it's attached, don't free it */
2334out:
2335 spin_unlock(&swap_lock);
2336outer:
2337 if (page)
2338 __free_page(page);
2339 return 0;
2340}
2341
2342/*
2343 * swap_count_continued - when the original swap_map count is incremented
2344 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
2345 * into, carry if so, or else fail until a new continuation page is allocated;
2346 * when the original swap_map count is decremented from 0 with continuation,
2347 * borrow from the continuation and report whether it still holds more.
2348 * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
2349 */
2350static bool swap_count_continued(struct swap_info_struct *si,
2351 pgoff_t offset, unsigned char count)
2352{
2353 struct page *head;
2354 struct page *page;
2355 unsigned char *map;
2356
2357 head = vmalloc_to_page(si->swap_map + offset);
2358 if (page_private(head) != SWP_CONTINUED) {
2359 BUG_ON(count & COUNT_CONTINUED);
2360 return false; /* need to add count continuation */
2361 }
2362
2363 offset &= ~PAGE_MASK;
2364 page = list_entry(head->lru.next, struct page, lru);
2365 map = kmap_atomic(page, KM_USER0) + offset;
2366
2367 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
2368 goto init_map; /* jump over SWAP_CONT_MAX checks */
2369
2370 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
2371 /*
2372 * Think of how you add 1 to 999
2373 */
2374 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2375 kunmap_atomic(map, KM_USER0);
2376 page = list_entry(page->lru.next, struct page, lru);
2377 BUG_ON(page == head);
2378 map = kmap_atomic(page, KM_USER0) + offset;
2379 }
2380 if (*map == SWAP_CONT_MAX) {
2381 kunmap_atomic(map, KM_USER0);
2382 page = list_entry(page->lru.next, struct page, lru);
2383 if (page == head)
2384 return false; /* add count continuation */
2385 map = kmap_atomic(page, KM_USER0) + offset;
2386init_map: *map = 0; /* we didn't zero the page */
2387 }
2388 *map += 1;
2389 kunmap_atomic(map, KM_USER0);
2390 page = list_entry(page->lru.prev, struct page, lru);
2391 while (page != head) {
2392 map = kmap_atomic(page, KM_USER0) + offset;
2393 *map = COUNT_CONTINUED;
2394 kunmap_atomic(map, KM_USER0);
2395 page = list_entry(page->lru.prev, struct page, lru);
2396 }
2397 return true; /* incremented */
2398
2399 } else { /* decrementing */
2400 /*
2401 * Think of how you subtract 1 from 1000
2402 */
2403 BUG_ON(count != COUNT_CONTINUED);
2404 while (*map == COUNT_CONTINUED) {
2405 kunmap_atomic(map, KM_USER0);
2406 page = list_entry(page->lru.next, struct page, lru);
2407 BUG_ON(page == head);
2408 map = kmap_atomic(page, KM_USER0) + offset;
2409 }
2410 BUG_ON(*map == 0);
2411 *map -= 1;
2412 if (*map == 0)
2413 count = 0;
2414 kunmap_atomic(map, KM_USER0);
2415 page = list_entry(page->lru.prev, struct page, lru);
2416 while (page != head) {
2417 map = kmap_atomic(page, KM_USER0) + offset;
2418 *map = SWAP_CONT_MAX | count;
2419 count = COUNT_CONTINUED;
2420 kunmap_atomic(map, KM_USER0);
2421 page = list_entry(page->lru.prev, struct page, lru);
2422 }
2423 return count == COUNT_CONTINUED;
2424 }
2425}
2426
2427/*
2428 * free_swap_count_continuations - swapoff free all the continuation pages
2429 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
2430 */
2431static void free_swap_count_continuations(struct swap_info_struct *si)
2432{
2433 pgoff_t offset;
2434
2435 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2436 struct page *head;
2437 head = vmalloc_to_page(si->swap_map + offset);
2438 if (page_private(head)) {
2439 struct list_head *this, *next;
2440 list_for_each_safe(this, next, &head->lru) {
2441 struct page *page;
2442 page = list_entry(this, struct page, lru);
2443 list_del(this);
2444 __free_page(page);
2445 }
2446 }
2447 }
2448}
diff --git a/mm/truncate.c b/mm/truncate.c
index 450cebdabfc0..342deee22684 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
272 pagevec_release(&pvec); 272 pagevec_release(&pvec);
273 break; 273 break;
274 } 274 }
275 mem_cgroup_uncharge_start();
275 for (i = 0; i < pagevec_count(&pvec); i++) { 276 for (i = 0; i < pagevec_count(&pvec); i++) {
276 struct page *page = pvec.pages[i]; 277 struct page *page = pvec.pages[i];
277 278
@@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
286 unlock_page(page); 287 unlock_page(page);
287 } 288 }
288 pagevec_release(&pvec); 289 pagevec_release(&pvec);
290 mem_cgroup_uncharge_end();
289 } 291 }
290} 292}
291EXPORT_SYMBOL(truncate_inode_pages_range); 293EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
327 pagevec_init(&pvec, 0); 329 pagevec_init(&pvec, 0);
328 while (next <= end && 330 while (next <= end &&
329 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 331 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
332 mem_cgroup_uncharge_start();
330 for (i = 0; i < pagevec_count(&pvec); i++) { 333 for (i = 0; i < pagevec_count(&pvec); i++) {
331 struct page *page = pvec.pages[i]; 334 struct page *page = pvec.pages[i];
332 pgoff_t index; 335 pgoff_t index;
@@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
354 break; 357 break;
355 } 358 }
356 pagevec_release(&pvec); 359 pagevec_release(&pvec);
360 mem_cgroup_uncharge_end();
357 cond_resched(); 361 cond_resched();
358 } 362 }
359 return ret; 363 return ret;
@@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
428 while (next <= end && !wrapped && 432 while (next <= end && !wrapped &&
429 pagevec_lookup(&pvec, mapping, next, 433 pagevec_lookup(&pvec, mapping, next,
430 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 434 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
435 mem_cgroup_uncharge_start();
431 for (i = 0; i < pagevec_count(&pvec); i++) { 436 for (i = 0; i < pagevec_count(&pvec); i++) {
432 struct page *page = pvec.pages[i]; 437 struct page *page = pvec.pages[i];
433 pgoff_t page_index; 438 pgoff_t page_index;
@@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
477 unlock_page(page); 482 unlock_page(page);
478 } 483 }
479 pagevec_release(&pvec); 484 pagevec_release(&pvec);
485 mem_cgroup_uncharge_end();
480 cond_resched(); 486 cond_resched();
481 } 487 }
482 return ret; 488 return ret;
@@ -490,7 +496,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
490 * Any pages which are found to be mapped into pagetables are unmapped prior to 496 * Any pages which are found to be mapped into pagetables are unmapped prior to
491 * invalidation. 497 * invalidation.
492 * 498 *
493 * Returns -EIO if any pages could not be invalidated. 499 * Returns -EBUSY if any pages could not be invalidated.
494 */ 500 */
495int invalidate_inode_pages2(struct address_space *mapping) 501int invalidate_inode_pages2(struct address_space *mapping)
496{ 502{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b65cfe44a562..37e69295f250 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -12,6 +12,7 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/highmem.h> 14#include <linux/highmem.h>
15#include <linux/sched.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/spinlock.h> 17#include <linux/spinlock.h>
17#include <linux/interrupt.h> 18#include <linux/interrupt.h>
@@ -25,10 +26,10 @@
25#include <linux/rcupdate.h> 26#include <linux/rcupdate.h>
26#include <linux/pfn.h> 27#include <linux/pfn.h>
27#include <linux/kmemleak.h> 28#include <linux/kmemleak.h>
28#include <linux/highmem.h>
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/shmparam.h>
32 33
33 34
34/*** Page table manipulation functions ***/ 35/*** Page table manipulation functions ***/
@@ -1156,12 +1157,11 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1156} 1157}
1157 1158
1158static struct vm_struct *__get_vm_area_node(unsigned long size, 1159static struct vm_struct *__get_vm_area_node(unsigned long size,
1159 unsigned long flags, unsigned long start, unsigned long end, 1160 unsigned long align, unsigned long flags, unsigned long start,
1160 int node, gfp_t gfp_mask, void *caller) 1161 unsigned long end, int node, gfp_t gfp_mask, void *caller)
1161{ 1162{
1162 static struct vmap_area *va; 1163 static struct vmap_area *va;
1163 struct vm_struct *area; 1164 struct vm_struct *area;
1164 unsigned long align = 1;
1165 1165
1166 BUG_ON(in_interrupt()); 1166 BUG_ON(in_interrupt());
1167 if (flags & VM_IOREMAP) { 1167 if (flags & VM_IOREMAP) {
@@ -1201,7 +1201,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1201struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 1201struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1202 unsigned long start, unsigned long end) 1202 unsigned long start, unsigned long end)
1203{ 1203{
1204 return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, 1204 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
1205 __builtin_return_address(0)); 1205 __builtin_return_address(0));
1206} 1206}
1207EXPORT_SYMBOL_GPL(__get_vm_area); 1207EXPORT_SYMBOL_GPL(__get_vm_area);
@@ -1210,7 +1210,7 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1210 unsigned long start, unsigned long end, 1210 unsigned long start, unsigned long end,
1211 void *caller) 1211 void *caller)
1212{ 1212{
1213 return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, 1213 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
1214 caller); 1214 caller);
1215} 1215}
1216 1216
@@ -1225,22 +1225,22 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1225 */ 1225 */
1226struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 1226struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1227{ 1227{
1228 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, 1228 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1229 -1, GFP_KERNEL, __builtin_return_address(0)); 1229 -1, GFP_KERNEL, __builtin_return_address(0));
1230} 1230}
1231 1231
1232struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 1232struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1233 void *caller) 1233 void *caller)
1234{ 1234{
1235 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, 1235 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1236 -1, GFP_KERNEL, caller); 1236 -1, GFP_KERNEL, caller);
1237} 1237}
1238 1238
1239struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, 1239struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
1240 int node, gfp_t gfp_mask) 1240 int node, gfp_t gfp_mask)
1241{ 1241{
1242 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, 1242 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1243 gfp_mask, __builtin_return_address(0)); 1243 node, gfp_mask, __builtin_return_address(0));
1244} 1244}
1245 1245
1246static struct vm_struct *find_vm_area(const void *addr) 1246static struct vm_struct *find_vm_area(const void *addr)
@@ -1403,13 +1403,15 @@ void *vmap(struct page **pages, unsigned int count,
1403} 1403}
1404EXPORT_SYMBOL(vmap); 1404EXPORT_SYMBOL(vmap);
1405 1405
1406static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 1406static void *__vmalloc_node(unsigned long size, unsigned long align,
1407 gfp_t gfp_mask, pgprot_t prot,
1407 int node, void *caller); 1408 int node, void *caller);
1408static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1409static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1409 pgprot_t prot, int node, void *caller) 1410 pgprot_t prot, int node, void *caller)
1410{ 1411{
1411 struct page **pages; 1412 struct page **pages;
1412 unsigned int nr_pages, array_size, i; 1413 unsigned int nr_pages, array_size, i;
1414 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
1413 1415
1414 nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; 1416 nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
1415 array_size = (nr_pages * sizeof(struct page *)); 1417 array_size = (nr_pages * sizeof(struct page *));
@@ -1417,13 +1419,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1417 area->nr_pages = nr_pages; 1419 area->nr_pages = nr_pages;
1418 /* Please note that the recursion is strictly bounded. */ 1420 /* Please note that the recursion is strictly bounded. */
1419 if (array_size > PAGE_SIZE) { 1421 if (array_size > PAGE_SIZE) {
1420 pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, 1422 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
1421 PAGE_KERNEL, node, caller); 1423 PAGE_KERNEL, node, caller);
1422 area->flags |= VM_VPAGES; 1424 area->flags |= VM_VPAGES;
1423 } else { 1425 } else {
1424 pages = kmalloc_node(array_size, 1426 pages = kmalloc_node(array_size, nested_gfp, node);
1425 (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO,
1426 node);
1427 } 1427 }
1428 area->pages = pages; 1428 area->pages = pages;
1429 area->caller = caller; 1429 area->caller = caller;
@@ -1476,6 +1476,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1476/** 1476/**
1477 * __vmalloc_node - allocate virtually contiguous memory 1477 * __vmalloc_node - allocate virtually contiguous memory
1478 * @size: allocation size 1478 * @size: allocation size
1479 * @align: desired alignment
1479 * @gfp_mask: flags for the page level allocator 1480 * @gfp_mask: flags for the page level allocator
1480 * @prot: protection mask for the allocated pages 1481 * @prot: protection mask for the allocated pages
1481 * @node: node to use for allocation or -1 1482 * @node: node to use for allocation or -1
@@ -1485,8 +1486,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1485 * allocator with @gfp_mask flags. Map them into contiguous 1486 * allocator with @gfp_mask flags. Map them into contiguous
1486 * kernel virtual space, using a pagetable protection of @prot. 1487 * kernel virtual space, using a pagetable protection of @prot.
1487 */ 1488 */
1488static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 1489static void *__vmalloc_node(unsigned long size, unsigned long align,
1489 int node, void *caller) 1490 gfp_t gfp_mask, pgprot_t prot,
1491 int node, void *caller)
1490{ 1492{
1491 struct vm_struct *area; 1493 struct vm_struct *area;
1492 void *addr; 1494 void *addr;
@@ -1496,8 +1498,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1496 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1498 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1497 return NULL; 1499 return NULL;
1498 1500
1499 area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, 1501 area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
1500 node, gfp_mask, caller); 1502 VMALLOC_END, node, gfp_mask, caller);
1501 1503
1502 if (!area) 1504 if (!area)
1503 return NULL; 1505 return NULL;
@@ -1516,7 +1518,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1516 1518
1517void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1519void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1518{ 1520{
1519 return __vmalloc_node(size, gfp_mask, prot, -1, 1521 return __vmalloc_node(size, 1, gfp_mask, prot, -1,
1520 __builtin_return_address(0)); 1522 __builtin_return_address(0));
1521} 1523}
1522EXPORT_SYMBOL(__vmalloc); 1524EXPORT_SYMBOL(__vmalloc);
@@ -1532,7 +1534,7 @@ EXPORT_SYMBOL(__vmalloc);
1532 */ 1534 */
1533void *vmalloc(unsigned long size) 1535void *vmalloc(unsigned long size)
1534{ 1536{
1535 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1537 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1536 -1, __builtin_return_address(0)); 1538 -1, __builtin_return_address(0));
1537} 1539}
1538EXPORT_SYMBOL(vmalloc); 1540EXPORT_SYMBOL(vmalloc);
@@ -1549,7 +1551,8 @@ void *vmalloc_user(unsigned long size)
1549 struct vm_struct *area; 1551 struct vm_struct *area;
1550 void *ret; 1552 void *ret;
1551 1553
1552 ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 1554 ret = __vmalloc_node(size, SHMLBA,
1555 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
1553 PAGE_KERNEL, -1, __builtin_return_address(0)); 1556 PAGE_KERNEL, -1, __builtin_return_address(0));
1554 if (ret) { 1557 if (ret) {
1555 area = find_vm_area(ret); 1558 area = find_vm_area(ret);
@@ -1572,7 +1575,7 @@ EXPORT_SYMBOL(vmalloc_user);
1572 */ 1575 */
1573void *vmalloc_node(unsigned long size, int node) 1576void *vmalloc_node(unsigned long size, int node)
1574{ 1577{
1575 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1578 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1576 node, __builtin_return_address(0)); 1579 node, __builtin_return_address(0));
1577} 1580}
1578EXPORT_SYMBOL(vmalloc_node); 1581EXPORT_SYMBOL(vmalloc_node);
@@ -1595,7 +1598,7 @@ EXPORT_SYMBOL(vmalloc_node);
1595 1598
1596void *vmalloc_exec(unsigned long size) 1599void *vmalloc_exec(unsigned long size)
1597{ 1600{
1598 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, 1601 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
1599 -1, __builtin_return_address(0)); 1602 -1, __builtin_return_address(0));
1600} 1603}
1601 1604
@@ -1616,7 +1619,7 @@ void *vmalloc_exec(unsigned long size)
1616 */ 1619 */
1617void *vmalloc_32(unsigned long size) 1620void *vmalloc_32(unsigned long size)
1618{ 1621{
1619 return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, 1622 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
1620 -1, __builtin_return_address(0)); 1623 -1, __builtin_return_address(0));
1621} 1624}
1622EXPORT_SYMBOL(vmalloc_32); 1625EXPORT_SYMBOL(vmalloc_32);
@@ -1633,7 +1636,7 @@ void *vmalloc_32_user(unsigned long size)
1633 struct vm_struct *area; 1636 struct vm_struct *area;
1634 void *ret; 1637 void *ret;
1635 1638
1636 ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 1639 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1637 -1, __builtin_return_address(0)); 1640 -1, __builtin_return_address(0));
1638 if (ret) { 1641 if (ret) {
1639 area = find_vm_area(ret); 1642 area = find_vm_area(ret);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 64e438898832..885207a6b6b7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -55,6 +55,11 @@ struct scan_control {
55 /* Number of pages freed so far during a call to shrink_zones() */ 55 /* Number of pages freed so far during a call to shrink_zones() */
56 unsigned long nr_reclaimed; 56 unsigned long nr_reclaimed;
57 57
58 /* How many pages shrink_list() should reclaim */
59 unsigned long nr_to_reclaim;
60
61 unsigned long hibernation_mode;
62
58 /* This context's GFP mask */ 63 /* This context's GFP mask */
59 gfp_t gfp_mask; 64 gfp_t gfp_mask;
60 65
@@ -66,12 +71,6 @@ struct scan_control {
66 /* Can pages be swapped as part of reclaim? */ 71 /* Can pages be swapped as part of reclaim? */
67 int may_swap; 72 int may_swap;
68 73
69 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
70 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
71 * In this context, it doesn't matter that we scan the
72 * whole list at once. */
73 int swap_cluster_max;
74
75 int swappiness; 74 int swappiness;
76 75
77 int all_unreclaimable; 76 int all_unreclaimable;
@@ -358,7 +357,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
358 * stalls if we need to run get_block(). We could test 357 * stalls if we need to run get_block(). We could test
359 * PagePrivate for that. 358 * PagePrivate for that.
360 * 359 *
361 * If this process is currently in generic_file_write() against 360 * If this process is currently in __generic_file_aio_write() against
362 * this page's queue, we can perform writeback even if that 361 * this page's queue, we can perform writeback even if that
363 * will block. 362 * will block.
364 * 363 *
@@ -544,6 +543,16 @@ redo:
544 */ 543 */
545 lru = LRU_UNEVICTABLE; 544 lru = LRU_UNEVICTABLE;
546 add_page_to_unevictable_list(page); 545 add_page_to_unevictable_list(page);
546 /*
547 * When racing with an mlock clearing (page is
548 * unlocked), make sure that if the other thread does
549 * not observe our setting of PG_lru and fails
550 * isolation, we see PG_mlocked cleared below and move
551 * the page back to the evictable list.
552 *
553 * The other side is TestClearPageMlocked().
554 */
555 smp_mb();
547 } 556 }
548 557
549 /* 558 /*
@@ -1088,7 +1097,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1088 int lumpy_reclaim = 0; 1097 int lumpy_reclaim = 0;
1089 1098
1090 while (unlikely(too_many_isolated(zone, file, sc))) { 1099 while (unlikely(too_many_isolated(zone, file, sc))) {
1091 congestion_wait(WRITE, HZ/10); 1100 congestion_wait(BLK_RW_ASYNC, HZ/10);
1092 1101
1093 /* We are about to die and free our memory. Return now. */ 1102 /* We are about to die and free our memory. Return now. */
1094 if (fatal_signal_pending(current)) 1103 if (fatal_signal_pending(current))
@@ -1122,7 +1131,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1122 unsigned long nr_anon; 1131 unsigned long nr_anon;
1123 unsigned long nr_file; 1132 unsigned long nr_file;
1124 1133
1125 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1134 nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX,
1126 &page_list, &nr_scan, sc->order, mode, 1135 &page_list, &nr_scan, sc->order, mode,
1127 zone, sc->mem_cgroup, 0, file); 1136 zone, sc->mem_cgroup, 0, file);
1128 1137
@@ -1156,10 +1165,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1156 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); 1165 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1157 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); 1166 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1158 1167
1159 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; 1168 reclaim_stat->recent_scanned[0] += nr_anon;
1160 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; 1169 reclaim_stat->recent_scanned[1] += nr_file;
1161 reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
1162 reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];
1163 1170
1164 spin_unlock_irq(&zone->lru_lock); 1171 spin_unlock_irq(&zone->lru_lock);
1165 1172
@@ -1356,7 +1363,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1356 * IO, plus JVM can create lots of anon VM_EXEC pages, 1363 * IO, plus JVM can create lots of anon VM_EXEC pages,
1357 * so we ignore them here. 1364 * so we ignore them here.
1358 */ 1365 */
1359 if ((vm_flags & VM_EXEC) && !PageAnon(page)) { 1366 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1360 list_add(&page->lru, &l_active); 1367 list_add(&page->lru, &l_active);
1361 continue; 1368 continue;
1362 } 1369 }
@@ -1454,20 +1461,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1454 return low; 1461 return low;
1455} 1462}
1456 1463
1464static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
1465 int file)
1466{
1467 if (file)
1468 return inactive_file_is_low(zone, sc);
1469 else
1470 return inactive_anon_is_low(zone, sc);
1471}
1472
1457static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1473static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1458 struct zone *zone, struct scan_control *sc, int priority) 1474 struct zone *zone, struct scan_control *sc, int priority)
1459{ 1475{
1460 int file = is_file_lru(lru); 1476 int file = is_file_lru(lru);
1461 1477
1462 if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { 1478 if (is_active_lru(lru)) {
1463 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1479 if (inactive_list_is_low(zone, sc, file))
1480 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1464 return 0; 1481 return 0;
1465 } 1482 }
1466 1483
1467 if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
1468 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1469 return 0;
1470 }
1471 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); 1484 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1472} 1485}
1473 1486
@@ -1557,15 +1570,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1557 * until we collected @swap_cluster_max pages to scan. 1570 * until we collected @swap_cluster_max pages to scan.
1558 */ 1571 */
1559static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, 1572static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1560 unsigned long *nr_saved_scan, 1573 unsigned long *nr_saved_scan)
1561 unsigned long swap_cluster_max)
1562{ 1574{
1563 unsigned long nr; 1575 unsigned long nr;
1564 1576
1565 *nr_saved_scan += nr_to_scan; 1577 *nr_saved_scan += nr_to_scan;
1566 nr = *nr_saved_scan; 1578 nr = *nr_saved_scan;
1567 1579
1568 if (nr >= swap_cluster_max) 1580 if (nr >= SWAP_CLUSTER_MAX)
1569 *nr_saved_scan = 0; 1581 *nr_saved_scan = 0;
1570 else 1582 else
1571 nr = 0; 1583 nr = 0;
@@ -1584,7 +1596,7 @@ static void shrink_zone(int priority, struct zone *zone,
1584 unsigned long percent[2]; /* anon @ 0; file @ 1 */ 1596 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1585 enum lru_list l; 1597 enum lru_list l;
1586 unsigned long nr_reclaimed = sc->nr_reclaimed; 1598 unsigned long nr_reclaimed = sc->nr_reclaimed;
1587 unsigned long swap_cluster_max = sc->swap_cluster_max; 1599 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1588 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1600 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1589 int noswap = 0; 1601 int noswap = 0;
1590 1602
@@ -1606,15 +1618,15 @@ static void shrink_zone(int priority, struct zone *zone,
1606 scan = (scan * percent[file]) / 100; 1618 scan = (scan * percent[file]) / 100;
1607 } 1619 }
1608 nr[l] = nr_scan_try_batch(scan, 1620 nr[l] = nr_scan_try_batch(scan,
1609 &reclaim_stat->nr_saved_scan[l], 1621 &reclaim_stat->nr_saved_scan[l]);
1610 swap_cluster_max);
1611 } 1622 }
1612 1623
1613 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1624 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1614 nr[LRU_INACTIVE_FILE]) { 1625 nr[LRU_INACTIVE_FILE]) {
1615 for_each_evictable_lru(l) { 1626 for_each_evictable_lru(l) {
1616 if (nr[l]) { 1627 if (nr[l]) {
1617 nr_to_scan = min(nr[l], swap_cluster_max); 1628 nr_to_scan = min_t(unsigned long,
1629 nr[l], SWAP_CLUSTER_MAX);
1618 nr[l] -= nr_to_scan; 1630 nr[l] -= nr_to_scan;
1619 1631
1620 nr_reclaimed += shrink_list(l, nr_to_scan, 1632 nr_reclaimed += shrink_list(l, nr_to_scan,
@@ -1629,8 +1641,7 @@ static void shrink_zone(int priority, struct zone *zone,
1629 * with multiple processes reclaiming pages, the total 1641 * with multiple processes reclaiming pages, the total
1630 * freeing target can get unreasonably large. 1642 * freeing target can get unreasonably large.
1631 */ 1643 */
1632 if (nr_reclaimed > swap_cluster_max && 1644 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1633 priority < DEF_PRIORITY && !current_is_kswapd())
1634 break; 1645 break;
1635 } 1646 }
1636 1647
@@ -1728,6 +1739,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1728 struct zoneref *z; 1739 struct zoneref *z;
1729 struct zone *zone; 1740 struct zone *zone;
1730 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1741 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1742 unsigned long writeback_threshold;
1731 1743
1732 delayacct_freepages_start(); 1744 delayacct_freepages_start();
1733 1745
@@ -1763,7 +1775,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1763 } 1775 }
1764 } 1776 }
1765 total_scanned += sc->nr_scanned; 1777 total_scanned += sc->nr_scanned;
1766 if (sc->nr_reclaimed >= sc->swap_cluster_max) { 1778 if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
1767 ret = sc->nr_reclaimed; 1779 ret = sc->nr_reclaimed;
1768 goto out; 1780 goto out;
1769 } 1781 }
@@ -1775,14 +1787,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1775 * that's undesirable in laptop mode, where we *want* lumpy 1787 * that's undesirable in laptop mode, where we *want* lumpy
1776 * writeout. So in laptop mode, write out the whole world. 1788 * writeout. So in laptop mode, write out the whole world.
1777 */ 1789 */
1778 if (total_scanned > sc->swap_cluster_max + 1790 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
1779 sc->swap_cluster_max / 2) { 1791 if (total_scanned > writeback_threshold) {
1780 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); 1792 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
1781 sc->may_writepage = 1; 1793 sc->may_writepage = 1;
1782 } 1794 }
1783 1795
1784 /* Take a nap, wait for some writeback to complete */ 1796 /* Take a nap, wait for some writeback to complete */
1785 if (sc->nr_scanned && priority < DEF_PRIORITY - 2) 1797 if (!sc->hibernation_mode && sc->nr_scanned &&
1798 priority < DEF_PRIORITY - 2)
1786 congestion_wait(BLK_RW_ASYNC, HZ/10); 1799 congestion_wait(BLK_RW_ASYNC, HZ/10);
1787 } 1800 }
1788 /* top priority shrink_zones still had more to do? don't OOM, then */ 1801 /* top priority shrink_zones still had more to do? don't OOM, then */
@@ -1821,7 +1834,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1821 struct scan_control sc = { 1834 struct scan_control sc = {
1822 .gfp_mask = gfp_mask, 1835 .gfp_mask = gfp_mask,
1823 .may_writepage = !laptop_mode, 1836 .may_writepage = !laptop_mode,
1824 .swap_cluster_max = SWAP_CLUSTER_MAX, 1837 .nr_to_reclaim = SWAP_CLUSTER_MAX,
1825 .may_unmap = 1, 1838 .may_unmap = 1,
1826 .may_swap = 1, 1839 .may_swap = 1,
1827 .swappiness = vm_swappiness, 1840 .swappiness = vm_swappiness,
@@ -1845,7 +1858,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1845 .may_writepage = !laptop_mode, 1858 .may_writepage = !laptop_mode,
1846 .may_unmap = 1, 1859 .may_unmap = 1,
1847 .may_swap = !noswap, 1860 .may_swap = !noswap,
1848 .swap_cluster_max = SWAP_CLUSTER_MAX,
1849 .swappiness = swappiness, 1861 .swappiness = swappiness,
1850 .order = 0, 1862 .order = 0,
1851 .mem_cgroup = mem, 1863 .mem_cgroup = mem,
@@ -1879,7 +1891,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1879 .may_writepage = !laptop_mode, 1891 .may_writepage = !laptop_mode,
1880 .may_unmap = 1, 1892 .may_unmap = 1,
1881 .may_swap = !noswap, 1893 .may_swap = !noswap,
1882 .swap_cluster_max = SWAP_CLUSTER_MAX, 1894 .nr_to_reclaim = SWAP_CLUSTER_MAX,
1883 .swappiness = swappiness, 1895 .swappiness = swappiness,
1884 .order = 0, 1896 .order = 0,
1885 .mem_cgroup = mem_cont, 1897 .mem_cgroup = mem_cont,
@@ -1894,6 +1906,30 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1894} 1906}
1895#endif 1907#endif
1896 1908
1909/* is kswapd sleeping prematurely? */
1910static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
1911{
1912 int i;
1913
1914 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
1915 if (remaining)
1916 return 1;
1917
1918 /* If after HZ/10, a zone is below the high mark, it's premature */
1919 for (i = 0; i < pgdat->nr_zones; i++) {
1920 struct zone *zone = pgdat->node_zones + i;
1921
1922 if (!populated_zone(zone))
1923 continue;
1924
1925 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
1926 0, 0))
1927 return 1;
1928 }
1929
1930 return 0;
1931}
1932
1897/* 1933/*
1898 * For kswapd, balance_pgdat() will work across all this node's zones until 1934 * For kswapd, balance_pgdat() will work across all this node's zones until
1899 * they are all at high_wmark_pages(zone). 1935 * they are all at high_wmark_pages(zone).
@@ -1926,7 +1962,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1926 .gfp_mask = GFP_KERNEL, 1962 .gfp_mask = GFP_KERNEL,
1927 .may_unmap = 1, 1963 .may_unmap = 1,
1928 .may_swap = 1, 1964 .may_swap = 1,
1929 .swap_cluster_max = SWAP_CLUSTER_MAX, 1965 /*
1966 * kswapd doesn't want to be bailed out while reclaim. because
1967 * we want to put equal scanning pressure on each zone.
1968 */
1969 .nr_to_reclaim = ULONG_MAX,
1930 .swappiness = vm_swappiness, 1970 .swappiness = vm_swappiness,
1931 .order = order, 1971 .order = order,
1932 .mem_cgroup = NULL, 1972 .mem_cgroup = NULL,
@@ -1951,6 +1991,7 @@ loop_again:
1951 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1991 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1952 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 1992 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
1953 unsigned long lru_pages = 0; 1993 unsigned long lru_pages = 0;
1994 int has_under_min_watermark_zone = 0;
1954 1995
1955 /* The swap token gets in the way of swapout... */ 1996 /* The swap token gets in the way of swapout... */
1956 if (!priority) 1997 if (!priority)
@@ -2057,6 +2098,15 @@ loop_again:
2057 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 2098 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
2058 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2099 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2059 sc.may_writepage = 1; 2100 sc.may_writepage = 1;
2101
2102 /*
2103 * We are still under min water mark. it mean we have
2104 * GFP_ATOMIC allocation failure risk. Hurry up!
2105 */
2106 if (!zone_watermark_ok(zone, order, min_wmark_pages(zone),
2107 end_zone, 0))
2108 has_under_min_watermark_zone = 1;
2109
2060 } 2110 }
2061 if (all_zones_ok) 2111 if (all_zones_ok)
2062 break; /* kswapd: all done */ 2112 break; /* kswapd: all done */
@@ -2064,8 +2114,12 @@ loop_again:
2064 * OK, kswapd is getting into trouble. Take a nap, then take 2114 * OK, kswapd is getting into trouble. Take a nap, then take
2065 * another pass across the zones. 2115 * another pass across the zones.
2066 */ 2116 */
2067 if (total_scanned && priority < DEF_PRIORITY - 2) 2117 if (total_scanned && (priority < DEF_PRIORITY - 2)) {
2068 congestion_wait(BLK_RW_ASYNC, HZ/10); 2118 if (has_under_min_watermark_zone)
2119 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2120 else
2121 congestion_wait(BLK_RW_ASYNC, HZ/10);
2122 }
2069 2123
2070 /* 2124 /*
2071 * We do this so kswapd doesn't build up large priorities for 2125 * We do this so kswapd doesn't build up large priorities for
@@ -2163,6 +2217,7 @@ static int kswapd(void *p)
2163 order = 0; 2217 order = 0;
2164 for ( ; ; ) { 2218 for ( ; ; ) {
2165 unsigned long new_order; 2219 unsigned long new_order;
2220 int ret;
2166 2221
2167 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2222 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2168 new_order = pgdat->kswapd_max_order; 2223 new_order = pgdat->kswapd_max_order;
@@ -2174,19 +2229,45 @@ static int kswapd(void *p)
2174 */ 2229 */
2175 order = new_order; 2230 order = new_order;
2176 } else { 2231 } else {
2177 if (!freezing(current)) 2232 if (!freezing(current) && !kthread_should_stop()) {
2178 schedule(); 2233 long remaining = 0;
2234
2235 /* Try to sleep for a short interval */
2236 if (!sleeping_prematurely(pgdat, order, remaining)) {
2237 remaining = schedule_timeout(HZ/10);
2238 finish_wait(&pgdat->kswapd_wait, &wait);
2239 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2240 }
2241
2242 /*
2243 * After a short sleep, check if it was a
2244 * premature sleep. If not, then go fully
2245 * to sleep until explicitly woken up
2246 */
2247 if (!sleeping_prematurely(pgdat, order, remaining))
2248 schedule();
2249 else {
2250 if (remaining)
2251 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2252 else
2253 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2254 }
2255 }
2179 2256
2180 order = pgdat->kswapd_max_order; 2257 order = pgdat->kswapd_max_order;
2181 } 2258 }
2182 finish_wait(&pgdat->kswapd_wait, &wait); 2259 finish_wait(&pgdat->kswapd_wait, &wait);
2183 2260
2184 if (!try_to_freeze()) { 2261 ret = try_to_freeze();
2185 /* We can speed up thawing tasks if we don't call 2262 if (kthread_should_stop())
2186 * balance_pgdat after returning from the refrigerator 2263 break;
2187 */ 2264
2265 /*
2266 * We can speed up thawing tasks if we don't call balance_pgdat
2267 * after returning from the refrigerator
2268 */
2269 if (!ret)
2188 balance_pgdat(pgdat, order); 2270 balance_pgdat(pgdat, order);
2189 }
2190 } 2271 }
2191 return 0; 2272 return 0;
2192} 2273}
@@ -2250,148 +2331,43 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
2250 2331
2251#ifdef CONFIG_HIBERNATION 2332#ifdef CONFIG_HIBERNATION
2252/* 2333/*
2253 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 2334 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
2254 * from LRU lists system-wide, for given pass and priority.
2255 *
2256 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
2257 */
2258static void shrink_all_zones(unsigned long nr_pages, int prio,
2259 int pass, struct scan_control *sc)
2260{
2261 struct zone *zone;
2262 unsigned long nr_reclaimed = 0;
2263 struct zone_reclaim_stat *reclaim_stat;
2264
2265 for_each_populated_zone(zone) {
2266 enum lru_list l;
2267
2268 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
2269 continue;
2270
2271 for_each_evictable_lru(l) {
2272 enum zone_stat_item ls = NR_LRU_BASE + l;
2273 unsigned long lru_pages = zone_page_state(zone, ls);
2274
2275 /* For pass = 0, we don't shrink the active list */
2276 if (pass == 0 && (l == LRU_ACTIVE_ANON ||
2277 l == LRU_ACTIVE_FILE))
2278 continue;
2279
2280 reclaim_stat = get_reclaim_stat(zone, sc);
2281 reclaim_stat->nr_saved_scan[l] +=
2282 (lru_pages >> prio) + 1;
2283 if (reclaim_stat->nr_saved_scan[l]
2284 >= nr_pages || pass > 3) {
2285 unsigned long nr_to_scan;
2286
2287 reclaim_stat->nr_saved_scan[l] = 0;
2288 nr_to_scan = min(nr_pages, lru_pages);
2289 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2290 sc, prio);
2291 if (nr_reclaimed >= nr_pages) {
2292 sc->nr_reclaimed += nr_reclaimed;
2293 return;
2294 }
2295 }
2296 }
2297 }
2298 sc->nr_reclaimed += nr_reclaimed;
2299}
2300
2301/*
2302 * Try to free `nr_pages' of memory, system-wide, and return the number of
2303 * freed pages. 2335 * freed pages.
2304 * 2336 *
2305 * Rather than trying to age LRUs the aim is to preserve the overall 2337 * Rather than trying to age LRUs the aim is to preserve the overall
2306 * LRU order by reclaiming preferentially 2338 * LRU order by reclaiming preferentially
2307 * inactive > active > active referenced > active mapped 2339 * inactive > active > active referenced > active mapped
2308 */ 2340 */
2309unsigned long shrink_all_memory(unsigned long nr_pages) 2341unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2310{ 2342{
2311 unsigned long lru_pages, nr_slab;
2312 int pass;
2313 struct reclaim_state reclaim_state; 2343 struct reclaim_state reclaim_state;
2314 struct scan_control sc = { 2344 struct scan_control sc = {
2315 .gfp_mask = GFP_KERNEL, 2345 .gfp_mask = GFP_HIGHUSER_MOVABLE,
2316 .may_unmap = 0, 2346 .may_swap = 1,
2347 .may_unmap = 1,
2317 .may_writepage = 1, 2348 .may_writepage = 1,
2349 .nr_to_reclaim = nr_to_reclaim,
2350 .hibernation_mode = 1,
2351 .swappiness = vm_swappiness,
2352 .order = 0,
2318 .isolate_pages = isolate_pages_global, 2353 .isolate_pages = isolate_pages_global,
2319 .nr_reclaimed = 0,
2320 }; 2354 };
2355 struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
2356 struct task_struct *p = current;
2357 unsigned long nr_reclaimed;
2321 2358
2322 current->reclaim_state = &reclaim_state; 2359 p->flags |= PF_MEMALLOC;
2323 2360 lockdep_set_current_reclaim_state(sc.gfp_mask);
2324 lru_pages = global_reclaimable_pages(); 2361 reclaim_state.reclaimed_slab = 0;
2325 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 2362 p->reclaim_state = &reclaim_state;
2326 /* If slab caches are huge, it's better to hit them first */
2327 while (nr_slab >= lru_pages) {
2328 reclaim_state.reclaimed_slab = 0;
2329 shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
2330 if (!reclaim_state.reclaimed_slab)
2331 break;
2332
2333 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2334 if (sc.nr_reclaimed >= nr_pages)
2335 goto out;
2336
2337 nr_slab -= reclaim_state.reclaimed_slab;
2338 }
2339
2340 /*
2341 * We try to shrink LRUs in 5 passes:
2342 * 0 = Reclaim from inactive_list only
2343 * 1 = Reclaim from active list but don't reclaim mapped
2344 * 2 = 2nd pass of type 1
2345 * 3 = Reclaim mapped (normal reclaim)
2346 * 4 = 2nd pass of type 3
2347 */
2348 for (pass = 0; pass < 5; pass++) {
2349 int prio;
2350
2351 /* Force reclaiming mapped pages in the passes #3 and #4 */
2352 if (pass > 2)
2353 sc.may_unmap = 1;
2354
2355 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
2356 unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed;
2357
2358 sc.nr_scanned = 0;
2359 sc.swap_cluster_max = nr_to_scan;
2360 shrink_all_zones(nr_to_scan, prio, pass, &sc);
2361 if (sc.nr_reclaimed >= nr_pages)
2362 goto out;
2363
2364 reclaim_state.reclaimed_slab = 0;
2365 shrink_slab(sc.nr_scanned, sc.gfp_mask,
2366 global_reclaimable_pages());
2367 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2368 if (sc.nr_reclaimed >= nr_pages)
2369 goto out;
2370
2371 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
2372 congestion_wait(BLK_RW_ASYNC, HZ / 10);
2373 }
2374 }
2375
2376 /*
2377 * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be
2378 * something in slab caches
2379 */
2380 if (!sc.nr_reclaimed) {
2381 do {
2382 reclaim_state.reclaimed_slab = 0;
2383 shrink_slab(nr_pages, sc.gfp_mask,
2384 global_reclaimable_pages());
2385 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2386 } while (sc.nr_reclaimed < nr_pages &&
2387 reclaim_state.reclaimed_slab > 0);
2388 }
2389 2363
2364 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2390 2365
2391out: 2366 p->reclaim_state = NULL;
2392 current->reclaim_state = NULL; 2367 lockdep_clear_current_reclaim_state();
2368 p->flags &= ~PF_MEMALLOC;
2393 2369
2394 return sc.nr_reclaimed; 2370 return nr_reclaimed;
2395} 2371}
2396#endif /* CONFIG_HIBERNATION */ 2372#endif /* CONFIG_HIBERNATION */
2397 2373
@@ -2441,6 +2417,17 @@ int kswapd_run(int nid)
2441 return ret; 2417 return ret;
2442} 2418}
2443 2419
2420/*
2421 * Called by memory hotplug when all memory in a node is offlined.
2422 */
2423void kswapd_stop(int nid)
2424{
2425 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
2426
2427 if (kswapd)
2428 kthread_stop(kswapd);
2429}
2430
2444static int __init kswapd_init(void) 2431static int __init kswapd_init(void)
2445{ 2432{
2446 int nid; 2433 int nid;
@@ -2543,8 +2530,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2543 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 2530 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2544 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 2531 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
2545 .may_swap = 1, 2532 .may_swap = 1,
2546 .swap_cluster_max = max_t(unsigned long, nr_pages, 2533 .nr_to_reclaim = max_t(unsigned long, nr_pages,
2547 SWAP_CLUSTER_MAX), 2534 SWAP_CLUSTER_MAX),
2548 .gfp_mask = gfp_mask, 2535 .gfp_mask = gfp_mask,
2549 .swappiness = vm_swappiness, 2536 .swappiness = vm_swappiness,
2550 .order = order, 2537 .order = order,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index dad2327e4580..6051fbab67ba 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -683,6 +683,9 @@ static const char * const vmstat_text[] = {
683 "slabs_scanned", 683 "slabs_scanned",
684 "kswapd_steal", 684 "kswapd_steal",
685 "kswapd_inodesteal", 685 "kswapd_inodesteal",
686 "kswapd_low_wmark_hit_quickly",
687 "kswapd_high_wmark_hit_quickly",
688 "kswapd_skip_congestion_wait",
686 "pageoutrun", 689 "pageoutrun",
687 "allocstall", 690 "allocstall",
688 691