aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig65
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/Makefile4
-rw-r--r--mm/bootmem.c12
-rw-r--r--mm/bounce.c10
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c175
-rw-r--r--mm/highmem.c1
-rw-r--r--mm/hugetlb.c132
-rw-r--r--mm/init-mm.c20
-rw-r--r--mm/internal.h33
-rw-r--r--mm/kmemcheck.c122
-rw-r--r--mm/kmemleak-test.c111
-rw-r--r--mm/kmemleak.c1498
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/madvise.c26
-rw-r--r--mm/memcontrol.c63
-rw-r--r--mm/memory.c240
-rw-r--r--mm/memory_hotplug.c6
-rw-r--r--mm/mempolicy.c145
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mlock.c73
-rw-r--r--mm/mmap.c20
-rw-r--r--mm/mmzone.c15
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/nommu.c20
-rw-r--r--mm/oom_kill.c128
-rw-r--r--mm/page-writeback.c25
-rw-r--r--mm/page_alloc.c872
-rw-r--r--mm/page_cgroup.c17
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/pdflush.c31
-rw-r--r--mm/percpu.c141
-rw-r--r--mm/readahead.c145
-rw-r--r--mm/rmap.c42
-rw-r--r--mm/shmem.c14
-rw-r--r--mm/slab.c280
-rw-r--r--mm/slob.c18
-rw-r--r--mm/slub.c173
-rw-r--r--mm/swap.c46
-rw-r--r--mm/swap_state.c19
-rw-r--r--mm/swapfile.c276
-rw-r--r--mm/truncate.c40
-rw-r--r--mm/util.c31
-rw-r--r--mm/vmalloc.c34
-rw-r--r--mm/vmscan.c378
-rw-r--r--mm/vmstat.c38
47 files changed, 4059 insertions, 1495 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 57971d2ab848..c948d4ca8bde 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -128,11 +128,11 @@ config SPARSEMEM_VMEMMAP
128config MEMORY_HOTPLUG 128config MEMORY_HOTPLUG
129 bool "Allow for memory hot-add" 129 bool "Allow for memory hot-add"
130 depends on SPARSEMEM || X86_64_ACPI_NUMA 130 depends on SPARSEMEM || X86_64_ACPI_NUMA
131 depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG 131 depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG
132 depends on (IA64 || X86 || PPC64 || SUPERH || S390) 132 depends on (IA64 || X86 || PPC64 || SUPERH || S390)
133 133
134comment "Memory hotplug is currently incompatible with Software Suspend" 134comment "Memory hotplug is currently incompatible with Software Suspend"
135 depends on SPARSEMEM && HOTPLUG && HIBERNATION 135 depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390
136 136
137config MEMORY_HOTPLUG_SPARSE 137config MEMORY_HOTPLUG_SPARSE
138 def_bool y 138 def_bool y
@@ -203,25 +203,60 @@ config VIRT_TO_BUS
203 def_bool y 203 def_bool y
204 depends on !ARCH_NO_VIRT_TO_BUS 204 depends on !ARCH_NO_VIRT_TO_BUS
205 205
206config UNEVICTABLE_LRU
207 bool "Add LRU list to track non-evictable pages"
208 default y
209 help
210 Keeps unevictable pages off of the active and inactive pageout
211 lists, so kswapd will not waste CPU time or have its balancing
212 algorithms thrown off by scanning these pages. Selecting this
213 will use one page flag and increase the code size a little,
214 say Y unless you know what you are doing.
215
216 See Documentation/vm/unevictable-lru.txt for more information.
217
218config HAVE_MLOCK 206config HAVE_MLOCK
219 bool 207 bool
220 default y if MMU=y 208 default y if MMU=y
221 209
222config HAVE_MLOCKED_PAGE_BIT 210config HAVE_MLOCKED_PAGE_BIT
223 bool 211 bool
224 default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y 212 default y if HAVE_MLOCK=y
225 213
226config MMU_NOTIFIER 214config MMU_NOTIFIER
227 bool 215 bool
216
217config DEFAULT_MMAP_MIN_ADDR
218 int "Low address space to protect from user allocation"
219 default 4096
220 help
221 This is the portion of low virtual memory which should be protected
222 from userspace allocation. Keeping a user from writing to low pages
223 can help reduce the impact of kernel NULL pointer bugs.
224
225 For most ia64, ppc64 and x86 users with lots of address space
226 a value of 65536 is reasonable and should cause no problems.
227 On arm and other archs it should not be higher than 32768.
228 Programs which use vm86 functionality would either need additional
229 permissions from either the LSM or the capabilities module or have
230 this protection disabled.
231
232 This value can be changed after boot using the
233 /proc/sys/vm/mmap_min_addr tunable.
234
235
236config NOMMU_INITIAL_TRIM_EXCESS
237 int "Turn on mmap() excess space trimming before booting"
238 depends on !MMU
239 default 1
240 help
241 The NOMMU mmap() frequently needs to allocate large contiguous chunks
242 of memory on which to store mappings, but it can only ask the system
243 allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently
244 more than it requires. To deal with this, mmap() is able to trim off
245 the excess and return it to the allocator.
246
247 If trimming is enabled, the excess is trimmed off and returned to the
248 system allocator, which can cause extra fragmentation, particularly
249 if there are a lot of transient processes.
250
251 If trimming is disabled, the excess is kept, but not used, which for
252 long-term mappings means that the space is wasted.
253
254 Trimming can be dynamically controlled through a sysctl option
255 (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of
256 excess pages there must be before trimming should occur, or zero if
257 no trimming is to occur.
258
259 This option specifies the initial value of this option. The default
260 of 1 says that all excess pages should be trimmed.
261
262 See Documentation/nommu-mmap.txt for more information.
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index bb01e298f260..aa99fd1f7109 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -2,6 +2,7 @@ config DEBUG_PAGEALLOC
2 bool "Debug page memory allocations" 2 bool "Debug page memory allocations"
3 depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC 3 depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
4 depends on !HIBERNATION || !PPC && !SPARC 4 depends on !HIBERNATION || !PPC && !SPARC
5 depends on !KMEMCHECK
5 ---help--- 6 ---help---
6 Unmap pages from the kernel linear mapping after free_pages(). 7 Unmap pages from the kernel linear mapping after free_pages().
7 This results in a large slowdown, but helps to find certain types 8 This results in a large slowdown, but helps to find certain types
diff --git a/mm/Makefile b/mm/Makefile
index ec73c68b6015..5e0bd6426693 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,6 +12,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
15obj-y += init-mm.o
15 16
16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o 17obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
17obj-$(CONFIG_BOUNCE) += bounce.o 18obj-$(CONFIG_BOUNCE) += bounce.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
27obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 28obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
28obj-$(CONFIG_SLAB) += slab.o 29obj-$(CONFIG_SLAB) += slab.o
29obj-$(CONFIG_SLUB) += slub.o 30obj-$(CONFIG_SLUB) += slub.o
31obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
30obj-$(CONFIG_FAILSLAB) += failslab.o 32obj-$(CONFIG_FAILSLAB) += failslab.o
31obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
32obj-$(CONFIG_FS_XIP) += filemap_xip.o 34obj-$(CONFIG_FS_XIP) += filemap_xip.o
@@ -38,3 +40,5 @@ obj-$(CONFIG_SMP) += allocpercpu.o
38endif 40endif
39obj-$(CONFIG_QUICKLIST) += quicklist.o 41obj-$(CONFIG_QUICKLIST) += quicklist.o
40obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 42obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
43obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
44obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index daf92713f7de..282df0a09e6f 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -532,6 +532,9 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
532 unsigned long size, unsigned long align, 532 unsigned long size, unsigned long align,
533 unsigned long goal, unsigned long limit) 533 unsigned long goal, unsigned long limit)
534{ 534{
535 if (WARN_ON_ONCE(slab_is_available()))
536 return kzalloc(size, GFP_NOWAIT);
537
535#ifdef CONFIG_HAVE_ARCH_BOOTMEM 538#ifdef CONFIG_HAVE_ARCH_BOOTMEM
536 bootmem_data_t *p_bdata; 539 bootmem_data_t *p_bdata;
537 540
@@ -662,6 +665,9 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
662void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, 665void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
663 unsigned long align, unsigned long goal) 666 unsigned long align, unsigned long goal)
664{ 667{
668 if (WARN_ON_ONCE(slab_is_available()))
669 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
670
665 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); 671 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
666} 672}
667 673
@@ -693,6 +699,9 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
693{ 699{
694 void *ptr; 700 void *ptr;
695 701
702 if (WARN_ON_ONCE(slab_is_available()))
703 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
704
696 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); 705 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
697 if (ptr) 706 if (ptr)
698 return ptr; 707 return ptr;
@@ -745,6 +754,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
745void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 754void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
746 unsigned long align, unsigned long goal) 755 unsigned long align, unsigned long goal)
747{ 756{
757 if (WARN_ON_ONCE(slab_is_available()))
758 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
759
748 return ___alloc_bootmem_node(pgdat->bdata, size, align, 760 return ___alloc_bootmem_node(pgdat->bdata, size, align,
749 goal, ARCH_LOW_ADDRESS_LIMIT); 761 goal, ARCH_LOW_ADDRESS_LIMIT);
750} 762}
diff --git a/mm/bounce.c b/mm/bounce.c
index e590272fe7a8..a2b76a588e34 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -13,17 +13,15 @@
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/hash.h> 14#include <linux/hash.h>
15#include <linux/highmem.h> 15#include <linux/highmem.h>
16#include <linux/blktrace_api.h>
17#include <trace/block.h>
18#include <asm/tlbflush.h> 16#include <asm/tlbflush.h>
19 17
18#include <trace/events/block.h>
19
20#define POOL_SIZE 64 20#define POOL_SIZE 64
21#define ISA_POOL_SIZE 16 21#define ISA_POOL_SIZE 16
22 22
23static mempool_t *page_pool, *isa_page_pool; 23static mempool_t *page_pool, *isa_page_pool;
24 24
25DEFINE_TRACE(block_bio_bounce);
26
27#ifdef CONFIG_HIGHMEM 25#ifdef CONFIG_HIGHMEM
28static __init int init_emergency_pool(void) 26static __init int init_emergency_pool(void)
29{ 27{
@@ -192,7 +190,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
192 /* 190 /*
193 * is destination page below bounce pfn? 191 * is destination page below bounce pfn?
194 */ 192 */
195 if (page_to_pfn(page) <= q->bounce_pfn) 193 if (page_to_pfn(page) <= queue_bounce_pfn(q))
196 continue; 194 continue;
197 195
198 /* 196 /*
@@ -284,7 +282,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
284 * don't waste time iterating over bio segments 282 * don't waste time iterating over bio segments
285 */ 283 */
286 if (!(q->bounce_gfp & GFP_DMA)) { 284 if (!(q->bounce_gfp & GFP_DMA)) {
287 if (q->bounce_pfn >= blk_max_pfn) 285 if (queue_bounce_pfn(q) >= blk_max_pfn)
288 return; 286 return;
289 pool = page_pool; 287 pool = page_pool;
290 } else { 288 } else {
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 54a0f8040afa..e43359214f6f 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
101 101
102 ret = force_page_cache_readahead(mapping, file, 102 ret = force_page_cache_readahead(mapping, file,
103 start_index, 103 start_index,
104 max_sane_readahead(nrpages)); 104 nrpages);
105 if (ret > 0) 105 if (ret > 0)
106 ret = 0; 106 ret = 0;
107 break; 107 break;
diff --git a/mm/filemap.c b/mm/filemap.c
index 379ff0bcbf6e..22396713feb9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -121,7 +121,6 @@ void __remove_from_page_cache(struct page *page)
121 mapping->nrpages--; 121 mapping->nrpages--;
122 __dec_zone_page_state(page, NR_FILE_PAGES); 122 __dec_zone_page_state(page, NR_FILE_PAGES);
123 BUG_ON(page_mapped(page)); 123 BUG_ON(page_mapped(page));
124 mem_cgroup_uncharge_cache_page(page);
125 124
126 /* 125 /*
127 * Some filesystems seem to re-dirty the page even after 126 * Some filesystems seem to re-dirty the page even after
@@ -145,6 +144,7 @@ void remove_from_page_cache(struct page *page)
145 spin_lock_irq(&mapping->tree_lock); 144 spin_lock_irq(&mapping->tree_lock);
146 __remove_from_page_cache(page); 145 __remove_from_page_cache(page);
147 spin_unlock_irq(&mapping->tree_lock); 146 spin_unlock_irq(&mapping->tree_lock);
147 mem_cgroup_uncharge_cache_page(page);
148} 148}
149 149
150static int sync_page(void *word) 150static int sync_page(void *word)
@@ -476,13 +476,13 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
476 if (likely(!error)) { 476 if (likely(!error)) {
477 mapping->nrpages++; 477 mapping->nrpages++;
478 __inc_zone_page_state(page, NR_FILE_PAGES); 478 __inc_zone_page_state(page, NR_FILE_PAGES);
479 spin_unlock_irq(&mapping->tree_lock);
479 } else { 480 } else {
480 page->mapping = NULL; 481 page->mapping = NULL;
482 spin_unlock_irq(&mapping->tree_lock);
481 mem_cgroup_uncharge_cache_page(page); 483 mem_cgroup_uncharge_cache_page(page);
482 page_cache_release(page); 484 page_cache_release(page);
483 } 485 }
484
485 spin_unlock_irq(&mapping->tree_lock);
486 radix_tree_preload_end(); 486 radix_tree_preload_end();
487 } else 487 } else
488 mem_cgroup_uncharge_cache_page(page); 488 mem_cgroup_uncharge_cache_page(page);
@@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
521{ 521{
522 if (cpuset_do_page_mem_spread()) { 522 if (cpuset_do_page_mem_spread()) {
523 int n = cpuset_mem_spread_node(); 523 int n = cpuset_mem_spread_node();
524 return alloc_pages_node(n, gfp, 0); 524 return alloc_pages_exact_node(n, gfp, 0);
525 } 525 }
526 return alloc_pages(gfp, 0); 526 return alloc_pages(gfp, 0);
527} 527}
@@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait);
1004static void shrink_readahead_size_eio(struct file *filp, 1004static void shrink_readahead_size_eio(struct file *filp,
1005 struct file_ra_state *ra) 1005 struct file_ra_state *ra)
1006{ 1006{
1007 if (!ra->ra_pages)
1008 return;
1009
1010 ra->ra_pages /= 4; 1007 ra->ra_pages /= 4;
1011} 1008}
1012 1009
@@ -1390,8 +1387,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
1390 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 1387 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1391 return -EINVAL; 1388 return -EINVAL;
1392 1389
1393 force_page_cache_readahead(mapping, filp, index, 1390 force_page_cache_readahead(mapping, filp, index, nr);
1394 max_sane_readahead(nr));
1395 return 0; 1391 return 0;
1396} 1392}
1397 1393
@@ -1457,6 +1453,73 @@ static int page_cache_read(struct file *file, pgoff_t offset)
1457 1453
1458#define MMAP_LOTSAMISS (100) 1454#define MMAP_LOTSAMISS (100)
1459 1455
1456/*
1457 * Synchronous readahead happens when we don't even find
1458 * a page in the page cache at all.
1459 */
1460static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1461 struct file_ra_state *ra,
1462 struct file *file,
1463 pgoff_t offset)
1464{
1465 unsigned long ra_pages;
1466 struct address_space *mapping = file->f_mapping;
1467
1468 /* If we don't want any read-ahead, don't bother */
1469 if (VM_RandomReadHint(vma))
1470 return;
1471
1472 if (VM_SequentialReadHint(vma) ||
1473 offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
1474 page_cache_sync_readahead(mapping, ra, file, offset,
1475 ra->ra_pages);
1476 return;
1477 }
1478
1479 if (ra->mmap_miss < INT_MAX)
1480 ra->mmap_miss++;
1481
1482 /*
1483 * Do we miss much more than hit in this file? If so,
1484 * stop bothering with read-ahead. It will only hurt.
1485 */
1486 if (ra->mmap_miss > MMAP_LOTSAMISS)
1487 return;
1488
1489 /*
1490 * mmap read-around
1491 */
1492 ra_pages = max_sane_readahead(ra->ra_pages);
1493 if (ra_pages) {
1494 ra->start = max_t(long, 0, offset - ra_pages/2);
1495 ra->size = ra_pages;
1496 ra->async_size = 0;
1497 ra_submit(ra, mapping, file);
1498 }
1499}
1500
1501/*
1502 * Asynchronous readahead happens when we find the page and PG_readahead,
1503 * so we want to possibly extend the readahead further..
1504 */
1505static void do_async_mmap_readahead(struct vm_area_struct *vma,
1506 struct file_ra_state *ra,
1507 struct file *file,
1508 struct page *page,
1509 pgoff_t offset)
1510{
1511 struct address_space *mapping = file->f_mapping;
1512
1513 /* If we don't want any read-ahead, don't bother */
1514 if (VM_RandomReadHint(vma))
1515 return;
1516 if (ra->mmap_miss > 0)
1517 ra->mmap_miss--;
1518 if (PageReadahead(page))
1519 page_cache_async_readahead(mapping, ra, file,
1520 page, offset, ra->ra_pages);
1521}
1522
1460/** 1523/**
1461 * filemap_fault - read in file data for page fault handling 1524 * filemap_fault - read in file data for page fault handling
1462 * @vma: vma in which the fault was taken 1525 * @vma: vma in which the fault was taken
@@ -1476,78 +1539,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1476 struct address_space *mapping = file->f_mapping; 1539 struct address_space *mapping = file->f_mapping;
1477 struct file_ra_state *ra = &file->f_ra; 1540 struct file_ra_state *ra = &file->f_ra;
1478 struct inode *inode = mapping->host; 1541 struct inode *inode = mapping->host;
1542 pgoff_t offset = vmf->pgoff;
1479 struct page *page; 1543 struct page *page;
1480 pgoff_t size; 1544 pgoff_t size;
1481 int did_readaround = 0;
1482 int ret = 0; 1545 int ret = 0;
1483 1546
1484 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1547 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1485 if (vmf->pgoff >= size) 1548 if (offset >= size)
1486 return VM_FAULT_SIGBUS; 1549 return VM_FAULT_SIGBUS;
1487 1550
1488 /* If we don't want any read-ahead, don't bother */
1489 if (VM_RandomReadHint(vma))
1490 goto no_cached_page;
1491
1492 /* 1551 /*
1493 * Do we have something in the page cache already? 1552 * Do we have something in the page cache already?
1494 */ 1553 */
1495retry_find: 1554 page = find_get_page(mapping, offset);
1496 page = find_lock_page(mapping, vmf->pgoff); 1555 if (likely(page)) {
1497 /*
1498 * For sequential accesses, we use the generic readahead logic.
1499 */
1500 if (VM_SequentialReadHint(vma)) {
1501 if (!page) {
1502 page_cache_sync_readahead(mapping, ra, file,
1503 vmf->pgoff, 1);
1504 page = find_lock_page(mapping, vmf->pgoff);
1505 if (!page)
1506 goto no_cached_page;
1507 }
1508 if (PageReadahead(page)) {
1509 page_cache_async_readahead(mapping, ra, file, page,
1510 vmf->pgoff, 1);
1511 }
1512 }
1513
1514 if (!page) {
1515 unsigned long ra_pages;
1516
1517 ra->mmap_miss++;
1518
1519 /* 1556 /*
1520 * Do we miss much more than hit in this file? If so, 1557 * We found the page, so try async readahead before
1521 * stop bothering with read-ahead. It will only hurt. 1558 * waiting for the lock.
1522 */ 1559 */
1523 if (ra->mmap_miss > MMAP_LOTSAMISS) 1560 do_async_mmap_readahead(vma, ra, file, page, offset);
1524 goto no_cached_page; 1561 lock_page(page);
1525 1562
1526 /* 1563 /* Did it get truncated? */
1527 * To keep the pgmajfault counter straight, we need to 1564 if (unlikely(page->mapping != mapping)) {
1528 * check did_readaround, as this is an inner loop. 1565 unlock_page(page);
1529 */ 1566 put_page(page);
1530 if (!did_readaround) { 1567 goto no_cached_page;
1531 ret = VM_FAULT_MAJOR;
1532 count_vm_event(PGMAJFAULT);
1533 }
1534 did_readaround = 1;
1535 ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1536 if (ra_pages) {
1537 pgoff_t start = 0;
1538
1539 if (vmf->pgoff > ra_pages / 2)
1540 start = vmf->pgoff - ra_pages / 2;
1541 do_page_cache_readahead(mapping, file, start, ra_pages);
1542 } 1568 }
1543 page = find_lock_page(mapping, vmf->pgoff); 1569 } else {
1570 /* No page in the page cache at all */
1571 do_sync_mmap_readahead(vma, ra, file, offset);
1572 count_vm_event(PGMAJFAULT);
1573 ret = VM_FAULT_MAJOR;
1574retry_find:
1575 page = find_lock_page(mapping, offset);
1544 if (!page) 1576 if (!page)
1545 goto no_cached_page; 1577 goto no_cached_page;
1546 } 1578 }
1547 1579
1548 if (!did_readaround)
1549 ra->mmap_miss--;
1550
1551 /* 1580 /*
1552 * We have a locked page in the page cache, now we need to check 1581 * We have a locked page in the page cache, now we need to check
1553 * that it's up-to-date. If not, it is going to be due to an error. 1582 * that it's up-to-date. If not, it is going to be due to an error.
@@ -1555,18 +1584,18 @@ retry_find:
1555 if (unlikely(!PageUptodate(page))) 1584 if (unlikely(!PageUptodate(page)))
1556 goto page_not_uptodate; 1585 goto page_not_uptodate;
1557 1586
1558 /* Must recheck i_size under page lock */ 1587 /*
1588 * Found the page and have a reference on it.
1589 * We must recheck i_size under page lock.
1590 */
1559 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1591 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1560 if (unlikely(vmf->pgoff >= size)) { 1592 if (unlikely(offset >= size)) {
1561 unlock_page(page); 1593 unlock_page(page);
1562 page_cache_release(page); 1594 page_cache_release(page);
1563 return VM_FAULT_SIGBUS; 1595 return VM_FAULT_SIGBUS;
1564 } 1596 }
1565 1597
1566 /* 1598 ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
1567 * Found the page and have a reference on it.
1568 */
1569 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
1570 vmf->page = page; 1599 vmf->page = page;
1571 return ret | VM_FAULT_LOCKED; 1600 return ret | VM_FAULT_LOCKED;
1572 1601
@@ -1575,7 +1604,7 @@ no_cached_page:
1575 * We're only likely to ever get here if MADV_RANDOM is in 1604 * We're only likely to ever get here if MADV_RANDOM is in
1576 * effect. 1605 * effect.
1577 */ 1606 */
1578 error = page_cache_read(file, vmf->pgoff); 1607 error = page_cache_read(file, offset);
1579 1608
1580 /* 1609 /*
1581 * The page we want has now been added to the page cache. 1610 * The page we want has now been added to the page cache.
@@ -1595,12 +1624,6 @@ no_cached_page:
1595 return VM_FAULT_SIGBUS; 1624 return VM_FAULT_SIGBUS;
1596 1625
1597page_not_uptodate: 1626page_not_uptodate:
1598 /* IO error path */
1599 if (!did_readaround) {
1600 ret = VM_FAULT_MAJOR;
1601 count_vm_event(PGMAJFAULT);
1602 }
1603
1604 /* 1627 /*
1605 * Umm, take care of errors if the page isn't up-to-date. 1628 * Umm, take care of errors if the page isn't up-to-date.
1606 * Try to re-read it _once_. We do this synchronously, 1629 * Try to re-read it _once_. We do this synchronously,
diff --git a/mm/highmem.c b/mm/highmem.c
index 68eb1d9b63fa..25878cc49daa 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,7 +26,6 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/hash.h> 27#include <linux/hash.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/blktrace_api.h>
30#include <asm/tlbflush.h> 29#include <asm/tlbflush.h>
31 30
32/* 31/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 28c655ba9353..a56e6f3ce979 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -316,7 +316,7 @@ static void resv_map_release(struct kref *ref)
316static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 316static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
317{ 317{
318 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 318 VM_BUG_ON(!is_vm_hugetlb_page(vma));
319 if (!(vma->vm_flags & VM_SHARED)) 319 if (!(vma->vm_flags & VM_MAYSHARE))
320 return (struct resv_map *)(get_vma_private_data(vma) & 320 return (struct resv_map *)(get_vma_private_data(vma) &
321 ~HPAGE_RESV_MASK); 321 ~HPAGE_RESV_MASK);
322 return NULL; 322 return NULL;
@@ -325,7 +325,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
325static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 325static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
326{ 326{
327 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 327 VM_BUG_ON(!is_vm_hugetlb_page(vma));
328 VM_BUG_ON(vma->vm_flags & VM_SHARED); 328 VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
329 329
330 set_vma_private_data(vma, (get_vma_private_data(vma) & 330 set_vma_private_data(vma, (get_vma_private_data(vma) &
331 HPAGE_RESV_MASK) | (unsigned long)map); 331 HPAGE_RESV_MASK) | (unsigned long)map);
@@ -334,7 +334,7 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
334static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 334static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
335{ 335{
336 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 336 VM_BUG_ON(!is_vm_hugetlb_page(vma));
337 VM_BUG_ON(vma->vm_flags & VM_SHARED); 337 VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
338 338
339 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 339 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
340} 340}
@@ -353,7 +353,7 @@ static void decrement_hugepage_resv_vma(struct hstate *h,
353 if (vma->vm_flags & VM_NORESERVE) 353 if (vma->vm_flags & VM_NORESERVE)
354 return; 354 return;
355 355
356 if (vma->vm_flags & VM_SHARED) { 356 if (vma->vm_flags & VM_MAYSHARE) {
357 /* Shared mappings always use reserves */ 357 /* Shared mappings always use reserves */
358 h->resv_huge_pages--; 358 h->resv_huge_pages--;
359 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 359 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
@@ -369,14 +369,14 @@ static void decrement_hugepage_resv_vma(struct hstate *h,
369void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 369void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
370{ 370{
371 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 371 VM_BUG_ON(!is_vm_hugetlb_page(vma));
372 if (!(vma->vm_flags & VM_SHARED)) 372 if (!(vma->vm_flags & VM_MAYSHARE))
373 vma->vm_private_data = (void *)0; 373 vma->vm_private_data = (void *)0;
374} 374}
375 375
376/* Returns true if the VMA has associated reserve pages */ 376/* Returns true if the VMA has associated reserve pages */
377static int vma_has_reserves(struct vm_area_struct *vma) 377static int vma_has_reserves(struct vm_area_struct *vma)
378{ 378{
379 if (vma->vm_flags & VM_SHARED) 379 if (vma->vm_flags & VM_MAYSHARE)
380 return 1; 380 return 1;
381 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 381 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
382 return 1; 382 return 1;
@@ -578,41 +578,6 @@ static void free_huge_page(struct page *page)
578 hugetlb_put_quota(mapping, 1); 578 hugetlb_put_quota(mapping, 1);
579} 579}
580 580
581/*
582 * Increment or decrement surplus_huge_pages. Keep node-specific counters
583 * balanced by operating on them in a round-robin fashion.
584 * Returns 1 if an adjustment was made.
585 */
586static int adjust_pool_surplus(struct hstate *h, int delta)
587{
588 static int prev_nid;
589 int nid = prev_nid;
590 int ret = 0;
591
592 VM_BUG_ON(delta != -1 && delta != 1);
593 do {
594 nid = next_node(nid, node_online_map);
595 if (nid == MAX_NUMNODES)
596 nid = first_node(node_online_map);
597
598 /* To shrink on this node, there must be a surplus page */
599 if (delta < 0 && !h->surplus_huge_pages_node[nid])
600 continue;
601 /* Surplus cannot exceed the total number of pages */
602 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
603 h->nr_huge_pages_node[nid])
604 continue;
605
606 h->surplus_huge_pages += delta;
607 h->surplus_huge_pages_node[nid] += delta;
608 ret = 1;
609 break;
610 } while (nid != prev_nid);
611
612 prev_nid = nid;
613 return ret;
614}
615
616static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 581static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
617{ 582{
618 set_compound_page_dtor(page, free_huge_page); 583 set_compound_page_dtor(page, free_huge_page);
@@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
623 put_page(page); /* free it into the hugepage allocator */ 588 put_page(page); /* free it into the hugepage allocator */
624} 589}
625 590
591static void prep_compound_gigantic_page(struct page *page, unsigned long order)
592{
593 int i;
594 int nr_pages = 1 << order;
595 struct page *p = page + 1;
596
597 /* we rely on prep_new_huge_page to set the destructor */
598 set_compound_order(page, order);
599 __SetPageHead(page);
600 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
601 __SetPageTail(p);
602 p->first_page = page;
603 }
604}
605
606int PageHuge(struct page *page)
607{
608 compound_page_dtor *dtor;
609
610 if (!PageCompound(page))
611 return 0;
612
613 page = compound_head(page);
614 dtor = get_compound_page_dtor(page);
615
616 return dtor == free_huge_page;
617}
618
626static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 619static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
627{ 620{
628 struct page *page; 621 struct page *page;
@@ -630,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
630 if (h->order >= MAX_ORDER) 623 if (h->order >= MAX_ORDER)
631 return NULL; 624 return NULL;
632 625
633 page = alloc_pages_node(nid, 626 page = alloc_pages_exact_node(nid,
634 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 627 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
635 __GFP_REPEAT|__GFP_NOWARN, 628 __GFP_REPEAT|__GFP_NOWARN,
636 huge_page_order(h)); 629 huge_page_order(h));
@@ -649,7 +642,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
649 * Use a helper variable to find the next node and then 642 * Use a helper variable to find the next node and then
650 * copy it back to hugetlb_next_nid afterwards: 643 * copy it back to hugetlb_next_nid afterwards:
651 * otherwise there's a window in which a racer might 644 * otherwise there's a window in which a racer might
652 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 645 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
653 * But we don't need to use a spin_lock here: it really 646 * But we don't need to use a spin_lock here: it really
654 * doesn't matter if occasionally a racer chooses the 647 * doesn't matter if occasionally a racer chooses the
655 * same nid as we do. Move nid forward in the mask even 648 * same nid as we do. Move nid forward in the mask even
@@ -875,7 +868,7 @@ static void return_unused_surplus_pages(struct hstate *h,
875 * can no longer free unreserved surplus pages. This occurs when 868 * can no longer free unreserved surplus pages. This occurs when
876 * the nodes with surplus pages have no free pages. 869 * the nodes with surplus pages have no free pages.
877 */ 870 */
878 unsigned long remaining_iterations = num_online_nodes(); 871 unsigned long remaining_iterations = nr_online_nodes;
879 872
880 /* Uncommit the reservation */ 873 /* Uncommit the reservation */
881 h->resv_huge_pages -= unused_resv_pages; 874 h->resv_huge_pages -= unused_resv_pages;
@@ -904,7 +897,7 @@ static void return_unused_surplus_pages(struct hstate *h,
904 h->surplus_huge_pages--; 897 h->surplus_huge_pages--;
905 h->surplus_huge_pages_node[nid]--; 898 h->surplus_huge_pages_node[nid]--;
906 nr_pages--; 899 nr_pages--;
907 remaining_iterations = num_online_nodes(); 900 remaining_iterations = nr_online_nodes;
908 } 901 }
909 } 902 }
910} 903}
@@ -924,7 +917,7 @@ static long vma_needs_reservation(struct hstate *h,
924 struct address_space *mapping = vma->vm_file->f_mapping; 917 struct address_space *mapping = vma->vm_file->f_mapping;
925 struct inode *inode = mapping->host; 918 struct inode *inode = mapping->host;
926 919
927 if (vma->vm_flags & VM_SHARED) { 920 if (vma->vm_flags & VM_MAYSHARE) {
928 pgoff_t idx = vma_hugecache_offset(h, vma, addr); 921 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
929 return region_chg(&inode->i_mapping->private_list, 922 return region_chg(&inode->i_mapping->private_list,
930 idx, idx + 1); 923 idx, idx + 1);
@@ -949,7 +942,7 @@ static void vma_commit_reservation(struct hstate *h,
949 struct address_space *mapping = vma->vm_file->f_mapping; 942 struct address_space *mapping = vma->vm_file->f_mapping;
950 struct inode *inode = mapping->host; 943 struct inode *inode = mapping->host;
951 944
952 if (vma->vm_flags & VM_SHARED) { 945 if (vma->vm_flags & VM_MAYSHARE) {
953 pgoff_t idx = vma_hugecache_offset(h, vma, addr); 946 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
954 region_add(&inode->i_mapping->private_list, idx, idx + 1); 947 region_add(&inode->i_mapping->private_list, idx, idx + 1);
955 948
@@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1140} 1133}
1141#endif 1134#endif
1142 1135
1136/*
1137 * Increment or decrement surplus_huge_pages. Keep node-specific counters
1138 * balanced by operating on them in a round-robin fashion.
1139 * Returns 1 if an adjustment was made.
1140 */
1141static int adjust_pool_surplus(struct hstate *h, int delta)
1142{
1143 static int prev_nid;
1144 int nid = prev_nid;
1145 int ret = 0;
1146
1147 VM_BUG_ON(delta != -1 && delta != 1);
1148 do {
1149 nid = next_node(nid, node_online_map);
1150 if (nid == MAX_NUMNODES)
1151 nid = first_node(node_online_map);
1152
1153 /* To shrink on this node, there must be a surplus page */
1154 if (delta < 0 && !h->surplus_huge_pages_node[nid])
1155 continue;
1156 /* Surplus cannot exceed the total number of pages */
1157 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
1158 h->nr_huge_pages_node[nid])
1159 continue;
1160
1161 h->surplus_huge_pages += delta;
1162 h->surplus_huge_pages_node[nid] += delta;
1163 ret = 1;
1164 break;
1165 } while (nid != prev_nid);
1166
1167 prev_nid = nid;
1168 return ret;
1169}
1170
1143#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1171#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1144static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) 1172static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1145{ 1173{
@@ -1893,7 +1921,7 @@ retry_avoidcopy:
1893 * at the time of fork() could consume its reserves on COW instead 1921 * at the time of fork() could consume its reserves on COW instead
1894 * of the full address range. 1922 * of the full address range.
1895 */ 1923 */
1896 if (!(vma->vm_flags & VM_SHARED) && 1924 if (!(vma->vm_flags & VM_MAYSHARE) &&
1897 is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 1925 is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
1898 old_page != pagecache_page) 1926 old_page != pagecache_page)
1899 outside_reserve = 1; 1927 outside_reserve = 1;
@@ -2000,7 +2028,7 @@ retry:
2000 clear_huge_page(page, address, huge_page_size(h)); 2028 clear_huge_page(page, address, huge_page_size(h));
2001 __SetPageUptodate(page); 2029 __SetPageUptodate(page);
2002 2030
2003 if (vma->vm_flags & VM_SHARED) { 2031 if (vma->vm_flags & VM_MAYSHARE) {
2004 int err; 2032 int err;
2005 struct inode *inode = mapping->host; 2033 struct inode *inode = mapping->host;
2006 2034
@@ -2104,7 +2132,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2104 goto out_mutex; 2132 goto out_mutex;
2105 } 2133 }
2106 2134
2107 if (!(vma->vm_flags & VM_SHARED)) 2135 if (!(vma->vm_flags & VM_MAYSHARE))
2108 pagecache_page = hugetlbfs_pagecache_page(h, 2136 pagecache_page = hugetlbfs_pagecache_page(h,
2109 vma, address); 2137 vma, address);
2110 } 2138 }
@@ -2289,7 +2317,7 @@ int hugetlb_reserve_pages(struct inode *inode,
2289 * to reserve the full area even if read-only as mprotect() may be 2317 * to reserve the full area even if read-only as mprotect() may be
2290 * called to make the mapping read-write. Assume !vma is a shm mapping 2318 * called to make the mapping read-write. Assume !vma is a shm mapping
2291 */ 2319 */
2292 if (!vma || vma->vm_flags & VM_SHARED) 2320 if (!vma || vma->vm_flags & VM_MAYSHARE)
2293 chg = region_chg(&inode->i_mapping->private_list, from, to); 2321 chg = region_chg(&inode->i_mapping->private_list, from, to);
2294 else { 2322 else {
2295 struct resv_map *resv_map = resv_map_alloc(); 2323 struct resv_map *resv_map = resv_map_alloc();
@@ -2330,7 +2358,7 @@ int hugetlb_reserve_pages(struct inode *inode,
2330 * consumed reservations are stored in the map. Hence, nothing 2358 * consumed reservations are stored in the map. Hence, nothing
2331 * else has to be done for private mappings here 2359 * else has to be done for private mappings here
2332 */ 2360 */
2333 if (!vma || vma->vm_flags & VM_SHARED) 2361 if (!vma || vma->vm_flags & VM_MAYSHARE)
2334 region_add(&inode->i_mapping->private_list, from, to); 2362 region_add(&inode->i_mapping->private_list, from, to);
2335 return 0; 2363 return 0;
2336} 2364}
diff --git a/mm/init-mm.c b/mm/init-mm.c
new file mode 100644
index 000000000000..57aba0da9668
--- /dev/null
+++ b/mm/init-mm.c
@@ -0,0 +1,20 @@
1#include <linux/mm_types.h>
2#include <linux/rbtree.h>
3#include <linux/rwsem.h>
4#include <linux/spinlock.h>
5#include <linux/list.h>
6#include <linux/cpumask.h>
7
8#include <asm/atomic.h>
9#include <asm/pgtable.h>
10
11struct mm_struct init_mm = {
12 .mm_rb = RB_ROOT,
13 .pgd = swapper_pg_dir,
14 .mm_users = ATOMIC_INIT(2),
15 .mm_count = ATOMIC_INIT(1),
16 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
17 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
18 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
19 .cpu_vm_mask = CPU_MASK_ALL,
20};
diff --git a/mm/internal.h b/mm/internal.h
index 987bb03fbdd8..f290c4db528b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -16,9 +16,6 @@
16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
17 unsigned long floor, unsigned long ceiling); 17 unsigned long floor, unsigned long ceiling);
18 18
19extern void prep_compound_page(struct page *page, unsigned long order);
20extern void prep_compound_gigantic_page(struct page *page, unsigned long order);
21
22static inline void set_page_count(struct page *page, int v) 19static inline void set_page_count(struct page *page, int v)
23{ 20{
24 atomic_set(&page->_count, v); 21 atomic_set(&page->_count, v);
@@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page);
51 */ 48 */
52extern unsigned long highest_memmap_pfn; 49extern unsigned long highest_memmap_pfn;
53extern void __free_pages_bootmem(struct page *page, unsigned int order); 50extern void __free_pages_bootmem(struct page *page, unsigned int order);
51extern void prep_compound_page(struct page *page, unsigned long order);
52
54 53
55/* 54/*
56 * function for dealing with page's order in buddy system. 55 * function for dealing with page's order in buddy system.
@@ -74,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
74} 73}
75#endif 74#endif
76 75
77#ifdef CONFIG_UNEVICTABLE_LRU
78/* 76/*
79 * unevictable_migrate_page() called only from migrate_page_copy() to 77 * unevictable_migrate_page() called only from migrate_page_copy() to
80 * migrate unevictable flag to new page. 78 * migrate unevictable flag to new page.
@@ -86,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
86 if (TestClearPageUnevictable(old)) 84 if (TestClearPageUnevictable(old))
87 SetPageUnevictable(new); 85 SetPageUnevictable(new);
88} 86}
89#else
90static inline void unevictable_migrate_page(struct page *new, struct page *old)
91{
92}
93#endif
94 87
95#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT 88#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
96/* 89/*
@@ -150,23 +143,6 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
150 } 143 }
151} 144}
152 145
153/*
154 * free_page_mlock() -- clean up attempts to free and mlocked() page.
155 * Page should not be on lru, so no need to fix that up.
156 * free_pages_check() will verify...
157 */
158static inline void free_page_mlock(struct page *page)
159{
160 if (unlikely(TestClearPageMlocked(page))) {
161 unsigned long flags;
162
163 local_irq_save(flags);
164 __dec_zone_page_state(page, NR_MLOCK);
165 __count_vm_event(UNEVICTABLE_MLOCKFREED);
166 local_irq_restore(flags);
167 }
168}
169
170#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 146#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
171static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 147static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
172{ 148{
@@ -175,7 +151,6 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
175static inline void clear_page_mlock(struct page *page) { } 151static inline void clear_page_mlock(struct page *page) { }
176static inline void mlock_vma_page(struct page *page) { } 152static inline void mlock_vma_page(struct page *page) { }
177static inline void mlock_migrate_page(struct page *new, struct page *old) { } 153static inline void mlock_migrate_page(struct page *new, struct page *old) { }
178static inline void free_page_mlock(struct page *page) { }
179 154
180#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 155#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
181 156
@@ -284,4 +259,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
284 unsigned long start, int len, int flags, 259 unsigned long start, int len, int flags,
285 struct page **pages, struct vm_area_struct **vmas); 260 struct page **pages, struct vm_area_struct **vmas);
286 261
262#define ZONE_RECLAIM_NOSCAN -2
263#define ZONE_RECLAIM_FULL -1
264#define ZONE_RECLAIM_SOME 0
265#define ZONE_RECLAIM_SUCCESS 1
287#endif 266#endif
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
new file mode 100644
index 000000000000..fd814fd61319
--- /dev/null
+++ b/mm/kmemcheck.c
@@ -0,0 +1,122 @@
1#include <linux/gfp.h>
2#include <linux/mm_types.h>
3#include <linux/mm.h>
4#include <linux/slab.h>
5#include <linux/kmemcheck.h>
6
7void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
8{
9 struct page *shadow;
10 int pages;
11 int i;
12
13 pages = 1 << order;
14
15 /*
16 * With kmemcheck enabled, we need to allocate a memory area for the
17 * shadow bits as well.
18 */
19 shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
20 if (!shadow) {
21 if (printk_ratelimit())
22 printk(KERN_ERR "kmemcheck: failed to allocate "
23 "shadow bitmap\n");
24 return;
25 }
26
27 for(i = 0; i < pages; ++i)
28 page[i].shadow = page_address(&shadow[i]);
29
30 /*
31 * Mark it as non-present for the MMU so that our accesses to
32 * this memory will trigger a page fault and let us analyze
33 * the memory accesses.
34 */
35 kmemcheck_hide_pages(page, pages);
36}
37
38void kmemcheck_free_shadow(struct page *page, int order)
39{
40 struct page *shadow;
41 int pages;
42 int i;
43
44 if (!kmemcheck_page_is_tracked(page))
45 return;
46
47 pages = 1 << order;
48
49 kmemcheck_show_pages(page, pages);
50
51 shadow = virt_to_page(page[0].shadow);
52
53 for(i = 0; i < pages; ++i)
54 page[i].shadow = NULL;
55
56 __free_pages(shadow, order);
57}
58
59void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
60 size_t size)
61{
62 /*
63 * Has already been memset(), which initializes the shadow for us
64 * as well.
65 */
66 if (gfpflags & __GFP_ZERO)
67 return;
68
69 /* No need to initialize the shadow of a non-tracked slab. */
70 if (s->flags & SLAB_NOTRACK)
71 return;
72
73 if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
74 /*
75 * Allow notracked objects to be allocated from
76 * tracked caches. Note however that these objects
77 * will still get page faults on access, they just
78 * won't ever be flagged as uninitialized. If page
79 * faults are not acceptable, the slab cache itself
80 * should be marked NOTRACK.
81 */
82 kmemcheck_mark_initialized(object, size);
83 } else if (!s->ctor) {
84 /*
85 * New objects should be marked uninitialized before
86 * they're returned to the called.
87 */
88 kmemcheck_mark_uninitialized(object, size);
89 }
90}
91
92void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
93{
94 /* TODO: RCU freeing is unsupported for now; hide false positives. */
95 if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU))
96 kmemcheck_mark_freed(object, size);
97}
98
99void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
100 gfp_t gfpflags)
101{
102 int pages;
103
104 if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
105 return;
106
107 pages = 1 << order;
108
109 /*
110 * NOTE: We choose to track GFP_ZERO pages too; in fact, they
111 * can become uninitialized by copying uninitialized memory
112 * into them.
113 */
114
115 /* XXX: Can use zone->node for node? */
116 kmemcheck_alloc_shadow(page, order, gfpflags, -1);
117
118 if (gfpflags & __GFP_ZERO)
119 kmemcheck_mark_initialized_pages(page, pages);
120 else
121 kmemcheck_mark_uninitialized_pages(page, pages);
122}
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
new file mode 100644
index 000000000000..d5292fc6f523
--- /dev/null
+++ b/mm/kmemleak-test.c
@@ -0,0 +1,111 @@
1/*
2 * mm/kmemleak-test.c
3 *
4 * Copyright (C) 2008 ARM Limited
5 * Written by Catalin Marinas <catalin.marinas@arm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#include <linux/init.h>
22#include <linux/kernel.h>
23#include <linux/module.h>
24#include <linux/slab.h>
25#include <linux/vmalloc.h>
26#include <linux/list.h>
27#include <linux/percpu.h>
28#include <linux/fdtable.h>
29
30#include <linux/kmemleak.h>
31
32struct test_node {
33 long header[25];
34 struct list_head list;
35 long footer[25];
36};
37
38static LIST_HEAD(test_list);
39static DEFINE_PER_CPU(void *, test_pointer);
40
41/*
42 * Some very simple testing. This function needs to be extended for
43 * proper testing.
44 */
45static int __init kmemleak_test_init(void)
46{
47 struct test_node *elem;
48 int i;
49
50 printk(KERN_INFO "Kmemleak testing\n");
51
52 /* make some orphan objects */
53 pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
54 pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
55 pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
56 pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
57 pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
58 pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
59 pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
60 pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
61#ifndef CONFIG_MODULES
62 pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
63 kmem_cache_alloc(files_cachep, GFP_KERNEL));
64 pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
65 kmem_cache_alloc(files_cachep, GFP_KERNEL));
66#endif
67 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
68 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
69 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
70 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
71 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
72
73 /*
74 * Add elements to a list. They should only appear as orphan
75 * after the module is removed.
76 */
77 for (i = 0; i < 10; i++) {
78 elem = kmalloc(sizeof(*elem), GFP_KERNEL);
79 pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem);
80 if (!elem)
81 return -ENOMEM;
82 memset(elem, 0, sizeof(*elem));
83 INIT_LIST_HEAD(&elem->list);
84
85 list_add_tail(&elem->list, &test_list);
86 }
87
88 for_each_possible_cpu(i) {
89 per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL);
90 pr_info("kmemleak: kmalloc(129) = %p\n",
91 per_cpu(test_pointer, i));
92 }
93
94 return 0;
95}
96module_init(kmemleak_test_init);
97
98static void __exit kmemleak_test_exit(void)
99{
100 struct test_node *elem, *tmp;
101
102 /*
103 * Remove the list elements without actually freeing the
104 * memory.
105 */
106 list_for_each_entry_safe(elem, tmp, &test_list, list)
107 list_del(&elem->list);
108}
109module_exit(kmemleak_test_exit);
110
111MODULE_LICENSE("GPL");
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
new file mode 100644
index 000000000000..58ec86c9e58a
--- /dev/null
+++ b/mm/kmemleak.c
@@ -0,0 +1,1498 @@
1/*
2 * mm/kmemleak.c
3 *
4 * Copyright (C) 2008 ARM Limited
5 * Written by Catalin Marinas <catalin.marinas@arm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 *
21 * For more information on the algorithm and kmemleak usage, please see
22 * Documentation/kmemleak.txt.
23 *
24 * Notes on locking
25 * ----------------
26 *
27 * The following locks and mutexes are used by kmemleak:
28 *
29 * - kmemleak_lock (rwlock): protects the object_list modifications and
30 * accesses to the object_tree_root. The object_list is the main list
31 * holding the metadata (struct kmemleak_object) for the allocated memory
32 * blocks. The object_tree_root is a priority search tree used to look-up
33 * metadata based on a pointer to the corresponding memory block. The
34 * kmemleak_object structures are added to the object_list and
35 * object_tree_root in the create_object() function called from the
36 * kmemleak_alloc() callback and removed in delete_object() called from the
37 * kmemleak_free() callback
38 * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to
39 * the metadata (e.g. count) are protected by this lock. Note that some
40 * members of this structure may be protected by other means (atomic or
41 * kmemleak_lock). This lock is also held when scanning the corresponding
42 * memory block to avoid the kernel freeing it via the kmemleak_free()
43 * callback. This is less heavyweight than holding a global lock like
44 * kmemleak_lock during scanning
45 * - scan_mutex (mutex): ensures that only one thread may scan the memory for
46 * unreferenced objects at a time. The gray_list contains the objects which
47 * are already referenced or marked as false positives and need to be
48 * scanned. This list is only modified during a scanning episode when the
49 * scan_mutex is held. At the end of a scan, the gray_list is always empty.
50 * Note that the kmemleak_object.use_count is incremented when an object is
51 * added to the gray_list and therefore cannot be freed
52 * - kmemleak_mutex (mutex): prevents multiple users of the "kmemleak" debugfs
53 * file together with modifications to the memory scanning parameters
54 * including the scan_thread pointer
55 *
56 * The kmemleak_object structures have a use_count incremented or decremented
57 * using the get_object()/put_object() functions. When the use_count becomes
58 * 0, this count can no longer be incremented and put_object() schedules the
59 * kmemleak_object freeing via an RCU callback. All calls to the get_object()
60 * function must be protected by rcu_read_lock() to avoid accessing a freed
61 * structure.
62 */
63
64#include <linux/init.h>
65#include <linux/kernel.h>
66#include <linux/list.h>
67#include <linux/sched.h>
68#include <linux/jiffies.h>
69#include <linux/delay.h>
70#include <linux/module.h>
71#include <linux/kthread.h>
72#include <linux/prio_tree.h>
73#include <linux/gfp.h>
74#include <linux/fs.h>
75#include <linux/debugfs.h>
76#include <linux/seq_file.h>
77#include <linux/cpumask.h>
78#include <linux/spinlock.h>
79#include <linux/mutex.h>
80#include <linux/rcupdate.h>
81#include <linux/stacktrace.h>
82#include <linux/cache.h>
83#include <linux/percpu.h>
84#include <linux/hardirq.h>
85#include <linux/mmzone.h>
86#include <linux/slab.h>
87#include <linux/thread_info.h>
88#include <linux/err.h>
89#include <linux/uaccess.h>
90#include <linux/string.h>
91#include <linux/nodemask.h>
92#include <linux/mm.h>
93
94#include <asm/sections.h>
95#include <asm/processor.h>
96#include <asm/atomic.h>
97
98#include <linux/kmemleak.h>
99
100/*
101 * Kmemleak configuration and common defines.
102 */
103#define MAX_TRACE 16 /* stack trace length */
104#define REPORTS_NR 50 /* maximum number of reported leaks */
105#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
106#define MSECS_SCAN_YIELD 10 /* CPU yielding period */
107#define SECS_FIRST_SCAN 60 /* delay before the first scan */
108#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
109
110#define BYTES_PER_POINTER sizeof(void *)
111
112/* scanning area inside a memory block */
113struct kmemleak_scan_area {
114 struct hlist_node node;
115 unsigned long offset;
116 size_t length;
117};
118
119/*
120 * Structure holding the metadata for each allocated memory block.
121 * Modifications to such objects should be made while holding the
122 * object->lock. Insertions or deletions from object_list, gray_list or
123 * tree_node are already protected by the corresponding locks or mutex (see
124 * the notes on locking above). These objects are reference-counted
125 * (use_count) and freed using the RCU mechanism.
126 */
127struct kmemleak_object {
128 spinlock_t lock;
129 unsigned long flags; /* object status flags */
130 struct list_head object_list;
131 struct list_head gray_list;
132 struct prio_tree_node tree_node;
133 struct rcu_head rcu; /* object_list lockless traversal */
134 /* object usage count; object freed when use_count == 0 */
135 atomic_t use_count;
136 unsigned long pointer;
137 size_t size;
138 /* minimum number of a pointers found before it is considered leak */
139 int min_count;
140 /* the total number of pointers found pointing to this object */
141 int count;
142 /* memory ranges to be scanned inside an object (empty for all) */
143 struct hlist_head area_list;
144 unsigned long trace[MAX_TRACE];
145 unsigned int trace_len;
146 unsigned long jiffies; /* creation timestamp */
147 pid_t pid; /* pid of the current task */
148 char comm[TASK_COMM_LEN]; /* executable name */
149};
150
151/* flag representing the memory block allocation status */
152#define OBJECT_ALLOCATED (1 << 0)
153/* flag set after the first reporting of an unreference object */
154#define OBJECT_REPORTED (1 << 1)
155/* flag set to not scan the object */
156#define OBJECT_NO_SCAN (1 << 2)
157
158/* the list of all allocated objects */
159static LIST_HEAD(object_list);
160/* the list of gray-colored objects (see color_gray comment below) */
161static LIST_HEAD(gray_list);
162/* prio search tree for object boundaries */
163static struct prio_tree_root object_tree_root;
164/* rw_lock protecting the access to object_list and prio_tree_root */
165static DEFINE_RWLOCK(kmemleak_lock);
166
167/* allocation caches for kmemleak internal data */
168static struct kmem_cache *object_cache;
169static struct kmem_cache *scan_area_cache;
170
171/* set if tracing memory operations is enabled */
172static atomic_t kmemleak_enabled = ATOMIC_INIT(0);
173/* set in the late_initcall if there were no errors */
174static atomic_t kmemleak_initialized = ATOMIC_INIT(0);
175/* enables or disables early logging of the memory operations */
176static atomic_t kmemleak_early_log = ATOMIC_INIT(1);
177/* set if a fata kmemleak error has occurred */
178static atomic_t kmemleak_error = ATOMIC_INIT(0);
179
180/* minimum and maximum address that may be valid pointers */
181static unsigned long min_addr = ULONG_MAX;
182static unsigned long max_addr;
183
184/* used for yielding the CPU to other tasks during scanning */
185static unsigned long next_scan_yield;
186static struct task_struct *scan_thread;
187static unsigned long jiffies_scan_yield;
188static unsigned long jiffies_min_age;
189/* delay between automatic memory scannings */
190static signed long jiffies_scan_wait;
191/* enables or disables the task stacks scanning */
192static int kmemleak_stack_scan;
193/* mutex protecting the memory scanning */
194static DEFINE_MUTEX(scan_mutex);
195/* mutex protecting the access to the /sys/kernel/debug/kmemleak file */
196static DEFINE_MUTEX(kmemleak_mutex);
197
198/* number of leaks reported (for limitation purposes) */
199static int reported_leaks;
200
201/*
202 * Early object allocation/freeing logging. Kkmemleak is initialized after the
203 * kernel allocator. However, both the kernel allocator and kmemleak may
204 * allocate memory blocks which need to be tracked. Kkmemleak defines an
205 * arbitrary buffer to hold the allocation/freeing information before it is
206 * fully initialized.
207 */
208
209/* kmemleak operation type for early logging */
210enum {
211 KMEMLEAK_ALLOC,
212 KMEMLEAK_FREE,
213 KMEMLEAK_NOT_LEAK,
214 KMEMLEAK_IGNORE,
215 KMEMLEAK_SCAN_AREA,
216 KMEMLEAK_NO_SCAN
217};
218
219/*
220 * Structure holding the information passed to kmemleak callbacks during the
221 * early logging.
222 */
223struct early_log {
224 int op_type; /* kmemleak operation type */
225 const void *ptr; /* allocated/freed memory block */
226 size_t size; /* memory block size */
227 int min_count; /* minimum reference count */
228 unsigned long offset; /* scan area offset */
229 size_t length; /* scan area length */
230};
231
232/* early logging buffer and current position */
233static struct early_log early_log[200];
234static int crt_early_log;
235
236static void kmemleak_disable(void);
237
238/*
239 * Print a warning and dump the stack trace.
240 */
241#define kmemleak_warn(x...) do { \
242 pr_warning(x); \
243 dump_stack(); \
244} while (0)
245
246/*
247 * Macro invoked when a serious kmemleak condition occured and cannot be
248 * recovered from. Kkmemleak will be disabled and further allocation/freeing
249 * tracing no longer available.
250 */
251#define kmemleak_panic(x...) do { \
252 kmemleak_warn(x); \
253 kmemleak_disable(); \
254} while (0)
255
256/*
257 * Object colors, encoded with count and min_count:
258 * - white - orphan object, not enough references to it (count < min_count)
259 * - gray - not orphan, not marked as false positive (min_count == 0) or
260 * sufficient references to it (count >= min_count)
261 * - black - ignore, it doesn't contain references (e.g. text section)
262 * (min_count == -1). No function defined for this color.
263 * Newly created objects don't have any color assigned (object->count == -1)
264 * before the next memory scan when they become white.
265 */
266static int color_white(const struct kmemleak_object *object)
267{
268 return object->count != -1 && object->count < object->min_count;
269}
270
271static int color_gray(const struct kmemleak_object *object)
272{
273 return object->min_count != -1 && object->count >= object->min_count;
274}
275
276/*
277 * Objects are considered referenced if their color is gray and they have not
278 * been deleted.
279 */
280static int referenced_object(struct kmemleak_object *object)
281{
282 return (object->flags & OBJECT_ALLOCATED) && color_gray(object);
283}
284
285/*
286 * Objects are considered unreferenced only if their color is white, they have
287 * not be deleted and have a minimum age to avoid false positives caused by
288 * pointers temporarily stored in CPU registers.
289 */
290static int unreferenced_object(struct kmemleak_object *object)
291{
292 return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
293 time_is_before_eq_jiffies(object->jiffies + jiffies_min_age);
294}
295
296/*
297 * Printing of the (un)referenced objects information, either to the seq file
298 * or to the kernel log. The print_referenced/print_unreferenced functions
299 * must be called with the object->lock held.
300 */
301#define print_helper(seq, x...) do { \
302 struct seq_file *s = (seq); \
303 if (s) \
304 seq_printf(s, x); \
305 else \
306 pr_info(x); \
307} while (0)
308
309static void print_referenced(struct kmemleak_object *object)
310{
311 pr_info("kmemleak: referenced object 0x%08lx (size %zu)\n",
312 object->pointer, object->size);
313}
314
315static void print_unreferenced(struct seq_file *seq,
316 struct kmemleak_object *object)
317{
318 int i;
319
320 print_helper(seq, "kmemleak: unreferenced object 0x%08lx (size %zu):\n",
321 object->pointer, object->size);
322 print_helper(seq, " comm \"%s\", pid %d, jiffies %lu\n",
323 object->comm, object->pid, object->jiffies);
324 print_helper(seq, " backtrace:\n");
325
326 for (i = 0; i < object->trace_len; i++) {
327 void *ptr = (void *)object->trace[i];
328 print_helper(seq, " [<%p>] %pS\n", ptr, ptr);
329 }
330}
331
332/*
333 * Print the kmemleak_object information. This function is used mainly for
334 * debugging special cases when kmemleak operations. It must be called with
335 * the object->lock held.
336 */
337static void dump_object_info(struct kmemleak_object *object)
338{
339 struct stack_trace trace;
340
341 trace.nr_entries = object->trace_len;
342 trace.entries = object->trace;
343
344 pr_notice("kmemleak: Object 0x%08lx (size %zu):\n",
345 object->tree_node.start, object->size);
346 pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
347 object->comm, object->pid, object->jiffies);
348 pr_notice(" min_count = %d\n", object->min_count);
349 pr_notice(" count = %d\n", object->count);
350 pr_notice(" backtrace:\n");
351 print_stack_trace(&trace, 4);
352}
353
354/*
355 * Look-up a memory block metadata (kmemleak_object) in the priority search
356 * tree based on a pointer value. If alias is 0, only values pointing to the
357 * beginning of the memory block are allowed. The kmemleak_lock must be held
358 * when calling this function.
359 */
360static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
361{
362 struct prio_tree_node *node;
363 struct prio_tree_iter iter;
364 struct kmemleak_object *object;
365
366 prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr);
367 node = prio_tree_next(&iter);
368 if (node) {
369 object = prio_tree_entry(node, struct kmemleak_object,
370 tree_node);
371 if (!alias && object->pointer != ptr) {
372 kmemleak_warn("kmemleak: Found object by alias");
373 object = NULL;
374 }
375 } else
376 object = NULL;
377
378 return object;
379}
380
381/*
382 * Increment the object use_count. Return 1 if successful or 0 otherwise. Note
383 * that once an object's use_count reached 0, the RCU freeing was already
384 * registered and the object should no longer be used. This function must be
385 * called under the protection of rcu_read_lock().
386 */
387static int get_object(struct kmemleak_object *object)
388{
389 return atomic_inc_not_zero(&object->use_count);
390}
391
392/*
393 * RCU callback to free a kmemleak_object.
394 */
395static void free_object_rcu(struct rcu_head *rcu)
396{
397 struct hlist_node *elem, *tmp;
398 struct kmemleak_scan_area *area;
399 struct kmemleak_object *object =
400 container_of(rcu, struct kmemleak_object, rcu);
401
402 /*
403 * Once use_count is 0 (guaranteed by put_object), there is no other
404 * code accessing this object, hence no need for locking.
405 */
406 hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) {
407 hlist_del(elem);
408 kmem_cache_free(scan_area_cache, area);
409 }
410 kmem_cache_free(object_cache, object);
411}
412
413/*
414 * Decrement the object use_count. Once the count is 0, free the object using
415 * an RCU callback. Since put_object() may be called via the kmemleak_free() ->
416 * delete_object() path, the delayed RCU freeing ensures that there is no
417 * recursive call to the kernel allocator. Lock-less RCU object_list traversal
418 * is also possible.
419 */
420static void put_object(struct kmemleak_object *object)
421{
422 if (!atomic_dec_and_test(&object->use_count))
423 return;
424
425 /* should only get here after delete_object was called */
426 WARN_ON(object->flags & OBJECT_ALLOCATED);
427
428 call_rcu(&object->rcu, free_object_rcu);
429}
430
431/*
432 * Look up an object in the prio search tree and increase its use_count.
433 */
434static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
435{
436 unsigned long flags;
437 struct kmemleak_object *object = NULL;
438
439 rcu_read_lock();
440 read_lock_irqsave(&kmemleak_lock, flags);
441 if (ptr >= min_addr && ptr < max_addr)
442 object = lookup_object(ptr, alias);
443 read_unlock_irqrestore(&kmemleak_lock, flags);
444
445 /* check whether the object is still available */
446 if (object && !get_object(object))
447 object = NULL;
448 rcu_read_unlock();
449
450 return object;
451}
452
453/*
454 * Create the metadata (struct kmemleak_object) corresponding to an allocated
455 * memory block and add it to the object_list and object_tree_root.
456 */
457static void create_object(unsigned long ptr, size_t size, int min_count,
458 gfp_t gfp)
459{
460 unsigned long flags;
461 struct kmemleak_object *object;
462 struct prio_tree_node *node;
463 struct stack_trace trace;
464
465 object = kmem_cache_alloc(object_cache, gfp & ~GFP_SLAB_BUG_MASK);
466 if (!object) {
467 kmemleak_panic("kmemleak: Cannot allocate a kmemleak_object "
468 "structure\n");
469 return;
470 }
471
472 INIT_LIST_HEAD(&object->object_list);
473 INIT_LIST_HEAD(&object->gray_list);
474 INIT_HLIST_HEAD(&object->area_list);
475 spin_lock_init(&object->lock);
476 atomic_set(&object->use_count, 1);
477 object->flags = OBJECT_ALLOCATED;
478 object->pointer = ptr;
479 object->size = size;
480 object->min_count = min_count;
481 object->count = -1; /* no color initially */
482 object->jiffies = jiffies;
483
484 /* task information */
485 if (in_irq()) {
486 object->pid = 0;
487 strncpy(object->comm, "hardirq", sizeof(object->comm));
488 } else if (in_softirq()) {
489 object->pid = 0;
490 strncpy(object->comm, "softirq", sizeof(object->comm));
491 } else {
492 object->pid = current->pid;
493 /*
494 * There is a small chance of a race with set_task_comm(),
495 * however using get_task_comm() here may cause locking
496 * dependency issues with current->alloc_lock. In the worst
497 * case, the command line is not correct.
498 */
499 strncpy(object->comm, current->comm, sizeof(object->comm));
500 }
501
502 /* kernel backtrace */
503 trace.max_entries = MAX_TRACE;
504 trace.nr_entries = 0;
505 trace.entries = object->trace;
506 trace.skip = 1;
507 save_stack_trace(&trace);
508 object->trace_len = trace.nr_entries;
509
510 INIT_PRIO_TREE_NODE(&object->tree_node);
511 object->tree_node.start = ptr;
512 object->tree_node.last = ptr + size - 1;
513
514 write_lock_irqsave(&kmemleak_lock, flags);
515 min_addr = min(min_addr, ptr);
516 max_addr = max(max_addr, ptr + size);
517 node = prio_tree_insert(&object_tree_root, &object->tree_node);
518 /*
519 * The code calling the kernel does not yet have the pointer to the
520 * memory block to be able to free it. However, we still hold the
521 * kmemleak_lock here in case parts of the kernel started freeing
522 * random memory blocks.
523 */
524 if (node != &object->tree_node) {
525 unsigned long flags;
526
527 kmemleak_panic("kmemleak: Cannot insert 0x%lx into the object "
528 "search tree (already existing)\n", ptr);
529 object = lookup_object(ptr, 1);
530 spin_lock_irqsave(&object->lock, flags);
531 dump_object_info(object);
532 spin_unlock_irqrestore(&object->lock, flags);
533
534 goto out;
535 }
536 list_add_tail_rcu(&object->object_list, &object_list);
537out:
538 write_unlock_irqrestore(&kmemleak_lock, flags);
539}
540
541/*
542 * Remove the metadata (struct kmemleak_object) for a memory block from the
543 * object_list and object_tree_root and decrement its use_count.
544 */
545static void delete_object(unsigned long ptr)
546{
547 unsigned long flags;
548 struct kmemleak_object *object;
549
550 write_lock_irqsave(&kmemleak_lock, flags);
551 object = lookup_object(ptr, 0);
552 if (!object) {
553 kmemleak_warn("kmemleak: Freeing unknown object at 0x%08lx\n",
554 ptr);
555 write_unlock_irqrestore(&kmemleak_lock, flags);
556 return;
557 }
558 prio_tree_remove(&object_tree_root, &object->tree_node);
559 list_del_rcu(&object->object_list);
560 write_unlock_irqrestore(&kmemleak_lock, flags);
561
562 WARN_ON(!(object->flags & OBJECT_ALLOCATED));
563 WARN_ON(atomic_read(&object->use_count) < 1);
564
565 /*
566 * Locking here also ensures that the corresponding memory block
567 * cannot be freed when it is being scanned.
568 */
569 spin_lock_irqsave(&object->lock, flags);
570 if (object->flags & OBJECT_REPORTED)
571 print_referenced(object);
572 object->flags &= ~OBJECT_ALLOCATED;
573 spin_unlock_irqrestore(&object->lock, flags);
574 put_object(object);
575}
576
577/*
578 * Make a object permanently as gray-colored so that it can no longer be
579 * reported as a leak. This is used in general to mark a false positive.
580 */
581static void make_gray_object(unsigned long ptr)
582{
583 unsigned long flags;
584 struct kmemleak_object *object;
585
586 object = find_and_get_object(ptr, 0);
587 if (!object) {
588 kmemleak_warn("kmemleak: Graying unknown object at 0x%08lx\n",
589 ptr);
590 return;
591 }
592
593 spin_lock_irqsave(&object->lock, flags);
594 object->min_count = 0;
595 spin_unlock_irqrestore(&object->lock, flags);
596 put_object(object);
597}
598
599/*
600 * Mark the object as black-colored so that it is ignored from scans and
601 * reporting.
602 */
603static void make_black_object(unsigned long ptr)
604{
605 unsigned long flags;
606 struct kmemleak_object *object;
607
608 object = find_and_get_object(ptr, 0);
609 if (!object) {
610 kmemleak_warn("kmemleak: Blacking unknown object at 0x%08lx\n",
611 ptr);
612 return;
613 }
614
615 spin_lock_irqsave(&object->lock, flags);
616 object->min_count = -1;
617 spin_unlock_irqrestore(&object->lock, flags);
618 put_object(object);
619}
620
621/*
622 * Add a scanning area to the object. If at least one such area is added,
623 * kmemleak will only scan these ranges rather than the whole memory block.
624 */
625static void add_scan_area(unsigned long ptr, unsigned long offset,
626 size_t length, gfp_t gfp)
627{
628 unsigned long flags;
629 struct kmemleak_object *object;
630 struct kmemleak_scan_area *area;
631
632 object = find_and_get_object(ptr, 0);
633 if (!object) {
634 kmemleak_warn("kmemleak: Adding scan area to unknown "
635 "object at 0x%08lx\n", ptr);
636 return;
637 }
638
639 area = kmem_cache_alloc(scan_area_cache, gfp & ~GFP_SLAB_BUG_MASK);
640 if (!area) {
641 kmemleak_warn("kmemleak: Cannot allocate a scan area\n");
642 goto out;
643 }
644
645 spin_lock_irqsave(&object->lock, flags);
646 if (offset + length > object->size) {
647 kmemleak_warn("kmemleak: Scan area larger than object "
648 "0x%08lx\n", ptr);
649 dump_object_info(object);
650 kmem_cache_free(scan_area_cache, area);
651 goto out_unlock;
652 }
653
654 INIT_HLIST_NODE(&area->node);
655 area->offset = offset;
656 area->length = length;
657
658 hlist_add_head(&area->node, &object->area_list);
659out_unlock:
660 spin_unlock_irqrestore(&object->lock, flags);
661out:
662 put_object(object);
663}
664
665/*
666 * Set the OBJECT_NO_SCAN flag for the object corresponding to the give
667 * pointer. Such object will not be scanned by kmemleak but references to it
668 * are searched.
669 */
670static void object_no_scan(unsigned long ptr)
671{
672 unsigned long flags;
673 struct kmemleak_object *object;
674
675 object = find_and_get_object(ptr, 0);
676 if (!object) {
677 kmemleak_warn("kmemleak: Not scanning unknown object at "
678 "0x%08lx\n", ptr);
679 return;
680 }
681
682 spin_lock_irqsave(&object->lock, flags);
683 object->flags |= OBJECT_NO_SCAN;
684 spin_unlock_irqrestore(&object->lock, flags);
685 put_object(object);
686}
687
688/*
689 * Log an early kmemleak_* call to the early_log buffer. These calls will be
690 * processed later once kmemleak is fully initialized.
691 */
692static void log_early(int op_type, const void *ptr, size_t size,
693 int min_count, unsigned long offset, size_t length)
694{
695 unsigned long flags;
696 struct early_log *log;
697
698 if (crt_early_log >= ARRAY_SIZE(early_log)) {
699 kmemleak_panic("kmemleak: Early log buffer exceeded\n");
700 return;
701 }
702
703 /*
704 * There is no need for locking since the kernel is still in UP mode
705 * at this stage. Disabling the IRQs is enough.
706 */
707 local_irq_save(flags);
708 log = &early_log[crt_early_log];
709 log->op_type = op_type;
710 log->ptr = ptr;
711 log->size = size;
712 log->min_count = min_count;
713 log->offset = offset;
714 log->length = length;
715 crt_early_log++;
716 local_irq_restore(flags);
717}
718
719/*
720 * Memory allocation function callback. This function is called from the
721 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
722 * vmalloc etc.).
723 */
724void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp)
725{
726 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
727
728 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
729 create_object((unsigned long)ptr, size, min_count, gfp);
730 else if (atomic_read(&kmemleak_early_log))
731 log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0);
732}
733EXPORT_SYMBOL_GPL(kmemleak_alloc);
734
735/*
736 * Memory freeing function callback. This function is called from the kernel
737 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
738 */
739void kmemleak_free(const void *ptr)
740{
741 pr_debug("%s(0x%p)\n", __func__, ptr);
742
743 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
744 delete_object((unsigned long)ptr);
745 else if (atomic_read(&kmemleak_early_log))
746 log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0);
747}
748EXPORT_SYMBOL_GPL(kmemleak_free);
749
750/*
751 * Mark an already allocated memory block as a false positive. This will cause
752 * the block to no longer be reported as leak and always be scanned.
753 */
754void kmemleak_not_leak(const void *ptr)
755{
756 pr_debug("%s(0x%p)\n", __func__, ptr);
757
758 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
759 make_gray_object((unsigned long)ptr);
760 else if (atomic_read(&kmemleak_early_log))
761 log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0);
762}
763EXPORT_SYMBOL(kmemleak_not_leak);
764
765/*
766 * Ignore a memory block. This is usually done when it is known that the
767 * corresponding block is not a leak and does not contain any references to
768 * other allocated memory blocks.
769 */
770void kmemleak_ignore(const void *ptr)
771{
772 pr_debug("%s(0x%p)\n", __func__, ptr);
773
774 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
775 make_black_object((unsigned long)ptr);
776 else if (atomic_read(&kmemleak_early_log))
777 log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0);
778}
779EXPORT_SYMBOL(kmemleak_ignore);
780
781/*
782 * Limit the range to be scanned in an allocated memory block.
783 */
784void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length,
785 gfp_t gfp)
786{
787 pr_debug("%s(0x%p)\n", __func__, ptr);
788
789 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
790 add_scan_area((unsigned long)ptr, offset, length, gfp);
791 else if (atomic_read(&kmemleak_early_log))
792 log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length);
793}
794EXPORT_SYMBOL(kmemleak_scan_area);
795
796/*
797 * Inform kmemleak not to scan the given memory block.
798 */
799void kmemleak_no_scan(const void *ptr)
800{
801 pr_debug("%s(0x%p)\n", __func__, ptr);
802
803 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
804 object_no_scan((unsigned long)ptr);
805 else if (atomic_read(&kmemleak_early_log))
806 log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0);
807}
808EXPORT_SYMBOL(kmemleak_no_scan);
809
810/*
811 * Yield the CPU so that other tasks get a chance to run. The yielding is
812 * rate-limited to avoid excessive number of calls to the schedule() function
813 * during memory scanning.
814 */
815static void scan_yield(void)
816{
817 might_sleep();
818
819 if (time_is_before_eq_jiffies(next_scan_yield)) {
820 schedule();
821 next_scan_yield = jiffies + jiffies_scan_yield;
822 }
823}
824
825/*
826 * Memory scanning is a long process and it needs to be interruptable. This
827 * function checks whether such interrupt condition occured.
828 */
829static int scan_should_stop(void)
830{
831 if (!atomic_read(&kmemleak_enabled))
832 return 1;
833
834 /*
835 * This function may be called from either process or kthread context,
836 * hence the need to check for both stop conditions.
837 */
838 if (current->mm)
839 return signal_pending(current);
840 else
841 return kthread_should_stop();
842
843 return 0;
844}
845
846/*
847 * Scan a memory block (exclusive range) for valid pointers and add those
848 * found to the gray list.
849 */
850static void scan_block(void *_start, void *_end,
851 struct kmemleak_object *scanned)
852{
853 unsigned long *ptr;
854 unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
855 unsigned long *end = _end - (BYTES_PER_POINTER - 1);
856
857 for (ptr = start; ptr < end; ptr++) {
858 unsigned long flags;
859 unsigned long pointer = *ptr;
860 struct kmemleak_object *object;
861
862 if (scan_should_stop())
863 break;
864
865 /*
866 * When scanning a memory block with a corresponding
867 * kmemleak_object, the CPU yielding is handled in the calling
868 * code since it holds the object->lock to avoid the block
869 * freeing.
870 */
871 if (!scanned)
872 scan_yield();
873
874 object = find_and_get_object(pointer, 1);
875 if (!object)
876 continue;
877 if (object == scanned) {
878 /* self referenced, ignore */
879 put_object(object);
880 continue;
881 }
882
883 /*
884 * Avoid the lockdep recursive warning on object->lock being
885 * previously acquired in scan_object(). These locks are
886 * enclosed by scan_mutex.
887 */
888 spin_lock_irqsave_nested(&object->lock, flags,
889 SINGLE_DEPTH_NESTING);
890 if (!color_white(object)) {
891 /* non-orphan, ignored or new */
892 spin_unlock_irqrestore(&object->lock, flags);
893 put_object(object);
894 continue;
895 }
896
897 /*
898 * Increase the object's reference count (number of pointers
899 * to the memory block). If this count reaches the required
900 * minimum, the object's color will become gray and it will be
901 * added to the gray_list.
902 */
903 object->count++;
904 if (color_gray(object))
905 list_add_tail(&object->gray_list, &gray_list);
906 else
907 put_object(object);
908 spin_unlock_irqrestore(&object->lock, flags);
909 }
910}
911
912/*
913 * Scan a memory block corresponding to a kmemleak_object. A condition is
914 * that object->use_count >= 1.
915 */
916static void scan_object(struct kmemleak_object *object)
917{
918 struct kmemleak_scan_area *area;
919 struct hlist_node *elem;
920 unsigned long flags;
921
922 /*
923 * Once the object->lock is aquired, the corresponding memory block
924 * cannot be freed (the same lock is aquired in delete_object).
925 */
926 spin_lock_irqsave(&object->lock, flags);
927 if (object->flags & OBJECT_NO_SCAN)
928 goto out;
929 if (!(object->flags & OBJECT_ALLOCATED))
930 /* already freed object */
931 goto out;
932 if (hlist_empty(&object->area_list))
933 scan_block((void *)object->pointer,
934 (void *)(object->pointer + object->size), object);
935 else
936 hlist_for_each_entry(area, elem, &object->area_list, node)
937 scan_block((void *)(object->pointer + area->offset),
938 (void *)(object->pointer + area->offset
939 + area->length), object);
940out:
941 spin_unlock_irqrestore(&object->lock, flags);
942}
943
944/*
945 * Scan data sections and all the referenced memory blocks allocated via the
946 * kernel's standard allocators. This function must be called with the
947 * scan_mutex held.
948 */
949static void kmemleak_scan(void)
950{
951 unsigned long flags;
952 struct kmemleak_object *object, *tmp;
953 struct task_struct *task;
954 int i;
955
956 /* prepare the kmemleak_object's */
957 rcu_read_lock();
958 list_for_each_entry_rcu(object, &object_list, object_list) {
959 spin_lock_irqsave(&object->lock, flags);
960#ifdef DEBUG
961 /*
962 * With a few exceptions there should be a maximum of
963 * 1 reference to any object at this point.
964 */
965 if (atomic_read(&object->use_count) > 1) {
966 pr_debug("kmemleak: object->use_count = %d\n",
967 atomic_read(&object->use_count));
968 dump_object_info(object);
969 }
970#endif
971 /* reset the reference count (whiten the object) */
972 object->count = 0;
973 if (color_gray(object) && get_object(object))
974 list_add_tail(&object->gray_list, &gray_list);
975
976 spin_unlock_irqrestore(&object->lock, flags);
977 }
978 rcu_read_unlock();
979
980 /* data/bss scanning */
981 scan_block(_sdata, _edata, NULL);
982 scan_block(__bss_start, __bss_stop, NULL);
983
984#ifdef CONFIG_SMP
985 /* per-cpu sections scanning */
986 for_each_possible_cpu(i)
987 scan_block(__per_cpu_start + per_cpu_offset(i),
988 __per_cpu_end + per_cpu_offset(i), NULL);
989#endif
990
991 /*
992 * Struct page scanning for each node. The code below is not yet safe
993 * with MEMORY_HOTPLUG.
994 */
995 for_each_online_node(i) {
996 pg_data_t *pgdat = NODE_DATA(i);
997 unsigned long start_pfn = pgdat->node_start_pfn;
998 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
999 unsigned long pfn;
1000
1001 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1002 struct page *page;
1003
1004 if (!pfn_valid(pfn))
1005 continue;
1006 page = pfn_to_page(pfn);
1007 /* only scan if page is in use */
1008 if (page_count(page) == 0)
1009 continue;
1010 scan_block(page, page + 1, NULL);
1011 }
1012 }
1013
1014 /*
1015 * Scanning the task stacks may introduce false negatives and it is
1016 * not enabled by default.
1017 */
1018 if (kmemleak_stack_scan) {
1019 read_lock(&tasklist_lock);
1020 for_each_process(task)
1021 scan_block(task_stack_page(task),
1022 task_stack_page(task) + THREAD_SIZE, NULL);
1023 read_unlock(&tasklist_lock);
1024 }
1025
1026 /*
1027 * Scan the objects already referenced from the sections scanned
1028 * above. More objects will be referenced and, if there are no memory
1029 * leaks, all the objects will be scanned. The list traversal is safe
1030 * for both tail additions and removals from inside the loop. The
1031 * kmemleak objects cannot be freed from outside the loop because their
1032 * use_count was increased.
1033 */
1034 object = list_entry(gray_list.next, typeof(*object), gray_list);
1035 while (&object->gray_list != &gray_list) {
1036 scan_yield();
1037
1038 /* may add new objects to the list */
1039 if (!scan_should_stop())
1040 scan_object(object);
1041
1042 tmp = list_entry(object->gray_list.next, typeof(*object),
1043 gray_list);
1044
1045 /* remove the object from the list and release it */
1046 list_del(&object->gray_list);
1047 put_object(object);
1048
1049 object = tmp;
1050 }
1051 WARN_ON(!list_empty(&gray_list));
1052}
1053
1054/*
1055 * Thread function performing automatic memory scanning. Unreferenced objects
1056 * at the end of a memory scan are reported but only the first time.
1057 */
1058static int kmemleak_scan_thread(void *arg)
1059{
1060 static int first_run = 1;
1061
1062 pr_info("kmemleak: Automatic memory scanning thread started\n");
1063
1064 /*
1065 * Wait before the first scan to allow the system to fully initialize.
1066 */
1067 if (first_run) {
1068 first_run = 0;
1069 ssleep(SECS_FIRST_SCAN);
1070 }
1071
1072 while (!kthread_should_stop()) {
1073 struct kmemleak_object *object;
1074 signed long timeout = jiffies_scan_wait;
1075
1076 mutex_lock(&scan_mutex);
1077
1078 kmemleak_scan();
1079 reported_leaks = 0;
1080
1081 rcu_read_lock();
1082 list_for_each_entry_rcu(object, &object_list, object_list) {
1083 unsigned long flags;
1084
1085 if (reported_leaks >= REPORTS_NR)
1086 break;
1087 spin_lock_irqsave(&object->lock, flags);
1088 if (!(object->flags & OBJECT_REPORTED) &&
1089 unreferenced_object(object)) {
1090 print_unreferenced(NULL, object);
1091 object->flags |= OBJECT_REPORTED;
1092 reported_leaks++;
1093 } else if ((object->flags & OBJECT_REPORTED) &&
1094 referenced_object(object)) {
1095 print_referenced(object);
1096 object->flags &= ~OBJECT_REPORTED;
1097 }
1098 spin_unlock_irqrestore(&object->lock, flags);
1099 }
1100 rcu_read_unlock();
1101
1102 mutex_unlock(&scan_mutex);
1103 /* wait before the next scan */
1104 while (timeout && !kthread_should_stop())
1105 timeout = schedule_timeout_interruptible(timeout);
1106 }
1107
1108 pr_info("kmemleak: Automatic memory scanning thread ended\n");
1109
1110 return 0;
1111}
1112
1113/*
1114 * Start the automatic memory scanning thread. This function must be called
1115 * with the kmemleak_mutex held.
1116 */
1117void start_scan_thread(void)
1118{
1119 if (scan_thread)
1120 return;
1121 scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak");
1122 if (IS_ERR(scan_thread)) {
1123 pr_warning("kmemleak: Failed to create the scan thread\n");
1124 scan_thread = NULL;
1125 }
1126}
1127
1128/*
1129 * Stop the automatic memory scanning thread. This function must be called
1130 * with the kmemleak_mutex held.
1131 */
1132void stop_scan_thread(void)
1133{
1134 if (scan_thread) {
1135 kthread_stop(scan_thread);
1136 scan_thread = NULL;
1137 }
1138}
1139
1140/*
1141 * Iterate over the object_list and return the first valid object at or after
1142 * the required position with its use_count incremented. The function triggers
1143 * a memory scanning when the pos argument points to the first position.
1144 */
1145static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
1146{
1147 struct kmemleak_object *object;
1148 loff_t n = *pos;
1149
1150 if (!n) {
1151 kmemleak_scan();
1152 reported_leaks = 0;
1153 }
1154 if (reported_leaks >= REPORTS_NR)
1155 return NULL;
1156
1157 rcu_read_lock();
1158 list_for_each_entry_rcu(object, &object_list, object_list) {
1159 if (n-- > 0)
1160 continue;
1161 if (get_object(object))
1162 goto out;
1163 }
1164 object = NULL;
1165out:
1166 rcu_read_unlock();
1167 return object;
1168}
1169
1170/*
1171 * Return the next object in the object_list. The function decrements the
1172 * use_count of the previous object and increases that of the next one.
1173 */
1174static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1175{
1176 struct kmemleak_object *prev_obj = v;
1177 struct kmemleak_object *next_obj = NULL;
1178 struct list_head *n = &prev_obj->object_list;
1179
1180 ++(*pos);
1181 if (reported_leaks >= REPORTS_NR)
1182 goto out;
1183
1184 rcu_read_lock();
1185 list_for_each_continue_rcu(n, &object_list) {
1186 next_obj = list_entry(n, struct kmemleak_object, object_list);
1187 if (get_object(next_obj))
1188 break;
1189 }
1190 rcu_read_unlock();
1191out:
1192 put_object(prev_obj);
1193 return next_obj;
1194}
1195
1196/*
1197 * Decrement the use_count of the last object required, if any.
1198 */
1199static void kmemleak_seq_stop(struct seq_file *seq, void *v)
1200{
1201 if (v)
1202 put_object(v);
1203}
1204
1205/*
1206 * Print the information for an unreferenced object to the seq file.
1207 */
1208static int kmemleak_seq_show(struct seq_file *seq, void *v)
1209{
1210 struct kmemleak_object *object = v;
1211 unsigned long flags;
1212
1213 spin_lock_irqsave(&object->lock, flags);
1214 if (!unreferenced_object(object))
1215 goto out;
1216 print_unreferenced(seq, object);
1217 reported_leaks++;
1218out:
1219 spin_unlock_irqrestore(&object->lock, flags);
1220 return 0;
1221}
1222
1223static const struct seq_operations kmemleak_seq_ops = {
1224 .start = kmemleak_seq_start,
1225 .next = kmemleak_seq_next,
1226 .stop = kmemleak_seq_stop,
1227 .show = kmemleak_seq_show,
1228};
1229
1230static int kmemleak_open(struct inode *inode, struct file *file)
1231{
1232 int ret = 0;
1233
1234 if (!atomic_read(&kmemleak_enabled))
1235 return -EBUSY;
1236
1237 ret = mutex_lock_interruptible(&kmemleak_mutex);
1238 if (ret < 0)
1239 goto out;
1240 if (file->f_mode & FMODE_READ) {
1241 ret = mutex_lock_interruptible(&scan_mutex);
1242 if (ret < 0)
1243 goto kmemleak_unlock;
1244 ret = seq_open(file, &kmemleak_seq_ops);
1245 if (ret < 0)
1246 goto scan_unlock;
1247 }
1248 return ret;
1249
1250scan_unlock:
1251 mutex_unlock(&scan_mutex);
1252kmemleak_unlock:
1253 mutex_unlock(&kmemleak_mutex);
1254out:
1255 return ret;
1256}
1257
1258static int kmemleak_release(struct inode *inode, struct file *file)
1259{
1260 int ret = 0;
1261
1262 if (file->f_mode & FMODE_READ) {
1263 seq_release(inode, file);
1264 mutex_unlock(&scan_mutex);
1265 }
1266 mutex_unlock(&kmemleak_mutex);
1267
1268 return ret;
1269}
1270
1271/*
1272 * File write operation to configure kmemleak at run-time. The following
1273 * commands can be written to the /sys/kernel/debug/kmemleak file:
1274 * off - disable kmemleak (irreversible)
1275 * stack=on - enable the task stacks scanning
1276 * stack=off - disable the tasks stacks scanning
1277 * scan=on - start the automatic memory scanning thread
1278 * scan=off - stop the automatic memory scanning thread
1279 * scan=... - set the automatic memory scanning period in seconds (0 to
1280 * disable it)
1281 */
1282static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1283 size_t size, loff_t *ppos)
1284{
1285 char buf[64];
1286 int buf_size;
1287
1288 if (!atomic_read(&kmemleak_enabled))
1289 return -EBUSY;
1290
1291 buf_size = min(size, (sizeof(buf) - 1));
1292 if (strncpy_from_user(buf, user_buf, buf_size) < 0)
1293 return -EFAULT;
1294 buf[buf_size] = 0;
1295
1296 if (strncmp(buf, "off", 3) == 0)
1297 kmemleak_disable();
1298 else if (strncmp(buf, "stack=on", 8) == 0)
1299 kmemleak_stack_scan = 1;
1300 else if (strncmp(buf, "stack=off", 9) == 0)
1301 kmemleak_stack_scan = 0;
1302 else if (strncmp(buf, "scan=on", 7) == 0)
1303 start_scan_thread();
1304 else if (strncmp(buf, "scan=off", 8) == 0)
1305 stop_scan_thread();
1306 else if (strncmp(buf, "scan=", 5) == 0) {
1307 unsigned long secs;
1308 int err;
1309
1310 err = strict_strtoul(buf + 5, 0, &secs);
1311 if (err < 0)
1312 return err;
1313 stop_scan_thread();
1314 if (secs) {
1315 jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
1316 start_scan_thread();
1317 }
1318 } else
1319 return -EINVAL;
1320
1321 /* ignore the rest of the buffer, only one command at a time */
1322 *ppos += size;
1323 return size;
1324}
1325
1326static const struct file_operations kmemleak_fops = {
1327 .owner = THIS_MODULE,
1328 .open = kmemleak_open,
1329 .read = seq_read,
1330 .write = kmemleak_write,
1331 .llseek = seq_lseek,
1332 .release = kmemleak_release,
1333};
1334
1335/*
1336 * Perform the freeing of the kmemleak internal objects after waiting for any
1337 * current memory scan to complete.
1338 */
1339static int kmemleak_cleanup_thread(void *arg)
1340{
1341 struct kmemleak_object *object;
1342
1343 mutex_lock(&kmemleak_mutex);
1344 stop_scan_thread();
1345 mutex_unlock(&kmemleak_mutex);
1346
1347 mutex_lock(&scan_mutex);
1348 rcu_read_lock();
1349 list_for_each_entry_rcu(object, &object_list, object_list)
1350 delete_object(object->pointer);
1351 rcu_read_unlock();
1352 mutex_unlock(&scan_mutex);
1353
1354 return 0;
1355}
1356
1357/*
1358 * Start the clean-up thread.
1359 */
1360static void kmemleak_cleanup(void)
1361{
1362 struct task_struct *cleanup_thread;
1363
1364 cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL,
1365 "kmemleak-clean");
1366 if (IS_ERR(cleanup_thread))
1367 pr_warning("kmemleak: Failed to create the clean-up thread\n");
1368}
1369
1370/*
1371 * Disable kmemleak. No memory allocation/freeing will be traced once this
1372 * function is called. Disabling kmemleak is an irreversible operation.
1373 */
1374static void kmemleak_disable(void)
1375{
1376 /* atomically check whether it was already invoked */
1377 if (atomic_cmpxchg(&kmemleak_error, 0, 1))
1378 return;
1379
1380 /* stop any memory operation tracing */
1381 atomic_set(&kmemleak_early_log, 0);
1382 atomic_set(&kmemleak_enabled, 0);
1383
1384 /* check whether it is too early for a kernel thread */
1385 if (atomic_read(&kmemleak_initialized))
1386 kmemleak_cleanup();
1387
1388 pr_info("Kernel memory leak detector disabled\n");
1389}
1390
1391/*
1392 * Allow boot-time kmemleak disabling (enabled by default).
1393 */
1394static int kmemleak_boot_config(char *str)
1395{
1396 if (!str)
1397 return -EINVAL;
1398 if (strcmp(str, "off") == 0)
1399 kmemleak_disable();
1400 else if (strcmp(str, "on") != 0)
1401 return -EINVAL;
1402 return 0;
1403}
1404early_param("kmemleak", kmemleak_boot_config);
1405
1406/*
1407 * Kkmemleak initialization.
1408 */
1409void __init kmemleak_init(void)
1410{
1411 int i;
1412 unsigned long flags;
1413
1414 jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD);
1415 jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
1416 jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
1417
1418 object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
1419 scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
1420 INIT_PRIO_TREE_ROOT(&object_tree_root);
1421
1422 /* the kernel is still in UP mode, so disabling the IRQs is enough */
1423 local_irq_save(flags);
1424 if (!atomic_read(&kmemleak_error)) {
1425 atomic_set(&kmemleak_enabled, 1);
1426 atomic_set(&kmemleak_early_log, 0);
1427 }
1428 local_irq_restore(flags);
1429
1430 /*
1431 * This is the point where tracking allocations is safe. Automatic
1432 * scanning is started during the late initcall. Add the early logged
1433 * callbacks to the kmemleak infrastructure.
1434 */
1435 for (i = 0; i < crt_early_log; i++) {
1436 struct early_log *log = &early_log[i];
1437
1438 switch (log->op_type) {
1439 case KMEMLEAK_ALLOC:
1440 kmemleak_alloc(log->ptr, log->size, log->min_count,
1441 GFP_KERNEL);
1442 break;
1443 case KMEMLEAK_FREE:
1444 kmemleak_free(log->ptr);
1445 break;
1446 case KMEMLEAK_NOT_LEAK:
1447 kmemleak_not_leak(log->ptr);
1448 break;
1449 case KMEMLEAK_IGNORE:
1450 kmemleak_ignore(log->ptr);
1451 break;
1452 case KMEMLEAK_SCAN_AREA:
1453 kmemleak_scan_area(log->ptr, log->offset, log->length,
1454 GFP_KERNEL);
1455 break;
1456 case KMEMLEAK_NO_SCAN:
1457 kmemleak_no_scan(log->ptr);
1458 break;
1459 default:
1460 WARN_ON(1);
1461 }
1462 }
1463}
1464
1465/*
1466 * Late initialization function.
1467 */
1468static int __init kmemleak_late_init(void)
1469{
1470 struct dentry *dentry;
1471
1472 atomic_set(&kmemleak_initialized, 1);
1473
1474 if (atomic_read(&kmemleak_error)) {
1475 /*
1476 * Some error occured and kmemleak was disabled. There is a
1477 * small chance that kmemleak_disable() was called immediately
1478 * after setting kmemleak_initialized and we may end up with
1479 * two clean-up threads but serialized by scan_mutex.
1480 */
1481 kmemleak_cleanup();
1482 return -ENOMEM;
1483 }
1484
1485 dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL,
1486 &kmemleak_fops);
1487 if (!dentry)
1488 pr_warning("kmemleak: Failed to create the debugfs kmemleak "
1489 "file\n");
1490 mutex_lock(&kmemleak_mutex);
1491 start_scan_thread();
1492 mutex_unlock(&kmemleak_mutex);
1493
1494 pr_info("Kernel memory leak detector initialized\n");
1495
1496 return 0;
1497}
1498late_initcall(kmemleak_late_init);
diff --git a/mm/maccess.c b/mm/maccess.c
index ac40796cfb15..9073695ff25f 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -39,7 +39,7 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
39 * Safely write to address @dst from the buffer at @src. If a kernel fault 39 * Safely write to address @dst from the buffer at @src. If a kernel fault
40 * happens, handle that and return -EFAULT. 40 * happens, handle that and return -EFAULT.
41 */ 41 */
42long probe_kernel_write(void *dst, void *src, size_t size) 42long notrace __weak probe_kernel_write(void *dst, void *src, size_t size)
43{ 43{
44 long ret; 44 long ret;
45 mm_segment_t old_fs = get_fs(); 45 mm_segment_t old_fs = get_fs();
diff --git a/mm/madvise.c b/mm/madvise.c
index b9ce574827c8..76eb4193acdd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -123,8 +123,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
123 end = vma->vm_end; 123 end = vma->vm_end;
124 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 124 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
125 125
126 force_page_cache_readahead(file->f_mapping, 126 force_page_cache_readahead(file->f_mapping, file, start, end - start);
127 file, start, max_sane_readahead(end - start));
128 return 0; 127 return 0;
129} 128}
130 129
@@ -239,12 +238,30 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
239 break; 238 break;
240 239
241 default: 240 default:
242 error = -EINVAL; 241 BUG();
243 break; 242 break;
244 } 243 }
245 return error; 244 return error;
246} 245}
247 246
247static int
248madvise_behavior_valid(int behavior)
249{
250 switch (behavior) {
251 case MADV_DOFORK:
252 case MADV_DONTFORK:
253 case MADV_NORMAL:
254 case MADV_SEQUENTIAL:
255 case MADV_RANDOM:
256 case MADV_REMOVE:
257 case MADV_WILLNEED:
258 case MADV_DONTNEED:
259 return 1;
260
261 default:
262 return 0;
263 }
264}
248/* 265/*
249 * The madvise(2) system call. 266 * The madvise(2) system call.
250 * 267 *
@@ -290,6 +307,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
290 int write; 307 int write;
291 size_t len; 308 size_t len;
292 309
310 if (!madvise_behavior_valid(behavior))
311 return error;
312
293 write = madvise_need_mmap_write(behavior); 313 write = madvise_need_mmap_write(behavior);
294 if (write) 314 if (write)
295 down_write(&current->mm->mmap_sem); 315 down_write(&current->mm->mmap_sem);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e44fb0fbb80e..70db6e0a5eec 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -314,14 +314,6 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
314 return mem; 314 return mem;
315} 315}
316 316
317static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
318{
319 if (!mem)
320 return true;
321 return css_is_removed(&mem->css);
322}
323
324
325/* 317/*
326 * Call callback function against all cgroup under hierarchy tree. 318 * Call callback function against all cgroup under hierarchy tree.
327 */ 319 */
@@ -578,6 +570,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
578 return 0; 570 return 0;
579} 571}
580 572
573int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
574{
575 unsigned long active;
576 unsigned long inactive;
577
578 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
579 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
580
581 return (active > inactive);
582}
583
581unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 584unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
582 struct zone *zone, 585 struct zone *zone,
583 enum lru_list lru) 586 enum lru_list lru)
@@ -932,7 +935,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
932 if (unlikely(!mem)) 935 if (unlikely(!mem))
933 return 0; 936 return 0;
934 937
935 VM_BUG_ON(!mem || mem_cgroup_is_obsolete(mem)); 938 VM_BUG_ON(css_is_removed(&mem->css));
936 939
937 while (1) { 940 while (1) {
938 int ret; 941 int ret;
@@ -1024,9 +1027,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
1024 return NULL; 1027 return NULL;
1025 1028
1026 pc = lookup_page_cgroup(page); 1029 pc = lookup_page_cgroup(page);
1027 /* 1030 lock_page_cgroup(pc);
1028 * Used bit of swapcache is solid under page lock.
1029 */
1030 if (PageCgroupUsed(pc)) { 1031 if (PageCgroupUsed(pc)) {
1031 mem = pc->mem_cgroup; 1032 mem = pc->mem_cgroup;
1032 if (mem && !css_tryget(&mem->css)) 1033 if (mem && !css_tryget(&mem->css))
@@ -1040,6 +1041,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
1040 mem = NULL; 1041 mem = NULL;
1041 rcu_read_unlock(); 1042 rcu_read_unlock();
1042 } 1043 }
1044 unlock_page_cgroup(pc);
1043 return mem; 1045 return mem;
1044} 1046}
1045 1047
@@ -1489,8 +1491,9 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
1489 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 1491 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1490} 1492}
1491 1493
1494#ifdef CONFIG_SWAP
1492/* 1495/*
1493 * called from __delete_from_swap_cache() and drop "page" account. 1496 * called after __delete_from_swap_cache() and drop "page" account.
1494 * memcg information is recorded to swap_cgroup of "ent" 1497 * memcg information is recorded to swap_cgroup of "ent"
1495 */ 1498 */
1496void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) 1499void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
@@ -1507,6 +1510,7 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1507 if (memcg) 1510 if (memcg)
1508 css_put(&memcg->css); 1511 css_put(&memcg->css);
1509} 1512}
1513#endif
1510 1514
1511#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 1515#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1512/* 1516/*
@@ -1618,37 +1622,28 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
1618} 1622}
1619 1623
1620/* 1624/*
1621 * A call to try to shrink memory usage under specified resource controller. 1625 * A call to try to shrink memory usage on charge failure at shmem's swapin.
1622 * This is typically used for page reclaiming for shmem for reducing side 1626 * Calling hierarchical_reclaim is not enough because we should update
1623 * effect of page allocation from shmem, which is used by some mem_cgroup. 1627 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
1628 * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
1629 * not from the memcg which this page would be charged to.
1630 * try_charge_swapin does all of these works properly.
1624 */ 1631 */
1625int mem_cgroup_shrink_usage(struct page *page, 1632int mem_cgroup_shmem_charge_fallback(struct page *page,
1626 struct mm_struct *mm, 1633 struct mm_struct *mm,
1627 gfp_t gfp_mask) 1634 gfp_t gfp_mask)
1628{ 1635{
1629 struct mem_cgroup *mem = NULL; 1636 struct mem_cgroup *mem = NULL;
1630 int progress = 0; 1637 int ret;
1631 int retry = MEM_CGROUP_RECLAIM_RETRIES;
1632 1638
1633 if (mem_cgroup_disabled()) 1639 if (mem_cgroup_disabled())
1634 return 0; 1640 return 0;
1635 if (page)
1636 mem = try_get_mem_cgroup_from_swapcache(page);
1637 if (!mem && mm)
1638 mem = try_get_mem_cgroup_from_mm(mm);
1639 if (unlikely(!mem))
1640 return 0;
1641 1641
1642 do { 1642 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
1643 progress = mem_cgroup_hierarchical_reclaim(mem, 1643 if (!ret)
1644 gfp_mask, true, false); 1644 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
1645 progress += mem_cgroup_check_under_limit(mem);
1646 } while (!progress && --retry);
1647 1645
1648 css_put(&mem->css); 1646 return ret;
1649 if (!retry)
1650 return -ENOMEM;
1651 return 0;
1652} 1647}
1653 1648
1654static DEFINE_MUTEX(set_limit_mutex); 1649static DEFINE_MUTEX(set_limit_mutex);
diff --git a/mm/memory.c b/mm/memory.c
index cf6873e91c6a..d5d1653d60a6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1360 return i; 1360 return i;
1361} 1361}
1362 1362
1363/**
1364 * get_user_pages() - pin user pages in memory
1365 * @tsk: task_struct of target task
1366 * @mm: mm_struct of target mm
1367 * @start: starting user address
1368 * @len: number of pages from start to pin
1369 * @write: whether pages will be written to by the caller
1370 * @force: whether to force write access even if user mapping is
1371 * readonly. This will result in the page being COWed even
1372 * in MAP_SHARED mappings. You do not want this.
1373 * @pages: array that receives pointers to the pages pinned.
1374 * Should be at least nr_pages long. Or NULL, if caller
1375 * only intends to ensure the pages are faulted in.
1376 * @vmas: array of pointers to vmas corresponding to each page.
1377 * Or NULL if the caller does not require them.
1378 *
1379 * Returns number of pages pinned. This may be fewer than the number
1380 * requested. If len is 0 or negative, returns 0. If no pages
1381 * were pinned, returns -errno. Each page returned must be released
1382 * with a put_page() call when it is finished with. vmas will only
1383 * remain valid while mmap_sem is held.
1384 *
1385 * Must be called with mmap_sem held for read or write.
1386 *
1387 * get_user_pages walks a process's page tables and takes a reference to
1388 * each struct page that each user address corresponds to at a given
1389 * instant. That is, it takes the page that would be accessed if a user
1390 * thread accesses the given user virtual address at that instant.
1391 *
1392 * This does not guarantee that the page exists in the user mappings when
1393 * get_user_pages returns, and there may even be a completely different
1394 * page there in some cases (eg. if mmapped pagecache has been invalidated
1395 * and subsequently re faulted). However it does guarantee that the page
1396 * won't be freed completely. And mostly callers simply care that the page
1397 * contains data that was valid *at some point in time*. Typically, an IO
1398 * or similar operation cannot guarantee anything stronger anyway because
1399 * locks can't be held over the syscall boundary.
1400 *
1401 * If write=0, the page must not be written to. If the page is written to,
1402 * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
1403 * after the page is finished with, and before put_page is called.
1404 *
1405 * get_user_pages is typically used for fewer-copy IO operations, to get a
1406 * handle on the memory by some means other than accesses via the user virtual
1407 * addresses. The pages may be submitted for DMA to devices or accessed via
1408 * their kernel linear mapping (via the kmap APIs). Care should be taken to
1409 * use the correct cache flushing APIs.
1410 *
1411 * See also get_user_pages_fast, for performance critical applications.
1412 */
1363int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1413int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1364 unsigned long start, int len, int write, int force, 1414 unsigned long start, int len, int write, int force,
1365 struct page **pages, struct vm_area_struct **vmas) 1415 struct page **pages, struct vm_area_struct **vmas)
@@ -1971,6 +2021,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1971 ret = tmp; 2021 ret = tmp;
1972 goto unwritable_page; 2022 goto unwritable_page;
1973 } 2023 }
2024 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2025 lock_page(old_page);
2026 if (!old_page->mapping) {
2027 ret = 0; /* retry the fault */
2028 unlock_page(old_page);
2029 goto unwritable_page;
2030 }
2031 } else
2032 VM_BUG_ON(!PageLocked(old_page));
1974 2033
1975 /* 2034 /*
1976 * Since we dropped the lock we need to revalidate 2035 * Since we dropped the lock we need to revalidate
@@ -1980,9 +2039,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1980 */ 2039 */
1981 page_table = pte_offset_map_lock(mm, pmd, address, 2040 page_table = pte_offset_map_lock(mm, pmd, address,
1982 &ptl); 2041 &ptl);
1983 page_cache_release(old_page); 2042 if (!pte_same(*page_table, orig_pte)) {
1984 if (!pte_same(*page_table, orig_pte)) 2043 unlock_page(old_page);
2044 page_cache_release(old_page);
1985 goto unlock; 2045 goto unlock;
2046 }
1986 2047
1987 page_mkwrite = 1; 2048 page_mkwrite = 1;
1988 } 2049 }
@@ -2094,9 +2155,6 @@ gotten:
2094unlock: 2155unlock:
2095 pte_unmap_unlock(page_table, ptl); 2156 pte_unmap_unlock(page_table, ptl);
2096 if (dirty_page) { 2157 if (dirty_page) {
2097 if (vma->vm_file)
2098 file_update_time(vma->vm_file);
2099
2100 /* 2158 /*
2101 * Yes, Virginia, this is actually required to prevent a race 2159 * Yes, Virginia, this is actually required to prevent a race
2102 * with clear_page_dirty_for_io() from clearing the page dirty 2160 * with clear_page_dirty_for_io() from clearing the page dirty
@@ -2105,16 +2163,41 @@ unlock:
2105 * 2163 *
2106 * do_no_page is protected similarly. 2164 * do_no_page is protected similarly.
2107 */ 2165 */
2108 wait_on_page_locked(dirty_page); 2166 if (!page_mkwrite) {
2109 set_page_dirty_balance(dirty_page, page_mkwrite); 2167 wait_on_page_locked(dirty_page);
2168 set_page_dirty_balance(dirty_page, page_mkwrite);
2169 }
2110 put_page(dirty_page); 2170 put_page(dirty_page);
2171 if (page_mkwrite) {
2172 struct address_space *mapping = dirty_page->mapping;
2173
2174 set_page_dirty(dirty_page);
2175 unlock_page(dirty_page);
2176 page_cache_release(dirty_page);
2177 if (mapping) {
2178 /*
2179 * Some device drivers do not set page.mapping
2180 * but still dirty their pages
2181 */
2182 balance_dirty_pages_ratelimited(mapping);
2183 }
2184 }
2185
2186 /* file_update_time outside page_lock */
2187 if (vma->vm_file)
2188 file_update_time(vma->vm_file);
2111 } 2189 }
2112 return ret; 2190 return ret;
2113oom_free_new: 2191oom_free_new:
2114 page_cache_release(new_page); 2192 page_cache_release(new_page);
2115oom: 2193oom:
2116 if (old_page) 2194 if (old_page) {
2195 if (page_mkwrite) {
2196 unlock_page(old_page);
2197 page_cache_release(old_page);
2198 }
2117 page_cache_release(old_page); 2199 page_cache_release(old_page);
2200 }
2118 return VM_FAULT_OOM; 2201 return VM_FAULT_OOM;
2119 2202
2120unwritable_page: 2203unwritable_page:
@@ -2458,8 +2541,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2458 2541
2459 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 2542 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2460 ret = VM_FAULT_OOM; 2543 ret = VM_FAULT_OOM;
2461 unlock_page(page); 2544 goto out_page;
2462 goto out;
2463 } 2545 }
2464 2546
2465 /* 2547 /*
@@ -2521,6 +2603,7 @@ out:
2521out_nomap: 2603out_nomap:
2522 mem_cgroup_cancel_charge_swapin(ptr); 2604 mem_cgroup_cancel_charge_swapin(ptr);
2523 pte_unmap_unlock(page_table, ptl); 2605 pte_unmap_unlock(page_table, ptl);
2606out_page:
2524 unlock_page(page); 2607 unlock_page(page);
2525 page_cache_release(page); 2608 page_cache_release(page);
2526 return ret; 2609 return ret;
@@ -2664,27 +2747,22 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2664 int tmp; 2747 int tmp;
2665 2748
2666 unlock_page(page); 2749 unlock_page(page);
2667 vmf.flags |= FAULT_FLAG_MKWRITE; 2750 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2668 tmp = vma->vm_ops->page_mkwrite(vma, &vmf); 2751 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2669 if (unlikely(tmp & 2752 if (unlikely(tmp &
2670 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { 2753 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2671 ret = tmp; 2754 ret = tmp;
2672 anon = 1; /* no anon but release vmf.page */ 2755 goto unwritable_page;
2673 goto out_unlocked;
2674 }
2675 lock_page(page);
2676 /*
2677 * XXX: this is not quite right (racy vs
2678 * invalidate) to unlock and relock the page
2679 * like this, however a better fix requires
2680 * reworking page_mkwrite locking API, which
2681 * is better done later.
2682 */
2683 if (!page->mapping) {
2684 ret = 0;
2685 anon = 1; /* no anon but release vmf.page */
2686 goto out;
2687 } 2756 }
2757 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2758 lock_page(page);
2759 if (!page->mapping) {
2760 ret = 0; /* retry the fault */
2761 unlock_page(page);
2762 goto unwritable_page;
2763 }
2764 } else
2765 VM_BUG_ON(!PageLocked(page));
2688 page_mkwrite = 1; 2766 page_mkwrite = 1;
2689 } 2767 }
2690 } 2768 }
@@ -2736,19 +2814,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2736 pte_unmap_unlock(page_table, ptl); 2814 pte_unmap_unlock(page_table, ptl);
2737 2815
2738out: 2816out:
2739 unlock_page(vmf.page); 2817 if (dirty_page) {
2740out_unlocked: 2818 struct address_space *mapping = page->mapping;
2741 if (anon)
2742 page_cache_release(vmf.page);
2743 else if (dirty_page) {
2744 if (vma->vm_file)
2745 file_update_time(vma->vm_file);
2746 2819
2747 set_page_dirty_balance(dirty_page, page_mkwrite); 2820 if (set_page_dirty(dirty_page))
2821 page_mkwrite = 1;
2822 unlock_page(dirty_page);
2748 put_page(dirty_page); 2823 put_page(dirty_page);
2824 if (page_mkwrite && mapping) {
2825 /*
2826 * Some device drivers do not set page.mapping but still
2827 * dirty their pages
2828 */
2829 balance_dirty_pages_ratelimited(mapping);
2830 }
2831
2832 /* file_update_time outside page_lock */
2833 if (vma->vm_file)
2834 file_update_time(vma->vm_file);
2835 } else {
2836 unlock_page(vmf.page);
2837 if (anon)
2838 page_cache_release(vmf.page);
2749 } 2839 }
2750 2840
2751 return ret; 2841 return ret;
2842
2843unwritable_page:
2844 page_cache_release(page);
2845 return ret;
2752} 2846}
2753 2847
2754static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2848static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -3009,22 +3103,13 @@ int in_gate_area_no_task(unsigned long addr)
3009 3103
3010#endif /* __HAVE_ARCH_GATE_AREA */ 3104#endif /* __HAVE_ARCH_GATE_AREA */
3011 3105
3012#ifdef CONFIG_HAVE_IOREMAP_PROT 3106static int follow_pte(struct mm_struct *mm, unsigned long address,
3013int follow_phys(struct vm_area_struct *vma, 3107 pte_t **ptepp, spinlock_t **ptlp)
3014 unsigned long address, unsigned int flags,
3015 unsigned long *prot, resource_size_t *phys)
3016{ 3108{
3017 pgd_t *pgd; 3109 pgd_t *pgd;
3018 pud_t *pud; 3110 pud_t *pud;
3019 pmd_t *pmd; 3111 pmd_t *pmd;
3020 pte_t *ptep, pte; 3112 pte_t *ptep;
3021 spinlock_t *ptl;
3022 resource_size_t phys_addr = 0;
3023 struct mm_struct *mm = vma->vm_mm;
3024 int ret = -EINVAL;
3025
3026 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3027 goto out;
3028 3113
3029 pgd = pgd_offset(mm, address); 3114 pgd = pgd_offset(mm, address);
3030 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 3115 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
@@ -3042,22 +3127,71 @@ int follow_phys(struct vm_area_struct *vma,
3042 if (pmd_huge(*pmd)) 3127 if (pmd_huge(*pmd))
3043 goto out; 3128 goto out;
3044 3129
3045 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 3130 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3046 if (!ptep) 3131 if (!ptep)
3047 goto out; 3132 goto out;
3133 if (!pte_present(*ptep))
3134 goto unlock;
3135 *ptepp = ptep;
3136 return 0;
3137unlock:
3138 pte_unmap_unlock(ptep, *ptlp);
3139out:
3140 return -EINVAL;
3141}
3142
3143/**
3144 * follow_pfn - look up PFN at a user virtual address
3145 * @vma: memory mapping
3146 * @address: user virtual address
3147 * @pfn: location to store found PFN
3148 *
3149 * Only IO mappings and raw PFN mappings are allowed.
3150 *
3151 * Returns zero and the pfn at @pfn on success, -ve otherwise.
3152 */
3153int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3154 unsigned long *pfn)
3155{
3156 int ret = -EINVAL;
3157 spinlock_t *ptl;
3158 pte_t *ptep;
3159
3160 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3161 return ret;
3162
3163 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3164 if (ret)
3165 return ret;
3166 *pfn = pte_pfn(*ptep);
3167 pte_unmap_unlock(ptep, ptl);
3168 return 0;
3169}
3170EXPORT_SYMBOL(follow_pfn);
3171
3172#ifdef CONFIG_HAVE_IOREMAP_PROT
3173int follow_phys(struct vm_area_struct *vma,
3174 unsigned long address, unsigned int flags,
3175 unsigned long *prot, resource_size_t *phys)
3176{
3177 int ret = -EINVAL;
3178 pte_t *ptep, pte;
3179 spinlock_t *ptl;
3180
3181 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3182 goto out;
3048 3183
3184 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3185 goto out;
3049 pte = *ptep; 3186 pte = *ptep;
3050 if (!pte_present(pte)) 3187
3051 goto unlock;
3052 if ((flags & FOLL_WRITE) && !pte_write(pte)) 3188 if ((flags & FOLL_WRITE) && !pte_write(pte))
3053 goto unlock; 3189 goto unlock;
3054 phys_addr = pte_pfn(pte);
3055 phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
3056 3190
3057 *prot = pgprot_val(pte_pgprot(pte)); 3191 *prot = pgprot_val(pte_pgprot(pte));
3058 *phys = phys_addr; 3192 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3059 ret = 0;
3060 3193
3194 ret = 0;
3061unlock: 3195unlock:
3062 pte_unmap_unlock(ptep, ptl); 3196 pte_unmap_unlock(ptep, ptl);
3063out: 3197out:
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c083cf5fd6df..e4412a676c88 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -422,7 +422,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
422 zone->present_pages += onlined_pages; 422 zone->present_pages += onlined_pages;
423 zone->zone_pgdat->node_present_pages += onlined_pages; 423 zone->zone_pgdat->node_present_pages += onlined_pages;
424 424
425 setup_per_zone_pages_min(); 425 setup_per_zone_wmarks();
426 calculate_zone_inactive_ratio(zone);
426 if (onlined_pages) { 427 if (onlined_pages) {
427 kswapd_run(zone_to_nid(zone)); 428 kswapd_run(zone_to_nid(zone));
428 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 429 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
@@ -832,6 +833,9 @@ repeat:
832 totalram_pages -= offlined_pages; 833 totalram_pages -= offlined_pages;
833 num_physpages -= offlined_pages; 834 num_physpages -= offlined_pages;
834 835
836 setup_per_zone_wmarks();
837 calculate_zone_inactive_ratio(zone);
838
835 vm_total_pages = nr_free_pagecache_pages(); 839 vm_total_pages = nr_free_pagecache_pages();
836 writeback_set_ratelimit(); 840 writeback_set_ratelimit();
837 841
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3eb4a6fdc043..e08e2c4da63a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
182 return 0; 182 return 0;
183} 183}
184 184
185/* Create a new policy */ 185/*
186 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
187 * any, for the new policy. mpol_new() has already validated the nodes
188 * parameter with respect to the policy mode and flags. But, we need to
189 * handle an empty nodemask with MPOL_PREFERRED here.
190 *
191 * Must be called holding task's alloc_lock to protect task's mems_allowed
192 * and mempolicy. May also be called holding the mmap_semaphore for write.
193 */
194static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
195{
196 nodemask_t cpuset_context_nmask;
197 int ret;
198
199 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
200 if (pol == NULL)
201 return 0;
202
203 VM_BUG_ON(!nodes);
204 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
205 nodes = NULL; /* explicit local allocation */
206 else {
207 if (pol->flags & MPOL_F_RELATIVE_NODES)
208 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
209 &cpuset_current_mems_allowed);
210 else
211 nodes_and(cpuset_context_nmask, *nodes,
212 cpuset_current_mems_allowed);
213 if (mpol_store_user_nodemask(pol))
214 pol->w.user_nodemask = *nodes;
215 else
216 pol->w.cpuset_mems_allowed =
217 cpuset_current_mems_allowed;
218 }
219
220 ret = mpol_ops[pol->mode].create(pol,
221 nodes ? &cpuset_context_nmask : NULL);
222 return ret;
223}
224
225/*
226 * This function just creates a new policy, does some check and simple
227 * initialization. You must invoke mpol_set_nodemask() to set nodes.
228 */
186static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 229static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
187 nodemask_t *nodes) 230 nodemask_t *nodes)
188{ 231{
189 struct mempolicy *policy; 232 struct mempolicy *policy;
190 nodemask_t cpuset_context_nmask;
191 int ret;
192 233
193 pr_debug("setting mode %d flags %d nodes[0] %lx\n", 234 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
194 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); 235 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
210 if (((flags & MPOL_F_STATIC_NODES) || 251 if (((flags & MPOL_F_STATIC_NODES) ||
211 (flags & MPOL_F_RELATIVE_NODES))) 252 (flags & MPOL_F_RELATIVE_NODES)))
212 return ERR_PTR(-EINVAL); 253 return ERR_PTR(-EINVAL);
213 nodes = NULL; /* flag local alloc */
214 } 254 }
215 } else if (nodes_empty(*nodes)) 255 } else if (nodes_empty(*nodes))
216 return ERR_PTR(-EINVAL); 256 return ERR_PTR(-EINVAL);
@@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
221 policy->mode = mode; 261 policy->mode = mode;
222 policy->flags = flags; 262 policy->flags = flags;
223 263
224 if (nodes) {
225 /*
226 * cpuset related setup doesn't apply to local allocation
227 */
228 cpuset_update_task_memory_state();
229 if (flags & MPOL_F_RELATIVE_NODES)
230 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
231 &cpuset_current_mems_allowed);
232 else
233 nodes_and(cpuset_context_nmask, *nodes,
234 cpuset_current_mems_allowed);
235 if (mpol_store_user_nodemask(policy))
236 policy->w.user_nodemask = *nodes;
237 else
238 policy->w.cpuset_mems_allowed =
239 cpuset_mems_allowed(current);
240 }
241
242 ret = mpol_ops[mode].create(policy,
243 nodes ? &cpuset_context_nmask : NULL);
244 if (ret < 0) {
245 kmem_cache_free(policy_cache, policy);
246 return ERR_PTR(ret);
247 }
248 return policy; 264 return policy;
249} 265}
250 266
@@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
324/* 340/*
325 * Wrapper for mpol_rebind_policy() that just requires task 341 * Wrapper for mpol_rebind_policy() that just requires task
326 * pointer, and updates task mempolicy. 342 * pointer, and updates task mempolicy.
343 *
344 * Called with task's alloc_lock held.
327 */ 345 */
328 346
329void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 347void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
@@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void)
600static long do_set_mempolicy(unsigned short mode, unsigned short flags, 618static long do_set_mempolicy(unsigned short mode, unsigned short flags,
601 nodemask_t *nodes) 619 nodemask_t *nodes)
602{ 620{
603 struct mempolicy *new; 621 struct mempolicy *new, *old;
604 struct mm_struct *mm = current->mm; 622 struct mm_struct *mm = current->mm;
623 int ret;
605 624
606 new = mpol_new(mode, flags, nodes); 625 new = mpol_new(mode, flags, nodes);
607 if (IS_ERR(new)) 626 if (IS_ERR(new))
@@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
615 */ 634 */
616 if (mm) 635 if (mm)
617 down_write(&mm->mmap_sem); 636 down_write(&mm->mmap_sem);
618 mpol_put(current->mempolicy); 637 task_lock(current);
638 ret = mpol_set_nodemask(new, nodes);
639 if (ret) {
640 task_unlock(current);
641 if (mm)
642 up_write(&mm->mmap_sem);
643 mpol_put(new);
644 return ret;
645 }
646 old = current->mempolicy;
619 current->mempolicy = new; 647 current->mempolicy = new;
620 mpol_set_task_struct_flag(); 648 mpol_set_task_struct_flag();
621 if (new && new->mode == MPOL_INTERLEAVE && 649 if (new && new->mode == MPOL_INTERLEAVE &&
622 nodes_weight(new->v.nodes)) 650 nodes_weight(new->v.nodes))
623 current->il_next = first_node(new->v.nodes); 651 current->il_next = first_node(new->v.nodes);
652 task_unlock(current);
624 if (mm) 653 if (mm)
625 up_write(&mm->mmap_sem); 654 up_write(&mm->mmap_sem);
626 655
656 mpol_put(old);
627 return 0; 657 return 0;
628} 658}
629 659
630/* 660/*
631 * Return nodemask for policy for get_mempolicy() query 661 * Return nodemask for policy for get_mempolicy() query
662 *
663 * Called with task's alloc_lock held
632 */ 664 */
633static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) 665static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
634{ 666{
@@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
674 struct vm_area_struct *vma = NULL; 706 struct vm_area_struct *vma = NULL;
675 struct mempolicy *pol = current->mempolicy; 707 struct mempolicy *pol = current->mempolicy;
676 708
677 cpuset_update_task_memory_state();
678 if (flags & 709 if (flags &
679 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 710 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
680 return -EINVAL; 711 return -EINVAL;
@@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
683 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 714 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
684 return -EINVAL; 715 return -EINVAL;
685 *policy = 0; /* just so it's initialized */ 716 *policy = 0; /* just so it's initialized */
717 task_lock(current);
686 *nmask = cpuset_current_mems_allowed; 718 *nmask = cpuset_current_mems_allowed;
719 task_unlock(current);
687 return 0; 720 return 0;
688 } 721 }
689 722
@@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
738 } 771 }
739 772
740 err = 0; 773 err = 0;
741 if (nmask) 774 if (nmask) {
775 task_lock(current);
742 get_policy_nodemask(pol, nmask); 776 get_policy_nodemask(pol, nmask);
777 task_unlock(current);
778 }
743 779
744 out: 780 out:
745 mpol_cond_put(pol); 781 mpol_cond_put(pol);
@@ -767,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
767 803
768static struct page *new_node_page(struct page *page, unsigned long node, int **x) 804static struct page *new_node_page(struct page *page, unsigned long node, int **x)
769{ 805{
770 return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); 806 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
771} 807}
772 808
773/* 809/*
@@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len,
979 return err; 1015 return err;
980 } 1016 }
981 down_write(&mm->mmap_sem); 1017 down_write(&mm->mmap_sem);
1018 task_lock(current);
1019 err = mpol_set_nodemask(new, nmask);
1020 task_unlock(current);
1021 if (err) {
1022 up_write(&mm->mmap_sem);
1023 mpol_put(new);
1024 return err;
1025 }
982 vma = check_range(mm, start, end, nmask, 1026 vma = check_range(mm, start, end, nmask,
983 flags | MPOL_MF_INVERT, &pagelist); 1027 flags | MPOL_MF_INVERT, &pagelist);
984 1028
@@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1545 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1589 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1546 struct zonelist *zl; 1590 struct zonelist *zl;
1547 1591
1548 cpuset_update_task_memory_state();
1549
1550 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1592 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1551 unsigned nid; 1593 unsigned nid;
1552 1594
@@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1593{ 1635{
1594 struct mempolicy *pol = current->mempolicy; 1636 struct mempolicy *pol = current->mempolicy;
1595 1637
1596 if ((gfp & __GFP_WAIT) && !in_interrupt())
1597 cpuset_update_task_memory_state();
1598 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1638 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1599 pol = &default_policy; 1639 pol = &default_policy;
1600 1640
@@ -1854,6 +1894,8 @@ restart:
1854 */ 1894 */
1855void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 1895void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1856{ 1896{
1897 int ret;
1898
1857 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 1899 sp->root = RB_ROOT; /* empty tree == default mempolicy */
1858 spin_lock_init(&sp->lock); 1900 spin_lock_init(&sp->lock);
1859 1901
@@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1863 1905
1864 /* contextualize the tmpfs mount point mempolicy */ 1906 /* contextualize the tmpfs mount point mempolicy */
1865 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 1907 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1866 mpol_put(mpol); /* drop our ref on sb mpol */ 1908 if (IS_ERR(new)) {
1867 if (IS_ERR(new)) 1909 mpol_put(mpol); /* drop our ref on sb mpol */
1868 return; /* no valid nodemask intersection */ 1910 return; /* no valid nodemask intersection */
1911 }
1912
1913 task_lock(current);
1914 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
1915 task_unlock(current);
1916 mpol_put(mpol); /* drop our ref on sb mpol */
1917 if (ret) {
1918 mpol_put(new);
1919 return;
1920 }
1869 1921
1870 /* Create pseudo-vma that contains just the policy */ 1922 /* Create pseudo-vma that contains just the policy */
1871 memset(&pvma, 0, sizeof(struct vm_area_struct)); 1923 memset(&pvma, 0, sizeof(struct vm_area_struct));
@@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2086 new = mpol_new(mode, mode_flags, &nodes); 2138 new = mpol_new(mode, mode_flags, &nodes);
2087 if (IS_ERR(new)) 2139 if (IS_ERR(new))
2088 err = 1; 2140 err = 1;
2089 else if (no_context) 2141 else {
2090 new->w.user_nodemask = nodes; /* save for contextualization */ 2142 int ret;
2143
2144 task_lock(current);
2145 ret = mpol_set_nodemask(new, &nodes);
2146 task_unlock(current);
2147 if (ret)
2148 err = 1;
2149 else if (no_context) {
2150 /* save for contextualization */
2151 new->w.user_nodemask = nodes;
2152 }
2153 }
2091 2154
2092out: 2155out:
2093 /* Restore string for error message */ 2156 /* Restore string for error message */
diff --git a/mm/migrate.c b/mm/migrate.c
index 068655d8f883..939888f9ddab 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
802 802
803 *result = &pm->status; 803 *result = &pm->status;
804 804
805 return alloc_pages_node(pm->node, 805 return alloc_pages_exact_node(pm->node,
806 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 806 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
807} 807}
808 808
@@ -820,7 +820,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
820 struct page_to_node *pp; 820 struct page_to_node *pp;
821 LIST_HEAD(pagelist); 821 LIST_HEAD(pagelist);
822 822
823 migrate_prep();
824 down_read(&mm->mmap_sem); 823 down_read(&mm->mmap_sem);
825 824
826 /* 825 /*
@@ -907,6 +906,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
907 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); 906 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
908 if (!pm) 907 if (!pm)
909 goto out; 908 goto out;
909
910 migrate_prep();
911
910 /* 912 /*
911 * Store a chunk of page_to_node array in a page, 913 * Store a chunk of page_to_node array in a page,
912 * but keep the last one as a marker 914 * but keep the last one as a marker
diff --git a/mm/mlock.c b/mm/mlock.c
index cbe9e0581b75..45eb650b9654 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -31,7 +31,6 @@ int can_do_mlock(void)
31} 31}
32EXPORT_SYMBOL(can_do_mlock); 32EXPORT_SYMBOL(can_do_mlock);
33 33
34#ifdef CONFIG_UNEVICTABLE_LRU
35/* 34/*
36 * Mlocked pages are marked with PageMlocked() flag for efficient testing 35 * Mlocked pages are marked with PageMlocked() flag for efficient testing
37 * in vmscan and, possibly, the fault path; and to support semi-accurate 36 * in vmscan and, possibly, the fault path; and to support semi-accurate
@@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval)
261 return retval; 260 return retval;
262} 261}
263 262
264#else /* CONFIG_UNEVICTABLE_LRU */
265
266/*
267 * Just make pages present if VM_LOCKED. No-op if unlocking.
268 */
269static long __mlock_vma_pages_range(struct vm_area_struct *vma,
270 unsigned long start, unsigned long end,
271 int mlock)
272{
273 if (mlock && (vma->vm_flags & VM_LOCKED))
274 return make_pages_present(start, end);
275 return 0;
276}
277
278static inline int __mlock_posix_error_return(long retval)
279{
280 return 0;
281}
282
283#endif /* CONFIG_UNEVICTABLE_LRU */
284
285/** 263/**
286 * mlock_vma_pages_range() - mlock pages in specified vma range. 264 * mlock_vma_pages_range() - mlock pages in specified vma range.
287 * @vma - the vma containing the specfied address range 265 * @vma - the vma containing the specfied address range
@@ -629,52 +607,43 @@ void user_shm_unlock(size_t size, struct user_struct *user)
629 free_uid(user); 607 free_uid(user);
630} 608}
631 609
632void *alloc_locked_buffer(size_t size) 610int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
611 size_t size)
633{ 612{
634 unsigned long rlim, vm, pgsz; 613 unsigned long lim, vm, pgsz;
635 void *buffer = NULL; 614 int error = -ENOMEM;
636 615
637 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; 616 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
638 617
639 down_write(&current->mm->mmap_sem); 618 down_write(&mm->mmap_sem);
640 619
641 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; 620 lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
642 vm = current->mm->total_vm + pgsz; 621 vm = mm->total_vm + pgsz;
643 if (rlim < vm) 622 if (lim < vm)
644 goto out; 623 goto out;
645 624
646 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 625 lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
647 vm = current->mm->locked_vm + pgsz; 626 vm = mm->locked_vm + pgsz;
648 if (rlim < vm) 627 if (lim < vm)
649 goto out; 628 goto out;
650 629
651 buffer = kzalloc(size, GFP_KERNEL); 630 mm->total_vm += pgsz;
652 if (!buffer) 631 mm->locked_vm += pgsz;
653 goto out;
654
655 current->mm->total_vm += pgsz;
656 current->mm->locked_vm += pgsz;
657 632
633 error = 0;
658 out: 634 out:
659 up_write(&current->mm->mmap_sem); 635 up_write(&mm->mmap_sem);
660 return buffer; 636 return error;
661} 637}
662 638
663void release_locked_buffer(void *buffer, size_t size) 639void refund_locked_memory(struct mm_struct *mm, size_t size)
664{ 640{
665 unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; 641 unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
666 642
667 down_write(&current->mm->mmap_sem); 643 down_write(&mm->mmap_sem);
668
669 current->mm->total_vm -= pgsz;
670 current->mm->locked_vm -= pgsz;
671
672 up_write(&current->mm->mmap_sem);
673}
674 644
675void free_locked_buffer(void *buffer, size_t size) 645 mm->total_vm -= pgsz;
676{ 646 mm->locked_vm -= pgsz;
677 release_locked_buffer(buffer, size);
678 647
679 kfree(buffer); 648 up_write(&mm->mmap_sem);
680} 649}
diff --git a/mm/mmap.c b/mm/mmap.c
index 3303d1ba8e87..34579b23ebd5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
28#include <linux/mempolicy.h> 28#include <linux/mempolicy.h>
29#include <linux/rmap.h> 29#include <linux/rmap.h>
30#include <linux/mmu_notifier.h> 30#include <linux/mmu_notifier.h>
31#include <linux/perf_counter.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/cacheflush.h> 34#include <asm/cacheflush.h>
@@ -85,7 +86,10 @@ EXPORT_SYMBOL(vm_get_page_prot);
85int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 86int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
86int sysctl_overcommit_ratio = 50; /* default is 50% */ 87int sysctl_overcommit_ratio = 50; /* default is 50% */
87int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 88int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
88atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); 89struct percpu_counter vm_committed_as;
90
91/* amount of vm to protect from userspace access */
92unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
89 93
90/* 94/*
91 * Check that a process has enough memory to allocate a new virtual 95 * Check that a process has enough memory to allocate a new virtual
@@ -179,11 +183,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
179 if (mm) 183 if (mm)
180 allowed -= mm->total_vm / 32; 184 allowed -= mm->total_vm / 32;
181 185
182 /* 186 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
183 * cast `allowed' as a signed long because vm_committed_space
184 * sometimes has a negative value
185 */
186 if (atomic_long_read(&vm_committed_space) < (long)allowed)
187 return 0; 187 return 0;
188error: 188error:
189 vm_unacct_memory(pages); 189 vm_unacct_memory(pages);
@@ -1223,6 +1223,8 @@ munmap_back:
1223 if (correct_wcount) 1223 if (correct_wcount)
1224 atomic_inc(&inode->i_writecount); 1224 atomic_inc(&inode->i_writecount);
1225out: 1225out:
1226 perf_counter_mmap(vma);
1227
1226 mm->total_vm += len >> PAGE_SHIFT; 1228 mm->total_vm += len >> PAGE_SHIFT;
1227 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1229 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1228 if (vm_flags & VM_LOCKED) { 1230 if (vm_flags & VM_LOCKED) {
@@ -2309,6 +2311,8 @@ int install_special_mapping(struct mm_struct *mm,
2309 2311
2310 mm->total_vm += len >> PAGE_SHIFT; 2312 mm->total_vm += len >> PAGE_SHIFT;
2311 2313
2314 perf_counter_mmap(vma);
2315
2312 return 0; 2316 return 0;
2313} 2317}
2314 2318
@@ -2481,4 +2485,8 @@ void mm_drop_all_locks(struct mm_struct *mm)
2481 */ 2485 */
2482void __init mmap_init(void) 2486void __init mmap_init(void)
2483{ 2487{
2488 int ret;
2489
2490 ret = percpu_counter_init(&vm_committed_as, 0);
2491 VM_BUG_ON(ret);
2484} 2492}
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 16ce8b955dcf..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -6,6 +6,7 @@
6 6
7 7
8#include <linux/stddef.h> 8#include <linux/stddef.h>
9#include <linux/mm.h>
9#include <linux/mmzone.h> 10#include <linux/mmzone.h>
10#include <linux/module.h> 11#include <linux/module.h>
11 12
@@ -72,3 +73,17 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
72 *zone = zonelist_zone(z); 73 *zone = zonelist_zone(z);
73 return z; 74 return z;
74} 75}
76
77#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
78int memmap_valid_within(unsigned long pfn,
79 struct page *page, struct zone *zone)
80{
81 if (page_to_pfn(page) != pfn)
82 return 0;
83
84 if (page_zone(page) != zone)
85 return 0;
86
87 return 1;
88}
89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 258197b76fb4..d80311baeb2d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h> 24#include <linux/mmu_notifier.h>
25#include <linux/migrate.h> 25#include <linux/migrate.h>
26#include <linux/perf_counter.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27#include <asm/pgtable.h> 28#include <asm/pgtable.h>
28#include <asm/cacheflush.h> 29#include <asm/cacheflush.h>
@@ -299,6 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
299 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); 300 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
300 if (error) 301 if (error)
301 goto out; 302 goto out;
303 perf_counter_mmap(vma);
302 nstart = tmp; 304 nstart = tmp;
303 305
304 if (nstart < prev->vm_end) 306 if (nstart < prev->vm_end)
diff --git a/mm/nommu.c b/mm/nommu.c
index 72eda4aee2cb..2fd2ad5da98e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -62,13 +62,16 @@ void *high_memory;
62struct page *mem_map; 62struct page *mem_map;
63unsigned long max_mapnr; 63unsigned long max_mapnr;
64unsigned long num_physpages; 64unsigned long num_physpages;
65atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); 65struct percpu_counter vm_committed_as;
66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
67int sysctl_overcommit_ratio = 50; /* default is 50% */ 67int sysctl_overcommit_ratio = 50; /* default is 50% */
68int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 68int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
69int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ 69int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
70int heap_stack_gap = 0; 70int heap_stack_gap = 0;
71 71
72/* amount of vm to protect from userspace access */
73unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
74
72atomic_long_t mmap_pages_allocated; 75atomic_long_t mmap_pages_allocated;
73 76
74EXPORT_SYMBOL(mem_map); 77EXPORT_SYMBOL(mem_map);
@@ -463,6 +466,10 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
463 */ 466 */
464void __init mmap_init(void) 467void __init mmap_init(void)
465{ 468{
469 int ret;
470
471 ret = percpu_counter_init(&vm_committed_as, 0);
472 VM_BUG_ON(ret);
466 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); 473 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
467} 474}
468 475
@@ -511,8 +518,6 @@ static void add_nommu_region(struct vm_region *region)
511 518
512 validate_nommu_regions(); 519 validate_nommu_regions();
513 520
514 BUG_ON(region->vm_start & ~PAGE_MASK);
515
516 parent = NULL; 521 parent = NULL;
517 p = &nommu_region_tree.rb_node; 522 p = &nommu_region_tree.rb_node;
518 while (*p) { 523 while (*p) {
@@ -1847,12 +1852,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1847 if (mm) 1852 if (mm)
1848 allowed -= mm->total_vm / 32; 1853 allowed -= mm->total_vm / 32;
1849 1854
1850 /* 1855 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
1851 * cast `allowed' as a signed long because vm_committed_space
1852 * sometimes has a negative value
1853 */
1854 if (atomic_long_read(&vm_committed_space) < (long)allowed)
1855 return 0; 1856 return 0;
1857
1856error: 1858error:
1857 vm_unacct_memory(pages); 1859 vm_unacct_memory(pages);
1858 1860
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2f3166e308d9..175a67a78a99 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 unsigned long points, cpu_time, run_time; 58 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 59 struct mm_struct *mm;
60 struct task_struct *child; 60 struct task_struct *child;
61 int oom_adj;
61 62
62 task_lock(p); 63 task_lock(p);
63 mm = p->mm; 64 mm = p->mm;
@@ -65,6 +66,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
65 task_unlock(p); 66 task_unlock(p);
66 return 0; 67 return 0;
67 } 68 }
69 oom_adj = mm->oom_adj;
70 if (oom_adj == OOM_DISABLE) {
71 task_unlock(p);
72 return 0;
73 }
68 74
69 /* 75 /*
70 * The memory size of the process is the basis for the badness. 76 * The memory size of the process is the basis for the badness.
@@ -148,15 +154,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
148 points /= 8; 154 points /= 8;
149 155
150 /* 156 /*
151 * Adjust the score by oomkilladj. 157 * Adjust the score by oom_adj.
152 */ 158 */
153 if (p->oomkilladj) { 159 if (oom_adj) {
154 if (p->oomkilladj > 0) { 160 if (oom_adj > 0) {
155 if (!points) 161 if (!points)
156 points = 1; 162 points = 1;
157 points <<= p->oomkilladj; 163 points <<= oom_adj;
158 } else 164 } else
159 points >>= -(p->oomkilladj); 165 points >>= -(oom_adj);
160 } 166 }
161 167
162#ifdef DEBUG 168#ifdef DEBUG
@@ -251,11 +257,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
251 *ppoints = ULONG_MAX; 257 *ppoints = ULONG_MAX;
252 } 258 }
253 259
254 if (p->oomkilladj == OOM_DISABLE)
255 continue;
256
257 points = badness(p, uptime.tv_sec); 260 points = badness(p, uptime.tv_sec);
258 if (points > *ppoints || !chosen) { 261 if (points > *ppoints) {
259 chosen = p; 262 chosen = p;
260 *ppoints = points; 263 *ppoints = points;
261 } 264 }
@@ -284,22 +287,27 @@ static void dump_tasks(const struct mem_cgroup *mem)
284 printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " 287 printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj "
285 "name\n"); 288 "name\n");
286 do_each_thread(g, p) { 289 do_each_thread(g, p) {
287 /* 290 struct mm_struct *mm;
288 * total_vm and rss sizes do not exist for tasks with a 291
289 * detached mm so there's no need to report them.
290 */
291 if (!p->mm)
292 continue;
293 if (mem && !task_in_mem_cgroup(p, mem)) 292 if (mem && !task_in_mem_cgroup(p, mem))
294 continue; 293 continue;
295 if (!thread_group_leader(p)) 294 if (!thread_group_leader(p))
296 continue; 295 continue;
297 296
298 task_lock(p); 297 task_lock(p);
298 mm = p->mm;
299 if (!mm) {
300 /*
301 * total_vm and rss sizes do not exist for tasks with no
302 * mm so there's no need to report them; they can't be
303 * oom killed anyway.
304 */
305 task_unlock(p);
306 continue;
307 }
299 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 308 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
300 p->pid, __task_cred(p)->uid, p->tgid, 309 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
301 p->mm->total_vm, get_mm_rss(p->mm), (int)task_cpu(p), 310 get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
302 p->oomkilladj, p->comm);
303 task_unlock(p); 311 task_unlock(p);
304 } while_each_thread(g, p); 312 } while_each_thread(g, p);
305} 313}
@@ -317,11 +325,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
317 return; 325 return;
318 } 326 }
319 327
320 if (!p->mm) { 328 if (!p->mm)
321 WARN_ON(1);
322 printk(KERN_WARNING "tried to kill an mm-less task!\n");
323 return; 329 return;
324 }
325 330
326 if (verbose) 331 if (verbose)
327 printk(KERN_ERR "Killed process %d (%s)\n", 332 printk(KERN_ERR "Killed process %d (%s)\n",
@@ -343,28 +348,13 @@ static int oom_kill_task(struct task_struct *p)
343 struct mm_struct *mm; 348 struct mm_struct *mm;
344 struct task_struct *g, *q; 349 struct task_struct *g, *q;
345 350
351 task_lock(p);
346 mm = p->mm; 352 mm = p->mm;
347 353 if (!mm || mm->oom_adj == OOM_DISABLE) {
348 /* WARNING: mm may not be dereferenced since we did not obtain its 354 task_unlock(p);
349 * value from get_task_mm(p). This is OK since all we need to do is
350 * compare mm to q->mm below.
351 *
352 * Furthermore, even if mm contains a non-NULL value, p->mm may
353 * change to NULL at any time since we do not hold task_lock(p).
354 * However, this is of no concern to us.
355 */
356
357 if (mm == NULL)
358 return 1; 355 return 1;
359 356 }
360 /* 357 task_unlock(p);
361 * Don't kill the process if any threads are set to OOM_DISABLE
362 */
363 do_each_thread(g, q) {
364 if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
365 return 1;
366 } while_each_thread(g, q);
367
368 __oom_kill_task(p, 1); 358 __oom_kill_task(p, 1);
369 359
370 /* 360 /*
@@ -387,10 +377,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
387 struct task_struct *c; 377 struct task_struct *c;
388 378
389 if (printk_ratelimit()) { 379 if (printk_ratelimit()) {
390 printk(KERN_WARNING "%s invoked oom-killer: "
391 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
392 current->comm, gfp_mask, order, current->oomkilladj);
393 task_lock(current); 380 task_lock(current);
381 printk(KERN_WARNING "%s invoked oom-killer: "
382 "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
383 current->comm, gfp_mask, order,
384 current->mm ? current->mm->oom_adj : OOM_DISABLE);
394 cpuset_print_task_mems_allowed(current); 385 cpuset_print_task_mems_allowed(current);
395 task_unlock(current); 386 task_unlock(current);
396 dump_stack(); 387 dump_stack();
@@ -403,8 +394,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
403 /* 394 /*
404 * If the task is already exiting, don't alarm the sysadmin or kill 395 * If the task is already exiting, don't alarm the sysadmin or kill
405 * its children or threads, just set TIF_MEMDIE so it can die quickly 396 * its children or threads, just set TIF_MEMDIE so it can die quickly
397 * if its mm is still attached.
406 */ 398 */
407 if (p->flags & PF_EXITING) { 399 if (p->mm && (p->flags & PF_EXITING)) {
408 __oom_kill_task(p, 0); 400 __oom_kill_task(p, 0);
409 return 0; 401 return 0;
410 } 402 }
@@ -514,34 +506,32 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
514 */ 506 */
515static void __out_of_memory(gfp_t gfp_mask, int order) 507static void __out_of_memory(gfp_t gfp_mask, int order)
516{ 508{
517 if (sysctl_oom_kill_allocating_task) { 509 struct task_struct *p;
518 oom_kill_process(current, gfp_mask, order, 0, NULL, 510 unsigned long points;
519 "Out of memory (oom_kill_allocating_task)");
520
521 } else {
522 unsigned long points;
523 struct task_struct *p;
524
525retry:
526 /*
527 * Rambo mode: Shoot down a process and hope it solves whatever
528 * issues we may have.
529 */
530 p = select_bad_process(&points, NULL);
531 511
532 if (PTR_ERR(p) == -1UL) 512 if (sysctl_oom_kill_allocating_task)
513 if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
514 "Out of memory (oom_kill_allocating_task)"))
533 return; 515 return;
516retry:
517 /*
518 * Rambo mode: Shoot down a process and hope it solves whatever
519 * issues we may have.
520 */
521 p = select_bad_process(&points, NULL);
534 522
535 /* Found nothing?!?! Either we hang forever, or we panic. */ 523 if (PTR_ERR(p) == -1UL)
536 if (!p) { 524 return;
537 read_unlock(&tasklist_lock);
538 panic("Out of memory and no killable processes...\n");
539 }
540 525
541 if (oom_kill_process(p, gfp_mask, order, points, NULL, 526 /* Found nothing?!?! Either we hang forever, or we panic. */
542 "Out of memory")) 527 if (!p) {
543 goto retry; 528 read_unlock(&tasklist_lock);
529 panic("Out of memory and no killable processes...\n");
544 } 530 }
531
532 if (oom_kill_process(p, gfp_mask, order, points, NULL,
533 "Out of memory"))
534 goto retry;
545} 535}
546 536
547/* 537/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 30351f0063ac..7b0dcea4935b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -94,12 +94,12 @@ unsigned long vm_dirty_bytes;
94/* 94/*
95 * The interval between `kupdate'-style writebacks 95 * The interval between `kupdate'-style writebacks
96 */ 96 */
97unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */ 97unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
98 98
99/* 99/*
100 * The longest time for which data is allowed to remain dirty 100 * The longest time for which data is allowed to remain dirty
101 */ 101 */
102unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */ 102unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
103 103
104/* 104/*
105 * Flag that makes the machine dump writes/reads and block dirtyings. 105 * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -265,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
265 * This avoids exceeding the total dirty_limit when the floating averages 265 * This avoids exceeding the total dirty_limit when the floating averages
266 * fluctuate too quickly. 266 * fluctuate too quickly.
267 */ 267 */
268static void 268static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
269clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) 269 unsigned long dirty, unsigned long *pbdi_dirty)
270{ 270{
271 long avail_dirty; 271 unsigned long avail_dirty;
272 272
273 avail_dirty = dirty - 273 avail_dirty = global_page_state(NR_FILE_DIRTY) +
274 (global_page_state(NR_FILE_DIRTY) +
275 global_page_state(NR_WRITEBACK) + 274 global_page_state(NR_WRITEBACK) +
276 global_page_state(NR_UNSTABLE_NFS) + 275 global_page_state(NR_UNSTABLE_NFS) +
277 global_page_state(NR_WRITEBACK_TEMP)); 276 global_page_state(NR_WRITEBACK_TEMP);
278 277
279 if (avail_dirty < 0) 278 if (avail_dirty < dirty)
279 avail_dirty = dirty - avail_dirty;
280 else
280 avail_dirty = 0; 281 avail_dirty = 0;
281 282
282 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + 283 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
@@ -299,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
299 * 300 *
300 * dirty -= (dirty/8) * p_{t} 301 * dirty -= (dirty/8) * p_{t}
301 */ 302 */
302static void task_dirty_limit(struct task_struct *tsk, long *pdirty) 303static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
303{ 304{
304 long numerator, denominator; 305 long numerator, denominator;
305 long dirty = *pdirty; 306 unsigned long dirty = *pdirty;
306 u64 inv = dirty >> 3; 307 u64 inv = dirty >> 3;
307 308
308 task_dirties_fraction(tsk, &numerator, &denominator); 309 task_dirties_fraction(tsk, &numerator, &denominator);
@@ -770,7 +771,7 @@ static void wb_kupdate(unsigned long arg)
770 771
771 sync_supers(); 772 sync_supers();
772 773
773 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval); 774 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
774 start_jif = jiffies; 775 start_jif = jiffies;
775 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10); 776 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
776 nr_to_write = global_page_state(NR_FILE_DIRTY) + 777 nr_to_write = global_page_state(NR_FILE_DIRTY) +
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2f26991fff1..a5f3c278c573 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -23,6 +23,7 @@
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/compiler.h> 24#include <linux/compiler.h>
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/kmemcheck.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/suspend.h> 28#include <linux/suspend.h>
28#include <linux/pagevec.h> 29#include <linux/pagevec.h>
@@ -46,6 +47,7 @@
46#include <linux/page-isolation.h> 47#include <linux/page-isolation.h>
47#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
48#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h>
49 51
50#include <asm/tlbflush.h> 52#include <asm/tlbflush.h>
51#include <asm/div64.h> 53#include <asm/div64.h>
@@ -149,10 +151,6 @@ static unsigned long __meminitdata dma_reserve;
149 static int __meminitdata nr_nodemap_entries; 151 static int __meminitdata nr_nodemap_entries;
150 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 152 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
151 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 153 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
152#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
155#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
156 static unsigned long __initdata required_kernelcore; 154 static unsigned long __initdata required_kernelcore;
157 static unsigned long __initdata required_movablecore; 155 static unsigned long __initdata required_movablecore;
158 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 156 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
@@ -164,17 +162,25 @@ static unsigned long __meminitdata dma_reserve;
164 162
165#if MAX_NUMNODES > 1 163#if MAX_NUMNODES > 1
166int nr_node_ids __read_mostly = MAX_NUMNODES; 164int nr_node_ids __read_mostly = MAX_NUMNODES;
165int nr_online_nodes __read_mostly = 1;
167EXPORT_SYMBOL(nr_node_ids); 166EXPORT_SYMBOL(nr_node_ids);
167EXPORT_SYMBOL(nr_online_nodes);
168#endif 168#endif
169 169
170int page_group_by_mobility_disabled __read_mostly; 170int page_group_by_mobility_disabled __read_mostly;
171 171
172static void set_pageblock_migratetype(struct page *page, int migratetype) 172static void set_pageblock_migratetype(struct page *page, int migratetype)
173{ 173{
174
175 if (unlikely(page_group_by_mobility_disabled))
176 migratetype = MIGRATE_UNMOVABLE;
177
174 set_pageblock_flags_group(page, (unsigned long)migratetype, 178 set_pageblock_flags_group(page, (unsigned long)migratetype,
175 PB_migrate, PB_migrate_end); 179 PB_migrate, PB_migrate_end);
176} 180}
177 181
182bool oom_killer_disabled __read_mostly;
183
178#ifdef CONFIG_DEBUG_VM 184#ifdef CONFIG_DEBUG_VM
179static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 185static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
180{ 186{
@@ -297,23 +303,6 @@ void prep_compound_page(struct page *page, unsigned long order)
297 } 303 }
298} 304}
299 305
300#ifdef CONFIG_HUGETLBFS
301void prep_compound_gigantic_page(struct page *page, unsigned long order)
302{
303 int i;
304 int nr_pages = 1 << order;
305 struct page *p = page + 1;
306
307 set_compound_page_dtor(page, free_compound_page);
308 set_compound_order(page, order);
309 __SetPageHead(page);
310 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
311 __SetPageTail(p);
312 p->first_page = page;
313 }
314}
315#endif
316
317static int destroy_compound_page(struct page *page, unsigned long order) 306static int destroy_compound_page(struct page *page, unsigned long order)
318{ 307{
319 int i; 308 int i;
@@ -420,7 +409,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
420 return 0; 409 return 0;
421 410
422 if (PageBuddy(buddy) && page_order(buddy) == order) { 411 if (PageBuddy(buddy) && page_order(buddy) == order) {
423 BUG_ON(page_count(buddy) != 0); 412 VM_BUG_ON(page_count(buddy) != 0);
424 return 1; 413 return 1;
425 } 414 }
426 return 0; 415 return 0;
@@ -451,22 +440,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
451 */ 440 */
452 441
453static inline void __free_one_page(struct page *page, 442static inline void __free_one_page(struct page *page,
454 struct zone *zone, unsigned int order) 443 struct zone *zone, unsigned int order,
444 int migratetype)
455{ 445{
456 unsigned long page_idx; 446 unsigned long page_idx;
457 int order_size = 1 << order;
458 int migratetype = get_pageblock_migratetype(page);
459 447
460 if (unlikely(PageCompound(page))) 448 if (unlikely(PageCompound(page)))
461 if (unlikely(destroy_compound_page(page, order))) 449 if (unlikely(destroy_compound_page(page, order)))
462 return; 450 return;
463 451
452 VM_BUG_ON(migratetype == -1);
453
464 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 454 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
465 455
466 VM_BUG_ON(page_idx & (order_size - 1)); 456 VM_BUG_ON(page_idx & ((1 << order) - 1));
467 VM_BUG_ON(bad_range(zone, page)); 457 VM_BUG_ON(bad_range(zone, page));
468 458
469 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
470 while (order < MAX_ORDER-1) { 459 while (order < MAX_ORDER-1) {
471 unsigned long combined_idx; 460 unsigned long combined_idx;
472 struct page *buddy; 461 struct page *buddy;
@@ -490,12 +479,27 @@ static inline void __free_one_page(struct page *page,
490 zone->free_area[order].nr_free++; 479 zone->free_area[order].nr_free++;
491} 480}
492 481
482#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
483/*
484 * free_page_mlock() -- clean up attempts to free and mlocked() page.
485 * Page should not be on lru, so no need to fix that up.
486 * free_pages_check() will verify...
487 */
488static inline void free_page_mlock(struct page *page)
489{
490 __ClearPageMlocked(page);
491 __dec_zone_page_state(page, NR_MLOCK);
492 __count_vm_event(UNEVICTABLE_MLOCKFREED);
493}
494#else
495static void free_page_mlock(struct page *page) { }
496#endif
497
493static inline int free_pages_check(struct page *page) 498static inline int free_pages_check(struct page *page)
494{ 499{
495 free_page_mlock(page);
496 if (unlikely(page_mapcount(page) | 500 if (unlikely(page_mapcount(page) |
497 (page->mapping != NULL) | 501 (page->mapping != NULL) |
498 (page_count(page) != 0) | 502 (atomic_read(&page->_count) != 0) |
499 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { 503 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
500 bad_page(page); 504 bad_page(page);
501 return 1; 505 return 1;
@@ -522,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count,
522 spin_lock(&zone->lock); 526 spin_lock(&zone->lock);
523 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 527 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
524 zone->pages_scanned = 0; 528 zone->pages_scanned = 0;
529
530 __mod_zone_page_state(zone, NR_FREE_PAGES, count << order);
525 while (count--) { 531 while (count--) {
526 struct page *page; 532 struct page *page;
527 533
@@ -529,17 +535,20 @@ static void free_pages_bulk(struct zone *zone, int count,
529 page = list_entry(list->prev, struct page, lru); 535 page = list_entry(list->prev, struct page, lru);
530 /* have to delete it as __free_one_page list manipulates */ 536 /* have to delete it as __free_one_page list manipulates */
531 list_del(&page->lru); 537 list_del(&page->lru);
532 __free_one_page(page, zone, order); 538 __free_one_page(page, zone, order, page_private(page));
533 } 539 }
534 spin_unlock(&zone->lock); 540 spin_unlock(&zone->lock);
535} 541}
536 542
537static void free_one_page(struct zone *zone, struct page *page, int order) 543static void free_one_page(struct zone *zone, struct page *page, int order,
544 int migratetype)
538{ 545{
539 spin_lock(&zone->lock); 546 spin_lock(&zone->lock);
540 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 547 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
541 zone->pages_scanned = 0; 548 zone->pages_scanned = 0;
542 __free_one_page(page, zone, order); 549
550 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
551 __free_one_page(page, zone, order, migratetype);
543 spin_unlock(&zone->lock); 552 spin_unlock(&zone->lock);
544} 553}
545 554
@@ -548,6 +557,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
548 unsigned long flags; 557 unsigned long flags;
549 int i; 558 int i;
550 int bad = 0; 559 int bad = 0;
560 int clearMlocked = PageMlocked(page);
561
562 kmemcheck_free_shadow(page, order);
551 563
552 for (i = 0 ; i < (1 << order) ; ++i) 564 for (i = 0 ; i < (1 << order) ; ++i)
553 bad += free_pages_check(page + i); 565 bad += free_pages_check(page + i);
@@ -563,8 +575,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
563 kernel_map_pages(page, 1 << order, 0); 575 kernel_map_pages(page, 1 << order, 0);
564 576
565 local_irq_save(flags); 577 local_irq_save(flags);
578 if (unlikely(clearMlocked))
579 free_page_mlock(page);
566 __count_vm_events(PGFREE, 1 << order); 580 __count_vm_events(PGFREE, 1 << order);
567 free_one_page(page_zone(page), page, order); 581 free_one_page(page_zone(page), page, order,
582 get_pageblock_migratetype(page));
568 local_irq_restore(flags); 583 local_irq_restore(flags);
569} 584}
570 585
@@ -635,7 +650,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
635{ 650{
636 if (unlikely(page_mapcount(page) | 651 if (unlikely(page_mapcount(page) |
637 (page->mapping != NULL) | 652 (page->mapping != NULL) |
638 (page_count(page) != 0) | 653 (atomic_read(&page->_count) != 0) |
639 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { 654 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
640 bad_page(page); 655 bad_page(page);
641 return 1; 656 return 1;
@@ -660,7 +675,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
660 * Go through the free lists for the given migratetype and remove 675 * Go through the free lists for the given migratetype and remove
661 * the smallest available page from the freelists 676 * the smallest available page from the freelists
662 */ 677 */
663static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 678static inline
679struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
664 int migratetype) 680 int migratetype)
665{ 681{
666 unsigned int current_order; 682 unsigned int current_order;
@@ -678,7 +694,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
678 list_del(&page->lru); 694 list_del(&page->lru);
679 rmv_page_order(page); 695 rmv_page_order(page);
680 area->nr_free--; 696 area->nr_free--;
681 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
682 expand(zone, page, order, current_order, area, migratetype); 697 expand(zone, page, order, current_order, area, migratetype);
683 return page; 698 return page;
684 } 699 }
@@ -769,8 +784,8 @@ static int move_freepages_block(struct zone *zone, struct page *page,
769} 784}
770 785
771/* Remove an element from the buddy allocator from the fallback list */ 786/* Remove an element from the buddy allocator from the fallback list */
772static struct page *__rmqueue_fallback(struct zone *zone, int order, 787static inline struct page *
773 int start_migratetype) 788__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
774{ 789{
775 struct free_area * area; 790 struct free_area * area;
776 int current_order; 791 int current_order;
@@ -818,8 +833,6 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
818 /* Remove the page from the freelists */ 833 /* Remove the page from the freelists */
819 list_del(&page->lru); 834 list_del(&page->lru);
820 rmv_page_order(page); 835 rmv_page_order(page);
821 __mod_zone_page_state(zone, NR_FREE_PAGES,
822 -(1UL << order));
823 836
824 if (current_order == pageblock_order) 837 if (current_order == pageblock_order)
825 set_pageblock_migratetype(page, 838 set_pageblock_migratetype(page,
@@ -830,8 +843,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
830 } 843 }
831 } 844 }
832 845
833 /* Use MIGRATE_RESERVE rather than fail an allocation */ 846 return NULL;
834 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
835} 847}
836 848
837/* 849/*
@@ -843,11 +855,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
843{ 855{
844 struct page *page; 856 struct page *page;
845 857
858retry_reserve:
846 page = __rmqueue_smallest(zone, order, migratetype); 859 page = __rmqueue_smallest(zone, order, migratetype);
847 860
848 if (unlikely(!page)) 861 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
849 page = __rmqueue_fallback(zone, order, migratetype); 862 page = __rmqueue_fallback(zone, order, migratetype);
850 863
864 /*
865 * Use MIGRATE_RESERVE rather than fail an allocation. goto
866 * is used because __rmqueue_smallest is an inline function
867 * and we want just one call site
868 */
869 if (!page) {
870 migratetype = MIGRATE_RESERVE;
871 goto retry_reserve;
872 }
873 }
874
851 return page; 875 return page;
852} 876}
853 877
@@ -881,6 +905,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
881 set_page_private(page, migratetype); 905 set_page_private(page, migratetype);
882 list = &page->lru; 906 list = &page->lru;
883 } 907 }
908 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
884 spin_unlock(&zone->lock); 909 spin_unlock(&zone->lock);
885 return i; 910 return i;
886} 911}
@@ -996,6 +1021,9 @@ static void free_hot_cold_page(struct page *page, int cold)
996 struct zone *zone = page_zone(page); 1021 struct zone *zone = page_zone(page);
997 struct per_cpu_pages *pcp; 1022 struct per_cpu_pages *pcp;
998 unsigned long flags; 1023 unsigned long flags;
1024 int clearMlocked = PageMlocked(page);
1025
1026 kmemcheck_free_shadow(page, 0);
999 1027
1000 if (PageAnon(page)) 1028 if (PageAnon(page))
1001 page->mapping = NULL; 1029 page->mapping = NULL;
@@ -1010,13 +1038,16 @@ static void free_hot_cold_page(struct page *page, int cold)
1010 kernel_map_pages(page, 1, 0); 1038 kernel_map_pages(page, 1, 0);
1011 1039
1012 pcp = &zone_pcp(zone, get_cpu())->pcp; 1040 pcp = &zone_pcp(zone, get_cpu())->pcp;
1041 set_page_private(page, get_pageblock_migratetype(page));
1013 local_irq_save(flags); 1042 local_irq_save(flags);
1043 if (unlikely(clearMlocked))
1044 free_page_mlock(page);
1014 __count_vm_event(PGFREE); 1045 __count_vm_event(PGFREE);
1046
1015 if (cold) 1047 if (cold)
1016 list_add_tail(&page->lru, &pcp->list); 1048 list_add_tail(&page->lru, &pcp->list);
1017 else 1049 else
1018 list_add(&page->lru, &pcp->list); 1050 list_add(&page->lru, &pcp->list);
1019 set_page_private(page, get_pageblock_migratetype(page));
1020 pcp->count++; 1051 pcp->count++;
1021 if (pcp->count >= pcp->high) { 1052 if (pcp->count >= pcp->high) {
1022 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1053 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -1050,6 +1081,16 @@ void split_page(struct page *page, unsigned int order)
1050 1081
1051 VM_BUG_ON(PageCompound(page)); 1082 VM_BUG_ON(PageCompound(page));
1052 VM_BUG_ON(!page_count(page)); 1083 VM_BUG_ON(!page_count(page));
1084
1085#ifdef CONFIG_KMEMCHECK
1086 /*
1087 * Split shadow pages too, because free(page[0]) would
1088 * otherwise free the whole shadow.
1089 */
1090 if (kmemcheck_page_is_tracked(page))
1091 split_page(virt_to_page(page[0].shadow), order);
1092#endif
1093
1053 for (i = 1; i < (1 << order); i++) 1094 for (i = 1; i < (1 << order); i++)
1054 set_page_refcounted(page + i); 1095 set_page_refcounted(page + i);
1055} 1096}
@@ -1059,14 +1100,15 @@ void split_page(struct page *page, unsigned int order)
1059 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1100 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1060 * or two. 1101 * or two.
1061 */ 1102 */
1062static struct page *buffered_rmqueue(struct zone *preferred_zone, 1103static inline
1063 struct zone *zone, int order, gfp_t gfp_flags) 1104struct page *buffered_rmqueue(struct zone *preferred_zone,
1105 struct zone *zone, int order, gfp_t gfp_flags,
1106 int migratetype)
1064{ 1107{
1065 unsigned long flags; 1108 unsigned long flags;
1066 struct page *page; 1109 struct page *page;
1067 int cold = !!(gfp_flags & __GFP_COLD); 1110 int cold = !!(gfp_flags & __GFP_COLD);
1068 int cpu; 1111 int cpu;
1069 int migratetype = allocflags_to_migratetype(gfp_flags);
1070 1112
1071again: 1113again:
1072 cpu = get_cpu(); 1114 cpu = get_cpu();
@@ -1103,8 +1145,22 @@ again:
1103 list_del(&page->lru); 1145 list_del(&page->lru);
1104 pcp->count--; 1146 pcp->count--;
1105 } else { 1147 } else {
1148 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1149 /*
1150 * __GFP_NOFAIL is not to be used in new code.
1151 *
1152 * All __GFP_NOFAIL callers should be fixed so that they
1153 * properly detect and handle allocation failures.
1154 *
1155 * We most definitely don't want callers attempting to
1156 * allocate greater than single-page units with
1157 * __GFP_NOFAIL.
1158 */
1159 WARN_ON_ONCE(order > 0);
1160 }
1106 spin_lock_irqsave(&zone->lock, flags); 1161 spin_lock_irqsave(&zone->lock, flags);
1107 page = __rmqueue(zone, order, migratetype); 1162 page = __rmqueue(zone, order, migratetype);
1163 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1108 spin_unlock(&zone->lock); 1164 spin_unlock(&zone->lock);
1109 if (!page) 1165 if (!page)
1110 goto failed; 1166 goto failed;
@@ -1126,10 +1182,15 @@ failed:
1126 return NULL; 1182 return NULL;
1127} 1183}
1128 1184
1129#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 1185/* The ALLOC_WMARK bits are used as an index to zone->watermark */
1130#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 1186#define ALLOC_WMARK_MIN WMARK_MIN
1131#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 1187#define ALLOC_WMARK_LOW WMARK_LOW
1132#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 1188#define ALLOC_WMARK_HIGH WMARK_HIGH
1189#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1190
1191/* Mask to get the watermark bits */
1192#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1193
1133#define ALLOC_HARDER 0x10 /* try to alloc harder */ 1194#define ALLOC_HARDER 0x10 /* try to alloc harder */
1134#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 1195#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1135#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 1196#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
@@ -1387,23 +1448,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1387 */ 1448 */
1388static struct page * 1449static struct page *
1389get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1450get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1390 struct zonelist *zonelist, int high_zoneidx, int alloc_flags) 1451 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1452 struct zone *preferred_zone, int migratetype)
1391{ 1453{
1392 struct zoneref *z; 1454 struct zoneref *z;
1393 struct page *page = NULL; 1455 struct page *page = NULL;
1394 int classzone_idx; 1456 int classzone_idx;
1395 struct zone *zone, *preferred_zone; 1457 struct zone *zone;
1396 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1458 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1397 int zlc_active = 0; /* set if using zonelist_cache */ 1459 int zlc_active = 0; /* set if using zonelist_cache */
1398 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1460 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1399 1461
1400 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
1401 &preferred_zone);
1402 if (!preferred_zone)
1403 return NULL;
1404
1405 classzone_idx = zone_idx(preferred_zone); 1462 classzone_idx = zone_idx(preferred_zone);
1406
1407zonelist_scan: 1463zonelist_scan:
1408 /* 1464 /*
1409 * Scan zonelist, looking for a zone with enough free. 1465 * Scan zonelist, looking for a zone with enough free.
@@ -1418,31 +1474,49 @@ zonelist_scan:
1418 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1474 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1419 goto try_next_zone; 1475 goto try_next_zone;
1420 1476
1477 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1421 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1478 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1422 unsigned long mark; 1479 unsigned long mark;
1423 if (alloc_flags & ALLOC_WMARK_MIN) 1480 int ret;
1424 mark = zone->pages_min; 1481
1425 else if (alloc_flags & ALLOC_WMARK_LOW) 1482 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1426 mark = zone->pages_low; 1483 if (zone_watermark_ok(zone, order, mark,
1427 else 1484 classzone_idx, alloc_flags))
1428 mark = zone->pages_high; 1485 goto try_this_zone;
1429 if (!zone_watermark_ok(zone, order, mark, 1486
1430 classzone_idx, alloc_flags)) { 1487 if (zone_reclaim_mode == 0)
1431 if (!zone_reclaim_mode || 1488 goto this_zone_full;
1432 !zone_reclaim(zone, gfp_mask, order)) 1489
1490 ret = zone_reclaim(zone, gfp_mask, order);
1491 switch (ret) {
1492 case ZONE_RECLAIM_NOSCAN:
1493 /* did not scan */
1494 goto try_next_zone;
1495 case ZONE_RECLAIM_FULL:
1496 /* scanned but unreclaimable */
1497 goto this_zone_full;
1498 default:
1499 /* did we reclaim enough */
1500 if (!zone_watermark_ok(zone, order, mark,
1501 classzone_idx, alloc_flags))
1433 goto this_zone_full; 1502 goto this_zone_full;
1434 } 1503 }
1435 } 1504 }
1436 1505
1437 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); 1506try_this_zone:
1507 page = buffered_rmqueue(preferred_zone, zone, order,
1508 gfp_mask, migratetype);
1438 if (page) 1509 if (page)
1439 break; 1510 break;
1440this_zone_full: 1511this_zone_full:
1441 if (NUMA_BUILD) 1512 if (NUMA_BUILD)
1442 zlc_mark_zone_full(zonelist, z); 1513 zlc_mark_zone_full(zonelist, z);
1443try_next_zone: 1514try_next_zone:
1444 if (NUMA_BUILD && !did_zlc_setup) { 1515 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1445 /* we do zlc_setup after the first zone is tried */ 1516 /*
1517 * we do zlc_setup after the first zone is tried but only
1518 * if there are multiple nodes make it worthwhile
1519 */
1446 allowednodes = zlc_setup(zonelist, alloc_flags); 1520 allowednodes = zlc_setup(zonelist, alloc_flags);
1447 zlc_active = 1; 1521 zlc_active = 1;
1448 did_zlc_setup = 1; 1522 did_zlc_setup = 1;
@@ -1457,47 +1531,217 @@ try_next_zone:
1457 return page; 1531 return page;
1458} 1532}
1459 1533
1534static inline int
1535should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1536 unsigned long pages_reclaimed)
1537{
1538 /* Do not loop if specifically requested */
1539 if (gfp_mask & __GFP_NORETRY)
1540 return 0;
1541
1542 /*
1543 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1544 * means __GFP_NOFAIL, but that may not be true in other
1545 * implementations.
1546 */
1547 if (order <= PAGE_ALLOC_COSTLY_ORDER)
1548 return 1;
1549
1550 /*
1551 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1552 * specified, then we retry until we no longer reclaim any pages
1553 * (above), or we've reclaimed an order of pages at least as
1554 * large as the allocation's order. In both cases, if the
1555 * allocation still fails, we stop retrying.
1556 */
1557 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1558 return 1;
1559
1560 /*
1561 * Don't let big-order allocations loop unless the caller
1562 * explicitly requests that.
1563 */
1564 if (gfp_mask & __GFP_NOFAIL)
1565 return 1;
1566
1567 return 0;
1568}
1569
1570static inline struct page *
1571__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1572 struct zonelist *zonelist, enum zone_type high_zoneidx,
1573 nodemask_t *nodemask, struct zone *preferred_zone,
1574 int migratetype)
1575{
1576 struct page *page;
1577
1578 /* Acquire the OOM killer lock for the zones in zonelist */
1579 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1580 schedule_timeout_uninterruptible(1);
1581 return NULL;
1582 }
1583
1584 /*
1585 * Go through the zonelist yet one more time, keep very high watermark
1586 * here, this is only to catch a parallel oom killing, we must fail if
1587 * we're still under heavy pressure.
1588 */
1589 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1590 order, zonelist, high_zoneidx,
1591 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
1592 preferred_zone, migratetype);
1593 if (page)
1594 goto out;
1595
1596 /* The OOM killer will not help higher order allocs */
1597 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))
1598 goto out;
1599
1600 /* Exhausted what can be done so it's blamo time */
1601 out_of_memory(zonelist, gfp_mask, order);
1602
1603out:
1604 clear_zonelist_oom(zonelist, gfp_mask);
1605 return page;
1606}
1607
1608/* The really slow allocator path where we enter direct reclaim */
1609static inline struct page *
1610__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1611 struct zonelist *zonelist, enum zone_type high_zoneidx,
1612 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1613 int migratetype, unsigned long *did_some_progress)
1614{
1615 struct page *page = NULL;
1616 struct reclaim_state reclaim_state;
1617 struct task_struct *p = current;
1618
1619 cond_resched();
1620
1621 /* We now go into synchronous reclaim */
1622 cpuset_memory_pressure_bump();
1623
1624 /*
1625 * The task's cpuset might have expanded its set of allowable nodes
1626 */
1627 p->flags |= PF_MEMALLOC;
1628 lockdep_set_current_reclaim_state(gfp_mask);
1629 reclaim_state.reclaimed_slab = 0;
1630 p->reclaim_state = &reclaim_state;
1631
1632 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1633
1634 p->reclaim_state = NULL;
1635 lockdep_clear_current_reclaim_state();
1636 p->flags &= ~PF_MEMALLOC;
1637
1638 cond_resched();
1639
1640 if (order != 0)
1641 drain_all_pages();
1642
1643 if (likely(*did_some_progress))
1644 page = get_page_from_freelist(gfp_mask, nodemask, order,
1645 zonelist, high_zoneidx,
1646 alloc_flags, preferred_zone,
1647 migratetype);
1648 return page;
1649}
1650
1460/* 1651/*
1461 * This is the 'heart' of the zoned buddy allocator. 1652 * This is called in the allocator slow-path if the allocation request is of
1653 * sufficient urgency to ignore watermarks and take other desperate measures
1462 */ 1654 */
1463struct page * 1655static inline struct page *
1464__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1656__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1465 struct zonelist *zonelist, nodemask_t *nodemask) 1657 struct zonelist *zonelist, enum zone_type high_zoneidx,
1658 nodemask_t *nodemask, struct zone *preferred_zone,
1659 int migratetype)
1660{
1661 struct page *page;
1662
1663 do {
1664 page = get_page_from_freelist(gfp_mask, nodemask, order,
1665 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
1666 preferred_zone, migratetype);
1667
1668 if (!page && gfp_mask & __GFP_NOFAIL)
1669 congestion_wait(WRITE, HZ/50);
1670 } while (!page && (gfp_mask & __GFP_NOFAIL));
1671
1672 return page;
1673}
1674
1675static inline
1676void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1677 enum zone_type high_zoneidx)
1466{ 1678{
1467 const gfp_t wait = gfp_mask & __GFP_WAIT;
1468 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1469 struct zoneref *z; 1679 struct zoneref *z;
1470 struct zone *zone; 1680 struct zone *zone;
1471 struct page *page;
1472 struct reclaim_state reclaim_state;
1473 struct task_struct *p = current;
1474 int do_retry;
1475 int alloc_flags;
1476 unsigned long did_some_progress;
1477 unsigned long pages_reclaimed = 0;
1478 1681
1479 lockdep_trace_alloc(gfp_mask); 1682 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1683 wakeup_kswapd(zone, order);
1684}
1480 1685
1481 might_sleep_if(wait); 1686static inline int
1687gfp_to_alloc_flags(gfp_t gfp_mask)
1688{
1689 struct task_struct *p = current;
1690 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1691 const gfp_t wait = gfp_mask & __GFP_WAIT;
1482 1692
1483 if (should_fail_alloc_page(gfp_mask, order)) 1693 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1484 return NULL; 1694 BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
1485 1695
1486restart: 1696 /*
1487 z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ 1697 * The caller may dip into page reserves a bit more if the caller
1698 * cannot run direct reclaim, or if the caller has realtime scheduling
1699 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1700 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1701 */
1702 alloc_flags |= (gfp_mask & __GFP_HIGH);
1488 1703
1489 if (unlikely(!z->zone)) { 1704 if (!wait) {
1705 alloc_flags |= ALLOC_HARDER;
1490 /* 1706 /*
1491 * Happens if we have an empty zonelist as a result of 1707 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1492 * GFP_THISNODE being used on a memoryless node 1708 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1493 */ 1709 */
1494 return NULL; 1710 alloc_flags &= ~ALLOC_CPUSET;
1711 } else if (unlikely(rt_task(p)))
1712 alloc_flags |= ALLOC_HARDER;
1713
1714 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1715 if (!in_interrupt() &&
1716 ((p->flags & PF_MEMALLOC) ||
1717 unlikely(test_thread_flag(TIF_MEMDIE))))
1718 alloc_flags |= ALLOC_NO_WATERMARKS;
1495 } 1719 }
1496 1720
1497 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 1721 return alloc_flags;
1498 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1722}
1499 if (page) 1723
1500 goto got_pg; 1724static inline struct page *
1725__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1726 struct zonelist *zonelist, enum zone_type high_zoneidx,
1727 nodemask_t *nodemask, struct zone *preferred_zone,
1728 int migratetype)
1729{
1730 const gfp_t wait = gfp_mask & __GFP_WAIT;
1731 struct page *page = NULL;
1732 int alloc_flags;
1733 unsigned long pages_reclaimed = 0;
1734 unsigned long did_some_progress;
1735 struct task_struct *p = current;
1736
1737 /*
1738 * In the slowpath, we sanity check order to avoid ever trying to
1739 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
1740 * be using allocators in order of preference for an area that is
1741 * too large.
1742 */
1743 if (WARN_ON_ONCE(order >= MAX_ORDER))
1744 return NULL;
1501 1745
1502 /* 1746 /*
1503 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1747 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1510,154 +1754,83 @@ restart:
1510 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1754 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1511 goto nopage; 1755 goto nopage;
1512 1756
1513 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1757 wake_all_kswapd(order, zonelist, high_zoneidx);
1514 wakeup_kswapd(zone, order);
1515 1758
1516 /* 1759 /*
1517 * OK, we're below the kswapd watermark and have kicked background 1760 * OK, we're below the kswapd watermark and have kicked background
1518 * reclaim. Now things get more complex, so set up alloc_flags according 1761 * reclaim. Now things get more complex, so set up alloc_flags according
1519 * to how we want to proceed. 1762 * to how we want to proceed.
1520 *
1521 * The caller may dip into page reserves a bit more if the caller
1522 * cannot run direct reclaim, or if the caller has realtime scheduling
1523 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1524 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1525 */ 1763 */
1526 alloc_flags = ALLOC_WMARK_MIN; 1764 alloc_flags = gfp_to_alloc_flags(gfp_mask);
1527 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
1528 alloc_flags |= ALLOC_HARDER;
1529 if (gfp_mask & __GFP_HIGH)
1530 alloc_flags |= ALLOC_HIGH;
1531 if (wait)
1532 alloc_flags |= ALLOC_CPUSET;
1533 1765
1534 /* 1766restart:
1535 * Go through the zonelist again. Let __GFP_HIGH and allocations 1767 /* This is the last chance, in general, before the goto nopage. */
1536 * coming from realtime tasks go deeper into reserves.
1537 *
1538 * This is the last chance, in general, before the goto nopage.
1539 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1540 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1541 */
1542 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 1768 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1543 high_zoneidx, alloc_flags); 1769 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
1770 preferred_zone, migratetype);
1544 if (page) 1771 if (page)
1545 goto got_pg; 1772 goto got_pg;
1546 1773
1547 /* This allocation should allow future memory freeing. */
1548
1549rebalance: 1774rebalance:
1550 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1775 /* Allocate without watermarks if the context allows */
1551 && !in_interrupt()) { 1776 if (alloc_flags & ALLOC_NO_WATERMARKS) {
1552 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1777 page = __alloc_pages_high_priority(gfp_mask, order,
1553nofail_alloc: 1778 zonelist, high_zoneidx, nodemask,
1554 /* go through the zonelist yet again, ignoring mins */ 1779 preferred_zone, migratetype);
1555 page = get_page_from_freelist(gfp_mask, nodemask, order, 1780 if (page)
1556 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); 1781 goto got_pg;
1557 if (page)
1558 goto got_pg;
1559 if (gfp_mask & __GFP_NOFAIL) {
1560 congestion_wait(WRITE, HZ/50);
1561 goto nofail_alloc;
1562 }
1563 }
1564 goto nopage;
1565 } 1782 }
1566 1783
1567 /* Atomic allocations - we can't balance anything */ 1784 /* Atomic allocations - we can't balance anything */
1568 if (!wait) 1785 if (!wait)
1569 goto nopage; 1786 goto nopage;
1570 1787
1571 cond_resched(); 1788 /* Avoid recursion of direct reclaim */
1789 if (p->flags & PF_MEMALLOC)
1790 goto nopage;
1791
1792 /* Try direct reclaim and then allocating */
1793 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1794 zonelist, high_zoneidx,
1795 nodemask,
1796 alloc_flags, preferred_zone,
1797 migratetype, &did_some_progress);
1798 if (page)
1799 goto got_pg;
1572 1800
1573 /* We now go into synchronous reclaim */
1574 cpuset_memory_pressure_bump();
1575 /* 1801 /*
1576 * The task's cpuset might have expanded its set of allowable nodes 1802 * If we failed to make any progress reclaiming, then we are
1803 * running out of options and have to consider going OOM
1577 */ 1804 */
1578 cpuset_update_task_memory_state(); 1805 if (!did_some_progress) {
1579 p->flags |= PF_MEMALLOC; 1806 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1580 1807 if (oom_killer_disabled)
1581 lockdep_set_current_reclaim_state(gfp_mask); 1808 goto nopage;
1582 reclaim_state.reclaimed_slab = 0; 1809 page = __alloc_pages_may_oom(gfp_mask, order,
1583 p->reclaim_state = &reclaim_state; 1810 zonelist, high_zoneidx,
1584 1811 nodemask, preferred_zone,
1585 did_some_progress = try_to_free_pages(zonelist, order, 1812 migratetype);
1586 gfp_mask, nodemask); 1813 if (page)
1587 1814 goto got_pg;
1588 p->reclaim_state = NULL;
1589 lockdep_clear_current_reclaim_state();
1590 p->flags &= ~PF_MEMALLOC;
1591
1592 cond_resched();
1593 1815
1594 if (order != 0) 1816 /*
1595 drain_all_pages(); 1817 * The OOM killer does not trigger for high-order
1818 * ~__GFP_NOFAIL allocations so if no progress is being
1819 * made, there are no other options and retrying is
1820 * unlikely to help.
1821 */
1822 if (order > PAGE_ALLOC_COSTLY_ORDER &&
1823 !(gfp_mask & __GFP_NOFAIL))
1824 goto nopage;
1596 1825
1597 if (likely(did_some_progress)) {
1598 page = get_page_from_freelist(gfp_mask, nodemask, order,
1599 zonelist, high_zoneidx, alloc_flags);
1600 if (page)
1601 goto got_pg;
1602 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1603 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1604 schedule_timeout_uninterruptible(1);
1605 goto restart; 1826 goto restart;
1606 } 1827 }
1607
1608 /*
1609 * Go through the zonelist yet one more time, keep
1610 * very high watermark here, this is only to catch
1611 * a parallel oom killing, we must fail if we're still
1612 * under heavy pressure.
1613 */
1614 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1615 order, zonelist, high_zoneidx,
1616 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1617 if (page) {
1618 clear_zonelist_oom(zonelist, gfp_mask);
1619 goto got_pg;
1620 }
1621
1622 /* The OOM killer will not help higher order allocs so fail */
1623 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1624 clear_zonelist_oom(zonelist, gfp_mask);
1625 goto nopage;
1626 }
1627
1628 out_of_memory(zonelist, gfp_mask, order);
1629 clear_zonelist_oom(zonelist, gfp_mask);
1630 goto restart;
1631 } 1828 }
1632 1829
1633 /* 1830 /* Check if we should retry the allocation */
1634 * Don't let big-order allocations loop unless the caller explicitly
1635 * requests that. Wait for some write requests to complete then retry.
1636 *
1637 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1638 * means __GFP_NOFAIL, but that may not be true in other
1639 * implementations.
1640 *
1641 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1642 * specified, then we retry until we no longer reclaim any pages
1643 * (above), or we've reclaimed an order of pages at least as
1644 * large as the allocation's order. In both cases, if the
1645 * allocation still fails, we stop retrying.
1646 */
1647 pages_reclaimed += did_some_progress; 1831 pages_reclaimed += did_some_progress;
1648 do_retry = 0; 1832 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1649 if (!(gfp_mask & __GFP_NORETRY)) { 1833 /* Wait for some write requests to complete then retry */
1650 if (order <= PAGE_ALLOC_COSTLY_ORDER) {
1651 do_retry = 1;
1652 } else {
1653 if (gfp_mask & __GFP_REPEAT &&
1654 pages_reclaimed < (1 << order))
1655 do_retry = 1;
1656 }
1657 if (gfp_mask & __GFP_NOFAIL)
1658 do_retry = 1;
1659 }
1660 if (do_retry) {
1661 congestion_wait(WRITE, HZ/50); 1834 congestion_wait(WRITE, HZ/50);
1662 goto rebalance; 1835 goto rebalance;
1663 } 1836 }
@@ -1670,10 +1843,58 @@ nopage:
1670 dump_stack(); 1843 dump_stack();
1671 show_mem(); 1844 show_mem();
1672 } 1845 }
1846 return page;
1673got_pg: 1847got_pg:
1848 if (kmemcheck_enabled)
1849 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
1674 return page; 1850 return page;
1851
1675} 1852}
1676EXPORT_SYMBOL(__alloc_pages_internal); 1853
1854/*
1855 * This is the 'heart' of the zoned buddy allocator.
1856 */
1857struct page *
1858__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1859 struct zonelist *zonelist, nodemask_t *nodemask)
1860{
1861 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1862 struct zone *preferred_zone;
1863 struct page *page;
1864 int migratetype = allocflags_to_migratetype(gfp_mask);
1865
1866 lockdep_trace_alloc(gfp_mask);
1867
1868 might_sleep_if(gfp_mask & __GFP_WAIT);
1869
1870 if (should_fail_alloc_page(gfp_mask, order))
1871 return NULL;
1872
1873 /*
1874 * Check the zones suitable for the gfp_mask contain at least one
1875 * valid zone. It's possible to have an empty zonelist as a result
1876 * of GFP_THISNODE and a memoryless node
1877 */
1878 if (unlikely(!zonelist->_zonerefs->zone))
1879 return NULL;
1880
1881 /* The preferred zone is used for statistics later */
1882 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1883 if (!preferred_zone)
1884 return NULL;
1885
1886 /* First allocation attempt */
1887 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1888 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
1889 preferred_zone, migratetype);
1890 if (unlikely(!page))
1891 page = __alloc_pages_slowpath(gfp_mask, order,
1892 zonelist, high_zoneidx, nodemask,
1893 preferred_zone, migratetype);
1894
1895 return page;
1896}
1897EXPORT_SYMBOL(__alloc_pages_nodemask);
1677 1898
1678/* 1899/*
1679 * Common helper functions. 1900 * Common helper functions.
@@ -1802,7 +2023,7 @@ static unsigned int nr_free_zone_pages(int offset)
1802 2023
1803 for_each_zone_zonelist(zone, z, zonelist, offset) { 2024 for_each_zone_zonelist(zone, z, zonelist, offset) {
1804 unsigned long size = zone->present_pages; 2025 unsigned long size = zone->present_pages;
1805 unsigned long high = zone->pages_high; 2026 unsigned long high = high_wmark_pages(zone);
1806 if (size > high) 2027 if (size > high)
1807 sum += size - high; 2028 sum += size - high;
1808 } 2029 }
@@ -1894,19 +2115,14 @@ void show_free_areas(void)
1894 2115
1895 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" 2116 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
1896 " inactive_file:%lu" 2117 " inactive_file:%lu"
1897//TODO: check/adjust line lengths
1898#ifdef CONFIG_UNEVICTABLE_LRU
1899 " unevictable:%lu" 2118 " unevictable:%lu"
1900#endif
1901 " dirty:%lu writeback:%lu unstable:%lu\n" 2119 " dirty:%lu writeback:%lu unstable:%lu\n"
1902 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 2120 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1903 global_page_state(NR_ACTIVE_ANON), 2121 global_page_state(NR_ACTIVE_ANON),
1904 global_page_state(NR_ACTIVE_FILE), 2122 global_page_state(NR_ACTIVE_FILE),
1905 global_page_state(NR_INACTIVE_ANON), 2123 global_page_state(NR_INACTIVE_ANON),
1906 global_page_state(NR_INACTIVE_FILE), 2124 global_page_state(NR_INACTIVE_FILE),
1907#ifdef CONFIG_UNEVICTABLE_LRU
1908 global_page_state(NR_UNEVICTABLE), 2125 global_page_state(NR_UNEVICTABLE),
1909#endif
1910 global_page_state(NR_FILE_DIRTY), 2126 global_page_state(NR_FILE_DIRTY),
1911 global_page_state(NR_WRITEBACK), 2127 global_page_state(NR_WRITEBACK),
1912 global_page_state(NR_UNSTABLE_NFS), 2128 global_page_state(NR_UNSTABLE_NFS),
@@ -1930,25 +2146,21 @@ void show_free_areas(void)
1930 " inactive_anon:%lukB" 2146 " inactive_anon:%lukB"
1931 " active_file:%lukB" 2147 " active_file:%lukB"
1932 " inactive_file:%lukB" 2148 " inactive_file:%lukB"
1933#ifdef CONFIG_UNEVICTABLE_LRU
1934 " unevictable:%lukB" 2149 " unevictable:%lukB"
1935#endif
1936 " present:%lukB" 2150 " present:%lukB"
1937 " pages_scanned:%lu" 2151 " pages_scanned:%lu"
1938 " all_unreclaimable? %s" 2152 " all_unreclaimable? %s"
1939 "\n", 2153 "\n",
1940 zone->name, 2154 zone->name,
1941 K(zone_page_state(zone, NR_FREE_PAGES)), 2155 K(zone_page_state(zone, NR_FREE_PAGES)),
1942 K(zone->pages_min), 2156 K(min_wmark_pages(zone)),
1943 K(zone->pages_low), 2157 K(low_wmark_pages(zone)),
1944 K(zone->pages_high), 2158 K(high_wmark_pages(zone)),
1945 K(zone_page_state(zone, NR_ACTIVE_ANON)), 2159 K(zone_page_state(zone, NR_ACTIVE_ANON)),
1946 K(zone_page_state(zone, NR_INACTIVE_ANON)), 2160 K(zone_page_state(zone, NR_INACTIVE_ANON)),
1947 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2161 K(zone_page_state(zone, NR_ACTIVE_FILE)),
1948 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2162 K(zone_page_state(zone, NR_INACTIVE_FILE)),
1949#ifdef CONFIG_UNEVICTABLE_LRU
1950 K(zone_page_state(zone, NR_UNEVICTABLE)), 2163 K(zone_page_state(zone, NR_UNEVICTABLE)),
1951#endif
1952 K(zone->present_pages), 2164 K(zone->present_pages),
1953 zone->pages_scanned, 2165 zone->pages_scanned,
1954 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2166 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -2106,7 +2318,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2106} 2318}
2107 2319
2108 2320
2109#define MAX_NODE_LOAD (num_online_nodes()) 2321#define MAX_NODE_LOAD (nr_online_nodes)
2110static int node_load[MAX_NUMNODES]; 2322static int node_load[MAX_NUMNODES];
2111 2323
2112/** 2324/**
@@ -2315,7 +2527,7 @@ static void build_zonelists(pg_data_t *pgdat)
2315 2527
2316 /* NUMA-aware ordering of nodes */ 2528 /* NUMA-aware ordering of nodes */
2317 local_node = pgdat->node_id; 2529 local_node = pgdat->node_id;
2318 load = num_online_nodes(); 2530 load = nr_online_nodes;
2319 prev_node = local_node; 2531 prev_node = local_node;
2320 nodes_clear(used_mask); 2532 nodes_clear(used_mask);
2321 2533
@@ -2466,7 +2678,7 @@ void build_all_zonelists(void)
2466 2678
2467 printk("Built %i zonelists in %s order, mobility grouping %s. " 2679 printk("Built %i zonelists in %s order, mobility grouping %s. "
2468 "Total pages: %ld\n", 2680 "Total pages: %ld\n",
2469 num_online_nodes(), 2681 nr_online_nodes,
2470 zonelist_order_name[current_zonelist_order], 2682 zonelist_order_name[current_zonelist_order],
2471 page_group_by_mobility_disabled ? "off" : "on", 2683 page_group_by_mobility_disabled ? "off" : "on",
2472 vm_total_pages); 2684 vm_total_pages);
@@ -2545,8 +2757,8 @@ static inline unsigned long wait_table_bits(unsigned long size)
2545 2757
2546/* 2758/*
2547 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 2759 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2548 * of blocks reserved is based on zone->pages_min. The memory within the 2760 * of blocks reserved is based on min_wmark_pages(zone). The memory within
2549 * reserve will tend to store contiguous free pages. Setting min_free_kbytes 2761 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
2550 * higher will lead to a bigger reserve which will get freed as contiguous 2762 * higher will lead to a bigger reserve which will get freed as contiguous
2551 * blocks as reclaim kicks in 2763 * blocks as reclaim kicks in
2552 */ 2764 */
@@ -2559,7 +2771,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2559 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2771 /* Get the start pfn, end pfn and the number of blocks to reserve */
2560 start_pfn = zone->zone_start_pfn; 2772 start_pfn = zone->zone_start_pfn;
2561 end_pfn = start_pfn + zone->spanned_pages; 2773 end_pfn = start_pfn + zone->spanned_pages;
2562 reserve = roundup(zone->pages_min, pageblock_nr_pages) >> 2774 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2563 pageblock_order; 2775 pageblock_order;
2564 2776
2565 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2777 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
@@ -2681,6 +2893,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
2681 2893
2682static int zone_batchsize(struct zone *zone) 2894static int zone_batchsize(struct zone *zone)
2683{ 2895{
2896#ifdef CONFIG_MMU
2684 int batch; 2897 int batch;
2685 2898
2686 /* 2899 /*
@@ -2706,9 +2919,26 @@ static int zone_batchsize(struct zone *zone)
2706 * of pages of one half of the possible page colors 2919 * of pages of one half of the possible page colors
2707 * and the other with pages of the other colors. 2920 * and the other with pages of the other colors.
2708 */ 2921 */
2709 batch = (1 << (fls(batch + batch/2)-1)) - 1; 2922 batch = rounddown_pow_of_two(batch + batch/2) - 1;
2710 2923
2711 return batch; 2924 return batch;
2925
2926#else
2927 /* The deferral and batching of frees should be suppressed under NOMMU
2928 * conditions.
2929 *
2930 * The problem is that NOMMU needs to be able to allocate large chunks
2931 * of contiguous memory as there's no hardware page translation to
2932 * assemble apparent contiguous memory from discontiguous pages.
2933 *
2934 * Queueing large contiguous runs of pages for batching, however,
2935 * causes the pages to actually be freed in smaller chunks. As there
2936 * can be a significant delay between the individual batches being
2937 * recycled, this leads to the once large chunks of space being
2938 * fragmented and becoming unavailable for high-order allocations.
2939 */
2940 return 0;
2941#endif
2712} 2942}
2713 2943
2714static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 2944static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
@@ -3085,64 +3315,6 @@ void __init sparse_memory_present_with_active_regions(int nid)
3085} 3315}
3086 3316
3087/** 3317/**
3088 * push_node_boundaries - Push node boundaries to at least the requested boundary
3089 * @nid: The nid of the node to push the boundary for
3090 * @start_pfn: The start pfn of the node
3091 * @end_pfn: The end pfn of the node
3092 *
3093 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
3094 * time. Specifically, on x86_64, SRAT will report ranges that can potentially
3095 * be hotplugged even though no physical memory exists. This function allows
3096 * an arch to push out the node boundaries so mem_map is allocated that can
3097 * be used later.
3098 */
3099#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
3100void __init push_node_boundaries(unsigned int nid,
3101 unsigned long start_pfn, unsigned long end_pfn)
3102{
3103 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3104 "Entering push_node_boundaries(%u, %lu, %lu)\n",
3105 nid, start_pfn, end_pfn);
3106
3107 /* Initialise the boundary for this node if necessary */
3108 if (node_boundary_end_pfn[nid] == 0)
3109 node_boundary_start_pfn[nid] = -1UL;
3110
3111 /* Update the boundaries */
3112 if (node_boundary_start_pfn[nid] > start_pfn)
3113 node_boundary_start_pfn[nid] = start_pfn;
3114 if (node_boundary_end_pfn[nid] < end_pfn)
3115 node_boundary_end_pfn[nid] = end_pfn;
3116}
3117
3118/* If necessary, push the node boundary out for reserve hotadd */
3119static void __meminit account_node_boundary(unsigned int nid,
3120 unsigned long *start_pfn, unsigned long *end_pfn)
3121{
3122 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3123 "Entering account_node_boundary(%u, %lu, %lu)\n",
3124 nid, *start_pfn, *end_pfn);
3125
3126 /* Return if boundary information has not been provided */
3127 if (node_boundary_end_pfn[nid] == 0)
3128 return;
3129
3130 /* Check the boundaries and update if necessary */
3131 if (node_boundary_start_pfn[nid] < *start_pfn)
3132 *start_pfn = node_boundary_start_pfn[nid];
3133 if (node_boundary_end_pfn[nid] > *end_pfn)
3134 *end_pfn = node_boundary_end_pfn[nid];
3135}
3136#else
3137void __init push_node_boundaries(unsigned int nid,
3138 unsigned long start_pfn, unsigned long end_pfn) {}
3139
3140static void __meminit account_node_boundary(unsigned int nid,
3141 unsigned long *start_pfn, unsigned long *end_pfn) {}
3142#endif
3143
3144
3145/**
3146 * get_pfn_range_for_nid - Return the start and end page frames for a node 3318 * get_pfn_range_for_nid - Return the start and end page frames for a node
3147 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 3319 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
3148 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 3320 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
@@ -3167,9 +3339,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
3167 3339
3168 if (*start_pfn == -1UL) 3340 if (*start_pfn == -1UL)
3169 *start_pfn = 0; 3341 *start_pfn = 0;
3170
3171 /* Push the node boundaries out if requested */
3172 account_node_boundary(nid, start_pfn, end_pfn);
3173} 3342}
3174 3343
3175/* 3344/*
@@ -3534,7 +3703,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3534 zone_pcp_init(zone); 3703 zone_pcp_init(zone);
3535 for_each_lru(l) { 3704 for_each_lru(l) {
3536 INIT_LIST_HEAD(&zone->lru[l].list); 3705 INIT_LIST_HEAD(&zone->lru[l].list);
3537 zone->lru[l].nr_scan = 0; 3706 zone->lru[l].nr_saved_scan = 0;
3538 } 3707 }
3539 zone->reclaim_stat.recent_rotated[0] = 0; 3708 zone->reclaim_stat.recent_rotated[0] = 0;
3540 zone->reclaim_stat.recent_rotated[1] = 0; 3709 zone->reclaim_stat.recent_rotated[1] = 0;
@@ -3775,10 +3944,6 @@ void __init remove_all_active_ranges(void)
3775{ 3944{
3776 memset(early_node_map, 0, sizeof(early_node_map)); 3945 memset(early_node_map, 0, sizeof(early_node_map));
3777 nr_nodemap_entries = 0; 3946 nr_nodemap_entries = 0;
3778#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
3779 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
3780 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
3781#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
3782} 3947}
3783 3948
3784/* Compare two active node_active_regions */ 3949/* Compare two active node_active_regions */
@@ -4075,6 +4240,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4075 early_node_map[i].start_pfn, 4240 early_node_map[i].start_pfn,
4076 early_node_map[i].end_pfn); 4241 early_node_map[i].end_pfn);
4077 4242
4243 /*
4244 * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init
4245 * that node_mask, clear it at first
4246 */
4247 nodes_clear(node_states[N_HIGH_MEMORY]);
4078 /* Initialise every node */ 4248 /* Initialise every node */
4079 mminit_verify_pageflags_layout(); 4249 mminit_verify_pageflags_layout();
4080 setup_nr_node_ids(); 4250 setup_nr_node_ids();
@@ -4209,8 +4379,8 @@ static void calculate_totalreserve_pages(void)
4209 max = zone->lowmem_reserve[j]; 4379 max = zone->lowmem_reserve[j];
4210 } 4380 }
4211 4381
4212 /* we treat pages_high as reserved pages. */ 4382 /* we treat the high watermark as reserved pages. */
4213 max += zone->pages_high; 4383 max += high_wmark_pages(zone);
4214 4384
4215 if (max > zone->present_pages) 4385 if (max > zone->present_pages)
4216 max = zone->present_pages; 4386 max = zone->present_pages;
@@ -4260,12 +4430,13 @@ static void setup_per_zone_lowmem_reserve(void)
4260} 4430}
4261 4431
4262/** 4432/**
4263 * setup_per_zone_pages_min - called when min_free_kbytes changes. 4433 * setup_per_zone_wmarks - called when min_free_kbytes changes
4434 * or when memory is hot-{added|removed}
4264 * 4435 *
4265 * Ensures that the pages_{min,low,high} values for each zone are set correctly 4436 * Ensures that the watermark[min,low,high] values for each zone are set
4266 * with respect to min_free_kbytes. 4437 * correctly with respect to min_free_kbytes.
4267 */ 4438 */
4268void setup_per_zone_pages_min(void) 4439void setup_per_zone_wmarks(void)
4269{ 4440{
4270 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 4441 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
4271 unsigned long lowmem_pages = 0; 4442 unsigned long lowmem_pages = 0;
@@ -4290,7 +4461,7 @@ void setup_per_zone_pages_min(void)
4290 * need highmem pages, so cap pages_min to a small 4461 * need highmem pages, so cap pages_min to a small
4291 * value here. 4462 * value here.
4292 * 4463 *
4293 * The (pages_high-pages_low) and (pages_low-pages_min) 4464 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
4294 * deltas controls asynch page reclaim, and so should 4465 * deltas controls asynch page reclaim, and so should
4295 * not be capped for highmem. 4466 * not be capped for highmem.
4296 */ 4467 */
@@ -4301,17 +4472,17 @@ void setup_per_zone_pages_min(void)
4301 min_pages = SWAP_CLUSTER_MAX; 4472 min_pages = SWAP_CLUSTER_MAX;
4302 if (min_pages > 128) 4473 if (min_pages > 128)
4303 min_pages = 128; 4474 min_pages = 128;
4304 zone->pages_min = min_pages; 4475 zone->watermark[WMARK_MIN] = min_pages;
4305 } else { 4476 } else {
4306 /* 4477 /*
4307 * If it's a lowmem zone, reserve a number of pages 4478 * If it's a lowmem zone, reserve a number of pages
4308 * proportionate to the zone's size. 4479 * proportionate to the zone's size.
4309 */ 4480 */
4310 zone->pages_min = tmp; 4481 zone->watermark[WMARK_MIN] = tmp;
4311 } 4482 }
4312 4483
4313 zone->pages_low = zone->pages_min + (tmp >> 2); 4484 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
4314 zone->pages_high = zone->pages_min + (tmp >> 1); 4485 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
4315 setup_zone_migrate_reserve(zone); 4486 setup_zone_migrate_reserve(zone);
4316 spin_unlock_irqrestore(&zone->lock, flags); 4487 spin_unlock_irqrestore(&zone->lock, flags);
4317 } 4488 }
@@ -4321,8 +4492,6 @@ void setup_per_zone_pages_min(void)
4321} 4492}
4322 4493
4323/** 4494/**
4324 * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
4325 *
4326 * The inactive anon list should be small enough that the VM never has to 4495 * The inactive anon list should be small enough that the VM never has to
4327 * do too much work, but large enough that each inactive page has a chance 4496 * do too much work, but large enough that each inactive page has a chance
4328 * to be referenced again before it is swapped out. 4497 * to be referenced again before it is swapped out.
@@ -4343,21 +4512,26 @@ void setup_per_zone_pages_min(void)
4343 * 1TB 101 10GB 4512 * 1TB 101 10GB
4344 * 10TB 320 32GB 4513 * 10TB 320 32GB
4345 */ 4514 */
4346static void setup_per_zone_inactive_ratio(void) 4515void calculate_zone_inactive_ratio(struct zone *zone)
4347{ 4516{
4348 struct zone *zone; 4517 unsigned int gb, ratio;
4349
4350 for_each_zone(zone) {
4351 unsigned int gb, ratio;
4352 4518
4353 /* Zone size in gigabytes */ 4519 /* Zone size in gigabytes */
4354 gb = zone->present_pages >> (30 - PAGE_SHIFT); 4520 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4521 if (gb)
4355 ratio = int_sqrt(10 * gb); 4522 ratio = int_sqrt(10 * gb);
4356 if (!ratio) 4523 else
4357 ratio = 1; 4524 ratio = 1;
4358 4525
4359 zone->inactive_ratio = ratio; 4526 zone->inactive_ratio = ratio;
4360 } 4527}
4528
4529static void __init setup_per_zone_inactive_ratio(void)
4530{
4531 struct zone *zone;
4532
4533 for_each_zone(zone)
4534 calculate_zone_inactive_ratio(zone);
4361} 4535}
4362 4536
4363/* 4537/*
@@ -4384,7 +4558,7 @@ static void setup_per_zone_inactive_ratio(void)
4384 * 8192MB: 11584k 4558 * 8192MB: 11584k
4385 * 16384MB: 16384k 4559 * 16384MB: 16384k
4386 */ 4560 */
4387static int __init init_per_zone_pages_min(void) 4561static int __init init_per_zone_wmark_min(void)
4388{ 4562{
4389 unsigned long lowmem_kbytes; 4563 unsigned long lowmem_kbytes;
4390 4564
@@ -4395,12 +4569,12 @@ static int __init init_per_zone_pages_min(void)
4395 min_free_kbytes = 128; 4569 min_free_kbytes = 128;
4396 if (min_free_kbytes > 65536) 4570 if (min_free_kbytes > 65536)
4397 min_free_kbytes = 65536; 4571 min_free_kbytes = 65536;
4398 setup_per_zone_pages_min(); 4572 setup_per_zone_wmarks();
4399 setup_per_zone_lowmem_reserve(); 4573 setup_per_zone_lowmem_reserve();
4400 setup_per_zone_inactive_ratio(); 4574 setup_per_zone_inactive_ratio();
4401 return 0; 4575 return 0;
4402} 4576}
4403module_init(init_per_zone_pages_min) 4577module_init(init_per_zone_wmark_min)
4404 4578
4405/* 4579/*
4406 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 4580 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
@@ -4412,7 +4586,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4412{ 4586{
4413 proc_dointvec(table, write, file, buffer, length, ppos); 4587 proc_dointvec(table, write, file, buffer, length, ppos);
4414 if (write) 4588 if (write)
4415 setup_per_zone_pages_min(); 4589 setup_per_zone_wmarks();
4416 return 0; 4590 return 0;
4417} 4591}
4418 4592
@@ -4456,7 +4630,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4456 * whenever sysctl_lowmem_reserve_ratio changes. 4630 * whenever sysctl_lowmem_reserve_ratio changes.
4457 * 4631 *
4458 * The reserve ratio obviously has absolutely no relation with the 4632 * The reserve ratio obviously has absolutely no relation with the
4459 * pages_min watermarks. The lowmem reserve ratio can only make sense 4633 * minimum watermarks. The lowmem reserve ratio can only make sense
4460 * if in function of the boot time zone sizes. 4634 * if in function of the boot time zone sizes.
4461 */ 4635 */
4462int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4636int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
@@ -4563,23 +4737,13 @@ void *__init alloc_large_system_hash(const char *tablename,
4563 else if (hashdist) 4737 else if (hashdist)
4564 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4738 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4565 else { 4739 else {
4566 unsigned long order = get_order(size);
4567 table = (void*) __get_free_pages(GFP_ATOMIC, order);
4568 /* 4740 /*
4569 * If bucketsize is not a power-of-two, we may free 4741 * If bucketsize is not a power-of-two, we may free
4570 * some pages at the end of hash table. 4742 * some pages at the end of hash table which
4743 * alloc_pages_exact() automatically does
4571 */ 4744 */
4572 if (table) { 4745 if (get_order(size) < MAX_ORDER)
4573 unsigned long alloc_end = (unsigned long)table + 4746 table = alloc_pages_exact(size, GFP_ATOMIC);
4574 (PAGE_SIZE << order);
4575 unsigned long used = (unsigned long)table +
4576 PAGE_ALIGN(size);
4577 split_page(virt_to_page(table), order);
4578 while (used < alloc_end) {
4579 free_page(used);
4580 used += PAGE_SIZE;
4581 }
4582 }
4583 } 4747 }
4584 } while (!table && size > PAGE_SIZE && --log2qty); 4748 } while (!table && size > PAGE_SIZE && --log2qty);
4585 4749
@@ -4597,6 +4761,16 @@ void *__init alloc_large_system_hash(const char *tablename,
4597 if (_hash_mask) 4761 if (_hash_mask)
4598 *_hash_mask = (1 << log2qty) - 1; 4762 *_hash_mask = (1 << log2qty) - 1;
4599 4763
4764 /*
4765 * If hashdist is set, the table allocation is done with __vmalloc()
4766 * which invokes the kmemleak_alloc() callback. This function may also
4767 * be called before the slab and kmemleak are initialised when
4768 * kmemleak simply buffers the request to be executed later
4769 * (GFP_ATOMIC flag ignored in this case).
4770 */
4771 if (!hashdist)
4772 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
4773
4600 return table; 4774 return table;
4601} 4775}
4602 4776
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 791905c991df..11a8a10a3909 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -69,7 +69,7 @@ static int __init alloc_node_page_cgroup(int nid)
69 return 0; 69 return 0;
70} 70}
71 71
72void __init page_cgroup_init(void) 72void __init page_cgroup_init_flatmem(void)
73{ 73{
74 74
75 int nid, fail; 75 int nid, fail;
@@ -113,16 +113,11 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
113 if (!section->page_cgroup) { 113 if (!section->page_cgroup) {
114 nid = page_to_nid(pfn_to_page(pfn)); 114 nid = page_to_nid(pfn_to_page(pfn));
115 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 115 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
116 if (slab_is_available()) { 116 VM_BUG_ON(!slab_is_available());
117 base = kmalloc_node(table_size, 117 base = kmalloc_node(table_size,
118 GFP_KERNEL | __GFP_NOWARN, nid); 118 GFP_KERNEL | __GFP_NOWARN, nid);
119 if (!base) 119 if (!base)
120 base = vmalloc_node(table_size, nid); 120 base = vmalloc_node(table_size, nid);
121 } else {
122 base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
123 table_size,
124 PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
125 }
126 } else { 121 } else {
127 /* 122 /*
128 * We don't have to allocate page_cgroup again, but 123 * We don't have to allocate page_cgroup again, but
diff --git a/mm/page_io.c b/mm/page_io.c
index 3023c475e041..c6f3e5071de3 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -120,7 +120,7 @@ out:
120 return ret; 120 return ret;
121} 121}
122 122
123int swap_readpage(struct file *file, struct page *page) 123int swap_readpage(struct page *page)
124{ 124{
125 struct bio *bio; 125 struct bio *bio;
126 int ret = 0; 126 int ret = 0;
diff --git a/mm/pdflush.c b/mm/pdflush.c
index f2caf96993f8..235ac440c44e 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -58,14 +58,6 @@ static DEFINE_SPINLOCK(pdflush_lock);
58int nr_pdflush_threads = 0; 58int nr_pdflush_threads = 0;
59 59
60/* 60/*
61 * The max/min number of pdflush threads. R/W by sysctl at
62 * /proc/sys/vm/nr_pdflush_threads_max/min
63 */
64int nr_pdflush_threads_max __read_mostly = MAX_PDFLUSH_THREADS;
65int nr_pdflush_threads_min __read_mostly = MIN_PDFLUSH_THREADS;
66
67
68/*
69 * The time at which the pdflush thread pool last went empty 61 * The time at which the pdflush thread pool last went empty
70 */ 62 */
71static unsigned long last_empty_jifs; 63static unsigned long last_empty_jifs;
@@ -76,7 +68,7 @@ static unsigned long last_empty_jifs;
76 * Thread pool management algorithm: 68 * Thread pool management algorithm:
77 * 69 *
78 * - The minimum and maximum number of pdflush instances are bound 70 * - The minimum and maximum number of pdflush instances are bound
79 * by nr_pdflush_threads_min and nr_pdflush_threads_max. 71 * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
80 * 72 *
81 * - If there have been no idle pdflush instances for 1 second, create 73 * - If there have been no idle pdflush instances for 1 second, create
82 * a new one. 74 * a new one.
@@ -142,13 +134,14 @@ static int __pdflush(struct pdflush_work *my_work)
142 * To throttle creation, we reset last_empty_jifs. 134 * To throttle creation, we reset last_empty_jifs.
143 */ 135 */
144 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { 136 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
145 if (list_empty(&pdflush_list) && 137 if (list_empty(&pdflush_list)) {
146 nr_pdflush_threads < nr_pdflush_threads_max) { 138 if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
147 last_empty_jifs = jiffies; 139 last_empty_jifs = jiffies;
148 nr_pdflush_threads++; 140 nr_pdflush_threads++;
149 spin_unlock_irq(&pdflush_lock); 141 spin_unlock_irq(&pdflush_lock);
150 start_one_pdflush_thread(); 142 start_one_pdflush_thread();
151 spin_lock_irq(&pdflush_lock); 143 spin_lock_irq(&pdflush_lock);
144 }
152 } 145 }
153 } 146 }
154 147
@@ -160,7 +153,7 @@ static int __pdflush(struct pdflush_work *my_work)
160 */ 153 */
161 if (list_empty(&pdflush_list)) 154 if (list_empty(&pdflush_list))
162 continue; 155 continue;
163 if (nr_pdflush_threads <= nr_pdflush_threads_min) 156 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
164 continue; 157 continue;
165 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); 158 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
166 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { 159 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
@@ -266,9 +259,9 @@ static int __init pdflush_init(void)
266 * Pre-set nr_pdflush_threads... If we fail to create, 259 * Pre-set nr_pdflush_threads... If we fail to create,
267 * the count will be decremented. 260 * the count will be decremented.
268 */ 261 */
269 nr_pdflush_threads = nr_pdflush_threads_min; 262 nr_pdflush_threads = MIN_PDFLUSH_THREADS;
270 263
271 for (i = 0; i < nr_pdflush_threads_min; i++) 264 for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
272 start_one_pdflush_thread(); 265 start_one_pdflush_thread();
273 return 0; 266 return 0;
274} 267}
diff --git a/mm/percpu.c b/mm/percpu.c
index 1aa5d8fbca12..c0b2c1a76e81 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -23,7 +23,7 @@
23 * Allocation is done in offset-size areas of single unit space. Ie, 23 * Allocation is done in offset-size areas of single unit space. Ie,
24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, 24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring 25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
26 * percpu base registers UNIT_SIZE apart. 26 * percpu base registers pcpu_unit_size apart.
27 * 27 *
28 * There are usually many small percpu allocations many of them as 28 * There are usually many small percpu allocations many of them as
29 * small as 4 bytes. The allocator organizes chunks into lists 29 * small as 4 bytes. The allocator organizes chunks into lists
@@ -38,8 +38,8 @@
38 * region and negative allocated. Allocation inside a chunk is done 38 * region and negative allocated. Allocation inside a chunk is done
39 * by scanning this map sequentially and serving the first matching 39 * by scanning this map sequentially and serving the first matching
40 * entry. This is mostly copied from the percpu_modalloc() allocator. 40 * entry. This is mostly copied from the percpu_modalloc() allocator.
41 * Chunks are also linked into a rb tree to ease address to chunk 41 * Chunks can be determined from the address using the index field
42 * mapping during free. 42 * in the page struct. The index field contains a pointer to the chunk.
43 * 43 *
44 * To use this allocator, arch code should do the followings. 44 * To use this allocator, arch code should do the followings.
45 * 45 *
@@ -61,7 +61,6 @@
61#include <linux/mutex.h> 61#include <linux/mutex.h>
62#include <linux/percpu.h> 62#include <linux/percpu.h>
63#include <linux/pfn.h> 63#include <linux/pfn.h>
64#include <linux/rbtree.h>
65#include <linux/slab.h> 64#include <linux/slab.h>
66#include <linux/spinlock.h> 65#include <linux/spinlock.h>
67#include <linux/vmalloc.h> 66#include <linux/vmalloc.h>
@@ -88,7 +87,6 @@
88 87
89struct pcpu_chunk { 88struct pcpu_chunk {
90 struct list_head list; /* linked to pcpu_slot lists */ 89 struct list_head list; /* linked to pcpu_slot lists */
91 struct rb_node rb_node; /* key is chunk->vm->addr */
92 int free_size; /* free bytes in the chunk */ 90 int free_size; /* free bytes in the chunk */
93 int contig_hint; /* max contiguous size hint */ 91 int contig_hint; /* max contiguous size hint */
94 struct vm_struct *vm; /* mapped vmalloc region */ 92 struct vm_struct *vm; /* mapped vmalloc region */
@@ -110,9 +108,21 @@ static size_t pcpu_chunk_struct_size __read_mostly;
110void *pcpu_base_addr __read_mostly; 108void *pcpu_base_addr __read_mostly;
111EXPORT_SYMBOL_GPL(pcpu_base_addr); 109EXPORT_SYMBOL_GPL(pcpu_base_addr);
112 110
113/* optional reserved chunk, only accessible for reserved allocations */ 111/*
112 * The first chunk which always exists. Note that unlike other
113 * chunks, this one can be allocated and mapped in several different
114 * ways and thus often doesn't live in the vmalloc area.
115 */
116static struct pcpu_chunk *pcpu_first_chunk;
117
118/*
119 * Optional reserved chunk. This chunk reserves part of the first
120 * chunk and serves it for reserved allocations. The amount of
121 * reserved offset is in pcpu_reserved_chunk_limit. When reserved
122 * area doesn't exist, the following variables contain NULL and 0
123 * respectively.
124 */
114static struct pcpu_chunk *pcpu_reserved_chunk; 125static struct pcpu_chunk *pcpu_reserved_chunk;
115/* offset limit of the reserved chunk */
116static int pcpu_reserved_chunk_limit; 126static int pcpu_reserved_chunk_limit;
117 127
118/* 128/*
@@ -121,7 +131,7 @@ static int pcpu_reserved_chunk_limit;
121 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 131 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
122 * protects allocation/reclaim paths, chunks and chunk->page arrays. 132 * protects allocation/reclaim paths, chunks and chunk->page arrays.
123 * The latter is a spinlock and protects the index data structures - 133 * The latter is a spinlock and protects the index data structures -
124 * chunk slots, rbtree, chunks and area maps in chunks. 134 * chunk slots, chunks and area maps in chunks.
125 * 135 *
126 * During allocation, pcpu_alloc_mutex is kept locked all the time and 136 * During allocation, pcpu_alloc_mutex is kept locked all the time and
127 * pcpu_lock is grabbed and released as necessary. All actual memory 137 * pcpu_lock is grabbed and released as necessary. All actual memory
@@ -140,7 +150,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
140static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ 150static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
141 151
142static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ 152static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
143static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
144 153
145/* reclaim work to release fully free chunks, scheduled from free path */ 154/* reclaim work to release fully free chunks, scheduled from free path */
146static void pcpu_reclaim(struct work_struct *work); 155static void pcpu_reclaim(struct work_struct *work);
@@ -191,6 +200,18 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
191 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; 200 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
192} 201}
193 202
203/* set the pointer to a chunk in a page struct */
204static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
205{
206 page->index = (unsigned long)pcpu;
207}
208
209/* obtain pointer to a chunk from a page struct */
210static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
211{
212 return (struct pcpu_chunk *)page->index;
213}
214
194/** 215/**
195 * pcpu_mem_alloc - allocate memory 216 * pcpu_mem_alloc - allocate memory
196 * @size: bytes to allocate 217 * @size: bytes to allocate
@@ -257,93 +278,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
257 } 278 }
258} 279}
259 280
260static struct rb_node **pcpu_chunk_rb_search(void *addr,
261 struct rb_node **parentp)
262{
263 struct rb_node **p = &pcpu_addr_root.rb_node;
264 struct rb_node *parent = NULL;
265 struct pcpu_chunk *chunk;
266
267 while (*p) {
268 parent = *p;
269 chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
270
271 if (addr < chunk->vm->addr)
272 p = &(*p)->rb_left;
273 else if (addr > chunk->vm->addr)
274 p = &(*p)->rb_right;
275 else
276 break;
277 }
278
279 if (parentp)
280 *parentp = parent;
281 return p;
282}
283
284/** 281/**
285 * pcpu_chunk_addr_search - search for chunk containing specified address 282 * pcpu_chunk_addr_search - determine chunk containing specified address
286 * @addr: address to search for 283 * @addr: address for which the chunk needs to be determined.
287 *
288 * Look for chunk which might contain @addr. More specifically, it
289 * searchs for the chunk with the highest start address which isn't
290 * beyond @addr.
291 *
292 * CONTEXT:
293 * pcpu_lock.
294 * 284 *
295 * RETURNS: 285 * RETURNS:
296 * The address of the found chunk. 286 * The address of the found chunk.
297 */ 287 */
298static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) 288static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
299{ 289{
300 struct rb_node *n, *parent; 290 void *first_start = pcpu_first_chunk->vm->addr;
301 struct pcpu_chunk *chunk;
302 291
303 /* is it in the reserved chunk? */ 292 /* is it in the first chunk? */
304 if (pcpu_reserved_chunk) { 293 if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
305 void *start = pcpu_reserved_chunk->vm->addr; 294 /* is it in the reserved area? */
306 295 if (addr < first_start + pcpu_reserved_chunk_limit)
307 if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
308 return pcpu_reserved_chunk; 296 return pcpu_reserved_chunk;
297 return pcpu_first_chunk;
309 } 298 }
310 299
311 /* nah... search the regular ones */ 300 return pcpu_get_page_chunk(vmalloc_to_page(addr));
312 n = *pcpu_chunk_rb_search(addr, &parent);
313 if (!n) {
314 /* no exactly matching chunk, the parent is the closest */
315 n = parent;
316 BUG_ON(!n);
317 }
318 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
319
320 if (addr < chunk->vm->addr) {
321 /* the parent was the next one, look for the previous one */
322 n = rb_prev(n);
323 BUG_ON(!n);
324 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
325 }
326
327 return chunk;
328}
329
330/**
331 * pcpu_chunk_addr_insert - insert chunk into address rb tree
332 * @new: chunk to insert
333 *
334 * Insert @new into address rb tree.
335 *
336 * CONTEXT:
337 * pcpu_lock.
338 */
339static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
340{
341 struct rb_node **p, *parent;
342
343 p = pcpu_chunk_rb_search(new->vm->addr, &parent);
344 BUG_ON(*p);
345 rb_link_node(&new->rb_node, parent, p);
346 rb_insert_color(&new->rb_node, &pcpu_addr_root);
347} 301}
348 302
349/** 303/**
@@ -755,6 +709,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
755 alloc_mask, 0); 709 alloc_mask, 0);
756 if (!*pagep) 710 if (!*pagep)
757 goto err; 711 goto err;
712 pcpu_set_page_chunk(*pagep, chunk);
758 } 713 }
759 } 714 }
760 715
@@ -879,7 +834,6 @@ restart:
879 834
880 spin_lock_irq(&pcpu_lock); 835 spin_lock_irq(&pcpu_lock);
881 pcpu_chunk_relocate(chunk, -1); 836 pcpu_chunk_relocate(chunk, -1);
882 pcpu_chunk_addr_insert(chunk);
883 goto restart; 837 goto restart;
884 838
885area_found: 839area_found:
@@ -968,7 +922,6 @@ static void pcpu_reclaim(struct work_struct *work)
968 if (chunk == list_first_entry(head, struct pcpu_chunk, list)) 922 if (chunk == list_first_entry(head, struct pcpu_chunk, list))
969 continue; 923 continue;
970 924
971 rb_erase(&chunk->rb_node, &pcpu_addr_root);
972 list_move(&chunk->list, &todo); 925 list_move(&chunk->list, &todo);
973 } 926 }
974 927
@@ -1147,7 +1100,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1147 1100
1148 if (reserved_size) { 1101 if (reserved_size) {
1149 schunk->free_size = reserved_size; 1102 schunk->free_size = reserved_size;
1150 pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ 1103 pcpu_reserved_chunk = schunk;
1104 pcpu_reserved_chunk_limit = static_size + reserved_size;
1151 } else { 1105 } else {
1152 schunk->free_size = dyn_size; 1106 schunk->free_size = dyn_size;
1153 dyn_size = 0; /* dynamic area covered */ 1107 dyn_size = 0; /* dynamic area covered */
@@ -1158,8 +1112,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1158 if (schunk->free_size) 1112 if (schunk->free_size)
1159 schunk->map[schunk->map_used++] = schunk->free_size; 1113 schunk->map[schunk->map_used++] = schunk->free_size;
1160 1114
1161 pcpu_reserved_chunk_limit = static_size + schunk->free_size;
1162
1163 /* init dynamic chunk if necessary */ 1115 /* init dynamic chunk if necessary */
1164 if (dyn_size) { 1116 if (dyn_size) {
1165 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); 1117 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
@@ -1226,13 +1178,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1226 } 1178 }
1227 1179
1228 /* link the first chunk in */ 1180 /* link the first chunk in */
1229 if (!dchunk) { 1181 pcpu_first_chunk = dchunk ?: schunk;
1230 pcpu_chunk_relocate(schunk, -1); 1182 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1231 pcpu_chunk_addr_insert(schunk);
1232 } else {
1233 pcpu_chunk_relocate(dchunk, -1);
1234 pcpu_chunk_addr_insert(dchunk);
1235 }
1236 1183
1237 /* we're done */ 1184 /* we're done */
1238 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); 1185 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
diff --git a/mm/readahead.c b/mm/readahead.c
index 133b6d525513..aa1aa2345235 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -133,15 +133,12 @@ out:
133} 133}
134 134
135/* 135/*
136 * do_page_cache_readahead actually reads a chunk of disk. It allocates all 136 * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
137 * the pages first, then submits them all for I/O. This avoids the very bad 137 * the pages first, then submits them all for I/O. This avoids the very bad
138 * behaviour which would occur if page allocations are causing VM writeback. 138 * behaviour which would occur if page allocations are causing VM writeback.
139 * We really don't want to intermingle reads and writes like that. 139 * We really don't want to intermingle reads and writes like that.
140 * 140 *
141 * Returns the number of pages requested, or the maximum amount of I/O allowed. 141 * Returns the number of pages requested, or the maximum amount of I/O allowed.
142 *
143 * do_page_cache_readahead() returns -1 if it encountered request queue
144 * congestion.
145 */ 142 */
146static int 143static int
147__do_page_cache_readahead(struct address_space *mapping, struct file *filp, 144__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
@@ -210,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
210 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) 207 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
211 return -EINVAL; 208 return -EINVAL;
212 209
210 nr_to_read = max_sane_readahead(nr_to_read);
213 while (nr_to_read) { 211 while (nr_to_read) {
214 int err; 212 int err;
215 213
@@ -231,22 +229,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
231} 229}
232 230
233/* 231/*
234 * This version skips the IO if the queue is read-congested, and will tell the
235 * block layer to abandon the readahead if request allocation would block.
236 *
237 * force_page_cache_readahead() will ignore queue congestion and will block on
238 * request queues.
239 */
240int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
241 pgoff_t offset, unsigned long nr_to_read)
242{
243 if (bdi_read_congested(mapping->backing_dev_info))
244 return -1;
245
246 return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
247}
248
249/*
250 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a 232 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
251 * sensible upper limit. 233 * sensible upper limit.
252 */ 234 */
@@ -259,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr)
259/* 241/*
260 * Submit IO for the read-ahead request in file_ra_state. 242 * Submit IO for the read-ahead request in file_ra_state.
261 */ 243 */
262static unsigned long ra_submit(struct file_ra_state *ra, 244unsigned long ra_submit(struct file_ra_state *ra,
263 struct address_space *mapping, struct file *filp) 245 struct address_space *mapping, struct file *filp)
264{ 246{
265 int actual; 247 int actual;
@@ -348,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
348 */ 330 */
349 331
350/* 332/*
333 * Count contiguously cached pages from @offset-1 to @offset-@max,
334 * this count is a conservative estimation of
335 * - length of the sequential read sequence, or
336 * - thrashing threshold in memory tight systems
337 */
338static pgoff_t count_history_pages(struct address_space *mapping,
339 struct file_ra_state *ra,
340 pgoff_t offset, unsigned long max)
341{
342 pgoff_t head;
343
344 rcu_read_lock();
345 head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
346 rcu_read_unlock();
347
348 return offset - 1 - head;
349}
350
351/*
352 * page cache context based read-ahead
353 */
354static int try_context_readahead(struct address_space *mapping,
355 struct file_ra_state *ra,
356 pgoff_t offset,
357 unsigned long req_size,
358 unsigned long max)
359{
360 pgoff_t size;
361
362 size = count_history_pages(mapping, ra, offset, max);
363
364 /*
365 * no history pages:
366 * it could be a random read
367 */
368 if (!size)
369 return 0;
370
371 /*
372 * starts from beginning of file:
373 * it is a strong indication of long-run stream (or whole-file-read)
374 */
375 if (size >= offset)
376 size *= 2;
377
378 ra->start = offset;
379 ra->size = get_init_ra_size(size + req_size, max);
380 ra->async_size = ra->size;
381
382 return 1;
383}
384
385/*
351 * A minimal readahead algorithm for trivial sequential/random reads. 386 * A minimal readahead algorithm for trivial sequential/random reads.
352 */ 387 */
353static unsigned long 388static unsigned long
@@ -356,34 +391,26 @@ ondemand_readahead(struct address_space *mapping,
356 bool hit_readahead_marker, pgoff_t offset, 391 bool hit_readahead_marker, pgoff_t offset,
357 unsigned long req_size) 392 unsigned long req_size)
358{ 393{
359 int max = ra->ra_pages; /* max readahead pages */ 394 unsigned long max = max_sane_readahead(ra->ra_pages);
360 pgoff_t prev_offset; 395
361 int sequential; 396 /*
397 * start of file
398 */
399 if (!offset)
400 goto initial_readahead;
362 401
363 /* 402 /*
364 * It's the expected callback offset, assume sequential access. 403 * It's the expected callback offset, assume sequential access.
365 * Ramp up sizes, and push forward the readahead window. 404 * Ramp up sizes, and push forward the readahead window.
366 */ 405 */
367 if (offset && (offset == (ra->start + ra->size - ra->async_size) || 406 if ((offset == (ra->start + ra->size - ra->async_size) ||
368 offset == (ra->start + ra->size))) { 407 offset == (ra->start + ra->size))) {
369 ra->start += ra->size; 408 ra->start += ra->size;
370 ra->size = get_next_ra_size(ra, max); 409 ra->size = get_next_ra_size(ra, max);
371 ra->async_size = ra->size; 410 ra->async_size = ra->size;
372 goto readit; 411 goto readit;
373 } 412 }
374 413
375 prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
376 sequential = offset - prev_offset <= 1UL || req_size > max;
377
378 /*
379 * Standalone, small read.
380 * Read as is, and do not pollute the readahead state.
381 */
382 if (!hit_readahead_marker && !sequential) {
383 return __do_page_cache_readahead(mapping, filp,
384 offset, req_size, 0);
385 }
386
387 /* 414 /*
388 * Hit a marked page without valid readahead state. 415 * Hit a marked page without valid readahead state.
389 * E.g. interleaved reads. 416 * E.g. interleaved reads.
@@ -394,7 +421,7 @@ ondemand_readahead(struct address_space *mapping,
394 pgoff_t start; 421 pgoff_t start;
395 422
396 rcu_read_lock(); 423 rcu_read_lock();
397 start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); 424 start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
398 rcu_read_unlock(); 425 rcu_read_unlock();
399 426
400 if (!start || start - offset > max) 427 if (!start || start - offset > max)
@@ -402,23 +429,53 @@ ondemand_readahead(struct address_space *mapping,
402 429
403 ra->start = start; 430 ra->start = start;
404 ra->size = start - offset; /* old async_size */ 431 ra->size = start - offset; /* old async_size */
432 ra->size += req_size;
405 ra->size = get_next_ra_size(ra, max); 433 ra->size = get_next_ra_size(ra, max);
406 ra->async_size = ra->size; 434 ra->async_size = ra->size;
407 goto readit; 435 goto readit;
408 } 436 }
409 437
410 /* 438 /*
411 * It may be one of 439 * oversize read
412 * - first read on start of file 440 */
413 * - sequential cache miss 441 if (req_size > max)
414 * - oversize random read 442 goto initial_readahead;
415 * Start readahead for it. 443
444 /*
445 * sequential cache miss
446 */
447 if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
448 goto initial_readahead;
449
450 /*
451 * Query the page cache and look for the traces(cached history pages)
452 * that a sequential stream would leave behind.
453 */
454 if (try_context_readahead(mapping, ra, offset, req_size, max))
455 goto readit;
456
457 /*
458 * standalone, small random read
459 * Read as is, and do not pollute the readahead state.
416 */ 460 */
461 return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
462
463initial_readahead:
417 ra->start = offset; 464 ra->start = offset;
418 ra->size = get_init_ra_size(req_size, max); 465 ra->size = get_init_ra_size(req_size, max);
419 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; 466 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
420 467
421readit: 468readit:
469 /*
470 * Will this read hit the readahead marker made by itself?
471 * If so, trigger the readahead marker hit now, and merge
472 * the resulted next readahead window into the current one.
473 */
474 if (offset == ra->start && ra->size == ra->async_size) {
475 ra->async_size = get_next_ra_size(ra, max);
476 ra->size += ra->async_size;
477 }
478
422 return ra_submit(ra, mapping, filp); 479 return ra_submit(ra, mapping, filp);
423} 480}
424 481
diff --git a/mm/rmap.c b/mm/rmap.c
index 16521664010d..c9ccc1a72dc3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -14,7 +14,7 @@
14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 17 * Contributions by Hugh Dickins 2003, 2004
18 */ 18 */
19 19
20/* 20/*
@@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
333 * repeatedly from either page_referenced_anon or page_referenced_file. 333 * repeatedly from either page_referenced_anon or page_referenced_file.
334 */ 334 */
335static int page_referenced_one(struct page *page, 335static int page_referenced_one(struct page *page,
336 struct vm_area_struct *vma, unsigned int *mapcount) 336 struct vm_area_struct *vma,
337 unsigned int *mapcount,
338 unsigned long *vm_flags)
337{ 339{
338 struct mm_struct *mm = vma->vm_mm; 340 struct mm_struct *mm = vma->vm_mm;
339 unsigned long address; 341 unsigned long address;
@@ -381,11 +383,14 @@ out_unmap:
381 (*mapcount)--; 383 (*mapcount)--;
382 pte_unmap_unlock(pte, ptl); 384 pte_unmap_unlock(pte, ptl);
383out: 385out:
386 if (referenced)
387 *vm_flags |= vma->vm_flags;
384 return referenced; 388 return referenced;
385} 389}
386 390
387static int page_referenced_anon(struct page *page, 391static int page_referenced_anon(struct page *page,
388 struct mem_cgroup *mem_cont) 392 struct mem_cgroup *mem_cont,
393 unsigned long *vm_flags)
389{ 394{
390 unsigned int mapcount; 395 unsigned int mapcount;
391 struct anon_vma *anon_vma; 396 struct anon_vma *anon_vma;
@@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page,
405 */ 410 */
406 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 411 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
407 continue; 412 continue;
408 referenced += page_referenced_one(page, vma, &mapcount); 413 referenced += page_referenced_one(page, vma,
414 &mapcount, vm_flags);
409 if (!mapcount) 415 if (!mapcount)
410 break; 416 break;
411 } 417 }
@@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page,
418 * page_referenced_file - referenced check for object-based rmap 424 * page_referenced_file - referenced check for object-based rmap
419 * @page: the page we're checking references on. 425 * @page: the page we're checking references on.
420 * @mem_cont: target memory controller 426 * @mem_cont: target memory controller
427 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
421 * 428 *
422 * For an object-based mapped page, find all the places it is mapped and 429 * For an object-based mapped page, find all the places it is mapped and
423 * check/clear the referenced flag. This is done by following the page->mapping 430 * check/clear the referenced flag. This is done by following the page->mapping
@@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page,
427 * This function is only called from page_referenced for object-based pages. 434 * This function is only called from page_referenced for object-based pages.
428 */ 435 */
429static int page_referenced_file(struct page *page, 436static int page_referenced_file(struct page *page,
430 struct mem_cgroup *mem_cont) 437 struct mem_cgroup *mem_cont,
438 unsigned long *vm_flags)
431{ 439{
432 unsigned int mapcount; 440 unsigned int mapcount;
433 struct address_space *mapping = page->mapping; 441 struct address_space *mapping = page->mapping;
@@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page,
467 */ 475 */
468 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 476 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
469 continue; 477 continue;
470 referenced += page_referenced_one(page, vma, &mapcount); 478 referenced += page_referenced_one(page, vma,
479 &mapcount, vm_flags);
471 if (!mapcount) 480 if (!mapcount)
472 break; 481 break;
473 } 482 }
@@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page,
481 * @page: the page to test 490 * @page: the page to test
482 * @is_locked: caller holds lock on the page 491 * @is_locked: caller holds lock on the page
483 * @mem_cont: target memory controller 492 * @mem_cont: target memory controller
493 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
484 * 494 *
485 * Quick test_and_clear_referenced for all mappings to a page, 495 * Quick test_and_clear_referenced for all mappings to a page,
486 * returns the number of ptes which referenced the page. 496 * returns the number of ptes which referenced the page.
487 */ 497 */
488int page_referenced(struct page *page, int is_locked, 498int page_referenced(struct page *page,
489 struct mem_cgroup *mem_cont) 499 int is_locked,
500 struct mem_cgroup *mem_cont,
501 unsigned long *vm_flags)
490{ 502{
491 int referenced = 0; 503 int referenced = 0;
492 504
493 if (TestClearPageReferenced(page)) 505 if (TestClearPageReferenced(page))
494 referenced++; 506 referenced++;
495 507
508 *vm_flags = 0;
496 if (page_mapped(page) && page->mapping) { 509 if (page_mapped(page) && page->mapping) {
497 if (PageAnon(page)) 510 if (PageAnon(page))
498 referenced += page_referenced_anon(page, mem_cont); 511 referenced += page_referenced_anon(page, mem_cont,
512 vm_flags);
499 else if (is_locked) 513 else if (is_locked)
500 referenced += page_referenced_file(page, mem_cont); 514 referenced += page_referenced_file(page, mem_cont,
515 vm_flags);
501 else if (!trylock_page(page)) 516 else if (!trylock_page(page))
502 referenced++; 517 referenced++;
503 else { 518 else {
504 if (page->mapping) 519 if (page->mapping)
505 referenced += 520 referenced += page_referenced_file(page,
506 page_referenced_file(page, mem_cont); 521 mem_cont, vm_flags);
507 unlock_page(page); 522 unlock_page(page);
508 } 523 }
509 } 524 }
@@ -1202,7 +1217,6 @@ int try_to_unmap(struct page *page, int migration)
1202 return ret; 1217 return ret;
1203} 1218}
1204 1219
1205#ifdef CONFIG_UNEVICTABLE_LRU
1206/** 1220/**
1207 * try_to_munlock - try to munlock a page 1221 * try_to_munlock - try to munlock a page
1208 * @page: the page to be munlocked 1222 * @page: the page to be munlocked
@@ -1226,4 +1240,4 @@ int try_to_munlock(struct page *page)
1226 else 1240 else
1227 return try_to_unmap_file(page, 1, 0); 1241 return try_to_unmap_file(page, 1, 0);
1228} 1242}
1229#endif 1243
diff --git a/mm/shmem.c b/mm/shmem.c
index f9cb20ebb990..e89d7ec18eda 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1097 shmem_swp_unmap(entry); 1097 shmem_swp_unmap(entry);
1098unlock: 1098unlock:
1099 spin_unlock(&info->lock); 1099 spin_unlock(&info->lock);
1100 swap_free(swap); 1100 swapcache_free(swap, NULL);
1101redirty: 1101redirty:
1102 set_page_dirty(page); 1102 set_page_dirty(page);
1103 if (wbc->for_reclaim) 1103 if (wbc->for_reclaim)
@@ -1340,8 +1340,12 @@ repeat:
1340 shmem_swp_unmap(entry); 1340 shmem_swp_unmap(entry);
1341 spin_unlock(&info->lock); 1341 spin_unlock(&info->lock);
1342 if (error == -ENOMEM) { 1342 if (error == -ENOMEM) {
1343 /* allow reclaim from this memory cgroup */ 1343 /*
1344 error = mem_cgroup_shrink_usage(swappage, 1344 * reclaim from proper memory cgroup and
1345 * call memcg's OOM if needed.
1346 */
1347 error = mem_cgroup_shmem_charge_fallback(
1348 swappage,
1345 current->mm, 1349 current->mm,
1346 gfp); 1350 gfp);
1347 if (error) { 1351 if (error) {
@@ -2608,7 +2612,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
2608 * @size: size to be set for the file 2612 * @size: size to be set for the file
2609 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 2613 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
2610 */ 2614 */
2611struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) 2615struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
2612{ 2616{
2613 int error; 2617 int error;
2614 struct file *file; 2618 struct file *file;
@@ -2655,6 +2659,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2655 if (error) 2659 if (error)
2656 goto close_file; 2660 goto close_file;
2657#endif 2661#endif
2662 ima_counts_get(file);
2658 return file; 2663 return file;
2659 2664
2660close_file: 2665close_file:
@@ -2680,7 +2685,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2680 if (IS_ERR(file)) 2685 if (IS_ERR(file))
2681 return PTR_ERR(file); 2686 return PTR_ERR(file);
2682 2687
2683 ima_shm_check(file);
2684 if (vma->vm_file) 2688 if (vma->vm_file)
2685 fput(vma->vm_file); 2689 fput(vma->vm_file);
2686 vma->vm_file = file; 2690 vma->vm_file = file;
diff --git a/mm/slab.c b/mm/slab.c
index 9a90b00d2f91..d08692303f6e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,17 +102,19 @@
102#include <linux/cpu.h> 102#include <linux/cpu.h>
103#include <linux/sysctl.h> 103#include <linux/sysctl.h>
104#include <linux/module.h> 104#include <linux/module.h>
105#include <trace/kmemtrace.h> 105#include <linux/kmemtrace.h>
106#include <linux/rcupdate.h> 106#include <linux/rcupdate.h>
107#include <linux/string.h> 107#include <linux/string.h>
108#include <linux/uaccess.h> 108#include <linux/uaccess.h>
109#include <linux/nodemask.h> 109#include <linux/nodemask.h>
110#include <linux/kmemleak.h>
110#include <linux/mempolicy.h> 111#include <linux/mempolicy.h>
111#include <linux/mutex.h> 112#include <linux/mutex.h>
112#include <linux/fault-inject.h> 113#include <linux/fault-inject.h>
113#include <linux/rtmutex.h> 114#include <linux/rtmutex.h>
114#include <linux/reciprocal_div.h> 115#include <linux/reciprocal_div.h>
115#include <linux/debugobjects.h> 116#include <linux/debugobjects.h>
117#include <linux/kmemcheck.h>
116 118
117#include <asm/cacheflush.h> 119#include <asm/cacheflush.h>
118#include <asm/tlbflush.h> 120#include <asm/tlbflush.h>
@@ -178,13 +180,13 @@
178 SLAB_STORE_USER | \ 180 SLAB_STORE_USER | \
179 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 181 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
180 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 182 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
181 SLAB_DEBUG_OBJECTS) 183 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
182#else 184#else
183# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ 185# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
184 SLAB_CACHE_DMA | \ 186 SLAB_CACHE_DMA | \
185 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 187 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
186 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 188 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
187 SLAB_DEBUG_OBJECTS) 189 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
188#endif 190#endif
189 191
190/* 192/*
@@ -303,6 +305,12 @@ struct kmem_list3 {
303}; 305};
304 306
305/* 307/*
308 * The slab allocator is initialized with interrupts disabled. Therefore, make
309 * sure early boot allocations don't accidentally enable interrupts.
310 */
311static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK;
312
313/*
306 * Need this for bootstrapping a per node allocator. 314 * Need this for bootstrapping a per node allocator.
307 */ 315 */
308#define NUM_INIT_LISTS (3 * MAX_NUMNODES) 316#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
@@ -315,7 +323,7 @@ static int drain_freelist(struct kmem_cache *cache,
315 struct kmem_list3 *l3, int tofree); 323 struct kmem_list3 *l3, int tofree);
316static void free_block(struct kmem_cache *cachep, void **objpp, int len, 324static void free_block(struct kmem_cache *cachep, void **objpp, int len,
317 int node); 325 int node);
318static int enable_cpucache(struct kmem_cache *cachep); 326static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
319static void cache_reap(struct work_struct *unused); 327static void cache_reap(struct work_struct *unused);
320 328
321/* 329/*
@@ -373,87 +381,6 @@ static void kmem_list3_init(struct kmem_list3 *parent)
373 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 381 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
374 } while (0) 382 } while (0)
375 383
376/*
377 * struct kmem_cache
378 *
379 * manages a cache.
380 */
381
382struct kmem_cache {
383/* 1) per-cpu data, touched during every alloc/free */
384 struct array_cache *array[NR_CPUS];
385/* 2) Cache tunables. Protected by cache_chain_mutex */
386 unsigned int batchcount;
387 unsigned int limit;
388 unsigned int shared;
389
390 unsigned int buffer_size;
391 u32 reciprocal_buffer_size;
392/* 3) touched by every alloc & free from the backend */
393
394 unsigned int flags; /* constant flags */
395 unsigned int num; /* # of objs per slab */
396
397/* 4) cache_grow/shrink */
398 /* order of pgs per slab (2^n) */
399 unsigned int gfporder;
400
401 /* force GFP flags, e.g. GFP_DMA */
402 gfp_t gfpflags;
403
404 size_t colour; /* cache colouring range */
405 unsigned int colour_off; /* colour offset */
406 struct kmem_cache *slabp_cache;
407 unsigned int slab_size;
408 unsigned int dflags; /* dynamic flags */
409
410 /* constructor func */
411 void (*ctor)(void *obj);
412
413/* 5) cache creation/removal */
414 const char *name;
415 struct list_head next;
416
417/* 6) statistics */
418#if STATS
419 unsigned long num_active;
420 unsigned long num_allocations;
421 unsigned long high_mark;
422 unsigned long grown;
423 unsigned long reaped;
424 unsigned long errors;
425 unsigned long max_freeable;
426 unsigned long node_allocs;
427 unsigned long node_frees;
428 unsigned long node_overflow;
429 atomic_t allochit;
430 atomic_t allocmiss;
431 atomic_t freehit;
432 atomic_t freemiss;
433#endif
434#if DEBUG
435 /*
436 * If debugging is enabled, then the allocator can add additional
437 * fields and/or padding to every object. buffer_size contains the total
438 * object size including these internal fields, the following two
439 * variables contain the offset to the user object and its size.
440 */
441 int obj_offset;
442 int obj_size;
443#endif
444 /*
445 * We put nodelists[] at the end of kmem_cache, because we want to size
446 * this array to nr_node_ids slots instead of MAX_NUMNODES
447 * (see kmem_cache_init())
448 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
449 * is statically defined, so we reserve the max number of nodes.
450 */
451 struct kmem_list3 *nodelists[MAX_NUMNODES];
452 /*
453 * Do not add fields after nodelists[]
454 */
455};
456
457#define CFLGS_OFF_SLAB (0x80000000UL) 384#define CFLGS_OFF_SLAB (0x80000000UL)
458#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 385#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
459 386
@@ -752,6 +679,7 @@ static enum {
752 NONE, 679 NONE,
753 PARTIAL_AC, 680 PARTIAL_AC,
754 PARTIAL_L3, 681 PARTIAL_L3,
682 EARLY,
755 FULL 683 FULL
756} g_cpucache_up; 684} g_cpucache_up;
757 685
@@ -760,7 +688,7 @@ static enum {
760 */ 688 */
761int slab_is_available(void) 689int slab_is_available(void)
762{ 690{
763 return g_cpucache_up == FULL; 691 return g_cpucache_up >= EARLY;
764} 692}
765 693
766static DEFINE_PER_CPU(struct delayed_work, reap_work); 694static DEFINE_PER_CPU(struct delayed_work, reap_work);
@@ -890,7 +818,6 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
890 */ 818 */
891 819
892static int use_alien_caches __read_mostly = 1; 820static int use_alien_caches __read_mostly = 1;
893static int numa_platform __read_mostly = 1;
894static int __init noaliencache_setup(char *s) 821static int __init noaliencache_setup(char *s)
895{ 822{
896 use_alien_caches = 0; 823 use_alien_caches = 0;
@@ -958,12 +885,20 @@ static void __cpuinit start_cpu_timer(int cpu)
958} 885}
959 886
960static struct array_cache *alloc_arraycache(int node, int entries, 887static struct array_cache *alloc_arraycache(int node, int entries,
961 int batchcount) 888 int batchcount, gfp_t gfp)
962{ 889{
963 int memsize = sizeof(void *) * entries + sizeof(struct array_cache); 890 int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
964 struct array_cache *nc = NULL; 891 struct array_cache *nc = NULL;
965 892
966 nc = kmalloc_node(memsize, GFP_KERNEL, node); 893 nc = kmalloc_node(memsize, gfp, node);
894 /*
895 * The array_cache structures contain pointers to free object.
896 * However, when such objects are allocated or transfered to another
897 * cache the pointers are not cleared and they could be counted as
898 * valid references during a kmemleak scan. Therefore, kmemleak must
899 * not scan such objects.
900 */
901 kmemleak_no_scan(nc);
967 if (nc) { 902 if (nc) {
968 nc->avail = 0; 903 nc->avail = 0;
969 nc->limit = entries; 904 nc->limit = entries;
@@ -1003,7 +938,7 @@ static int transfer_objects(struct array_cache *to,
1003#define drain_alien_cache(cachep, alien) do { } while (0) 938#define drain_alien_cache(cachep, alien) do { } while (0)
1004#define reap_alien(cachep, l3) do { } while (0) 939#define reap_alien(cachep, l3) do { } while (0)
1005 940
1006static inline struct array_cache **alloc_alien_cache(int node, int limit) 941static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
1007{ 942{
1008 return (struct array_cache **)BAD_ALIEN_MAGIC; 943 return (struct array_cache **)BAD_ALIEN_MAGIC;
1009} 944}
@@ -1034,7 +969,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
1034static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 969static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1035static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 970static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1036 971
1037static struct array_cache **alloc_alien_cache(int node, int limit) 972static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
1038{ 973{
1039 struct array_cache **ac_ptr; 974 struct array_cache **ac_ptr;
1040 int memsize = sizeof(void *) * nr_node_ids; 975 int memsize = sizeof(void *) * nr_node_ids;
@@ -1042,14 +977,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit)
1042 977
1043 if (limit > 1) 978 if (limit > 1)
1044 limit = 12; 979 limit = 12;
1045 ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); 980 ac_ptr = kmalloc_node(memsize, gfp, node);
1046 if (ac_ptr) { 981 if (ac_ptr) {
1047 for_each_node(i) { 982 for_each_node(i) {
1048 if (i == node || !node_online(i)) { 983 if (i == node || !node_online(i)) {
1049 ac_ptr[i] = NULL; 984 ac_ptr[i] = NULL;
1050 continue; 985 continue;
1051 } 986 }
1052 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); 987 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
1053 if (!ac_ptr[i]) { 988 if (!ac_ptr[i]) {
1054 for (i--; i >= 0; i--) 989 for (i--; i >= 0; i--)
1055 kfree(ac_ptr[i]); 990 kfree(ac_ptr[i]);
@@ -1282,20 +1217,20 @@ static int __cpuinit cpuup_prepare(long cpu)
1282 struct array_cache **alien = NULL; 1217 struct array_cache **alien = NULL;
1283 1218
1284 nc = alloc_arraycache(node, cachep->limit, 1219 nc = alloc_arraycache(node, cachep->limit,
1285 cachep->batchcount); 1220 cachep->batchcount, GFP_KERNEL);
1286 if (!nc) 1221 if (!nc)
1287 goto bad; 1222 goto bad;
1288 if (cachep->shared) { 1223 if (cachep->shared) {
1289 shared = alloc_arraycache(node, 1224 shared = alloc_arraycache(node,
1290 cachep->shared * cachep->batchcount, 1225 cachep->shared * cachep->batchcount,
1291 0xbaadf00d); 1226 0xbaadf00d, GFP_KERNEL);
1292 if (!shared) { 1227 if (!shared) {
1293 kfree(nc); 1228 kfree(nc);
1294 goto bad; 1229 goto bad;
1295 } 1230 }
1296 } 1231 }
1297 if (use_alien_caches) { 1232 if (use_alien_caches) {
1298 alien = alloc_alien_cache(node, cachep->limit); 1233 alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
1299 if (!alien) { 1234 if (!alien) {
1300 kfree(shared); 1235 kfree(shared);
1301 kfree(nc); 1236 kfree(nc);
@@ -1399,10 +1334,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1399{ 1334{
1400 struct kmem_list3 *ptr; 1335 struct kmem_list3 *ptr;
1401 1336
1402 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); 1337 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
1403 BUG_ON(!ptr); 1338 BUG_ON(!ptr);
1404 1339
1405 local_irq_disable();
1406 memcpy(ptr, list, sizeof(struct kmem_list3)); 1340 memcpy(ptr, list, sizeof(struct kmem_list3));
1407 /* 1341 /*
1408 * Do not assume that spinlocks can be initialized via memcpy: 1342 * Do not assume that spinlocks can be initialized via memcpy:
@@ -1411,7 +1345,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1411 1345
1412 MAKE_ALL_LISTS(cachep, ptr, nodeid); 1346 MAKE_ALL_LISTS(cachep, ptr, nodeid);
1413 cachep->nodelists[nodeid] = ptr; 1347 cachep->nodelists[nodeid] = ptr;
1414 local_irq_enable();
1415} 1348}
1416 1349
1417/* 1350/*
@@ -1443,10 +1376,8 @@ void __init kmem_cache_init(void)
1443 int order; 1376 int order;
1444 int node; 1377 int node;
1445 1378
1446 if (num_possible_nodes() == 1) { 1379 if (num_possible_nodes() == 1)
1447 use_alien_caches = 0; 1380 use_alien_caches = 0;
1448 numa_platform = 0;
1449 }
1450 1381
1451 for (i = 0; i < NUM_INIT_LISTS; i++) { 1382 for (i = 0; i < NUM_INIT_LISTS; i++) {
1452 kmem_list3_init(&initkmem_list3[i]); 1383 kmem_list3_init(&initkmem_list3[i]);
@@ -1575,9 +1506,8 @@ void __init kmem_cache_init(void)
1575 { 1506 {
1576 struct array_cache *ptr; 1507 struct array_cache *ptr;
1577 1508
1578 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1509 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1579 1510
1580 local_irq_disable();
1581 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); 1511 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1582 memcpy(ptr, cpu_cache_get(&cache_cache), 1512 memcpy(ptr, cpu_cache_get(&cache_cache),
1583 sizeof(struct arraycache_init)); 1513 sizeof(struct arraycache_init));
@@ -1587,11 +1517,9 @@ void __init kmem_cache_init(void)
1587 spin_lock_init(&ptr->lock); 1517 spin_lock_init(&ptr->lock);
1588 1518
1589 cache_cache.array[smp_processor_id()] = ptr; 1519 cache_cache.array[smp_processor_id()] = ptr;
1590 local_irq_enable();
1591 1520
1592 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1521 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1593 1522
1594 local_irq_disable();
1595 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) 1523 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1596 != &initarray_generic.cache); 1524 != &initarray_generic.cache);
1597 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), 1525 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
@@ -1603,7 +1531,6 @@ void __init kmem_cache_init(void)
1603 1531
1604 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1532 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1605 ptr; 1533 ptr;
1606 local_irq_enable();
1607 } 1534 }
1608 /* 5) Replace the bootstrap kmem_list3's */ 1535 /* 5) Replace the bootstrap kmem_list3's */
1609 { 1536 {
@@ -1622,19 +1549,27 @@ void __init kmem_cache_init(void)
1622 } 1549 }
1623 } 1550 }
1624 1551
1625 /* 6) resize the head arrays to their final sizes */ 1552 g_cpucache_up = EARLY;
1626 {
1627 struct kmem_cache *cachep;
1628 mutex_lock(&cache_chain_mutex);
1629 list_for_each_entry(cachep, &cache_chain, next)
1630 if (enable_cpucache(cachep))
1631 BUG();
1632 mutex_unlock(&cache_chain_mutex);
1633 }
1634 1553
1635 /* Annotate slab for lockdep -- annotate the malloc caches */ 1554 /* Annotate slab for lockdep -- annotate the malloc caches */
1636 init_lock_keys(); 1555 init_lock_keys();
1556}
1557
1558void __init kmem_cache_init_late(void)
1559{
1560 struct kmem_cache *cachep;
1637 1561
1562 /*
1563 * Interrupts are enabled now so all GFP allocations are safe.
1564 */
1565 slab_gfp_mask = __GFP_BITS_MASK;
1566
1567 /* 6) resize the head arrays to their final sizes */
1568 mutex_lock(&cache_chain_mutex);
1569 list_for_each_entry(cachep, &cache_chain, next)
1570 if (enable_cpucache(cachep, GFP_NOWAIT))
1571 BUG();
1572 mutex_unlock(&cache_chain_mutex);
1638 1573
1639 /* Done! */ 1574 /* Done! */
1640 g_cpucache_up = FULL; 1575 g_cpucache_up = FULL;
@@ -1689,7 +1624,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1689 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1624 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1690 flags |= __GFP_RECLAIMABLE; 1625 flags |= __GFP_RECLAIMABLE;
1691 1626
1692 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1627 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1693 if (!page) 1628 if (!page)
1694 return NULL; 1629 return NULL;
1695 1630
@@ -1702,6 +1637,16 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1702 NR_SLAB_UNRECLAIMABLE, nr_pages); 1637 NR_SLAB_UNRECLAIMABLE, nr_pages);
1703 for (i = 0; i < nr_pages; i++) 1638 for (i = 0; i < nr_pages; i++)
1704 __SetPageSlab(page + i); 1639 __SetPageSlab(page + i);
1640
1641 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1642 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1643
1644 if (cachep->ctor)
1645 kmemcheck_mark_uninitialized_pages(page, nr_pages);
1646 else
1647 kmemcheck_mark_unallocated_pages(page, nr_pages);
1648 }
1649
1705 return page_address(page); 1650 return page_address(page);
1706} 1651}
1707 1652
@@ -1714,6 +1659,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1714 struct page *page = virt_to_page(addr); 1659 struct page *page = virt_to_page(addr);
1715 const unsigned long nr_freed = i; 1660 const unsigned long nr_freed = i;
1716 1661
1662 kmemcheck_free_shadow(page, cachep->gfporder);
1663
1717 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1664 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1718 sub_zone_page_state(page_zone(page), 1665 sub_zone_page_state(page_zone(page),
1719 NR_SLAB_RECLAIMABLE, nr_freed); 1666 NR_SLAB_RECLAIMABLE, nr_freed);
@@ -2064,10 +2011,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2064 return left_over; 2011 return left_over;
2065} 2012}
2066 2013
2067static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) 2014static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2068{ 2015{
2069 if (g_cpucache_up == FULL) 2016 if (g_cpucache_up == FULL)
2070 return enable_cpucache(cachep); 2017 return enable_cpucache(cachep, gfp);
2071 2018
2072 if (g_cpucache_up == NONE) { 2019 if (g_cpucache_up == NONE) {
2073 /* 2020 /*
@@ -2089,7 +2036,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2089 g_cpucache_up = PARTIAL_AC; 2036 g_cpucache_up = PARTIAL_AC;
2090 } else { 2037 } else {
2091 cachep->array[smp_processor_id()] = 2038 cachep->array[smp_processor_id()] =
2092 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 2039 kmalloc(sizeof(struct arraycache_init), gfp);
2093 2040
2094 if (g_cpucache_up == PARTIAL_AC) { 2041 if (g_cpucache_up == PARTIAL_AC) {
2095 set_up_list3s(cachep, SIZE_L3); 2042 set_up_list3s(cachep, SIZE_L3);
@@ -2099,7 +2046,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2099 for_each_online_node(node) { 2046 for_each_online_node(node) {
2100 cachep->nodelists[node] = 2047 cachep->nodelists[node] =
2101 kmalloc_node(sizeof(struct kmem_list3), 2048 kmalloc_node(sizeof(struct kmem_list3),
2102 GFP_KERNEL, node); 2049 gfp, node);
2103 BUG_ON(!cachep->nodelists[node]); 2050 BUG_ON(!cachep->nodelists[node]);
2104 kmem_list3_init(cachep->nodelists[node]); 2051 kmem_list3_init(cachep->nodelists[node]);
2105 } 2052 }
@@ -2153,6 +2100,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2153{ 2100{
2154 size_t left_over, slab_size, ralign; 2101 size_t left_over, slab_size, ralign;
2155 struct kmem_cache *cachep = NULL, *pc; 2102 struct kmem_cache *cachep = NULL, *pc;
2103 gfp_t gfp;
2156 2104
2157 /* 2105 /*
2158 * Sanity checks... these are all serious usage bugs. 2106 * Sanity checks... these are all serious usage bugs.
@@ -2168,8 +2116,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2168 * We use cache_chain_mutex to ensure a consistent view of 2116 * We use cache_chain_mutex to ensure a consistent view of
2169 * cpu_online_mask as well. Please see cpuup_callback 2117 * cpu_online_mask as well. Please see cpuup_callback
2170 */ 2118 */
2171 get_online_cpus(); 2119 if (slab_is_available()) {
2172 mutex_lock(&cache_chain_mutex); 2120 get_online_cpus();
2121 mutex_lock(&cache_chain_mutex);
2122 }
2173 2123
2174 list_for_each_entry(pc, &cache_chain, next) { 2124 list_for_each_entry(pc, &cache_chain, next) {
2175 char tmp; 2125 char tmp;
@@ -2278,8 +2228,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2278 */ 2228 */
2279 align = ralign; 2229 align = ralign;
2280 2230
2231 if (slab_is_available())
2232 gfp = GFP_KERNEL;
2233 else
2234 gfp = GFP_NOWAIT;
2235
2281 /* Get cache's description obj. */ 2236 /* Get cache's description obj. */
2282 cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); 2237 cachep = kmem_cache_zalloc(&cache_cache, gfp);
2283 if (!cachep) 2238 if (!cachep)
2284 goto oops; 2239 goto oops;
2285 2240
@@ -2353,6 +2308,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2353 /* really off slab. No need for manual alignment */ 2308 /* really off slab. No need for manual alignment */
2354 slab_size = 2309 slab_size =
2355 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); 2310 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2311
2312#ifdef CONFIG_PAGE_POISONING
2313 /* If we're going to use the generic kernel_map_pages()
2314 * poisoning, then it's going to smash the contents of
2315 * the redzone and userword anyhow, so switch them off.
2316 */
2317 if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
2318 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2319#endif
2356 } 2320 }
2357 2321
2358 cachep->colour_off = cache_line_size(); 2322 cachep->colour_off = cache_line_size();
@@ -2382,7 +2346,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2382 cachep->ctor = ctor; 2346 cachep->ctor = ctor;
2383 cachep->name = name; 2347 cachep->name = name;
2384 2348
2385 if (setup_cpu_cache(cachep)) { 2349 if (setup_cpu_cache(cachep, gfp)) {
2386 __kmem_cache_destroy(cachep); 2350 __kmem_cache_destroy(cachep);
2387 cachep = NULL; 2351 cachep = NULL;
2388 goto oops; 2352 goto oops;
@@ -2394,8 +2358,10 @@ oops:
2394 if (!cachep && (flags & SLAB_PANIC)) 2358 if (!cachep && (flags & SLAB_PANIC))
2395 panic("kmem_cache_create(): failed to create slab `%s'\n", 2359 panic("kmem_cache_create(): failed to create slab `%s'\n",
2396 name); 2360 name);
2397 mutex_unlock(&cache_chain_mutex); 2361 if (slab_is_available()) {
2398 put_online_cpus(); 2362 mutex_unlock(&cache_chain_mutex);
2363 put_online_cpus();
2364 }
2399 return cachep; 2365 return cachep;
2400} 2366}
2401EXPORT_SYMBOL(kmem_cache_create); 2367EXPORT_SYMBOL(kmem_cache_create);
@@ -2621,6 +2587,14 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2621 /* Slab management obj is off-slab. */ 2587 /* Slab management obj is off-slab. */
2622 slabp = kmem_cache_alloc_node(cachep->slabp_cache, 2588 slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2623 local_flags, nodeid); 2589 local_flags, nodeid);
2590 /*
2591 * If the first object in the slab is leaked (it's allocated
2592 * but no one has a reference to it), we want to make sure
2593 * kmemleak does not treat the ->s_mem pointer as a reference
2594 * to the object. Otherwise we will not report the leak.
2595 */
2596 kmemleak_scan_area(slabp, offsetof(struct slab, list),
2597 sizeof(struct list_head), local_flags);
2624 if (!slabp) 2598 if (!slabp)
2625 return NULL; 2599 return NULL;
2626 } else { 2600 } else {
@@ -3141,6 +3115,12 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3141 STATS_INC_ALLOCMISS(cachep); 3115 STATS_INC_ALLOCMISS(cachep);
3142 objp = cache_alloc_refill(cachep, flags); 3116 objp = cache_alloc_refill(cachep, flags);
3143 } 3117 }
3118 /*
3119 * To avoid a false negative, if an object that is in one of the
3120 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3121 * treat the array pointers as a reference to the object.
3122 */
3123 kmemleak_erase(&ac->entry[ac->avail]);
3144 return objp; 3124 return objp;
3145} 3125}
3146 3126
@@ -3219,7 +3199,7 @@ retry:
3219 if (local_flags & __GFP_WAIT) 3199 if (local_flags & __GFP_WAIT)
3220 local_irq_enable(); 3200 local_irq_enable();
3221 kmem_flagcheck(cache, flags); 3201 kmem_flagcheck(cache, flags);
3222 obj = kmem_getpages(cache, local_flags, -1); 3202 obj = kmem_getpages(cache, local_flags, numa_node_id());
3223 if (local_flags & __GFP_WAIT) 3203 if (local_flags & __GFP_WAIT)
3224 local_irq_disable(); 3204 local_irq_disable();
3225 if (obj) { 3205 if (obj) {
@@ -3327,6 +3307,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3327 unsigned long save_flags; 3307 unsigned long save_flags;
3328 void *ptr; 3308 void *ptr;
3329 3309
3310 flags &= slab_gfp_mask;
3311
3330 lockdep_trace_alloc(flags); 3312 lockdep_trace_alloc(flags);
3331 3313
3332 if (slab_should_failslab(cachep, flags)) 3314 if (slab_should_failslab(cachep, flags))
@@ -3360,6 +3342,11 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3360 out: 3342 out:
3361 local_irq_restore(save_flags); 3343 local_irq_restore(save_flags);
3362 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3344 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3345 kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
3346 flags);
3347
3348 if (likely(ptr))
3349 kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
3363 3350
3364 if (unlikely((flags & __GFP_ZERO) && ptr)) 3351 if (unlikely((flags & __GFP_ZERO) && ptr))
3365 memset(ptr, 0, obj_size(cachep)); 3352 memset(ptr, 0, obj_size(cachep));
@@ -3405,6 +3392,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3405 unsigned long save_flags; 3392 unsigned long save_flags;
3406 void *objp; 3393 void *objp;
3407 3394
3395 flags &= slab_gfp_mask;
3396
3408 lockdep_trace_alloc(flags); 3397 lockdep_trace_alloc(flags);
3409 3398
3410 if (slab_should_failslab(cachep, flags)) 3399 if (slab_should_failslab(cachep, flags))
@@ -3415,8 +3404,13 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3415 objp = __do_cache_alloc(cachep, flags); 3404 objp = __do_cache_alloc(cachep, flags);
3416 local_irq_restore(save_flags); 3405 local_irq_restore(save_flags);
3417 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3406 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3407 kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
3408 flags);
3418 prefetchw(objp); 3409 prefetchw(objp);
3419 3410
3411 if (likely(objp))
3412 kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
3413
3420 if (unlikely((flags & __GFP_ZERO) && objp)) 3414 if (unlikely((flags & __GFP_ZERO) && objp))
3421 memset(objp, 0, obj_size(cachep)); 3415 memset(objp, 0, obj_size(cachep));
3422 3416
@@ -3530,8 +3524,11 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3530 struct array_cache *ac = cpu_cache_get(cachep); 3524 struct array_cache *ac = cpu_cache_get(cachep);
3531 3525
3532 check_irq_off(); 3526 check_irq_off();
3527 kmemleak_free_recursive(objp, cachep->flags);
3533 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3528 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3534 3529
3530 kmemcheck_slab_free(cachep, objp, obj_size(cachep));
3531
3535 /* 3532 /*
3536 * Skip calling cache_free_alien() when the platform is not numa. 3533 * Skip calling cache_free_alien() when the platform is not numa.
3537 * This will avoid cache misses that happen while accessing slabp (which 3534 * This will avoid cache misses that happen while accessing slabp (which
@@ -3539,7 +3536,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3539 * variable to skip the call, which is mostly likely to be present in 3536 * variable to skip the call, which is mostly likely to be present in
3540 * the cache. 3537 * the cache.
3541 */ 3538 */
3542 if (numa_platform && cache_free_alien(cachep, objp)) 3539 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3543 return; 3540 return;
3544 3541
3545 if (likely(ac->avail < ac->limit)) { 3542 if (likely(ac->avail < ac->limit)) {
@@ -3802,7 +3799,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
3802/* 3799/*
3803 * This initializes kmem_list3 or resizes various caches for all nodes. 3800 * This initializes kmem_list3 or resizes various caches for all nodes.
3804 */ 3801 */
3805static int alloc_kmemlist(struct kmem_cache *cachep) 3802static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
3806{ 3803{
3807 int node; 3804 int node;
3808 struct kmem_list3 *l3; 3805 struct kmem_list3 *l3;
@@ -3812,7 +3809,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3812 for_each_online_node(node) { 3809 for_each_online_node(node) {
3813 3810
3814 if (use_alien_caches) { 3811 if (use_alien_caches) {
3815 new_alien = alloc_alien_cache(node, cachep->limit); 3812 new_alien = alloc_alien_cache(node, cachep->limit, gfp);
3816 if (!new_alien) 3813 if (!new_alien)
3817 goto fail; 3814 goto fail;
3818 } 3815 }
@@ -3821,7 +3818,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3821 if (cachep->shared) { 3818 if (cachep->shared) {
3822 new_shared = alloc_arraycache(node, 3819 new_shared = alloc_arraycache(node,
3823 cachep->shared*cachep->batchcount, 3820 cachep->shared*cachep->batchcount,
3824 0xbaadf00d); 3821 0xbaadf00d, gfp);
3825 if (!new_shared) { 3822 if (!new_shared) {
3826 free_alien_cache(new_alien); 3823 free_alien_cache(new_alien);
3827 goto fail; 3824 goto fail;
@@ -3850,7 +3847,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3850 free_alien_cache(new_alien); 3847 free_alien_cache(new_alien);
3851 continue; 3848 continue;
3852 } 3849 }
3853 l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); 3850 l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
3854 if (!l3) { 3851 if (!l3) {
3855 free_alien_cache(new_alien); 3852 free_alien_cache(new_alien);
3856 kfree(new_shared); 3853 kfree(new_shared);
@@ -3906,18 +3903,18 @@ static void do_ccupdate_local(void *info)
3906 3903
3907/* Always called with the cache_chain_mutex held */ 3904/* Always called with the cache_chain_mutex held */
3908static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3905static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3909 int batchcount, int shared) 3906 int batchcount, int shared, gfp_t gfp)
3910{ 3907{
3911 struct ccupdate_struct *new; 3908 struct ccupdate_struct *new;
3912 int i; 3909 int i;
3913 3910
3914 new = kzalloc(sizeof(*new), GFP_KERNEL); 3911 new = kzalloc(sizeof(*new), gfp);
3915 if (!new) 3912 if (!new)
3916 return -ENOMEM; 3913 return -ENOMEM;
3917 3914
3918 for_each_online_cpu(i) { 3915 for_each_online_cpu(i) {
3919 new->new[i] = alloc_arraycache(cpu_to_node(i), limit, 3916 new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3920 batchcount); 3917 batchcount, gfp);
3921 if (!new->new[i]) { 3918 if (!new->new[i]) {
3922 for (i--; i >= 0; i--) 3919 for (i--; i >= 0; i--)
3923 kfree(new->new[i]); 3920 kfree(new->new[i]);
@@ -3944,11 +3941,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3944 kfree(ccold); 3941 kfree(ccold);
3945 } 3942 }
3946 kfree(new); 3943 kfree(new);
3947 return alloc_kmemlist(cachep); 3944 return alloc_kmemlist(cachep, gfp);
3948} 3945}
3949 3946
3950/* Called with cache_chain_mutex held always */ 3947/* Called with cache_chain_mutex held always */
3951static int enable_cpucache(struct kmem_cache *cachep) 3948static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
3952{ 3949{
3953 int err; 3950 int err;
3954 int limit, shared; 3951 int limit, shared;
@@ -3994,7 +3991,7 @@ static int enable_cpucache(struct kmem_cache *cachep)
3994 if (limit > 32) 3991 if (limit > 32)
3995 limit = 32; 3992 limit = 32;
3996#endif 3993#endif
3997 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); 3994 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
3998 if (err) 3995 if (err)
3999 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3996 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4000 cachep->name, -err); 3997 cachep->name, -err);
@@ -4300,7 +4297,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4300 res = 0; 4297 res = 0;
4301 } else { 4298 } else {
4302 res = do_tune_cpucache(cachep, limit, 4299 res = do_tune_cpucache(cachep, limit,
4303 batchcount, shared); 4300 batchcount, shared,
4301 GFP_KERNEL);
4304 } 4302 }
4305 break; 4303 break;
4306 } 4304 }
diff --git a/mm/slob.c b/mm/slob.c
index aad9dad2e820..c78742defdc6 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -46,7 +46,7 @@
46 * NUMA support in SLOB is fairly simplistic, pushing most of the real 46 * NUMA support in SLOB is fairly simplistic, pushing most of the real
47 * logic down to the page allocator, and simply doing the node accounting 47 * logic down to the page allocator, and simply doing the node accounting
48 * on the upper levels. In the event that a node id is explicitly 48 * on the upper levels. In the event that a node id is explicitly
49 * provided, alloc_pages_node() with the specified node id is used 49 * provided, alloc_pages_exact_node() with the specified node id is used
50 * instead. The common case (or when the node id isn't explicitly provided) 50 * instead. The common case (or when the node id isn't explicitly provided)
51 * will default to the current node, as per numa_node_id(). 51 * will default to the current node, as per numa_node_id().
52 * 52 *
@@ -60,12 +60,14 @@
60#include <linux/kernel.h> 60#include <linux/kernel.h>
61#include <linux/slab.h> 61#include <linux/slab.h>
62#include <linux/mm.h> 62#include <linux/mm.h>
63#include <linux/swap.h> /* struct reclaim_state */
63#include <linux/cache.h> 64#include <linux/cache.h>
64#include <linux/init.h> 65#include <linux/init.h>
65#include <linux/module.h> 66#include <linux/module.h>
66#include <linux/rcupdate.h> 67#include <linux/rcupdate.h>
67#include <linux/list.h> 68#include <linux/list.h>
68#include <trace/kmemtrace.h> 69#include <linux/kmemtrace.h>
70#include <linux/kmemleak.h>
69#include <asm/atomic.h> 71#include <asm/atomic.h>
70 72
71/* 73/*
@@ -242,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
242 244
243#ifdef CONFIG_NUMA 245#ifdef CONFIG_NUMA
244 if (node != -1) 246 if (node != -1)
245 page = alloc_pages_node(node, gfp, order); 247 page = alloc_pages_exact_node(node, gfp, order);
246 else 248 else
247#endif 249#endif
248 page = alloc_pages(gfp, order); 250 page = alloc_pages(gfp, order);
@@ -255,6 +257,8 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
255 257
256static void slob_free_pages(void *b, int order) 258static void slob_free_pages(void *b, int order)
257{ 259{
260 if (current->reclaim_state)
261 current->reclaim_state->reclaimed_slab += 1 << order;
258 free_pages((unsigned long)b, order); 262 free_pages((unsigned long)b, order);
259} 263}
260 264
@@ -407,7 +411,7 @@ static void slob_free(void *block, int size)
407 spin_unlock_irqrestore(&slob_lock, flags); 411 spin_unlock_irqrestore(&slob_lock, flags);
408 clear_slob_page(sp); 412 clear_slob_page(sp);
409 free_slob_page(sp); 413 free_slob_page(sp);
410 free_page((unsigned long)b); 414 slob_free_pages(b, 0);
411 return; 415 return;
412 } 416 }
413 417
@@ -506,6 +510,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
506 size, PAGE_SIZE << order, gfp, node); 510 size, PAGE_SIZE << order, gfp, node);
507 } 511 }
508 512
513 kmemleak_alloc(ret, size, 1, gfp);
509 return ret; 514 return ret;
510} 515}
511EXPORT_SYMBOL(__kmalloc_node); 516EXPORT_SYMBOL(__kmalloc_node);
@@ -518,6 +523,7 @@ void kfree(const void *block)
518 523
519 if (unlikely(ZERO_OR_NULL_PTR(block))) 524 if (unlikely(ZERO_OR_NULL_PTR(block)))
520 return; 525 return;
526 kmemleak_free(block);
521 527
522 sp = slob_page(block); 528 sp = slob_page(block);
523 if (is_slob_page(sp)) { 529 if (is_slob_page(sp)) {
@@ -581,12 +587,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
581 } else if (flags & SLAB_PANIC) 587 } else if (flags & SLAB_PANIC)
582 panic("Cannot create slab cache %s\n", name); 588 panic("Cannot create slab cache %s\n", name);
583 589
590 kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
584 return c; 591 return c;
585} 592}
586EXPORT_SYMBOL(kmem_cache_create); 593EXPORT_SYMBOL(kmem_cache_create);
587 594
588void kmem_cache_destroy(struct kmem_cache *c) 595void kmem_cache_destroy(struct kmem_cache *c)
589{ 596{
597 kmemleak_free(c);
590 slob_free(c, sizeof(struct kmem_cache)); 598 slob_free(c, sizeof(struct kmem_cache));
591} 599}
592EXPORT_SYMBOL(kmem_cache_destroy); 600EXPORT_SYMBOL(kmem_cache_destroy);
@@ -610,6 +618,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
610 if (c->ctor) 618 if (c->ctor)
611 c->ctor(b); 619 c->ctor(b);
612 620
621 kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
613 return b; 622 return b;
614} 623}
615EXPORT_SYMBOL(kmem_cache_alloc_node); 624EXPORT_SYMBOL(kmem_cache_alloc_node);
@@ -632,6 +641,7 @@ static void kmem_rcu_free(struct rcu_head *head)
632 641
633void kmem_cache_free(struct kmem_cache *c, void *b) 642void kmem_cache_free(struct kmem_cache *c, void *b)
634{ 643{
644 kmemleak_free_recursive(b, c->flags);
635 if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { 645 if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
636 struct slob_rcu *slob_rcu; 646 struct slob_rcu *slob_rcu;
637 slob_rcu = b + (c->size - sizeof(struct slob_rcu)); 647 slob_rcu = b + (c->size - sizeof(struct slob_rcu));
diff --git a/mm/slub.c b/mm/slub.c
index 7ab54ecbd3f3..b2b0c78ae35d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/swap.h> /* struct reclaim_state */
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/bit_spinlock.h> 14#include <linux/bit_spinlock.h>
14#include <linux/interrupt.h> 15#include <linux/interrupt.h>
@@ -16,9 +17,11 @@
16#include <linux/slab.h> 17#include <linux/slab.h>
17#include <linux/proc_fs.h> 18#include <linux/proc_fs.h>
18#include <linux/seq_file.h> 19#include <linux/seq_file.h>
19#include <trace/kmemtrace.h> 20#include <linux/kmemtrace.h>
21#include <linux/kmemcheck.h>
20#include <linux/cpu.h> 22#include <linux/cpu.h>
21#include <linux/cpuset.h> 23#include <linux/cpuset.h>
24#include <linux/kmemleak.h>
22#include <linux/mempolicy.h> 25#include <linux/mempolicy.h>
23#include <linux/ctype.h> 26#include <linux/ctype.h>
24#include <linux/debugobjects.h> 27#include <linux/debugobjects.h>
@@ -142,10 +145,10 @@
142 * Set of flags that will prevent slab merging 145 * Set of flags that will prevent slab merging
143 */ 146 */
144#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 147#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
145 SLAB_TRACE | SLAB_DESTROY_BY_RCU) 148 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE)
146 149
147#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 150#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
148 SLAB_CACHE_DMA) 151 SLAB_CACHE_DMA | SLAB_NOTRACK)
149 152
150#ifndef ARCH_KMALLOC_MINALIGN 153#ifndef ARCH_KMALLOC_MINALIGN
151#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 154#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
@@ -176,6 +179,12 @@ static enum {
176 SYSFS /* Sysfs up */ 179 SYSFS /* Sysfs up */
177} slab_state = DOWN; 180} slab_state = DOWN;
178 181
182/*
183 * The slab allocator is initialized with interrupts disabled. Therefore, make
184 * sure early boot allocations don't accidentally enable interrupts.
185 */
186static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK;
187
179/* A list of all slab caches on the system */ 188/* A list of all slab caches on the system */
180static DECLARE_RWSEM(slub_lock); 189static DECLARE_RWSEM(slub_lock);
181static LIST_HEAD(slab_caches); 190static LIST_HEAD(slab_caches);
@@ -831,6 +840,11 @@ static inline unsigned long slabs_node(struct kmem_cache *s, int node)
831 return atomic_long_read(&n->nr_slabs); 840 return atomic_long_read(&n->nr_slabs);
832} 841}
833 842
843static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
844{
845 return atomic_long_read(&n->nr_slabs);
846}
847
834static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) 848static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
835{ 849{
836 struct kmem_cache_node *n = get_node(s, node); 850 struct kmem_cache_node *n = get_node(s, node);
@@ -1049,6 +1063,8 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
1049 1063
1050static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1064static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1051 { return 0; } 1065 { return 0; }
1066static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1067 { return 0; }
1052static inline void inc_slabs_node(struct kmem_cache *s, int node, 1068static inline void inc_slabs_node(struct kmem_cache *s, int node,
1053 int objects) {} 1069 int objects) {}
1054static inline void dec_slabs_node(struct kmem_cache *s, int node, 1070static inline void dec_slabs_node(struct kmem_cache *s, int node,
@@ -1063,6 +1079,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
1063{ 1079{
1064 int order = oo_order(oo); 1080 int order = oo_order(oo);
1065 1081
1082 flags |= __GFP_NOTRACK;
1083
1066 if (node == -1) 1084 if (node == -1)
1067 return alloc_pages(flags, order); 1085 return alloc_pages(flags, order);
1068 else 1086 else
@@ -1090,6 +1108,24 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1090 1108
1091 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); 1109 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK);
1092 } 1110 }
1111
1112 if (kmemcheck_enabled
1113 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS)))
1114 {
1115 int pages = 1 << oo_order(oo);
1116
1117 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
1118
1119 /*
1120 * Objects from caches that have a constructor don't get
1121 * cleared when they're allocated, so we need to do it here.
1122 */
1123 if (s->ctor)
1124 kmemcheck_mark_uninitialized_pages(page, pages);
1125 else
1126 kmemcheck_mark_unallocated_pages(page, pages);
1127 }
1128
1093 page->objects = oo_objects(oo); 1129 page->objects = oo_objects(oo);
1094 mod_zone_page_state(page_zone(page), 1130 mod_zone_page_state(page_zone(page),
1095 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1131 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -1163,6 +1199,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1163 __ClearPageSlubDebug(page); 1199 __ClearPageSlubDebug(page);
1164 } 1200 }
1165 1201
1202 kmemcheck_free_shadow(page, compound_order(page));
1203
1166 mod_zone_page_state(page_zone(page), 1204 mod_zone_page_state(page_zone(page),
1167 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1205 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1168 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1206 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
@@ -1170,6 +1208,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1170 1208
1171 __ClearPageSlab(page); 1209 __ClearPageSlab(page);
1172 reset_page_mapcount(page); 1210 reset_page_mapcount(page);
1211 if (current->reclaim_state)
1212 current->reclaim_state->reclaimed_slab += pages;
1173 __free_pages(page, order); 1213 __free_pages(page, order);
1174} 1214}
1175 1215
@@ -1481,6 +1521,65 @@ static inline int node_match(struct kmem_cache_cpu *c, int node)
1481 return 1; 1521 return 1;
1482} 1522}
1483 1523
1524static int count_free(struct page *page)
1525{
1526 return page->objects - page->inuse;
1527}
1528
1529static unsigned long count_partial(struct kmem_cache_node *n,
1530 int (*get_count)(struct page *))
1531{
1532 unsigned long flags;
1533 unsigned long x = 0;
1534 struct page *page;
1535
1536 spin_lock_irqsave(&n->list_lock, flags);
1537 list_for_each_entry(page, &n->partial, lru)
1538 x += get_count(page);
1539 spin_unlock_irqrestore(&n->list_lock, flags);
1540 return x;
1541}
1542
1543static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
1544{
1545#ifdef CONFIG_SLUB_DEBUG
1546 return atomic_long_read(&n->total_objects);
1547#else
1548 return 0;
1549#endif
1550}
1551
1552static noinline void
1553slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
1554{
1555 int node;
1556
1557 printk(KERN_WARNING
1558 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1559 nid, gfpflags);
1560 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, "
1561 "default order: %d, min order: %d\n", s->name, s->objsize,
1562 s->size, oo_order(s->oo), oo_order(s->min));
1563
1564 for_each_online_node(node) {
1565 struct kmem_cache_node *n = get_node(s, node);
1566 unsigned long nr_slabs;
1567 unsigned long nr_objs;
1568 unsigned long nr_free;
1569
1570 if (!n)
1571 continue;
1572
1573 nr_free = count_partial(n, count_free);
1574 nr_slabs = node_nr_slabs(n);
1575 nr_objs = node_nr_objs(n);
1576
1577 printk(KERN_WARNING
1578 " node %d: slabs: %ld, objs: %ld, free: %ld\n",
1579 node, nr_slabs, nr_objs, nr_free);
1580 }
1581}
1582
1484/* 1583/*
1485 * Slow path. The lockless freelist is empty or we need to perform 1584 * Slow path. The lockless freelist is empty or we need to perform
1486 * debugging duties. 1585 * debugging duties.
@@ -1562,6 +1661,8 @@ new_slab:
1562 c->page = new; 1661 c->page = new;
1563 goto load_freelist; 1662 goto load_freelist;
1564 } 1663 }
1664 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1665 slab_out_of_memory(s, gfpflags, node);
1565 return NULL; 1666 return NULL;
1566debug: 1667debug:
1567 if (!alloc_debug_processing(s, c->page, object, addr)) 1668 if (!alloc_debug_processing(s, c->page, object, addr))
@@ -1591,6 +1692,8 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1591 unsigned long flags; 1692 unsigned long flags;
1592 unsigned int objsize; 1693 unsigned int objsize;
1593 1694
1695 gfpflags &= slab_gfp_mask;
1696
1594 lockdep_trace_alloc(gfpflags); 1697 lockdep_trace_alloc(gfpflags);
1595 might_sleep_if(gfpflags & __GFP_WAIT); 1698 might_sleep_if(gfpflags & __GFP_WAIT);
1596 1699
@@ -1614,6 +1717,9 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1614 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1717 if (unlikely((gfpflags & __GFP_ZERO) && object))
1615 memset(object, 0, objsize); 1718 memset(object, 0, objsize);
1616 1719
1720 kmemcheck_slab_alloc(s, gfpflags, object, c->objsize);
1721 kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags);
1722
1617 return object; 1723 return object;
1618} 1724}
1619 1725
@@ -1743,8 +1849,10 @@ static __always_inline void slab_free(struct kmem_cache *s,
1743 struct kmem_cache_cpu *c; 1849 struct kmem_cache_cpu *c;
1744 unsigned long flags; 1850 unsigned long flags;
1745 1851
1852 kmemleak_free_recursive(x, s->flags);
1746 local_irq_save(flags); 1853 local_irq_save(flags);
1747 c = get_cpu_slab(s, smp_processor_id()); 1854 c = get_cpu_slab(s, smp_processor_id());
1855 kmemcheck_slab_free(s, object, c->objsize);
1748 debug_check_no_locks_freed(object, c->objsize); 1856 debug_check_no_locks_freed(object, c->objsize);
1749 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1857 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1750 debug_check_no_obj_freed(object, c->objsize); 1858 debug_check_no_obj_freed(object, c->objsize);
@@ -1909,7 +2017,7 @@ static inline int calculate_order(int size)
1909 * Doh this slab cannot be placed using slub_max_order. 2017 * Doh this slab cannot be placed using slub_max_order.
1910 */ 2018 */
1911 order = slab_order(size, 1, MAX_ORDER, 1); 2019 order = slab_order(size, 1, MAX_ORDER, 1);
1912 if (order <= MAX_ORDER) 2020 if (order < MAX_ORDER)
1913 return order; 2021 return order;
1914 return -ENOSYS; 2022 return -ENOSYS;
1915} 2023}
@@ -2522,6 +2630,7 @@ __setup("slub_min_order=", setup_slub_min_order);
2522static int __init setup_slub_max_order(char *str) 2630static int __init setup_slub_max_order(char *str)
2523{ 2631{
2524 get_option(&str, &slub_max_order); 2632 get_option(&str, &slub_max_order);
2633 slub_max_order = min(slub_max_order, MAX_ORDER - 1);
2525 2634
2526 return 1; 2635 return 1;
2527} 2636}
@@ -2553,13 +2662,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
2553 if (gfp_flags & SLUB_DMA) 2662 if (gfp_flags & SLUB_DMA)
2554 flags = SLAB_CACHE_DMA; 2663 flags = SLAB_CACHE_DMA;
2555 2664
2556 down_write(&slub_lock); 2665 /*
2666 * This function is called with IRQs disabled during early-boot on
2667 * single CPU so there's no need to take slub_lock here.
2668 */
2557 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, 2669 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
2558 flags, NULL)) 2670 flags, NULL))
2559 goto panic; 2671 goto panic;
2560 2672
2561 list_add(&s->list, &slab_caches); 2673 list_add(&s->list, &slab_caches);
2562 up_write(&slub_lock); 2674
2563 if (sysfs_slab_add(s)) 2675 if (sysfs_slab_add(s))
2564 goto panic; 2676 goto panic;
2565 return s; 2677 return s;
@@ -2615,7 +2727,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2615 2727
2616 if (!s || !text || !kmem_cache_open(s, flags, text, 2728 if (!s || !text || !kmem_cache_open(s, flags, text,
2617 realsize, ARCH_KMALLOC_MINALIGN, 2729 realsize, ARCH_KMALLOC_MINALIGN,
2618 SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { 2730 SLAB_CACHE_DMA|SLAB_NOTRACK|__SYSFS_ADD_DEFERRED,
2731 NULL)) {
2619 kfree(s); 2732 kfree(s);
2620 kfree(text); 2733 kfree(text);
2621 goto unlock_out; 2734 goto unlock_out;
@@ -2709,9 +2822,10 @@ EXPORT_SYMBOL(__kmalloc);
2709 2822
2710static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 2823static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2711{ 2824{
2712 struct page *page = alloc_pages_node(node, flags | __GFP_COMP, 2825 struct page *page;
2713 get_order(size));
2714 2826
2827 flags |= __GFP_COMP | __GFP_NOTRACK;
2828 page = alloc_pages_node(node, flags, get_order(size));
2715 if (page) 2829 if (page)
2716 return page_address(page); 2830 return page_address(page);
2717 else 2831 else
@@ -3017,7 +3131,7 @@ void __init kmem_cache_init(void)
3017 * kmem_cache_open for slab_state == DOWN. 3131 * kmem_cache_open for slab_state == DOWN.
3018 */ 3132 */
3019 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 3133 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
3020 sizeof(struct kmem_cache_node), GFP_KERNEL); 3134 sizeof(struct kmem_cache_node), GFP_NOWAIT);
3021 kmalloc_caches[0].refcount = -1; 3135 kmalloc_caches[0].refcount = -1;
3022 caches++; 3136 caches++;
3023 3137
@@ -3030,16 +3144,16 @@ void __init kmem_cache_init(void)
3030 /* Caches that are not of the two-to-the-power-of size */ 3144 /* Caches that are not of the two-to-the-power-of size */
3031 if (KMALLOC_MIN_SIZE <= 64) { 3145 if (KMALLOC_MIN_SIZE <= 64) {
3032 create_kmalloc_cache(&kmalloc_caches[1], 3146 create_kmalloc_cache(&kmalloc_caches[1],
3033 "kmalloc-96", 96, GFP_KERNEL); 3147 "kmalloc-96", 96, GFP_NOWAIT);
3034 caches++; 3148 caches++;
3035 create_kmalloc_cache(&kmalloc_caches[2], 3149 create_kmalloc_cache(&kmalloc_caches[2],
3036 "kmalloc-192", 192, GFP_KERNEL); 3150 "kmalloc-192", 192, GFP_NOWAIT);
3037 caches++; 3151 caches++;
3038 } 3152 }
3039 3153
3040 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3154 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
3041 create_kmalloc_cache(&kmalloc_caches[i], 3155 create_kmalloc_cache(&kmalloc_caches[i],
3042 "kmalloc", 1 << i, GFP_KERNEL); 3156 "kmalloc", 1 << i, GFP_NOWAIT);
3043 caches++; 3157 caches++;
3044 } 3158 }
3045 3159
@@ -3076,7 +3190,7 @@ void __init kmem_cache_init(void)
3076 /* Provide the correct kmalloc names now that the caches are up */ 3190 /* Provide the correct kmalloc names now that the caches are up */
3077 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) 3191 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
3078 kmalloc_caches[i]. name = 3192 kmalloc_caches[i]. name =
3079 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 3193 kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
3080 3194
3081#ifdef CONFIG_SMP 3195#ifdef CONFIG_SMP
3082 register_cpu_notifier(&slab_notifier); 3196 register_cpu_notifier(&slab_notifier);
@@ -3094,6 +3208,14 @@ void __init kmem_cache_init(void)
3094 nr_cpu_ids, nr_node_ids); 3208 nr_cpu_ids, nr_node_ids);
3095} 3209}
3096 3210
3211void __init kmem_cache_init_late(void)
3212{
3213 /*
3214 * Interrupts are enabled now so all GFP allocations are safe.
3215 */
3216 slab_gfp_mask = __GFP_BITS_MASK;
3217}
3218
3097/* 3219/*
3098 * Find a mergeable slab cache 3220 * Find a mergeable slab cache
3099 */ 3221 */
@@ -3314,20 +3436,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3314} 3436}
3315 3437
3316#ifdef CONFIG_SLUB_DEBUG 3438#ifdef CONFIG_SLUB_DEBUG
3317static unsigned long count_partial(struct kmem_cache_node *n,
3318 int (*get_count)(struct page *))
3319{
3320 unsigned long flags;
3321 unsigned long x = 0;
3322 struct page *page;
3323
3324 spin_lock_irqsave(&n->list_lock, flags);
3325 list_for_each_entry(page, &n->partial, lru)
3326 x += get_count(page);
3327 spin_unlock_irqrestore(&n->list_lock, flags);
3328 return x;
3329}
3330
3331static int count_inuse(struct page *page) 3439static int count_inuse(struct page *page)
3332{ 3440{
3333 return page->inuse; 3441 return page->inuse;
@@ -3338,11 +3446,6 @@ static int count_total(struct page *page)
3338 return page->objects; 3446 return page->objects;
3339} 3447}
3340 3448
3341static int count_free(struct page *page)
3342{
3343 return page->objects - page->inuse;
3344}
3345
3346static int validate_slab(struct kmem_cache *s, struct page *page, 3449static int validate_slab(struct kmem_cache *s, struct page *page,
3347 unsigned long *map) 3450 unsigned long *map)
3348{ 3451{
@@ -3711,7 +3814,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
3711 to_cpumask(l->cpus)); 3814 to_cpumask(l->cpus));
3712 } 3815 }
3713 3816
3714 if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && 3817 if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
3715 len < PAGE_SIZE - 60) { 3818 len < PAGE_SIZE - 60) {
3716 len += sprintf(buf + len, " nodes="); 3819 len += sprintf(buf + len, " nodes=");
3717 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, 3820 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
@@ -4386,6 +4489,8 @@ static char *create_unique_id(struct kmem_cache *s)
4386 *p++ = 'a'; 4489 *p++ = 'a';
4387 if (s->flags & SLAB_DEBUG_FREE) 4490 if (s->flags & SLAB_DEBUG_FREE)
4388 *p++ = 'F'; 4491 *p++ = 'F';
4492 if (!(s->flags & SLAB_NOTRACK))
4493 *p++ = 't';
4389 if (p != name + 1) 4494 if (p != name + 1)
4390 *p++ = '-'; 4495 *p++ = '-';
4391 p += sprintf(p, "%07d", s->size); 4496 p += sprintf(p, "%07d", s->size);
diff --git a/mm/swap.c b/mm/swap.c
index bede23ce64ea..cb29ae5d33ab 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -491,49 +491,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
491 491
492EXPORT_SYMBOL(pagevec_lookup_tag); 492EXPORT_SYMBOL(pagevec_lookup_tag);
493 493
494#ifdef CONFIG_SMP
495/*
496 * We tolerate a little inaccuracy to avoid ping-ponging the counter between
497 * CPUs
498 */
499#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
500
501static DEFINE_PER_CPU(long, committed_space);
502
503void vm_acct_memory(long pages)
504{
505 long *local;
506
507 preempt_disable();
508 local = &__get_cpu_var(committed_space);
509 *local += pages;
510 if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
511 atomic_long_add(*local, &vm_committed_space);
512 *local = 0;
513 }
514 preempt_enable();
515}
516
517#ifdef CONFIG_HOTPLUG_CPU
518
519/* Drop the CPU's cached committed space back into the central pool. */
520static int cpu_swap_callback(struct notifier_block *nfb,
521 unsigned long action,
522 void *hcpu)
523{
524 long *committed;
525
526 committed = &per_cpu(committed_space, (long)hcpu);
527 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
528 atomic_long_add(*committed, &vm_committed_space);
529 *committed = 0;
530 drain_cpu_pagevecs((long)hcpu);
531 }
532 return NOTIFY_OK;
533}
534#endif /* CONFIG_HOTPLUG_CPU */
535#endif /* CONFIG_SMP */
536
537/* 494/*
538 * Perform any setup for the swap system 495 * Perform any setup for the swap system
539 */ 496 */
@@ -554,7 +511,4 @@ void __init swap_setup(void)
554 * Right now other parts of the system means that we 511 * Right now other parts of the system means that we
555 * _really_ don't want to cluster much more 512 * _really_ don't want to cluster much more
556 */ 513 */
557#ifdef CONFIG_HOTPLUG_CPU
558 hotcpu_notifier(cpu_swap_callback, 0);
559#endif
560} 514}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3ecea98ecb45..42cd38eba79f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -109,8 +109,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
109 */ 109 */
110void __delete_from_swap_cache(struct page *page) 110void __delete_from_swap_cache(struct page *page)
111{ 111{
112 swp_entry_t ent = {.val = page_private(page)};
113
114 VM_BUG_ON(!PageLocked(page)); 112 VM_BUG_ON(!PageLocked(page));
115 VM_BUG_ON(!PageSwapCache(page)); 113 VM_BUG_ON(!PageSwapCache(page));
116 VM_BUG_ON(PageWriteback(page)); 114 VM_BUG_ON(PageWriteback(page));
@@ -121,13 +119,11 @@ void __delete_from_swap_cache(struct page *page)
121 total_swapcache_pages--; 119 total_swapcache_pages--;
122 __dec_zone_page_state(page, NR_FILE_PAGES); 120 __dec_zone_page_state(page, NR_FILE_PAGES);
123 INC_CACHE_INFO(del_total); 121 INC_CACHE_INFO(del_total);
124 mem_cgroup_uncharge_swapcache(page, ent);
125} 122}
126 123
127/** 124/**
128 * add_to_swap - allocate swap space for a page 125 * add_to_swap - allocate swap space for a page
129 * @page: page we want to move to swap 126 * @page: page we want to move to swap
130 * @gfp_mask: memory allocation flags
131 * 127 *
132 * Allocate swap space for the page and add the page to the 128 * Allocate swap space for the page and add the page to the
133 * swap cache. Caller needs to hold the page lock. 129 * swap cache. Caller needs to hold the page lock.
@@ -165,11 +161,11 @@ int add_to_swap(struct page *page)
165 return 1; 161 return 1;
166 case -EEXIST: 162 case -EEXIST:
167 /* Raced with "speculative" read_swap_cache_async */ 163 /* Raced with "speculative" read_swap_cache_async */
168 swap_free(entry); 164 swapcache_free(entry, NULL);
169 continue; 165 continue;
170 default: 166 default:
171 /* -ENOMEM radix-tree allocation failure */ 167 /* -ENOMEM radix-tree allocation failure */
172 swap_free(entry); 168 swapcache_free(entry, NULL);
173 return 0; 169 return 0;
174 } 170 }
175 } 171 }
@@ -191,7 +187,7 @@ void delete_from_swap_cache(struct page *page)
191 __delete_from_swap_cache(page); 187 __delete_from_swap_cache(page);
192 spin_unlock_irq(&swapper_space.tree_lock); 188 spin_unlock_irq(&swapper_space.tree_lock);
193 189
194 swap_free(entry); 190 swapcache_free(entry, page);
195 page_cache_release(page); 191 page_cache_release(page);
196} 192}
197 193
@@ -295,7 +291,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
295 /* 291 /*
296 * Swap entry may have been freed since our caller observed it. 292 * Swap entry may have been freed since our caller observed it.
297 */ 293 */
298 if (!swap_duplicate(entry)) 294 err = swapcache_prepare(entry);
295 if (err == -EEXIST) /* seems racy */
296 continue;
297 if (err) /* swp entry is obsolete ? */
299 break; 298 break;
300 299
301 /* 300 /*
@@ -314,12 +313,12 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
314 * Initiate read into locked page and return. 313 * Initiate read into locked page and return.
315 */ 314 */
316 lru_cache_add_anon(new_page); 315 lru_cache_add_anon(new_page);
317 swap_readpage(NULL, new_page); 316 swap_readpage(new_page);
318 return new_page; 317 return new_page;
319 } 318 }
320 ClearPageSwapBacked(new_page); 319 ClearPageSwapBacked(new_page);
321 __clear_page_locked(new_page); 320 __clear_page_locked(new_page);
322 swap_free(entry); 321 swapcache_free(entry, NULL);
323 } while (err != -ENOMEM); 322 } while (err != -ENOMEM);
324 323
325 if (new_page) 324 if (new_page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 312fafe0ab6e..28faa01cf578 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -53,6 +53,59 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES];
53 53
54static DEFINE_MUTEX(swapon_mutex); 54static DEFINE_MUTEX(swapon_mutex);
55 55
56/* For reference count accounting in swap_map */
57/* enum for swap_map[] handling. internal use only */
58enum {
59 SWAP_MAP = 0, /* ops for reference from swap users */
60 SWAP_CACHE, /* ops for reference from swap cache */
61};
62
63static inline int swap_count(unsigned short ent)
64{
65 return ent & SWAP_COUNT_MASK;
66}
67
68static inline bool swap_has_cache(unsigned short ent)
69{
70 return !!(ent & SWAP_HAS_CACHE);
71}
72
73static inline unsigned short encode_swapmap(int count, bool has_cache)
74{
75 unsigned short ret = count;
76
77 if (has_cache)
78 return SWAP_HAS_CACHE | ret;
79 return ret;
80}
81
82/* returnes 1 if swap entry is freed */
83static int
84__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
85{
86 int type = si - swap_info;
87 swp_entry_t entry = swp_entry(type, offset);
88 struct page *page;
89 int ret = 0;
90
91 page = find_get_page(&swapper_space, entry.val);
92 if (!page)
93 return 0;
94 /*
95 * This function is called from scan_swap_map() and it's called
96 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
97 * We have to use trylock for avoiding deadlock. This is a special
98 * case and you should use try_to_free_swap() with explicit lock_page()
99 * in usual operations.
100 */
101 if (trylock_page(page)) {
102 ret = try_to_free_swap(page);
103 unlock_page(page);
104 }
105 page_cache_release(page);
106 return ret;
107}
108
56/* 109/*
57 * We need this because the bdev->unplug_fn can sleep and we cannot 110 * We need this because the bdev->unplug_fn can sleep and we cannot
58 * hold swap_lock while calling the unplug_fn. And swap_lock 111 * hold swap_lock while calling the unplug_fn. And swap_lock
@@ -167,7 +220,8 @@ static int wait_for_discard(void *word)
167#define SWAPFILE_CLUSTER 256 220#define SWAPFILE_CLUSTER 256
168#define LATENCY_LIMIT 256 221#define LATENCY_LIMIT 256
169 222
170static inline unsigned long scan_swap_map(struct swap_info_struct *si) 223static inline unsigned long scan_swap_map(struct swap_info_struct *si,
224 int cache)
171{ 225{
172 unsigned long offset; 226 unsigned long offset;
173 unsigned long scan_base; 227 unsigned long scan_base;
@@ -273,6 +327,19 @@ checks:
273 goto no_page; 327 goto no_page;
274 if (offset > si->highest_bit) 328 if (offset > si->highest_bit)
275 scan_base = offset = si->lowest_bit; 329 scan_base = offset = si->lowest_bit;
330
331 /* reuse swap entry of cache-only swap if not busy. */
332 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
333 int swap_was_freed;
334 spin_unlock(&swap_lock);
335 swap_was_freed = __try_to_reclaim_swap(si, offset);
336 spin_lock(&swap_lock);
337 /* entry was freed successfully, try to use this again */
338 if (swap_was_freed)
339 goto checks;
340 goto scan; /* check next one */
341 }
342
276 if (si->swap_map[offset]) 343 if (si->swap_map[offset])
277 goto scan; 344 goto scan;
278 345
@@ -285,7 +352,10 @@ checks:
285 si->lowest_bit = si->max; 352 si->lowest_bit = si->max;
286 si->highest_bit = 0; 353 si->highest_bit = 0;
287 } 354 }
288 si->swap_map[offset] = 1; 355 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
356 si->swap_map[offset] = encode_swapmap(0, true);
357 else /* at suspend */
358 si->swap_map[offset] = encode_swapmap(1, false);
289 si->cluster_next = offset + 1; 359 si->cluster_next = offset + 1;
290 si->flags -= SWP_SCANNING; 360 si->flags -= SWP_SCANNING;
291 361
@@ -351,6 +421,10 @@ scan:
351 spin_lock(&swap_lock); 421 spin_lock(&swap_lock);
352 goto checks; 422 goto checks;
353 } 423 }
424 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
425 spin_lock(&swap_lock);
426 goto checks;
427 }
354 if (unlikely(--latency_ration < 0)) { 428 if (unlikely(--latency_ration < 0)) {
355 cond_resched(); 429 cond_resched();
356 latency_ration = LATENCY_LIMIT; 430 latency_ration = LATENCY_LIMIT;
@@ -362,6 +436,10 @@ scan:
362 spin_lock(&swap_lock); 436 spin_lock(&swap_lock);
363 goto checks; 437 goto checks;
364 } 438 }
439 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
440 spin_lock(&swap_lock);
441 goto checks;
442 }
365 if (unlikely(--latency_ration < 0)) { 443 if (unlikely(--latency_ration < 0)) {
366 cond_resched(); 444 cond_resched();
367 latency_ration = LATENCY_LIMIT; 445 latency_ration = LATENCY_LIMIT;
@@ -401,7 +479,8 @@ swp_entry_t get_swap_page(void)
401 continue; 479 continue;
402 480
403 swap_list.next = next; 481 swap_list.next = next;
404 offset = scan_swap_map(si); 482 /* This is called for allocating swap entry for cache */
483 offset = scan_swap_map(si, SWAP_CACHE);
405 if (offset) { 484 if (offset) {
406 spin_unlock(&swap_lock); 485 spin_unlock(&swap_lock);
407 return swp_entry(type, offset); 486 return swp_entry(type, offset);
@@ -415,6 +494,7 @@ noswap:
415 return (swp_entry_t) {0}; 494 return (swp_entry_t) {0};
416} 495}
417 496
497/* The only caller of this function is now susupend routine */
418swp_entry_t get_swap_page_of_type(int type) 498swp_entry_t get_swap_page_of_type(int type)
419{ 499{
420 struct swap_info_struct *si; 500 struct swap_info_struct *si;
@@ -424,7 +504,8 @@ swp_entry_t get_swap_page_of_type(int type)
424 si = swap_info + type; 504 si = swap_info + type;
425 if (si->flags & SWP_WRITEOK) { 505 if (si->flags & SWP_WRITEOK) {
426 nr_swap_pages--; 506 nr_swap_pages--;
427 offset = scan_swap_map(si); 507 /* This is called for allocating swap entry, not cache */
508 offset = scan_swap_map(si, SWAP_MAP);
428 if (offset) { 509 if (offset) {
429 spin_unlock(&swap_lock); 510 spin_unlock(&swap_lock);
430 return swp_entry(type, offset); 511 return swp_entry(type, offset);
@@ -471,25 +552,38 @@ out:
471 return NULL; 552 return NULL;
472} 553}
473 554
474static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) 555static int swap_entry_free(struct swap_info_struct *p,
556 swp_entry_t ent, int cache)
475{ 557{
476 unsigned long offset = swp_offset(ent); 558 unsigned long offset = swp_offset(ent);
477 int count = p->swap_map[offset]; 559 int count = swap_count(p->swap_map[offset]);
478 560 bool has_cache;
479 if (count < SWAP_MAP_MAX) { 561
480 count--; 562 has_cache = swap_has_cache(p->swap_map[offset]);
481 p->swap_map[offset] = count; 563
482 if (!count) { 564 if (cache == SWAP_MAP) { /* dropping usage count of swap */
483 if (offset < p->lowest_bit) 565 if (count < SWAP_MAP_MAX) {
484 p->lowest_bit = offset; 566 count--;
485 if (offset > p->highest_bit) 567 p->swap_map[offset] = encode_swapmap(count, has_cache);
486 p->highest_bit = offset;
487 if (p->prio > swap_info[swap_list.next].prio)
488 swap_list.next = p - swap_info;
489 nr_swap_pages++;
490 p->inuse_pages--;
491 mem_cgroup_uncharge_swap(ent);
492 } 568 }
569 } else { /* dropping swap cache flag */
570 VM_BUG_ON(!has_cache);
571 p->swap_map[offset] = encode_swapmap(count, false);
572
573 }
574 /* return code. */
575 count = p->swap_map[offset];
576 /* free if no reference */
577 if (!count) {
578 if (offset < p->lowest_bit)
579 p->lowest_bit = offset;
580 if (offset > p->highest_bit)
581 p->highest_bit = offset;
582 if (p->prio > swap_info[swap_list.next].prio)
583 swap_list.next = p - swap_info;
584 nr_swap_pages++;
585 p->inuse_pages--;
586 mem_cgroup_uncharge_swap(ent);
493 } 587 }
494 return count; 588 return count;
495} 589}
@@ -504,9 +598,26 @@ void swap_free(swp_entry_t entry)
504 598
505 p = swap_info_get(entry); 599 p = swap_info_get(entry);
506 if (p) { 600 if (p) {
507 swap_entry_free(p, entry); 601 swap_entry_free(p, entry, SWAP_MAP);
602 spin_unlock(&swap_lock);
603 }
604}
605
606/*
607 * Called after dropping swapcache to decrease refcnt to swap entries.
608 */
609void swapcache_free(swp_entry_t entry, struct page *page)
610{
611 struct swap_info_struct *p;
612
613 if (page)
614 mem_cgroup_uncharge_swapcache(page, entry);
615 p = swap_info_get(entry);
616 if (p) {
617 swap_entry_free(p, entry, SWAP_CACHE);
508 spin_unlock(&swap_lock); 618 spin_unlock(&swap_lock);
509 } 619 }
620 return;
510} 621}
511 622
512/* 623/*
@@ -521,8 +632,7 @@ static inline int page_swapcount(struct page *page)
521 entry.val = page_private(page); 632 entry.val = page_private(page);
522 p = swap_info_get(entry); 633 p = swap_info_get(entry);
523 if (p) { 634 if (p) {
524 /* Subtract the 1 for the swap cache itself */ 635 count = swap_count(p->swap_map[swp_offset(entry)]);
525 count = p->swap_map[swp_offset(entry)] - 1;
526 spin_unlock(&swap_lock); 636 spin_unlock(&swap_lock);
527 } 637 }
528 return count; 638 return count;
@@ -584,7 +694,7 @@ int free_swap_and_cache(swp_entry_t entry)
584 694
585 p = swap_info_get(entry); 695 p = swap_info_get(entry);
586 if (p) { 696 if (p) {
587 if (swap_entry_free(p, entry) == 1) { 697 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
588 page = find_get_page(&swapper_space, entry.val); 698 page = find_get_page(&swapper_space, entry.val);
589 if (page && !trylock_page(page)) { 699 if (page && !trylock_page(page)) {
590 page_cache_release(page); 700 page_cache_release(page);
@@ -891,7 +1001,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
891 i = 1; 1001 i = 1;
892 } 1002 }
893 count = si->swap_map[i]; 1003 count = si->swap_map[i];
894 if (count && count != SWAP_MAP_BAD) 1004 if (count && swap_count(count) != SWAP_MAP_BAD)
895 break; 1005 break;
896 } 1006 }
897 return i; 1007 return i;
@@ -995,13 +1105,13 @@ static int try_to_unuse(unsigned int type)
995 */ 1105 */
996 shmem = 0; 1106 shmem = 0;
997 swcount = *swap_map; 1107 swcount = *swap_map;
998 if (swcount > 1) { 1108 if (swap_count(swcount)) {
999 if (start_mm == &init_mm) 1109 if (start_mm == &init_mm)
1000 shmem = shmem_unuse(entry, page); 1110 shmem = shmem_unuse(entry, page);
1001 else 1111 else
1002 retval = unuse_mm(start_mm, entry, page); 1112 retval = unuse_mm(start_mm, entry, page);
1003 } 1113 }
1004 if (*swap_map > 1) { 1114 if (swap_count(*swap_map)) {
1005 int set_start_mm = (*swap_map >= swcount); 1115 int set_start_mm = (*swap_map >= swcount);
1006 struct list_head *p = &start_mm->mmlist; 1116 struct list_head *p = &start_mm->mmlist;
1007 struct mm_struct *new_start_mm = start_mm; 1117 struct mm_struct *new_start_mm = start_mm;
@@ -1011,7 +1121,7 @@ static int try_to_unuse(unsigned int type)
1011 atomic_inc(&new_start_mm->mm_users); 1121 atomic_inc(&new_start_mm->mm_users);
1012 atomic_inc(&prev_mm->mm_users); 1122 atomic_inc(&prev_mm->mm_users);
1013 spin_lock(&mmlist_lock); 1123 spin_lock(&mmlist_lock);
1014 while (*swap_map > 1 && !retval && !shmem && 1124 while (swap_count(*swap_map) && !retval && !shmem &&
1015 (p = p->next) != &start_mm->mmlist) { 1125 (p = p->next) != &start_mm->mmlist) {
1016 mm = list_entry(p, struct mm_struct, mmlist); 1126 mm = list_entry(p, struct mm_struct, mmlist);
1017 if (!atomic_inc_not_zero(&mm->mm_users)) 1127 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1023,14 +1133,16 @@ static int try_to_unuse(unsigned int type)
1023 cond_resched(); 1133 cond_resched();
1024 1134
1025 swcount = *swap_map; 1135 swcount = *swap_map;
1026 if (swcount <= 1) 1136 if (!swap_count(swcount)) /* any usage ? */
1027 ; 1137 ;
1028 else if (mm == &init_mm) { 1138 else if (mm == &init_mm) {
1029 set_start_mm = 1; 1139 set_start_mm = 1;
1030 shmem = shmem_unuse(entry, page); 1140 shmem = shmem_unuse(entry, page);
1031 } else 1141 } else
1032 retval = unuse_mm(mm, entry, page); 1142 retval = unuse_mm(mm, entry, page);
1033 if (set_start_mm && *swap_map < swcount) { 1143
1144 if (set_start_mm &&
1145 swap_count(*swap_map) < swcount) {
1034 mmput(new_start_mm); 1146 mmput(new_start_mm);
1035 atomic_inc(&mm->mm_users); 1147 atomic_inc(&mm->mm_users);
1036 new_start_mm = mm; 1148 new_start_mm = mm;
@@ -1057,21 +1169,25 @@ static int try_to_unuse(unsigned int type)
1057 } 1169 }
1058 1170
1059 /* 1171 /*
1060 * How could swap count reach 0x7fff when the maximum 1172 * How could swap count reach 0x7ffe ?
1061 * pid is 0x7fff, and there's no way to repeat a swap 1173 * There's no way to repeat a swap page within an mm
1062 * page within an mm (except in shmem, where it's the 1174 * (except in shmem, where it's the shared object which takes
1063 * shared object which takes the reference count)? 1175 * the reference count)?
1064 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. 1176 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1065 * 1177 * short is too small....)
1066 * If that's wrong, then we should worry more about 1178 * If that's wrong, then we should worry more about
1067 * exit_mmap() and do_munmap() cases described above: 1179 * exit_mmap() and do_munmap() cases described above:
1068 * we might be resetting SWAP_MAP_MAX too early here. 1180 * we might be resetting SWAP_MAP_MAX too early here.
1069 * We know "Undead"s can happen, they're okay, so don't 1181 * We know "Undead"s can happen, they're okay, so don't
1070 * report them; but do report if we reset SWAP_MAP_MAX. 1182 * report them; but do report if we reset SWAP_MAP_MAX.
1071 */ 1183 */
1072 if (*swap_map == SWAP_MAP_MAX) { 1184 /* We might release the lock_page() in unuse_mm(). */
1185 if (!PageSwapCache(page) || page_private(page) != entry.val)
1186 goto retry;
1187
1188 if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1073 spin_lock(&swap_lock); 1189 spin_lock(&swap_lock);
1074 *swap_map = 1; 1190 *swap_map = encode_swapmap(0, true);
1075 spin_unlock(&swap_lock); 1191 spin_unlock(&swap_lock);
1076 reset_overflow = 1; 1192 reset_overflow = 1;
1077 } 1193 }
@@ -1089,7 +1205,8 @@ static int try_to_unuse(unsigned int type)
1089 * pages would be incorrect if swap supported "shared 1205 * pages would be incorrect if swap supported "shared
1090 * private" pages, but they are handled by tmpfs files. 1206 * private" pages, but they are handled by tmpfs files.
1091 */ 1207 */
1092 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { 1208 if (swap_count(*swap_map) &&
1209 PageDirty(page) && PageSwapCache(page)) {
1093 struct writeback_control wbc = { 1210 struct writeback_control wbc = {
1094 .sync_mode = WB_SYNC_NONE, 1211 .sync_mode = WB_SYNC_NONE,
1095 }; 1212 };
@@ -1116,6 +1233,7 @@ static int try_to_unuse(unsigned int type)
1116 * mark page dirty so shrink_page_list will preserve it. 1233 * mark page dirty so shrink_page_list will preserve it.
1117 */ 1234 */
1118 SetPageDirty(page); 1235 SetPageDirty(page);
1236retry:
1119 unlock_page(page); 1237 unlock_page(page);
1120 page_cache_release(page); 1238 page_cache_release(page);
1121 1239
@@ -1942,15 +2060,23 @@ void si_swapinfo(struct sysinfo *val)
1942 * 2060 *
1943 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 2061 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
1944 * "permanent", but will be reclaimed by the next swapoff. 2062 * "permanent", but will be reclaimed by the next swapoff.
2063 * Returns error code in following case.
2064 * - success -> 0
2065 * - swp_entry is invalid -> EINVAL
2066 * - swp_entry is migration entry -> EINVAL
2067 * - swap-cache reference is requested but there is already one. -> EEXIST
2068 * - swap-cache reference is requested but the entry is not used. -> ENOENT
1945 */ 2069 */
1946int swap_duplicate(swp_entry_t entry) 2070static int __swap_duplicate(swp_entry_t entry, bool cache)
1947{ 2071{
1948 struct swap_info_struct * p; 2072 struct swap_info_struct * p;
1949 unsigned long offset, type; 2073 unsigned long offset, type;
1950 int result = 0; 2074 int result = -EINVAL;
2075 int count;
2076 bool has_cache;
1951 2077
1952 if (is_migration_entry(entry)) 2078 if (is_migration_entry(entry))
1953 return 1; 2079 return -EINVAL;
1954 2080
1955 type = swp_type(entry); 2081 type = swp_type(entry);
1956 if (type >= nr_swapfiles) 2082 if (type >= nr_swapfiles)
@@ -1959,17 +2085,40 @@ int swap_duplicate(swp_entry_t entry)
1959 offset = swp_offset(entry); 2085 offset = swp_offset(entry);
1960 2086
1961 spin_lock(&swap_lock); 2087 spin_lock(&swap_lock);
1962 if (offset < p->max && p->swap_map[offset]) { 2088
1963 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { 2089 if (unlikely(offset >= p->max))
1964 p->swap_map[offset]++; 2090 goto unlock_out;
1965 result = 1; 2091
1966 } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { 2092 count = swap_count(p->swap_map[offset]);
2093 has_cache = swap_has_cache(p->swap_map[offset]);
2094
2095 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
2096
2097 /* set SWAP_HAS_CACHE if there is no cache and entry is used */
2098 if (!has_cache && count) {
2099 p->swap_map[offset] = encode_swapmap(count, true);
2100 result = 0;
2101 } else if (has_cache) /* someone added cache */
2102 result = -EEXIST;
2103 else if (!count) /* no users */
2104 result = -ENOENT;
2105
2106 } else if (count || has_cache) {
2107 if (count < SWAP_MAP_MAX - 1) {
2108 p->swap_map[offset] = encode_swapmap(count + 1,
2109 has_cache);
2110 result = 0;
2111 } else if (count <= SWAP_MAP_MAX) {
1967 if (swap_overflow++ < 5) 2112 if (swap_overflow++ < 5)
1968 printk(KERN_WARNING "swap_dup: swap entry overflow\n"); 2113 printk(KERN_WARNING
1969 p->swap_map[offset] = SWAP_MAP_MAX; 2114 "swap_dup: swap entry overflow\n");
1970 result = 1; 2115 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
2116 has_cache);
2117 result = 0;
1971 } 2118 }
1972 } 2119 } else
2120 result = -ENOENT; /* unused swap entry */
2121unlock_out:
1973 spin_unlock(&swap_lock); 2122 spin_unlock(&swap_lock);
1974out: 2123out:
1975 return result; 2124 return result;
@@ -1978,6 +2127,27 @@ bad_file:
1978 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2127 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
1979 goto out; 2128 goto out;
1980} 2129}
2130/*
2131 * increase reference count of swap entry by 1.
2132 */
2133void swap_duplicate(swp_entry_t entry)
2134{
2135 __swap_duplicate(entry, SWAP_MAP);
2136}
2137
2138/*
2139 * @entry: swap entry for which we allocate swap cache.
2140 *
2141 * Called when allocating swap cache for exising swap entry,
2142 * This can return error codes. Returns 0 at success.
2143 * -EBUSY means there is a swap cache.
2144 * Note: return code is different from swap_duplicate().
2145 */
2146int swapcache_prepare(swp_entry_t entry)
2147{
2148 return __swap_duplicate(entry, SWAP_CACHE);
2149}
2150
1981 2151
1982struct swap_info_struct * 2152struct swap_info_struct *
1983get_swap_info_struct(unsigned type) 2153get_swap_info_struct(unsigned type)
@@ -2016,7 +2186,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2016 /* Don't read in free or bad pages */ 2186 /* Don't read in free or bad pages */
2017 if (!si->swap_map[toff]) 2187 if (!si->swap_map[toff])
2018 break; 2188 break;
2019 if (si->swap_map[toff] == SWAP_MAP_BAD) 2189 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2020 break; 2190 break;
2021 } 2191 }
2022 /* Count contiguous allocated slots below our target */ 2192 /* Count contiguous allocated slots below our target */
@@ -2024,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2024 /* Don't read in free or bad pages */ 2194 /* Don't read in free or bad pages */
2025 if (!si->swap_map[toff]) 2195 if (!si->swap_map[toff])
2026 break; 2196 break;
2027 if (si->swap_map[toff] == SWAP_MAP_BAD) 2197 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2028 break; 2198 break;
2029 } 2199 }
2030 spin_unlock(&swap_lock); 2200 spin_unlock(&swap_lock);
diff --git a/mm/truncate.c b/mm/truncate.c
index 55206fab7b99..ccc3ecf7cb98 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
267} 267}
268EXPORT_SYMBOL(truncate_inode_pages); 268EXPORT_SYMBOL(truncate_inode_pages);
269 269
270unsigned long __invalidate_mapping_pages(struct address_space *mapping, 270/**
271 pgoff_t start, pgoff_t end, bool be_atomic) 271 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
272 * @mapping: the address_space which holds the pages to invalidate
273 * @start: the offset 'from' which to invalidate
274 * @end: the offset 'to' which to invalidate (inclusive)
275 *
276 * This function only removes the unlocked pages, if you want to
277 * remove all the pages of one inode, you must call truncate_inode_pages.
278 *
279 * invalidate_mapping_pages() will not block on IO activity. It will not
280 * invalidate pages which are dirty, locked, under writeback or mapped into
281 * pagetables.
282 */
283unsigned long invalidate_mapping_pages(struct address_space *mapping,
284 pgoff_t start, pgoff_t end)
272{ 285{
273 struct pagevec pvec; 286 struct pagevec pvec;
274 pgoff_t next = start; 287 pgoff_t next = start;
@@ -309,30 +322,10 @@ unlock:
309 break; 322 break;
310 } 323 }
311 pagevec_release(&pvec); 324 pagevec_release(&pvec);
312 if (likely(!be_atomic)) 325 cond_resched();
313 cond_resched();
314 } 326 }
315 return ret; 327 return ret;
316} 328}
317
318/**
319 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
320 * @mapping: the address_space which holds the pages to invalidate
321 * @start: the offset 'from' which to invalidate
322 * @end: the offset 'to' which to invalidate (inclusive)
323 *
324 * This function only removes the unlocked pages, if you want to
325 * remove all the pages of one inode, you must call truncate_inode_pages.
326 *
327 * invalidate_mapping_pages() will not block on IO activity. It will not
328 * invalidate pages which are dirty, locked, under writeback or mapped into
329 * pagetables.
330 */
331unsigned long invalidate_mapping_pages(struct address_space *mapping,
332 pgoff_t start, pgoff_t end)
333{
334 return __invalidate_mapping_pages(mapping, start, end, false);
335}
336EXPORT_SYMBOL(invalidate_mapping_pages); 329EXPORT_SYMBOL(invalidate_mapping_pages);
337 330
338/* 331/*
@@ -359,6 +352,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
359 BUG_ON(page_has_private(page)); 352 BUG_ON(page_has_private(page));
360 __remove_from_page_cache(page); 353 __remove_from_page_cache(page);
361 spin_unlock_irq(&mapping->tree_lock); 354 spin_unlock_irq(&mapping->tree_lock);
355 mem_cgroup_uncharge_cache_page(page);
362 page_cache_release(page); /* pagecache ref */ 356 page_cache_release(page); /* pagecache ref */
363 return 1; 357 return 1;
364failed: 358failed:
diff --git a/mm/util.c b/mm/util.c
index 55bef160b9f1..7c35ad95f927 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,9 +4,11 @@
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/tracepoint.h>
8#include <asm/uaccess.h> 7#include <asm/uaccess.h>
9 8
9#define CREATE_TRACE_POINTS
10#include <trace/events/kmem.h>
11
10/** 12/**
11 * kstrdup - allocate space for and copy an existing string 13 * kstrdup - allocate space for and copy an existing string
12 * @s: the string to duplicate 14 * @s: the string to duplicate
@@ -166,6 +168,10 @@ EXPORT_SYMBOL(krealloc);
166 * 168 *
167 * The memory of the object @p points to is zeroed before freed. 169 * The memory of the object @p points to is zeroed before freed.
168 * If @p is %NULL, kzfree() does nothing. 170 * If @p is %NULL, kzfree() does nothing.
171 *
172 * Note: this function zeroes the whole allocated buffer which can be a good
173 * deal bigger than the requested buffer size passed to kmalloc(). So be
174 * careful when using this function in performance sensitive code.
169 */ 175 */
170void kzfree(const void *p) 176void kzfree(const void *p)
171{ 177{
@@ -231,13 +237,21 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
231 * @pages: array that receives pointers to the pages pinned. 237 * @pages: array that receives pointers to the pages pinned.
232 * Should be at least nr_pages long. 238 * Should be at least nr_pages long.
233 * 239 *
234 * Attempt to pin user pages in memory without taking mm->mmap_sem.
235 * If not successful, it will fall back to taking the lock and
236 * calling get_user_pages().
237 *
238 * Returns number of pages pinned. This may be fewer than the number 240 * Returns number of pages pinned. This may be fewer than the number
239 * requested. If nr_pages is 0 or negative, returns 0. If no pages 241 * requested. If nr_pages is 0 or negative, returns 0. If no pages
240 * were pinned, returns -errno. 242 * were pinned, returns -errno.
243 *
244 * get_user_pages_fast provides equivalent functionality to get_user_pages,
245 * operating on current and current->mm, with force=0 and vma=NULL. However
246 * unlike get_user_pages, it must be called without mmap_sem held.
247 *
248 * get_user_pages_fast may take mmap_sem and page table locks, so no
249 * assumptions can be made about lack of locking. get_user_pages_fast is to be
250 * implemented in a way that is advantageous (vs get_user_pages()) when the
251 * user memory area is already faulted in and present in ptes. However if the
252 * pages have to be faulted in, it may turn out to be slightly slower so
253 * callers need to carefully consider what to use. On many architectures,
254 * get_user_pages_fast simply falls back to get_user_pages.
241 */ 255 */
242int __attribute__((weak)) get_user_pages_fast(unsigned long start, 256int __attribute__((weak)) get_user_pages_fast(unsigned long start,
243 int nr_pages, int write, struct page **pages) 257 int nr_pages, int write, struct page **pages)
@@ -255,13 +269,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
255EXPORT_SYMBOL_GPL(get_user_pages_fast); 269EXPORT_SYMBOL_GPL(get_user_pages_fast);
256 270
257/* Tracepoints definitions. */ 271/* Tracepoints definitions. */
258DEFINE_TRACE(kmalloc);
259DEFINE_TRACE(kmem_cache_alloc);
260DEFINE_TRACE(kmalloc_node);
261DEFINE_TRACE(kmem_cache_alloc_node);
262DEFINE_TRACE(kfree);
263DEFINE_TRACE(kmem_cache_free);
264
265EXPORT_TRACEPOINT_SYMBOL(kmalloc); 272EXPORT_TRACEPOINT_SYMBOL(kmalloc);
266EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); 273EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
267EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); 274EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fab19876b4d1..f8189a4b3e13 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -23,8 +23,8 @@
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/radix-tree.h> 24#include <linux/radix-tree.h>
25#include <linux/rcupdate.h> 25#include <linux/rcupdate.h>
26#include <linux/bootmem.h>
27#include <linux/pfn.h> 26#include <linux/pfn.h>
27#include <linux/kmemleak.h>
28 28
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
@@ -402,6 +402,7 @@ overflow:
402 printk(KERN_WARNING 402 printk(KERN_WARNING
403 "vmap allocation for size %lu failed: " 403 "vmap allocation for size %lu failed: "
404 "use vmalloc=<size> to increase size.\n", size); 404 "use vmalloc=<size> to increase size.\n", size);
405 kfree(va);
405 return ERR_PTR(-EBUSY); 406 return ERR_PTR(-EBUSY);
406 } 407 }
407 408
@@ -1031,7 +1032,7 @@ void __init vmalloc_init(void)
1031 1032
1032 /* Import existing vmlist entries. */ 1033 /* Import existing vmlist entries. */
1033 for (tmp = vmlist; tmp; tmp = tmp->next) { 1034 for (tmp = vmlist; tmp; tmp = tmp->next) {
1034 va = alloc_bootmem(sizeof(struct vmap_area)); 1035 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
1035 va->flags = tmp->flags | VM_VM_AREA; 1036 va->flags = tmp->flags | VM_VM_AREA;
1036 va->va_start = (unsigned long)tmp->addr; 1037 va->va_start = (unsigned long)tmp->addr;
1037 va->va_end = va->va_start + tmp->size; 1038 va->va_end = va->va_start + tmp->size;
@@ -1326,6 +1327,9 @@ static void __vunmap(const void *addr, int deallocate_pages)
1326void vfree(const void *addr) 1327void vfree(const void *addr)
1327{ 1328{
1328 BUG_ON(in_interrupt()); 1329 BUG_ON(in_interrupt());
1330
1331 kmemleak_free(addr);
1332
1329 __vunmap(addr, 1); 1333 __vunmap(addr, 1);
1330} 1334}
1331EXPORT_SYMBOL(vfree); 1335EXPORT_SYMBOL(vfree);
@@ -1438,8 +1442,17 @@ fail:
1438 1442
1439void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) 1443void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1440{ 1444{
1441 return __vmalloc_area_node(area, gfp_mask, prot, -1, 1445 void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
1442 __builtin_return_address(0)); 1446 __builtin_return_address(0));
1447
1448 /*
1449 * A ref_count = 3 is needed because the vm_struct and vmap_area
1450 * structures allocated in the __get_vm_area_node() function contain
1451 * references to the virtual address of the vmalloc'ed block.
1452 */
1453 kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
1454
1455 return addr;
1443} 1456}
1444 1457
1445/** 1458/**
@@ -1458,6 +1471,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1458 int node, void *caller) 1471 int node, void *caller)
1459{ 1472{
1460 struct vm_struct *area; 1473 struct vm_struct *area;
1474 void *addr;
1475 unsigned long real_size = size;
1461 1476
1462 size = PAGE_ALIGN(size); 1477 size = PAGE_ALIGN(size);
1463 if (!size || (size >> PAGE_SHIFT) > num_physpages) 1478 if (!size || (size >> PAGE_SHIFT) > num_physpages)
@@ -1469,7 +1484,16 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1469 if (!area) 1484 if (!area)
1470 return NULL; 1485 return NULL;
1471 1486
1472 return __vmalloc_area_node(area, gfp_mask, prot, node, caller); 1487 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
1488
1489 /*
1490 * A ref_count = 3 is needed because the vm_struct and vmap_area
1491 * structures allocated in the __get_vm_area_node() function contain
1492 * references to the virtual address of the vmalloc'ed block.
1493 */
1494 kmemleak_alloc(addr, real_size, 3, gfp_mask);
1495
1496 return addr;
1473} 1497}
1474 1498
1475void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1499void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eac9577941f9..4139aa52b941 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -470,10 +470,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
470 swp_entry_t swap = { .val = page_private(page) }; 470 swp_entry_t swap = { .val = page_private(page) };
471 __delete_from_swap_cache(page); 471 __delete_from_swap_cache(page);
472 spin_unlock_irq(&mapping->tree_lock); 472 spin_unlock_irq(&mapping->tree_lock);
473 swap_free(swap); 473 swapcache_free(swap, page);
474 } else { 474 } else {
475 __remove_from_page_cache(page); 475 __remove_from_page_cache(page);
476 spin_unlock_irq(&mapping->tree_lock); 476 spin_unlock_irq(&mapping->tree_lock);
477 mem_cgroup_uncharge_cache_page(page);
477 } 478 }
478 479
479 return 1; 480 return 1;
@@ -512,7 +513,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
512 * 513 *
513 * lru_lock must not be held, interrupts must be enabled. 514 * lru_lock must not be held, interrupts must be enabled.
514 */ 515 */
515#ifdef CONFIG_UNEVICTABLE_LRU
516void putback_lru_page(struct page *page) 516void putback_lru_page(struct page *page)
517{ 517{
518 int lru; 518 int lru;
@@ -566,20 +566,6 @@ redo:
566 put_page(page); /* drop ref from isolate */ 566 put_page(page); /* drop ref from isolate */
567} 567}
568 568
569#else /* CONFIG_UNEVICTABLE_LRU */
570
571void putback_lru_page(struct page *page)
572{
573 int lru;
574 VM_BUG_ON(PageLRU(page));
575
576 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
577 lru_cache_add_lru(page, lru);
578 put_page(page);
579}
580#endif /* CONFIG_UNEVICTABLE_LRU */
581
582
583/* 569/*
584 * shrink_page_list() returns the number of reclaimed pages 570 * shrink_page_list() returns the number of reclaimed pages
585 */ 571 */
@@ -591,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
591 struct pagevec freed_pvec; 577 struct pagevec freed_pvec;
592 int pgactivate = 0; 578 int pgactivate = 0;
593 unsigned long nr_reclaimed = 0; 579 unsigned long nr_reclaimed = 0;
580 unsigned long vm_flags;
594 581
595 cond_resched(); 582 cond_resched();
596 583
@@ -641,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
641 goto keep_locked; 628 goto keep_locked;
642 } 629 }
643 630
644 referenced = page_referenced(page, 1, sc->mem_cgroup); 631 referenced = page_referenced(page, 1,
632 sc->mem_cgroup, &vm_flags);
645 /* In active use or really unfreeable? Activate it. */ 633 /* In active use or really unfreeable? Activate it. */
646 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && 634 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
647 referenced && page_mapping_inuse(page)) 635 referenced && page_mapping_inuse(page))
@@ -941,18 +929,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
941 /* Check that we have not crossed a zone boundary. */ 929 /* Check that we have not crossed a zone boundary. */
942 if (unlikely(page_zone_id(cursor_page) != zone_id)) 930 if (unlikely(page_zone_id(cursor_page) != zone_id))
943 continue; 931 continue;
944 switch (__isolate_lru_page(cursor_page, mode, file)) { 932 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
945 case 0:
946 list_move(&cursor_page->lru, dst); 933 list_move(&cursor_page->lru, dst);
947 nr_taken++; 934 nr_taken++;
948 scan++; 935 scan++;
949 break;
950
951 case -EBUSY:
952 /* else it is being freed elsewhere */
953 list_move(&cursor_page->lru, src);
954 default:
955 break; /* ! on LRU or wrong list */
956 } 936 }
957 } 937 }
958 } 938 }
@@ -1059,6 +1039,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1059 unsigned long nr_scanned = 0; 1039 unsigned long nr_scanned = 0;
1060 unsigned long nr_reclaimed = 0; 1040 unsigned long nr_reclaimed = 0;
1061 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1041 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1042 int lumpy_reclaim = 0;
1043
1044 /*
1045 * If we need a large contiguous chunk of memory, or have
1046 * trouble getting a small set of contiguous pages, we
1047 * will reclaim both active and inactive pages.
1048 *
1049 * We use the same threshold as pageout congestion_wait below.
1050 */
1051 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1052 lumpy_reclaim = 1;
1053 else if (sc->order && priority < DEF_PRIORITY - 2)
1054 lumpy_reclaim = 1;
1062 1055
1063 pagevec_init(&pvec, 1); 1056 pagevec_init(&pvec, 1);
1064 1057
@@ -1071,19 +1064,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1071 unsigned long nr_freed; 1064 unsigned long nr_freed;
1072 unsigned long nr_active; 1065 unsigned long nr_active;
1073 unsigned int count[NR_LRU_LISTS] = { 0, }; 1066 unsigned int count[NR_LRU_LISTS] = { 0, };
1074 int mode = ISOLATE_INACTIVE; 1067 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1075
1076 /*
1077 * If we need a large contiguous chunk of memory, or have
1078 * trouble getting a small set of contiguous pages, we
1079 * will reclaim both active and inactive pages.
1080 *
1081 * We use the same threshold as pageout congestion_wait below.
1082 */
1083 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1084 mode = ISOLATE_BOTH;
1085 else if (sc->order && priority < DEF_PRIORITY - 2)
1086 mode = ISOLATE_BOTH;
1087 1068
1088 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1069 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1089 &page_list, &nr_scan, sc->order, mode, 1070 &page_list, &nr_scan, sc->order, mode,
@@ -1120,7 +1101,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1120 * but that should be acceptable to the caller 1101 * but that should be acceptable to the caller
1121 */ 1102 */
1122 if (nr_freed < nr_taken && !current_is_kswapd() && 1103 if (nr_freed < nr_taken && !current_is_kswapd() &&
1123 sc->order > PAGE_ALLOC_COSTLY_ORDER) { 1104 lumpy_reclaim) {
1124 congestion_wait(WRITE, HZ/10); 1105 congestion_wait(WRITE, HZ/10);
1125 1106
1126 /* 1107 /*
@@ -1215,18 +1196,54 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1215 * But we had to alter page->flags anyway. 1196 * But we had to alter page->flags anyway.
1216 */ 1197 */
1217 1198
1199static void move_active_pages_to_lru(struct zone *zone,
1200 struct list_head *list,
1201 enum lru_list lru)
1202{
1203 unsigned long pgmoved = 0;
1204 struct pagevec pvec;
1205 struct page *page;
1206
1207 pagevec_init(&pvec, 1);
1208
1209 while (!list_empty(list)) {
1210 page = lru_to_page(list);
1211 prefetchw_prev_lru_page(page, list, flags);
1212
1213 VM_BUG_ON(PageLRU(page));
1214 SetPageLRU(page);
1215
1216 VM_BUG_ON(!PageActive(page));
1217 if (!is_active_lru(lru))
1218 ClearPageActive(page); /* we are de-activating */
1219
1220 list_move(&page->lru, &zone->lru[lru].list);
1221 mem_cgroup_add_lru_list(page, lru);
1222 pgmoved++;
1223
1224 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1225 spin_unlock_irq(&zone->lru_lock);
1226 if (buffer_heads_over_limit)
1227 pagevec_strip(&pvec);
1228 __pagevec_release(&pvec);
1229 spin_lock_irq(&zone->lru_lock);
1230 }
1231 }
1232 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1233 if (!is_active_lru(lru))
1234 __count_vm_events(PGDEACTIVATE, pgmoved);
1235}
1218 1236
1219static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1237static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1220 struct scan_control *sc, int priority, int file) 1238 struct scan_control *sc, int priority, int file)
1221{ 1239{
1222 unsigned long pgmoved; 1240 unsigned long pgmoved;
1223 int pgdeactivate = 0;
1224 unsigned long pgscanned; 1241 unsigned long pgscanned;
1242 unsigned long vm_flags;
1225 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1243 LIST_HEAD(l_hold); /* The pages which were snipped off */
1244 LIST_HEAD(l_active);
1226 LIST_HEAD(l_inactive); 1245 LIST_HEAD(l_inactive);
1227 struct page *page; 1246 struct page *page;
1228 struct pagevec pvec;
1229 enum lru_list lru;
1230 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1247 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1231 1248
1232 lru_add_drain(); 1249 lru_add_drain();
@@ -1243,13 +1260,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1243 } 1260 }
1244 reclaim_stat->recent_scanned[!!file] += pgmoved; 1261 reclaim_stat->recent_scanned[!!file] += pgmoved;
1245 1262
1263 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1246 if (file) 1264 if (file)
1247 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); 1265 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1248 else 1266 else
1249 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); 1267 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1250 spin_unlock_irq(&zone->lru_lock); 1268 spin_unlock_irq(&zone->lru_lock);
1251 1269
1252 pgmoved = 0; 1270 pgmoved = 0; /* count referenced (mapping) mapped pages */
1253 while (!list_empty(&l_hold)) { 1271 while (!list_empty(&l_hold)) {
1254 cond_resched(); 1272 cond_resched();
1255 page = lru_to_page(&l_hold); 1273 page = lru_to_page(&l_hold);
@@ -1262,58 +1280,44 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1262 1280
1263 /* page_referenced clears PageReferenced */ 1281 /* page_referenced clears PageReferenced */
1264 if (page_mapping_inuse(page) && 1282 if (page_mapping_inuse(page) &&
1265 page_referenced(page, 0, sc->mem_cgroup)) 1283 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1266 pgmoved++; 1284 pgmoved++;
1285 /*
1286 * Identify referenced, file-backed active pages and
1287 * give them one more trip around the active list. So
1288 * that executable code get better chances to stay in
1289 * memory under moderate memory pressure. Anon pages
1290 * are not likely to be evicted by use-once streaming
1291 * IO, plus JVM can create lots of anon VM_EXEC pages,
1292 * so we ignore them here.
1293 */
1294 if ((vm_flags & VM_EXEC) && !PageAnon(page)) {
1295 list_add(&page->lru, &l_active);
1296 continue;
1297 }
1298 }
1267 1299
1268 list_add(&page->lru, &l_inactive); 1300 list_add(&page->lru, &l_inactive);
1269 } 1301 }
1270 1302
1271 /* 1303 /*
1272 * Move the pages to the [file or anon] inactive list. 1304 * Move pages back to the lru list.
1273 */ 1305 */
1274 pagevec_init(&pvec, 1);
1275 lru = LRU_BASE + file * LRU_FILE;
1276
1277 spin_lock_irq(&zone->lru_lock); 1306 spin_lock_irq(&zone->lru_lock);
1278 /* 1307 /*
1279 * Count referenced pages from currently used mappings as 1308 * Count referenced pages from currently used mappings as rotated,
1280 * rotated, even though they are moved to the inactive list. 1309 * even though only some of them are actually re-activated. This
1281 * This helps balance scan pressure between file and anonymous 1310 * helps balance scan pressure between file and anonymous pages in
1282 * pages in get_scan_ratio. 1311 * get_scan_ratio.
1283 */ 1312 */
1284 reclaim_stat->recent_rotated[!!file] += pgmoved; 1313 reclaim_stat->recent_rotated[!!file] += pgmoved;
1285 1314
1286 pgmoved = 0; 1315 move_active_pages_to_lru(zone, &l_active,
1287 while (!list_empty(&l_inactive)) { 1316 LRU_ACTIVE + file * LRU_FILE);
1288 page = lru_to_page(&l_inactive); 1317 move_active_pages_to_lru(zone, &l_inactive,
1289 prefetchw_prev_lru_page(page, &l_inactive, flags); 1318 LRU_BASE + file * LRU_FILE);
1290 VM_BUG_ON(PageLRU(page));
1291 SetPageLRU(page);
1292 VM_BUG_ON(!PageActive(page));
1293 ClearPageActive(page);
1294 1319
1295 list_move(&page->lru, &zone->lru[lru].list);
1296 mem_cgroup_add_lru_list(page, lru);
1297 pgmoved++;
1298 if (!pagevec_add(&pvec, page)) {
1299 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1300 spin_unlock_irq(&zone->lru_lock);
1301 pgdeactivate += pgmoved;
1302 pgmoved = 0;
1303 if (buffer_heads_over_limit)
1304 pagevec_strip(&pvec);
1305 __pagevec_release(&pvec);
1306 spin_lock_irq(&zone->lru_lock);
1307 }
1308 }
1309 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1310 pgdeactivate += pgmoved;
1311 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1312 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1313 spin_unlock_irq(&zone->lru_lock); 1320 spin_unlock_irq(&zone->lru_lock);
1314 if (buffer_heads_over_limit)
1315 pagevec_strip(&pvec);
1316 pagevec_release(&pvec);
1317} 1321}
1318 1322
1319static int inactive_anon_is_low_global(struct zone *zone) 1323static int inactive_anon_is_low_global(struct zone *zone)
@@ -1348,12 +1352,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1348 return low; 1352 return low;
1349} 1353}
1350 1354
1355static int inactive_file_is_low_global(struct zone *zone)
1356{
1357 unsigned long active, inactive;
1358
1359 active = zone_page_state(zone, NR_ACTIVE_FILE);
1360 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1361
1362 return (active > inactive);
1363}
1364
1365/**
1366 * inactive_file_is_low - check if file pages need to be deactivated
1367 * @zone: zone to check
1368 * @sc: scan control of this context
1369 *
1370 * When the system is doing streaming IO, memory pressure here
1371 * ensures that active file pages get deactivated, until more
1372 * than half of the file pages are on the inactive list.
1373 *
1374 * Once we get to that situation, protect the system's working
1375 * set from being evicted by disabling active file page aging.
1376 *
1377 * This uses a different ratio than the anonymous pages, because
1378 * the page cache uses a use-once replacement algorithm.
1379 */
1380static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1381{
1382 int low;
1383
1384 if (scanning_global_lru(sc))
1385 low = inactive_file_is_low_global(zone);
1386 else
1387 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
1388 return low;
1389}
1390
1351static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1391static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1352 struct zone *zone, struct scan_control *sc, int priority) 1392 struct zone *zone, struct scan_control *sc, int priority)
1353{ 1393{
1354 int file = is_file_lru(lru); 1394 int file = is_file_lru(lru);
1355 1395
1356 if (lru == LRU_ACTIVE_FILE) { 1396 if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
1357 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1397 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1358 return 0; 1398 return 0;
1359 } 1399 }
@@ -1382,13 +1422,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1382 unsigned long ap, fp; 1422 unsigned long ap, fp;
1383 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1423 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1384 1424
1385 /* If we have no swap space, do not bother scanning anon pages. */
1386 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1387 percent[0] = 0;
1388 percent[1] = 100;
1389 return;
1390 }
1391
1392 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + 1425 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
1393 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); 1426 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
1394 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + 1427 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1398,7 +1431,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1398 free = zone_page_state(zone, NR_FREE_PAGES); 1431 free = zone_page_state(zone, NR_FREE_PAGES);
1399 /* If we have very few page cache pages, 1432 /* If we have very few page cache pages,
1400 force-scan anon pages. */ 1433 force-scan anon pages. */
1401 if (unlikely(file + free <= zone->pages_high)) { 1434 if (unlikely(file + free <= high_wmark_pages(zone))) {
1402 percent[0] = 100; 1435 percent[0] = 100;
1403 percent[1] = 0; 1436 percent[1] = 0;
1404 return; 1437 return;
@@ -1453,6 +1486,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1453 percent[1] = 100 - percent[0]; 1486 percent[1] = 100 - percent[0];
1454} 1487}
1455 1488
1489/*
1490 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1491 * until we collected @swap_cluster_max pages to scan.
1492 */
1493static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1494 unsigned long *nr_saved_scan,
1495 unsigned long swap_cluster_max)
1496{
1497 unsigned long nr;
1498
1499 *nr_saved_scan += nr_to_scan;
1500 nr = *nr_saved_scan;
1501
1502 if (nr >= swap_cluster_max)
1503 *nr_saved_scan = 0;
1504 else
1505 nr = 0;
1506
1507 return nr;
1508}
1456 1509
1457/* 1510/*
1458 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1511 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
@@ -1466,26 +1519,30 @@ static void shrink_zone(int priority, struct zone *zone,
1466 enum lru_list l; 1519 enum lru_list l;
1467 unsigned long nr_reclaimed = sc->nr_reclaimed; 1520 unsigned long nr_reclaimed = sc->nr_reclaimed;
1468 unsigned long swap_cluster_max = sc->swap_cluster_max; 1521 unsigned long swap_cluster_max = sc->swap_cluster_max;
1522 int noswap = 0;
1469 1523
1470 get_scan_ratio(zone, sc, percent); 1524 /* If we have no swap space, do not bother scanning anon pages. */
1525 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1526 noswap = 1;
1527 percent[0] = 0;
1528 percent[1] = 100;
1529 } else
1530 get_scan_ratio(zone, sc, percent);
1471 1531
1472 for_each_evictable_lru(l) { 1532 for_each_evictable_lru(l) {
1473 int file = is_file_lru(l); 1533 int file = is_file_lru(l);
1474 int scan; 1534 unsigned long scan;
1475 1535
1476 scan = zone_nr_pages(zone, sc, l); 1536 scan = zone_nr_pages(zone, sc, l);
1477 if (priority) { 1537 if (priority || noswap) {
1478 scan >>= priority; 1538 scan >>= priority;
1479 scan = (scan * percent[file]) / 100; 1539 scan = (scan * percent[file]) / 100;
1480 } 1540 }
1481 if (scanning_global_lru(sc)) { 1541 if (scanning_global_lru(sc))
1482 zone->lru[l].nr_scan += scan; 1542 nr[l] = nr_scan_try_batch(scan,
1483 nr[l] = zone->lru[l].nr_scan; 1543 &zone->lru[l].nr_saved_scan,
1484 if (nr[l] >= swap_cluster_max) 1544 swap_cluster_max);
1485 zone->lru[l].nr_scan = 0; 1545 else
1486 else
1487 nr[l] = 0;
1488 } else
1489 nr[l] = scan; 1546 nr[l] = scan;
1490 } 1547 }
1491 1548
@@ -1519,7 +1576,7 @@ static void shrink_zone(int priority, struct zone *zone,
1519 * Even if we did not try to evict anon pages at all, we want to 1576 * Even if we did not try to evict anon pages at all, we want to
1520 * rebalance the anon lru active/inactive ratio. 1577 * rebalance the anon lru active/inactive ratio.
1521 */ 1578 */
1522 if (inactive_anon_is_low(zone, sc)) 1579 if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
1523 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1580 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1524 1581
1525 throttle_vm_writeout(sc->gfp_mask); 1582 throttle_vm_writeout(sc->gfp_mask);
@@ -1530,11 +1587,13 @@ static void shrink_zone(int priority, struct zone *zone,
1530 * try to reclaim pages from zones which will satisfy the caller's allocation 1587 * try to reclaim pages from zones which will satisfy the caller's allocation
1531 * request. 1588 * request.
1532 * 1589 *
1533 * We reclaim from a zone even if that zone is over pages_high. Because: 1590 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
1591 * Because:
1534 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 1592 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
1535 * allocation or 1593 * allocation or
1536 * b) The zones may be over pages_high but they must go *over* pages_high to 1594 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
1537 * satisfy the `incremental min' zone defense algorithm. 1595 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
1596 * zone defense algorithm.
1538 * 1597 *
1539 * If a zone is deemed to be full of pinned pages then just give it a light 1598 * If a zone is deemed to be full of pinned pages then just give it a light
1540 * scan then give up on it. 1599 * scan then give up on it.
@@ -1740,7 +1799,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1740 1799
1741/* 1800/*
1742 * For kswapd, balance_pgdat() will work across all this node's zones until 1801 * For kswapd, balance_pgdat() will work across all this node's zones until
1743 * they are all at pages_high. 1802 * they are all at high_wmark_pages(zone).
1744 * 1803 *
1745 * Returns the number of pages which were actually freed. 1804 * Returns the number of pages which were actually freed.
1746 * 1805 *
@@ -1753,11 +1812,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1753 * the zone for when the problem goes away. 1812 * the zone for when the problem goes away.
1754 * 1813 *
1755 * kswapd scans the zones in the highmem->normal->dma direction. It skips 1814 * kswapd scans the zones in the highmem->normal->dma direction. It skips
1756 * zones which have free_pages > pages_high, but once a zone is found to have 1815 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
1757 * free_pages <= pages_high, we scan that zone and the lower zones regardless 1816 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
1758 * of the number of free pages in the lower zones. This interoperates with 1817 * lower zones regardless of the number of free pages in the lower zones. This
1759 * the page allocator fallback scheme to ensure that aging of pages is balanced 1818 * interoperates with the page allocator fallback scheme to ensure that aging
1760 * across the zones. 1819 * of pages is balanced across the zones.
1761 */ 1820 */
1762static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 1821static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1763{ 1822{
@@ -1778,7 +1837,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1778 }; 1837 };
1779 /* 1838 /*
1780 * temp_priority is used to remember the scanning priority at which 1839 * temp_priority is used to remember the scanning priority at which
1781 * this zone was successfully refilled to free_pages == pages_high. 1840 * this zone was successfully refilled to
1841 * free_pages == high_wmark_pages(zone).
1782 */ 1842 */
1783 int temp_priority[MAX_NR_ZONES]; 1843 int temp_priority[MAX_NR_ZONES];
1784 1844
@@ -1823,8 +1883,8 @@ loop_again:
1823 shrink_active_list(SWAP_CLUSTER_MAX, zone, 1883 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1824 &sc, priority, 0); 1884 &sc, priority, 0);
1825 1885
1826 if (!zone_watermark_ok(zone, order, zone->pages_high, 1886 if (!zone_watermark_ok(zone, order,
1827 0, 0)) { 1887 high_wmark_pages(zone), 0, 0)) {
1828 end_zone = i; 1888 end_zone = i;
1829 break; 1889 break;
1830 } 1890 }
@@ -1858,8 +1918,8 @@ loop_again:
1858 priority != DEF_PRIORITY) 1918 priority != DEF_PRIORITY)
1859 continue; 1919 continue;
1860 1920
1861 if (!zone_watermark_ok(zone, order, zone->pages_high, 1921 if (!zone_watermark_ok(zone, order,
1862 end_zone, 0)) 1922 high_wmark_pages(zone), end_zone, 0))
1863 all_zones_ok = 0; 1923 all_zones_ok = 0;
1864 temp_priority[i] = priority; 1924 temp_priority[i] = priority;
1865 sc.nr_scanned = 0; 1925 sc.nr_scanned = 0;
@@ -1868,8 +1928,8 @@ loop_again:
1868 * We put equal pressure on every zone, unless one 1928 * We put equal pressure on every zone, unless one
1869 * zone has way too many pages free already. 1929 * zone has way too many pages free already.
1870 */ 1930 */
1871 if (!zone_watermark_ok(zone, order, 8*zone->pages_high, 1931 if (!zone_watermark_ok(zone, order,
1872 end_zone, 0)) 1932 8*high_wmark_pages(zone), end_zone, 0))
1873 shrink_zone(priority, zone, &sc); 1933 shrink_zone(priority, zone, &sc);
1874 reclaim_state->reclaimed_slab = 0; 1934 reclaim_state->reclaimed_slab = 0;
1875 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1935 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -2035,7 +2095,7 @@ void wakeup_kswapd(struct zone *zone, int order)
2035 return; 2095 return;
2036 2096
2037 pgdat = zone->zone_pgdat; 2097 pgdat = zone->zone_pgdat;
2038 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) 2098 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2039 return; 2099 return;
2040 if (pgdat->kswapd_max_order < order) 2100 if (pgdat->kswapd_max_order < order)
2041 pgdat->kswapd_max_order = order; 2101 pgdat->kswapd_max_order = order;
@@ -2054,7 +2114,7 @@ unsigned long global_lru_pages(void)
2054 + global_page_state(NR_INACTIVE_FILE); 2114 + global_page_state(NR_INACTIVE_FILE);
2055} 2115}
2056 2116
2057#ifdef CONFIG_PM 2117#ifdef CONFIG_HIBERNATION
2058/* 2118/*
2059 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 2119 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
2060 * from LRU lists system-wide, for given pass and priority. 2120 * from LRU lists system-wide, for given pass and priority.
@@ -2082,11 +2142,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2082 l == LRU_ACTIVE_FILE)) 2142 l == LRU_ACTIVE_FILE))
2083 continue; 2143 continue;
2084 2144
2085 zone->lru[l].nr_scan += (lru_pages >> prio) + 1; 2145 zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
2086 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { 2146 if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
2087 unsigned long nr_to_scan; 2147 unsigned long nr_to_scan;
2088 2148
2089 zone->lru[l].nr_scan = 0; 2149 zone->lru[l].nr_saved_scan = 0;
2090 nr_to_scan = min(nr_pages, lru_pages); 2150 nr_to_scan = min(nr_pages, lru_pages);
2091 nr_reclaimed += shrink_list(l, nr_to_scan, zone, 2151 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2092 sc, prio); 2152 sc, prio);
@@ -2194,7 +2254,7 @@ out:
2194 2254
2195 return sc.nr_reclaimed; 2255 return sc.nr_reclaimed;
2196} 2256}
2197#endif 2257#endif /* CONFIG_HIBERNATION */
2198 2258
2199/* It's optimal to keep kswapds on the same CPUs as their memory, but 2259/* It's optimal to keep kswapds on the same CPUs as their memory, but
2200 not required for correctness. So if the last cpu in a node goes 2260 not required for correctness. So if the last cpu in a node goes
@@ -2288,6 +2348,48 @@ int sysctl_min_unmapped_ratio = 1;
2288 */ 2348 */
2289int sysctl_min_slab_ratio = 5; 2349int sysctl_min_slab_ratio = 5;
2290 2350
2351static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
2352{
2353 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
2354 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
2355 zone_page_state(zone, NR_ACTIVE_FILE);
2356
2357 /*
2358 * It's possible for there to be more file mapped pages than
2359 * accounted for by the pages on the file LRU lists because
2360 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
2361 */
2362 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
2363}
2364
2365/* Work out how many page cache pages we can reclaim in this reclaim_mode */
2366static long zone_pagecache_reclaimable(struct zone *zone)
2367{
2368 long nr_pagecache_reclaimable;
2369 long delta = 0;
2370
2371 /*
2372 * If RECLAIM_SWAP is set, then all file pages are considered
2373 * potentially reclaimable. Otherwise, we have to worry about
2374 * pages like swapcache and zone_unmapped_file_pages() provides
2375 * a better estimate
2376 */
2377 if (zone_reclaim_mode & RECLAIM_SWAP)
2378 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
2379 else
2380 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
2381
2382 /* If we can't clean pages, remove dirty pages from consideration */
2383 if (!(zone_reclaim_mode & RECLAIM_WRITE))
2384 delta += zone_page_state(zone, NR_FILE_DIRTY);
2385
2386 /* Watch for any possible underflows due to delta */
2387 if (unlikely(delta > nr_pagecache_reclaimable))
2388 delta = nr_pagecache_reclaimable;
2389
2390 return nr_pagecache_reclaimable - delta;
2391}
2392
2291/* 2393/*
2292 * Try to free up some pages from this zone through reclaim. 2394 * Try to free up some pages from this zone through reclaim.
2293 */ 2395 */
@@ -2322,9 +2424,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2322 reclaim_state.reclaimed_slab = 0; 2424 reclaim_state.reclaimed_slab = 0;
2323 p->reclaim_state = &reclaim_state; 2425 p->reclaim_state = &reclaim_state;
2324 2426
2325 if (zone_page_state(zone, NR_FILE_PAGES) - 2427 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
2326 zone_page_state(zone, NR_FILE_MAPPED) >
2327 zone->min_unmapped_pages) {
2328 /* 2428 /*
2329 * Free memory by calling shrink zone with increasing 2429 * Free memory by calling shrink zone with increasing
2330 * priorities until we have enough memory freed. 2430 * priorities until we have enough memory freed.
@@ -2382,20 +2482,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2382 * if less than a specified percentage of the zone is used by 2482 * if less than a specified percentage of the zone is used by
2383 * unmapped file backed pages. 2483 * unmapped file backed pages.
2384 */ 2484 */
2385 if (zone_page_state(zone, NR_FILE_PAGES) - 2485 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
2386 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages 2486 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
2387 && zone_page_state(zone, NR_SLAB_RECLAIMABLE) 2487 return ZONE_RECLAIM_FULL;
2388 <= zone->min_slab_pages)
2389 return 0;
2390 2488
2391 if (zone_is_all_unreclaimable(zone)) 2489 if (zone_is_all_unreclaimable(zone))
2392 return 0; 2490 return ZONE_RECLAIM_FULL;
2393 2491
2394 /* 2492 /*
2395 * Do not scan if the allocation should not be delayed. 2493 * Do not scan if the allocation should not be delayed.
2396 */ 2494 */
2397 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) 2495 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
2398 return 0; 2496 return ZONE_RECLAIM_NOSCAN;
2399 2497
2400 /* 2498 /*
2401 * Only run zone reclaim on the local zone or on zones that do not 2499 * Only run zone reclaim on the local zone or on zones that do not
@@ -2405,18 +2503,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2405 */ 2503 */
2406 node_id = zone_to_nid(zone); 2504 node_id = zone_to_nid(zone);
2407 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 2505 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
2408 return 0; 2506 return ZONE_RECLAIM_NOSCAN;
2409 2507
2410 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 2508 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
2411 return 0; 2509 return ZONE_RECLAIM_NOSCAN;
2510
2412 ret = __zone_reclaim(zone, gfp_mask, order); 2511 ret = __zone_reclaim(zone, gfp_mask, order);
2413 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 2512 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
2414 2513
2514 if (!ret)
2515 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
2516
2415 return ret; 2517 return ret;
2416} 2518}
2417#endif 2519#endif
2418 2520
2419#ifdef CONFIG_UNEVICTABLE_LRU
2420/* 2521/*
2421 * page_evictable - test whether a page is evictable 2522 * page_evictable - test whether a page is evictable
2422 * @page: the page to test 2523 * @page: the page to test
@@ -2663,4 +2764,3 @@ void scan_unevictable_unregister_node(struct node *node)
2663 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 2764 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2664} 2765}
2665 2766
2666#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 66f6130976cb..138bed53706e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -509,22 +509,11 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
509 continue; 509 continue;
510 510
511 page = pfn_to_page(pfn); 511 page = pfn_to_page(pfn);
512#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES 512
513 /* 513 /* Watch for unexpected holes punched in the memmap */
514 * Ordinarily, memory holes in flatmem still have a valid 514 if (!memmap_valid_within(pfn, page, zone))
515 * memmap for the PFN range. However, an architecture for
516 * embedded systems (e.g. ARM) can free up the memmap backing
517 * holes to save memory on the assumption the memmap is
518 * never used. The page_zone linkages are then broken even
519 * though pfn_valid() returns true. Skip the page if the
520 * linkages are broken. Even if this test passed, the impact
521 * is that the counters for the movable type are off but
522 * fragmentation monitoring is likely meaningless on small
523 * systems.
524 */
525 if (page_zone(page) != zone)
526 continue; 515 continue;
527#endif 516
528 mtype = get_pageblock_migratetype(page); 517 mtype = get_pageblock_migratetype(page);
529 518
530 if (mtype < MIGRATE_TYPES) 519 if (mtype < MIGRATE_TYPES)
@@ -640,10 +629,8 @@ static const char * const vmstat_text[] = {
640 "nr_active_anon", 629 "nr_active_anon",
641 "nr_inactive_file", 630 "nr_inactive_file",
642 "nr_active_file", 631 "nr_active_file",
643#ifdef CONFIG_UNEVICTABLE_LRU
644 "nr_unevictable", 632 "nr_unevictable",
645 "nr_mlock", 633 "nr_mlock",
646#endif
647 "nr_anon_pages", 634 "nr_anon_pages",
648 "nr_mapped", 635 "nr_mapped",
649 "nr_file_pages", 636 "nr_file_pages",
@@ -686,6 +673,9 @@ static const char * const vmstat_text[] = {
686 TEXTS_FOR_ZONES("pgscan_kswapd") 673 TEXTS_FOR_ZONES("pgscan_kswapd")
687 TEXTS_FOR_ZONES("pgscan_direct") 674 TEXTS_FOR_ZONES("pgscan_direct")
688 675
676#ifdef CONFIG_NUMA
677 "zone_reclaim_failed",
678#endif
689 "pginodesteal", 679 "pginodesteal",
690 "slabs_scanned", 680 "slabs_scanned",
691 "kswapd_steal", 681 "kswapd_steal",
@@ -698,7 +688,6 @@ static const char * const vmstat_text[] = {
698 "htlb_buddy_alloc_success", 688 "htlb_buddy_alloc_success",
699 "htlb_buddy_alloc_fail", 689 "htlb_buddy_alloc_fail",
700#endif 690#endif
701#ifdef CONFIG_UNEVICTABLE_LRU
702 "unevictable_pgs_culled", 691 "unevictable_pgs_culled",
703 "unevictable_pgs_scanned", 692 "unevictable_pgs_scanned",
704 "unevictable_pgs_rescued", 693 "unevictable_pgs_rescued",
@@ -708,7 +697,6 @@ static const char * const vmstat_text[] = {
708 "unevictable_pgs_stranded", 697 "unevictable_pgs_stranded",
709 "unevictable_pgs_mlockfreed", 698 "unevictable_pgs_mlockfreed",
710#endif 699#endif
711#endif
712}; 700};
713 701
714static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, 702static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
@@ -721,18 +709,14 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
721 "\n min %lu" 709 "\n min %lu"
722 "\n low %lu" 710 "\n low %lu"
723 "\n high %lu" 711 "\n high %lu"
724 "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" 712 "\n scanned %lu"
725 "\n spanned %lu" 713 "\n spanned %lu"
726 "\n present %lu", 714 "\n present %lu",
727 zone_page_state(zone, NR_FREE_PAGES), 715 zone_page_state(zone, NR_FREE_PAGES),
728 zone->pages_min, 716 min_wmark_pages(zone),
729 zone->pages_low, 717 low_wmark_pages(zone),
730 zone->pages_high, 718 high_wmark_pages(zone),
731 zone->pages_scanned, 719 zone->pages_scanned,
732 zone->lru[LRU_ACTIVE_ANON].nr_scan,
733 zone->lru[LRU_INACTIVE_ANON].nr_scan,
734 zone->lru[LRU_ACTIVE_FILE].nr_scan,
735 zone->lru[LRU_INACTIVE_FILE].nr_scan,
736 zone->spanned_pages, 720 zone->spanned_pages,
737 zone->present_pages); 721 zone->present_pages);
738 722