aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2009-09-08 20:55:21 -0400
committerDan Williams <dan.j.williams@intel.com>2009-09-08 20:55:21 -0400
commitbbb20089a3275a19e475dbc21320c3742e3ca423 (patch)
tree216fdc1cbef450ca688135c5b8969169482d9a48 /mm
parent3e48e656903e9fd8bc805c6a2c4264d7808d315b (diff)
parent657a77fa7284d8ae28dfa48f1dc5d919bf5b2843 (diff)
Merge branch 'dmaengine' into async-tx-next
Conflicts: crypto/async_tx/async_xor.c drivers/dma/ioat/dma_v2.h drivers/dma/ioat/pci.c drivers/md/raid5.c
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig37
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/Makefile4
-rw-r--r--mm/bootmem.c26
-rw-r--r--mm/bounce.c10
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c169
-rw-r--r--mm/highmem.c1
-rw-r--r--mm/hugetlb.c123
-rw-r--r--mm/init-mm.c20
-rw-r--r--mm/internal.h33
-rw-r--r--mm/kmemcheck.c122
-rw-r--r--mm/kmemleak-test.c111
-rw-r--r--mm/kmemleak.c1497
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/madvise.c26
-rw-r--r--mm/memcontrol.c137
-rw-r--r--mm/memory.c178
-rw-r--r--mm/memory_hotplug.c6
-rw-r--r--mm/mempolicy.c145
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mlock.c73
-rw-r--r--mm/mmap.c8
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/nommu.c3
-rw-r--r--mm/oom_kill.c64
-rw-r--r--mm/page-writeback.c19
-rw-r--r--mm/page_alloc.c860
-rw-r--r--mm/page_cgroup.c41
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/percpu.c141
-rw-r--r--mm/readahead.c145
-rw-r--r--mm/rmap.c45
-rw-r--r--mm/shmem.c12
-rw-r--r--mm/shmem_acl.c29
-rw-r--r--mm/slab.c269
-rw-r--r--mm/slob.c19
-rw-r--r--mm/slub.c181
-rw-r--r--mm/swap_state.c17
-rw-r--r--mm/swapfile.c284
-rw-r--r--mm/thrash.c32
-rw-r--r--mm/truncate.c39
-rw-r--r--mm/util.c31
-rw-r--r--mm/vmalloc.c33
-rw-r--r--mm/vmscan.c380
-rw-r--r--mm/vmstat.c19
46 files changed, 4006 insertions, 1398 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c2b57d81e153..c948d4ca8bde 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -128,11 +128,11 @@ config SPARSEMEM_VMEMMAP
128config MEMORY_HOTPLUG 128config MEMORY_HOTPLUG
129 bool "Allow for memory hot-add" 129 bool "Allow for memory hot-add"
130 depends on SPARSEMEM || X86_64_ACPI_NUMA 130 depends on SPARSEMEM || X86_64_ACPI_NUMA
131 depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG 131 depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG
132 depends on (IA64 || X86 || PPC64 || SUPERH || S390) 132 depends on (IA64 || X86 || PPC64 || SUPERH || S390)
133 133
134comment "Memory hotplug is currently incompatible with Software Suspend" 134comment "Memory hotplug is currently incompatible with Software Suspend"
135 depends on SPARSEMEM && HOTPLUG && HIBERNATION 135 depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390
136 136
137config MEMORY_HOTPLUG_SPARSE 137config MEMORY_HOTPLUG_SPARSE
138 def_bool y 138 def_bool y
@@ -203,29 +203,36 @@ config VIRT_TO_BUS
203 def_bool y 203 def_bool y
204 depends on !ARCH_NO_VIRT_TO_BUS 204 depends on !ARCH_NO_VIRT_TO_BUS
205 205
206config UNEVICTABLE_LRU
207 bool "Add LRU list to track non-evictable pages"
208 default y
209 help
210 Keeps unevictable pages off of the active and inactive pageout
211 lists, so kswapd will not waste CPU time or have its balancing
212 algorithms thrown off by scanning these pages. Selecting this
213 will use one page flag and increase the code size a little,
214 say Y unless you know what you are doing.
215
216 See Documentation/vm/unevictable-lru.txt for more information.
217
218config HAVE_MLOCK 206config HAVE_MLOCK
219 bool 207 bool
220 default y if MMU=y 208 default y if MMU=y
221 209
222config HAVE_MLOCKED_PAGE_BIT 210config HAVE_MLOCKED_PAGE_BIT
223 bool 211 bool
224 default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y 212 default y if HAVE_MLOCK=y
225 213
226config MMU_NOTIFIER 214config MMU_NOTIFIER
227 bool 215 bool
228 216
217config DEFAULT_MMAP_MIN_ADDR
218 int "Low address space to protect from user allocation"
219 default 4096
220 help
221 This is the portion of low virtual memory which should be protected
222 from userspace allocation. Keeping a user from writing to low pages
223 can help reduce the impact of kernel NULL pointer bugs.
224
225 For most ia64, ppc64 and x86 users with lots of address space
226 a value of 65536 is reasonable and should cause no problems.
227 On arm and other archs it should not be higher than 32768.
228 Programs which use vm86 functionality would either need additional
229 permissions from either the LSM or the capabilities module or have
230 this protection disabled.
231
232 This value can be changed after boot using the
233 /proc/sys/vm/mmap_min_addr tunable.
234
235
229config NOMMU_INITIAL_TRIM_EXCESS 236config NOMMU_INITIAL_TRIM_EXCESS
230 int "Turn on mmap() excess space trimming before booting" 237 int "Turn on mmap() excess space trimming before booting"
231 depends on !MMU 238 depends on !MMU
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index bb01e298f260..aa99fd1f7109 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -2,6 +2,7 @@ config DEBUG_PAGEALLOC
2 bool "Debug page memory allocations" 2 bool "Debug page memory allocations"
3 depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC 3 depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
4 depends on !HIBERNATION || !PPC && !SPARC 4 depends on !HIBERNATION || !PPC && !SPARC
5 depends on !KMEMCHECK
5 ---help--- 6 ---help---
6 Unmap pages from the kernel linear mapping after free_pages(). 7 Unmap pages from the kernel linear mapping after free_pages().
7 This results in a large slowdown, but helps to find certain types 8 This results in a large slowdown, but helps to find certain types
diff --git a/mm/Makefile b/mm/Makefile
index ec73c68b6015..5e0bd6426693 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,6 +12,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
15obj-y += init-mm.o
15 16
16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o 17obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
17obj-$(CONFIG_BOUNCE) += bounce.o 18obj-$(CONFIG_BOUNCE) += bounce.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
27obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 28obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
28obj-$(CONFIG_SLAB) += slab.o 29obj-$(CONFIG_SLAB) += slab.o
29obj-$(CONFIG_SLUB) += slub.o 30obj-$(CONFIG_SLUB) += slub.o
31obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
30obj-$(CONFIG_FAILSLAB) += failslab.o 32obj-$(CONFIG_FAILSLAB) += failslab.o
31obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
32obj-$(CONFIG_FS_XIP) += filemap_xip.o 34obj-$(CONFIG_FS_XIP) += filemap_xip.o
@@ -38,3 +40,5 @@ obj-$(CONFIG_SMP) += allocpercpu.o
38endif 40endif
39obj-$(CONFIG_QUICKLIST) += quicklist.o 41obj-$(CONFIG_QUICKLIST) += quicklist.o
40obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 42obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
43obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
44obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index daf92713f7de..d2a9ce952768 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -532,12 +532,19 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
532 unsigned long size, unsigned long align, 532 unsigned long size, unsigned long align,
533 unsigned long goal, unsigned long limit) 533 unsigned long goal, unsigned long limit)
534{ 534{
535#ifdef CONFIG_HAVE_ARCH_BOOTMEM 535 if (WARN_ON_ONCE(slab_is_available()))
536 bootmem_data_t *p_bdata; 536 return kzalloc(size, GFP_NOWAIT);
537 537
538 p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit); 538#ifdef CONFIG_HAVE_ARCH_BOOTMEM
539 if (p_bdata) 539 {
540 return alloc_bootmem_core(p_bdata, size, align, goal, limit); 540 bootmem_data_t *p_bdata;
541
542 p_bdata = bootmem_arch_preferred_node(bdata, size, align,
543 goal, limit);
544 if (p_bdata)
545 return alloc_bootmem_core(p_bdata, size, align,
546 goal, limit);
547 }
541#endif 548#endif
542 return NULL; 549 return NULL;
543} 550}
@@ -662,6 +669,9 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
662void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, 669void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
663 unsigned long align, unsigned long goal) 670 unsigned long align, unsigned long goal)
664{ 671{
672 if (WARN_ON_ONCE(slab_is_available()))
673 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
674
665 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); 675 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
666} 676}
667 677
@@ -693,6 +703,9 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
693{ 703{
694 void *ptr; 704 void *ptr;
695 705
706 if (WARN_ON_ONCE(slab_is_available()))
707 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
708
696 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); 709 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
697 if (ptr) 710 if (ptr)
698 return ptr; 711 return ptr;
@@ -745,6 +758,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
745void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 758void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
746 unsigned long align, unsigned long goal) 759 unsigned long align, unsigned long goal)
747{ 760{
761 if (WARN_ON_ONCE(slab_is_available()))
762 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
763
748 return ___alloc_bootmem_node(pgdat->bdata, size, align, 764 return ___alloc_bootmem_node(pgdat->bdata, size, align,
749 goal, ARCH_LOW_ADDRESS_LIMIT); 765 goal, ARCH_LOW_ADDRESS_LIMIT);
750} 766}
diff --git a/mm/bounce.c b/mm/bounce.c
index e590272fe7a8..a2b76a588e34 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -13,17 +13,15 @@
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/hash.h> 14#include <linux/hash.h>
15#include <linux/highmem.h> 15#include <linux/highmem.h>
16#include <linux/blktrace_api.h>
17#include <trace/block.h>
18#include <asm/tlbflush.h> 16#include <asm/tlbflush.h>
19 17
18#include <trace/events/block.h>
19
20#define POOL_SIZE 64 20#define POOL_SIZE 64
21#define ISA_POOL_SIZE 16 21#define ISA_POOL_SIZE 16
22 22
23static mempool_t *page_pool, *isa_page_pool; 23static mempool_t *page_pool, *isa_page_pool;
24 24
25DEFINE_TRACE(block_bio_bounce);
26
27#ifdef CONFIG_HIGHMEM 25#ifdef CONFIG_HIGHMEM
28static __init int init_emergency_pool(void) 26static __init int init_emergency_pool(void)
29{ 27{
@@ -192,7 +190,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
192 /* 190 /*
193 * is destination page below bounce pfn? 191 * is destination page below bounce pfn?
194 */ 192 */
195 if (page_to_pfn(page) <= q->bounce_pfn) 193 if (page_to_pfn(page) <= queue_bounce_pfn(q))
196 continue; 194 continue;
197 195
198 /* 196 /*
@@ -284,7 +282,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
284 * don't waste time iterating over bio segments 282 * don't waste time iterating over bio segments
285 */ 283 */
286 if (!(q->bounce_gfp & GFP_DMA)) { 284 if (!(q->bounce_gfp & GFP_DMA)) {
287 if (q->bounce_pfn >= blk_max_pfn) 285 if (queue_bounce_pfn(q) >= blk_max_pfn)
288 return; 286 return;
289 pool = page_pool; 287 pool = page_pool;
290 } else { 288 } else {
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 54a0f8040afa..e43359214f6f 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
101 101
102 ret = force_page_cache_readahead(mapping, file, 102 ret = force_page_cache_readahead(mapping, file,
103 start_index, 103 start_index,
104 max_sane_readahead(nrpages)); 104 nrpages);
105 if (ret > 0) 105 if (ret > 0)
106 ret = 0; 106 ret = 0;
107 break; 107 break;
diff --git a/mm/filemap.c b/mm/filemap.c
index 1b60f30cebfa..22396713feb9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
521{ 521{
522 if (cpuset_do_page_mem_spread()) { 522 if (cpuset_do_page_mem_spread()) {
523 int n = cpuset_mem_spread_node(); 523 int n = cpuset_mem_spread_node();
524 return alloc_pages_node(n, gfp, 0); 524 return alloc_pages_exact_node(n, gfp, 0);
525 } 525 }
526 return alloc_pages(gfp, 0); 526 return alloc_pages(gfp, 0);
527} 527}
@@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait);
1004static void shrink_readahead_size_eio(struct file *filp, 1004static void shrink_readahead_size_eio(struct file *filp,
1005 struct file_ra_state *ra) 1005 struct file_ra_state *ra)
1006{ 1006{
1007 if (!ra->ra_pages)
1008 return;
1009
1010 ra->ra_pages /= 4; 1007 ra->ra_pages /= 4;
1011} 1008}
1012 1009
@@ -1390,8 +1387,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
1390 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 1387 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1391 return -EINVAL; 1388 return -EINVAL;
1392 1389
1393 force_page_cache_readahead(mapping, filp, index, 1390 force_page_cache_readahead(mapping, filp, index, nr);
1394 max_sane_readahead(nr));
1395 return 0; 1391 return 0;
1396} 1392}
1397 1393
@@ -1457,6 +1453,73 @@ static int page_cache_read(struct file *file, pgoff_t offset)
1457 1453
1458#define MMAP_LOTSAMISS (100) 1454#define MMAP_LOTSAMISS (100)
1459 1455
1456/*
1457 * Synchronous readahead happens when we don't even find
1458 * a page in the page cache at all.
1459 */
1460static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1461 struct file_ra_state *ra,
1462 struct file *file,
1463 pgoff_t offset)
1464{
1465 unsigned long ra_pages;
1466 struct address_space *mapping = file->f_mapping;
1467
1468 /* If we don't want any read-ahead, don't bother */
1469 if (VM_RandomReadHint(vma))
1470 return;
1471
1472 if (VM_SequentialReadHint(vma) ||
1473 offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
1474 page_cache_sync_readahead(mapping, ra, file, offset,
1475 ra->ra_pages);
1476 return;
1477 }
1478
1479 if (ra->mmap_miss < INT_MAX)
1480 ra->mmap_miss++;
1481
1482 /*
1483 * Do we miss much more than hit in this file? If so,
1484 * stop bothering with read-ahead. It will only hurt.
1485 */
1486 if (ra->mmap_miss > MMAP_LOTSAMISS)
1487 return;
1488
1489 /*
1490 * mmap read-around
1491 */
1492 ra_pages = max_sane_readahead(ra->ra_pages);
1493 if (ra_pages) {
1494 ra->start = max_t(long, 0, offset - ra_pages/2);
1495 ra->size = ra_pages;
1496 ra->async_size = 0;
1497 ra_submit(ra, mapping, file);
1498 }
1499}
1500
1501/*
1502 * Asynchronous readahead happens when we find the page and PG_readahead,
1503 * so we want to possibly extend the readahead further..
1504 */
1505static void do_async_mmap_readahead(struct vm_area_struct *vma,
1506 struct file_ra_state *ra,
1507 struct file *file,
1508 struct page *page,
1509 pgoff_t offset)
1510{
1511 struct address_space *mapping = file->f_mapping;
1512
1513 /* If we don't want any read-ahead, don't bother */
1514 if (VM_RandomReadHint(vma))
1515 return;
1516 if (ra->mmap_miss > 0)
1517 ra->mmap_miss--;
1518 if (PageReadahead(page))
1519 page_cache_async_readahead(mapping, ra, file,
1520 page, offset, ra->ra_pages);
1521}
1522
1460/** 1523/**
1461 * filemap_fault - read in file data for page fault handling 1524 * filemap_fault - read in file data for page fault handling
1462 * @vma: vma in which the fault was taken 1525 * @vma: vma in which the fault was taken
@@ -1476,78 +1539,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1476 struct address_space *mapping = file->f_mapping; 1539 struct address_space *mapping = file->f_mapping;
1477 struct file_ra_state *ra = &file->f_ra; 1540 struct file_ra_state *ra = &file->f_ra;
1478 struct inode *inode = mapping->host; 1541 struct inode *inode = mapping->host;
1542 pgoff_t offset = vmf->pgoff;
1479 struct page *page; 1543 struct page *page;
1480 pgoff_t size; 1544 pgoff_t size;
1481 int did_readaround = 0;
1482 int ret = 0; 1545 int ret = 0;
1483 1546
1484 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1547 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1485 if (vmf->pgoff >= size) 1548 if (offset >= size)
1486 return VM_FAULT_SIGBUS; 1549 return VM_FAULT_SIGBUS;
1487 1550
1488 /* If we don't want any read-ahead, don't bother */
1489 if (VM_RandomReadHint(vma))
1490 goto no_cached_page;
1491
1492 /* 1551 /*
1493 * Do we have something in the page cache already? 1552 * Do we have something in the page cache already?
1494 */ 1553 */
1495retry_find: 1554 page = find_get_page(mapping, offset);
1496 page = find_lock_page(mapping, vmf->pgoff); 1555 if (likely(page)) {
1497 /*
1498 * For sequential accesses, we use the generic readahead logic.
1499 */
1500 if (VM_SequentialReadHint(vma)) {
1501 if (!page) {
1502 page_cache_sync_readahead(mapping, ra, file,
1503 vmf->pgoff, 1);
1504 page = find_lock_page(mapping, vmf->pgoff);
1505 if (!page)
1506 goto no_cached_page;
1507 }
1508 if (PageReadahead(page)) {
1509 page_cache_async_readahead(mapping, ra, file, page,
1510 vmf->pgoff, 1);
1511 }
1512 }
1513
1514 if (!page) {
1515 unsigned long ra_pages;
1516
1517 ra->mmap_miss++;
1518
1519 /* 1556 /*
1520 * Do we miss much more than hit in this file? If so, 1557 * We found the page, so try async readahead before
1521 * stop bothering with read-ahead. It will only hurt. 1558 * waiting for the lock.
1522 */ 1559 */
1523 if (ra->mmap_miss > MMAP_LOTSAMISS) 1560 do_async_mmap_readahead(vma, ra, file, page, offset);
1524 goto no_cached_page; 1561 lock_page(page);
1525 1562
1526 /* 1563 /* Did it get truncated? */
1527 * To keep the pgmajfault counter straight, we need to 1564 if (unlikely(page->mapping != mapping)) {
1528 * check did_readaround, as this is an inner loop. 1565 unlock_page(page);
1529 */ 1566 put_page(page);
1530 if (!did_readaround) { 1567 goto no_cached_page;
1531 ret = VM_FAULT_MAJOR;
1532 count_vm_event(PGMAJFAULT);
1533 }
1534 did_readaround = 1;
1535 ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1536 if (ra_pages) {
1537 pgoff_t start = 0;
1538
1539 if (vmf->pgoff > ra_pages / 2)
1540 start = vmf->pgoff - ra_pages / 2;
1541 do_page_cache_readahead(mapping, file, start, ra_pages);
1542 } 1568 }
1543 page = find_lock_page(mapping, vmf->pgoff); 1569 } else {
1570 /* No page in the page cache at all */
1571 do_sync_mmap_readahead(vma, ra, file, offset);
1572 count_vm_event(PGMAJFAULT);
1573 ret = VM_FAULT_MAJOR;
1574retry_find:
1575 page = find_lock_page(mapping, offset);
1544 if (!page) 1576 if (!page)
1545 goto no_cached_page; 1577 goto no_cached_page;
1546 } 1578 }
1547 1579
1548 if (!did_readaround)
1549 ra->mmap_miss--;
1550
1551 /* 1580 /*
1552 * We have a locked page in the page cache, now we need to check 1581 * We have a locked page in the page cache, now we need to check
1553 * that it's up-to-date. If not, it is going to be due to an error. 1582 * that it's up-to-date. If not, it is going to be due to an error.
@@ -1555,18 +1584,18 @@ retry_find:
1555 if (unlikely(!PageUptodate(page))) 1584 if (unlikely(!PageUptodate(page)))
1556 goto page_not_uptodate; 1585 goto page_not_uptodate;
1557 1586
1558 /* Must recheck i_size under page lock */ 1587 /*
1588 * Found the page and have a reference on it.
1589 * We must recheck i_size under page lock.
1590 */
1559 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1591 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1560 if (unlikely(vmf->pgoff >= size)) { 1592 if (unlikely(offset >= size)) {
1561 unlock_page(page); 1593 unlock_page(page);
1562 page_cache_release(page); 1594 page_cache_release(page);
1563 return VM_FAULT_SIGBUS; 1595 return VM_FAULT_SIGBUS;
1564 } 1596 }
1565 1597
1566 /* 1598 ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
1567 * Found the page and have a reference on it.
1568 */
1569 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
1570 vmf->page = page; 1599 vmf->page = page;
1571 return ret | VM_FAULT_LOCKED; 1600 return ret | VM_FAULT_LOCKED;
1572 1601
@@ -1575,7 +1604,7 @@ no_cached_page:
1575 * We're only likely to ever get here if MADV_RANDOM is in 1604 * We're only likely to ever get here if MADV_RANDOM is in
1576 * effect. 1605 * effect.
1577 */ 1606 */
1578 error = page_cache_read(file, vmf->pgoff); 1607 error = page_cache_read(file, offset);
1579 1608
1580 /* 1609 /*
1581 * The page we want has now been added to the page cache. 1610 * The page we want has now been added to the page cache.
@@ -1595,12 +1624,6 @@ no_cached_page:
1595 return VM_FAULT_SIGBUS; 1624 return VM_FAULT_SIGBUS;
1596 1625
1597page_not_uptodate: 1626page_not_uptodate:
1598 /* IO error path */
1599 if (!did_readaround) {
1600 ret = VM_FAULT_MAJOR;
1601 count_vm_event(PGMAJFAULT);
1602 }
1603
1604 /* 1627 /*
1605 * Umm, take care of errors if the page isn't up-to-date. 1628 * Umm, take care of errors if the page isn't up-to-date.
1606 * Try to re-read it _once_. We do this synchronously, 1629 * Try to re-read it _once_. We do this synchronously,
diff --git a/mm/highmem.c b/mm/highmem.c
index 68eb1d9b63fa..25878cc49daa 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,7 +26,6 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/hash.h> 27#include <linux/hash.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/blktrace_api.h>
30#include <asm/tlbflush.h> 29#include <asm/tlbflush.h>
31 30
32/* 31/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e83ad2c9228c..d0351e31f474 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -578,41 +578,6 @@ static void free_huge_page(struct page *page)
578 hugetlb_put_quota(mapping, 1); 578 hugetlb_put_quota(mapping, 1);
579} 579}
580 580
581/*
582 * Increment or decrement surplus_huge_pages. Keep node-specific counters
583 * balanced by operating on them in a round-robin fashion.
584 * Returns 1 if an adjustment was made.
585 */
586static int adjust_pool_surplus(struct hstate *h, int delta)
587{
588 static int prev_nid;
589 int nid = prev_nid;
590 int ret = 0;
591
592 VM_BUG_ON(delta != -1 && delta != 1);
593 do {
594 nid = next_node(nid, node_online_map);
595 if (nid == MAX_NUMNODES)
596 nid = first_node(node_online_map);
597
598 /* To shrink on this node, there must be a surplus page */
599 if (delta < 0 && !h->surplus_huge_pages_node[nid])
600 continue;
601 /* Surplus cannot exceed the total number of pages */
602 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
603 h->nr_huge_pages_node[nid])
604 continue;
605
606 h->surplus_huge_pages += delta;
607 h->surplus_huge_pages_node[nid] += delta;
608 ret = 1;
609 break;
610 } while (nid != prev_nid);
611
612 prev_nid = nid;
613 return ret;
614}
615
616static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 581static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
617{ 582{
618 set_compound_page_dtor(page, free_huge_page); 583 set_compound_page_dtor(page, free_huge_page);
@@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
623 put_page(page); /* free it into the hugepage allocator */ 588 put_page(page); /* free it into the hugepage allocator */
624} 589}
625 590
591static void prep_compound_gigantic_page(struct page *page, unsigned long order)
592{
593 int i;
594 int nr_pages = 1 << order;
595 struct page *p = page + 1;
596
597 /* we rely on prep_new_huge_page to set the destructor */
598 set_compound_order(page, order);
599 __SetPageHead(page);
600 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
601 __SetPageTail(p);
602 p->first_page = page;
603 }
604}
605
606int PageHuge(struct page *page)
607{
608 compound_page_dtor *dtor;
609
610 if (!PageCompound(page))
611 return 0;
612
613 page = compound_head(page);
614 dtor = get_compound_page_dtor(page);
615
616 return dtor == free_huge_page;
617}
618
626static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 619static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
627{ 620{
628 struct page *page; 621 struct page *page;
@@ -630,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
630 if (h->order >= MAX_ORDER) 623 if (h->order >= MAX_ORDER)
631 return NULL; 624 return NULL;
632 625
633 page = alloc_pages_node(nid, 626 page = alloc_pages_exact_node(nid,
634 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 627 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
635 __GFP_REPEAT|__GFP_NOWARN, 628 __GFP_REPEAT|__GFP_NOWARN,
636 huge_page_order(h)); 629 huge_page_order(h));
@@ -649,7 +642,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
649 * Use a helper variable to find the next node and then 642 * Use a helper variable to find the next node and then
650 * copy it back to hugetlb_next_nid afterwards: 643 * copy it back to hugetlb_next_nid afterwards:
651 * otherwise there's a window in which a racer might 644 * otherwise there's a window in which a racer might
652 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 645 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
653 * But we don't need to use a spin_lock here: it really 646 * But we don't need to use a spin_lock here: it really
654 * doesn't matter if occasionally a racer chooses the 647 * doesn't matter if occasionally a racer chooses the
655 * same nid as we do. Move nid forward in the mask even 648 * same nid as we do. Move nid forward in the mask even
@@ -875,7 +868,7 @@ static void return_unused_surplus_pages(struct hstate *h,
875 * can no longer free unreserved surplus pages. This occurs when 868 * can no longer free unreserved surplus pages. This occurs when
876 * the nodes with surplus pages have no free pages. 869 * the nodes with surplus pages have no free pages.
877 */ 870 */
878 unsigned long remaining_iterations = num_online_nodes(); 871 unsigned long remaining_iterations = nr_online_nodes;
879 872
880 /* Uncommit the reservation */ 873 /* Uncommit the reservation */
881 h->resv_huge_pages -= unused_resv_pages; 874 h->resv_huge_pages -= unused_resv_pages;
@@ -904,7 +897,7 @@ static void return_unused_surplus_pages(struct hstate *h,
904 h->surplus_huge_pages--; 897 h->surplus_huge_pages--;
905 h->surplus_huge_pages_node[nid]--; 898 h->surplus_huge_pages_node[nid]--;
906 nr_pages--; 899 nr_pages--;
907 remaining_iterations = num_online_nodes(); 900 remaining_iterations = nr_online_nodes;
908 } 901 }
909 } 902 }
910} 903}
@@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1140} 1133}
1141#endif 1134#endif
1142 1135
1136/*
1137 * Increment or decrement surplus_huge_pages. Keep node-specific counters
1138 * balanced by operating on them in a round-robin fashion.
1139 * Returns 1 if an adjustment was made.
1140 */
1141static int adjust_pool_surplus(struct hstate *h, int delta)
1142{
1143 static int prev_nid;
1144 int nid = prev_nid;
1145 int ret = 0;
1146
1147 VM_BUG_ON(delta != -1 && delta != 1);
1148 do {
1149 nid = next_node(nid, node_online_map);
1150 if (nid == MAX_NUMNODES)
1151 nid = first_node(node_online_map);
1152
1153 /* To shrink on this node, there must be a surplus page */
1154 if (delta < 0 && !h->surplus_huge_pages_node[nid])
1155 continue;
1156 /* Surplus cannot exceed the total number of pages */
1157 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
1158 h->nr_huge_pages_node[nid])
1159 continue;
1160
1161 h->surplus_huge_pages += delta;
1162 h->surplus_huge_pages_node[nid] += delta;
1163 ret = 1;
1164 break;
1165 } while (nid != prev_nid);
1166
1167 prev_nid = nid;
1168 return ret;
1169}
1170
1143#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1171#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1144static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) 1172static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1145{ 1173{
@@ -1957,7 +1985,7 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
1957} 1985}
1958 1986
1959static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 1987static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1960 unsigned long address, pte_t *ptep, int write_access) 1988 unsigned long address, pte_t *ptep, unsigned int flags)
1961{ 1989{
1962 struct hstate *h = hstate_vma(vma); 1990 struct hstate *h = hstate_vma(vma);
1963 int ret = VM_FAULT_SIGBUS; 1991 int ret = VM_FAULT_SIGBUS;
@@ -2025,7 +2053,7 @@ retry:
2025 * any allocations necessary to record that reservation occur outside 2053 * any allocations necessary to record that reservation occur outside
2026 * the spinlock. 2054 * the spinlock.
2027 */ 2055 */
2028 if (write_access && !(vma->vm_flags & VM_SHARED)) 2056 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
2029 if (vma_needs_reservation(h, vma, address) < 0) { 2057 if (vma_needs_reservation(h, vma, address) < 0) {
2030 ret = VM_FAULT_OOM; 2058 ret = VM_FAULT_OOM;
2031 goto backout_unlocked; 2059 goto backout_unlocked;
@@ -2044,7 +2072,7 @@ retry:
2044 && (vma->vm_flags & VM_SHARED))); 2072 && (vma->vm_flags & VM_SHARED)));
2045 set_huge_pte_at(mm, address, ptep, new_pte); 2073 set_huge_pte_at(mm, address, ptep, new_pte);
2046 2074
2047 if (write_access && !(vma->vm_flags & VM_SHARED)) { 2075 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
2048 /* Optimization, do the COW without a second fault */ 2076 /* Optimization, do the COW without a second fault */
2049 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); 2077 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
2050 } 2078 }
@@ -2063,7 +2091,7 @@ backout_unlocked:
2063} 2091}
2064 2092
2065int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2093int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2066 unsigned long address, int write_access) 2094 unsigned long address, unsigned int flags)
2067{ 2095{
2068 pte_t *ptep; 2096 pte_t *ptep;
2069 pte_t entry; 2097 pte_t entry;
@@ -2084,7 +2112,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2084 mutex_lock(&hugetlb_instantiation_mutex); 2112 mutex_lock(&hugetlb_instantiation_mutex);
2085 entry = huge_ptep_get(ptep); 2113 entry = huge_ptep_get(ptep);
2086 if (huge_pte_none(entry)) { 2114 if (huge_pte_none(entry)) {
2087 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 2115 ret = hugetlb_no_page(mm, vma, address, ptep, flags);
2088 goto out_mutex; 2116 goto out_mutex;
2089 } 2117 }
2090 2118
@@ -2098,7 +2126,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2098 * page now as it is used to determine if a reservation has been 2126 * page now as it is used to determine if a reservation has been
2099 * consumed. 2127 * consumed.
2100 */ 2128 */
2101 if (write_access && !pte_write(entry)) { 2129 if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
2102 if (vma_needs_reservation(h, vma, address) < 0) { 2130 if (vma_needs_reservation(h, vma, address) < 0) {
2103 ret = VM_FAULT_OOM; 2131 ret = VM_FAULT_OOM;
2104 goto out_mutex; 2132 goto out_mutex;
@@ -2115,7 +2143,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2115 goto out_page_table_lock; 2143 goto out_page_table_lock;
2116 2144
2117 2145
2118 if (write_access) { 2146 if (flags & FAULT_FLAG_WRITE) {
2119 if (!pte_write(entry)) { 2147 if (!pte_write(entry)) {
2120 ret = hugetlb_cow(mm, vma, address, ptep, entry, 2148 ret = hugetlb_cow(mm, vma, address, ptep, entry,
2121 pagecache_page); 2149 pagecache_page);
@@ -2124,7 +2152,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2124 entry = pte_mkdirty(entry); 2152 entry = pte_mkdirty(entry);
2125 } 2153 }
2126 entry = pte_mkyoung(entry); 2154 entry = pte_mkyoung(entry);
2127 if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) 2155 if (huge_ptep_set_access_flags(vma, address, ptep, entry,
2156 flags & FAULT_FLAG_WRITE))
2128 update_mmu_cache(vma, address, entry); 2157 update_mmu_cache(vma, address, entry);
2129 2158
2130out_page_table_lock: 2159out_page_table_lock:
diff --git a/mm/init-mm.c b/mm/init-mm.c
new file mode 100644
index 000000000000..57aba0da9668
--- /dev/null
+++ b/mm/init-mm.c
@@ -0,0 +1,20 @@
1#include <linux/mm_types.h>
2#include <linux/rbtree.h>
3#include <linux/rwsem.h>
4#include <linux/spinlock.h>
5#include <linux/list.h>
6#include <linux/cpumask.h>
7
8#include <asm/atomic.h>
9#include <asm/pgtable.h>
10
11struct mm_struct init_mm = {
12 .mm_rb = RB_ROOT,
13 .pgd = swapper_pg_dir,
14 .mm_users = ATOMIC_INIT(2),
15 .mm_count = ATOMIC_INIT(1),
16 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
17 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
18 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
19 .cpu_vm_mask = CPU_MASK_ALL,
20};
diff --git a/mm/internal.h b/mm/internal.h
index 987bb03fbdd8..f290c4db528b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -16,9 +16,6 @@
16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
17 unsigned long floor, unsigned long ceiling); 17 unsigned long floor, unsigned long ceiling);
18 18
19extern void prep_compound_page(struct page *page, unsigned long order);
20extern void prep_compound_gigantic_page(struct page *page, unsigned long order);
21
22static inline void set_page_count(struct page *page, int v) 19static inline void set_page_count(struct page *page, int v)
23{ 20{
24 atomic_set(&page->_count, v); 21 atomic_set(&page->_count, v);
@@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page);
51 */ 48 */
52extern unsigned long highest_memmap_pfn; 49extern unsigned long highest_memmap_pfn;
53extern void __free_pages_bootmem(struct page *page, unsigned int order); 50extern void __free_pages_bootmem(struct page *page, unsigned int order);
51extern void prep_compound_page(struct page *page, unsigned long order);
52
54 53
55/* 54/*
56 * function for dealing with page's order in buddy system. 55 * function for dealing with page's order in buddy system.
@@ -74,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
74} 73}
75#endif 74#endif
76 75
77#ifdef CONFIG_UNEVICTABLE_LRU
78/* 76/*
79 * unevictable_migrate_page() called only from migrate_page_copy() to 77 * unevictable_migrate_page() called only from migrate_page_copy() to
80 * migrate unevictable flag to new page. 78 * migrate unevictable flag to new page.
@@ -86,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
86 if (TestClearPageUnevictable(old)) 84 if (TestClearPageUnevictable(old))
87 SetPageUnevictable(new); 85 SetPageUnevictable(new);
88} 86}
89#else
90static inline void unevictable_migrate_page(struct page *new, struct page *old)
91{
92}
93#endif
94 87
95#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT 88#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
96/* 89/*
@@ -150,23 +143,6 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
150 } 143 }
151} 144}
152 145
153/*
154 * free_page_mlock() -- clean up attempts to free and mlocked() page.
155 * Page should not be on lru, so no need to fix that up.
156 * free_pages_check() will verify...
157 */
158static inline void free_page_mlock(struct page *page)
159{
160 if (unlikely(TestClearPageMlocked(page))) {
161 unsigned long flags;
162
163 local_irq_save(flags);
164 __dec_zone_page_state(page, NR_MLOCK);
165 __count_vm_event(UNEVICTABLE_MLOCKFREED);
166 local_irq_restore(flags);
167 }
168}
169
170#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 146#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
171static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 147static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
172{ 148{
@@ -175,7 +151,6 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
175static inline void clear_page_mlock(struct page *page) { } 151static inline void clear_page_mlock(struct page *page) { }
176static inline void mlock_vma_page(struct page *page) { } 152static inline void mlock_vma_page(struct page *page) { }
177static inline void mlock_migrate_page(struct page *new, struct page *old) { } 153static inline void mlock_migrate_page(struct page *new, struct page *old) { }
178static inline void free_page_mlock(struct page *page) { }
179 154
180#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 155#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
181 156
@@ -284,4 +259,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
284 unsigned long start, int len, int flags, 259 unsigned long start, int len, int flags,
285 struct page **pages, struct vm_area_struct **vmas); 260 struct page **pages, struct vm_area_struct **vmas);
286 261
262#define ZONE_RECLAIM_NOSCAN -2
263#define ZONE_RECLAIM_FULL -1
264#define ZONE_RECLAIM_SOME 0
265#define ZONE_RECLAIM_SUCCESS 1
287#endif 266#endif
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
new file mode 100644
index 000000000000..fd814fd61319
--- /dev/null
+++ b/mm/kmemcheck.c
@@ -0,0 +1,122 @@
1#include <linux/gfp.h>
2#include <linux/mm_types.h>
3#include <linux/mm.h>
4#include <linux/slab.h>
5#include <linux/kmemcheck.h>
6
7void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
8{
9 struct page *shadow;
10 int pages;
11 int i;
12
13 pages = 1 << order;
14
15 /*
16 * With kmemcheck enabled, we need to allocate a memory area for the
17 * shadow bits as well.
18 */
19 shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
20 if (!shadow) {
21 if (printk_ratelimit())
22 printk(KERN_ERR "kmemcheck: failed to allocate "
23 "shadow bitmap\n");
24 return;
25 }
26
27 for(i = 0; i < pages; ++i)
28 page[i].shadow = page_address(&shadow[i]);
29
30 /*
31 * Mark it as non-present for the MMU so that our accesses to
32 * this memory will trigger a page fault and let us analyze
33 * the memory accesses.
34 */
35 kmemcheck_hide_pages(page, pages);
36}
37
38void kmemcheck_free_shadow(struct page *page, int order)
39{
40 struct page *shadow;
41 int pages;
42 int i;
43
44 if (!kmemcheck_page_is_tracked(page))
45 return;
46
47 pages = 1 << order;
48
49 kmemcheck_show_pages(page, pages);
50
51 shadow = virt_to_page(page[0].shadow);
52
53 for(i = 0; i < pages; ++i)
54 page[i].shadow = NULL;
55
56 __free_pages(shadow, order);
57}
58
59void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
60 size_t size)
61{
62 /*
63 * Has already been memset(), which initializes the shadow for us
64 * as well.
65 */
66 if (gfpflags & __GFP_ZERO)
67 return;
68
69 /* No need to initialize the shadow of a non-tracked slab. */
70 if (s->flags & SLAB_NOTRACK)
71 return;
72
73 if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
74 /*
75 * Allow notracked objects to be allocated from
76 * tracked caches. Note however that these objects
77 * will still get page faults on access, they just
78 * won't ever be flagged as uninitialized. If page
79 * faults are not acceptable, the slab cache itself
80 * should be marked NOTRACK.
81 */
82 kmemcheck_mark_initialized(object, size);
83 } else if (!s->ctor) {
84 /*
85 * New objects should be marked uninitialized before
86 * they're returned to the called.
87 */
88 kmemcheck_mark_uninitialized(object, size);
89 }
90}
91
92void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
93{
94 /* TODO: RCU freeing is unsupported for now; hide false positives. */
95 if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU))
96 kmemcheck_mark_freed(object, size);
97}
98
99void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
100 gfp_t gfpflags)
101{
102 int pages;
103
104 if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
105 return;
106
107 pages = 1 << order;
108
109 /*
110 * NOTE: We choose to track GFP_ZERO pages too; in fact, they
111 * can become uninitialized by copying uninitialized memory
112 * into them.
113 */
114
115 /* XXX: Can use zone->node for node? */
116 kmemcheck_alloc_shadow(page, order, gfpflags, -1);
117
118 if (gfpflags & __GFP_ZERO)
119 kmemcheck_mark_initialized_pages(page, pages);
120 else
121 kmemcheck_mark_uninitialized_pages(page, pages);
122}
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
new file mode 100644
index 000000000000..d5292fc6f523
--- /dev/null
+++ b/mm/kmemleak-test.c
@@ -0,0 +1,111 @@
1/*
2 * mm/kmemleak-test.c
3 *
4 * Copyright (C) 2008 ARM Limited
5 * Written by Catalin Marinas <catalin.marinas@arm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#include <linux/init.h>
22#include <linux/kernel.h>
23#include <linux/module.h>
24#include <linux/slab.h>
25#include <linux/vmalloc.h>
26#include <linux/list.h>
27#include <linux/percpu.h>
28#include <linux/fdtable.h>
29
30#include <linux/kmemleak.h>
31
32struct test_node {
33 long header[25];
34 struct list_head list;
35 long footer[25];
36};
37
38static LIST_HEAD(test_list);
39static DEFINE_PER_CPU(void *, test_pointer);
40
41/*
42 * Some very simple testing. This function needs to be extended for
43 * proper testing.
44 */
45static int __init kmemleak_test_init(void)
46{
47 struct test_node *elem;
48 int i;
49
50 printk(KERN_INFO "Kmemleak testing\n");
51
52 /* make some orphan objects */
53 pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
54 pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
55 pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
56 pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
57 pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
58 pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
59 pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
60 pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
61#ifndef CONFIG_MODULES
62 pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
63 kmem_cache_alloc(files_cachep, GFP_KERNEL));
64 pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
65 kmem_cache_alloc(files_cachep, GFP_KERNEL));
66#endif
67 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
68 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
69 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
70 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
71 pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
72
73 /*
74 * Add elements to a list. They should only appear as orphan
75 * after the module is removed.
76 */
77 for (i = 0; i < 10; i++) {
78 elem = kmalloc(sizeof(*elem), GFP_KERNEL);
79 pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem);
80 if (!elem)
81 return -ENOMEM;
82 memset(elem, 0, sizeof(*elem));
83 INIT_LIST_HEAD(&elem->list);
84
85 list_add_tail(&elem->list, &test_list);
86 }
87
88 for_each_possible_cpu(i) {
89 per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL);
90 pr_info("kmemleak: kmalloc(129) = %p\n",
91 per_cpu(test_pointer, i));
92 }
93
94 return 0;
95}
96module_init(kmemleak_test_init);
97
98static void __exit kmemleak_test_exit(void)
99{
100 struct test_node *elem, *tmp;
101
102 /*
103 * Remove the list elements without actually freeing the
104 * memory.
105 */
106 list_for_each_entry_safe(elem, tmp, &test_list, list)
107 list_del(&elem->list);
108}
109module_exit(kmemleak_test_exit);
110
111MODULE_LICENSE("GPL");
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
new file mode 100644
index 000000000000..c96f2c8700aa
--- /dev/null
+++ b/mm/kmemleak.c
@@ -0,0 +1,1497 @@
1/*
2 * mm/kmemleak.c
3 *
4 * Copyright (C) 2008 ARM Limited
5 * Written by Catalin Marinas <catalin.marinas@arm.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 *
21 * For more information on the algorithm and kmemleak usage, please see
22 * Documentation/kmemleak.txt.
23 *
24 * Notes on locking
25 * ----------------
26 *
27 * The following locks and mutexes are used by kmemleak:
28 *
29 * - kmemleak_lock (rwlock): protects the object_list modifications and
30 * accesses to the object_tree_root. The object_list is the main list
31 * holding the metadata (struct kmemleak_object) for the allocated memory
32 * blocks. The object_tree_root is a priority search tree used to look-up
33 * metadata based on a pointer to the corresponding memory block. The
34 * kmemleak_object structures are added to the object_list and
35 * object_tree_root in the create_object() function called from the
36 * kmemleak_alloc() callback and removed in delete_object() called from the
37 * kmemleak_free() callback
38 * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to
39 * the metadata (e.g. count) are protected by this lock. Note that some
40 * members of this structure may be protected by other means (atomic or
41 * kmemleak_lock). This lock is also held when scanning the corresponding
42 * memory block to avoid the kernel freeing it via the kmemleak_free()
43 * callback. This is less heavyweight than holding a global lock like
44 * kmemleak_lock during scanning
45 * - scan_mutex (mutex): ensures that only one thread may scan the memory for
46 * unreferenced objects at a time. The gray_list contains the objects which
47 * are already referenced or marked as false positives and need to be
48 * scanned. This list is only modified during a scanning episode when the
49 * scan_mutex is held. At the end of a scan, the gray_list is always empty.
50 * Note that the kmemleak_object.use_count is incremented when an object is
51 * added to the gray_list and therefore cannot be freed
52 * - kmemleak_mutex (mutex): prevents multiple users of the "kmemleak" debugfs
53 * file together with modifications to the memory scanning parameters
54 * including the scan_thread pointer
55 *
56 * The kmemleak_object structures have a use_count incremented or decremented
57 * using the get_object()/put_object() functions. When the use_count becomes
58 * 0, this count can no longer be incremented and put_object() schedules the
59 * kmemleak_object freeing via an RCU callback. All calls to the get_object()
60 * function must be protected by rcu_read_lock() to avoid accessing a freed
61 * structure.
62 */
63
64#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
65
66#include <linux/init.h>
67#include <linux/kernel.h>
68#include <linux/list.h>
69#include <linux/sched.h>
70#include <linux/jiffies.h>
71#include <linux/delay.h>
72#include <linux/module.h>
73#include <linux/kthread.h>
74#include <linux/prio_tree.h>
75#include <linux/gfp.h>
76#include <linux/fs.h>
77#include <linux/debugfs.h>
78#include <linux/seq_file.h>
79#include <linux/cpumask.h>
80#include <linux/spinlock.h>
81#include <linux/mutex.h>
82#include <linux/rcupdate.h>
83#include <linux/stacktrace.h>
84#include <linux/cache.h>
85#include <linux/percpu.h>
86#include <linux/hardirq.h>
87#include <linux/mmzone.h>
88#include <linux/slab.h>
89#include <linux/thread_info.h>
90#include <linux/err.h>
91#include <linux/uaccess.h>
92#include <linux/string.h>
93#include <linux/nodemask.h>
94#include <linux/mm.h>
95
96#include <asm/sections.h>
97#include <asm/processor.h>
98#include <asm/atomic.h>
99
100#include <linux/kmemleak.h>
101
102/*
103 * Kmemleak configuration and common defines.
104 */
105#define MAX_TRACE 16 /* stack trace length */
106#define REPORTS_NR 50 /* maximum number of reported leaks */
107#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
108#define MSECS_SCAN_YIELD 10 /* CPU yielding period */
109#define SECS_FIRST_SCAN 60 /* delay before the first scan */
110#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
111
112#define BYTES_PER_POINTER sizeof(void *)
113
114/* GFP bitmask for kmemleak internal allocations */
115#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC)
116
117/* scanning area inside a memory block */
118struct kmemleak_scan_area {
119 struct hlist_node node;
120 unsigned long offset;
121 size_t length;
122};
123
124/*
125 * Structure holding the metadata for each allocated memory block.
126 * Modifications to such objects should be made while holding the
127 * object->lock. Insertions or deletions from object_list, gray_list or
128 * tree_node are already protected by the corresponding locks or mutex (see
129 * the notes on locking above). These objects are reference-counted
130 * (use_count) and freed using the RCU mechanism.
131 */
132struct kmemleak_object {
133 spinlock_t lock;
134 unsigned long flags; /* object status flags */
135 struct list_head object_list;
136 struct list_head gray_list;
137 struct prio_tree_node tree_node;
138 struct rcu_head rcu; /* object_list lockless traversal */
139 /* object usage count; object freed when use_count == 0 */
140 atomic_t use_count;
141 unsigned long pointer;
142 size_t size;
143 /* minimum number of a pointers found before it is considered leak */
144 int min_count;
145 /* the total number of pointers found pointing to this object */
146 int count;
147 /* memory ranges to be scanned inside an object (empty for all) */
148 struct hlist_head area_list;
149 unsigned long trace[MAX_TRACE];
150 unsigned int trace_len;
151 unsigned long jiffies; /* creation timestamp */
152 pid_t pid; /* pid of the current task */
153 char comm[TASK_COMM_LEN]; /* executable name */
154};
155
156/* flag representing the memory block allocation status */
157#define OBJECT_ALLOCATED (1 << 0)
158/* flag set after the first reporting of an unreference object */
159#define OBJECT_REPORTED (1 << 1)
160/* flag set to not scan the object */
161#define OBJECT_NO_SCAN (1 << 2)
162
163/* the list of all allocated objects */
164static LIST_HEAD(object_list);
165/* the list of gray-colored objects (see color_gray comment below) */
166static LIST_HEAD(gray_list);
167/* prio search tree for object boundaries */
168static struct prio_tree_root object_tree_root;
169/* rw_lock protecting the access to object_list and prio_tree_root */
170static DEFINE_RWLOCK(kmemleak_lock);
171
172/* allocation caches for kmemleak internal data */
173static struct kmem_cache *object_cache;
174static struct kmem_cache *scan_area_cache;
175
176/* set if tracing memory operations is enabled */
177static atomic_t kmemleak_enabled = ATOMIC_INIT(0);
178/* set in the late_initcall if there were no errors */
179static atomic_t kmemleak_initialized = ATOMIC_INIT(0);
180/* enables or disables early logging of the memory operations */
181static atomic_t kmemleak_early_log = ATOMIC_INIT(1);
182/* set if a fata kmemleak error has occurred */
183static atomic_t kmemleak_error = ATOMIC_INIT(0);
184
185/* minimum and maximum address that may be valid pointers */
186static unsigned long min_addr = ULONG_MAX;
187static unsigned long max_addr;
188
189/* used for yielding the CPU to other tasks during scanning */
190static unsigned long next_scan_yield;
191static struct task_struct *scan_thread;
192static unsigned long jiffies_scan_yield;
193static unsigned long jiffies_min_age;
194/* delay between automatic memory scannings */
195static signed long jiffies_scan_wait;
196/* enables or disables the task stacks scanning */
197static int kmemleak_stack_scan;
198/* mutex protecting the memory scanning */
199static DEFINE_MUTEX(scan_mutex);
200/* mutex protecting the access to the /sys/kernel/debug/kmemleak file */
201static DEFINE_MUTEX(kmemleak_mutex);
202
203/* number of leaks reported (for limitation purposes) */
204static int reported_leaks;
205
206/*
207 * Early object allocation/freeing logging. Kmemleak is initialized after the
208 * kernel allocator. However, both the kernel allocator and kmemleak may
209 * allocate memory blocks which need to be tracked. Kmemleak defines an
210 * arbitrary buffer to hold the allocation/freeing information before it is
211 * fully initialized.
212 */
213
214/* kmemleak operation type for early logging */
215enum {
216 KMEMLEAK_ALLOC,
217 KMEMLEAK_FREE,
218 KMEMLEAK_NOT_LEAK,
219 KMEMLEAK_IGNORE,
220 KMEMLEAK_SCAN_AREA,
221 KMEMLEAK_NO_SCAN
222};
223
224/*
225 * Structure holding the information passed to kmemleak callbacks during the
226 * early logging.
227 */
228struct early_log {
229 int op_type; /* kmemleak operation type */
230 const void *ptr; /* allocated/freed memory block */
231 size_t size; /* memory block size */
232 int min_count; /* minimum reference count */
233 unsigned long offset; /* scan area offset */
234 size_t length; /* scan area length */
235};
236
237/* early logging buffer and current position */
238static struct early_log early_log[200];
239static int crt_early_log;
240
241static void kmemleak_disable(void);
242
243/*
244 * Print a warning and dump the stack trace.
245 */
246#define kmemleak_warn(x...) do { \
247 pr_warning(x); \
248 dump_stack(); \
249} while (0)
250
251/*
252 * Macro invoked when a serious kmemleak condition occured and cannot be
253 * recovered from. Kmemleak will be disabled and further allocation/freeing
254 * tracing no longer available.
255 */
256#define kmemleak_stop(x...) do { \
257 kmemleak_warn(x); \
258 kmemleak_disable(); \
259} while (0)
260
261/*
262 * Object colors, encoded with count and min_count:
263 * - white - orphan object, not enough references to it (count < min_count)
264 * - gray - not orphan, not marked as false positive (min_count == 0) or
265 * sufficient references to it (count >= min_count)
266 * - black - ignore, it doesn't contain references (e.g. text section)
267 * (min_count == -1). No function defined for this color.
268 * Newly created objects don't have any color assigned (object->count == -1)
269 * before the next memory scan when they become white.
270 */
271static int color_white(const struct kmemleak_object *object)
272{
273 return object->count != -1 && object->count < object->min_count;
274}
275
276static int color_gray(const struct kmemleak_object *object)
277{
278 return object->min_count != -1 && object->count >= object->min_count;
279}
280
281/*
282 * Objects are considered referenced if their color is gray and they have not
283 * been deleted.
284 */
285static int referenced_object(struct kmemleak_object *object)
286{
287 return (object->flags & OBJECT_ALLOCATED) && color_gray(object);
288}
289
290/*
291 * Objects are considered unreferenced only if their color is white, they have
292 * not be deleted and have a minimum age to avoid false positives caused by
293 * pointers temporarily stored in CPU registers.
294 */
295static int unreferenced_object(struct kmemleak_object *object)
296{
297 return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
298 time_is_before_eq_jiffies(object->jiffies + jiffies_min_age);
299}
300
301/*
302 * Printing of the (un)referenced objects information, either to the seq file
303 * or to the kernel log. The print_referenced/print_unreferenced functions
304 * must be called with the object->lock held.
305 */
306#define print_helper(seq, x...) do { \
307 struct seq_file *s = (seq); \
308 if (s) \
309 seq_printf(s, x); \
310 else \
311 pr_info(x); \
312} while (0)
313
314static void print_referenced(struct kmemleak_object *object)
315{
316 pr_info("referenced object 0x%08lx (size %zu)\n",
317 object->pointer, object->size);
318}
319
320static void print_unreferenced(struct seq_file *seq,
321 struct kmemleak_object *object)
322{
323 int i;
324
325 print_helper(seq, "unreferenced object 0x%08lx (size %zu):\n",
326 object->pointer, object->size);
327 print_helper(seq, " comm \"%s\", pid %d, jiffies %lu\n",
328 object->comm, object->pid, object->jiffies);
329 print_helper(seq, " backtrace:\n");
330
331 for (i = 0; i < object->trace_len; i++) {
332 void *ptr = (void *)object->trace[i];
333 print_helper(seq, " [<%p>] %pS\n", ptr, ptr);
334 }
335}
336
337/*
338 * Print the kmemleak_object information. This function is used mainly for
339 * debugging special cases when kmemleak operations. It must be called with
340 * the object->lock held.
341 */
342static void dump_object_info(struct kmemleak_object *object)
343{
344 struct stack_trace trace;
345
346 trace.nr_entries = object->trace_len;
347 trace.entries = object->trace;
348
349 pr_notice("Object 0x%08lx (size %zu):\n",
350 object->tree_node.start, object->size);
351 pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
352 object->comm, object->pid, object->jiffies);
353 pr_notice(" min_count = %d\n", object->min_count);
354 pr_notice(" count = %d\n", object->count);
355 pr_notice(" backtrace:\n");
356 print_stack_trace(&trace, 4);
357}
358
359/*
360 * Look-up a memory block metadata (kmemleak_object) in the priority search
361 * tree based on a pointer value. If alias is 0, only values pointing to the
362 * beginning of the memory block are allowed. The kmemleak_lock must be held
363 * when calling this function.
364 */
365static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
366{
367 struct prio_tree_node *node;
368 struct prio_tree_iter iter;
369 struct kmemleak_object *object;
370
371 prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr);
372 node = prio_tree_next(&iter);
373 if (node) {
374 object = prio_tree_entry(node, struct kmemleak_object,
375 tree_node);
376 if (!alias && object->pointer != ptr) {
377 kmemleak_warn("Found object by alias");
378 object = NULL;
379 }
380 } else
381 object = NULL;
382
383 return object;
384}
385
386/*
387 * Increment the object use_count. Return 1 if successful or 0 otherwise. Note
388 * that once an object's use_count reached 0, the RCU freeing was already
389 * registered and the object should no longer be used. This function must be
390 * called under the protection of rcu_read_lock().
391 */
392static int get_object(struct kmemleak_object *object)
393{
394 return atomic_inc_not_zero(&object->use_count);
395}
396
397/*
398 * RCU callback to free a kmemleak_object.
399 */
400static void free_object_rcu(struct rcu_head *rcu)
401{
402 struct hlist_node *elem, *tmp;
403 struct kmemleak_scan_area *area;
404 struct kmemleak_object *object =
405 container_of(rcu, struct kmemleak_object, rcu);
406
407 /*
408 * Once use_count is 0 (guaranteed by put_object), there is no other
409 * code accessing this object, hence no need for locking.
410 */
411 hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) {
412 hlist_del(elem);
413 kmem_cache_free(scan_area_cache, area);
414 }
415 kmem_cache_free(object_cache, object);
416}
417
418/*
419 * Decrement the object use_count. Once the count is 0, free the object using
420 * an RCU callback. Since put_object() may be called via the kmemleak_free() ->
421 * delete_object() path, the delayed RCU freeing ensures that there is no
422 * recursive call to the kernel allocator. Lock-less RCU object_list traversal
423 * is also possible.
424 */
425static void put_object(struct kmemleak_object *object)
426{
427 if (!atomic_dec_and_test(&object->use_count))
428 return;
429
430 /* should only get here after delete_object was called */
431 WARN_ON(object->flags & OBJECT_ALLOCATED);
432
433 call_rcu(&object->rcu, free_object_rcu);
434}
435
436/*
437 * Look up an object in the prio search tree and increase its use_count.
438 */
439static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
440{
441 unsigned long flags;
442 struct kmemleak_object *object = NULL;
443
444 rcu_read_lock();
445 read_lock_irqsave(&kmemleak_lock, flags);
446 if (ptr >= min_addr && ptr < max_addr)
447 object = lookup_object(ptr, alias);
448 read_unlock_irqrestore(&kmemleak_lock, flags);
449
450 /* check whether the object is still available */
451 if (object && !get_object(object))
452 object = NULL;
453 rcu_read_unlock();
454
455 return object;
456}
457
458/*
459 * Create the metadata (struct kmemleak_object) corresponding to an allocated
460 * memory block and add it to the object_list and object_tree_root.
461 */
462static void create_object(unsigned long ptr, size_t size, int min_count,
463 gfp_t gfp)
464{
465 unsigned long flags;
466 struct kmemleak_object *object;
467 struct prio_tree_node *node;
468 struct stack_trace trace;
469
470 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
471 if (!object) {
472 kmemleak_stop("Cannot allocate a kmemleak_object structure\n");
473 return;
474 }
475
476 INIT_LIST_HEAD(&object->object_list);
477 INIT_LIST_HEAD(&object->gray_list);
478 INIT_HLIST_HEAD(&object->area_list);
479 spin_lock_init(&object->lock);
480 atomic_set(&object->use_count, 1);
481 object->flags = OBJECT_ALLOCATED;
482 object->pointer = ptr;
483 object->size = size;
484 object->min_count = min_count;
485 object->count = -1; /* no color initially */
486 object->jiffies = jiffies;
487
488 /* task information */
489 if (in_irq()) {
490 object->pid = 0;
491 strncpy(object->comm, "hardirq", sizeof(object->comm));
492 } else if (in_softirq()) {
493 object->pid = 0;
494 strncpy(object->comm, "softirq", sizeof(object->comm));
495 } else {
496 object->pid = current->pid;
497 /*
498 * There is a small chance of a race with set_task_comm(),
499 * however using get_task_comm() here may cause locking
500 * dependency issues with current->alloc_lock. In the worst
501 * case, the command line is not correct.
502 */
503 strncpy(object->comm, current->comm, sizeof(object->comm));
504 }
505
506 /* kernel backtrace */
507 trace.max_entries = MAX_TRACE;
508 trace.nr_entries = 0;
509 trace.entries = object->trace;
510 trace.skip = 1;
511 save_stack_trace(&trace);
512 object->trace_len = trace.nr_entries;
513
514 INIT_PRIO_TREE_NODE(&object->tree_node);
515 object->tree_node.start = ptr;
516 object->tree_node.last = ptr + size - 1;
517
518 write_lock_irqsave(&kmemleak_lock, flags);
519 min_addr = min(min_addr, ptr);
520 max_addr = max(max_addr, ptr + size);
521 node = prio_tree_insert(&object_tree_root, &object->tree_node);
522 /*
523 * The code calling the kernel does not yet have the pointer to the
524 * memory block to be able to free it. However, we still hold the
525 * kmemleak_lock here in case parts of the kernel started freeing
526 * random memory blocks.
527 */
528 if (node != &object->tree_node) {
529 unsigned long flags;
530
531 kmemleak_stop("Cannot insert 0x%lx into the object search tree "
532 "(already existing)\n", ptr);
533 object = lookup_object(ptr, 1);
534 spin_lock_irqsave(&object->lock, flags);
535 dump_object_info(object);
536 spin_unlock_irqrestore(&object->lock, flags);
537
538 goto out;
539 }
540 list_add_tail_rcu(&object->object_list, &object_list);
541out:
542 write_unlock_irqrestore(&kmemleak_lock, flags);
543}
544
545/*
546 * Remove the metadata (struct kmemleak_object) for a memory block from the
547 * object_list and object_tree_root and decrement its use_count.
548 */
549static void delete_object(unsigned long ptr)
550{
551 unsigned long flags;
552 struct kmemleak_object *object;
553
554 write_lock_irqsave(&kmemleak_lock, flags);
555 object = lookup_object(ptr, 0);
556 if (!object) {
557 kmemleak_warn("Freeing unknown object at 0x%08lx\n",
558 ptr);
559 write_unlock_irqrestore(&kmemleak_lock, flags);
560 return;
561 }
562 prio_tree_remove(&object_tree_root, &object->tree_node);
563 list_del_rcu(&object->object_list);
564 write_unlock_irqrestore(&kmemleak_lock, flags);
565
566 WARN_ON(!(object->flags & OBJECT_ALLOCATED));
567 WARN_ON(atomic_read(&object->use_count) < 1);
568
569 /*
570 * Locking here also ensures that the corresponding memory block
571 * cannot be freed when it is being scanned.
572 */
573 spin_lock_irqsave(&object->lock, flags);
574 if (object->flags & OBJECT_REPORTED)
575 print_referenced(object);
576 object->flags &= ~OBJECT_ALLOCATED;
577 spin_unlock_irqrestore(&object->lock, flags);
578 put_object(object);
579}
580
581/*
582 * Make a object permanently as gray-colored so that it can no longer be
583 * reported as a leak. This is used in general to mark a false positive.
584 */
585static void make_gray_object(unsigned long ptr)
586{
587 unsigned long flags;
588 struct kmemleak_object *object;
589
590 object = find_and_get_object(ptr, 0);
591 if (!object) {
592 kmemleak_warn("Graying unknown object at 0x%08lx\n", ptr);
593 return;
594 }
595
596 spin_lock_irqsave(&object->lock, flags);
597 object->min_count = 0;
598 spin_unlock_irqrestore(&object->lock, flags);
599 put_object(object);
600}
601
602/*
603 * Mark the object as black-colored so that it is ignored from scans and
604 * reporting.
605 */
606static void make_black_object(unsigned long ptr)
607{
608 unsigned long flags;
609 struct kmemleak_object *object;
610
611 object = find_and_get_object(ptr, 0);
612 if (!object) {
613 kmemleak_warn("Blacking unknown object at 0x%08lx\n", ptr);
614 return;
615 }
616
617 spin_lock_irqsave(&object->lock, flags);
618 object->min_count = -1;
619 spin_unlock_irqrestore(&object->lock, flags);
620 put_object(object);
621}
622
623/*
624 * Add a scanning area to the object. If at least one such area is added,
625 * kmemleak will only scan these ranges rather than the whole memory block.
626 */
627static void add_scan_area(unsigned long ptr, unsigned long offset,
628 size_t length, gfp_t gfp)
629{
630 unsigned long flags;
631 struct kmemleak_object *object;
632 struct kmemleak_scan_area *area;
633
634 object = find_and_get_object(ptr, 0);
635 if (!object) {
636 kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n",
637 ptr);
638 return;
639 }
640
641 area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK);
642 if (!area) {
643 kmemleak_warn("Cannot allocate a scan area\n");
644 goto out;
645 }
646
647 spin_lock_irqsave(&object->lock, flags);
648 if (offset + length > object->size) {
649 kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
650 dump_object_info(object);
651 kmem_cache_free(scan_area_cache, area);
652 goto out_unlock;
653 }
654
655 INIT_HLIST_NODE(&area->node);
656 area->offset = offset;
657 area->length = length;
658
659 hlist_add_head(&area->node, &object->area_list);
660out_unlock:
661 spin_unlock_irqrestore(&object->lock, flags);
662out:
663 put_object(object);
664}
665
666/*
667 * Set the OBJECT_NO_SCAN flag for the object corresponding to the give
668 * pointer. Such object will not be scanned by kmemleak but references to it
669 * are searched.
670 */
671static void object_no_scan(unsigned long ptr)
672{
673 unsigned long flags;
674 struct kmemleak_object *object;
675
676 object = find_and_get_object(ptr, 0);
677 if (!object) {
678 kmemleak_warn("Not scanning unknown object at 0x%08lx\n", ptr);
679 return;
680 }
681
682 spin_lock_irqsave(&object->lock, flags);
683 object->flags |= OBJECT_NO_SCAN;
684 spin_unlock_irqrestore(&object->lock, flags);
685 put_object(object);
686}
687
688/*
689 * Log an early kmemleak_* call to the early_log buffer. These calls will be
690 * processed later once kmemleak is fully initialized.
691 */
692static void log_early(int op_type, const void *ptr, size_t size,
693 int min_count, unsigned long offset, size_t length)
694{
695 unsigned long flags;
696 struct early_log *log;
697
698 if (crt_early_log >= ARRAY_SIZE(early_log)) {
699 kmemleak_stop("Early log buffer exceeded\n");
700 return;
701 }
702
703 /*
704 * There is no need for locking since the kernel is still in UP mode
705 * at this stage. Disabling the IRQs is enough.
706 */
707 local_irq_save(flags);
708 log = &early_log[crt_early_log];
709 log->op_type = op_type;
710 log->ptr = ptr;
711 log->size = size;
712 log->min_count = min_count;
713 log->offset = offset;
714 log->length = length;
715 crt_early_log++;
716 local_irq_restore(flags);
717}
718
719/*
720 * Memory allocation function callback. This function is called from the
721 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
722 * vmalloc etc.).
723 */
724void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp)
725{
726 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
727
728 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
729 create_object((unsigned long)ptr, size, min_count, gfp);
730 else if (atomic_read(&kmemleak_early_log))
731 log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0);
732}
733EXPORT_SYMBOL_GPL(kmemleak_alloc);
734
735/*
736 * Memory freeing function callback. This function is called from the kernel
737 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
738 */
739void kmemleak_free(const void *ptr)
740{
741 pr_debug("%s(0x%p)\n", __func__, ptr);
742
743 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
744 delete_object((unsigned long)ptr);
745 else if (atomic_read(&kmemleak_early_log))
746 log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0);
747}
748EXPORT_SYMBOL_GPL(kmemleak_free);
749
750/*
751 * Mark an already allocated memory block as a false positive. This will cause
752 * the block to no longer be reported as leak and always be scanned.
753 */
754void kmemleak_not_leak(const void *ptr)
755{
756 pr_debug("%s(0x%p)\n", __func__, ptr);
757
758 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
759 make_gray_object((unsigned long)ptr);
760 else if (atomic_read(&kmemleak_early_log))
761 log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0);
762}
763EXPORT_SYMBOL(kmemleak_not_leak);
764
765/*
766 * Ignore a memory block. This is usually done when it is known that the
767 * corresponding block is not a leak and does not contain any references to
768 * other allocated memory blocks.
769 */
770void kmemleak_ignore(const void *ptr)
771{
772 pr_debug("%s(0x%p)\n", __func__, ptr);
773
774 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
775 make_black_object((unsigned long)ptr);
776 else if (atomic_read(&kmemleak_early_log))
777 log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0);
778}
779EXPORT_SYMBOL(kmemleak_ignore);
780
781/*
782 * Limit the range to be scanned in an allocated memory block.
783 */
784void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length,
785 gfp_t gfp)
786{
787 pr_debug("%s(0x%p)\n", __func__, ptr);
788
789 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
790 add_scan_area((unsigned long)ptr, offset, length, gfp);
791 else if (atomic_read(&kmemleak_early_log))
792 log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length);
793}
794EXPORT_SYMBOL(kmemleak_scan_area);
795
796/*
797 * Inform kmemleak not to scan the given memory block.
798 */
799void kmemleak_no_scan(const void *ptr)
800{
801 pr_debug("%s(0x%p)\n", __func__, ptr);
802
803 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
804 object_no_scan((unsigned long)ptr);
805 else if (atomic_read(&kmemleak_early_log))
806 log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0);
807}
808EXPORT_SYMBOL(kmemleak_no_scan);
809
810/*
811 * Yield the CPU so that other tasks get a chance to run. The yielding is
812 * rate-limited to avoid excessive number of calls to the schedule() function
813 * during memory scanning.
814 */
815static void scan_yield(void)
816{
817 might_sleep();
818
819 if (time_is_before_eq_jiffies(next_scan_yield)) {
820 schedule();
821 next_scan_yield = jiffies + jiffies_scan_yield;
822 }
823}
824
825/*
826 * Memory scanning is a long process and it needs to be interruptable. This
827 * function checks whether such interrupt condition occured.
828 */
829static int scan_should_stop(void)
830{
831 if (!atomic_read(&kmemleak_enabled))
832 return 1;
833
834 /*
835 * This function may be called from either process or kthread context,
836 * hence the need to check for both stop conditions.
837 */
838 if (current->mm)
839 return signal_pending(current);
840 else
841 return kthread_should_stop();
842
843 return 0;
844}
845
846/*
847 * Scan a memory block (exclusive range) for valid pointers and add those
848 * found to the gray list.
849 */
850static void scan_block(void *_start, void *_end,
851 struct kmemleak_object *scanned)
852{
853 unsigned long *ptr;
854 unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
855 unsigned long *end = _end - (BYTES_PER_POINTER - 1);
856
857 for (ptr = start; ptr < end; ptr++) {
858 unsigned long flags;
859 unsigned long pointer = *ptr;
860 struct kmemleak_object *object;
861
862 if (scan_should_stop())
863 break;
864
865 /*
866 * When scanning a memory block with a corresponding
867 * kmemleak_object, the CPU yielding is handled in the calling
868 * code since it holds the object->lock to avoid the block
869 * freeing.
870 */
871 if (!scanned)
872 scan_yield();
873
874 object = find_and_get_object(pointer, 1);
875 if (!object)
876 continue;
877 if (object == scanned) {
878 /* self referenced, ignore */
879 put_object(object);
880 continue;
881 }
882
883 /*
884 * Avoid the lockdep recursive warning on object->lock being
885 * previously acquired in scan_object(). These locks are
886 * enclosed by scan_mutex.
887 */
888 spin_lock_irqsave_nested(&object->lock, flags,
889 SINGLE_DEPTH_NESTING);
890 if (!color_white(object)) {
891 /* non-orphan, ignored or new */
892 spin_unlock_irqrestore(&object->lock, flags);
893 put_object(object);
894 continue;
895 }
896
897 /*
898 * Increase the object's reference count (number of pointers
899 * to the memory block). If this count reaches the required
900 * minimum, the object's color will become gray and it will be
901 * added to the gray_list.
902 */
903 object->count++;
904 if (color_gray(object))
905 list_add_tail(&object->gray_list, &gray_list);
906 else
907 put_object(object);
908 spin_unlock_irqrestore(&object->lock, flags);
909 }
910}
911
912/*
913 * Scan a memory block corresponding to a kmemleak_object. A condition is
914 * that object->use_count >= 1.
915 */
916static void scan_object(struct kmemleak_object *object)
917{
918 struct kmemleak_scan_area *area;
919 struct hlist_node *elem;
920 unsigned long flags;
921
922 /*
923 * Once the object->lock is aquired, the corresponding memory block
924 * cannot be freed (the same lock is aquired in delete_object).
925 */
926 spin_lock_irqsave(&object->lock, flags);
927 if (object->flags & OBJECT_NO_SCAN)
928 goto out;
929 if (!(object->flags & OBJECT_ALLOCATED))
930 /* already freed object */
931 goto out;
932 if (hlist_empty(&object->area_list))
933 scan_block((void *)object->pointer,
934 (void *)(object->pointer + object->size), object);
935 else
936 hlist_for_each_entry(area, elem, &object->area_list, node)
937 scan_block((void *)(object->pointer + area->offset),
938 (void *)(object->pointer + area->offset
939 + area->length), object);
940out:
941 spin_unlock_irqrestore(&object->lock, flags);
942}
943
944/*
945 * Scan data sections and all the referenced memory blocks allocated via the
946 * kernel's standard allocators. This function must be called with the
947 * scan_mutex held.
948 */
949static void kmemleak_scan(void)
950{
951 unsigned long flags;
952 struct kmemleak_object *object, *tmp;
953 struct task_struct *task;
954 int i;
955
956 /* prepare the kmemleak_object's */
957 rcu_read_lock();
958 list_for_each_entry_rcu(object, &object_list, object_list) {
959 spin_lock_irqsave(&object->lock, flags);
960#ifdef DEBUG
961 /*
962 * With a few exceptions there should be a maximum of
963 * 1 reference to any object at this point.
964 */
965 if (atomic_read(&object->use_count) > 1) {
966 pr_debug("object->use_count = %d\n",
967 atomic_read(&object->use_count));
968 dump_object_info(object);
969 }
970#endif
971 /* reset the reference count (whiten the object) */
972 object->count = 0;
973 if (color_gray(object) && get_object(object))
974 list_add_tail(&object->gray_list, &gray_list);
975
976 spin_unlock_irqrestore(&object->lock, flags);
977 }
978 rcu_read_unlock();
979
980 /* data/bss scanning */
981 scan_block(_sdata, _edata, NULL);
982 scan_block(__bss_start, __bss_stop, NULL);
983
984#ifdef CONFIG_SMP
985 /* per-cpu sections scanning */
986 for_each_possible_cpu(i)
987 scan_block(__per_cpu_start + per_cpu_offset(i),
988 __per_cpu_end + per_cpu_offset(i), NULL);
989#endif
990
991 /*
992 * Struct page scanning for each node. The code below is not yet safe
993 * with MEMORY_HOTPLUG.
994 */
995 for_each_online_node(i) {
996 pg_data_t *pgdat = NODE_DATA(i);
997 unsigned long start_pfn = pgdat->node_start_pfn;
998 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
999 unsigned long pfn;
1000
1001 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1002 struct page *page;
1003
1004 if (!pfn_valid(pfn))
1005 continue;
1006 page = pfn_to_page(pfn);
1007 /* only scan if page is in use */
1008 if (page_count(page) == 0)
1009 continue;
1010 scan_block(page, page + 1, NULL);
1011 }
1012 }
1013
1014 /*
1015 * Scanning the task stacks may introduce false negatives and it is
1016 * not enabled by default.
1017 */
1018 if (kmemleak_stack_scan) {
1019 read_lock(&tasklist_lock);
1020 for_each_process(task)
1021 scan_block(task_stack_page(task),
1022 task_stack_page(task) + THREAD_SIZE, NULL);
1023 read_unlock(&tasklist_lock);
1024 }
1025
1026 /*
1027 * Scan the objects already referenced from the sections scanned
1028 * above. More objects will be referenced and, if there are no memory
1029 * leaks, all the objects will be scanned. The list traversal is safe
1030 * for both tail additions and removals from inside the loop. The
1031 * kmemleak objects cannot be freed from outside the loop because their
1032 * use_count was increased.
1033 */
1034 object = list_entry(gray_list.next, typeof(*object), gray_list);
1035 while (&object->gray_list != &gray_list) {
1036 scan_yield();
1037
1038 /* may add new objects to the list */
1039 if (!scan_should_stop())
1040 scan_object(object);
1041
1042 tmp = list_entry(object->gray_list.next, typeof(*object),
1043 gray_list);
1044
1045 /* remove the object from the list and release it */
1046 list_del(&object->gray_list);
1047 put_object(object);
1048
1049 object = tmp;
1050 }
1051 WARN_ON(!list_empty(&gray_list));
1052}
1053
1054/*
1055 * Thread function performing automatic memory scanning. Unreferenced objects
1056 * at the end of a memory scan are reported but only the first time.
1057 */
1058static int kmemleak_scan_thread(void *arg)
1059{
1060 static int first_run = 1;
1061
1062 pr_info("Automatic memory scanning thread started\n");
1063
1064 /*
1065 * Wait before the first scan to allow the system to fully initialize.
1066 */
1067 if (first_run) {
1068 first_run = 0;
1069 ssleep(SECS_FIRST_SCAN);
1070 }
1071
1072 while (!kthread_should_stop()) {
1073 struct kmemleak_object *object;
1074 signed long timeout = jiffies_scan_wait;
1075
1076 mutex_lock(&scan_mutex);
1077
1078 kmemleak_scan();
1079 reported_leaks = 0;
1080
1081 rcu_read_lock();
1082 list_for_each_entry_rcu(object, &object_list, object_list) {
1083 unsigned long flags;
1084
1085 if (reported_leaks >= REPORTS_NR)
1086 break;
1087 spin_lock_irqsave(&object->lock, flags);
1088 if (!(object->flags & OBJECT_REPORTED) &&
1089 unreferenced_object(object)) {
1090 print_unreferenced(NULL, object);
1091 object->flags |= OBJECT_REPORTED;
1092 reported_leaks++;
1093 } else if ((object->flags & OBJECT_REPORTED) &&
1094 referenced_object(object)) {
1095 print_referenced(object);
1096 object->flags &= ~OBJECT_REPORTED;
1097 }
1098 spin_unlock_irqrestore(&object->lock, flags);
1099 }
1100 rcu_read_unlock();
1101
1102 mutex_unlock(&scan_mutex);
1103 /* wait before the next scan */
1104 while (timeout && !kthread_should_stop())
1105 timeout = schedule_timeout_interruptible(timeout);
1106 }
1107
1108 pr_info("Automatic memory scanning thread ended\n");
1109
1110 return 0;
1111}
1112
1113/*
1114 * Start the automatic memory scanning thread. This function must be called
1115 * with the kmemleak_mutex held.
1116 */
1117void start_scan_thread(void)
1118{
1119 if (scan_thread)
1120 return;
1121 scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak");
1122 if (IS_ERR(scan_thread)) {
1123 pr_warning("Failed to create the scan thread\n");
1124 scan_thread = NULL;
1125 }
1126}
1127
1128/*
1129 * Stop the automatic memory scanning thread. This function must be called
1130 * with the kmemleak_mutex held.
1131 */
1132void stop_scan_thread(void)
1133{
1134 if (scan_thread) {
1135 kthread_stop(scan_thread);
1136 scan_thread = NULL;
1137 }
1138}
1139
1140/*
1141 * Iterate over the object_list and return the first valid object at or after
1142 * the required position with its use_count incremented. The function triggers
1143 * a memory scanning when the pos argument points to the first position.
1144 */
1145static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
1146{
1147 struct kmemleak_object *object;
1148 loff_t n = *pos;
1149
1150 if (!n) {
1151 kmemleak_scan();
1152 reported_leaks = 0;
1153 }
1154 if (reported_leaks >= REPORTS_NR)
1155 return NULL;
1156
1157 rcu_read_lock();
1158 list_for_each_entry_rcu(object, &object_list, object_list) {
1159 if (n-- > 0)
1160 continue;
1161 if (get_object(object))
1162 goto out;
1163 }
1164 object = NULL;
1165out:
1166 rcu_read_unlock();
1167 return object;
1168}
1169
1170/*
1171 * Return the next object in the object_list. The function decrements the
1172 * use_count of the previous object and increases that of the next one.
1173 */
1174static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1175{
1176 struct kmemleak_object *prev_obj = v;
1177 struct kmemleak_object *next_obj = NULL;
1178 struct list_head *n = &prev_obj->object_list;
1179
1180 ++(*pos);
1181 if (reported_leaks >= REPORTS_NR)
1182 goto out;
1183
1184 rcu_read_lock();
1185 list_for_each_continue_rcu(n, &object_list) {
1186 next_obj = list_entry(n, struct kmemleak_object, object_list);
1187 if (get_object(next_obj))
1188 break;
1189 }
1190 rcu_read_unlock();
1191out:
1192 put_object(prev_obj);
1193 return next_obj;
1194}
1195
1196/*
1197 * Decrement the use_count of the last object required, if any.
1198 */
1199static void kmemleak_seq_stop(struct seq_file *seq, void *v)
1200{
1201 if (v)
1202 put_object(v);
1203}
1204
1205/*
1206 * Print the information for an unreferenced object to the seq file.
1207 */
1208static int kmemleak_seq_show(struct seq_file *seq, void *v)
1209{
1210 struct kmemleak_object *object = v;
1211 unsigned long flags;
1212
1213 spin_lock_irqsave(&object->lock, flags);
1214 if (!unreferenced_object(object))
1215 goto out;
1216 print_unreferenced(seq, object);
1217 reported_leaks++;
1218out:
1219 spin_unlock_irqrestore(&object->lock, flags);
1220 return 0;
1221}
1222
1223static const struct seq_operations kmemleak_seq_ops = {
1224 .start = kmemleak_seq_start,
1225 .next = kmemleak_seq_next,
1226 .stop = kmemleak_seq_stop,
1227 .show = kmemleak_seq_show,
1228};
1229
1230static int kmemleak_open(struct inode *inode, struct file *file)
1231{
1232 int ret = 0;
1233
1234 if (!atomic_read(&kmemleak_enabled))
1235 return -EBUSY;
1236
1237 ret = mutex_lock_interruptible(&kmemleak_mutex);
1238 if (ret < 0)
1239 goto out;
1240 if (file->f_mode & FMODE_READ) {
1241 ret = mutex_lock_interruptible(&scan_mutex);
1242 if (ret < 0)
1243 goto kmemleak_unlock;
1244 ret = seq_open(file, &kmemleak_seq_ops);
1245 if (ret < 0)
1246 goto scan_unlock;
1247 }
1248 return ret;
1249
1250scan_unlock:
1251 mutex_unlock(&scan_mutex);
1252kmemleak_unlock:
1253 mutex_unlock(&kmemleak_mutex);
1254out:
1255 return ret;
1256}
1257
1258static int kmemleak_release(struct inode *inode, struct file *file)
1259{
1260 int ret = 0;
1261
1262 if (file->f_mode & FMODE_READ) {
1263 seq_release(inode, file);
1264 mutex_unlock(&scan_mutex);
1265 }
1266 mutex_unlock(&kmemleak_mutex);
1267
1268 return ret;
1269}
1270
1271/*
1272 * File write operation to configure kmemleak at run-time. The following
1273 * commands can be written to the /sys/kernel/debug/kmemleak file:
1274 * off - disable kmemleak (irreversible)
1275 * stack=on - enable the task stacks scanning
1276 * stack=off - disable the tasks stacks scanning
1277 * scan=on - start the automatic memory scanning thread
1278 * scan=off - stop the automatic memory scanning thread
1279 * scan=... - set the automatic memory scanning period in seconds (0 to
1280 * disable it)
1281 */
1282static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1283 size_t size, loff_t *ppos)
1284{
1285 char buf[64];
1286 int buf_size;
1287
1288 if (!atomic_read(&kmemleak_enabled))
1289 return -EBUSY;
1290
1291 buf_size = min(size, (sizeof(buf) - 1));
1292 if (strncpy_from_user(buf, user_buf, buf_size) < 0)
1293 return -EFAULT;
1294 buf[buf_size] = 0;
1295
1296 if (strncmp(buf, "off", 3) == 0)
1297 kmemleak_disable();
1298 else if (strncmp(buf, "stack=on", 8) == 0)
1299 kmemleak_stack_scan = 1;
1300 else if (strncmp(buf, "stack=off", 9) == 0)
1301 kmemleak_stack_scan = 0;
1302 else if (strncmp(buf, "scan=on", 7) == 0)
1303 start_scan_thread();
1304 else if (strncmp(buf, "scan=off", 8) == 0)
1305 stop_scan_thread();
1306 else if (strncmp(buf, "scan=", 5) == 0) {
1307 unsigned long secs;
1308 int err;
1309
1310 err = strict_strtoul(buf + 5, 0, &secs);
1311 if (err < 0)
1312 return err;
1313 stop_scan_thread();
1314 if (secs) {
1315 jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
1316 start_scan_thread();
1317 }
1318 } else
1319 return -EINVAL;
1320
1321 /* ignore the rest of the buffer, only one command at a time */
1322 *ppos += size;
1323 return size;
1324}
1325
1326static const struct file_operations kmemleak_fops = {
1327 .owner = THIS_MODULE,
1328 .open = kmemleak_open,
1329 .read = seq_read,
1330 .write = kmemleak_write,
1331 .llseek = seq_lseek,
1332 .release = kmemleak_release,
1333};
1334
1335/*
1336 * Perform the freeing of the kmemleak internal objects after waiting for any
1337 * current memory scan to complete.
1338 */
1339static int kmemleak_cleanup_thread(void *arg)
1340{
1341 struct kmemleak_object *object;
1342
1343 mutex_lock(&kmemleak_mutex);
1344 stop_scan_thread();
1345 mutex_unlock(&kmemleak_mutex);
1346
1347 mutex_lock(&scan_mutex);
1348 rcu_read_lock();
1349 list_for_each_entry_rcu(object, &object_list, object_list)
1350 delete_object(object->pointer);
1351 rcu_read_unlock();
1352 mutex_unlock(&scan_mutex);
1353
1354 return 0;
1355}
1356
1357/*
1358 * Start the clean-up thread.
1359 */
1360static void kmemleak_cleanup(void)
1361{
1362 struct task_struct *cleanup_thread;
1363
1364 cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL,
1365 "kmemleak-clean");
1366 if (IS_ERR(cleanup_thread))
1367 pr_warning("Failed to create the clean-up thread\n");
1368}
1369
1370/*
1371 * Disable kmemleak. No memory allocation/freeing will be traced once this
1372 * function is called. Disabling kmemleak is an irreversible operation.
1373 */
1374static void kmemleak_disable(void)
1375{
1376 /* atomically check whether it was already invoked */
1377 if (atomic_cmpxchg(&kmemleak_error, 0, 1))
1378 return;
1379
1380 /* stop any memory operation tracing */
1381 atomic_set(&kmemleak_early_log, 0);
1382 atomic_set(&kmemleak_enabled, 0);
1383
1384 /* check whether it is too early for a kernel thread */
1385 if (atomic_read(&kmemleak_initialized))
1386 kmemleak_cleanup();
1387
1388 pr_info("Kernel memory leak detector disabled\n");
1389}
1390
1391/*
1392 * Allow boot-time kmemleak disabling (enabled by default).
1393 */
1394static int kmemleak_boot_config(char *str)
1395{
1396 if (!str)
1397 return -EINVAL;
1398 if (strcmp(str, "off") == 0)
1399 kmemleak_disable();
1400 else if (strcmp(str, "on") != 0)
1401 return -EINVAL;
1402 return 0;
1403}
1404early_param("kmemleak", kmemleak_boot_config);
1405
1406/*
1407 * Kmemleak initialization.
1408 */
1409void __init kmemleak_init(void)
1410{
1411 int i;
1412 unsigned long flags;
1413
1414 jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD);
1415 jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
1416 jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
1417
1418 object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
1419 scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
1420 INIT_PRIO_TREE_ROOT(&object_tree_root);
1421
1422 /* the kernel is still in UP mode, so disabling the IRQs is enough */
1423 local_irq_save(flags);
1424 if (!atomic_read(&kmemleak_error)) {
1425 atomic_set(&kmemleak_enabled, 1);
1426 atomic_set(&kmemleak_early_log, 0);
1427 }
1428 local_irq_restore(flags);
1429
1430 /*
1431 * This is the point where tracking allocations is safe. Automatic
1432 * scanning is started during the late initcall. Add the early logged
1433 * callbacks to the kmemleak infrastructure.
1434 */
1435 for (i = 0; i < crt_early_log; i++) {
1436 struct early_log *log = &early_log[i];
1437
1438 switch (log->op_type) {
1439 case KMEMLEAK_ALLOC:
1440 kmemleak_alloc(log->ptr, log->size, log->min_count,
1441 GFP_KERNEL);
1442 break;
1443 case KMEMLEAK_FREE:
1444 kmemleak_free(log->ptr);
1445 break;
1446 case KMEMLEAK_NOT_LEAK:
1447 kmemleak_not_leak(log->ptr);
1448 break;
1449 case KMEMLEAK_IGNORE:
1450 kmemleak_ignore(log->ptr);
1451 break;
1452 case KMEMLEAK_SCAN_AREA:
1453 kmemleak_scan_area(log->ptr, log->offset, log->length,
1454 GFP_KERNEL);
1455 break;
1456 case KMEMLEAK_NO_SCAN:
1457 kmemleak_no_scan(log->ptr);
1458 break;
1459 default:
1460 WARN_ON(1);
1461 }
1462 }
1463}
1464
1465/*
1466 * Late initialization function.
1467 */
1468static int __init kmemleak_late_init(void)
1469{
1470 struct dentry *dentry;
1471
1472 atomic_set(&kmemleak_initialized, 1);
1473
1474 if (atomic_read(&kmemleak_error)) {
1475 /*
1476 * Some error occured and kmemleak was disabled. There is a
1477 * small chance that kmemleak_disable() was called immediately
1478 * after setting kmemleak_initialized and we may end up with
1479 * two clean-up threads but serialized by scan_mutex.
1480 */
1481 kmemleak_cleanup();
1482 return -ENOMEM;
1483 }
1484
1485 dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL,
1486 &kmemleak_fops);
1487 if (!dentry)
1488 pr_warning("Failed to create the debugfs kmemleak file\n");
1489 mutex_lock(&kmemleak_mutex);
1490 start_scan_thread();
1491 mutex_unlock(&kmemleak_mutex);
1492
1493 pr_info("Kernel memory leak detector initialized\n");
1494
1495 return 0;
1496}
1497late_initcall(kmemleak_late_init);
diff --git a/mm/maccess.c b/mm/maccess.c
index ac40796cfb15..9073695ff25f 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -39,7 +39,7 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
39 * Safely write to address @dst from the buffer at @src. If a kernel fault 39 * Safely write to address @dst from the buffer at @src. If a kernel fault
40 * happens, handle that and return -EFAULT. 40 * happens, handle that and return -EFAULT.
41 */ 41 */
42long probe_kernel_write(void *dst, void *src, size_t size) 42long notrace __weak probe_kernel_write(void *dst, void *src, size_t size)
43{ 43{
44 long ret; 44 long ret;
45 mm_segment_t old_fs = get_fs(); 45 mm_segment_t old_fs = get_fs();
diff --git a/mm/madvise.c b/mm/madvise.c
index b9ce574827c8..76eb4193acdd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -123,8 +123,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
123 end = vma->vm_end; 123 end = vma->vm_end;
124 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 124 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
125 125
126 force_page_cache_readahead(file->f_mapping, 126 force_page_cache_readahead(file->f_mapping, file, start, end - start);
127 file, start, max_sane_readahead(end - start));
128 return 0; 127 return 0;
129} 128}
130 129
@@ -239,12 +238,30 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
239 break; 238 break;
240 239
241 default: 240 default:
242 error = -EINVAL; 241 BUG();
243 break; 242 break;
244 } 243 }
245 return error; 244 return error;
246} 245}
247 246
247static int
248madvise_behavior_valid(int behavior)
249{
250 switch (behavior) {
251 case MADV_DOFORK:
252 case MADV_DONTFORK:
253 case MADV_NORMAL:
254 case MADV_SEQUENTIAL:
255 case MADV_RANDOM:
256 case MADV_REMOVE:
257 case MADV_WILLNEED:
258 case MADV_DONTNEED:
259 return 1;
260
261 default:
262 return 0;
263 }
264}
248/* 265/*
249 * The madvise(2) system call. 266 * The madvise(2) system call.
250 * 267 *
@@ -290,6 +307,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
290 int write; 307 int write;
291 size_t len; 308 size_t len;
292 309
310 if (!madvise_behavior_valid(behavior))
311 return error;
312
293 write = madvise_need_mmap_write(behavior); 313 write = madvise_need_mmap_write(behavior);
294 if (write) 314 if (write)
295 down_write(&current->mm->mmap_sem); 315 down_write(&current->mm->mmap_sem);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 78eb8552818b..e2fa20dadf40 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,7 +45,7 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly;
45#define MEM_CGROUP_RECLAIM_RETRIES 5 45#define MEM_CGROUP_RECLAIM_RETRIES 5
46 46
47#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 47#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
48/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */ 48/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
49int do_swap_account __read_mostly; 49int do_swap_account __read_mostly;
50static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 50static int really_do_swap_account __initdata = 1; /* for remember boot option*/
51#else 51#else
@@ -62,7 +62,8 @@ enum mem_cgroup_stat_index {
62 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 62 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
63 */ 63 */
64 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 64 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
65 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ 65 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
66 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */
66 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 67 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
67 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 68 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
68 69
@@ -176,6 +177,9 @@ struct mem_cgroup {
176 177
177 unsigned int swappiness; 178 unsigned int swappiness;
178 179
180 /* set when res.limit == memsw.limit */
181 bool memsw_is_minimum;
182
179 /* 183 /*
180 * statistics. This must be placed at the end of memcg. 184 * statistics. This must be placed at the end of memcg.
181 */ 185 */
@@ -188,6 +192,7 @@ enum charge_type {
188 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 192 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
189 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 193 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
190 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 194 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
195 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
191 NR_CHARGE_TYPE, 196 NR_CHARGE_TYPE,
192}; 197};
193 198
@@ -570,6 +575,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
570 return 0; 575 return 0;
571} 576}
572 577
578int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
579{
580 unsigned long active;
581 unsigned long inactive;
582
583 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
584 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
585
586 return (active > inactive);
587}
588
573unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 589unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
574 struct zone *zone, 590 struct zone *zone,
575 enum lru_list lru) 591 enum lru_list lru)
@@ -633,6 +649,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
633 int zid = zone_idx(z); 649 int zid = zone_idx(z);
634 struct mem_cgroup_per_zone *mz; 650 struct mem_cgroup_per_zone *mz;
635 int lru = LRU_FILE * !!file + !!active; 651 int lru = LRU_FILE * !!file + !!active;
652 int ret;
636 653
637 BUG_ON(!mem_cont); 654 BUG_ON(!mem_cont);
638 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 655 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
@@ -650,9 +667,19 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
650 continue; 667 continue;
651 668
652 scan++; 669 scan++;
653 if (__isolate_lru_page(page, mode, file) == 0) { 670 ret = __isolate_lru_page(page, mode, file);
671 switch (ret) {
672 case 0:
654 list_move(&page->lru, dst); 673 list_move(&page->lru, dst);
674 mem_cgroup_del_lru(page);
655 nr_taken++; 675 nr_taken++;
676 break;
677 case -EBUSY:
678 /* we don't affect global LRU but rotate in our LRU */
679 mem_cgroup_rotate_lru_list(page, page_lru(page));
680 break;
681 default:
682 break;
656 } 683 }
657 } 684 }
658 685
@@ -834,6 +861,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
834 int ret, total = 0; 861 int ret, total = 0;
835 int loop = 0; 862 int loop = 0;
836 863
864 /* If memsw_is_minimum==1, swap-out is of-no-use. */
865 if (root_mem->memsw_is_minimum)
866 noswap = true;
867
837 while (loop < 2) { 868 while (loop < 2) {
838 victim = mem_cgroup_select_victim(root_mem); 869 victim = mem_cgroup_select_victim(root_mem);
839 if (victim == root_mem) 870 if (victim == root_mem)
@@ -889,6 +920,44 @@ static void record_last_oom(struct mem_cgroup *mem)
889 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); 920 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
890} 921}
891 922
923/*
924 * Currently used to update mapped file statistics, but the routine can be
925 * generalized to update other statistics as well.
926 */
927void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
928{
929 struct mem_cgroup *mem;
930 struct mem_cgroup_stat *stat;
931 struct mem_cgroup_stat_cpu *cpustat;
932 int cpu;
933 struct page_cgroup *pc;
934
935 if (!page_is_file_cache(page))
936 return;
937
938 pc = lookup_page_cgroup(page);
939 if (unlikely(!pc))
940 return;
941
942 lock_page_cgroup(pc);
943 mem = pc->mem_cgroup;
944 if (!mem)
945 goto done;
946
947 if (!PageCgroupUsed(pc))
948 goto done;
949
950 /*
951 * Preemption is already disabled, we don't need get_cpu()
952 */
953 cpu = smp_processor_id();
954 stat = &mem->stat;
955 cpustat = &stat->cpustat[cpu];
956
957 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val);
958done:
959 unlock_page_cgroup(pc);
960}
892 961
893/* 962/*
894 * Unlike exported interface, "oom" parameter is added. if oom==true, 963 * Unlike exported interface, "oom" parameter is added. if oom==true,
@@ -1087,6 +1156,10 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1087 struct mem_cgroup_per_zone *from_mz, *to_mz; 1156 struct mem_cgroup_per_zone *from_mz, *to_mz;
1088 int nid, zid; 1157 int nid, zid;
1089 int ret = -EBUSY; 1158 int ret = -EBUSY;
1159 struct page *page;
1160 int cpu;
1161 struct mem_cgroup_stat *stat;
1162 struct mem_cgroup_stat_cpu *cpustat;
1090 1163
1091 VM_BUG_ON(from == to); 1164 VM_BUG_ON(from == to);
1092 VM_BUG_ON(PageLRU(pc->page)); 1165 VM_BUG_ON(PageLRU(pc->page));
@@ -1107,6 +1180,23 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1107 1180
1108 res_counter_uncharge(&from->res, PAGE_SIZE); 1181 res_counter_uncharge(&from->res, PAGE_SIZE);
1109 mem_cgroup_charge_statistics(from, pc, false); 1182 mem_cgroup_charge_statistics(from, pc, false);
1183
1184 page = pc->page;
1185 if (page_is_file_cache(page) && page_mapped(page)) {
1186 cpu = smp_processor_id();
1187 /* Update mapped_file data for mem_cgroup "from" */
1188 stat = &from->stat;
1189 cpustat = &stat->cpustat[cpu];
1190 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
1191 -1);
1192
1193 /* Update mapped_file data for mem_cgroup "to" */
1194 stat = &to->stat;
1195 cpustat = &stat->cpustat[cpu];
1196 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
1197 1);
1198 }
1199
1110 if (do_swap_account) 1200 if (do_swap_account)
1111 res_counter_uncharge(&from->memsw, PAGE_SIZE); 1201 res_counter_uncharge(&from->memsw, PAGE_SIZE);
1112 css_put(&from->css); 1202 css_put(&from->css);
@@ -1422,6 +1512,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1422 1512
1423 switch (ctype) { 1513 switch (ctype) {
1424 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1514 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1515 case MEM_CGROUP_CHARGE_TYPE_DROP:
1425 if (page_mapped(page)) 1516 if (page_mapped(page))
1426 goto unlock_out; 1517 goto unlock_out;
1427 break; 1518 break;
@@ -1485,18 +1576,23 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
1485 * called after __delete_from_swap_cache() and drop "page" account. 1576 * called after __delete_from_swap_cache() and drop "page" account.
1486 * memcg information is recorded to swap_cgroup of "ent" 1577 * memcg information is recorded to swap_cgroup of "ent"
1487 */ 1578 */
1488void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) 1579void
1580mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
1489{ 1581{
1490 struct mem_cgroup *memcg; 1582 struct mem_cgroup *memcg;
1583 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
1584
1585 if (!swapout) /* this was a swap cache but the swap is unused ! */
1586 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
1587
1588 memcg = __mem_cgroup_uncharge_common(page, ctype);
1491 1589
1492 memcg = __mem_cgroup_uncharge_common(page,
1493 MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1494 /* record memcg information */ 1590 /* record memcg information */
1495 if (do_swap_account && memcg) { 1591 if (do_swap_account && swapout && memcg) {
1496 swap_cgroup_record(ent, css_id(&memcg->css)); 1592 swap_cgroup_record(ent, css_id(&memcg->css));
1497 mem_cgroup_get(memcg); 1593 mem_cgroup_get(memcg);
1498 } 1594 }
1499 if (memcg) 1595 if (swapout && memcg)
1500 css_put(&memcg->css); 1596 css_put(&memcg->css);
1501} 1597}
1502#endif 1598#endif
@@ -1674,6 +1770,12 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1674 break; 1770 break;
1675 } 1771 }
1676 ret = res_counter_set_limit(&memcg->res, val); 1772 ret = res_counter_set_limit(&memcg->res, val);
1773 if (!ret) {
1774 if (memswlimit == val)
1775 memcg->memsw_is_minimum = true;
1776 else
1777 memcg->memsw_is_minimum = false;
1778 }
1677 mutex_unlock(&set_limit_mutex); 1779 mutex_unlock(&set_limit_mutex);
1678 1780
1679 if (!ret) 1781 if (!ret)
@@ -1692,16 +1794,14 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1692 return ret; 1794 return ret;
1693} 1795}
1694 1796
1695int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 1797static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1696 unsigned long long val) 1798 unsigned long long val)
1697{ 1799{
1698 int retry_count; 1800 int retry_count;
1699 u64 memlimit, oldusage, curusage; 1801 u64 memlimit, oldusage, curusage;
1700 int children = mem_cgroup_count_children(memcg); 1802 int children = mem_cgroup_count_children(memcg);
1701 int ret = -EBUSY; 1803 int ret = -EBUSY;
1702 1804
1703 if (!do_swap_account)
1704 return -EINVAL;
1705 /* see mem_cgroup_resize_res_limit */ 1805 /* see mem_cgroup_resize_res_limit */
1706 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 1806 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
1707 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1807 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
@@ -1723,6 +1823,12 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1723 break; 1823 break;
1724 } 1824 }
1725 ret = res_counter_set_limit(&memcg->memsw, val); 1825 ret = res_counter_set_limit(&memcg->memsw, val);
1826 if (!ret) {
1827 if (memlimit == val)
1828 memcg->memsw_is_minimum = true;
1829 else
1830 memcg->memsw_is_minimum = false;
1831 }
1726 mutex_unlock(&set_limit_mutex); 1832 mutex_unlock(&set_limit_mutex);
1727 1833
1728 if (!ret) 1834 if (!ret)
@@ -1936,8 +2042,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
1936 val = res_counter_read_u64(&mem->res, name); 2042 val = res_counter_read_u64(&mem->res, name);
1937 break; 2043 break;
1938 case _MEMSWAP: 2044 case _MEMSWAP:
1939 if (do_swap_account) 2045 val = res_counter_read_u64(&mem->memsw, name);
1940 val = res_counter_read_u64(&mem->memsw, name);
1941 break; 2046 break;
1942 default: 2047 default:
1943 BUG(); 2048 BUG();
@@ -2035,6 +2140,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2035enum { 2140enum {
2036 MCS_CACHE, 2141 MCS_CACHE,
2037 MCS_RSS, 2142 MCS_RSS,
2143 MCS_MAPPED_FILE,
2038 MCS_PGPGIN, 2144 MCS_PGPGIN,
2039 MCS_PGPGOUT, 2145 MCS_PGPGOUT,
2040 MCS_INACTIVE_ANON, 2146 MCS_INACTIVE_ANON,
@@ -2055,6 +2161,7 @@ struct {
2055} memcg_stat_strings[NR_MCS_STAT] = { 2161} memcg_stat_strings[NR_MCS_STAT] = {
2056 {"cache", "total_cache"}, 2162 {"cache", "total_cache"},
2057 {"rss", "total_rss"}, 2163 {"rss", "total_rss"},
2164 {"mapped_file", "total_mapped_file"},
2058 {"pgpgin", "total_pgpgin"}, 2165 {"pgpgin", "total_pgpgin"},
2059 {"pgpgout", "total_pgpgout"}, 2166 {"pgpgout", "total_pgpgout"},
2060 {"inactive_anon", "total_inactive_anon"}, 2167 {"inactive_anon", "total_inactive_anon"},
@@ -2075,6 +2182,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2075 s->stat[MCS_CACHE] += val * PAGE_SIZE; 2182 s->stat[MCS_CACHE] += val * PAGE_SIZE;
2076 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 2183 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
2077 s->stat[MCS_RSS] += val * PAGE_SIZE; 2184 s->stat[MCS_RSS] += val * PAGE_SIZE;
2185 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE);
2186 s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE;
2078 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); 2187 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
2079 s->stat[MCS_PGPGIN] += val; 2188 s->stat[MCS_PGPGIN] += val;
2080 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 2189 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
diff --git a/mm/memory.c b/mm/memory.c
index 4126dd16778c..f46ac18ba231 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1310,8 +1310,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1310 cond_resched(); 1310 cond_resched();
1311 while (!(page = follow_page(vma, start, foll_flags))) { 1311 while (!(page = follow_page(vma, start, foll_flags))) {
1312 int ret; 1312 int ret;
1313
1313 ret = handle_mm_fault(mm, vma, start, 1314 ret = handle_mm_fault(mm, vma, start,
1314 foll_flags & FOLL_WRITE); 1315 (foll_flags & FOLL_WRITE) ?
1316 FAULT_FLAG_WRITE : 0);
1317
1315 if (ret & VM_FAULT_ERROR) { 1318 if (ret & VM_FAULT_ERROR) {
1316 if (ret & VM_FAULT_OOM) 1319 if (ret & VM_FAULT_OOM)
1317 return i ? i : -ENOMEM; 1320 return i ? i : -ENOMEM;
@@ -1360,6 +1363,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1360 return i; 1363 return i;
1361} 1364}
1362 1365
1366/**
1367 * get_user_pages() - pin user pages in memory
1368 * @tsk: task_struct of target task
1369 * @mm: mm_struct of target mm
1370 * @start: starting user address
1371 * @len: number of pages from start to pin
1372 * @write: whether pages will be written to by the caller
1373 * @force: whether to force write access even if user mapping is
1374 * readonly. This will result in the page being COWed even
1375 * in MAP_SHARED mappings. You do not want this.
1376 * @pages: array that receives pointers to the pages pinned.
1377 * Should be at least nr_pages long. Or NULL, if caller
1378 * only intends to ensure the pages are faulted in.
1379 * @vmas: array of pointers to vmas corresponding to each page.
1380 * Or NULL if the caller does not require them.
1381 *
1382 * Returns number of pages pinned. This may be fewer than the number
1383 * requested. If len is 0 or negative, returns 0. If no pages
1384 * were pinned, returns -errno. Each page returned must be released
1385 * with a put_page() call when it is finished with. vmas will only
1386 * remain valid while mmap_sem is held.
1387 *
1388 * Must be called with mmap_sem held for read or write.
1389 *
1390 * get_user_pages walks a process's page tables and takes a reference to
1391 * each struct page that each user address corresponds to at a given
1392 * instant. That is, it takes the page that would be accessed if a user
1393 * thread accesses the given user virtual address at that instant.
1394 *
1395 * This does not guarantee that the page exists in the user mappings when
1396 * get_user_pages returns, and there may even be a completely different
1397 * page there in some cases (eg. if mmapped pagecache has been invalidated
1398 * and subsequently re faulted). However it does guarantee that the page
1399 * won't be freed completely. And mostly callers simply care that the page
1400 * contains data that was valid *at some point in time*. Typically, an IO
1401 * or similar operation cannot guarantee anything stronger anyway because
1402 * locks can't be held over the syscall boundary.
1403 *
1404 * If write=0, the page must not be written to. If the page is written to,
1405 * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
1406 * after the page is finished with, and before put_page is called.
1407 *
1408 * get_user_pages is typically used for fewer-copy IO operations, to get a
1409 * handle on the memory by some means other than accesses via the user virtual
1410 * addresses. The pages may be submitted for DMA to devices or accessed via
1411 * their kernel linear mapping (via the kmap APIs). Care should be taken to
1412 * use the correct cache flushing APIs.
1413 *
1414 * See also get_user_pages_fast, for performance critical applications.
1415 */
1363int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1416int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1364 unsigned long start, int len, int write, int force, 1417 unsigned long start, int len, int write, int force,
1365 struct page **pages, struct vm_area_struct **vmas) 1418 struct page **pages, struct vm_area_struct **vmas)
@@ -2446,7 +2499,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2446 */ 2499 */
2447static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, 2500static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2448 unsigned long address, pte_t *page_table, pmd_t *pmd, 2501 unsigned long address, pte_t *page_table, pmd_t *pmd,
2449 int write_access, pte_t orig_pte) 2502 unsigned int flags, pte_t orig_pte)
2450{ 2503{
2451 spinlock_t *ptl; 2504 spinlock_t *ptl;
2452 struct page *page; 2505 struct page *page;
@@ -2466,7 +2519,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2466 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2519 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2467 page = lookup_swap_cache(entry); 2520 page = lookup_swap_cache(entry);
2468 if (!page) { 2521 if (!page) {
2469 grab_swap_token(); /* Contend for token _before_ read-in */ 2522 grab_swap_token(mm); /* Contend for token _before_ read-in */
2470 page = swapin_readahead(entry, 2523 page = swapin_readahead(entry,
2471 GFP_HIGHUSER_MOVABLE, vma, address); 2524 GFP_HIGHUSER_MOVABLE, vma, address);
2472 if (!page) { 2525 if (!page) {
@@ -2522,9 +2575,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2522 2575
2523 inc_mm_counter(mm, anon_rss); 2576 inc_mm_counter(mm, anon_rss);
2524 pte = mk_pte(page, vma->vm_page_prot); 2577 pte = mk_pte(page, vma->vm_page_prot);
2525 if (write_access && reuse_swap_page(page)) { 2578 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2526 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2579 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2527 write_access = 0; 2580 flags &= ~FAULT_FLAG_WRITE;
2528 } 2581 }
2529 flush_icache_page(vma, page); 2582 flush_icache_page(vma, page);
2530 set_pte_at(mm, address, page_table, pte); 2583 set_pte_at(mm, address, page_table, pte);
@@ -2537,7 +2590,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2537 try_to_free_swap(page); 2590 try_to_free_swap(page);
2538 unlock_page(page); 2591 unlock_page(page);
2539 2592
2540 if (write_access) { 2593 if (flags & FAULT_FLAG_WRITE) {
2541 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); 2594 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2542 if (ret & VM_FAULT_ERROR) 2595 if (ret & VM_FAULT_ERROR)
2543 ret &= VM_FAULT_ERROR; 2596 ret &= VM_FAULT_ERROR;
@@ -2566,7 +2619,7 @@ out_page:
2566 */ 2619 */
2567static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 2620static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2568 unsigned long address, pte_t *page_table, pmd_t *pmd, 2621 unsigned long address, pte_t *page_table, pmd_t *pmd,
2569 int write_access) 2622 unsigned int flags)
2570{ 2623{
2571 struct page *page; 2624 struct page *page;
2572 spinlock_t *ptl; 2625 spinlock_t *ptl;
@@ -2726,7 +2779,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2726 * due to the bad i386 page protection. But it's valid 2779 * due to the bad i386 page protection. But it's valid
2727 * for other architectures too. 2780 * for other architectures too.
2728 * 2781 *
2729 * Note that if write_access is true, we either now have 2782 * Note that if FAULT_FLAG_WRITE is set, we either now have
2730 * an exclusive copy of the page, or this is a shared mapping, 2783 * an exclusive copy of the page, or this is a shared mapping,
2731 * so we can make it writable and dirty to avoid having to 2784 * so we can make it writable and dirty to avoid having to
2732 * handle that later. 2785 * handle that later.
@@ -2797,11 +2850,10 @@ unwritable_page:
2797 2850
2798static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2851static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2799 unsigned long address, pte_t *page_table, pmd_t *pmd, 2852 unsigned long address, pte_t *page_table, pmd_t *pmd,
2800 int write_access, pte_t orig_pte) 2853 unsigned int flags, pte_t orig_pte)
2801{ 2854{
2802 pgoff_t pgoff = (((address & PAGE_MASK) 2855 pgoff_t pgoff = (((address & PAGE_MASK)
2803 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2856 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2804 unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
2805 2857
2806 pte_unmap(page_table); 2858 pte_unmap(page_table);
2807 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 2859 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
@@ -2818,12 +2870,12 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2818 */ 2870 */
2819static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2871static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2820 unsigned long address, pte_t *page_table, pmd_t *pmd, 2872 unsigned long address, pte_t *page_table, pmd_t *pmd,
2821 int write_access, pte_t orig_pte) 2873 unsigned int flags, pte_t orig_pte)
2822{ 2874{
2823 unsigned int flags = FAULT_FLAG_NONLINEAR |
2824 (write_access ? FAULT_FLAG_WRITE : 0);
2825 pgoff_t pgoff; 2875 pgoff_t pgoff;
2826 2876
2877 flags |= FAULT_FLAG_NONLINEAR;
2878
2827 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2879 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2828 return 0; 2880 return 0;
2829 2881
@@ -2854,7 +2906,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2854 */ 2906 */
2855static inline int handle_pte_fault(struct mm_struct *mm, 2907static inline int handle_pte_fault(struct mm_struct *mm,
2856 struct vm_area_struct *vma, unsigned long address, 2908 struct vm_area_struct *vma, unsigned long address,
2857 pte_t *pte, pmd_t *pmd, int write_access) 2909 pte_t *pte, pmd_t *pmd, unsigned int flags)
2858{ 2910{
2859 pte_t entry; 2911 pte_t entry;
2860 spinlock_t *ptl; 2912 spinlock_t *ptl;
@@ -2865,30 +2917,30 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2865 if (vma->vm_ops) { 2917 if (vma->vm_ops) {
2866 if (likely(vma->vm_ops->fault)) 2918 if (likely(vma->vm_ops->fault))
2867 return do_linear_fault(mm, vma, address, 2919 return do_linear_fault(mm, vma, address,
2868 pte, pmd, write_access, entry); 2920 pte, pmd, flags, entry);
2869 } 2921 }
2870 return do_anonymous_page(mm, vma, address, 2922 return do_anonymous_page(mm, vma, address,
2871 pte, pmd, write_access); 2923 pte, pmd, flags);
2872 } 2924 }
2873 if (pte_file(entry)) 2925 if (pte_file(entry))
2874 return do_nonlinear_fault(mm, vma, address, 2926 return do_nonlinear_fault(mm, vma, address,
2875 pte, pmd, write_access, entry); 2927 pte, pmd, flags, entry);
2876 return do_swap_page(mm, vma, address, 2928 return do_swap_page(mm, vma, address,
2877 pte, pmd, write_access, entry); 2929 pte, pmd, flags, entry);
2878 } 2930 }
2879 2931
2880 ptl = pte_lockptr(mm, pmd); 2932 ptl = pte_lockptr(mm, pmd);
2881 spin_lock(ptl); 2933 spin_lock(ptl);
2882 if (unlikely(!pte_same(*pte, entry))) 2934 if (unlikely(!pte_same(*pte, entry)))
2883 goto unlock; 2935 goto unlock;
2884 if (write_access) { 2936 if (flags & FAULT_FLAG_WRITE) {
2885 if (!pte_write(entry)) 2937 if (!pte_write(entry))
2886 return do_wp_page(mm, vma, address, 2938 return do_wp_page(mm, vma, address,
2887 pte, pmd, ptl, entry); 2939 pte, pmd, ptl, entry);
2888 entry = pte_mkdirty(entry); 2940 entry = pte_mkdirty(entry);
2889 } 2941 }
2890 entry = pte_mkyoung(entry); 2942 entry = pte_mkyoung(entry);
2891 if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { 2943 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
2892 update_mmu_cache(vma, address, entry); 2944 update_mmu_cache(vma, address, entry);
2893 } else { 2945 } else {
2894 /* 2946 /*
@@ -2897,7 +2949,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2897 * This still avoids useless tlb flushes for .text page faults 2949 * This still avoids useless tlb flushes for .text page faults
2898 * with threads. 2950 * with threads.
2899 */ 2951 */
2900 if (write_access) 2952 if (flags & FAULT_FLAG_WRITE)
2901 flush_tlb_page(vma, address); 2953 flush_tlb_page(vma, address);
2902 } 2954 }
2903unlock: 2955unlock:
@@ -2909,7 +2961,7 @@ unlock:
2909 * By the time we get here, we already hold the mm semaphore 2961 * By the time we get here, we already hold the mm semaphore
2910 */ 2962 */
2911int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2963int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2912 unsigned long address, int write_access) 2964 unsigned long address, unsigned int flags)
2913{ 2965{
2914 pgd_t *pgd; 2966 pgd_t *pgd;
2915 pud_t *pud; 2967 pud_t *pud;
@@ -2921,7 +2973,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2921 count_vm_event(PGFAULT); 2973 count_vm_event(PGFAULT);
2922 2974
2923 if (unlikely(is_vm_hugetlb_page(vma))) 2975 if (unlikely(is_vm_hugetlb_page(vma)))
2924 return hugetlb_fault(mm, vma, address, write_access); 2976 return hugetlb_fault(mm, vma, address, flags);
2925 2977
2926 pgd = pgd_offset(mm, address); 2978 pgd = pgd_offset(mm, address);
2927 pud = pud_alloc(mm, pgd, address); 2979 pud = pud_alloc(mm, pgd, address);
@@ -2934,7 +2986,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2934 if (!pte) 2986 if (!pte)
2935 return VM_FAULT_OOM; 2987 return VM_FAULT_OOM;
2936 2988
2937 return handle_pte_fault(mm, vma, address, pte, pmd, write_access); 2989 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
2938} 2990}
2939 2991
2940#ifndef __PAGETABLE_PUD_FOLDED 2992#ifndef __PAGETABLE_PUD_FOLDED
@@ -3053,22 +3105,13 @@ int in_gate_area_no_task(unsigned long addr)
3053 3105
3054#endif /* __HAVE_ARCH_GATE_AREA */ 3106#endif /* __HAVE_ARCH_GATE_AREA */
3055 3107
3056#ifdef CONFIG_HAVE_IOREMAP_PROT 3108static int follow_pte(struct mm_struct *mm, unsigned long address,
3057int follow_phys(struct vm_area_struct *vma, 3109 pte_t **ptepp, spinlock_t **ptlp)
3058 unsigned long address, unsigned int flags,
3059 unsigned long *prot, resource_size_t *phys)
3060{ 3110{
3061 pgd_t *pgd; 3111 pgd_t *pgd;
3062 pud_t *pud; 3112 pud_t *pud;
3063 pmd_t *pmd; 3113 pmd_t *pmd;
3064 pte_t *ptep, pte; 3114 pte_t *ptep;
3065 spinlock_t *ptl;
3066 resource_size_t phys_addr = 0;
3067 struct mm_struct *mm = vma->vm_mm;
3068 int ret = -EINVAL;
3069
3070 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3071 goto out;
3072 3115
3073 pgd = pgd_offset(mm, address); 3116 pgd = pgd_offset(mm, address);
3074 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 3117 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
@@ -3086,22 +3129,71 @@ int follow_phys(struct vm_area_struct *vma,
3086 if (pmd_huge(*pmd)) 3129 if (pmd_huge(*pmd))
3087 goto out; 3130 goto out;
3088 3131
3089 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 3132 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3090 if (!ptep) 3133 if (!ptep)
3091 goto out; 3134 goto out;
3135 if (!pte_present(*ptep))
3136 goto unlock;
3137 *ptepp = ptep;
3138 return 0;
3139unlock:
3140 pte_unmap_unlock(ptep, *ptlp);
3141out:
3142 return -EINVAL;
3143}
3092 3144
3145/**
3146 * follow_pfn - look up PFN at a user virtual address
3147 * @vma: memory mapping
3148 * @address: user virtual address
3149 * @pfn: location to store found PFN
3150 *
3151 * Only IO mappings and raw PFN mappings are allowed.
3152 *
3153 * Returns zero and the pfn at @pfn on success, -ve otherwise.
3154 */
3155int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3156 unsigned long *pfn)
3157{
3158 int ret = -EINVAL;
3159 spinlock_t *ptl;
3160 pte_t *ptep;
3161
3162 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3163 return ret;
3164
3165 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3166 if (ret)
3167 return ret;
3168 *pfn = pte_pfn(*ptep);
3169 pte_unmap_unlock(ptep, ptl);
3170 return 0;
3171}
3172EXPORT_SYMBOL(follow_pfn);
3173
3174#ifdef CONFIG_HAVE_IOREMAP_PROT
3175int follow_phys(struct vm_area_struct *vma,
3176 unsigned long address, unsigned int flags,
3177 unsigned long *prot, resource_size_t *phys)
3178{
3179 int ret = -EINVAL;
3180 pte_t *ptep, pte;
3181 spinlock_t *ptl;
3182
3183 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3184 goto out;
3185
3186 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3187 goto out;
3093 pte = *ptep; 3188 pte = *ptep;
3094 if (!pte_present(pte)) 3189
3095 goto unlock;
3096 if ((flags & FOLL_WRITE) && !pte_write(pte)) 3190 if ((flags & FOLL_WRITE) && !pte_write(pte))
3097 goto unlock; 3191 goto unlock;
3098 phys_addr = pte_pfn(pte);
3099 phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
3100 3192
3101 *prot = pgprot_val(pte_pgprot(pte)); 3193 *prot = pgprot_val(pte_pgprot(pte));
3102 *phys = phys_addr; 3194 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3103 ret = 0;
3104 3195
3196 ret = 0;
3105unlock: 3197unlock:
3106 pte_unmap_unlock(ptep, ptl); 3198 pte_unmap_unlock(ptep, ptl);
3107out: 3199out:
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c083cf5fd6df..e4412a676c88 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -422,7 +422,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
422 zone->present_pages += onlined_pages; 422 zone->present_pages += onlined_pages;
423 zone->zone_pgdat->node_present_pages += onlined_pages; 423 zone->zone_pgdat->node_present_pages += onlined_pages;
424 424
425 setup_per_zone_pages_min(); 425 setup_per_zone_wmarks();
426 calculate_zone_inactive_ratio(zone);
426 if (onlined_pages) { 427 if (onlined_pages) {
427 kswapd_run(zone_to_nid(zone)); 428 kswapd_run(zone_to_nid(zone));
428 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 429 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
@@ -832,6 +833,9 @@ repeat:
832 totalram_pages -= offlined_pages; 833 totalram_pages -= offlined_pages;
833 num_physpages -= offlined_pages; 834 num_physpages -= offlined_pages;
834 835
836 setup_per_zone_wmarks();
837 calculate_zone_inactive_ratio(zone);
838
835 vm_total_pages = nr_free_pagecache_pages(); 839 vm_total_pages = nr_free_pagecache_pages();
836 writeback_set_ratelimit(); 840 writeback_set_ratelimit();
837 841
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3eb4a6fdc043..e08e2c4da63a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
182 return 0; 182 return 0;
183} 183}
184 184
185/* Create a new policy */ 185/*
186 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
187 * any, for the new policy. mpol_new() has already validated the nodes
188 * parameter with respect to the policy mode and flags. But, we need to
189 * handle an empty nodemask with MPOL_PREFERRED here.
190 *
191 * Must be called holding task's alloc_lock to protect task's mems_allowed
192 * and mempolicy. May also be called holding the mmap_semaphore for write.
193 */
194static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
195{
196 nodemask_t cpuset_context_nmask;
197 int ret;
198
199 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
200 if (pol == NULL)
201 return 0;
202
203 VM_BUG_ON(!nodes);
204 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
205 nodes = NULL; /* explicit local allocation */
206 else {
207 if (pol->flags & MPOL_F_RELATIVE_NODES)
208 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
209 &cpuset_current_mems_allowed);
210 else
211 nodes_and(cpuset_context_nmask, *nodes,
212 cpuset_current_mems_allowed);
213 if (mpol_store_user_nodemask(pol))
214 pol->w.user_nodemask = *nodes;
215 else
216 pol->w.cpuset_mems_allowed =
217 cpuset_current_mems_allowed;
218 }
219
220 ret = mpol_ops[pol->mode].create(pol,
221 nodes ? &cpuset_context_nmask : NULL);
222 return ret;
223}
224
225/*
226 * This function just creates a new policy, does some check and simple
227 * initialization. You must invoke mpol_set_nodemask() to set nodes.
228 */
186static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 229static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
187 nodemask_t *nodes) 230 nodemask_t *nodes)
188{ 231{
189 struct mempolicy *policy; 232 struct mempolicy *policy;
190 nodemask_t cpuset_context_nmask;
191 int ret;
192 233
193 pr_debug("setting mode %d flags %d nodes[0] %lx\n", 234 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
194 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); 235 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
210 if (((flags & MPOL_F_STATIC_NODES) || 251 if (((flags & MPOL_F_STATIC_NODES) ||
211 (flags & MPOL_F_RELATIVE_NODES))) 252 (flags & MPOL_F_RELATIVE_NODES)))
212 return ERR_PTR(-EINVAL); 253 return ERR_PTR(-EINVAL);
213 nodes = NULL; /* flag local alloc */
214 } 254 }
215 } else if (nodes_empty(*nodes)) 255 } else if (nodes_empty(*nodes))
216 return ERR_PTR(-EINVAL); 256 return ERR_PTR(-EINVAL);
@@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
221 policy->mode = mode; 261 policy->mode = mode;
222 policy->flags = flags; 262 policy->flags = flags;
223 263
224 if (nodes) {
225 /*
226 * cpuset related setup doesn't apply to local allocation
227 */
228 cpuset_update_task_memory_state();
229 if (flags & MPOL_F_RELATIVE_NODES)
230 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
231 &cpuset_current_mems_allowed);
232 else
233 nodes_and(cpuset_context_nmask, *nodes,
234 cpuset_current_mems_allowed);
235 if (mpol_store_user_nodemask(policy))
236 policy->w.user_nodemask = *nodes;
237 else
238 policy->w.cpuset_mems_allowed =
239 cpuset_mems_allowed(current);
240 }
241
242 ret = mpol_ops[mode].create(policy,
243 nodes ? &cpuset_context_nmask : NULL);
244 if (ret < 0) {
245 kmem_cache_free(policy_cache, policy);
246 return ERR_PTR(ret);
247 }
248 return policy; 264 return policy;
249} 265}
250 266
@@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
324/* 340/*
325 * Wrapper for mpol_rebind_policy() that just requires task 341 * Wrapper for mpol_rebind_policy() that just requires task
326 * pointer, and updates task mempolicy. 342 * pointer, and updates task mempolicy.
343 *
344 * Called with task's alloc_lock held.
327 */ 345 */
328 346
329void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 347void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
@@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void)
600static long do_set_mempolicy(unsigned short mode, unsigned short flags, 618static long do_set_mempolicy(unsigned short mode, unsigned short flags,
601 nodemask_t *nodes) 619 nodemask_t *nodes)
602{ 620{
603 struct mempolicy *new; 621 struct mempolicy *new, *old;
604 struct mm_struct *mm = current->mm; 622 struct mm_struct *mm = current->mm;
623 int ret;
605 624
606 new = mpol_new(mode, flags, nodes); 625 new = mpol_new(mode, flags, nodes);
607 if (IS_ERR(new)) 626 if (IS_ERR(new))
@@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
615 */ 634 */
616 if (mm) 635 if (mm)
617 down_write(&mm->mmap_sem); 636 down_write(&mm->mmap_sem);
618 mpol_put(current->mempolicy); 637 task_lock(current);
638 ret = mpol_set_nodemask(new, nodes);
639 if (ret) {
640 task_unlock(current);
641 if (mm)
642 up_write(&mm->mmap_sem);
643 mpol_put(new);
644 return ret;
645 }
646 old = current->mempolicy;
619 current->mempolicy = new; 647 current->mempolicy = new;
620 mpol_set_task_struct_flag(); 648 mpol_set_task_struct_flag();
621 if (new && new->mode == MPOL_INTERLEAVE && 649 if (new && new->mode == MPOL_INTERLEAVE &&
622 nodes_weight(new->v.nodes)) 650 nodes_weight(new->v.nodes))
623 current->il_next = first_node(new->v.nodes); 651 current->il_next = first_node(new->v.nodes);
652 task_unlock(current);
624 if (mm) 653 if (mm)
625 up_write(&mm->mmap_sem); 654 up_write(&mm->mmap_sem);
626 655
656 mpol_put(old);
627 return 0; 657 return 0;
628} 658}
629 659
630/* 660/*
631 * Return nodemask for policy for get_mempolicy() query 661 * Return nodemask for policy for get_mempolicy() query
662 *
663 * Called with task's alloc_lock held
632 */ 664 */
633static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) 665static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
634{ 666{
@@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
674 struct vm_area_struct *vma = NULL; 706 struct vm_area_struct *vma = NULL;
675 struct mempolicy *pol = current->mempolicy; 707 struct mempolicy *pol = current->mempolicy;
676 708
677 cpuset_update_task_memory_state();
678 if (flags & 709 if (flags &
679 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 710 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
680 return -EINVAL; 711 return -EINVAL;
@@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
683 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 714 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
684 return -EINVAL; 715 return -EINVAL;
685 *policy = 0; /* just so it's initialized */ 716 *policy = 0; /* just so it's initialized */
717 task_lock(current);
686 *nmask = cpuset_current_mems_allowed; 718 *nmask = cpuset_current_mems_allowed;
719 task_unlock(current);
687 return 0; 720 return 0;
688 } 721 }
689 722
@@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
738 } 771 }
739 772
740 err = 0; 773 err = 0;
741 if (nmask) 774 if (nmask) {
775 task_lock(current);
742 get_policy_nodemask(pol, nmask); 776 get_policy_nodemask(pol, nmask);
777 task_unlock(current);
778 }
743 779
744 out: 780 out:
745 mpol_cond_put(pol); 781 mpol_cond_put(pol);
@@ -767,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
767 803
768static struct page *new_node_page(struct page *page, unsigned long node, int **x) 804static struct page *new_node_page(struct page *page, unsigned long node, int **x)
769{ 805{
770 return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); 806 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
771} 807}
772 808
773/* 809/*
@@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len,
979 return err; 1015 return err;
980 } 1016 }
981 down_write(&mm->mmap_sem); 1017 down_write(&mm->mmap_sem);
1018 task_lock(current);
1019 err = mpol_set_nodemask(new, nmask);
1020 task_unlock(current);
1021 if (err) {
1022 up_write(&mm->mmap_sem);
1023 mpol_put(new);
1024 return err;
1025 }
982 vma = check_range(mm, start, end, nmask, 1026 vma = check_range(mm, start, end, nmask,
983 flags | MPOL_MF_INVERT, &pagelist); 1027 flags | MPOL_MF_INVERT, &pagelist);
984 1028
@@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1545 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1589 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1546 struct zonelist *zl; 1590 struct zonelist *zl;
1547 1591
1548 cpuset_update_task_memory_state();
1549
1550 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1592 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1551 unsigned nid; 1593 unsigned nid;
1552 1594
@@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1593{ 1635{
1594 struct mempolicy *pol = current->mempolicy; 1636 struct mempolicy *pol = current->mempolicy;
1595 1637
1596 if ((gfp & __GFP_WAIT) && !in_interrupt())
1597 cpuset_update_task_memory_state();
1598 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1638 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1599 pol = &default_policy; 1639 pol = &default_policy;
1600 1640
@@ -1854,6 +1894,8 @@ restart:
1854 */ 1894 */
1855void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 1895void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1856{ 1896{
1897 int ret;
1898
1857 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 1899 sp->root = RB_ROOT; /* empty tree == default mempolicy */
1858 spin_lock_init(&sp->lock); 1900 spin_lock_init(&sp->lock);
1859 1901
@@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1863 1905
1864 /* contextualize the tmpfs mount point mempolicy */ 1906 /* contextualize the tmpfs mount point mempolicy */
1865 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 1907 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1866 mpol_put(mpol); /* drop our ref on sb mpol */ 1908 if (IS_ERR(new)) {
1867 if (IS_ERR(new)) 1909 mpol_put(mpol); /* drop our ref on sb mpol */
1868 return; /* no valid nodemask intersection */ 1910 return; /* no valid nodemask intersection */
1911 }
1912
1913 task_lock(current);
1914 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
1915 task_unlock(current);
1916 mpol_put(mpol); /* drop our ref on sb mpol */
1917 if (ret) {
1918 mpol_put(new);
1919 return;
1920 }
1869 1921
1870 /* Create pseudo-vma that contains just the policy */ 1922 /* Create pseudo-vma that contains just the policy */
1871 memset(&pvma, 0, sizeof(struct vm_area_struct)); 1923 memset(&pvma, 0, sizeof(struct vm_area_struct));
@@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2086 new = mpol_new(mode, mode_flags, &nodes); 2138 new = mpol_new(mode, mode_flags, &nodes);
2087 if (IS_ERR(new)) 2139 if (IS_ERR(new))
2088 err = 1; 2140 err = 1;
2089 else if (no_context) 2141 else {
2090 new->w.user_nodemask = nodes; /* save for contextualization */ 2142 int ret;
2143
2144 task_lock(current);
2145 ret = mpol_set_nodemask(new, &nodes);
2146 task_unlock(current);
2147 if (ret)
2148 err = 1;
2149 else if (no_context) {
2150 /* save for contextualization */
2151 new->w.user_nodemask = nodes;
2152 }
2153 }
2091 2154
2092out: 2155out:
2093 /* Restore string for error message */ 2156 /* Restore string for error message */
diff --git a/mm/migrate.c b/mm/migrate.c
index 068655d8f883..939888f9ddab 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
802 802
803 *result = &pm->status; 803 *result = &pm->status;
804 804
805 return alloc_pages_node(pm->node, 805 return alloc_pages_exact_node(pm->node,
806 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 806 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
807} 807}
808 808
@@ -820,7 +820,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
820 struct page_to_node *pp; 820 struct page_to_node *pp;
821 LIST_HEAD(pagelist); 821 LIST_HEAD(pagelist);
822 822
823 migrate_prep();
824 down_read(&mm->mmap_sem); 823 down_read(&mm->mmap_sem);
825 824
826 /* 825 /*
@@ -907,6 +906,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
907 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); 906 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
908 if (!pm) 907 if (!pm)
909 goto out; 908 goto out;
909
910 migrate_prep();
911
910 /* 912 /*
911 * Store a chunk of page_to_node array in a page, 913 * Store a chunk of page_to_node array in a page,
912 * but keep the last one as a marker 914 * but keep the last one as a marker
diff --git a/mm/mlock.c b/mm/mlock.c
index cbe9e0581b75..45eb650b9654 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -31,7 +31,6 @@ int can_do_mlock(void)
31} 31}
32EXPORT_SYMBOL(can_do_mlock); 32EXPORT_SYMBOL(can_do_mlock);
33 33
34#ifdef CONFIG_UNEVICTABLE_LRU
35/* 34/*
36 * Mlocked pages are marked with PageMlocked() flag for efficient testing 35 * Mlocked pages are marked with PageMlocked() flag for efficient testing
37 * in vmscan and, possibly, the fault path; and to support semi-accurate 36 * in vmscan and, possibly, the fault path; and to support semi-accurate
@@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval)
261 return retval; 260 return retval;
262} 261}
263 262
264#else /* CONFIG_UNEVICTABLE_LRU */
265
266/*
267 * Just make pages present if VM_LOCKED. No-op if unlocking.
268 */
269static long __mlock_vma_pages_range(struct vm_area_struct *vma,
270 unsigned long start, unsigned long end,
271 int mlock)
272{
273 if (mlock && (vma->vm_flags & VM_LOCKED))
274 return make_pages_present(start, end);
275 return 0;
276}
277
278static inline int __mlock_posix_error_return(long retval)
279{
280 return 0;
281}
282
283#endif /* CONFIG_UNEVICTABLE_LRU */
284
285/** 263/**
286 * mlock_vma_pages_range() - mlock pages in specified vma range. 264 * mlock_vma_pages_range() - mlock pages in specified vma range.
287 * @vma - the vma containing the specfied address range 265 * @vma - the vma containing the specfied address range
@@ -629,52 +607,43 @@ void user_shm_unlock(size_t size, struct user_struct *user)
629 free_uid(user); 607 free_uid(user);
630} 608}
631 609
632void *alloc_locked_buffer(size_t size) 610int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
611 size_t size)
633{ 612{
634 unsigned long rlim, vm, pgsz; 613 unsigned long lim, vm, pgsz;
635 void *buffer = NULL; 614 int error = -ENOMEM;
636 615
637 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; 616 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
638 617
639 down_write(&current->mm->mmap_sem); 618 down_write(&mm->mmap_sem);
640 619
641 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; 620 lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
642 vm = current->mm->total_vm + pgsz; 621 vm = mm->total_vm + pgsz;
643 if (rlim < vm) 622 if (lim < vm)
644 goto out; 623 goto out;
645 624
646 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 625 lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
647 vm = current->mm->locked_vm + pgsz; 626 vm = mm->locked_vm + pgsz;
648 if (rlim < vm) 627 if (lim < vm)
649 goto out; 628 goto out;
650 629
651 buffer = kzalloc(size, GFP_KERNEL); 630 mm->total_vm += pgsz;
652 if (!buffer) 631 mm->locked_vm += pgsz;
653 goto out;
654
655 current->mm->total_vm += pgsz;
656 current->mm->locked_vm += pgsz;
657 632
633 error = 0;
658 out: 634 out:
659 up_write(&current->mm->mmap_sem); 635 up_write(&mm->mmap_sem);
660 return buffer; 636 return error;
661} 637}
662 638
663void release_locked_buffer(void *buffer, size_t size) 639void refund_locked_memory(struct mm_struct *mm, size_t size)
664{ 640{
665 unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; 641 unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
666 642
667 down_write(&current->mm->mmap_sem); 643 down_write(&mm->mmap_sem);
668
669 current->mm->total_vm -= pgsz;
670 current->mm->locked_vm -= pgsz;
671
672 up_write(&current->mm->mmap_sem);
673}
674 644
675void free_locked_buffer(void *buffer, size_t size) 645 mm->total_vm -= pgsz;
676{ 646 mm->locked_vm -= pgsz;
677 release_locked_buffer(buffer, size);
678 647
679 kfree(buffer); 648 up_write(&mm->mmap_sem);
680} 649}
diff --git a/mm/mmap.c b/mm/mmap.c
index 6b7b1a95944b..34579b23ebd5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
28#include <linux/mempolicy.h> 28#include <linux/mempolicy.h>
29#include <linux/rmap.h> 29#include <linux/rmap.h>
30#include <linux/mmu_notifier.h> 30#include <linux/mmu_notifier.h>
31#include <linux/perf_counter.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/cacheflush.h> 34#include <asm/cacheflush.h>
@@ -87,6 +88,9 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */
87int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 88int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
88struct percpu_counter vm_committed_as; 89struct percpu_counter vm_committed_as;
89 90
91/* amount of vm to protect from userspace access */
92unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
93
90/* 94/*
91 * Check that a process has enough memory to allocate a new virtual 95 * Check that a process has enough memory to allocate a new virtual
92 * mapping. 0 means there is enough memory for the allocation to 96 * mapping. 0 means there is enough memory for the allocation to
@@ -1219,6 +1223,8 @@ munmap_back:
1219 if (correct_wcount) 1223 if (correct_wcount)
1220 atomic_inc(&inode->i_writecount); 1224 atomic_inc(&inode->i_writecount);
1221out: 1225out:
1226 perf_counter_mmap(vma);
1227
1222 mm->total_vm += len >> PAGE_SHIFT; 1228 mm->total_vm += len >> PAGE_SHIFT;
1223 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1229 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1224 if (vm_flags & VM_LOCKED) { 1230 if (vm_flags & VM_LOCKED) {
@@ -2305,6 +2311,8 @@ int install_special_mapping(struct mm_struct *mm,
2305 2311
2306 mm->total_vm += len >> PAGE_SHIFT; 2312 mm->total_vm += len >> PAGE_SHIFT;
2307 2313
2314 perf_counter_mmap(vma);
2315
2308 return 0; 2316 return 0;
2309} 2317}
2310 2318
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 258197b76fb4..d80311baeb2d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h> 24#include <linux/mmu_notifier.h>
25#include <linux/migrate.h> 25#include <linux/migrate.h>
26#include <linux/perf_counter.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27#include <asm/pgtable.h> 28#include <asm/pgtable.h>
28#include <asm/cacheflush.h> 29#include <asm/cacheflush.h>
@@ -299,6 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
299 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); 300 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
300 if (error) 301 if (error)
301 goto out; 302 goto out;
303 perf_counter_mmap(vma);
302 nstart = tmp; 304 nstart = tmp;
303 305
304 if (nstart < prev->vm_end) 306 if (nstart < prev->vm_end)
diff --git a/mm/nommu.c b/mm/nommu.c
index b571ef707428..2fd2ad5da98e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -69,6 +69,9 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
69int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; 69int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
70int heap_stack_gap = 0; 70int heap_stack_gap = 0;
71 71
72/* amount of vm to protect from userspace access */
73unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
74
72atomic_long_t mmap_pages_allocated; 75atomic_long_t mmap_pages_allocated;
73 76
74EXPORT_SYMBOL(mem_map); 77EXPORT_SYMBOL(mem_map);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a7b2460e922b..175a67a78a99 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 unsigned long points, cpu_time, run_time; 58 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 59 struct mm_struct *mm;
60 struct task_struct *child; 60 struct task_struct *child;
61 int oom_adj;
61 62
62 task_lock(p); 63 task_lock(p);
63 mm = p->mm; 64 mm = p->mm;
@@ -65,6 +66,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
65 task_unlock(p); 66 task_unlock(p);
66 return 0; 67 return 0;
67 } 68 }
69 oom_adj = mm->oom_adj;
70 if (oom_adj == OOM_DISABLE) {
71 task_unlock(p);
72 return 0;
73 }
68 74
69 /* 75 /*
70 * The memory size of the process is the basis for the badness. 76 * The memory size of the process is the basis for the badness.
@@ -148,15 +154,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
148 points /= 8; 154 points /= 8;
149 155
150 /* 156 /*
151 * Adjust the score by oomkilladj. 157 * Adjust the score by oom_adj.
152 */ 158 */
153 if (p->oomkilladj) { 159 if (oom_adj) {
154 if (p->oomkilladj > 0) { 160 if (oom_adj > 0) {
155 if (!points) 161 if (!points)
156 points = 1; 162 points = 1;
157 points <<= p->oomkilladj; 163 points <<= oom_adj;
158 } else 164 } else
159 points >>= -(p->oomkilladj); 165 points >>= -(oom_adj);
160 } 166 }
161 167
162#ifdef DEBUG 168#ifdef DEBUG
@@ -251,11 +257,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
251 *ppoints = ULONG_MAX; 257 *ppoints = ULONG_MAX;
252 } 258 }
253 259
254 if (p->oomkilladj == OOM_DISABLE)
255 continue;
256
257 points = badness(p, uptime.tv_sec); 260 points = badness(p, uptime.tv_sec);
258 if (points > *ppoints || !chosen) { 261 if (points > *ppoints) {
259 chosen = p; 262 chosen = p;
260 *ppoints = points; 263 *ppoints = points;
261 } 264 }
@@ -304,8 +307,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
304 } 307 }
305 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 308 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
306 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 309 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
307 get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, 310 get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
308 p->comm);
309 task_unlock(p); 311 task_unlock(p);
310 } while_each_thread(g, p); 312 } while_each_thread(g, p);
311} 313}
@@ -323,11 +325,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
323 return; 325 return;
324 } 326 }
325 327
326 if (!p->mm) { 328 if (!p->mm)
327 WARN_ON(1);
328 printk(KERN_WARNING "tried to kill an mm-less task!\n");
329 return; 329 return;
330 }
331 330
332 if (verbose) 331 if (verbose)
333 printk(KERN_ERR "Killed process %d (%s)\n", 332 printk(KERN_ERR "Killed process %d (%s)\n",
@@ -349,28 +348,13 @@ static int oom_kill_task(struct task_struct *p)
349 struct mm_struct *mm; 348 struct mm_struct *mm;
350 struct task_struct *g, *q; 349 struct task_struct *g, *q;
351 350
351 task_lock(p);
352 mm = p->mm; 352 mm = p->mm;
353 353 if (!mm || mm->oom_adj == OOM_DISABLE) {
354 /* WARNING: mm may not be dereferenced since we did not obtain its 354 task_unlock(p);
355 * value from get_task_mm(p). This is OK since all we need to do is
356 * compare mm to q->mm below.
357 *
358 * Furthermore, even if mm contains a non-NULL value, p->mm may
359 * change to NULL at any time since we do not hold task_lock(p).
360 * However, this is of no concern to us.
361 */
362
363 if (mm == NULL)
364 return 1; 355 return 1;
365 356 }
366 /* 357 task_unlock(p);
367 * Don't kill the process if any threads are set to OOM_DISABLE
368 */
369 do_each_thread(g, q) {
370 if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
371 return 1;
372 } while_each_thread(g, q);
373
374 __oom_kill_task(p, 1); 358 __oom_kill_task(p, 1);
375 359
376 /* 360 /*
@@ -393,10 +377,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
393 struct task_struct *c; 377 struct task_struct *c;
394 378
395 if (printk_ratelimit()) { 379 if (printk_ratelimit()) {
396 printk(KERN_WARNING "%s invoked oom-killer: "
397 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
398 current->comm, gfp_mask, order, current->oomkilladj);
399 task_lock(current); 380 task_lock(current);
381 printk(KERN_WARNING "%s invoked oom-killer: "
382 "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
383 current->comm, gfp_mask, order,
384 current->mm ? current->mm->oom_adj : OOM_DISABLE);
400 cpuset_print_task_mems_allowed(current); 385 cpuset_print_task_mems_allowed(current);
401 task_unlock(current); 386 task_unlock(current);
402 dump_stack(); 387 dump_stack();
@@ -409,8 +394,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
409 /* 394 /*
410 * If the task is already exiting, don't alarm the sysadmin or kill 395 * If the task is already exiting, don't alarm the sysadmin or kill
411 * its children or threads, just set TIF_MEMDIE so it can die quickly 396 * its children or threads, just set TIF_MEMDIE so it can die quickly
397 * if its mm is still attached.
412 */ 398 */
413 if (p->flags & PF_EXITING) { 399 if (p->mm && (p->flags & PF_EXITING)) {
414 __oom_kill_task(p, 0); 400 __oom_kill_task(p, 0);
415 return 0; 401 return 0;
416 } 402 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bb553c3e955d..7b0dcea4935b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -265,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
265 * This avoids exceeding the total dirty_limit when the floating averages 265 * This avoids exceeding the total dirty_limit when the floating averages
266 * fluctuate too quickly. 266 * fluctuate too quickly.
267 */ 267 */
268static void 268static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
269clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) 269 unsigned long dirty, unsigned long *pbdi_dirty)
270{ 270{
271 long avail_dirty; 271 unsigned long avail_dirty;
272 272
273 avail_dirty = dirty - 273 avail_dirty = global_page_state(NR_FILE_DIRTY) +
274 (global_page_state(NR_FILE_DIRTY) +
275 global_page_state(NR_WRITEBACK) + 274 global_page_state(NR_WRITEBACK) +
276 global_page_state(NR_UNSTABLE_NFS) + 275 global_page_state(NR_UNSTABLE_NFS) +
277 global_page_state(NR_WRITEBACK_TEMP)); 276 global_page_state(NR_WRITEBACK_TEMP);
278 277
279 if (avail_dirty < 0) 278 if (avail_dirty < dirty)
279 avail_dirty = dirty - avail_dirty;
280 else
280 avail_dirty = 0; 281 avail_dirty = 0;
281 282
282 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + 283 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
@@ -299,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
299 * 300 *
300 * dirty -= (dirty/8) * p_{t} 301 * dirty -= (dirty/8) * p_{t}
301 */ 302 */
302static void task_dirty_limit(struct task_struct *tsk, long *pdirty) 303static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
303{ 304{
304 long numerator, denominator; 305 long numerator, denominator;
305 long dirty = *pdirty; 306 unsigned long dirty = *pdirty;
306 u64 inv = dirty >> 3; 307 u64 inv = dirty >> 3;
307 308
308 task_dirties_fraction(tsk, &numerator, &denominator); 309 task_dirties_fraction(tsk, &numerator, &denominator);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe753ecf2aa5..5d714f8fb303 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -23,6 +23,7 @@
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/compiler.h> 24#include <linux/compiler.h>
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/kmemcheck.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/suspend.h> 28#include <linux/suspend.h>
28#include <linux/pagevec.h> 29#include <linux/pagevec.h>
@@ -46,6 +47,7 @@
46#include <linux/page-isolation.h> 47#include <linux/page-isolation.h>
47#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
48#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h>
49 51
50#include <asm/tlbflush.h> 52#include <asm/tlbflush.h>
51#include <asm/div64.h> 53#include <asm/div64.h>
@@ -71,6 +73,7 @@ unsigned long totalram_pages __read_mostly;
71unsigned long totalreserve_pages __read_mostly; 73unsigned long totalreserve_pages __read_mostly;
72unsigned long highest_memmap_pfn __read_mostly; 74unsigned long highest_memmap_pfn __read_mostly;
73int percpu_pagelist_fraction; 75int percpu_pagelist_fraction;
76gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
74 77
75#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 78#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
76int pageblock_order __read_mostly; 79int pageblock_order __read_mostly;
@@ -149,10 +152,6 @@ static unsigned long __meminitdata dma_reserve;
149 static int __meminitdata nr_nodemap_entries; 152 static int __meminitdata nr_nodemap_entries;
150 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 153 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
151 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 154 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
152#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
155#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
156 static unsigned long __initdata required_kernelcore; 155 static unsigned long __initdata required_kernelcore;
157 static unsigned long __initdata required_movablecore; 156 static unsigned long __initdata required_movablecore;
158 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 157 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
@@ -164,17 +163,25 @@ static unsigned long __meminitdata dma_reserve;
164 163
165#if MAX_NUMNODES > 1 164#if MAX_NUMNODES > 1
166int nr_node_ids __read_mostly = MAX_NUMNODES; 165int nr_node_ids __read_mostly = MAX_NUMNODES;
166int nr_online_nodes __read_mostly = 1;
167EXPORT_SYMBOL(nr_node_ids); 167EXPORT_SYMBOL(nr_node_ids);
168EXPORT_SYMBOL(nr_online_nodes);
168#endif 169#endif
169 170
170int page_group_by_mobility_disabled __read_mostly; 171int page_group_by_mobility_disabled __read_mostly;
171 172
172static void set_pageblock_migratetype(struct page *page, int migratetype) 173static void set_pageblock_migratetype(struct page *page, int migratetype)
173{ 174{
175
176 if (unlikely(page_group_by_mobility_disabled))
177 migratetype = MIGRATE_UNMOVABLE;
178
174 set_pageblock_flags_group(page, (unsigned long)migratetype, 179 set_pageblock_flags_group(page, (unsigned long)migratetype,
175 PB_migrate, PB_migrate_end); 180 PB_migrate, PB_migrate_end);
176} 181}
177 182
183bool oom_killer_disabled __read_mostly;
184
178#ifdef CONFIG_DEBUG_VM 185#ifdef CONFIG_DEBUG_VM
179static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 186static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
180{ 187{
@@ -297,23 +304,6 @@ void prep_compound_page(struct page *page, unsigned long order)
297 } 304 }
298} 305}
299 306
300#ifdef CONFIG_HUGETLBFS
301void prep_compound_gigantic_page(struct page *page, unsigned long order)
302{
303 int i;
304 int nr_pages = 1 << order;
305 struct page *p = page + 1;
306
307 set_compound_page_dtor(page, free_compound_page);
308 set_compound_order(page, order);
309 __SetPageHead(page);
310 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
311 __SetPageTail(p);
312 p->first_page = page;
313 }
314}
315#endif
316
317static int destroy_compound_page(struct page *page, unsigned long order) 307static int destroy_compound_page(struct page *page, unsigned long order)
318{ 308{
319 int i; 309 int i;
@@ -420,7 +410,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
420 return 0; 410 return 0;
421 411
422 if (PageBuddy(buddy) && page_order(buddy) == order) { 412 if (PageBuddy(buddy) && page_order(buddy) == order) {
423 BUG_ON(page_count(buddy) != 0); 413 VM_BUG_ON(page_count(buddy) != 0);
424 return 1; 414 return 1;
425 } 415 }
426 return 0; 416 return 0;
@@ -451,22 +441,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
451 */ 441 */
452 442
453static inline void __free_one_page(struct page *page, 443static inline void __free_one_page(struct page *page,
454 struct zone *zone, unsigned int order) 444 struct zone *zone, unsigned int order,
445 int migratetype)
455{ 446{
456 unsigned long page_idx; 447 unsigned long page_idx;
457 int order_size = 1 << order;
458 int migratetype = get_pageblock_migratetype(page);
459 448
460 if (unlikely(PageCompound(page))) 449 if (unlikely(PageCompound(page)))
461 if (unlikely(destroy_compound_page(page, order))) 450 if (unlikely(destroy_compound_page(page, order)))
462 return; 451 return;
463 452
453 VM_BUG_ON(migratetype == -1);
454
464 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 455 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
465 456
466 VM_BUG_ON(page_idx & (order_size - 1)); 457 VM_BUG_ON(page_idx & ((1 << order) - 1));
467 VM_BUG_ON(bad_range(zone, page)); 458 VM_BUG_ON(bad_range(zone, page));
468 459
469 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
470 while (order < MAX_ORDER-1) { 460 while (order < MAX_ORDER-1) {
471 unsigned long combined_idx; 461 unsigned long combined_idx;
472 struct page *buddy; 462 struct page *buddy;
@@ -490,12 +480,26 @@ static inline void __free_one_page(struct page *page,
490 zone->free_area[order].nr_free++; 480 zone->free_area[order].nr_free++;
491} 481}
492 482
483#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
484/*
485 * free_page_mlock() -- clean up attempts to free and mlocked() page.
486 * Page should not be on lru, so no need to fix that up.
487 * free_pages_check() will verify...
488 */
489static inline void free_page_mlock(struct page *page)
490{
491 __dec_zone_page_state(page, NR_MLOCK);
492 __count_vm_event(UNEVICTABLE_MLOCKFREED);
493}
494#else
495static void free_page_mlock(struct page *page) { }
496#endif
497
493static inline int free_pages_check(struct page *page) 498static inline int free_pages_check(struct page *page)
494{ 499{
495 free_page_mlock(page);
496 if (unlikely(page_mapcount(page) | 500 if (unlikely(page_mapcount(page) |
497 (page->mapping != NULL) | 501 (page->mapping != NULL) |
498 (page_count(page) != 0) | 502 (atomic_read(&page->_count) != 0) |
499 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { 503 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
500 bad_page(page); 504 bad_page(page);
501 return 1; 505 return 1;
@@ -522,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count,
522 spin_lock(&zone->lock); 526 spin_lock(&zone->lock);
523 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 527 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
524 zone->pages_scanned = 0; 528 zone->pages_scanned = 0;
529
530 __mod_zone_page_state(zone, NR_FREE_PAGES, count << order);
525 while (count--) { 531 while (count--) {
526 struct page *page; 532 struct page *page;
527 533
@@ -529,17 +535,20 @@ static void free_pages_bulk(struct zone *zone, int count,
529 page = list_entry(list->prev, struct page, lru); 535 page = list_entry(list->prev, struct page, lru);
530 /* have to delete it as __free_one_page list manipulates */ 536 /* have to delete it as __free_one_page list manipulates */
531 list_del(&page->lru); 537 list_del(&page->lru);
532 __free_one_page(page, zone, order); 538 __free_one_page(page, zone, order, page_private(page));
533 } 539 }
534 spin_unlock(&zone->lock); 540 spin_unlock(&zone->lock);
535} 541}
536 542
537static void free_one_page(struct zone *zone, struct page *page, int order) 543static void free_one_page(struct zone *zone, struct page *page, int order,
544 int migratetype)
538{ 545{
539 spin_lock(&zone->lock); 546 spin_lock(&zone->lock);
540 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 547 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
541 zone->pages_scanned = 0; 548 zone->pages_scanned = 0;
542 __free_one_page(page, zone, order); 549
550 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
551 __free_one_page(page, zone, order, migratetype);
543 spin_unlock(&zone->lock); 552 spin_unlock(&zone->lock);
544} 553}
545 554
@@ -548,6 +557,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
548 unsigned long flags; 557 unsigned long flags;
549 int i; 558 int i;
550 int bad = 0; 559 int bad = 0;
560 int wasMlocked = TestClearPageMlocked(page);
561
562 kmemcheck_free_shadow(page, order);
551 563
552 for (i = 0 ; i < (1 << order) ; ++i) 564 for (i = 0 ; i < (1 << order) ; ++i)
553 bad += free_pages_check(page + i); 565 bad += free_pages_check(page + i);
@@ -563,8 +575,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
563 kernel_map_pages(page, 1 << order, 0); 575 kernel_map_pages(page, 1 << order, 0);
564 576
565 local_irq_save(flags); 577 local_irq_save(flags);
578 if (unlikely(wasMlocked))
579 free_page_mlock(page);
566 __count_vm_events(PGFREE, 1 << order); 580 __count_vm_events(PGFREE, 1 << order);
567 free_one_page(page_zone(page), page, order); 581 free_one_page(page_zone(page), page, order,
582 get_pageblock_migratetype(page));
568 local_irq_restore(flags); 583 local_irq_restore(flags);
569} 584}
570 585
@@ -635,7 +650,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
635{ 650{
636 if (unlikely(page_mapcount(page) | 651 if (unlikely(page_mapcount(page) |
637 (page->mapping != NULL) | 652 (page->mapping != NULL) |
638 (page_count(page) != 0) | 653 (atomic_read(&page->_count) != 0) |
639 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { 654 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
640 bad_page(page); 655 bad_page(page);
641 return 1; 656 return 1;
@@ -660,7 +675,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
660 * Go through the free lists for the given migratetype and remove 675 * Go through the free lists for the given migratetype and remove
661 * the smallest available page from the freelists 676 * the smallest available page from the freelists
662 */ 677 */
663static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 678static inline
679struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
664 int migratetype) 680 int migratetype)
665{ 681{
666 unsigned int current_order; 682 unsigned int current_order;
@@ -678,7 +694,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
678 list_del(&page->lru); 694 list_del(&page->lru);
679 rmv_page_order(page); 695 rmv_page_order(page);
680 area->nr_free--; 696 area->nr_free--;
681 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
682 expand(zone, page, order, current_order, area, migratetype); 697 expand(zone, page, order, current_order, area, migratetype);
683 return page; 698 return page;
684 } 699 }
@@ -769,8 +784,8 @@ static int move_freepages_block(struct zone *zone, struct page *page,
769} 784}
770 785
771/* Remove an element from the buddy allocator from the fallback list */ 786/* Remove an element from the buddy allocator from the fallback list */
772static struct page *__rmqueue_fallback(struct zone *zone, int order, 787static inline struct page *
773 int start_migratetype) 788__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
774{ 789{
775 struct free_area * area; 790 struct free_area * area;
776 int current_order; 791 int current_order;
@@ -818,8 +833,6 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
818 /* Remove the page from the freelists */ 833 /* Remove the page from the freelists */
819 list_del(&page->lru); 834 list_del(&page->lru);
820 rmv_page_order(page); 835 rmv_page_order(page);
821 __mod_zone_page_state(zone, NR_FREE_PAGES,
822 -(1UL << order));
823 836
824 if (current_order == pageblock_order) 837 if (current_order == pageblock_order)
825 set_pageblock_migratetype(page, 838 set_pageblock_migratetype(page,
@@ -830,8 +843,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
830 } 843 }
831 } 844 }
832 845
833 /* Use MIGRATE_RESERVE rather than fail an allocation */ 846 return NULL;
834 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
835} 847}
836 848
837/* 849/*
@@ -843,11 +855,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
843{ 855{
844 struct page *page; 856 struct page *page;
845 857
858retry_reserve:
846 page = __rmqueue_smallest(zone, order, migratetype); 859 page = __rmqueue_smallest(zone, order, migratetype);
847 860
848 if (unlikely(!page)) 861 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
849 page = __rmqueue_fallback(zone, order, migratetype); 862 page = __rmqueue_fallback(zone, order, migratetype);
850 863
864 /*
865 * Use MIGRATE_RESERVE rather than fail an allocation. goto
866 * is used because __rmqueue_smallest is an inline function
867 * and we want just one call site
868 */
869 if (!page) {
870 migratetype = MIGRATE_RESERVE;
871 goto retry_reserve;
872 }
873 }
874
851 return page; 875 return page;
852} 876}
853 877
@@ -881,6 +905,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
881 set_page_private(page, migratetype); 905 set_page_private(page, migratetype);
882 list = &page->lru; 906 list = &page->lru;
883 } 907 }
908 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
884 spin_unlock(&zone->lock); 909 spin_unlock(&zone->lock);
885 return i; 910 return i;
886} 911}
@@ -996,6 +1021,9 @@ static void free_hot_cold_page(struct page *page, int cold)
996 struct zone *zone = page_zone(page); 1021 struct zone *zone = page_zone(page);
997 struct per_cpu_pages *pcp; 1022 struct per_cpu_pages *pcp;
998 unsigned long flags; 1023 unsigned long flags;
1024 int wasMlocked = TestClearPageMlocked(page);
1025
1026 kmemcheck_free_shadow(page, 0);
999 1027
1000 if (PageAnon(page)) 1028 if (PageAnon(page))
1001 page->mapping = NULL; 1029 page->mapping = NULL;
@@ -1010,13 +1038,16 @@ static void free_hot_cold_page(struct page *page, int cold)
1010 kernel_map_pages(page, 1, 0); 1038 kernel_map_pages(page, 1, 0);
1011 1039
1012 pcp = &zone_pcp(zone, get_cpu())->pcp; 1040 pcp = &zone_pcp(zone, get_cpu())->pcp;
1041 set_page_private(page, get_pageblock_migratetype(page));
1013 local_irq_save(flags); 1042 local_irq_save(flags);
1043 if (unlikely(wasMlocked))
1044 free_page_mlock(page);
1014 __count_vm_event(PGFREE); 1045 __count_vm_event(PGFREE);
1046
1015 if (cold) 1047 if (cold)
1016 list_add_tail(&page->lru, &pcp->list); 1048 list_add_tail(&page->lru, &pcp->list);
1017 else 1049 else
1018 list_add(&page->lru, &pcp->list); 1050 list_add(&page->lru, &pcp->list);
1019 set_page_private(page, get_pageblock_migratetype(page));
1020 pcp->count++; 1051 pcp->count++;
1021 if (pcp->count >= pcp->high) { 1052 if (pcp->count >= pcp->high) {
1022 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1053 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -1050,6 +1081,16 @@ void split_page(struct page *page, unsigned int order)
1050 1081
1051 VM_BUG_ON(PageCompound(page)); 1082 VM_BUG_ON(PageCompound(page));
1052 VM_BUG_ON(!page_count(page)); 1083 VM_BUG_ON(!page_count(page));
1084
1085#ifdef CONFIG_KMEMCHECK
1086 /*
1087 * Split shadow pages too, because free(page[0]) would
1088 * otherwise free the whole shadow.
1089 */
1090 if (kmemcheck_page_is_tracked(page))
1091 split_page(virt_to_page(page[0].shadow), order);
1092#endif
1093
1053 for (i = 1; i < (1 << order); i++) 1094 for (i = 1; i < (1 << order); i++)
1054 set_page_refcounted(page + i); 1095 set_page_refcounted(page + i);
1055} 1096}
@@ -1059,14 +1100,15 @@ void split_page(struct page *page, unsigned int order)
1059 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1100 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1060 * or two. 1101 * or two.
1061 */ 1102 */
1062static struct page *buffered_rmqueue(struct zone *preferred_zone, 1103static inline
1063 struct zone *zone, int order, gfp_t gfp_flags) 1104struct page *buffered_rmqueue(struct zone *preferred_zone,
1105 struct zone *zone, int order, gfp_t gfp_flags,
1106 int migratetype)
1064{ 1107{
1065 unsigned long flags; 1108 unsigned long flags;
1066 struct page *page; 1109 struct page *page;
1067 int cold = !!(gfp_flags & __GFP_COLD); 1110 int cold = !!(gfp_flags & __GFP_COLD);
1068 int cpu; 1111 int cpu;
1069 int migratetype = allocflags_to_migratetype(gfp_flags);
1070 1112
1071again: 1113again:
1072 cpu = get_cpu(); 1114 cpu = get_cpu();
@@ -1103,8 +1145,22 @@ again:
1103 list_del(&page->lru); 1145 list_del(&page->lru);
1104 pcp->count--; 1146 pcp->count--;
1105 } else { 1147 } else {
1148 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1149 /*
1150 * __GFP_NOFAIL is not to be used in new code.
1151 *
1152 * All __GFP_NOFAIL callers should be fixed so that they
1153 * properly detect and handle allocation failures.
1154 *
1155 * We most definitely don't want callers attempting to
1156 * allocate greater than order-1 page units with
1157 * __GFP_NOFAIL.
1158 */
1159 WARN_ON_ONCE(order > 1);
1160 }
1106 spin_lock_irqsave(&zone->lock, flags); 1161 spin_lock_irqsave(&zone->lock, flags);
1107 page = __rmqueue(zone, order, migratetype); 1162 page = __rmqueue(zone, order, migratetype);
1163 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1108 spin_unlock(&zone->lock); 1164 spin_unlock(&zone->lock);
1109 if (!page) 1165 if (!page)
1110 goto failed; 1166 goto failed;
@@ -1126,10 +1182,15 @@ failed:
1126 return NULL; 1182 return NULL;
1127} 1183}
1128 1184
1129#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 1185/* The ALLOC_WMARK bits are used as an index to zone->watermark */
1130#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 1186#define ALLOC_WMARK_MIN WMARK_MIN
1131#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 1187#define ALLOC_WMARK_LOW WMARK_LOW
1132#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 1188#define ALLOC_WMARK_HIGH WMARK_HIGH
1189#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1190
1191/* Mask to get the watermark bits */
1192#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1193
1133#define ALLOC_HARDER 0x10 /* try to alloc harder */ 1194#define ALLOC_HARDER 0x10 /* try to alloc harder */
1134#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 1195#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1135#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 1196#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
@@ -1387,23 +1448,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1387 */ 1448 */
1388static struct page * 1449static struct page *
1389get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1450get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1390 struct zonelist *zonelist, int high_zoneidx, int alloc_flags) 1451 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1452 struct zone *preferred_zone, int migratetype)
1391{ 1453{
1392 struct zoneref *z; 1454 struct zoneref *z;
1393 struct page *page = NULL; 1455 struct page *page = NULL;
1394 int classzone_idx; 1456 int classzone_idx;
1395 struct zone *zone, *preferred_zone; 1457 struct zone *zone;
1396 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1458 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1397 int zlc_active = 0; /* set if using zonelist_cache */ 1459 int zlc_active = 0; /* set if using zonelist_cache */
1398 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1460 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1399 1461
1400 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
1401 &preferred_zone);
1402 if (!preferred_zone)
1403 return NULL;
1404
1405 classzone_idx = zone_idx(preferred_zone); 1462 classzone_idx = zone_idx(preferred_zone);
1406
1407zonelist_scan: 1463zonelist_scan:
1408 /* 1464 /*
1409 * Scan zonelist, looking for a zone with enough free. 1465 * Scan zonelist, looking for a zone with enough free.
@@ -1418,31 +1474,49 @@ zonelist_scan:
1418 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1474 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1419 goto try_next_zone; 1475 goto try_next_zone;
1420 1476
1477 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1421 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1478 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1422 unsigned long mark; 1479 unsigned long mark;
1423 if (alloc_flags & ALLOC_WMARK_MIN) 1480 int ret;
1424 mark = zone->pages_min; 1481
1425 else if (alloc_flags & ALLOC_WMARK_LOW) 1482 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1426 mark = zone->pages_low; 1483 if (zone_watermark_ok(zone, order, mark,
1427 else 1484 classzone_idx, alloc_flags))
1428 mark = zone->pages_high; 1485 goto try_this_zone;
1429 if (!zone_watermark_ok(zone, order, mark, 1486
1430 classzone_idx, alloc_flags)) { 1487 if (zone_reclaim_mode == 0)
1431 if (!zone_reclaim_mode || 1488 goto this_zone_full;
1432 !zone_reclaim(zone, gfp_mask, order)) 1489
1490 ret = zone_reclaim(zone, gfp_mask, order);
1491 switch (ret) {
1492 case ZONE_RECLAIM_NOSCAN:
1493 /* did not scan */
1494 goto try_next_zone;
1495 case ZONE_RECLAIM_FULL:
1496 /* scanned but unreclaimable */
1497 goto this_zone_full;
1498 default:
1499 /* did we reclaim enough */
1500 if (!zone_watermark_ok(zone, order, mark,
1501 classzone_idx, alloc_flags))
1433 goto this_zone_full; 1502 goto this_zone_full;
1434 } 1503 }
1435 } 1504 }
1436 1505
1437 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); 1506try_this_zone:
1507 page = buffered_rmqueue(preferred_zone, zone, order,
1508 gfp_mask, migratetype);
1438 if (page) 1509 if (page)
1439 break; 1510 break;
1440this_zone_full: 1511this_zone_full:
1441 if (NUMA_BUILD) 1512 if (NUMA_BUILD)
1442 zlc_mark_zone_full(zonelist, z); 1513 zlc_mark_zone_full(zonelist, z);
1443try_next_zone: 1514try_next_zone:
1444 if (NUMA_BUILD && !did_zlc_setup) { 1515 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1445 /* we do zlc_setup after the first zone is tried */ 1516 /*
1517 * we do zlc_setup after the first zone is tried but only
1518 * if there are multiple nodes make it worthwhile
1519 */
1446 allowednodes = zlc_setup(zonelist, alloc_flags); 1520 allowednodes = zlc_setup(zonelist, alloc_flags);
1447 zlc_active = 1; 1521 zlc_active = 1;
1448 did_zlc_setup = 1; 1522 did_zlc_setup = 1;
@@ -1457,47 +1531,217 @@ try_next_zone:
1457 return page; 1531 return page;
1458} 1532}
1459 1533
1534static inline int
1535should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1536 unsigned long pages_reclaimed)
1537{
1538 /* Do not loop if specifically requested */
1539 if (gfp_mask & __GFP_NORETRY)
1540 return 0;
1541
1542 /*
1543 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1544 * means __GFP_NOFAIL, but that may not be true in other
1545 * implementations.
1546 */
1547 if (order <= PAGE_ALLOC_COSTLY_ORDER)
1548 return 1;
1549
1550 /*
1551 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1552 * specified, then we retry until we no longer reclaim any pages
1553 * (above), or we've reclaimed an order of pages at least as
1554 * large as the allocation's order. In both cases, if the
1555 * allocation still fails, we stop retrying.
1556 */
1557 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1558 return 1;
1559
1560 /*
1561 * Don't let big-order allocations loop unless the caller
1562 * explicitly requests that.
1563 */
1564 if (gfp_mask & __GFP_NOFAIL)
1565 return 1;
1566
1567 return 0;
1568}
1569
1570static inline struct page *
1571__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1572 struct zonelist *zonelist, enum zone_type high_zoneidx,
1573 nodemask_t *nodemask, struct zone *preferred_zone,
1574 int migratetype)
1575{
1576 struct page *page;
1577
1578 /* Acquire the OOM killer lock for the zones in zonelist */
1579 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1580 schedule_timeout_uninterruptible(1);
1581 return NULL;
1582 }
1583
1584 /*
1585 * Go through the zonelist yet one more time, keep very high watermark
1586 * here, this is only to catch a parallel oom killing, we must fail if
1587 * we're still under heavy pressure.
1588 */
1589 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1590 order, zonelist, high_zoneidx,
1591 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
1592 preferred_zone, migratetype);
1593 if (page)
1594 goto out;
1595
1596 /* The OOM killer will not help higher order allocs */
1597 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))
1598 goto out;
1599
1600 /* Exhausted what can be done so it's blamo time */
1601 out_of_memory(zonelist, gfp_mask, order);
1602
1603out:
1604 clear_zonelist_oom(zonelist, gfp_mask);
1605 return page;
1606}
1607
1608/* The really slow allocator path where we enter direct reclaim */
1609static inline struct page *
1610__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1611 struct zonelist *zonelist, enum zone_type high_zoneidx,
1612 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1613 int migratetype, unsigned long *did_some_progress)
1614{
1615 struct page *page = NULL;
1616 struct reclaim_state reclaim_state;
1617 struct task_struct *p = current;
1618
1619 cond_resched();
1620
1621 /* We now go into synchronous reclaim */
1622 cpuset_memory_pressure_bump();
1623
1624 /*
1625 * The task's cpuset might have expanded its set of allowable nodes
1626 */
1627 p->flags |= PF_MEMALLOC;
1628 lockdep_set_current_reclaim_state(gfp_mask);
1629 reclaim_state.reclaimed_slab = 0;
1630 p->reclaim_state = &reclaim_state;
1631
1632 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1633
1634 p->reclaim_state = NULL;
1635 lockdep_clear_current_reclaim_state();
1636 p->flags &= ~PF_MEMALLOC;
1637
1638 cond_resched();
1639
1640 if (order != 0)
1641 drain_all_pages();
1642
1643 if (likely(*did_some_progress))
1644 page = get_page_from_freelist(gfp_mask, nodemask, order,
1645 zonelist, high_zoneidx,
1646 alloc_flags, preferred_zone,
1647 migratetype);
1648 return page;
1649}
1650
1460/* 1651/*
1461 * This is the 'heart' of the zoned buddy allocator. 1652 * This is called in the allocator slow-path if the allocation request is of
1653 * sufficient urgency to ignore watermarks and take other desperate measures
1462 */ 1654 */
1463struct page * 1655static inline struct page *
1464__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1656__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1465 struct zonelist *zonelist, nodemask_t *nodemask) 1657 struct zonelist *zonelist, enum zone_type high_zoneidx,
1658 nodemask_t *nodemask, struct zone *preferred_zone,
1659 int migratetype)
1660{
1661 struct page *page;
1662
1663 do {
1664 page = get_page_from_freelist(gfp_mask, nodemask, order,
1665 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
1666 preferred_zone, migratetype);
1667
1668 if (!page && gfp_mask & __GFP_NOFAIL)
1669 congestion_wait(WRITE, HZ/50);
1670 } while (!page && (gfp_mask & __GFP_NOFAIL));
1671
1672 return page;
1673}
1674
1675static inline
1676void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1677 enum zone_type high_zoneidx)
1466{ 1678{
1467 const gfp_t wait = gfp_mask & __GFP_WAIT;
1468 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1469 struct zoneref *z; 1679 struct zoneref *z;
1470 struct zone *zone; 1680 struct zone *zone;
1471 struct page *page;
1472 struct reclaim_state reclaim_state;
1473 struct task_struct *p = current;
1474 int do_retry;
1475 int alloc_flags;
1476 unsigned long did_some_progress;
1477 unsigned long pages_reclaimed = 0;
1478 1681
1479 lockdep_trace_alloc(gfp_mask); 1682 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1683 wakeup_kswapd(zone, order);
1684}
1480 1685
1481 might_sleep_if(wait); 1686static inline int
1687gfp_to_alloc_flags(gfp_t gfp_mask)
1688{
1689 struct task_struct *p = current;
1690 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1691 const gfp_t wait = gfp_mask & __GFP_WAIT;
1482 1692
1483 if (should_fail_alloc_page(gfp_mask, order)) 1693 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1484 return NULL; 1694 BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
1485 1695
1486restart: 1696 /*
1487 z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ 1697 * The caller may dip into page reserves a bit more if the caller
1698 * cannot run direct reclaim, or if the caller has realtime scheduling
1699 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1700 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1701 */
1702 alloc_flags |= (gfp_mask & __GFP_HIGH);
1488 1703
1489 if (unlikely(!z->zone)) { 1704 if (!wait) {
1705 alloc_flags |= ALLOC_HARDER;
1490 /* 1706 /*
1491 * Happens if we have an empty zonelist as a result of 1707 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1492 * GFP_THISNODE being used on a memoryless node 1708 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1493 */ 1709 */
1494 return NULL; 1710 alloc_flags &= ~ALLOC_CPUSET;
1711 } else if (unlikely(rt_task(p)))
1712 alloc_flags |= ALLOC_HARDER;
1713
1714 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1715 if (!in_interrupt() &&
1716 ((p->flags & PF_MEMALLOC) ||
1717 unlikely(test_thread_flag(TIF_MEMDIE))))
1718 alloc_flags |= ALLOC_NO_WATERMARKS;
1495 } 1719 }
1496 1720
1497 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 1721 return alloc_flags;
1498 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1722}
1499 if (page) 1723
1500 goto got_pg; 1724static inline struct page *
1725__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1726 struct zonelist *zonelist, enum zone_type high_zoneidx,
1727 nodemask_t *nodemask, struct zone *preferred_zone,
1728 int migratetype)
1729{
1730 const gfp_t wait = gfp_mask & __GFP_WAIT;
1731 struct page *page = NULL;
1732 int alloc_flags;
1733 unsigned long pages_reclaimed = 0;
1734 unsigned long did_some_progress;
1735 struct task_struct *p = current;
1736
1737 /*
1738 * In the slowpath, we sanity check order to avoid ever trying to
1739 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
1740 * be using allocators in order of preference for an area that is
1741 * too large.
1742 */
1743 if (WARN_ON_ONCE(order >= MAX_ORDER))
1744 return NULL;
1501 1745
1502 /* 1746 /*
1503 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1747 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1510,154 +1754,83 @@ restart:
1510 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1754 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1511 goto nopage; 1755 goto nopage;
1512 1756
1513 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1757 wake_all_kswapd(order, zonelist, high_zoneidx);
1514 wakeup_kswapd(zone, order);
1515 1758
1516 /* 1759 /*
1517 * OK, we're below the kswapd watermark and have kicked background 1760 * OK, we're below the kswapd watermark and have kicked background
1518 * reclaim. Now things get more complex, so set up alloc_flags according 1761 * reclaim. Now things get more complex, so set up alloc_flags according
1519 * to how we want to proceed. 1762 * to how we want to proceed.
1520 *
1521 * The caller may dip into page reserves a bit more if the caller
1522 * cannot run direct reclaim, or if the caller has realtime scheduling
1523 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1524 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1525 */ 1763 */
1526 alloc_flags = ALLOC_WMARK_MIN; 1764 alloc_flags = gfp_to_alloc_flags(gfp_mask);
1527 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
1528 alloc_flags |= ALLOC_HARDER;
1529 if (gfp_mask & __GFP_HIGH)
1530 alloc_flags |= ALLOC_HIGH;
1531 if (wait)
1532 alloc_flags |= ALLOC_CPUSET;
1533 1765
1534 /* 1766restart:
1535 * Go through the zonelist again. Let __GFP_HIGH and allocations 1767 /* This is the last chance, in general, before the goto nopage. */
1536 * coming from realtime tasks go deeper into reserves.
1537 *
1538 * This is the last chance, in general, before the goto nopage.
1539 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1540 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1541 */
1542 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 1768 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1543 high_zoneidx, alloc_flags); 1769 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
1770 preferred_zone, migratetype);
1544 if (page) 1771 if (page)
1545 goto got_pg; 1772 goto got_pg;
1546 1773
1547 /* This allocation should allow future memory freeing. */
1548
1549rebalance: 1774rebalance:
1550 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1775 /* Allocate without watermarks if the context allows */
1551 && !in_interrupt()) { 1776 if (alloc_flags & ALLOC_NO_WATERMARKS) {
1552 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1777 page = __alloc_pages_high_priority(gfp_mask, order,
1553nofail_alloc: 1778 zonelist, high_zoneidx, nodemask,
1554 /* go through the zonelist yet again, ignoring mins */ 1779 preferred_zone, migratetype);
1555 page = get_page_from_freelist(gfp_mask, nodemask, order, 1780 if (page)
1556 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); 1781 goto got_pg;
1557 if (page)
1558 goto got_pg;
1559 if (gfp_mask & __GFP_NOFAIL) {
1560 congestion_wait(WRITE, HZ/50);
1561 goto nofail_alloc;
1562 }
1563 }
1564 goto nopage;
1565 } 1782 }
1566 1783
1567 /* Atomic allocations - we can't balance anything */ 1784 /* Atomic allocations - we can't balance anything */
1568 if (!wait) 1785 if (!wait)
1569 goto nopage; 1786 goto nopage;
1570 1787
1571 cond_resched(); 1788 /* Avoid recursion of direct reclaim */
1789 if (p->flags & PF_MEMALLOC)
1790 goto nopage;
1791
1792 /* Try direct reclaim and then allocating */
1793 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1794 zonelist, high_zoneidx,
1795 nodemask,
1796 alloc_flags, preferred_zone,
1797 migratetype, &did_some_progress);
1798 if (page)
1799 goto got_pg;
1572 1800
1573 /* We now go into synchronous reclaim */
1574 cpuset_memory_pressure_bump();
1575 /* 1801 /*
1576 * The task's cpuset might have expanded its set of allowable nodes 1802 * If we failed to make any progress reclaiming, then we are
1803 * running out of options and have to consider going OOM
1577 */ 1804 */
1578 cpuset_update_task_memory_state(); 1805 if (!did_some_progress) {
1579 p->flags |= PF_MEMALLOC; 1806 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1580 1807 if (oom_killer_disabled)
1581 lockdep_set_current_reclaim_state(gfp_mask); 1808 goto nopage;
1582 reclaim_state.reclaimed_slab = 0; 1809 page = __alloc_pages_may_oom(gfp_mask, order,
1583 p->reclaim_state = &reclaim_state; 1810 zonelist, high_zoneidx,
1584 1811 nodemask, preferred_zone,
1585 did_some_progress = try_to_free_pages(zonelist, order, 1812 migratetype);
1586 gfp_mask, nodemask); 1813 if (page)
1587 1814 goto got_pg;
1588 p->reclaim_state = NULL;
1589 lockdep_clear_current_reclaim_state();
1590 p->flags &= ~PF_MEMALLOC;
1591
1592 cond_resched();
1593 1815
1594 if (order != 0) 1816 /*
1595 drain_all_pages(); 1817 * The OOM killer does not trigger for high-order
1818 * ~__GFP_NOFAIL allocations so if no progress is being
1819 * made, there are no other options and retrying is
1820 * unlikely to help.
1821 */
1822 if (order > PAGE_ALLOC_COSTLY_ORDER &&
1823 !(gfp_mask & __GFP_NOFAIL))
1824 goto nopage;
1596 1825
1597 if (likely(did_some_progress)) {
1598 page = get_page_from_freelist(gfp_mask, nodemask, order,
1599 zonelist, high_zoneidx, alloc_flags);
1600 if (page)
1601 goto got_pg;
1602 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1603 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1604 schedule_timeout_uninterruptible(1);
1605 goto restart; 1826 goto restart;
1606 } 1827 }
1607
1608 /*
1609 * Go through the zonelist yet one more time, keep
1610 * very high watermark here, this is only to catch
1611 * a parallel oom killing, we must fail if we're still
1612 * under heavy pressure.
1613 */
1614 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1615 order, zonelist, high_zoneidx,
1616 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1617 if (page) {
1618 clear_zonelist_oom(zonelist, gfp_mask);
1619 goto got_pg;
1620 }
1621
1622 /* The OOM killer will not help higher order allocs so fail */
1623 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1624 clear_zonelist_oom(zonelist, gfp_mask);
1625 goto nopage;
1626 }
1627
1628 out_of_memory(zonelist, gfp_mask, order);
1629 clear_zonelist_oom(zonelist, gfp_mask);
1630 goto restart;
1631 } 1828 }
1632 1829
1633 /* 1830 /* Check if we should retry the allocation */
1634 * Don't let big-order allocations loop unless the caller explicitly
1635 * requests that. Wait for some write requests to complete then retry.
1636 *
1637 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1638 * means __GFP_NOFAIL, but that may not be true in other
1639 * implementations.
1640 *
1641 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1642 * specified, then we retry until we no longer reclaim any pages
1643 * (above), or we've reclaimed an order of pages at least as
1644 * large as the allocation's order. In both cases, if the
1645 * allocation still fails, we stop retrying.
1646 */
1647 pages_reclaimed += did_some_progress; 1831 pages_reclaimed += did_some_progress;
1648 do_retry = 0; 1832 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1649 if (!(gfp_mask & __GFP_NORETRY)) { 1833 /* Wait for some write requests to complete then retry */
1650 if (order <= PAGE_ALLOC_COSTLY_ORDER) {
1651 do_retry = 1;
1652 } else {
1653 if (gfp_mask & __GFP_REPEAT &&
1654 pages_reclaimed < (1 << order))
1655 do_retry = 1;
1656 }
1657 if (gfp_mask & __GFP_NOFAIL)
1658 do_retry = 1;
1659 }
1660 if (do_retry) {
1661 congestion_wait(WRITE, HZ/50); 1834 congestion_wait(WRITE, HZ/50);
1662 goto rebalance; 1835 goto rebalance;
1663 } 1836 }
@@ -1670,10 +1843,60 @@ nopage:
1670 dump_stack(); 1843 dump_stack();
1671 show_mem(); 1844 show_mem();
1672 } 1845 }
1846 return page;
1673got_pg: 1847got_pg:
1848 if (kmemcheck_enabled)
1849 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
1850 return page;
1851
1852}
1853
1854/*
1855 * This is the 'heart' of the zoned buddy allocator.
1856 */
1857struct page *
1858__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1859 struct zonelist *zonelist, nodemask_t *nodemask)
1860{
1861 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1862 struct zone *preferred_zone;
1863 struct page *page;
1864 int migratetype = allocflags_to_migratetype(gfp_mask);
1865
1866 gfp_mask &= gfp_allowed_mask;
1867
1868 lockdep_trace_alloc(gfp_mask);
1869
1870 might_sleep_if(gfp_mask & __GFP_WAIT);
1871
1872 if (should_fail_alloc_page(gfp_mask, order))
1873 return NULL;
1874
1875 /*
1876 * Check the zones suitable for the gfp_mask contain at least one
1877 * valid zone. It's possible to have an empty zonelist as a result
1878 * of GFP_THISNODE and a memoryless node
1879 */
1880 if (unlikely(!zonelist->_zonerefs->zone))
1881 return NULL;
1882
1883 /* The preferred zone is used for statistics later */
1884 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1885 if (!preferred_zone)
1886 return NULL;
1887
1888 /* First allocation attempt */
1889 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1890 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
1891 preferred_zone, migratetype);
1892 if (unlikely(!page))
1893 page = __alloc_pages_slowpath(gfp_mask, order,
1894 zonelist, high_zoneidx, nodemask,
1895 preferred_zone, migratetype);
1896
1674 return page; 1897 return page;
1675} 1898}
1676EXPORT_SYMBOL(__alloc_pages_internal); 1899EXPORT_SYMBOL(__alloc_pages_nodemask);
1677 1900
1678/* 1901/*
1679 * Common helper functions. 1902 * Common helper functions.
@@ -1802,7 +2025,7 @@ static unsigned int nr_free_zone_pages(int offset)
1802 2025
1803 for_each_zone_zonelist(zone, z, zonelist, offset) { 2026 for_each_zone_zonelist(zone, z, zonelist, offset) {
1804 unsigned long size = zone->present_pages; 2027 unsigned long size = zone->present_pages;
1805 unsigned long high = zone->pages_high; 2028 unsigned long high = high_wmark_pages(zone);
1806 if (size > high) 2029 if (size > high)
1807 sum += size - high; 2030 sum += size - high;
1808 } 2031 }
@@ -1894,19 +2117,14 @@ void show_free_areas(void)
1894 2117
1895 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" 2118 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
1896 " inactive_file:%lu" 2119 " inactive_file:%lu"
1897//TODO: check/adjust line lengths
1898#ifdef CONFIG_UNEVICTABLE_LRU
1899 " unevictable:%lu" 2120 " unevictable:%lu"
1900#endif
1901 " dirty:%lu writeback:%lu unstable:%lu\n" 2121 " dirty:%lu writeback:%lu unstable:%lu\n"
1902 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 2122 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1903 global_page_state(NR_ACTIVE_ANON), 2123 global_page_state(NR_ACTIVE_ANON),
1904 global_page_state(NR_ACTIVE_FILE), 2124 global_page_state(NR_ACTIVE_FILE),
1905 global_page_state(NR_INACTIVE_ANON), 2125 global_page_state(NR_INACTIVE_ANON),
1906 global_page_state(NR_INACTIVE_FILE), 2126 global_page_state(NR_INACTIVE_FILE),
1907#ifdef CONFIG_UNEVICTABLE_LRU
1908 global_page_state(NR_UNEVICTABLE), 2127 global_page_state(NR_UNEVICTABLE),
1909#endif
1910 global_page_state(NR_FILE_DIRTY), 2128 global_page_state(NR_FILE_DIRTY),
1911 global_page_state(NR_WRITEBACK), 2129 global_page_state(NR_WRITEBACK),
1912 global_page_state(NR_UNSTABLE_NFS), 2130 global_page_state(NR_UNSTABLE_NFS),
@@ -1930,25 +2148,21 @@ void show_free_areas(void)
1930 " inactive_anon:%lukB" 2148 " inactive_anon:%lukB"
1931 " active_file:%lukB" 2149 " active_file:%lukB"
1932 " inactive_file:%lukB" 2150 " inactive_file:%lukB"
1933#ifdef CONFIG_UNEVICTABLE_LRU
1934 " unevictable:%lukB" 2151 " unevictable:%lukB"
1935#endif
1936 " present:%lukB" 2152 " present:%lukB"
1937 " pages_scanned:%lu" 2153 " pages_scanned:%lu"
1938 " all_unreclaimable? %s" 2154 " all_unreclaimable? %s"
1939 "\n", 2155 "\n",
1940 zone->name, 2156 zone->name,
1941 K(zone_page_state(zone, NR_FREE_PAGES)), 2157 K(zone_page_state(zone, NR_FREE_PAGES)),
1942 K(zone->pages_min), 2158 K(min_wmark_pages(zone)),
1943 K(zone->pages_low), 2159 K(low_wmark_pages(zone)),
1944 K(zone->pages_high), 2160 K(high_wmark_pages(zone)),
1945 K(zone_page_state(zone, NR_ACTIVE_ANON)), 2161 K(zone_page_state(zone, NR_ACTIVE_ANON)),
1946 K(zone_page_state(zone, NR_INACTIVE_ANON)), 2162 K(zone_page_state(zone, NR_INACTIVE_ANON)),
1947 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2163 K(zone_page_state(zone, NR_ACTIVE_FILE)),
1948 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2164 K(zone_page_state(zone, NR_INACTIVE_FILE)),
1949#ifdef CONFIG_UNEVICTABLE_LRU
1950 K(zone_page_state(zone, NR_UNEVICTABLE)), 2165 K(zone_page_state(zone, NR_UNEVICTABLE)),
1951#endif
1952 K(zone->present_pages), 2166 K(zone->present_pages),
1953 zone->pages_scanned, 2167 zone->pages_scanned,
1954 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2168 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -2106,7 +2320,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2106} 2320}
2107 2321
2108 2322
2109#define MAX_NODE_LOAD (num_online_nodes()) 2323#define MAX_NODE_LOAD (nr_online_nodes)
2110static int node_load[MAX_NUMNODES]; 2324static int node_load[MAX_NUMNODES];
2111 2325
2112/** 2326/**
@@ -2315,7 +2529,7 @@ static void build_zonelists(pg_data_t *pgdat)
2315 2529
2316 /* NUMA-aware ordering of nodes */ 2530 /* NUMA-aware ordering of nodes */
2317 local_node = pgdat->node_id; 2531 local_node = pgdat->node_id;
2318 load = num_online_nodes(); 2532 load = nr_online_nodes;
2319 prev_node = local_node; 2533 prev_node = local_node;
2320 nodes_clear(used_mask); 2534 nodes_clear(used_mask);
2321 2535
@@ -2466,7 +2680,7 @@ void build_all_zonelists(void)
2466 2680
2467 printk("Built %i zonelists in %s order, mobility grouping %s. " 2681 printk("Built %i zonelists in %s order, mobility grouping %s. "
2468 "Total pages: %ld\n", 2682 "Total pages: %ld\n",
2469 num_online_nodes(), 2683 nr_online_nodes,
2470 zonelist_order_name[current_zonelist_order], 2684 zonelist_order_name[current_zonelist_order],
2471 page_group_by_mobility_disabled ? "off" : "on", 2685 page_group_by_mobility_disabled ? "off" : "on",
2472 vm_total_pages); 2686 vm_total_pages);
@@ -2545,8 +2759,8 @@ static inline unsigned long wait_table_bits(unsigned long size)
2545 2759
2546/* 2760/*
2547 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 2761 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2548 * of blocks reserved is based on zone->pages_min. The memory within the 2762 * of blocks reserved is based on min_wmark_pages(zone). The memory within
2549 * reserve will tend to store contiguous free pages. Setting min_free_kbytes 2763 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
2550 * higher will lead to a bigger reserve which will get freed as contiguous 2764 * higher will lead to a bigger reserve which will get freed as contiguous
2551 * blocks as reclaim kicks in 2765 * blocks as reclaim kicks in
2552 */ 2766 */
@@ -2559,7 +2773,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2559 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2773 /* Get the start pfn, end pfn and the number of blocks to reserve */
2560 start_pfn = zone->zone_start_pfn; 2774 start_pfn = zone->zone_start_pfn;
2561 end_pfn = start_pfn + zone->spanned_pages; 2775 end_pfn = start_pfn + zone->spanned_pages;
2562 reserve = roundup(zone->pages_min, pageblock_nr_pages) >> 2776 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2563 pageblock_order; 2777 pageblock_order;
2564 2778
2565 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2779 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
@@ -2812,7 +3026,7 @@ bad:
2812 if (dzone == zone) 3026 if (dzone == zone)
2813 break; 3027 break;
2814 kfree(zone_pcp(dzone, cpu)); 3028 kfree(zone_pcp(dzone, cpu));
2815 zone_pcp(dzone, cpu) = NULL; 3029 zone_pcp(dzone, cpu) = &boot_pageset[cpu];
2816 } 3030 }
2817 return -ENOMEM; 3031 return -ENOMEM;
2818} 3032}
@@ -2827,7 +3041,7 @@ static inline void free_zone_pagesets(int cpu)
2827 /* Free per_cpu_pageset if it is slab allocated */ 3041 /* Free per_cpu_pageset if it is slab allocated */
2828 if (pset != &boot_pageset[cpu]) 3042 if (pset != &boot_pageset[cpu])
2829 kfree(pset); 3043 kfree(pset);
2830 zone_pcp(zone, cpu) = NULL; 3044 zone_pcp(zone, cpu) = &boot_pageset[cpu];
2831 } 3045 }
2832} 3046}
2833 3047
@@ -3103,64 +3317,6 @@ void __init sparse_memory_present_with_active_regions(int nid)
3103} 3317}
3104 3318
3105/** 3319/**
3106 * push_node_boundaries - Push node boundaries to at least the requested boundary
3107 * @nid: The nid of the node to push the boundary for
3108 * @start_pfn: The start pfn of the node
3109 * @end_pfn: The end pfn of the node
3110 *
3111 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
3112 * time. Specifically, on x86_64, SRAT will report ranges that can potentially
3113 * be hotplugged even though no physical memory exists. This function allows
3114 * an arch to push out the node boundaries so mem_map is allocated that can
3115 * be used later.
3116 */
3117#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
3118void __init push_node_boundaries(unsigned int nid,
3119 unsigned long start_pfn, unsigned long end_pfn)
3120{
3121 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3122 "Entering push_node_boundaries(%u, %lu, %lu)\n",
3123 nid, start_pfn, end_pfn);
3124
3125 /* Initialise the boundary for this node if necessary */
3126 if (node_boundary_end_pfn[nid] == 0)
3127 node_boundary_start_pfn[nid] = -1UL;
3128
3129 /* Update the boundaries */
3130 if (node_boundary_start_pfn[nid] > start_pfn)
3131 node_boundary_start_pfn[nid] = start_pfn;
3132 if (node_boundary_end_pfn[nid] < end_pfn)
3133 node_boundary_end_pfn[nid] = end_pfn;
3134}
3135
3136/* If necessary, push the node boundary out for reserve hotadd */
3137static void __meminit account_node_boundary(unsigned int nid,
3138 unsigned long *start_pfn, unsigned long *end_pfn)
3139{
3140 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3141 "Entering account_node_boundary(%u, %lu, %lu)\n",
3142 nid, *start_pfn, *end_pfn);
3143
3144 /* Return if boundary information has not been provided */
3145 if (node_boundary_end_pfn[nid] == 0)
3146 return;
3147
3148 /* Check the boundaries and update if necessary */
3149 if (node_boundary_start_pfn[nid] < *start_pfn)
3150 *start_pfn = node_boundary_start_pfn[nid];
3151 if (node_boundary_end_pfn[nid] > *end_pfn)
3152 *end_pfn = node_boundary_end_pfn[nid];
3153}
3154#else
3155void __init push_node_boundaries(unsigned int nid,
3156 unsigned long start_pfn, unsigned long end_pfn) {}
3157
3158static void __meminit account_node_boundary(unsigned int nid,
3159 unsigned long *start_pfn, unsigned long *end_pfn) {}
3160#endif
3161
3162
3163/**
3164 * get_pfn_range_for_nid - Return the start and end page frames for a node 3320 * get_pfn_range_for_nid - Return the start and end page frames for a node
3165 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 3321 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
3166 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 3322 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
@@ -3185,9 +3341,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
3185 3341
3186 if (*start_pfn == -1UL) 3342 if (*start_pfn == -1UL)
3187 *start_pfn = 0; 3343 *start_pfn = 0;
3188
3189 /* Push the node boundaries out if requested */
3190 account_node_boundary(nid, start_pfn, end_pfn);
3191} 3344}
3192 3345
3193/* 3346/*
@@ -3552,7 +3705,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3552 zone_pcp_init(zone); 3705 zone_pcp_init(zone);
3553 for_each_lru(l) { 3706 for_each_lru(l) {
3554 INIT_LIST_HEAD(&zone->lru[l].list); 3707 INIT_LIST_HEAD(&zone->lru[l].list);
3555 zone->lru[l].nr_scan = 0; 3708 zone->lru[l].nr_saved_scan = 0;
3556 } 3709 }
3557 zone->reclaim_stat.recent_rotated[0] = 0; 3710 zone->reclaim_stat.recent_rotated[0] = 0;
3558 zone->reclaim_stat.recent_rotated[1] = 0; 3711 zone->reclaim_stat.recent_rotated[1] = 0;
@@ -3793,10 +3946,6 @@ void __init remove_all_active_ranges(void)
3793{ 3946{
3794 memset(early_node_map, 0, sizeof(early_node_map)); 3947 memset(early_node_map, 0, sizeof(early_node_map));
3795 nr_nodemap_entries = 0; 3948 nr_nodemap_entries = 0;
3796#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
3797 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
3798 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
3799#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
3800} 3949}
3801 3950
3802/* Compare two active node_active_regions */ 3951/* Compare two active node_active_regions */
@@ -4093,6 +4242,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4093 early_node_map[i].start_pfn, 4242 early_node_map[i].start_pfn,
4094 early_node_map[i].end_pfn); 4243 early_node_map[i].end_pfn);
4095 4244
4245 /*
4246 * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init
4247 * that node_mask, clear it at first
4248 */
4249 nodes_clear(node_states[N_HIGH_MEMORY]);
4096 /* Initialise every node */ 4250 /* Initialise every node */
4097 mminit_verify_pageflags_layout(); 4251 mminit_verify_pageflags_layout();
4098 setup_nr_node_ids(); 4252 setup_nr_node_ids();
@@ -4227,8 +4381,8 @@ static void calculate_totalreserve_pages(void)
4227 max = zone->lowmem_reserve[j]; 4381 max = zone->lowmem_reserve[j];
4228 } 4382 }
4229 4383
4230 /* we treat pages_high as reserved pages. */ 4384 /* we treat the high watermark as reserved pages. */
4231 max += zone->pages_high; 4385 max += high_wmark_pages(zone);
4232 4386
4233 if (max > zone->present_pages) 4387 if (max > zone->present_pages)
4234 max = zone->present_pages; 4388 max = zone->present_pages;
@@ -4278,12 +4432,13 @@ static void setup_per_zone_lowmem_reserve(void)
4278} 4432}
4279 4433
4280/** 4434/**
4281 * setup_per_zone_pages_min - called when min_free_kbytes changes. 4435 * setup_per_zone_wmarks - called when min_free_kbytes changes
4436 * or when memory is hot-{added|removed}
4282 * 4437 *
4283 * Ensures that the pages_{min,low,high} values for each zone are set correctly 4438 * Ensures that the watermark[min,low,high] values for each zone are set
4284 * with respect to min_free_kbytes. 4439 * correctly with respect to min_free_kbytes.
4285 */ 4440 */
4286void setup_per_zone_pages_min(void) 4441void setup_per_zone_wmarks(void)
4287{ 4442{
4288 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 4443 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
4289 unsigned long lowmem_pages = 0; 4444 unsigned long lowmem_pages = 0;
@@ -4308,7 +4463,7 @@ void setup_per_zone_pages_min(void)
4308 * need highmem pages, so cap pages_min to a small 4463 * need highmem pages, so cap pages_min to a small
4309 * value here. 4464 * value here.
4310 * 4465 *
4311 * The (pages_high-pages_low) and (pages_low-pages_min) 4466 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
4312 * deltas controls asynch page reclaim, and so should 4467 * deltas controls asynch page reclaim, and so should
4313 * not be capped for highmem. 4468 * not be capped for highmem.
4314 */ 4469 */
@@ -4319,17 +4474,17 @@ void setup_per_zone_pages_min(void)
4319 min_pages = SWAP_CLUSTER_MAX; 4474 min_pages = SWAP_CLUSTER_MAX;
4320 if (min_pages > 128) 4475 if (min_pages > 128)
4321 min_pages = 128; 4476 min_pages = 128;
4322 zone->pages_min = min_pages; 4477 zone->watermark[WMARK_MIN] = min_pages;
4323 } else { 4478 } else {
4324 /* 4479 /*
4325 * If it's a lowmem zone, reserve a number of pages 4480 * If it's a lowmem zone, reserve a number of pages
4326 * proportionate to the zone's size. 4481 * proportionate to the zone's size.
4327 */ 4482 */
4328 zone->pages_min = tmp; 4483 zone->watermark[WMARK_MIN] = tmp;
4329 } 4484 }
4330 4485
4331 zone->pages_low = zone->pages_min + (tmp >> 2); 4486 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
4332 zone->pages_high = zone->pages_min + (tmp >> 1); 4487 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
4333 setup_zone_migrate_reserve(zone); 4488 setup_zone_migrate_reserve(zone);
4334 spin_unlock_irqrestore(&zone->lock, flags); 4489 spin_unlock_irqrestore(&zone->lock, flags);
4335 } 4490 }
@@ -4339,8 +4494,6 @@ void setup_per_zone_pages_min(void)
4339} 4494}
4340 4495
4341/** 4496/**
4342 * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
4343 *
4344 * The inactive anon list should be small enough that the VM never has to 4497 * The inactive anon list should be small enough that the VM never has to
4345 * do too much work, but large enough that each inactive page has a chance 4498 * do too much work, but large enough that each inactive page has a chance
4346 * to be referenced again before it is swapped out. 4499 * to be referenced again before it is swapped out.
@@ -4361,21 +4514,26 @@ void setup_per_zone_pages_min(void)
4361 * 1TB 101 10GB 4514 * 1TB 101 10GB
4362 * 10TB 320 32GB 4515 * 10TB 320 32GB
4363 */ 4516 */
4364static void setup_per_zone_inactive_ratio(void) 4517void calculate_zone_inactive_ratio(struct zone *zone)
4365{ 4518{
4366 struct zone *zone; 4519 unsigned int gb, ratio;
4367 4520
4368 for_each_zone(zone) { 4521 /* Zone size in gigabytes */
4369 unsigned int gb, ratio; 4522 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4370 4523 if (gb)
4371 /* Zone size in gigabytes */
4372 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4373 ratio = int_sqrt(10 * gb); 4524 ratio = int_sqrt(10 * gb);
4374 if (!ratio) 4525 else
4375 ratio = 1; 4526 ratio = 1;
4376 4527
4377 zone->inactive_ratio = ratio; 4528 zone->inactive_ratio = ratio;
4378 } 4529}
4530
4531static void __init setup_per_zone_inactive_ratio(void)
4532{
4533 struct zone *zone;
4534
4535 for_each_zone(zone)
4536 calculate_zone_inactive_ratio(zone);
4379} 4537}
4380 4538
4381/* 4539/*
@@ -4402,7 +4560,7 @@ static void setup_per_zone_inactive_ratio(void)
4402 * 8192MB: 11584k 4560 * 8192MB: 11584k
4403 * 16384MB: 16384k 4561 * 16384MB: 16384k
4404 */ 4562 */
4405static int __init init_per_zone_pages_min(void) 4563static int __init init_per_zone_wmark_min(void)
4406{ 4564{
4407 unsigned long lowmem_kbytes; 4565 unsigned long lowmem_kbytes;
4408 4566
@@ -4413,12 +4571,12 @@ static int __init init_per_zone_pages_min(void)
4413 min_free_kbytes = 128; 4571 min_free_kbytes = 128;
4414 if (min_free_kbytes > 65536) 4572 if (min_free_kbytes > 65536)
4415 min_free_kbytes = 65536; 4573 min_free_kbytes = 65536;
4416 setup_per_zone_pages_min(); 4574 setup_per_zone_wmarks();
4417 setup_per_zone_lowmem_reserve(); 4575 setup_per_zone_lowmem_reserve();
4418 setup_per_zone_inactive_ratio(); 4576 setup_per_zone_inactive_ratio();
4419 return 0; 4577 return 0;
4420} 4578}
4421module_init(init_per_zone_pages_min) 4579module_init(init_per_zone_wmark_min)
4422 4580
4423/* 4581/*
4424 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 4582 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
@@ -4430,7 +4588,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4430{ 4588{
4431 proc_dointvec(table, write, file, buffer, length, ppos); 4589 proc_dointvec(table, write, file, buffer, length, ppos);
4432 if (write) 4590 if (write)
4433 setup_per_zone_pages_min(); 4591 setup_per_zone_wmarks();
4434 return 0; 4592 return 0;
4435} 4593}
4436 4594
@@ -4474,7 +4632,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4474 * whenever sysctl_lowmem_reserve_ratio changes. 4632 * whenever sysctl_lowmem_reserve_ratio changes.
4475 * 4633 *
4476 * The reserve ratio obviously has absolutely no relation with the 4634 * The reserve ratio obviously has absolutely no relation with the
4477 * pages_min watermarks. The lowmem reserve ratio can only make sense 4635 * minimum watermarks. The lowmem reserve ratio can only make sense
4478 * if in function of the boot time zone sizes. 4636 * if in function of the boot time zone sizes.
4479 */ 4637 */
4480int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4638int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
@@ -4501,7 +4659,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4501 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4659 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
4502 if (!write || (ret == -EINVAL)) 4660 if (!write || (ret == -EINVAL))
4503 return ret; 4661 return ret;
4504 for_each_zone(zone) { 4662 for_each_populated_zone(zone) {
4505 for_each_online_cpu(cpu) { 4663 for_each_online_cpu(cpu) {
4506 unsigned long high; 4664 unsigned long high;
4507 high = zone->present_pages / percpu_pagelist_fraction; 4665 high = zone->present_pages / percpu_pagelist_fraction;
@@ -4581,23 +4739,13 @@ void *__init alloc_large_system_hash(const char *tablename,
4581 else if (hashdist) 4739 else if (hashdist)
4582 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4740 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4583 else { 4741 else {
4584 unsigned long order = get_order(size);
4585 table = (void*) __get_free_pages(GFP_ATOMIC, order);
4586 /* 4742 /*
4587 * If bucketsize is not a power-of-two, we may free 4743 * If bucketsize is not a power-of-two, we may free
4588 * some pages at the end of hash table. 4744 * some pages at the end of hash table which
4745 * alloc_pages_exact() automatically does
4589 */ 4746 */
4590 if (table) { 4747 if (get_order(size) < MAX_ORDER)
4591 unsigned long alloc_end = (unsigned long)table + 4748 table = alloc_pages_exact(size, GFP_ATOMIC);
4592 (PAGE_SIZE << order);
4593 unsigned long used = (unsigned long)table +
4594 PAGE_ALIGN(size);
4595 split_page(virt_to_page(table), order);
4596 while (used < alloc_end) {
4597 free_page(used);
4598 used += PAGE_SIZE;
4599 }
4600 }
4601 } 4749 }
4602 } while (!table && size > PAGE_SIZE && --log2qty); 4750 } while (!table && size > PAGE_SIZE && --log2qty);
4603 4751
@@ -4615,6 +4763,16 @@ void *__init alloc_large_system_hash(const char *tablename,
4615 if (_hash_mask) 4763 if (_hash_mask)
4616 *_hash_mask = (1 << log2qty) - 1; 4764 *_hash_mask = (1 << log2qty) - 1;
4617 4765
4766 /*
4767 * If hashdist is set, the table allocation is done with __vmalloc()
4768 * which invokes the kmemleak_alloc() callback. This function may also
4769 * be called before the slab and kmemleak are initialised when
4770 * kmemleak simply buffers the request to be executed later
4771 * (GFP_ATOMIC flag ignored in this case).
4772 */
4773 if (!hashdist)
4774 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
4775
4618 return table; 4776 return table;
4619} 4777}
4620 4778
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 791905c991df..f22b4ebbd8dc 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -69,7 +69,7 @@ static int __init alloc_node_page_cgroup(int nid)
69 return 0; 69 return 0;
70} 70}
71 71
72void __init page_cgroup_init(void) 72void __init page_cgroup_init_flatmem(void)
73{ 73{
74 74
75 int nid, fail; 75 int nid, fail;
@@ -83,12 +83,12 @@ void __init page_cgroup_init(void)
83 goto fail; 83 goto fail;
84 } 84 }
85 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); 85 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
86 printk(KERN_INFO "please try cgroup_disable=memory option if you" 86 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
87 " don't want\n"); 87 " don't want memory cgroups\n");
88 return; 88 return;
89fail: 89fail:
90 printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); 90 printk(KERN_CRIT "allocation of page_cgroup failed.\n");
91 printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); 91 printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
92 panic("Out of memory"); 92 panic("Out of memory");
93} 93}
94 94
@@ -99,6 +99,8 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
99 unsigned long pfn = page_to_pfn(page); 99 unsigned long pfn = page_to_pfn(page);
100 struct mem_section *section = __pfn_to_section(pfn); 100 struct mem_section *section = __pfn_to_section(pfn);
101 101
102 if (!section->page_cgroup)
103 return NULL;
102 return section->page_cgroup + pfn; 104 return section->page_cgroup + pfn;
103} 105}
104 106
@@ -113,16 +115,11 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
113 if (!section->page_cgroup) { 115 if (!section->page_cgroup) {
114 nid = page_to_nid(pfn_to_page(pfn)); 116 nid = page_to_nid(pfn_to_page(pfn));
115 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 117 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
116 if (slab_is_available()) { 118 VM_BUG_ON(!slab_is_available());
117 base = kmalloc_node(table_size, 119 base = kmalloc_node(table_size,
118 GFP_KERNEL | __GFP_NOWARN, nid); 120 GFP_KERNEL | __GFP_NOWARN, nid);
119 if (!base) 121 if (!base)
120 base = vmalloc_node(table_size, nid); 122 base = vmalloc_node(table_size, nid);
121 } else {
122 base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
123 table_size,
124 PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
125 }
126 } else { 123 } else {
127 /* 124 /*
128 * We don't have to allocate page_cgroup again, but 125 * We don't have to allocate page_cgroup again, but
@@ -257,14 +254,14 @@ void __init page_cgroup_init(void)
257 fail = init_section_page_cgroup(pfn); 254 fail = init_section_page_cgroup(pfn);
258 } 255 }
259 if (fail) { 256 if (fail) {
260 printk(KERN_CRIT "try cgroup_disable=memory boot option\n"); 257 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
261 panic("Out of memory"); 258 panic("Out of memory");
262 } else { 259 } else {
263 hotplug_memory_notifier(page_cgroup_callback, 0); 260 hotplug_memory_notifier(page_cgroup_callback, 0);
264 } 261 }
265 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); 262 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
266 printk(KERN_INFO "please try cgroup_disable=memory option if you don't" 263 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
267 " want\n"); 264 " want memory cgroups\n");
268} 265}
269 266
270void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) 267void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
@@ -314,8 +311,6 @@ static int swap_cgroup_prepare(int type)
314 struct swap_cgroup_ctrl *ctrl; 311 struct swap_cgroup_ctrl *ctrl;
315 unsigned long idx, max; 312 unsigned long idx, max;
316 313
317 if (!do_swap_account)
318 return 0;
319 ctrl = &swap_cgroup_ctrl[type]; 314 ctrl = &swap_cgroup_ctrl[type];
320 315
321 for (idx = 0; idx < ctrl->length; idx++) { 316 for (idx = 0; idx < ctrl->length; idx++) {
@@ -352,9 +347,6 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
352 struct swap_cgroup *sc; 347 struct swap_cgroup *sc;
353 unsigned short old; 348 unsigned short old;
354 349
355 if (!do_swap_account)
356 return 0;
357
358 ctrl = &swap_cgroup_ctrl[type]; 350 ctrl = &swap_cgroup_ctrl[type];
359 351
360 mappage = ctrl->map[idx]; 352 mappage = ctrl->map[idx];
@@ -383,9 +375,6 @@ unsigned short lookup_swap_cgroup(swp_entry_t ent)
383 struct swap_cgroup *sc; 375 struct swap_cgroup *sc;
384 unsigned short ret; 376 unsigned short ret;
385 377
386 if (!do_swap_account)
387 return 0;
388
389 ctrl = &swap_cgroup_ctrl[type]; 378 ctrl = &swap_cgroup_ctrl[type];
390 mappage = ctrl->map[idx]; 379 mappage = ctrl->map[idx];
391 sc = page_address(mappage); 380 sc = page_address(mappage);
diff --git a/mm/page_io.c b/mm/page_io.c
index 3023c475e041..c6f3e5071de3 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -120,7 +120,7 @@ out:
120 return ret; 120 return ret;
121} 121}
122 122
123int swap_readpage(struct file *file, struct page *page) 123int swap_readpage(struct page *page)
124{ 124{
125 struct bio *bio; 125 struct bio *bio;
126 int ret = 0; 126 int ret = 0;
diff --git a/mm/percpu.c b/mm/percpu.c
index 1aa5d8fbca12..c0b2c1a76e81 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -23,7 +23,7 @@
23 * Allocation is done in offset-size areas of single unit space. Ie, 23 * Allocation is done in offset-size areas of single unit space. Ie,
24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, 24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring 25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
26 * percpu base registers UNIT_SIZE apart. 26 * percpu base registers pcpu_unit_size apart.
27 * 27 *
28 * There are usually many small percpu allocations many of them as 28 * There are usually many small percpu allocations many of them as
29 * small as 4 bytes. The allocator organizes chunks into lists 29 * small as 4 bytes. The allocator organizes chunks into lists
@@ -38,8 +38,8 @@
38 * region and negative allocated. Allocation inside a chunk is done 38 * region and negative allocated. Allocation inside a chunk is done
39 * by scanning this map sequentially and serving the first matching 39 * by scanning this map sequentially and serving the first matching
40 * entry. This is mostly copied from the percpu_modalloc() allocator. 40 * entry. This is mostly copied from the percpu_modalloc() allocator.
41 * Chunks are also linked into a rb tree to ease address to chunk 41 * Chunks can be determined from the address using the index field
42 * mapping during free. 42 * in the page struct. The index field contains a pointer to the chunk.
43 * 43 *
44 * To use this allocator, arch code should do the followings. 44 * To use this allocator, arch code should do the followings.
45 * 45 *
@@ -61,7 +61,6 @@
61#include <linux/mutex.h> 61#include <linux/mutex.h>
62#include <linux/percpu.h> 62#include <linux/percpu.h>
63#include <linux/pfn.h> 63#include <linux/pfn.h>
64#include <linux/rbtree.h>
65#include <linux/slab.h> 64#include <linux/slab.h>
66#include <linux/spinlock.h> 65#include <linux/spinlock.h>
67#include <linux/vmalloc.h> 66#include <linux/vmalloc.h>
@@ -88,7 +87,6 @@
88 87
89struct pcpu_chunk { 88struct pcpu_chunk {
90 struct list_head list; /* linked to pcpu_slot lists */ 89 struct list_head list; /* linked to pcpu_slot lists */
91 struct rb_node rb_node; /* key is chunk->vm->addr */
92 int free_size; /* free bytes in the chunk */ 90 int free_size; /* free bytes in the chunk */
93 int contig_hint; /* max contiguous size hint */ 91 int contig_hint; /* max contiguous size hint */
94 struct vm_struct *vm; /* mapped vmalloc region */ 92 struct vm_struct *vm; /* mapped vmalloc region */
@@ -110,9 +108,21 @@ static size_t pcpu_chunk_struct_size __read_mostly;
110void *pcpu_base_addr __read_mostly; 108void *pcpu_base_addr __read_mostly;
111EXPORT_SYMBOL_GPL(pcpu_base_addr); 109EXPORT_SYMBOL_GPL(pcpu_base_addr);
112 110
113/* optional reserved chunk, only accessible for reserved allocations */ 111/*
112 * The first chunk which always exists. Note that unlike other
113 * chunks, this one can be allocated and mapped in several different
114 * ways and thus often doesn't live in the vmalloc area.
115 */
116static struct pcpu_chunk *pcpu_first_chunk;
117
118/*
119 * Optional reserved chunk. This chunk reserves part of the first
120 * chunk and serves it for reserved allocations. The amount of
121 * reserved offset is in pcpu_reserved_chunk_limit. When reserved
122 * area doesn't exist, the following variables contain NULL and 0
123 * respectively.
124 */
114static struct pcpu_chunk *pcpu_reserved_chunk; 125static struct pcpu_chunk *pcpu_reserved_chunk;
115/* offset limit of the reserved chunk */
116static int pcpu_reserved_chunk_limit; 126static int pcpu_reserved_chunk_limit;
117 127
118/* 128/*
@@ -121,7 +131,7 @@ static int pcpu_reserved_chunk_limit;
121 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 131 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
122 * protects allocation/reclaim paths, chunks and chunk->page arrays. 132 * protects allocation/reclaim paths, chunks and chunk->page arrays.
123 * The latter is a spinlock and protects the index data structures - 133 * The latter is a spinlock and protects the index data structures -
124 * chunk slots, rbtree, chunks and area maps in chunks. 134 * chunk slots, chunks and area maps in chunks.
125 * 135 *
126 * During allocation, pcpu_alloc_mutex is kept locked all the time and 136 * During allocation, pcpu_alloc_mutex is kept locked all the time and
127 * pcpu_lock is grabbed and released as necessary. All actual memory 137 * pcpu_lock is grabbed and released as necessary. All actual memory
@@ -140,7 +150,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
140static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ 150static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
141 151
142static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ 152static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
143static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
144 153
145/* reclaim work to release fully free chunks, scheduled from free path */ 154/* reclaim work to release fully free chunks, scheduled from free path */
146static void pcpu_reclaim(struct work_struct *work); 155static void pcpu_reclaim(struct work_struct *work);
@@ -191,6 +200,18 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
191 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; 200 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
192} 201}
193 202
203/* set the pointer to a chunk in a page struct */
204static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
205{
206 page->index = (unsigned long)pcpu;
207}
208
209/* obtain pointer to a chunk from a page struct */
210static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
211{
212 return (struct pcpu_chunk *)page->index;
213}
214
194/** 215/**
195 * pcpu_mem_alloc - allocate memory 216 * pcpu_mem_alloc - allocate memory
196 * @size: bytes to allocate 217 * @size: bytes to allocate
@@ -257,93 +278,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
257 } 278 }
258} 279}
259 280
260static struct rb_node **pcpu_chunk_rb_search(void *addr,
261 struct rb_node **parentp)
262{
263 struct rb_node **p = &pcpu_addr_root.rb_node;
264 struct rb_node *parent = NULL;
265 struct pcpu_chunk *chunk;
266
267 while (*p) {
268 parent = *p;
269 chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
270
271 if (addr < chunk->vm->addr)
272 p = &(*p)->rb_left;
273 else if (addr > chunk->vm->addr)
274 p = &(*p)->rb_right;
275 else
276 break;
277 }
278
279 if (parentp)
280 *parentp = parent;
281 return p;
282}
283
284/** 281/**
285 * pcpu_chunk_addr_search - search for chunk containing specified address 282 * pcpu_chunk_addr_search - determine chunk containing specified address
286 * @addr: address to search for 283 * @addr: address for which the chunk needs to be determined.
287 *
288 * Look for chunk which might contain @addr. More specifically, it
289 * searchs for the chunk with the highest start address which isn't
290 * beyond @addr.
291 *
292 * CONTEXT:
293 * pcpu_lock.
294 * 284 *
295 * RETURNS: 285 * RETURNS:
296 * The address of the found chunk. 286 * The address of the found chunk.
297 */ 287 */
298static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) 288static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
299{ 289{
300 struct rb_node *n, *parent; 290 void *first_start = pcpu_first_chunk->vm->addr;
301 struct pcpu_chunk *chunk;
302 291
303 /* is it in the reserved chunk? */ 292 /* is it in the first chunk? */
304 if (pcpu_reserved_chunk) { 293 if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
305 void *start = pcpu_reserved_chunk->vm->addr; 294 /* is it in the reserved area? */
306 295 if (addr < first_start + pcpu_reserved_chunk_limit)
307 if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
308 return pcpu_reserved_chunk; 296 return pcpu_reserved_chunk;
297 return pcpu_first_chunk;
309 } 298 }
310 299
311 /* nah... search the regular ones */ 300 return pcpu_get_page_chunk(vmalloc_to_page(addr));
312 n = *pcpu_chunk_rb_search(addr, &parent);
313 if (!n) {
314 /* no exactly matching chunk, the parent is the closest */
315 n = parent;
316 BUG_ON(!n);
317 }
318 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
319
320 if (addr < chunk->vm->addr) {
321 /* the parent was the next one, look for the previous one */
322 n = rb_prev(n);
323 BUG_ON(!n);
324 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
325 }
326
327 return chunk;
328}
329
330/**
331 * pcpu_chunk_addr_insert - insert chunk into address rb tree
332 * @new: chunk to insert
333 *
334 * Insert @new into address rb tree.
335 *
336 * CONTEXT:
337 * pcpu_lock.
338 */
339static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
340{
341 struct rb_node **p, *parent;
342
343 p = pcpu_chunk_rb_search(new->vm->addr, &parent);
344 BUG_ON(*p);
345 rb_link_node(&new->rb_node, parent, p);
346 rb_insert_color(&new->rb_node, &pcpu_addr_root);
347} 301}
348 302
349/** 303/**
@@ -755,6 +709,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
755 alloc_mask, 0); 709 alloc_mask, 0);
756 if (!*pagep) 710 if (!*pagep)
757 goto err; 711 goto err;
712 pcpu_set_page_chunk(*pagep, chunk);
758 } 713 }
759 } 714 }
760 715
@@ -879,7 +834,6 @@ restart:
879 834
880 spin_lock_irq(&pcpu_lock); 835 spin_lock_irq(&pcpu_lock);
881 pcpu_chunk_relocate(chunk, -1); 836 pcpu_chunk_relocate(chunk, -1);
882 pcpu_chunk_addr_insert(chunk);
883 goto restart; 837 goto restart;
884 838
885area_found: 839area_found:
@@ -968,7 +922,6 @@ static void pcpu_reclaim(struct work_struct *work)
968 if (chunk == list_first_entry(head, struct pcpu_chunk, list)) 922 if (chunk == list_first_entry(head, struct pcpu_chunk, list))
969 continue; 923 continue;
970 924
971 rb_erase(&chunk->rb_node, &pcpu_addr_root);
972 list_move(&chunk->list, &todo); 925 list_move(&chunk->list, &todo);
973 } 926 }
974 927
@@ -1147,7 +1100,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1147 1100
1148 if (reserved_size) { 1101 if (reserved_size) {
1149 schunk->free_size = reserved_size; 1102 schunk->free_size = reserved_size;
1150 pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ 1103 pcpu_reserved_chunk = schunk;
1104 pcpu_reserved_chunk_limit = static_size + reserved_size;
1151 } else { 1105 } else {
1152 schunk->free_size = dyn_size; 1106 schunk->free_size = dyn_size;
1153 dyn_size = 0; /* dynamic area covered */ 1107 dyn_size = 0; /* dynamic area covered */
@@ -1158,8 +1112,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1158 if (schunk->free_size) 1112 if (schunk->free_size)
1159 schunk->map[schunk->map_used++] = schunk->free_size; 1113 schunk->map[schunk->map_used++] = schunk->free_size;
1160 1114
1161 pcpu_reserved_chunk_limit = static_size + schunk->free_size;
1162
1163 /* init dynamic chunk if necessary */ 1115 /* init dynamic chunk if necessary */
1164 if (dyn_size) { 1116 if (dyn_size) {
1165 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); 1117 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
@@ -1226,13 +1178,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1226 } 1178 }
1227 1179
1228 /* link the first chunk in */ 1180 /* link the first chunk in */
1229 if (!dchunk) { 1181 pcpu_first_chunk = dchunk ?: schunk;
1230 pcpu_chunk_relocate(schunk, -1); 1182 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1231 pcpu_chunk_addr_insert(schunk);
1232 } else {
1233 pcpu_chunk_relocate(dchunk, -1);
1234 pcpu_chunk_addr_insert(dchunk);
1235 }
1236 1183
1237 /* we're done */ 1184 /* we're done */
1238 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); 1185 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
diff --git a/mm/readahead.c b/mm/readahead.c
index 133b6d525513..aa1aa2345235 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -133,15 +133,12 @@ out:
133} 133}
134 134
135/* 135/*
136 * do_page_cache_readahead actually reads a chunk of disk. It allocates all 136 * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
137 * the pages first, then submits them all for I/O. This avoids the very bad 137 * the pages first, then submits them all for I/O. This avoids the very bad
138 * behaviour which would occur if page allocations are causing VM writeback. 138 * behaviour which would occur if page allocations are causing VM writeback.
139 * We really don't want to intermingle reads and writes like that. 139 * We really don't want to intermingle reads and writes like that.
140 * 140 *
141 * Returns the number of pages requested, or the maximum amount of I/O allowed. 141 * Returns the number of pages requested, or the maximum amount of I/O allowed.
142 *
143 * do_page_cache_readahead() returns -1 if it encountered request queue
144 * congestion.
145 */ 142 */
146static int 143static int
147__do_page_cache_readahead(struct address_space *mapping, struct file *filp, 144__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
@@ -210,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
210 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) 207 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
211 return -EINVAL; 208 return -EINVAL;
212 209
210 nr_to_read = max_sane_readahead(nr_to_read);
213 while (nr_to_read) { 211 while (nr_to_read) {
214 int err; 212 int err;
215 213
@@ -231,22 +229,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
231} 229}
232 230
233/* 231/*
234 * This version skips the IO if the queue is read-congested, and will tell the
235 * block layer to abandon the readahead if request allocation would block.
236 *
237 * force_page_cache_readahead() will ignore queue congestion and will block on
238 * request queues.
239 */
240int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
241 pgoff_t offset, unsigned long nr_to_read)
242{
243 if (bdi_read_congested(mapping->backing_dev_info))
244 return -1;
245
246 return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
247}
248
249/*
250 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a 232 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
251 * sensible upper limit. 233 * sensible upper limit.
252 */ 234 */
@@ -259,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr)
259/* 241/*
260 * Submit IO for the read-ahead request in file_ra_state. 242 * Submit IO for the read-ahead request in file_ra_state.
261 */ 243 */
262static unsigned long ra_submit(struct file_ra_state *ra, 244unsigned long ra_submit(struct file_ra_state *ra,
263 struct address_space *mapping, struct file *filp) 245 struct address_space *mapping, struct file *filp)
264{ 246{
265 int actual; 247 int actual;
@@ -348,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
348 */ 330 */
349 331
350/* 332/*
333 * Count contiguously cached pages from @offset-1 to @offset-@max,
334 * this count is a conservative estimation of
335 * - length of the sequential read sequence, or
336 * - thrashing threshold in memory tight systems
337 */
338static pgoff_t count_history_pages(struct address_space *mapping,
339 struct file_ra_state *ra,
340 pgoff_t offset, unsigned long max)
341{
342 pgoff_t head;
343
344 rcu_read_lock();
345 head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
346 rcu_read_unlock();
347
348 return offset - 1 - head;
349}
350
351/*
352 * page cache context based read-ahead
353 */
354static int try_context_readahead(struct address_space *mapping,
355 struct file_ra_state *ra,
356 pgoff_t offset,
357 unsigned long req_size,
358 unsigned long max)
359{
360 pgoff_t size;
361
362 size = count_history_pages(mapping, ra, offset, max);
363
364 /*
365 * no history pages:
366 * it could be a random read
367 */
368 if (!size)
369 return 0;
370
371 /*
372 * starts from beginning of file:
373 * it is a strong indication of long-run stream (or whole-file-read)
374 */
375 if (size >= offset)
376 size *= 2;
377
378 ra->start = offset;
379 ra->size = get_init_ra_size(size + req_size, max);
380 ra->async_size = ra->size;
381
382 return 1;
383}
384
385/*
351 * A minimal readahead algorithm for trivial sequential/random reads. 386 * A minimal readahead algorithm for trivial sequential/random reads.
352 */ 387 */
353static unsigned long 388static unsigned long
@@ -356,34 +391,26 @@ ondemand_readahead(struct address_space *mapping,
356 bool hit_readahead_marker, pgoff_t offset, 391 bool hit_readahead_marker, pgoff_t offset,
357 unsigned long req_size) 392 unsigned long req_size)
358{ 393{
359 int max = ra->ra_pages; /* max readahead pages */ 394 unsigned long max = max_sane_readahead(ra->ra_pages);
360 pgoff_t prev_offset; 395
361 int sequential; 396 /*
397 * start of file
398 */
399 if (!offset)
400 goto initial_readahead;
362 401
363 /* 402 /*
364 * It's the expected callback offset, assume sequential access. 403 * It's the expected callback offset, assume sequential access.
365 * Ramp up sizes, and push forward the readahead window. 404 * Ramp up sizes, and push forward the readahead window.
366 */ 405 */
367 if (offset && (offset == (ra->start + ra->size - ra->async_size) || 406 if ((offset == (ra->start + ra->size - ra->async_size) ||
368 offset == (ra->start + ra->size))) { 407 offset == (ra->start + ra->size))) {
369 ra->start += ra->size; 408 ra->start += ra->size;
370 ra->size = get_next_ra_size(ra, max); 409 ra->size = get_next_ra_size(ra, max);
371 ra->async_size = ra->size; 410 ra->async_size = ra->size;
372 goto readit; 411 goto readit;
373 } 412 }
374 413
375 prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
376 sequential = offset - prev_offset <= 1UL || req_size > max;
377
378 /*
379 * Standalone, small read.
380 * Read as is, and do not pollute the readahead state.
381 */
382 if (!hit_readahead_marker && !sequential) {
383 return __do_page_cache_readahead(mapping, filp,
384 offset, req_size, 0);
385 }
386
387 /* 414 /*
388 * Hit a marked page without valid readahead state. 415 * Hit a marked page without valid readahead state.
389 * E.g. interleaved reads. 416 * E.g. interleaved reads.
@@ -394,7 +421,7 @@ ondemand_readahead(struct address_space *mapping,
394 pgoff_t start; 421 pgoff_t start;
395 422
396 rcu_read_lock(); 423 rcu_read_lock();
397 start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); 424 start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
398 rcu_read_unlock(); 425 rcu_read_unlock();
399 426
400 if (!start || start - offset > max) 427 if (!start || start - offset > max)
@@ -402,23 +429,53 @@ ondemand_readahead(struct address_space *mapping,
402 429
403 ra->start = start; 430 ra->start = start;
404 ra->size = start - offset; /* old async_size */ 431 ra->size = start - offset; /* old async_size */
432 ra->size += req_size;
405 ra->size = get_next_ra_size(ra, max); 433 ra->size = get_next_ra_size(ra, max);
406 ra->async_size = ra->size; 434 ra->async_size = ra->size;
407 goto readit; 435 goto readit;
408 } 436 }
409 437
410 /* 438 /*
411 * It may be one of 439 * oversize read
412 * - first read on start of file 440 */
413 * - sequential cache miss 441 if (req_size > max)
414 * - oversize random read 442 goto initial_readahead;
415 * Start readahead for it. 443
444 /*
445 * sequential cache miss
446 */
447 if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
448 goto initial_readahead;
449
450 /*
451 * Query the page cache and look for the traces(cached history pages)
452 * that a sequential stream would leave behind.
453 */
454 if (try_context_readahead(mapping, ra, offset, req_size, max))
455 goto readit;
456
457 /*
458 * standalone, small random read
459 * Read as is, and do not pollute the readahead state.
416 */ 460 */
461 return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
462
463initial_readahead:
417 ra->start = offset; 464 ra->start = offset;
418 ra->size = get_init_ra_size(req_size, max); 465 ra->size = get_init_ra_size(req_size, max);
419 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; 466 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
420 467
421readit: 468readit:
469 /*
470 * Will this read hit the readahead marker made by itself?
471 * If so, trigger the readahead marker hit now, and merge
472 * the resulted next readahead window into the current one.
473 */
474 if (offset == ra->start && ra->size == ra->async_size) {
475 ra->async_size = get_next_ra_size(ra, max);
476 ra->size += ra->async_size;
477 }
478
422 return ra_submit(ra, mapping, filp); 479 return ra_submit(ra, mapping, filp);
423} 480}
424 481
diff --git a/mm/rmap.c b/mm/rmap.c
index 23122af32611..836c6c63e1f2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
333 * repeatedly from either page_referenced_anon or page_referenced_file. 333 * repeatedly from either page_referenced_anon or page_referenced_file.
334 */ 334 */
335static int page_referenced_one(struct page *page, 335static int page_referenced_one(struct page *page,
336 struct vm_area_struct *vma, unsigned int *mapcount) 336 struct vm_area_struct *vma,
337 unsigned int *mapcount,
338 unsigned long *vm_flags)
337{ 339{
338 struct mm_struct *mm = vma->vm_mm; 340 struct mm_struct *mm = vma->vm_mm;
339 unsigned long address; 341 unsigned long address;
@@ -381,11 +383,14 @@ out_unmap:
381 (*mapcount)--; 383 (*mapcount)--;
382 pte_unmap_unlock(pte, ptl); 384 pte_unmap_unlock(pte, ptl);
383out: 385out:
386 if (referenced)
387 *vm_flags |= vma->vm_flags;
384 return referenced; 388 return referenced;
385} 389}
386 390
387static int page_referenced_anon(struct page *page, 391static int page_referenced_anon(struct page *page,
388 struct mem_cgroup *mem_cont) 392 struct mem_cgroup *mem_cont,
393 unsigned long *vm_flags)
389{ 394{
390 unsigned int mapcount; 395 unsigned int mapcount;
391 struct anon_vma *anon_vma; 396 struct anon_vma *anon_vma;
@@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page,
405 */ 410 */
406 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 411 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
407 continue; 412 continue;
408 referenced += page_referenced_one(page, vma, &mapcount); 413 referenced += page_referenced_one(page, vma,
414 &mapcount, vm_flags);
409 if (!mapcount) 415 if (!mapcount)
410 break; 416 break;
411 } 417 }
@@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page,
418 * page_referenced_file - referenced check for object-based rmap 424 * page_referenced_file - referenced check for object-based rmap
419 * @page: the page we're checking references on. 425 * @page: the page we're checking references on.
420 * @mem_cont: target memory controller 426 * @mem_cont: target memory controller
427 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
421 * 428 *
422 * For an object-based mapped page, find all the places it is mapped and 429 * For an object-based mapped page, find all the places it is mapped and
423 * check/clear the referenced flag. This is done by following the page->mapping 430 * check/clear the referenced flag. This is done by following the page->mapping
@@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page,
427 * This function is only called from page_referenced for object-based pages. 434 * This function is only called from page_referenced for object-based pages.
428 */ 435 */
429static int page_referenced_file(struct page *page, 436static int page_referenced_file(struct page *page,
430 struct mem_cgroup *mem_cont) 437 struct mem_cgroup *mem_cont,
438 unsigned long *vm_flags)
431{ 439{
432 unsigned int mapcount; 440 unsigned int mapcount;
433 struct address_space *mapping = page->mapping; 441 struct address_space *mapping = page->mapping;
@@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page,
467 */ 475 */
468 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 476 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
469 continue; 477 continue;
470 referenced += page_referenced_one(page, vma, &mapcount); 478 referenced += page_referenced_one(page, vma,
479 &mapcount, vm_flags);
471 if (!mapcount) 480 if (!mapcount)
472 break; 481 break;
473 } 482 }
@@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page,
481 * @page: the page to test 490 * @page: the page to test
482 * @is_locked: caller holds lock on the page 491 * @is_locked: caller holds lock on the page
483 * @mem_cont: target memory controller 492 * @mem_cont: target memory controller
493 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
484 * 494 *
485 * Quick test_and_clear_referenced for all mappings to a page, 495 * Quick test_and_clear_referenced for all mappings to a page,
486 * returns the number of ptes which referenced the page. 496 * returns the number of ptes which referenced the page.
487 */ 497 */
488int page_referenced(struct page *page, int is_locked, 498int page_referenced(struct page *page,
489 struct mem_cgroup *mem_cont) 499 int is_locked,
500 struct mem_cgroup *mem_cont,
501 unsigned long *vm_flags)
490{ 502{
491 int referenced = 0; 503 int referenced = 0;
492 504
493 if (TestClearPageReferenced(page)) 505 if (TestClearPageReferenced(page))
494 referenced++; 506 referenced++;
495 507
508 *vm_flags = 0;
496 if (page_mapped(page) && page->mapping) { 509 if (page_mapped(page) && page->mapping) {
497 if (PageAnon(page)) 510 if (PageAnon(page))
498 referenced += page_referenced_anon(page, mem_cont); 511 referenced += page_referenced_anon(page, mem_cont,
512 vm_flags);
499 else if (is_locked) 513 else if (is_locked)
500 referenced += page_referenced_file(page, mem_cont); 514 referenced += page_referenced_file(page, mem_cont,
515 vm_flags);
501 else if (!trylock_page(page)) 516 else if (!trylock_page(page))
502 referenced++; 517 referenced++;
503 else { 518 else {
504 if (page->mapping) 519 if (page->mapping)
505 referenced += 520 referenced += page_referenced_file(page,
506 page_referenced_file(page, mem_cont); 521 mem_cont, vm_flags);
507 unlock_page(page); 522 unlock_page(page);
508 } 523 }
509 } 524 }
@@ -688,8 +703,10 @@ void page_add_new_anon_rmap(struct page *page,
688 */ 703 */
689void page_add_file_rmap(struct page *page) 704void page_add_file_rmap(struct page *page)
690{ 705{
691 if (atomic_inc_and_test(&page->_mapcount)) 706 if (atomic_inc_and_test(&page->_mapcount)) {
692 __inc_zone_page_state(page, NR_FILE_MAPPED); 707 __inc_zone_page_state(page, NR_FILE_MAPPED);
708 mem_cgroup_update_mapped_file_stat(page, 1);
709 }
693} 710}
694 711
695#ifdef CONFIG_DEBUG_VM 712#ifdef CONFIG_DEBUG_VM
@@ -738,6 +755,7 @@ void page_remove_rmap(struct page *page)
738 mem_cgroup_uncharge_page(page); 755 mem_cgroup_uncharge_page(page);
739 __dec_zone_page_state(page, 756 __dec_zone_page_state(page,
740 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 757 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
758 mem_cgroup_update_mapped_file_stat(page, -1);
741 /* 759 /*
742 * It would be tidy to reset the PageAnon mapping here, 760 * It would be tidy to reset the PageAnon mapping here,
743 * but that might overwrite a racing page_add_anon_rmap 761 * but that might overwrite a racing page_add_anon_rmap
@@ -1202,7 +1220,6 @@ int try_to_unmap(struct page *page, int migration)
1202 return ret; 1220 return ret;
1203} 1221}
1204 1222
1205#ifdef CONFIG_UNEVICTABLE_LRU
1206/** 1223/**
1207 * try_to_munlock - try to munlock a page 1224 * try_to_munlock - try to munlock a page
1208 * @page: the page to be munlocked 1225 * @page: the page to be munlocked
@@ -1226,4 +1243,4 @@ int try_to_munlock(struct page *page)
1226 else 1243 else
1227 return try_to_unmap_file(page, 1, 0); 1244 return try_to_unmap_file(page, 1, 0);
1228} 1245}
1229#endif 1246
diff --git a/mm/shmem.c b/mm/shmem.c
index b25f95ce3db7..d713239ce2ce 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1097 shmem_swp_unmap(entry); 1097 shmem_swp_unmap(entry);
1098unlock: 1098unlock:
1099 spin_unlock(&info->lock); 1099 spin_unlock(&info->lock);
1100 swap_free(swap); 1100 swapcache_free(swap, NULL);
1101redirty: 1101redirty:
1102 set_page_dirty(page); 1102 set_page_dirty(page);
1103 if (wbc->for_reclaim) 1103 if (wbc->for_reclaim)
@@ -1558,6 +1558,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode,
1558 spin_lock_init(&info->lock); 1558 spin_lock_init(&info->lock);
1559 info->flags = flags & VM_NORESERVE; 1559 info->flags = flags & VM_NORESERVE;
1560 INIT_LIST_HEAD(&info->swaplist); 1560 INIT_LIST_HEAD(&info->swaplist);
1561 cache_no_acl(inode);
1561 1562
1562 switch (mode & S_IFMT) { 1563 switch (mode & S_IFMT) {
1563 default: 1564 default:
@@ -2388,7 +2389,6 @@ static void shmem_destroy_inode(struct inode *inode)
2388 /* only struct inode is valid if it's an inline symlink */ 2389 /* only struct inode is valid if it's an inline symlink */
2389 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2390 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2390 } 2391 }
2391 shmem_acl_destroy_inode(inode);
2392 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2392 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2393} 2393}
2394 2394
@@ -2397,10 +2397,6 @@ static void init_once(void *foo)
2397 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2397 struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2398 2398
2399 inode_init_once(&p->vfs_inode); 2399 inode_init_once(&p->vfs_inode);
2400#ifdef CONFIG_TMPFS_POSIX_ACL
2401 p->i_acl = NULL;
2402 p->i_default_acl = NULL;
2403#endif
2404} 2400}
2405 2401
2406static int init_inodecache(void) 2402static int init_inodecache(void)
@@ -2612,7 +2608,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
2612 * @size: size to be set for the file 2608 * @size: size to be set for the file
2613 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 2609 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
2614 */ 2610 */
2615struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) 2611struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
2616{ 2612{
2617 int error; 2613 int error;
2618 struct file *file; 2614 struct file *file;
@@ -2659,6 +2655,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2659 if (error) 2655 if (error)
2660 goto close_file; 2656 goto close_file;
2661#endif 2657#endif
2658 ima_counts_get(file);
2662 return file; 2659 return file;
2663 2660
2664close_file: 2661close_file:
@@ -2684,7 +2681,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2684 if (IS_ERR(file)) 2681 if (IS_ERR(file))
2685 return PTR_ERR(file); 2682 return PTR_ERR(file);
2686 2683
2687 ima_shm_check(file);
2688 if (vma->vm_file) 2684 if (vma->vm_file)
2689 fput(vma->vm_file); 2685 fput(vma->vm_file);
2690 vma->vm_file = file; 2686 vma->vm_file = file;
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index 8e5aadd7dcd6..606a8e757a42 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -22,11 +22,11 @@ shmem_get_acl(struct inode *inode, int type)
22 spin_lock(&inode->i_lock); 22 spin_lock(&inode->i_lock);
23 switch(type) { 23 switch(type) {
24 case ACL_TYPE_ACCESS: 24 case ACL_TYPE_ACCESS:
25 acl = posix_acl_dup(SHMEM_I(inode)->i_acl); 25 acl = posix_acl_dup(inode->i_acl);
26 break; 26 break;
27 27
28 case ACL_TYPE_DEFAULT: 28 case ACL_TYPE_DEFAULT:
29 acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl); 29 acl = posix_acl_dup(inode->i_default_acl);
30 break; 30 break;
31 } 31 }
32 spin_unlock(&inode->i_lock); 32 spin_unlock(&inode->i_lock);
@@ -45,13 +45,13 @@ shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
45 spin_lock(&inode->i_lock); 45 spin_lock(&inode->i_lock);
46 switch(type) { 46 switch(type) {
47 case ACL_TYPE_ACCESS: 47 case ACL_TYPE_ACCESS:
48 free = SHMEM_I(inode)->i_acl; 48 free = inode->i_acl;
49 SHMEM_I(inode)->i_acl = posix_acl_dup(acl); 49 inode->i_acl = posix_acl_dup(acl);
50 break; 50 break;
51 51
52 case ACL_TYPE_DEFAULT: 52 case ACL_TYPE_DEFAULT:
53 free = SHMEM_I(inode)->i_default_acl; 53 free = inode->i_default_acl;
54 SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl); 54 inode->i_default_acl = posix_acl_dup(acl);
55 break; 55 break;
56 } 56 }
57 spin_unlock(&inode->i_lock); 57 spin_unlock(&inode->i_lock);
@@ -155,23 +155,6 @@ shmem_acl_init(struct inode *inode, struct inode *dir)
155} 155}
156 156
157/** 157/**
158 * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode
159 *
160 * This is done before destroying the actual inode.
161 */
162
163void
164shmem_acl_destroy_inode(struct inode *inode)
165{
166 if (SHMEM_I(inode)->i_acl)
167 posix_acl_release(SHMEM_I(inode)->i_acl);
168 SHMEM_I(inode)->i_acl = NULL;
169 if (SHMEM_I(inode)->i_default_acl)
170 posix_acl_release(SHMEM_I(inode)->i_default_acl);
171 SHMEM_I(inode)->i_default_acl = NULL;
172}
173
174/**
175 * shmem_check_acl - check_acl() callback for generic_permission() 158 * shmem_check_acl - check_acl() callback for generic_permission()
176 */ 159 */
177static int 160static int
diff --git a/mm/slab.c b/mm/slab.c
index 9a90b00d2f91..e74a16e4ced6 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,17 +102,19 @@
102#include <linux/cpu.h> 102#include <linux/cpu.h>
103#include <linux/sysctl.h> 103#include <linux/sysctl.h>
104#include <linux/module.h> 104#include <linux/module.h>
105#include <trace/kmemtrace.h> 105#include <linux/kmemtrace.h>
106#include <linux/rcupdate.h> 106#include <linux/rcupdate.h>
107#include <linux/string.h> 107#include <linux/string.h>
108#include <linux/uaccess.h> 108#include <linux/uaccess.h>
109#include <linux/nodemask.h> 109#include <linux/nodemask.h>
110#include <linux/kmemleak.h>
110#include <linux/mempolicy.h> 111#include <linux/mempolicy.h>
111#include <linux/mutex.h> 112#include <linux/mutex.h>
112#include <linux/fault-inject.h> 113#include <linux/fault-inject.h>
113#include <linux/rtmutex.h> 114#include <linux/rtmutex.h>
114#include <linux/reciprocal_div.h> 115#include <linux/reciprocal_div.h>
115#include <linux/debugobjects.h> 116#include <linux/debugobjects.h>
117#include <linux/kmemcheck.h>
116 118
117#include <asm/cacheflush.h> 119#include <asm/cacheflush.h>
118#include <asm/tlbflush.h> 120#include <asm/tlbflush.h>
@@ -178,13 +180,13 @@
178 SLAB_STORE_USER | \ 180 SLAB_STORE_USER | \
179 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 181 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
180 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 182 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
181 SLAB_DEBUG_OBJECTS) 183 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
182#else 184#else
183# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ 185# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
184 SLAB_CACHE_DMA | \ 186 SLAB_CACHE_DMA | \
185 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 187 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
186 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 188 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
187 SLAB_DEBUG_OBJECTS) 189 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
188#endif 190#endif
189 191
190/* 192/*
@@ -315,7 +317,7 @@ static int drain_freelist(struct kmem_cache *cache,
315 struct kmem_list3 *l3, int tofree); 317 struct kmem_list3 *l3, int tofree);
316static void free_block(struct kmem_cache *cachep, void **objpp, int len, 318static void free_block(struct kmem_cache *cachep, void **objpp, int len,
317 int node); 319 int node);
318static int enable_cpucache(struct kmem_cache *cachep); 320static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
319static void cache_reap(struct work_struct *unused); 321static void cache_reap(struct work_struct *unused);
320 322
321/* 323/*
@@ -373,87 +375,6 @@ static void kmem_list3_init(struct kmem_list3 *parent)
373 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 375 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
374 } while (0) 376 } while (0)
375 377
376/*
377 * struct kmem_cache
378 *
379 * manages a cache.
380 */
381
382struct kmem_cache {
383/* 1) per-cpu data, touched during every alloc/free */
384 struct array_cache *array[NR_CPUS];
385/* 2) Cache tunables. Protected by cache_chain_mutex */
386 unsigned int batchcount;
387 unsigned int limit;
388 unsigned int shared;
389
390 unsigned int buffer_size;
391 u32 reciprocal_buffer_size;
392/* 3) touched by every alloc & free from the backend */
393
394 unsigned int flags; /* constant flags */
395 unsigned int num; /* # of objs per slab */
396
397/* 4) cache_grow/shrink */
398 /* order of pgs per slab (2^n) */
399 unsigned int gfporder;
400
401 /* force GFP flags, e.g. GFP_DMA */
402 gfp_t gfpflags;
403
404 size_t colour; /* cache colouring range */
405 unsigned int colour_off; /* colour offset */
406 struct kmem_cache *slabp_cache;
407 unsigned int slab_size;
408 unsigned int dflags; /* dynamic flags */
409
410 /* constructor func */
411 void (*ctor)(void *obj);
412
413/* 5) cache creation/removal */
414 const char *name;
415 struct list_head next;
416
417/* 6) statistics */
418#if STATS
419 unsigned long num_active;
420 unsigned long num_allocations;
421 unsigned long high_mark;
422 unsigned long grown;
423 unsigned long reaped;
424 unsigned long errors;
425 unsigned long max_freeable;
426 unsigned long node_allocs;
427 unsigned long node_frees;
428 unsigned long node_overflow;
429 atomic_t allochit;
430 atomic_t allocmiss;
431 atomic_t freehit;
432 atomic_t freemiss;
433#endif
434#if DEBUG
435 /*
436 * If debugging is enabled, then the allocator can add additional
437 * fields and/or padding to every object. buffer_size contains the total
438 * object size including these internal fields, the following two
439 * variables contain the offset to the user object and its size.
440 */
441 int obj_offset;
442 int obj_size;
443#endif
444 /*
445 * We put nodelists[] at the end of kmem_cache, because we want to size
446 * this array to nr_node_ids slots instead of MAX_NUMNODES
447 * (see kmem_cache_init())
448 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
449 * is statically defined, so we reserve the max number of nodes.
450 */
451 struct kmem_list3 *nodelists[MAX_NUMNODES];
452 /*
453 * Do not add fields after nodelists[]
454 */
455};
456
457#define CFLGS_OFF_SLAB (0x80000000UL) 378#define CFLGS_OFF_SLAB (0x80000000UL)
458#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 379#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
459 380
@@ -752,6 +673,7 @@ static enum {
752 NONE, 673 NONE,
753 PARTIAL_AC, 674 PARTIAL_AC,
754 PARTIAL_L3, 675 PARTIAL_L3,
676 EARLY,
755 FULL 677 FULL
756} g_cpucache_up; 678} g_cpucache_up;
757 679
@@ -760,7 +682,7 @@ static enum {
760 */ 682 */
761int slab_is_available(void) 683int slab_is_available(void)
762{ 684{
763 return g_cpucache_up == FULL; 685 return g_cpucache_up >= EARLY;
764} 686}
765 687
766static DEFINE_PER_CPU(struct delayed_work, reap_work); 688static DEFINE_PER_CPU(struct delayed_work, reap_work);
@@ -890,7 +812,6 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
890 */ 812 */
891 813
892static int use_alien_caches __read_mostly = 1; 814static int use_alien_caches __read_mostly = 1;
893static int numa_platform __read_mostly = 1;
894static int __init noaliencache_setup(char *s) 815static int __init noaliencache_setup(char *s)
895{ 816{
896 use_alien_caches = 0; 817 use_alien_caches = 0;
@@ -958,12 +879,20 @@ static void __cpuinit start_cpu_timer(int cpu)
958} 879}
959 880
960static struct array_cache *alloc_arraycache(int node, int entries, 881static struct array_cache *alloc_arraycache(int node, int entries,
961 int batchcount) 882 int batchcount, gfp_t gfp)
962{ 883{
963 int memsize = sizeof(void *) * entries + sizeof(struct array_cache); 884 int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
964 struct array_cache *nc = NULL; 885 struct array_cache *nc = NULL;
965 886
966 nc = kmalloc_node(memsize, GFP_KERNEL, node); 887 nc = kmalloc_node(memsize, gfp, node);
888 /*
889 * The array_cache structures contain pointers to free object.
890 * However, when such objects are allocated or transfered to another
891 * cache the pointers are not cleared and they could be counted as
892 * valid references during a kmemleak scan. Therefore, kmemleak must
893 * not scan such objects.
894 */
895 kmemleak_no_scan(nc);
967 if (nc) { 896 if (nc) {
968 nc->avail = 0; 897 nc->avail = 0;
969 nc->limit = entries; 898 nc->limit = entries;
@@ -1003,7 +932,7 @@ static int transfer_objects(struct array_cache *to,
1003#define drain_alien_cache(cachep, alien) do { } while (0) 932#define drain_alien_cache(cachep, alien) do { } while (0)
1004#define reap_alien(cachep, l3) do { } while (0) 933#define reap_alien(cachep, l3) do { } while (0)
1005 934
1006static inline struct array_cache **alloc_alien_cache(int node, int limit) 935static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
1007{ 936{
1008 return (struct array_cache **)BAD_ALIEN_MAGIC; 937 return (struct array_cache **)BAD_ALIEN_MAGIC;
1009} 938}
@@ -1034,7 +963,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
1034static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 963static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1035static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 964static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1036 965
1037static struct array_cache **alloc_alien_cache(int node, int limit) 966static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
1038{ 967{
1039 struct array_cache **ac_ptr; 968 struct array_cache **ac_ptr;
1040 int memsize = sizeof(void *) * nr_node_ids; 969 int memsize = sizeof(void *) * nr_node_ids;
@@ -1042,14 +971,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit)
1042 971
1043 if (limit > 1) 972 if (limit > 1)
1044 limit = 12; 973 limit = 12;
1045 ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); 974 ac_ptr = kmalloc_node(memsize, gfp, node);
1046 if (ac_ptr) { 975 if (ac_ptr) {
1047 for_each_node(i) { 976 for_each_node(i) {
1048 if (i == node || !node_online(i)) { 977 if (i == node || !node_online(i)) {
1049 ac_ptr[i] = NULL; 978 ac_ptr[i] = NULL;
1050 continue; 979 continue;
1051 } 980 }
1052 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); 981 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
1053 if (!ac_ptr[i]) { 982 if (!ac_ptr[i]) {
1054 for (i--; i >= 0; i--) 983 for (i--; i >= 0; i--)
1055 kfree(ac_ptr[i]); 984 kfree(ac_ptr[i]);
@@ -1282,20 +1211,20 @@ static int __cpuinit cpuup_prepare(long cpu)
1282 struct array_cache **alien = NULL; 1211 struct array_cache **alien = NULL;
1283 1212
1284 nc = alloc_arraycache(node, cachep->limit, 1213 nc = alloc_arraycache(node, cachep->limit,
1285 cachep->batchcount); 1214 cachep->batchcount, GFP_KERNEL);
1286 if (!nc) 1215 if (!nc)
1287 goto bad; 1216 goto bad;
1288 if (cachep->shared) { 1217 if (cachep->shared) {
1289 shared = alloc_arraycache(node, 1218 shared = alloc_arraycache(node,
1290 cachep->shared * cachep->batchcount, 1219 cachep->shared * cachep->batchcount,
1291 0xbaadf00d); 1220 0xbaadf00d, GFP_KERNEL);
1292 if (!shared) { 1221 if (!shared) {
1293 kfree(nc); 1222 kfree(nc);
1294 goto bad; 1223 goto bad;
1295 } 1224 }
1296 } 1225 }
1297 if (use_alien_caches) { 1226 if (use_alien_caches) {
1298 alien = alloc_alien_cache(node, cachep->limit); 1227 alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
1299 if (!alien) { 1228 if (!alien) {
1300 kfree(shared); 1229 kfree(shared);
1301 kfree(nc); 1230 kfree(nc);
@@ -1399,10 +1328,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1399{ 1328{
1400 struct kmem_list3 *ptr; 1329 struct kmem_list3 *ptr;
1401 1330
1402 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); 1331 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
1403 BUG_ON(!ptr); 1332 BUG_ON(!ptr);
1404 1333
1405 local_irq_disable();
1406 memcpy(ptr, list, sizeof(struct kmem_list3)); 1334 memcpy(ptr, list, sizeof(struct kmem_list3));
1407 /* 1335 /*
1408 * Do not assume that spinlocks can be initialized via memcpy: 1336 * Do not assume that spinlocks can be initialized via memcpy:
@@ -1411,7 +1339,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1411 1339
1412 MAKE_ALL_LISTS(cachep, ptr, nodeid); 1340 MAKE_ALL_LISTS(cachep, ptr, nodeid);
1413 cachep->nodelists[nodeid] = ptr; 1341 cachep->nodelists[nodeid] = ptr;
1414 local_irq_enable();
1415} 1342}
1416 1343
1417/* 1344/*
@@ -1443,10 +1370,8 @@ void __init kmem_cache_init(void)
1443 int order; 1370 int order;
1444 int node; 1371 int node;
1445 1372
1446 if (num_possible_nodes() == 1) { 1373 if (num_possible_nodes() == 1)
1447 use_alien_caches = 0; 1374 use_alien_caches = 0;
1448 numa_platform = 0;
1449 }
1450 1375
1451 for (i = 0; i < NUM_INIT_LISTS; i++) { 1376 for (i = 0; i < NUM_INIT_LISTS; i++) {
1452 kmem_list3_init(&initkmem_list3[i]); 1377 kmem_list3_init(&initkmem_list3[i]);
@@ -1575,9 +1500,8 @@ void __init kmem_cache_init(void)
1575 { 1500 {
1576 struct array_cache *ptr; 1501 struct array_cache *ptr;
1577 1502
1578 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1503 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1579 1504
1580 local_irq_disable();
1581 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); 1505 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1582 memcpy(ptr, cpu_cache_get(&cache_cache), 1506 memcpy(ptr, cpu_cache_get(&cache_cache),
1583 sizeof(struct arraycache_init)); 1507 sizeof(struct arraycache_init));
@@ -1587,11 +1511,9 @@ void __init kmem_cache_init(void)
1587 spin_lock_init(&ptr->lock); 1511 spin_lock_init(&ptr->lock);
1588 1512
1589 cache_cache.array[smp_processor_id()] = ptr; 1513 cache_cache.array[smp_processor_id()] = ptr;
1590 local_irq_enable();
1591 1514
1592 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1515 ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
1593 1516
1594 local_irq_disable();
1595 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) 1517 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1596 != &initarray_generic.cache); 1518 != &initarray_generic.cache);
1597 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), 1519 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
@@ -1603,7 +1525,6 @@ void __init kmem_cache_init(void)
1603 1525
1604 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1526 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1605 ptr; 1527 ptr;
1606 local_irq_enable();
1607 } 1528 }
1608 /* 5) Replace the bootstrap kmem_list3's */ 1529 /* 5) Replace the bootstrap kmem_list3's */
1609 { 1530 {
@@ -1622,19 +1543,22 @@ void __init kmem_cache_init(void)
1622 } 1543 }
1623 } 1544 }
1624 1545
1625 /* 6) resize the head arrays to their final sizes */ 1546 g_cpucache_up = EARLY;
1626 {
1627 struct kmem_cache *cachep;
1628 mutex_lock(&cache_chain_mutex);
1629 list_for_each_entry(cachep, &cache_chain, next)
1630 if (enable_cpucache(cachep))
1631 BUG();
1632 mutex_unlock(&cache_chain_mutex);
1633 }
1634 1547
1635 /* Annotate slab for lockdep -- annotate the malloc caches */ 1548 /* Annotate slab for lockdep -- annotate the malloc caches */
1636 init_lock_keys(); 1549 init_lock_keys();
1550}
1551
1552void __init kmem_cache_init_late(void)
1553{
1554 struct kmem_cache *cachep;
1637 1555
1556 /* 6) resize the head arrays to their final sizes */
1557 mutex_lock(&cache_chain_mutex);
1558 list_for_each_entry(cachep, &cache_chain, next)
1559 if (enable_cpucache(cachep, GFP_NOWAIT))
1560 BUG();
1561 mutex_unlock(&cache_chain_mutex);
1638 1562
1639 /* Done! */ 1563 /* Done! */
1640 g_cpucache_up = FULL; 1564 g_cpucache_up = FULL;
@@ -1689,7 +1613,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1689 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1613 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1690 flags |= __GFP_RECLAIMABLE; 1614 flags |= __GFP_RECLAIMABLE;
1691 1615
1692 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1616 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1693 if (!page) 1617 if (!page)
1694 return NULL; 1618 return NULL;
1695 1619
@@ -1702,6 +1626,16 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1702 NR_SLAB_UNRECLAIMABLE, nr_pages); 1626 NR_SLAB_UNRECLAIMABLE, nr_pages);
1703 for (i = 0; i < nr_pages; i++) 1627 for (i = 0; i < nr_pages; i++)
1704 __SetPageSlab(page + i); 1628 __SetPageSlab(page + i);
1629
1630 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1631 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1632
1633 if (cachep->ctor)
1634 kmemcheck_mark_uninitialized_pages(page, nr_pages);
1635 else
1636 kmemcheck_mark_unallocated_pages(page, nr_pages);
1637 }
1638
1705 return page_address(page); 1639 return page_address(page);
1706} 1640}
1707 1641
@@ -1714,6 +1648,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1714 struct page *page = virt_to_page(addr); 1648 struct page *page = virt_to_page(addr);
1715 const unsigned long nr_freed = i; 1649 const unsigned long nr_freed = i;
1716 1650
1651 kmemcheck_free_shadow(page, cachep->gfporder);
1652
1717 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1653 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1718 sub_zone_page_state(page_zone(page), 1654 sub_zone_page_state(page_zone(page),
1719 NR_SLAB_RECLAIMABLE, nr_freed); 1655 NR_SLAB_RECLAIMABLE, nr_freed);
@@ -2064,10 +2000,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2064 return left_over; 2000 return left_over;
2065} 2001}
2066 2002
2067static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) 2003static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2068{ 2004{
2069 if (g_cpucache_up == FULL) 2005 if (g_cpucache_up == FULL)
2070 return enable_cpucache(cachep); 2006 return enable_cpucache(cachep, gfp);
2071 2007
2072 if (g_cpucache_up == NONE) { 2008 if (g_cpucache_up == NONE) {
2073 /* 2009 /*
@@ -2089,7 +2025,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2089 g_cpucache_up = PARTIAL_AC; 2025 g_cpucache_up = PARTIAL_AC;
2090 } else { 2026 } else {
2091 cachep->array[smp_processor_id()] = 2027 cachep->array[smp_processor_id()] =
2092 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 2028 kmalloc(sizeof(struct arraycache_init), gfp);
2093 2029
2094 if (g_cpucache_up == PARTIAL_AC) { 2030 if (g_cpucache_up == PARTIAL_AC) {
2095 set_up_list3s(cachep, SIZE_L3); 2031 set_up_list3s(cachep, SIZE_L3);
@@ -2099,7 +2035,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2099 for_each_online_node(node) { 2035 for_each_online_node(node) {
2100 cachep->nodelists[node] = 2036 cachep->nodelists[node] =
2101 kmalloc_node(sizeof(struct kmem_list3), 2037 kmalloc_node(sizeof(struct kmem_list3),
2102 GFP_KERNEL, node); 2038 gfp, node);
2103 BUG_ON(!cachep->nodelists[node]); 2039 BUG_ON(!cachep->nodelists[node]);
2104 kmem_list3_init(cachep->nodelists[node]); 2040 kmem_list3_init(cachep->nodelists[node]);
2105 } 2041 }
@@ -2153,6 +2089,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2153{ 2089{
2154 size_t left_over, slab_size, ralign; 2090 size_t left_over, slab_size, ralign;
2155 struct kmem_cache *cachep = NULL, *pc; 2091 struct kmem_cache *cachep = NULL, *pc;
2092 gfp_t gfp;
2156 2093
2157 /* 2094 /*
2158 * Sanity checks... these are all serious usage bugs. 2095 * Sanity checks... these are all serious usage bugs.
@@ -2168,8 +2105,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2168 * We use cache_chain_mutex to ensure a consistent view of 2105 * We use cache_chain_mutex to ensure a consistent view of
2169 * cpu_online_mask as well. Please see cpuup_callback 2106 * cpu_online_mask as well. Please see cpuup_callback
2170 */ 2107 */
2171 get_online_cpus(); 2108 if (slab_is_available()) {
2172 mutex_lock(&cache_chain_mutex); 2109 get_online_cpus();
2110 mutex_lock(&cache_chain_mutex);
2111 }
2173 2112
2174 list_for_each_entry(pc, &cache_chain, next) { 2113 list_for_each_entry(pc, &cache_chain, next) {
2175 char tmp; 2114 char tmp;
@@ -2278,8 +2217,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2278 */ 2217 */
2279 align = ralign; 2218 align = ralign;
2280 2219
2220 if (slab_is_available())
2221 gfp = GFP_KERNEL;
2222 else
2223 gfp = GFP_NOWAIT;
2224
2281 /* Get cache's description obj. */ 2225 /* Get cache's description obj. */
2282 cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); 2226 cachep = kmem_cache_zalloc(&cache_cache, gfp);
2283 if (!cachep) 2227 if (!cachep)
2284 goto oops; 2228 goto oops;
2285 2229
@@ -2353,6 +2297,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2353 /* really off slab. No need for manual alignment */ 2297 /* really off slab. No need for manual alignment */
2354 slab_size = 2298 slab_size =
2355 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); 2299 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2300
2301#ifdef CONFIG_PAGE_POISONING
2302 /* If we're going to use the generic kernel_map_pages()
2303 * poisoning, then it's going to smash the contents of
2304 * the redzone and userword anyhow, so switch them off.
2305 */
2306 if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
2307 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2308#endif
2356 } 2309 }
2357 2310
2358 cachep->colour_off = cache_line_size(); 2311 cachep->colour_off = cache_line_size();
@@ -2382,7 +2335,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2382 cachep->ctor = ctor; 2335 cachep->ctor = ctor;
2383 cachep->name = name; 2336 cachep->name = name;
2384 2337
2385 if (setup_cpu_cache(cachep)) { 2338 if (setup_cpu_cache(cachep, gfp)) {
2386 __kmem_cache_destroy(cachep); 2339 __kmem_cache_destroy(cachep);
2387 cachep = NULL; 2340 cachep = NULL;
2388 goto oops; 2341 goto oops;
@@ -2394,8 +2347,10 @@ oops:
2394 if (!cachep && (flags & SLAB_PANIC)) 2347 if (!cachep && (flags & SLAB_PANIC))
2395 panic("kmem_cache_create(): failed to create slab `%s'\n", 2348 panic("kmem_cache_create(): failed to create slab `%s'\n",
2396 name); 2349 name);
2397 mutex_unlock(&cache_chain_mutex); 2350 if (slab_is_available()) {
2398 put_online_cpus(); 2351 mutex_unlock(&cache_chain_mutex);
2352 put_online_cpus();
2353 }
2399 return cachep; 2354 return cachep;
2400} 2355}
2401EXPORT_SYMBOL(kmem_cache_create); 2356EXPORT_SYMBOL(kmem_cache_create);
@@ -2621,6 +2576,14 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2621 /* Slab management obj is off-slab. */ 2576 /* Slab management obj is off-slab. */
2622 slabp = kmem_cache_alloc_node(cachep->slabp_cache, 2577 slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2623 local_flags, nodeid); 2578 local_flags, nodeid);
2579 /*
2580 * If the first object in the slab is leaked (it's allocated
2581 * but no one has a reference to it), we want to make sure
2582 * kmemleak does not treat the ->s_mem pointer as a reference
2583 * to the object. Otherwise we will not report the leak.
2584 */
2585 kmemleak_scan_area(slabp, offsetof(struct slab, list),
2586 sizeof(struct list_head), local_flags);
2624 if (!slabp) 2587 if (!slabp)
2625 return NULL; 2588 return NULL;
2626 } else { 2589 } else {
@@ -3141,6 +3104,12 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3141 STATS_INC_ALLOCMISS(cachep); 3104 STATS_INC_ALLOCMISS(cachep);
3142 objp = cache_alloc_refill(cachep, flags); 3105 objp = cache_alloc_refill(cachep, flags);
3143 } 3106 }
3107 /*
3108 * To avoid a false negative, if an object that is in one of the
3109 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3110 * treat the array pointers as a reference to the object.
3111 */
3112 kmemleak_erase(&ac->entry[ac->avail]);
3144 return objp; 3113 return objp;
3145} 3114}
3146 3115
@@ -3219,7 +3188,7 @@ retry:
3219 if (local_flags & __GFP_WAIT) 3188 if (local_flags & __GFP_WAIT)
3220 local_irq_enable(); 3189 local_irq_enable();
3221 kmem_flagcheck(cache, flags); 3190 kmem_flagcheck(cache, flags);
3222 obj = kmem_getpages(cache, local_flags, -1); 3191 obj = kmem_getpages(cache, local_flags, numa_node_id());
3223 if (local_flags & __GFP_WAIT) 3192 if (local_flags & __GFP_WAIT)
3224 local_irq_disable(); 3193 local_irq_disable();
3225 if (obj) { 3194 if (obj) {
@@ -3327,6 +3296,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3327 unsigned long save_flags; 3296 unsigned long save_flags;
3328 void *ptr; 3297 void *ptr;
3329 3298
3299 flags &= gfp_allowed_mask;
3300
3330 lockdep_trace_alloc(flags); 3301 lockdep_trace_alloc(flags);
3331 3302
3332 if (slab_should_failslab(cachep, flags)) 3303 if (slab_should_failslab(cachep, flags))
@@ -3360,6 +3331,11 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3360 out: 3331 out:
3361 local_irq_restore(save_flags); 3332 local_irq_restore(save_flags);
3362 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3333 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3334 kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
3335 flags);
3336
3337 if (likely(ptr))
3338 kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
3363 3339
3364 if (unlikely((flags & __GFP_ZERO) && ptr)) 3340 if (unlikely((flags & __GFP_ZERO) && ptr))
3365 memset(ptr, 0, obj_size(cachep)); 3341 memset(ptr, 0, obj_size(cachep));
@@ -3405,6 +3381,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3405 unsigned long save_flags; 3381 unsigned long save_flags;
3406 void *objp; 3382 void *objp;
3407 3383
3384 flags &= gfp_allowed_mask;
3385
3408 lockdep_trace_alloc(flags); 3386 lockdep_trace_alloc(flags);
3409 3387
3410 if (slab_should_failslab(cachep, flags)) 3388 if (slab_should_failslab(cachep, flags))
@@ -3415,8 +3393,13 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3415 objp = __do_cache_alloc(cachep, flags); 3393 objp = __do_cache_alloc(cachep, flags);
3416 local_irq_restore(save_flags); 3394 local_irq_restore(save_flags);
3417 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3395 objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3396 kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
3397 flags);
3418 prefetchw(objp); 3398 prefetchw(objp);
3419 3399
3400 if (likely(objp))
3401 kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
3402
3420 if (unlikely((flags & __GFP_ZERO) && objp)) 3403 if (unlikely((flags & __GFP_ZERO) && objp))
3421 memset(objp, 0, obj_size(cachep)); 3404 memset(objp, 0, obj_size(cachep));
3422 3405
@@ -3530,8 +3513,11 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3530 struct array_cache *ac = cpu_cache_get(cachep); 3513 struct array_cache *ac = cpu_cache_get(cachep);
3531 3514
3532 check_irq_off(); 3515 check_irq_off();
3516 kmemleak_free_recursive(objp, cachep->flags);
3533 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3517 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3534 3518
3519 kmemcheck_slab_free(cachep, objp, obj_size(cachep));
3520
3535 /* 3521 /*
3536 * Skip calling cache_free_alien() when the platform is not numa. 3522 * Skip calling cache_free_alien() when the platform is not numa.
3537 * This will avoid cache misses that happen while accessing slabp (which 3523 * This will avoid cache misses that happen while accessing slabp (which
@@ -3539,7 +3525,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3539 * variable to skip the call, which is mostly likely to be present in 3525 * variable to skip the call, which is mostly likely to be present in
3540 * the cache. 3526 * the cache.
3541 */ 3527 */
3542 if (numa_platform && cache_free_alien(cachep, objp)) 3528 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3543 return; 3529 return;
3544 3530
3545 if (likely(ac->avail < ac->limit)) { 3531 if (likely(ac->avail < ac->limit)) {
@@ -3802,7 +3788,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
3802/* 3788/*
3803 * This initializes kmem_list3 or resizes various caches for all nodes. 3789 * This initializes kmem_list3 or resizes various caches for all nodes.
3804 */ 3790 */
3805static int alloc_kmemlist(struct kmem_cache *cachep) 3791static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
3806{ 3792{
3807 int node; 3793 int node;
3808 struct kmem_list3 *l3; 3794 struct kmem_list3 *l3;
@@ -3812,7 +3798,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3812 for_each_online_node(node) { 3798 for_each_online_node(node) {
3813 3799
3814 if (use_alien_caches) { 3800 if (use_alien_caches) {
3815 new_alien = alloc_alien_cache(node, cachep->limit); 3801 new_alien = alloc_alien_cache(node, cachep->limit, gfp);
3816 if (!new_alien) 3802 if (!new_alien)
3817 goto fail; 3803 goto fail;
3818 } 3804 }
@@ -3821,7 +3807,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3821 if (cachep->shared) { 3807 if (cachep->shared) {
3822 new_shared = alloc_arraycache(node, 3808 new_shared = alloc_arraycache(node,
3823 cachep->shared*cachep->batchcount, 3809 cachep->shared*cachep->batchcount,
3824 0xbaadf00d); 3810 0xbaadf00d, gfp);
3825 if (!new_shared) { 3811 if (!new_shared) {
3826 free_alien_cache(new_alien); 3812 free_alien_cache(new_alien);
3827 goto fail; 3813 goto fail;
@@ -3850,7 +3836,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3850 free_alien_cache(new_alien); 3836 free_alien_cache(new_alien);
3851 continue; 3837 continue;
3852 } 3838 }
3853 l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); 3839 l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
3854 if (!l3) { 3840 if (!l3) {
3855 free_alien_cache(new_alien); 3841 free_alien_cache(new_alien);
3856 kfree(new_shared); 3842 kfree(new_shared);
@@ -3906,18 +3892,18 @@ static void do_ccupdate_local(void *info)
3906 3892
3907/* Always called with the cache_chain_mutex held */ 3893/* Always called with the cache_chain_mutex held */
3908static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3894static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3909 int batchcount, int shared) 3895 int batchcount, int shared, gfp_t gfp)
3910{ 3896{
3911 struct ccupdate_struct *new; 3897 struct ccupdate_struct *new;
3912 int i; 3898 int i;
3913 3899
3914 new = kzalloc(sizeof(*new), GFP_KERNEL); 3900 new = kzalloc(sizeof(*new), gfp);
3915 if (!new) 3901 if (!new)
3916 return -ENOMEM; 3902 return -ENOMEM;
3917 3903
3918 for_each_online_cpu(i) { 3904 for_each_online_cpu(i) {
3919 new->new[i] = alloc_arraycache(cpu_to_node(i), limit, 3905 new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3920 batchcount); 3906 batchcount, gfp);
3921 if (!new->new[i]) { 3907 if (!new->new[i]) {
3922 for (i--; i >= 0; i--) 3908 for (i--; i >= 0; i--)
3923 kfree(new->new[i]); 3909 kfree(new->new[i]);
@@ -3944,11 +3930,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3944 kfree(ccold); 3930 kfree(ccold);
3945 } 3931 }
3946 kfree(new); 3932 kfree(new);
3947 return alloc_kmemlist(cachep); 3933 return alloc_kmemlist(cachep, gfp);
3948} 3934}
3949 3935
3950/* Called with cache_chain_mutex held always */ 3936/* Called with cache_chain_mutex held always */
3951static int enable_cpucache(struct kmem_cache *cachep) 3937static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
3952{ 3938{
3953 int err; 3939 int err;
3954 int limit, shared; 3940 int limit, shared;
@@ -3994,7 +3980,7 @@ static int enable_cpucache(struct kmem_cache *cachep)
3994 if (limit > 32) 3980 if (limit > 32)
3995 limit = 32; 3981 limit = 32;
3996#endif 3982#endif
3997 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); 3983 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
3998 if (err) 3984 if (err)
3999 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3985 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
4000 cachep->name, -err); 3986 cachep->name, -err);
@@ -4300,7 +4286,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4300 res = 0; 4286 res = 0;
4301 } else { 4287 } else {
4302 res = do_tune_cpucache(cachep, limit, 4288 res = do_tune_cpucache(cachep, limit,
4303 batchcount, shared); 4289 batchcount, shared,
4290 GFP_KERNEL);
4304 } 4291 }
4305 break; 4292 break;
4306 } 4293 }
diff --git a/mm/slob.c b/mm/slob.c
index f92e66d558bd..c78742defdc6 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -46,7 +46,7 @@
46 * NUMA support in SLOB is fairly simplistic, pushing most of the real 46 * NUMA support in SLOB is fairly simplistic, pushing most of the real
47 * logic down to the page allocator, and simply doing the node accounting 47 * logic down to the page allocator, and simply doing the node accounting
48 * on the upper levels. In the event that a node id is explicitly 48 * on the upper levels. In the event that a node id is explicitly
49 * provided, alloc_pages_node() with the specified node id is used 49 * provided, alloc_pages_exact_node() with the specified node id is used
50 * instead. The common case (or when the node id isn't explicitly provided) 50 * instead. The common case (or when the node id isn't explicitly provided)
51 * will default to the current node, as per numa_node_id(). 51 * will default to the current node, as per numa_node_id().
52 * 52 *
@@ -66,7 +66,8 @@
66#include <linux/module.h> 66#include <linux/module.h>
67#include <linux/rcupdate.h> 67#include <linux/rcupdate.h>
68#include <linux/list.h> 68#include <linux/list.h>
69#include <trace/kmemtrace.h> 69#include <linux/kmemtrace.h>
70#include <linux/kmemleak.h>
70#include <asm/atomic.h> 71#include <asm/atomic.h>
71 72
72/* 73/*
@@ -132,17 +133,17 @@ static LIST_HEAD(free_slob_large);
132 */ 133 */
133static inline int is_slob_page(struct slob_page *sp) 134static inline int is_slob_page(struct slob_page *sp)
134{ 135{
135 return PageSlobPage((struct page *)sp); 136 return PageSlab((struct page *)sp);
136} 137}
137 138
138static inline void set_slob_page(struct slob_page *sp) 139static inline void set_slob_page(struct slob_page *sp)
139{ 140{
140 __SetPageSlobPage((struct page *)sp); 141 __SetPageSlab((struct page *)sp);
141} 142}
142 143
143static inline void clear_slob_page(struct slob_page *sp) 144static inline void clear_slob_page(struct slob_page *sp)
144{ 145{
145 __ClearPageSlobPage((struct page *)sp); 146 __ClearPageSlab((struct page *)sp);
146} 147}
147 148
148static inline struct slob_page *slob_page(const void *addr) 149static inline struct slob_page *slob_page(const void *addr)
@@ -243,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
243 244
244#ifdef CONFIG_NUMA 245#ifdef CONFIG_NUMA
245 if (node != -1) 246 if (node != -1)
246 page = alloc_pages_node(node, gfp, order); 247 page = alloc_pages_exact_node(node, gfp, order);
247 else 248 else
248#endif 249#endif
249 page = alloc_pages(gfp, order); 250 page = alloc_pages(gfp, order);
@@ -509,6 +510,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
509 size, PAGE_SIZE << order, gfp, node); 510 size, PAGE_SIZE << order, gfp, node);
510 } 511 }
511 512
513 kmemleak_alloc(ret, size, 1, gfp);
512 return ret; 514 return ret;
513} 515}
514EXPORT_SYMBOL(__kmalloc_node); 516EXPORT_SYMBOL(__kmalloc_node);
@@ -521,6 +523,7 @@ void kfree(const void *block)
521 523
522 if (unlikely(ZERO_OR_NULL_PTR(block))) 524 if (unlikely(ZERO_OR_NULL_PTR(block)))
523 return; 525 return;
526 kmemleak_free(block);
524 527
525 sp = slob_page(block); 528 sp = slob_page(block);
526 if (is_slob_page(sp)) { 529 if (is_slob_page(sp)) {
@@ -584,12 +587,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
584 } else if (flags & SLAB_PANIC) 587 } else if (flags & SLAB_PANIC)
585 panic("Cannot create slab cache %s\n", name); 588 panic("Cannot create slab cache %s\n", name);
586 589
590 kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
587 return c; 591 return c;
588} 592}
589EXPORT_SYMBOL(kmem_cache_create); 593EXPORT_SYMBOL(kmem_cache_create);
590 594
591void kmem_cache_destroy(struct kmem_cache *c) 595void kmem_cache_destroy(struct kmem_cache *c)
592{ 596{
597 kmemleak_free(c);
593 slob_free(c, sizeof(struct kmem_cache)); 598 slob_free(c, sizeof(struct kmem_cache));
594} 599}
595EXPORT_SYMBOL(kmem_cache_destroy); 600EXPORT_SYMBOL(kmem_cache_destroy);
@@ -613,6 +618,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
613 if (c->ctor) 618 if (c->ctor)
614 c->ctor(b); 619 c->ctor(b);
615 620
621 kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
616 return b; 622 return b;
617} 623}
618EXPORT_SYMBOL(kmem_cache_alloc_node); 624EXPORT_SYMBOL(kmem_cache_alloc_node);
@@ -635,6 +641,7 @@ static void kmem_rcu_free(struct rcu_head *head)
635 641
636void kmem_cache_free(struct kmem_cache *c, void *b) 642void kmem_cache_free(struct kmem_cache *c, void *b)
637{ 643{
644 kmemleak_free_recursive(b, c->flags);
638 if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { 645 if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
639 struct slob_rcu *slob_rcu; 646 struct slob_rcu *slob_rcu;
640 slob_rcu = b + (c->size - sizeof(struct slob_rcu)); 647 slob_rcu = b + (c->size - sizeof(struct slob_rcu));
diff --git a/mm/slub.c b/mm/slub.c
index 65ffda5934b0..819f056b39c6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -17,9 +17,11 @@
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/proc_fs.h> 18#include <linux/proc_fs.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <trace/kmemtrace.h> 20#include <linux/kmemtrace.h>
21#include <linux/kmemcheck.h>
21#include <linux/cpu.h> 22#include <linux/cpu.h>
22#include <linux/cpuset.h> 23#include <linux/cpuset.h>
24#include <linux/kmemleak.h>
23#include <linux/mempolicy.h> 25#include <linux/mempolicy.h>
24#include <linux/ctype.h> 26#include <linux/ctype.h>
25#include <linux/debugobjects.h> 27#include <linux/debugobjects.h>
@@ -143,10 +145,10 @@
143 * Set of flags that will prevent slab merging 145 * Set of flags that will prevent slab merging
144 */ 146 */
145#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 147#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
146 SLAB_TRACE | SLAB_DESTROY_BY_RCU) 148 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE)
147 149
148#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 150#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
149 SLAB_CACHE_DMA) 151 SLAB_CACHE_DMA | SLAB_NOTRACK)
150 152
151#ifndef ARCH_KMALLOC_MINALIGN 153#ifndef ARCH_KMALLOC_MINALIGN
152#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 154#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
@@ -832,6 +834,11 @@ static inline unsigned long slabs_node(struct kmem_cache *s, int node)
832 return atomic_long_read(&n->nr_slabs); 834 return atomic_long_read(&n->nr_slabs);
833} 835}
834 836
837static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
838{
839 return atomic_long_read(&n->nr_slabs);
840}
841
835static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) 842static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
836{ 843{
837 struct kmem_cache_node *n = get_node(s, node); 844 struct kmem_cache_node *n = get_node(s, node);
@@ -1050,6 +1057,8 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
1050 1057
1051static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1058static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1052 { return 0; } 1059 { return 0; }
1060static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1061 { return 0; }
1053static inline void inc_slabs_node(struct kmem_cache *s, int node, 1062static inline void inc_slabs_node(struct kmem_cache *s, int node,
1054 int objects) {} 1063 int objects) {}
1055static inline void dec_slabs_node(struct kmem_cache *s, int node, 1064static inline void dec_slabs_node(struct kmem_cache *s, int node,
@@ -1064,6 +1073,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
1064{ 1073{
1065 int order = oo_order(oo); 1074 int order = oo_order(oo);
1066 1075
1076 flags |= __GFP_NOTRACK;
1077
1067 if (node == -1) 1078 if (node == -1)
1068 return alloc_pages(flags, order); 1079 return alloc_pages(flags, order);
1069 else 1080 else
@@ -1074,11 +1085,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1074{ 1085{
1075 struct page *page; 1086 struct page *page;
1076 struct kmem_cache_order_objects oo = s->oo; 1087 struct kmem_cache_order_objects oo = s->oo;
1088 gfp_t alloc_gfp;
1077 1089
1078 flags |= s->allocflags; 1090 flags |= s->allocflags;
1079 1091
1080 page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node, 1092 /*
1081 oo); 1093 * Let the initial higher-order allocation fail under memory pressure
1094 * so we fall-back to the minimum order allocation.
1095 */
1096 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1097
1098 page = alloc_slab_page(alloc_gfp, node, oo);
1082 if (unlikely(!page)) { 1099 if (unlikely(!page)) {
1083 oo = s->min; 1100 oo = s->min;
1084 /* 1101 /*
@@ -1091,6 +1108,24 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1091 1108
1092 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); 1109 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK);
1093 } 1110 }
1111
1112 if (kmemcheck_enabled
1113 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS)))
1114 {
1115 int pages = 1 << oo_order(oo);
1116
1117 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
1118
1119 /*
1120 * Objects from caches that have a constructor don't get
1121 * cleared when they're allocated, so we need to do it here.
1122 */
1123 if (s->ctor)
1124 kmemcheck_mark_uninitialized_pages(page, pages);
1125 else
1126 kmemcheck_mark_unallocated_pages(page, pages);
1127 }
1128
1094 page->objects = oo_objects(oo); 1129 page->objects = oo_objects(oo);
1095 mod_zone_page_state(page_zone(page), 1130 mod_zone_page_state(page_zone(page),
1096 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1131 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -1164,6 +1199,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1164 __ClearPageSlubDebug(page); 1199 __ClearPageSlubDebug(page);
1165 } 1200 }
1166 1201
1202 kmemcheck_free_shadow(page, compound_order(page));
1203
1167 mod_zone_page_state(page_zone(page), 1204 mod_zone_page_state(page_zone(page),
1168 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1205 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1169 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1206 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
@@ -1484,6 +1521,65 @@ static inline int node_match(struct kmem_cache_cpu *c, int node)
1484 return 1; 1521 return 1;
1485} 1522}
1486 1523
1524static int count_free(struct page *page)
1525{
1526 return page->objects - page->inuse;
1527}
1528
1529static unsigned long count_partial(struct kmem_cache_node *n,
1530 int (*get_count)(struct page *))
1531{
1532 unsigned long flags;
1533 unsigned long x = 0;
1534 struct page *page;
1535
1536 spin_lock_irqsave(&n->list_lock, flags);
1537 list_for_each_entry(page, &n->partial, lru)
1538 x += get_count(page);
1539 spin_unlock_irqrestore(&n->list_lock, flags);
1540 return x;
1541}
1542
1543static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
1544{
1545#ifdef CONFIG_SLUB_DEBUG
1546 return atomic_long_read(&n->total_objects);
1547#else
1548 return 0;
1549#endif
1550}
1551
1552static noinline void
1553slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
1554{
1555 int node;
1556
1557 printk(KERN_WARNING
1558 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1559 nid, gfpflags);
1560 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, "
1561 "default order: %d, min order: %d\n", s->name, s->objsize,
1562 s->size, oo_order(s->oo), oo_order(s->min));
1563
1564 for_each_online_node(node) {
1565 struct kmem_cache_node *n = get_node(s, node);
1566 unsigned long nr_slabs;
1567 unsigned long nr_objs;
1568 unsigned long nr_free;
1569
1570 if (!n)
1571 continue;
1572
1573 nr_free = count_partial(n, count_free);
1574 nr_slabs = node_nr_slabs(n);
1575 nr_objs = node_nr_objs(n);
1576
1577 printk(KERN_WARNING
1578 " node %d: slabs: %ld, objs: %ld, free: %ld\n",
1579 node, nr_slabs, nr_objs, nr_free);
1580 }
1581}
1582
1487/* 1583/*
1488 * Slow path. The lockless freelist is empty or we need to perform 1584 * Slow path. The lockless freelist is empty or we need to perform
1489 * debugging duties. 1585 * debugging duties.
@@ -1565,6 +1661,8 @@ new_slab:
1565 c->page = new; 1661 c->page = new;
1566 goto load_freelist; 1662 goto load_freelist;
1567 } 1663 }
1664 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1665 slab_out_of_memory(s, gfpflags, node);
1568 return NULL; 1666 return NULL;
1569debug: 1667debug:
1570 if (!alloc_debug_processing(s, c->page, object, addr)) 1668 if (!alloc_debug_processing(s, c->page, object, addr))
@@ -1594,6 +1692,8 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1594 unsigned long flags; 1692 unsigned long flags;
1595 unsigned int objsize; 1693 unsigned int objsize;
1596 1694
1695 gfpflags &= gfp_allowed_mask;
1696
1597 lockdep_trace_alloc(gfpflags); 1697 lockdep_trace_alloc(gfpflags);
1598 might_sleep_if(gfpflags & __GFP_WAIT); 1698 might_sleep_if(gfpflags & __GFP_WAIT);
1599 1699
@@ -1617,6 +1717,9 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1617 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1717 if (unlikely((gfpflags & __GFP_ZERO) && object))
1618 memset(object, 0, objsize); 1718 memset(object, 0, objsize);
1619 1719
1720 kmemcheck_slab_alloc(s, gfpflags, object, c->objsize);
1721 kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags);
1722
1620 return object; 1723 return object;
1621} 1724}
1622 1725
@@ -1746,8 +1849,10 @@ static __always_inline void slab_free(struct kmem_cache *s,
1746 struct kmem_cache_cpu *c; 1849 struct kmem_cache_cpu *c;
1747 unsigned long flags; 1850 unsigned long flags;
1748 1851
1852 kmemleak_free_recursive(x, s->flags);
1749 local_irq_save(flags); 1853 local_irq_save(flags);
1750 c = get_cpu_slab(s, smp_processor_id()); 1854 c = get_cpu_slab(s, smp_processor_id());
1855 kmemcheck_slab_free(s, object, c->objsize);
1751 debug_check_no_locks_freed(object, c->objsize); 1856 debug_check_no_locks_freed(object, c->objsize);
1752 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1857 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1753 debug_check_no_obj_freed(object, c->objsize); 1858 debug_check_no_obj_freed(object, c->objsize);
@@ -2557,13 +2662,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
2557 if (gfp_flags & SLUB_DMA) 2662 if (gfp_flags & SLUB_DMA)
2558 flags = SLAB_CACHE_DMA; 2663 flags = SLAB_CACHE_DMA;
2559 2664
2560 down_write(&slub_lock); 2665 /*
2666 * This function is called with IRQs disabled during early-boot on
2667 * single CPU so there's no need to take slub_lock here.
2668 */
2561 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, 2669 if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
2562 flags, NULL)) 2670 flags, NULL))
2563 goto panic; 2671 goto panic;
2564 2672
2565 list_add(&s->list, &slab_caches); 2673 list_add(&s->list, &slab_caches);
2566 up_write(&slub_lock); 2674
2567 if (sysfs_slab_add(s)) 2675 if (sysfs_slab_add(s))
2568 goto panic; 2676 goto panic;
2569 return s; 2677 return s;
@@ -2596,6 +2704,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2596 struct kmem_cache *s; 2704 struct kmem_cache *s;
2597 char *text; 2705 char *text;
2598 size_t realsize; 2706 size_t realsize;
2707 unsigned long slabflags;
2599 2708
2600 s = kmalloc_caches_dma[index]; 2709 s = kmalloc_caches_dma[index];
2601 if (s) 2710 if (s)
@@ -2617,9 +2726,18 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2617 (unsigned int)realsize); 2726 (unsigned int)realsize);
2618 s = kmalloc(kmem_size, flags & ~SLUB_DMA); 2727 s = kmalloc(kmem_size, flags & ~SLUB_DMA);
2619 2728
2729 /*
2730 * Must defer sysfs creation to a workqueue because we don't know
2731 * what context we are called from. Before sysfs comes up, we don't
2732 * need to do anything because our sysfs initcall will start by
2733 * adding all existing slabs to sysfs.
2734 */
2735 slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK;
2736 if (slab_state >= SYSFS)
2737 slabflags |= __SYSFS_ADD_DEFERRED;
2738
2620 if (!s || !text || !kmem_cache_open(s, flags, text, 2739 if (!s || !text || !kmem_cache_open(s, flags, text,
2621 realsize, ARCH_KMALLOC_MINALIGN, 2740 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
2622 SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) {
2623 kfree(s); 2741 kfree(s);
2624 kfree(text); 2742 kfree(text);
2625 goto unlock_out; 2743 goto unlock_out;
@@ -2628,7 +2746,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2628 list_add(&s->list, &slab_caches); 2746 list_add(&s->list, &slab_caches);
2629 kmalloc_caches_dma[index] = s; 2747 kmalloc_caches_dma[index] = s;
2630 2748
2631 schedule_work(&sysfs_add_work); 2749 if (slab_state >= SYSFS)
2750 schedule_work(&sysfs_add_work);
2632 2751
2633unlock_out: 2752unlock_out:
2634 up_write(&slub_lock); 2753 up_write(&slub_lock);
@@ -2713,9 +2832,10 @@ EXPORT_SYMBOL(__kmalloc);
2713 2832
2714static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 2833static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2715{ 2834{
2716 struct page *page = alloc_pages_node(node, flags | __GFP_COMP, 2835 struct page *page;
2717 get_order(size));
2718 2836
2837 flags |= __GFP_COMP | __GFP_NOTRACK;
2838 page = alloc_pages_node(node, flags, get_order(size));
2719 if (page) 2839 if (page)
2720 return page_address(page); 2840 return page_address(page);
2721 else 2841 else
@@ -3021,7 +3141,7 @@ void __init kmem_cache_init(void)
3021 * kmem_cache_open for slab_state == DOWN. 3141 * kmem_cache_open for slab_state == DOWN.
3022 */ 3142 */
3023 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 3143 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
3024 sizeof(struct kmem_cache_node), GFP_KERNEL); 3144 sizeof(struct kmem_cache_node), GFP_NOWAIT);
3025 kmalloc_caches[0].refcount = -1; 3145 kmalloc_caches[0].refcount = -1;
3026 caches++; 3146 caches++;
3027 3147
@@ -3034,16 +3154,16 @@ void __init kmem_cache_init(void)
3034 /* Caches that are not of the two-to-the-power-of size */ 3154 /* Caches that are not of the two-to-the-power-of size */
3035 if (KMALLOC_MIN_SIZE <= 64) { 3155 if (KMALLOC_MIN_SIZE <= 64) {
3036 create_kmalloc_cache(&kmalloc_caches[1], 3156 create_kmalloc_cache(&kmalloc_caches[1],
3037 "kmalloc-96", 96, GFP_KERNEL); 3157 "kmalloc-96", 96, GFP_NOWAIT);
3038 caches++; 3158 caches++;
3039 create_kmalloc_cache(&kmalloc_caches[2], 3159 create_kmalloc_cache(&kmalloc_caches[2],
3040 "kmalloc-192", 192, GFP_KERNEL); 3160 "kmalloc-192", 192, GFP_NOWAIT);
3041 caches++; 3161 caches++;
3042 } 3162 }
3043 3163
3044 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { 3164 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
3045 create_kmalloc_cache(&kmalloc_caches[i], 3165 create_kmalloc_cache(&kmalloc_caches[i],
3046 "kmalloc", 1 << i, GFP_KERNEL); 3166 "kmalloc", 1 << i, GFP_NOWAIT);
3047 caches++; 3167 caches++;
3048 } 3168 }
3049 3169
@@ -3080,7 +3200,7 @@ void __init kmem_cache_init(void)
3080 /* Provide the correct kmalloc names now that the caches are up */ 3200 /* Provide the correct kmalloc names now that the caches are up */
3081 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) 3201 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
3082 kmalloc_caches[i]. name = 3202 kmalloc_caches[i]. name =
3083 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 3203 kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
3084 3204
3085#ifdef CONFIG_SMP 3205#ifdef CONFIG_SMP
3086 register_cpu_notifier(&slab_notifier); 3206 register_cpu_notifier(&slab_notifier);
@@ -3098,6 +3218,10 @@ void __init kmem_cache_init(void)
3098 nr_cpu_ids, nr_node_ids); 3218 nr_cpu_ids, nr_node_ids);
3099} 3219}
3100 3220
3221void __init kmem_cache_init_late(void)
3222{
3223}
3224
3101/* 3225/*
3102 * Find a mergeable slab cache 3226 * Find a mergeable slab cache
3103 */ 3227 */
@@ -3318,20 +3442,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3318} 3442}
3319 3443
3320#ifdef CONFIG_SLUB_DEBUG 3444#ifdef CONFIG_SLUB_DEBUG
3321static unsigned long count_partial(struct kmem_cache_node *n,
3322 int (*get_count)(struct page *))
3323{
3324 unsigned long flags;
3325 unsigned long x = 0;
3326 struct page *page;
3327
3328 spin_lock_irqsave(&n->list_lock, flags);
3329 list_for_each_entry(page, &n->partial, lru)
3330 x += get_count(page);
3331 spin_unlock_irqrestore(&n->list_lock, flags);
3332 return x;
3333}
3334
3335static int count_inuse(struct page *page) 3445static int count_inuse(struct page *page)
3336{ 3446{
3337 return page->inuse; 3447 return page->inuse;
@@ -3342,11 +3452,6 @@ static int count_total(struct page *page)
3342 return page->objects; 3452 return page->objects;
3343} 3453}
3344 3454
3345static int count_free(struct page *page)
3346{
3347 return page->objects - page->inuse;
3348}
3349
3350static int validate_slab(struct kmem_cache *s, struct page *page, 3455static int validate_slab(struct kmem_cache *s, struct page *page,
3351 unsigned long *map) 3456 unsigned long *map)
3352{ 3457{
@@ -3715,7 +3820,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
3715 to_cpumask(l->cpus)); 3820 to_cpumask(l->cpus));
3716 } 3821 }
3717 3822
3718 if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && 3823 if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
3719 len < PAGE_SIZE - 60) { 3824 len < PAGE_SIZE - 60) {
3720 len += sprintf(buf + len, " nodes="); 3825 len += sprintf(buf + len, " nodes=");
3721 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, 3826 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
@@ -4390,6 +4495,8 @@ static char *create_unique_id(struct kmem_cache *s)
4390 *p++ = 'a'; 4495 *p++ = 'a';
4391 if (s->flags & SLAB_DEBUG_FREE) 4496 if (s->flags & SLAB_DEBUG_FREE)
4392 *p++ = 'F'; 4497 *p++ = 'F';
4498 if (!(s->flags & SLAB_NOTRACK))
4499 *p++ = 't';
4393 if (p != name + 1) 4500 if (p != name + 1)
4394 *p++ = '-'; 4501 *p++ = '-';
4395 p += sprintf(p, "%07d", s->size); 4502 p += sprintf(p, "%07d", s->size);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1416e7e9e02d..42cd38eba79f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -124,7 +124,6 @@ void __delete_from_swap_cache(struct page *page)
124/** 124/**
125 * add_to_swap - allocate swap space for a page 125 * add_to_swap - allocate swap space for a page
126 * @page: page we want to move to swap 126 * @page: page we want to move to swap
127 * @gfp_mask: memory allocation flags
128 * 127 *
129 * Allocate swap space for the page and add the page to the 128 * Allocate swap space for the page and add the page to the
130 * swap cache. Caller needs to hold the page lock. 129 * swap cache. Caller needs to hold the page lock.
@@ -162,11 +161,11 @@ int add_to_swap(struct page *page)
162 return 1; 161 return 1;
163 case -EEXIST: 162 case -EEXIST:
164 /* Raced with "speculative" read_swap_cache_async */ 163 /* Raced with "speculative" read_swap_cache_async */
165 swap_free(entry); 164 swapcache_free(entry, NULL);
166 continue; 165 continue;
167 default: 166 default:
168 /* -ENOMEM radix-tree allocation failure */ 167 /* -ENOMEM radix-tree allocation failure */
169 swap_free(entry); 168 swapcache_free(entry, NULL);
170 return 0; 169 return 0;
171 } 170 }
172 } 171 }
@@ -188,8 +187,7 @@ void delete_from_swap_cache(struct page *page)
188 __delete_from_swap_cache(page); 187 __delete_from_swap_cache(page);
189 spin_unlock_irq(&swapper_space.tree_lock); 188 spin_unlock_irq(&swapper_space.tree_lock);
190 189
191 mem_cgroup_uncharge_swapcache(page, entry); 190 swapcache_free(entry, page);
192 swap_free(entry);
193 page_cache_release(page); 191 page_cache_release(page);
194} 192}
195 193
@@ -293,7 +291,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
293 /* 291 /*
294 * Swap entry may have been freed since our caller observed it. 292 * Swap entry may have been freed since our caller observed it.
295 */ 293 */
296 if (!swap_duplicate(entry)) 294 err = swapcache_prepare(entry);
295 if (err == -EEXIST) /* seems racy */
296 continue;
297 if (err) /* swp entry is obsolete ? */
297 break; 298 break;
298 299
299 /* 300 /*
@@ -312,12 +313,12 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
312 * Initiate read into locked page and return. 313 * Initiate read into locked page and return.
313 */ 314 */
314 lru_cache_add_anon(new_page); 315 lru_cache_add_anon(new_page);
315 swap_readpage(NULL, new_page); 316 swap_readpage(new_page);
316 return new_page; 317 return new_page;
317 } 318 }
318 ClearPageSwapBacked(new_page); 319 ClearPageSwapBacked(new_page);
319 __clear_page_locked(new_page); 320 __clear_page_locked(new_page);
320 swap_free(entry); 321 swapcache_free(entry, NULL);
321 } while (err != -ENOMEM); 322 } while (err != -ENOMEM);
322 323
323 if (new_page) 324 if (new_page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 312fafe0ab6e..d1ade1a48ee7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -53,6 +53,59 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES];
53 53
54static DEFINE_MUTEX(swapon_mutex); 54static DEFINE_MUTEX(swapon_mutex);
55 55
56/* For reference count accounting in swap_map */
57/* enum for swap_map[] handling. internal use only */
58enum {
59 SWAP_MAP = 0, /* ops for reference from swap users */
60 SWAP_CACHE, /* ops for reference from swap cache */
61};
62
63static inline int swap_count(unsigned short ent)
64{
65 return ent & SWAP_COUNT_MASK;
66}
67
68static inline bool swap_has_cache(unsigned short ent)
69{
70 return !!(ent & SWAP_HAS_CACHE);
71}
72
73static inline unsigned short encode_swapmap(int count, bool has_cache)
74{
75 unsigned short ret = count;
76
77 if (has_cache)
78 return SWAP_HAS_CACHE | ret;
79 return ret;
80}
81
82/* returnes 1 if swap entry is freed */
83static int
84__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
85{
86 int type = si - swap_info;
87 swp_entry_t entry = swp_entry(type, offset);
88 struct page *page;
89 int ret = 0;
90
91 page = find_get_page(&swapper_space, entry.val);
92 if (!page)
93 return 0;
94 /*
95 * This function is called from scan_swap_map() and it's called
96 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
97 * We have to use trylock for avoiding deadlock. This is a special
98 * case and you should use try_to_free_swap() with explicit lock_page()
99 * in usual operations.
100 */
101 if (trylock_page(page)) {
102 ret = try_to_free_swap(page);
103 unlock_page(page);
104 }
105 page_cache_release(page);
106 return ret;
107}
108
56/* 109/*
57 * We need this because the bdev->unplug_fn can sleep and we cannot 110 * We need this because the bdev->unplug_fn can sleep and we cannot
58 * hold swap_lock while calling the unplug_fn. And swap_lock 111 * hold swap_lock while calling the unplug_fn. And swap_lock
@@ -167,7 +220,8 @@ static int wait_for_discard(void *word)
167#define SWAPFILE_CLUSTER 256 220#define SWAPFILE_CLUSTER 256
168#define LATENCY_LIMIT 256 221#define LATENCY_LIMIT 256
169 222
170static inline unsigned long scan_swap_map(struct swap_info_struct *si) 223static inline unsigned long scan_swap_map(struct swap_info_struct *si,
224 int cache)
171{ 225{
172 unsigned long offset; 226 unsigned long offset;
173 unsigned long scan_base; 227 unsigned long scan_base;
@@ -273,6 +327,19 @@ checks:
273 goto no_page; 327 goto no_page;
274 if (offset > si->highest_bit) 328 if (offset > si->highest_bit)
275 scan_base = offset = si->lowest_bit; 329 scan_base = offset = si->lowest_bit;
330
331 /* reuse swap entry of cache-only swap if not busy. */
332 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
333 int swap_was_freed;
334 spin_unlock(&swap_lock);
335 swap_was_freed = __try_to_reclaim_swap(si, offset);
336 spin_lock(&swap_lock);
337 /* entry was freed successfully, try to use this again */
338 if (swap_was_freed)
339 goto checks;
340 goto scan; /* check next one */
341 }
342
276 if (si->swap_map[offset]) 343 if (si->swap_map[offset])
277 goto scan; 344 goto scan;
278 345
@@ -285,7 +352,10 @@ checks:
285 si->lowest_bit = si->max; 352 si->lowest_bit = si->max;
286 si->highest_bit = 0; 353 si->highest_bit = 0;
287 } 354 }
288 si->swap_map[offset] = 1; 355 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
356 si->swap_map[offset] = encode_swapmap(0, true);
357 else /* at suspend */
358 si->swap_map[offset] = encode_swapmap(1, false);
289 si->cluster_next = offset + 1; 359 si->cluster_next = offset + 1;
290 si->flags -= SWP_SCANNING; 360 si->flags -= SWP_SCANNING;
291 361
@@ -351,6 +421,10 @@ scan:
351 spin_lock(&swap_lock); 421 spin_lock(&swap_lock);
352 goto checks; 422 goto checks;
353 } 423 }
424 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
425 spin_lock(&swap_lock);
426 goto checks;
427 }
354 if (unlikely(--latency_ration < 0)) { 428 if (unlikely(--latency_ration < 0)) {
355 cond_resched(); 429 cond_resched();
356 latency_ration = LATENCY_LIMIT; 430 latency_ration = LATENCY_LIMIT;
@@ -362,6 +436,10 @@ scan:
362 spin_lock(&swap_lock); 436 spin_lock(&swap_lock);
363 goto checks; 437 goto checks;
364 } 438 }
439 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
440 spin_lock(&swap_lock);
441 goto checks;
442 }
365 if (unlikely(--latency_ration < 0)) { 443 if (unlikely(--latency_ration < 0)) {
366 cond_resched(); 444 cond_resched();
367 latency_ration = LATENCY_LIMIT; 445 latency_ration = LATENCY_LIMIT;
@@ -401,7 +479,8 @@ swp_entry_t get_swap_page(void)
401 continue; 479 continue;
402 480
403 swap_list.next = next; 481 swap_list.next = next;
404 offset = scan_swap_map(si); 482 /* This is called for allocating swap entry for cache */
483 offset = scan_swap_map(si, SWAP_CACHE);
405 if (offset) { 484 if (offset) {
406 spin_unlock(&swap_lock); 485 spin_unlock(&swap_lock);
407 return swp_entry(type, offset); 486 return swp_entry(type, offset);
@@ -415,6 +494,7 @@ noswap:
415 return (swp_entry_t) {0}; 494 return (swp_entry_t) {0};
416} 495}
417 496
497/* The only caller of this function is now susupend routine */
418swp_entry_t get_swap_page_of_type(int type) 498swp_entry_t get_swap_page_of_type(int type)
419{ 499{
420 struct swap_info_struct *si; 500 struct swap_info_struct *si;
@@ -424,7 +504,8 @@ swp_entry_t get_swap_page_of_type(int type)
424 si = swap_info + type; 504 si = swap_info + type;
425 if (si->flags & SWP_WRITEOK) { 505 if (si->flags & SWP_WRITEOK) {
426 nr_swap_pages--; 506 nr_swap_pages--;
427 offset = scan_swap_map(si); 507 /* This is called for allocating swap entry, not cache */
508 offset = scan_swap_map(si, SWAP_MAP);
428 if (offset) { 509 if (offset) {
429 spin_unlock(&swap_lock); 510 spin_unlock(&swap_lock);
430 return swp_entry(type, offset); 511 return swp_entry(type, offset);
@@ -471,26 +552,40 @@ out:
471 return NULL; 552 return NULL;
472} 553}
473 554
474static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) 555static int swap_entry_free(struct swap_info_struct *p,
556 swp_entry_t ent, int cache)
475{ 557{
476 unsigned long offset = swp_offset(ent); 558 unsigned long offset = swp_offset(ent);
477 int count = p->swap_map[offset]; 559 int count = swap_count(p->swap_map[offset]);
478 560 bool has_cache;
479 if (count < SWAP_MAP_MAX) { 561
480 count--; 562 has_cache = swap_has_cache(p->swap_map[offset]);
481 p->swap_map[offset] = count; 563
482 if (!count) { 564 if (cache == SWAP_MAP) { /* dropping usage count of swap */
483 if (offset < p->lowest_bit) 565 if (count < SWAP_MAP_MAX) {
484 p->lowest_bit = offset; 566 count--;
485 if (offset > p->highest_bit) 567 p->swap_map[offset] = encode_swapmap(count, has_cache);
486 p->highest_bit = offset;
487 if (p->prio > swap_info[swap_list.next].prio)
488 swap_list.next = p - swap_info;
489 nr_swap_pages++;
490 p->inuse_pages--;
491 mem_cgroup_uncharge_swap(ent);
492 } 568 }
569 } else { /* dropping swap cache flag */
570 VM_BUG_ON(!has_cache);
571 p->swap_map[offset] = encode_swapmap(count, false);
572
573 }
574 /* return code. */
575 count = p->swap_map[offset];
576 /* free if no reference */
577 if (!count) {
578 if (offset < p->lowest_bit)
579 p->lowest_bit = offset;
580 if (offset > p->highest_bit)
581 p->highest_bit = offset;
582 if (p->prio > swap_info[swap_list.next].prio)
583 swap_list.next = p - swap_info;
584 nr_swap_pages++;
585 p->inuse_pages--;
493 } 586 }
587 if (!swap_count(count))
588 mem_cgroup_uncharge_swap(ent);
494 return count; 589 return count;
495} 590}
496 591
@@ -504,9 +599,33 @@ void swap_free(swp_entry_t entry)
504 599
505 p = swap_info_get(entry); 600 p = swap_info_get(entry);
506 if (p) { 601 if (p) {
507 swap_entry_free(p, entry); 602 swap_entry_free(p, entry, SWAP_MAP);
603 spin_unlock(&swap_lock);
604 }
605}
606
607/*
608 * Called after dropping swapcache to decrease refcnt to swap entries.
609 */
610void swapcache_free(swp_entry_t entry, struct page *page)
611{
612 struct swap_info_struct *p;
613 int ret;
614
615 p = swap_info_get(entry);
616 if (p) {
617 ret = swap_entry_free(p, entry, SWAP_CACHE);
618 if (page) {
619 bool swapout;
620 if (ret)
621 swapout = true; /* the end of swap out */
622 else
623 swapout = false; /* no more swap users! */
624 mem_cgroup_uncharge_swapcache(page, entry, swapout);
625 }
508 spin_unlock(&swap_lock); 626 spin_unlock(&swap_lock);
509 } 627 }
628 return;
510} 629}
511 630
512/* 631/*
@@ -521,8 +640,7 @@ static inline int page_swapcount(struct page *page)
521 entry.val = page_private(page); 640 entry.val = page_private(page);
522 p = swap_info_get(entry); 641 p = swap_info_get(entry);
523 if (p) { 642 if (p) {
524 /* Subtract the 1 for the swap cache itself */ 643 count = swap_count(p->swap_map[swp_offset(entry)]);
525 count = p->swap_map[swp_offset(entry)] - 1;
526 spin_unlock(&swap_lock); 644 spin_unlock(&swap_lock);
527 } 645 }
528 return count; 646 return count;
@@ -584,7 +702,7 @@ int free_swap_and_cache(swp_entry_t entry)
584 702
585 p = swap_info_get(entry); 703 p = swap_info_get(entry);
586 if (p) { 704 if (p) {
587 if (swap_entry_free(p, entry) == 1) { 705 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
588 page = find_get_page(&swapper_space, entry.val); 706 page = find_get_page(&swapper_space, entry.val);
589 if (page && !trylock_page(page)) { 707 if (page && !trylock_page(page)) {
590 page_cache_release(page); 708 page_cache_release(page);
@@ -891,7 +1009,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
891 i = 1; 1009 i = 1;
892 } 1010 }
893 count = si->swap_map[i]; 1011 count = si->swap_map[i];
894 if (count && count != SWAP_MAP_BAD) 1012 if (count && swap_count(count) != SWAP_MAP_BAD)
895 break; 1013 break;
896 } 1014 }
897 return i; 1015 return i;
@@ -995,13 +1113,13 @@ static int try_to_unuse(unsigned int type)
995 */ 1113 */
996 shmem = 0; 1114 shmem = 0;
997 swcount = *swap_map; 1115 swcount = *swap_map;
998 if (swcount > 1) { 1116 if (swap_count(swcount)) {
999 if (start_mm == &init_mm) 1117 if (start_mm == &init_mm)
1000 shmem = shmem_unuse(entry, page); 1118 shmem = shmem_unuse(entry, page);
1001 else 1119 else
1002 retval = unuse_mm(start_mm, entry, page); 1120 retval = unuse_mm(start_mm, entry, page);
1003 } 1121 }
1004 if (*swap_map > 1) { 1122 if (swap_count(*swap_map)) {
1005 int set_start_mm = (*swap_map >= swcount); 1123 int set_start_mm = (*swap_map >= swcount);
1006 struct list_head *p = &start_mm->mmlist; 1124 struct list_head *p = &start_mm->mmlist;
1007 struct mm_struct *new_start_mm = start_mm; 1125 struct mm_struct *new_start_mm = start_mm;
@@ -1011,7 +1129,7 @@ static int try_to_unuse(unsigned int type)
1011 atomic_inc(&new_start_mm->mm_users); 1129 atomic_inc(&new_start_mm->mm_users);
1012 atomic_inc(&prev_mm->mm_users); 1130 atomic_inc(&prev_mm->mm_users);
1013 spin_lock(&mmlist_lock); 1131 spin_lock(&mmlist_lock);
1014 while (*swap_map > 1 && !retval && !shmem && 1132 while (swap_count(*swap_map) && !retval && !shmem &&
1015 (p = p->next) != &start_mm->mmlist) { 1133 (p = p->next) != &start_mm->mmlist) {
1016 mm = list_entry(p, struct mm_struct, mmlist); 1134 mm = list_entry(p, struct mm_struct, mmlist);
1017 if (!atomic_inc_not_zero(&mm->mm_users)) 1135 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1023,14 +1141,16 @@ static int try_to_unuse(unsigned int type)
1023 cond_resched(); 1141 cond_resched();
1024 1142
1025 swcount = *swap_map; 1143 swcount = *swap_map;
1026 if (swcount <= 1) 1144 if (!swap_count(swcount)) /* any usage ? */
1027 ; 1145 ;
1028 else if (mm == &init_mm) { 1146 else if (mm == &init_mm) {
1029 set_start_mm = 1; 1147 set_start_mm = 1;
1030 shmem = shmem_unuse(entry, page); 1148 shmem = shmem_unuse(entry, page);
1031 } else 1149 } else
1032 retval = unuse_mm(mm, entry, page); 1150 retval = unuse_mm(mm, entry, page);
1033 if (set_start_mm && *swap_map < swcount) { 1151
1152 if (set_start_mm &&
1153 swap_count(*swap_map) < swcount) {
1034 mmput(new_start_mm); 1154 mmput(new_start_mm);
1035 atomic_inc(&mm->mm_users); 1155 atomic_inc(&mm->mm_users);
1036 new_start_mm = mm; 1156 new_start_mm = mm;
@@ -1057,21 +1177,25 @@ static int try_to_unuse(unsigned int type)
1057 } 1177 }
1058 1178
1059 /* 1179 /*
1060 * How could swap count reach 0x7fff when the maximum 1180 * How could swap count reach 0x7ffe ?
1061 * pid is 0x7fff, and there's no way to repeat a swap 1181 * There's no way to repeat a swap page within an mm
1062 * page within an mm (except in shmem, where it's the 1182 * (except in shmem, where it's the shared object which takes
1063 * shared object which takes the reference count)? 1183 * the reference count)?
1064 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. 1184 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1065 * 1185 * short is too small....)
1066 * If that's wrong, then we should worry more about 1186 * If that's wrong, then we should worry more about
1067 * exit_mmap() and do_munmap() cases described above: 1187 * exit_mmap() and do_munmap() cases described above:
1068 * we might be resetting SWAP_MAP_MAX too early here. 1188 * we might be resetting SWAP_MAP_MAX too early here.
1069 * We know "Undead"s can happen, they're okay, so don't 1189 * We know "Undead"s can happen, they're okay, so don't
1070 * report them; but do report if we reset SWAP_MAP_MAX. 1190 * report them; but do report if we reset SWAP_MAP_MAX.
1071 */ 1191 */
1072 if (*swap_map == SWAP_MAP_MAX) { 1192 /* We might release the lock_page() in unuse_mm(). */
1193 if (!PageSwapCache(page) || page_private(page) != entry.val)
1194 goto retry;
1195
1196 if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1073 spin_lock(&swap_lock); 1197 spin_lock(&swap_lock);
1074 *swap_map = 1; 1198 *swap_map = encode_swapmap(0, true);
1075 spin_unlock(&swap_lock); 1199 spin_unlock(&swap_lock);
1076 reset_overflow = 1; 1200 reset_overflow = 1;
1077 } 1201 }
@@ -1089,7 +1213,8 @@ static int try_to_unuse(unsigned int type)
1089 * pages would be incorrect if swap supported "shared 1213 * pages would be incorrect if swap supported "shared
1090 * private" pages, but they are handled by tmpfs files. 1214 * private" pages, but they are handled by tmpfs files.
1091 */ 1215 */
1092 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { 1216 if (swap_count(*swap_map) &&
1217 PageDirty(page) && PageSwapCache(page)) {
1093 struct writeback_control wbc = { 1218 struct writeback_control wbc = {
1094 .sync_mode = WB_SYNC_NONE, 1219 .sync_mode = WB_SYNC_NONE,
1095 }; 1220 };
@@ -1116,6 +1241,7 @@ static int try_to_unuse(unsigned int type)
1116 * mark page dirty so shrink_page_list will preserve it. 1241 * mark page dirty so shrink_page_list will preserve it.
1117 */ 1242 */
1118 SetPageDirty(page); 1243 SetPageDirty(page);
1244retry:
1119 unlock_page(page); 1245 unlock_page(page);
1120 page_cache_release(page); 1246 page_cache_release(page);
1121 1247
@@ -1942,15 +2068,23 @@ void si_swapinfo(struct sysinfo *val)
1942 * 2068 *
1943 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 2069 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
1944 * "permanent", but will be reclaimed by the next swapoff. 2070 * "permanent", but will be reclaimed by the next swapoff.
2071 * Returns error code in following case.
2072 * - success -> 0
2073 * - swp_entry is invalid -> EINVAL
2074 * - swp_entry is migration entry -> EINVAL
2075 * - swap-cache reference is requested but there is already one. -> EEXIST
2076 * - swap-cache reference is requested but the entry is not used. -> ENOENT
1945 */ 2077 */
1946int swap_duplicate(swp_entry_t entry) 2078static int __swap_duplicate(swp_entry_t entry, bool cache)
1947{ 2079{
1948 struct swap_info_struct * p; 2080 struct swap_info_struct * p;
1949 unsigned long offset, type; 2081 unsigned long offset, type;
1950 int result = 0; 2082 int result = -EINVAL;
2083 int count;
2084 bool has_cache;
1951 2085
1952 if (is_migration_entry(entry)) 2086 if (is_migration_entry(entry))
1953 return 1; 2087 return -EINVAL;
1954 2088
1955 type = swp_type(entry); 2089 type = swp_type(entry);
1956 if (type >= nr_swapfiles) 2090 if (type >= nr_swapfiles)
@@ -1959,17 +2093,40 @@ int swap_duplicate(swp_entry_t entry)
1959 offset = swp_offset(entry); 2093 offset = swp_offset(entry);
1960 2094
1961 spin_lock(&swap_lock); 2095 spin_lock(&swap_lock);
1962 if (offset < p->max && p->swap_map[offset]) { 2096
1963 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { 2097 if (unlikely(offset >= p->max))
1964 p->swap_map[offset]++; 2098 goto unlock_out;
1965 result = 1; 2099
1966 } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { 2100 count = swap_count(p->swap_map[offset]);
2101 has_cache = swap_has_cache(p->swap_map[offset]);
2102
2103 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
2104
2105 /* set SWAP_HAS_CACHE if there is no cache and entry is used */
2106 if (!has_cache && count) {
2107 p->swap_map[offset] = encode_swapmap(count, true);
2108 result = 0;
2109 } else if (has_cache) /* someone added cache */
2110 result = -EEXIST;
2111 else if (!count) /* no users */
2112 result = -ENOENT;
2113
2114 } else if (count || has_cache) {
2115 if (count < SWAP_MAP_MAX - 1) {
2116 p->swap_map[offset] = encode_swapmap(count + 1,
2117 has_cache);
2118 result = 0;
2119 } else if (count <= SWAP_MAP_MAX) {
1967 if (swap_overflow++ < 5) 2120 if (swap_overflow++ < 5)
1968 printk(KERN_WARNING "swap_dup: swap entry overflow\n"); 2121 printk(KERN_WARNING
1969 p->swap_map[offset] = SWAP_MAP_MAX; 2122 "swap_dup: swap entry overflow\n");
1970 result = 1; 2123 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
2124 has_cache);
2125 result = 0;
1971 } 2126 }
1972 } 2127 } else
2128 result = -ENOENT; /* unused swap entry */
2129unlock_out:
1973 spin_unlock(&swap_lock); 2130 spin_unlock(&swap_lock);
1974out: 2131out:
1975 return result; 2132 return result;
@@ -1978,6 +2135,27 @@ bad_file:
1978 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2135 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
1979 goto out; 2136 goto out;
1980} 2137}
2138/*
2139 * increase reference count of swap entry by 1.
2140 */
2141void swap_duplicate(swp_entry_t entry)
2142{
2143 __swap_duplicate(entry, SWAP_MAP);
2144}
2145
2146/*
2147 * @entry: swap entry for which we allocate swap cache.
2148 *
2149 * Called when allocating swap cache for exising swap entry,
2150 * This can return error codes. Returns 0 at success.
2151 * -EBUSY means there is a swap cache.
2152 * Note: return code is different from swap_duplicate().
2153 */
2154int swapcache_prepare(swp_entry_t entry)
2155{
2156 return __swap_duplicate(entry, SWAP_CACHE);
2157}
2158
1981 2159
1982struct swap_info_struct * 2160struct swap_info_struct *
1983get_swap_info_struct(unsigned type) 2161get_swap_info_struct(unsigned type)
@@ -2016,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2016 /* Don't read in free or bad pages */ 2194 /* Don't read in free or bad pages */
2017 if (!si->swap_map[toff]) 2195 if (!si->swap_map[toff])
2018 break; 2196 break;
2019 if (si->swap_map[toff] == SWAP_MAP_BAD) 2197 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2020 break; 2198 break;
2021 } 2199 }
2022 /* Count contiguous allocated slots below our target */ 2200 /* Count contiguous allocated slots below our target */
@@ -2024,7 +2202,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2024 /* Don't read in free or bad pages */ 2202 /* Don't read in free or bad pages */
2025 if (!si->swap_map[toff]) 2203 if (!si->swap_map[toff])
2026 break; 2204 break;
2027 if (si->swap_map[toff] == SWAP_MAP_BAD) 2205 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2028 break; 2206 break;
2029 } 2207 }
2030 spin_unlock(&swap_lock); 2208 spin_unlock(&swap_lock);
diff --git a/mm/thrash.c b/mm/thrash.c
index c4c5205a9c35..2372d4ed5dd8 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -26,47 +26,45 @@ static DEFINE_SPINLOCK(swap_token_lock);
26struct mm_struct *swap_token_mm; 26struct mm_struct *swap_token_mm;
27static unsigned int global_faults; 27static unsigned int global_faults;
28 28
29void grab_swap_token(void) 29void grab_swap_token(struct mm_struct *mm)
30{ 30{
31 int current_interval; 31 int current_interval;
32 32
33 global_faults++; 33 global_faults++;
34 34
35 current_interval = global_faults - current->mm->faultstamp; 35 current_interval = global_faults - mm->faultstamp;
36 36
37 if (!spin_trylock(&swap_token_lock)) 37 if (!spin_trylock(&swap_token_lock))
38 return; 38 return;
39 39
40 /* First come first served */ 40 /* First come first served */
41 if (swap_token_mm == NULL) { 41 if (swap_token_mm == NULL) {
42 current->mm->token_priority = current->mm->token_priority + 2; 42 mm->token_priority = mm->token_priority + 2;
43 swap_token_mm = current->mm; 43 swap_token_mm = mm;
44 goto out; 44 goto out;
45 } 45 }
46 46
47 if (current->mm != swap_token_mm) { 47 if (mm != swap_token_mm) {
48 if (current_interval < current->mm->last_interval) 48 if (current_interval < mm->last_interval)
49 current->mm->token_priority++; 49 mm->token_priority++;
50 else { 50 else {
51 if (likely(current->mm->token_priority > 0)) 51 if (likely(mm->token_priority > 0))
52 current->mm->token_priority--; 52 mm->token_priority--;
53 } 53 }
54 /* Check if we deserve the token */ 54 /* Check if we deserve the token */
55 if (current->mm->token_priority > 55 if (mm->token_priority > swap_token_mm->token_priority) {
56 swap_token_mm->token_priority) { 56 mm->token_priority += 2;
57 current->mm->token_priority += 2; 57 swap_token_mm = mm;
58 swap_token_mm = current->mm;
59 } 58 }
60 } else { 59 } else {
61 /* Token holder came in again! */ 60 /* Token holder came in again! */
62 current->mm->token_priority += 2; 61 mm->token_priority += 2;
63 } 62 }
64 63
65out: 64out:
66 current->mm->faultstamp = global_faults; 65 mm->faultstamp = global_faults;
67 current->mm->last_interval = current_interval; 66 mm->last_interval = current_interval;
68 spin_unlock(&swap_token_lock); 67 spin_unlock(&swap_token_lock);
69return;
70} 68}
71 69
72/* Called on process exit. */ 70/* Called on process exit. */
diff --git a/mm/truncate.c b/mm/truncate.c
index 12e1579f9165..ccc3ecf7cb98 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
267} 267}
268EXPORT_SYMBOL(truncate_inode_pages); 268EXPORT_SYMBOL(truncate_inode_pages);
269 269
270unsigned long __invalidate_mapping_pages(struct address_space *mapping, 270/**
271 pgoff_t start, pgoff_t end, bool be_atomic) 271 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
272 * @mapping: the address_space which holds the pages to invalidate
273 * @start: the offset 'from' which to invalidate
274 * @end: the offset 'to' which to invalidate (inclusive)
275 *
276 * This function only removes the unlocked pages, if you want to
277 * remove all the pages of one inode, you must call truncate_inode_pages.
278 *
279 * invalidate_mapping_pages() will not block on IO activity. It will not
280 * invalidate pages which are dirty, locked, under writeback or mapped into
281 * pagetables.
282 */
283unsigned long invalidate_mapping_pages(struct address_space *mapping,
284 pgoff_t start, pgoff_t end)
272{ 285{
273 struct pagevec pvec; 286 struct pagevec pvec;
274 pgoff_t next = start; 287 pgoff_t next = start;
@@ -309,30 +322,10 @@ unlock:
309 break; 322 break;
310 } 323 }
311 pagevec_release(&pvec); 324 pagevec_release(&pvec);
312 if (likely(!be_atomic)) 325 cond_resched();
313 cond_resched();
314 } 326 }
315 return ret; 327 return ret;
316} 328}
317
318/**
319 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
320 * @mapping: the address_space which holds the pages to invalidate
321 * @start: the offset 'from' which to invalidate
322 * @end: the offset 'to' which to invalidate (inclusive)
323 *
324 * This function only removes the unlocked pages, if you want to
325 * remove all the pages of one inode, you must call truncate_inode_pages.
326 *
327 * invalidate_mapping_pages() will not block on IO activity. It will not
328 * invalidate pages which are dirty, locked, under writeback or mapped into
329 * pagetables.
330 */
331unsigned long invalidate_mapping_pages(struct address_space *mapping,
332 pgoff_t start, pgoff_t end)
333{
334 return __invalidate_mapping_pages(mapping, start, end, false);
335}
336EXPORT_SYMBOL(invalidate_mapping_pages); 329EXPORT_SYMBOL(invalidate_mapping_pages);
337 330
338/* 331/*
diff --git a/mm/util.c b/mm/util.c
index 55bef160b9f1..7c35ad95f927 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,9 +4,11 @@
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/tracepoint.h>
8#include <asm/uaccess.h> 7#include <asm/uaccess.h>
9 8
9#define CREATE_TRACE_POINTS
10#include <trace/events/kmem.h>
11
10/** 12/**
11 * kstrdup - allocate space for and copy an existing string 13 * kstrdup - allocate space for and copy an existing string
12 * @s: the string to duplicate 14 * @s: the string to duplicate
@@ -166,6 +168,10 @@ EXPORT_SYMBOL(krealloc);
166 * 168 *
167 * The memory of the object @p points to is zeroed before freed. 169 * The memory of the object @p points to is zeroed before freed.
168 * If @p is %NULL, kzfree() does nothing. 170 * If @p is %NULL, kzfree() does nothing.
171 *
172 * Note: this function zeroes the whole allocated buffer which can be a good
173 * deal bigger than the requested buffer size passed to kmalloc(). So be
174 * careful when using this function in performance sensitive code.
169 */ 175 */
170void kzfree(const void *p) 176void kzfree(const void *p)
171{ 177{
@@ -231,13 +237,21 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
231 * @pages: array that receives pointers to the pages pinned. 237 * @pages: array that receives pointers to the pages pinned.
232 * Should be at least nr_pages long. 238 * Should be at least nr_pages long.
233 * 239 *
234 * Attempt to pin user pages in memory without taking mm->mmap_sem.
235 * If not successful, it will fall back to taking the lock and
236 * calling get_user_pages().
237 *
238 * Returns number of pages pinned. This may be fewer than the number 240 * Returns number of pages pinned. This may be fewer than the number
239 * requested. If nr_pages is 0 or negative, returns 0. If no pages 241 * requested. If nr_pages is 0 or negative, returns 0. If no pages
240 * were pinned, returns -errno. 242 * were pinned, returns -errno.
243 *
244 * get_user_pages_fast provides equivalent functionality to get_user_pages,
245 * operating on current and current->mm, with force=0 and vma=NULL. However
246 * unlike get_user_pages, it must be called without mmap_sem held.
247 *
248 * get_user_pages_fast may take mmap_sem and page table locks, so no
249 * assumptions can be made about lack of locking. get_user_pages_fast is to be
250 * implemented in a way that is advantageous (vs get_user_pages()) when the
251 * user memory area is already faulted in and present in ptes. However if the
252 * pages have to be faulted in, it may turn out to be slightly slower so
253 * callers need to carefully consider what to use. On many architectures,
254 * get_user_pages_fast simply falls back to get_user_pages.
241 */ 255 */
242int __attribute__((weak)) get_user_pages_fast(unsigned long start, 256int __attribute__((weak)) get_user_pages_fast(unsigned long start,
243 int nr_pages, int write, struct page **pages) 257 int nr_pages, int write, struct page **pages)
@@ -255,13 +269,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
255EXPORT_SYMBOL_GPL(get_user_pages_fast); 269EXPORT_SYMBOL_GPL(get_user_pages_fast);
256 270
257/* Tracepoints definitions. */ 271/* Tracepoints definitions. */
258DEFINE_TRACE(kmalloc);
259DEFINE_TRACE(kmem_cache_alloc);
260DEFINE_TRACE(kmalloc_node);
261DEFINE_TRACE(kmem_cache_alloc_node);
262DEFINE_TRACE(kfree);
263DEFINE_TRACE(kmem_cache_free);
264
265EXPORT_TRACEPOINT_SYMBOL(kmalloc); 272EXPORT_TRACEPOINT_SYMBOL(kmalloc);
266EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); 273EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
267EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); 274EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 083716ea38c9..f8189a4b3e13 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -23,8 +23,8 @@
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/radix-tree.h> 24#include <linux/radix-tree.h>
25#include <linux/rcupdate.h> 25#include <linux/rcupdate.h>
26#include <linux/bootmem.h>
27#include <linux/pfn.h> 26#include <linux/pfn.h>
27#include <linux/kmemleak.h>
28 28
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
@@ -1032,7 +1032,7 @@ void __init vmalloc_init(void)
1032 1032
1033 /* Import existing vmlist entries. */ 1033 /* Import existing vmlist entries. */
1034 for (tmp = vmlist; tmp; tmp = tmp->next) { 1034 for (tmp = vmlist; tmp; tmp = tmp->next) {
1035 va = alloc_bootmem(sizeof(struct vmap_area)); 1035 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
1036 va->flags = tmp->flags | VM_VM_AREA; 1036 va->flags = tmp->flags | VM_VM_AREA;
1037 va->va_start = (unsigned long)tmp->addr; 1037 va->va_start = (unsigned long)tmp->addr;
1038 va->va_end = va->va_start + tmp->size; 1038 va->va_end = va->va_start + tmp->size;
@@ -1327,6 +1327,9 @@ static void __vunmap(const void *addr, int deallocate_pages)
1327void vfree(const void *addr) 1327void vfree(const void *addr)
1328{ 1328{
1329 BUG_ON(in_interrupt()); 1329 BUG_ON(in_interrupt());
1330
1331 kmemleak_free(addr);
1332
1330 __vunmap(addr, 1); 1333 __vunmap(addr, 1);
1331} 1334}
1332EXPORT_SYMBOL(vfree); 1335EXPORT_SYMBOL(vfree);
@@ -1439,8 +1442,17 @@ fail:
1439 1442
1440void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) 1443void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1441{ 1444{
1442 return __vmalloc_area_node(area, gfp_mask, prot, -1, 1445 void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
1443 __builtin_return_address(0)); 1446 __builtin_return_address(0));
1447
1448 /*
1449 * A ref_count = 3 is needed because the vm_struct and vmap_area
1450 * structures allocated in the __get_vm_area_node() function contain
1451 * references to the virtual address of the vmalloc'ed block.
1452 */
1453 kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
1454
1455 return addr;
1444} 1456}
1445 1457
1446/** 1458/**
@@ -1459,6 +1471,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1459 int node, void *caller) 1471 int node, void *caller)
1460{ 1472{
1461 struct vm_struct *area; 1473 struct vm_struct *area;
1474 void *addr;
1475 unsigned long real_size = size;
1462 1476
1463 size = PAGE_ALIGN(size); 1477 size = PAGE_ALIGN(size);
1464 if (!size || (size >> PAGE_SHIFT) > num_physpages) 1478 if (!size || (size >> PAGE_SHIFT) > num_physpages)
@@ -1470,7 +1484,16 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1470 if (!area) 1484 if (!area)
1471 return NULL; 1485 return NULL;
1472 1486
1473 return __vmalloc_area_node(area, gfp_mask, prot, node, caller); 1487 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
1488
1489 /*
1490 * A ref_count = 3 is needed because the vm_struct and vmap_area
1491 * structures allocated in the __get_vm_area_node() function contain
1492 * references to the virtual address of the vmalloc'ed block.
1493 */
1494 kmemleak_alloc(addr, real_size, 3, gfp_mask);
1495
1496 return addr;
1474} 1497}
1475 1498
1476void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1499void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d254306562cd..54155268dfca 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -470,8 +470,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
470 swp_entry_t swap = { .val = page_private(page) }; 470 swp_entry_t swap = { .val = page_private(page) };
471 __delete_from_swap_cache(page); 471 __delete_from_swap_cache(page);
472 spin_unlock_irq(&mapping->tree_lock); 472 spin_unlock_irq(&mapping->tree_lock);
473 mem_cgroup_uncharge_swapcache(page, swap); 473 swapcache_free(swap, page);
474 swap_free(swap);
475 } else { 474 } else {
476 __remove_from_page_cache(page); 475 __remove_from_page_cache(page);
477 spin_unlock_irq(&mapping->tree_lock); 476 spin_unlock_irq(&mapping->tree_lock);
@@ -514,7 +513,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
514 * 513 *
515 * lru_lock must not be held, interrupts must be enabled. 514 * lru_lock must not be held, interrupts must be enabled.
516 */ 515 */
517#ifdef CONFIG_UNEVICTABLE_LRU
518void putback_lru_page(struct page *page) 516void putback_lru_page(struct page *page)
519{ 517{
520 int lru; 518 int lru;
@@ -568,20 +566,6 @@ redo:
568 put_page(page); /* drop ref from isolate */ 566 put_page(page); /* drop ref from isolate */
569} 567}
570 568
571#else /* CONFIG_UNEVICTABLE_LRU */
572
573void putback_lru_page(struct page *page)
574{
575 int lru;
576 VM_BUG_ON(PageLRU(page));
577
578 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
579 lru_cache_add_lru(page, lru);
580 put_page(page);
581}
582#endif /* CONFIG_UNEVICTABLE_LRU */
583
584
585/* 569/*
586 * shrink_page_list() returns the number of reclaimed pages 570 * shrink_page_list() returns the number of reclaimed pages
587 */ 571 */
@@ -593,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
593 struct pagevec freed_pvec; 577 struct pagevec freed_pvec;
594 int pgactivate = 0; 578 int pgactivate = 0;
595 unsigned long nr_reclaimed = 0; 579 unsigned long nr_reclaimed = 0;
580 unsigned long vm_flags;
596 581
597 cond_resched(); 582 cond_resched();
598 583
@@ -643,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
643 goto keep_locked; 628 goto keep_locked;
644 } 629 }
645 630
646 referenced = page_referenced(page, 1, sc->mem_cgroup); 631 referenced = page_referenced(page, 1,
632 sc->mem_cgroup, &vm_flags);
647 /* In active use or really unfreeable? Activate it. */ 633 /* In active use or really unfreeable? Activate it. */
648 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && 634 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
649 referenced && page_mapping_inuse(page)) 635 referenced && page_mapping_inuse(page))
@@ -851,7 +837,6 @@ int __isolate_lru_page(struct page *page, int mode, int file)
851 */ 837 */
852 ClearPageLRU(page); 838 ClearPageLRU(page);
853 ret = 0; 839 ret = 0;
854 mem_cgroup_del_lru(page);
855 } 840 }
856 841
857 return ret; 842 return ret;
@@ -899,12 +884,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
899 switch (__isolate_lru_page(page, mode, file)) { 884 switch (__isolate_lru_page(page, mode, file)) {
900 case 0: 885 case 0:
901 list_move(&page->lru, dst); 886 list_move(&page->lru, dst);
887 mem_cgroup_del_lru(page);
902 nr_taken++; 888 nr_taken++;
903 break; 889 break;
904 890
905 case -EBUSY: 891 case -EBUSY:
906 /* else it is being freed elsewhere */ 892 /* else it is being freed elsewhere */
907 list_move(&page->lru, src); 893 list_move(&page->lru, src);
894 mem_cgroup_rotate_lru_list(page, page_lru(page));
908 continue; 895 continue;
909 896
910 default: 897 default:
@@ -943,18 +930,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
943 /* Check that we have not crossed a zone boundary. */ 930 /* Check that we have not crossed a zone boundary. */
944 if (unlikely(page_zone_id(cursor_page) != zone_id)) 931 if (unlikely(page_zone_id(cursor_page) != zone_id))
945 continue; 932 continue;
946 switch (__isolate_lru_page(cursor_page, mode, file)) { 933 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
947 case 0:
948 list_move(&cursor_page->lru, dst); 934 list_move(&cursor_page->lru, dst);
935 mem_cgroup_del_lru(cursor_page);
949 nr_taken++; 936 nr_taken++;
950 scan++; 937 scan++;
951 break;
952
953 case -EBUSY:
954 /* else it is being freed elsewhere */
955 list_move(&cursor_page->lru, src);
956 default:
957 break; /* ! on LRU or wrong list */
958 } 938 }
959 } 939 }
960 } 940 }
@@ -1061,6 +1041,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1061 unsigned long nr_scanned = 0; 1041 unsigned long nr_scanned = 0;
1062 unsigned long nr_reclaimed = 0; 1042 unsigned long nr_reclaimed = 0;
1063 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1043 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1044 int lumpy_reclaim = 0;
1045
1046 /*
1047 * If we need a large contiguous chunk of memory, or have
1048 * trouble getting a small set of contiguous pages, we
1049 * will reclaim both active and inactive pages.
1050 *
1051 * We use the same threshold as pageout congestion_wait below.
1052 */
1053 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1054 lumpy_reclaim = 1;
1055 else if (sc->order && priority < DEF_PRIORITY - 2)
1056 lumpy_reclaim = 1;
1064 1057
1065 pagevec_init(&pvec, 1); 1058 pagevec_init(&pvec, 1);
1066 1059
@@ -1073,19 +1066,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1073 unsigned long nr_freed; 1066 unsigned long nr_freed;
1074 unsigned long nr_active; 1067 unsigned long nr_active;
1075 unsigned int count[NR_LRU_LISTS] = { 0, }; 1068 unsigned int count[NR_LRU_LISTS] = { 0, };
1076 int mode = ISOLATE_INACTIVE; 1069 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1077
1078 /*
1079 * If we need a large contiguous chunk of memory, or have
1080 * trouble getting a small set of contiguous pages, we
1081 * will reclaim both active and inactive pages.
1082 *
1083 * We use the same threshold as pageout congestion_wait below.
1084 */
1085 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1086 mode = ISOLATE_BOTH;
1087 else if (sc->order && priority < DEF_PRIORITY - 2)
1088 mode = ISOLATE_BOTH;
1089 1070
1090 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1071 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1091 &page_list, &nr_scan, sc->order, mode, 1072 &page_list, &nr_scan, sc->order, mode,
@@ -1122,7 +1103,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1122 * but that should be acceptable to the caller 1103 * but that should be acceptable to the caller
1123 */ 1104 */
1124 if (nr_freed < nr_taken && !current_is_kswapd() && 1105 if (nr_freed < nr_taken && !current_is_kswapd() &&
1125 sc->order > PAGE_ALLOC_COSTLY_ORDER) { 1106 lumpy_reclaim) {
1126 congestion_wait(WRITE, HZ/10); 1107 congestion_wait(WRITE, HZ/10);
1127 1108
1128 /* 1109 /*
@@ -1217,18 +1198,54 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1217 * But we had to alter page->flags anyway. 1198 * But we had to alter page->flags anyway.
1218 */ 1199 */
1219 1200
1201static void move_active_pages_to_lru(struct zone *zone,
1202 struct list_head *list,
1203 enum lru_list lru)
1204{
1205 unsigned long pgmoved = 0;
1206 struct pagevec pvec;
1207 struct page *page;
1208
1209 pagevec_init(&pvec, 1);
1210
1211 while (!list_empty(list)) {
1212 page = lru_to_page(list);
1213 prefetchw_prev_lru_page(page, list, flags);
1214
1215 VM_BUG_ON(PageLRU(page));
1216 SetPageLRU(page);
1217
1218 VM_BUG_ON(!PageActive(page));
1219 if (!is_active_lru(lru))
1220 ClearPageActive(page); /* we are de-activating */
1221
1222 list_move(&page->lru, &zone->lru[lru].list);
1223 mem_cgroup_add_lru_list(page, lru);
1224 pgmoved++;
1225
1226 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1227 spin_unlock_irq(&zone->lru_lock);
1228 if (buffer_heads_over_limit)
1229 pagevec_strip(&pvec);
1230 __pagevec_release(&pvec);
1231 spin_lock_irq(&zone->lru_lock);
1232 }
1233 }
1234 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1235 if (!is_active_lru(lru))
1236 __count_vm_events(PGDEACTIVATE, pgmoved);
1237}
1220 1238
1221static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1239static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1222 struct scan_control *sc, int priority, int file) 1240 struct scan_control *sc, int priority, int file)
1223{ 1241{
1224 unsigned long pgmoved; 1242 unsigned long pgmoved;
1225 int pgdeactivate = 0;
1226 unsigned long pgscanned; 1243 unsigned long pgscanned;
1244 unsigned long vm_flags;
1227 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1245 LIST_HEAD(l_hold); /* The pages which were snipped off */
1246 LIST_HEAD(l_active);
1228 LIST_HEAD(l_inactive); 1247 LIST_HEAD(l_inactive);
1229 struct page *page; 1248 struct page *page;
1230 struct pagevec pvec;
1231 enum lru_list lru;
1232 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1249 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1233 1250
1234 lru_add_drain(); 1251 lru_add_drain();
@@ -1245,13 +1262,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1245 } 1262 }
1246 reclaim_stat->recent_scanned[!!file] += pgmoved; 1263 reclaim_stat->recent_scanned[!!file] += pgmoved;
1247 1264
1265 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1248 if (file) 1266 if (file)
1249 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); 1267 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1250 else 1268 else
1251 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); 1269 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1252 spin_unlock_irq(&zone->lru_lock); 1270 spin_unlock_irq(&zone->lru_lock);
1253 1271
1254 pgmoved = 0; 1272 pgmoved = 0; /* count referenced (mapping) mapped pages */
1255 while (!list_empty(&l_hold)) { 1273 while (!list_empty(&l_hold)) {
1256 cond_resched(); 1274 cond_resched();
1257 page = lru_to_page(&l_hold); 1275 page = lru_to_page(&l_hold);
@@ -1264,58 +1282,44 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1264 1282
1265 /* page_referenced clears PageReferenced */ 1283 /* page_referenced clears PageReferenced */
1266 if (page_mapping_inuse(page) && 1284 if (page_mapping_inuse(page) &&
1267 page_referenced(page, 0, sc->mem_cgroup)) 1285 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1268 pgmoved++; 1286 pgmoved++;
1287 /*
1288 * Identify referenced, file-backed active pages and
1289 * give them one more trip around the active list. So
1290 * that executable code get better chances to stay in
1291 * memory under moderate memory pressure. Anon pages
1292 * are not likely to be evicted by use-once streaming
1293 * IO, plus JVM can create lots of anon VM_EXEC pages,
1294 * so we ignore them here.
1295 */
1296 if ((vm_flags & VM_EXEC) && !PageAnon(page)) {
1297 list_add(&page->lru, &l_active);
1298 continue;
1299 }
1300 }
1269 1301
1270 list_add(&page->lru, &l_inactive); 1302 list_add(&page->lru, &l_inactive);
1271 } 1303 }
1272 1304
1273 /* 1305 /*
1274 * Move the pages to the [file or anon] inactive list. 1306 * Move pages back to the lru list.
1275 */ 1307 */
1276 pagevec_init(&pvec, 1);
1277 lru = LRU_BASE + file * LRU_FILE;
1278
1279 spin_lock_irq(&zone->lru_lock); 1308 spin_lock_irq(&zone->lru_lock);
1280 /* 1309 /*
1281 * Count referenced pages from currently used mappings as 1310 * Count referenced pages from currently used mappings as rotated,
1282 * rotated, even though they are moved to the inactive list. 1311 * even though only some of them are actually re-activated. This
1283 * This helps balance scan pressure between file and anonymous 1312 * helps balance scan pressure between file and anonymous pages in
1284 * pages in get_scan_ratio. 1313 * get_scan_ratio.
1285 */ 1314 */
1286 reclaim_stat->recent_rotated[!!file] += pgmoved; 1315 reclaim_stat->recent_rotated[!!file] += pgmoved;
1287 1316
1288 pgmoved = 0; 1317 move_active_pages_to_lru(zone, &l_active,
1289 while (!list_empty(&l_inactive)) { 1318 LRU_ACTIVE + file * LRU_FILE);
1290 page = lru_to_page(&l_inactive); 1319 move_active_pages_to_lru(zone, &l_inactive,
1291 prefetchw_prev_lru_page(page, &l_inactive, flags); 1320 LRU_BASE + file * LRU_FILE);
1292 VM_BUG_ON(PageLRU(page));
1293 SetPageLRU(page);
1294 VM_BUG_ON(!PageActive(page));
1295 ClearPageActive(page);
1296 1321
1297 list_move(&page->lru, &zone->lru[lru].list);
1298 mem_cgroup_add_lru_list(page, lru);
1299 pgmoved++;
1300 if (!pagevec_add(&pvec, page)) {
1301 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1302 spin_unlock_irq(&zone->lru_lock);
1303 pgdeactivate += pgmoved;
1304 pgmoved = 0;
1305 if (buffer_heads_over_limit)
1306 pagevec_strip(&pvec);
1307 __pagevec_release(&pvec);
1308 spin_lock_irq(&zone->lru_lock);
1309 }
1310 }
1311 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1312 pgdeactivate += pgmoved;
1313 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1314 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1315 spin_unlock_irq(&zone->lru_lock); 1322 spin_unlock_irq(&zone->lru_lock);
1316 if (buffer_heads_over_limit)
1317 pagevec_strip(&pvec);
1318 pagevec_release(&pvec);
1319} 1323}
1320 1324
1321static int inactive_anon_is_low_global(struct zone *zone) 1325static int inactive_anon_is_low_global(struct zone *zone)
@@ -1350,12 +1354,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1350 return low; 1354 return low;
1351} 1355}
1352 1356
1357static int inactive_file_is_low_global(struct zone *zone)
1358{
1359 unsigned long active, inactive;
1360
1361 active = zone_page_state(zone, NR_ACTIVE_FILE);
1362 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1363
1364 return (active > inactive);
1365}
1366
1367/**
1368 * inactive_file_is_low - check if file pages need to be deactivated
1369 * @zone: zone to check
1370 * @sc: scan control of this context
1371 *
1372 * When the system is doing streaming IO, memory pressure here
1373 * ensures that active file pages get deactivated, until more
1374 * than half of the file pages are on the inactive list.
1375 *
1376 * Once we get to that situation, protect the system's working
1377 * set from being evicted by disabling active file page aging.
1378 *
1379 * This uses a different ratio than the anonymous pages, because
1380 * the page cache uses a use-once replacement algorithm.
1381 */
1382static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1383{
1384 int low;
1385
1386 if (scanning_global_lru(sc))
1387 low = inactive_file_is_low_global(zone);
1388 else
1389 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
1390 return low;
1391}
1392
1353static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1393static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1354 struct zone *zone, struct scan_control *sc, int priority) 1394 struct zone *zone, struct scan_control *sc, int priority)
1355{ 1395{
1356 int file = is_file_lru(lru); 1396 int file = is_file_lru(lru);
1357 1397
1358 if (lru == LRU_ACTIVE_FILE) { 1398 if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
1359 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1399 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1360 return 0; 1400 return 0;
1361 } 1401 }
@@ -1384,13 +1424,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1384 unsigned long ap, fp; 1424 unsigned long ap, fp;
1385 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1425 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1386 1426
1387 /* If we have no swap space, do not bother scanning anon pages. */
1388 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1389 percent[0] = 0;
1390 percent[1] = 100;
1391 return;
1392 }
1393
1394 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + 1427 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
1395 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); 1428 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
1396 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + 1429 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1400,7 +1433,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1400 free = zone_page_state(zone, NR_FREE_PAGES); 1433 free = zone_page_state(zone, NR_FREE_PAGES);
1401 /* If we have very few page cache pages, 1434 /* If we have very few page cache pages,
1402 force-scan anon pages. */ 1435 force-scan anon pages. */
1403 if (unlikely(file + free <= zone->pages_high)) { 1436 if (unlikely(file + free <= high_wmark_pages(zone))) {
1404 percent[0] = 100; 1437 percent[0] = 100;
1405 percent[1] = 0; 1438 percent[1] = 0;
1406 return; 1439 return;
@@ -1455,6 +1488,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1455 percent[1] = 100 - percent[0]; 1488 percent[1] = 100 - percent[0];
1456} 1489}
1457 1490
1491/*
1492 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1493 * until we collected @swap_cluster_max pages to scan.
1494 */
1495static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1496 unsigned long *nr_saved_scan,
1497 unsigned long swap_cluster_max)
1498{
1499 unsigned long nr;
1500
1501 *nr_saved_scan += nr_to_scan;
1502 nr = *nr_saved_scan;
1503
1504 if (nr >= swap_cluster_max)
1505 *nr_saved_scan = 0;
1506 else
1507 nr = 0;
1508
1509 return nr;
1510}
1458 1511
1459/* 1512/*
1460 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1513 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
@@ -1468,26 +1521,30 @@ static void shrink_zone(int priority, struct zone *zone,
1468 enum lru_list l; 1521 enum lru_list l;
1469 unsigned long nr_reclaimed = sc->nr_reclaimed; 1522 unsigned long nr_reclaimed = sc->nr_reclaimed;
1470 unsigned long swap_cluster_max = sc->swap_cluster_max; 1523 unsigned long swap_cluster_max = sc->swap_cluster_max;
1524 int noswap = 0;
1471 1525
1472 get_scan_ratio(zone, sc, percent); 1526 /* If we have no swap space, do not bother scanning anon pages. */
1527 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1528 noswap = 1;
1529 percent[0] = 0;
1530 percent[1] = 100;
1531 } else
1532 get_scan_ratio(zone, sc, percent);
1473 1533
1474 for_each_evictable_lru(l) { 1534 for_each_evictable_lru(l) {
1475 int file = is_file_lru(l); 1535 int file = is_file_lru(l);
1476 unsigned long scan; 1536 unsigned long scan;
1477 1537
1478 scan = zone_nr_pages(zone, sc, l); 1538 scan = zone_nr_pages(zone, sc, l);
1479 if (priority) { 1539 if (priority || noswap) {
1480 scan >>= priority; 1540 scan >>= priority;
1481 scan = (scan * percent[file]) / 100; 1541 scan = (scan * percent[file]) / 100;
1482 } 1542 }
1483 if (scanning_global_lru(sc)) { 1543 if (scanning_global_lru(sc))
1484 zone->lru[l].nr_scan += scan; 1544 nr[l] = nr_scan_try_batch(scan,
1485 nr[l] = zone->lru[l].nr_scan; 1545 &zone->lru[l].nr_saved_scan,
1486 if (nr[l] >= swap_cluster_max) 1546 swap_cluster_max);
1487 zone->lru[l].nr_scan = 0; 1547 else
1488 else
1489 nr[l] = 0;
1490 } else
1491 nr[l] = scan; 1548 nr[l] = scan;
1492 } 1549 }
1493 1550
@@ -1521,7 +1578,7 @@ static void shrink_zone(int priority, struct zone *zone,
1521 * Even if we did not try to evict anon pages at all, we want to 1578 * Even if we did not try to evict anon pages at all, we want to
1522 * rebalance the anon lru active/inactive ratio. 1579 * rebalance the anon lru active/inactive ratio.
1523 */ 1580 */
1524 if (inactive_anon_is_low(zone, sc)) 1581 if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
1525 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1582 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1526 1583
1527 throttle_vm_writeout(sc->gfp_mask); 1584 throttle_vm_writeout(sc->gfp_mask);
@@ -1532,11 +1589,13 @@ static void shrink_zone(int priority, struct zone *zone,
1532 * try to reclaim pages from zones which will satisfy the caller's allocation 1589 * try to reclaim pages from zones which will satisfy the caller's allocation
1533 * request. 1590 * request.
1534 * 1591 *
1535 * We reclaim from a zone even if that zone is over pages_high. Because: 1592 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
1593 * Because:
1536 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 1594 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
1537 * allocation or 1595 * allocation or
1538 * b) The zones may be over pages_high but they must go *over* pages_high to 1596 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
1539 * satisfy the `incremental min' zone defense algorithm. 1597 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
1598 * zone defense algorithm.
1540 * 1599 *
1541 * If a zone is deemed to be full of pinned pages then just give it a light 1600 * If a zone is deemed to be full of pinned pages then just give it a light
1542 * scan then give up on it. 1601 * scan then give up on it.
@@ -1742,7 +1801,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1742 1801
1743/* 1802/*
1744 * For kswapd, balance_pgdat() will work across all this node's zones until 1803 * For kswapd, balance_pgdat() will work across all this node's zones until
1745 * they are all at pages_high. 1804 * they are all at high_wmark_pages(zone).
1746 * 1805 *
1747 * Returns the number of pages which were actually freed. 1806 * Returns the number of pages which were actually freed.
1748 * 1807 *
@@ -1755,11 +1814,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1755 * the zone for when the problem goes away. 1814 * the zone for when the problem goes away.
1756 * 1815 *
1757 * kswapd scans the zones in the highmem->normal->dma direction. It skips 1816 * kswapd scans the zones in the highmem->normal->dma direction. It skips
1758 * zones which have free_pages > pages_high, but once a zone is found to have 1817 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
1759 * free_pages <= pages_high, we scan that zone and the lower zones regardless 1818 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
1760 * of the number of free pages in the lower zones. This interoperates with 1819 * lower zones regardless of the number of free pages in the lower zones. This
1761 * the page allocator fallback scheme to ensure that aging of pages is balanced 1820 * interoperates with the page allocator fallback scheme to ensure that aging
1762 * across the zones. 1821 * of pages is balanced across the zones.
1763 */ 1822 */
1764static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 1823static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1765{ 1824{
@@ -1780,7 +1839,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1780 }; 1839 };
1781 /* 1840 /*
1782 * temp_priority is used to remember the scanning priority at which 1841 * temp_priority is used to remember the scanning priority at which
1783 * this zone was successfully refilled to free_pages == pages_high. 1842 * this zone was successfully refilled to
1843 * free_pages == high_wmark_pages(zone).
1784 */ 1844 */
1785 int temp_priority[MAX_NR_ZONES]; 1845 int temp_priority[MAX_NR_ZONES];
1786 1846
@@ -1825,8 +1885,8 @@ loop_again:
1825 shrink_active_list(SWAP_CLUSTER_MAX, zone, 1885 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1826 &sc, priority, 0); 1886 &sc, priority, 0);
1827 1887
1828 if (!zone_watermark_ok(zone, order, zone->pages_high, 1888 if (!zone_watermark_ok(zone, order,
1829 0, 0)) { 1889 high_wmark_pages(zone), 0, 0)) {
1830 end_zone = i; 1890 end_zone = i;
1831 break; 1891 break;
1832 } 1892 }
@@ -1860,8 +1920,8 @@ loop_again:
1860 priority != DEF_PRIORITY) 1920 priority != DEF_PRIORITY)
1861 continue; 1921 continue;
1862 1922
1863 if (!zone_watermark_ok(zone, order, zone->pages_high, 1923 if (!zone_watermark_ok(zone, order,
1864 end_zone, 0)) 1924 high_wmark_pages(zone), end_zone, 0))
1865 all_zones_ok = 0; 1925 all_zones_ok = 0;
1866 temp_priority[i] = priority; 1926 temp_priority[i] = priority;
1867 sc.nr_scanned = 0; 1927 sc.nr_scanned = 0;
@@ -1870,8 +1930,8 @@ loop_again:
1870 * We put equal pressure on every zone, unless one 1930 * We put equal pressure on every zone, unless one
1871 * zone has way too many pages free already. 1931 * zone has way too many pages free already.
1872 */ 1932 */
1873 if (!zone_watermark_ok(zone, order, 8*zone->pages_high, 1933 if (!zone_watermark_ok(zone, order,
1874 end_zone, 0)) 1934 8*high_wmark_pages(zone), end_zone, 0))
1875 shrink_zone(priority, zone, &sc); 1935 shrink_zone(priority, zone, &sc);
1876 reclaim_state->reclaimed_slab = 0; 1936 reclaim_state->reclaimed_slab = 0;
1877 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1937 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -2037,7 +2097,7 @@ void wakeup_kswapd(struct zone *zone, int order)
2037 return; 2097 return;
2038 2098
2039 pgdat = zone->zone_pgdat; 2099 pgdat = zone->zone_pgdat;
2040 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) 2100 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2041 return; 2101 return;
2042 if (pgdat->kswapd_max_order < order) 2102 if (pgdat->kswapd_max_order < order)
2043 pgdat->kswapd_max_order = order; 2103 pgdat->kswapd_max_order = order;
@@ -2056,7 +2116,7 @@ unsigned long global_lru_pages(void)
2056 + global_page_state(NR_INACTIVE_FILE); 2116 + global_page_state(NR_INACTIVE_FILE);
2057} 2117}
2058 2118
2059#ifdef CONFIG_PM 2119#ifdef CONFIG_HIBERNATION
2060/* 2120/*
2061 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 2121 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
2062 * from LRU lists system-wide, for given pass and priority. 2122 * from LRU lists system-wide, for given pass and priority.
@@ -2084,11 +2144,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2084 l == LRU_ACTIVE_FILE)) 2144 l == LRU_ACTIVE_FILE))
2085 continue; 2145 continue;
2086 2146
2087 zone->lru[l].nr_scan += (lru_pages >> prio) + 1; 2147 zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
2088 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { 2148 if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
2089 unsigned long nr_to_scan; 2149 unsigned long nr_to_scan;
2090 2150
2091 zone->lru[l].nr_scan = 0; 2151 zone->lru[l].nr_saved_scan = 0;
2092 nr_to_scan = min(nr_pages, lru_pages); 2152 nr_to_scan = min(nr_pages, lru_pages);
2093 nr_reclaimed += shrink_list(l, nr_to_scan, zone, 2153 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2094 sc, prio); 2154 sc, prio);
@@ -2196,7 +2256,7 @@ out:
2196 2256
2197 return sc.nr_reclaimed; 2257 return sc.nr_reclaimed;
2198} 2258}
2199#endif 2259#endif /* CONFIG_HIBERNATION */
2200 2260
2201/* It's optimal to keep kswapds on the same CPUs as their memory, but 2261/* It's optimal to keep kswapds on the same CPUs as their memory, but
2202 not required for correctness. So if the last cpu in a node goes 2262 not required for correctness. So if the last cpu in a node goes
@@ -2290,6 +2350,48 @@ int sysctl_min_unmapped_ratio = 1;
2290 */ 2350 */
2291int sysctl_min_slab_ratio = 5; 2351int sysctl_min_slab_ratio = 5;
2292 2352
2353static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
2354{
2355 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
2356 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
2357 zone_page_state(zone, NR_ACTIVE_FILE);
2358
2359 /*
2360 * It's possible for there to be more file mapped pages than
2361 * accounted for by the pages on the file LRU lists because
2362 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
2363 */
2364 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
2365}
2366
2367/* Work out how many page cache pages we can reclaim in this reclaim_mode */
2368static long zone_pagecache_reclaimable(struct zone *zone)
2369{
2370 long nr_pagecache_reclaimable;
2371 long delta = 0;
2372
2373 /*
2374 * If RECLAIM_SWAP is set, then all file pages are considered
2375 * potentially reclaimable. Otherwise, we have to worry about
2376 * pages like swapcache and zone_unmapped_file_pages() provides
2377 * a better estimate
2378 */
2379 if (zone_reclaim_mode & RECLAIM_SWAP)
2380 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
2381 else
2382 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
2383
2384 /* If we can't clean pages, remove dirty pages from consideration */
2385 if (!(zone_reclaim_mode & RECLAIM_WRITE))
2386 delta += zone_page_state(zone, NR_FILE_DIRTY);
2387
2388 /* Watch for any possible underflows due to delta */
2389 if (unlikely(delta > nr_pagecache_reclaimable))
2390 delta = nr_pagecache_reclaimable;
2391
2392 return nr_pagecache_reclaimable - delta;
2393}
2394
2293/* 2395/*
2294 * Try to free up some pages from this zone through reclaim. 2396 * Try to free up some pages from this zone through reclaim.
2295 */ 2397 */
@@ -2324,9 +2426,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2324 reclaim_state.reclaimed_slab = 0; 2426 reclaim_state.reclaimed_slab = 0;
2325 p->reclaim_state = &reclaim_state; 2427 p->reclaim_state = &reclaim_state;
2326 2428
2327 if (zone_page_state(zone, NR_FILE_PAGES) - 2429 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
2328 zone_page_state(zone, NR_FILE_MAPPED) >
2329 zone->min_unmapped_pages) {
2330 /* 2430 /*
2331 * Free memory by calling shrink zone with increasing 2431 * Free memory by calling shrink zone with increasing
2332 * priorities until we have enough memory freed. 2432 * priorities until we have enough memory freed.
@@ -2384,20 +2484,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2384 * if less than a specified percentage of the zone is used by 2484 * if less than a specified percentage of the zone is used by
2385 * unmapped file backed pages. 2485 * unmapped file backed pages.
2386 */ 2486 */
2387 if (zone_page_state(zone, NR_FILE_PAGES) - 2487 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
2388 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages 2488 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
2389 && zone_page_state(zone, NR_SLAB_RECLAIMABLE) 2489 return ZONE_RECLAIM_FULL;
2390 <= zone->min_slab_pages)
2391 return 0;
2392 2490
2393 if (zone_is_all_unreclaimable(zone)) 2491 if (zone_is_all_unreclaimable(zone))
2394 return 0; 2492 return ZONE_RECLAIM_FULL;
2395 2493
2396 /* 2494 /*
2397 * Do not scan if the allocation should not be delayed. 2495 * Do not scan if the allocation should not be delayed.
2398 */ 2496 */
2399 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) 2497 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
2400 return 0; 2498 return ZONE_RECLAIM_NOSCAN;
2401 2499
2402 /* 2500 /*
2403 * Only run zone reclaim on the local zone or on zones that do not 2501 * Only run zone reclaim on the local zone or on zones that do not
@@ -2407,18 +2505,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2407 */ 2505 */
2408 node_id = zone_to_nid(zone); 2506 node_id = zone_to_nid(zone);
2409 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 2507 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
2410 return 0; 2508 return ZONE_RECLAIM_NOSCAN;
2411 2509
2412 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 2510 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
2413 return 0; 2511 return ZONE_RECLAIM_NOSCAN;
2512
2414 ret = __zone_reclaim(zone, gfp_mask, order); 2513 ret = __zone_reclaim(zone, gfp_mask, order);
2415 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 2514 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
2416 2515
2516 if (!ret)
2517 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
2518
2417 return ret; 2519 return ret;
2418} 2520}
2419#endif 2521#endif
2420 2522
2421#ifdef CONFIG_UNEVICTABLE_LRU
2422/* 2523/*
2423 * page_evictable - test whether a page is evictable 2524 * page_evictable - test whether a page is evictable
2424 * @page: the page to test 2525 * @page: the page to test
@@ -2665,4 +2766,3 @@ void scan_unevictable_unregister_node(struct node *node)
2665 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 2766 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2666} 2767}
2667 2768
2668#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 74d66dba0cbe..138bed53706e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -629,10 +629,8 @@ static const char * const vmstat_text[] = {
629 "nr_active_anon", 629 "nr_active_anon",
630 "nr_inactive_file", 630 "nr_inactive_file",
631 "nr_active_file", 631 "nr_active_file",
632#ifdef CONFIG_UNEVICTABLE_LRU
633 "nr_unevictable", 632 "nr_unevictable",
634 "nr_mlock", 633 "nr_mlock",
635#endif
636 "nr_anon_pages", 634 "nr_anon_pages",
637 "nr_mapped", 635 "nr_mapped",
638 "nr_file_pages", 636 "nr_file_pages",
@@ -675,6 +673,9 @@ static const char * const vmstat_text[] = {
675 TEXTS_FOR_ZONES("pgscan_kswapd") 673 TEXTS_FOR_ZONES("pgscan_kswapd")
676 TEXTS_FOR_ZONES("pgscan_direct") 674 TEXTS_FOR_ZONES("pgscan_direct")
677 675
676#ifdef CONFIG_NUMA
677 "zone_reclaim_failed",
678#endif
678 "pginodesteal", 679 "pginodesteal",
679 "slabs_scanned", 680 "slabs_scanned",
680 "kswapd_steal", 681 "kswapd_steal",
@@ -687,7 +688,6 @@ static const char * const vmstat_text[] = {
687 "htlb_buddy_alloc_success", 688 "htlb_buddy_alloc_success",
688 "htlb_buddy_alloc_fail", 689 "htlb_buddy_alloc_fail",
689#endif 690#endif
690#ifdef CONFIG_UNEVICTABLE_LRU
691 "unevictable_pgs_culled", 691 "unevictable_pgs_culled",
692 "unevictable_pgs_scanned", 692 "unevictable_pgs_scanned",
693 "unevictable_pgs_rescued", 693 "unevictable_pgs_rescued",
@@ -697,7 +697,6 @@ static const char * const vmstat_text[] = {
697 "unevictable_pgs_stranded", 697 "unevictable_pgs_stranded",
698 "unevictable_pgs_mlockfreed", 698 "unevictable_pgs_mlockfreed",
699#endif 699#endif
700#endif
701}; 700};
702 701
703static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, 702static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
@@ -710,18 +709,14 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
710 "\n min %lu" 709 "\n min %lu"
711 "\n low %lu" 710 "\n low %lu"
712 "\n high %lu" 711 "\n high %lu"
713 "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" 712 "\n scanned %lu"
714 "\n spanned %lu" 713 "\n spanned %lu"
715 "\n present %lu", 714 "\n present %lu",
716 zone_page_state(zone, NR_FREE_PAGES), 715 zone_page_state(zone, NR_FREE_PAGES),
717 zone->pages_min, 716 min_wmark_pages(zone),
718 zone->pages_low, 717 low_wmark_pages(zone),
719 zone->pages_high, 718 high_wmark_pages(zone),
720 zone->pages_scanned, 719 zone->pages_scanned,
721 zone->lru[LRU_ACTIVE_ANON].nr_scan,
722 zone->lru[LRU_INACTIVE_ANON].nr_scan,
723 zone->lru[LRU_ACTIVE_FILE].nr_scan,
724 zone->lru[LRU_INACTIVE_FILE].nr_scan,
725 zone->spanned_pages, 720 zone->spanned_pages,
726 zone->present_pages); 721 zone->present_pages);
727 722