aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig18
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bounce.c1
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c169
-rw-r--r--mm/highmem.c1
-rw-r--r--mm/hugetlb.c106
-rw-r--r--mm/init-mm.c20
-rw-r--r--mm/internal.h33
-rw-r--r--mm/kmemcheck.c122
-rw-r--r--mm/kmemleak.c27
-rw-r--r--mm/madvise.c26
-rw-r--r--mm/memcontrol.c137
-rw-r--r--mm/memory.c128
-rw-r--r--mm/memory_hotplug.c6
-rw-r--r--mm/mempolicy.c145
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mlock.c22
-rw-r--r--mm/oom_kill.c64
-rw-r--r--mm/page-writeback.c19
-rw-r--r--mm/page_alloc.c772
-rw-r--r--mm/page_cgroup.c24
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/readahead.c145
-rw-r--r--mm/rmap.c45
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slab.c126
-rw-r--r--mm/slob.c10
-rw-r--r--mm/slub.c141
-rw-r--r--mm/swap_state.c17
-rw-r--r--mm/swapfile.c284
-rw-r--r--mm/truncate.c39
-rw-r--r--mm/util.c20
-rw-r--r--mm/vmscan.c376
-rw-r--r--mm/vmstat.c19
36 files changed, 2046 insertions, 1033 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 71830ba7b986..c948d4ca8bde 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -128,11 +128,11 @@ config SPARSEMEM_VMEMMAP
128config MEMORY_HOTPLUG 128config MEMORY_HOTPLUG
129 bool "Allow for memory hot-add" 129 bool "Allow for memory hot-add"
130 depends on SPARSEMEM || X86_64_ACPI_NUMA 130 depends on SPARSEMEM || X86_64_ACPI_NUMA
131 depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG 131 depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG
132 depends on (IA64 || X86 || PPC64 || SUPERH || S390) 132 depends on (IA64 || X86 || PPC64 || SUPERH || S390)
133 133
134comment "Memory hotplug is currently incompatible with Software Suspend" 134comment "Memory hotplug is currently incompatible with Software Suspend"
135 depends on SPARSEMEM && HOTPLUG && HIBERNATION 135 depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390
136 136
137config MEMORY_HOTPLUG_SPARSE 137config MEMORY_HOTPLUG_SPARSE
138 def_bool y 138 def_bool y
@@ -203,25 +203,13 @@ config VIRT_TO_BUS
203 def_bool y 203 def_bool y
204 depends on !ARCH_NO_VIRT_TO_BUS 204 depends on !ARCH_NO_VIRT_TO_BUS
205 205
206config UNEVICTABLE_LRU
207 bool "Add LRU list to track non-evictable pages"
208 default y
209 help
210 Keeps unevictable pages off of the active and inactive pageout
211 lists, so kswapd will not waste CPU time or have its balancing
212 algorithms thrown off by scanning these pages. Selecting this
213 will use one page flag and increase the code size a little,
214 say Y unless you know what you are doing.
215
216 See Documentation/vm/unevictable-lru.txt for more information.
217
218config HAVE_MLOCK 206config HAVE_MLOCK
219 bool 207 bool
220 default y if MMU=y 208 default y if MMU=y
221 209
222config HAVE_MLOCKED_PAGE_BIT 210config HAVE_MLOCKED_PAGE_BIT
223 bool 211 bool
224 default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y 212 default y if HAVE_MLOCK=y
225 213
226config MMU_NOTIFIER 214config MMU_NOTIFIER
227 bool 215 bool
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index bb01e298f260..aa99fd1f7109 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -2,6 +2,7 @@ config DEBUG_PAGEALLOC
2 bool "Debug page memory allocations" 2 bool "Debug page memory allocations"
3 depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC 3 depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
4 depends on !HIBERNATION || !PPC && !SPARC 4 depends on !HIBERNATION || !PPC && !SPARC
5 depends on !KMEMCHECK
5 ---help--- 6 ---help---
6 Unmap pages from the kernel linear mapping after free_pages(). 7 Unmap pages from the kernel linear mapping after free_pages().
7 This results in a large slowdown, but helps to find certain types 8 This results in a large slowdown, but helps to find certain types
diff --git a/mm/Makefile b/mm/Makefile
index e89acb090b4d..5e0bd6426693 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,6 +12,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
15obj-y += init-mm.o
15 16
16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o 17obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
17obj-$(CONFIG_BOUNCE) += bounce.o 18obj-$(CONFIG_BOUNCE) += bounce.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
27obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 28obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
28obj-$(CONFIG_SLAB) += slab.o 29obj-$(CONFIG_SLAB) += slab.o
29obj-$(CONFIG_SLUB) += slub.o 30obj-$(CONFIG_SLUB) += slub.o
31obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
30obj-$(CONFIG_FAILSLAB) += failslab.o 32obj-$(CONFIG_FAILSLAB) += failslab.o
31obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
32obj-$(CONFIG_FS_XIP) += filemap_xip.o 34obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bounce.c b/mm/bounce.c
index 4ebe3ea83795..a2b76a588e34 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -13,7 +13,6 @@
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/hash.h> 14#include <linux/hash.h>
15#include <linux/highmem.h> 15#include <linux/highmem.h>
16#include <linux/blktrace_api.h>
17#include <asm/tlbflush.h> 16#include <asm/tlbflush.h>
18 17
19#include <trace/events/block.h> 18#include <trace/events/block.h>
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 54a0f8040afa..e43359214f6f 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
101 101
102 ret = force_page_cache_readahead(mapping, file, 102 ret = force_page_cache_readahead(mapping, file,
103 start_index, 103 start_index,
104 max_sane_readahead(nrpages)); 104 nrpages);
105 if (ret > 0) 105 if (ret > 0)
106 ret = 0; 106 ret = 0;
107 break; 107 break;
diff --git a/mm/filemap.c b/mm/filemap.c
index 1b60f30cebfa..22396713feb9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
521{ 521{
522 if (cpuset_do_page_mem_spread()) { 522 if (cpuset_do_page_mem_spread()) {
523 int n = cpuset_mem_spread_node(); 523 int n = cpuset_mem_spread_node();
524 return alloc_pages_node(n, gfp, 0); 524 return alloc_pages_exact_node(n, gfp, 0);
525 } 525 }
526 return alloc_pages(gfp, 0); 526 return alloc_pages(gfp, 0);
527} 527}
@@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait);
1004static void shrink_readahead_size_eio(struct file *filp, 1004static void shrink_readahead_size_eio(struct file *filp,
1005 struct file_ra_state *ra) 1005 struct file_ra_state *ra)
1006{ 1006{
1007 if (!ra->ra_pages)
1008 return;
1009
1010 ra->ra_pages /= 4; 1007 ra->ra_pages /= 4;
1011} 1008}
1012 1009
@@ -1390,8 +1387,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
1390 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 1387 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1391 return -EINVAL; 1388 return -EINVAL;
1392 1389
1393 force_page_cache_readahead(mapping, filp, index, 1390 force_page_cache_readahead(mapping, filp, index, nr);
1394 max_sane_readahead(nr));
1395 return 0; 1391 return 0;
1396} 1392}
1397 1393
@@ -1457,6 +1453,73 @@ static int page_cache_read(struct file *file, pgoff_t offset)
1457 1453
1458#define MMAP_LOTSAMISS (100) 1454#define MMAP_LOTSAMISS (100)
1459 1455
1456/*
1457 * Synchronous readahead happens when we don't even find
1458 * a page in the page cache at all.
1459 */
1460static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1461 struct file_ra_state *ra,
1462 struct file *file,
1463 pgoff_t offset)
1464{
1465 unsigned long ra_pages;
1466 struct address_space *mapping = file->f_mapping;
1467
1468 /* If we don't want any read-ahead, don't bother */
1469 if (VM_RandomReadHint(vma))
1470 return;
1471
1472 if (VM_SequentialReadHint(vma) ||
1473 offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
1474 page_cache_sync_readahead(mapping, ra, file, offset,
1475 ra->ra_pages);
1476 return;
1477 }
1478
1479 if (ra->mmap_miss < INT_MAX)
1480 ra->mmap_miss++;
1481
1482 /*
1483 * Do we miss much more than hit in this file? If so,
1484 * stop bothering with read-ahead. It will only hurt.
1485 */
1486 if (ra->mmap_miss > MMAP_LOTSAMISS)
1487 return;
1488
1489 /*
1490 * mmap read-around
1491 */
1492 ra_pages = max_sane_readahead(ra->ra_pages);
1493 if (ra_pages) {
1494 ra->start = max_t(long, 0, offset - ra_pages/2);
1495 ra->size = ra_pages;
1496 ra->async_size = 0;
1497 ra_submit(ra, mapping, file);
1498 }
1499}
1500
1501/*
1502 * Asynchronous readahead happens when we find the page and PG_readahead,
1503 * so we want to possibly extend the readahead further..
1504 */
1505static void do_async_mmap_readahead(struct vm_area_struct *vma,
1506 struct file_ra_state *ra,
1507 struct file *file,
1508 struct page *page,
1509 pgoff_t offset)
1510{
1511 struct address_space *mapping = file->f_mapping;
1512
1513 /* If we don't want any read-ahead, don't bother */
1514 if (VM_RandomReadHint(vma))
1515 return;
1516 if (ra->mmap_miss > 0)
1517 ra->mmap_miss--;
1518 if (PageReadahead(page))
1519 page_cache_async_readahead(mapping, ra, file,
1520 page, offset, ra->ra_pages);
1521}
1522
1460/** 1523/**
1461 * filemap_fault - read in file data for page fault handling 1524 * filemap_fault - read in file data for page fault handling
1462 * @vma: vma in which the fault was taken 1525 * @vma: vma in which the fault was taken
@@ -1476,78 +1539,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1476 struct address_space *mapping = file->f_mapping; 1539 struct address_space *mapping = file->f_mapping;
1477 struct file_ra_state *ra = &file->f_ra; 1540 struct file_ra_state *ra = &file->f_ra;
1478 struct inode *inode = mapping->host; 1541 struct inode *inode = mapping->host;
1542 pgoff_t offset = vmf->pgoff;
1479 struct page *page; 1543 struct page *page;
1480 pgoff_t size; 1544 pgoff_t size;
1481 int did_readaround = 0;
1482 int ret = 0; 1545 int ret = 0;
1483 1546
1484 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1547 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1485 if (vmf->pgoff >= size) 1548 if (offset >= size)
1486 return VM_FAULT_SIGBUS; 1549 return VM_FAULT_SIGBUS;
1487 1550
1488 /* If we don't want any read-ahead, don't bother */
1489 if (VM_RandomReadHint(vma))
1490 goto no_cached_page;
1491
1492 /* 1551 /*
1493 * Do we have something in the page cache already? 1552 * Do we have something in the page cache already?
1494 */ 1553 */
1495retry_find: 1554 page = find_get_page(mapping, offset);
1496 page = find_lock_page(mapping, vmf->pgoff); 1555 if (likely(page)) {
1497 /*
1498 * For sequential accesses, we use the generic readahead logic.
1499 */
1500 if (VM_SequentialReadHint(vma)) {
1501 if (!page) {
1502 page_cache_sync_readahead(mapping, ra, file,
1503 vmf->pgoff, 1);
1504 page = find_lock_page(mapping, vmf->pgoff);
1505 if (!page)
1506 goto no_cached_page;
1507 }
1508 if (PageReadahead(page)) {
1509 page_cache_async_readahead(mapping, ra, file, page,
1510 vmf->pgoff, 1);
1511 }
1512 }
1513
1514 if (!page) {
1515 unsigned long ra_pages;
1516
1517 ra->mmap_miss++;
1518
1519 /* 1556 /*
1520 * Do we miss much more than hit in this file? If so, 1557 * We found the page, so try async readahead before
1521 * stop bothering with read-ahead. It will only hurt. 1558 * waiting for the lock.
1522 */ 1559 */
1523 if (ra->mmap_miss > MMAP_LOTSAMISS) 1560 do_async_mmap_readahead(vma, ra, file, page, offset);
1524 goto no_cached_page; 1561 lock_page(page);
1525 1562
1526 /* 1563 /* Did it get truncated? */
1527 * To keep the pgmajfault counter straight, we need to 1564 if (unlikely(page->mapping != mapping)) {
1528 * check did_readaround, as this is an inner loop. 1565 unlock_page(page);
1529 */ 1566 put_page(page);
1530 if (!did_readaround) { 1567 goto no_cached_page;
1531 ret = VM_FAULT_MAJOR;
1532 count_vm_event(PGMAJFAULT);
1533 }
1534 did_readaround = 1;
1535 ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1536 if (ra_pages) {
1537 pgoff_t start = 0;
1538
1539 if (vmf->pgoff > ra_pages / 2)
1540 start = vmf->pgoff - ra_pages / 2;
1541 do_page_cache_readahead(mapping, file, start, ra_pages);
1542 } 1568 }
1543 page = find_lock_page(mapping, vmf->pgoff); 1569 } else {
1570 /* No page in the page cache at all */
1571 do_sync_mmap_readahead(vma, ra, file, offset);
1572 count_vm_event(PGMAJFAULT);
1573 ret = VM_FAULT_MAJOR;
1574retry_find:
1575 page = find_lock_page(mapping, offset);
1544 if (!page) 1576 if (!page)
1545 goto no_cached_page; 1577 goto no_cached_page;
1546 } 1578 }
1547 1579
1548 if (!did_readaround)
1549 ra->mmap_miss--;
1550
1551 /* 1580 /*
1552 * We have a locked page in the page cache, now we need to check 1581 * We have a locked page in the page cache, now we need to check
1553 * that it's up-to-date. If not, it is going to be due to an error. 1582 * that it's up-to-date. If not, it is going to be due to an error.
@@ -1555,18 +1584,18 @@ retry_find:
1555 if (unlikely(!PageUptodate(page))) 1584 if (unlikely(!PageUptodate(page)))
1556 goto page_not_uptodate; 1585 goto page_not_uptodate;
1557 1586
1558 /* Must recheck i_size under page lock */ 1587 /*
1588 * Found the page and have a reference on it.
1589 * We must recheck i_size under page lock.
1590 */
1559 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1591 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1560 if (unlikely(vmf->pgoff >= size)) { 1592 if (unlikely(offset >= size)) {
1561 unlock_page(page); 1593 unlock_page(page);
1562 page_cache_release(page); 1594 page_cache_release(page);
1563 return VM_FAULT_SIGBUS; 1595 return VM_FAULT_SIGBUS;
1564 } 1596 }
1565 1597
1566 /* 1598 ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
1567 * Found the page and have a reference on it.
1568 */
1569 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
1570 vmf->page = page; 1599 vmf->page = page;
1571 return ret | VM_FAULT_LOCKED; 1600 return ret | VM_FAULT_LOCKED;
1572 1601
@@ -1575,7 +1604,7 @@ no_cached_page:
1575 * We're only likely to ever get here if MADV_RANDOM is in 1604 * We're only likely to ever get here if MADV_RANDOM is in
1576 * effect. 1605 * effect.
1577 */ 1606 */
1578 error = page_cache_read(file, vmf->pgoff); 1607 error = page_cache_read(file, offset);
1579 1608
1580 /* 1609 /*
1581 * The page we want has now been added to the page cache. 1610 * The page we want has now been added to the page cache.
@@ -1595,12 +1624,6 @@ no_cached_page:
1595 return VM_FAULT_SIGBUS; 1624 return VM_FAULT_SIGBUS;
1596 1625
1597page_not_uptodate: 1626page_not_uptodate:
1598 /* IO error path */
1599 if (!did_readaround) {
1600 ret = VM_FAULT_MAJOR;
1601 count_vm_event(PGMAJFAULT);
1602 }
1603
1604 /* 1627 /*
1605 * Umm, take care of errors if the page isn't up-to-date. 1628 * Umm, take care of errors if the page isn't up-to-date.
1606 * Try to re-read it _once_. We do this synchronously, 1629 * Try to re-read it _once_. We do this synchronously,
diff --git a/mm/highmem.c b/mm/highmem.c
index 68eb1d9b63fa..25878cc49daa 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,7 +26,6 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/hash.h> 27#include <linux/hash.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/blktrace_api.h>
30#include <asm/tlbflush.h> 29#include <asm/tlbflush.h>
31 30
32/* 31/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e83ad2c9228c..a56e6f3ce979 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -578,41 +578,6 @@ static void free_huge_page(struct page *page)
578 hugetlb_put_quota(mapping, 1); 578 hugetlb_put_quota(mapping, 1);
579} 579}
580 580
581/*
582 * Increment or decrement surplus_huge_pages. Keep node-specific counters
583 * balanced by operating on them in a round-robin fashion.
584 * Returns 1 if an adjustment was made.
585 */
586static int adjust_pool_surplus(struct hstate *h, int delta)
587{
588 static int prev_nid;
589 int nid = prev_nid;
590 int ret = 0;
591
592 VM_BUG_ON(delta != -1 && delta != 1);
593 do {
594 nid = next_node(nid, node_online_map);
595 if (nid == MAX_NUMNODES)
596 nid = first_node(node_online_map);
597
598 /* To shrink on this node, there must be a surplus page */
599 if (delta < 0 && !h->surplus_huge_pages_node[nid])
600 continue;
601 /* Surplus cannot exceed the total number of pages */
602 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
603 h->nr_huge_pages_node[nid])
604 continue;
605
606 h->surplus_huge_pages += delta;
607 h->surplus_huge_pages_node[nid] += delta;
608 ret = 1;
609 break;
610 } while (nid != prev_nid);
611
612 prev_nid = nid;
613 return ret;
614}
615
616static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 581static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
617{ 582{
618 set_compound_page_dtor(page, free_huge_page); 583 set_compound_page_dtor(page, free_huge_page);
@@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
623 put_page(page); /* free it into the hugepage allocator */ 588 put_page(page); /* free it into the hugepage allocator */
624} 589}
625 590
591static void prep_compound_gigantic_page(struct page *page, unsigned long order)
592{
593 int i;
594 int nr_pages = 1 << order;
595 struct page *p = page + 1;
596
597 /* we rely on prep_new_huge_page to set the destructor */
598 set_compound_order(page, order);
599 __SetPageHead(page);
600 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
601 __SetPageTail(p);
602 p->first_page = page;
603 }
604}
605
606int PageHuge(struct page *page)
607{
608 compound_page_dtor *dtor;
609
610 if (!PageCompound(page))
611 return 0;
612
613 page = compound_head(page);
614 dtor = get_compound_page_dtor(page);
615
616 return dtor == free_huge_page;
617}
618
626static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 619static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
627{ 620{
628 struct page *page; 621 struct page *page;
@@ -630,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
630 if (h->order >= MAX_ORDER) 623 if (h->order >= MAX_ORDER)
631 return NULL; 624 return NULL;
632 625
633 page = alloc_pages_node(nid, 626 page = alloc_pages_exact_node(nid,
634 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 627 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
635 __GFP_REPEAT|__GFP_NOWARN, 628 __GFP_REPEAT|__GFP_NOWARN,
636 huge_page_order(h)); 629 huge_page_order(h));
@@ -649,7 +642,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
649 * Use a helper variable to find the next node and then 642 * Use a helper variable to find the next node and then
650 * copy it back to hugetlb_next_nid afterwards: 643 * copy it back to hugetlb_next_nid afterwards:
651 * otherwise there's a window in which a racer might 644 * otherwise there's a window in which a racer might
652 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 645 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
653 * But we don't need to use a spin_lock here: it really 646 * But we don't need to use a spin_lock here: it really
654 * doesn't matter if occasionally a racer chooses the 647 * doesn't matter if occasionally a racer chooses the
655 * same nid as we do. Move nid forward in the mask even 648 * same nid as we do. Move nid forward in the mask even
@@ -875,7 +868,7 @@ static void return_unused_surplus_pages(struct hstate *h,
875 * can no longer free unreserved surplus pages. This occurs when 868 * can no longer free unreserved surplus pages. This occurs when
876 * the nodes with surplus pages have no free pages. 869 * the nodes with surplus pages have no free pages.
877 */ 870 */
878 unsigned long remaining_iterations = num_online_nodes(); 871 unsigned long remaining_iterations = nr_online_nodes;
879 872
880 /* Uncommit the reservation */ 873 /* Uncommit the reservation */
881 h->resv_huge_pages -= unused_resv_pages; 874 h->resv_huge_pages -= unused_resv_pages;
@@ -904,7 +897,7 @@ static void return_unused_surplus_pages(struct hstate *h,
904 h->surplus_huge_pages--; 897 h->surplus_huge_pages--;
905 h->surplus_huge_pages_node[nid]--; 898 h->surplus_huge_pages_node[nid]--;
906 nr_pages--; 899 nr_pages--;
907 remaining_iterations = num_online_nodes(); 900 remaining_iterations = nr_online_nodes;
908 } 901 }
909 } 902 }
910} 903}
@@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1140} 1133}
1141#endif 1134#endif
1142 1135
1136/*
1137 * Increment or decrement surplus_huge_pages. Keep node-specific counters
1138 * balanced by operating on them in a round-robin fashion.
1139 * Returns 1 if an adjustment was made.
1140 */
1141static int adjust_pool_surplus(struct hstate *h, int delta)
1142{
1143 static int prev_nid;
1144 int nid = prev_nid;
1145 int ret = 0;
1146
1147 VM_BUG_ON(delta != -1 && delta != 1);
1148 do {
1149 nid = next_node(nid, node_online_map);
1150 if (nid == MAX_NUMNODES)
1151 nid = first_node(node_online_map);
1152
1153 /* To shrink on this node, there must be a surplus page */
1154 if (delta < 0 && !h->surplus_huge_pages_node[nid])
1155 continue;
1156 /* Surplus cannot exceed the total number of pages */
1157 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
1158 h->nr_huge_pages_node[nid])
1159 continue;
1160
1161 h->surplus_huge_pages += delta;
1162 h->surplus_huge_pages_node[nid] += delta;
1163 ret = 1;
1164 break;
1165 } while (nid != prev_nid);
1166
1167 prev_nid = nid;
1168 return ret;
1169}
1170
1143#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1171#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1144static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) 1172static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1145{ 1173{
diff --git a/mm/init-mm.c b/mm/init-mm.c
new file mode 100644
index 000000000000..57aba0da9668
--- /dev/null
+++ b/mm/init-mm.c
@@ -0,0 +1,20 @@
1#include <linux/mm_types.h>
2#include <linux/rbtree.h>
3#include <linux/rwsem.h>
4#include <linux/spinlock.h>
5#include <linux/list.h>
6#include <linux/cpumask.h>
7
8#include <asm/atomic.h>
9#include <asm/pgtable.h>
10
11struct mm_struct init_mm = {
12 .mm_rb = RB_ROOT,
13 .pgd = swapper_pg_dir,
14 .mm_users = ATOMIC_INIT(2),
15 .mm_count = ATOMIC_INIT(1),
16 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
17 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
18 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
19 .cpu_vm_mask = CPU_MASK_ALL,
20};
diff --git a/mm/internal.h b/mm/internal.h
index 987bb03fbdd8..f290c4db528b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -16,9 +16,6 @@
16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
17 unsigned long floor, unsigned long ceiling); 17 unsigned long floor, unsigned long ceiling);
18 18
19extern void prep_compound_page(struct page *page, unsigned long order);
20extern void prep_compound_gigantic_page(struct page *page, unsigned long order);
21
22static inline void set_page_count(struct page *page, int v) 19static inline void set_page_count(struct page *page, int v)
23{ 20{
24 atomic_set(&page->_count, v); 21 atomic_set(&page->_count, v);
@@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page);
51 */ 48 */
52extern unsigned long highest_memmap_pfn; 49extern unsigned long highest_memmap_pfn;
53extern void __free_pages_bootmem(struct page *page, unsigned int order); 50extern void __free_pages_bootmem(struct page *page, unsigned int order);
51extern void prep_compound_page(struct page *page, unsigned long order);
52
54 53
55/* 54/*
56 * function for dealing with page's order in buddy system. 55 * function for dealing with page's order in buddy system.
@@ -74,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
74} 73}
75#endif 74#endif
76 75
77#ifdef CONFIG_UNEVICTABLE_LRU
78/* 76/*
79 * unevictable_migrate_page() called only from migrate_page_copy() to 77 * unevictable_migrate_page() called only from migrate_page_copy() to
80 * migrate unevictable flag to new page. 78 * migrate unevictable flag to new page.
@@ -86,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
86 if (TestClearPageUnevictable(old)) 84 if (TestClearPageUnevictable(old))
87 SetPageUnevictable(new); 85 SetPageUnevictable(new);
88} 86}
89#else
90static inline void unevictable_migrate_page(struct page *new, struct page *old)
91{
92}
93#endif
94 87
95#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT 88#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
96/* 89/*
@@ -150,23 +143,6 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
150 } 143 }
151} 144}
152 145
153/*
154 * free_page_mlock() -- clean up attempts to free and mlocked() page.
155 * Page should not be on lru, so no need to fix that up.
156 * free_pages_check() will verify...
157 */
158static inline void free_page_mlock(struct page *page)
159{
160 if (unlikely(TestClearPageMlocked(page))) {
161 unsigned long flags;
162
163 local_irq_save(flags);
164 __dec_zone_page_state(page, NR_MLOCK);
165 __count_vm_event(UNEVICTABLE_MLOCKFREED);
166 local_irq_restore(flags);
167 }
168}
169
170#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 146#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
171static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 147static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
172{ 148{
@@ -175,7 +151,6 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
175static inline void clear_page_mlock(struct page *page) { } 151static inline void clear_page_mlock(struct page *page) { }
176static inline void mlock_vma_page(struct page *page) { } 152static inline void mlock_vma_page(struct page *page) { }
177static inline void mlock_migrate_page(struct page *new, struct page *old) { } 153static inline void mlock_migrate_page(struct page *new, struct page *old) { }
178static inline void free_page_mlock(struct page *page) { }
179 154
180#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 155#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
181 156
@@ -284,4 +259,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
284 unsigned long start, int len, int flags, 259 unsigned long start, int len, int flags,
285 struct page **pages, struct vm_area_struct **vmas); 260 struct page **pages, struct vm_area_struct **vmas);
286 261
262#define ZONE_RECLAIM_NOSCAN -2
263#define ZONE_RECLAIM_FULL -1
264#define ZONE_RECLAIM_SOME 0
265#define ZONE_RECLAIM_SUCCESS 1
287#endif 266#endif
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
new file mode 100644
index 000000000000..fd814fd61319
--- /dev/null
+++ b/mm/kmemcheck.c
@@ -0,0 +1,122 @@
1#include <linux/gfp.h>
2#include <linux/mm_types.h>
3#include <linux/mm.h>
4#include <linux/slab.h>
5#include <linux/kmemcheck.h>
6
7void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
8{
9 struct page *shadow;
10 int pages;
11 int i;
12
13 pages = 1 << order;
14
15 /*
16 * With kmemcheck enabled, we need to allocate a memory area for the
17 * shadow bits as well.
18 */
19 shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
20 if (!shadow) {
21 if (printk_ratelimit())
22 printk(KERN_ERR "kmemcheck: failed to allocate "
23 "shadow bitmap\n");
24 return;
25 }
26
27 for(i = 0; i < pages; ++i)
28 page[i].shadow = page_address(&shadow[i]);
29
30 /*
31 * Mark it as non-present for the MMU so that our accesses to
32 * this memory will trigger a page fault and let us analyze
33 * the memory accesses.
34 */
35 kmemcheck_hide_pages(page, pages);
36}
37
38void kmemcheck_free_shadow(struct page *page, int order)
39{
40 struct page *shadow;
41 int pages;
42 int i;
43
44 if (!kmemcheck_page_is_tracked(page))
45 return;
46
47 pages = 1 << order;
48
49 kmemcheck_show_pages(page, pages);
50
51 shadow = virt_to_page(page[0].shadow);
52
53 for(i = 0; i < pages; ++i)
54 page[i].shadow = NULL;
55
56 __free_pages(shadow, order);
57}
58
59void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
60 size_t size)
61{
62 /*
63 * Has already been memset(), which initializes the shadow for us
64 * as well.
65 */
66 if (gfpflags & __GFP_ZERO)
67 return;
68
69 /* No need to initialize the shadow of a non-tracked slab. */
70 if (s->flags & SLAB_NOTRACK)
71 return;
72
73 if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
74 /*
75 * Allow notracked objects to be allocated from
76 * tracked caches. Note however that these objects
77 * will still get page faults on access, they just
78 * won't ever be flagged as uninitialized. If page
79 * faults are not acceptable, the slab cache itself
80 * should be marked NOTRACK.
81 */
82 kmemcheck_mark_initialized(object, size);
83 } else if (!s->ctor) {
84 /*
85 * New objects should be marked uninitialized before
86 * they're returned to the called.
87 */
88 kmemcheck_mark_uninitialized(object, size);
89 }
90}
91
92void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
93{
94 /* TODO: RCU freeing is unsupported for now; hide false positives. */
95 if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU))
96 kmemcheck_mark_freed(object, size);
97}
98
99void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
100 gfp_t gfpflags)
101{
102 int pages;
103
104 if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
105 return;
106
107 pages = 1 << order;
108
109 /*
110 * NOTE: We choose to track GFP_ZERO pages too; in fact, they
111 * can become uninitialized by copying uninitialized memory
112 * into them.
113 */
114
115 /* XXX: Can use zone->node for node? */
116 kmemcheck_alloc_shadow(page, order, gfpflags, -1);
117
118 if (gfpflags & __GFP_ZERO)
119 kmemcheck_mark_initialized_pages(page, pages);
120 else
121 kmemcheck_mark_uninitialized_pages(page, pages);
122}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 58ec86c9e58a..ec759b60077a 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -109,6 +109,9 @@
109 109
110#define BYTES_PER_POINTER sizeof(void *) 110#define BYTES_PER_POINTER sizeof(void *)
111 111
112/* GFP bitmask for kmemleak internal allocations */
113#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC)
114
112/* scanning area inside a memory block */ 115/* scanning area inside a memory block */
113struct kmemleak_scan_area { 116struct kmemleak_scan_area {
114 struct hlist_node node; 117 struct hlist_node node;
@@ -199,9 +202,9 @@ static DEFINE_MUTEX(kmemleak_mutex);
199static int reported_leaks; 202static int reported_leaks;
200 203
201/* 204/*
202 * Early object allocation/freeing logging. Kkmemleak is initialized after the 205 * Early object allocation/freeing logging. Kmemleak is initialized after the
203 * kernel allocator. However, both the kernel allocator and kmemleak may 206 * kernel allocator. However, both the kernel allocator and kmemleak may
204 * allocate memory blocks which need to be tracked. Kkmemleak defines an 207 * allocate memory blocks which need to be tracked. Kmemleak defines an
205 * arbitrary buffer to hold the allocation/freeing information before it is 208 * arbitrary buffer to hold the allocation/freeing information before it is
206 * fully initialized. 209 * fully initialized.
207 */ 210 */
@@ -245,10 +248,10 @@ static void kmemleak_disable(void);
245 248
246/* 249/*
247 * Macro invoked when a serious kmemleak condition occured and cannot be 250 * Macro invoked when a serious kmemleak condition occured and cannot be
248 * recovered from. Kkmemleak will be disabled and further allocation/freeing 251 * recovered from. Kmemleak will be disabled and further allocation/freeing
249 * tracing no longer available. 252 * tracing no longer available.
250 */ 253 */
251#define kmemleak_panic(x...) do { \ 254#define kmemleak_stop(x...) do { \
252 kmemleak_warn(x); \ 255 kmemleak_warn(x); \
253 kmemleak_disable(); \ 256 kmemleak_disable(); \
254} while (0) 257} while (0)
@@ -462,10 +465,10 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
462 struct prio_tree_node *node; 465 struct prio_tree_node *node;
463 struct stack_trace trace; 466 struct stack_trace trace;
464 467
465 object = kmem_cache_alloc(object_cache, gfp & ~GFP_SLAB_BUG_MASK); 468 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
466 if (!object) { 469 if (!object) {
467 kmemleak_panic("kmemleak: Cannot allocate a kmemleak_object " 470 kmemleak_stop("kmemleak: Cannot allocate a kmemleak_object "
468 "structure\n"); 471 "structure\n");
469 return; 472 return;
470 } 473 }
471 474
@@ -524,8 +527,8 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
524 if (node != &object->tree_node) { 527 if (node != &object->tree_node) {
525 unsigned long flags; 528 unsigned long flags;
526 529
527 kmemleak_panic("kmemleak: Cannot insert 0x%lx into the object " 530 kmemleak_stop("kmemleak: Cannot insert 0x%lx into the object "
528 "search tree (already existing)\n", ptr); 531 "search tree (already existing)\n", ptr);
529 object = lookup_object(ptr, 1); 532 object = lookup_object(ptr, 1);
530 spin_lock_irqsave(&object->lock, flags); 533 spin_lock_irqsave(&object->lock, flags);
531 dump_object_info(object); 534 dump_object_info(object);
@@ -636,7 +639,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset,
636 return; 639 return;
637 } 640 }
638 641
639 area = kmem_cache_alloc(scan_area_cache, gfp & ~GFP_SLAB_BUG_MASK); 642 area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK);
640 if (!area) { 643 if (!area) {
641 kmemleak_warn("kmemleak: Cannot allocate a scan area\n"); 644 kmemleak_warn("kmemleak: Cannot allocate a scan area\n");
642 goto out; 645 goto out;
@@ -696,7 +699,7 @@ static void log_early(int op_type, const void *ptr, size_t size,
696 struct early_log *log; 699 struct early_log *log;
697 700
698 if (crt_early_log >= ARRAY_SIZE(early_log)) { 701 if (crt_early_log >= ARRAY_SIZE(early_log)) {
699 kmemleak_panic("kmemleak: Early log buffer exceeded\n"); 702 kmemleak_stop("kmemleak: Early log buffer exceeded\n");
700 return; 703 return;
701 } 704 }
702 705
@@ -1404,7 +1407,7 @@ static int kmemleak_boot_config(char *str)
1404early_param("kmemleak", kmemleak_boot_config); 1407early_param("kmemleak", kmemleak_boot_config);
1405 1408
1406/* 1409/*
1407 * Kkmemleak initialization. 1410 * Kmemleak initialization.
1408 */ 1411 */
1409void __init kmemleak_init(void) 1412void __init kmemleak_init(void)
1410{ 1413{
diff --git a/mm/madvise.c b/mm/madvise.c
index b9ce574827c8..76eb4193acdd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -123,8 +123,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
123 end = vma->vm_end; 123 end = vma->vm_end;
124 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 124 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
125 125
126 force_page_cache_readahead(file->f_mapping, 126 force_page_cache_readahead(file->f_mapping, file, start, end - start);
127 file, start, max_sane_readahead(end - start));
128 return 0; 127 return 0;
129} 128}
130 129
@@ -239,12 +238,30 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
239 break; 238 break;
240 239
241 default: 240 default:
242 error = -EINVAL; 241 BUG();
243 break; 242 break;
244 } 243 }
245 return error; 244 return error;
246} 245}
247 246
247static int
248madvise_behavior_valid(int behavior)
249{
250 switch (behavior) {
251 case MADV_DOFORK:
252 case MADV_DONTFORK:
253 case MADV_NORMAL:
254 case MADV_SEQUENTIAL:
255 case MADV_RANDOM:
256 case MADV_REMOVE:
257 case MADV_WILLNEED:
258 case MADV_DONTNEED:
259 return 1;
260
261 default:
262 return 0;
263 }
264}
248/* 265/*
249 * The madvise(2) system call. 266 * The madvise(2) system call.
250 * 267 *
@@ -290,6 +307,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
290 int write; 307 int write;
291 size_t len; 308 size_t len;
292 309
310 if (!madvise_behavior_valid(behavior))
311 return error;
312
293 write = madvise_need_mmap_write(behavior); 313 write = madvise_need_mmap_write(behavior);
294 if (write) 314 if (write)
295 down_write(&current->mm->mmap_sem); 315 down_write(&current->mm->mmap_sem);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 78eb8552818b..e2fa20dadf40 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,7 +45,7 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly;
45#define MEM_CGROUP_RECLAIM_RETRIES 5 45#define MEM_CGROUP_RECLAIM_RETRIES 5
46 46
47#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 47#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
48/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */ 48/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
49int do_swap_account __read_mostly; 49int do_swap_account __read_mostly;
50static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 50static int really_do_swap_account __initdata = 1; /* for remember boot option*/
51#else 51#else
@@ -62,7 +62,8 @@ enum mem_cgroup_stat_index {
62 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 62 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
63 */ 63 */
64 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 64 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
65 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ 65 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
66 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */
66 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 67 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
67 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 68 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
68 69
@@ -176,6 +177,9 @@ struct mem_cgroup {
176 177
177 unsigned int swappiness; 178 unsigned int swappiness;
178 179
180 /* set when res.limit == memsw.limit */
181 bool memsw_is_minimum;
182
179 /* 183 /*
180 * statistics. This must be placed at the end of memcg. 184 * statistics. This must be placed at the end of memcg.
181 */ 185 */
@@ -188,6 +192,7 @@ enum charge_type {
188 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 192 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
189 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 193 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
190 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 194 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
195 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
191 NR_CHARGE_TYPE, 196 NR_CHARGE_TYPE,
192}; 197};
193 198
@@ -570,6 +575,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
570 return 0; 575 return 0;
571} 576}
572 577
578int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
579{
580 unsigned long active;
581 unsigned long inactive;
582
583 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
584 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
585
586 return (active > inactive);
587}
588
573unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 589unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
574 struct zone *zone, 590 struct zone *zone,
575 enum lru_list lru) 591 enum lru_list lru)
@@ -633,6 +649,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
633 int zid = zone_idx(z); 649 int zid = zone_idx(z);
634 struct mem_cgroup_per_zone *mz; 650 struct mem_cgroup_per_zone *mz;
635 int lru = LRU_FILE * !!file + !!active; 651 int lru = LRU_FILE * !!file + !!active;
652 int ret;
636 653
637 BUG_ON(!mem_cont); 654 BUG_ON(!mem_cont);
638 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 655 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
@@ -650,9 +667,19 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
650 continue; 667 continue;
651 668
652 scan++; 669 scan++;
653 if (__isolate_lru_page(page, mode, file) == 0) { 670 ret = __isolate_lru_page(page, mode, file);
671 switch (ret) {
672 case 0:
654 list_move(&page->lru, dst); 673 list_move(&page->lru, dst);
674 mem_cgroup_del_lru(page);
655 nr_taken++; 675 nr_taken++;
676 break;
677 case -EBUSY:
678 /* we don't affect global LRU but rotate in our LRU */
679 mem_cgroup_rotate_lru_list(page, page_lru(page));
680 break;
681 default:
682 break;
656 } 683 }
657 } 684 }
658 685
@@ -834,6 +861,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
834 int ret, total = 0; 861 int ret, total = 0;
835 int loop = 0; 862 int loop = 0;
836 863
864 /* If memsw_is_minimum==1, swap-out is of-no-use. */
865 if (root_mem->memsw_is_minimum)
866 noswap = true;
867
837 while (loop < 2) { 868 while (loop < 2) {
838 victim = mem_cgroup_select_victim(root_mem); 869 victim = mem_cgroup_select_victim(root_mem);
839 if (victim == root_mem) 870 if (victim == root_mem)
@@ -889,6 +920,44 @@ static void record_last_oom(struct mem_cgroup *mem)
889 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); 920 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
890} 921}
891 922
923/*
924 * Currently used to update mapped file statistics, but the routine can be
925 * generalized to update other statistics as well.
926 */
927void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
928{
929 struct mem_cgroup *mem;
930 struct mem_cgroup_stat *stat;
931 struct mem_cgroup_stat_cpu *cpustat;
932 int cpu;
933 struct page_cgroup *pc;
934
935 if (!page_is_file_cache(page))
936 return;
937
938 pc = lookup_page_cgroup(page);
939 if (unlikely(!pc))
940 return;
941
942 lock_page_cgroup(pc);
943 mem = pc->mem_cgroup;
944 if (!mem)
945 goto done;
946
947 if (!PageCgroupUsed(pc))
948 goto done;
949
950 /*
951 * Preemption is already disabled, we don't need get_cpu()
952 */
953 cpu = smp_processor_id();
954 stat = &mem->stat;
955 cpustat = &stat->cpustat[cpu];
956
957 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val);
958done:
959 unlock_page_cgroup(pc);
960}
892 961
893/* 962/*
894 * Unlike exported interface, "oom" parameter is added. if oom==true, 963 * Unlike exported interface, "oom" parameter is added. if oom==true,
@@ -1087,6 +1156,10 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1087 struct mem_cgroup_per_zone *from_mz, *to_mz; 1156 struct mem_cgroup_per_zone *from_mz, *to_mz;
1088 int nid, zid; 1157 int nid, zid;
1089 int ret = -EBUSY; 1158 int ret = -EBUSY;
1159 struct page *page;
1160 int cpu;
1161 struct mem_cgroup_stat *stat;
1162 struct mem_cgroup_stat_cpu *cpustat;
1090 1163
1091 VM_BUG_ON(from == to); 1164 VM_BUG_ON(from == to);
1092 VM_BUG_ON(PageLRU(pc->page)); 1165 VM_BUG_ON(PageLRU(pc->page));
@@ -1107,6 +1180,23 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1107 1180
1108 res_counter_uncharge(&from->res, PAGE_SIZE); 1181 res_counter_uncharge(&from->res, PAGE_SIZE);
1109 mem_cgroup_charge_statistics(from, pc, false); 1182 mem_cgroup_charge_statistics(from, pc, false);
1183
1184 page = pc->page;
1185 if (page_is_file_cache(page) && page_mapped(page)) {
1186 cpu = smp_processor_id();
1187 /* Update mapped_file data for mem_cgroup "from" */
1188 stat = &from->stat;
1189 cpustat = &stat->cpustat[cpu];
1190 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
1191 -1);
1192
1193 /* Update mapped_file data for mem_cgroup "to" */
1194 stat = &to->stat;
1195 cpustat = &stat->cpustat[cpu];
1196 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
1197 1);
1198 }
1199
1110 if (do_swap_account) 1200 if (do_swap_account)
1111 res_counter_uncharge(&from->memsw, PAGE_SIZE); 1201 res_counter_uncharge(&from->memsw, PAGE_SIZE);
1112 css_put(&from->css); 1202 css_put(&from->css);
@@ -1422,6 +1512,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1422 1512
1423 switch (ctype) { 1513 switch (ctype) {
1424 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 1514 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1515 case MEM_CGROUP_CHARGE_TYPE_DROP:
1425 if (page_mapped(page)) 1516 if (page_mapped(page))
1426 goto unlock_out; 1517 goto unlock_out;
1427 break; 1518 break;
@@ -1485,18 +1576,23 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
1485 * called after __delete_from_swap_cache() and drop "page" account. 1576 * called after __delete_from_swap_cache() and drop "page" account.
1486 * memcg information is recorded to swap_cgroup of "ent" 1577 * memcg information is recorded to swap_cgroup of "ent"
1487 */ 1578 */
1488void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) 1579void
1580mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
1489{ 1581{
1490 struct mem_cgroup *memcg; 1582 struct mem_cgroup *memcg;
1583 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
1584
1585 if (!swapout) /* this was a swap cache but the swap is unused ! */
1586 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
1587
1588 memcg = __mem_cgroup_uncharge_common(page, ctype);
1491 1589
1492 memcg = __mem_cgroup_uncharge_common(page,
1493 MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1494 /* record memcg information */ 1590 /* record memcg information */
1495 if (do_swap_account && memcg) { 1591 if (do_swap_account && swapout && memcg) {
1496 swap_cgroup_record(ent, css_id(&memcg->css)); 1592 swap_cgroup_record(ent, css_id(&memcg->css));
1497 mem_cgroup_get(memcg); 1593 mem_cgroup_get(memcg);
1498 } 1594 }
1499 if (memcg) 1595 if (swapout && memcg)
1500 css_put(&memcg->css); 1596 css_put(&memcg->css);
1501} 1597}
1502#endif 1598#endif
@@ -1674,6 +1770,12 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1674 break; 1770 break;
1675 } 1771 }
1676 ret = res_counter_set_limit(&memcg->res, val); 1772 ret = res_counter_set_limit(&memcg->res, val);
1773 if (!ret) {
1774 if (memswlimit == val)
1775 memcg->memsw_is_minimum = true;
1776 else
1777 memcg->memsw_is_minimum = false;
1778 }
1677 mutex_unlock(&set_limit_mutex); 1779 mutex_unlock(&set_limit_mutex);
1678 1780
1679 if (!ret) 1781 if (!ret)
@@ -1692,16 +1794,14 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1692 return ret; 1794 return ret;
1693} 1795}
1694 1796
1695int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 1797static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1696 unsigned long long val) 1798 unsigned long long val)
1697{ 1799{
1698 int retry_count; 1800 int retry_count;
1699 u64 memlimit, oldusage, curusage; 1801 u64 memlimit, oldusage, curusage;
1700 int children = mem_cgroup_count_children(memcg); 1802 int children = mem_cgroup_count_children(memcg);
1701 int ret = -EBUSY; 1803 int ret = -EBUSY;
1702 1804
1703 if (!do_swap_account)
1704 return -EINVAL;
1705 /* see mem_cgroup_resize_res_limit */ 1805 /* see mem_cgroup_resize_res_limit */
1706 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 1806 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
1707 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1807 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
@@ -1723,6 +1823,12 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1723 break; 1823 break;
1724 } 1824 }
1725 ret = res_counter_set_limit(&memcg->memsw, val); 1825 ret = res_counter_set_limit(&memcg->memsw, val);
1826 if (!ret) {
1827 if (memlimit == val)
1828 memcg->memsw_is_minimum = true;
1829 else
1830 memcg->memsw_is_minimum = false;
1831 }
1726 mutex_unlock(&set_limit_mutex); 1832 mutex_unlock(&set_limit_mutex);
1727 1833
1728 if (!ret) 1834 if (!ret)
@@ -1936,8 +2042,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
1936 val = res_counter_read_u64(&mem->res, name); 2042 val = res_counter_read_u64(&mem->res, name);
1937 break; 2043 break;
1938 case _MEMSWAP: 2044 case _MEMSWAP:
1939 if (do_swap_account) 2045 val = res_counter_read_u64(&mem->memsw, name);
1940 val = res_counter_read_u64(&mem->memsw, name);
1941 break; 2046 break;
1942 default: 2047 default:
1943 BUG(); 2048 BUG();
@@ -2035,6 +2140,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2035enum { 2140enum {
2036 MCS_CACHE, 2141 MCS_CACHE,
2037 MCS_RSS, 2142 MCS_RSS,
2143 MCS_MAPPED_FILE,
2038 MCS_PGPGIN, 2144 MCS_PGPGIN,
2039 MCS_PGPGOUT, 2145 MCS_PGPGOUT,
2040 MCS_INACTIVE_ANON, 2146 MCS_INACTIVE_ANON,
@@ -2055,6 +2161,7 @@ struct {
2055} memcg_stat_strings[NR_MCS_STAT] = { 2161} memcg_stat_strings[NR_MCS_STAT] = {
2056 {"cache", "total_cache"}, 2162 {"cache", "total_cache"},
2057 {"rss", "total_rss"}, 2163 {"rss", "total_rss"},
2164 {"mapped_file", "total_mapped_file"},
2058 {"pgpgin", "total_pgpgin"}, 2165 {"pgpgin", "total_pgpgin"},
2059 {"pgpgout", "total_pgpgout"}, 2166 {"pgpgout", "total_pgpgout"},
2060 {"inactive_anon", "total_inactive_anon"}, 2167 {"inactive_anon", "total_inactive_anon"},
@@ -2075,6 +2182,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2075 s->stat[MCS_CACHE] += val * PAGE_SIZE; 2182 s->stat[MCS_CACHE] += val * PAGE_SIZE;
2076 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 2183 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
2077 s->stat[MCS_RSS] += val * PAGE_SIZE; 2184 s->stat[MCS_RSS] += val * PAGE_SIZE;
2185 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE);
2186 s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE;
2078 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); 2187 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
2079 s->stat[MCS_PGPGIN] += val; 2188 s->stat[MCS_PGPGIN] += val;
2080 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 2189 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
diff --git a/mm/memory.c b/mm/memory.c
index 4126dd16778c..d5d1653d60a6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1360 return i; 1360 return i;
1361} 1361}
1362 1362
1363/**
1364 * get_user_pages() - pin user pages in memory
1365 * @tsk: task_struct of target task
1366 * @mm: mm_struct of target mm
1367 * @start: starting user address
1368 * @len: number of pages from start to pin
1369 * @write: whether pages will be written to by the caller
1370 * @force: whether to force write access even if user mapping is
1371 * readonly. This will result in the page being COWed even
1372 * in MAP_SHARED mappings. You do not want this.
1373 * @pages: array that receives pointers to the pages pinned.
1374 * Should be at least nr_pages long. Or NULL, if caller
1375 * only intends to ensure the pages are faulted in.
1376 * @vmas: array of pointers to vmas corresponding to each page.
1377 * Or NULL if the caller does not require them.
1378 *
1379 * Returns number of pages pinned. This may be fewer than the number
1380 * requested. If len is 0 or negative, returns 0. If no pages
1381 * were pinned, returns -errno. Each page returned must be released
1382 * with a put_page() call when it is finished with. vmas will only
1383 * remain valid while mmap_sem is held.
1384 *
1385 * Must be called with mmap_sem held for read or write.
1386 *
1387 * get_user_pages walks a process's page tables and takes a reference to
1388 * each struct page that each user address corresponds to at a given
1389 * instant. That is, it takes the page that would be accessed if a user
1390 * thread accesses the given user virtual address at that instant.
1391 *
1392 * This does not guarantee that the page exists in the user mappings when
1393 * get_user_pages returns, and there may even be a completely different
1394 * page there in some cases (eg. if mmapped pagecache has been invalidated
1395 * and subsequently re faulted). However it does guarantee that the page
1396 * won't be freed completely. And mostly callers simply care that the page
1397 * contains data that was valid *at some point in time*. Typically, an IO
1398 * or similar operation cannot guarantee anything stronger anyway because
1399 * locks can't be held over the syscall boundary.
1400 *
1401 * If write=0, the page must not be written to. If the page is written to,
1402 * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
1403 * after the page is finished with, and before put_page is called.
1404 *
1405 * get_user_pages is typically used for fewer-copy IO operations, to get a
1406 * handle on the memory by some means other than accesses via the user virtual
1407 * addresses. The pages may be submitted for DMA to devices or accessed via
1408 * their kernel linear mapping (via the kmap APIs). Care should be taken to
1409 * use the correct cache flushing APIs.
1410 *
1411 * See also get_user_pages_fast, for performance critical applications.
1412 */
1363int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1413int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1364 unsigned long start, int len, int write, int force, 1414 unsigned long start, int len, int write, int force,
1365 struct page **pages, struct vm_area_struct **vmas) 1415 struct page **pages, struct vm_area_struct **vmas)
@@ -3053,22 +3103,13 @@ int in_gate_area_no_task(unsigned long addr)
3053 3103
3054#endif /* __HAVE_ARCH_GATE_AREA */ 3104#endif /* __HAVE_ARCH_GATE_AREA */
3055 3105
3056#ifdef CONFIG_HAVE_IOREMAP_PROT 3106static int follow_pte(struct mm_struct *mm, unsigned long address,
3057int follow_phys(struct vm_area_struct *vma, 3107 pte_t **ptepp, spinlock_t **ptlp)
3058 unsigned long address, unsigned int flags,
3059 unsigned long *prot, resource_size_t *phys)
3060{ 3108{
3061 pgd_t *pgd; 3109 pgd_t *pgd;
3062 pud_t *pud; 3110 pud_t *pud;
3063 pmd_t *pmd; 3111 pmd_t *pmd;
3064 pte_t *ptep, pte; 3112 pte_t *ptep;
3065 spinlock_t *ptl;
3066 resource_size_t phys_addr = 0;
3067 struct mm_struct *mm = vma->vm_mm;
3068 int ret = -EINVAL;
3069
3070 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3071 goto out;
3072 3113
3073 pgd = pgd_offset(mm, address); 3114 pgd = pgd_offset(mm, address);
3074 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 3115 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
@@ -3086,22 +3127,71 @@ int follow_phys(struct vm_area_struct *vma,
3086 if (pmd_huge(*pmd)) 3127 if (pmd_huge(*pmd))
3087 goto out; 3128 goto out;
3088 3129
3089 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 3130 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3090 if (!ptep) 3131 if (!ptep)
3091 goto out; 3132 goto out;
3133 if (!pte_present(*ptep))
3134 goto unlock;
3135 *ptepp = ptep;
3136 return 0;
3137unlock:
3138 pte_unmap_unlock(ptep, *ptlp);
3139out:
3140 return -EINVAL;
3141}
3092 3142
3143/**
3144 * follow_pfn - look up PFN at a user virtual address
3145 * @vma: memory mapping
3146 * @address: user virtual address
3147 * @pfn: location to store found PFN
3148 *
3149 * Only IO mappings and raw PFN mappings are allowed.
3150 *
3151 * Returns zero and the pfn at @pfn on success, -ve otherwise.
3152 */
3153int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3154 unsigned long *pfn)
3155{
3156 int ret = -EINVAL;
3157 spinlock_t *ptl;
3158 pte_t *ptep;
3159
3160 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3161 return ret;
3162
3163 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3164 if (ret)
3165 return ret;
3166 *pfn = pte_pfn(*ptep);
3167 pte_unmap_unlock(ptep, ptl);
3168 return 0;
3169}
3170EXPORT_SYMBOL(follow_pfn);
3171
3172#ifdef CONFIG_HAVE_IOREMAP_PROT
3173int follow_phys(struct vm_area_struct *vma,
3174 unsigned long address, unsigned int flags,
3175 unsigned long *prot, resource_size_t *phys)
3176{
3177 int ret = -EINVAL;
3178 pte_t *ptep, pte;
3179 spinlock_t *ptl;
3180
3181 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3182 goto out;
3183
3184 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3185 goto out;
3093 pte = *ptep; 3186 pte = *ptep;
3094 if (!pte_present(pte)) 3187
3095 goto unlock;
3096 if ((flags & FOLL_WRITE) && !pte_write(pte)) 3188 if ((flags & FOLL_WRITE) && !pte_write(pte))
3097 goto unlock; 3189 goto unlock;
3098 phys_addr = pte_pfn(pte);
3099 phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
3100 3190
3101 *prot = pgprot_val(pte_pgprot(pte)); 3191 *prot = pgprot_val(pte_pgprot(pte));
3102 *phys = phys_addr; 3192 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3103 ret = 0;
3104 3193
3194 ret = 0;
3105unlock: 3195unlock:
3106 pte_unmap_unlock(ptep, ptl); 3196 pte_unmap_unlock(ptep, ptl);
3107out: 3197out:
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c083cf5fd6df..e4412a676c88 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -422,7 +422,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
422 zone->present_pages += onlined_pages; 422 zone->present_pages += onlined_pages;
423 zone->zone_pgdat->node_present_pages += onlined_pages; 423 zone->zone_pgdat->node_present_pages += onlined_pages;
424 424
425 setup_per_zone_pages_min(); 425 setup_per_zone_wmarks();
426 calculate_zone_inactive_ratio(zone);
426 if (onlined_pages) { 427 if (onlined_pages) {
427 kswapd_run(zone_to_nid(zone)); 428 kswapd_run(zone_to_nid(zone));
428 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 429 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
@@ -832,6 +833,9 @@ repeat:
832 totalram_pages -= offlined_pages; 833 totalram_pages -= offlined_pages;
833 num_physpages -= offlined_pages; 834 num_physpages -= offlined_pages;
834 835
836 setup_per_zone_wmarks();
837 calculate_zone_inactive_ratio(zone);
838
835 vm_total_pages = nr_free_pagecache_pages(); 839 vm_total_pages = nr_free_pagecache_pages();
836 writeback_set_ratelimit(); 840 writeback_set_ratelimit();
837 841
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3eb4a6fdc043..e08e2c4da63a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
182 return 0; 182 return 0;
183} 183}
184 184
185/* Create a new policy */ 185/*
186 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
187 * any, for the new policy. mpol_new() has already validated the nodes
188 * parameter with respect to the policy mode and flags. But, we need to
189 * handle an empty nodemask with MPOL_PREFERRED here.
190 *
191 * Must be called holding task's alloc_lock to protect task's mems_allowed
192 * and mempolicy. May also be called holding the mmap_semaphore for write.
193 */
194static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
195{
196 nodemask_t cpuset_context_nmask;
197 int ret;
198
199 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
200 if (pol == NULL)
201 return 0;
202
203 VM_BUG_ON(!nodes);
204 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
205 nodes = NULL; /* explicit local allocation */
206 else {
207 if (pol->flags & MPOL_F_RELATIVE_NODES)
208 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
209 &cpuset_current_mems_allowed);
210 else
211 nodes_and(cpuset_context_nmask, *nodes,
212 cpuset_current_mems_allowed);
213 if (mpol_store_user_nodemask(pol))
214 pol->w.user_nodemask = *nodes;
215 else
216 pol->w.cpuset_mems_allowed =
217 cpuset_current_mems_allowed;
218 }
219
220 ret = mpol_ops[pol->mode].create(pol,
221 nodes ? &cpuset_context_nmask : NULL);
222 return ret;
223}
224
225/*
226 * This function just creates a new policy, does some check and simple
227 * initialization. You must invoke mpol_set_nodemask() to set nodes.
228 */
186static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 229static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
187 nodemask_t *nodes) 230 nodemask_t *nodes)
188{ 231{
189 struct mempolicy *policy; 232 struct mempolicy *policy;
190 nodemask_t cpuset_context_nmask;
191 int ret;
192 233
193 pr_debug("setting mode %d flags %d nodes[0] %lx\n", 234 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
194 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); 235 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
210 if (((flags & MPOL_F_STATIC_NODES) || 251 if (((flags & MPOL_F_STATIC_NODES) ||
211 (flags & MPOL_F_RELATIVE_NODES))) 252 (flags & MPOL_F_RELATIVE_NODES)))
212 return ERR_PTR(-EINVAL); 253 return ERR_PTR(-EINVAL);
213 nodes = NULL; /* flag local alloc */
214 } 254 }
215 } else if (nodes_empty(*nodes)) 255 } else if (nodes_empty(*nodes))
216 return ERR_PTR(-EINVAL); 256 return ERR_PTR(-EINVAL);
@@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
221 policy->mode = mode; 261 policy->mode = mode;
222 policy->flags = flags; 262 policy->flags = flags;
223 263
224 if (nodes) {
225 /*
226 * cpuset related setup doesn't apply to local allocation
227 */
228 cpuset_update_task_memory_state();
229 if (flags & MPOL_F_RELATIVE_NODES)
230 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
231 &cpuset_current_mems_allowed);
232 else
233 nodes_and(cpuset_context_nmask, *nodes,
234 cpuset_current_mems_allowed);
235 if (mpol_store_user_nodemask(policy))
236 policy->w.user_nodemask = *nodes;
237 else
238 policy->w.cpuset_mems_allowed =
239 cpuset_mems_allowed(current);
240 }
241
242 ret = mpol_ops[mode].create(policy,
243 nodes ? &cpuset_context_nmask : NULL);
244 if (ret < 0) {
245 kmem_cache_free(policy_cache, policy);
246 return ERR_PTR(ret);
247 }
248 return policy; 264 return policy;
249} 265}
250 266
@@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
324/* 340/*
325 * Wrapper for mpol_rebind_policy() that just requires task 341 * Wrapper for mpol_rebind_policy() that just requires task
326 * pointer, and updates task mempolicy. 342 * pointer, and updates task mempolicy.
343 *
344 * Called with task's alloc_lock held.
327 */ 345 */
328 346
329void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 347void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
@@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void)
600static long do_set_mempolicy(unsigned short mode, unsigned short flags, 618static long do_set_mempolicy(unsigned short mode, unsigned short flags,
601 nodemask_t *nodes) 619 nodemask_t *nodes)
602{ 620{
603 struct mempolicy *new; 621 struct mempolicy *new, *old;
604 struct mm_struct *mm = current->mm; 622 struct mm_struct *mm = current->mm;
623 int ret;
605 624
606 new = mpol_new(mode, flags, nodes); 625 new = mpol_new(mode, flags, nodes);
607 if (IS_ERR(new)) 626 if (IS_ERR(new))
@@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
615 */ 634 */
616 if (mm) 635 if (mm)
617 down_write(&mm->mmap_sem); 636 down_write(&mm->mmap_sem);
618 mpol_put(current->mempolicy); 637 task_lock(current);
638 ret = mpol_set_nodemask(new, nodes);
639 if (ret) {
640 task_unlock(current);
641 if (mm)
642 up_write(&mm->mmap_sem);
643 mpol_put(new);
644 return ret;
645 }
646 old = current->mempolicy;
619 current->mempolicy = new; 647 current->mempolicy = new;
620 mpol_set_task_struct_flag(); 648 mpol_set_task_struct_flag();
621 if (new && new->mode == MPOL_INTERLEAVE && 649 if (new && new->mode == MPOL_INTERLEAVE &&
622 nodes_weight(new->v.nodes)) 650 nodes_weight(new->v.nodes))
623 current->il_next = first_node(new->v.nodes); 651 current->il_next = first_node(new->v.nodes);
652 task_unlock(current);
624 if (mm) 653 if (mm)
625 up_write(&mm->mmap_sem); 654 up_write(&mm->mmap_sem);
626 655
656 mpol_put(old);
627 return 0; 657 return 0;
628} 658}
629 659
630/* 660/*
631 * Return nodemask for policy for get_mempolicy() query 661 * Return nodemask for policy for get_mempolicy() query
662 *
663 * Called with task's alloc_lock held
632 */ 664 */
633static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) 665static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
634{ 666{
@@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
674 struct vm_area_struct *vma = NULL; 706 struct vm_area_struct *vma = NULL;
675 struct mempolicy *pol = current->mempolicy; 707 struct mempolicy *pol = current->mempolicy;
676 708
677 cpuset_update_task_memory_state();
678 if (flags & 709 if (flags &
679 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 710 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
680 return -EINVAL; 711 return -EINVAL;
@@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
683 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 714 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
684 return -EINVAL; 715 return -EINVAL;
685 *policy = 0; /* just so it's initialized */ 716 *policy = 0; /* just so it's initialized */
717 task_lock(current);
686 *nmask = cpuset_current_mems_allowed; 718 *nmask = cpuset_current_mems_allowed;
719 task_unlock(current);
687 return 0; 720 return 0;
688 } 721 }
689 722
@@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
738 } 771 }
739 772
740 err = 0; 773 err = 0;
741 if (nmask) 774 if (nmask) {
775 task_lock(current);
742 get_policy_nodemask(pol, nmask); 776 get_policy_nodemask(pol, nmask);
777 task_unlock(current);
778 }
743 779
744 out: 780 out:
745 mpol_cond_put(pol); 781 mpol_cond_put(pol);
@@ -767,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
767 803
768static struct page *new_node_page(struct page *page, unsigned long node, int **x) 804static struct page *new_node_page(struct page *page, unsigned long node, int **x)
769{ 805{
770 return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); 806 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
771} 807}
772 808
773/* 809/*
@@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len,
979 return err; 1015 return err;
980 } 1016 }
981 down_write(&mm->mmap_sem); 1017 down_write(&mm->mmap_sem);
1018 task_lock(current);
1019 err = mpol_set_nodemask(new, nmask);
1020 task_unlock(current);
1021 if (err) {
1022 up_write(&mm->mmap_sem);
1023 mpol_put(new);
1024 return err;
1025 }
982 vma = check_range(mm, start, end, nmask, 1026 vma = check_range(mm, start, end, nmask,
983 flags | MPOL_MF_INVERT, &pagelist); 1027 flags | MPOL_MF_INVERT, &pagelist);
984 1028
@@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1545 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1589 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1546 struct zonelist *zl; 1590 struct zonelist *zl;
1547 1591
1548 cpuset_update_task_memory_state();
1549
1550 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1592 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1551 unsigned nid; 1593 unsigned nid;
1552 1594
@@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1593{ 1635{
1594 struct mempolicy *pol = current->mempolicy; 1636 struct mempolicy *pol = current->mempolicy;
1595 1637
1596 if ((gfp & __GFP_WAIT) && !in_interrupt())
1597 cpuset_update_task_memory_state();
1598 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1638 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1599 pol = &default_policy; 1639 pol = &default_policy;
1600 1640
@@ -1854,6 +1894,8 @@ restart:
1854 */ 1894 */
1855void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 1895void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1856{ 1896{
1897 int ret;
1898
1857 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 1899 sp->root = RB_ROOT; /* empty tree == default mempolicy */
1858 spin_lock_init(&sp->lock); 1900 spin_lock_init(&sp->lock);
1859 1901
@@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1863 1905
1864 /* contextualize the tmpfs mount point mempolicy */ 1906 /* contextualize the tmpfs mount point mempolicy */
1865 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 1907 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1866 mpol_put(mpol); /* drop our ref on sb mpol */ 1908 if (IS_ERR(new)) {
1867 if (IS_ERR(new)) 1909 mpol_put(mpol); /* drop our ref on sb mpol */
1868 return; /* no valid nodemask intersection */ 1910 return; /* no valid nodemask intersection */
1911 }
1912
1913 task_lock(current);
1914 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
1915 task_unlock(current);
1916 mpol_put(mpol); /* drop our ref on sb mpol */
1917 if (ret) {
1918 mpol_put(new);
1919 return;
1920 }
1869 1921
1870 /* Create pseudo-vma that contains just the policy */ 1922 /* Create pseudo-vma that contains just the policy */
1871 memset(&pvma, 0, sizeof(struct vm_area_struct)); 1923 memset(&pvma, 0, sizeof(struct vm_area_struct));
@@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2086 new = mpol_new(mode, mode_flags, &nodes); 2138 new = mpol_new(mode, mode_flags, &nodes);
2087 if (IS_ERR(new)) 2139 if (IS_ERR(new))
2088 err = 1; 2140 err = 1;
2089 else if (no_context) 2141 else {
2090 new->w.user_nodemask = nodes; /* save for contextualization */ 2142 int ret;
2143
2144 task_lock(current);
2145 ret = mpol_set_nodemask(new, &nodes);
2146 task_unlock(current);
2147 if (ret)
2148 err = 1;
2149 else if (no_context) {
2150 /* save for contextualization */
2151 new->w.user_nodemask = nodes;
2152 }
2153 }
2091 2154
2092out: 2155out:
2093 /* Restore string for error message */ 2156 /* Restore string for error message */
diff --git a/mm/migrate.c b/mm/migrate.c
index 068655d8f883..939888f9ddab 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
802 802
803 *result = &pm->status; 803 *result = &pm->status;
804 804
805 return alloc_pages_node(pm->node, 805 return alloc_pages_exact_node(pm->node,
806 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 806 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
807} 807}
808 808
@@ -820,7 +820,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
820 struct page_to_node *pp; 820 struct page_to_node *pp;
821 LIST_HEAD(pagelist); 821 LIST_HEAD(pagelist);
822 822
823 migrate_prep();
824 down_read(&mm->mmap_sem); 823 down_read(&mm->mmap_sem);
825 824
826 /* 825 /*
@@ -907,6 +906,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
907 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); 906 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
908 if (!pm) 907 if (!pm)
909 goto out; 908 goto out;
909
910 migrate_prep();
911
910 /* 912 /*
911 * Store a chunk of page_to_node array in a page, 913 * Store a chunk of page_to_node array in a page,
912 * but keep the last one as a marker 914 * but keep the last one as a marker
diff --git a/mm/mlock.c b/mm/mlock.c
index ac130433c7d3..45eb650b9654 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -31,7 +31,6 @@ int can_do_mlock(void)
31} 31}
32EXPORT_SYMBOL(can_do_mlock); 32EXPORT_SYMBOL(can_do_mlock);
33 33
34#ifdef CONFIG_UNEVICTABLE_LRU
35/* 34/*
36 * Mlocked pages are marked with PageMlocked() flag for efficient testing 35 * Mlocked pages are marked with PageMlocked() flag for efficient testing
37 * in vmscan and, possibly, the fault path; and to support semi-accurate 36 * in vmscan and, possibly, the fault path; and to support semi-accurate
@@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval)
261 return retval; 260 return retval;
262} 261}
263 262
264#else /* CONFIG_UNEVICTABLE_LRU */
265
266/*
267 * Just make pages present if VM_LOCKED. No-op if unlocking.
268 */
269static long __mlock_vma_pages_range(struct vm_area_struct *vma,
270 unsigned long start, unsigned long end,
271 int mlock)
272{
273 if (mlock && (vma->vm_flags & VM_LOCKED))
274 return make_pages_present(start, end);
275 return 0;
276}
277
278static inline int __mlock_posix_error_return(long retval)
279{
280 return 0;
281}
282
283#endif /* CONFIG_UNEVICTABLE_LRU */
284
285/** 263/**
286 * mlock_vma_pages_range() - mlock pages in specified vma range. 264 * mlock_vma_pages_range() - mlock pages in specified vma range.
287 * @vma - the vma containing the specfied address range 265 * @vma - the vma containing the specfied address range
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a7b2460e922b..175a67a78a99 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 unsigned long points, cpu_time, run_time; 58 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 59 struct mm_struct *mm;
60 struct task_struct *child; 60 struct task_struct *child;
61 int oom_adj;
61 62
62 task_lock(p); 63 task_lock(p);
63 mm = p->mm; 64 mm = p->mm;
@@ -65,6 +66,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
65 task_unlock(p); 66 task_unlock(p);
66 return 0; 67 return 0;
67 } 68 }
69 oom_adj = mm->oom_adj;
70 if (oom_adj == OOM_DISABLE) {
71 task_unlock(p);
72 return 0;
73 }
68 74
69 /* 75 /*
70 * The memory size of the process is the basis for the badness. 76 * The memory size of the process is the basis for the badness.
@@ -148,15 +154,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
148 points /= 8; 154 points /= 8;
149 155
150 /* 156 /*
151 * Adjust the score by oomkilladj. 157 * Adjust the score by oom_adj.
152 */ 158 */
153 if (p->oomkilladj) { 159 if (oom_adj) {
154 if (p->oomkilladj > 0) { 160 if (oom_adj > 0) {
155 if (!points) 161 if (!points)
156 points = 1; 162 points = 1;
157 points <<= p->oomkilladj; 163 points <<= oom_adj;
158 } else 164 } else
159 points >>= -(p->oomkilladj); 165 points >>= -(oom_adj);
160 } 166 }
161 167
162#ifdef DEBUG 168#ifdef DEBUG
@@ -251,11 +257,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
251 *ppoints = ULONG_MAX; 257 *ppoints = ULONG_MAX;
252 } 258 }
253 259
254 if (p->oomkilladj == OOM_DISABLE)
255 continue;
256
257 points = badness(p, uptime.tv_sec); 260 points = badness(p, uptime.tv_sec);
258 if (points > *ppoints || !chosen) { 261 if (points > *ppoints) {
259 chosen = p; 262 chosen = p;
260 *ppoints = points; 263 *ppoints = points;
261 } 264 }
@@ -304,8 +307,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
304 } 307 }
305 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 308 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
306 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 309 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
307 get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, 310 get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
308 p->comm);
309 task_unlock(p); 311 task_unlock(p);
310 } while_each_thread(g, p); 312 } while_each_thread(g, p);
311} 313}
@@ -323,11 +325,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
323 return; 325 return;
324 } 326 }
325 327
326 if (!p->mm) { 328 if (!p->mm)
327 WARN_ON(1);
328 printk(KERN_WARNING "tried to kill an mm-less task!\n");
329 return; 329 return;
330 }
331 330
332 if (verbose) 331 if (verbose)
333 printk(KERN_ERR "Killed process %d (%s)\n", 332 printk(KERN_ERR "Killed process %d (%s)\n",
@@ -349,28 +348,13 @@ static int oom_kill_task(struct task_struct *p)
349 struct mm_struct *mm; 348 struct mm_struct *mm;
350 struct task_struct *g, *q; 349 struct task_struct *g, *q;
351 350
351 task_lock(p);
352 mm = p->mm; 352 mm = p->mm;
353 353 if (!mm || mm->oom_adj == OOM_DISABLE) {
354 /* WARNING: mm may not be dereferenced since we did not obtain its 354 task_unlock(p);
355 * value from get_task_mm(p). This is OK since all we need to do is
356 * compare mm to q->mm below.
357 *
358 * Furthermore, even if mm contains a non-NULL value, p->mm may
359 * change to NULL at any time since we do not hold task_lock(p).
360 * However, this is of no concern to us.
361 */
362
363 if (mm == NULL)
364 return 1; 355 return 1;
365 356 }
366 /* 357 task_unlock(p);
367 * Don't kill the process if any threads are set to OOM_DISABLE
368 */
369 do_each_thread(g, q) {
370 if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
371 return 1;
372 } while_each_thread(g, q);
373
374 __oom_kill_task(p, 1); 358 __oom_kill_task(p, 1);
375 359
376 /* 360 /*
@@ -393,10 +377,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
393 struct task_struct *c; 377 struct task_struct *c;
394 378
395 if (printk_ratelimit()) { 379 if (printk_ratelimit()) {
396 printk(KERN_WARNING "%s invoked oom-killer: "
397 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
398 current->comm, gfp_mask, order, current->oomkilladj);
399 task_lock(current); 380 task_lock(current);
381 printk(KERN_WARNING "%s invoked oom-killer: "
382 "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
383 current->comm, gfp_mask, order,
384 current->mm ? current->mm->oom_adj : OOM_DISABLE);
400 cpuset_print_task_mems_allowed(current); 385 cpuset_print_task_mems_allowed(current);
401 task_unlock(current); 386 task_unlock(current);
402 dump_stack(); 387 dump_stack();
@@ -409,8 +394,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
409 /* 394 /*
410 * If the task is already exiting, don't alarm the sysadmin or kill 395 * If the task is already exiting, don't alarm the sysadmin or kill
411 * its children or threads, just set TIF_MEMDIE so it can die quickly 396 * its children or threads, just set TIF_MEMDIE so it can die quickly
397 * if its mm is still attached.
412 */ 398 */
413 if (p->flags & PF_EXITING) { 399 if (p->mm && (p->flags & PF_EXITING)) {
414 __oom_kill_task(p, 0); 400 __oom_kill_task(p, 0);
415 return 0; 401 return 0;
416 } 402 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bb553c3e955d..7b0dcea4935b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -265,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
265 * This avoids exceeding the total dirty_limit when the floating averages 265 * This avoids exceeding the total dirty_limit when the floating averages
266 * fluctuate too quickly. 266 * fluctuate too quickly.
267 */ 267 */
268static void 268static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
269clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) 269 unsigned long dirty, unsigned long *pbdi_dirty)
270{ 270{
271 long avail_dirty; 271 unsigned long avail_dirty;
272 272
273 avail_dirty = dirty - 273 avail_dirty = global_page_state(NR_FILE_DIRTY) +
274 (global_page_state(NR_FILE_DIRTY) +
275 global_page_state(NR_WRITEBACK) + 274 global_page_state(NR_WRITEBACK) +
276 global_page_state(NR_UNSTABLE_NFS) + 275 global_page_state(NR_UNSTABLE_NFS) +
277 global_page_state(NR_WRITEBACK_TEMP)); 276 global_page_state(NR_WRITEBACK_TEMP);
278 277
279 if (avail_dirty < 0) 278 if (avail_dirty < dirty)
279 avail_dirty = dirty - avail_dirty;
280 else
280 avail_dirty = 0; 281 avail_dirty = 0;
281 282
282 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + 283 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
@@ -299,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
299 * 300 *
300 * dirty -= (dirty/8) * p_{t} 301 * dirty -= (dirty/8) * p_{t}
301 */ 302 */
302static void task_dirty_limit(struct task_struct *tsk, long *pdirty) 303static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
303{ 304{
304 long numerator, denominator; 305 long numerator, denominator;
305 long dirty = *pdirty; 306 unsigned long dirty = *pdirty;
306 u64 inv = dirty >> 3; 307 u64 inv = dirty >> 3;
307 308
308 task_dirties_fraction(tsk, &numerator, &denominator); 309 task_dirties_fraction(tsk, &numerator, &denominator);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17d5f539a9aa..a5f3c278c573 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -23,6 +23,7 @@
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/compiler.h> 24#include <linux/compiler.h>
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/kmemcheck.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/suspend.h> 28#include <linux/suspend.h>
28#include <linux/pagevec.h> 29#include <linux/pagevec.h>
@@ -161,17 +162,25 @@ static unsigned long __meminitdata dma_reserve;
161 162
162#if MAX_NUMNODES > 1 163#if MAX_NUMNODES > 1
163int nr_node_ids __read_mostly = MAX_NUMNODES; 164int nr_node_ids __read_mostly = MAX_NUMNODES;
165int nr_online_nodes __read_mostly = 1;
164EXPORT_SYMBOL(nr_node_ids); 166EXPORT_SYMBOL(nr_node_ids);
167EXPORT_SYMBOL(nr_online_nodes);
165#endif 168#endif
166 169
167int page_group_by_mobility_disabled __read_mostly; 170int page_group_by_mobility_disabled __read_mostly;
168 171
169static void set_pageblock_migratetype(struct page *page, int migratetype) 172static void set_pageblock_migratetype(struct page *page, int migratetype)
170{ 173{
174
175 if (unlikely(page_group_by_mobility_disabled))
176 migratetype = MIGRATE_UNMOVABLE;
177
171 set_pageblock_flags_group(page, (unsigned long)migratetype, 178 set_pageblock_flags_group(page, (unsigned long)migratetype,
172 PB_migrate, PB_migrate_end); 179 PB_migrate, PB_migrate_end);
173} 180}
174 181
182bool oom_killer_disabled __read_mostly;
183
175#ifdef CONFIG_DEBUG_VM 184#ifdef CONFIG_DEBUG_VM
176static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 185static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
177{ 186{
@@ -294,23 +303,6 @@ void prep_compound_page(struct page *page, unsigned long order)
294 } 303 }
295} 304}
296 305
297#ifdef CONFIG_HUGETLBFS
298void prep_compound_gigantic_page(struct page *page, unsigned long order)
299{
300 int i;
301 int nr_pages = 1 << order;
302 struct page *p = page + 1;
303
304 set_compound_page_dtor(page, free_compound_page);
305 set_compound_order(page, order);
306 __SetPageHead(page);
307 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
308 __SetPageTail(p);
309 p->first_page = page;
310 }
311}
312#endif
313
314static int destroy_compound_page(struct page *page, unsigned long order) 306static int destroy_compound_page(struct page *page, unsigned long order)
315{ 307{
316 int i; 308 int i;
@@ -417,7 +409,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
417 return 0; 409 return 0;
418 410
419 if (PageBuddy(buddy) && page_order(buddy) == order) { 411 if (PageBuddy(buddy) && page_order(buddy) == order) {
420 BUG_ON(page_count(buddy) != 0); 412 VM_BUG_ON(page_count(buddy) != 0);
421 return 1; 413 return 1;
422 } 414 }
423 return 0; 415 return 0;
@@ -448,22 +440,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
448 */ 440 */
449 441
450static inline void __free_one_page(struct page *page, 442static inline void __free_one_page(struct page *page,
451 struct zone *zone, unsigned int order) 443 struct zone *zone, unsigned int order,
444 int migratetype)
452{ 445{
453 unsigned long page_idx; 446 unsigned long page_idx;
454 int order_size = 1 << order;
455 int migratetype = get_pageblock_migratetype(page);
456 447
457 if (unlikely(PageCompound(page))) 448 if (unlikely(PageCompound(page)))
458 if (unlikely(destroy_compound_page(page, order))) 449 if (unlikely(destroy_compound_page(page, order)))
459 return; 450 return;
460 451
452 VM_BUG_ON(migratetype == -1);
453
461 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 454 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
462 455
463 VM_BUG_ON(page_idx & (order_size - 1)); 456 VM_BUG_ON(page_idx & ((1 << order) - 1));
464 VM_BUG_ON(bad_range(zone, page)); 457 VM_BUG_ON(bad_range(zone, page));
465 458
466 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
467 while (order < MAX_ORDER-1) { 459 while (order < MAX_ORDER-1) {
468 unsigned long combined_idx; 460 unsigned long combined_idx;
469 struct page *buddy; 461 struct page *buddy;
@@ -487,12 +479,27 @@ static inline void __free_one_page(struct page *page,
487 zone->free_area[order].nr_free++; 479 zone->free_area[order].nr_free++;
488} 480}
489 481
482#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
483/*
484 * free_page_mlock() -- clean up attempts to free and mlocked() page.
485 * Page should not be on lru, so no need to fix that up.
486 * free_pages_check() will verify...
487 */
488static inline void free_page_mlock(struct page *page)
489{
490 __ClearPageMlocked(page);
491 __dec_zone_page_state(page, NR_MLOCK);
492 __count_vm_event(UNEVICTABLE_MLOCKFREED);
493}
494#else
495static void free_page_mlock(struct page *page) { }
496#endif
497
490static inline int free_pages_check(struct page *page) 498static inline int free_pages_check(struct page *page)
491{ 499{
492 free_page_mlock(page);
493 if (unlikely(page_mapcount(page) | 500 if (unlikely(page_mapcount(page) |
494 (page->mapping != NULL) | 501 (page->mapping != NULL) |
495 (page_count(page) != 0) | 502 (atomic_read(&page->_count) != 0) |
496 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { 503 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
497 bad_page(page); 504 bad_page(page);
498 return 1; 505 return 1;
@@ -519,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count,
519 spin_lock(&zone->lock); 526 spin_lock(&zone->lock);
520 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 527 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
521 zone->pages_scanned = 0; 528 zone->pages_scanned = 0;
529
530 __mod_zone_page_state(zone, NR_FREE_PAGES, count << order);
522 while (count--) { 531 while (count--) {
523 struct page *page; 532 struct page *page;
524 533
@@ -526,17 +535,20 @@ static void free_pages_bulk(struct zone *zone, int count,
526 page = list_entry(list->prev, struct page, lru); 535 page = list_entry(list->prev, struct page, lru);
527 /* have to delete it as __free_one_page list manipulates */ 536 /* have to delete it as __free_one_page list manipulates */
528 list_del(&page->lru); 537 list_del(&page->lru);
529 __free_one_page(page, zone, order); 538 __free_one_page(page, zone, order, page_private(page));
530 } 539 }
531 spin_unlock(&zone->lock); 540 spin_unlock(&zone->lock);
532} 541}
533 542
534static void free_one_page(struct zone *zone, struct page *page, int order) 543static void free_one_page(struct zone *zone, struct page *page, int order,
544 int migratetype)
535{ 545{
536 spin_lock(&zone->lock); 546 spin_lock(&zone->lock);
537 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 547 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
538 zone->pages_scanned = 0; 548 zone->pages_scanned = 0;
539 __free_one_page(page, zone, order); 549
550 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
551 __free_one_page(page, zone, order, migratetype);
540 spin_unlock(&zone->lock); 552 spin_unlock(&zone->lock);
541} 553}
542 554
@@ -545,6 +557,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
545 unsigned long flags; 557 unsigned long flags;
546 int i; 558 int i;
547 int bad = 0; 559 int bad = 0;
560 int clearMlocked = PageMlocked(page);
561
562 kmemcheck_free_shadow(page, order);
548 563
549 for (i = 0 ; i < (1 << order) ; ++i) 564 for (i = 0 ; i < (1 << order) ; ++i)
550 bad += free_pages_check(page + i); 565 bad += free_pages_check(page + i);
@@ -560,8 +575,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
560 kernel_map_pages(page, 1 << order, 0); 575 kernel_map_pages(page, 1 << order, 0);
561 576
562 local_irq_save(flags); 577 local_irq_save(flags);
578 if (unlikely(clearMlocked))
579 free_page_mlock(page);
563 __count_vm_events(PGFREE, 1 << order); 580 __count_vm_events(PGFREE, 1 << order);
564 free_one_page(page_zone(page), page, order); 581 free_one_page(page_zone(page), page, order,
582 get_pageblock_migratetype(page));
565 local_irq_restore(flags); 583 local_irq_restore(flags);
566} 584}
567 585
@@ -632,7 +650,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
632{ 650{
633 if (unlikely(page_mapcount(page) | 651 if (unlikely(page_mapcount(page) |
634 (page->mapping != NULL) | 652 (page->mapping != NULL) |
635 (page_count(page) != 0) | 653 (atomic_read(&page->_count) != 0) |
636 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { 654 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
637 bad_page(page); 655 bad_page(page);
638 return 1; 656 return 1;
@@ -657,7 +675,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
657 * Go through the free lists for the given migratetype and remove 675 * Go through the free lists for the given migratetype and remove
658 * the smallest available page from the freelists 676 * the smallest available page from the freelists
659 */ 677 */
660static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 678static inline
679struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
661 int migratetype) 680 int migratetype)
662{ 681{
663 unsigned int current_order; 682 unsigned int current_order;
@@ -675,7 +694,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
675 list_del(&page->lru); 694 list_del(&page->lru);
676 rmv_page_order(page); 695 rmv_page_order(page);
677 area->nr_free--; 696 area->nr_free--;
678 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
679 expand(zone, page, order, current_order, area, migratetype); 697 expand(zone, page, order, current_order, area, migratetype);
680 return page; 698 return page;
681 } 699 }
@@ -766,8 +784,8 @@ static int move_freepages_block(struct zone *zone, struct page *page,
766} 784}
767 785
768/* Remove an element from the buddy allocator from the fallback list */ 786/* Remove an element from the buddy allocator from the fallback list */
769static struct page *__rmqueue_fallback(struct zone *zone, int order, 787static inline struct page *
770 int start_migratetype) 788__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
771{ 789{
772 struct free_area * area; 790 struct free_area * area;
773 int current_order; 791 int current_order;
@@ -815,8 +833,6 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
815 /* Remove the page from the freelists */ 833 /* Remove the page from the freelists */
816 list_del(&page->lru); 834 list_del(&page->lru);
817 rmv_page_order(page); 835 rmv_page_order(page);
818 __mod_zone_page_state(zone, NR_FREE_PAGES,
819 -(1UL << order));
820 836
821 if (current_order == pageblock_order) 837 if (current_order == pageblock_order)
822 set_pageblock_migratetype(page, 838 set_pageblock_migratetype(page,
@@ -827,8 +843,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
827 } 843 }
828 } 844 }
829 845
830 /* Use MIGRATE_RESERVE rather than fail an allocation */ 846 return NULL;
831 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
832} 847}
833 848
834/* 849/*
@@ -840,11 +855,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
840{ 855{
841 struct page *page; 856 struct page *page;
842 857
858retry_reserve:
843 page = __rmqueue_smallest(zone, order, migratetype); 859 page = __rmqueue_smallest(zone, order, migratetype);
844 860
845 if (unlikely(!page)) 861 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
846 page = __rmqueue_fallback(zone, order, migratetype); 862 page = __rmqueue_fallback(zone, order, migratetype);
847 863
864 /*
865 * Use MIGRATE_RESERVE rather than fail an allocation. goto
866 * is used because __rmqueue_smallest is an inline function
867 * and we want just one call site
868 */
869 if (!page) {
870 migratetype = MIGRATE_RESERVE;
871 goto retry_reserve;
872 }
873 }
874
848 return page; 875 return page;
849} 876}
850 877
@@ -878,6 +905,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
878 set_page_private(page, migratetype); 905 set_page_private(page, migratetype);
879 list = &page->lru; 906 list = &page->lru;
880 } 907 }
908 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
881 spin_unlock(&zone->lock); 909 spin_unlock(&zone->lock);
882 return i; 910 return i;
883} 911}
@@ -993,6 +1021,9 @@ static void free_hot_cold_page(struct page *page, int cold)
993 struct zone *zone = page_zone(page); 1021 struct zone *zone = page_zone(page);
994 struct per_cpu_pages *pcp; 1022 struct per_cpu_pages *pcp;
995 unsigned long flags; 1023 unsigned long flags;
1024 int clearMlocked = PageMlocked(page);
1025
1026 kmemcheck_free_shadow(page, 0);
996 1027
997 if (PageAnon(page)) 1028 if (PageAnon(page))
998 page->mapping = NULL; 1029 page->mapping = NULL;
@@ -1007,13 +1038,16 @@ static void free_hot_cold_page(struct page *page, int cold)
1007 kernel_map_pages(page, 1, 0); 1038 kernel_map_pages(page, 1, 0);
1008 1039
1009 pcp = &zone_pcp(zone, get_cpu())->pcp; 1040 pcp = &zone_pcp(zone, get_cpu())->pcp;
1041 set_page_private(page, get_pageblock_migratetype(page));
1010 local_irq_save(flags); 1042 local_irq_save(flags);
1043 if (unlikely(clearMlocked))
1044 free_page_mlock(page);
1011 __count_vm_event(PGFREE); 1045 __count_vm_event(PGFREE);
1046
1012 if (cold) 1047 if (cold)
1013 list_add_tail(&page->lru, &pcp->list); 1048 list_add_tail(&page->lru, &pcp->list);
1014 else 1049 else
1015 list_add(&page->lru, &pcp->list); 1050 list_add(&page->lru, &pcp->list);
1016 set_page_private(page, get_pageblock_migratetype(page));
1017 pcp->count++; 1051 pcp->count++;
1018 if (pcp->count >= pcp->high) { 1052 if (pcp->count >= pcp->high) {
1019 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1053 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -1047,6 +1081,16 @@ void split_page(struct page *page, unsigned int order)
1047 1081
1048 VM_BUG_ON(PageCompound(page)); 1082 VM_BUG_ON(PageCompound(page));
1049 VM_BUG_ON(!page_count(page)); 1083 VM_BUG_ON(!page_count(page));
1084
1085#ifdef CONFIG_KMEMCHECK
1086 /*
1087 * Split shadow pages too, because free(page[0]) would
1088 * otherwise free the whole shadow.
1089 */
1090 if (kmemcheck_page_is_tracked(page))
1091 split_page(virt_to_page(page[0].shadow), order);
1092#endif
1093
1050 for (i = 1; i < (1 << order); i++) 1094 for (i = 1; i < (1 << order); i++)
1051 set_page_refcounted(page + i); 1095 set_page_refcounted(page + i);
1052} 1096}
@@ -1056,14 +1100,15 @@ void split_page(struct page *page, unsigned int order)
1056 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1100 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1057 * or two. 1101 * or two.
1058 */ 1102 */
1059static struct page *buffered_rmqueue(struct zone *preferred_zone, 1103static inline
1060 struct zone *zone, int order, gfp_t gfp_flags) 1104struct page *buffered_rmqueue(struct zone *preferred_zone,
1105 struct zone *zone, int order, gfp_t gfp_flags,
1106 int migratetype)
1061{ 1107{
1062 unsigned long flags; 1108 unsigned long flags;
1063 struct page *page; 1109 struct page *page;
1064 int cold = !!(gfp_flags & __GFP_COLD); 1110 int cold = !!(gfp_flags & __GFP_COLD);
1065 int cpu; 1111 int cpu;
1066 int migratetype = allocflags_to_migratetype(gfp_flags);
1067 1112
1068again: 1113again:
1069 cpu = get_cpu(); 1114 cpu = get_cpu();
@@ -1100,8 +1145,22 @@ again:
1100 list_del(&page->lru); 1145 list_del(&page->lru);
1101 pcp->count--; 1146 pcp->count--;
1102 } else { 1147 } else {
1148 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1149 /*
1150 * __GFP_NOFAIL is not to be used in new code.
1151 *
1152 * All __GFP_NOFAIL callers should be fixed so that they
1153 * properly detect and handle allocation failures.
1154 *
1155 * We most definitely don't want callers attempting to
1156 * allocate greater than single-page units with
1157 * __GFP_NOFAIL.
1158 */
1159 WARN_ON_ONCE(order > 0);
1160 }
1103 spin_lock_irqsave(&zone->lock, flags); 1161 spin_lock_irqsave(&zone->lock, flags);
1104 page = __rmqueue(zone, order, migratetype); 1162 page = __rmqueue(zone, order, migratetype);
1163 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1105 spin_unlock(&zone->lock); 1164 spin_unlock(&zone->lock);
1106 if (!page) 1165 if (!page)
1107 goto failed; 1166 goto failed;
@@ -1123,10 +1182,15 @@ failed:
1123 return NULL; 1182 return NULL;
1124} 1183}
1125 1184
1126#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 1185/* The ALLOC_WMARK bits are used as an index to zone->watermark */
1127#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 1186#define ALLOC_WMARK_MIN WMARK_MIN
1128#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 1187#define ALLOC_WMARK_LOW WMARK_LOW
1129#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 1188#define ALLOC_WMARK_HIGH WMARK_HIGH
1189#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1190
1191/* Mask to get the watermark bits */
1192#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1193
1130#define ALLOC_HARDER 0x10 /* try to alloc harder */ 1194#define ALLOC_HARDER 0x10 /* try to alloc harder */
1131#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 1195#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1132#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 1196#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
@@ -1384,23 +1448,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1384 */ 1448 */
1385static struct page * 1449static struct page *
1386get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1450get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1387 struct zonelist *zonelist, int high_zoneidx, int alloc_flags) 1451 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1452 struct zone *preferred_zone, int migratetype)
1388{ 1453{
1389 struct zoneref *z; 1454 struct zoneref *z;
1390 struct page *page = NULL; 1455 struct page *page = NULL;
1391 int classzone_idx; 1456 int classzone_idx;
1392 struct zone *zone, *preferred_zone; 1457 struct zone *zone;
1393 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1458 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1394 int zlc_active = 0; /* set if using zonelist_cache */ 1459 int zlc_active = 0; /* set if using zonelist_cache */
1395 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1460 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1396 1461
1397 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
1398 &preferred_zone);
1399 if (!preferred_zone)
1400 return NULL;
1401
1402 classzone_idx = zone_idx(preferred_zone); 1462 classzone_idx = zone_idx(preferred_zone);
1403
1404zonelist_scan: 1463zonelist_scan:
1405 /* 1464 /*
1406 * Scan zonelist, looking for a zone with enough free. 1465 * Scan zonelist, looking for a zone with enough free.
@@ -1415,31 +1474,49 @@ zonelist_scan:
1415 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1474 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1416 goto try_next_zone; 1475 goto try_next_zone;
1417 1476
1477 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1418 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1478 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1419 unsigned long mark; 1479 unsigned long mark;
1420 if (alloc_flags & ALLOC_WMARK_MIN) 1480 int ret;
1421 mark = zone->pages_min; 1481
1422 else if (alloc_flags & ALLOC_WMARK_LOW) 1482 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1423 mark = zone->pages_low; 1483 if (zone_watermark_ok(zone, order, mark,
1424 else 1484 classzone_idx, alloc_flags))
1425 mark = zone->pages_high; 1485 goto try_this_zone;
1426 if (!zone_watermark_ok(zone, order, mark, 1486
1427 classzone_idx, alloc_flags)) { 1487 if (zone_reclaim_mode == 0)
1428 if (!zone_reclaim_mode || 1488 goto this_zone_full;
1429 !zone_reclaim(zone, gfp_mask, order)) 1489
1490 ret = zone_reclaim(zone, gfp_mask, order);
1491 switch (ret) {
1492 case ZONE_RECLAIM_NOSCAN:
1493 /* did not scan */
1494 goto try_next_zone;
1495 case ZONE_RECLAIM_FULL:
1496 /* scanned but unreclaimable */
1497 goto this_zone_full;
1498 default:
1499 /* did we reclaim enough */
1500 if (!zone_watermark_ok(zone, order, mark,
1501 classzone_idx, alloc_flags))
1430 goto this_zone_full; 1502 goto this_zone_full;
1431 } 1503 }
1432 } 1504 }
1433 1505
1434 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); 1506try_this_zone:
1507 page = buffered_rmqueue(preferred_zone, zone, order,
1508 gfp_mask, migratetype);
1435 if (page) 1509 if (page)
1436 break; 1510 break;
1437this_zone_full: 1511this_zone_full:
1438 if (NUMA_BUILD) 1512 if (NUMA_BUILD)
1439 zlc_mark_zone_full(zonelist, z); 1513 zlc_mark_zone_full(zonelist, z);
1440try_next_zone: 1514try_next_zone:
1441 if (NUMA_BUILD && !did_zlc_setup) { 1515 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1442 /* we do zlc_setup after the first zone is tried */ 1516 /*
1517 * we do zlc_setup after the first zone is tried but only
1518 * if there are multiple nodes make it worthwhile
1519 */
1443 allowednodes = zlc_setup(zonelist, alloc_flags); 1520 allowednodes = zlc_setup(zonelist, alloc_flags);
1444 zlc_active = 1; 1521 zlc_active = 1;
1445 did_zlc_setup = 1; 1522 did_zlc_setup = 1;
@@ -1454,47 +1531,217 @@ try_next_zone:
1454 return page; 1531 return page;
1455} 1532}
1456 1533
1534static inline int
1535should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1536 unsigned long pages_reclaimed)
1537{
1538 /* Do not loop if specifically requested */
1539 if (gfp_mask & __GFP_NORETRY)
1540 return 0;
1541
1542 /*
1543 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1544 * means __GFP_NOFAIL, but that may not be true in other
1545 * implementations.
1546 */
1547 if (order <= PAGE_ALLOC_COSTLY_ORDER)
1548 return 1;
1549
1550 /*
1551 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1552 * specified, then we retry until we no longer reclaim any pages
1553 * (above), or we've reclaimed an order of pages at least as
1554 * large as the allocation's order. In both cases, if the
1555 * allocation still fails, we stop retrying.
1556 */
1557 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1558 return 1;
1559
1560 /*
1561 * Don't let big-order allocations loop unless the caller
1562 * explicitly requests that.
1563 */
1564 if (gfp_mask & __GFP_NOFAIL)
1565 return 1;
1566
1567 return 0;
1568}
1569
1570static inline struct page *
1571__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1572 struct zonelist *zonelist, enum zone_type high_zoneidx,
1573 nodemask_t *nodemask, struct zone *preferred_zone,
1574 int migratetype)
1575{
1576 struct page *page;
1577
1578 /* Acquire the OOM killer lock for the zones in zonelist */
1579 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1580 schedule_timeout_uninterruptible(1);
1581 return NULL;
1582 }
1583
1584 /*
1585 * Go through the zonelist yet one more time, keep very high watermark
1586 * here, this is only to catch a parallel oom killing, we must fail if
1587 * we're still under heavy pressure.
1588 */
1589 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1590 order, zonelist, high_zoneidx,
1591 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
1592 preferred_zone, migratetype);
1593 if (page)
1594 goto out;
1595
1596 /* The OOM killer will not help higher order allocs */
1597 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))
1598 goto out;
1599
1600 /* Exhausted what can be done so it's blamo time */
1601 out_of_memory(zonelist, gfp_mask, order);
1602
1603out:
1604 clear_zonelist_oom(zonelist, gfp_mask);
1605 return page;
1606}
1607
1608/* The really slow allocator path where we enter direct reclaim */
1609static inline struct page *
1610__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1611 struct zonelist *zonelist, enum zone_type high_zoneidx,
1612 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1613 int migratetype, unsigned long *did_some_progress)
1614{
1615 struct page *page = NULL;
1616 struct reclaim_state reclaim_state;
1617 struct task_struct *p = current;
1618
1619 cond_resched();
1620
1621 /* We now go into synchronous reclaim */
1622 cpuset_memory_pressure_bump();
1623
1624 /*
1625 * The task's cpuset might have expanded its set of allowable nodes
1626 */
1627 p->flags |= PF_MEMALLOC;
1628 lockdep_set_current_reclaim_state(gfp_mask);
1629 reclaim_state.reclaimed_slab = 0;
1630 p->reclaim_state = &reclaim_state;
1631
1632 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1633
1634 p->reclaim_state = NULL;
1635 lockdep_clear_current_reclaim_state();
1636 p->flags &= ~PF_MEMALLOC;
1637
1638 cond_resched();
1639
1640 if (order != 0)
1641 drain_all_pages();
1642
1643 if (likely(*did_some_progress))
1644 page = get_page_from_freelist(gfp_mask, nodemask, order,
1645 zonelist, high_zoneidx,
1646 alloc_flags, preferred_zone,
1647 migratetype);
1648 return page;
1649}
1650
1457/* 1651/*
1458 * This is the 'heart' of the zoned buddy allocator. 1652 * This is called in the allocator slow-path if the allocation request is of
1653 * sufficient urgency to ignore watermarks and take other desperate measures
1459 */ 1654 */
1460struct page * 1655static inline struct page *
1461__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1656__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1462 struct zonelist *zonelist, nodemask_t *nodemask) 1657 struct zonelist *zonelist, enum zone_type high_zoneidx,
1658 nodemask_t *nodemask, struct zone *preferred_zone,
1659 int migratetype)
1660{
1661 struct page *page;
1662
1663 do {
1664 page = get_page_from_freelist(gfp_mask, nodemask, order,
1665 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
1666 preferred_zone, migratetype);
1667
1668 if (!page && gfp_mask & __GFP_NOFAIL)
1669 congestion_wait(WRITE, HZ/50);
1670 } while (!page && (gfp_mask & __GFP_NOFAIL));
1671
1672 return page;
1673}
1674
1675static inline
1676void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1677 enum zone_type high_zoneidx)
1463{ 1678{
1464 const gfp_t wait = gfp_mask & __GFP_WAIT;
1465 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1466 struct zoneref *z; 1679 struct zoneref *z;
1467 struct zone *zone; 1680 struct zone *zone;
1468 struct page *page;
1469 struct reclaim_state reclaim_state;
1470 struct task_struct *p = current;
1471 int do_retry;
1472 int alloc_flags;
1473 unsigned long did_some_progress;
1474 unsigned long pages_reclaimed = 0;
1475 1681
1476 lockdep_trace_alloc(gfp_mask); 1682 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1683 wakeup_kswapd(zone, order);
1684}
1477 1685
1478 might_sleep_if(wait); 1686static inline int
1687gfp_to_alloc_flags(gfp_t gfp_mask)
1688{
1689 struct task_struct *p = current;
1690 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1691 const gfp_t wait = gfp_mask & __GFP_WAIT;
1479 1692
1480 if (should_fail_alloc_page(gfp_mask, order)) 1693 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1481 return NULL; 1694 BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
1482 1695
1483restart: 1696 /*
1484 z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ 1697 * The caller may dip into page reserves a bit more if the caller
1698 * cannot run direct reclaim, or if the caller has realtime scheduling
1699 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1700 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1701 */
1702 alloc_flags |= (gfp_mask & __GFP_HIGH);
1485 1703
1486 if (unlikely(!z->zone)) { 1704 if (!wait) {
1705 alloc_flags |= ALLOC_HARDER;
1487 /* 1706 /*
1488 * Happens if we have an empty zonelist as a result of 1707 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1489 * GFP_THISNODE being used on a memoryless node 1708 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1490 */ 1709 */
1491 return NULL; 1710 alloc_flags &= ~ALLOC_CPUSET;
1711 } else if (unlikely(rt_task(p)))
1712 alloc_flags |= ALLOC_HARDER;
1713
1714 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1715 if (!in_interrupt() &&
1716 ((p->flags & PF_MEMALLOC) ||
1717 unlikely(test_thread_flag(TIF_MEMDIE))))
1718 alloc_flags |= ALLOC_NO_WATERMARKS;
1492 } 1719 }
1493 1720
1494 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 1721 return alloc_flags;
1495 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1722}
1496 if (page) 1723
1497 goto got_pg; 1724static inline struct page *
1725__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1726 struct zonelist *zonelist, enum zone_type high_zoneidx,
1727 nodemask_t *nodemask, struct zone *preferred_zone,
1728 int migratetype)
1729{
1730 const gfp_t wait = gfp_mask & __GFP_WAIT;
1731 struct page *page = NULL;
1732 int alloc_flags;
1733 unsigned long pages_reclaimed = 0;
1734 unsigned long did_some_progress;
1735 struct task_struct *p = current;
1736
1737 /*
1738 * In the slowpath, we sanity check order to avoid ever trying to
1739 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
1740 * be using allocators in order of preference for an area that is
1741 * too large.
1742 */
1743 if (WARN_ON_ONCE(order >= MAX_ORDER))
1744 return NULL;
1498 1745
1499 /* 1746 /*
1500 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1747 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1507,154 +1754,83 @@ restart:
1507 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1754 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1508 goto nopage; 1755 goto nopage;
1509 1756
1510 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1757 wake_all_kswapd(order, zonelist, high_zoneidx);
1511 wakeup_kswapd(zone, order);
1512 1758
1513 /* 1759 /*
1514 * OK, we're below the kswapd watermark and have kicked background 1760 * OK, we're below the kswapd watermark and have kicked background
1515 * reclaim. Now things get more complex, so set up alloc_flags according 1761 * reclaim. Now things get more complex, so set up alloc_flags according
1516 * to how we want to proceed. 1762 * to how we want to proceed.
1517 *
1518 * The caller may dip into page reserves a bit more if the caller
1519 * cannot run direct reclaim, or if the caller has realtime scheduling
1520 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1521 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1522 */ 1763 */
1523 alloc_flags = ALLOC_WMARK_MIN; 1764 alloc_flags = gfp_to_alloc_flags(gfp_mask);
1524 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
1525 alloc_flags |= ALLOC_HARDER;
1526 if (gfp_mask & __GFP_HIGH)
1527 alloc_flags |= ALLOC_HIGH;
1528 if (wait)
1529 alloc_flags |= ALLOC_CPUSET;
1530 1765
1531 /* 1766restart:
1532 * Go through the zonelist again. Let __GFP_HIGH and allocations 1767 /* This is the last chance, in general, before the goto nopage. */
1533 * coming from realtime tasks go deeper into reserves.
1534 *
1535 * This is the last chance, in general, before the goto nopage.
1536 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1537 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1538 */
1539 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 1768 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1540 high_zoneidx, alloc_flags); 1769 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
1770 preferred_zone, migratetype);
1541 if (page) 1771 if (page)
1542 goto got_pg; 1772 goto got_pg;
1543 1773
1544 /* This allocation should allow future memory freeing. */
1545
1546rebalance: 1774rebalance:
1547 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1775 /* Allocate without watermarks if the context allows */
1548 && !in_interrupt()) { 1776 if (alloc_flags & ALLOC_NO_WATERMARKS) {
1549 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1777 page = __alloc_pages_high_priority(gfp_mask, order,
1550nofail_alloc: 1778 zonelist, high_zoneidx, nodemask,
1551 /* go through the zonelist yet again, ignoring mins */ 1779 preferred_zone, migratetype);
1552 page = get_page_from_freelist(gfp_mask, nodemask, order, 1780 if (page)
1553 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); 1781 goto got_pg;
1554 if (page)
1555 goto got_pg;
1556 if (gfp_mask & __GFP_NOFAIL) {
1557 congestion_wait(WRITE, HZ/50);
1558 goto nofail_alloc;
1559 }
1560 }
1561 goto nopage;
1562 } 1782 }
1563 1783
1564 /* Atomic allocations - we can't balance anything */ 1784 /* Atomic allocations - we can't balance anything */
1565 if (!wait) 1785 if (!wait)
1566 goto nopage; 1786 goto nopage;
1567 1787
1568 cond_resched(); 1788 /* Avoid recursion of direct reclaim */
1789 if (p->flags & PF_MEMALLOC)
1790 goto nopage;
1791
1792 /* Try direct reclaim and then allocating */
1793 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1794 zonelist, high_zoneidx,
1795 nodemask,
1796 alloc_flags, preferred_zone,
1797 migratetype, &did_some_progress);
1798 if (page)
1799 goto got_pg;
1569 1800
1570 /* We now go into synchronous reclaim */
1571 cpuset_memory_pressure_bump();
1572 /* 1801 /*
1573 * The task's cpuset might have expanded its set of allowable nodes 1802 * If we failed to make any progress reclaiming, then we are
1803 * running out of options and have to consider going OOM
1574 */ 1804 */
1575 cpuset_update_task_memory_state(); 1805 if (!did_some_progress) {
1576 p->flags |= PF_MEMALLOC; 1806 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1577 1807 if (oom_killer_disabled)
1578 lockdep_set_current_reclaim_state(gfp_mask); 1808 goto nopage;
1579 reclaim_state.reclaimed_slab = 0; 1809 page = __alloc_pages_may_oom(gfp_mask, order,
1580 p->reclaim_state = &reclaim_state; 1810 zonelist, high_zoneidx,
1581 1811 nodemask, preferred_zone,
1582 did_some_progress = try_to_free_pages(zonelist, order, 1812 migratetype);
1583 gfp_mask, nodemask); 1813 if (page)
1584 1814 goto got_pg;
1585 p->reclaim_state = NULL;
1586 lockdep_clear_current_reclaim_state();
1587 p->flags &= ~PF_MEMALLOC;
1588
1589 cond_resched();
1590 1815
1591 if (order != 0) 1816 /*
1592 drain_all_pages(); 1817 * The OOM killer does not trigger for high-order
1818 * ~__GFP_NOFAIL allocations so if no progress is being
1819 * made, there are no other options and retrying is
1820 * unlikely to help.
1821 */
1822 if (order > PAGE_ALLOC_COSTLY_ORDER &&
1823 !(gfp_mask & __GFP_NOFAIL))
1824 goto nopage;
1593 1825
1594 if (likely(did_some_progress)) {
1595 page = get_page_from_freelist(gfp_mask, nodemask, order,
1596 zonelist, high_zoneidx, alloc_flags);
1597 if (page)
1598 goto got_pg;
1599 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1600 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1601 schedule_timeout_uninterruptible(1);
1602 goto restart; 1826 goto restart;
1603 } 1827 }
1604
1605 /*
1606 * Go through the zonelist yet one more time, keep
1607 * very high watermark here, this is only to catch
1608 * a parallel oom killing, we must fail if we're still
1609 * under heavy pressure.
1610 */
1611 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1612 order, zonelist, high_zoneidx,
1613 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1614 if (page) {
1615 clear_zonelist_oom(zonelist, gfp_mask);
1616 goto got_pg;
1617 }
1618
1619 /* The OOM killer will not help higher order allocs so fail */
1620 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1621 clear_zonelist_oom(zonelist, gfp_mask);
1622 goto nopage;
1623 }
1624
1625 out_of_memory(zonelist, gfp_mask, order);
1626 clear_zonelist_oom(zonelist, gfp_mask);
1627 goto restart;
1628 } 1828 }
1629 1829
1630 /* 1830 /* Check if we should retry the allocation */
1631 * Don't let big-order allocations loop unless the caller explicitly
1632 * requests that. Wait for some write requests to complete then retry.
1633 *
1634 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1635 * means __GFP_NOFAIL, but that may not be true in other
1636 * implementations.
1637 *
1638 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1639 * specified, then we retry until we no longer reclaim any pages
1640 * (above), or we've reclaimed an order of pages at least as
1641 * large as the allocation's order. In both cases, if the
1642 * allocation still fails, we stop retrying.
1643 */
1644 pages_reclaimed += did_some_progress; 1831 pages_reclaimed += did_some_progress;
1645 do_retry = 0; 1832 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1646 if (!(gfp_mask & __GFP_NORETRY)) { 1833 /* Wait for some write requests to complete then retry */
1647 if (order <= PAGE_ALLOC_COSTLY_ORDER) {
1648 do_retry = 1;
1649 } else {
1650 if (gfp_mask & __GFP_REPEAT &&
1651 pages_reclaimed < (1 << order))
1652 do_retry = 1;
1653 }
1654 if (gfp_mask & __GFP_NOFAIL)
1655 do_retry = 1;
1656 }
1657 if (do_retry) {
1658 congestion_wait(WRITE, HZ/50); 1834 congestion_wait(WRITE, HZ/50);
1659 goto rebalance; 1835 goto rebalance;
1660 } 1836 }
@@ -1667,10 +1843,58 @@ nopage:
1667 dump_stack(); 1843 dump_stack();
1668 show_mem(); 1844 show_mem();
1669 } 1845 }
1846 return page;
1670got_pg: 1847got_pg:
1848 if (kmemcheck_enabled)
1849 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
1850 return page;
1851
1852}
1853
1854/*
1855 * This is the 'heart' of the zoned buddy allocator.
1856 */
1857struct page *
1858__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1859 struct zonelist *zonelist, nodemask_t *nodemask)
1860{
1861 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1862 struct zone *preferred_zone;
1863 struct page *page;
1864 int migratetype = allocflags_to_migratetype(gfp_mask);
1865
1866 lockdep_trace_alloc(gfp_mask);
1867
1868 might_sleep_if(gfp_mask & __GFP_WAIT);
1869
1870 if (should_fail_alloc_page(gfp_mask, order))
1871 return NULL;
1872
1873 /*
1874 * Check the zones suitable for the gfp_mask contain at least one
1875 * valid zone. It's possible to have an empty zonelist as a result
1876 * of GFP_THISNODE and a memoryless node
1877 */
1878 if (unlikely(!zonelist->_zonerefs->zone))
1879 return NULL;
1880
1881 /* The preferred zone is used for statistics later */
1882 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1883 if (!preferred_zone)
1884 return NULL;
1885
1886 /* First allocation attempt */
1887 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1888 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
1889 preferred_zone, migratetype);
1890 if (unlikely(!page))
1891 page = __alloc_pages_slowpath(gfp_mask, order,
1892 zonelist, high_zoneidx, nodemask,
1893 preferred_zone, migratetype);
1894
1671 return page; 1895 return page;
1672} 1896}
1673EXPORT_SYMBOL(__alloc_pages_internal); 1897EXPORT_SYMBOL(__alloc_pages_nodemask);
1674 1898
1675/* 1899/*
1676 * Common helper functions. 1900 * Common helper functions.
@@ -1799,7 +2023,7 @@ static unsigned int nr_free_zone_pages(int offset)
1799 2023
1800 for_each_zone_zonelist(zone, z, zonelist, offset) { 2024 for_each_zone_zonelist(zone, z, zonelist, offset) {
1801 unsigned long size = zone->present_pages; 2025 unsigned long size = zone->present_pages;
1802 unsigned long high = zone->pages_high; 2026 unsigned long high = high_wmark_pages(zone);
1803 if (size > high) 2027 if (size > high)
1804 sum += size - high; 2028 sum += size - high;
1805 } 2029 }
@@ -1891,19 +2115,14 @@ void show_free_areas(void)
1891 2115
1892 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" 2116 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
1893 " inactive_file:%lu" 2117 " inactive_file:%lu"
1894//TODO: check/adjust line lengths
1895#ifdef CONFIG_UNEVICTABLE_LRU
1896 " unevictable:%lu" 2118 " unevictable:%lu"
1897#endif
1898 " dirty:%lu writeback:%lu unstable:%lu\n" 2119 " dirty:%lu writeback:%lu unstable:%lu\n"
1899 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 2120 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1900 global_page_state(NR_ACTIVE_ANON), 2121 global_page_state(NR_ACTIVE_ANON),
1901 global_page_state(NR_ACTIVE_FILE), 2122 global_page_state(NR_ACTIVE_FILE),
1902 global_page_state(NR_INACTIVE_ANON), 2123 global_page_state(NR_INACTIVE_ANON),
1903 global_page_state(NR_INACTIVE_FILE), 2124 global_page_state(NR_INACTIVE_FILE),
1904#ifdef CONFIG_UNEVICTABLE_LRU
1905 global_page_state(NR_UNEVICTABLE), 2125 global_page_state(NR_UNEVICTABLE),
1906#endif
1907 global_page_state(NR_FILE_DIRTY), 2126 global_page_state(NR_FILE_DIRTY),
1908 global_page_state(NR_WRITEBACK), 2127 global_page_state(NR_WRITEBACK),
1909 global_page_state(NR_UNSTABLE_NFS), 2128 global_page_state(NR_UNSTABLE_NFS),
@@ -1927,25 +2146,21 @@ void show_free_areas(void)
1927 " inactive_anon:%lukB" 2146 " inactive_anon:%lukB"
1928 " active_file:%lukB" 2147 " active_file:%lukB"
1929 " inactive_file:%lukB" 2148 " inactive_file:%lukB"
1930#ifdef CONFIG_UNEVICTABLE_LRU
1931 " unevictable:%lukB" 2149 " unevictable:%lukB"
1932#endif
1933 " present:%lukB" 2150 " present:%lukB"
1934 " pages_scanned:%lu" 2151 " pages_scanned:%lu"
1935 " all_unreclaimable? %s" 2152 " all_unreclaimable? %s"
1936 "\n", 2153 "\n",
1937 zone->name, 2154 zone->name,
1938 K(zone_page_state(zone, NR_FREE_PAGES)), 2155 K(zone_page_state(zone, NR_FREE_PAGES)),
1939 K(zone->pages_min), 2156 K(min_wmark_pages(zone)),
1940 K(zone->pages_low), 2157 K(low_wmark_pages(zone)),
1941 K(zone->pages_high), 2158 K(high_wmark_pages(zone)),
1942 K(zone_page_state(zone, NR_ACTIVE_ANON)), 2159 K(zone_page_state(zone, NR_ACTIVE_ANON)),
1943 K(zone_page_state(zone, NR_INACTIVE_ANON)), 2160 K(zone_page_state(zone, NR_INACTIVE_ANON)),
1944 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2161 K(zone_page_state(zone, NR_ACTIVE_FILE)),
1945 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2162 K(zone_page_state(zone, NR_INACTIVE_FILE)),
1946#ifdef CONFIG_UNEVICTABLE_LRU
1947 K(zone_page_state(zone, NR_UNEVICTABLE)), 2163 K(zone_page_state(zone, NR_UNEVICTABLE)),
1948#endif
1949 K(zone->present_pages), 2164 K(zone->present_pages),
1950 zone->pages_scanned, 2165 zone->pages_scanned,
1951 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2166 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -2103,7 +2318,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2103} 2318}
2104 2319
2105 2320
2106#define MAX_NODE_LOAD (num_online_nodes()) 2321#define MAX_NODE_LOAD (nr_online_nodes)
2107static int node_load[MAX_NUMNODES]; 2322static int node_load[MAX_NUMNODES];
2108 2323
2109/** 2324/**
@@ -2312,7 +2527,7 @@ static void build_zonelists(pg_data_t *pgdat)
2312 2527
2313 /* NUMA-aware ordering of nodes */ 2528 /* NUMA-aware ordering of nodes */
2314 local_node = pgdat->node_id; 2529 local_node = pgdat->node_id;
2315 load = num_online_nodes(); 2530 load = nr_online_nodes;
2316 prev_node = local_node; 2531 prev_node = local_node;
2317 nodes_clear(used_mask); 2532 nodes_clear(used_mask);
2318 2533
@@ -2463,7 +2678,7 @@ void build_all_zonelists(void)
2463 2678
2464 printk("Built %i zonelists in %s order, mobility grouping %s. " 2679 printk("Built %i zonelists in %s order, mobility grouping %s. "
2465 "Total pages: %ld\n", 2680 "Total pages: %ld\n",
2466 num_online_nodes(), 2681 nr_online_nodes,
2467 zonelist_order_name[current_zonelist_order], 2682 zonelist_order_name[current_zonelist_order],
2468 page_group_by_mobility_disabled ? "off" : "on", 2683 page_group_by_mobility_disabled ? "off" : "on",
2469 vm_total_pages); 2684 vm_total_pages);
@@ -2542,8 +2757,8 @@ static inline unsigned long wait_table_bits(unsigned long size)
2542 2757
2543/* 2758/*
2544 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 2759 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2545 * of blocks reserved is based on zone->pages_min. The memory within the 2760 * of blocks reserved is based on min_wmark_pages(zone). The memory within
2546 * reserve will tend to store contiguous free pages. Setting min_free_kbytes 2761 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
2547 * higher will lead to a bigger reserve which will get freed as contiguous 2762 * higher will lead to a bigger reserve which will get freed as contiguous
2548 * blocks as reclaim kicks in 2763 * blocks as reclaim kicks in
2549 */ 2764 */
@@ -2556,7 +2771,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2556 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2771 /* Get the start pfn, end pfn and the number of blocks to reserve */
2557 start_pfn = zone->zone_start_pfn; 2772 start_pfn = zone->zone_start_pfn;
2558 end_pfn = start_pfn + zone->spanned_pages; 2773 end_pfn = start_pfn + zone->spanned_pages;
2559 reserve = roundup(zone->pages_min, pageblock_nr_pages) >> 2774 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2560 pageblock_order; 2775 pageblock_order;
2561 2776
2562 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2777 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
@@ -3488,7 +3703,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3488 zone_pcp_init(zone); 3703 zone_pcp_init(zone);
3489 for_each_lru(l) { 3704 for_each_lru(l) {
3490 INIT_LIST_HEAD(&zone->lru[l].list); 3705 INIT_LIST_HEAD(&zone->lru[l].list);
3491 zone->lru[l].nr_scan = 0; 3706 zone->lru[l].nr_saved_scan = 0;
3492 } 3707 }
3493 zone->reclaim_stat.recent_rotated[0] = 0; 3708 zone->reclaim_stat.recent_rotated[0] = 0;
3494 zone->reclaim_stat.recent_rotated[1] = 0; 3709 zone->reclaim_stat.recent_rotated[1] = 0;
@@ -4025,6 +4240,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4025 early_node_map[i].start_pfn, 4240 early_node_map[i].start_pfn,
4026 early_node_map[i].end_pfn); 4241 early_node_map[i].end_pfn);
4027 4242
4243 /*
4244 * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init
4245 * that node_mask, clear it at first
4246 */
4247 nodes_clear(node_states[N_HIGH_MEMORY]);
4028 /* Initialise every node */ 4248 /* Initialise every node */
4029 mminit_verify_pageflags_layout(); 4249 mminit_verify_pageflags_layout();
4030 setup_nr_node_ids(); 4250 setup_nr_node_ids();
@@ -4159,8 +4379,8 @@ static void calculate_totalreserve_pages(void)
4159 max = zone->lowmem_reserve[j]; 4379 max = zone->lowmem_reserve[j];
4160 } 4380 }
4161 4381
4162 /* we treat pages_high as reserved pages. */ 4382 /* we treat the high watermark as reserved pages. */
4163 max += zone->pages_high; 4383 max += high_wmark_pages(zone);
4164 4384
4165 if (max > zone->present_pages) 4385 if (max > zone->present_pages)
4166 max = zone->present_pages; 4386 max = zone->present_pages;
@@ -4210,12 +4430,13 @@ static void setup_per_zone_lowmem_reserve(void)
4210} 4430}
4211 4431
4212/** 4432/**
4213 * setup_per_zone_pages_min - called when min_free_kbytes changes. 4433 * setup_per_zone_wmarks - called when min_free_kbytes changes
4434 * or when memory is hot-{added|removed}
4214 * 4435 *
4215 * Ensures that the pages_{min,low,high} values for each zone are set correctly 4436 * Ensures that the watermark[min,low,high] values for each zone are set
4216 * with respect to min_free_kbytes. 4437 * correctly with respect to min_free_kbytes.
4217 */ 4438 */
4218void setup_per_zone_pages_min(void) 4439void setup_per_zone_wmarks(void)
4219{ 4440{
4220 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 4441 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
4221 unsigned long lowmem_pages = 0; 4442 unsigned long lowmem_pages = 0;
@@ -4240,7 +4461,7 @@ void setup_per_zone_pages_min(void)
4240 * need highmem pages, so cap pages_min to a small 4461 * need highmem pages, so cap pages_min to a small
4241 * value here. 4462 * value here.
4242 * 4463 *
4243 * The (pages_high-pages_low) and (pages_low-pages_min) 4464 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
4244 * deltas controls asynch page reclaim, and so should 4465 * deltas controls asynch page reclaim, and so should
4245 * not be capped for highmem. 4466 * not be capped for highmem.
4246 */ 4467 */
@@ -4251,17 +4472,17 @@ void setup_per_zone_pages_min(void)
4251 min_pages = SWAP_CLUSTER_MAX; 4472 min_pages = SWAP_CLUSTER_MAX;
4252 if (min_pages > 128) 4473 if (min_pages > 128)
4253 min_pages = 128; 4474 min_pages = 128;
4254 zone->pages_min = min_pages; 4475 zone->watermark[WMARK_MIN] = min_pages;
4255 } else { 4476 } else {
4256 /* 4477 /*
4257 * If it's a lowmem zone, reserve a number of pages 4478 * If it's a lowmem zone, reserve a number of pages
4258 * proportionate to the zone's size. 4479 * proportionate to the zone's size.
4259 */ 4480 */
4260 zone->pages_min = tmp; 4481 zone->watermark[WMARK_MIN] = tmp;
4261 } 4482 }
4262 4483
4263 zone->pages_low = zone->pages_min + (tmp >> 2); 4484 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
4264 zone->pages_high = zone->pages_min + (tmp >> 1); 4485 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
4265 setup_zone_migrate_reserve(zone); 4486 setup_zone_migrate_reserve(zone);
4266 spin_unlock_irqrestore(&zone->lock, flags); 4487 spin_unlock_irqrestore(&zone->lock, flags);
4267 } 4488 }
@@ -4271,8 +4492,6 @@ void setup_per_zone_pages_min(void)
4271} 4492}
4272 4493
4273/** 4494/**
4274 * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
4275 *
4276 * The inactive anon list should be small enough that the VM never has to 4495 * The inactive anon list should be small enough that the VM never has to
4277 * do too much work, but large enough that each inactive page has a chance 4496 * do too much work, but large enough that each inactive page has a chance
4278 * to be referenced again before it is swapped out. 4497 * to be referenced again before it is swapped out.
@@ -4293,21 +4512,26 @@ void setup_per_zone_pages_min(void)
4293 * 1TB 101 10GB 4512 * 1TB 101 10GB
4294 * 10TB 320 32GB 4513 * 10TB 320 32GB
4295 */ 4514 */
4296static void setup_per_zone_inactive_ratio(void) 4515void calculate_zone_inactive_ratio(struct zone *zone)
4297{ 4516{
4298 struct zone *zone; 4517 unsigned int gb, ratio;
4299
4300 for_each_zone(zone) {
4301 unsigned int gb, ratio;
4302 4518
4303 /* Zone size in gigabytes */ 4519 /* Zone size in gigabytes */
4304 gb = zone->present_pages >> (30 - PAGE_SHIFT); 4520 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4521 if (gb)
4305 ratio = int_sqrt(10 * gb); 4522 ratio = int_sqrt(10 * gb);
4306 if (!ratio) 4523 else
4307 ratio = 1; 4524 ratio = 1;
4308 4525
4309 zone->inactive_ratio = ratio; 4526 zone->inactive_ratio = ratio;
4310 } 4527}
4528
4529static void __init setup_per_zone_inactive_ratio(void)
4530{
4531 struct zone *zone;
4532
4533 for_each_zone(zone)
4534 calculate_zone_inactive_ratio(zone);
4311} 4535}
4312 4536
4313/* 4537/*
@@ -4334,7 +4558,7 @@ static void setup_per_zone_inactive_ratio(void)
4334 * 8192MB: 11584k 4558 * 8192MB: 11584k
4335 * 16384MB: 16384k 4559 * 16384MB: 16384k
4336 */ 4560 */
4337static int __init init_per_zone_pages_min(void) 4561static int __init init_per_zone_wmark_min(void)
4338{ 4562{
4339 unsigned long lowmem_kbytes; 4563 unsigned long lowmem_kbytes;
4340 4564
@@ -4345,12 +4569,12 @@ static int __init init_per_zone_pages_min(void)
4345 min_free_kbytes = 128; 4569 min_free_kbytes = 128;
4346 if (min_free_kbytes > 65536) 4570 if (min_free_kbytes > 65536)
4347 min_free_kbytes = 65536; 4571 min_free_kbytes = 65536;
4348 setup_per_zone_pages_min(); 4572 setup_per_zone_wmarks();
4349 setup_per_zone_lowmem_reserve(); 4573 setup_per_zone_lowmem_reserve();
4350 setup_per_zone_inactive_ratio(); 4574 setup_per_zone_inactive_ratio();
4351 return 0; 4575 return 0;
4352} 4576}
4353module_init(init_per_zone_pages_min) 4577module_init(init_per_zone_wmark_min)
4354 4578
4355/* 4579/*
4356 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 4580 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
@@ -4362,7 +4586,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4362{ 4586{
4363 proc_dointvec(table, write, file, buffer, length, ppos); 4587 proc_dointvec(table, write, file, buffer, length, ppos);
4364 if (write) 4588 if (write)
4365 setup_per_zone_pages_min(); 4589 setup_per_zone_wmarks();
4366 return 0; 4590 return 0;
4367} 4591}
4368 4592
@@ -4406,7 +4630,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4406 * whenever sysctl_lowmem_reserve_ratio changes. 4630 * whenever sysctl_lowmem_reserve_ratio changes.
4407 * 4631 *
4408 * The reserve ratio obviously has absolutely no relation with the 4632 * The reserve ratio obviously has absolutely no relation with the
4409 * pages_min watermarks. The lowmem reserve ratio can only make sense 4633 * minimum watermarks. The lowmem reserve ratio can only make sense
4410 * if in function of the boot time zone sizes. 4634 * if in function of the boot time zone sizes.
4411 */ 4635 */
4412int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4636int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
@@ -4513,23 +4737,13 @@ void *__init alloc_large_system_hash(const char *tablename,
4513 else if (hashdist) 4737 else if (hashdist)
4514 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4738 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4515 else { 4739 else {
4516 unsigned long order = get_order(size);
4517 table = (void*) __get_free_pages(GFP_ATOMIC, order);
4518 /* 4740 /*
4519 * If bucketsize is not a power-of-two, we may free 4741 * If bucketsize is not a power-of-two, we may free
4520 * some pages at the end of hash table. 4742 * some pages at the end of hash table which
4743 * alloc_pages_exact() automatically does
4521 */ 4744 */
4522 if (table) { 4745 if (get_order(size) < MAX_ORDER)
4523 unsigned long alloc_end = (unsigned long)table + 4746 table = alloc_pages_exact(size, GFP_ATOMIC);
4524 (PAGE_SIZE << order);
4525 unsigned long used = (unsigned long)table +
4526 PAGE_ALIGN(size);
4527 split_page(virt_to_page(table), order);
4528 while (used < alloc_end) {
4529 free_page(used);
4530 used += PAGE_SIZE;
4531 }
4532 }
4533 } 4747 }
4534 } while (!table && size > PAGE_SIZE && --log2qty); 4748 } while (!table && size > PAGE_SIZE && --log2qty);
4535 4749
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 11a8a10a3909..f22b4ebbd8dc 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -83,12 +83,12 @@ void __init page_cgroup_init_flatmem(void)
83 goto fail; 83 goto fail;
84 } 84 }
85 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); 85 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
86 printk(KERN_INFO "please try cgroup_disable=memory option if you" 86 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
87 " don't want\n"); 87 " don't want memory cgroups\n");
88 return; 88 return;
89fail: 89fail:
90 printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); 90 printk(KERN_CRIT "allocation of page_cgroup failed.\n");
91 printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); 91 printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
92 panic("Out of memory"); 92 panic("Out of memory");
93} 93}
94 94
@@ -99,6 +99,8 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
99 unsigned long pfn = page_to_pfn(page); 99 unsigned long pfn = page_to_pfn(page);
100 struct mem_section *section = __pfn_to_section(pfn); 100 struct mem_section *section = __pfn_to_section(pfn);
101 101
102 if (!section->page_cgroup)
103 return NULL;
102 return section->page_cgroup + pfn; 104 return section->page_cgroup + pfn;
103} 105}
104 106
@@ -252,14 +254,14 @@ void __init page_cgroup_init(void)
252 fail = init_section_page_cgroup(pfn); 254 fail = init_section_page_cgroup(pfn);
253 } 255 }
254 if (fail) { 256 if (fail) {
255 printk(KERN_CRIT "try cgroup_disable=memory boot option\n"); 257 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
256 panic("Out of memory"); 258 panic("Out of memory");
257 } else { 259 } else {
258 hotplug_memory_notifier(page_cgroup_callback, 0); 260 hotplug_memory_notifier(page_cgroup_callback, 0);
259 } 261 }
260 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); 262 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
261 printk(KERN_INFO "please try cgroup_disable=memory option if you don't" 263 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
262 " want\n"); 264 " want memory cgroups\n");
263} 265}
264 266
265void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) 267void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
@@ -309,8 +311,6 @@ static int swap_cgroup_prepare(int type)
309 struct swap_cgroup_ctrl *ctrl; 311 struct swap_cgroup_ctrl *ctrl;
310 unsigned long idx, max; 312 unsigned long idx, max;
311 313
312 if (!do_swap_account)
313 return 0;
314 ctrl = &swap_cgroup_ctrl[type]; 314 ctrl = &swap_cgroup_ctrl[type];
315 315
316 for (idx = 0; idx < ctrl->length; idx++) { 316 for (idx = 0; idx < ctrl->length; idx++) {
@@ -347,9 +347,6 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
347 struct swap_cgroup *sc; 347 struct swap_cgroup *sc;
348 unsigned short old; 348 unsigned short old;
349 349
350 if (!do_swap_account)
351 return 0;
352
353 ctrl = &swap_cgroup_ctrl[type]; 350 ctrl = &swap_cgroup_ctrl[type];
354 351
355 mappage = ctrl->map[idx]; 352 mappage = ctrl->map[idx];
@@ -378,9 +375,6 @@ unsigned short lookup_swap_cgroup(swp_entry_t ent)
378 struct swap_cgroup *sc; 375 struct swap_cgroup *sc;
379 unsigned short ret; 376 unsigned short ret;
380 377
381 if (!do_swap_account)
382 return 0;
383
384 ctrl = &swap_cgroup_ctrl[type]; 378 ctrl = &swap_cgroup_ctrl[type];
385 mappage = ctrl->map[idx]; 379 mappage = ctrl->map[idx];
386 sc = page_address(mappage); 380 sc = page_address(mappage);
diff --git a/mm/page_io.c b/mm/page_io.c
index 3023c475e041..c6f3e5071de3 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -120,7 +120,7 @@ out:
120 return ret; 120 return ret;
121} 121}
122 122
123int swap_readpage(struct file *file, struct page *page) 123int swap_readpage(struct page *page)
124{ 124{
125 struct bio *bio; 125 struct bio *bio;
126 int ret = 0; 126 int ret = 0;
diff --git a/mm/readahead.c b/mm/readahead.c
index 133b6d525513..aa1aa2345235 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -133,15 +133,12 @@ out:
133} 133}
134 134
135/* 135/*
136 * do_page_cache_readahead actually reads a chunk of disk. It allocates all 136 * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
137 * the pages first, then submits them all for I/O. This avoids the very bad 137 * the pages first, then submits them all for I/O. This avoids the very bad
138 * behaviour which would occur if page allocations are causing VM writeback. 138 * behaviour which would occur if page allocations are causing VM writeback.
139 * We really don't want to intermingle reads and writes like that. 139 * We really don't want to intermingle reads and writes like that.
140 * 140 *
141 * Returns the number of pages requested, or the maximum amount of I/O allowed. 141 * Returns the number of pages requested, or the maximum amount of I/O allowed.
142 *
143 * do_page_cache_readahead() returns -1 if it encountered request queue
144 * congestion.
145 */ 142 */
146static int 143static int
147__do_page_cache_readahead(struct address_space *mapping, struct file *filp, 144__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
@@ -210,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
210 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) 207 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
211 return -EINVAL; 208 return -EINVAL;
212 209
210 nr_to_read = max_sane_readahead(nr_to_read);
213 while (nr_to_read) { 211 while (nr_to_read) {
214 int err; 212 int err;
215 213
@@ -231,22 +229,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
231} 229}
232 230
233/* 231/*
234 * This version skips the IO if the queue is read-congested, and will tell the
235 * block layer to abandon the readahead if request allocation would block.
236 *
237 * force_page_cache_readahead() will ignore queue congestion and will block on
238 * request queues.
239 */
240int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
241 pgoff_t offset, unsigned long nr_to_read)
242{
243 if (bdi_read_congested(mapping->backing_dev_info))
244 return -1;
245
246 return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
247}
248
249/*
250 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a 232 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
251 * sensible upper limit. 233 * sensible upper limit.
252 */ 234 */
@@ -259,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr)
259/* 241/*
260 * Submit IO for the read-ahead request in file_ra_state. 242 * Submit IO for the read-ahead request in file_ra_state.
261 */ 243 */
262static unsigned long ra_submit(struct file_ra_state *ra, 244unsigned long ra_submit(struct file_ra_state *ra,
263 struct address_space *mapping, struct file *filp) 245 struct address_space *mapping, struct file *filp)
264{ 246{
265 int actual; 247 int actual;
@@ -348,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
348 */ 330 */
349 331
350/* 332/*
333 * Count contiguously cached pages from @offset-1 to @offset-@max,
334 * this count is a conservative estimation of
335 * - length of the sequential read sequence, or
336 * - thrashing threshold in memory tight systems
337 */
338static pgoff_t count_history_pages(struct address_space *mapping,
339 struct file_ra_state *ra,
340 pgoff_t offset, unsigned long max)
341{
342 pgoff_t head;
343
344 rcu_read_lock();
345 head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
346 rcu_read_unlock();
347
348 return offset - 1 - head;
349}
350
351/*
352 * page cache context based read-ahead
353 */
354static int try_context_readahead(struct address_space *mapping,
355 struct file_ra_state *ra,
356 pgoff_t offset,
357 unsigned long req_size,
358 unsigned long max)
359{
360 pgoff_t size;
361
362 size = count_history_pages(mapping, ra, offset, max);
363
364 /*
365 * no history pages:
366 * it could be a random read
367 */
368 if (!size)
369 return 0;
370
371 /*
372 * starts from beginning of file:
373 * it is a strong indication of long-run stream (or whole-file-read)
374 */
375 if (size >= offset)
376 size *= 2;
377
378 ra->start = offset;
379 ra->size = get_init_ra_size(size + req_size, max);
380 ra->async_size = ra->size;
381
382 return 1;
383}
384
385/*
351 * A minimal readahead algorithm for trivial sequential/random reads. 386 * A minimal readahead algorithm for trivial sequential/random reads.
352 */ 387 */
353static unsigned long 388static unsigned long
@@ -356,34 +391,26 @@ ondemand_readahead(struct address_space *mapping,
356 bool hit_readahead_marker, pgoff_t offset, 391 bool hit_readahead_marker, pgoff_t offset,
357 unsigned long req_size) 392 unsigned long req_size)
358{ 393{
359 int max = ra->ra_pages; /* max readahead pages */ 394 unsigned long max = max_sane_readahead(ra->ra_pages);
360 pgoff_t prev_offset; 395
361 int sequential; 396 /*
397 * start of file
398 */
399 if (!offset)
400 goto initial_readahead;
362 401
363 /* 402 /*
364 * It's the expected callback offset, assume sequential access. 403 * It's the expected callback offset, assume sequential access.
365 * Ramp up sizes, and push forward the readahead window. 404 * Ramp up sizes, and push forward the readahead window.
366 */ 405 */
367 if (offset && (offset == (ra->start + ra->size - ra->async_size) || 406 if ((offset == (ra->start + ra->size - ra->async_size) ||
368 offset == (ra->start + ra->size))) { 407 offset == (ra->start + ra->size))) {
369 ra->start += ra->size; 408 ra->start += ra->size;
370 ra->size = get_next_ra_size(ra, max); 409 ra->size = get_next_ra_size(ra, max);
371 ra->async_size = ra->size; 410 ra->async_size = ra->size;
372 goto readit; 411 goto readit;
373 } 412 }
374 413
375 prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
376 sequential = offset - prev_offset <= 1UL || req_size > max;
377
378 /*
379 * Standalone, small read.
380 * Read as is, and do not pollute the readahead state.
381 */
382 if (!hit_readahead_marker && !sequential) {
383 return __do_page_cache_readahead(mapping, filp,
384 offset, req_size, 0);
385 }
386
387 /* 414 /*
388 * Hit a marked page without valid readahead state. 415 * Hit a marked page without valid readahead state.
389 * E.g. interleaved reads. 416 * E.g. interleaved reads.
@@ -394,7 +421,7 @@ ondemand_readahead(struct address_space *mapping,
394 pgoff_t start; 421 pgoff_t start;
395 422
396 rcu_read_lock(); 423 rcu_read_lock();
397 start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); 424 start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
398 rcu_read_unlock(); 425 rcu_read_unlock();
399 426
400 if (!start || start - offset > max) 427 if (!start || start - offset > max)
@@ -402,23 +429,53 @@ ondemand_readahead(struct address_space *mapping,
402 429
403 ra->start = start; 430 ra->start = start;
404 ra->size = start - offset; /* old async_size */ 431 ra->size = start - offset; /* old async_size */
432 ra->size += req_size;
405 ra->size = get_next_ra_size(ra, max); 433 ra->size = get_next_ra_size(ra, max);
406 ra->async_size = ra->size; 434 ra->async_size = ra->size;
407 goto readit; 435 goto readit;
408 } 436 }
409 437
410 /* 438 /*
411 * It may be one of 439 * oversize read
412 * - first read on start of file 440 */
413 * - sequential cache miss 441 if (req_size > max)
414 * - oversize random read 442 goto initial_readahead;
415 * Start readahead for it. 443
444 /*
445 * sequential cache miss
446 */
447 if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
448 goto initial_readahead;
449
450 /*
451 * Query the page cache and look for the traces(cached history pages)
452 * that a sequential stream would leave behind.
453 */
454 if (try_context_readahead(mapping, ra, offset, req_size, max))
455 goto readit;
456
457 /*
458 * standalone, small random read
459 * Read as is, and do not pollute the readahead state.
416 */ 460 */
461 return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
462
463initial_readahead:
417 ra->start = offset; 464 ra->start = offset;
418 ra->size = get_init_ra_size(req_size, max); 465 ra->size = get_init_ra_size(req_size, max);
419 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; 466 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
420 467
421readit: 468readit:
469 /*
470 * Will this read hit the readahead marker made by itself?
471 * If so, trigger the readahead marker hit now, and merge
472 * the resulted next readahead window into the current one.
473 */
474 if (offset == ra->start && ra->size == ra->async_size) {
475 ra->async_size = get_next_ra_size(ra, max);
476 ra->size += ra->async_size;
477 }
478
422 return ra_submit(ra, mapping, filp); 479 return ra_submit(ra, mapping, filp);
423} 480}
424 481
diff --git a/mm/rmap.c b/mm/rmap.c
index 23122af32611..836c6c63e1f2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
333 * repeatedly from either page_referenced_anon or page_referenced_file. 333 * repeatedly from either page_referenced_anon or page_referenced_file.
334 */ 334 */
335static int page_referenced_one(struct page *page, 335static int page_referenced_one(struct page *page,
336 struct vm_area_struct *vma, unsigned int *mapcount) 336 struct vm_area_struct *vma,
337 unsigned int *mapcount,
338 unsigned long *vm_flags)
337{ 339{
338 struct mm_struct *mm = vma->vm_mm; 340 struct mm_struct *mm = vma->vm_mm;
339 unsigned long address; 341 unsigned long address;
@@ -381,11 +383,14 @@ out_unmap:
381 (*mapcount)--; 383 (*mapcount)--;
382 pte_unmap_unlock(pte, ptl); 384 pte_unmap_unlock(pte, ptl);
383out: 385out:
386 if (referenced)
387 *vm_flags |= vma->vm_flags;
384 return referenced; 388 return referenced;
385} 389}
386 390
387static int page_referenced_anon(struct page *page, 391static int page_referenced_anon(struct page *page,
388 struct mem_cgroup *mem_cont) 392 struct mem_cgroup *mem_cont,
393 unsigned long *vm_flags)
389{ 394{
390 unsigned int mapcount; 395 unsigned int mapcount;
391 struct anon_vma *anon_vma; 396 struct anon_vma *anon_vma;
@@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page,
405 */ 410 */
406 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 411 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
407 continue; 412 continue;
408 referenced += page_referenced_one(page, vma, &mapcount); 413 referenced += page_referenced_one(page, vma,
414 &mapcount, vm_flags);
409 if (!mapcount) 415 if (!mapcount)
410 break; 416 break;
411 } 417 }
@@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page,
418 * page_referenced_file - referenced check for object-based rmap 424 * page_referenced_file - referenced check for object-based rmap
419 * @page: the page we're checking references on. 425 * @page: the page we're checking references on.
420 * @mem_cont: target memory controller 426 * @mem_cont: target memory controller
427 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
421 * 428 *
422 * For an object-based mapped page, find all the places it is mapped and 429 * For an object-based mapped page, find all the places it is mapped and
423 * check/clear the referenced flag. This is done by following the page->mapping 430 * check/clear the referenced flag. This is done by following the page->mapping
@@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page,
427 * This function is only called from page_referenced for object-based pages. 434 * This function is only called from page_referenced for object-based pages.
428 */ 435 */
429static int page_referenced_file(struct page *page, 436static int page_referenced_file(struct page *page,
430 struct mem_cgroup *mem_cont) 437 struct mem_cgroup *mem_cont,
438 unsigned long *vm_flags)
431{ 439{
432 unsigned int mapcount; 440 unsigned int mapcount;
433 struct address_space *mapping = page->mapping; 441 struct address_space *mapping = page->mapping;
@@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page,
467 */ 475 */
468 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 476 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
469 continue; 477 continue;
470 referenced += page_referenced_one(page, vma, &mapcount); 478 referenced += page_referenced_one(page, vma,
479 &mapcount, vm_flags);
471 if (!mapcount) 480 if (!mapcount)
472 break; 481 break;
473 } 482 }
@@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page,
481 * @page: the page to test 490 * @page: the page to test
482 * @is_locked: caller holds lock on the page 491 * @is_locked: caller holds lock on the page
483 * @mem_cont: target memory controller 492 * @mem_cont: target memory controller
493 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
484 * 494 *
485 * Quick test_and_clear_referenced for all mappings to a page, 495 * Quick test_and_clear_referenced for all mappings to a page,
486 * returns the number of ptes which referenced the page. 496 * returns the number of ptes which referenced the page.
487 */ 497 */
488int page_referenced(struct page *page, int is_locked, 498int page_referenced(struct page *page,
489 struct mem_cgroup *mem_cont) 499 int is_locked,
500 struct mem_cgroup *mem_cont,
501 unsigned long *vm_flags)
490{ 502{
491 int referenced = 0; 503 int referenced = 0;
492 504
493 if (TestClearPageReferenced(page)) 505 if (TestClearPageReferenced(page))
494 referenced++; 506 referenced++;
495 507
508 *vm_flags = 0;
496 if (page_mapped(page) && page->mapping) { 509 if (page_mapped(page) && page->mapping) {
497 if (PageAnon(page)) 510 if (PageAnon(page))
498 referenced += page_referenced_anon(page, mem_cont); 511 referenced += page_referenced_anon(page, mem_cont,
512 vm_flags);
499 else if (is_locked) 513 else if (is_locked)
500 referenced += page_referenced_file(page, mem_cont); 514 referenced += page_referenced_file(page, mem_cont,
515 vm_flags);
501 else if (!trylock_page(page)) 516 else if (!trylock_page(page))
502 referenced++; 517 referenced++;
503 else { 518 else {
504 if (page->mapping) 519 if (page->mapping)
505 referenced += 520 referenced += page_referenced_file(page,
506 page_referenced_file(page, mem_cont); 521 mem_cont, vm_flags);
507 unlock_page(page); 522 unlock_page(page);
508 } 523 }
509 } 524 }
@@ -688,8 +703,10 @@ void page_add_new_anon_rmap(struct page *page,
688 */ 703 */
689void page_add_file_rmap(struct page *page) 704void page_add_file_rmap(struct page *page)
690{ 705{
691 if (atomic_inc_and_test(&page->_mapcount)) 706 if (atomic_inc_and_test(&page->_mapcount)) {
692 __inc_zone_page_state(page, NR_FILE_MAPPED); 707 __inc_zone_page_state(page, NR_FILE_MAPPED);
708 mem_cgroup_update_mapped_file_stat(page, 1);
709 }
693} 710}
694 711
695#ifdef CONFIG_DEBUG_VM 712#ifdef CONFIG_DEBUG_VM
@@ -738,6 +755,7 @@ void page_remove_rmap(struct page *page)
738 mem_cgroup_uncharge_page(page); 755 mem_cgroup_uncharge_page(page);
739 __dec_zone_page_state(page, 756 __dec_zone_page_state(page,
740 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 757 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
758 mem_cgroup_update_mapped_file_stat(page, -1);
741 /* 759 /*
742 * It would be tidy to reset the PageAnon mapping here, 760 * It would be tidy to reset the PageAnon mapping here,
743 * but that might overwrite a racing page_add_anon_rmap 761 * but that might overwrite a racing page_add_anon_rmap
@@ -1202,7 +1220,6 @@ int try_to_unmap(struct page *page, int migration)
1202 return ret; 1220 return ret;
1203} 1221}
1204 1222
1205#ifdef CONFIG_UNEVICTABLE_LRU
1206/** 1223/**
1207 * try_to_munlock - try to munlock a page 1224 * try_to_munlock - try to munlock a page
1208 * @page: the page to be munlocked 1225 * @page: the page to be munlocked
@@ -1226,4 +1243,4 @@ int try_to_munlock(struct page *page)
1226 else 1243 else
1227 return try_to_unmap_file(page, 1, 0); 1244 return try_to_unmap_file(page, 1, 0);
1228} 1245}
1229#endif 1246
diff --git a/mm/shmem.c b/mm/shmem.c
index 0132fbd45a23..e89d7ec18eda 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1097 shmem_swp_unmap(entry); 1097 shmem_swp_unmap(entry);
1098unlock: 1098unlock:
1099 spin_unlock(&info->lock); 1099 spin_unlock(&info->lock);
1100 swap_free(swap); 1100 swapcache_free(swap, NULL);
1101redirty: 1101redirty:
1102 set_page_dirty(page); 1102 set_page_dirty(page);
1103 if (wbc->for_reclaim) 1103 if (wbc->for_reclaim)
@@ -2612,7 +2612,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
2612 * @size: size to be set for the file 2612 * @size: size to be set for the file
2613 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 2613 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
2614 */ 2614 */
2615struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) 2615struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
2616{ 2616{
2617 int error; 2617 int error;
2618 struct file *file; 2618 struct file *file;
diff --git a/mm/slab.c b/mm/slab.c
index 18e3164de09a..d08692303f6e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -114,6 +114,7 @@
114#include <linux/rtmutex.h> 114#include <linux/rtmutex.h>
115#include <linux/reciprocal_div.h> 115#include <linux/reciprocal_div.h>
116#include <linux/debugobjects.h> 116#include <linux/debugobjects.h>
117#include <linux/kmemcheck.h>
117 118
118#include <asm/cacheflush.h> 119#include <asm/cacheflush.h>
119#include <asm/tlbflush.h> 120#include <asm/tlbflush.h>
@@ -179,13 +180,13 @@
179 SLAB_STORE_USER | \ 180 SLAB_STORE_USER | \
180 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 181 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
181 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 182 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
182 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE) 183 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
183#else 184#else
184# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ 185# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
185 SLAB_CACHE_DMA | \ 186 SLAB_CACHE_DMA | \
186 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 187 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
187 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ 188 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
188 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE) 189 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
189#endif 190#endif
190 191
191/* 192/*
@@ -380,87 +381,6 @@ static void kmem_list3_init(struct kmem_list3 *parent)
380 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 381 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
381 } while (0) 382 } while (0)
382 383
383/*
384 * struct kmem_cache
385 *
386 * manages a cache.
387 */
388
389struct kmem_cache {
390/* 1) per-cpu data, touched during every alloc/free */
391 struct array_cache *array[NR_CPUS];
392/* 2) Cache tunables. Protected by cache_chain_mutex */
393 unsigned int batchcount;
394 unsigned int limit;
395 unsigned int shared;
396
397 unsigned int buffer_size;
398 u32 reciprocal_buffer_size;
399/* 3) touched by every alloc & free from the backend */
400
401 unsigned int flags; /* constant flags */
402 unsigned int num; /* # of objs per slab */
403
404/* 4) cache_grow/shrink */
405 /* order of pgs per slab (2^n) */
406 unsigned int gfporder;
407
408 /* force GFP flags, e.g. GFP_DMA */
409 gfp_t gfpflags;
410
411 size_t colour; /* cache colouring range */
412 unsigned int colour_off; /* colour offset */
413 struct kmem_cache *slabp_cache;
414 unsigned int slab_size;
415 unsigned int dflags; /* dynamic flags */
416
417 /* constructor func */
418 void (*ctor)(void *obj);
419
420/* 5) cache creation/removal */
421 const char *name;
422 struct list_head next;
423
424/* 6) statistics */
425#if STATS
426 unsigned long num_active;
427 unsigned long num_allocations;
428 unsigned long high_mark;
429 unsigned long grown;
430 unsigned long reaped;
431 unsigned long errors;
432 unsigned long max_freeable;
433 unsigned long node_allocs;
434 unsigned long node_frees;
435 unsigned long node_overflow;
436 atomic_t allochit;
437 atomic_t allocmiss;
438 atomic_t freehit;
439 atomic_t freemiss;
440#endif
441#if DEBUG
442 /*
443 * If debugging is enabled, then the allocator can add additional
444 * fields and/or padding to every object. buffer_size contains the total
445 * object size including these internal fields, the following two
446 * variables contain the offset to the user object and its size.
447 */
448 int obj_offset;
449 int obj_size;
450#endif
451 /*
452 * We put nodelists[] at the end of kmem_cache, because we want to size
453 * this array to nr_node_ids slots instead of MAX_NUMNODES
454 * (see kmem_cache_init())
455 * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache
456 * is statically defined, so we reserve the max number of nodes.
457 */
458 struct kmem_list3 *nodelists[MAX_NUMNODES];
459 /*
460 * Do not add fields after nodelists[]
461 */
462};
463
464#define CFLGS_OFF_SLAB (0x80000000UL) 384#define CFLGS_OFF_SLAB (0x80000000UL)
465#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 385#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
466 386
@@ -898,7 +818,6 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
898 */ 818 */
899 819
900static int use_alien_caches __read_mostly = 1; 820static int use_alien_caches __read_mostly = 1;
901static int numa_platform __read_mostly = 1;
902static int __init noaliencache_setup(char *s) 821static int __init noaliencache_setup(char *s)
903{ 822{
904 use_alien_caches = 0; 823 use_alien_caches = 0;
@@ -1457,10 +1376,8 @@ void __init kmem_cache_init(void)
1457 int order; 1376 int order;
1458 int node; 1377 int node;
1459 1378
1460 if (num_possible_nodes() == 1) { 1379 if (num_possible_nodes() == 1)
1461 use_alien_caches = 0; 1380 use_alien_caches = 0;
1462 numa_platform = 0;
1463 }
1464 1381
1465 for (i = 0; i < NUM_INIT_LISTS; i++) { 1382 for (i = 0; i < NUM_INIT_LISTS; i++) {
1466 kmem_list3_init(&initkmem_list3[i]); 1383 kmem_list3_init(&initkmem_list3[i]);
@@ -1707,7 +1624,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1707 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1624 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1708 flags |= __GFP_RECLAIMABLE; 1625 flags |= __GFP_RECLAIMABLE;
1709 1626
1710 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1627 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1711 if (!page) 1628 if (!page)
1712 return NULL; 1629 return NULL;
1713 1630
@@ -1720,6 +1637,16 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1720 NR_SLAB_UNRECLAIMABLE, nr_pages); 1637 NR_SLAB_UNRECLAIMABLE, nr_pages);
1721 for (i = 0; i < nr_pages; i++) 1638 for (i = 0; i < nr_pages; i++)
1722 __SetPageSlab(page + i); 1639 __SetPageSlab(page + i);
1640
1641 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1642 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1643
1644 if (cachep->ctor)
1645 kmemcheck_mark_uninitialized_pages(page, nr_pages);
1646 else
1647 kmemcheck_mark_unallocated_pages(page, nr_pages);
1648 }
1649
1723 return page_address(page); 1650 return page_address(page);
1724} 1651}
1725 1652
@@ -1732,6 +1659,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1732 struct page *page = virt_to_page(addr); 1659 struct page *page = virt_to_page(addr);
1733 const unsigned long nr_freed = i; 1660 const unsigned long nr_freed = i;
1734 1661
1662 kmemcheck_free_shadow(page, cachep->gfporder);
1663
1735 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1664 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1736 sub_zone_page_state(page_zone(page), 1665 sub_zone_page_state(page_zone(page),
1737 NR_SLAB_RECLAIMABLE, nr_freed); 1666 NR_SLAB_RECLAIMABLE, nr_freed);
@@ -2379,6 +2308,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2379 /* really off slab. No need for manual alignment */ 2308 /* really off slab. No need for manual alignment */
2380 slab_size = 2309 slab_size =
2381 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); 2310 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2311
2312#ifdef CONFIG_PAGE_POISONING
2313 /* If we're going to use the generic kernel_map_pages()
2314 * poisoning, then it's going to smash the contents of
2315 * the redzone and userword anyhow, so switch them off.
2316 */
2317 if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
2318 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2319#endif
2382 } 2320 }
2383 2321
2384 cachep->colour_off = cache_line_size(); 2322 cachep->colour_off = cache_line_size();
@@ -3261,7 +3199,7 @@ retry:
3261 if (local_flags & __GFP_WAIT) 3199 if (local_flags & __GFP_WAIT)
3262 local_irq_enable(); 3200 local_irq_enable();
3263 kmem_flagcheck(cache, flags); 3201 kmem_flagcheck(cache, flags);
3264 obj = kmem_getpages(cache, local_flags, -1); 3202 obj = kmem_getpages(cache, local_flags, numa_node_id());
3265 if (local_flags & __GFP_WAIT) 3203 if (local_flags & __GFP_WAIT)
3266 local_irq_disable(); 3204 local_irq_disable();
3267 if (obj) { 3205 if (obj) {
@@ -3407,6 +3345,9 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3407 kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, 3345 kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
3408 flags); 3346 flags);
3409 3347
3348 if (likely(ptr))
3349 kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
3350
3410 if (unlikely((flags & __GFP_ZERO) && ptr)) 3351 if (unlikely((flags & __GFP_ZERO) && ptr))
3411 memset(ptr, 0, obj_size(cachep)); 3352 memset(ptr, 0, obj_size(cachep));
3412 3353
@@ -3467,6 +3408,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3467 flags); 3408 flags);
3468 prefetchw(objp); 3409 prefetchw(objp);
3469 3410
3411 if (likely(objp))
3412 kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
3413
3470 if (unlikely((flags & __GFP_ZERO) && objp)) 3414 if (unlikely((flags & __GFP_ZERO) && objp))
3471 memset(objp, 0, obj_size(cachep)); 3415 memset(objp, 0, obj_size(cachep));
3472 3416
@@ -3583,6 +3527,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3583 kmemleak_free_recursive(objp, cachep->flags); 3527 kmemleak_free_recursive(objp, cachep->flags);
3584 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3528 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3585 3529
3530 kmemcheck_slab_free(cachep, objp, obj_size(cachep));
3531
3586 /* 3532 /*
3587 * Skip calling cache_free_alien() when the platform is not numa. 3533 * Skip calling cache_free_alien() when the platform is not numa.
3588 * This will avoid cache misses that happen while accessing slabp (which 3534 * This will avoid cache misses that happen while accessing slabp (which
@@ -3590,7 +3536,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3590 * variable to skip the call, which is mostly likely to be present in 3536 * variable to skip the call, which is mostly likely to be present in
3591 * the cache. 3537 * the cache.
3592 */ 3538 */
3593 if (numa_platform && cache_free_alien(cachep, objp)) 3539 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3594 return; 3540 return;
3595 3541
3596 if (likely(ac->avail < ac->limit)) { 3542 if (likely(ac->avail < ac->limit)) {
diff --git a/mm/slob.c b/mm/slob.c
index 12f261499925..c78742defdc6 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -46,7 +46,7 @@
46 * NUMA support in SLOB is fairly simplistic, pushing most of the real 46 * NUMA support in SLOB is fairly simplistic, pushing most of the real
47 * logic down to the page allocator, and simply doing the node accounting 47 * logic down to the page allocator, and simply doing the node accounting
48 * on the upper levels. In the event that a node id is explicitly 48 * on the upper levels. In the event that a node id is explicitly
49 * provided, alloc_pages_node() with the specified node id is used 49 * provided, alloc_pages_exact_node() with the specified node id is used
50 * instead. The common case (or when the node id isn't explicitly provided) 50 * instead. The common case (or when the node id isn't explicitly provided)
51 * will default to the current node, as per numa_node_id(). 51 * will default to the current node, as per numa_node_id().
52 * 52 *
@@ -133,17 +133,17 @@ static LIST_HEAD(free_slob_large);
133 */ 133 */
134static inline int is_slob_page(struct slob_page *sp) 134static inline int is_slob_page(struct slob_page *sp)
135{ 135{
136 return PageSlobPage((struct page *)sp); 136 return PageSlab((struct page *)sp);
137} 137}
138 138
139static inline void set_slob_page(struct slob_page *sp) 139static inline void set_slob_page(struct slob_page *sp)
140{ 140{
141 __SetPageSlobPage((struct page *)sp); 141 __SetPageSlab((struct page *)sp);
142} 142}
143 143
144static inline void clear_slob_page(struct slob_page *sp) 144static inline void clear_slob_page(struct slob_page *sp)
145{ 145{
146 __ClearPageSlobPage((struct page *)sp); 146 __ClearPageSlab((struct page *)sp);
147} 147}
148 148
149static inline struct slob_page *slob_page(const void *addr) 149static inline struct slob_page *slob_page(const void *addr)
@@ -244,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
244 244
245#ifdef CONFIG_NUMA 245#ifdef CONFIG_NUMA
246 if (node != -1) 246 if (node != -1)
247 page = alloc_pages_node(node, gfp, order); 247 page = alloc_pages_exact_node(node, gfp, order);
248 else 248 else
249#endif 249#endif
250 page = alloc_pages(gfp, order); 250 page = alloc_pages(gfp, order);
diff --git a/mm/slub.c b/mm/slub.c
index 30354bfeb43d..4c6449310a0e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -18,6 +18,7 @@
18#include <linux/proc_fs.h> 18#include <linux/proc_fs.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/kmemtrace.h> 20#include <linux/kmemtrace.h>
21#include <linux/kmemcheck.h>
21#include <linux/cpu.h> 22#include <linux/cpu.h>
22#include <linux/cpuset.h> 23#include <linux/cpuset.h>
23#include <linux/kmemleak.h> 24#include <linux/kmemleak.h>
@@ -147,7 +148,7 @@
147 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) 148 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE)
148 149
149#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 150#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
150 SLAB_CACHE_DMA) 151 SLAB_CACHE_DMA | SLAB_NOTRACK)
151 152
152#ifndef ARCH_KMALLOC_MINALIGN 153#ifndef ARCH_KMALLOC_MINALIGN
153#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 154#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
@@ -839,6 +840,11 @@ static inline unsigned long slabs_node(struct kmem_cache *s, int node)
839 return atomic_long_read(&n->nr_slabs); 840 return atomic_long_read(&n->nr_slabs);
840} 841}
841 842
843static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
844{
845 return atomic_long_read(&n->nr_slabs);
846}
847
842static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) 848static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
843{ 849{
844 struct kmem_cache_node *n = get_node(s, node); 850 struct kmem_cache_node *n = get_node(s, node);
@@ -1057,6 +1063,8 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
1057 1063
1058static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1064static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1059 { return 0; } 1065 { return 0; }
1066static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1067 { return 0; }
1060static inline void inc_slabs_node(struct kmem_cache *s, int node, 1068static inline void inc_slabs_node(struct kmem_cache *s, int node,
1061 int objects) {} 1069 int objects) {}
1062static inline void dec_slabs_node(struct kmem_cache *s, int node, 1070static inline void dec_slabs_node(struct kmem_cache *s, int node,
@@ -1071,6 +1079,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
1071{ 1079{
1072 int order = oo_order(oo); 1080 int order = oo_order(oo);
1073 1081
1082 flags |= __GFP_NOTRACK;
1083
1074 if (node == -1) 1084 if (node == -1)
1075 return alloc_pages(flags, order); 1085 return alloc_pages(flags, order);
1076 else 1086 else
@@ -1098,6 +1108,24 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1098 1108
1099 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); 1109 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK);
1100 } 1110 }
1111
1112 if (kmemcheck_enabled
1113 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS)))
1114 {
1115 int pages = 1 << oo_order(oo);
1116
1117 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
1118
1119 /*
1120 * Objects from caches that have a constructor don't get
1121 * cleared when they're allocated, so we need to do it here.
1122 */
1123 if (s->ctor)
1124 kmemcheck_mark_uninitialized_pages(page, pages);
1125 else
1126 kmemcheck_mark_unallocated_pages(page, pages);
1127 }
1128
1101 page->objects = oo_objects(oo); 1129 page->objects = oo_objects(oo);
1102 mod_zone_page_state(page_zone(page), 1130 mod_zone_page_state(page_zone(page),
1103 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1131 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -1171,6 +1199,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1171 __ClearPageSlubDebug(page); 1199 __ClearPageSlubDebug(page);
1172 } 1200 }
1173 1201
1202 kmemcheck_free_shadow(page, compound_order(page));
1203
1174 mod_zone_page_state(page_zone(page), 1204 mod_zone_page_state(page_zone(page),
1175 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1205 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1176 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1206 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
@@ -1491,6 +1521,65 @@ static inline int node_match(struct kmem_cache_cpu *c, int node)
1491 return 1; 1521 return 1;
1492} 1522}
1493 1523
1524static int count_free(struct page *page)
1525{
1526 return page->objects - page->inuse;
1527}
1528
1529static unsigned long count_partial(struct kmem_cache_node *n,
1530 int (*get_count)(struct page *))
1531{
1532 unsigned long flags;
1533 unsigned long x = 0;
1534 struct page *page;
1535
1536 spin_lock_irqsave(&n->list_lock, flags);
1537 list_for_each_entry(page, &n->partial, lru)
1538 x += get_count(page);
1539 spin_unlock_irqrestore(&n->list_lock, flags);
1540 return x;
1541}
1542
1543static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
1544{
1545#ifdef CONFIG_SLUB_DEBUG
1546 return atomic_long_read(&n->total_objects);
1547#else
1548 return 0;
1549#endif
1550}
1551
1552static noinline void
1553slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
1554{
1555 int node;
1556
1557 printk(KERN_WARNING
1558 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
1559 nid, gfpflags);
1560 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, "
1561 "default order: %d, min order: %d\n", s->name, s->objsize,
1562 s->size, oo_order(s->oo), oo_order(s->min));
1563
1564 for_each_online_node(node) {
1565 struct kmem_cache_node *n = get_node(s, node);
1566 unsigned long nr_slabs;
1567 unsigned long nr_objs;
1568 unsigned long nr_free;
1569
1570 if (!n)
1571 continue;
1572
1573 nr_free = count_partial(n, count_free);
1574 nr_slabs = node_nr_slabs(n);
1575 nr_objs = node_nr_objs(n);
1576
1577 printk(KERN_WARNING
1578 " node %d: slabs: %ld, objs: %ld, free: %ld\n",
1579 node, nr_slabs, nr_objs, nr_free);
1580 }
1581}
1582
1494/* 1583/*
1495 * Slow path. The lockless freelist is empty or we need to perform 1584 * Slow path. The lockless freelist is empty or we need to perform
1496 * debugging duties. 1585 * debugging duties.
@@ -1572,6 +1661,8 @@ new_slab:
1572 c->page = new; 1661 c->page = new;
1573 goto load_freelist; 1662 goto load_freelist;
1574 } 1663 }
1664 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1665 slab_out_of_memory(s, gfpflags, node);
1575 return NULL; 1666 return NULL;
1576debug: 1667debug:
1577 if (!alloc_debug_processing(s, c->page, object, addr)) 1668 if (!alloc_debug_processing(s, c->page, object, addr))
@@ -1626,7 +1717,9 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1626 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1717 if (unlikely((gfpflags & __GFP_ZERO) && object))
1627 memset(object, 0, objsize); 1718 memset(object, 0, objsize);
1628 1719
1720 kmemcheck_slab_alloc(s, gfpflags, object, c->objsize);
1629 kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); 1721 kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags);
1722
1630 return object; 1723 return object;
1631} 1724}
1632 1725
@@ -1759,6 +1852,7 @@ static __always_inline void slab_free(struct kmem_cache *s,
1759 kmemleak_free_recursive(x, s->flags); 1852 kmemleak_free_recursive(x, s->flags);
1760 local_irq_save(flags); 1853 local_irq_save(flags);
1761 c = get_cpu_slab(s, smp_processor_id()); 1854 c = get_cpu_slab(s, smp_processor_id());
1855 kmemcheck_slab_free(s, object, c->objsize);
1762 debug_check_no_locks_freed(object, c->objsize); 1856 debug_check_no_locks_freed(object, c->objsize);
1763 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1857 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1764 debug_check_no_obj_freed(object, c->objsize); 1858 debug_check_no_obj_freed(object, c->objsize);
@@ -2610,6 +2704,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2610 struct kmem_cache *s; 2704 struct kmem_cache *s;
2611 char *text; 2705 char *text;
2612 size_t realsize; 2706 size_t realsize;
2707 unsigned long slabflags;
2613 2708
2614 s = kmalloc_caches_dma[index]; 2709 s = kmalloc_caches_dma[index];
2615 if (s) 2710 if (s)
@@ -2631,9 +2726,18 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2631 (unsigned int)realsize); 2726 (unsigned int)realsize);
2632 s = kmalloc(kmem_size, flags & ~SLUB_DMA); 2727 s = kmalloc(kmem_size, flags & ~SLUB_DMA);
2633 2728
2729 /*
2730 * Must defer sysfs creation to a workqueue because we don't know
2731 * what context we are called from. Before sysfs comes up, we don't
2732 * need to do anything because our sysfs initcall will start by
2733 * adding all existing slabs to sysfs.
2734 */
2735 slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK;
2736 if (slab_state >= SYSFS)
2737 slabflags |= __SYSFS_ADD_DEFERRED;
2738
2634 if (!s || !text || !kmem_cache_open(s, flags, text, 2739 if (!s || !text || !kmem_cache_open(s, flags, text,
2635 realsize, ARCH_KMALLOC_MINALIGN, 2740 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
2636 SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) {
2637 kfree(s); 2741 kfree(s);
2638 kfree(text); 2742 kfree(text);
2639 goto unlock_out; 2743 goto unlock_out;
@@ -2642,7 +2746,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2642 list_add(&s->list, &slab_caches); 2746 list_add(&s->list, &slab_caches);
2643 kmalloc_caches_dma[index] = s; 2747 kmalloc_caches_dma[index] = s;
2644 2748
2645 schedule_work(&sysfs_add_work); 2749 if (slab_state >= SYSFS)
2750 schedule_work(&sysfs_add_work);
2646 2751
2647unlock_out: 2752unlock_out:
2648 up_write(&slub_lock); 2753 up_write(&slub_lock);
@@ -2727,9 +2832,10 @@ EXPORT_SYMBOL(__kmalloc);
2727 2832
2728static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 2833static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2729{ 2834{
2730 struct page *page = alloc_pages_node(node, flags | __GFP_COMP, 2835 struct page *page;
2731 get_order(size));
2732 2836
2837 flags |= __GFP_COMP | __GFP_NOTRACK;
2838 page = alloc_pages_node(node, flags, get_order(size));
2733 if (page) 2839 if (page)
2734 return page_address(page); 2840 return page_address(page);
2735 else 2841 else
@@ -3340,20 +3446,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3340} 3446}
3341 3447
3342#ifdef CONFIG_SLUB_DEBUG 3448#ifdef CONFIG_SLUB_DEBUG
3343static unsigned long count_partial(struct kmem_cache_node *n,
3344 int (*get_count)(struct page *))
3345{
3346 unsigned long flags;
3347 unsigned long x = 0;
3348 struct page *page;
3349
3350 spin_lock_irqsave(&n->list_lock, flags);
3351 list_for_each_entry(page, &n->partial, lru)
3352 x += get_count(page);
3353 spin_unlock_irqrestore(&n->list_lock, flags);
3354 return x;
3355}
3356
3357static int count_inuse(struct page *page) 3449static int count_inuse(struct page *page)
3358{ 3450{
3359 return page->inuse; 3451 return page->inuse;
@@ -3364,11 +3456,6 @@ static int count_total(struct page *page)
3364 return page->objects; 3456 return page->objects;
3365} 3457}
3366 3458
3367static int count_free(struct page *page)
3368{
3369 return page->objects - page->inuse;
3370}
3371
3372static int validate_slab(struct kmem_cache *s, struct page *page, 3459static int validate_slab(struct kmem_cache *s, struct page *page,
3373 unsigned long *map) 3460 unsigned long *map)
3374{ 3461{
@@ -3737,7 +3824,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
3737 to_cpumask(l->cpus)); 3824 to_cpumask(l->cpus));
3738 } 3825 }
3739 3826
3740 if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && 3827 if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
3741 len < PAGE_SIZE - 60) { 3828 len < PAGE_SIZE - 60) {
3742 len += sprintf(buf + len, " nodes="); 3829 len += sprintf(buf + len, " nodes=");
3743 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, 3830 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
@@ -4412,6 +4499,8 @@ static char *create_unique_id(struct kmem_cache *s)
4412 *p++ = 'a'; 4499 *p++ = 'a';
4413 if (s->flags & SLAB_DEBUG_FREE) 4500 if (s->flags & SLAB_DEBUG_FREE)
4414 *p++ = 'F'; 4501 *p++ = 'F';
4502 if (!(s->flags & SLAB_NOTRACK))
4503 *p++ = 't';
4415 if (p != name + 1) 4504 if (p != name + 1)
4416 *p++ = '-'; 4505 *p++ = '-';
4417 p += sprintf(p, "%07d", s->size); 4506 p += sprintf(p, "%07d", s->size);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1416e7e9e02d..42cd38eba79f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -124,7 +124,6 @@ void __delete_from_swap_cache(struct page *page)
124/** 124/**
125 * add_to_swap - allocate swap space for a page 125 * add_to_swap - allocate swap space for a page
126 * @page: page we want to move to swap 126 * @page: page we want to move to swap
127 * @gfp_mask: memory allocation flags
128 * 127 *
129 * Allocate swap space for the page and add the page to the 128 * Allocate swap space for the page and add the page to the
130 * swap cache. Caller needs to hold the page lock. 129 * swap cache. Caller needs to hold the page lock.
@@ -162,11 +161,11 @@ int add_to_swap(struct page *page)
162 return 1; 161 return 1;
163 case -EEXIST: 162 case -EEXIST:
164 /* Raced with "speculative" read_swap_cache_async */ 163 /* Raced with "speculative" read_swap_cache_async */
165 swap_free(entry); 164 swapcache_free(entry, NULL);
166 continue; 165 continue;
167 default: 166 default:
168 /* -ENOMEM radix-tree allocation failure */ 167 /* -ENOMEM radix-tree allocation failure */
169 swap_free(entry); 168 swapcache_free(entry, NULL);
170 return 0; 169 return 0;
171 } 170 }
172 } 171 }
@@ -188,8 +187,7 @@ void delete_from_swap_cache(struct page *page)
188 __delete_from_swap_cache(page); 187 __delete_from_swap_cache(page);
189 spin_unlock_irq(&swapper_space.tree_lock); 188 spin_unlock_irq(&swapper_space.tree_lock);
190 189
191 mem_cgroup_uncharge_swapcache(page, entry); 190 swapcache_free(entry, page);
192 swap_free(entry);
193 page_cache_release(page); 191 page_cache_release(page);
194} 192}
195 193
@@ -293,7 +291,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
293 /* 291 /*
294 * Swap entry may have been freed since our caller observed it. 292 * Swap entry may have been freed since our caller observed it.
295 */ 293 */
296 if (!swap_duplicate(entry)) 294 err = swapcache_prepare(entry);
295 if (err == -EEXIST) /* seems racy */
296 continue;
297 if (err) /* swp entry is obsolete ? */
297 break; 298 break;
298 299
299 /* 300 /*
@@ -312,12 +313,12 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
312 * Initiate read into locked page and return. 313 * Initiate read into locked page and return.
313 */ 314 */
314 lru_cache_add_anon(new_page); 315 lru_cache_add_anon(new_page);
315 swap_readpage(NULL, new_page); 316 swap_readpage(new_page);
316 return new_page; 317 return new_page;
317 } 318 }
318 ClearPageSwapBacked(new_page); 319 ClearPageSwapBacked(new_page);
319 __clear_page_locked(new_page); 320 __clear_page_locked(new_page);
320 swap_free(entry); 321 swapcache_free(entry, NULL);
321 } while (err != -ENOMEM); 322 } while (err != -ENOMEM);
322 323
323 if (new_page) 324 if (new_page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 312fafe0ab6e..d1ade1a48ee7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -53,6 +53,59 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES];
53 53
54static DEFINE_MUTEX(swapon_mutex); 54static DEFINE_MUTEX(swapon_mutex);
55 55
56/* For reference count accounting in swap_map */
57/* enum for swap_map[] handling. internal use only */
58enum {
59 SWAP_MAP = 0, /* ops for reference from swap users */
60 SWAP_CACHE, /* ops for reference from swap cache */
61};
62
63static inline int swap_count(unsigned short ent)
64{
65 return ent & SWAP_COUNT_MASK;
66}
67
68static inline bool swap_has_cache(unsigned short ent)
69{
70 return !!(ent & SWAP_HAS_CACHE);
71}
72
73static inline unsigned short encode_swapmap(int count, bool has_cache)
74{
75 unsigned short ret = count;
76
77 if (has_cache)
78 return SWAP_HAS_CACHE | ret;
79 return ret;
80}
81
82/* returnes 1 if swap entry is freed */
83static int
84__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
85{
86 int type = si - swap_info;
87 swp_entry_t entry = swp_entry(type, offset);
88 struct page *page;
89 int ret = 0;
90
91 page = find_get_page(&swapper_space, entry.val);
92 if (!page)
93 return 0;
94 /*
95 * This function is called from scan_swap_map() and it's called
96 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
97 * We have to use trylock for avoiding deadlock. This is a special
98 * case and you should use try_to_free_swap() with explicit lock_page()
99 * in usual operations.
100 */
101 if (trylock_page(page)) {
102 ret = try_to_free_swap(page);
103 unlock_page(page);
104 }
105 page_cache_release(page);
106 return ret;
107}
108
56/* 109/*
57 * We need this because the bdev->unplug_fn can sleep and we cannot 110 * We need this because the bdev->unplug_fn can sleep and we cannot
58 * hold swap_lock while calling the unplug_fn. And swap_lock 111 * hold swap_lock while calling the unplug_fn. And swap_lock
@@ -167,7 +220,8 @@ static int wait_for_discard(void *word)
167#define SWAPFILE_CLUSTER 256 220#define SWAPFILE_CLUSTER 256
168#define LATENCY_LIMIT 256 221#define LATENCY_LIMIT 256
169 222
170static inline unsigned long scan_swap_map(struct swap_info_struct *si) 223static inline unsigned long scan_swap_map(struct swap_info_struct *si,
224 int cache)
171{ 225{
172 unsigned long offset; 226 unsigned long offset;
173 unsigned long scan_base; 227 unsigned long scan_base;
@@ -273,6 +327,19 @@ checks:
273 goto no_page; 327 goto no_page;
274 if (offset > si->highest_bit) 328 if (offset > si->highest_bit)
275 scan_base = offset = si->lowest_bit; 329 scan_base = offset = si->lowest_bit;
330
331 /* reuse swap entry of cache-only swap if not busy. */
332 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
333 int swap_was_freed;
334 spin_unlock(&swap_lock);
335 swap_was_freed = __try_to_reclaim_swap(si, offset);
336 spin_lock(&swap_lock);
337 /* entry was freed successfully, try to use this again */
338 if (swap_was_freed)
339 goto checks;
340 goto scan; /* check next one */
341 }
342
276 if (si->swap_map[offset]) 343 if (si->swap_map[offset])
277 goto scan; 344 goto scan;
278 345
@@ -285,7 +352,10 @@ checks:
285 si->lowest_bit = si->max; 352 si->lowest_bit = si->max;
286 si->highest_bit = 0; 353 si->highest_bit = 0;
287 } 354 }
288 si->swap_map[offset] = 1; 355 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
356 si->swap_map[offset] = encode_swapmap(0, true);
357 else /* at suspend */
358 si->swap_map[offset] = encode_swapmap(1, false);
289 si->cluster_next = offset + 1; 359 si->cluster_next = offset + 1;
290 si->flags -= SWP_SCANNING; 360 si->flags -= SWP_SCANNING;
291 361
@@ -351,6 +421,10 @@ scan:
351 spin_lock(&swap_lock); 421 spin_lock(&swap_lock);
352 goto checks; 422 goto checks;
353 } 423 }
424 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
425 spin_lock(&swap_lock);
426 goto checks;
427 }
354 if (unlikely(--latency_ration < 0)) { 428 if (unlikely(--latency_ration < 0)) {
355 cond_resched(); 429 cond_resched();
356 latency_ration = LATENCY_LIMIT; 430 latency_ration = LATENCY_LIMIT;
@@ -362,6 +436,10 @@ scan:
362 spin_lock(&swap_lock); 436 spin_lock(&swap_lock);
363 goto checks; 437 goto checks;
364 } 438 }
439 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
440 spin_lock(&swap_lock);
441 goto checks;
442 }
365 if (unlikely(--latency_ration < 0)) { 443 if (unlikely(--latency_ration < 0)) {
366 cond_resched(); 444 cond_resched();
367 latency_ration = LATENCY_LIMIT; 445 latency_ration = LATENCY_LIMIT;
@@ -401,7 +479,8 @@ swp_entry_t get_swap_page(void)
401 continue; 479 continue;
402 480
403 swap_list.next = next; 481 swap_list.next = next;
404 offset = scan_swap_map(si); 482 /* This is called for allocating swap entry for cache */
483 offset = scan_swap_map(si, SWAP_CACHE);
405 if (offset) { 484 if (offset) {
406 spin_unlock(&swap_lock); 485 spin_unlock(&swap_lock);
407 return swp_entry(type, offset); 486 return swp_entry(type, offset);
@@ -415,6 +494,7 @@ noswap:
415 return (swp_entry_t) {0}; 494 return (swp_entry_t) {0};
416} 495}
417 496
497/* The only caller of this function is now susupend routine */
418swp_entry_t get_swap_page_of_type(int type) 498swp_entry_t get_swap_page_of_type(int type)
419{ 499{
420 struct swap_info_struct *si; 500 struct swap_info_struct *si;
@@ -424,7 +504,8 @@ swp_entry_t get_swap_page_of_type(int type)
424 si = swap_info + type; 504 si = swap_info + type;
425 if (si->flags & SWP_WRITEOK) { 505 if (si->flags & SWP_WRITEOK) {
426 nr_swap_pages--; 506 nr_swap_pages--;
427 offset = scan_swap_map(si); 507 /* This is called for allocating swap entry, not cache */
508 offset = scan_swap_map(si, SWAP_MAP);
428 if (offset) { 509 if (offset) {
429 spin_unlock(&swap_lock); 510 spin_unlock(&swap_lock);
430 return swp_entry(type, offset); 511 return swp_entry(type, offset);
@@ -471,26 +552,40 @@ out:
471 return NULL; 552 return NULL;
472} 553}
473 554
474static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) 555static int swap_entry_free(struct swap_info_struct *p,
556 swp_entry_t ent, int cache)
475{ 557{
476 unsigned long offset = swp_offset(ent); 558 unsigned long offset = swp_offset(ent);
477 int count = p->swap_map[offset]; 559 int count = swap_count(p->swap_map[offset]);
478 560 bool has_cache;
479 if (count < SWAP_MAP_MAX) { 561
480 count--; 562 has_cache = swap_has_cache(p->swap_map[offset]);
481 p->swap_map[offset] = count; 563
482 if (!count) { 564 if (cache == SWAP_MAP) { /* dropping usage count of swap */
483 if (offset < p->lowest_bit) 565 if (count < SWAP_MAP_MAX) {
484 p->lowest_bit = offset; 566 count--;
485 if (offset > p->highest_bit) 567 p->swap_map[offset] = encode_swapmap(count, has_cache);
486 p->highest_bit = offset;
487 if (p->prio > swap_info[swap_list.next].prio)
488 swap_list.next = p - swap_info;
489 nr_swap_pages++;
490 p->inuse_pages--;
491 mem_cgroup_uncharge_swap(ent);
492 } 568 }
569 } else { /* dropping swap cache flag */
570 VM_BUG_ON(!has_cache);
571 p->swap_map[offset] = encode_swapmap(count, false);
572
573 }
574 /* return code. */
575 count = p->swap_map[offset];
576 /* free if no reference */
577 if (!count) {
578 if (offset < p->lowest_bit)
579 p->lowest_bit = offset;
580 if (offset > p->highest_bit)
581 p->highest_bit = offset;
582 if (p->prio > swap_info[swap_list.next].prio)
583 swap_list.next = p - swap_info;
584 nr_swap_pages++;
585 p->inuse_pages--;
493 } 586 }
587 if (!swap_count(count))
588 mem_cgroup_uncharge_swap(ent);
494 return count; 589 return count;
495} 590}
496 591
@@ -504,9 +599,33 @@ void swap_free(swp_entry_t entry)
504 599
505 p = swap_info_get(entry); 600 p = swap_info_get(entry);
506 if (p) { 601 if (p) {
507 swap_entry_free(p, entry); 602 swap_entry_free(p, entry, SWAP_MAP);
603 spin_unlock(&swap_lock);
604 }
605}
606
607/*
608 * Called after dropping swapcache to decrease refcnt to swap entries.
609 */
610void swapcache_free(swp_entry_t entry, struct page *page)
611{
612 struct swap_info_struct *p;
613 int ret;
614
615 p = swap_info_get(entry);
616 if (p) {
617 ret = swap_entry_free(p, entry, SWAP_CACHE);
618 if (page) {
619 bool swapout;
620 if (ret)
621 swapout = true; /* the end of swap out */
622 else
623 swapout = false; /* no more swap users! */
624 mem_cgroup_uncharge_swapcache(page, entry, swapout);
625 }
508 spin_unlock(&swap_lock); 626 spin_unlock(&swap_lock);
509 } 627 }
628 return;
510} 629}
511 630
512/* 631/*
@@ -521,8 +640,7 @@ static inline int page_swapcount(struct page *page)
521 entry.val = page_private(page); 640 entry.val = page_private(page);
522 p = swap_info_get(entry); 641 p = swap_info_get(entry);
523 if (p) { 642 if (p) {
524 /* Subtract the 1 for the swap cache itself */ 643 count = swap_count(p->swap_map[swp_offset(entry)]);
525 count = p->swap_map[swp_offset(entry)] - 1;
526 spin_unlock(&swap_lock); 644 spin_unlock(&swap_lock);
527 } 645 }
528 return count; 646 return count;
@@ -584,7 +702,7 @@ int free_swap_and_cache(swp_entry_t entry)
584 702
585 p = swap_info_get(entry); 703 p = swap_info_get(entry);
586 if (p) { 704 if (p) {
587 if (swap_entry_free(p, entry) == 1) { 705 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
588 page = find_get_page(&swapper_space, entry.val); 706 page = find_get_page(&swapper_space, entry.val);
589 if (page && !trylock_page(page)) { 707 if (page && !trylock_page(page)) {
590 page_cache_release(page); 708 page_cache_release(page);
@@ -891,7 +1009,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
891 i = 1; 1009 i = 1;
892 } 1010 }
893 count = si->swap_map[i]; 1011 count = si->swap_map[i];
894 if (count && count != SWAP_MAP_BAD) 1012 if (count && swap_count(count) != SWAP_MAP_BAD)
895 break; 1013 break;
896 } 1014 }
897 return i; 1015 return i;
@@ -995,13 +1113,13 @@ static int try_to_unuse(unsigned int type)
995 */ 1113 */
996 shmem = 0; 1114 shmem = 0;
997 swcount = *swap_map; 1115 swcount = *swap_map;
998 if (swcount > 1) { 1116 if (swap_count(swcount)) {
999 if (start_mm == &init_mm) 1117 if (start_mm == &init_mm)
1000 shmem = shmem_unuse(entry, page); 1118 shmem = shmem_unuse(entry, page);
1001 else 1119 else
1002 retval = unuse_mm(start_mm, entry, page); 1120 retval = unuse_mm(start_mm, entry, page);
1003 } 1121 }
1004 if (*swap_map > 1) { 1122 if (swap_count(*swap_map)) {
1005 int set_start_mm = (*swap_map >= swcount); 1123 int set_start_mm = (*swap_map >= swcount);
1006 struct list_head *p = &start_mm->mmlist; 1124 struct list_head *p = &start_mm->mmlist;
1007 struct mm_struct *new_start_mm = start_mm; 1125 struct mm_struct *new_start_mm = start_mm;
@@ -1011,7 +1129,7 @@ static int try_to_unuse(unsigned int type)
1011 atomic_inc(&new_start_mm->mm_users); 1129 atomic_inc(&new_start_mm->mm_users);
1012 atomic_inc(&prev_mm->mm_users); 1130 atomic_inc(&prev_mm->mm_users);
1013 spin_lock(&mmlist_lock); 1131 spin_lock(&mmlist_lock);
1014 while (*swap_map > 1 && !retval && !shmem && 1132 while (swap_count(*swap_map) && !retval && !shmem &&
1015 (p = p->next) != &start_mm->mmlist) { 1133 (p = p->next) != &start_mm->mmlist) {
1016 mm = list_entry(p, struct mm_struct, mmlist); 1134 mm = list_entry(p, struct mm_struct, mmlist);
1017 if (!atomic_inc_not_zero(&mm->mm_users)) 1135 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1023,14 +1141,16 @@ static int try_to_unuse(unsigned int type)
1023 cond_resched(); 1141 cond_resched();
1024 1142
1025 swcount = *swap_map; 1143 swcount = *swap_map;
1026 if (swcount <= 1) 1144 if (!swap_count(swcount)) /* any usage ? */
1027 ; 1145 ;
1028 else if (mm == &init_mm) { 1146 else if (mm == &init_mm) {
1029 set_start_mm = 1; 1147 set_start_mm = 1;
1030 shmem = shmem_unuse(entry, page); 1148 shmem = shmem_unuse(entry, page);
1031 } else 1149 } else
1032 retval = unuse_mm(mm, entry, page); 1150 retval = unuse_mm(mm, entry, page);
1033 if (set_start_mm && *swap_map < swcount) { 1151
1152 if (set_start_mm &&
1153 swap_count(*swap_map) < swcount) {
1034 mmput(new_start_mm); 1154 mmput(new_start_mm);
1035 atomic_inc(&mm->mm_users); 1155 atomic_inc(&mm->mm_users);
1036 new_start_mm = mm; 1156 new_start_mm = mm;
@@ -1057,21 +1177,25 @@ static int try_to_unuse(unsigned int type)
1057 } 1177 }
1058 1178
1059 /* 1179 /*
1060 * How could swap count reach 0x7fff when the maximum 1180 * How could swap count reach 0x7ffe ?
1061 * pid is 0x7fff, and there's no way to repeat a swap 1181 * There's no way to repeat a swap page within an mm
1062 * page within an mm (except in shmem, where it's the 1182 * (except in shmem, where it's the shared object which takes
1063 * shared object which takes the reference count)? 1183 * the reference count)?
1064 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. 1184 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1065 * 1185 * short is too small....)
1066 * If that's wrong, then we should worry more about 1186 * If that's wrong, then we should worry more about
1067 * exit_mmap() and do_munmap() cases described above: 1187 * exit_mmap() and do_munmap() cases described above:
1068 * we might be resetting SWAP_MAP_MAX too early here. 1188 * we might be resetting SWAP_MAP_MAX too early here.
1069 * We know "Undead"s can happen, they're okay, so don't 1189 * We know "Undead"s can happen, they're okay, so don't
1070 * report them; but do report if we reset SWAP_MAP_MAX. 1190 * report them; but do report if we reset SWAP_MAP_MAX.
1071 */ 1191 */
1072 if (*swap_map == SWAP_MAP_MAX) { 1192 /* We might release the lock_page() in unuse_mm(). */
1193 if (!PageSwapCache(page) || page_private(page) != entry.val)
1194 goto retry;
1195
1196 if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1073 spin_lock(&swap_lock); 1197 spin_lock(&swap_lock);
1074 *swap_map = 1; 1198 *swap_map = encode_swapmap(0, true);
1075 spin_unlock(&swap_lock); 1199 spin_unlock(&swap_lock);
1076 reset_overflow = 1; 1200 reset_overflow = 1;
1077 } 1201 }
@@ -1089,7 +1213,8 @@ static int try_to_unuse(unsigned int type)
1089 * pages would be incorrect if swap supported "shared 1213 * pages would be incorrect if swap supported "shared
1090 * private" pages, but they are handled by tmpfs files. 1214 * private" pages, but they are handled by tmpfs files.
1091 */ 1215 */
1092 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { 1216 if (swap_count(*swap_map) &&
1217 PageDirty(page) && PageSwapCache(page)) {
1093 struct writeback_control wbc = { 1218 struct writeback_control wbc = {
1094 .sync_mode = WB_SYNC_NONE, 1219 .sync_mode = WB_SYNC_NONE,
1095 }; 1220 };
@@ -1116,6 +1241,7 @@ static int try_to_unuse(unsigned int type)
1116 * mark page dirty so shrink_page_list will preserve it. 1241 * mark page dirty so shrink_page_list will preserve it.
1117 */ 1242 */
1118 SetPageDirty(page); 1243 SetPageDirty(page);
1244retry:
1119 unlock_page(page); 1245 unlock_page(page);
1120 page_cache_release(page); 1246 page_cache_release(page);
1121 1247
@@ -1942,15 +2068,23 @@ void si_swapinfo(struct sysinfo *val)
1942 * 2068 *
1943 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 2069 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
1944 * "permanent", but will be reclaimed by the next swapoff. 2070 * "permanent", but will be reclaimed by the next swapoff.
2071 * Returns error code in following case.
2072 * - success -> 0
2073 * - swp_entry is invalid -> EINVAL
2074 * - swp_entry is migration entry -> EINVAL
2075 * - swap-cache reference is requested but there is already one. -> EEXIST
2076 * - swap-cache reference is requested but the entry is not used. -> ENOENT
1945 */ 2077 */
1946int swap_duplicate(swp_entry_t entry) 2078static int __swap_duplicate(swp_entry_t entry, bool cache)
1947{ 2079{
1948 struct swap_info_struct * p; 2080 struct swap_info_struct * p;
1949 unsigned long offset, type; 2081 unsigned long offset, type;
1950 int result = 0; 2082 int result = -EINVAL;
2083 int count;
2084 bool has_cache;
1951 2085
1952 if (is_migration_entry(entry)) 2086 if (is_migration_entry(entry))
1953 return 1; 2087 return -EINVAL;
1954 2088
1955 type = swp_type(entry); 2089 type = swp_type(entry);
1956 if (type >= nr_swapfiles) 2090 if (type >= nr_swapfiles)
@@ -1959,17 +2093,40 @@ int swap_duplicate(swp_entry_t entry)
1959 offset = swp_offset(entry); 2093 offset = swp_offset(entry);
1960 2094
1961 spin_lock(&swap_lock); 2095 spin_lock(&swap_lock);
1962 if (offset < p->max && p->swap_map[offset]) { 2096
1963 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { 2097 if (unlikely(offset >= p->max))
1964 p->swap_map[offset]++; 2098 goto unlock_out;
1965 result = 1; 2099
1966 } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { 2100 count = swap_count(p->swap_map[offset]);
2101 has_cache = swap_has_cache(p->swap_map[offset]);
2102
2103 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
2104
2105 /* set SWAP_HAS_CACHE if there is no cache and entry is used */
2106 if (!has_cache && count) {
2107 p->swap_map[offset] = encode_swapmap(count, true);
2108 result = 0;
2109 } else if (has_cache) /* someone added cache */
2110 result = -EEXIST;
2111 else if (!count) /* no users */
2112 result = -ENOENT;
2113
2114 } else if (count || has_cache) {
2115 if (count < SWAP_MAP_MAX - 1) {
2116 p->swap_map[offset] = encode_swapmap(count + 1,
2117 has_cache);
2118 result = 0;
2119 } else if (count <= SWAP_MAP_MAX) {
1967 if (swap_overflow++ < 5) 2120 if (swap_overflow++ < 5)
1968 printk(KERN_WARNING "swap_dup: swap entry overflow\n"); 2121 printk(KERN_WARNING
1969 p->swap_map[offset] = SWAP_MAP_MAX; 2122 "swap_dup: swap entry overflow\n");
1970 result = 1; 2123 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
2124 has_cache);
2125 result = 0;
1971 } 2126 }
1972 } 2127 } else
2128 result = -ENOENT; /* unused swap entry */
2129unlock_out:
1973 spin_unlock(&swap_lock); 2130 spin_unlock(&swap_lock);
1974out: 2131out:
1975 return result; 2132 return result;
@@ -1978,6 +2135,27 @@ bad_file:
1978 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2135 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
1979 goto out; 2136 goto out;
1980} 2137}
2138/*
2139 * increase reference count of swap entry by 1.
2140 */
2141void swap_duplicate(swp_entry_t entry)
2142{
2143 __swap_duplicate(entry, SWAP_MAP);
2144}
2145
2146/*
2147 * @entry: swap entry for which we allocate swap cache.
2148 *
2149 * Called when allocating swap cache for exising swap entry,
2150 * This can return error codes. Returns 0 at success.
2151 * -EBUSY means there is a swap cache.
2152 * Note: return code is different from swap_duplicate().
2153 */
2154int swapcache_prepare(swp_entry_t entry)
2155{
2156 return __swap_duplicate(entry, SWAP_CACHE);
2157}
2158
1981 2159
1982struct swap_info_struct * 2160struct swap_info_struct *
1983get_swap_info_struct(unsigned type) 2161get_swap_info_struct(unsigned type)
@@ -2016,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2016 /* Don't read in free or bad pages */ 2194 /* Don't read in free or bad pages */
2017 if (!si->swap_map[toff]) 2195 if (!si->swap_map[toff])
2018 break; 2196 break;
2019 if (si->swap_map[toff] == SWAP_MAP_BAD) 2197 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2020 break; 2198 break;
2021 } 2199 }
2022 /* Count contiguous allocated slots below our target */ 2200 /* Count contiguous allocated slots below our target */
@@ -2024,7 +2202,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2024 /* Don't read in free or bad pages */ 2202 /* Don't read in free or bad pages */
2025 if (!si->swap_map[toff]) 2203 if (!si->swap_map[toff])
2026 break; 2204 break;
2027 if (si->swap_map[toff] == SWAP_MAP_BAD) 2205 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2028 break; 2206 break;
2029 } 2207 }
2030 spin_unlock(&swap_lock); 2208 spin_unlock(&swap_lock);
diff --git a/mm/truncate.c b/mm/truncate.c
index 12e1579f9165..ccc3ecf7cb98 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
267} 267}
268EXPORT_SYMBOL(truncate_inode_pages); 268EXPORT_SYMBOL(truncate_inode_pages);
269 269
270unsigned long __invalidate_mapping_pages(struct address_space *mapping, 270/**
271 pgoff_t start, pgoff_t end, bool be_atomic) 271 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
272 * @mapping: the address_space which holds the pages to invalidate
273 * @start: the offset 'from' which to invalidate
274 * @end: the offset 'to' which to invalidate (inclusive)
275 *
276 * This function only removes the unlocked pages, if you want to
277 * remove all the pages of one inode, you must call truncate_inode_pages.
278 *
279 * invalidate_mapping_pages() will not block on IO activity. It will not
280 * invalidate pages which are dirty, locked, under writeback or mapped into
281 * pagetables.
282 */
283unsigned long invalidate_mapping_pages(struct address_space *mapping,
284 pgoff_t start, pgoff_t end)
272{ 285{
273 struct pagevec pvec; 286 struct pagevec pvec;
274 pgoff_t next = start; 287 pgoff_t next = start;
@@ -309,30 +322,10 @@ unlock:
309 break; 322 break;
310 } 323 }
311 pagevec_release(&pvec); 324 pagevec_release(&pvec);
312 if (likely(!be_atomic)) 325 cond_resched();
313 cond_resched();
314 } 326 }
315 return ret; 327 return ret;
316} 328}
317
318/**
319 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
320 * @mapping: the address_space which holds the pages to invalidate
321 * @start: the offset 'from' which to invalidate
322 * @end: the offset 'to' which to invalidate (inclusive)
323 *
324 * This function only removes the unlocked pages, if you want to
325 * remove all the pages of one inode, you must call truncate_inode_pages.
326 *
327 * invalidate_mapping_pages() will not block on IO activity. It will not
328 * invalidate pages which are dirty, locked, under writeback or mapped into
329 * pagetables.
330 */
331unsigned long invalidate_mapping_pages(struct address_space *mapping,
332 pgoff_t start, pgoff_t end)
333{
334 return __invalidate_mapping_pages(mapping, start, end, false);
335}
336EXPORT_SYMBOL(invalidate_mapping_pages); 329EXPORT_SYMBOL(invalidate_mapping_pages);
337 330
338/* 331/*
diff --git a/mm/util.c b/mm/util.c
index abc65aa7cdfc..7c35ad95f927 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -168,6 +168,10 @@ EXPORT_SYMBOL(krealloc);
168 * 168 *
169 * The memory of the object @p points to is zeroed before freed. 169 * The memory of the object @p points to is zeroed before freed.
170 * If @p is %NULL, kzfree() does nothing. 170 * If @p is %NULL, kzfree() does nothing.
171 *
172 * Note: this function zeroes the whole allocated buffer which can be a good
173 * deal bigger than the requested buffer size passed to kmalloc(). So be
174 * careful when using this function in performance sensitive code.
171 */ 175 */
172void kzfree(const void *p) 176void kzfree(const void *p)
173{ 177{
@@ -233,13 +237,21 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
233 * @pages: array that receives pointers to the pages pinned. 237 * @pages: array that receives pointers to the pages pinned.
234 * Should be at least nr_pages long. 238 * Should be at least nr_pages long.
235 * 239 *
236 * Attempt to pin user pages in memory without taking mm->mmap_sem.
237 * If not successful, it will fall back to taking the lock and
238 * calling get_user_pages().
239 *
240 * Returns number of pages pinned. This may be fewer than the number 240 * Returns number of pages pinned. This may be fewer than the number
241 * requested. If nr_pages is 0 or negative, returns 0. If no pages 241 * requested. If nr_pages is 0 or negative, returns 0. If no pages
242 * were pinned, returns -errno. 242 * were pinned, returns -errno.
243 *
244 * get_user_pages_fast provides equivalent functionality to get_user_pages,
245 * operating on current and current->mm, with force=0 and vma=NULL. However
246 * unlike get_user_pages, it must be called without mmap_sem held.
247 *
248 * get_user_pages_fast may take mmap_sem and page table locks, so no
249 * assumptions can be made about lack of locking. get_user_pages_fast is to be
250 * implemented in a way that is advantageous (vs get_user_pages()) when the
251 * user memory area is already faulted in and present in ptes. However if the
252 * pages have to be faulted in, it may turn out to be slightly slower so
253 * callers need to carefully consider what to use. On many architectures,
254 * get_user_pages_fast simply falls back to get_user_pages.
243 */ 255 */
244int __attribute__((weak)) get_user_pages_fast(unsigned long start, 256int __attribute__((weak)) get_user_pages_fast(unsigned long start,
245 int nr_pages, int write, struct page **pages) 257 int nr_pages, int write, struct page **pages)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 95c08a8cc2ba..e8fa2d9eb212 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -470,8 +470,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
470 swp_entry_t swap = { .val = page_private(page) }; 470 swp_entry_t swap = { .val = page_private(page) };
471 __delete_from_swap_cache(page); 471 __delete_from_swap_cache(page);
472 spin_unlock_irq(&mapping->tree_lock); 472 spin_unlock_irq(&mapping->tree_lock);
473 mem_cgroup_uncharge_swapcache(page, swap); 473 swapcache_free(swap, page);
474 swap_free(swap);
475 } else { 474 } else {
476 __remove_from_page_cache(page); 475 __remove_from_page_cache(page);
477 spin_unlock_irq(&mapping->tree_lock); 476 spin_unlock_irq(&mapping->tree_lock);
@@ -514,7 +513,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
514 * 513 *
515 * lru_lock must not be held, interrupts must be enabled. 514 * lru_lock must not be held, interrupts must be enabled.
516 */ 515 */
517#ifdef CONFIG_UNEVICTABLE_LRU
518void putback_lru_page(struct page *page) 516void putback_lru_page(struct page *page)
519{ 517{
520 int lru; 518 int lru;
@@ -568,20 +566,6 @@ redo:
568 put_page(page); /* drop ref from isolate */ 566 put_page(page); /* drop ref from isolate */
569} 567}
570 568
571#else /* CONFIG_UNEVICTABLE_LRU */
572
573void putback_lru_page(struct page *page)
574{
575 int lru;
576 VM_BUG_ON(PageLRU(page));
577
578 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
579 lru_cache_add_lru(page, lru);
580 put_page(page);
581}
582#endif /* CONFIG_UNEVICTABLE_LRU */
583
584
585/* 569/*
586 * shrink_page_list() returns the number of reclaimed pages 570 * shrink_page_list() returns the number of reclaimed pages
587 */ 571 */
@@ -593,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
593 struct pagevec freed_pvec; 577 struct pagevec freed_pvec;
594 int pgactivate = 0; 578 int pgactivate = 0;
595 unsigned long nr_reclaimed = 0; 579 unsigned long nr_reclaimed = 0;
580 unsigned long vm_flags;
596 581
597 cond_resched(); 582 cond_resched();
598 583
@@ -643,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
643 goto keep_locked; 628 goto keep_locked;
644 } 629 }
645 630
646 referenced = page_referenced(page, 1, sc->mem_cgroup); 631 referenced = page_referenced(page, 1,
632 sc->mem_cgroup, &vm_flags);
647 /* In active use or really unfreeable? Activate it. */ 633 /* In active use or really unfreeable? Activate it. */
648 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && 634 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
649 referenced && page_mapping_inuse(page)) 635 referenced && page_mapping_inuse(page))
@@ -851,7 +837,6 @@ int __isolate_lru_page(struct page *page, int mode, int file)
851 */ 837 */
852 ClearPageLRU(page); 838 ClearPageLRU(page);
853 ret = 0; 839 ret = 0;
854 mem_cgroup_del_lru(page);
855 } 840 }
856 841
857 return ret; 842 return ret;
@@ -899,12 +884,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
899 switch (__isolate_lru_page(page, mode, file)) { 884 switch (__isolate_lru_page(page, mode, file)) {
900 case 0: 885 case 0:
901 list_move(&page->lru, dst); 886 list_move(&page->lru, dst);
887 mem_cgroup_del_lru(page);
902 nr_taken++; 888 nr_taken++;
903 break; 889 break;
904 890
905 case -EBUSY: 891 case -EBUSY:
906 /* else it is being freed elsewhere */ 892 /* else it is being freed elsewhere */
907 list_move(&page->lru, src); 893 list_move(&page->lru, src);
894 mem_cgroup_rotate_lru_list(page, page_lru(page));
908 continue; 895 continue;
909 896
910 default: 897 default:
@@ -943,18 +930,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
943 /* Check that we have not crossed a zone boundary. */ 930 /* Check that we have not crossed a zone boundary. */
944 if (unlikely(page_zone_id(cursor_page) != zone_id)) 931 if (unlikely(page_zone_id(cursor_page) != zone_id))
945 continue; 932 continue;
946 switch (__isolate_lru_page(cursor_page, mode, file)) { 933 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
947 case 0:
948 list_move(&cursor_page->lru, dst); 934 list_move(&cursor_page->lru, dst);
935 mem_cgroup_del_lru(page);
949 nr_taken++; 936 nr_taken++;
950 scan++; 937 scan++;
951 break;
952
953 case -EBUSY:
954 /* else it is being freed elsewhere */
955 list_move(&cursor_page->lru, src);
956 default:
957 break; /* ! on LRU or wrong list */
958 } 938 }
959 } 939 }
960 } 940 }
@@ -1061,6 +1041,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1061 unsigned long nr_scanned = 0; 1041 unsigned long nr_scanned = 0;
1062 unsigned long nr_reclaimed = 0; 1042 unsigned long nr_reclaimed = 0;
1063 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1043 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1044 int lumpy_reclaim = 0;
1045
1046 /*
1047 * If we need a large contiguous chunk of memory, or have
1048 * trouble getting a small set of contiguous pages, we
1049 * will reclaim both active and inactive pages.
1050 *
1051 * We use the same threshold as pageout congestion_wait below.
1052 */
1053 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1054 lumpy_reclaim = 1;
1055 else if (sc->order && priority < DEF_PRIORITY - 2)
1056 lumpy_reclaim = 1;
1064 1057
1065 pagevec_init(&pvec, 1); 1058 pagevec_init(&pvec, 1);
1066 1059
@@ -1073,19 +1066,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1073 unsigned long nr_freed; 1066 unsigned long nr_freed;
1074 unsigned long nr_active; 1067 unsigned long nr_active;
1075 unsigned int count[NR_LRU_LISTS] = { 0, }; 1068 unsigned int count[NR_LRU_LISTS] = { 0, };
1076 int mode = ISOLATE_INACTIVE; 1069 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1077
1078 /*
1079 * If we need a large contiguous chunk of memory, or have
1080 * trouble getting a small set of contiguous pages, we
1081 * will reclaim both active and inactive pages.
1082 *
1083 * We use the same threshold as pageout congestion_wait below.
1084 */
1085 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1086 mode = ISOLATE_BOTH;
1087 else if (sc->order && priority < DEF_PRIORITY - 2)
1088 mode = ISOLATE_BOTH;
1089 1070
1090 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1071 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1091 &page_list, &nr_scan, sc->order, mode, 1072 &page_list, &nr_scan, sc->order, mode,
@@ -1122,7 +1103,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1122 * but that should be acceptable to the caller 1103 * but that should be acceptable to the caller
1123 */ 1104 */
1124 if (nr_freed < nr_taken && !current_is_kswapd() && 1105 if (nr_freed < nr_taken && !current_is_kswapd() &&
1125 sc->order > PAGE_ALLOC_COSTLY_ORDER) { 1106 lumpy_reclaim) {
1126 congestion_wait(WRITE, HZ/10); 1107 congestion_wait(WRITE, HZ/10);
1127 1108
1128 /* 1109 /*
@@ -1217,18 +1198,54 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1217 * But we had to alter page->flags anyway. 1198 * But we had to alter page->flags anyway.
1218 */ 1199 */
1219 1200
1201static void move_active_pages_to_lru(struct zone *zone,
1202 struct list_head *list,
1203 enum lru_list lru)
1204{
1205 unsigned long pgmoved = 0;
1206 struct pagevec pvec;
1207 struct page *page;
1208
1209 pagevec_init(&pvec, 1);
1210
1211 while (!list_empty(list)) {
1212 page = lru_to_page(list);
1213 prefetchw_prev_lru_page(page, list, flags);
1214
1215 VM_BUG_ON(PageLRU(page));
1216 SetPageLRU(page);
1217
1218 VM_BUG_ON(!PageActive(page));
1219 if (!is_active_lru(lru))
1220 ClearPageActive(page); /* we are de-activating */
1221
1222 list_move(&page->lru, &zone->lru[lru].list);
1223 mem_cgroup_add_lru_list(page, lru);
1224 pgmoved++;
1225
1226 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1227 spin_unlock_irq(&zone->lru_lock);
1228 if (buffer_heads_over_limit)
1229 pagevec_strip(&pvec);
1230 __pagevec_release(&pvec);
1231 spin_lock_irq(&zone->lru_lock);
1232 }
1233 }
1234 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1235 if (!is_active_lru(lru))
1236 __count_vm_events(PGDEACTIVATE, pgmoved);
1237}
1220 1238
1221static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1239static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1222 struct scan_control *sc, int priority, int file) 1240 struct scan_control *sc, int priority, int file)
1223{ 1241{
1224 unsigned long pgmoved; 1242 unsigned long pgmoved;
1225 int pgdeactivate = 0;
1226 unsigned long pgscanned; 1243 unsigned long pgscanned;
1244 unsigned long vm_flags;
1227 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1245 LIST_HEAD(l_hold); /* The pages which were snipped off */
1246 LIST_HEAD(l_active);
1228 LIST_HEAD(l_inactive); 1247 LIST_HEAD(l_inactive);
1229 struct page *page; 1248 struct page *page;
1230 struct pagevec pvec;
1231 enum lru_list lru;
1232 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1249 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1233 1250
1234 lru_add_drain(); 1251 lru_add_drain();
@@ -1245,13 +1262,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1245 } 1262 }
1246 reclaim_stat->recent_scanned[!!file] += pgmoved; 1263 reclaim_stat->recent_scanned[!!file] += pgmoved;
1247 1264
1265 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1248 if (file) 1266 if (file)
1249 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); 1267 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1250 else 1268 else
1251 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); 1269 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1252 spin_unlock_irq(&zone->lru_lock); 1270 spin_unlock_irq(&zone->lru_lock);
1253 1271
1254 pgmoved = 0; 1272 pgmoved = 0; /* count referenced (mapping) mapped pages */
1255 while (!list_empty(&l_hold)) { 1273 while (!list_empty(&l_hold)) {
1256 cond_resched(); 1274 cond_resched();
1257 page = lru_to_page(&l_hold); 1275 page = lru_to_page(&l_hold);
@@ -1264,58 +1282,44 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1264 1282
1265 /* page_referenced clears PageReferenced */ 1283 /* page_referenced clears PageReferenced */
1266 if (page_mapping_inuse(page) && 1284 if (page_mapping_inuse(page) &&
1267 page_referenced(page, 0, sc->mem_cgroup)) 1285 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1268 pgmoved++; 1286 pgmoved++;
1287 /*
1288 * Identify referenced, file-backed active pages and
1289 * give them one more trip around the active list. So
1290 * that executable code get better chances to stay in
1291 * memory under moderate memory pressure. Anon pages
1292 * are not likely to be evicted by use-once streaming
1293 * IO, plus JVM can create lots of anon VM_EXEC pages,
1294 * so we ignore them here.
1295 */
1296 if ((vm_flags & VM_EXEC) && !PageAnon(page)) {
1297 list_add(&page->lru, &l_active);
1298 continue;
1299 }
1300 }
1269 1301
1270 list_add(&page->lru, &l_inactive); 1302 list_add(&page->lru, &l_inactive);
1271 } 1303 }
1272 1304
1273 /* 1305 /*
1274 * Move the pages to the [file or anon] inactive list. 1306 * Move pages back to the lru list.
1275 */ 1307 */
1276 pagevec_init(&pvec, 1);
1277 lru = LRU_BASE + file * LRU_FILE;
1278
1279 spin_lock_irq(&zone->lru_lock); 1308 spin_lock_irq(&zone->lru_lock);
1280 /* 1309 /*
1281 * Count referenced pages from currently used mappings as 1310 * Count referenced pages from currently used mappings as rotated,
1282 * rotated, even though they are moved to the inactive list. 1311 * even though only some of them are actually re-activated. This
1283 * This helps balance scan pressure between file and anonymous 1312 * helps balance scan pressure between file and anonymous pages in
1284 * pages in get_scan_ratio. 1313 * get_scan_ratio.
1285 */ 1314 */
1286 reclaim_stat->recent_rotated[!!file] += pgmoved; 1315 reclaim_stat->recent_rotated[!!file] += pgmoved;
1287 1316
1288 pgmoved = 0; 1317 move_active_pages_to_lru(zone, &l_active,
1289 while (!list_empty(&l_inactive)) { 1318 LRU_ACTIVE + file * LRU_FILE);
1290 page = lru_to_page(&l_inactive); 1319 move_active_pages_to_lru(zone, &l_inactive,
1291 prefetchw_prev_lru_page(page, &l_inactive, flags); 1320 LRU_BASE + file * LRU_FILE);
1292 VM_BUG_ON(PageLRU(page));
1293 SetPageLRU(page);
1294 VM_BUG_ON(!PageActive(page));
1295 ClearPageActive(page);
1296 1321
1297 list_move(&page->lru, &zone->lru[lru].list);
1298 mem_cgroup_add_lru_list(page, lru);
1299 pgmoved++;
1300 if (!pagevec_add(&pvec, page)) {
1301 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1302 spin_unlock_irq(&zone->lru_lock);
1303 pgdeactivate += pgmoved;
1304 pgmoved = 0;
1305 if (buffer_heads_over_limit)
1306 pagevec_strip(&pvec);
1307 __pagevec_release(&pvec);
1308 spin_lock_irq(&zone->lru_lock);
1309 }
1310 }
1311 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1312 pgdeactivate += pgmoved;
1313 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1314 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1315 spin_unlock_irq(&zone->lru_lock); 1322 spin_unlock_irq(&zone->lru_lock);
1316 if (buffer_heads_over_limit)
1317 pagevec_strip(&pvec);
1318 pagevec_release(&pvec);
1319} 1323}
1320 1324
1321static int inactive_anon_is_low_global(struct zone *zone) 1325static int inactive_anon_is_low_global(struct zone *zone)
@@ -1350,12 +1354,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1350 return low; 1354 return low;
1351} 1355}
1352 1356
1357static int inactive_file_is_low_global(struct zone *zone)
1358{
1359 unsigned long active, inactive;
1360
1361 active = zone_page_state(zone, NR_ACTIVE_FILE);
1362 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1363
1364 return (active > inactive);
1365}
1366
1367/**
1368 * inactive_file_is_low - check if file pages need to be deactivated
1369 * @zone: zone to check
1370 * @sc: scan control of this context
1371 *
1372 * When the system is doing streaming IO, memory pressure here
1373 * ensures that active file pages get deactivated, until more
1374 * than half of the file pages are on the inactive list.
1375 *
1376 * Once we get to that situation, protect the system's working
1377 * set from being evicted by disabling active file page aging.
1378 *
1379 * This uses a different ratio than the anonymous pages, because
1380 * the page cache uses a use-once replacement algorithm.
1381 */
1382static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1383{
1384 int low;
1385
1386 if (scanning_global_lru(sc))
1387 low = inactive_file_is_low_global(zone);
1388 else
1389 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
1390 return low;
1391}
1392
1353static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1393static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1354 struct zone *zone, struct scan_control *sc, int priority) 1394 struct zone *zone, struct scan_control *sc, int priority)
1355{ 1395{
1356 int file = is_file_lru(lru); 1396 int file = is_file_lru(lru);
1357 1397
1358 if (lru == LRU_ACTIVE_FILE) { 1398 if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
1359 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1399 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1360 return 0; 1400 return 0;
1361 } 1401 }
@@ -1384,13 +1424,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1384 unsigned long ap, fp; 1424 unsigned long ap, fp;
1385 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1425 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1386 1426
1387 /* If we have no swap space, do not bother scanning anon pages. */
1388 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1389 percent[0] = 0;
1390 percent[1] = 100;
1391 return;
1392 }
1393
1394 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + 1427 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
1395 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); 1428 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
1396 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + 1429 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1400,7 +1433,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1400 free = zone_page_state(zone, NR_FREE_PAGES); 1433 free = zone_page_state(zone, NR_FREE_PAGES);
1401 /* If we have very few page cache pages, 1434 /* If we have very few page cache pages,
1402 force-scan anon pages. */ 1435 force-scan anon pages. */
1403 if (unlikely(file + free <= zone->pages_high)) { 1436 if (unlikely(file + free <= high_wmark_pages(zone))) {
1404 percent[0] = 100; 1437 percent[0] = 100;
1405 percent[1] = 0; 1438 percent[1] = 0;
1406 return; 1439 return;
@@ -1455,6 +1488,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1455 percent[1] = 100 - percent[0]; 1488 percent[1] = 100 - percent[0];
1456} 1489}
1457 1490
1491/*
1492 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1493 * until we collected @swap_cluster_max pages to scan.
1494 */
1495static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1496 unsigned long *nr_saved_scan,
1497 unsigned long swap_cluster_max)
1498{
1499 unsigned long nr;
1500
1501 *nr_saved_scan += nr_to_scan;
1502 nr = *nr_saved_scan;
1503
1504 if (nr >= swap_cluster_max)
1505 *nr_saved_scan = 0;
1506 else
1507 nr = 0;
1508
1509 return nr;
1510}
1458 1511
1459/* 1512/*
1460 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1513 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
@@ -1468,26 +1521,30 @@ static void shrink_zone(int priority, struct zone *zone,
1468 enum lru_list l; 1521 enum lru_list l;
1469 unsigned long nr_reclaimed = sc->nr_reclaimed; 1522 unsigned long nr_reclaimed = sc->nr_reclaimed;
1470 unsigned long swap_cluster_max = sc->swap_cluster_max; 1523 unsigned long swap_cluster_max = sc->swap_cluster_max;
1524 int noswap = 0;
1471 1525
1472 get_scan_ratio(zone, sc, percent); 1526 /* If we have no swap space, do not bother scanning anon pages. */
1527 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1528 noswap = 1;
1529 percent[0] = 0;
1530 percent[1] = 100;
1531 } else
1532 get_scan_ratio(zone, sc, percent);
1473 1533
1474 for_each_evictable_lru(l) { 1534 for_each_evictable_lru(l) {
1475 int file = is_file_lru(l); 1535 int file = is_file_lru(l);
1476 unsigned long scan; 1536 unsigned long scan;
1477 1537
1478 scan = zone_nr_pages(zone, sc, l); 1538 scan = zone_nr_pages(zone, sc, l);
1479 if (priority) { 1539 if (priority || noswap) {
1480 scan >>= priority; 1540 scan >>= priority;
1481 scan = (scan * percent[file]) / 100; 1541 scan = (scan * percent[file]) / 100;
1482 } 1542 }
1483 if (scanning_global_lru(sc)) { 1543 if (scanning_global_lru(sc))
1484 zone->lru[l].nr_scan += scan; 1544 nr[l] = nr_scan_try_batch(scan,
1485 nr[l] = zone->lru[l].nr_scan; 1545 &zone->lru[l].nr_saved_scan,
1486 if (nr[l] >= swap_cluster_max) 1546 swap_cluster_max);
1487 zone->lru[l].nr_scan = 0; 1547 else
1488 else
1489 nr[l] = 0;
1490 } else
1491 nr[l] = scan; 1548 nr[l] = scan;
1492 } 1549 }
1493 1550
@@ -1521,7 +1578,7 @@ static void shrink_zone(int priority, struct zone *zone,
1521 * Even if we did not try to evict anon pages at all, we want to 1578 * Even if we did not try to evict anon pages at all, we want to
1522 * rebalance the anon lru active/inactive ratio. 1579 * rebalance the anon lru active/inactive ratio.
1523 */ 1580 */
1524 if (inactive_anon_is_low(zone, sc)) 1581 if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
1525 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1582 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1526 1583
1527 throttle_vm_writeout(sc->gfp_mask); 1584 throttle_vm_writeout(sc->gfp_mask);
@@ -1532,11 +1589,13 @@ static void shrink_zone(int priority, struct zone *zone,
1532 * try to reclaim pages from zones which will satisfy the caller's allocation 1589 * try to reclaim pages from zones which will satisfy the caller's allocation
1533 * request. 1590 * request.
1534 * 1591 *
1535 * We reclaim from a zone even if that zone is over pages_high. Because: 1592 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
1593 * Because:
1536 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 1594 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
1537 * allocation or 1595 * allocation or
1538 * b) The zones may be over pages_high but they must go *over* pages_high to 1596 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
1539 * satisfy the `incremental min' zone defense algorithm. 1597 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
1598 * zone defense algorithm.
1540 * 1599 *
1541 * If a zone is deemed to be full of pinned pages then just give it a light 1600 * If a zone is deemed to be full of pinned pages then just give it a light
1542 * scan then give up on it. 1601 * scan then give up on it.
@@ -1742,7 +1801,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1742 1801
1743/* 1802/*
1744 * For kswapd, balance_pgdat() will work across all this node's zones until 1803 * For kswapd, balance_pgdat() will work across all this node's zones until
1745 * they are all at pages_high. 1804 * they are all at high_wmark_pages(zone).
1746 * 1805 *
1747 * Returns the number of pages which were actually freed. 1806 * Returns the number of pages which were actually freed.
1748 * 1807 *
@@ -1755,11 +1814,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1755 * the zone for when the problem goes away. 1814 * the zone for when the problem goes away.
1756 * 1815 *
1757 * kswapd scans the zones in the highmem->normal->dma direction. It skips 1816 * kswapd scans the zones in the highmem->normal->dma direction. It skips
1758 * zones which have free_pages > pages_high, but once a zone is found to have 1817 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
1759 * free_pages <= pages_high, we scan that zone and the lower zones regardless 1818 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
1760 * of the number of free pages in the lower zones. This interoperates with 1819 * lower zones regardless of the number of free pages in the lower zones. This
1761 * the page allocator fallback scheme to ensure that aging of pages is balanced 1820 * interoperates with the page allocator fallback scheme to ensure that aging
1762 * across the zones. 1821 * of pages is balanced across the zones.
1763 */ 1822 */
1764static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 1823static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1765{ 1824{
@@ -1780,7 +1839,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1780 }; 1839 };
1781 /* 1840 /*
1782 * temp_priority is used to remember the scanning priority at which 1841 * temp_priority is used to remember the scanning priority at which
1783 * this zone was successfully refilled to free_pages == pages_high. 1842 * this zone was successfully refilled to
1843 * free_pages == high_wmark_pages(zone).
1784 */ 1844 */
1785 int temp_priority[MAX_NR_ZONES]; 1845 int temp_priority[MAX_NR_ZONES];
1786 1846
@@ -1825,8 +1885,8 @@ loop_again:
1825 shrink_active_list(SWAP_CLUSTER_MAX, zone, 1885 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1826 &sc, priority, 0); 1886 &sc, priority, 0);
1827 1887
1828 if (!zone_watermark_ok(zone, order, zone->pages_high, 1888 if (!zone_watermark_ok(zone, order,
1829 0, 0)) { 1889 high_wmark_pages(zone), 0, 0)) {
1830 end_zone = i; 1890 end_zone = i;
1831 break; 1891 break;
1832 } 1892 }
@@ -1860,8 +1920,8 @@ loop_again:
1860 priority != DEF_PRIORITY) 1920 priority != DEF_PRIORITY)
1861 continue; 1921 continue;
1862 1922
1863 if (!zone_watermark_ok(zone, order, zone->pages_high, 1923 if (!zone_watermark_ok(zone, order,
1864 end_zone, 0)) 1924 high_wmark_pages(zone), end_zone, 0))
1865 all_zones_ok = 0; 1925 all_zones_ok = 0;
1866 temp_priority[i] = priority; 1926 temp_priority[i] = priority;
1867 sc.nr_scanned = 0; 1927 sc.nr_scanned = 0;
@@ -1870,8 +1930,8 @@ loop_again:
1870 * We put equal pressure on every zone, unless one 1930 * We put equal pressure on every zone, unless one
1871 * zone has way too many pages free already. 1931 * zone has way too many pages free already.
1872 */ 1932 */
1873 if (!zone_watermark_ok(zone, order, 8*zone->pages_high, 1933 if (!zone_watermark_ok(zone, order,
1874 end_zone, 0)) 1934 8*high_wmark_pages(zone), end_zone, 0))
1875 shrink_zone(priority, zone, &sc); 1935 shrink_zone(priority, zone, &sc);
1876 reclaim_state->reclaimed_slab = 0; 1936 reclaim_state->reclaimed_slab = 0;
1877 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1937 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -2037,7 +2097,7 @@ void wakeup_kswapd(struct zone *zone, int order)
2037 return; 2097 return;
2038 2098
2039 pgdat = zone->zone_pgdat; 2099 pgdat = zone->zone_pgdat;
2040 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) 2100 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2041 return; 2101 return;
2042 if (pgdat->kswapd_max_order < order) 2102 if (pgdat->kswapd_max_order < order)
2043 pgdat->kswapd_max_order = order; 2103 pgdat->kswapd_max_order = order;
@@ -2084,11 +2144,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2084 l == LRU_ACTIVE_FILE)) 2144 l == LRU_ACTIVE_FILE))
2085 continue; 2145 continue;
2086 2146
2087 zone->lru[l].nr_scan += (lru_pages >> prio) + 1; 2147 zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
2088 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { 2148 if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
2089 unsigned long nr_to_scan; 2149 unsigned long nr_to_scan;
2090 2150
2091 zone->lru[l].nr_scan = 0; 2151 zone->lru[l].nr_saved_scan = 0;
2092 nr_to_scan = min(nr_pages, lru_pages); 2152 nr_to_scan = min(nr_pages, lru_pages);
2093 nr_reclaimed += shrink_list(l, nr_to_scan, zone, 2153 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2094 sc, prio); 2154 sc, prio);
@@ -2290,6 +2350,48 @@ int sysctl_min_unmapped_ratio = 1;
2290 */ 2350 */
2291int sysctl_min_slab_ratio = 5; 2351int sysctl_min_slab_ratio = 5;
2292 2352
2353static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
2354{
2355 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
2356 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
2357 zone_page_state(zone, NR_ACTIVE_FILE);
2358
2359 /*
2360 * It's possible for there to be more file mapped pages than
2361 * accounted for by the pages on the file LRU lists because
2362 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
2363 */
2364 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
2365}
2366
2367/* Work out how many page cache pages we can reclaim in this reclaim_mode */
2368static long zone_pagecache_reclaimable(struct zone *zone)
2369{
2370 long nr_pagecache_reclaimable;
2371 long delta = 0;
2372
2373 /*
2374 * If RECLAIM_SWAP is set, then all file pages are considered
2375 * potentially reclaimable. Otherwise, we have to worry about
2376 * pages like swapcache and zone_unmapped_file_pages() provides
2377 * a better estimate
2378 */
2379 if (zone_reclaim_mode & RECLAIM_SWAP)
2380 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
2381 else
2382 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
2383
2384 /* If we can't clean pages, remove dirty pages from consideration */
2385 if (!(zone_reclaim_mode & RECLAIM_WRITE))
2386 delta += zone_page_state(zone, NR_FILE_DIRTY);
2387
2388 /* Watch for any possible underflows due to delta */
2389 if (unlikely(delta > nr_pagecache_reclaimable))
2390 delta = nr_pagecache_reclaimable;
2391
2392 return nr_pagecache_reclaimable - delta;
2393}
2394
2293/* 2395/*
2294 * Try to free up some pages from this zone through reclaim. 2396 * Try to free up some pages from this zone through reclaim.
2295 */ 2397 */
@@ -2324,9 +2426,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2324 reclaim_state.reclaimed_slab = 0; 2426 reclaim_state.reclaimed_slab = 0;
2325 p->reclaim_state = &reclaim_state; 2427 p->reclaim_state = &reclaim_state;
2326 2428
2327 if (zone_page_state(zone, NR_FILE_PAGES) - 2429 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
2328 zone_page_state(zone, NR_FILE_MAPPED) >
2329 zone->min_unmapped_pages) {
2330 /* 2430 /*
2331 * Free memory by calling shrink zone with increasing 2431 * Free memory by calling shrink zone with increasing
2332 * priorities until we have enough memory freed. 2432 * priorities until we have enough memory freed.
@@ -2384,20 +2484,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2384 * if less than a specified percentage of the zone is used by 2484 * if less than a specified percentage of the zone is used by
2385 * unmapped file backed pages. 2485 * unmapped file backed pages.
2386 */ 2486 */
2387 if (zone_page_state(zone, NR_FILE_PAGES) - 2487 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
2388 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages 2488 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
2389 && zone_page_state(zone, NR_SLAB_RECLAIMABLE) 2489 return ZONE_RECLAIM_FULL;
2390 <= zone->min_slab_pages)
2391 return 0;
2392 2490
2393 if (zone_is_all_unreclaimable(zone)) 2491 if (zone_is_all_unreclaimable(zone))
2394 return 0; 2492 return ZONE_RECLAIM_FULL;
2395 2493
2396 /* 2494 /*
2397 * Do not scan if the allocation should not be delayed. 2495 * Do not scan if the allocation should not be delayed.
2398 */ 2496 */
2399 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) 2497 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
2400 return 0; 2498 return ZONE_RECLAIM_NOSCAN;
2401 2499
2402 /* 2500 /*
2403 * Only run zone reclaim on the local zone or on zones that do not 2501 * Only run zone reclaim on the local zone or on zones that do not
@@ -2407,18 +2505,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2407 */ 2505 */
2408 node_id = zone_to_nid(zone); 2506 node_id = zone_to_nid(zone);
2409 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 2507 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
2410 return 0; 2508 return ZONE_RECLAIM_NOSCAN;
2411 2509
2412 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 2510 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
2413 return 0; 2511 return ZONE_RECLAIM_NOSCAN;
2512
2414 ret = __zone_reclaim(zone, gfp_mask, order); 2513 ret = __zone_reclaim(zone, gfp_mask, order);
2415 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 2514 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
2416 2515
2516 if (!ret)
2517 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
2518
2417 return ret; 2519 return ret;
2418} 2520}
2419#endif 2521#endif
2420 2522
2421#ifdef CONFIG_UNEVICTABLE_LRU
2422/* 2523/*
2423 * page_evictable - test whether a page is evictable 2524 * page_evictable - test whether a page is evictable
2424 * @page: the page to test 2525 * @page: the page to test
@@ -2665,4 +2766,3 @@ void scan_unevictable_unregister_node(struct node *node)
2665 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 2766 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2666} 2767}
2667 2768
2668#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 74d66dba0cbe..138bed53706e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -629,10 +629,8 @@ static const char * const vmstat_text[] = {
629 "nr_active_anon", 629 "nr_active_anon",
630 "nr_inactive_file", 630 "nr_inactive_file",
631 "nr_active_file", 631 "nr_active_file",
632#ifdef CONFIG_UNEVICTABLE_LRU
633 "nr_unevictable", 632 "nr_unevictable",
634 "nr_mlock", 633 "nr_mlock",
635#endif
636 "nr_anon_pages", 634 "nr_anon_pages",
637 "nr_mapped", 635 "nr_mapped",
638 "nr_file_pages", 636 "nr_file_pages",
@@ -675,6 +673,9 @@ static const char * const vmstat_text[] = {
675 TEXTS_FOR_ZONES("pgscan_kswapd") 673 TEXTS_FOR_ZONES("pgscan_kswapd")
676 TEXTS_FOR_ZONES("pgscan_direct") 674 TEXTS_FOR_ZONES("pgscan_direct")
677 675
676#ifdef CONFIG_NUMA
677 "zone_reclaim_failed",
678#endif
678 "pginodesteal", 679 "pginodesteal",
679 "slabs_scanned", 680 "slabs_scanned",
680 "kswapd_steal", 681 "kswapd_steal",
@@ -687,7 +688,6 @@ static const char * const vmstat_text[] = {
687 "htlb_buddy_alloc_success", 688 "htlb_buddy_alloc_success",
688 "htlb_buddy_alloc_fail", 689 "htlb_buddy_alloc_fail",
689#endif 690#endif
690#ifdef CONFIG_UNEVICTABLE_LRU
691 "unevictable_pgs_culled", 691 "unevictable_pgs_culled",
692 "unevictable_pgs_scanned", 692 "unevictable_pgs_scanned",
693 "unevictable_pgs_rescued", 693 "unevictable_pgs_rescued",
@@ -697,7 +697,6 @@ static const char * const vmstat_text[] = {
697 "unevictable_pgs_stranded", 697 "unevictable_pgs_stranded",
698 "unevictable_pgs_mlockfreed", 698 "unevictable_pgs_mlockfreed",
699#endif 699#endif
700#endif
701}; 700};
702 701
703static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, 702static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
@@ -710,18 +709,14 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
710 "\n min %lu" 709 "\n min %lu"
711 "\n low %lu" 710 "\n low %lu"
712 "\n high %lu" 711 "\n high %lu"
713 "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" 712 "\n scanned %lu"
714 "\n spanned %lu" 713 "\n spanned %lu"
715 "\n present %lu", 714 "\n present %lu",
716 zone_page_state(zone, NR_FREE_PAGES), 715 zone_page_state(zone, NR_FREE_PAGES),
717 zone->pages_min, 716 min_wmark_pages(zone),
718 zone->pages_low, 717 low_wmark_pages(zone),
719 zone->pages_high, 718 high_wmark_pages(zone),
720 zone->pages_scanned, 719 zone->pages_scanned,
721 zone->lru[LRU_ACTIVE_ANON].nr_scan,
722 zone->lru[LRU_INACTIVE_ANON].nr_scan,
723 zone->lru[LRU_ACTIVE_FILE].nr_scan,
724 zone->lru[LRU_INACTIVE_FILE].nr_scan,
725 zone->spanned_pages, 720 zone->spanned_pages,
726 zone->present_pages); 721 zone->present_pages);
727 722