aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig14
-rw-r--r--mm/Makefile1
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c169
-rw-r--r--mm/hugetlb.c106
-rw-r--r--mm/init-mm.c20
-rw-r--r--mm/internal.h33
-rw-r--r--mm/madvise.c26
-rw-r--r--mm/memcontrol.c11
-rw-r--r--mm/memory.c128
-rw-r--r--mm/memory_hotplug.c6
-rw-r--r--mm/mempolicy.c145
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mlock.c22
-rw-r--r--mm/oom_kill.c64
-rw-r--r--mm/page-writeback.c19
-rw-r--r--mm/page_alloc.c754
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/readahead.c145
-rw-r--r--mm/rmap.c40
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slab.c11
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c2
-rw-r--r--mm/swap_state.c17
-rw-r--r--mm/swapfile.c276
-rw-r--r--mm/truncate.c39
-rw-r--r--mm/util.c16
-rw-r--r--mm/vmscan.c372
-rw-r--r--mm/vmstat.c19
30 files changed, 1598 insertions, 875 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 6f4610a9ce55..c948d4ca8bde 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -203,25 +203,13 @@ config VIRT_TO_BUS
203 def_bool y 203 def_bool y
204 depends on !ARCH_NO_VIRT_TO_BUS 204 depends on !ARCH_NO_VIRT_TO_BUS
205 205
206config UNEVICTABLE_LRU
207 bool "Add LRU list to track non-evictable pages"
208 default y
209 help
210 Keeps unevictable pages off of the active and inactive pageout
211 lists, so kswapd will not waste CPU time or have its balancing
212 algorithms thrown off by scanning these pages. Selecting this
213 will use one page flag and increase the code size a little,
214 say Y unless you know what you are doing.
215
216 See Documentation/vm/unevictable-lru.txt for more information.
217
218config HAVE_MLOCK 206config HAVE_MLOCK
219 bool 207 bool
220 default y if MMU=y 208 default y if MMU=y
221 209
222config HAVE_MLOCKED_PAGE_BIT 210config HAVE_MLOCKED_PAGE_BIT
223 bool 211 bool
224 default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y 212 default y if HAVE_MLOCK=y
225 213
226config MMU_NOTIFIER 214config MMU_NOTIFIER
227 bool 215 bool
diff --git a/mm/Makefile b/mm/Makefile
index c379ce08354a..5e0bd6426693 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,6 +12,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
15obj-y += init-mm.o
15 16
16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o 17obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
17obj-$(CONFIG_BOUNCE) += bounce.o 18obj-$(CONFIG_BOUNCE) += bounce.o
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 54a0f8040afa..e43359214f6f 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
101 101
102 ret = force_page_cache_readahead(mapping, file, 102 ret = force_page_cache_readahead(mapping, file,
103 start_index, 103 start_index,
104 max_sane_readahead(nrpages)); 104 nrpages);
105 if (ret > 0) 105 if (ret > 0)
106 ret = 0; 106 ret = 0;
107 break; 107 break;
diff --git a/mm/filemap.c b/mm/filemap.c
index 1b60f30cebfa..22396713feb9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
521{ 521{
522 if (cpuset_do_page_mem_spread()) { 522 if (cpuset_do_page_mem_spread()) {
523 int n = cpuset_mem_spread_node(); 523 int n = cpuset_mem_spread_node();
524 return alloc_pages_node(n, gfp, 0); 524 return alloc_pages_exact_node(n, gfp, 0);
525 } 525 }
526 return alloc_pages(gfp, 0); 526 return alloc_pages(gfp, 0);
527} 527}
@@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait);
1004static void shrink_readahead_size_eio(struct file *filp, 1004static void shrink_readahead_size_eio(struct file *filp,
1005 struct file_ra_state *ra) 1005 struct file_ra_state *ra)
1006{ 1006{
1007 if (!ra->ra_pages)
1008 return;
1009
1010 ra->ra_pages /= 4; 1007 ra->ra_pages /= 4;
1011} 1008}
1012 1009
@@ -1390,8 +1387,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
1390 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 1387 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1391 return -EINVAL; 1388 return -EINVAL;
1392 1389
1393 force_page_cache_readahead(mapping, filp, index, 1390 force_page_cache_readahead(mapping, filp, index, nr);
1394 max_sane_readahead(nr));
1395 return 0; 1391 return 0;
1396} 1392}
1397 1393
@@ -1457,6 +1453,73 @@ static int page_cache_read(struct file *file, pgoff_t offset)
1457 1453
1458#define MMAP_LOTSAMISS (100) 1454#define MMAP_LOTSAMISS (100)
1459 1455
1456/*
1457 * Synchronous readahead happens when we don't even find
1458 * a page in the page cache at all.
1459 */
1460static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1461 struct file_ra_state *ra,
1462 struct file *file,
1463 pgoff_t offset)
1464{
1465 unsigned long ra_pages;
1466 struct address_space *mapping = file->f_mapping;
1467
1468 /* If we don't want any read-ahead, don't bother */
1469 if (VM_RandomReadHint(vma))
1470 return;
1471
1472 if (VM_SequentialReadHint(vma) ||
1473 offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
1474 page_cache_sync_readahead(mapping, ra, file, offset,
1475 ra->ra_pages);
1476 return;
1477 }
1478
1479 if (ra->mmap_miss < INT_MAX)
1480 ra->mmap_miss++;
1481
1482 /*
1483 * Do we miss much more than hit in this file? If so,
1484 * stop bothering with read-ahead. It will only hurt.
1485 */
1486 if (ra->mmap_miss > MMAP_LOTSAMISS)
1487 return;
1488
1489 /*
1490 * mmap read-around
1491 */
1492 ra_pages = max_sane_readahead(ra->ra_pages);
1493 if (ra_pages) {
1494 ra->start = max_t(long, 0, offset - ra_pages/2);
1495 ra->size = ra_pages;
1496 ra->async_size = 0;
1497 ra_submit(ra, mapping, file);
1498 }
1499}
1500
1501/*
1502 * Asynchronous readahead happens when we find the page and PG_readahead,
1503 * so we want to possibly extend the readahead further..
1504 */
1505static void do_async_mmap_readahead(struct vm_area_struct *vma,
1506 struct file_ra_state *ra,
1507 struct file *file,
1508 struct page *page,
1509 pgoff_t offset)
1510{
1511 struct address_space *mapping = file->f_mapping;
1512
1513 /* If we don't want any read-ahead, don't bother */
1514 if (VM_RandomReadHint(vma))
1515 return;
1516 if (ra->mmap_miss > 0)
1517 ra->mmap_miss--;
1518 if (PageReadahead(page))
1519 page_cache_async_readahead(mapping, ra, file,
1520 page, offset, ra->ra_pages);
1521}
1522
1460/** 1523/**
1461 * filemap_fault - read in file data for page fault handling 1524 * filemap_fault - read in file data for page fault handling
1462 * @vma: vma in which the fault was taken 1525 * @vma: vma in which the fault was taken
@@ -1476,78 +1539,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1476 struct address_space *mapping = file->f_mapping; 1539 struct address_space *mapping = file->f_mapping;
1477 struct file_ra_state *ra = &file->f_ra; 1540 struct file_ra_state *ra = &file->f_ra;
1478 struct inode *inode = mapping->host; 1541 struct inode *inode = mapping->host;
1542 pgoff_t offset = vmf->pgoff;
1479 struct page *page; 1543 struct page *page;
1480 pgoff_t size; 1544 pgoff_t size;
1481 int did_readaround = 0;
1482 int ret = 0; 1545 int ret = 0;
1483 1546
1484 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1547 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1485 if (vmf->pgoff >= size) 1548 if (offset >= size)
1486 return VM_FAULT_SIGBUS; 1549 return VM_FAULT_SIGBUS;
1487 1550
1488 /* If we don't want any read-ahead, don't bother */
1489 if (VM_RandomReadHint(vma))
1490 goto no_cached_page;
1491
1492 /* 1551 /*
1493 * Do we have something in the page cache already? 1552 * Do we have something in the page cache already?
1494 */ 1553 */
1495retry_find: 1554 page = find_get_page(mapping, offset);
1496 page = find_lock_page(mapping, vmf->pgoff); 1555 if (likely(page)) {
1497 /*
1498 * For sequential accesses, we use the generic readahead logic.
1499 */
1500 if (VM_SequentialReadHint(vma)) {
1501 if (!page) {
1502 page_cache_sync_readahead(mapping, ra, file,
1503 vmf->pgoff, 1);
1504 page = find_lock_page(mapping, vmf->pgoff);
1505 if (!page)
1506 goto no_cached_page;
1507 }
1508 if (PageReadahead(page)) {
1509 page_cache_async_readahead(mapping, ra, file, page,
1510 vmf->pgoff, 1);
1511 }
1512 }
1513
1514 if (!page) {
1515 unsigned long ra_pages;
1516
1517 ra->mmap_miss++;
1518
1519 /* 1556 /*
1520 * Do we miss much more than hit in this file? If so, 1557 * We found the page, so try async readahead before
1521 * stop bothering with read-ahead. It will only hurt. 1558 * waiting for the lock.
1522 */ 1559 */
1523 if (ra->mmap_miss > MMAP_LOTSAMISS) 1560 do_async_mmap_readahead(vma, ra, file, page, offset);
1524 goto no_cached_page; 1561 lock_page(page);
1525 1562
1526 /* 1563 /* Did it get truncated? */
1527 * To keep the pgmajfault counter straight, we need to 1564 if (unlikely(page->mapping != mapping)) {
1528 * check did_readaround, as this is an inner loop. 1565 unlock_page(page);
1529 */ 1566 put_page(page);
1530 if (!did_readaround) { 1567 goto no_cached_page;
1531 ret = VM_FAULT_MAJOR;
1532 count_vm_event(PGMAJFAULT);
1533 }
1534 did_readaround = 1;
1535 ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1536 if (ra_pages) {
1537 pgoff_t start = 0;
1538
1539 if (vmf->pgoff > ra_pages / 2)
1540 start = vmf->pgoff - ra_pages / 2;
1541 do_page_cache_readahead(mapping, file, start, ra_pages);
1542 } 1568 }
1543 page = find_lock_page(mapping, vmf->pgoff); 1569 } else {
1570 /* No page in the page cache at all */
1571 do_sync_mmap_readahead(vma, ra, file, offset);
1572 count_vm_event(PGMAJFAULT);
1573 ret = VM_FAULT_MAJOR;
1574retry_find:
1575 page = find_lock_page(mapping, offset);
1544 if (!page) 1576 if (!page)
1545 goto no_cached_page; 1577 goto no_cached_page;
1546 } 1578 }
1547 1579
1548 if (!did_readaround)
1549 ra->mmap_miss--;
1550
1551 /* 1580 /*
1552 * We have a locked page in the page cache, now we need to check 1581 * We have a locked page in the page cache, now we need to check
1553 * that it's up-to-date. If not, it is going to be due to an error. 1582 * that it's up-to-date. If not, it is going to be due to an error.
@@ -1555,18 +1584,18 @@ retry_find:
1555 if (unlikely(!PageUptodate(page))) 1584 if (unlikely(!PageUptodate(page)))
1556 goto page_not_uptodate; 1585 goto page_not_uptodate;
1557 1586
1558 /* Must recheck i_size under page lock */ 1587 /*
1588 * Found the page and have a reference on it.
1589 * We must recheck i_size under page lock.
1590 */
1559 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1591 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1560 if (unlikely(vmf->pgoff >= size)) { 1592 if (unlikely(offset >= size)) {
1561 unlock_page(page); 1593 unlock_page(page);
1562 page_cache_release(page); 1594 page_cache_release(page);
1563 return VM_FAULT_SIGBUS; 1595 return VM_FAULT_SIGBUS;
1564 } 1596 }
1565 1597
1566 /* 1598 ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
1567 * Found the page and have a reference on it.
1568 */
1569 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
1570 vmf->page = page; 1599 vmf->page = page;
1571 return ret | VM_FAULT_LOCKED; 1600 return ret | VM_FAULT_LOCKED;
1572 1601
@@ -1575,7 +1604,7 @@ no_cached_page:
1575 * We're only likely to ever get here if MADV_RANDOM is in 1604 * We're only likely to ever get here if MADV_RANDOM is in
1576 * effect. 1605 * effect.
1577 */ 1606 */
1578 error = page_cache_read(file, vmf->pgoff); 1607 error = page_cache_read(file, offset);
1579 1608
1580 /* 1609 /*
1581 * The page we want has now been added to the page cache. 1610 * The page we want has now been added to the page cache.
@@ -1595,12 +1624,6 @@ no_cached_page:
1595 return VM_FAULT_SIGBUS; 1624 return VM_FAULT_SIGBUS;
1596 1625
1597page_not_uptodate: 1626page_not_uptodate:
1598 /* IO error path */
1599 if (!did_readaround) {
1600 ret = VM_FAULT_MAJOR;
1601 count_vm_event(PGMAJFAULT);
1602 }
1603
1604 /* 1627 /*
1605 * Umm, take care of errors if the page isn't up-to-date. 1628 * Umm, take care of errors if the page isn't up-to-date.
1606 * Try to re-read it _once_. We do this synchronously, 1629 * Try to re-read it _once_. We do this synchronously,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e83ad2c9228c..a56e6f3ce979 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -578,41 +578,6 @@ static void free_huge_page(struct page *page)
578 hugetlb_put_quota(mapping, 1); 578 hugetlb_put_quota(mapping, 1);
579} 579}
580 580
581/*
582 * Increment or decrement surplus_huge_pages. Keep node-specific counters
583 * balanced by operating on them in a round-robin fashion.
584 * Returns 1 if an adjustment was made.
585 */
586static int adjust_pool_surplus(struct hstate *h, int delta)
587{
588 static int prev_nid;
589 int nid = prev_nid;
590 int ret = 0;
591
592 VM_BUG_ON(delta != -1 && delta != 1);
593 do {
594 nid = next_node(nid, node_online_map);
595 if (nid == MAX_NUMNODES)
596 nid = first_node(node_online_map);
597
598 /* To shrink on this node, there must be a surplus page */
599 if (delta < 0 && !h->surplus_huge_pages_node[nid])
600 continue;
601 /* Surplus cannot exceed the total number of pages */
602 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
603 h->nr_huge_pages_node[nid])
604 continue;
605
606 h->surplus_huge_pages += delta;
607 h->surplus_huge_pages_node[nid] += delta;
608 ret = 1;
609 break;
610 } while (nid != prev_nid);
611
612 prev_nid = nid;
613 return ret;
614}
615
616static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 581static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
617{ 582{
618 set_compound_page_dtor(page, free_huge_page); 583 set_compound_page_dtor(page, free_huge_page);
@@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
623 put_page(page); /* free it into the hugepage allocator */ 588 put_page(page); /* free it into the hugepage allocator */
624} 589}
625 590
591static void prep_compound_gigantic_page(struct page *page, unsigned long order)
592{
593 int i;
594 int nr_pages = 1 << order;
595 struct page *p = page + 1;
596
597 /* we rely on prep_new_huge_page to set the destructor */
598 set_compound_order(page, order);
599 __SetPageHead(page);
600 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
601 __SetPageTail(p);
602 p->first_page = page;
603 }
604}
605
606int PageHuge(struct page *page)
607{
608 compound_page_dtor *dtor;
609
610 if (!PageCompound(page))
611 return 0;
612
613 page = compound_head(page);
614 dtor = get_compound_page_dtor(page);
615
616 return dtor == free_huge_page;
617}
618
626static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 619static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
627{ 620{
628 struct page *page; 621 struct page *page;
@@ -630,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
630 if (h->order >= MAX_ORDER) 623 if (h->order >= MAX_ORDER)
631 return NULL; 624 return NULL;
632 625
633 page = alloc_pages_node(nid, 626 page = alloc_pages_exact_node(nid,
634 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 627 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
635 __GFP_REPEAT|__GFP_NOWARN, 628 __GFP_REPEAT|__GFP_NOWARN,
636 huge_page_order(h)); 629 huge_page_order(h));
@@ -649,7 +642,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
649 * Use a helper variable to find the next node and then 642 * Use a helper variable to find the next node and then
650 * copy it back to hugetlb_next_nid afterwards: 643 * copy it back to hugetlb_next_nid afterwards:
651 * otherwise there's a window in which a racer might 644 * otherwise there's a window in which a racer might
652 * pass invalid nid MAX_NUMNODES to alloc_pages_node. 645 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
653 * But we don't need to use a spin_lock here: it really 646 * But we don't need to use a spin_lock here: it really
654 * doesn't matter if occasionally a racer chooses the 647 * doesn't matter if occasionally a racer chooses the
655 * same nid as we do. Move nid forward in the mask even 648 * same nid as we do. Move nid forward in the mask even
@@ -875,7 +868,7 @@ static void return_unused_surplus_pages(struct hstate *h,
875 * can no longer free unreserved surplus pages. This occurs when 868 * can no longer free unreserved surplus pages. This occurs when
876 * the nodes with surplus pages have no free pages. 869 * the nodes with surplus pages have no free pages.
877 */ 870 */
878 unsigned long remaining_iterations = num_online_nodes(); 871 unsigned long remaining_iterations = nr_online_nodes;
879 872
880 /* Uncommit the reservation */ 873 /* Uncommit the reservation */
881 h->resv_huge_pages -= unused_resv_pages; 874 h->resv_huge_pages -= unused_resv_pages;
@@ -904,7 +897,7 @@ static void return_unused_surplus_pages(struct hstate *h,
904 h->surplus_huge_pages--; 897 h->surplus_huge_pages--;
905 h->surplus_huge_pages_node[nid]--; 898 h->surplus_huge_pages_node[nid]--;
906 nr_pages--; 899 nr_pages--;
907 remaining_iterations = num_online_nodes(); 900 remaining_iterations = nr_online_nodes;
908 } 901 }
909 } 902 }
910} 903}
@@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1140} 1133}
1141#endif 1134#endif
1142 1135
1136/*
1137 * Increment or decrement surplus_huge_pages. Keep node-specific counters
1138 * balanced by operating on them in a round-robin fashion.
1139 * Returns 1 if an adjustment was made.
1140 */
1141static int adjust_pool_surplus(struct hstate *h, int delta)
1142{
1143 static int prev_nid;
1144 int nid = prev_nid;
1145 int ret = 0;
1146
1147 VM_BUG_ON(delta != -1 && delta != 1);
1148 do {
1149 nid = next_node(nid, node_online_map);
1150 if (nid == MAX_NUMNODES)
1151 nid = first_node(node_online_map);
1152
1153 /* To shrink on this node, there must be a surplus page */
1154 if (delta < 0 && !h->surplus_huge_pages_node[nid])
1155 continue;
1156 /* Surplus cannot exceed the total number of pages */
1157 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
1158 h->nr_huge_pages_node[nid])
1159 continue;
1160
1161 h->surplus_huge_pages += delta;
1162 h->surplus_huge_pages_node[nid] += delta;
1163 ret = 1;
1164 break;
1165 } while (nid != prev_nid);
1166
1167 prev_nid = nid;
1168 return ret;
1169}
1170
1143#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1171#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1144static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) 1172static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1145{ 1173{
diff --git a/mm/init-mm.c b/mm/init-mm.c
new file mode 100644
index 000000000000..57aba0da9668
--- /dev/null
+++ b/mm/init-mm.c
@@ -0,0 +1,20 @@
1#include <linux/mm_types.h>
2#include <linux/rbtree.h>
3#include <linux/rwsem.h>
4#include <linux/spinlock.h>
5#include <linux/list.h>
6#include <linux/cpumask.h>
7
8#include <asm/atomic.h>
9#include <asm/pgtable.h>
10
11struct mm_struct init_mm = {
12 .mm_rb = RB_ROOT,
13 .pgd = swapper_pg_dir,
14 .mm_users = ATOMIC_INIT(2),
15 .mm_count = ATOMIC_INIT(1),
16 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
17 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
18 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
19 .cpu_vm_mask = CPU_MASK_ALL,
20};
diff --git a/mm/internal.h b/mm/internal.h
index 987bb03fbdd8..f290c4db528b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -16,9 +16,6 @@
16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
17 unsigned long floor, unsigned long ceiling); 17 unsigned long floor, unsigned long ceiling);
18 18
19extern void prep_compound_page(struct page *page, unsigned long order);
20extern void prep_compound_gigantic_page(struct page *page, unsigned long order);
21
22static inline void set_page_count(struct page *page, int v) 19static inline void set_page_count(struct page *page, int v)
23{ 20{
24 atomic_set(&page->_count, v); 21 atomic_set(&page->_count, v);
@@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page);
51 */ 48 */
52extern unsigned long highest_memmap_pfn; 49extern unsigned long highest_memmap_pfn;
53extern void __free_pages_bootmem(struct page *page, unsigned int order); 50extern void __free_pages_bootmem(struct page *page, unsigned int order);
51extern void prep_compound_page(struct page *page, unsigned long order);
52
54 53
55/* 54/*
56 * function for dealing with page's order in buddy system. 55 * function for dealing with page's order in buddy system.
@@ -74,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
74} 73}
75#endif 74#endif
76 75
77#ifdef CONFIG_UNEVICTABLE_LRU
78/* 76/*
79 * unevictable_migrate_page() called only from migrate_page_copy() to 77 * unevictable_migrate_page() called only from migrate_page_copy() to
80 * migrate unevictable flag to new page. 78 * migrate unevictable flag to new page.
@@ -86,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
86 if (TestClearPageUnevictable(old)) 84 if (TestClearPageUnevictable(old))
87 SetPageUnevictable(new); 85 SetPageUnevictable(new);
88} 86}
89#else
90static inline void unevictable_migrate_page(struct page *new, struct page *old)
91{
92}
93#endif
94 87
95#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT 88#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
96/* 89/*
@@ -150,23 +143,6 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
150 } 143 }
151} 144}
152 145
153/*
154 * free_page_mlock() -- clean up attempts to free and mlocked() page.
155 * Page should not be on lru, so no need to fix that up.
156 * free_pages_check() will verify...
157 */
158static inline void free_page_mlock(struct page *page)
159{
160 if (unlikely(TestClearPageMlocked(page))) {
161 unsigned long flags;
162
163 local_irq_save(flags);
164 __dec_zone_page_state(page, NR_MLOCK);
165 __count_vm_event(UNEVICTABLE_MLOCKFREED);
166 local_irq_restore(flags);
167 }
168}
169
170#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 146#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
171static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 147static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
172{ 148{
@@ -175,7 +151,6 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
175static inline void clear_page_mlock(struct page *page) { } 151static inline void clear_page_mlock(struct page *page) { }
176static inline void mlock_vma_page(struct page *page) { } 152static inline void mlock_vma_page(struct page *page) { }
177static inline void mlock_migrate_page(struct page *new, struct page *old) { } 153static inline void mlock_migrate_page(struct page *new, struct page *old) { }
178static inline void free_page_mlock(struct page *page) { }
179 154
180#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 155#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
181 156
@@ -284,4 +259,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
284 unsigned long start, int len, int flags, 259 unsigned long start, int len, int flags,
285 struct page **pages, struct vm_area_struct **vmas); 260 struct page **pages, struct vm_area_struct **vmas);
286 261
262#define ZONE_RECLAIM_NOSCAN -2
263#define ZONE_RECLAIM_FULL -1
264#define ZONE_RECLAIM_SOME 0
265#define ZONE_RECLAIM_SUCCESS 1
287#endif 266#endif
diff --git a/mm/madvise.c b/mm/madvise.c
index b9ce574827c8..76eb4193acdd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -123,8 +123,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
123 end = vma->vm_end; 123 end = vma->vm_end;
124 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 124 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
125 125
126 force_page_cache_readahead(file->f_mapping, 126 force_page_cache_readahead(file->f_mapping, file, start, end - start);
127 file, start, max_sane_readahead(end - start));
128 return 0; 127 return 0;
129} 128}
130 129
@@ -239,12 +238,30 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
239 break; 238 break;
240 239
241 default: 240 default:
242 error = -EINVAL; 241 BUG();
243 break; 242 break;
244 } 243 }
245 return error; 244 return error;
246} 245}
247 246
247static int
248madvise_behavior_valid(int behavior)
249{
250 switch (behavior) {
251 case MADV_DOFORK:
252 case MADV_DONTFORK:
253 case MADV_NORMAL:
254 case MADV_SEQUENTIAL:
255 case MADV_RANDOM:
256 case MADV_REMOVE:
257 case MADV_WILLNEED:
258 case MADV_DONTNEED:
259 return 1;
260
261 default:
262 return 0;
263 }
264}
248/* 265/*
249 * The madvise(2) system call. 266 * The madvise(2) system call.
250 * 267 *
@@ -290,6 +307,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
290 int write; 307 int write;
291 size_t len; 308 size_t len;
292 309
310 if (!madvise_behavior_valid(behavior))
311 return error;
312
293 write = madvise_need_mmap_write(behavior); 313 write = madvise_need_mmap_write(behavior);
294 if (write) 314 if (write)
295 down_write(&current->mm->mmap_sem); 315 down_write(&current->mm->mmap_sem);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 78eb8552818b..70db6e0a5eec 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -570,6 +570,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
570 return 0; 570 return 0;
571} 571}
572 572
573int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
574{
575 unsigned long active;
576 unsigned long inactive;
577
578 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
579 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
580
581 return (active > inactive);
582}
583
573unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 584unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
574 struct zone *zone, 585 struct zone *zone,
575 enum lru_list lru) 586 enum lru_list lru)
diff --git a/mm/memory.c b/mm/memory.c
index 4126dd16778c..d5d1653d60a6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1360 return i; 1360 return i;
1361} 1361}
1362 1362
1363/**
1364 * get_user_pages() - pin user pages in memory
1365 * @tsk: task_struct of target task
1366 * @mm: mm_struct of target mm
1367 * @start: starting user address
1368 * @len: number of pages from start to pin
1369 * @write: whether pages will be written to by the caller
1370 * @force: whether to force write access even if user mapping is
1371 * readonly. This will result in the page being COWed even
1372 * in MAP_SHARED mappings. You do not want this.
1373 * @pages: array that receives pointers to the pages pinned.
1374 * Should be at least nr_pages long. Or NULL, if caller
1375 * only intends to ensure the pages are faulted in.
1376 * @vmas: array of pointers to vmas corresponding to each page.
1377 * Or NULL if the caller does not require them.
1378 *
1379 * Returns number of pages pinned. This may be fewer than the number
1380 * requested. If len is 0 or negative, returns 0. If no pages
1381 * were pinned, returns -errno. Each page returned must be released
1382 * with a put_page() call when it is finished with. vmas will only
1383 * remain valid while mmap_sem is held.
1384 *
1385 * Must be called with mmap_sem held for read or write.
1386 *
1387 * get_user_pages walks a process's page tables and takes a reference to
1388 * each struct page that each user address corresponds to at a given
1389 * instant. That is, it takes the page that would be accessed if a user
1390 * thread accesses the given user virtual address at that instant.
1391 *
1392 * This does not guarantee that the page exists in the user mappings when
1393 * get_user_pages returns, and there may even be a completely different
1394 * page there in some cases (eg. if mmapped pagecache has been invalidated
1395 * and subsequently re faulted). However it does guarantee that the page
1396 * won't be freed completely. And mostly callers simply care that the page
1397 * contains data that was valid *at some point in time*. Typically, an IO
1398 * or similar operation cannot guarantee anything stronger anyway because
1399 * locks can't be held over the syscall boundary.
1400 *
1401 * If write=0, the page must not be written to. If the page is written to,
1402 * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
1403 * after the page is finished with, and before put_page is called.
1404 *
1405 * get_user_pages is typically used for fewer-copy IO operations, to get a
1406 * handle on the memory by some means other than accesses via the user virtual
1407 * addresses. The pages may be submitted for DMA to devices or accessed via
1408 * their kernel linear mapping (via the kmap APIs). Care should be taken to
1409 * use the correct cache flushing APIs.
1410 *
1411 * See also get_user_pages_fast, for performance critical applications.
1412 */
1363int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1413int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1364 unsigned long start, int len, int write, int force, 1414 unsigned long start, int len, int write, int force,
1365 struct page **pages, struct vm_area_struct **vmas) 1415 struct page **pages, struct vm_area_struct **vmas)
@@ -3053,22 +3103,13 @@ int in_gate_area_no_task(unsigned long addr)
3053 3103
3054#endif /* __HAVE_ARCH_GATE_AREA */ 3104#endif /* __HAVE_ARCH_GATE_AREA */
3055 3105
3056#ifdef CONFIG_HAVE_IOREMAP_PROT 3106static int follow_pte(struct mm_struct *mm, unsigned long address,
3057int follow_phys(struct vm_area_struct *vma, 3107 pte_t **ptepp, spinlock_t **ptlp)
3058 unsigned long address, unsigned int flags,
3059 unsigned long *prot, resource_size_t *phys)
3060{ 3108{
3061 pgd_t *pgd; 3109 pgd_t *pgd;
3062 pud_t *pud; 3110 pud_t *pud;
3063 pmd_t *pmd; 3111 pmd_t *pmd;
3064 pte_t *ptep, pte; 3112 pte_t *ptep;
3065 spinlock_t *ptl;
3066 resource_size_t phys_addr = 0;
3067 struct mm_struct *mm = vma->vm_mm;
3068 int ret = -EINVAL;
3069
3070 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3071 goto out;
3072 3113
3073 pgd = pgd_offset(mm, address); 3114 pgd = pgd_offset(mm, address);
3074 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 3115 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
@@ -3086,22 +3127,71 @@ int follow_phys(struct vm_area_struct *vma,
3086 if (pmd_huge(*pmd)) 3127 if (pmd_huge(*pmd))
3087 goto out; 3128 goto out;
3088 3129
3089 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 3130 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3090 if (!ptep) 3131 if (!ptep)
3091 goto out; 3132 goto out;
3133 if (!pte_present(*ptep))
3134 goto unlock;
3135 *ptepp = ptep;
3136 return 0;
3137unlock:
3138 pte_unmap_unlock(ptep, *ptlp);
3139out:
3140 return -EINVAL;
3141}
3092 3142
3143/**
3144 * follow_pfn - look up PFN at a user virtual address
3145 * @vma: memory mapping
3146 * @address: user virtual address
3147 * @pfn: location to store found PFN
3148 *
3149 * Only IO mappings and raw PFN mappings are allowed.
3150 *
3151 * Returns zero and the pfn at @pfn on success, -ve otherwise.
3152 */
3153int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3154 unsigned long *pfn)
3155{
3156 int ret = -EINVAL;
3157 spinlock_t *ptl;
3158 pte_t *ptep;
3159
3160 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3161 return ret;
3162
3163 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3164 if (ret)
3165 return ret;
3166 *pfn = pte_pfn(*ptep);
3167 pte_unmap_unlock(ptep, ptl);
3168 return 0;
3169}
3170EXPORT_SYMBOL(follow_pfn);
3171
3172#ifdef CONFIG_HAVE_IOREMAP_PROT
3173int follow_phys(struct vm_area_struct *vma,
3174 unsigned long address, unsigned int flags,
3175 unsigned long *prot, resource_size_t *phys)
3176{
3177 int ret = -EINVAL;
3178 pte_t *ptep, pte;
3179 spinlock_t *ptl;
3180
3181 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3182 goto out;
3183
3184 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3185 goto out;
3093 pte = *ptep; 3186 pte = *ptep;
3094 if (!pte_present(pte)) 3187
3095 goto unlock;
3096 if ((flags & FOLL_WRITE) && !pte_write(pte)) 3188 if ((flags & FOLL_WRITE) && !pte_write(pte))
3097 goto unlock; 3189 goto unlock;
3098 phys_addr = pte_pfn(pte);
3099 phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
3100 3190
3101 *prot = pgprot_val(pte_pgprot(pte)); 3191 *prot = pgprot_val(pte_pgprot(pte));
3102 *phys = phys_addr; 3192 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3103 ret = 0;
3104 3193
3194 ret = 0;
3105unlock: 3195unlock:
3106 pte_unmap_unlock(ptep, ptl); 3196 pte_unmap_unlock(ptep, ptl);
3107out: 3197out:
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c083cf5fd6df..e4412a676c88 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -422,7 +422,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
422 zone->present_pages += onlined_pages; 422 zone->present_pages += onlined_pages;
423 zone->zone_pgdat->node_present_pages += onlined_pages; 423 zone->zone_pgdat->node_present_pages += onlined_pages;
424 424
425 setup_per_zone_pages_min(); 425 setup_per_zone_wmarks();
426 calculate_zone_inactive_ratio(zone);
426 if (onlined_pages) { 427 if (onlined_pages) {
427 kswapd_run(zone_to_nid(zone)); 428 kswapd_run(zone_to_nid(zone));
428 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 429 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
@@ -832,6 +833,9 @@ repeat:
832 totalram_pages -= offlined_pages; 833 totalram_pages -= offlined_pages;
833 num_physpages -= offlined_pages; 834 num_physpages -= offlined_pages;
834 835
836 setup_per_zone_wmarks();
837 calculate_zone_inactive_ratio(zone);
838
835 vm_total_pages = nr_free_pagecache_pages(); 839 vm_total_pages = nr_free_pagecache_pages();
836 writeback_set_ratelimit(); 840 writeback_set_ratelimit();
837 841
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3eb4a6fdc043..e08e2c4da63a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
182 return 0; 182 return 0;
183} 183}
184 184
185/* Create a new policy */ 185/*
186 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
187 * any, for the new policy. mpol_new() has already validated the nodes
188 * parameter with respect to the policy mode and flags. But, we need to
189 * handle an empty nodemask with MPOL_PREFERRED here.
190 *
191 * Must be called holding task's alloc_lock to protect task's mems_allowed
192 * and mempolicy. May also be called holding the mmap_semaphore for write.
193 */
194static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
195{
196 nodemask_t cpuset_context_nmask;
197 int ret;
198
199 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
200 if (pol == NULL)
201 return 0;
202
203 VM_BUG_ON(!nodes);
204 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
205 nodes = NULL; /* explicit local allocation */
206 else {
207 if (pol->flags & MPOL_F_RELATIVE_NODES)
208 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
209 &cpuset_current_mems_allowed);
210 else
211 nodes_and(cpuset_context_nmask, *nodes,
212 cpuset_current_mems_allowed);
213 if (mpol_store_user_nodemask(pol))
214 pol->w.user_nodemask = *nodes;
215 else
216 pol->w.cpuset_mems_allowed =
217 cpuset_current_mems_allowed;
218 }
219
220 ret = mpol_ops[pol->mode].create(pol,
221 nodes ? &cpuset_context_nmask : NULL);
222 return ret;
223}
224
225/*
226 * This function just creates a new policy, does some check and simple
227 * initialization. You must invoke mpol_set_nodemask() to set nodes.
228 */
186static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 229static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
187 nodemask_t *nodes) 230 nodemask_t *nodes)
188{ 231{
189 struct mempolicy *policy; 232 struct mempolicy *policy;
190 nodemask_t cpuset_context_nmask;
191 int ret;
192 233
193 pr_debug("setting mode %d flags %d nodes[0] %lx\n", 234 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
194 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); 235 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
210 if (((flags & MPOL_F_STATIC_NODES) || 251 if (((flags & MPOL_F_STATIC_NODES) ||
211 (flags & MPOL_F_RELATIVE_NODES))) 252 (flags & MPOL_F_RELATIVE_NODES)))
212 return ERR_PTR(-EINVAL); 253 return ERR_PTR(-EINVAL);
213 nodes = NULL; /* flag local alloc */
214 } 254 }
215 } else if (nodes_empty(*nodes)) 255 } else if (nodes_empty(*nodes))
216 return ERR_PTR(-EINVAL); 256 return ERR_PTR(-EINVAL);
@@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
221 policy->mode = mode; 261 policy->mode = mode;
222 policy->flags = flags; 262 policy->flags = flags;
223 263
224 if (nodes) {
225 /*
226 * cpuset related setup doesn't apply to local allocation
227 */
228 cpuset_update_task_memory_state();
229 if (flags & MPOL_F_RELATIVE_NODES)
230 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
231 &cpuset_current_mems_allowed);
232 else
233 nodes_and(cpuset_context_nmask, *nodes,
234 cpuset_current_mems_allowed);
235 if (mpol_store_user_nodemask(policy))
236 policy->w.user_nodemask = *nodes;
237 else
238 policy->w.cpuset_mems_allowed =
239 cpuset_mems_allowed(current);
240 }
241
242 ret = mpol_ops[mode].create(policy,
243 nodes ? &cpuset_context_nmask : NULL);
244 if (ret < 0) {
245 kmem_cache_free(policy_cache, policy);
246 return ERR_PTR(ret);
247 }
248 return policy; 264 return policy;
249} 265}
250 266
@@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
324/* 340/*
325 * Wrapper for mpol_rebind_policy() that just requires task 341 * Wrapper for mpol_rebind_policy() that just requires task
326 * pointer, and updates task mempolicy. 342 * pointer, and updates task mempolicy.
343 *
344 * Called with task's alloc_lock held.
327 */ 345 */
328 346
329void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 347void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
@@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void)
600static long do_set_mempolicy(unsigned short mode, unsigned short flags, 618static long do_set_mempolicy(unsigned short mode, unsigned short flags,
601 nodemask_t *nodes) 619 nodemask_t *nodes)
602{ 620{
603 struct mempolicy *new; 621 struct mempolicy *new, *old;
604 struct mm_struct *mm = current->mm; 622 struct mm_struct *mm = current->mm;
623 int ret;
605 624
606 new = mpol_new(mode, flags, nodes); 625 new = mpol_new(mode, flags, nodes);
607 if (IS_ERR(new)) 626 if (IS_ERR(new))
@@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
615 */ 634 */
616 if (mm) 635 if (mm)
617 down_write(&mm->mmap_sem); 636 down_write(&mm->mmap_sem);
618 mpol_put(current->mempolicy); 637 task_lock(current);
638 ret = mpol_set_nodemask(new, nodes);
639 if (ret) {
640 task_unlock(current);
641 if (mm)
642 up_write(&mm->mmap_sem);
643 mpol_put(new);
644 return ret;
645 }
646 old = current->mempolicy;
619 current->mempolicy = new; 647 current->mempolicy = new;
620 mpol_set_task_struct_flag(); 648 mpol_set_task_struct_flag();
621 if (new && new->mode == MPOL_INTERLEAVE && 649 if (new && new->mode == MPOL_INTERLEAVE &&
622 nodes_weight(new->v.nodes)) 650 nodes_weight(new->v.nodes))
623 current->il_next = first_node(new->v.nodes); 651 current->il_next = first_node(new->v.nodes);
652 task_unlock(current);
624 if (mm) 653 if (mm)
625 up_write(&mm->mmap_sem); 654 up_write(&mm->mmap_sem);
626 655
656 mpol_put(old);
627 return 0; 657 return 0;
628} 658}
629 659
630/* 660/*
631 * Return nodemask for policy for get_mempolicy() query 661 * Return nodemask for policy for get_mempolicy() query
662 *
663 * Called with task's alloc_lock held
632 */ 664 */
633static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) 665static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
634{ 666{
@@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
674 struct vm_area_struct *vma = NULL; 706 struct vm_area_struct *vma = NULL;
675 struct mempolicy *pol = current->mempolicy; 707 struct mempolicy *pol = current->mempolicy;
676 708
677 cpuset_update_task_memory_state();
678 if (flags & 709 if (flags &
679 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 710 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
680 return -EINVAL; 711 return -EINVAL;
@@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
683 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 714 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
684 return -EINVAL; 715 return -EINVAL;
685 *policy = 0; /* just so it's initialized */ 716 *policy = 0; /* just so it's initialized */
717 task_lock(current);
686 *nmask = cpuset_current_mems_allowed; 718 *nmask = cpuset_current_mems_allowed;
719 task_unlock(current);
687 return 0; 720 return 0;
688 } 721 }
689 722
@@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
738 } 771 }
739 772
740 err = 0; 773 err = 0;
741 if (nmask) 774 if (nmask) {
775 task_lock(current);
742 get_policy_nodemask(pol, nmask); 776 get_policy_nodemask(pol, nmask);
777 task_unlock(current);
778 }
743 779
744 out: 780 out:
745 mpol_cond_put(pol); 781 mpol_cond_put(pol);
@@ -767,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
767 803
768static struct page *new_node_page(struct page *page, unsigned long node, int **x) 804static struct page *new_node_page(struct page *page, unsigned long node, int **x)
769{ 805{
770 return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); 806 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
771} 807}
772 808
773/* 809/*
@@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len,
979 return err; 1015 return err;
980 } 1016 }
981 down_write(&mm->mmap_sem); 1017 down_write(&mm->mmap_sem);
1018 task_lock(current);
1019 err = mpol_set_nodemask(new, nmask);
1020 task_unlock(current);
1021 if (err) {
1022 up_write(&mm->mmap_sem);
1023 mpol_put(new);
1024 return err;
1025 }
982 vma = check_range(mm, start, end, nmask, 1026 vma = check_range(mm, start, end, nmask,
983 flags | MPOL_MF_INVERT, &pagelist); 1027 flags | MPOL_MF_INVERT, &pagelist);
984 1028
@@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1545 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1589 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1546 struct zonelist *zl; 1590 struct zonelist *zl;
1547 1591
1548 cpuset_update_task_memory_state();
1549
1550 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1592 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1551 unsigned nid; 1593 unsigned nid;
1552 1594
@@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1593{ 1635{
1594 struct mempolicy *pol = current->mempolicy; 1636 struct mempolicy *pol = current->mempolicy;
1595 1637
1596 if ((gfp & __GFP_WAIT) && !in_interrupt())
1597 cpuset_update_task_memory_state();
1598 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1638 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1599 pol = &default_policy; 1639 pol = &default_policy;
1600 1640
@@ -1854,6 +1894,8 @@ restart:
1854 */ 1894 */
1855void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 1895void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1856{ 1896{
1897 int ret;
1898
1857 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 1899 sp->root = RB_ROOT; /* empty tree == default mempolicy */
1858 spin_lock_init(&sp->lock); 1900 spin_lock_init(&sp->lock);
1859 1901
@@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1863 1905
1864 /* contextualize the tmpfs mount point mempolicy */ 1906 /* contextualize the tmpfs mount point mempolicy */
1865 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 1907 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1866 mpol_put(mpol); /* drop our ref on sb mpol */ 1908 if (IS_ERR(new)) {
1867 if (IS_ERR(new)) 1909 mpol_put(mpol); /* drop our ref on sb mpol */
1868 return; /* no valid nodemask intersection */ 1910 return; /* no valid nodemask intersection */
1911 }
1912
1913 task_lock(current);
1914 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
1915 task_unlock(current);
1916 mpol_put(mpol); /* drop our ref on sb mpol */
1917 if (ret) {
1918 mpol_put(new);
1919 return;
1920 }
1869 1921
1870 /* Create pseudo-vma that contains just the policy */ 1922 /* Create pseudo-vma that contains just the policy */
1871 memset(&pvma, 0, sizeof(struct vm_area_struct)); 1923 memset(&pvma, 0, sizeof(struct vm_area_struct));
@@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2086 new = mpol_new(mode, mode_flags, &nodes); 2138 new = mpol_new(mode, mode_flags, &nodes);
2087 if (IS_ERR(new)) 2139 if (IS_ERR(new))
2088 err = 1; 2140 err = 1;
2089 else if (no_context) 2141 else {
2090 new->w.user_nodemask = nodes; /* save for contextualization */ 2142 int ret;
2143
2144 task_lock(current);
2145 ret = mpol_set_nodemask(new, &nodes);
2146 task_unlock(current);
2147 if (ret)
2148 err = 1;
2149 else if (no_context) {
2150 /* save for contextualization */
2151 new->w.user_nodemask = nodes;
2152 }
2153 }
2091 2154
2092out: 2155out:
2093 /* Restore string for error message */ 2156 /* Restore string for error message */
diff --git a/mm/migrate.c b/mm/migrate.c
index 068655d8f883..939888f9ddab 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
802 802
803 *result = &pm->status; 803 *result = &pm->status;
804 804
805 return alloc_pages_node(pm->node, 805 return alloc_pages_exact_node(pm->node,
806 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 806 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
807} 807}
808 808
@@ -820,7 +820,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
820 struct page_to_node *pp; 820 struct page_to_node *pp;
821 LIST_HEAD(pagelist); 821 LIST_HEAD(pagelist);
822 822
823 migrate_prep();
824 down_read(&mm->mmap_sem); 823 down_read(&mm->mmap_sem);
825 824
826 /* 825 /*
@@ -907,6 +906,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
907 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); 906 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
908 if (!pm) 907 if (!pm)
909 goto out; 908 goto out;
909
910 migrate_prep();
911
910 /* 912 /*
911 * Store a chunk of page_to_node array in a page, 913 * Store a chunk of page_to_node array in a page,
912 * but keep the last one as a marker 914 * but keep the last one as a marker
diff --git a/mm/mlock.c b/mm/mlock.c
index ac130433c7d3..45eb650b9654 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -31,7 +31,6 @@ int can_do_mlock(void)
31} 31}
32EXPORT_SYMBOL(can_do_mlock); 32EXPORT_SYMBOL(can_do_mlock);
33 33
34#ifdef CONFIG_UNEVICTABLE_LRU
35/* 34/*
36 * Mlocked pages are marked with PageMlocked() flag for efficient testing 35 * Mlocked pages are marked with PageMlocked() flag for efficient testing
37 * in vmscan and, possibly, the fault path; and to support semi-accurate 36 * in vmscan and, possibly, the fault path; and to support semi-accurate
@@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval)
261 return retval; 260 return retval;
262} 261}
263 262
264#else /* CONFIG_UNEVICTABLE_LRU */
265
266/*
267 * Just make pages present if VM_LOCKED. No-op if unlocking.
268 */
269static long __mlock_vma_pages_range(struct vm_area_struct *vma,
270 unsigned long start, unsigned long end,
271 int mlock)
272{
273 if (mlock && (vma->vm_flags & VM_LOCKED))
274 return make_pages_present(start, end);
275 return 0;
276}
277
278static inline int __mlock_posix_error_return(long retval)
279{
280 return 0;
281}
282
283#endif /* CONFIG_UNEVICTABLE_LRU */
284
285/** 263/**
286 * mlock_vma_pages_range() - mlock pages in specified vma range. 264 * mlock_vma_pages_range() - mlock pages in specified vma range.
287 * @vma - the vma containing the specfied address range 265 * @vma - the vma containing the specfied address range
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a7b2460e922b..175a67a78a99 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 unsigned long points, cpu_time, run_time; 58 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 59 struct mm_struct *mm;
60 struct task_struct *child; 60 struct task_struct *child;
61 int oom_adj;
61 62
62 task_lock(p); 63 task_lock(p);
63 mm = p->mm; 64 mm = p->mm;
@@ -65,6 +66,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
65 task_unlock(p); 66 task_unlock(p);
66 return 0; 67 return 0;
67 } 68 }
69 oom_adj = mm->oom_adj;
70 if (oom_adj == OOM_DISABLE) {
71 task_unlock(p);
72 return 0;
73 }
68 74
69 /* 75 /*
70 * The memory size of the process is the basis for the badness. 76 * The memory size of the process is the basis for the badness.
@@ -148,15 +154,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
148 points /= 8; 154 points /= 8;
149 155
150 /* 156 /*
151 * Adjust the score by oomkilladj. 157 * Adjust the score by oom_adj.
152 */ 158 */
153 if (p->oomkilladj) { 159 if (oom_adj) {
154 if (p->oomkilladj > 0) { 160 if (oom_adj > 0) {
155 if (!points) 161 if (!points)
156 points = 1; 162 points = 1;
157 points <<= p->oomkilladj; 163 points <<= oom_adj;
158 } else 164 } else
159 points >>= -(p->oomkilladj); 165 points >>= -(oom_adj);
160 } 166 }
161 167
162#ifdef DEBUG 168#ifdef DEBUG
@@ -251,11 +257,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
251 *ppoints = ULONG_MAX; 257 *ppoints = ULONG_MAX;
252 } 258 }
253 259
254 if (p->oomkilladj == OOM_DISABLE)
255 continue;
256
257 points = badness(p, uptime.tv_sec); 260 points = badness(p, uptime.tv_sec);
258 if (points > *ppoints || !chosen) { 261 if (points > *ppoints) {
259 chosen = p; 262 chosen = p;
260 *ppoints = points; 263 *ppoints = points;
261 } 264 }
@@ -304,8 +307,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
304 } 307 }
305 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 308 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
306 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 309 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
307 get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, 310 get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
308 p->comm);
309 task_unlock(p); 311 task_unlock(p);
310 } while_each_thread(g, p); 312 } while_each_thread(g, p);
311} 313}
@@ -323,11 +325,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
323 return; 325 return;
324 } 326 }
325 327
326 if (!p->mm) { 328 if (!p->mm)
327 WARN_ON(1);
328 printk(KERN_WARNING "tried to kill an mm-less task!\n");
329 return; 329 return;
330 }
331 330
332 if (verbose) 331 if (verbose)
333 printk(KERN_ERR "Killed process %d (%s)\n", 332 printk(KERN_ERR "Killed process %d (%s)\n",
@@ -349,28 +348,13 @@ static int oom_kill_task(struct task_struct *p)
349 struct mm_struct *mm; 348 struct mm_struct *mm;
350 struct task_struct *g, *q; 349 struct task_struct *g, *q;
351 350
351 task_lock(p);
352 mm = p->mm; 352 mm = p->mm;
353 353 if (!mm || mm->oom_adj == OOM_DISABLE) {
354 /* WARNING: mm may not be dereferenced since we did not obtain its 354 task_unlock(p);
355 * value from get_task_mm(p). This is OK since all we need to do is
356 * compare mm to q->mm below.
357 *
358 * Furthermore, even if mm contains a non-NULL value, p->mm may
359 * change to NULL at any time since we do not hold task_lock(p).
360 * However, this is of no concern to us.
361 */
362
363 if (mm == NULL)
364 return 1; 355 return 1;
365 356 }
366 /* 357 task_unlock(p);
367 * Don't kill the process if any threads are set to OOM_DISABLE
368 */
369 do_each_thread(g, q) {
370 if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
371 return 1;
372 } while_each_thread(g, q);
373
374 __oom_kill_task(p, 1); 358 __oom_kill_task(p, 1);
375 359
376 /* 360 /*
@@ -393,10 +377,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
393 struct task_struct *c; 377 struct task_struct *c;
394 378
395 if (printk_ratelimit()) { 379 if (printk_ratelimit()) {
396 printk(KERN_WARNING "%s invoked oom-killer: "
397 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
398 current->comm, gfp_mask, order, current->oomkilladj);
399 task_lock(current); 380 task_lock(current);
381 printk(KERN_WARNING "%s invoked oom-killer: "
382 "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
383 current->comm, gfp_mask, order,
384 current->mm ? current->mm->oom_adj : OOM_DISABLE);
400 cpuset_print_task_mems_allowed(current); 385 cpuset_print_task_mems_allowed(current);
401 task_unlock(current); 386 task_unlock(current);
402 dump_stack(); 387 dump_stack();
@@ -409,8 +394,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
409 /* 394 /*
410 * If the task is already exiting, don't alarm the sysadmin or kill 395 * If the task is already exiting, don't alarm the sysadmin or kill
411 * its children or threads, just set TIF_MEMDIE so it can die quickly 396 * its children or threads, just set TIF_MEMDIE so it can die quickly
397 * if its mm is still attached.
412 */ 398 */
413 if (p->flags & PF_EXITING) { 399 if (p->mm && (p->flags & PF_EXITING)) {
414 __oom_kill_task(p, 0); 400 __oom_kill_task(p, 0);
415 return 0; 401 return 0;
416 } 402 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bb553c3e955d..7b0dcea4935b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -265,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
265 * This avoids exceeding the total dirty_limit when the floating averages 265 * This avoids exceeding the total dirty_limit when the floating averages
266 * fluctuate too quickly. 266 * fluctuate too quickly.
267 */ 267 */
268static void 268static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
269clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) 269 unsigned long dirty, unsigned long *pbdi_dirty)
270{ 270{
271 long avail_dirty; 271 unsigned long avail_dirty;
272 272
273 avail_dirty = dirty - 273 avail_dirty = global_page_state(NR_FILE_DIRTY) +
274 (global_page_state(NR_FILE_DIRTY) +
275 global_page_state(NR_WRITEBACK) + 274 global_page_state(NR_WRITEBACK) +
276 global_page_state(NR_UNSTABLE_NFS) + 275 global_page_state(NR_UNSTABLE_NFS) +
277 global_page_state(NR_WRITEBACK_TEMP)); 276 global_page_state(NR_WRITEBACK_TEMP);
278 277
279 if (avail_dirty < 0) 278 if (avail_dirty < dirty)
279 avail_dirty = dirty - avail_dirty;
280 else
280 avail_dirty = 0; 281 avail_dirty = 0;
281 282
282 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + 283 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
@@ -299,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
299 * 300 *
300 * dirty -= (dirty/8) * p_{t} 301 * dirty -= (dirty/8) * p_{t}
301 */ 302 */
302static void task_dirty_limit(struct task_struct *tsk, long *pdirty) 303static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
303{ 304{
304 long numerator, denominator; 305 long numerator, denominator;
305 long dirty = *pdirty; 306 unsigned long dirty = *pdirty;
306 u64 inv = dirty >> 3; 307 u64 inv = dirty >> 3;
307 308
308 task_dirties_fraction(tsk, &numerator, &denominator); 309 task_dirties_fraction(tsk, &numerator, &denominator);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0727896a88ac..a5f3c278c573 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -162,17 +162,25 @@ static unsigned long __meminitdata dma_reserve;
162 162
163#if MAX_NUMNODES > 1 163#if MAX_NUMNODES > 1
164int nr_node_ids __read_mostly = MAX_NUMNODES; 164int nr_node_ids __read_mostly = MAX_NUMNODES;
165int nr_online_nodes __read_mostly = 1;
165EXPORT_SYMBOL(nr_node_ids); 166EXPORT_SYMBOL(nr_node_ids);
167EXPORT_SYMBOL(nr_online_nodes);
166#endif 168#endif
167 169
168int page_group_by_mobility_disabled __read_mostly; 170int page_group_by_mobility_disabled __read_mostly;
169 171
170static void set_pageblock_migratetype(struct page *page, int migratetype) 172static void set_pageblock_migratetype(struct page *page, int migratetype)
171{ 173{
174
175 if (unlikely(page_group_by_mobility_disabled))
176 migratetype = MIGRATE_UNMOVABLE;
177
172 set_pageblock_flags_group(page, (unsigned long)migratetype, 178 set_pageblock_flags_group(page, (unsigned long)migratetype,
173 PB_migrate, PB_migrate_end); 179 PB_migrate, PB_migrate_end);
174} 180}
175 181
182bool oom_killer_disabled __read_mostly;
183
176#ifdef CONFIG_DEBUG_VM 184#ifdef CONFIG_DEBUG_VM
177static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 185static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
178{ 186{
@@ -295,23 +303,6 @@ void prep_compound_page(struct page *page, unsigned long order)
295 } 303 }
296} 304}
297 305
298#ifdef CONFIG_HUGETLBFS
299void prep_compound_gigantic_page(struct page *page, unsigned long order)
300{
301 int i;
302 int nr_pages = 1 << order;
303 struct page *p = page + 1;
304
305 set_compound_page_dtor(page, free_compound_page);
306 set_compound_order(page, order);
307 __SetPageHead(page);
308 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
309 __SetPageTail(p);
310 p->first_page = page;
311 }
312}
313#endif
314
315static int destroy_compound_page(struct page *page, unsigned long order) 306static int destroy_compound_page(struct page *page, unsigned long order)
316{ 307{
317 int i; 308 int i;
@@ -418,7 +409,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
418 return 0; 409 return 0;
419 410
420 if (PageBuddy(buddy) && page_order(buddy) == order) { 411 if (PageBuddy(buddy) && page_order(buddy) == order) {
421 BUG_ON(page_count(buddy) != 0); 412 VM_BUG_ON(page_count(buddy) != 0);
422 return 1; 413 return 1;
423 } 414 }
424 return 0; 415 return 0;
@@ -449,22 +440,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
449 */ 440 */
450 441
451static inline void __free_one_page(struct page *page, 442static inline void __free_one_page(struct page *page,
452 struct zone *zone, unsigned int order) 443 struct zone *zone, unsigned int order,
444 int migratetype)
453{ 445{
454 unsigned long page_idx; 446 unsigned long page_idx;
455 int order_size = 1 << order;
456 int migratetype = get_pageblock_migratetype(page);
457 447
458 if (unlikely(PageCompound(page))) 448 if (unlikely(PageCompound(page)))
459 if (unlikely(destroy_compound_page(page, order))) 449 if (unlikely(destroy_compound_page(page, order)))
460 return; 450 return;
461 451
452 VM_BUG_ON(migratetype == -1);
453
462 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 454 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
463 455
464 VM_BUG_ON(page_idx & (order_size - 1)); 456 VM_BUG_ON(page_idx & ((1 << order) - 1));
465 VM_BUG_ON(bad_range(zone, page)); 457 VM_BUG_ON(bad_range(zone, page));
466 458
467 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
468 while (order < MAX_ORDER-1) { 459 while (order < MAX_ORDER-1) {
469 unsigned long combined_idx; 460 unsigned long combined_idx;
470 struct page *buddy; 461 struct page *buddy;
@@ -488,12 +479,27 @@ static inline void __free_one_page(struct page *page,
488 zone->free_area[order].nr_free++; 479 zone->free_area[order].nr_free++;
489} 480}
490 481
482#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
483/*
484 * free_page_mlock() -- clean up attempts to free and mlocked() page.
485 * Page should not be on lru, so no need to fix that up.
486 * free_pages_check() will verify...
487 */
488static inline void free_page_mlock(struct page *page)
489{
490 __ClearPageMlocked(page);
491 __dec_zone_page_state(page, NR_MLOCK);
492 __count_vm_event(UNEVICTABLE_MLOCKFREED);
493}
494#else
495static void free_page_mlock(struct page *page) { }
496#endif
497
491static inline int free_pages_check(struct page *page) 498static inline int free_pages_check(struct page *page)
492{ 499{
493 free_page_mlock(page);
494 if (unlikely(page_mapcount(page) | 500 if (unlikely(page_mapcount(page) |
495 (page->mapping != NULL) | 501 (page->mapping != NULL) |
496 (page_count(page) != 0) | 502 (atomic_read(&page->_count) != 0) |
497 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { 503 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
498 bad_page(page); 504 bad_page(page);
499 return 1; 505 return 1;
@@ -520,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count,
520 spin_lock(&zone->lock); 526 spin_lock(&zone->lock);
521 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 527 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
522 zone->pages_scanned = 0; 528 zone->pages_scanned = 0;
529
530 __mod_zone_page_state(zone, NR_FREE_PAGES, count << order);
523 while (count--) { 531 while (count--) {
524 struct page *page; 532 struct page *page;
525 533
@@ -527,17 +535,20 @@ static void free_pages_bulk(struct zone *zone, int count,
527 page = list_entry(list->prev, struct page, lru); 535 page = list_entry(list->prev, struct page, lru);
528 /* have to delete it as __free_one_page list manipulates */ 536 /* have to delete it as __free_one_page list manipulates */
529 list_del(&page->lru); 537 list_del(&page->lru);
530 __free_one_page(page, zone, order); 538 __free_one_page(page, zone, order, page_private(page));
531 } 539 }
532 spin_unlock(&zone->lock); 540 spin_unlock(&zone->lock);
533} 541}
534 542
535static void free_one_page(struct zone *zone, struct page *page, int order) 543static void free_one_page(struct zone *zone, struct page *page, int order,
544 int migratetype)
536{ 545{
537 spin_lock(&zone->lock); 546 spin_lock(&zone->lock);
538 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 547 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
539 zone->pages_scanned = 0; 548 zone->pages_scanned = 0;
540 __free_one_page(page, zone, order); 549
550 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
551 __free_one_page(page, zone, order, migratetype);
541 spin_unlock(&zone->lock); 552 spin_unlock(&zone->lock);
542} 553}
543 554
@@ -546,6 +557,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
546 unsigned long flags; 557 unsigned long flags;
547 int i; 558 int i;
548 int bad = 0; 559 int bad = 0;
560 int clearMlocked = PageMlocked(page);
549 561
550 kmemcheck_free_shadow(page, order); 562 kmemcheck_free_shadow(page, order);
551 563
@@ -563,8 +575,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
563 kernel_map_pages(page, 1 << order, 0); 575 kernel_map_pages(page, 1 << order, 0);
564 576
565 local_irq_save(flags); 577 local_irq_save(flags);
578 if (unlikely(clearMlocked))
579 free_page_mlock(page);
566 __count_vm_events(PGFREE, 1 << order); 580 __count_vm_events(PGFREE, 1 << order);
567 free_one_page(page_zone(page), page, order); 581 free_one_page(page_zone(page), page, order,
582 get_pageblock_migratetype(page));
568 local_irq_restore(flags); 583 local_irq_restore(flags);
569} 584}
570 585
@@ -635,7 +650,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
635{ 650{
636 if (unlikely(page_mapcount(page) | 651 if (unlikely(page_mapcount(page) |
637 (page->mapping != NULL) | 652 (page->mapping != NULL) |
638 (page_count(page) != 0) | 653 (atomic_read(&page->_count) != 0) |
639 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { 654 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
640 bad_page(page); 655 bad_page(page);
641 return 1; 656 return 1;
@@ -660,7 +675,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
660 * Go through the free lists for the given migratetype and remove 675 * Go through the free lists for the given migratetype and remove
661 * the smallest available page from the freelists 676 * the smallest available page from the freelists
662 */ 677 */
663static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 678static inline
679struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
664 int migratetype) 680 int migratetype)
665{ 681{
666 unsigned int current_order; 682 unsigned int current_order;
@@ -678,7 +694,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
678 list_del(&page->lru); 694 list_del(&page->lru);
679 rmv_page_order(page); 695 rmv_page_order(page);
680 area->nr_free--; 696 area->nr_free--;
681 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
682 expand(zone, page, order, current_order, area, migratetype); 697 expand(zone, page, order, current_order, area, migratetype);
683 return page; 698 return page;
684 } 699 }
@@ -769,8 +784,8 @@ static int move_freepages_block(struct zone *zone, struct page *page,
769} 784}
770 785
771/* Remove an element from the buddy allocator from the fallback list */ 786/* Remove an element from the buddy allocator from the fallback list */
772static struct page *__rmqueue_fallback(struct zone *zone, int order, 787static inline struct page *
773 int start_migratetype) 788__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
774{ 789{
775 struct free_area * area; 790 struct free_area * area;
776 int current_order; 791 int current_order;
@@ -818,8 +833,6 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
818 /* Remove the page from the freelists */ 833 /* Remove the page from the freelists */
819 list_del(&page->lru); 834 list_del(&page->lru);
820 rmv_page_order(page); 835 rmv_page_order(page);
821 __mod_zone_page_state(zone, NR_FREE_PAGES,
822 -(1UL << order));
823 836
824 if (current_order == pageblock_order) 837 if (current_order == pageblock_order)
825 set_pageblock_migratetype(page, 838 set_pageblock_migratetype(page,
@@ -830,8 +843,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
830 } 843 }
831 } 844 }
832 845
833 /* Use MIGRATE_RESERVE rather than fail an allocation */ 846 return NULL;
834 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
835} 847}
836 848
837/* 849/*
@@ -843,11 +855,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
843{ 855{
844 struct page *page; 856 struct page *page;
845 857
858retry_reserve:
846 page = __rmqueue_smallest(zone, order, migratetype); 859 page = __rmqueue_smallest(zone, order, migratetype);
847 860
848 if (unlikely(!page)) 861 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
849 page = __rmqueue_fallback(zone, order, migratetype); 862 page = __rmqueue_fallback(zone, order, migratetype);
850 863
864 /*
865 * Use MIGRATE_RESERVE rather than fail an allocation. goto
866 * is used because __rmqueue_smallest is an inline function
867 * and we want just one call site
868 */
869 if (!page) {
870 migratetype = MIGRATE_RESERVE;
871 goto retry_reserve;
872 }
873 }
874
851 return page; 875 return page;
852} 876}
853 877
@@ -881,6 +905,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
881 set_page_private(page, migratetype); 905 set_page_private(page, migratetype);
882 list = &page->lru; 906 list = &page->lru;
883 } 907 }
908 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
884 spin_unlock(&zone->lock); 909 spin_unlock(&zone->lock);
885 return i; 910 return i;
886} 911}
@@ -996,6 +1021,7 @@ static void free_hot_cold_page(struct page *page, int cold)
996 struct zone *zone = page_zone(page); 1021 struct zone *zone = page_zone(page);
997 struct per_cpu_pages *pcp; 1022 struct per_cpu_pages *pcp;
998 unsigned long flags; 1023 unsigned long flags;
1024 int clearMlocked = PageMlocked(page);
999 1025
1000 kmemcheck_free_shadow(page, 0); 1026 kmemcheck_free_shadow(page, 0);
1001 1027
@@ -1012,13 +1038,16 @@ static void free_hot_cold_page(struct page *page, int cold)
1012 kernel_map_pages(page, 1, 0); 1038 kernel_map_pages(page, 1, 0);
1013 1039
1014 pcp = &zone_pcp(zone, get_cpu())->pcp; 1040 pcp = &zone_pcp(zone, get_cpu())->pcp;
1041 set_page_private(page, get_pageblock_migratetype(page));
1015 local_irq_save(flags); 1042 local_irq_save(flags);
1043 if (unlikely(clearMlocked))
1044 free_page_mlock(page);
1016 __count_vm_event(PGFREE); 1045 __count_vm_event(PGFREE);
1046
1017 if (cold) 1047 if (cold)
1018 list_add_tail(&page->lru, &pcp->list); 1048 list_add_tail(&page->lru, &pcp->list);
1019 else 1049 else
1020 list_add(&page->lru, &pcp->list); 1050 list_add(&page->lru, &pcp->list);
1021 set_page_private(page, get_pageblock_migratetype(page));
1022 pcp->count++; 1051 pcp->count++;
1023 if (pcp->count >= pcp->high) { 1052 if (pcp->count >= pcp->high) {
1024 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1053 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -1071,14 +1100,15 @@ void split_page(struct page *page, unsigned int order)
1071 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1100 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1072 * or two. 1101 * or two.
1073 */ 1102 */
1074static struct page *buffered_rmqueue(struct zone *preferred_zone, 1103static inline
1075 struct zone *zone, int order, gfp_t gfp_flags) 1104struct page *buffered_rmqueue(struct zone *preferred_zone,
1105 struct zone *zone, int order, gfp_t gfp_flags,
1106 int migratetype)
1076{ 1107{
1077 unsigned long flags; 1108 unsigned long flags;
1078 struct page *page; 1109 struct page *page;
1079 int cold = !!(gfp_flags & __GFP_COLD); 1110 int cold = !!(gfp_flags & __GFP_COLD);
1080 int cpu; 1111 int cpu;
1081 int migratetype = allocflags_to_migratetype(gfp_flags);
1082 1112
1083again: 1113again:
1084 cpu = get_cpu(); 1114 cpu = get_cpu();
@@ -1115,8 +1145,22 @@ again:
1115 list_del(&page->lru); 1145 list_del(&page->lru);
1116 pcp->count--; 1146 pcp->count--;
1117 } else { 1147 } else {
1148 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1149 /*
1150 * __GFP_NOFAIL is not to be used in new code.
1151 *
1152 * All __GFP_NOFAIL callers should be fixed so that they
1153 * properly detect and handle allocation failures.
1154 *
1155 * We most definitely don't want callers attempting to
1156 * allocate greater than single-page units with
1157 * __GFP_NOFAIL.
1158 */
1159 WARN_ON_ONCE(order > 0);
1160 }
1118 spin_lock_irqsave(&zone->lock, flags); 1161 spin_lock_irqsave(&zone->lock, flags);
1119 page = __rmqueue(zone, order, migratetype); 1162 page = __rmqueue(zone, order, migratetype);
1163 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1120 spin_unlock(&zone->lock); 1164 spin_unlock(&zone->lock);
1121 if (!page) 1165 if (!page)
1122 goto failed; 1166 goto failed;
@@ -1138,10 +1182,15 @@ failed:
1138 return NULL; 1182 return NULL;
1139} 1183}
1140 1184
1141#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 1185/* The ALLOC_WMARK bits are used as an index to zone->watermark */
1142#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 1186#define ALLOC_WMARK_MIN WMARK_MIN
1143#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 1187#define ALLOC_WMARK_LOW WMARK_LOW
1144#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 1188#define ALLOC_WMARK_HIGH WMARK_HIGH
1189#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1190
1191/* Mask to get the watermark bits */
1192#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1193
1145#define ALLOC_HARDER 0x10 /* try to alloc harder */ 1194#define ALLOC_HARDER 0x10 /* try to alloc harder */
1146#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 1195#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1147#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 1196#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
@@ -1399,23 +1448,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1399 */ 1448 */
1400static struct page * 1449static struct page *
1401get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1450get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1402 struct zonelist *zonelist, int high_zoneidx, int alloc_flags) 1451 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1452 struct zone *preferred_zone, int migratetype)
1403{ 1453{
1404 struct zoneref *z; 1454 struct zoneref *z;
1405 struct page *page = NULL; 1455 struct page *page = NULL;
1406 int classzone_idx; 1456 int classzone_idx;
1407 struct zone *zone, *preferred_zone; 1457 struct zone *zone;
1408 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1458 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1409 int zlc_active = 0; /* set if using zonelist_cache */ 1459 int zlc_active = 0; /* set if using zonelist_cache */
1410 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1460 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1411 1461
1412 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
1413 &preferred_zone);
1414 if (!preferred_zone)
1415 return NULL;
1416
1417 classzone_idx = zone_idx(preferred_zone); 1462 classzone_idx = zone_idx(preferred_zone);
1418
1419zonelist_scan: 1463zonelist_scan:
1420 /* 1464 /*
1421 * Scan zonelist, looking for a zone with enough free. 1465 * Scan zonelist, looking for a zone with enough free.
@@ -1430,31 +1474,49 @@ zonelist_scan:
1430 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1474 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1431 goto try_next_zone; 1475 goto try_next_zone;
1432 1476
1477 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1433 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1478 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1434 unsigned long mark; 1479 unsigned long mark;
1435 if (alloc_flags & ALLOC_WMARK_MIN) 1480 int ret;
1436 mark = zone->pages_min; 1481
1437 else if (alloc_flags & ALLOC_WMARK_LOW) 1482 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1438 mark = zone->pages_low; 1483 if (zone_watermark_ok(zone, order, mark,
1439 else 1484 classzone_idx, alloc_flags))
1440 mark = zone->pages_high; 1485 goto try_this_zone;
1441 if (!zone_watermark_ok(zone, order, mark, 1486
1442 classzone_idx, alloc_flags)) { 1487 if (zone_reclaim_mode == 0)
1443 if (!zone_reclaim_mode || 1488 goto this_zone_full;
1444 !zone_reclaim(zone, gfp_mask, order)) 1489
1490 ret = zone_reclaim(zone, gfp_mask, order);
1491 switch (ret) {
1492 case ZONE_RECLAIM_NOSCAN:
1493 /* did not scan */
1494 goto try_next_zone;
1495 case ZONE_RECLAIM_FULL:
1496 /* scanned but unreclaimable */
1497 goto this_zone_full;
1498 default:
1499 /* did we reclaim enough */
1500 if (!zone_watermark_ok(zone, order, mark,
1501 classzone_idx, alloc_flags))
1445 goto this_zone_full; 1502 goto this_zone_full;
1446 } 1503 }
1447 } 1504 }
1448 1505
1449 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); 1506try_this_zone:
1507 page = buffered_rmqueue(preferred_zone, zone, order,
1508 gfp_mask, migratetype);
1450 if (page) 1509 if (page)
1451 break; 1510 break;
1452this_zone_full: 1511this_zone_full:
1453 if (NUMA_BUILD) 1512 if (NUMA_BUILD)
1454 zlc_mark_zone_full(zonelist, z); 1513 zlc_mark_zone_full(zonelist, z);
1455try_next_zone: 1514try_next_zone:
1456 if (NUMA_BUILD && !did_zlc_setup) { 1515 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1457 /* we do zlc_setup after the first zone is tried */ 1516 /*
1517 * we do zlc_setup after the first zone is tried but only
1518 * if there are multiple nodes make it worthwhile
1519 */
1458 allowednodes = zlc_setup(zonelist, alloc_flags); 1520 allowednodes = zlc_setup(zonelist, alloc_flags);
1459 zlc_active = 1; 1521 zlc_active = 1;
1460 did_zlc_setup = 1; 1522 did_zlc_setup = 1;
@@ -1469,47 +1531,217 @@ try_next_zone:
1469 return page; 1531 return page;
1470} 1532}
1471 1533
1534static inline int
1535should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1536 unsigned long pages_reclaimed)
1537{
1538 /* Do not loop if specifically requested */
1539 if (gfp_mask & __GFP_NORETRY)
1540 return 0;
1541
1542 /*
1543 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1544 * means __GFP_NOFAIL, but that may not be true in other
1545 * implementations.
1546 */
1547 if (order <= PAGE_ALLOC_COSTLY_ORDER)
1548 return 1;
1549
1550 /*
1551 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1552 * specified, then we retry until we no longer reclaim any pages
1553 * (above), or we've reclaimed an order of pages at least as
1554 * large as the allocation's order. In both cases, if the
1555 * allocation still fails, we stop retrying.
1556 */
1557 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1558 return 1;
1559
1560 /*
1561 * Don't let big-order allocations loop unless the caller
1562 * explicitly requests that.
1563 */
1564 if (gfp_mask & __GFP_NOFAIL)
1565 return 1;
1566
1567 return 0;
1568}
1569
1570static inline struct page *
1571__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1572 struct zonelist *zonelist, enum zone_type high_zoneidx,
1573 nodemask_t *nodemask, struct zone *preferred_zone,
1574 int migratetype)
1575{
1576 struct page *page;
1577
1578 /* Acquire the OOM killer lock for the zones in zonelist */
1579 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1580 schedule_timeout_uninterruptible(1);
1581 return NULL;
1582 }
1583
1584 /*
1585 * Go through the zonelist yet one more time, keep very high watermark
1586 * here, this is only to catch a parallel oom killing, we must fail if
1587 * we're still under heavy pressure.
1588 */
1589 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1590 order, zonelist, high_zoneidx,
1591 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
1592 preferred_zone, migratetype);
1593 if (page)
1594 goto out;
1595
1596 /* The OOM killer will not help higher order allocs */
1597 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))
1598 goto out;
1599
1600 /* Exhausted what can be done so it's blamo time */
1601 out_of_memory(zonelist, gfp_mask, order);
1602
1603out:
1604 clear_zonelist_oom(zonelist, gfp_mask);
1605 return page;
1606}
1607
1608/* The really slow allocator path where we enter direct reclaim */
1609static inline struct page *
1610__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1611 struct zonelist *zonelist, enum zone_type high_zoneidx,
1612 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1613 int migratetype, unsigned long *did_some_progress)
1614{
1615 struct page *page = NULL;
1616 struct reclaim_state reclaim_state;
1617 struct task_struct *p = current;
1618
1619 cond_resched();
1620
1621 /* We now go into synchronous reclaim */
1622 cpuset_memory_pressure_bump();
1623
1624 /*
1625 * The task's cpuset might have expanded its set of allowable nodes
1626 */
1627 p->flags |= PF_MEMALLOC;
1628 lockdep_set_current_reclaim_state(gfp_mask);
1629 reclaim_state.reclaimed_slab = 0;
1630 p->reclaim_state = &reclaim_state;
1631
1632 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1633
1634 p->reclaim_state = NULL;
1635 lockdep_clear_current_reclaim_state();
1636 p->flags &= ~PF_MEMALLOC;
1637
1638 cond_resched();
1639
1640 if (order != 0)
1641 drain_all_pages();
1642
1643 if (likely(*did_some_progress))
1644 page = get_page_from_freelist(gfp_mask, nodemask, order,
1645 zonelist, high_zoneidx,
1646 alloc_flags, preferred_zone,
1647 migratetype);
1648 return page;
1649}
1650
1472/* 1651/*
1473 * This is the 'heart' of the zoned buddy allocator. 1652 * This is called in the allocator slow-path if the allocation request is of
1653 * sufficient urgency to ignore watermarks and take other desperate measures
1474 */ 1654 */
1475struct page * 1655static inline struct page *
1476__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1656__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1477 struct zonelist *zonelist, nodemask_t *nodemask) 1657 struct zonelist *zonelist, enum zone_type high_zoneidx,
1658 nodemask_t *nodemask, struct zone *preferred_zone,
1659 int migratetype)
1660{
1661 struct page *page;
1662
1663 do {
1664 page = get_page_from_freelist(gfp_mask, nodemask, order,
1665 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
1666 preferred_zone, migratetype);
1667
1668 if (!page && gfp_mask & __GFP_NOFAIL)
1669 congestion_wait(WRITE, HZ/50);
1670 } while (!page && (gfp_mask & __GFP_NOFAIL));
1671
1672 return page;
1673}
1674
1675static inline
1676void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1677 enum zone_type high_zoneidx)
1478{ 1678{
1479 const gfp_t wait = gfp_mask & __GFP_WAIT;
1480 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1481 struct zoneref *z; 1679 struct zoneref *z;
1482 struct zone *zone; 1680 struct zone *zone;
1483 struct page *page;
1484 struct reclaim_state reclaim_state;
1485 struct task_struct *p = current;
1486 int do_retry;
1487 int alloc_flags;
1488 unsigned long did_some_progress;
1489 unsigned long pages_reclaimed = 0;
1490 1681
1491 lockdep_trace_alloc(gfp_mask); 1682 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1683 wakeup_kswapd(zone, order);
1684}
1492 1685
1493 might_sleep_if(wait); 1686static inline int
1687gfp_to_alloc_flags(gfp_t gfp_mask)
1688{
1689 struct task_struct *p = current;
1690 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1691 const gfp_t wait = gfp_mask & __GFP_WAIT;
1494 1692
1495 if (should_fail_alloc_page(gfp_mask, order)) 1693 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1496 return NULL; 1694 BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
1497 1695
1498restart: 1696 /*
1499 z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ 1697 * The caller may dip into page reserves a bit more if the caller
1698 * cannot run direct reclaim, or if the caller has realtime scheduling
1699 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1700 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1701 */
1702 alloc_flags |= (gfp_mask & __GFP_HIGH);
1500 1703
1501 if (unlikely(!z->zone)) { 1704 if (!wait) {
1705 alloc_flags |= ALLOC_HARDER;
1502 /* 1706 /*
1503 * Happens if we have an empty zonelist as a result of 1707 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1504 * GFP_THISNODE being used on a memoryless node 1708 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1505 */ 1709 */
1506 return NULL; 1710 alloc_flags &= ~ALLOC_CPUSET;
1711 } else if (unlikely(rt_task(p)))
1712 alloc_flags |= ALLOC_HARDER;
1713
1714 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1715 if (!in_interrupt() &&
1716 ((p->flags & PF_MEMALLOC) ||
1717 unlikely(test_thread_flag(TIF_MEMDIE))))
1718 alloc_flags |= ALLOC_NO_WATERMARKS;
1507 } 1719 }
1508 1720
1509 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 1721 return alloc_flags;
1510 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1722}
1511 if (page) 1723
1512 goto got_pg; 1724static inline struct page *
1725__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1726 struct zonelist *zonelist, enum zone_type high_zoneidx,
1727 nodemask_t *nodemask, struct zone *preferred_zone,
1728 int migratetype)
1729{
1730 const gfp_t wait = gfp_mask & __GFP_WAIT;
1731 struct page *page = NULL;
1732 int alloc_flags;
1733 unsigned long pages_reclaimed = 0;
1734 unsigned long did_some_progress;
1735 struct task_struct *p = current;
1736
1737 /*
1738 * In the slowpath, we sanity check order to avoid ever trying to
1739 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
1740 * be using allocators in order of preference for an area that is
1741 * too large.
1742 */
1743 if (WARN_ON_ONCE(order >= MAX_ORDER))
1744 return NULL;
1513 1745
1514 /* 1746 /*
1515 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1747 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1522,154 +1754,83 @@ restart:
1522 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1754 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1523 goto nopage; 1755 goto nopage;
1524 1756
1525 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1757 wake_all_kswapd(order, zonelist, high_zoneidx);
1526 wakeup_kswapd(zone, order);
1527 1758
1528 /* 1759 /*
1529 * OK, we're below the kswapd watermark and have kicked background 1760 * OK, we're below the kswapd watermark and have kicked background
1530 * reclaim. Now things get more complex, so set up alloc_flags according 1761 * reclaim. Now things get more complex, so set up alloc_flags according
1531 * to how we want to proceed. 1762 * to how we want to proceed.
1532 *
1533 * The caller may dip into page reserves a bit more if the caller
1534 * cannot run direct reclaim, or if the caller has realtime scheduling
1535 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1536 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1537 */ 1763 */
1538 alloc_flags = ALLOC_WMARK_MIN; 1764 alloc_flags = gfp_to_alloc_flags(gfp_mask);
1539 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
1540 alloc_flags |= ALLOC_HARDER;
1541 if (gfp_mask & __GFP_HIGH)
1542 alloc_flags |= ALLOC_HIGH;
1543 if (wait)
1544 alloc_flags |= ALLOC_CPUSET;
1545 1765
1546 /* 1766restart:
1547 * Go through the zonelist again. Let __GFP_HIGH and allocations 1767 /* This is the last chance, in general, before the goto nopage. */
1548 * coming from realtime tasks go deeper into reserves.
1549 *
1550 * This is the last chance, in general, before the goto nopage.
1551 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1552 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1553 */
1554 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 1768 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1555 high_zoneidx, alloc_flags); 1769 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
1770 preferred_zone, migratetype);
1556 if (page) 1771 if (page)
1557 goto got_pg; 1772 goto got_pg;
1558 1773
1559 /* This allocation should allow future memory freeing. */
1560
1561rebalance: 1774rebalance:
1562 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1775 /* Allocate without watermarks if the context allows */
1563 && !in_interrupt()) { 1776 if (alloc_flags & ALLOC_NO_WATERMARKS) {
1564 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1777 page = __alloc_pages_high_priority(gfp_mask, order,
1565nofail_alloc: 1778 zonelist, high_zoneidx, nodemask,
1566 /* go through the zonelist yet again, ignoring mins */ 1779 preferred_zone, migratetype);
1567 page = get_page_from_freelist(gfp_mask, nodemask, order, 1780 if (page)
1568 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); 1781 goto got_pg;
1569 if (page)
1570 goto got_pg;
1571 if (gfp_mask & __GFP_NOFAIL) {
1572 congestion_wait(WRITE, HZ/50);
1573 goto nofail_alloc;
1574 }
1575 }
1576 goto nopage;
1577 } 1782 }
1578 1783
1579 /* Atomic allocations - we can't balance anything */ 1784 /* Atomic allocations - we can't balance anything */
1580 if (!wait) 1785 if (!wait)
1581 goto nopage; 1786 goto nopage;
1582 1787
1583 cond_resched(); 1788 /* Avoid recursion of direct reclaim */
1789 if (p->flags & PF_MEMALLOC)
1790 goto nopage;
1791
1792 /* Try direct reclaim and then allocating */
1793 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1794 zonelist, high_zoneidx,
1795 nodemask,
1796 alloc_flags, preferred_zone,
1797 migratetype, &did_some_progress);
1798 if (page)
1799 goto got_pg;
1584 1800
1585 /* We now go into synchronous reclaim */
1586 cpuset_memory_pressure_bump();
1587 /* 1801 /*
1588 * The task's cpuset might have expanded its set of allowable nodes 1802 * If we failed to make any progress reclaiming, then we are
1803 * running out of options and have to consider going OOM
1589 */ 1804 */
1590 cpuset_update_task_memory_state(); 1805 if (!did_some_progress) {
1591 p->flags |= PF_MEMALLOC; 1806 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1592 1807 if (oom_killer_disabled)
1593 lockdep_set_current_reclaim_state(gfp_mask); 1808 goto nopage;
1594 reclaim_state.reclaimed_slab = 0; 1809 page = __alloc_pages_may_oom(gfp_mask, order,
1595 p->reclaim_state = &reclaim_state; 1810 zonelist, high_zoneidx,
1596 1811 nodemask, preferred_zone,
1597 did_some_progress = try_to_free_pages(zonelist, order, 1812 migratetype);
1598 gfp_mask, nodemask); 1813 if (page)
1599 1814 goto got_pg;
1600 p->reclaim_state = NULL;
1601 lockdep_clear_current_reclaim_state();
1602 p->flags &= ~PF_MEMALLOC;
1603
1604 cond_resched();
1605 1815
1606 if (order != 0) 1816 /*
1607 drain_all_pages(); 1817 * The OOM killer does not trigger for high-order
1818 * ~__GFP_NOFAIL allocations so if no progress is being
1819 * made, there are no other options and retrying is
1820 * unlikely to help.
1821 */
1822 if (order > PAGE_ALLOC_COSTLY_ORDER &&
1823 !(gfp_mask & __GFP_NOFAIL))
1824 goto nopage;
1608 1825
1609 if (likely(did_some_progress)) {
1610 page = get_page_from_freelist(gfp_mask, nodemask, order,
1611 zonelist, high_zoneidx, alloc_flags);
1612 if (page)
1613 goto got_pg;
1614 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1615 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1616 schedule_timeout_uninterruptible(1);
1617 goto restart; 1826 goto restart;
1618 } 1827 }
1619
1620 /*
1621 * Go through the zonelist yet one more time, keep
1622 * very high watermark here, this is only to catch
1623 * a parallel oom killing, we must fail if we're still
1624 * under heavy pressure.
1625 */
1626 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1627 order, zonelist, high_zoneidx,
1628 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1629 if (page) {
1630 clear_zonelist_oom(zonelist, gfp_mask);
1631 goto got_pg;
1632 }
1633
1634 /* The OOM killer will not help higher order allocs so fail */
1635 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1636 clear_zonelist_oom(zonelist, gfp_mask);
1637 goto nopage;
1638 }
1639
1640 out_of_memory(zonelist, gfp_mask, order);
1641 clear_zonelist_oom(zonelist, gfp_mask);
1642 goto restart;
1643 } 1828 }
1644 1829
1645 /* 1830 /* Check if we should retry the allocation */
1646 * Don't let big-order allocations loop unless the caller explicitly
1647 * requests that. Wait for some write requests to complete then retry.
1648 *
1649 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1650 * means __GFP_NOFAIL, but that may not be true in other
1651 * implementations.
1652 *
1653 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1654 * specified, then we retry until we no longer reclaim any pages
1655 * (above), or we've reclaimed an order of pages at least as
1656 * large as the allocation's order. In both cases, if the
1657 * allocation still fails, we stop retrying.
1658 */
1659 pages_reclaimed += did_some_progress; 1831 pages_reclaimed += did_some_progress;
1660 do_retry = 0; 1832 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1661 if (!(gfp_mask & __GFP_NORETRY)) { 1833 /* Wait for some write requests to complete then retry */
1662 if (order <= PAGE_ALLOC_COSTLY_ORDER) {
1663 do_retry = 1;
1664 } else {
1665 if (gfp_mask & __GFP_REPEAT &&
1666 pages_reclaimed < (1 << order))
1667 do_retry = 1;
1668 }
1669 if (gfp_mask & __GFP_NOFAIL)
1670 do_retry = 1;
1671 }
1672 if (do_retry) {
1673 congestion_wait(WRITE, HZ/50); 1834 congestion_wait(WRITE, HZ/50);
1674 goto rebalance; 1835 goto rebalance;
1675 } 1836 }
@@ -1687,8 +1848,53 @@ got_pg:
1687 if (kmemcheck_enabled) 1848 if (kmemcheck_enabled)
1688 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 1849 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
1689 return page; 1850 return page;
1851
1852}
1853
1854/*
1855 * This is the 'heart' of the zoned buddy allocator.
1856 */
1857struct page *
1858__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1859 struct zonelist *zonelist, nodemask_t *nodemask)
1860{
1861 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1862 struct zone *preferred_zone;
1863 struct page *page;
1864 int migratetype = allocflags_to_migratetype(gfp_mask);
1865
1866 lockdep_trace_alloc(gfp_mask);
1867
1868 might_sleep_if(gfp_mask & __GFP_WAIT);
1869
1870 if (should_fail_alloc_page(gfp_mask, order))
1871 return NULL;
1872
1873 /*
1874 * Check the zones suitable for the gfp_mask contain at least one
1875 * valid zone. It's possible to have an empty zonelist as a result
1876 * of GFP_THISNODE and a memoryless node
1877 */
1878 if (unlikely(!zonelist->_zonerefs->zone))
1879 return NULL;
1880
1881 /* The preferred zone is used for statistics later */
1882 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1883 if (!preferred_zone)
1884 return NULL;
1885
1886 /* First allocation attempt */
1887 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1888 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
1889 preferred_zone, migratetype);
1890 if (unlikely(!page))
1891 page = __alloc_pages_slowpath(gfp_mask, order,
1892 zonelist, high_zoneidx, nodemask,
1893 preferred_zone, migratetype);
1894
1895 return page;
1690} 1896}
1691EXPORT_SYMBOL(__alloc_pages_internal); 1897EXPORT_SYMBOL(__alloc_pages_nodemask);
1692 1898
1693/* 1899/*
1694 * Common helper functions. 1900 * Common helper functions.
@@ -1817,7 +2023,7 @@ static unsigned int nr_free_zone_pages(int offset)
1817 2023
1818 for_each_zone_zonelist(zone, z, zonelist, offset) { 2024 for_each_zone_zonelist(zone, z, zonelist, offset) {
1819 unsigned long size = zone->present_pages; 2025 unsigned long size = zone->present_pages;
1820 unsigned long high = zone->pages_high; 2026 unsigned long high = high_wmark_pages(zone);
1821 if (size > high) 2027 if (size > high)
1822 sum += size - high; 2028 sum += size - high;
1823 } 2029 }
@@ -1909,19 +2115,14 @@ void show_free_areas(void)
1909 2115
1910 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" 2116 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
1911 " inactive_file:%lu" 2117 " inactive_file:%lu"
1912//TODO: check/adjust line lengths
1913#ifdef CONFIG_UNEVICTABLE_LRU
1914 " unevictable:%lu" 2118 " unevictable:%lu"
1915#endif
1916 " dirty:%lu writeback:%lu unstable:%lu\n" 2119 " dirty:%lu writeback:%lu unstable:%lu\n"
1917 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 2120 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1918 global_page_state(NR_ACTIVE_ANON), 2121 global_page_state(NR_ACTIVE_ANON),
1919 global_page_state(NR_ACTIVE_FILE), 2122 global_page_state(NR_ACTIVE_FILE),
1920 global_page_state(NR_INACTIVE_ANON), 2123 global_page_state(NR_INACTIVE_ANON),
1921 global_page_state(NR_INACTIVE_FILE), 2124 global_page_state(NR_INACTIVE_FILE),
1922#ifdef CONFIG_UNEVICTABLE_LRU
1923 global_page_state(NR_UNEVICTABLE), 2125 global_page_state(NR_UNEVICTABLE),
1924#endif
1925 global_page_state(NR_FILE_DIRTY), 2126 global_page_state(NR_FILE_DIRTY),
1926 global_page_state(NR_WRITEBACK), 2127 global_page_state(NR_WRITEBACK),
1927 global_page_state(NR_UNSTABLE_NFS), 2128 global_page_state(NR_UNSTABLE_NFS),
@@ -1945,25 +2146,21 @@ void show_free_areas(void)
1945 " inactive_anon:%lukB" 2146 " inactive_anon:%lukB"
1946 " active_file:%lukB" 2147 " active_file:%lukB"
1947 " inactive_file:%lukB" 2148 " inactive_file:%lukB"
1948#ifdef CONFIG_UNEVICTABLE_LRU
1949 " unevictable:%lukB" 2149 " unevictable:%lukB"
1950#endif
1951 " present:%lukB" 2150 " present:%lukB"
1952 " pages_scanned:%lu" 2151 " pages_scanned:%lu"
1953 " all_unreclaimable? %s" 2152 " all_unreclaimable? %s"
1954 "\n", 2153 "\n",
1955 zone->name, 2154 zone->name,
1956 K(zone_page_state(zone, NR_FREE_PAGES)), 2155 K(zone_page_state(zone, NR_FREE_PAGES)),
1957 K(zone->pages_min), 2156 K(min_wmark_pages(zone)),
1958 K(zone->pages_low), 2157 K(low_wmark_pages(zone)),
1959 K(zone->pages_high), 2158 K(high_wmark_pages(zone)),
1960 K(zone_page_state(zone, NR_ACTIVE_ANON)), 2159 K(zone_page_state(zone, NR_ACTIVE_ANON)),
1961 K(zone_page_state(zone, NR_INACTIVE_ANON)), 2160 K(zone_page_state(zone, NR_INACTIVE_ANON)),
1962 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2161 K(zone_page_state(zone, NR_ACTIVE_FILE)),
1963 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2162 K(zone_page_state(zone, NR_INACTIVE_FILE)),
1964#ifdef CONFIG_UNEVICTABLE_LRU
1965 K(zone_page_state(zone, NR_UNEVICTABLE)), 2163 K(zone_page_state(zone, NR_UNEVICTABLE)),
1966#endif
1967 K(zone->present_pages), 2164 K(zone->present_pages),
1968 zone->pages_scanned, 2165 zone->pages_scanned,
1969 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2166 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -2121,7 +2318,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2121} 2318}
2122 2319
2123 2320
2124#define MAX_NODE_LOAD (num_online_nodes()) 2321#define MAX_NODE_LOAD (nr_online_nodes)
2125static int node_load[MAX_NUMNODES]; 2322static int node_load[MAX_NUMNODES];
2126 2323
2127/** 2324/**
@@ -2330,7 +2527,7 @@ static void build_zonelists(pg_data_t *pgdat)
2330 2527
2331 /* NUMA-aware ordering of nodes */ 2528 /* NUMA-aware ordering of nodes */
2332 local_node = pgdat->node_id; 2529 local_node = pgdat->node_id;
2333 load = num_online_nodes(); 2530 load = nr_online_nodes;
2334 prev_node = local_node; 2531 prev_node = local_node;
2335 nodes_clear(used_mask); 2532 nodes_clear(used_mask);
2336 2533
@@ -2481,7 +2678,7 @@ void build_all_zonelists(void)
2481 2678
2482 printk("Built %i zonelists in %s order, mobility grouping %s. " 2679 printk("Built %i zonelists in %s order, mobility grouping %s. "
2483 "Total pages: %ld\n", 2680 "Total pages: %ld\n",
2484 num_online_nodes(), 2681 nr_online_nodes,
2485 zonelist_order_name[current_zonelist_order], 2682 zonelist_order_name[current_zonelist_order],
2486 page_group_by_mobility_disabled ? "off" : "on", 2683 page_group_by_mobility_disabled ? "off" : "on",
2487 vm_total_pages); 2684 vm_total_pages);
@@ -2560,8 +2757,8 @@ static inline unsigned long wait_table_bits(unsigned long size)
2560 2757
2561/* 2758/*
2562 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 2759 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2563 * of blocks reserved is based on zone->pages_min. The memory within the 2760 * of blocks reserved is based on min_wmark_pages(zone). The memory within
2564 * reserve will tend to store contiguous free pages. Setting min_free_kbytes 2761 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
2565 * higher will lead to a bigger reserve which will get freed as contiguous 2762 * higher will lead to a bigger reserve which will get freed as contiguous
2566 * blocks as reclaim kicks in 2763 * blocks as reclaim kicks in
2567 */ 2764 */
@@ -2574,7 +2771,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2574 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2771 /* Get the start pfn, end pfn and the number of blocks to reserve */
2575 start_pfn = zone->zone_start_pfn; 2772 start_pfn = zone->zone_start_pfn;
2576 end_pfn = start_pfn + zone->spanned_pages; 2773 end_pfn = start_pfn + zone->spanned_pages;
2577 reserve = roundup(zone->pages_min, pageblock_nr_pages) >> 2774 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2578 pageblock_order; 2775 pageblock_order;
2579 2776
2580 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2777 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
@@ -3506,7 +3703,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3506 zone_pcp_init(zone); 3703 zone_pcp_init(zone);
3507 for_each_lru(l) { 3704 for_each_lru(l) {
3508 INIT_LIST_HEAD(&zone->lru[l].list); 3705 INIT_LIST_HEAD(&zone->lru[l].list);
3509 zone->lru[l].nr_scan = 0; 3706 zone->lru[l].nr_saved_scan = 0;
3510 } 3707 }
3511 zone->reclaim_stat.recent_rotated[0] = 0; 3708 zone->reclaim_stat.recent_rotated[0] = 0;
3512 zone->reclaim_stat.recent_rotated[1] = 0; 3709 zone->reclaim_stat.recent_rotated[1] = 0;
@@ -4043,6 +4240,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4043 early_node_map[i].start_pfn, 4240 early_node_map[i].start_pfn,
4044 early_node_map[i].end_pfn); 4241 early_node_map[i].end_pfn);
4045 4242
4243 /*
4244 * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init
4245 * that node_mask, clear it at first
4246 */
4247 nodes_clear(node_states[N_HIGH_MEMORY]);
4046 /* Initialise every node */ 4248 /* Initialise every node */
4047 mminit_verify_pageflags_layout(); 4249 mminit_verify_pageflags_layout();
4048 setup_nr_node_ids(); 4250 setup_nr_node_ids();
@@ -4177,8 +4379,8 @@ static void calculate_totalreserve_pages(void)
4177 max = zone->lowmem_reserve[j]; 4379 max = zone->lowmem_reserve[j];
4178 } 4380 }
4179 4381
4180 /* we treat pages_high as reserved pages. */ 4382 /* we treat the high watermark as reserved pages. */
4181 max += zone->pages_high; 4383 max += high_wmark_pages(zone);
4182 4384
4183 if (max > zone->present_pages) 4385 if (max > zone->present_pages)
4184 max = zone->present_pages; 4386 max = zone->present_pages;
@@ -4228,12 +4430,13 @@ static void setup_per_zone_lowmem_reserve(void)
4228} 4430}
4229 4431
4230/** 4432/**
4231 * setup_per_zone_pages_min - called when min_free_kbytes changes. 4433 * setup_per_zone_wmarks - called when min_free_kbytes changes
4434 * or when memory is hot-{added|removed}
4232 * 4435 *
4233 * Ensures that the pages_{min,low,high} values for each zone are set correctly 4436 * Ensures that the watermark[min,low,high] values for each zone are set
4234 * with respect to min_free_kbytes. 4437 * correctly with respect to min_free_kbytes.
4235 */ 4438 */
4236void setup_per_zone_pages_min(void) 4439void setup_per_zone_wmarks(void)
4237{ 4440{
4238 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 4441 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
4239 unsigned long lowmem_pages = 0; 4442 unsigned long lowmem_pages = 0;
@@ -4258,7 +4461,7 @@ void setup_per_zone_pages_min(void)
4258 * need highmem pages, so cap pages_min to a small 4461 * need highmem pages, so cap pages_min to a small
4259 * value here. 4462 * value here.
4260 * 4463 *
4261 * The (pages_high-pages_low) and (pages_low-pages_min) 4464 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
4262 * deltas controls asynch page reclaim, and so should 4465 * deltas controls asynch page reclaim, and so should
4263 * not be capped for highmem. 4466 * not be capped for highmem.
4264 */ 4467 */
@@ -4269,17 +4472,17 @@ void setup_per_zone_pages_min(void)
4269 min_pages = SWAP_CLUSTER_MAX; 4472 min_pages = SWAP_CLUSTER_MAX;
4270 if (min_pages > 128) 4473 if (min_pages > 128)
4271 min_pages = 128; 4474 min_pages = 128;
4272 zone->pages_min = min_pages; 4475 zone->watermark[WMARK_MIN] = min_pages;
4273 } else { 4476 } else {
4274 /* 4477 /*
4275 * If it's a lowmem zone, reserve a number of pages 4478 * If it's a lowmem zone, reserve a number of pages
4276 * proportionate to the zone's size. 4479 * proportionate to the zone's size.
4277 */ 4480 */
4278 zone->pages_min = tmp; 4481 zone->watermark[WMARK_MIN] = tmp;
4279 } 4482 }
4280 4483
4281 zone->pages_low = zone->pages_min + (tmp >> 2); 4484 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
4282 zone->pages_high = zone->pages_min + (tmp >> 1); 4485 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
4283 setup_zone_migrate_reserve(zone); 4486 setup_zone_migrate_reserve(zone);
4284 spin_unlock_irqrestore(&zone->lock, flags); 4487 spin_unlock_irqrestore(&zone->lock, flags);
4285 } 4488 }
@@ -4289,8 +4492,6 @@ void setup_per_zone_pages_min(void)
4289} 4492}
4290 4493
4291/** 4494/**
4292 * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
4293 *
4294 * The inactive anon list should be small enough that the VM never has to 4495 * The inactive anon list should be small enough that the VM never has to
4295 * do too much work, but large enough that each inactive page has a chance 4496 * do too much work, but large enough that each inactive page has a chance
4296 * to be referenced again before it is swapped out. 4497 * to be referenced again before it is swapped out.
@@ -4311,21 +4512,26 @@ void setup_per_zone_pages_min(void)
4311 * 1TB 101 10GB 4512 * 1TB 101 10GB
4312 * 10TB 320 32GB 4513 * 10TB 320 32GB
4313 */ 4514 */
4314static void setup_per_zone_inactive_ratio(void) 4515void calculate_zone_inactive_ratio(struct zone *zone)
4315{ 4516{
4316 struct zone *zone; 4517 unsigned int gb, ratio;
4317
4318 for_each_zone(zone) {
4319 unsigned int gb, ratio;
4320 4518
4321 /* Zone size in gigabytes */ 4519 /* Zone size in gigabytes */
4322 gb = zone->present_pages >> (30 - PAGE_SHIFT); 4520 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4521 if (gb)
4323 ratio = int_sqrt(10 * gb); 4522 ratio = int_sqrt(10 * gb);
4324 if (!ratio) 4523 else
4325 ratio = 1; 4524 ratio = 1;
4326 4525
4327 zone->inactive_ratio = ratio; 4526 zone->inactive_ratio = ratio;
4328 } 4527}
4528
4529static void __init setup_per_zone_inactive_ratio(void)
4530{
4531 struct zone *zone;
4532
4533 for_each_zone(zone)
4534 calculate_zone_inactive_ratio(zone);
4329} 4535}
4330 4536
4331/* 4537/*
@@ -4352,7 +4558,7 @@ static void setup_per_zone_inactive_ratio(void)
4352 * 8192MB: 11584k 4558 * 8192MB: 11584k
4353 * 16384MB: 16384k 4559 * 16384MB: 16384k
4354 */ 4560 */
4355static int __init init_per_zone_pages_min(void) 4561static int __init init_per_zone_wmark_min(void)
4356{ 4562{
4357 unsigned long lowmem_kbytes; 4563 unsigned long lowmem_kbytes;
4358 4564
@@ -4363,12 +4569,12 @@ static int __init init_per_zone_pages_min(void)
4363 min_free_kbytes = 128; 4569 min_free_kbytes = 128;
4364 if (min_free_kbytes > 65536) 4570 if (min_free_kbytes > 65536)
4365 min_free_kbytes = 65536; 4571 min_free_kbytes = 65536;
4366 setup_per_zone_pages_min(); 4572 setup_per_zone_wmarks();
4367 setup_per_zone_lowmem_reserve(); 4573 setup_per_zone_lowmem_reserve();
4368 setup_per_zone_inactive_ratio(); 4574 setup_per_zone_inactive_ratio();
4369 return 0; 4575 return 0;
4370} 4576}
4371module_init(init_per_zone_pages_min) 4577module_init(init_per_zone_wmark_min)
4372 4578
4373/* 4579/*
4374 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 4580 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
@@ -4380,7 +4586,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4380{ 4586{
4381 proc_dointvec(table, write, file, buffer, length, ppos); 4587 proc_dointvec(table, write, file, buffer, length, ppos);
4382 if (write) 4588 if (write)
4383 setup_per_zone_pages_min(); 4589 setup_per_zone_wmarks();
4384 return 0; 4590 return 0;
4385} 4591}
4386 4592
@@ -4424,7 +4630,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4424 * whenever sysctl_lowmem_reserve_ratio changes. 4630 * whenever sysctl_lowmem_reserve_ratio changes.
4425 * 4631 *
4426 * The reserve ratio obviously has absolutely no relation with the 4632 * The reserve ratio obviously has absolutely no relation with the
4427 * pages_min watermarks. The lowmem reserve ratio can only make sense 4633 * minimum watermarks. The lowmem reserve ratio can only make sense
4428 * if in function of the boot time zone sizes. 4634 * if in function of the boot time zone sizes.
4429 */ 4635 */
4430int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4636int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
@@ -4531,23 +4737,13 @@ void *__init alloc_large_system_hash(const char *tablename,
4531 else if (hashdist) 4737 else if (hashdist)
4532 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4738 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4533 else { 4739 else {
4534 unsigned long order = get_order(size);
4535 table = (void*) __get_free_pages(GFP_ATOMIC, order);
4536 /* 4740 /*
4537 * If bucketsize is not a power-of-two, we may free 4741 * If bucketsize is not a power-of-two, we may free
4538 * some pages at the end of hash table. 4742 * some pages at the end of hash table which
4743 * alloc_pages_exact() automatically does
4539 */ 4744 */
4540 if (table) { 4745 if (get_order(size) < MAX_ORDER)
4541 unsigned long alloc_end = (unsigned long)table + 4746 table = alloc_pages_exact(size, GFP_ATOMIC);
4542 (PAGE_SIZE << order);
4543 unsigned long used = (unsigned long)table +
4544 PAGE_ALIGN(size);
4545 split_page(virt_to_page(table), order);
4546 while (used < alloc_end) {
4547 free_page(used);
4548 used += PAGE_SIZE;
4549 }
4550 }
4551 } 4747 }
4552 } while (!table && size > PAGE_SIZE && --log2qty); 4748 } while (!table && size > PAGE_SIZE && --log2qty);
4553 4749
diff --git a/mm/page_io.c b/mm/page_io.c
index 3023c475e041..c6f3e5071de3 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -120,7 +120,7 @@ out:
120 return ret; 120 return ret;
121} 121}
122 122
123int swap_readpage(struct file *file, struct page *page) 123int swap_readpage(struct page *page)
124{ 124{
125 struct bio *bio; 125 struct bio *bio;
126 int ret = 0; 126 int ret = 0;
diff --git a/mm/readahead.c b/mm/readahead.c
index 133b6d525513..aa1aa2345235 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -133,15 +133,12 @@ out:
133} 133}
134 134
135/* 135/*
136 * do_page_cache_readahead actually reads a chunk of disk. It allocates all 136 * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
137 * the pages first, then submits them all for I/O. This avoids the very bad 137 * the pages first, then submits them all for I/O. This avoids the very bad
138 * behaviour which would occur if page allocations are causing VM writeback. 138 * behaviour which would occur if page allocations are causing VM writeback.
139 * We really don't want to intermingle reads and writes like that. 139 * We really don't want to intermingle reads and writes like that.
140 * 140 *
141 * Returns the number of pages requested, or the maximum amount of I/O allowed. 141 * Returns the number of pages requested, or the maximum amount of I/O allowed.
142 *
143 * do_page_cache_readahead() returns -1 if it encountered request queue
144 * congestion.
145 */ 142 */
146static int 143static int
147__do_page_cache_readahead(struct address_space *mapping, struct file *filp, 144__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
@@ -210,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
210 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) 207 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
211 return -EINVAL; 208 return -EINVAL;
212 209
210 nr_to_read = max_sane_readahead(nr_to_read);
213 while (nr_to_read) { 211 while (nr_to_read) {
214 int err; 212 int err;
215 213
@@ -231,22 +229,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
231} 229}
232 230
233/* 231/*
234 * This version skips the IO if the queue is read-congested, and will tell the
235 * block layer to abandon the readahead if request allocation would block.
236 *
237 * force_page_cache_readahead() will ignore queue congestion and will block on
238 * request queues.
239 */
240int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
241 pgoff_t offset, unsigned long nr_to_read)
242{
243 if (bdi_read_congested(mapping->backing_dev_info))
244 return -1;
245
246 return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
247}
248
249/*
250 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a 232 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
251 * sensible upper limit. 233 * sensible upper limit.
252 */ 234 */
@@ -259,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr)
259/* 241/*
260 * Submit IO for the read-ahead request in file_ra_state. 242 * Submit IO for the read-ahead request in file_ra_state.
261 */ 243 */
262static unsigned long ra_submit(struct file_ra_state *ra, 244unsigned long ra_submit(struct file_ra_state *ra,
263 struct address_space *mapping, struct file *filp) 245 struct address_space *mapping, struct file *filp)
264{ 246{
265 int actual; 247 int actual;
@@ -348,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
348 */ 330 */
349 331
350/* 332/*
333 * Count contiguously cached pages from @offset-1 to @offset-@max,
334 * this count is a conservative estimation of
335 * - length of the sequential read sequence, or
336 * - thrashing threshold in memory tight systems
337 */
338static pgoff_t count_history_pages(struct address_space *mapping,
339 struct file_ra_state *ra,
340 pgoff_t offset, unsigned long max)
341{
342 pgoff_t head;
343
344 rcu_read_lock();
345 head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
346 rcu_read_unlock();
347
348 return offset - 1 - head;
349}
350
351/*
352 * page cache context based read-ahead
353 */
354static int try_context_readahead(struct address_space *mapping,
355 struct file_ra_state *ra,
356 pgoff_t offset,
357 unsigned long req_size,
358 unsigned long max)
359{
360 pgoff_t size;
361
362 size = count_history_pages(mapping, ra, offset, max);
363
364 /*
365 * no history pages:
366 * it could be a random read
367 */
368 if (!size)
369 return 0;
370
371 /*
372 * starts from beginning of file:
373 * it is a strong indication of long-run stream (or whole-file-read)
374 */
375 if (size >= offset)
376 size *= 2;
377
378 ra->start = offset;
379 ra->size = get_init_ra_size(size + req_size, max);
380 ra->async_size = ra->size;
381
382 return 1;
383}
384
385/*
351 * A minimal readahead algorithm for trivial sequential/random reads. 386 * A minimal readahead algorithm for trivial sequential/random reads.
352 */ 387 */
353static unsigned long 388static unsigned long
@@ -356,34 +391,26 @@ ondemand_readahead(struct address_space *mapping,
356 bool hit_readahead_marker, pgoff_t offset, 391 bool hit_readahead_marker, pgoff_t offset,
357 unsigned long req_size) 392 unsigned long req_size)
358{ 393{
359 int max = ra->ra_pages; /* max readahead pages */ 394 unsigned long max = max_sane_readahead(ra->ra_pages);
360 pgoff_t prev_offset; 395
361 int sequential; 396 /*
397 * start of file
398 */
399 if (!offset)
400 goto initial_readahead;
362 401
363 /* 402 /*
364 * It's the expected callback offset, assume sequential access. 403 * It's the expected callback offset, assume sequential access.
365 * Ramp up sizes, and push forward the readahead window. 404 * Ramp up sizes, and push forward the readahead window.
366 */ 405 */
367 if (offset && (offset == (ra->start + ra->size - ra->async_size) || 406 if ((offset == (ra->start + ra->size - ra->async_size) ||
368 offset == (ra->start + ra->size))) { 407 offset == (ra->start + ra->size))) {
369 ra->start += ra->size; 408 ra->start += ra->size;
370 ra->size = get_next_ra_size(ra, max); 409 ra->size = get_next_ra_size(ra, max);
371 ra->async_size = ra->size; 410 ra->async_size = ra->size;
372 goto readit; 411 goto readit;
373 } 412 }
374 413
375 prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
376 sequential = offset - prev_offset <= 1UL || req_size > max;
377
378 /*
379 * Standalone, small read.
380 * Read as is, and do not pollute the readahead state.
381 */
382 if (!hit_readahead_marker && !sequential) {
383 return __do_page_cache_readahead(mapping, filp,
384 offset, req_size, 0);
385 }
386
387 /* 414 /*
388 * Hit a marked page without valid readahead state. 415 * Hit a marked page without valid readahead state.
389 * E.g. interleaved reads. 416 * E.g. interleaved reads.
@@ -394,7 +421,7 @@ ondemand_readahead(struct address_space *mapping,
394 pgoff_t start; 421 pgoff_t start;
395 422
396 rcu_read_lock(); 423 rcu_read_lock();
397 start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); 424 start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
398 rcu_read_unlock(); 425 rcu_read_unlock();
399 426
400 if (!start || start - offset > max) 427 if (!start || start - offset > max)
@@ -402,23 +429,53 @@ ondemand_readahead(struct address_space *mapping,
402 429
403 ra->start = start; 430 ra->start = start;
404 ra->size = start - offset; /* old async_size */ 431 ra->size = start - offset; /* old async_size */
432 ra->size += req_size;
405 ra->size = get_next_ra_size(ra, max); 433 ra->size = get_next_ra_size(ra, max);
406 ra->async_size = ra->size; 434 ra->async_size = ra->size;
407 goto readit; 435 goto readit;
408 } 436 }
409 437
410 /* 438 /*
411 * It may be one of 439 * oversize read
412 * - first read on start of file 440 */
413 * - sequential cache miss 441 if (req_size > max)
414 * - oversize random read 442 goto initial_readahead;
415 * Start readahead for it. 443
444 /*
445 * sequential cache miss
446 */
447 if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
448 goto initial_readahead;
449
450 /*
451 * Query the page cache and look for the traces(cached history pages)
452 * that a sequential stream would leave behind.
453 */
454 if (try_context_readahead(mapping, ra, offset, req_size, max))
455 goto readit;
456
457 /*
458 * standalone, small random read
459 * Read as is, and do not pollute the readahead state.
416 */ 460 */
461 return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
462
463initial_readahead:
417 ra->start = offset; 464 ra->start = offset;
418 ra->size = get_init_ra_size(req_size, max); 465 ra->size = get_init_ra_size(req_size, max);
419 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; 466 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
420 467
421readit: 468readit:
469 /*
470 * Will this read hit the readahead marker made by itself?
471 * If so, trigger the readahead marker hit now, and merge
472 * the resulted next readahead window into the current one.
473 */
474 if (offset == ra->start && ra->size == ra->async_size) {
475 ra->async_size = get_next_ra_size(ra, max);
476 ra->size += ra->async_size;
477 }
478
422 return ra_submit(ra, mapping, filp); 479 return ra_submit(ra, mapping, filp);
423} 480}
424 481
diff --git a/mm/rmap.c b/mm/rmap.c
index 23122af32611..c9ccc1a72dc3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
333 * repeatedly from either page_referenced_anon or page_referenced_file. 333 * repeatedly from either page_referenced_anon or page_referenced_file.
334 */ 334 */
335static int page_referenced_one(struct page *page, 335static int page_referenced_one(struct page *page,
336 struct vm_area_struct *vma, unsigned int *mapcount) 336 struct vm_area_struct *vma,
337 unsigned int *mapcount,
338 unsigned long *vm_flags)
337{ 339{
338 struct mm_struct *mm = vma->vm_mm; 340 struct mm_struct *mm = vma->vm_mm;
339 unsigned long address; 341 unsigned long address;
@@ -381,11 +383,14 @@ out_unmap:
381 (*mapcount)--; 383 (*mapcount)--;
382 pte_unmap_unlock(pte, ptl); 384 pte_unmap_unlock(pte, ptl);
383out: 385out:
386 if (referenced)
387 *vm_flags |= vma->vm_flags;
384 return referenced; 388 return referenced;
385} 389}
386 390
387static int page_referenced_anon(struct page *page, 391static int page_referenced_anon(struct page *page,
388 struct mem_cgroup *mem_cont) 392 struct mem_cgroup *mem_cont,
393 unsigned long *vm_flags)
389{ 394{
390 unsigned int mapcount; 395 unsigned int mapcount;
391 struct anon_vma *anon_vma; 396 struct anon_vma *anon_vma;
@@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page,
405 */ 410 */
406 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 411 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
407 continue; 412 continue;
408 referenced += page_referenced_one(page, vma, &mapcount); 413 referenced += page_referenced_one(page, vma,
414 &mapcount, vm_flags);
409 if (!mapcount) 415 if (!mapcount)
410 break; 416 break;
411 } 417 }
@@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page,
418 * page_referenced_file - referenced check for object-based rmap 424 * page_referenced_file - referenced check for object-based rmap
419 * @page: the page we're checking references on. 425 * @page: the page we're checking references on.
420 * @mem_cont: target memory controller 426 * @mem_cont: target memory controller
427 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
421 * 428 *
422 * For an object-based mapped page, find all the places it is mapped and 429 * For an object-based mapped page, find all the places it is mapped and
423 * check/clear the referenced flag. This is done by following the page->mapping 430 * check/clear the referenced flag. This is done by following the page->mapping
@@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page,
427 * This function is only called from page_referenced for object-based pages. 434 * This function is only called from page_referenced for object-based pages.
428 */ 435 */
429static int page_referenced_file(struct page *page, 436static int page_referenced_file(struct page *page,
430 struct mem_cgroup *mem_cont) 437 struct mem_cgroup *mem_cont,
438 unsigned long *vm_flags)
431{ 439{
432 unsigned int mapcount; 440 unsigned int mapcount;
433 struct address_space *mapping = page->mapping; 441 struct address_space *mapping = page->mapping;
@@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page,
467 */ 475 */
468 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 476 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
469 continue; 477 continue;
470 referenced += page_referenced_one(page, vma, &mapcount); 478 referenced += page_referenced_one(page, vma,
479 &mapcount, vm_flags);
471 if (!mapcount) 480 if (!mapcount)
472 break; 481 break;
473 } 482 }
@@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page,
481 * @page: the page to test 490 * @page: the page to test
482 * @is_locked: caller holds lock on the page 491 * @is_locked: caller holds lock on the page
483 * @mem_cont: target memory controller 492 * @mem_cont: target memory controller
493 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
484 * 494 *
485 * Quick test_and_clear_referenced for all mappings to a page, 495 * Quick test_and_clear_referenced for all mappings to a page,
486 * returns the number of ptes which referenced the page. 496 * returns the number of ptes which referenced the page.
487 */ 497 */
488int page_referenced(struct page *page, int is_locked, 498int page_referenced(struct page *page,
489 struct mem_cgroup *mem_cont) 499 int is_locked,
500 struct mem_cgroup *mem_cont,
501 unsigned long *vm_flags)
490{ 502{
491 int referenced = 0; 503 int referenced = 0;
492 504
493 if (TestClearPageReferenced(page)) 505 if (TestClearPageReferenced(page))
494 referenced++; 506 referenced++;
495 507
508 *vm_flags = 0;
496 if (page_mapped(page) && page->mapping) { 509 if (page_mapped(page) && page->mapping) {
497 if (PageAnon(page)) 510 if (PageAnon(page))
498 referenced += page_referenced_anon(page, mem_cont); 511 referenced += page_referenced_anon(page, mem_cont,
512 vm_flags);
499 else if (is_locked) 513 else if (is_locked)
500 referenced += page_referenced_file(page, mem_cont); 514 referenced += page_referenced_file(page, mem_cont,
515 vm_flags);
501 else if (!trylock_page(page)) 516 else if (!trylock_page(page))
502 referenced++; 517 referenced++;
503 else { 518 else {
504 if (page->mapping) 519 if (page->mapping)
505 referenced += 520 referenced += page_referenced_file(page,
506 page_referenced_file(page, mem_cont); 521 mem_cont, vm_flags);
507 unlock_page(page); 522 unlock_page(page);
508 } 523 }
509 } 524 }
@@ -1202,7 +1217,6 @@ int try_to_unmap(struct page *page, int migration)
1202 return ret; 1217 return ret;
1203} 1218}
1204 1219
1205#ifdef CONFIG_UNEVICTABLE_LRU
1206/** 1220/**
1207 * try_to_munlock - try to munlock a page 1221 * try_to_munlock - try to munlock a page
1208 * @page: the page to be munlocked 1222 * @page: the page to be munlocked
@@ -1226,4 +1240,4 @@ int try_to_munlock(struct page *page)
1226 else 1240 else
1227 return try_to_unmap_file(page, 1, 0); 1241 return try_to_unmap_file(page, 1, 0);
1228} 1242}
1229#endif 1243
diff --git a/mm/shmem.c b/mm/shmem.c
index 0132fbd45a23..e89d7ec18eda 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1097 shmem_swp_unmap(entry); 1097 shmem_swp_unmap(entry);
1098unlock: 1098unlock:
1099 spin_unlock(&info->lock); 1099 spin_unlock(&info->lock);
1100 swap_free(swap); 1100 swapcache_free(swap, NULL);
1101redirty: 1101redirty:
1102 set_page_dirty(page); 1102 set_page_dirty(page);
1103 if (wbc->for_reclaim) 1103 if (wbc->for_reclaim)
@@ -2612,7 +2612,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
2612 * @size: size to be set for the file 2612 * @size: size to be set for the file
2613 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 2613 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
2614 */ 2614 */
2615struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) 2615struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
2616{ 2616{
2617 int error; 2617 int error;
2618 struct file *file; 2618 struct file *file;
diff --git a/mm/slab.c b/mm/slab.c
index af3376d0a833..f257d4dd474d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -818,7 +818,6 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
818 */ 818 */
819 819
820static int use_alien_caches __read_mostly = 1; 820static int use_alien_caches __read_mostly = 1;
821static int numa_platform __read_mostly = 1;
822static int __init noaliencache_setup(char *s) 821static int __init noaliencache_setup(char *s)
823{ 822{
824 use_alien_caches = 0; 823 use_alien_caches = 0;
@@ -1377,10 +1376,8 @@ void __init kmem_cache_init(void)
1377 int order; 1376 int order;
1378 int node; 1377 int node;
1379 1378
1380 if (num_possible_nodes() == 1) { 1379 if (num_possible_nodes() == 1)
1381 use_alien_caches = 0; 1380 use_alien_caches = 0;
1382 numa_platform = 0;
1383 }
1384 1381
1385 for (i = 0; i < NUM_INIT_LISTS; i++) { 1382 for (i = 0; i < NUM_INIT_LISTS; i++) {
1386 kmem_list3_init(&initkmem_list3[i]); 1383 kmem_list3_init(&initkmem_list3[i]);
@@ -1627,7 +1624,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1627 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1624 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1628 flags |= __GFP_RECLAIMABLE; 1625 flags |= __GFP_RECLAIMABLE;
1629 1626
1630 page = alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); 1627 page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
1631 if (!page) 1628 if (!page)
1632 return NULL; 1629 return NULL;
1633 1630
@@ -3193,7 +3190,7 @@ retry:
3193 if (local_flags & __GFP_WAIT) 3190 if (local_flags & __GFP_WAIT)
3194 local_irq_enable(); 3191 local_irq_enable();
3195 kmem_flagcheck(cache, flags); 3192 kmem_flagcheck(cache, flags);
3196 obj = kmem_getpages(cache, local_flags, -1); 3193 obj = kmem_getpages(cache, local_flags, numa_node_id());
3197 if (local_flags & __GFP_WAIT) 3194 if (local_flags & __GFP_WAIT)
3198 local_irq_disable(); 3195 local_irq_disable();
3199 if (obj) { 3196 if (obj) {
@@ -3530,7 +3527,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3530 * variable to skip the call, which is mostly likely to be present in 3527 * variable to skip the call, which is mostly likely to be present in
3531 * the cache. 3528 * the cache.
3532 */ 3529 */
3533 if (numa_platform && cache_free_alien(cachep, objp)) 3530 if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
3534 return; 3531 return;
3535 3532
3536 if (likely(ac->avail < ac->limit)) { 3533 if (likely(ac->avail < ac->limit)) {
diff --git a/mm/slob.c b/mm/slob.c
index 12f261499925..64f6db1943bf 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -46,7 +46,7 @@
46 * NUMA support in SLOB is fairly simplistic, pushing most of the real 46 * NUMA support in SLOB is fairly simplistic, pushing most of the real
47 * logic down to the page allocator, and simply doing the node accounting 47 * logic down to the page allocator, and simply doing the node accounting
48 * on the upper levels. In the event that a node id is explicitly 48 * on the upper levels. In the event that a node id is explicitly
49 * provided, alloc_pages_node() with the specified node id is used 49 * provided, alloc_pages_exact_node() with the specified node id is used
50 * instead. The common case (or when the node id isn't explicitly provided) 50 * instead. The common case (or when the node id isn't explicitly provided)
51 * will default to the current node, as per numa_node_id(). 51 * will default to the current node, as per numa_node_id().
52 * 52 *
@@ -244,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
244 244
245#ifdef CONFIG_NUMA 245#ifdef CONFIG_NUMA
246 if (node != -1) 246 if (node != -1)
247 page = alloc_pages_node(node, gfp, order); 247 page = alloc_pages_exact_node(node, gfp, order);
248 else 248 else
249#endif 249#endif
250 page = alloc_pages(gfp, order); 250 page = alloc_pages(gfp, order);
diff --git a/mm/slub.c b/mm/slub.c
index 15960a09abb1..2701419b0adc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3765,7 +3765,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
3765 to_cpumask(l->cpus)); 3765 to_cpumask(l->cpus));
3766 } 3766 }
3767 3767
3768 if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && 3768 if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
3769 len < PAGE_SIZE - 60) { 3769 len < PAGE_SIZE - 60) {
3770 len += sprintf(buf + len, " nodes="); 3770 len += sprintf(buf + len, " nodes=");
3771 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, 3771 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1416e7e9e02d..42cd38eba79f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -124,7 +124,6 @@ void __delete_from_swap_cache(struct page *page)
124/** 124/**
125 * add_to_swap - allocate swap space for a page 125 * add_to_swap - allocate swap space for a page
126 * @page: page we want to move to swap 126 * @page: page we want to move to swap
127 * @gfp_mask: memory allocation flags
128 * 127 *
129 * Allocate swap space for the page and add the page to the 128 * Allocate swap space for the page and add the page to the
130 * swap cache. Caller needs to hold the page lock. 129 * swap cache. Caller needs to hold the page lock.
@@ -162,11 +161,11 @@ int add_to_swap(struct page *page)
162 return 1; 161 return 1;
163 case -EEXIST: 162 case -EEXIST:
164 /* Raced with "speculative" read_swap_cache_async */ 163 /* Raced with "speculative" read_swap_cache_async */
165 swap_free(entry); 164 swapcache_free(entry, NULL);
166 continue; 165 continue;
167 default: 166 default:
168 /* -ENOMEM radix-tree allocation failure */ 167 /* -ENOMEM radix-tree allocation failure */
169 swap_free(entry); 168 swapcache_free(entry, NULL);
170 return 0; 169 return 0;
171 } 170 }
172 } 171 }
@@ -188,8 +187,7 @@ void delete_from_swap_cache(struct page *page)
188 __delete_from_swap_cache(page); 187 __delete_from_swap_cache(page);
189 spin_unlock_irq(&swapper_space.tree_lock); 188 spin_unlock_irq(&swapper_space.tree_lock);
190 189
191 mem_cgroup_uncharge_swapcache(page, entry); 190 swapcache_free(entry, page);
192 swap_free(entry);
193 page_cache_release(page); 191 page_cache_release(page);
194} 192}
195 193
@@ -293,7 +291,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
293 /* 291 /*
294 * Swap entry may have been freed since our caller observed it. 292 * Swap entry may have been freed since our caller observed it.
295 */ 293 */
296 if (!swap_duplicate(entry)) 294 err = swapcache_prepare(entry);
295 if (err == -EEXIST) /* seems racy */
296 continue;
297 if (err) /* swp entry is obsolete ? */
297 break; 298 break;
298 299
299 /* 300 /*
@@ -312,12 +313,12 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
312 * Initiate read into locked page and return. 313 * Initiate read into locked page and return.
313 */ 314 */
314 lru_cache_add_anon(new_page); 315 lru_cache_add_anon(new_page);
315 swap_readpage(NULL, new_page); 316 swap_readpage(new_page);
316 return new_page; 317 return new_page;
317 } 318 }
318 ClearPageSwapBacked(new_page); 319 ClearPageSwapBacked(new_page);
319 __clear_page_locked(new_page); 320 __clear_page_locked(new_page);
320 swap_free(entry); 321 swapcache_free(entry, NULL);
321 } while (err != -ENOMEM); 322 } while (err != -ENOMEM);
322 323
323 if (new_page) 324 if (new_page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 312fafe0ab6e..28faa01cf578 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -53,6 +53,59 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES];
53 53
54static DEFINE_MUTEX(swapon_mutex); 54static DEFINE_MUTEX(swapon_mutex);
55 55
56/* For reference count accounting in swap_map */
57/* enum for swap_map[] handling. internal use only */
58enum {
59 SWAP_MAP = 0, /* ops for reference from swap users */
60 SWAP_CACHE, /* ops for reference from swap cache */
61};
62
63static inline int swap_count(unsigned short ent)
64{
65 return ent & SWAP_COUNT_MASK;
66}
67
68static inline bool swap_has_cache(unsigned short ent)
69{
70 return !!(ent & SWAP_HAS_CACHE);
71}
72
73static inline unsigned short encode_swapmap(int count, bool has_cache)
74{
75 unsigned short ret = count;
76
77 if (has_cache)
78 return SWAP_HAS_CACHE | ret;
79 return ret;
80}
81
82/* returnes 1 if swap entry is freed */
83static int
84__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
85{
86 int type = si - swap_info;
87 swp_entry_t entry = swp_entry(type, offset);
88 struct page *page;
89 int ret = 0;
90
91 page = find_get_page(&swapper_space, entry.val);
92 if (!page)
93 return 0;
94 /*
95 * This function is called from scan_swap_map() and it's called
96 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
97 * We have to use trylock for avoiding deadlock. This is a special
98 * case and you should use try_to_free_swap() with explicit lock_page()
99 * in usual operations.
100 */
101 if (trylock_page(page)) {
102 ret = try_to_free_swap(page);
103 unlock_page(page);
104 }
105 page_cache_release(page);
106 return ret;
107}
108
56/* 109/*
57 * We need this because the bdev->unplug_fn can sleep and we cannot 110 * We need this because the bdev->unplug_fn can sleep and we cannot
58 * hold swap_lock while calling the unplug_fn. And swap_lock 111 * hold swap_lock while calling the unplug_fn. And swap_lock
@@ -167,7 +220,8 @@ static int wait_for_discard(void *word)
167#define SWAPFILE_CLUSTER 256 220#define SWAPFILE_CLUSTER 256
168#define LATENCY_LIMIT 256 221#define LATENCY_LIMIT 256
169 222
170static inline unsigned long scan_swap_map(struct swap_info_struct *si) 223static inline unsigned long scan_swap_map(struct swap_info_struct *si,
224 int cache)
171{ 225{
172 unsigned long offset; 226 unsigned long offset;
173 unsigned long scan_base; 227 unsigned long scan_base;
@@ -273,6 +327,19 @@ checks:
273 goto no_page; 327 goto no_page;
274 if (offset > si->highest_bit) 328 if (offset > si->highest_bit)
275 scan_base = offset = si->lowest_bit; 329 scan_base = offset = si->lowest_bit;
330
331 /* reuse swap entry of cache-only swap if not busy. */
332 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
333 int swap_was_freed;
334 spin_unlock(&swap_lock);
335 swap_was_freed = __try_to_reclaim_swap(si, offset);
336 spin_lock(&swap_lock);
337 /* entry was freed successfully, try to use this again */
338 if (swap_was_freed)
339 goto checks;
340 goto scan; /* check next one */
341 }
342
276 if (si->swap_map[offset]) 343 if (si->swap_map[offset])
277 goto scan; 344 goto scan;
278 345
@@ -285,7 +352,10 @@ checks:
285 si->lowest_bit = si->max; 352 si->lowest_bit = si->max;
286 si->highest_bit = 0; 353 si->highest_bit = 0;
287 } 354 }
288 si->swap_map[offset] = 1; 355 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
356 si->swap_map[offset] = encode_swapmap(0, true);
357 else /* at suspend */
358 si->swap_map[offset] = encode_swapmap(1, false);
289 si->cluster_next = offset + 1; 359 si->cluster_next = offset + 1;
290 si->flags -= SWP_SCANNING; 360 si->flags -= SWP_SCANNING;
291 361
@@ -351,6 +421,10 @@ scan:
351 spin_lock(&swap_lock); 421 spin_lock(&swap_lock);
352 goto checks; 422 goto checks;
353 } 423 }
424 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
425 spin_lock(&swap_lock);
426 goto checks;
427 }
354 if (unlikely(--latency_ration < 0)) { 428 if (unlikely(--latency_ration < 0)) {
355 cond_resched(); 429 cond_resched();
356 latency_ration = LATENCY_LIMIT; 430 latency_ration = LATENCY_LIMIT;
@@ -362,6 +436,10 @@ scan:
362 spin_lock(&swap_lock); 436 spin_lock(&swap_lock);
363 goto checks; 437 goto checks;
364 } 438 }
439 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
440 spin_lock(&swap_lock);
441 goto checks;
442 }
365 if (unlikely(--latency_ration < 0)) { 443 if (unlikely(--latency_ration < 0)) {
366 cond_resched(); 444 cond_resched();
367 latency_ration = LATENCY_LIMIT; 445 latency_ration = LATENCY_LIMIT;
@@ -401,7 +479,8 @@ swp_entry_t get_swap_page(void)
401 continue; 479 continue;
402 480
403 swap_list.next = next; 481 swap_list.next = next;
404 offset = scan_swap_map(si); 482 /* This is called for allocating swap entry for cache */
483 offset = scan_swap_map(si, SWAP_CACHE);
405 if (offset) { 484 if (offset) {
406 spin_unlock(&swap_lock); 485 spin_unlock(&swap_lock);
407 return swp_entry(type, offset); 486 return swp_entry(type, offset);
@@ -415,6 +494,7 @@ noswap:
415 return (swp_entry_t) {0}; 494 return (swp_entry_t) {0};
416} 495}
417 496
497/* The only caller of this function is now susupend routine */
418swp_entry_t get_swap_page_of_type(int type) 498swp_entry_t get_swap_page_of_type(int type)
419{ 499{
420 struct swap_info_struct *si; 500 struct swap_info_struct *si;
@@ -424,7 +504,8 @@ swp_entry_t get_swap_page_of_type(int type)
424 si = swap_info + type; 504 si = swap_info + type;
425 if (si->flags & SWP_WRITEOK) { 505 if (si->flags & SWP_WRITEOK) {
426 nr_swap_pages--; 506 nr_swap_pages--;
427 offset = scan_swap_map(si); 507 /* This is called for allocating swap entry, not cache */
508 offset = scan_swap_map(si, SWAP_MAP);
428 if (offset) { 509 if (offset) {
429 spin_unlock(&swap_lock); 510 spin_unlock(&swap_lock);
430 return swp_entry(type, offset); 511 return swp_entry(type, offset);
@@ -471,25 +552,38 @@ out:
471 return NULL; 552 return NULL;
472} 553}
473 554
474static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) 555static int swap_entry_free(struct swap_info_struct *p,
556 swp_entry_t ent, int cache)
475{ 557{
476 unsigned long offset = swp_offset(ent); 558 unsigned long offset = swp_offset(ent);
477 int count = p->swap_map[offset]; 559 int count = swap_count(p->swap_map[offset]);
478 560 bool has_cache;
479 if (count < SWAP_MAP_MAX) { 561
480 count--; 562 has_cache = swap_has_cache(p->swap_map[offset]);
481 p->swap_map[offset] = count; 563
482 if (!count) { 564 if (cache == SWAP_MAP) { /* dropping usage count of swap */
483 if (offset < p->lowest_bit) 565 if (count < SWAP_MAP_MAX) {
484 p->lowest_bit = offset; 566 count--;
485 if (offset > p->highest_bit) 567 p->swap_map[offset] = encode_swapmap(count, has_cache);
486 p->highest_bit = offset;
487 if (p->prio > swap_info[swap_list.next].prio)
488 swap_list.next = p - swap_info;
489 nr_swap_pages++;
490 p->inuse_pages--;
491 mem_cgroup_uncharge_swap(ent);
492 } 568 }
569 } else { /* dropping swap cache flag */
570 VM_BUG_ON(!has_cache);
571 p->swap_map[offset] = encode_swapmap(count, false);
572
573 }
574 /* return code. */
575 count = p->swap_map[offset];
576 /* free if no reference */
577 if (!count) {
578 if (offset < p->lowest_bit)
579 p->lowest_bit = offset;
580 if (offset > p->highest_bit)
581 p->highest_bit = offset;
582 if (p->prio > swap_info[swap_list.next].prio)
583 swap_list.next = p - swap_info;
584 nr_swap_pages++;
585 p->inuse_pages--;
586 mem_cgroup_uncharge_swap(ent);
493 } 587 }
494 return count; 588 return count;
495} 589}
@@ -504,9 +598,26 @@ void swap_free(swp_entry_t entry)
504 598
505 p = swap_info_get(entry); 599 p = swap_info_get(entry);
506 if (p) { 600 if (p) {
507 swap_entry_free(p, entry); 601 swap_entry_free(p, entry, SWAP_MAP);
602 spin_unlock(&swap_lock);
603 }
604}
605
606/*
607 * Called after dropping swapcache to decrease refcnt to swap entries.
608 */
609void swapcache_free(swp_entry_t entry, struct page *page)
610{
611 struct swap_info_struct *p;
612
613 if (page)
614 mem_cgroup_uncharge_swapcache(page, entry);
615 p = swap_info_get(entry);
616 if (p) {
617 swap_entry_free(p, entry, SWAP_CACHE);
508 spin_unlock(&swap_lock); 618 spin_unlock(&swap_lock);
509 } 619 }
620 return;
510} 621}
511 622
512/* 623/*
@@ -521,8 +632,7 @@ static inline int page_swapcount(struct page *page)
521 entry.val = page_private(page); 632 entry.val = page_private(page);
522 p = swap_info_get(entry); 633 p = swap_info_get(entry);
523 if (p) { 634 if (p) {
524 /* Subtract the 1 for the swap cache itself */ 635 count = swap_count(p->swap_map[swp_offset(entry)]);
525 count = p->swap_map[swp_offset(entry)] - 1;
526 spin_unlock(&swap_lock); 636 spin_unlock(&swap_lock);
527 } 637 }
528 return count; 638 return count;
@@ -584,7 +694,7 @@ int free_swap_and_cache(swp_entry_t entry)
584 694
585 p = swap_info_get(entry); 695 p = swap_info_get(entry);
586 if (p) { 696 if (p) {
587 if (swap_entry_free(p, entry) == 1) { 697 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
588 page = find_get_page(&swapper_space, entry.val); 698 page = find_get_page(&swapper_space, entry.val);
589 if (page && !trylock_page(page)) { 699 if (page && !trylock_page(page)) {
590 page_cache_release(page); 700 page_cache_release(page);
@@ -891,7 +1001,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
891 i = 1; 1001 i = 1;
892 } 1002 }
893 count = si->swap_map[i]; 1003 count = si->swap_map[i];
894 if (count && count != SWAP_MAP_BAD) 1004 if (count && swap_count(count) != SWAP_MAP_BAD)
895 break; 1005 break;
896 } 1006 }
897 return i; 1007 return i;
@@ -995,13 +1105,13 @@ static int try_to_unuse(unsigned int type)
995 */ 1105 */
996 shmem = 0; 1106 shmem = 0;
997 swcount = *swap_map; 1107 swcount = *swap_map;
998 if (swcount > 1) { 1108 if (swap_count(swcount)) {
999 if (start_mm == &init_mm) 1109 if (start_mm == &init_mm)
1000 shmem = shmem_unuse(entry, page); 1110 shmem = shmem_unuse(entry, page);
1001 else 1111 else
1002 retval = unuse_mm(start_mm, entry, page); 1112 retval = unuse_mm(start_mm, entry, page);
1003 } 1113 }
1004 if (*swap_map > 1) { 1114 if (swap_count(*swap_map)) {
1005 int set_start_mm = (*swap_map >= swcount); 1115 int set_start_mm = (*swap_map >= swcount);
1006 struct list_head *p = &start_mm->mmlist; 1116 struct list_head *p = &start_mm->mmlist;
1007 struct mm_struct *new_start_mm = start_mm; 1117 struct mm_struct *new_start_mm = start_mm;
@@ -1011,7 +1121,7 @@ static int try_to_unuse(unsigned int type)
1011 atomic_inc(&new_start_mm->mm_users); 1121 atomic_inc(&new_start_mm->mm_users);
1012 atomic_inc(&prev_mm->mm_users); 1122 atomic_inc(&prev_mm->mm_users);
1013 spin_lock(&mmlist_lock); 1123 spin_lock(&mmlist_lock);
1014 while (*swap_map > 1 && !retval && !shmem && 1124 while (swap_count(*swap_map) && !retval && !shmem &&
1015 (p = p->next) != &start_mm->mmlist) { 1125 (p = p->next) != &start_mm->mmlist) {
1016 mm = list_entry(p, struct mm_struct, mmlist); 1126 mm = list_entry(p, struct mm_struct, mmlist);
1017 if (!atomic_inc_not_zero(&mm->mm_users)) 1127 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1023,14 +1133,16 @@ static int try_to_unuse(unsigned int type)
1023 cond_resched(); 1133 cond_resched();
1024 1134
1025 swcount = *swap_map; 1135 swcount = *swap_map;
1026 if (swcount <= 1) 1136 if (!swap_count(swcount)) /* any usage ? */
1027 ; 1137 ;
1028 else if (mm == &init_mm) { 1138 else if (mm == &init_mm) {
1029 set_start_mm = 1; 1139 set_start_mm = 1;
1030 shmem = shmem_unuse(entry, page); 1140 shmem = shmem_unuse(entry, page);
1031 } else 1141 } else
1032 retval = unuse_mm(mm, entry, page); 1142 retval = unuse_mm(mm, entry, page);
1033 if (set_start_mm && *swap_map < swcount) { 1143
1144 if (set_start_mm &&
1145 swap_count(*swap_map) < swcount) {
1034 mmput(new_start_mm); 1146 mmput(new_start_mm);
1035 atomic_inc(&mm->mm_users); 1147 atomic_inc(&mm->mm_users);
1036 new_start_mm = mm; 1148 new_start_mm = mm;
@@ -1057,21 +1169,25 @@ static int try_to_unuse(unsigned int type)
1057 } 1169 }
1058 1170
1059 /* 1171 /*
1060 * How could swap count reach 0x7fff when the maximum 1172 * How could swap count reach 0x7ffe ?
1061 * pid is 0x7fff, and there's no way to repeat a swap 1173 * There's no way to repeat a swap page within an mm
1062 * page within an mm (except in shmem, where it's the 1174 * (except in shmem, where it's the shared object which takes
1063 * shared object which takes the reference count)? 1175 * the reference count)?
1064 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. 1176 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1065 * 1177 * short is too small....)
1066 * If that's wrong, then we should worry more about 1178 * If that's wrong, then we should worry more about
1067 * exit_mmap() and do_munmap() cases described above: 1179 * exit_mmap() and do_munmap() cases described above:
1068 * we might be resetting SWAP_MAP_MAX too early here. 1180 * we might be resetting SWAP_MAP_MAX too early here.
1069 * We know "Undead"s can happen, they're okay, so don't 1181 * We know "Undead"s can happen, they're okay, so don't
1070 * report them; but do report if we reset SWAP_MAP_MAX. 1182 * report them; but do report if we reset SWAP_MAP_MAX.
1071 */ 1183 */
1072 if (*swap_map == SWAP_MAP_MAX) { 1184 /* We might release the lock_page() in unuse_mm(). */
1185 if (!PageSwapCache(page) || page_private(page) != entry.val)
1186 goto retry;
1187
1188 if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1073 spin_lock(&swap_lock); 1189 spin_lock(&swap_lock);
1074 *swap_map = 1; 1190 *swap_map = encode_swapmap(0, true);
1075 spin_unlock(&swap_lock); 1191 spin_unlock(&swap_lock);
1076 reset_overflow = 1; 1192 reset_overflow = 1;
1077 } 1193 }
@@ -1089,7 +1205,8 @@ static int try_to_unuse(unsigned int type)
1089 * pages would be incorrect if swap supported "shared 1205 * pages would be incorrect if swap supported "shared
1090 * private" pages, but they are handled by tmpfs files. 1206 * private" pages, but they are handled by tmpfs files.
1091 */ 1207 */
1092 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { 1208 if (swap_count(*swap_map) &&
1209 PageDirty(page) && PageSwapCache(page)) {
1093 struct writeback_control wbc = { 1210 struct writeback_control wbc = {
1094 .sync_mode = WB_SYNC_NONE, 1211 .sync_mode = WB_SYNC_NONE,
1095 }; 1212 };
@@ -1116,6 +1233,7 @@ static int try_to_unuse(unsigned int type)
1116 * mark page dirty so shrink_page_list will preserve it. 1233 * mark page dirty so shrink_page_list will preserve it.
1117 */ 1234 */
1118 SetPageDirty(page); 1235 SetPageDirty(page);
1236retry:
1119 unlock_page(page); 1237 unlock_page(page);
1120 page_cache_release(page); 1238 page_cache_release(page);
1121 1239
@@ -1942,15 +2060,23 @@ void si_swapinfo(struct sysinfo *val)
1942 * 2060 *
1943 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as 2061 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
1944 * "permanent", but will be reclaimed by the next swapoff. 2062 * "permanent", but will be reclaimed by the next swapoff.
2063 * Returns error code in following case.
2064 * - success -> 0
2065 * - swp_entry is invalid -> EINVAL
2066 * - swp_entry is migration entry -> EINVAL
2067 * - swap-cache reference is requested but there is already one. -> EEXIST
2068 * - swap-cache reference is requested but the entry is not used. -> ENOENT
1945 */ 2069 */
1946int swap_duplicate(swp_entry_t entry) 2070static int __swap_duplicate(swp_entry_t entry, bool cache)
1947{ 2071{
1948 struct swap_info_struct * p; 2072 struct swap_info_struct * p;
1949 unsigned long offset, type; 2073 unsigned long offset, type;
1950 int result = 0; 2074 int result = -EINVAL;
2075 int count;
2076 bool has_cache;
1951 2077
1952 if (is_migration_entry(entry)) 2078 if (is_migration_entry(entry))
1953 return 1; 2079 return -EINVAL;
1954 2080
1955 type = swp_type(entry); 2081 type = swp_type(entry);
1956 if (type >= nr_swapfiles) 2082 if (type >= nr_swapfiles)
@@ -1959,17 +2085,40 @@ int swap_duplicate(swp_entry_t entry)
1959 offset = swp_offset(entry); 2085 offset = swp_offset(entry);
1960 2086
1961 spin_lock(&swap_lock); 2087 spin_lock(&swap_lock);
1962 if (offset < p->max && p->swap_map[offset]) { 2088
1963 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { 2089 if (unlikely(offset >= p->max))
1964 p->swap_map[offset]++; 2090 goto unlock_out;
1965 result = 1; 2091
1966 } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { 2092 count = swap_count(p->swap_map[offset]);
2093 has_cache = swap_has_cache(p->swap_map[offset]);
2094
2095 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
2096
2097 /* set SWAP_HAS_CACHE if there is no cache and entry is used */
2098 if (!has_cache && count) {
2099 p->swap_map[offset] = encode_swapmap(count, true);
2100 result = 0;
2101 } else if (has_cache) /* someone added cache */
2102 result = -EEXIST;
2103 else if (!count) /* no users */
2104 result = -ENOENT;
2105
2106 } else if (count || has_cache) {
2107 if (count < SWAP_MAP_MAX - 1) {
2108 p->swap_map[offset] = encode_swapmap(count + 1,
2109 has_cache);
2110 result = 0;
2111 } else if (count <= SWAP_MAP_MAX) {
1967 if (swap_overflow++ < 5) 2112 if (swap_overflow++ < 5)
1968 printk(KERN_WARNING "swap_dup: swap entry overflow\n"); 2113 printk(KERN_WARNING
1969 p->swap_map[offset] = SWAP_MAP_MAX; 2114 "swap_dup: swap entry overflow\n");
1970 result = 1; 2115 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
2116 has_cache);
2117 result = 0;
1971 } 2118 }
1972 } 2119 } else
2120 result = -ENOENT; /* unused swap entry */
2121unlock_out:
1973 spin_unlock(&swap_lock); 2122 spin_unlock(&swap_lock);
1974out: 2123out:
1975 return result; 2124 return result;
@@ -1978,6 +2127,27 @@ bad_file:
1978 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2127 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
1979 goto out; 2128 goto out;
1980} 2129}
2130/*
2131 * increase reference count of swap entry by 1.
2132 */
2133void swap_duplicate(swp_entry_t entry)
2134{
2135 __swap_duplicate(entry, SWAP_MAP);
2136}
2137
2138/*
2139 * @entry: swap entry for which we allocate swap cache.
2140 *
2141 * Called when allocating swap cache for exising swap entry,
2142 * This can return error codes. Returns 0 at success.
2143 * -EBUSY means there is a swap cache.
2144 * Note: return code is different from swap_duplicate().
2145 */
2146int swapcache_prepare(swp_entry_t entry)
2147{
2148 return __swap_duplicate(entry, SWAP_CACHE);
2149}
2150
1981 2151
1982struct swap_info_struct * 2152struct swap_info_struct *
1983get_swap_info_struct(unsigned type) 2153get_swap_info_struct(unsigned type)
@@ -2016,7 +2186,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2016 /* Don't read in free or bad pages */ 2186 /* Don't read in free or bad pages */
2017 if (!si->swap_map[toff]) 2187 if (!si->swap_map[toff])
2018 break; 2188 break;
2019 if (si->swap_map[toff] == SWAP_MAP_BAD) 2189 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2020 break; 2190 break;
2021 } 2191 }
2022 /* Count contiguous allocated slots below our target */ 2192 /* Count contiguous allocated slots below our target */
@@ -2024,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2024 /* Don't read in free or bad pages */ 2194 /* Don't read in free or bad pages */
2025 if (!si->swap_map[toff]) 2195 if (!si->swap_map[toff])
2026 break; 2196 break;
2027 if (si->swap_map[toff] == SWAP_MAP_BAD) 2197 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2028 break; 2198 break;
2029 } 2199 }
2030 spin_unlock(&swap_lock); 2200 spin_unlock(&swap_lock);
diff --git a/mm/truncate.c b/mm/truncate.c
index 12e1579f9165..ccc3ecf7cb98 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
267} 267}
268EXPORT_SYMBOL(truncate_inode_pages); 268EXPORT_SYMBOL(truncate_inode_pages);
269 269
270unsigned long __invalidate_mapping_pages(struct address_space *mapping, 270/**
271 pgoff_t start, pgoff_t end, bool be_atomic) 271 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
272 * @mapping: the address_space which holds the pages to invalidate
273 * @start: the offset 'from' which to invalidate
274 * @end: the offset 'to' which to invalidate (inclusive)
275 *
276 * This function only removes the unlocked pages, if you want to
277 * remove all the pages of one inode, you must call truncate_inode_pages.
278 *
279 * invalidate_mapping_pages() will not block on IO activity. It will not
280 * invalidate pages which are dirty, locked, under writeback or mapped into
281 * pagetables.
282 */
283unsigned long invalidate_mapping_pages(struct address_space *mapping,
284 pgoff_t start, pgoff_t end)
272{ 285{
273 struct pagevec pvec; 286 struct pagevec pvec;
274 pgoff_t next = start; 287 pgoff_t next = start;
@@ -309,30 +322,10 @@ unlock:
309 break; 322 break;
310 } 323 }
311 pagevec_release(&pvec); 324 pagevec_release(&pvec);
312 if (likely(!be_atomic)) 325 cond_resched();
313 cond_resched();
314 } 326 }
315 return ret; 327 return ret;
316} 328}
317
318/**
319 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
320 * @mapping: the address_space which holds the pages to invalidate
321 * @start: the offset 'from' which to invalidate
322 * @end: the offset 'to' which to invalidate (inclusive)
323 *
324 * This function only removes the unlocked pages, if you want to
325 * remove all the pages of one inode, you must call truncate_inode_pages.
326 *
327 * invalidate_mapping_pages() will not block on IO activity. It will not
328 * invalidate pages which are dirty, locked, under writeback or mapped into
329 * pagetables.
330 */
331unsigned long invalidate_mapping_pages(struct address_space *mapping,
332 pgoff_t start, pgoff_t end)
333{
334 return __invalidate_mapping_pages(mapping, start, end, false);
335}
336EXPORT_SYMBOL(invalidate_mapping_pages); 329EXPORT_SYMBOL(invalidate_mapping_pages);
337 330
338/* 331/*
diff --git a/mm/util.c b/mm/util.c
index abc65aa7cdfc..d5d2213728c5 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -233,13 +233,21 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
233 * @pages: array that receives pointers to the pages pinned. 233 * @pages: array that receives pointers to the pages pinned.
234 * Should be at least nr_pages long. 234 * Should be at least nr_pages long.
235 * 235 *
236 * Attempt to pin user pages in memory without taking mm->mmap_sem.
237 * If not successful, it will fall back to taking the lock and
238 * calling get_user_pages().
239 *
240 * Returns number of pages pinned. This may be fewer than the number 236 * Returns number of pages pinned. This may be fewer than the number
241 * requested. If nr_pages is 0 or negative, returns 0. If no pages 237 * requested. If nr_pages is 0 or negative, returns 0. If no pages
242 * were pinned, returns -errno. 238 * were pinned, returns -errno.
239 *
240 * get_user_pages_fast provides equivalent functionality to get_user_pages,
241 * operating on current and current->mm, with force=0 and vma=NULL. However
242 * unlike get_user_pages, it must be called without mmap_sem held.
243 *
244 * get_user_pages_fast may take mmap_sem and page table locks, so no
245 * assumptions can be made about lack of locking. get_user_pages_fast is to be
246 * implemented in a way that is advantageous (vs get_user_pages()) when the
247 * user memory area is already faulted in and present in ptes. However if the
248 * pages have to be faulted in, it may turn out to be slightly slower so
249 * callers need to carefully consider what to use. On many architectures,
250 * get_user_pages_fast simply falls back to get_user_pages.
243 */ 251 */
244int __attribute__((weak)) get_user_pages_fast(unsigned long start, 252int __attribute__((weak)) get_user_pages_fast(unsigned long start,
245 int nr_pages, int write, struct page **pages) 253 int nr_pages, int write, struct page **pages)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 95c08a8cc2ba..4139aa52b941 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -470,8 +470,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
470 swp_entry_t swap = { .val = page_private(page) }; 470 swp_entry_t swap = { .val = page_private(page) };
471 __delete_from_swap_cache(page); 471 __delete_from_swap_cache(page);
472 spin_unlock_irq(&mapping->tree_lock); 472 spin_unlock_irq(&mapping->tree_lock);
473 mem_cgroup_uncharge_swapcache(page, swap); 473 swapcache_free(swap, page);
474 swap_free(swap);
475 } else { 474 } else {
476 __remove_from_page_cache(page); 475 __remove_from_page_cache(page);
477 spin_unlock_irq(&mapping->tree_lock); 476 spin_unlock_irq(&mapping->tree_lock);
@@ -514,7 +513,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
514 * 513 *
515 * lru_lock must not be held, interrupts must be enabled. 514 * lru_lock must not be held, interrupts must be enabled.
516 */ 515 */
517#ifdef CONFIG_UNEVICTABLE_LRU
518void putback_lru_page(struct page *page) 516void putback_lru_page(struct page *page)
519{ 517{
520 int lru; 518 int lru;
@@ -568,20 +566,6 @@ redo:
568 put_page(page); /* drop ref from isolate */ 566 put_page(page); /* drop ref from isolate */
569} 567}
570 568
571#else /* CONFIG_UNEVICTABLE_LRU */
572
573void putback_lru_page(struct page *page)
574{
575 int lru;
576 VM_BUG_ON(PageLRU(page));
577
578 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
579 lru_cache_add_lru(page, lru);
580 put_page(page);
581}
582#endif /* CONFIG_UNEVICTABLE_LRU */
583
584
585/* 569/*
586 * shrink_page_list() returns the number of reclaimed pages 570 * shrink_page_list() returns the number of reclaimed pages
587 */ 571 */
@@ -593,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
593 struct pagevec freed_pvec; 577 struct pagevec freed_pvec;
594 int pgactivate = 0; 578 int pgactivate = 0;
595 unsigned long nr_reclaimed = 0; 579 unsigned long nr_reclaimed = 0;
580 unsigned long vm_flags;
596 581
597 cond_resched(); 582 cond_resched();
598 583
@@ -643,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
643 goto keep_locked; 628 goto keep_locked;
644 } 629 }
645 630
646 referenced = page_referenced(page, 1, sc->mem_cgroup); 631 referenced = page_referenced(page, 1,
632 sc->mem_cgroup, &vm_flags);
647 /* In active use or really unfreeable? Activate it. */ 633 /* In active use or really unfreeable? Activate it. */
648 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && 634 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
649 referenced && page_mapping_inuse(page)) 635 referenced && page_mapping_inuse(page))
@@ -943,18 +929,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
943 /* Check that we have not crossed a zone boundary. */ 929 /* Check that we have not crossed a zone boundary. */
944 if (unlikely(page_zone_id(cursor_page) != zone_id)) 930 if (unlikely(page_zone_id(cursor_page) != zone_id))
945 continue; 931 continue;
946 switch (__isolate_lru_page(cursor_page, mode, file)) { 932 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
947 case 0:
948 list_move(&cursor_page->lru, dst); 933 list_move(&cursor_page->lru, dst);
949 nr_taken++; 934 nr_taken++;
950 scan++; 935 scan++;
951 break;
952
953 case -EBUSY:
954 /* else it is being freed elsewhere */
955 list_move(&cursor_page->lru, src);
956 default:
957 break; /* ! on LRU or wrong list */
958 } 936 }
959 } 937 }
960 } 938 }
@@ -1061,6 +1039,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1061 unsigned long nr_scanned = 0; 1039 unsigned long nr_scanned = 0;
1062 unsigned long nr_reclaimed = 0; 1040 unsigned long nr_reclaimed = 0;
1063 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1041 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1042 int lumpy_reclaim = 0;
1043
1044 /*
1045 * If we need a large contiguous chunk of memory, or have
1046 * trouble getting a small set of contiguous pages, we
1047 * will reclaim both active and inactive pages.
1048 *
1049 * We use the same threshold as pageout congestion_wait below.
1050 */
1051 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1052 lumpy_reclaim = 1;
1053 else if (sc->order && priority < DEF_PRIORITY - 2)
1054 lumpy_reclaim = 1;
1064 1055
1065 pagevec_init(&pvec, 1); 1056 pagevec_init(&pvec, 1);
1066 1057
@@ -1073,19 +1064,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1073 unsigned long nr_freed; 1064 unsigned long nr_freed;
1074 unsigned long nr_active; 1065 unsigned long nr_active;
1075 unsigned int count[NR_LRU_LISTS] = { 0, }; 1066 unsigned int count[NR_LRU_LISTS] = { 0, };
1076 int mode = ISOLATE_INACTIVE; 1067 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1077
1078 /*
1079 * If we need a large contiguous chunk of memory, or have
1080 * trouble getting a small set of contiguous pages, we
1081 * will reclaim both active and inactive pages.
1082 *
1083 * We use the same threshold as pageout congestion_wait below.
1084 */
1085 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1086 mode = ISOLATE_BOTH;
1087 else if (sc->order && priority < DEF_PRIORITY - 2)
1088 mode = ISOLATE_BOTH;
1089 1068
1090 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1069 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1091 &page_list, &nr_scan, sc->order, mode, 1070 &page_list, &nr_scan, sc->order, mode,
@@ -1122,7 +1101,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1122 * but that should be acceptable to the caller 1101 * but that should be acceptable to the caller
1123 */ 1102 */
1124 if (nr_freed < nr_taken && !current_is_kswapd() && 1103 if (nr_freed < nr_taken && !current_is_kswapd() &&
1125 sc->order > PAGE_ALLOC_COSTLY_ORDER) { 1104 lumpy_reclaim) {
1126 congestion_wait(WRITE, HZ/10); 1105 congestion_wait(WRITE, HZ/10);
1127 1106
1128 /* 1107 /*
@@ -1217,18 +1196,54 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1217 * But we had to alter page->flags anyway. 1196 * But we had to alter page->flags anyway.
1218 */ 1197 */
1219 1198
1199static void move_active_pages_to_lru(struct zone *zone,
1200 struct list_head *list,
1201 enum lru_list lru)
1202{
1203 unsigned long pgmoved = 0;
1204 struct pagevec pvec;
1205 struct page *page;
1206
1207 pagevec_init(&pvec, 1);
1208
1209 while (!list_empty(list)) {
1210 page = lru_to_page(list);
1211 prefetchw_prev_lru_page(page, list, flags);
1212
1213 VM_BUG_ON(PageLRU(page));
1214 SetPageLRU(page);
1215
1216 VM_BUG_ON(!PageActive(page));
1217 if (!is_active_lru(lru))
1218 ClearPageActive(page); /* we are de-activating */
1219
1220 list_move(&page->lru, &zone->lru[lru].list);
1221 mem_cgroup_add_lru_list(page, lru);
1222 pgmoved++;
1223
1224 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1225 spin_unlock_irq(&zone->lru_lock);
1226 if (buffer_heads_over_limit)
1227 pagevec_strip(&pvec);
1228 __pagevec_release(&pvec);
1229 spin_lock_irq(&zone->lru_lock);
1230 }
1231 }
1232 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1233 if (!is_active_lru(lru))
1234 __count_vm_events(PGDEACTIVATE, pgmoved);
1235}
1220 1236
1221static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1237static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1222 struct scan_control *sc, int priority, int file) 1238 struct scan_control *sc, int priority, int file)
1223{ 1239{
1224 unsigned long pgmoved; 1240 unsigned long pgmoved;
1225 int pgdeactivate = 0;
1226 unsigned long pgscanned; 1241 unsigned long pgscanned;
1242 unsigned long vm_flags;
1227 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1243 LIST_HEAD(l_hold); /* The pages which were snipped off */
1244 LIST_HEAD(l_active);
1228 LIST_HEAD(l_inactive); 1245 LIST_HEAD(l_inactive);
1229 struct page *page; 1246 struct page *page;
1230 struct pagevec pvec;
1231 enum lru_list lru;
1232 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1247 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1233 1248
1234 lru_add_drain(); 1249 lru_add_drain();
@@ -1245,13 +1260,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1245 } 1260 }
1246 reclaim_stat->recent_scanned[!!file] += pgmoved; 1261 reclaim_stat->recent_scanned[!!file] += pgmoved;
1247 1262
1263 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1248 if (file) 1264 if (file)
1249 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); 1265 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1250 else 1266 else
1251 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); 1267 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1252 spin_unlock_irq(&zone->lru_lock); 1268 spin_unlock_irq(&zone->lru_lock);
1253 1269
1254 pgmoved = 0; 1270 pgmoved = 0; /* count referenced (mapping) mapped pages */
1255 while (!list_empty(&l_hold)) { 1271 while (!list_empty(&l_hold)) {
1256 cond_resched(); 1272 cond_resched();
1257 page = lru_to_page(&l_hold); 1273 page = lru_to_page(&l_hold);
@@ -1264,58 +1280,44 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1264 1280
1265 /* page_referenced clears PageReferenced */ 1281 /* page_referenced clears PageReferenced */
1266 if (page_mapping_inuse(page) && 1282 if (page_mapping_inuse(page) &&
1267 page_referenced(page, 0, sc->mem_cgroup)) 1283 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1268 pgmoved++; 1284 pgmoved++;
1285 /*
1286 * Identify referenced, file-backed active pages and
1287 * give them one more trip around the active list. So
1288 * that executable code get better chances to stay in
1289 * memory under moderate memory pressure. Anon pages
1290 * are not likely to be evicted by use-once streaming
1291 * IO, plus JVM can create lots of anon VM_EXEC pages,
1292 * so we ignore them here.
1293 */
1294 if ((vm_flags & VM_EXEC) && !PageAnon(page)) {
1295 list_add(&page->lru, &l_active);
1296 continue;
1297 }
1298 }
1269 1299
1270 list_add(&page->lru, &l_inactive); 1300 list_add(&page->lru, &l_inactive);
1271 } 1301 }
1272 1302
1273 /* 1303 /*
1274 * Move the pages to the [file or anon] inactive list. 1304 * Move pages back to the lru list.
1275 */ 1305 */
1276 pagevec_init(&pvec, 1);
1277 lru = LRU_BASE + file * LRU_FILE;
1278
1279 spin_lock_irq(&zone->lru_lock); 1306 spin_lock_irq(&zone->lru_lock);
1280 /* 1307 /*
1281 * Count referenced pages from currently used mappings as 1308 * Count referenced pages from currently used mappings as rotated,
1282 * rotated, even though they are moved to the inactive list. 1309 * even though only some of them are actually re-activated. This
1283 * This helps balance scan pressure between file and anonymous 1310 * helps balance scan pressure between file and anonymous pages in
1284 * pages in get_scan_ratio. 1311 * get_scan_ratio.
1285 */ 1312 */
1286 reclaim_stat->recent_rotated[!!file] += pgmoved; 1313 reclaim_stat->recent_rotated[!!file] += pgmoved;
1287 1314
1288 pgmoved = 0; 1315 move_active_pages_to_lru(zone, &l_active,
1289 while (!list_empty(&l_inactive)) { 1316 LRU_ACTIVE + file * LRU_FILE);
1290 page = lru_to_page(&l_inactive); 1317 move_active_pages_to_lru(zone, &l_inactive,
1291 prefetchw_prev_lru_page(page, &l_inactive, flags); 1318 LRU_BASE + file * LRU_FILE);
1292 VM_BUG_ON(PageLRU(page));
1293 SetPageLRU(page);
1294 VM_BUG_ON(!PageActive(page));
1295 ClearPageActive(page);
1296 1319
1297 list_move(&page->lru, &zone->lru[lru].list);
1298 mem_cgroup_add_lru_list(page, lru);
1299 pgmoved++;
1300 if (!pagevec_add(&pvec, page)) {
1301 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1302 spin_unlock_irq(&zone->lru_lock);
1303 pgdeactivate += pgmoved;
1304 pgmoved = 0;
1305 if (buffer_heads_over_limit)
1306 pagevec_strip(&pvec);
1307 __pagevec_release(&pvec);
1308 spin_lock_irq(&zone->lru_lock);
1309 }
1310 }
1311 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1312 pgdeactivate += pgmoved;
1313 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1314 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1315 spin_unlock_irq(&zone->lru_lock); 1320 spin_unlock_irq(&zone->lru_lock);
1316 if (buffer_heads_over_limit)
1317 pagevec_strip(&pvec);
1318 pagevec_release(&pvec);
1319} 1321}
1320 1322
1321static int inactive_anon_is_low_global(struct zone *zone) 1323static int inactive_anon_is_low_global(struct zone *zone)
@@ -1350,12 +1352,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1350 return low; 1352 return low;
1351} 1353}
1352 1354
1355static int inactive_file_is_low_global(struct zone *zone)
1356{
1357 unsigned long active, inactive;
1358
1359 active = zone_page_state(zone, NR_ACTIVE_FILE);
1360 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1361
1362 return (active > inactive);
1363}
1364
1365/**
1366 * inactive_file_is_low - check if file pages need to be deactivated
1367 * @zone: zone to check
1368 * @sc: scan control of this context
1369 *
1370 * When the system is doing streaming IO, memory pressure here
1371 * ensures that active file pages get deactivated, until more
1372 * than half of the file pages are on the inactive list.
1373 *
1374 * Once we get to that situation, protect the system's working
1375 * set from being evicted by disabling active file page aging.
1376 *
1377 * This uses a different ratio than the anonymous pages, because
1378 * the page cache uses a use-once replacement algorithm.
1379 */
1380static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1381{
1382 int low;
1383
1384 if (scanning_global_lru(sc))
1385 low = inactive_file_is_low_global(zone);
1386 else
1387 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
1388 return low;
1389}
1390
1353static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1391static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1354 struct zone *zone, struct scan_control *sc, int priority) 1392 struct zone *zone, struct scan_control *sc, int priority)
1355{ 1393{
1356 int file = is_file_lru(lru); 1394 int file = is_file_lru(lru);
1357 1395
1358 if (lru == LRU_ACTIVE_FILE) { 1396 if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
1359 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1397 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1360 return 0; 1398 return 0;
1361 } 1399 }
@@ -1384,13 +1422,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1384 unsigned long ap, fp; 1422 unsigned long ap, fp;
1385 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1423 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1386 1424
1387 /* If we have no swap space, do not bother scanning anon pages. */
1388 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1389 percent[0] = 0;
1390 percent[1] = 100;
1391 return;
1392 }
1393
1394 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + 1425 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
1395 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); 1426 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
1396 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + 1427 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1400,7 +1431,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1400 free = zone_page_state(zone, NR_FREE_PAGES); 1431 free = zone_page_state(zone, NR_FREE_PAGES);
1401 /* If we have very few page cache pages, 1432 /* If we have very few page cache pages,
1402 force-scan anon pages. */ 1433 force-scan anon pages. */
1403 if (unlikely(file + free <= zone->pages_high)) { 1434 if (unlikely(file + free <= high_wmark_pages(zone))) {
1404 percent[0] = 100; 1435 percent[0] = 100;
1405 percent[1] = 0; 1436 percent[1] = 0;
1406 return; 1437 return;
@@ -1455,6 +1486,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1455 percent[1] = 100 - percent[0]; 1486 percent[1] = 100 - percent[0];
1456} 1487}
1457 1488
1489/*
1490 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1491 * until we collected @swap_cluster_max pages to scan.
1492 */
1493static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1494 unsigned long *nr_saved_scan,
1495 unsigned long swap_cluster_max)
1496{
1497 unsigned long nr;
1498
1499 *nr_saved_scan += nr_to_scan;
1500 nr = *nr_saved_scan;
1501
1502 if (nr >= swap_cluster_max)
1503 *nr_saved_scan = 0;
1504 else
1505 nr = 0;
1506
1507 return nr;
1508}
1458 1509
1459/* 1510/*
1460 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1511 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
@@ -1468,26 +1519,30 @@ static void shrink_zone(int priority, struct zone *zone,
1468 enum lru_list l; 1519 enum lru_list l;
1469 unsigned long nr_reclaimed = sc->nr_reclaimed; 1520 unsigned long nr_reclaimed = sc->nr_reclaimed;
1470 unsigned long swap_cluster_max = sc->swap_cluster_max; 1521 unsigned long swap_cluster_max = sc->swap_cluster_max;
1522 int noswap = 0;
1471 1523
1472 get_scan_ratio(zone, sc, percent); 1524 /* If we have no swap space, do not bother scanning anon pages. */
1525 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1526 noswap = 1;
1527 percent[0] = 0;
1528 percent[1] = 100;
1529 } else
1530 get_scan_ratio(zone, sc, percent);
1473 1531
1474 for_each_evictable_lru(l) { 1532 for_each_evictable_lru(l) {
1475 int file = is_file_lru(l); 1533 int file = is_file_lru(l);
1476 unsigned long scan; 1534 unsigned long scan;
1477 1535
1478 scan = zone_nr_pages(zone, sc, l); 1536 scan = zone_nr_pages(zone, sc, l);
1479 if (priority) { 1537 if (priority || noswap) {
1480 scan >>= priority; 1538 scan >>= priority;
1481 scan = (scan * percent[file]) / 100; 1539 scan = (scan * percent[file]) / 100;
1482 } 1540 }
1483 if (scanning_global_lru(sc)) { 1541 if (scanning_global_lru(sc))
1484 zone->lru[l].nr_scan += scan; 1542 nr[l] = nr_scan_try_batch(scan,
1485 nr[l] = zone->lru[l].nr_scan; 1543 &zone->lru[l].nr_saved_scan,
1486 if (nr[l] >= swap_cluster_max) 1544 swap_cluster_max);
1487 zone->lru[l].nr_scan = 0; 1545 else
1488 else
1489 nr[l] = 0;
1490 } else
1491 nr[l] = scan; 1546 nr[l] = scan;
1492 } 1547 }
1493 1548
@@ -1521,7 +1576,7 @@ static void shrink_zone(int priority, struct zone *zone,
1521 * Even if we did not try to evict anon pages at all, we want to 1576 * Even if we did not try to evict anon pages at all, we want to
1522 * rebalance the anon lru active/inactive ratio. 1577 * rebalance the anon lru active/inactive ratio.
1523 */ 1578 */
1524 if (inactive_anon_is_low(zone, sc)) 1579 if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
1525 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1580 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1526 1581
1527 throttle_vm_writeout(sc->gfp_mask); 1582 throttle_vm_writeout(sc->gfp_mask);
@@ -1532,11 +1587,13 @@ static void shrink_zone(int priority, struct zone *zone,
1532 * try to reclaim pages from zones which will satisfy the caller's allocation 1587 * try to reclaim pages from zones which will satisfy the caller's allocation
1533 * request. 1588 * request.
1534 * 1589 *
1535 * We reclaim from a zone even if that zone is over pages_high. Because: 1590 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
1591 * Because:
1536 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 1592 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
1537 * allocation or 1593 * allocation or
1538 * b) The zones may be over pages_high but they must go *over* pages_high to 1594 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
1539 * satisfy the `incremental min' zone defense algorithm. 1595 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
1596 * zone defense algorithm.
1540 * 1597 *
1541 * If a zone is deemed to be full of pinned pages then just give it a light 1598 * If a zone is deemed to be full of pinned pages then just give it a light
1542 * scan then give up on it. 1599 * scan then give up on it.
@@ -1742,7 +1799,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1742 1799
1743/* 1800/*
1744 * For kswapd, balance_pgdat() will work across all this node's zones until 1801 * For kswapd, balance_pgdat() will work across all this node's zones until
1745 * they are all at pages_high. 1802 * they are all at high_wmark_pages(zone).
1746 * 1803 *
1747 * Returns the number of pages which were actually freed. 1804 * Returns the number of pages which were actually freed.
1748 * 1805 *
@@ -1755,11 +1812,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1755 * the zone for when the problem goes away. 1812 * the zone for when the problem goes away.
1756 * 1813 *
1757 * kswapd scans the zones in the highmem->normal->dma direction. It skips 1814 * kswapd scans the zones in the highmem->normal->dma direction. It skips
1758 * zones which have free_pages > pages_high, but once a zone is found to have 1815 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
1759 * free_pages <= pages_high, we scan that zone and the lower zones regardless 1816 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
1760 * of the number of free pages in the lower zones. This interoperates with 1817 * lower zones regardless of the number of free pages in the lower zones. This
1761 * the page allocator fallback scheme to ensure that aging of pages is balanced 1818 * interoperates with the page allocator fallback scheme to ensure that aging
1762 * across the zones. 1819 * of pages is balanced across the zones.
1763 */ 1820 */
1764static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 1821static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1765{ 1822{
@@ -1780,7 +1837,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1780 }; 1837 };
1781 /* 1838 /*
1782 * temp_priority is used to remember the scanning priority at which 1839 * temp_priority is used to remember the scanning priority at which
1783 * this zone was successfully refilled to free_pages == pages_high. 1840 * this zone was successfully refilled to
1841 * free_pages == high_wmark_pages(zone).
1784 */ 1842 */
1785 int temp_priority[MAX_NR_ZONES]; 1843 int temp_priority[MAX_NR_ZONES];
1786 1844
@@ -1825,8 +1883,8 @@ loop_again:
1825 shrink_active_list(SWAP_CLUSTER_MAX, zone, 1883 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1826 &sc, priority, 0); 1884 &sc, priority, 0);
1827 1885
1828 if (!zone_watermark_ok(zone, order, zone->pages_high, 1886 if (!zone_watermark_ok(zone, order,
1829 0, 0)) { 1887 high_wmark_pages(zone), 0, 0)) {
1830 end_zone = i; 1888 end_zone = i;
1831 break; 1889 break;
1832 } 1890 }
@@ -1860,8 +1918,8 @@ loop_again:
1860 priority != DEF_PRIORITY) 1918 priority != DEF_PRIORITY)
1861 continue; 1919 continue;
1862 1920
1863 if (!zone_watermark_ok(zone, order, zone->pages_high, 1921 if (!zone_watermark_ok(zone, order,
1864 end_zone, 0)) 1922 high_wmark_pages(zone), end_zone, 0))
1865 all_zones_ok = 0; 1923 all_zones_ok = 0;
1866 temp_priority[i] = priority; 1924 temp_priority[i] = priority;
1867 sc.nr_scanned = 0; 1925 sc.nr_scanned = 0;
@@ -1870,8 +1928,8 @@ loop_again:
1870 * We put equal pressure on every zone, unless one 1928 * We put equal pressure on every zone, unless one
1871 * zone has way too many pages free already. 1929 * zone has way too many pages free already.
1872 */ 1930 */
1873 if (!zone_watermark_ok(zone, order, 8*zone->pages_high, 1931 if (!zone_watermark_ok(zone, order,
1874 end_zone, 0)) 1932 8*high_wmark_pages(zone), end_zone, 0))
1875 shrink_zone(priority, zone, &sc); 1933 shrink_zone(priority, zone, &sc);
1876 reclaim_state->reclaimed_slab = 0; 1934 reclaim_state->reclaimed_slab = 0;
1877 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1935 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -2037,7 +2095,7 @@ void wakeup_kswapd(struct zone *zone, int order)
2037 return; 2095 return;
2038 2096
2039 pgdat = zone->zone_pgdat; 2097 pgdat = zone->zone_pgdat;
2040 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) 2098 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2041 return; 2099 return;
2042 if (pgdat->kswapd_max_order < order) 2100 if (pgdat->kswapd_max_order < order)
2043 pgdat->kswapd_max_order = order; 2101 pgdat->kswapd_max_order = order;
@@ -2084,11 +2142,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2084 l == LRU_ACTIVE_FILE)) 2142 l == LRU_ACTIVE_FILE))
2085 continue; 2143 continue;
2086 2144
2087 zone->lru[l].nr_scan += (lru_pages >> prio) + 1; 2145 zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
2088 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { 2146 if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
2089 unsigned long nr_to_scan; 2147 unsigned long nr_to_scan;
2090 2148
2091 zone->lru[l].nr_scan = 0; 2149 zone->lru[l].nr_saved_scan = 0;
2092 nr_to_scan = min(nr_pages, lru_pages); 2150 nr_to_scan = min(nr_pages, lru_pages);
2093 nr_reclaimed += shrink_list(l, nr_to_scan, zone, 2151 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2094 sc, prio); 2152 sc, prio);
@@ -2290,6 +2348,48 @@ int sysctl_min_unmapped_ratio = 1;
2290 */ 2348 */
2291int sysctl_min_slab_ratio = 5; 2349int sysctl_min_slab_ratio = 5;
2292 2350
2351static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
2352{
2353 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
2354 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
2355 zone_page_state(zone, NR_ACTIVE_FILE);
2356
2357 /*
2358 * It's possible for there to be more file mapped pages than
2359 * accounted for by the pages on the file LRU lists because
2360 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
2361 */
2362 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
2363}
2364
2365/* Work out how many page cache pages we can reclaim in this reclaim_mode */
2366static long zone_pagecache_reclaimable(struct zone *zone)
2367{
2368 long nr_pagecache_reclaimable;
2369 long delta = 0;
2370
2371 /*
2372 * If RECLAIM_SWAP is set, then all file pages are considered
2373 * potentially reclaimable. Otherwise, we have to worry about
2374 * pages like swapcache and zone_unmapped_file_pages() provides
2375 * a better estimate
2376 */
2377 if (zone_reclaim_mode & RECLAIM_SWAP)
2378 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
2379 else
2380 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
2381
2382 /* If we can't clean pages, remove dirty pages from consideration */
2383 if (!(zone_reclaim_mode & RECLAIM_WRITE))
2384 delta += zone_page_state(zone, NR_FILE_DIRTY);
2385
2386 /* Watch for any possible underflows due to delta */
2387 if (unlikely(delta > nr_pagecache_reclaimable))
2388 delta = nr_pagecache_reclaimable;
2389
2390 return nr_pagecache_reclaimable - delta;
2391}
2392
2293/* 2393/*
2294 * Try to free up some pages from this zone through reclaim. 2394 * Try to free up some pages from this zone through reclaim.
2295 */ 2395 */
@@ -2324,9 +2424,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2324 reclaim_state.reclaimed_slab = 0; 2424 reclaim_state.reclaimed_slab = 0;
2325 p->reclaim_state = &reclaim_state; 2425 p->reclaim_state = &reclaim_state;
2326 2426
2327 if (zone_page_state(zone, NR_FILE_PAGES) - 2427 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
2328 zone_page_state(zone, NR_FILE_MAPPED) >
2329 zone->min_unmapped_pages) {
2330 /* 2428 /*
2331 * Free memory by calling shrink zone with increasing 2429 * Free memory by calling shrink zone with increasing
2332 * priorities until we have enough memory freed. 2430 * priorities until we have enough memory freed.
@@ -2384,20 +2482,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2384 * if less than a specified percentage of the zone is used by 2482 * if less than a specified percentage of the zone is used by
2385 * unmapped file backed pages. 2483 * unmapped file backed pages.
2386 */ 2484 */
2387 if (zone_page_state(zone, NR_FILE_PAGES) - 2485 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
2388 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages 2486 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
2389 && zone_page_state(zone, NR_SLAB_RECLAIMABLE) 2487 return ZONE_RECLAIM_FULL;
2390 <= zone->min_slab_pages)
2391 return 0;
2392 2488
2393 if (zone_is_all_unreclaimable(zone)) 2489 if (zone_is_all_unreclaimable(zone))
2394 return 0; 2490 return ZONE_RECLAIM_FULL;
2395 2491
2396 /* 2492 /*
2397 * Do not scan if the allocation should not be delayed. 2493 * Do not scan if the allocation should not be delayed.
2398 */ 2494 */
2399 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) 2495 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
2400 return 0; 2496 return ZONE_RECLAIM_NOSCAN;
2401 2497
2402 /* 2498 /*
2403 * Only run zone reclaim on the local zone or on zones that do not 2499 * Only run zone reclaim on the local zone or on zones that do not
@@ -2407,18 +2503,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2407 */ 2503 */
2408 node_id = zone_to_nid(zone); 2504 node_id = zone_to_nid(zone);
2409 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 2505 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
2410 return 0; 2506 return ZONE_RECLAIM_NOSCAN;
2411 2507
2412 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 2508 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
2413 return 0; 2509 return ZONE_RECLAIM_NOSCAN;
2510
2414 ret = __zone_reclaim(zone, gfp_mask, order); 2511 ret = __zone_reclaim(zone, gfp_mask, order);
2415 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 2512 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
2416 2513
2514 if (!ret)
2515 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
2516
2417 return ret; 2517 return ret;
2418} 2518}
2419#endif 2519#endif
2420 2520
2421#ifdef CONFIG_UNEVICTABLE_LRU
2422/* 2521/*
2423 * page_evictable - test whether a page is evictable 2522 * page_evictable - test whether a page is evictable
2424 * @page: the page to test 2523 * @page: the page to test
@@ -2665,4 +2764,3 @@ void scan_unevictable_unregister_node(struct node *node)
2665 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 2764 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2666} 2765}
2667 2766
2668#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 74d66dba0cbe..138bed53706e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -629,10 +629,8 @@ static const char * const vmstat_text[] = {
629 "nr_active_anon", 629 "nr_active_anon",
630 "nr_inactive_file", 630 "nr_inactive_file",
631 "nr_active_file", 631 "nr_active_file",
632#ifdef CONFIG_UNEVICTABLE_LRU
633 "nr_unevictable", 632 "nr_unevictable",
634 "nr_mlock", 633 "nr_mlock",
635#endif
636 "nr_anon_pages", 634 "nr_anon_pages",
637 "nr_mapped", 635 "nr_mapped",
638 "nr_file_pages", 636 "nr_file_pages",
@@ -675,6 +673,9 @@ static const char * const vmstat_text[] = {
675 TEXTS_FOR_ZONES("pgscan_kswapd") 673 TEXTS_FOR_ZONES("pgscan_kswapd")
676 TEXTS_FOR_ZONES("pgscan_direct") 674 TEXTS_FOR_ZONES("pgscan_direct")
677 675
676#ifdef CONFIG_NUMA
677 "zone_reclaim_failed",
678#endif
678 "pginodesteal", 679 "pginodesteal",
679 "slabs_scanned", 680 "slabs_scanned",
680 "kswapd_steal", 681 "kswapd_steal",
@@ -687,7 +688,6 @@ static const char * const vmstat_text[] = {
687 "htlb_buddy_alloc_success", 688 "htlb_buddy_alloc_success",
688 "htlb_buddy_alloc_fail", 689 "htlb_buddy_alloc_fail",
689#endif 690#endif
690#ifdef CONFIG_UNEVICTABLE_LRU
691 "unevictable_pgs_culled", 691 "unevictable_pgs_culled",
692 "unevictable_pgs_scanned", 692 "unevictable_pgs_scanned",
693 "unevictable_pgs_rescued", 693 "unevictable_pgs_rescued",
@@ -697,7 +697,6 @@ static const char * const vmstat_text[] = {
697 "unevictable_pgs_stranded", 697 "unevictable_pgs_stranded",
698 "unevictable_pgs_mlockfreed", 698 "unevictable_pgs_mlockfreed",
699#endif 699#endif
700#endif
701}; 700};
702 701
703static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, 702static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
@@ -710,18 +709,14 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
710 "\n min %lu" 709 "\n min %lu"
711 "\n low %lu" 710 "\n low %lu"
712 "\n high %lu" 711 "\n high %lu"
713 "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" 712 "\n scanned %lu"
714 "\n spanned %lu" 713 "\n spanned %lu"
715 "\n present %lu", 714 "\n present %lu",
716 zone_page_state(zone, NR_FREE_PAGES), 715 zone_page_state(zone, NR_FREE_PAGES),
717 zone->pages_min, 716 min_wmark_pages(zone),
718 zone->pages_low, 717 low_wmark_pages(zone),
719 zone->pages_high, 718 high_wmark_pages(zone),
720 zone->pages_scanned, 719 zone->pages_scanned,
721 zone->lru[LRU_ACTIVE_ANON].nr_scan,
722 zone->lru[LRU_INACTIVE_ANON].nr_scan,
723 zone->lru[LRU_ACTIVE_FILE].nr_scan,
724 zone->lru[LRU_INACTIVE_FILE].nr_scan,
725 zone->spanned_pages, 720 zone->spanned_pages,
726 zone->present_pages); 721 zone->present_pages);
727 722