diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 18 | ||||
-rw-r--r-- | mm/Kconfig.debug | 1 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/bounce.c | 1 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 169 | ||||
-rw-r--r-- | mm/highmem.c | 1 | ||||
-rw-r--r-- | mm/hugetlb.c | 106 | ||||
-rw-r--r-- | mm/init-mm.c | 20 | ||||
-rw-r--r-- | mm/internal.h | 33 | ||||
-rw-r--r-- | mm/kmemcheck.c | 122 | ||||
-rw-r--r-- | mm/madvise.c | 26 | ||||
-rw-r--r-- | mm/memcontrol.c | 11 | ||||
-rw-r--r-- | mm/memory.c | 128 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 6 | ||||
-rw-r--r-- | mm/mempolicy.c | 145 | ||||
-rw-r--r-- | mm/migrate.c | 6 | ||||
-rw-r--r-- | mm/mlock.c | 22 | ||||
-rw-r--r-- | mm/oom_kill.c | 64 | ||||
-rw-r--r-- | mm/page-writeback.c | 19 | ||||
-rw-r--r-- | mm/page_alloc.c | 772 | ||||
-rw-r--r-- | mm/page_io.c | 2 | ||||
-rw-r--r-- | mm/readahead.c | 145 | ||||
-rw-r--r-- | mm/rmap.c | 40 | ||||
-rw-r--r-- | mm/shmem.c | 4 | ||||
-rw-r--r-- | mm/slab.c | 117 | ||||
-rw-r--r-- | mm/slob.c | 4 | ||||
-rw-r--r-- | mm/slub.c | 40 | ||||
-rw-r--r-- | mm/swap_state.c | 17 | ||||
-rw-r--r-- | mm/swapfile.c | 276 | ||||
-rw-r--r-- | mm/truncate.c | 39 | ||||
-rw-r--r-- | mm/util.c | 16 | ||||
-rw-r--r-- | mm/vmscan.c | 372 | ||||
-rw-r--r-- | mm/vmstat.c | 19 |
34 files changed, 1799 insertions, 966 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 71830ba7b986..c948d4ca8bde 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -128,11 +128,11 @@ config SPARSEMEM_VMEMMAP | |||
128 | config MEMORY_HOTPLUG | 128 | config MEMORY_HOTPLUG |
129 | bool "Allow for memory hot-add" | 129 | bool "Allow for memory hot-add" |
130 | depends on SPARSEMEM || X86_64_ACPI_NUMA | 130 | depends on SPARSEMEM || X86_64_ACPI_NUMA |
131 | depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG | 131 | depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG |
132 | depends on (IA64 || X86 || PPC64 || SUPERH || S390) | 132 | depends on (IA64 || X86 || PPC64 || SUPERH || S390) |
133 | 133 | ||
134 | comment "Memory hotplug is currently incompatible with Software Suspend" | 134 | comment "Memory hotplug is currently incompatible with Software Suspend" |
135 | depends on SPARSEMEM && HOTPLUG && HIBERNATION | 135 | depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390 |
136 | 136 | ||
137 | config MEMORY_HOTPLUG_SPARSE | 137 | config MEMORY_HOTPLUG_SPARSE |
138 | def_bool y | 138 | def_bool y |
@@ -203,25 +203,13 @@ config VIRT_TO_BUS | |||
203 | def_bool y | 203 | def_bool y |
204 | depends on !ARCH_NO_VIRT_TO_BUS | 204 | depends on !ARCH_NO_VIRT_TO_BUS |
205 | 205 | ||
206 | config UNEVICTABLE_LRU | ||
207 | bool "Add LRU list to track non-evictable pages" | ||
208 | default y | ||
209 | help | ||
210 | Keeps unevictable pages off of the active and inactive pageout | ||
211 | lists, so kswapd will not waste CPU time or have its balancing | ||
212 | algorithms thrown off by scanning these pages. Selecting this | ||
213 | will use one page flag and increase the code size a little, | ||
214 | say Y unless you know what you are doing. | ||
215 | |||
216 | See Documentation/vm/unevictable-lru.txt for more information. | ||
217 | |||
218 | config HAVE_MLOCK | 206 | config HAVE_MLOCK |
219 | bool | 207 | bool |
220 | default y if MMU=y | 208 | default y if MMU=y |
221 | 209 | ||
222 | config HAVE_MLOCKED_PAGE_BIT | 210 | config HAVE_MLOCKED_PAGE_BIT |
223 | bool | 211 | bool |
224 | default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y | 212 | default y if HAVE_MLOCK=y |
225 | 213 | ||
226 | config MMU_NOTIFIER | 214 | config MMU_NOTIFIER |
227 | bool | 215 | bool |
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index bb01e298f260..aa99fd1f7109 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
@@ -2,6 +2,7 @@ config DEBUG_PAGEALLOC | |||
2 | bool "Debug page memory allocations" | 2 | bool "Debug page memory allocations" |
3 | depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC | 3 | depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC |
4 | depends on !HIBERNATION || !PPC && !SPARC | 4 | depends on !HIBERNATION || !PPC && !SPARC |
5 | depends on !KMEMCHECK | ||
5 | ---help--- | 6 | ---help--- |
6 | Unmap pages from the kernel linear mapping after free_pages(). | 7 | Unmap pages from the kernel linear mapping after free_pages(). |
7 | This results in a large slowdown, but helps to find certain types | 8 | This results in a large slowdown, but helps to find certain types |
diff --git a/mm/Makefile b/mm/Makefile index e89acb090b4d..5e0bd6426693 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -12,6 +12,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | |||
12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
14 | page_isolation.o mm_init.o $(mmu-y) | 14 | page_isolation.o mm_init.o $(mmu-y) |
15 | obj-y += init-mm.o | ||
15 | 16 | ||
16 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o | 17 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o |
17 | obj-$(CONFIG_BOUNCE) += bounce.o | 18 | obj-$(CONFIG_BOUNCE) += bounce.o |
@@ -27,6 +28,7 @@ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | |||
27 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o | 28 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o |
28 | obj-$(CONFIG_SLAB) += slab.o | 29 | obj-$(CONFIG_SLAB) += slab.o |
29 | obj-$(CONFIG_SLUB) += slub.o | 30 | obj-$(CONFIG_SLUB) += slub.o |
31 | obj-$(CONFIG_KMEMCHECK) += kmemcheck.o | ||
30 | obj-$(CONFIG_FAILSLAB) += failslab.o | 32 | obj-$(CONFIG_FAILSLAB) += failslab.o |
31 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 33 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
32 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 34 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
diff --git a/mm/bounce.c b/mm/bounce.c index 4ebe3ea83795..a2b76a588e34 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -13,7 +13,6 @@ | |||
13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
14 | #include <linux/hash.h> | 14 | #include <linux/hash.h> |
15 | #include <linux/highmem.h> | 15 | #include <linux/highmem.h> |
16 | #include <linux/blktrace_api.h> | ||
17 | #include <asm/tlbflush.h> | 16 | #include <asm/tlbflush.h> |
18 | 17 | ||
19 | #include <trace/events/block.h> | 18 | #include <trace/events/block.h> |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 54a0f8040afa..e43359214f6f 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
101 | 101 | ||
102 | ret = force_page_cache_readahead(mapping, file, | 102 | ret = force_page_cache_readahead(mapping, file, |
103 | start_index, | 103 | start_index, |
104 | max_sane_readahead(nrpages)); | 104 | nrpages); |
105 | if (ret > 0) | 105 | if (ret > 0) |
106 | ret = 0; | 106 | ret = 0; |
107 | break; | 107 | break; |
diff --git a/mm/filemap.c b/mm/filemap.c index 1b60f30cebfa..22396713feb9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp) | |||
521 | { | 521 | { |
522 | if (cpuset_do_page_mem_spread()) { | 522 | if (cpuset_do_page_mem_spread()) { |
523 | int n = cpuset_mem_spread_node(); | 523 | int n = cpuset_mem_spread_node(); |
524 | return alloc_pages_node(n, gfp, 0); | 524 | return alloc_pages_exact_node(n, gfp, 0); |
525 | } | 525 | } |
526 | return alloc_pages(gfp, 0); | 526 | return alloc_pages(gfp, 0); |
527 | } | 527 | } |
@@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait); | |||
1004 | static void shrink_readahead_size_eio(struct file *filp, | 1004 | static void shrink_readahead_size_eio(struct file *filp, |
1005 | struct file_ra_state *ra) | 1005 | struct file_ra_state *ra) |
1006 | { | 1006 | { |
1007 | if (!ra->ra_pages) | ||
1008 | return; | ||
1009 | |||
1010 | ra->ra_pages /= 4; | 1007 | ra->ra_pages /= 4; |
1011 | } | 1008 | } |
1012 | 1009 | ||
@@ -1390,8 +1387,7 @@ do_readahead(struct address_space *mapping, struct file *filp, | |||
1390 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | 1387 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) |
1391 | return -EINVAL; | 1388 | return -EINVAL; |
1392 | 1389 | ||
1393 | force_page_cache_readahead(mapping, filp, index, | 1390 | force_page_cache_readahead(mapping, filp, index, nr); |
1394 | max_sane_readahead(nr)); | ||
1395 | return 0; | 1391 | return 0; |
1396 | } | 1392 | } |
1397 | 1393 | ||
@@ -1457,6 +1453,73 @@ static int page_cache_read(struct file *file, pgoff_t offset) | |||
1457 | 1453 | ||
1458 | #define MMAP_LOTSAMISS (100) | 1454 | #define MMAP_LOTSAMISS (100) |
1459 | 1455 | ||
1456 | /* | ||
1457 | * Synchronous readahead happens when we don't even find | ||
1458 | * a page in the page cache at all. | ||
1459 | */ | ||
1460 | static void do_sync_mmap_readahead(struct vm_area_struct *vma, | ||
1461 | struct file_ra_state *ra, | ||
1462 | struct file *file, | ||
1463 | pgoff_t offset) | ||
1464 | { | ||
1465 | unsigned long ra_pages; | ||
1466 | struct address_space *mapping = file->f_mapping; | ||
1467 | |||
1468 | /* If we don't want any read-ahead, don't bother */ | ||
1469 | if (VM_RandomReadHint(vma)) | ||
1470 | return; | ||
1471 | |||
1472 | if (VM_SequentialReadHint(vma) || | ||
1473 | offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { | ||
1474 | page_cache_sync_readahead(mapping, ra, file, offset, | ||
1475 | ra->ra_pages); | ||
1476 | return; | ||
1477 | } | ||
1478 | |||
1479 | if (ra->mmap_miss < INT_MAX) | ||
1480 | ra->mmap_miss++; | ||
1481 | |||
1482 | /* | ||
1483 | * Do we miss much more than hit in this file? If so, | ||
1484 | * stop bothering with read-ahead. It will only hurt. | ||
1485 | */ | ||
1486 | if (ra->mmap_miss > MMAP_LOTSAMISS) | ||
1487 | return; | ||
1488 | |||
1489 | /* | ||
1490 | * mmap read-around | ||
1491 | */ | ||
1492 | ra_pages = max_sane_readahead(ra->ra_pages); | ||
1493 | if (ra_pages) { | ||
1494 | ra->start = max_t(long, 0, offset - ra_pages/2); | ||
1495 | ra->size = ra_pages; | ||
1496 | ra->async_size = 0; | ||
1497 | ra_submit(ra, mapping, file); | ||
1498 | } | ||
1499 | } | ||
1500 | |||
1501 | /* | ||
1502 | * Asynchronous readahead happens when we find the page and PG_readahead, | ||
1503 | * so we want to possibly extend the readahead further.. | ||
1504 | */ | ||
1505 | static void do_async_mmap_readahead(struct vm_area_struct *vma, | ||
1506 | struct file_ra_state *ra, | ||
1507 | struct file *file, | ||
1508 | struct page *page, | ||
1509 | pgoff_t offset) | ||
1510 | { | ||
1511 | struct address_space *mapping = file->f_mapping; | ||
1512 | |||
1513 | /* If we don't want any read-ahead, don't bother */ | ||
1514 | if (VM_RandomReadHint(vma)) | ||
1515 | return; | ||
1516 | if (ra->mmap_miss > 0) | ||
1517 | ra->mmap_miss--; | ||
1518 | if (PageReadahead(page)) | ||
1519 | page_cache_async_readahead(mapping, ra, file, | ||
1520 | page, offset, ra->ra_pages); | ||
1521 | } | ||
1522 | |||
1460 | /** | 1523 | /** |
1461 | * filemap_fault - read in file data for page fault handling | 1524 | * filemap_fault - read in file data for page fault handling |
1462 | * @vma: vma in which the fault was taken | 1525 | * @vma: vma in which the fault was taken |
@@ -1476,78 +1539,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1476 | struct address_space *mapping = file->f_mapping; | 1539 | struct address_space *mapping = file->f_mapping; |
1477 | struct file_ra_state *ra = &file->f_ra; | 1540 | struct file_ra_state *ra = &file->f_ra; |
1478 | struct inode *inode = mapping->host; | 1541 | struct inode *inode = mapping->host; |
1542 | pgoff_t offset = vmf->pgoff; | ||
1479 | struct page *page; | 1543 | struct page *page; |
1480 | pgoff_t size; | 1544 | pgoff_t size; |
1481 | int did_readaround = 0; | ||
1482 | int ret = 0; | 1545 | int ret = 0; |
1483 | 1546 | ||
1484 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 1547 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
1485 | if (vmf->pgoff >= size) | 1548 | if (offset >= size) |
1486 | return VM_FAULT_SIGBUS; | 1549 | return VM_FAULT_SIGBUS; |
1487 | 1550 | ||
1488 | /* If we don't want any read-ahead, don't bother */ | ||
1489 | if (VM_RandomReadHint(vma)) | ||
1490 | goto no_cached_page; | ||
1491 | |||
1492 | /* | 1551 | /* |
1493 | * Do we have something in the page cache already? | 1552 | * Do we have something in the page cache already? |
1494 | */ | 1553 | */ |
1495 | retry_find: | 1554 | page = find_get_page(mapping, offset); |
1496 | page = find_lock_page(mapping, vmf->pgoff); | 1555 | if (likely(page)) { |
1497 | /* | ||
1498 | * For sequential accesses, we use the generic readahead logic. | ||
1499 | */ | ||
1500 | if (VM_SequentialReadHint(vma)) { | ||
1501 | if (!page) { | ||
1502 | page_cache_sync_readahead(mapping, ra, file, | ||
1503 | vmf->pgoff, 1); | ||
1504 | page = find_lock_page(mapping, vmf->pgoff); | ||
1505 | if (!page) | ||
1506 | goto no_cached_page; | ||
1507 | } | ||
1508 | if (PageReadahead(page)) { | ||
1509 | page_cache_async_readahead(mapping, ra, file, page, | ||
1510 | vmf->pgoff, 1); | ||
1511 | } | ||
1512 | } | ||
1513 | |||
1514 | if (!page) { | ||
1515 | unsigned long ra_pages; | ||
1516 | |||
1517 | ra->mmap_miss++; | ||
1518 | |||
1519 | /* | 1556 | /* |
1520 | * Do we miss much more than hit in this file? If so, | 1557 | * We found the page, so try async readahead before |
1521 | * stop bothering with read-ahead. It will only hurt. | 1558 | * waiting for the lock. |
1522 | */ | 1559 | */ |
1523 | if (ra->mmap_miss > MMAP_LOTSAMISS) | 1560 | do_async_mmap_readahead(vma, ra, file, page, offset); |
1524 | goto no_cached_page; | 1561 | lock_page(page); |
1525 | 1562 | ||
1526 | /* | 1563 | /* Did it get truncated? */ |
1527 | * To keep the pgmajfault counter straight, we need to | 1564 | if (unlikely(page->mapping != mapping)) { |
1528 | * check did_readaround, as this is an inner loop. | 1565 | unlock_page(page); |
1529 | */ | 1566 | put_page(page); |
1530 | if (!did_readaround) { | 1567 | goto no_cached_page; |
1531 | ret = VM_FAULT_MAJOR; | ||
1532 | count_vm_event(PGMAJFAULT); | ||
1533 | } | ||
1534 | did_readaround = 1; | ||
1535 | ra_pages = max_sane_readahead(file->f_ra.ra_pages); | ||
1536 | if (ra_pages) { | ||
1537 | pgoff_t start = 0; | ||
1538 | |||
1539 | if (vmf->pgoff > ra_pages / 2) | ||
1540 | start = vmf->pgoff - ra_pages / 2; | ||
1541 | do_page_cache_readahead(mapping, file, start, ra_pages); | ||
1542 | } | 1568 | } |
1543 | page = find_lock_page(mapping, vmf->pgoff); | 1569 | } else { |
1570 | /* No page in the page cache at all */ | ||
1571 | do_sync_mmap_readahead(vma, ra, file, offset); | ||
1572 | count_vm_event(PGMAJFAULT); | ||
1573 | ret = VM_FAULT_MAJOR; | ||
1574 | retry_find: | ||
1575 | page = find_lock_page(mapping, offset); | ||
1544 | if (!page) | 1576 | if (!page) |
1545 | goto no_cached_page; | 1577 | goto no_cached_page; |
1546 | } | 1578 | } |
1547 | 1579 | ||
1548 | if (!did_readaround) | ||
1549 | ra->mmap_miss--; | ||
1550 | |||
1551 | /* | 1580 | /* |
1552 | * We have a locked page in the page cache, now we need to check | 1581 | * We have a locked page in the page cache, now we need to check |
1553 | * that it's up-to-date. If not, it is going to be due to an error. | 1582 | * that it's up-to-date. If not, it is going to be due to an error. |
@@ -1555,18 +1584,18 @@ retry_find: | |||
1555 | if (unlikely(!PageUptodate(page))) | 1584 | if (unlikely(!PageUptodate(page))) |
1556 | goto page_not_uptodate; | 1585 | goto page_not_uptodate; |
1557 | 1586 | ||
1558 | /* Must recheck i_size under page lock */ | 1587 | /* |
1588 | * Found the page and have a reference on it. | ||
1589 | * We must recheck i_size under page lock. | ||
1590 | */ | ||
1559 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 1591 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
1560 | if (unlikely(vmf->pgoff >= size)) { | 1592 | if (unlikely(offset >= size)) { |
1561 | unlock_page(page); | 1593 | unlock_page(page); |
1562 | page_cache_release(page); | 1594 | page_cache_release(page); |
1563 | return VM_FAULT_SIGBUS; | 1595 | return VM_FAULT_SIGBUS; |
1564 | } | 1596 | } |
1565 | 1597 | ||
1566 | /* | 1598 | ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; |
1567 | * Found the page and have a reference on it. | ||
1568 | */ | ||
1569 | ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; | ||
1570 | vmf->page = page; | 1599 | vmf->page = page; |
1571 | return ret | VM_FAULT_LOCKED; | 1600 | return ret | VM_FAULT_LOCKED; |
1572 | 1601 | ||
@@ -1575,7 +1604,7 @@ no_cached_page: | |||
1575 | * We're only likely to ever get here if MADV_RANDOM is in | 1604 | * We're only likely to ever get here if MADV_RANDOM is in |
1576 | * effect. | 1605 | * effect. |
1577 | */ | 1606 | */ |
1578 | error = page_cache_read(file, vmf->pgoff); | 1607 | error = page_cache_read(file, offset); |
1579 | 1608 | ||
1580 | /* | 1609 | /* |
1581 | * The page we want has now been added to the page cache. | 1610 | * The page we want has now been added to the page cache. |
@@ -1595,12 +1624,6 @@ no_cached_page: | |||
1595 | return VM_FAULT_SIGBUS; | 1624 | return VM_FAULT_SIGBUS; |
1596 | 1625 | ||
1597 | page_not_uptodate: | 1626 | page_not_uptodate: |
1598 | /* IO error path */ | ||
1599 | if (!did_readaround) { | ||
1600 | ret = VM_FAULT_MAJOR; | ||
1601 | count_vm_event(PGMAJFAULT); | ||
1602 | } | ||
1603 | |||
1604 | /* | 1627 | /* |
1605 | * Umm, take care of errors if the page isn't up-to-date. | 1628 | * Umm, take care of errors if the page isn't up-to-date. |
1606 | * Try to re-read it _once_. We do this synchronously, | 1629 | * Try to re-read it _once_. We do this synchronously, |
diff --git a/mm/highmem.c b/mm/highmem.c index 68eb1d9b63fa..25878cc49daa 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -26,7 +26,6 @@ | |||
26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
27 | #include <linux/hash.h> | 27 | #include <linux/hash.h> |
28 | #include <linux/highmem.h> | 28 | #include <linux/highmem.h> |
29 | #include <linux/blktrace_api.h> | ||
30 | #include <asm/tlbflush.h> | 29 | #include <asm/tlbflush.h> |
31 | 30 | ||
32 | /* | 31 | /* |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e83ad2c9228c..a56e6f3ce979 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -578,41 +578,6 @@ static void free_huge_page(struct page *page) | |||
578 | hugetlb_put_quota(mapping, 1); | 578 | hugetlb_put_quota(mapping, 1); |
579 | } | 579 | } |
580 | 580 | ||
581 | /* | ||
582 | * Increment or decrement surplus_huge_pages. Keep node-specific counters | ||
583 | * balanced by operating on them in a round-robin fashion. | ||
584 | * Returns 1 if an adjustment was made. | ||
585 | */ | ||
586 | static int adjust_pool_surplus(struct hstate *h, int delta) | ||
587 | { | ||
588 | static int prev_nid; | ||
589 | int nid = prev_nid; | ||
590 | int ret = 0; | ||
591 | |||
592 | VM_BUG_ON(delta != -1 && delta != 1); | ||
593 | do { | ||
594 | nid = next_node(nid, node_online_map); | ||
595 | if (nid == MAX_NUMNODES) | ||
596 | nid = first_node(node_online_map); | ||
597 | |||
598 | /* To shrink on this node, there must be a surplus page */ | ||
599 | if (delta < 0 && !h->surplus_huge_pages_node[nid]) | ||
600 | continue; | ||
601 | /* Surplus cannot exceed the total number of pages */ | ||
602 | if (delta > 0 && h->surplus_huge_pages_node[nid] >= | ||
603 | h->nr_huge_pages_node[nid]) | ||
604 | continue; | ||
605 | |||
606 | h->surplus_huge_pages += delta; | ||
607 | h->surplus_huge_pages_node[nid] += delta; | ||
608 | ret = 1; | ||
609 | break; | ||
610 | } while (nid != prev_nid); | ||
611 | |||
612 | prev_nid = nid; | ||
613 | return ret; | ||
614 | } | ||
615 | |||
616 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 581 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
617 | { | 582 | { |
618 | set_compound_page_dtor(page, free_huge_page); | 583 | set_compound_page_dtor(page, free_huge_page); |
@@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | |||
623 | put_page(page); /* free it into the hugepage allocator */ | 588 | put_page(page); /* free it into the hugepage allocator */ |
624 | } | 589 | } |
625 | 590 | ||
591 | static void prep_compound_gigantic_page(struct page *page, unsigned long order) | ||
592 | { | ||
593 | int i; | ||
594 | int nr_pages = 1 << order; | ||
595 | struct page *p = page + 1; | ||
596 | |||
597 | /* we rely on prep_new_huge_page to set the destructor */ | ||
598 | set_compound_order(page, order); | ||
599 | __SetPageHead(page); | ||
600 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | ||
601 | __SetPageTail(p); | ||
602 | p->first_page = page; | ||
603 | } | ||
604 | } | ||
605 | |||
606 | int PageHuge(struct page *page) | ||
607 | { | ||
608 | compound_page_dtor *dtor; | ||
609 | |||
610 | if (!PageCompound(page)) | ||
611 | return 0; | ||
612 | |||
613 | page = compound_head(page); | ||
614 | dtor = get_compound_page_dtor(page); | ||
615 | |||
616 | return dtor == free_huge_page; | ||
617 | } | ||
618 | |||
626 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | 619 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) |
627 | { | 620 | { |
628 | struct page *page; | 621 | struct page *page; |
@@ -630,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
630 | if (h->order >= MAX_ORDER) | 623 | if (h->order >= MAX_ORDER) |
631 | return NULL; | 624 | return NULL; |
632 | 625 | ||
633 | page = alloc_pages_node(nid, | 626 | page = alloc_pages_exact_node(nid, |
634 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | 627 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| |
635 | __GFP_REPEAT|__GFP_NOWARN, | 628 | __GFP_REPEAT|__GFP_NOWARN, |
636 | huge_page_order(h)); | 629 | huge_page_order(h)); |
@@ -649,7 +642,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
649 | * Use a helper variable to find the next node and then | 642 | * Use a helper variable to find the next node and then |
650 | * copy it back to hugetlb_next_nid afterwards: | 643 | * copy it back to hugetlb_next_nid afterwards: |
651 | * otherwise there's a window in which a racer might | 644 | * otherwise there's a window in which a racer might |
652 | * pass invalid nid MAX_NUMNODES to alloc_pages_node. | 645 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. |
653 | * But we don't need to use a spin_lock here: it really | 646 | * But we don't need to use a spin_lock here: it really |
654 | * doesn't matter if occasionally a racer chooses the | 647 | * doesn't matter if occasionally a racer chooses the |
655 | * same nid as we do. Move nid forward in the mask even | 648 | * same nid as we do. Move nid forward in the mask even |
@@ -875,7 +868,7 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
875 | * can no longer free unreserved surplus pages. This occurs when | 868 | * can no longer free unreserved surplus pages. This occurs when |
876 | * the nodes with surplus pages have no free pages. | 869 | * the nodes with surplus pages have no free pages. |
877 | */ | 870 | */ |
878 | unsigned long remaining_iterations = num_online_nodes(); | 871 | unsigned long remaining_iterations = nr_online_nodes; |
879 | 872 | ||
880 | /* Uncommit the reservation */ | 873 | /* Uncommit the reservation */ |
881 | h->resv_huge_pages -= unused_resv_pages; | 874 | h->resv_huge_pages -= unused_resv_pages; |
@@ -904,7 +897,7 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
904 | h->surplus_huge_pages--; | 897 | h->surplus_huge_pages--; |
905 | h->surplus_huge_pages_node[nid]--; | 898 | h->surplus_huge_pages_node[nid]--; |
906 | nr_pages--; | 899 | nr_pages--; |
907 | remaining_iterations = num_online_nodes(); | 900 | remaining_iterations = nr_online_nodes; |
908 | } | 901 | } |
909 | } | 902 | } |
910 | } | 903 | } |
@@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) | |||
1140 | } | 1133 | } |
1141 | #endif | 1134 | #endif |
1142 | 1135 | ||
1136 | /* | ||
1137 | * Increment or decrement surplus_huge_pages. Keep node-specific counters | ||
1138 | * balanced by operating on them in a round-robin fashion. | ||
1139 | * Returns 1 if an adjustment was made. | ||
1140 | */ | ||
1141 | static int adjust_pool_surplus(struct hstate *h, int delta) | ||
1142 | { | ||
1143 | static int prev_nid; | ||
1144 | int nid = prev_nid; | ||
1145 | int ret = 0; | ||
1146 | |||
1147 | VM_BUG_ON(delta != -1 && delta != 1); | ||
1148 | do { | ||
1149 | nid = next_node(nid, node_online_map); | ||
1150 | if (nid == MAX_NUMNODES) | ||
1151 | nid = first_node(node_online_map); | ||
1152 | |||
1153 | /* To shrink on this node, there must be a surplus page */ | ||
1154 | if (delta < 0 && !h->surplus_huge_pages_node[nid]) | ||
1155 | continue; | ||
1156 | /* Surplus cannot exceed the total number of pages */ | ||
1157 | if (delta > 0 && h->surplus_huge_pages_node[nid] >= | ||
1158 | h->nr_huge_pages_node[nid]) | ||
1159 | continue; | ||
1160 | |||
1161 | h->surplus_huge_pages += delta; | ||
1162 | h->surplus_huge_pages_node[nid] += delta; | ||
1163 | ret = 1; | ||
1164 | break; | ||
1165 | } while (nid != prev_nid); | ||
1166 | |||
1167 | prev_nid = nid; | ||
1168 | return ret; | ||
1169 | } | ||
1170 | |||
1143 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) | 1171 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
1144 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | 1172 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) |
1145 | { | 1173 | { |
diff --git a/mm/init-mm.c b/mm/init-mm.c new file mode 100644 index 000000000000..57aba0da9668 --- /dev/null +++ b/mm/init-mm.c | |||
@@ -0,0 +1,20 @@ | |||
1 | #include <linux/mm_types.h> | ||
2 | #include <linux/rbtree.h> | ||
3 | #include <linux/rwsem.h> | ||
4 | #include <linux/spinlock.h> | ||
5 | #include <linux/list.h> | ||
6 | #include <linux/cpumask.h> | ||
7 | |||
8 | #include <asm/atomic.h> | ||
9 | #include <asm/pgtable.h> | ||
10 | |||
11 | struct mm_struct init_mm = { | ||
12 | .mm_rb = RB_ROOT, | ||
13 | .pgd = swapper_pg_dir, | ||
14 | .mm_users = ATOMIC_INIT(2), | ||
15 | .mm_count = ATOMIC_INIT(1), | ||
16 | .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), | ||
17 | .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), | ||
18 | .mmlist = LIST_HEAD_INIT(init_mm.mmlist), | ||
19 | .cpu_vm_mask = CPU_MASK_ALL, | ||
20 | }; | ||
diff --git a/mm/internal.h b/mm/internal.h index 987bb03fbdd8..f290c4db528b 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -16,9 +16,6 @@ | |||
16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
17 | unsigned long floor, unsigned long ceiling); | 17 | unsigned long floor, unsigned long ceiling); |
18 | 18 | ||
19 | extern void prep_compound_page(struct page *page, unsigned long order); | ||
20 | extern void prep_compound_gigantic_page(struct page *page, unsigned long order); | ||
21 | |||
22 | static inline void set_page_count(struct page *page, int v) | 19 | static inline void set_page_count(struct page *page, int v) |
23 | { | 20 | { |
24 | atomic_set(&page->_count, v); | 21 | atomic_set(&page->_count, v); |
@@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page); | |||
51 | */ | 48 | */ |
52 | extern unsigned long highest_memmap_pfn; | 49 | extern unsigned long highest_memmap_pfn; |
53 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 50 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
51 | extern void prep_compound_page(struct page *page, unsigned long order); | ||
52 | |||
54 | 53 | ||
55 | /* | 54 | /* |
56 | * function for dealing with page's order in buddy system. | 55 | * function for dealing with page's order in buddy system. |
@@ -74,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
74 | } | 73 | } |
75 | #endif | 74 | #endif |
76 | 75 | ||
77 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
78 | /* | 76 | /* |
79 | * unevictable_migrate_page() called only from migrate_page_copy() to | 77 | * unevictable_migrate_page() called only from migrate_page_copy() to |
80 | * migrate unevictable flag to new page. | 78 | * migrate unevictable flag to new page. |
@@ -86,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old) | |||
86 | if (TestClearPageUnevictable(old)) | 84 | if (TestClearPageUnevictable(old)) |
87 | SetPageUnevictable(new); | 85 | SetPageUnevictable(new); |
88 | } | 86 | } |
89 | #else | ||
90 | static inline void unevictable_migrate_page(struct page *new, struct page *old) | ||
91 | { | ||
92 | } | ||
93 | #endif | ||
94 | 87 | ||
95 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | 88 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT |
96 | /* | 89 | /* |
@@ -150,23 +143,6 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
150 | } | 143 | } |
151 | } | 144 | } |
152 | 145 | ||
153 | /* | ||
154 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | ||
155 | * Page should not be on lru, so no need to fix that up. | ||
156 | * free_pages_check() will verify... | ||
157 | */ | ||
158 | static inline void free_page_mlock(struct page *page) | ||
159 | { | ||
160 | if (unlikely(TestClearPageMlocked(page))) { | ||
161 | unsigned long flags; | ||
162 | |||
163 | local_irq_save(flags); | ||
164 | __dec_zone_page_state(page, NR_MLOCK); | ||
165 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | ||
166 | local_irq_restore(flags); | ||
167 | } | ||
168 | } | ||
169 | |||
170 | #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 146 | #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ |
171 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | 147 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) |
172 | { | 148 | { |
@@ -175,7 +151,6 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | |||
175 | static inline void clear_page_mlock(struct page *page) { } | 151 | static inline void clear_page_mlock(struct page *page) { } |
176 | static inline void mlock_vma_page(struct page *page) { } | 152 | static inline void mlock_vma_page(struct page *page) { } |
177 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } | 153 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } |
178 | static inline void free_page_mlock(struct page *page) { } | ||
179 | 154 | ||
180 | #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 155 | #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ |
181 | 156 | ||
@@ -284,4 +259,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
284 | unsigned long start, int len, int flags, | 259 | unsigned long start, int len, int flags, |
285 | struct page **pages, struct vm_area_struct **vmas); | 260 | struct page **pages, struct vm_area_struct **vmas); |
286 | 261 | ||
262 | #define ZONE_RECLAIM_NOSCAN -2 | ||
263 | #define ZONE_RECLAIM_FULL -1 | ||
264 | #define ZONE_RECLAIM_SOME 0 | ||
265 | #define ZONE_RECLAIM_SUCCESS 1 | ||
287 | #endif | 266 | #endif |
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c new file mode 100644 index 000000000000..fd814fd61319 --- /dev/null +++ b/mm/kmemcheck.c | |||
@@ -0,0 +1,122 @@ | |||
1 | #include <linux/gfp.h> | ||
2 | #include <linux/mm_types.h> | ||
3 | #include <linux/mm.h> | ||
4 | #include <linux/slab.h> | ||
5 | #include <linux/kmemcheck.h> | ||
6 | |||
7 | void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) | ||
8 | { | ||
9 | struct page *shadow; | ||
10 | int pages; | ||
11 | int i; | ||
12 | |||
13 | pages = 1 << order; | ||
14 | |||
15 | /* | ||
16 | * With kmemcheck enabled, we need to allocate a memory area for the | ||
17 | * shadow bits as well. | ||
18 | */ | ||
19 | shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); | ||
20 | if (!shadow) { | ||
21 | if (printk_ratelimit()) | ||
22 | printk(KERN_ERR "kmemcheck: failed to allocate " | ||
23 | "shadow bitmap\n"); | ||
24 | return; | ||
25 | } | ||
26 | |||
27 | for(i = 0; i < pages; ++i) | ||
28 | page[i].shadow = page_address(&shadow[i]); | ||
29 | |||
30 | /* | ||
31 | * Mark it as non-present for the MMU so that our accesses to | ||
32 | * this memory will trigger a page fault and let us analyze | ||
33 | * the memory accesses. | ||
34 | */ | ||
35 | kmemcheck_hide_pages(page, pages); | ||
36 | } | ||
37 | |||
38 | void kmemcheck_free_shadow(struct page *page, int order) | ||
39 | { | ||
40 | struct page *shadow; | ||
41 | int pages; | ||
42 | int i; | ||
43 | |||
44 | if (!kmemcheck_page_is_tracked(page)) | ||
45 | return; | ||
46 | |||
47 | pages = 1 << order; | ||
48 | |||
49 | kmemcheck_show_pages(page, pages); | ||
50 | |||
51 | shadow = virt_to_page(page[0].shadow); | ||
52 | |||
53 | for(i = 0; i < pages; ++i) | ||
54 | page[i].shadow = NULL; | ||
55 | |||
56 | __free_pages(shadow, order); | ||
57 | } | ||
58 | |||
59 | void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, | ||
60 | size_t size) | ||
61 | { | ||
62 | /* | ||
63 | * Has already been memset(), which initializes the shadow for us | ||
64 | * as well. | ||
65 | */ | ||
66 | if (gfpflags & __GFP_ZERO) | ||
67 | return; | ||
68 | |||
69 | /* No need to initialize the shadow of a non-tracked slab. */ | ||
70 | if (s->flags & SLAB_NOTRACK) | ||
71 | return; | ||
72 | |||
73 | if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) { | ||
74 | /* | ||
75 | * Allow notracked objects to be allocated from | ||
76 | * tracked caches. Note however that these objects | ||
77 | * will still get page faults on access, they just | ||
78 | * won't ever be flagged as uninitialized. If page | ||
79 | * faults are not acceptable, the slab cache itself | ||
80 | * should be marked NOTRACK. | ||
81 | */ | ||
82 | kmemcheck_mark_initialized(object, size); | ||
83 | } else if (!s->ctor) { | ||
84 | /* | ||
85 | * New objects should be marked uninitialized before | ||
86 | * they're returned to the called. | ||
87 | */ | ||
88 | kmemcheck_mark_uninitialized(object, size); | ||
89 | } | ||
90 | } | ||
91 | |||
92 | void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) | ||
93 | { | ||
94 | /* TODO: RCU freeing is unsupported for now; hide false positives. */ | ||
95 | if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) | ||
96 | kmemcheck_mark_freed(object, size); | ||
97 | } | ||
98 | |||
99 | void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order, | ||
100 | gfp_t gfpflags) | ||
101 | { | ||
102 | int pages; | ||
103 | |||
104 | if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK)) | ||
105 | return; | ||
106 | |||
107 | pages = 1 << order; | ||
108 | |||
109 | /* | ||
110 | * NOTE: We choose to track GFP_ZERO pages too; in fact, they | ||
111 | * can become uninitialized by copying uninitialized memory | ||
112 | * into them. | ||
113 | */ | ||
114 | |||
115 | /* XXX: Can use zone->node for node? */ | ||
116 | kmemcheck_alloc_shadow(page, order, gfpflags, -1); | ||
117 | |||
118 | if (gfpflags & __GFP_ZERO) | ||
119 | kmemcheck_mark_initialized_pages(page, pages); | ||
120 | else | ||
121 | kmemcheck_mark_uninitialized_pages(page, pages); | ||
122 | } | ||
diff --git a/mm/madvise.c b/mm/madvise.c index b9ce574827c8..76eb4193acdd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -123,8 +123,7 @@ static long madvise_willneed(struct vm_area_struct * vma, | |||
123 | end = vma->vm_end; | 123 | end = vma->vm_end; |
124 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 124 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
125 | 125 | ||
126 | force_page_cache_readahead(file->f_mapping, | 126 | force_page_cache_readahead(file->f_mapping, file, start, end - start); |
127 | file, start, max_sane_readahead(end - start)); | ||
128 | return 0; | 127 | return 0; |
129 | } | 128 | } |
130 | 129 | ||
@@ -239,12 +238,30 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
239 | break; | 238 | break; |
240 | 239 | ||
241 | default: | 240 | default: |
242 | error = -EINVAL; | 241 | BUG(); |
243 | break; | 242 | break; |
244 | } | 243 | } |
245 | return error; | 244 | return error; |
246 | } | 245 | } |
247 | 246 | ||
247 | static int | ||
248 | madvise_behavior_valid(int behavior) | ||
249 | { | ||
250 | switch (behavior) { | ||
251 | case MADV_DOFORK: | ||
252 | case MADV_DONTFORK: | ||
253 | case MADV_NORMAL: | ||
254 | case MADV_SEQUENTIAL: | ||
255 | case MADV_RANDOM: | ||
256 | case MADV_REMOVE: | ||
257 | case MADV_WILLNEED: | ||
258 | case MADV_DONTNEED: | ||
259 | return 1; | ||
260 | |||
261 | default: | ||
262 | return 0; | ||
263 | } | ||
264 | } | ||
248 | /* | 265 | /* |
249 | * The madvise(2) system call. | 266 | * The madvise(2) system call. |
250 | * | 267 | * |
@@ -290,6 +307,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
290 | int write; | 307 | int write; |
291 | size_t len; | 308 | size_t len; |
292 | 309 | ||
310 | if (!madvise_behavior_valid(behavior)) | ||
311 | return error; | ||
312 | |||
293 | write = madvise_need_mmap_write(behavior); | 313 | write = madvise_need_mmap_write(behavior); |
294 | if (write) | 314 | if (write) |
295 | down_write(¤t->mm->mmap_sem); | 315 | down_write(¤t->mm->mmap_sem); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 78eb8552818b..70db6e0a5eec 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -570,6 +570,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) | |||
570 | return 0; | 570 | return 0; |
571 | } | 571 | } |
572 | 572 | ||
573 | int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) | ||
574 | { | ||
575 | unsigned long active; | ||
576 | unsigned long inactive; | ||
577 | |||
578 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); | ||
579 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); | ||
580 | |||
581 | return (active > inactive); | ||
582 | } | ||
583 | |||
573 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, | 584 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, |
574 | struct zone *zone, | 585 | struct zone *zone, |
575 | enum lru_list lru) | 586 | enum lru_list lru) |
diff --git a/mm/memory.c b/mm/memory.c index 4126dd16778c..d5d1653d60a6 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1360 | return i; | 1360 | return i; |
1361 | } | 1361 | } |
1362 | 1362 | ||
1363 | /** | ||
1364 | * get_user_pages() - pin user pages in memory | ||
1365 | * @tsk: task_struct of target task | ||
1366 | * @mm: mm_struct of target mm | ||
1367 | * @start: starting user address | ||
1368 | * @len: number of pages from start to pin | ||
1369 | * @write: whether pages will be written to by the caller | ||
1370 | * @force: whether to force write access even if user mapping is | ||
1371 | * readonly. This will result in the page being COWed even | ||
1372 | * in MAP_SHARED mappings. You do not want this. | ||
1373 | * @pages: array that receives pointers to the pages pinned. | ||
1374 | * Should be at least nr_pages long. Or NULL, if caller | ||
1375 | * only intends to ensure the pages are faulted in. | ||
1376 | * @vmas: array of pointers to vmas corresponding to each page. | ||
1377 | * Or NULL if the caller does not require them. | ||
1378 | * | ||
1379 | * Returns number of pages pinned. This may be fewer than the number | ||
1380 | * requested. If len is 0 or negative, returns 0. If no pages | ||
1381 | * were pinned, returns -errno. Each page returned must be released | ||
1382 | * with a put_page() call when it is finished with. vmas will only | ||
1383 | * remain valid while mmap_sem is held. | ||
1384 | * | ||
1385 | * Must be called with mmap_sem held for read or write. | ||
1386 | * | ||
1387 | * get_user_pages walks a process's page tables and takes a reference to | ||
1388 | * each struct page that each user address corresponds to at a given | ||
1389 | * instant. That is, it takes the page that would be accessed if a user | ||
1390 | * thread accesses the given user virtual address at that instant. | ||
1391 | * | ||
1392 | * This does not guarantee that the page exists in the user mappings when | ||
1393 | * get_user_pages returns, and there may even be a completely different | ||
1394 | * page there in some cases (eg. if mmapped pagecache has been invalidated | ||
1395 | * and subsequently re faulted). However it does guarantee that the page | ||
1396 | * won't be freed completely. And mostly callers simply care that the page | ||
1397 | * contains data that was valid *at some point in time*. Typically, an IO | ||
1398 | * or similar operation cannot guarantee anything stronger anyway because | ||
1399 | * locks can't be held over the syscall boundary. | ||
1400 | * | ||
1401 | * If write=0, the page must not be written to. If the page is written to, | ||
1402 | * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called | ||
1403 | * after the page is finished with, and before put_page is called. | ||
1404 | * | ||
1405 | * get_user_pages is typically used for fewer-copy IO operations, to get a | ||
1406 | * handle on the memory by some means other than accesses via the user virtual | ||
1407 | * addresses. The pages may be submitted for DMA to devices or accessed via | ||
1408 | * their kernel linear mapping (via the kmap APIs). Care should be taken to | ||
1409 | * use the correct cache flushing APIs. | ||
1410 | * | ||
1411 | * See also get_user_pages_fast, for performance critical applications. | ||
1412 | */ | ||
1363 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1413 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1364 | unsigned long start, int len, int write, int force, | 1414 | unsigned long start, int len, int write, int force, |
1365 | struct page **pages, struct vm_area_struct **vmas) | 1415 | struct page **pages, struct vm_area_struct **vmas) |
@@ -3053,22 +3103,13 @@ int in_gate_area_no_task(unsigned long addr) | |||
3053 | 3103 | ||
3054 | #endif /* __HAVE_ARCH_GATE_AREA */ | 3104 | #endif /* __HAVE_ARCH_GATE_AREA */ |
3055 | 3105 | ||
3056 | #ifdef CONFIG_HAVE_IOREMAP_PROT | 3106 | static int follow_pte(struct mm_struct *mm, unsigned long address, |
3057 | int follow_phys(struct vm_area_struct *vma, | 3107 | pte_t **ptepp, spinlock_t **ptlp) |
3058 | unsigned long address, unsigned int flags, | ||
3059 | unsigned long *prot, resource_size_t *phys) | ||
3060 | { | 3108 | { |
3061 | pgd_t *pgd; | 3109 | pgd_t *pgd; |
3062 | pud_t *pud; | 3110 | pud_t *pud; |
3063 | pmd_t *pmd; | 3111 | pmd_t *pmd; |
3064 | pte_t *ptep, pte; | 3112 | pte_t *ptep; |
3065 | spinlock_t *ptl; | ||
3066 | resource_size_t phys_addr = 0; | ||
3067 | struct mm_struct *mm = vma->vm_mm; | ||
3068 | int ret = -EINVAL; | ||
3069 | |||
3070 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
3071 | goto out; | ||
3072 | 3113 | ||
3073 | pgd = pgd_offset(mm, address); | 3114 | pgd = pgd_offset(mm, address); |
3074 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | 3115 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) |
@@ -3086,22 +3127,71 @@ int follow_phys(struct vm_area_struct *vma, | |||
3086 | if (pmd_huge(*pmd)) | 3127 | if (pmd_huge(*pmd)) |
3087 | goto out; | 3128 | goto out; |
3088 | 3129 | ||
3089 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | 3130 | ptep = pte_offset_map_lock(mm, pmd, address, ptlp); |
3090 | if (!ptep) | 3131 | if (!ptep) |
3091 | goto out; | 3132 | goto out; |
3133 | if (!pte_present(*ptep)) | ||
3134 | goto unlock; | ||
3135 | *ptepp = ptep; | ||
3136 | return 0; | ||
3137 | unlock: | ||
3138 | pte_unmap_unlock(ptep, *ptlp); | ||
3139 | out: | ||
3140 | return -EINVAL; | ||
3141 | } | ||
3092 | 3142 | ||
3143 | /** | ||
3144 | * follow_pfn - look up PFN at a user virtual address | ||
3145 | * @vma: memory mapping | ||
3146 | * @address: user virtual address | ||
3147 | * @pfn: location to store found PFN | ||
3148 | * | ||
3149 | * Only IO mappings and raw PFN mappings are allowed. | ||
3150 | * | ||
3151 | * Returns zero and the pfn at @pfn on success, -ve otherwise. | ||
3152 | */ | ||
3153 | int follow_pfn(struct vm_area_struct *vma, unsigned long address, | ||
3154 | unsigned long *pfn) | ||
3155 | { | ||
3156 | int ret = -EINVAL; | ||
3157 | spinlock_t *ptl; | ||
3158 | pte_t *ptep; | ||
3159 | |||
3160 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
3161 | return ret; | ||
3162 | |||
3163 | ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); | ||
3164 | if (ret) | ||
3165 | return ret; | ||
3166 | *pfn = pte_pfn(*ptep); | ||
3167 | pte_unmap_unlock(ptep, ptl); | ||
3168 | return 0; | ||
3169 | } | ||
3170 | EXPORT_SYMBOL(follow_pfn); | ||
3171 | |||
3172 | #ifdef CONFIG_HAVE_IOREMAP_PROT | ||
3173 | int follow_phys(struct vm_area_struct *vma, | ||
3174 | unsigned long address, unsigned int flags, | ||
3175 | unsigned long *prot, resource_size_t *phys) | ||
3176 | { | ||
3177 | int ret = -EINVAL; | ||
3178 | pte_t *ptep, pte; | ||
3179 | spinlock_t *ptl; | ||
3180 | |||
3181 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
3182 | goto out; | ||
3183 | |||
3184 | if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) | ||
3185 | goto out; | ||
3093 | pte = *ptep; | 3186 | pte = *ptep; |
3094 | if (!pte_present(pte)) | 3187 | |
3095 | goto unlock; | ||
3096 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 3188 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
3097 | goto unlock; | 3189 | goto unlock; |
3098 | phys_addr = pte_pfn(pte); | ||
3099 | phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ | ||
3100 | 3190 | ||
3101 | *prot = pgprot_val(pte_pgprot(pte)); | 3191 | *prot = pgprot_val(pte_pgprot(pte)); |
3102 | *phys = phys_addr; | 3192 | *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; |
3103 | ret = 0; | ||
3104 | 3193 | ||
3194 | ret = 0; | ||
3105 | unlock: | 3195 | unlock: |
3106 | pte_unmap_unlock(ptep, ptl); | 3196 | pte_unmap_unlock(ptep, ptl); |
3107 | out: | 3197 | out: |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c083cf5fd6df..e4412a676c88 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -422,7 +422,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
422 | zone->present_pages += onlined_pages; | 422 | zone->present_pages += onlined_pages; |
423 | zone->zone_pgdat->node_present_pages += onlined_pages; | 423 | zone->zone_pgdat->node_present_pages += onlined_pages; |
424 | 424 | ||
425 | setup_per_zone_pages_min(); | 425 | setup_per_zone_wmarks(); |
426 | calculate_zone_inactive_ratio(zone); | ||
426 | if (onlined_pages) { | 427 | if (onlined_pages) { |
427 | kswapd_run(zone_to_nid(zone)); | 428 | kswapd_run(zone_to_nid(zone)); |
428 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | 429 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); |
@@ -832,6 +833,9 @@ repeat: | |||
832 | totalram_pages -= offlined_pages; | 833 | totalram_pages -= offlined_pages; |
833 | num_physpages -= offlined_pages; | 834 | num_physpages -= offlined_pages; |
834 | 835 | ||
836 | setup_per_zone_wmarks(); | ||
837 | calculate_zone_inactive_ratio(zone); | ||
838 | |||
835 | vm_total_pages = nr_free_pagecache_pages(); | 839 | vm_total_pages = nr_free_pagecache_pages(); |
836 | writeback_set_ratelimit(); | 840 | writeback_set_ratelimit(); |
837 | 841 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3eb4a6fdc043..e08e2c4da63a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) | |||
182 | return 0; | 182 | return 0; |
183 | } | 183 | } |
184 | 184 | ||
185 | /* Create a new policy */ | 185 | /* |
186 | * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if | ||
187 | * any, for the new policy. mpol_new() has already validated the nodes | ||
188 | * parameter with respect to the policy mode and flags. But, we need to | ||
189 | * handle an empty nodemask with MPOL_PREFERRED here. | ||
190 | * | ||
191 | * Must be called holding task's alloc_lock to protect task's mems_allowed | ||
192 | * and mempolicy. May also be called holding the mmap_semaphore for write. | ||
193 | */ | ||
194 | static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) | ||
195 | { | ||
196 | nodemask_t cpuset_context_nmask; | ||
197 | int ret; | ||
198 | |||
199 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ | ||
200 | if (pol == NULL) | ||
201 | return 0; | ||
202 | |||
203 | VM_BUG_ON(!nodes); | ||
204 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) | ||
205 | nodes = NULL; /* explicit local allocation */ | ||
206 | else { | ||
207 | if (pol->flags & MPOL_F_RELATIVE_NODES) | ||
208 | mpol_relative_nodemask(&cpuset_context_nmask, nodes, | ||
209 | &cpuset_current_mems_allowed); | ||
210 | else | ||
211 | nodes_and(cpuset_context_nmask, *nodes, | ||
212 | cpuset_current_mems_allowed); | ||
213 | if (mpol_store_user_nodemask(pol)) | ||
214 | pol->w.user_nodemask = *nodes; | ||
215 | else | ||
216 | pol->w.cpuset_mems_allowed = | ||
217 | cpuset_current_mems_allowed; | ||
218 | } | ||
219 | |||
220 | ret = mpol_ops[pol->mode].create(pol, | ||
221 | nodes ? &cpuset_context_nmask : NULL); | ||
222 | return ret; | ||
223 | } | ||
224 | |||
225 | /* | ||
226 | * This function just creates a new policy, does some check and simple | ||
227 | * initialization. You must invoke mpol_set_nodemask() to set nodes. | ||
228 | */ | ||
186 | static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | 229 | static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, |
187 | nodemask_t *nodes) | 230 | nodemask_t *nodes) |
188 | { | 231 | { |
189 | struct mempolicy *policy; | 232 | struct mempolicy *policy; |
190 | nodemask_t cpuset_context_nmask; | ||
191 | int ret; | ||
192 | 233 | ||
193 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", | 234 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", |
194 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); | 235 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); |
@@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
210 | if (((flags & MPOL_F_STATIC_NODES) || | 251 | if (((flags & MPOL_F_STATIC_NODES) || |
211 | (flags & MPOL_F_RELATIVE_NODES))) | 252 | (flags & MPOL_F_RELATIVE_NODES))) |
212 | return ERR_PTR(-EINVAL); | 253 | return ERR_PTR(-EINVAL); |
213 | nodes = NULL; /* flag local alloc */ | ||
214 | } | 254 | } |
215 | } else if (nodes_empty(*nodes)) | 255 | } else if (nodes_empty(*nodes)) |
216 | return ERR_PTR(-EINVAL); | 256 | return ERR_PTR(-EINVAL); |
@@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
221 | policy->mode = mode; | 261 | policy->mode = mode; |
222 | policy->flags = flags; | 262 | policy->flags = flags; |
223 | 263 | ||
224 | if (nodes) { | ||
225 | /* | ||
226 | * cpuset related setup doesn't apply to local allocation | ||
227 | */ | ||
228 | cpuset_update_task_memory_state(); | ||
229 | if (flags & MPOL_F_RELATIVE_NODES) | ||
230 | mpol_relative_nodemask(&cpuset_context_nmask, nodes, | ||
231 | &cpuset_current_mems_allowed); | ||
232 | else | ||
233 | nodes_and(cpuset_context_nmask, *nodes, | ||
234 | cpuset_current_mems_allowed); | ||
235 | if (mpol_store_user_nodemask(policy)) | ||
236 | policy->w.user_nodemask = *nodes; | ||
237 | else | ||
238 | policy->w.cpuset_mems_allowed = | ||
239 | cpuset_mems_allowed(current); | ||
240 | } | ||
241 | |||
242 | ret = mpol_ops[mode].create(policy, | ||
243 | nodes ? &cpuset_context_nmask : NULL); | ||
244 | if (ret < 0) { | ||
245 | kmem_cache_free(policy_cache, policy); | ||
246 | return ERR_PTR(ret); | ||
247 | } | ||
248 | return policy; | 264 | return policy; |
249 | } | 265 | } |
250 | 266 | ||
@@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol, | |||
324 | /* | 340 | /* |
325 | * Wrapper for mpol_rebind_policy() that just requires task | 341 | * Wrapper for mpol_rebind_policy() that just requires task |
326 | * pointer, and updates task mempolicy. | 342 | * pointer, and updates task mempolicy. |
343 | * | ||
344 | * Called with task's alloc_lock held. | ||
327 | */ | 345 | */ |
328 | 346 | ||
329 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) | 347 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) |
@@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void) | |||
600 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, | 618 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, |
601 | nodemask_t *nodes) | 619 | nodemask_t *nodes) |
602 | { | 620 | { |
603 | struct mempolicy *new; | 621 | struct mempolicy *new, *old; |
604 | struct mm_struct *mm = current->mm; | 622 | struct mm_struct *mm = current->mm; |
623 | int ret; | ||
605 | 624 | ||
606 | new = mpol_new(mode, flags, nodes); | 625 | new = mpol_new(mode, flags, nodes); |
607 | if (IS_ERR(new)) | 626 | if (IS_ERR(new)) |
@@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
615 | */ | 634 | */ |
616 | if (mm) | 635 | if (mm) |
617 | down_write(&mm->mmap_sem); | 636 | down_write(&mm->mmap_sem); |
618 | mpol_put(current->mempolicy); | 637 | task_lock(current); |
638 | ret = mpol_set_nodemask(new, nodes); | ||
639 | if (ret) { | ||
640 | task_unlock(current); | ||
641 | if (mm) | ||
642 | up_write(&mm->mmap_sem); | ||
643 | mpol_put(new); | ||
644 | return ret; | ||
645 | } | ||
646 | old = current->mempolicy; | ||
619 | current->mempolicy = new; | 647 | current->mempolicy = new; |
620 | mpol_set_task_struct_flag(); | 648 | mpol_set_task_struct_flag(); |
621 | if (new && new->mode == MPOL_INTERLEAVE && | 649 | if (new && new->mode == MPOL_INTERLEAVE && |
622 | nodes_weight(new->v.nodes)) | 650 | nodes_weight(new->v.nodes)) |
623 | current->il_next = first_node(new->v.nodes); | 651 | current->il_next = first_node(new->v.nodes); |
652 | task_unlock(current); | ||
624 | if (mm) | 653 | if (mm) |
625 | up_write(&mm->mmap_sem); | 654 | up_write(&mm->mmap_sem); |
626 | 655 | ||
656 | mpol_put(old); | ||
627 | return 0; | 657 | return 0; |
628 | } | 658 | } |
629 | 659 | ||
630 | /* | 660 | /* |
631 | * Return nodemask for policy for get_mempolicy() query | 661 | * Return nodemask for policy for get_mempolicy() query |
662 | * | ||
663 | * Called with task's alloc_lock held | ||
632 | */ | 664 | */ |
633 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) | 665 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) |
634 | { | 666 | { |
@@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
674 | struct vm_area_struct *vma = NULL; | 706 | struct vm_area_struct *vma = NULL; |
675 | struct mempolicy *pol = current->mempolicy; | 707 | struct mempolicy *pol = current->mempolicy; |
676 | 708 | ||
677 | cpuset_update_task_memory_state(); | ||
678 | if (flags & | 709 | if (flags & |
679 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) | 710 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) |
680 | return -EINVAL; | 711 | return -EINVAL; |
@@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
683 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) | 714 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) |
684 | return -EINVAL; | 715 | return -EINVAL; |
685 | *policy = 0; /* just so it's initialized */ | 716 | *policy = 0; /* just so it's initialized */ |
717 | task_lock(current); | ||
686 | *nmask = cpuset_current_mems_allowed; | 718 | *nmask = cpuset_current_mems_allowed; |
719 | task_unlock(current); | ||
687 | return 0; | 720 | return 0; |
688 | } | 721 | } |
689 | 722 | ||
@@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
738 | } | 771 | } |
739 | 772 | ||
740 | err = 0; | 773 | err = 0; |
741 | if (nmask) | 774 | if (nmask) { |
775 | task_lock(current); | ||
742 | get_policy_nodemask(pol, nmask); | 776 | get_policy_nodemask(pol, nmask); |
777 | task_unlock(current); | ||
778 | } | ||
743 | 779 | ||
744 | out: | 780 | out: |
745 | mpol_cond_put(pol); | 781 | mpol_cond_put(pol); |
@@ -767,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
767 | 803 | ||
768 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) | 804 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) |
769 | { | 805 | { |
770 | return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); | 806 | return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); |
771 | } | 807 | } |
772 | 808 | ||
773 | /* | 809 | /* |
@@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
979 | return err; | 1015 | return err; |
980 | } | 1016 | } |
981 | down_write(&mm->mmap_sem); | 1017 | down_write(&mm->mmap_sem); |
1018 | task_lock(current); | ||
1019 | err = mpol_set_nodemask(new, nmask); | ||
1020 | task_unlock(current); | ||
1021 | if (err) { | ||
1022 | up_write(&mm->mmap_sem); | ||
1023 | mpol_put(new); | ||
1024 | return err; | ||
1025 | } | ||
982 | vma = check_range(mm, start, end, nmask, | 1026 | vma = check_range(mm, start, end, nmask, |
983 | flags | MPOL_MF_INVERT, &pagelist); | 1027 | flags | MPOL_MF_INVERT, &pagelist); |
984 | 1028 | ||
@@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1545 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1589 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
1546 | struct zonelist *zl; | 1590 | struct zonelist *zl; |
1547 | 1591 | ||
1548 | cpuset_update_task_memory_state(); | ||
1549 | |||
1550 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1592 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
1551 | unsigned nid; | 1593 | unsigned nid; |
1552 | 1594 | ||
@@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1593 | { | 1635 | { |
1594 | struct mempolicy *pol = current->mempolicy; | 1636 | struct mempolicy *pol = current->mempolicy; |
1595 | 1637 | ||
1596 | if ((gfp & __GFP_WAIT) && !in_interrupt()) | ||
1597 | cpuset_update_task_memory_state(); | ||
1598 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1638 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
1599 | pol = &default_policy; | 1639 | pol = &default_policy; |
1600 | 1640 | ||
@@ -1854,6 +1894,8 @@ restart: | |||
1854 | */ | 1894 | */ |
1855 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | 1895 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) |
1856 | { | 1896 | { |
1897 | int ret; | ||
1898 | |||
1857 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ | 1899 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ |
1858 | spin_lock_init(&sp->lock); | 1900 | spin_lock_init(&sp->lock); |
1859 | 1901 | ||
@@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | |||
1863 | 1905 | ||
1864 | /* contextualize the tmpfs mount point mempolicy */ | 1906 | /* contextualize the tmpfs mount point mempolicy */ |
1865 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); | 1907 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); |
1866 | mpol_put(mpol); /* drop our ref on sb mpol */ | 1908 | if (IS_ERR(new)) { |
1867 | if (IS_ERR(new)) | 1909 | mpol_put(mpol); /* drop our ref on sb mpol */ |
1868 | return; /* no valid nodemask intersection */ | 1910 | return; /* no valid nodemask intersection */ |
1911 | } | ||
1912 | |||
1913 | task_lock(current); | ||
1914 | ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); | ||
1915 | task_unlock(current); | ||
1916 | mpol_put(mpol); /* drop our ref on sb mpol */ | ||
1917 | if (ret) { | ||
1918 | mpol_put(new); | ||
1919 | return; | ||
1920 | } | ||
1869 | 1921 | ||
1870 | /* Create pseudo-vma that contains just the policy */ | 1922 | /* Create pseudo-vma that contains just the policy */ |
1871 | memset(&pvma, 0, sizeof(struct vm_area_struct)); | 1923 | memset(&pvma, 0, sizeof(struct vm_area_struct)); |
@@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2086 | new = mpol_new(mode, mode_flags, &nodes); | 2138 | new = mpol_new(mode, mode_flags, &nodes); |
2087 | if (IS_ERR(new)) | 2139 | if (IS_ERR(new)) |
2088 | err = 1; | 2140 | err = 1; |
2089 | else if (no_context) | 2141 | else { |
2090 | new->w.user_nodemask = nodes; /* save for contextualization */ | 2142 | int ret; |
2143 | |||
2144 | task_lock(current); | ||
2145 | ret = mpol_set_nodemask(new, &nodes); | ||
2146 | task_unlock(current); | ||
2147 | if (ret) | ||
2148 | err = 1; | ||
2149 | else if (no_context) { | ||
2150 | /* save for contextualization */ | ||
2151 | new->w.user_nodemask = nodes; | ||
2152 | } | ||
2153 | } | ||
2091 | 2154 | ||
2092 | out: | 2155 | out: |
2093 | /* Restore string for error message */ | 2156 | /* Restore string for error message */ |
diff --git a/mm/migrate.c b/mm/migrate.c index 068655d8f883..939888f9ddab 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, | |||
802 | 802 | ||
803 | *result = &pm->status; | 803 | *result = &pm->status; |
804 | 804 | ||
805 | return alloc_pages_node(pm->node, | 805 | return alloc_pages_exact_node(pm->node, |
806 | GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); | 806 | GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); |
807 | } | 807 | } |
808 | 808 | ||
@@ -820,7 +820,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
820 | struct page_to_node *pp; | 820 | struct page_to_node *pp; |
821 | LIST_HEAD(pagelist); | 821 | LIST_HEAD(pagelist); |
822 | 822 | ||
823 | migrate_prep(); | ||
824 | down_read(&mm->mmap_sem); | 823 | down_read(&mm->mmap_sem); |
825 | 824 | ||
826 | /* | 825 | /* |
@@ -907,6 +906,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | |||
907 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); | 906 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); |
908 | if (!pm) | 907 | if (!pm) |
909 | goto out; | 908 | goto out; |
909 | |||
910 | migrate_prep(); | ||
911 | |||
910 | /* | 912 | /* |
911 | * Store a chunk of page_to_node array in a page, | 913 | * Store a chunk of page_to_node array in a page, |
912 | * but keep the last one as a marker | 914 | * but keep the last one as a marker |
diff --git a/mm/mlock.c b/mm/mlock.c index ac130433c7d3..45eb650b9654 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -31,7 +31,6 @@ int can_do_mlock(void) | |||
31 | } | 31 | } |
32 | EXPORT_SYMBOL(can_do_mlock); | 32 | EXPORT_SYMBOL(can_do_mlock); |
33 | 33 | ||
34 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
35 | /* | 34 | /* |
36 | * Mlocked pages are marked with PageMlocked() flag for efficient testing | 35 | * Mlocked pages are marked with PageMlocked() flag for efficient testing |
37 | * in vmscan and, possibly, the fault path; and to support semi-accurate | 36 | * in vmscan and, possibly, the fault path; and to support semi-accurate |
@@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval) | |||
261 | return retval; | 260 | return retval; |
262 | } | 261 | } |
263 | 262 | ||
264 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
265 | |||
266 | /* | ||
267 | * Just make pages present if VM_LOCKED. No-op if unlocking. | ||
268 | */ | ||
269 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | ||
270 | unsigned long start, unsigned long end, | ||
271 | int mlock) | ||
272 | { | ||
273 | if (mlock && (vma->vm_flags & VM_LOCKED)) | ||
274 | return make_pages_present(start, end); | ||
275 | return 0; | ||
276 | } | ||
277 | |||
278 | static inline int __mlock_posix_error_return(long retval) | ||
279 | { | ||
280 | return 0; | ||
281 | } | ||
282 | |||
283 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
284 | |||
285 | /** | 263 | /** |
286 | * mlock_vma_pages_range() - mlock pages in specified vma range. | 264 | * mlock_vma_pages_range() - mlock pages in specified vma range. |
287 | * @vma - the vma containing the specfied address range | 265 | * @vma - the vma containing the specfied address range |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a7b2460e922b..175a67a78a99 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
58 | unsigned long points, cpu_time, run_time; | 58 | unsigned long points, cpu_time, run_time; |
59 | struct mm_struct *mm; | 59 | struct mm_struct *mm; |
60 | struct task_struct *child; | 60 | struct task_struct *child; |
61 | int oom_adj; | ||
61 | 62 | ||
62 | task_lock(p); | 63 | task_lock(p); |
63 | mm = p->mm; | 64 | mm = p->mm; |
@@ -65,6 +66,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
65 | task_unlock(p); | 66 | task_unlock(p); |
66 | return 0; | 67 | return 0; |
67 | } | 68 | } |
69 | oom_adj = mm->oom_adj; | ||
70 | if (oom_adj == OOM_DISABLE) { | ||
71 | task_unlock(p); | ||
72 | return 0; | ||
73 | } | ||
68 | 74 | ||
69 | /* | 75 | /* |
70 | * The memory size of the process is the basis for the badness. | 76 | * The memory size of the process is the basis for the badness. |
@@ -148,15 +154,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
148 | points /= 8; | 154 | points /= 8; |
149 | 155 | ||
150 | /* | 156 | /* |
151 | * Adjust the score by oomkilladj. | 157 | * Adjust the score by oom_adj. |
152 | */ | 158 | */ |
153 | if (p->oomkilladj) { | 159 | if (oom_adj) { |
154 | if (p->oomkilladj > 0) { | 160 | if (oom_adj > 0) { |
155 | if (!points) | 161 | if (!points) |
156 | points = 1; | 162 | points = 1; |
157 | points <<= p->oomkilladj; | 163 | points <<= oom_adj; |
158 | } else | 164 | } else |
159 | points >>= -(p->oomkilladj); | 165 | points >>= -(oom_adj); |
160 | } | 166 | } |
161 | 167 | ||
162 | #ifdef DEBUG | 168 | #ifdef DEBUG |
@@ -251,11 +257,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, | |||
251 | *ppoints = ULONG_MAX; | 257 | *ppoints = ULONG_MAX; |
252 | } | 258 | } |
253 | 259 | ||
254 | if (p->oomkilladj == OOM_DISABLE) | ||
255 | continue; | ||
256 | |||
257 | points = badness(p, uptime.tv_sec); | 260 | points = badness(p, uptime.tv_sec); |
258 | if (points > *ppoints || !chosen) { | 261 | if (points > *ppoints) { |
259 | chosen = p; | 262 | chosen = p; |
260 | *ppoints = points; | 263 | *ppoints = points; |
261 | } | 264 | } |
@@ -304,8 +307,7 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
304 | } | 307 | } |
305 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", | 308 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", |
306 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, | 309 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, |
307 | get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, | 310 | get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); |
308 | p->comm); | ||
309 | task_unlock(p); | 311 | task_unlock(p); |
310 | } while_each_thread(g, p); | 312 | } while_each_thread(g, p); |
311 | } | 313 | } |
@@ -323,11 +325,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose) | |||
323 | return; | 325 | return; |
324 | } | 326 | } |
325 | 327 | ||
326 | if (!p->mm) { | 328 | if (!p->mm) |
327 | WARN_ON(1); | ||
328 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); | ||
329 | return; | 329 | return; |
330 | } | ||
331 | 330 | ||
332 | if (verbose) | 331 | if (verbose) |
333 | printk(KERN_ERR "Killed process %d (%s)\n", | 332 | printk(KERN_ERR "Killed process %d (%s)\n", |
@@ -349,28 +348,13 @@ static int oom_kill_task(struct task_struct *p) | |||
349 | struct mm_struct *mm; | 348 | struct mm_struct *mm; |
350 | struct task_struct *g, *q; | 349 | struct task_struct *g, *q; |
351 | 350 | ||
351 | task_lock(p); | ||
352 | mm = p->mm; | 352 | mm = p->mm; |
353 | 353 | if (!mm || mm->oom_adj == OOM_DISABLE) { | |
354 | /* WARNING: mm may not be dereferenced since we did not obtain its | 354 | task_unlock(p); |
355 | * value from get_task_mm(p). This is OK since all we need to do is | ||
356 | * compare mm to q->mm below. | ||
357 | * | ||
358 | * Furthermore, even if mm contains a non-NULL value, p->mm may | ||
359 | * change to NULL at any time since we do not hold task_lock(p). | ||
360 | * However, this is of no concern to us. | ||
361 | */ | ||
362 | |||
363 | if (mm == NULL) | ||
364 | return 1; | 355 | return 1; |
365 | 356 | } | |
366 | /* | 357 | task_unlock(p); |
367 | * Don't kill the process if any threads are set to OOM_DISABLE | ||
368 | */ | ||
369 | do_each_thread(g, q) { | ||
370 | if (q->mm == mm && q->oomkilladj == OOM_DISABLE) | ||
371 | return 1; | ||
372 | } while_each_thread(g, q); | ||
373 | |||
374 | __oom_kill_task(p, 1); | 358 | __oom_kill_task(p, 1); |
375 | 359 | ||
376 | /* | 360 | /* |
@@ -393,10 +377,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
393 | struct task_struct *c; | 377 | struct task_struct *c; |
394 | 378 | ||
395 | if (printk_ratelimit()) { | 379 | if (printk_ratelimit()) { |
396 | printk(KERN_WARNING "%s invoked oom-killer: " | ||
397 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | ||
398 | current->comm, gfp_mask, order, current->oomkilladj); | ||
399 | task_lock(current); | 380 | task_lock(current); |
381 | printk(KERN_WARNING "%s invoked oom-killer: " | ||
382 | "gfp_mask=0x%x, order=%d, oom_adj=%d\n", | ||
383 | current->comm, gfp_mask, order, | ||
384 | current->mm ? current->mm->oom_adj : OOM_DISABLE); | ||
400 | cpuset_print_task_mems_allowed(current); | 385 | cpuset_print_task_mems_allowed(current); |
401 | task_unlock(current); | 386 | task_unlock(current); |
402 | dump_stack(); | 387 | dump_stack(); |
@@ -409,8 +394,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
409 | /* | 394 | /* |
410 | * If the task is already exiting, don't alarm the sysadmin or kill | 395 | * If the task is already exiting, don't alarm the sysadmin or kill |
411 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 396 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
397 | * if its mm is still attached. | ||
412 | */ | 398 | */ |
413 | if (p->flags & PF_EXITING) { | 399 | if (p->mm && (p->flags & PF_EXITING)) { |
414 | __oom_kill_task(p, 0); | 400 | __oom_kill_task(p, 0); |
415 | return 0; | 401 | return 0; |
416 | } | 402 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bb553c3e955d..7b0dcea4935b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -265,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, | |||
265 | * This avoids exceeding the total dirty_limit when the floating averages | 265 | * This avoids exceeding the total dirty_limit when the floating averages |
266 | * fluctuate too quickly. | 266 | * fluctuate too quickly. |
267 | */ | 267 | */ |
268 | static void | 268 | static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, |
269 | clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) | 269 | unsigned long dirty, unsigned long *pbdi_dirty) |
270 | { | 270 | { |
271 | long avail_dirty; | 271 | unsigned long avail_dirty; |
272 | 272 | ||
273 | avail_dirty = dirty - | 273 | avail_dirty = global_page_state(NR_FILE_DIRTY) + |
274 | (global_page_state(NR_FILE_DIRTY) + | ||
275 | global_page_state(NR_WRITEBACK) + | 274 | global_page_state(NR_WRITEBACK) + |
276 | global_page_state(NR_UNSTABLE_NFS) + | 275 | global_page_state(NR_UNSTABLE_NFS) + |
277 | global_page_state(NR_WRITEBACK_TEMP)); | 276 | global_page_state(NR_WRITEBACK_TEMP); |
278 | 277 | ||
279 | if (avail_dirty < 0) | 278 | if (avail_dirty < dirty) |
279 | avail_dirty = dirty - avail_dirty; | ||
280 | else | ||
280 | avail_dirty = 0; | 281 | avail_dirty = 0; |
281 | 282 | ||
282 | avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + | 283 | avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + |
@@ -299,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk, | |||
299 | * | 300 | * |
300 | * dirty -= (dirty/8) * p_{t} | 301 | * dirty -= (dirty/8) * p_{t} |
301 | */ | 302 | */ |
302 | static void task_dirty_limit(struct task_struct *tsk, long *pdirty) | 303 | static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) |
303 | { | 304 | { |
304 | long numerator, denominator; | 305 | long numerator, denominator; |
305 | long dirty = *pdirty; | 306 | unsigned long dirty = *pdirty; |
306 | u64 inv = dirty >> 3; | 307 | u64 inv = dirty >> 3; |
307 | 308 | ||
308 | task_dirties_fraction(tsk, &numerator, &denominator); | 309 | task_dirties_fraction(tsk, &numerator, &denominator); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 17d5f539a9aa..a5f3c278c573 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/bootmem.h> | 23 | #include <linux/bootmem.h> |
24 | #include <linux/compiler.h> | 24 | #include <linux/compiler.h> |
25 | #include <linux/kernel.h> | 25 | #include <linux/kernel.h> |
26 | #include <linux/kmemcheck.h> | ||
26 | #include <linux/module.h> | 27 | #include <linux/module.h> |
27 | #include <linux/suspend.h> | 28 | #include <linux/suspend.h> |
28 | #include <linux/pagevec.h> | 29 | #include <linux/pagevec.h> |
@@ -161,17 +162,25 @@ static unsigned long __meminitdata dma_reserve; | |||
161 | 162 | ||
162 | #if MAX_NUMNODES > 1 | 163 | #if MAX_NUMNODES > 1 |
163 | int nr_node_ids __read_mostly = MAX_NUMNODES; | 164 | int nr_node_ids __read_mostly = MAX_NUMNODES; |
165 | int nr_online_nodes __read_mostly = 1; | ||
164 | EXPORT_SYMBOL(nr_node_ids); | 166 | EXPORT_SYMBOL(nr_node_ids); |
167 | EXPORT_SYMBOL(nr_online_nodes); | ||
165 | #endif | 168 | #endif |
166 | 169 | ||
167 | int page_group_by_mobility_disabled __read_mostly; | 170 | int page_group_by_mobility_disabled __read_mostly; |
168 | 171 | ||
169 | static void set_pageblock_migratetype(struct page *page, int migratetype) | 172 | static void set_pageblock_migratetype(struct page *page, int migratetype) |
170 | { | 173 | { |
174 | |||
175 | if (unlikely(page_group_by_mobility_disabled)) | ||
176 | migratetype = MIGRATE_UNMOVABLE; | ||
177 | |||
171 | set_pageblock_flags_group(page, (unsigned long)migratetype, | 178 | set_pageblock_flags_group(page, (unsigned long)migratetype, |
172 | PB_migrate, PB_migrate_end); | 179 | PB_migrate, PB_migrate_end); |
173 | } | 180 | } |
174 | 181 | ||
182 | bool oom_killer_disabled __read_mostly; | ||
183 | |||
175 | #ifdef CONFIG_DEBUG_VM | 184 | #ifdef CONFIG_DEBUG_VM |
176 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 185 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
177 | { | 186 | { |
@@ -294,23 +303,6 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
294 | } | 303 | } |
295 | } | 304 | } |
296 | 305 | ||
297 | #ifdef CONFIG_HUGETLBFS | ||
298 | void prep_compound_gigantic_page(struct page *page, unsigned long order) | ||
299 | { | ||
300 | int i; | ||
301 | int nr_pages = 1 << order; | ||
302 | struct page *p = page + 1; | ||
303 | |||
304 | set_compound_page_dtor(page, free_compound_page); | ||
305 | set_compound_order(page, order); | ||
306 | __SetPageHead(page); | ||
307 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | ||
308 | __SetPageTail(p); | ||
309 | p->first_page = page; | ||
310 | } | ||
311 | } | ||
312 | #endif | ||
313 | |||
314 | static int destroy_compound_page(struct page *page, unsigned long order) | 306 | static int destroy_compound_page(struct page *page, unsigned long order) |
315 | { | 307 | { |
316 | int i; | 308 | int i; |
@@ -417,7 +409,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
417 | return 0; | 409 | return 0; |
418 | 410 | ||
419 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 411 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
420 | BUG_ON(page_count(buddy) != 0); | 412 | VM_BUG_ON(page_count(buddy) != 0); |
421 | return 1; | 413 | return 1; |
422 | } | 414 | } |
423 | return 0; | 415 | return 0; |
@@ -448,22 +440,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
448 | */ | 440 | */ |
449 | 441 | ||
450 | static inline void __free_one_page(struct page *page, | 442 | static inline void __free_one_page(struct page *page, |
451 | struct zone *zone, unsigned int order) | 443 | struct zone *zone, unsigned int order, |
444 | int migratetype) | ||
452 | { | 445 | { |
453 | unsigned long page_idx; | 446 | unsigned long page_idx; |
454 | int order_size = 1 << order; | ||
455 | int migratetype = get_pageblock_migratetype(page); | ||
456 | 447 | ||
457 | if (unlikely(PageCompound(page))) | 448 | if (unlikely(PageCompound(page))) |
458 | if (unlikely(destroy_compound_page(page, order))) | 449 | if (unlikely(destroy_compound_page(page, order))) |
459 | return; | 450 | return; |
460 | 451 | ||
452 | VM_BUG_ON(migratetype == -1); | ||
453 | |||
461 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 454 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
462 | 455 | ||
463 | VM_BUG_ON(page_idx & (order_size - 1)); | 456 | VM_BUG_ON(page_idx & ((1 << order) - 1)); |
464 | VM_BUG_ON(bad_range(zone, page)); | 457 | VM_BUG_ON(bad_range(zone, page)); |
465 | 458 | ||
466 | __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); | ||
467 | while (order < MAX_ORDER-1) { | 459 | while (order < MAX_ORDER-1) { |
468 | unsigned long combined_idx; | 460 | unsigned long combined_idx; |
469 | struct page *buddy; | 461 | struct page *buddy; |
@@ -487,12 +479,27 @@ static inline void __free_one_page(struct page *page, | |||
487 | zone->free_area[order].nr_free++; | 479 | zone->free_area[order].nr_free++; |
488 | } | 480 | } |
489 | 481 | ||
482 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
483 | /* | ||
484 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | ||
485 | * Page should not be on lru, so no need to fix that up. | ||
486 | * free_pages_check() will verify... | ||
487 | */ | ||
488 | static inline void free_page_mlock(struct page *page) | ||
489 | { | ||
490 | __ClearPageMlocked(page); | ||
491 | __dec_zone_page_state(page, NR_MLOCK); | ||
492 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | ||
493 | } | ||
494 | #else | ||
495 | static void free_page_mlock(struct page *page) { } | ||
496 | #endif | ||
497 | |||
490 | static inline int free_pages_check(struct page *page) | 498 | static inline int free_pages_check(struct page *page) |
491 | { | 499 | { |
492 | free_page_mlock(page); | ||
493 | if (unlikely(page_mapcount(page) | | 500 | if (unlikely(page_mapcount(page) | |
494 | (page->mapping != NULL) | | 501 | (page->mapping != NULL) | |
495 | (page_count(page) != 0) | | 502 | (atomic_read(&page->_count) != 0) | |
496 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { | 503 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { |
497 | bad_page(page); | 504 | bad_page(page); |
498 | return 1; | 505 | return 1; |
@@ -519,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
519 | spin_lock(&zone->lock); | 526 | spin_lock(&zone->lock); |
520 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 527 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
521 | zone->pages_scanned = 0; | 528 | zone->pages_scanned = 0; |
529 | |||
530 | __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); | ||
522 | while (count--) { | 531 | while (count--) { |
523 | struct page *page; | 532 | struct page *page; |
524 | 533 | ||
@@ -526,17 +535,20 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
526 | page = list_entry(list->prev, struct page, lru); | 535 | page = list_entry(list->prev, struct page, lru); |
527 | /* have to delete it as __free_one_page list manipulates */ | 536 | /* have to delete it as __free_one_page list manipulates */ |
528 | list_del(&page->lru); | 537 | list_del(&page->lru); |
529 | __free_one_page(page, zone, order); | 538 | __free_one_page(page, zone, order, page_private(page)); |
530 | } | 539 | } |
531 | spin_unlock(&zone->lock); | 540 | spin_unlock(&zone->lock); |
532 | } | 541 | } |
533 | 542 | ||
534 | static void free_one_page(struct zone *zone, struct page *page, int order) | 543 | static void free_one_page(struct zone *zone, struct page *page, int order, |
544 | int migratetype) | ||
535 | { | 545 | { |
536 | spin_lock(&zone->lock); | 546 | spin_lock(&zone->lock); |
537 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 547 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
538 | zone->pages_scanned = 0; | 548 | zone->pages_scanned = 0; |
539 | __free_one_page(page, zone, order); | 549 | |
550 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | ||
551 | __free_one_page(page, zone, order, migratetype); | ||
540 | spin_unlock(&zone->lock); | 552 | spin_unlock(&zone->lock); |
541 | } | 553 | } |
542 | 554 | ||
@@ -545,6 +557,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
545 | unsigned long flags; | 557 | unsigned long flags; |
546 | int i; | 558 | int i; |
547 | int bad = 0; | 559 | int bad = 0; |
560 | int clearMlocked = PageMlocked(page); | ||
561 | |||
562 | kmemcheck_free_shadow(page, order); | ||
548 | 563 | ||
549 | for (i = 0 ; i < (1 << order) ; ++i) | 564 | for (i = 0 ; i < (1 << order) ; ++i) |
550 | bad += free_pages_check(page + i); | 565 | bad += free_pages_check(page + i); |
@@ -560,8 +575,11 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
560 | kernel_map_pages(page, 1 << order, 0); | 575 | kernel_map_pages(page, 1 << order, 0); |
561 | 576 | ||
562 | local_irq_save(flags); | 577 | local_irq_save(flags); |
578 | if (unlikely(clearMlocked)) | ||
579 | free_page_mlock(page); | ||
563 | __count_vm_events(PGFREE, 1 << order); | 580 | __count_vm_events(PGFREE, 1 << order); |
564 | free_one_page(page_zone(page), page, order); | 581 | free_one_page(page_zone(page), page, order, |
582 | get_pageblock_migratetype(page)); | ||
565 | local_irq_restore(flags); | 583 | local_irq_restore(flags); |
566 | } | 584 | } |
567 | 585 | ||
@@ -632,7 +650,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
632 | { | 650 | { |
633 | if (unlikely(page_mapcount(page) | | 651 | if (unlikely(page_mapcount(page) | |
634 | (page->mapping != NULL) | | 652 | (page->mapping != NULL) | |
635 | (page_count(page) != 0) | | 653 | (atomic_read(&page->_count) != 0) | |
636 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { | 654 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { |
637 | bad_page(page); | 655 | bad_page(page); |
638 | return 1; | 656 | return 1; |
@@ -657,7 +675,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
657 | * Go through the free lists for the given migratetype and remove | 675 | * Go through the free lists for the given migratetype and remove |
658 | * the smallest available page from the freelists | 676 | * the smallest available page from the freelists |
659 | */ | 677 | */ |
660 | static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | 678 | static inline |
679 | struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | ||
661 | int migratetype) | 680 | int migratetype) |
662 | { | 681 | { |
663 | unsigned int current_order; | 682 | unsigned int current_order; |
@@ -675,7 +694,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |||
675 | list_del(&page->lru); | 694 | list_del(&page->lru); |
676 | rmv_page_order(page); | 695 | rmv_page_order(page); |
677 | area->nr_free--; | 696 | area->nr_free--; |
678 | __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); | ||
679 | expand(zone, page, order, current_order, area, migratetype); | 697 | expand(zone, page, order, current_order, area, migratetype); |
680 | return page; | 698 | return page; |
681 | } | 699 | } |
@@ -766,8 +784,8 @@ static int move_freepages_block(struct zone *zone, struct page *page, | |||
766 | } | 784 | } |
767 | 785 | ||
768 | /* Remove an element from the buddy allocator from the fallback list */ | 786 | /* Remove an element from the buddy allocator from the fallback list */ |
769 | static struct page *__rmqueue_fallback(struct zone *zone, int order, | 787 | static inline struct page * |
770 | int start_migratetype) | 788 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) |
771 | { | 789 | { |
772 | struct free_area * area; | 790 | struct free_area * area; |
773 | int current_order; | 791 | int current_order; |
@@ -815,8 +833,6 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, | |||
815 | /* Remove the page from the freelists */ | 833 | /* Remove the page from the freelists */ |
816 | list_del(&page->lru); | 834 | list_del(&page->lru); |
817 | rmv_page_order(page); | 835 | rmv_page_order(page); |
818 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
819 | -(1UL << order)); | ||
820 | 836 | ||
821 | if (current_order == pageblock_order) | 837 | if (current_order == pageblock_order) |
822 | set_pageblock_migratetype(page, | 838 | set_pageblock_migratetype(page, |
@@ -827,8 +843,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, | |||
827 | } | 843 | } |
828 | } | 844 | } |
829 | 845 | ||
830 | /* Use MIGRATE_RESERVE rather than fail an allocation */ | 846 | return NULL; |
831 | return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); | ||
832 | } | 847 | } |
833 | 848 | ||
834 | /* | 849 | /* |
@@ -840,11 +855,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, | |||
840 | { | 855 | { |
841 | struct page *page; | 856 | struct page *page; |
842 | 857 | ||
858 | retry_reserve: | ||
843 | page = __rmqueue_smallest(zone, order, migratetype); | 859 | page = __rmqueue_smallest(zone, order, migratetype); |
844 | 860 | ||
845 | if (unlikely(!page)) | 861 | if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { |
846 | page = __rmqueue_fallback(zone, order, migratetype); | 862 | page = __rmqueue_fallback(zone, order, migratetype); |
847 | 863 | ||
864 | /* | ||
865 | * Use MIGRATE_RESERVE rather than fail an allocation. goto | ||
866 | * is used because __rmqueue_smallest is an inline function | ||
867 | * and we want just one call site | ||
868 | */ | ||
869 | if (!page) { | ||
870 | migratetype = MIGRATE_RESERVE; | ||
871 | goto retry_reserve; | ||
872 | } | ||
873 | } | ||
874 | |||
848 | return page; | 875 | return page; |
849 | } | 876 | } |
850 | 877 | ||
@@ -878,6 +905,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
878 | set_page_private(page, migratetype); | 905 | set_page_private(page, migratetype); |
879 | list = &page->lru; | 906 | list = &page->lru; |
880 | } | 907 | } |
908 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); | ||
881 | spin_unlock(&zone->lock); | 909 | spin_unlock(&zone->lock); |
882 | return i; | 910 | return i; |
883 | } | 911 | } |
@@ -993,6 +1021,9 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
993 | struct zone *zone = page_zone(page); | 1021 | struct zone *zone = page_zone(page); |
994 | struct per_cpu_pages *pcp; | 1022 | struct per_cpu_pages *pcp; |
995 | unsigned long flags; | 1023 | unsigned long flags; |
1024 | int clearMlocked = PageMlocked(page); | ||
1025 | |||
1026 | kmemcheck_free_shadow(page, 0); | ||
996 | 1027 | ||
997 | if (PageAnon(page)) | 1028 | if (PageAnon(page)) |
998 | page->mapping = NULL; | 1029 | page->mapping = NULL; |
@@ -1007,13 +1038,16 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
1007 | kernel_map_pages(page, 1, 0); | 1038 | kernel_map_pages(page, 1, 0); |
1008 | 1039 | ||
1009 | pcp = &zone_pcp(zone, get_cpu())->pcp; | 1040 | pcp = &zone_pcp(zone, get_cpu())->pcp; |
1041 | set_page_private(page, get_pageblock_migratetype(page)); | ||
1010 | local_irq_save(flags); | 1042 | local_irq_save(flags); |
1043 | if (unlikely(clearMlocked)) | ||
1044 | free_page_mlock(page); | ||
1011 | __count_vm_event(PGFREE); | 1045 | __count_vm_event(PGFREE); |
1046 | |||
1012 | if (cold) | 1047 | if (cold) |
1013 | list_add_tail(&page->lru, &pcp->list); | 1048 | list_add_tail(&page->lru, &pcp->list); |
1014 | else | 1049 | else |
1015 | list_add(&page->lru, &pcp->list); | 1050 | list_add(&page->lru, &pcp->list); |
1016 | set_page_private(page, get_pageblock_migratetype(page)); | ||
1017 | pcp->count++; | 1051 | pcp->count++; |
1018 | if (pcp->count >= pcp->high) { | 1052 | if (pcp->count >= pcp->high) { |
1019 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 1053 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); |
@@ -1047,6 +1081,16 @@ void split_page(struct page *page, unsigned int order) | |||
1047 | 1081 | ||
1048 | VM_BUG_ON(PageCompound(page)); | 1082 | VM_BUG_ON(PageCompound(page)); |
1049 | VM_BUG_ON(!page_count(page)); | 1083 | VM_BUG_ON(!page_count(page)); |
1084 | |||
1085 | #ifdef CONFIG_KMEMCHECK | ||
1086 | /* | ||
1087 | * Split shadow pages too, because free(page[0]) would | ||
1088 | * otherwise free the whole shadow. | ||
1089 | */ | ||
1090 | if (kmemcheck_page_is_tracked(page)) | ||
1091 | split_page(virt_to_page(page[0].shadow), order); | ||
1092 | #endif | ||
1093 | |||
1050 | for (i = 1; i < (1 << order); i++) | 1094 | for (i = 1; i < (1 << order); i++) |
1051 | set_page_refcounted(page + i); | 1095 | set_page_refcounted(page + i); |
1052 | } | 1096 | } |
@@ -1056,14 +1100,15 @@ void split_page(struct page *page, unsigned int order) | |||
1056 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 1100 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
1057 | * or two. | 1101 | * or two. |
1058 | */ | 1102 | */ |
1059 | static struct page *buffered_rmqueue(struct zone *preferred_zone, | 1103 | static inline |
1060 | struct zone *zone, int order, gfp_t gfp_flags) | 1104 | struct page *buffered_rmqueue(struct zone *preferred_zone, |
1105 | struct zone *zone, int order, gfp_t gfp_flags, | ||
1106 | int migratetype) | ||
1061 | { | 1107 | { |
1062 | unsigned long flags; | 1108 | unsigned long flags; |
1063 | struct page *page; | 1109 | struct page *page; |
1064 | int cold = !!(gfp_flags & __GFP_COLD); | 1110 | int cold = !!(gfp_flags & __GFP_COLD); |
1065 | int cpu; | 1111 | int cpu; |
1066 | int migratetype = allocflags_to_migratetype(gfp_flags); | ||
1067 | 1112 | ||
1068 | again: | 1113 | again: |
1069 | cpu = get_cpu(); | 1114 | cpu = get_cpu(); |
@@ -1100,8 +1145,22 @@ again: | |||
1100 | list_del(&page->lru); | 1145 | list_del(&page->lru); |
1101 | pcp->count--; | 1146 | pcp->count--; |
1102 | } else { | 1147 | } else { |
1148 | if (unlikely(gfp_flags & __GFP_NOFAIL)) { | ||
1149 | /* | ||
1150 | * __GFP_NOFAIL is not to be used in new code. | ||
1151 | * | ||
1152 | * All __GFP_NOFAIL callers should be fixed so that they | ||
1153 | * properly detect and handle allocation failures. | ||
1154 | * | ||
1155 | * We most definitely don't want callers attempting to | ||
1156 | * allocate greater than single-page units with | ||
1157 | * __GFP_NOFAIL. | ||
1158 | */ | ||
1159 | WARN_ON_ONCE(order > 0); | ||
1160 | } | ||
1103 | spin_lock_irqsave(&zone->lock, flags); | 1161 | spin_lock_irqsave(&zone->lock, flags); |
1104 | page = __rmqueue(zone, order, migratetype); | 1162 | page = __rmqueue(zone, order, migratetype); |
1163 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | ||
1105 | spin_unlock(&zone->lock); | 1164 | spin_unlock(&zone->lock); |
1106 | if (!page) | 1165 | if (!page) |
1107 | goto failed; | 1166 | goto failed; |
@@ -1123,10 +1182,15 @@ failed: | |||
1123 | return NULL; | 1182 | return NULL; |
1124 | } | 1183 | } |
1125 | 1184 | ||
1126 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ | 1185 | /* The ALLOC_WMARK bits are used as an index to zone->watermark */ |
1127 | #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ | 1186 | #define ALLOC_WMARK_MIN WMARK_MIN |
1128 | #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ | 1187 | #define ALLOC_WMARK_LOW WMARK_LOW |
1129 | #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ | 1188 | #define ALLOC_WMARK_HIGH WMARK_HIGH |
1189 | #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ | ||
1190 | |||
1191 | /* Mask to get the watermark bits */ | ||
1192 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) | ||
1193 | |||
1130 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | 1194 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ |
1131 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 1195 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
1132 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 1196 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
@@ -1384,23 +1448,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | |||
1384 | */ | 1448 | */ |
1385 | static struct page * | 1449 | static struct page * |
1386 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | 1450 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, |
1387 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags) | 1451 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, |
1452 | struct zone *preferred_zone, int migratetype) | ||
1388 | { | 1453 | { |
1389 | struct zoneref *z; | 1454 | struct zoneref *z; |
1390 | struct page *page = NULL; | 1455 | struct page *page = NULL; |
1391 | int classzone_idx; | 1456 | int classzone_idx; |
1392 | struct zone *zone, *preferred_zone; | 1457 | struct zone *zone; |
1393 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | 1458 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ |
1394 | int zlc_active = 0; /* set if using zonelist_cache */ | 1459 | int zlc_active = 0; /* set if using zonelist_cache */ |
1395 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1460 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
1396 | 1461 | ||
1397 | (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, | ||
1398 | &preferred_zone); | ||
1399 | if (!preferred_zone) | ||
1400 | return NULL; | ||
1401 | |||
1402 | classzone_idx = zone_idx(preferred_zone); | 1462 | classzone_idx = zone_idx(preferred_zone); |
1403 | |||
1404 | zonelist_scan: | 1463 | zonelist_scan: |
1405 | /* | 1464 | /* |
1406 | * Scan zonelist, looking for a zone with enough free. | 1465 | * Scan zonelist, looking for a zone with enough free. |
@@ -1415,31 +1474,49 @@ zonelist_scan: | |||
1415 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1474 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1416 | goto try_next_zone; | 1475 | goto try_next_zone; |
1417 | 1476 | ||
1477 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | ||
1418 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1478 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
1419 | unsigned long mark; | 1479 | unsigned long mark; |
1420 | if (alloc_flags & ALLOC_WMARK_MIN) | 1480 | int ret; |
1421 | mark = zone->pages_min; | 1481 | |
1422 | else if (alloc_flags & ALLOC_WMARK_LOW) | 1482 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
1423 | mark = zone->pages_low; | 1483 | if (zone_watermark_ok(zone, order, mark, |
1424 | else | 1484 | classzone_idx, alloc_flags)) |
1425 | mark = zone->pages_high; | 1485 | goto try_this_zone; |
1426 | if (!zone_watermark_ok(zone, order, mark, | 1486 | |
1427 | classzone_idx, alloc_flags)) { | 1487 | if (zone_reclaim_mode == 0) |
1428 | if (!zone_reclaim_mode || | 1488 | goto this_zone_full; |
1429 | !zone_reclaim(zone, gfp_mask, order)) | 1489 | |
1490 | ret = zone_reclaim(zone, gfp_mask, order); | ||
1491 | switch (ret) { | ||
1492 | case ZONE_RECLAIM_NOSCAN: | ||
1493 | /* did not scan */ | ||
1494 | goto try_next_zone; | ||
1495 | case ZONE_RECLAIM_FULL: | ||
1496 | /* scanned but unreclaimable */ | ||
1497 | goto this_zone_full; | ||
1498 | default: | ||
1499 | /* did we reclaim enough */ | ||
1500 | if (!zone_watermark_ok(zone, order, mark, | ||
1501 | classzone_idx, alloc_flags)) | ||
1430 | goto this_zone_full; | 1502 | goto this_zone_full; |
1431 | } | 1503 | } |
1432 | } | 1504 | } |
1433 | 1505 | ||
1434 | page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); | 1506 | try_this_zone: |
1507 | page = buffered_rmqueue(preferred_zone, zone, order, | ||
1508 | gfp_mask, migratetype); | ||
1435 | if (page) | 1509 | if (page) |
1436 | break; | 1510 | break; |
1437 | this_zone_full: | 1511 | this_zone_full: |
1438 | if (NUMA_BUILD) | 1512 | if (NUMA_BUILD) |
1439 | zlc_mark_zone_full(zonelist, z); | 1513 | zlc_mark_zone_full(zonelist, z); |
1440 | try_next_zone: | 1514 | try_next_zone: |
1441 | if (NUMA_BUILD && !did_zlc_setup) { | 1515 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { |
1442 | /* we do zlc_setup after the first zone is tried */ | 1516 | /* |
1517 | * we do zlc_setup after the first zone is tried but only | ||
1518 | * if there are multiple nodes make it worthwhile | ||
1519 | */ | ||
1443 | allowednodes = zlc_setup(zonelist, alloc_flags); | 1520 | allowednodes = zlc_setup(zonelist, alloc_flags); |
1444 | zlc_active = 1; | 1521 | zlc_active = 1; |
1445 | did_zlc_setup = 1; | 1522 | did_zlc_setup = 1; |
@@ -1454,47 +1531,217 @@ try_next_zone: | |||
1454 | return page; | 1531 | return page; |
1455 | } | 1532 | } |
1456 | 1533 | ||
1534 | static inline int | ||
1535 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, | ||
1536 | unsigned long pages_reclaimed) | ||
1537 | { | ||
1538 | /* Do not loop if specifically requested */ | ||
1539 | if (gfp_mask & __GFP_NORETRY) | ||
1540 | return 0; | ||
1541 | |||
1542 | /* | ||
1543 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | ||
1544 | * means __GFP_NOFAIL, but that may not be true in other | ||
1545 | * implementations. | ||
1546 | */ | ||
1547 | if (order <= PAGE_ALLOC_COSTLY_ORDER) | ||
1548 | return 1; | ||
1549 | |||
1550 | /* | ||
1551 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is | ||
1552 | * specified, then we retry until we no longer reclaim any pages | ||
1553 | * (above), or we've reclaimed an order of pages at least as | ||
1554 | * large as the allocation's order. In both cases, if the | ||
1555 | * allocation still fails, we stop retrying. | ||
1556 | */ | ||
1557 | if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) | ||
1558 | return 1; | ||
1559 | |||
1560 | /* | ||
1561 | * Don't let big-order allocations loop unless the caller | ||
1562 | * explicitly requests that. | ||
1563 | */ | ||
1564 | if (gfp_mask & __GFP_NOFAIL) | ||
1565 | return 1; | ||
1566 | |||
1567 | return 0; | ||
1568 | } | ||
1569 | |||
1570 | static inline struct page * | ||
1571 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | ||
1572 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1573 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
1574 | int migratetype) | ||
1575 | { | ||
1576 | struct page *page; | ||
1577 | |||
1578 | /* Acquire the OOM killer lock for the zones in zonelist */ | ||
1579 | if (!try_set_zone_oom(zonelist, gfp_mask)) { | ||
1580 | schedule_timeout_uninterruptible(1); | ||
1581 | return NULL; | ||
1582 | } | ||
1583 | |||
1584 | /* | ||
1585 | * Go through the zonelist yet one more time, keep very high watermark | ||
1586 | * here, this is only to catch a parallel oom killing, we must fail if | ||
1587 | * we're still under heavy pressure. | ||
1588 | */ | ||
1589 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | ||
1590 | order, zonelist, high_zoneidx, | ||
1591 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, | ||
1592 | preferred_zone, migratetype); | ||
1593 | if (page) | ||
1594 | goto out; | ||
1595 | |||
1596 | /* The OOM killer will not help higher order allocs */ | ||
1597 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) | ||
1598 | goto out; | ||
1599 | |||
1600 | /* Exhausted what can be done so it's blamo time */ | ||
1601 | out_of_memory(zonelist, gfp_mask, order); | ||
1602 | |||
1603 | out: | ||
1604 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1605 | return page; | ||
1606 | } | ||
1607 | |||
1608 | /* The really slow allocator path where we enter direct reclaim */ | ||
1609 | static inline struct page * | ||
1610 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | ||
1611 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1612 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
1613 | int migratetype, unsigned long *did_some_progress) | ||
1614 | { | ||
1615 | struct page *page = NULL; | ||
1616 | struct reclaim_state reclaim_state; | ||
1617 | struct task_struct *p = current; | ||
1618 | |||
1619 | cond_resched(); | ||
1620 | |||
1621 | /* We now go into synchronous reclaim */ | ||
1622 | cpuset_memory_pressure_bump(); | ||
1623 | |||
1624 | /* | ||
1625 | * The task's cpuset might have expanded its set of allowable nodes | ||
1626 | */ | ||
1627 | p->flags |= PF_MEMALLOC; | ||
1628 | lockdep_set_current_reclaim_state(gfp_mask); | ||
1629 | reclaim_state.reclaimed_slab = 0; | ||
1630 | p->reclaim_state = &reclaim_state; | ||
1631 | |||
1632 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | ||
1633 | |||
1634 | p->reclaim_state = NULL; | ||
1635 | lockdep_clear_current_reclaim_state(); | ||
1636 | p->flags &= ~PF_MEMALLOC; | ||
1637 | |||
1638 | cond_resched(); | ||
1639 | |||
1640 | if (order != 0) | ||
1641 | drain_all_pages(); | ||
1642 | |||
1643 | if (likely(*did_some_progress)) | ||
1644 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
1645 | zonelist, high_zoneidx, | ||
1646 | alloc_flags, preferred_zone, | ||
1647 | migratetype); | ||
1648 | return page; | ||
1649 | } | ||
1650 | |||
1457 | /* | 1651 | /* |
1458 | * This is the 'heart' of the zoned buddy allocator. | 1652 | * This is called in the allocator slow-path if the allocation request is of |
1653 | * sufficient urgency to ignore watermarks and take other desperate measures | ||
1459 | */ | 1654 | */ |
1460 | struct page * | 1655 | static inline struct page * |
1461 | __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, | 1656 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, |
1462 | struct zonelist *zonelist, nodemask_t *nodemask) | 1657 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
1658 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
1659 | int migratetype) | ||
1660 | { | ||
1661 | struct page *page; | ||
1662 | |||
1663 | do { | ||
1664 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
1665 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, | ||
1666 | preferred_zone, migratetype); | ||
1667 | |||
1668 | if (!page && gfp_mask & __GFP_NOFAIL) | ||
1669 | congestion_wait(WRITE, HZ/50); | ||
1670 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | ||
1671 | |||
1672 | return page; | ||
1673 | } | ||
1674 | |||
1675 | static inline | ||
1676 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, | ||
1677 | enum zone_type high_zoneidx) | ||
1463 | { | 1678 | { |
1464 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
1465 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
1466 | struct zoneref *z; | 1679 | struct zoneref *z; |
1467 | struct zone *zone; | 1680 | struct zone *zone; |
1468 | struct page *page; | ||
1469 | struct reclaim_state reclaim_state; | ||
1470 | struct task_struct *p = current; | ||
1471 | int do_retry; | ||
1472 | int alloc_flags; | ||
1473 | unsigned long did_some_progress; | ||
1474 | unsigned long pages_reclaimed = 0; | ||
1475 | 1681 | ||
1476 | lockdep_trace_alloc(gfp_mask); | 1682 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) |
1683 | wakeup_kswapd(zone, order); | ||
1684 | } | ||
1477 | 1685 | ||
1478 | might_sleep_if(wait); | 1686 | static inline int |
1687 | gfp_to_alloc_flags(gfp_t gfp_mask) | ||
1688 | { | ||
1689 | struct task_struct *p = current; | ||
1690 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; | ||
1691 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
1479 | 1692 | ||
1480 | if (should_fail_alloc_page(gfp_mask, order)) | 1693 | /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ |
1481 | return NULL; | 1694 | BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); |
1482 | 1695 | ||
1483 | restart: | 1696 | /* |
1484 | z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ | 1697 | * The caller may dip into page reserves a bit more if the caller |
1698 | * cannot run direct reclaim, or if the caller has realtime scheduling | ||
1699 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | ||
1700 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | ||
1701 | */ | ||
1702 | alloc_flags |= (gfp_mask & __GFP_HIGH); | ||
1485 | 1703 | ||
1486 | if (unlikely(!z->zone)) { | 1704 | if (!wait) { |
1705 | alloc_flags |= ALLOC_HARDER; | ||
1487 | /* | 1706 | /* |
1488 | * Happens if we have an empty zonelist as a result of | 1707 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
1489 | * GFP_THISNODE being used on a memoryless node | 1708 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1490 | */ | 1709 | */ |
1491 | return NULL; | 1710 | alloc_flags &= ~ALLOC_CPUSET; |
1711 | } else if (unlikely(rt_task(p))) | ||
1712 | alloc_flags |= ALLOC_HARDER; | ||
1713 | |||
1714 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | ||
1715 | if (!in_interrupt() && | ||
1716 | ((p->flags & PF_MEMALLOC) || | ||
1717 | unlikely(test_thread_flag(TIF_MEMDIE)))) | ||
1718 | alloc_flags |= ALLOC_NO_WATERMARKS; | ||
1492 | } | 1719 | } |
1493 | 1720 | ||
1494 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 1721 | return alloc_flags; |
1495 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); | 1722 | } |
1496 | if (page) | 1723 | |
1497 | goto got_pg; | 1724 | static inline struct page * |
1725 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | ||
1726 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1727 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
1728 | int migratetype) | ||
1729 | { | ||
1730 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
1731 | struct page *page = NULL; | ||
1732 | int alloc_flags; | ||
1733 | unsigned long pages_reclaimed = 0; | ||
1734 | unsigned long did_some_progress; | ||
1735 | struct task_struct *p = current; | ||
1736 | |||
1737 | /* | ||
1738 | * In the slowpath, we sanity check order to avoid ever trying to | ||
1739 | * reclaim >= MAX_ORDER areas which will never succeed. Callers may | ||
1740 | * be using allocators in order of preference for an area that is | ||
1741 | * too large. | ||
1742 | */ | ||
1743 | if (WARN_ON_ONCE(order >= MAX_ORDER)) | ||
1744 | return NULL; | ||
1498 | 1745 | ||
1499 | /* | 1746 | /* |
1500 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | 1747 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and |
@@ -1507,154 +1754,83 @@ restart: | |||
1507 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 1754 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) |
1508 | goto nopage; | 1755 | goto nopage; |
1509 | 1756 | ||
1510 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 1757 | wake_all_kswapd(order, zonelist, high_zoneidx); |
1511 | wakeup_kswapd(zone, order); | ||
1512 | 1758 | ||
1513 | /* | 1759 | /* |
1514 | * OK, we're below the kswapd watermark and have kicked background | 1760 | * OK, we're below the kswapd watermark and have kicked background |
1515 | * reclaim. Now things get more complex, so set up alloc_flags according | 1761 | * reclaim. Now things get more complex, so set up alloc_flags according |
1516 | * to how we want to proceed. | 1762 | * to how we want to proceed. |
1517 | * | ||
1518 | * The caller may dip into page reserves a bit more if the caller | ||
1519 | * cannot run direct reclaim, or if the caller has realtime scheduling | ||
1520 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | ||
1521 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | ||
1522 | */ | 1763 | */ |
1523 | alloc_flags = ALLOC_WMARK_MIN; | 1764 | alloc_flags = gfp_to_alloc_flags(gfp_mask); |
1524 | if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) | ||
1525 | alloc_flags |= ALLOC_HARDER; | ||
1526 | if (gfp_mask & __GFP_HIGH) | ||
1527 | alloc_flags |= ALLOC_HIGH; | ||
1528 | if (wait) | ||
1529 | alloc_flags |= ALLOC_CPUSET; | ||
1530 | 1765 | ||
1531 | /* | 1766 | restart: |
1532 | * Go through the zonelist again. Let __GFP_HIGH and allocations | 1767 | /* This is the last chance, in general, before the goto nopage. */ |
1533 | * coming from realtime tasks go deeper into reserves. | ||
1534 | * | ||
1535 | * This is the last chance, in general, before the goto nopage. | ||
1536 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | ||
1537 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | ||
1538 | */ | ||
1539 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 1768 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
1540 | high_zoneidx, alloc_flags); | 1769 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, |
1770 | preferred_zone, migratetype); | ||
1541 | if (page) | 1771 | if (page) |
1542 | goto got_pg; | 1772 | goto got_pg; |
1543 | 1773 | ||
1544 | /* This allocation should allow future memory freeing. */ | ||
1545 | |||
1546 | rebalance: | 1774 | rebalance: |
1547 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) | 1775 | /* Allocate without watermarks if the context allows */ |
1548 | && !in_interrupt()) { | 1776 | if (alloc_flags & ALLOC_NO_WATERMARKS) { |
1549 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 1777 | page = __alloc_pages_high_priority(gfp_mask, order, |
1550 | nofail_alloc: | 1778 | zonelist, high_zoneidx, nodemask, |
1551 | /* go through the zonelist yet again, ignoring mins */ | 1779 | preferred_zone, migratetype); |
1552 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 1780 | if (page) |
1553 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); | 1781 | goto got_pg; |
1554 | if (page) | ||
1555 | goto got_pg; | ||
1556 | if (gfp_mask & __GFP_NOFAIL) { | ||
1557 | congestion_wait(WRITE, HZ/50); | ||
1558 | goto nofail_alloc; | ||
1559 | } | ||
1560 | } | ||
1561 | goto nopage; | ||
1562 | } | 1782 | } |
1563 | 1783 | ||
1564 | /* Atomic allocations - we can't balance anything */ | 1784 | /* Atomic allocations - we can't balance anything */ |
1565 | if (!wait) | 1785 | if (!wait) |
1566 | goto nopage; | 1786 | goto nopage; |
1567 | 1787 | ||
1568 | cond_resched(); | 1788 | /* Avoid recursion of direct reclaim */ |
1789 | if (p->flags & PF_MEMALLOC) | ||
1790 | goto nopage; | ||
1791 | |||
1792 | /* Try direct reclaim and then allocating */ | ||
1793 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | ||
1794 | zonelist, high_zoneidx, | ||
1795 | nodemask, | ||
1796 | alloc_flags, preferred_zone, | ||
1797 | migratetype, &did_some_progress); | ||
1798 | if (page) | ||
1799 | goto got_pg; | ||
1569 | 1800 | ||
1570 | /* We now go into synchronous reclaim */ | ||
1571 | cpuset_memory_pressure_bump(); | ||
1572 | /* | 1801 | /* |
1573 | * The task's cpuset might have expanded its set of allowable nodes | 1802 | * If we failed to make any progress reclaiming, then we are |
1803 | * running out of options and have to consider going OOM | ||
1574 | */ | 1804 | */ |
1575 | cpuset_update_task_memory_state(); | 1805 | if (!did_some_progress) { |
1576 | p->flags |= PF_MEMALLOC; | 1806 | if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { |
1577 | 1807 | if (oom_killer_disabled) | |
1578 | lockdep_set_current_reclaim_state(gfp_mask); | 1808 | goto nopage; |
1579 | reclaim_state.reclaimed_slab = 0; | 1809 | page = __alloc_pages_may_oom(gfp_mask, order, |
1580 | p->reclaim_state = &reclaim_state; | 1810 | zonelist, high_zoneidx, |
1581 | 1811 | nodemask, preferred_zone, | |
1582 | did_some_progress = try_to_free_pages(zonelist, order, | 1812 | migratetype); |
1583 | gfp_mask, nodemask); | 1813 | if (page) |
1584 | 1814 | goto got_pg; | |
1585 | p->reclaim_state = NULL; | ||
1586 | lockdep_clear_current_reclaim_state(); | ||
1587 | p->flags &= ~PF_MEMALLOC; | ||
1588 | |||
1589 | cond_resched(); | ||
1590 | 1815 | ||
1591 | if (order != 0) | 1816 | /* |
1592 | drain_all_pages(); | 1817 | * The OOM killer does not trigger for high-order |
1818 | * ~__GFP_NOFAIL allocations so if no progress is being | ||
1819 | * made, there are no other options and retrying is | ||
1820 | * unlikely to help. | ||
1821 | */ | ||
1822 | if (order > PAGE_ALLOC_COSTLY_ORDER && | ||
1823 | !(gfp_mask & __GFP_NOFAIL)) | ||
1824 | goto nopage; | ||
1593 | 1825 | ||
1594 | if (likely(did_some_progress)) { | ||
1595 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
1596 | zonelist, high_zoneidx, alloc_flags); | ||
1597 | if (page) | ||
1598 | goto got_pg; | ||
1599 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | ||
1600 | if (!try_set_zone_oom(zonelist, gfp_mask)) { | ||
1601 | schedule_timeout_uninterruptible(1); | ||
1602 | goto restart; | 1826 | goto restart; |
1603 | } | 1827 | } |
1604 | |||
1605 | /* | ||
1606 | * Go through the zonelist yet one more time, keep | ||
1607 | * very high watermark here, this is only to catch | ||
1608 | * a parallel oom killing, we must fail if we're still | ||
1609 | * under heavy pressure. | ||
1610 | */ | ||
1611 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | ||
1612 | order, zonelist, high_zoneidx, | ||
1613 | ALLOC_WMARK_HIGH|ALLOC_CPUSET); | ||
1614 | if (page) { | ||
1615 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1616 | goto got_pg; | ||
1617 | } | ||
1618 | |||
1619 | /* The OOM killer will not help higher order allocs so fail */ | ||
1620 | if (order > PAGE_ALLOC_COSTLY_ORDER) { | ||
1621 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1622 | goto nopage; | ||
1623 | } | ||
1624 | |||
1625 | out_of_memory(zonelist, gfp_mask, order); | ||
1626 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1627 | goto restart; | ||
1628 | } | 1828 | } |
1629 | 1829 | ||
1630 | /* | 1830 | /* Check if we should retry the allocation */ |
1631 | * Don't let big-order allocations loop unless the caller explicitly | ||
1632 | * requests that. Wait for some write requests to complete then retry. | ||
1633 | * | ||
1634 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | ||
1635 | * means __GFP_NOFAIL, but that may not be true in other | ||
1636 | * implementations. | ||
1637 | * | ||
1638 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is | ||
1639 | * specified, then we retry until we no longer reclaim any pages | ||
1640 | * (above), or we've reclaimed an order of pages at least as | ||
1641 | * large as the allocation's order. In both cases, if the | ||
1642 | * allocation still fails, we stop retrying. | ||
1643 | */ | ||
1644 | pages_reclaimed += did_some_progress; | 1831 | pages_reclaimed += did_some_progress; |
1645 | do_retry = 0; | 1832 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { |
1646 | if (!(gfp_mask & __GFP_NORETRY)) { | 1833 | /* Wait for some write requests to complete then retry */ |
1647 | if (order <= PAGE_ALLOC_COSTLY_ORDER) { | ||
1648 | do_retry = 1; | ||
1649 | } else { | ||
1650 | if (gfp_mask & __GFP_REPEAT && | ||
1651 | pages_reclaimed < (1 << order)) | ||
1652 | do_retry = 1; | ||
1653 | } | ||
1654 | if (gfp_mask & __GFP_NOFAIL) | ||
1655 | do_retry = 1; | ||
1656 | } | ||
1657 | if (do_retry) { | ||
1658 | congestion_wait(WRITE, HZ/50); | 1834 | congestion_wait(WRITE, HZ/50); |
1659 | goto rebalance; | 1835 | goto rebalance; |
1660 | } | 1836 | } |
@@ -1667,10 +1843,58 @@ nopage: | |||
1667 | dump_stack(); | 1843 | dump_stack(); |
1668 | show_mem(); | 1844 | show_mem(); |
1669 | } | 1845 | } |
1846 | return page; | ||
1670 | got_pg: | 1847 | got_pg: |
1848 | if (kmemcheck_enabled) | ||
1849 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
1850 | return page; | ||
1851 | |||
1852 | } | ||
1853 | |||
1854 | /* | ||
1855 | * This is the 'heart' of the zoned buddy allocator. | ||
1856 | */ | ||
1857 | struct page * | ||
1858 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | ||
1859 | struct zonelist *zonelist, nodemask_t *nodemask) | ||
1860 | { | ||
1861 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
1862 | struct zone *preferred_zone; | ||
1863 | struct page *page; | ||
1864 | int migratetype = allocflags_to_migratetype(gfp_mask); | ||
1865 | |||
1866 | lockdep_trace_alloc(gfp_mask); | ||
1867 | |||
1868 | might_sleep_if(gfp_mask & __GFP_WAIT); | ||
1869 | |||
1870 | if (should_fail_alloc_page(gfp_mask, order)) | ||
1871 | return NULL; | ||
1872 | |||
1873 | /* | ||
1874 | * Check the zones suitable for the gfp_mask contain at least one | ||
1875 | * valid zone. It's possible to have an empty zonelist as a result | ||
1876 | * of GFP_THISNODE and a memoryless node | ||
1877 | */ | ||
1878 | if (unlikely(!zonelist->_zonerefs->zone)) | ||
1879 | return NULL; | ||
1880 | |||
1881 | /* The preferred zone is used for statistics later */ | ||
1882 | first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); | ||
1883 | if (!preferred_zone) | ||
1884 | return NULL; | ||
1885 | |||
1886 | /* First allocation attempt */ | ||
1887 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | ||
1888 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, | ||
1889 | preferred_zone, migratetype); | ||
1890 | if (unlikely(!page)) | ||
1891 | page = __alloc_pages_slowpath(gfp_mask, order, | ||
1892 | zonelist, high_zoneidx, nodemask, | ||
1893 | preferred_zone, migratetype); | ||
1894 | |||
1671 | return page; | 1895 | return page; |
1672 | } | 1896 | } |
1673 | EXPORT_SYMBOL(__alloc_pages_internal); | 1897 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
1674 | 1898 | ||
1675 | /* | 1899 | /* |
1676 | * Common helper functions. | 1900 | * Common helper functions. |
@@ -1799,7 +2023,7 @@ static unsigned int nr_free_zone_pages(int offset) | |||
1799 | 2023 | ||
1800 | for_each_zone_zonelist(zone, z, zonelist, offset) { | 2024 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
1801 | unsigned long size = zone->present_pages; | 2025 | unsigned long size = zone->present_pages; |
1802 | unsigned long high = zone->pages_high; | 2026 | unsigned long high = high_wmark_pages(zone); |
1803 | if (size > high) | 2027 | if (size > high) |
1804 | sum += size - high; | 2028 | sum += size - high; |
1805 | } | 2029 | } |
@@ -1891,19 +2115,14 @@ void show_free_areas(void) | |||
1891 | 2115 | ||
1892 | printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" | 2116 | printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" |
1893 | " inactive_file:%lu" | 2117 | " inactive_file:%lu" |
1894 | //TODO: check/adjust line lengths | ||
1895 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1896 | " unevictable:%lu" | 2118 | " unevictable:%lu" |
1897 | #endif | ||
1898 | " dirty:%lu writeback:%lu unstable:%lu\n" | 2119 | " dirty:%lu writeback:%lu unstable:%lu\n" |
1899 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", | 2120 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", |
1900 | global_page_state(NR_ACTIVE_ANON), | 2121 | global_page_state(NR_ACTIVE_ANON), |
1901 | global_page_state(NR_ACTIVE_FILE), | 2122 | global_page_state(NR_ACTIVE_FILE), |
1902 | global_page_state(NR_INACTIVE_ANON), | 2123 | global_page_state(NR_INACTIVE_ANON), |
1903 | global_page_state(NR_INACTIVE_FILE), | 2124 | global_page_state(NR_INACTIVE_FILE), |
1904 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1905 | global_page_state(NR_UNEVICTABLE), | 2125 | global_page_state(NR_UNEVICTABLE), |
1906 | #endif | ||
1907 | global_page_state(NR_FILE_DIRTY), | 2126 | global_page_state(NR_FILE_DIRTY), |
1908 | global_page_state(NR_WRITEBACK), | 2127 | global_page_state(NR_WRITEBACK), |
1909 | global_page_state(NR_UNSTABLE_NFS), | 2128 | global_page_state(NR_UNSTABLE_NFS), |
@@ -1927,25 +2146,21 @@ void show_free_areas(void) | |||
1927 | " inactive_anon:%lukB" | 2146 | " inactive_anon:%lukB" |
1928 | " active_file:%lukB" | 2147 | " active_file:%lukB" |
1929 | " inactive_file:%lukB" | 2148 | " inactive_file:%lukB" |
1930 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1931 | " unevictable:%lukB" | 2149 | " unevictable:%lukB" |
1932 | #endif | ||
1933 | " present:%lukB" | 2150 | " present:%lukB" |
1934 | " pages_scanned:%lu" | 2151 | " pages_scanned:%lu" |
1935 | " all_unreclaimable? %s" | 2152 | " all_unreclaimable? %s" |
1936 | "\n", | 2153 | "\n", |
1937 | zone->name, | 2154 | zone->name, |
1938 | K(zone_page_state(zone, NR_FREE_PAGES)), | 2155 | K(zone_page_state(zone, NR_FREE_PAGES)), |
1939 | K(zone->pages_min), | 2156 | K(min_wmark_pages(zone)), |
1940 | K(zone->pages_low), | 2157 | K(low_wmark_pages(zone)), |
1941 | K(zone->pages_high), | 2158 | K(high_wmark_pages(zone)), |
1942 | K(zone_page_state(zone, NR_ACTIVE_ANON)), | 2159 | K(zone_page_state(zone, NR_ACTIVE_ANON)), |
1943 | K(zone_page_state(zone, NR_INACTIVE_ANON)), | 2160 | K(zone_page_state(zone, NR_INACTIVE_ANON)), |
1944 | K(zone_page_state(zone, NR_ACTIVE_FILE)), | 2161 | K(zone_page_state(zone, NR_ACTIVE_FILE)), |
1945 | K(zone_page_state(zone, NR_INACTIVE_FILE)), | 2162 | K(zone_page_state(zone, NR_INACTIVE_FILE)), |
1946 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1947 | K(zone_page_state(zone, NR_UNEVICTABLE)), | 2163 | K(zone_page_state(zone, NR_UNEVICTABLE)), |
1948 | #endif | ||
1949 | K(zone->present_pages), | 2164 | K(zone->present_pages), |
1950 | zone->pages_scanned, | 2165 | zone->pages_scanned, |
1951 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") | 2166 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") |
@@ -2103,7 +2318,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
2103 | } | 2318 | } |
2104 | 2319 | ||
2105 | 2320 | ||
2106 | #define MAX_NODE_LOAD (num_online_nodes()) | 2321 | #define MAX_NODE_LOAD (nr_online_nodes) |
2107 | static int node_load[MAX_NUMNODES]; | 2322 | static int node_load[MAX_NUMNODES]; |
2108 | 2323 | ||
2109 | /** | 2324 | /** |
@@ -2312,7 +2527,7 @@ static void build_zonelists(pg_data_t *pgdat) | |||
2312 | 2527 | ||
2313 | /* NUMA-aware ordering of nodes */ | 2528 | /* NUMA-aware ordering of nodes */ |
2314 | local_node = pgdat->node_id; | 2529 | local_node = pgdat->node_id; |
2315 | load = num_online_nodes(); | 2530 | load = nr_online_nodes; |
2316 | prev_node = local_node; | 2531 | prev_node = local_node; |
2317 | nodes_clear(used_mask); | 2532 | nodes_clear(used_mask); |
2318 | 2533 | ||
@@ -2463,7 +2678,7 @@ void build_all_zonelists(void) | |||
2463 | 2678 | ||
2464 | printk("Built %i zonelists in %s order, mobility grouping %s. " | 2679 | printk("Built %i zonelists in %s order, mobility grouping %s. " |
2465 | "Total pages: %ld\n", | 2680 | "Total pages: %ld\n", |
2466 | num_online_nodes(), | 2681 | nr_online_nodes, |
2467 | zonelist_order_name[current_zonelist_order], | 2682 | zonelist_order_name[current_zonelist_order], |
2468 | page_group_by_mobility_disabled ? "off" : "on", | 2683 | page_group_by_mobility_disabled ? "off" : "on", |
2469 | vm_total_pages); | 2684 | vm_total_pages); |
@@ -2542,8 +2757,8 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
2542 | 2757 | ||
2543 | /* | 2758 | /* |
2544 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number | 2759 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number |
2545 | * of blocks reserved is based on zone->pages_min. The memory within the | 2760 | * of blocks reserved is based on min_wmark_pages(zone). The memory within |
2546 | * reserve will tend to store contiguous free pages. Setting min_free_kbytes | 2761 | * the reserve will tend to store contiguous free pages. Setting min_free_kbytes |
2547 | * higher will lead to a bigger reserve which will get freed as contiguous | 2762 | * higher will lead to a bigger reserve which will get freed as contiguous |
2548 | * blocks as reclaim kicks in | 2763 | * blocks as reclaim kicks in |
2549 | */ | 2764 | */ |
@@ -2556,7 +2771,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
2556 | /* Get the start pfn, end pfn and the number of blocks to reserve */ | 2771 | /* Get the start pfn, end pfn and the number of blocks to reserve */ |
2557 | start_pfn = zone->zone_start_pfn; | 2772 | start_pfn = zone->zone_start_pfn; |
2558 | end_pfn = start_pfn + zone->spanned_pages; | 2773 | end_pfn = start_pfn + zone->spanned_pages; |
2559 | reserve = roundup(zone->pages_min, pageblock_nr_pages) >> | 2774 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> |
2560 | pageblock_order; | 2775 | pageblock_order; |
2561 | 2776 | ||
2562 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 2777 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
@@ -3488,7 +3703,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
3488 | zone_pcp_init(zone); | 3703 | zone_pcp_init(zone); |
3489 | for_each_lru(l) { | 3704 | for_each_lru(l) { |
3490 | INIT_LIST_HEAD(&zone->lru[l].list); | 3705 | INIT_LIST_HEAD(&zone->lru[l].list); |
3491 | zone->lru[l].nr_scan = 0; | 3706 | zone->lru[l].nr_saved_scan = 0; |
3492 | } | 3707 | } |
3493 | zone->reclaim_stat.recent_rotated[0] = 0; | 3708 | zone->reclaim_stat.recent_rotated[0] = 0; |
3494 | zone->reclaim_stat.recent_rotated[1] = 0; | 3709 | zone->reclaim_stat.recent_rotated[1] = 0; |
@@ -4025,6 +4240,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4025 | early_node_map[i].start_pfn, | 4240 | early_node_map[i].start_pfn, |
4026 | early_node_map[i].end_pfn); | 4241 | early_node_map[i].end_pfn); |
4027 | 4242 | ||
4243 | /* | ||
4244 | * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init | ||
4245 | * that node_mask, clear it at first | ||
4246 | */ | ||
4247 | nodes_clear(node_states[N_HIGH_MEMORY]); | ||
4028 | /* Initialise every node */ | 4248 | /* Initialise every node */ |
4029 | mminit_verify_pageflags_layout(); | 4249 | mminit_verify_pageflags_layout(); |
4030 | setup_nr_node_ids(); | 4250 | setup_nr_node_ids(); |
@@ -4159,8 +4379,8 @@ static void calculate_totalreserve_pages(void) | |||
4159 | max = zone->lowmem_reserve[j]; | 4379 | max = zone->lowmem_reserve[j]; |
4160 | } | 4380 | } |
4161 | 4381 | ||
4162 | /* we treat pages_high as reserved pages. */ | 4382 | /* we treat the high watermark as reserved pages. */ |
4163 | max += zone->pages_high; | 4383 | max += high_wmark_pages(zone); |
4164 | 4384 | ||
4165 | if (max > zone->present_pages) | 4385 | if (max > zone->present_pages) |
4166 | max = zone->present_pages; | 4386 | max = zone->present_pages; |
@@ -4210,12 +4430,13 @@ static void setup_per_zone_lowmem_reserve(void) | |||
4210 | } | 4430 | } |
4211 | 4431 | ||
4212 | /** | 4432 | /** |
4213 | * setup_per_zone_pages_min - called when min_free_kbytes changes. | 4433 | * setup_per_zone_wmarks - called when min_free_kbytes changes |
4434 | * or when memory is hot-{added|removed} | ||
4214 | * | 4435 | * |
4215 | * Ensures that the pages_{min,low,high} values for each zone are set correctly | 4436 | * Ensures that the watermark[min,low,high] values for each zone are set |
4216 | * with respect to min_free_kbytes. | 4437 | * correctly with respect to min_free_kbytes. |
4217 | */ | 4438 | */ |
4218 | void setup_per_zone_pages_min(void) | 4439 | void setup_per_zone_wmarks(void) |
4219 | { | 4440 | { |
4220 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | 4441 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); |
4221 | unsigned long lowmem_pages = 0; | 4442 | unsigned long lowmem_pages = 0; |
@@ -4240,7 +4461,7 @@ void setup_per_zone_pages_min(void) | |||
4240 | * need highmem pages, so cap pages_min to a small | 4461 | * need highmem pages, so cap pages_min to a small |
4241 | * value here. | 4462 | * value here. |
4242 | * | 4463 | * |
4243 | * The (pages_high-pages_low) and (pages_low-pages_min) | 4464 | * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) |
4244 | * deltas controls asynch page reclaim, and so should | 4465 | * deltas controls asynch page reclaim, and so should |
4245 | * not be capped for highmem. | 4466 | * not be capped for highmem. |
4246 | */ | 4467 | */ |
@@ -4251,17 +4472,17 @@ void setup_per_zone_pages_min(void) | |||
4251 | min_pages = SWAP_CLUSTER_MAX; | 4472 | min_pages = SWAP_CLUSTER_MAX; |
4252 | if (min_pages > 128) | 4473 | if (min_pages > 128) |
4253 | min_pages = 128; | 4474 | min_pages = 128; |
4254 | zone->pages_min = min_pages; | 4475 | zone->watermark[WMARK_MIN] = min_pages; |
4255 | } else { | 4476 | } else { |
4256 | /* | 4477 | /* |
4257 | * If it's a lowmem zone, reserve a number of pages | 4478 | * If it's a lowmem zone, reserve a number of pages |
4258 | * proportionate to the zone's size. | 4479 | * proportionate to the zone's size. |
4259 | */ | 4480 | */ |
4260 | zone->pages_min = tmp; | 4481 | zone->watermark[WMARK_MIN] = tmp; |
4261 | } | 4482 | } |
4262 | 4483 | ||
4263 | zone->pages_low = zone->pages_min + (tmp >> 2); | 4484 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); |
4264 | zone->pages_high = zone->pages_min + (tmp >> 1); | 4485 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
4265 | setup_zone_migrate_reserve(zone); | 4486 | setup_zone_migrate_reserve(zone); |
4266 | spin_unlock_irqrestore(&zone->lock, flags); | 4487 | spin_unlock_irqrestore(&zone->lock, flags); |
4267 | } | 4488 | } |
@@ -4271,8 +4492,6 @@ void setup_per_zone_pages_min(void) | |||
4271 | } | 4492 | } |
4272 | 4493 | ||
4273 | /** | 4494 | /** |
4274 | * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. | ||
4275 | * | ||
4276 | * The inactive anon list should be small enough that the VM never has to | 4495 | * The inactive anon list should be small enough that the VM never has to |
4277 | * do too much work, but large enough that each inactive page has a chance | 4496 | * do too much work, but large enough that each inactive page has a chance |
4278 | * to be referenced again before it is swapped out. | 4497 | * to be referenced again before it is swapped out. |
@@ -4293,21 +4512,26 @@ void setup_per_zone_pages_min(void) | |||
4293 | * 1TB 101 10GB | 4512 | * 1TB 101 10GB |
4294 | * 10TB 320 32GB | 4513 | * 10TB 320 32GB |
4295 | */ | 4514 | */ |
4296 | static void setup_per_zone_inactive_ratio(void) | 4515 | void calculate_zone_inactive_ratio(struct zone *zone) |
4297 | { | 4516 | { |
4298 | struct zone *zone; | 4517 | unsigned int gb, ratio; |
4299 | |||
4300 | for_each_zone(zone) { | ||
4301 | unsigned int gb, ratio; | ||
4302 | 4518 | ||
4303 | /* Zone size in gigabytes */ | 4519 | /* Zone size in gigabytes */ |
4304 | gb = zone->present_pages >> (30 - PAGE_SHIFT); | 4520 | gb = zone->present_pages >> (30 - PAGE_SHIFT); |
4521 | if (gb) | ||
4305 | ratio = int_sqrt(10 * gb); | 4522 | ratio = int_sqrt(10 * gb); |
4306 | if (!ratio) | 4523 | else |
4307 | ratio = 1; | 4524 | ratio = 1; |
4308 | 4525 | ||
4309 | zone->inactive_ratio = ratio; | 4526 | zone->inactive_ratio = ratio; |
4310 | } | 4527 | } |
4528 | |||
4529 | static void __init setup_per_zone_inactive_ratio(void) | ||
4530 | { | ||
4531 | struct zone *zone; | ||
4532 | |||
4533 | for_each_zone(zone) | ||
4534 | calculate_zone_inactive_ratio(zone); | ||
4311 | } | 4535 | } |
4312 | 4536 | ||
4313 | /* | 4537 | /* |
@@ -4334,7 +4558,7 @@ static void setup_per_zone_inactive_ratio(void) | |||
4334 | * 8192MB: 11584k | 4558 | * 8192MB: 11584k |
4335 | * 16384MB: 16384k | 4559 | * 16384MB: 16384k |
4336 | */ | 4560 | */ |
4337 | static int __init init_per_zone_pages_min(void) | 4561 | static int __init init_per_zone_wmark_min(void) |
4338 | { | 4562 | { |
4339 | unsigned long lowmem_kbytes; | 4563 | unsigned long lowmem_kbytes; |
4340 | 4564 | ||
@@ -4345,12 +4569,12 @@ static int __init init_per_zone_pages_min(void) | |||
4345 | min_free_kbytes = 128; | 4569 | min_free_kbytes = 128; |
4346 | if (min_free_kbytes > 65536) | 4570 | if (min_free_kbytes > 65536) |
4347 | min_free_kbytes = 65536; | 4571 | min_free_kbytes = 65536; |
4348 | setup_per_zone_pages_min(); | 4572 | setup_per_zone_wmarks(); |
4349 | setup_per_zone_lowmem_reserve(); | 4573 | setup_per_zone_lowmem_reserve(); |
4350 | setup_per_zone_inactive_ratio(); | 4574 | setup_per_zone_inactive_ratio(); |
4351 | return 0; | 4575 | return 0; |
4352 | } | 4576 | } |
4353 | module_init(init_per_zone_pages_min) | 4577 | module_init(init_per_zone_wmark_min) |
4354 | 4578 | ||
4355 | /* | 4579 | /* |
4356 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so | 4580 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so |
@@ -4362,7 +4586,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | |||
4362 | { | 4586 | { |
4363 | proc_dointvec(table, write, file, buffer, length, ppos); | 4587 | proc_dointvec(table, write, file, buffer, length, ppos); |
4364 | if (write) | 4588 | if (write) |
4365 | setup_per_zone_pages_min(); | 4589 | setup_per_zone_wmarks(); |
4366 | return 0; | 4590 | return 0; |
4367 | } | 4591 | } |
4368 | 4592 | ||
@@ -4406,7 +4630,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | |||
4406 | * whenever sysctl_lowmem_reserve_ratio changes. | 4630 | * whenever sysctl_lowmem_reserve_ratio changes. |
4407 | * | 4631 | * |
4408 | * The reserve ratio obviously has absolutely no relation with the | 4632 | * The reserve ratio obviously has absolutely no relation with the |
4409 | * pages_min watermarks. The lowmem reserve ratio can only make sense | 4633 | * minimum watermarks. The lowmem reserve ratio can only make sense |
4410 | * if in function of the boot time zone sizes. | 4634 | * if in function of the boot time zone sizes. |
4411 | */ | 4635 | */ |
4412 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | 4636 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, |
@@ -4513,23 +4737,13 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4513 | else if (hashdist) | 4737 | else if (hashdist) |
4514 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 4738 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
4515 | else { | 4739 | else { |
4516 | unsigned long order = get_order(size); | ||
4517 | table = (void*) __get_free_pages(GFP_ATOMIC, order); | ||
4518 | /* | 4740 | /* |
4519 | * If bucketsize is not a power-of-two, we may free | 4741 | * If bucketsize is not a power-of-two, we may free |
4520 | * some pages at the end of hash table. | 4742 | * some pages at the end of hash table which |
4743 | * alloc_pages_exact() automatically does | ||
4521 | */ | 4744 | */ |
4522 | if (table) { | 4745 | if (get_order(size) < MAX_ORDER) |
4523 | unsigned long alloc_end = (unsigned long)table + | 4746 | table = alloc_pages_exact(size, GFP_ATOMIC); |
4524 | (PAGE_SIZE << order); | ||
4525 | unsigned long used = (unsigned long)table + | ||
4526 | PAGE_ALIGN(size); | ||
4527 | split_page(virt_to_page(table), order); | ||
4528 | while (used < alloc_end) { | ||
4529 | free_page(used); | ||
4530 | used += PAGE_SIZE; | ||
4531 | } | ||
4532 | } | ||
4533 | } | 4747 | } |
4534 | } while (!table && size > PAGE_SIZE && --log2qty); | 4748 | } while (!table && size > PAGE_SIZE && --log2qty); |
4535 | 4749 | ||
diff --git a/mm/page_io.c b/mm/page_io.c index 3023c475e041..c6f3e5071de3 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -120,7 +120,7 @@ out: | |||
120 | return ret; | 120 | return ret; |
121 | } | 121 | } |
122 | 122 | ||
123 | int swap_readpage(struct file *file, struct page *page) | 123 | int swap_readpage(struct page *page) |
124 | { | 124 | { |
125 | struct bio *bio; | 125 | struct bio *bio; |
126 | int ret = 0; | 126 | int ret = 0; |
diff --git a/mm/readahead.c b/mm/readahead.c index 133b6d525513..aa1aa2345235 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -133,15 +133,12 @@ out: | |||
133 | } | 133 | } |
134 | 134 | ||
135 | /* | 135 | /* |
136 | * do_page_cache_readahead actually reads a chunk of disk. It allocates all | 136 | * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all |
137 | * the pages first, then submits them all for I/O. This avoids the very bad | 137 | * the pages first, then submits them all for I/O. This avoids the very bad |
138 | * behaviour which would occur if page allocations are causing VM writeback. | 138 | * behaviour which would occur if page allocations are causing VM writeback. |
139 | * We really don't want to intermingle reads and writes like that. | 139 | * We really don't want to intermingle reads and writes like that. |
140 | * | 140 | * |
141 | * Returns the number of pages requested, or the maximum amount of I/O allowed. | 141 | * Returns the number of pages requested, or the maximum amount of I/O allowed. |
142 | * | ||
143 | * do_page_cache_readahead() returns -1 if it encountered request queue | ||
144 | * congestion. | ||
145 | */ | 142 | */ |
146 | static int | 143 | static int |
147 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | 144 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, |
@@ -210,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
210 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) | 207 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) |
211 | return -EINVAL; | 208 | return -EINVAL; |
212 | 209 | ||
210 | nr_to_read = max_sane_readahead(nr_to_read); | ||
213 | while (nr_to_read) { | 211 | while (nr_to_read) { |
214 | int err; | 212 | int err; |
215 | 213 | ||
@@ -231,22 +229,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
231 | } | 229 | } |
232 | 230 | ||
233 | /* | 231 | /* |
234 | * This version skips the IO if the queue is read-congested, and will tell the | ||
235 | * block layer to abandon the readahead if request allocation would block. | ||
236 | * | ||
237 | * force_page_cache_readahead() will ignore queue congestion and will block on | ||
238 | * request queues. | ||
239 | */ | ||
240 | int do_page_cache_readahead(struct address_space *mapping, struct file *filp, | ||
241 | pgoff_t offset, unsigned long nr_to_read) | ||
242 | { | ||
243 | if (bdi_read_congested(mapping->backing_dev_info)) | ||
244 | return -1; | ||
245 | |||
246 | return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a | 232 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a |
251 | * sensible upper limit. | 233 | * sensible upper limit. |
252 | */ | 234 | */ |
@@ -259,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr) | |||
259 | /* | 241 | /* |
260 | * Submit IO for the read-ahead request in file_ra_state. | 242 | * Submit IO for the read-ahead request in file_ra_state. |
261 | */ | 243 | */ |
262 | static unsigned long ra_submit(struct file_ra_state *ra, | 244 | unsigned long ra_submit(struct file_ra_state *ra, |
263 | struct address_space *mapping, struct file *filp) | 245 | struct address_space *mapping, struct file *filp) |
264 | { | 246 | { |
265 | int actual; | 247 | int actual; |
@@ -348,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, | |||
348 | */ | 330 | */ |
349 | 331 | ||
350 | /* | 332 | /* |
333 | * Count contiguously cached pages from @offset-1 to @offset-@max, | ||
334 | * this count is a conservative estimation of | ||
335 | * - length of the sequential read sequence, or | ||
336 | * - thrashing threshold in memory tight systems | ||
337 | */ | ||
338 | static pgoff_t count_history_pages(struct address_space *mapping, | ||
339 | struct file_ra_state *ra, | ||
340 | pgoff_t offset, unsigned long max) | ||
341 | { | ||
342 | pgoff_t head; | ||
343 | |||
344 | rcu_read_lock(); | ||
345 | head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); | ||
346 | rcu_read_unlock(); | ||
347 | |||
348 | return offset - 1 - head; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * page cache context based read-ahead | ||
353 | */ | ||
354 | static int try_context_readahead(struct address_space *mapping, | ||
355 | struct file_ra_state *ra, | ||
356 | pgoff_t offset, | ||
357 | unsigned long req_size, | ||
358 | unsigned long max) | ||
359 | { | ||
360 | pgoff_t size; | ||
361 | |||
362 | size = count_history_pages(mapping, ra, offset, max); | ||
363 | |||
364 | /* | ||
365 | * no history pages: | ||
366 | * it could be a random read | ||
367 | */ | ||
368 | if (!size) | ||
369 | return 0; | ||
370 | |||
371 | /* | ||
372 | * starts from beginning of file: | ||
373 | * it is a strong indication of long-run stream (or whole-file-read) | ||
374 | */ | ||
375 | if (size >= offset) | ||
376 | size *= 2; | ||
377 | |||
378 | ra->start = offset; | ||
379 | ra->size = get_init_ra_size(size + req_size, max); | ||
380 | ra->async_size = ra->size; | ||
381 | |||
382 | return 1; | ||
383 | } | ||
384 | |||
385 | /* | ||
351 | * A minimal readahead algorithm for trivial sequential/random reads. | 386 | * A minimal readahead algorithm for trivial sequential/random reads. |
352 | */ | 387 | */ |
353 | static unsigned long | 388 | static unsigned long |
@@ -356,34 +391,26 @@ ondemand_readahead(struct address_space *mapping, | |||
356 | bool hit_readahead_marker, pgoff_t offset, | 391 | bool hit_readahead_marker, pgoff_t offset, |
357 | unsigned long req_size) | 392 | unsigned long req_size) |
358 | { | 393 | { |
359 | int max = ra->ra_pages; /* max readahead pages */ | 394 | unsigned long max = max_sane_readahead(ra->ra_pages); |
360 | pgoff_t prev_offset; | 395 | |
361 | int sequential; | 396 | /* |
397 | * start of file | ||
398 | */ | ||
399 | if (!offset) | ||
400 | goto initial_readahead; | ||
362 | 401 | ||
363 | /* | 402 | /* |
364 | * It's the expected callback offset, assume sequential access. | 403 | * It's the expected callback offset, assume sequential access. |
365 | * Ramp up sizes, and push forward the readahead window. | 404 | * Ramp up sizes, and push forward the readahead window. |
366 | */ | 405 | */ |
367 | if (offset && (offset == (ra->start + ra->size - ra->async_size) || | 406 | if ((offset == (ra->start + ra->size - ra->async_size) || |
368 | offset == (ra->start + ra->size))) { | 407 | offset == (ra->start + ra->size))) { |
369 | ra->start += ra->size; | 408 | ra->start += ra->size; |
370 | ra->size = get_next_ra_size(ra, max); | 409 | ra->size = get_next_ra_size(ra, max); |
371 | ra->async_size = ra->size; | 410 | ra->async_size = ra->size; |
372 | goto readit; | 411 | goto readit; |
373 | } | 412 | } |
374 | 413 | ||
375 | prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; | ||
376 | sequential = offset - prev_offset <= 1UL || req_size > max; | ||
377 | |||
378 | /* | ||
379 | * Standalone, small read. | ||
380 | * Read as is, and do not pollute the readahead state. | ||
381 | */ | ||
382 | if (!hit_readahead_marker && !sequential) { | ||
383 | return __do_page_cache_readahead(mapping, filp, | ||
384 | offset, req_size, 0); | ||
385 | } | ||
386 | |||
387 | /* | 414 | /* |
388 | * Hit a marked page without valid readahead state. | 415 | * Hit a marked page without valid readahead state. |
389 | * E.g. interleaved reads. | 416 | * E.g. interleaved reads. |
@@ -394,7 +421,7 @@ ondemand_readahead(struct address_space *mapping, | |||
394 | pgoff_t start; | 421 | pgoff_t start; |
395 | 422 | ||
396 | rcu_read_lock(); | 423 | rcu_read_lock(); |
397 | start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); | 424 | start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); |
398 | rcu_read_unlock(); | 425 | rcu_read_unlock(); |
399 | 426 | ||
400 | if (!start || start - offset > max) | 427 | if (!start || start - offset > max) |
@@ -402,23 +429,53 @@ ondemand_readahead(struct address_space *mapping, | |||
402 | 429 | ||
403 | ra->start = start; | 430 | ra->start = start; |
404 | ra->size = start - offset; /* old async_size */ | 431 | ra->size = start - offset; /* old async_size */ |
432 | ra->size += req_size; | ||
405 | ra->size = get_next_ra_size(ra, max); | 433 | ra->size = get_next_ra_size(ra, max); |
406 | ra->async_size = ra->size; | 434 | ra->async_size = ra->size; |
407 | goto readit; | 435 | goto readit; |
408 | } | 436 | } |
409 | 437 | ||
410 | /* | 438 | /* |
411 | * It may be one of | 439 | * oversize read |
412 | * - first read on start of file | 440 | */ |
413 | * - sequential cache miss | 441 | if (req_size > max) |
414 | * - oversize random read | 442 | goto initial_readahead; |
415 | * Start readahead for it. | 443 | |
444 | /* | ||
445 | * sequential cache miss | ||
446 | */ | ||
447 | if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) | ||
448 | goto initial_readahead; | ||
449 | |||
450 | /* | ||
451 | * Query the page cache and look for the traces(cached history pages) | ||
452 | * that a sequential stream would leave behind. | ||
453 | */ | ||
454 | if (try_context_readahead(mapping, ra, offset, req_size, max)) | ||
455 | goto readit; | ||
456 | |||
457 | /* | ||
458 | * standalone, small random read | ||
459 | * Read as is, and do not pollute the readahead state. | ||
416 | */ | 460 | */ |
461 | return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); | ||
462 | |||
463 | initial_readahead: | ||
417 | ra->start = offset; | 464 | ra->start = offset; |
418 | ra->size = get_init_ra_size(req_size, max); | 465 | ra->size = get_init_ra_size(req_size, max); |
419 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; | 466 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; |
420 | 467 | ||
421 | readit: | 468 | readit: |
469 | /* | ||
470 | * Will this read hit the readahead marker made by itself? | ||
471 | * If so, trigger the readahead marker hit now, and merge | ||
472 | * the resulted next readahead window into the current one. | ||
473 | */ | ||
474 | if (offset == ra->start && ra->size == ra->async_size) { | ||
475 | ra->async_size = get_next_ra_size(ra, max); | ||
476 | ra->size += ra->async_size; | ||
477 | } | ||
478 | |||
422 | return ra_submit(ra, mapping, filp); | 479 | return ra_submit(ra, mapping, filp); |
423 | } | 480 | } |
424 | 481 | ||
@@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
333 | * repeatedly from either page_referenced_anon or page_referenced_file. | 333 | * repeatedly from either page_referenced_anon or page_referenced_file. |
334 | */ | 334 | */ |
335 | static int page_referenced_one(struct page *page, | 335 | static int page_referenced_one(struct page *page, |
336 | struct vm_area_struct *vma, unsigned int *mapcount) | 336 | struct vm_area_struct *vma, |
337 | unsigned int *mapcount, | ||
338 | unsigned long *vm_flags) | ||
337 | { | 339 | { |
338 | struct mm_struct *mm = vma->vm_mm; | 340 | struct mm_struct *mm = vma->vm_mm; |
339 | unsigned long address; | 341 | unsigned long address; |
@@ -381,11 +383,14 @@ out_unmap: | |||
381 | (*mapcount)--; | 383 | (*mapcount)--; |
382 | pte_unmap_unlock(pte, ptl); | 384 | pte_unmap_unlock(pte, ptl); |
383 | out: | 385 | out: |
386 | if (referenced) | ||
387 | *vm_flags |= vma->vm_flags; | ||
384 | return referenced; | 388 | return referenced; |
385 | } | 389 | } |
386 | 390 | ||
387 | static int page_referenced_anon(struct page *page, | 391 | static int page_referenced_anon(struct page *page, |
388 | struct mem_cgroup *mem_cont) | 392 | struct mem_cgroup *mem_cont, |
393 | unsigned long *vm_flags) | ||
389 | { | 394 | { |
390 | unsigned int mapcount; | 395 | unsigned int mapcount; |
391 | struct anon_vma *anon_vma; | 396 | struct anon_vma *anon_vma; |
@@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page, | |||
405 | */ | 410 | */ |
406 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 411 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
407 | continue; | 412 | continue; |
408 | referenced += page_referenced_one(page, vma, &mapcount); | 413 | referenced += page_referenced_one(page, vma, |
414 | &mapcount, vm_flags); | ||
409 | if (!mapcount) | 415 | if (!mapcount) |
410 | break; | 416 | break; |
411 | } | 417 | } |
@@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page, | |||
418 | * page_referenced_file - referenced check for object-based rmap | 424 | * page_referenced_file - referenced check for object-based rmap |
419 | * @page: the page we're checking references on. | 425 | * @page: the page we're checking references on. |
420 | * @mem_cont: target memory controller | 426 | * @mem_cont: target memory controller |
427 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | ||
421 | * | 428 | * |
422 | * For an object-based mapped page, find all the places it is mapped and | 429 | * For an object-based mapped page, find all the places it is mapped and |
423 | * check/clear the referenced flag. This is done by following the page->mapping | 430 | * check/clear the referenced flag. This is done by following the page->mapping |
@@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page, | |||
427 | * This function is only called from page_referenced for object-based pages. | 434 | * This function is only called from page_referenced for object-based pages. |
428 | */ | 435 | */ |
429 | static int page_referenced_file(struct page *page, | 436 | static int page_referenced_file(struct page *page, |
430 | struct mem_cgroup *mem_cont) | 437 | struct mem_cgroup *mem_cont, |
438 | unsigned long *vm_flags) | ||
431 | { | 439 | { |
432 | unsigned int mapcount; | 440 | unsigned int mapcount; |
433 | struct address_space *mapping = page->mapping; | 441 | struct address_space *mapping = page->mapping; |
@@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page, | |||
467 | */ | 475 | */ |
468 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 476 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
469 | continue; | 477 | continue; |
470 | referenced += page_referenced_one(page, vma, &mapcount); | 478 | referenced += page_referenced_one(page, vma, |
479 | &mapcount, vm_flags); | ||
471 | if (!mapcount) | 480 | if (!mapcount) |
472 | break; | 481 | break; |
473 | } | 482 | } |
@@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page, | |||
481 | * @page: the page to test | 490 | * @page: the page to test |
482 | * @is_locked: caller holds lock on the page | 491 | * @is_locked: caller holds lock on the page |
483 | * @mem_cont: target memory controller | 492 | * @mem_cont: target memory controller |
493 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | ||
484 | * | 494 | * |
485 | * Quick test_and_clear_referenced for all mappings to a page, | 495 | * Quick test_and_clear_referenced for all mappings to a page, |
486 | * returns the number of ptes which referenced the page. | 496 | * returns the number of ptes which referenced the page. |
487 | */ | 497 | */ |
488 | int page_referenced(struct page *page, int is_locked, | 498 | int page_referenced(struct page *page, |
489 | struct mem_cgroup *mem_cont) | 499 | int is_locked, |
500 | struct mem_cgroup *mem_cont, | ||
501 | unsigned long *vm_flags) | ||
490 | { | 502 | { |
491 | int referenced = 0; | 503 | int referenced = 0; |
492 | 504 | ||
493 | if (TestClearPageReferenced(page)) | 505 | if (TestClearPageReferenced(page)) |
494 | referenced++; | 506 | referenced++; |
495 | 507 | ||
508 | *vm_flags = 0; | ||
496 | if (page_mapped(page) && page->mapping) { | 509 | if (page_mapped(page) && page->mapping) { |
497 | if (PageAnon(page)) | 510 | if (PageAnon(page)) |
498 | referenced += page_referenced_anon(page, mem_cont); | 511 | referenced += page_referenced_anon(page, mem_cont, |
512 | vm_flags); | ||
499 | else if (is_locked) | 513 | else if (is_locked) |
500 | referenced += page_referenced_file(page, mem_cont); | 514 | referenced += page_referenced_file(page, mem_cont, |
515 | vm_flags); | ||
501 | else if (!trylock_page(page)) | 516 | else if (!trylock_page(page)) |
502 | referenced++; | 517 | referenced++; |
503 | else { | 518 | else { |
504 | if (page->mapping) | 519 | if (page->mapping) |
505 | referenced += | 520 | referenced += page_referenced_file(page, |
506 | page_referenced_file(page, mem_cont); | 521 | mem_cont, vm_flags); |
507 | unlock_page(page); | 522 | unlock_page(page); |
508 | } | 523 | } |
509 | } | 524 | } |
@@ -1202,7 +1217,6 @@ int try_to_unmap(struct page *page, int migration) | |||
1202 | return ret; | 1217 | return ret; |
1203 | } | 1218 | } |
1204 | 1219 | ||
1205 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
1206 | /** | 1220 | /** |
1207 | * try_to_munlock - try to munlock a page | 1221 | * try_to_munlock - try to munlock a page |
1208 | * @page: the page to be munlocked | 1222 | * @page: the page to be munlocked |
@@ -1226,4 +1240,4 @@ int try_to_munlock(struct page *page) | |||
1226 | else | 1240 | else |
1227 | return try_to_unmap_file(page, 1, 0); | 1241 | return try_to_unmap_file(page, 1, 0); |
1228 | } | 1242 | } |
1229 | #endif | 1243 | |
diff --git a/mm/shmem.c b/mm/shmem.c index 0132fbd45a23..e89d7ec18eda 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1097 | shmem_swp_unmap(entry); | 1097 | shmem_swp_unmap(entry); |
1098 | unlock: | 1098 | unlock: |
1099 | spin_unlock(&info->lock); | 1099 | spin_unlock(&info->lock); |
1100 | swap_free(swap); | 1100 | swapcache_free(swap, NULL); |
1101 | redirty: | 1101 | redirty: |
1102 | set_page_dirty(page); | 1102 | set_page_dirty(page); |
1103 | if (wbc->for_reclaim) | 1103 | if (wbc->for_reclaim) |
@@ -2612,7 +2612,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
2612 | * @size: size to be set for the file | 2612 | * @size: size to be set for the file |
2613 | * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size | 2613 | * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size |
2614 | */ | 2614 | */ |
2615 | struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | 2615 | struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) |
2616 | { | 2616 | { |
2617 | int error; | 2617 | int error; |
2618 | struct file *file; | 2618 | struct file *file; |
@@ -114,6 +114,7 @@ | |||
114 | #include <linux/rtmutex.h> | 114 | #include <linux/rtmutex.h> |
115 | #include <linux/reciprocal_div.h> | 115 | #include <linux/reciprocal_div.h> |
116 | #include <linux/debugobjects.h> | 116 | #include <linux/debugobjects.h> |
117 | #include <linux/kmemcheck.h> | ||
117 | 118 | ||
118 | #include <asm/cacheflush.h> | 119 | #include <asm/cacheflush.h> |
119 | #include <asm/tlbflush.h> | 120 | #include <asm/tlbflush.h> |
@@ -179,13 +180,13 @@ | |||
179 | SLAB_STORE_USER | \ | 180 | SLAB_STORE_USER | \ |
180 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 181 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
181 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ | 182 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ |
182 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE) | 183 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) |
183 | #else | 184 | #else |
184 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ | 185 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ |
185 | SLAB_CACHE_DMA | \ | 186 | SLAB_CACHE_DMA | \ |
186 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 187 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
187 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ | 188 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ |
188 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE) | 189 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) |
189 | #endif | 190 | #endif |
190 | 191 | ||
191 | /* | 192 | /* |
@@ -380,87 +381,6 @@ static void kmem_list3_init(struct kmem_list3 *parent) | |||
380 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ | 381 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ |
381 | } while (0) | 382 | } while (0) |
382 | 383 | ||
383 | /* | ||
384 | * struct kmem_cache | ||
385 | * | ||
386 | * manages a cache. | ||
387 | */ | ||
388 | |||
389 | struct kmem_cache { | ||
390 | /* 1) per-cpu data, touched during every alloc/free */ | ||
391 | struct array_cache *array[NR_CPUS]; | ||
392 | /* 2) Cache tunables. Protected by cache_chain_mutex */ | ||
393 | unsigned int batchcount; | ||
394 | unsigned int limit; | ||
395 | unsigned int shared; | ||
396 | |||
397 | unsigned int buffer_size; | ||
398 | u32 reciprocal_buffer_size; | ||
399 | /* 3) touched by every alloc & free from the backend */ | ||
400 | |||
401 | unsigned int flags; /* constant flags */ | ||
402 | unsigned int num; /* # of objs per slab */ | ||
403 | |||
404 | /* 4) cache_grow/shrink */ | ||
405 | /* order of pgs per slab (2^n) */ | ||
406 | unsigned int gfporder; | ||
407 | |||
408 | /* force GFP flags, e.g. GFP_DMA */ | ||
409 | gfp_t gfpflags; | ||
410 | |||
411 | size_t colour; /* cache colouring range */ | ||
412 | unsigned int colour_off; /* colour offset */ | ||
413 | struct kmem_cache *slabp_cache; | ||
414 | unsigned int slab_size; | ||
415 | unsigned int dflags; /* dynamic flags */ | ||
416 | |||
417 | /* constructor func */ | ||
418 | void (*ctor)(void *obj); | ||
419 | |||
420 | /* 5) cache creation/removal */ | ||
421 | const char *name; | ||
422 | struct list_head next; | ||
423 | |||
424 | /* 6) statistics */ | ||
425 | #if STATS | ||
426 | unsigned long num_active; | ||
427 | unsigned long num_allocations; | ||
428 | unsigned long high_mark; | ||
429 | unsigned long grown; | ||
430 | unsigned long reaped; | ||
431 | unsigned long errors; | ||
432 | unsigned long max_freeable; | ||
433 | unsigned long node_allocs; | ||
434 | unsigned long node_frees; | ||
435 | unsigned long node_overflow; | ||
436 | atomic_t allochit; | ||
437 | atomic_t allocmiss; | ||
438 | atomic_t freehit; | ||
439 | atomic_t freemiss; | ||
440 | #endif | ||
441 | #if DEBUG | ||
442 | /* | ||
443 | * If debugging is enabled, then the allocator can add additional | ||
444 | * fields and/or padding to every object. buffer_size contains the total | ||
445 | * object size including these internal fields, the following two | ||
446 | * variables contain the offset to the user object and its size. | ||
447 | */ | ||
448 | int obj_offset; | ||
449 | int obj_size; | ||
450 | #endif | ||
451 | /* | ||
452 | * We put nodelists[] at the end of kmem_cache, because we want to size | ||
453 | * this array to nr_node_ids slots instead of MAX_NUMNODES | ||
454 | * (see kmem_cache_init()) | ||
455 | * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache | ||
456 | * is statically defined, so we reserve the max number of nodes. | ||
457 | */ | ||
458 | struct kmem_list3 *nodelists[MAX_NUMNODES]; | ||
459 | /* | ||
460 | * Do not add fields after nodelists[] | ||
461 | */ | ||
462 | }; | ||
463 | |||
464 | #define CFLGS_OFF_SLAB (0x80000000UL) | 384 | #define CFLGS_OFF_SLAB (0x80000000UL) |
465 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) | 385 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) |
466 | 386 | ||
@@ -898,7 +818,6 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, | |||
898 | */ | 818 | */ |
899 | 819 | ||
900 | static int use_alien_caches __read_mostly = 1; | 820 | static int use_alien_caches __read_mostly = 1; |
901 | static int numa_platform __read_mostly = 1; | ||
902 | static int __init noaliencache_setup(char *s) | 821 | static int __init noaliencache_setup(char *s) |
903 | { | 822 | { |
904 | use_alien_caches = 0; | 823 | use_alien_caches = 0; |
@@ -1457,10 +1376,8 @@ void __init kmem_cache_init(void) | |||
1457 | int order; | 1376 | int order; |
1458 | int node; | 1377 | int node; |
1459 | 1378 | ||
1460 | if (num_possible_nodes() == 1) { | 1379 | if (num_possible_nodes() == 1) |
1461 | use_alien_caches = 0; | 1380 | use_alien_caches = 0; |
1462 | numa_platform = 0; | ||
1463 | } | ||
1464 | 1381 | ||
1465 | for (i = 0; i < NUM_INIT_LISTS; i++) { | 1382 | for (i = 0; i < NUM_INIT_LISTS; i++) { |
1466 | kmem_list3_init(&initkmem_list3[i]); | 1383 | kmem_list3_init(&initkmem_list3[i]); |
@@ -1707,7 +1624,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1707 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1624 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1708 | flags |= __GFP_RECLAIMABLE; | 1625 | flags |= __GFP_RECLAIMABLE; |
1709 | 1626 | ||
1710 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | 1627 | page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); |
1711 | if (!page) | 1628 | if (!page) |
1712 | return NULL; | 1629 | return NULL; |
1713 | 1630 | ||
@@ -1720,6 +1637,16 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1720 | NR_SLAB_UNRECLAIMABLE, nr_pages); | 1637 | NR_SLAB_UNRECLAIMABLE, nr_pages); |
1721 | for (i = 0; i < nr_pages; i++) | 1638 | for (i = 0; i < nr_pages; i++) |
1722 | __SetPageSlab(page + i); | 1639 | __SetPageSlab(page + i); |
1640 | |||
1641 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | ||
1642 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); | ||
1643 | |||
1644 | if (cachep->ctor) | ||
1645 | kmemcheck_mark_uninitialized_pages(page, nr_pages); | ||
1646 | else | ||
1647 | kmemcheck_mark_unallocated_pages(page, nr_pages); | ||
1648 | } | ||
1649 | |||
1723 | return page_address(page); | 1650 | return page_address(page); |
1724 | } | 1651 | } |
1725 | 1652 | ||
@@ -1732,6 +1659,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
1732 | struct page *page = virt_to_page(addr); | 1659 | struct page *page = virt_to_page(addr); |
1733 | const unsigned long nr_freed = i; | 1660 | const unsigned long nr_freed = i; |
1734 | 1661 | ||
1662 | kmemcheck_free_shadow(page, cachep->gfporder); | ||
1663 | |||
1735 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1664 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1736 | sub_zone_page_state(page_zone(page), | 1665 | sub_zone_page_state(page_zone(page), |
1737 | NR_SLAB_RECLAIMABLE, nr_freed); | 1666 | NR_SLAB_RECLAIMABLE, nr_freed); |
@@ -3261,7 +3190,7 @@ retry: | |||
3261 | if (local_flags & __GFP_WAIT) | 3190 | if (local_flags & __GFP_WAIT) |
3262 | local_irq_enable(); | 3191 | local_irq_enable(); |
3263 | kmem_flagcheck(cache, flags); | 3192 | kmem_flagcheck(cache, flags); |
3264 | obj = kmem_getpages(cache, local_flags, -1); | 3193 | obj = kmem_getpages(cache, local_flags, numa_node_id()); |
3265 | if (local_flags & __GFP_WAIT) | 3194 | if (local_flags & __GFP_WAIT) |
3266 | local_irq_disable(); | 3195 | local_irq_disable(); |
3267 | if (obj) { | 3196 | if (obj) { |
@@ -3407,6 +3336,9 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3407 | kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, | 3336 | kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, |
3408 | flags); | 3337 | flags); |
3409 | 3338 | ||
3339 | if (likely(ptr)) | ||
3340 | kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); | ||
3341 | |||
3410 | if (unlikely((flags & __GFP_ZERO) && ptr)) | 3342 | if (unlikely((flags & __GFP_ZERO) && ptr)) |
3411 | memset(ptr, 0, obj_size(cachep)); | 3343 | memset(ptr, 0, obj_size(cachep)); |
3412 | 3344 | ||
@@ -3467,6 +3399,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | |||
3467 | flags); | 3399 | flags); |
3468 | prefetchw(objp); | 3400 | prefetchw(objp); |
3469 | 3401 | ||
3402 | if (likely(objp)) | ||
3403 | kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); | ||
3404 | |||
3470 | if (unlikely((flags & __GFP_ZERO) && objp)) | 3405 | if (unlikely((flags & __GFP_ZERO) && objp)) |
3471 | memset(objp, 0, obj_size(cachep)); | 3406 | memset(objp, 0, obj_size(cachep)); |
3472 | 3407 | ||
@@ -3583,6 +3518,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
3583 | kmemleak_free_recursive(objp, cachep->flags); | 3518 | kmemleak_free_recursive(objp, cachep->flags); |
3584 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); | 3519 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); |
3585 | 3520 | ||
3521 | kmemcheck_slab_free(cachep, objp, obj_size(cachep)); | ||
3522 | |||
3586 | /* | 3523 | /* |
3587 | * Skip calling cache_free_alien() when the platform is not numa. | 3524 | * Skip calling cache_free_alien() when the platform is not numa. |
3588 | * This will avoid cache misses that happen while accessing slabp (which | 3525 | * This will avoid cache misses that happen while accessing slabp (which |
@@ -3590,7 +3527,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
3590 | * variable to skip the call, which is mostly likely to be present in | 3527 | * variable to skip the call, which is mostly likely to be present in |
3591 | * the cache. | 3528 | * the cache. |
3592 | */ | 3529 | */ |
3593 | if (numa_platform && cache_free_alien(cachep, objp)) | 3530 | if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) |
3594 | return; | 3531 | return; |
3595 | 3532 | ||
3596 | if (likely(ac->avail < ac->limit)) { | 3533 | if (likely(ac->avail < ac->limit)) { |
@@ -46,7 +46,7 @@ | |||
46 | * NUMA support in SLOB is fairly simplistic, pushing most of the real | 46 | * NUMA support in SLOB is fairly simplistic, pushing most of the real |
47 | * logic down to the page allocator, and simply doing the node accounting | 47 | * logic down to the page allocator, and simply doing the node accounting |
48 | * on the upper levels. In the event that a node id is explicitly | 48 | * on the upper levels. In the event that a node id is explicitly |
49 | * provided, alloc_pages_node() with the specified node id is used | 49 | * provided, alloc_pages_exact_node() with the specified node id is used |
50 | * instead. The common case (or when the node id isn't explicitly provided) | 50 | * instead. The common case (or when the node id isn't explicitly provided) |
51 | * will default to the current node, as per numa_node_id(). | 51 | * will default to the current node, as per numa_node_id(). |
52 | * | 52 | * |
@@ -244,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) | |||
244 | 244 | ||
245 | #ifdef CONFIG_NUMA | 245 | #ifdef CONFIG_NUMA |
246 | if (node != -1) | 246 | if (node != -1) |
247 | page = alloc_pages_node(node, gfp, order); | 247 | page = alloc_pages_exact_node(node, gfp, order); |
248 | else | 248 | else |
249 | #endif | 249 | #endif |
250 | page = alloc_pages(gfp, order); | 250 | page = alloc_pages(gfp, order); |
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/proc_fs.h> | 18 | #include <linux/proc_fs.h> |
19 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
20 | #include <linux/kmemtrace.h> | 20 | #include <linux/kmemtrace.h> |
21 | #include <linux/kmemcheck.h> | ||
21 | #include <linux/cpu.h> | 22 | #include <linux/cpu.h> |
22 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> |
23 | #include <linux/kmemleak.h> | 24 | #include <linux/kmemleak.h> |
@@ -147,7 +148,7 @@ | |||
147 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) | 148 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) |
148 | 149 | ||
149 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ | 150 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ |
150 | SLAB_CACHE_DMA) | 151 | SLAB_CACHE_DMA | SLAB_NOTRACK) |
151 | 152 | ||
152 | #ifndef ARCH_KMALLOC_MINALIGN | 153 | #ifndef ARCH_KMALLOC_MINALIGN |
153 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) | 154 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) |
@@ -1071,6 +1072,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node, | |||
1071 | { | 1072 | { |
1072 | int order = oo_order(oo); | 1073 | int order = oo_order(oo); |
1073 | 1074 | ||
1075 | flags |= __GFP_NOTRACK; | ||
1076 | |||
1074 | if (node == -1) | 1077 | if (node == -1) |
1075 | return alloc_pages(flags, order); | 1078 | return alloc_pages(flags, order); |
1076 | else | 1079 | else |
@@ -1098,6 +1101,24 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1098 | 1101 | ||
1099 | stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); | 1102 | stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); |
1100 | } | 1103 | } |
1104 | |||
1105 | if (kmemcheck_enabled | ||
1106 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) | ||
1107 | { | ||
1108 | int pages = 1 << oo_order(oo); | ||
1109 | |||
1110 | kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); | ||
1111 | |||
1112 | /* | ||
1113 | * Objects from caches that have a constructor don't get | ||
1114 | * cleared when they're allocated, so we need to do it here. | ||
1115 | */ | ||
1116 | if (s->ctor) | ||
1117 | kmemcheck_mark_uninitialized_pages(page, pages); | ||
1118 | else | ||
1119 | kmemcheck_mark_unallocated_pages(page, pages); | ||
1120 | } | ||
1121 | |||
1101 | page->objects = oo_objects(oo); | 1122 | page->objects = oo_objects(oo); |
1102 | mod_zone_page_state(page_zone(page), | 1123 | mod_zone_page_state(page_zone(page), |
1103 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | 1124 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? |
@@ -1171,6 +1192,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1171 | __ClearPageSlubDebug(page); | 1192 | __ClearPageSlubDebug(page); |
1172 | } | 1193 | } |
1173 | 1194 | ||
1195 | kmemcheck_free_shadow(page, compound_order(page)); | ||
1196 | |||
1174 | mod_zone_page_state(page_zone(page), | 1197 | mod_zone_page_state(page_zone(page), |
1175 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | 1198 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? |
1176 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | 1199 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, |
@@ -1626,7 +1649,9 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
1626 | if (unlikely((gfpflags & __GFP_ZERO) && object)) | 1649 | if (unlikely((gfpflags & __GFP_ZERO) && object)) |
1627 | memset(object, 0, objsize); | 1650 | memset(object, 0, objsize); |
1628 | 1651 | ||
1652 | kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); | ||
1629 | kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); | 1653 | kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); |
1654 | |||
1630 | return object; | 1655 | return object; |
1631 | } | 1656 | } |
1632 | 1657 | ||
@@ -1759,6 +1784,7 @@ static __always_inline void slab_free(struct kmem_cache *s, | |||
1759 | kmemleak_free_recursive(x, s->flags); | 1784 | kmemleak_free_recursive(x, s->flags); |
1760 | local_irq_save(flags); | 1785 | local_irq_save(flags); |
1761 | c = get_cpu_slab(s, smp_processor_id()); | 1786 | c = get_cpu_slab(s, smp_processor_id()); |
1787 | kmemcheck_slab_free(s, object, c->objsize); | ||
1762 | debug_check_no_locks_freed(object, c->objsize); | 1788 | debug_check_no_locks_freed(object, c->objsize); |
1763 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | 1789 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) |
1764 | debug_check_no_obj_freed(object, c->objsize); | 1790 | debug_check_no_obj_freed(object, c->objsize); |
@@ -2633,7 +2659,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
2633 | 2659 | ||
2634 | if (!s || !text || !kmem_cache_open(s, flags, text, | 2660 | if (!s || !text || !kmem_cache_open(s, flags, text, |
2635 | realsize, ARCH_KMALLOC_MINALIGN, | 2661 | realsize, ARCH_KMALLOC_MINALIGN, |
2636 | SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { | 2662 | SLAB_CACHE_DMA|SLAB_NOTRACK|__SYSFS_ADD_DEFERRED, |
2663 | NULL)) { | ||
2637 | kfree(s); | 2664 | kfree(s); |
2638 | kfree(text); | 2665 | kfree(text); |
2639 | goto unlock_out; | 2666 | goto unlock_out; |
@@ -2727,9 +2754,10 @@ EXPORT_SYMBOL(__kmalloc); | |||
2727 | 2754 | ||
2728 | static void *kmalloc_large_node(size_t size, gfp_t flags, int node) | 2755 | static void *kmalloc_large_node(size_t size, gfp_t flags, int node) |
2729 | { | 2756 | { |
2730 | struct page *page = alloc_pages_node(node, flags | __GFP_COMP, | 2757 | struct page *page; |
2731 | get_order(size)); | ||
2732 | 2758 | ||
2759 | flags |= __GFP_COMP | __GFP_NOTRACK; | ||
2760 | page = alloc_pages_node(node, flags, get_order(size)); | ||
2733 | if (page) | 2761 | if (page) |
2734 | return page_address(page); | 2762 | return page_address(page); |
2735 | else | 2763 | else |
@@ -3737,7 +3765,7 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
3737 | to_cpumask(l->cpus)); | 3765 | to_cpumask(l->cpus)); |
3738 | } | 3766 | } |
3739 | 3767 | ||
3740 | if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && | 3768 | if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && |
3741 | len < PAGE_SIZE - 60) { | 3769 | len < PAGE_SIZE - 60) { |
3742 | len += sprintf(buf + len, " nodes="); | 3770 | len += sprintf(buf + len, " nodes="); |
3743 | len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, | 3771 | len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, |
@@ -4412,6 +4440,8 @@ static char *create_unique_id(struct kmem_cache *s) | |||
4412 | *p++ = 'a'; | 4440 | *p++ = 'a'; |
4413 | if (s->flags & SLAB_DEBUG_FREE) | 4441 | if (s->flags & SLAB_DEBUG_FREE) |
4414 | *p++ = 'F'; | 4442 | *p++ = 'F'; |
4443 | if (!(s->flags & SLAB_NOTRACK)) | ||
4444 | *p++ = 't'; | ||
4415 | if (p != name + 1) | 4445 | if (p != name + 1) |
4416 | *p++ = '-'; | 4446 | *p++ = '-'; |
4417 | p += sprintf(p, "%07d", s->size); | 4447 | p += sprintf(p, "%07d", s->size); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 1416e7e9e02d..42cd38eba79f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -124,7 +124,6 @@ void __delete_from_swap_cache(struct page *page) | |||
124 | /** | 124 | /** |
125 | * add_to_swap - allocate swap space for a page | 125 | * add_to_swap - allocate swap space for a page |
126 | * @page: page we want to move to swap | 126 | * @page: page we want to move to swap |
127 | * @gfp_mask: memory allocation flags | ||
128 | * | 127 | * |
129 | * Allocate swap space for the page and add the page to the | 128 | * Allocate swap space for the page and add the page to the |
130 | * swap cache. Caller needs to hold the page lock. | 129 | * swap cache. Caller needs to hold the page lock. |
@@ -162,11 +161,11 @@ int add_to_swap(struct page *page) | |||
162 | return 1; | 161 | return 1; |
163 | case -EEXIST: | 162 | case -EEXIST: |
164 | /* Raced with "speculative" read_swap_cache_async */ | 163 | /* Raced with "speculative" read_swap_cache_async */ |
165 | swap_free(entry); | 164 | swapcache_free(entry, NULL); |
166 | continue; | 165 | continue; |
167 | default: | 166 | default: |
168 | /* -ENOMEM radix-tree allocation failure */ | 167 | /* -ENOMEM radix-tree allocation failure */ |
169 | swap_free(entry); | 168 | swapcache_free(entry, NULL); |
170 | return 0; | 169 | return 0; |
171 | } | 170 | } |
172 | } | 171 | } |
@@ -188,8 +187,7 @@ void delete_from_swap_cache(struct page *page) | |||
188 | __delete_from_swap_cache(page); | 187 | __delete_from_swap_cache(page); |
189 | spin_unlock_irq(&swapper_space.tree_lock); | 188 | spin_unlock_irq(&swapper_space.tree_lock); |
190 | 189 | ||
191 | mem_cgroup_uncharge_swapcache(page, entry); | 190 | swapcache_free(entry, page); |
192 | swap_free(entry); | ||
193 | page_cache_release(page); | 191 | page_cache_release(page); |
194 | } | 192 | } |
195 | 193 | ||
@@ -293,7 +291,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
293 | /* | 291 | /* |
294 | * Swap entry may have been freed since our caller observed it. | 292 | * Swap entry may have been freed since our caller observed it. |
295 | */ | 293 | */ |
296 | if (!swap_duplicate(entry)) | 294 | err = swapcache_prepare(entry); |
295 | if (err == -EEXIST) /* seems racy */ | ||
296 | continue; | ||
297 | if (err) /* swp entry is obsolete ? */ | ||
297 | break; | 298 | break; |
298 | 299 | ||
299 | /* | 300 | /* |
@@ -312,12 +313,12 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
312 | * Initiate read into locked page and return. | 313 | * Initiate read into locked page and return. |
313 | */ | 314 | */ |
314 | lru_cache_add_anon(new_page); | 315 | lru_cache_add_anon(new_page); |
315 | swap_readpage(NULL, new_page); | 316 | swap_readpage(new_page); |
316 | return new_page; | 317 | return new_page; |
317 | } | 318 | } |
318 | ClearPageSwapBacked(new_page); | 319 | ClearPageSwapBacked(new_page); |
319 | __clear_page_locked(new_page); | 320 | __clear_page_locked(new_page); |
320 | swap_free(entry); | 321 | swapcache_free(entry, NULL); |
321 | } while (err != -ENOMEM); | 322 | } while (err != -ENOMEM); |
322 | 323 | ||
323 | if (new_page) | 324 | if (new_page) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 312fafe0ab6e..28faa01cf578 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -53,6 +53,59 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES]; | |||
53 | 53 | ||
54 | static DEFINE_MUTEX(swapon_mutex); | 54 | static DEFINE_MUTEX(swapon_mutex); |
55 | 55 | ||
56 | /* For reference count accounting in swap_map */ | ||
57 | /* enum for swap_map[] handling. internal use only */ | ||
58 | enum { | ||
59 | SWAP_MAP = 0, /* ops for reference from swap users */ | ||
60 | SWAP_CACHE, /* ops for reference from swap cache */ | ||
61 | }; | ||
62 | |||
63 | static inline int swap_count(unsigned short ent) | ||
64 | { | ||
65 | return ent & SWAP_COUNT_MASK; | ||
66 | } | ||
67 | |||
68 | static inline bool swap_has_cache(unsigned short ent) | ||
69 | { | ||
70 | return !!(ent & SWAP_HAS_CACHE); | ||
71 | } | ||
72 | |||
73 | static inline unsigned short encode_swapmap(int count, bool has_cache) | ||
74 | { | ||
75 | unsigned short ret = count; | ||
76 | |||
77 | if (has_cache) | ||
78 | return SWAP_HAS_CACHE | ret; | ||
79 | return ret; | ||
80 | } | ||
81 | |||
82 | /* returnes 1 if swap entry is freed */ | ||
83 | static int | ||
84 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | ||
85 | { | ||
86 | int type = si - swap_info; | ||
87 | swp_entry_t entry = swp_entry(type, offset); | ||
88 | struct page *page; | ||
89 | int ret = 0; | ||
90 | |||
91 | page = find_get_page(&swapper_space, entry.val); | ||
92 | if (!page) | ||
93 | return 0; | ||
94 | /* | ||
95 | * This function is called from scan_swap_map() and it's called | ||
96 | * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. | ||
97 | * We have to use trylock for avoiding deadlock. This is a special | ||
98 | * case and you should use try_to_free_swap() with explicit lock_page() | ||
99 | * in usual operations. | ||
100 | */ | ||
101 | if (trylock_page(page)) { | ||
102 | ret = try_to_free_swap(page); | ||
103 | unlock_page(page); | ||
104 | } | ||
105 | page_cache_release(page); | ||
106 | return ret; | ||
107 | } | ||
108 | |||
56 | /* | 109 | /* |
57 | * We need this because the bdev->unplug_fn can sleep and we cannot | 110 | * We need this because the bdev->unplug_fn can sleep and we cannot |
58 | * hold swap_lock while calling the unplug_fn. And swap_lock | 111 | * hold swap_lock while calling the unplug_fn. And swap_lock |
@@ -167,7 +220,8 @@ static int wait_for_discard(void *word) | |||
167 | #define SWAPFILE_CLUSTER 256 | 220 | #define SWAPFILE_CLUSTER 256 |
168 | #define LATENCY_LIMIT 256 | 221 | #define LATENCY_LIMIT 256 |
169 | 222 | ||
170 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) | 223 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, |
224 | int cache) | ||
171 | { | 225 | { |
172 | unsigned long offset; | 226 | unsigned long offset; |
173 | unsigned long scan_base; | 227 | unsigned long scan_base; |
@@ -273,6 +327,19 @@ checks: | |||
273 | goto no_page; | 327 | goto no_page; |
274 | if (offset > si->highest_bit) | 328 | if (offset > si->highest_bit) |
275 | scan_base = offset = si->lowest_bit; | 329 | scan_base = offset = si->lowest_bit; |
330 | |||
331 | /* reuse swap entry of cache-only swap if not busy. */ | ||
332 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | ||
333 | int swap_was_freed; | ||
334 | spin_unlock(&swap_lock); | ||
335 | swap_was_freed = __try_to_reclaim_swap(si, offset); | ||
336 | spin_lock(&swap_lock); | ||
337 | /* entry was freed successfully, try to use this again */ | ||
338 | if (swap_was_freed) | ||
339 | goto checks; | ||
340 | goto scan; /* check next one */ | ||
341 | } | ||
342 | |||
276 | if (si->swap_map[offset]) | 343 | if (si->swap_map[offset]) |
277 | goto scan; | 344 | goto scan; |
278 | 345 | ||
@@ -285,7 +352,10 @@ checks: | |||
285 | si->lowest_bit = si->max; | 352 | si->lowest_bit = si->max; |
286 | si->highest_bit = 0; | 353 | si->highest_bit = 0; |
287 | } | 354 | } |
288 | si->swap_map[offset] = 1; | 355 | if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ |
356 | si->swap_map[offset] = encode_swapmap(0, true); | ||
357 | else /* at suspend */ | ||
358 | si->swap_map[offset] = encode_swapmap(1, false); | ||
289 | si->cluster_next = offset + 1; | 359 | si->cluster_next = offset + 1; |
290 | si->flags -= SWP_SCANNING; | 360 | si->flags -= SWP_SCANNING; |
291 | 361 | ||
@@ -351,6 +421,10 @@ scan: | |||
351 | spin_lock(&swap_lock); | 421 | spin_lock(&swap_lock); |
352 | goto checks; | 422 | goto checks; |
353 | } | 423 | } |
424 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | ||
425 | spin_lock(&swap_lock); | ||
426 | goto checks; | ||
427 | } | ||
354 | if (unlikely(--latency_ration < 0)) { | 428 | if (unlikely(--latency_ration < 0)) { |
355 | cond_resched(); | 429 | cond_resched(); |
356 | latency_ration = LATENCY_LIMIT; | 430 | latency_ration = LATENCY_LIMIT; |
@@ -362,6 +436,10 @@ scan: | |||
362 | spin_lock(&swap_lock); | 436 | spin_lock(&swap_lock); |
363 | goto checks; | 437 | goto checks; |
364 | } | 438 | } |
439 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | ||
440 | spin_lock(&swap_lock); | ||
441 | goto checks; | ||
442 | } | ||
365 | if (unlikely(--latency_ration < 0)) { | 443 | if (unlikely(--latency_ration < 0)) { |
366 | cond_resched(); | 444 | cond_resched(); |
367 | latency_ration = LATENCY_LIMIT; | 445 | latency_ration = LATENCY_LIMIT; |
@@ -401,7 +479,8 @@ swp_entry_t get_swap_page(void) | |||
401 | continue; | 479 | continue; |
402 | 480 | ||
403 | swap_list.next = next; | 481 | swap_list.next = next; |
404 | offset = scan_swap_map(si); | 482 | /* This is called for allocating swap entry for cache */ |
483 | offset = scan_swap_map(si, SWAP_CACHE); | ||
405 | if (offset) { | 484 | if (offset) { |
406 | spin_unlock(&swap_lock); | 485 | spin_unlock(&swap_lock); |
407 | return swp_entry(type, offset); | 486 | return swp_entry(type, offset); |
@@ -415,6 +494,7 @@ noswap: | |||
415 | return (swp_entry_t) {0}; | 494 | return (swp_entry_t) {0}; |
416 | } | 495 | } |
417 | 496 | ||
497 | /* The only caller of this function is now susupend routine */ | ||
418 | swp_entry_t get_swap_page_of_type(int type) | 498 | swp_entry_t get_swap_page_of_type(int type) |
419 | { | 499 | { |
420 | struct swap_info_struct *si; | 500 | struct swap_info_struct *si; |
@@ -424,7 +504,8 @@ swp_entry_t get_swap_page_of_type(int type) | |||
424 | si = swap_info + type; | 504 | si = swap_info + type; |
425 | if (si->flags & SWP_WRITEOK) { | 505 | if (si->flags & SWP_WRITEOK) { |
426 | nr_swap_pages--; | 506 | nr_swap_pages--; |
427 | offset = scan_swap_map(si); | 507 | /* This is called for allocating swap entry, not cache */ |
508 | offset = scan_swap_map(si, SWAP_MAP); | ||
428 | if (offset) { | 509 | if (offset) { |
429 | spin_unlock(&swap_lock); | 510 | spin_unlock(&swap_lock); |
430 | return swp_entry(type, offset); | 511 | return swp_entry(type, offset); |
@@ -471,25 +552,38 @@ out: | |||
471 | return NULL; | 552 | return NULL; |
472 | } | 553 | } |
473 | 554 | ||
474 | static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) | 555 | static int swap_entry_free(struct swap_info_struct *p, |
556 | swp_entry_t ent, int cache) | ||
475 | { | 557 | { |
476 | unsigned long offset = swp_offset(ent); | 558 | unsigned long offset = swp_offset(ent); |
477 | int count = p->swap_map[offset]; | 559 | int count = swap_count(p->swap_map[offset]); |
478 | 560 | bool has_cache; | |
479 | if (count < SWAP_MAP_MAX) { | 561 | |
480 | count--; | 562 | has_cache = swap_has_cache(p->swap_map[offset]); |
481 | p->swap_map[offset] = count; | 563 | |
482 | if (!count) { | 564 | if (cache == SWAP_MAP) { /* dropping usage count of swap */ |
483 | if (offset < p->lowest_bit) | 565 | if (count < SWAP_MAP_MAX) { |
484 | p->lowest_bit = offset; | 566 | count--; |
485 | if (offset > p->highest_bit) | 567 | p->swap_map[offset] = encode_swapmap(count, has_cache); |
486 | p->highest_bit = offset; | ||
487 | if (p->prio > swap_info[swap_list.next].prio) | ||
488 | swap_list.next = p - swap_info; | ||
489 | nr_swap_pages++; | ||
490 | p->inuse_pages--; | ||
491 | mem_cgroup_uncharge_swap(ent); | ||
492 | } | 568 | } |
569 | } else { /* dropping swap cache flag */ | ||
570 | VM_BUG_ON(!has_cache); | ||
571 | p->swap_map[offset] = encode_swapmap(count, false); | ||
572 | |||
573 | } | ||
574 | /* return code. */ | ||
575 | count = p->swap_map[offset]; | ||
576 | /* free if no reference */ | ||
577 | if (!count) { | ||
578 | if (offset < p->lowest_bit) | ||
579 | p->lowest_bit = offset; | ||
580 | if (offset > p->highest_bit) | ||
581 | p->highest_bit = offset; | ||
582 | if (p->prio > swap_info[swap_list.next].prio) | ||
583 | swap_list.next = p - swap_info; | ||
584 | nr_swap_pages++; | ||
585 | p->inuse_pages--; | ||
586 | mem_cgroup_uncharge_swap(ent); | ||
493 | } | 587 | } |
494 | return count; | 588 | return count; |
495 | } | 589 | } |
@@ -504,9 +598,26 @@ void swap_free(swp_entry_t entry) | |||
504 | 598 | ||
505 | p = swap_info_get(entry); | 599 | p = swap_info_get(entry); |
506 | if (p) { | 600 | if (p) { |
507 | swap_entry_free(p, entry); | 601 | swap_entry_free(p, entry, SWAP_MAP); |
602 | spin_unlock(&swap_lock); | ||
603 | } | ||
604 | } | ||
605 | |||
606 | /* | ||
607 | * Called after dropping swapcache to decrease refcnt to swap entries. | ||
608 | */ | ||
609 | void swapcache_free(swp_entry_t entry, struct page *page) | ||
610 | { | ||
611 | struct swap_info_struct *p; | ||
612 | |||
613 | if (page) | ||
614 | mem_cgroup_uncharge_swapcache(page, entry); | ||
615 | p = swap_info_get(entry); | ||
616 | if (p) { | ||
617 | swap_entry_free(p, entry, SWAP_CACHE); | ||
508 | spin_unlock(&swap_lock); | 618 | spin_unlock(&swap_lock); |
509 | } | 619 | } |
620 | return; | ||
510 | } | 621 | } |
511 | 622 | ||
512 | /* | 623 | /* |
@@ -521,8 +632,7 @@ static inline int page_swapcount(struct page *page) | |||
521 | entry.val = page_private(page); | 632 | entry.val = page_private(page); |
522 | p = swap_info_get(entry); | 633 | p = swap_info_get(entry); |
523 | if (p) { | 634 | if (p) { |
524 | /* Subtract the 1 for the swap cache itself */ | 635 | count = swap_count(p->swap_map[swp_offset(entry)]); |
525 | count = p->swap_map[swp_offset(entry)] - 1; | ||
526 | spin_unlock(&swap_lock); | 636 | spin_unlock(&swap_lock); |
527 | } | 637 | } |
528 | return count; | 638 | return count; |
@@ -584,7 +694,7 @@ int free_swap_and_cache(swp_entry_t entry) | |||
584 | 694 | ||
585 | p = swap_info_get(entry); | 695 | p = swap_info_get(entry); |
586 | if (p) { | 696 | if (p) { |
587 | if (swap_entry_free(p, entry) == 1) { | 697 | if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { |
588 | page = find_get_page(&swapper_space, entry.val); | 698 | page = find_get_page(&swapper_space, entry.val); |
589 | if (page && !trylock_page(page)) { | 699 | if (page && !trylock_page(page)) { |
590 | page_cache_release(page); | 700 | page_cache_release(page); |
@@ -891,7 +1001,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
891 | i = 1; | 1001 | i = 1; |
892 | } | 1002 | } |
893 | count = si->swap_map[i]; | 1003 | count = si->swap_map[i]; |
894 | if (count && count != SWAP_MAP_BAD) | 1004 | if (count && swap_count(count) != SWAP_MAP_BAD) |
895 | break; | 1005 | break; |
896 | } | 1006 | } |
897 | return i; | 1007 | return i; |
@@ -995,13 +1105,13 @@ static int try_to_unuse(unsigned int type) | |||
995 | */ | 1105 | */ |
996 | shmem = 0; | 1106 | shmem = 0; |
997 | swcount = *swap_map; | 1107 | swcount = *swap_map; |
998 | if (swcount > 1) { | 1108 | if (swap_count(swcount)) { |
999 | if (start_mm == &init_mm) | 1109 | if (start_mm == &init_mm) |
1000 | shmem = shmem_unuse(entry, page); | 1110 | shmem = shmem_unuse(entry, page); |
1001 | else | 1111 | else |
1002 | retval = unuse_mm(start_mm, entry, page); | 1112 | retval = unuse_mm(start_mm, entry, page); |
1003 | } | 1113 | } |
1004 | if (*swap_map > 1) { | 1114 | if (swap_count(*swap_map)) { |
1005 | int set_start_mm = (*swap_map >= swcount); | 1115 | int set_start_mm = (*swap_map >= swcount); |
1006 | struct list_head *p = &start_mm->mmlist; | 1116 | struct list_head *p = &start_mm->mmlist; |
1007 | struct mm_struct *new_start_mm = start_mm; | 1117 | struct mm_struct *new_start_mm = start_mm; |
@@ -1011,7 +1121,7 @@ static int try_to_unuse(unsigned int type) | |||
1011 | atomic_inc(&new_start_mm->mm_users); | 1121 | atomic_inc(&new_start_mm->mm_users); |
1012 | atomic_inc(&prev_mm->mm_users); | 1122 | atomic_inc(&prev_mm->mm_users); |
1013 | spin_lock(&mmlist_lock); | 1123 | spin_lock(&mmlist_lock); |
1014 | while (*swap_map > 1 && !retval && !shmem && | 1124 | while (swap_count(*swap_map) && !retval && !shmem && |
1015 | (p = p->next) != &start_mm->mmlist) { | 1125 | (p = p->next) != &start_mm->mmlist) { |
1016 | mm = list_entry(p, struct mm_struct, mmlist); | 1126 | mm = list_entry(p, struct mm_struct, mmlist); |
1017 | if (!atomic_inc_not_zero(&mm->mm_users)) | 1127 | if (!atomic_inc_not_zero(&mm->mm_users)) |
@@ -1023,14 +1133,16 @@ static int try_to_unuse(unsigned int type) | |||
1023 | cond_resched(); | 1133 | cond_resched(); |
1024 | 1134 | ||
1025 | swcount = *swap_map; | 1135 | swcount = *swap_map; |
1026 | if (swcount <= 1) | 1136 | if (!swap_count(swcount)) /* any usage ? */ |
1027 | ; | 1137 | ; |
1028 | else if (mm == &init_mm) { | 1138 | else if (mm == &init_mm) { |
1029 | set_start_mm = 1; | 1139 | set_start_mm = 1; |
1030 | shmem = shmem_unuse(entry, page); | 1140 | shmem = shmem_unuse(entry, page); |
1031 | } else | 1141 | } else |
1032 | retval = unuse_mm(mm, entry, page); | 1142 | retval = unuse_mm(mm, entry, page); |
1033 | if (set_start_mm && *swap_map < swcount) { | 1143 | |
1144 | if (set_start_mm && | ||
1145 | swap_count(*swap_map) < swcount) { | ||
1034 | mmput(new_start_mm); | 1146 | mmput(new_start_mm); |
1035 | atomic_inc(&mm->mm_users); | 1147 | atomic_inc(&mm->mm_users); |
1036 | new_start_mm = mm; | 1148 | new_start_mm = mm; |
@@ -1057,21 +1169,25 @@ static int try_to_unuse(unsigned int type) | |||
1057 | } | 1169 | } |
1058 | 1170 | ||
1059 | /* | 1171 | /* |
1060 | * How could swap count reach 0x7fff when the maximum | 1172 | * How could swap count reach 0x7ffe ? |
1061 | * pid is 0x7fff, and there's no way to repeat a swap | 1173 | * There's no way to repeat a swap page within an mm |
1062 | * page within an mm (except in shmem, where it's the | 1174 | * (except in shmem, where it's the shared object which takes |
1063 | * shared object which takes the reference count)? | 1175 | * the reference count)? |
1064 | * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. | 1176 | * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned |
1065 | * | 1177 | * short is too small....) |
1066 | * If that's wrong, then we should worry more about | 1178 | * If that's wrong, then we should worry more about |
1067 | * exit_mmap() and do_munmap() cases described above: | 1179 | * exit_mmap() and do_munmap() cases described above: |
1068 | * we might be resetting SWAP_MAP_MAX too early here. | 1180 | * we might be resetting SWAP_MAP_MAX too early here. |
1069 | * We know "Undead"s can happen, they're okay, so don't | 1181 | * We know "Undead"s can happen, they're okay, so don't |
1070 | * report them; but do report if we reset SWAP_MAP_MAX. | 1182 | * report them; but do report if we reset SWAP_MAP_MAX. |
1071 | */ | 1183 | */ |
1072 | if (*swap_map == SWAP_MAP_MAX) { | 1184 | /* We might release the lock_page() in unuse_mm(). */ |
1185 | if (!PageSwapCache(page) || page_private(page) != entry.val) | ||
1186 | goto retry; | ||
1187 | |||
1188 | if (swap_count(*swap_map) == SWAP_MAP_MAX) { | ||
1073 | spin_lock(&swap_lock); | 1189 | spin_lock(&swap_lock); |
1074 | *swap_map = 1; | 1190 | *swap_map = encode_swapmap(0, true); |
1075 | spin_unlock(&swap_lock); | 1191 | spin_unlock(&swap_lock); |
1076 | reset_overflow = 1; | 1192 | reset_overflow = 1; |
1077 | } | 1193 | } |
@@ -1089,7 +1205,8 @@ static int try_to_unuse(unsigned int type) | |||
1089 | * pages would be incorrect if swap supported "shared | 1205 | * pages would be incorrect if swap supported "shared |
1090 | * private" pages, but they are handled by tmpfs files. | 1206 | * private" pages, but they are handled by tmpfs files. |
1091 | */ | 1207 | */ |
1092 | if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { | 1208 | if (swap_count(*swap_map) && |
1209 | PageDirty(page) && PageSwapCache(page)) { | ||
1093 | struct writeback_control wbc = { | 1210 | struct writeback_control wbc = { |
1094 | .sync_mode = WB_SYNC_NONE, | 1211 | .sync_mode = WB_SYNC_NONE, |
1095 | }; | 1212 | }; |
@@ -1116,6 +1233,7 @@ static int try_to_unuse(unsigned int type) | |||
1116 | * mark page dirty so shrink_page_list will preserve it. | 1233 | * mark page dirty so shrink_page_list will preserve it. |
1117 | */ | 1234 | */ |
1118 | SetPageDirty(page); | 1235 | SetPageDirty(page); |
1236 | retry: | ||
1119 | unlock_page(page); | 1237 | unlock_page(page); |
1120 | page_cache_release(page); | 1238 | page_cache_release(page); |
1121 | 1239 | ||
@@ -1942,15 +2060,23 @@ void si_swapinfo(struct sysinfo *val) | |||
1942 | * | 2060 | * |
1943 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as | 2061 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as |
1944 | * "permanent", but will be reclaimed by the next swapoff. | 2062 | * "permanent", but will be reclaimed by the next swapoff. |
2063 | * Returns error code in following case. | ||
2064 | * - success -> 0 | ||
2065 | * - swp_entry is invalid -> EINVAL | ||
2066 | * - swp_entry is migration entry -> EINVAL | ||
2067 | * - swap-cache reference is requested but there is already one. -> EEXIST | ||
2068 | * - swap-cache reference is requested but the entry is not used. -> ENOENT | ||
1945 | */ | 2069 | */ |
1946 | int swap_duplicate(swp_entry_t entry) | 2070 | static int __swap_duplicate(swp_entry_t entry, bool cache) |
1947 | { | 2071 | { |
1948 | struct swap_info_struct * p; | 2072 | struct swap_info_struct * p; |
1949 | unsigned long offset, type; | 2073 | unsigned long offset, type; |
1950 | int result = 0; | 2074 | int result = -EINVAL; |
2075 | int count; | ||
2076 | bool has_cache; | ||
1951 | 2077 | ||
1952 | if (is_migration_entry(entry)) | 2078 | if (is_migration_entry(entry)) |
1953 | return 1; | 2079 | return -EINVAL; |
1954 | 2080 | ||
1955 | type = swp_type(entry); | 2081 | type = swp_type(entry); |
1956 | if (type >= nr_swapfiles) | 2082 | if (type >= nr_swapfiles) |
@@ -1959,17 +2085,40 @@ int swap_duplicate(swp_entry_t entry) | |||
1959 | offset = swp_offset(entry); | 2085 | offset = swp_offset(entry); |
1960 | 2086 | ||
1961 | spin_lock(&swap_lock); | 2087 | spin_lock(&swap_lock); |
1962 | if (offset < p->max && p->swap_map[offset]) { | 2088 | |
1963 | if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { | 2089 | if (unlikely(offset >= p->max)) |
1964 | p->swap_map[offset]++; | 2090 | goto unlock_out; |
1965 | result = 1; | 2091 | |
1966 | } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { | 2092 | count = swap_count(p->swap_map[offset]); |
2093 | has_cache = swap_has_cache(p->swap_map[offset]); | ||
2094 | |||
2095 | if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ | ||
2096 | |||
2097 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ | ||
2098 | if (!has_cache && count) { | ||
2099 | p->swap_map[offset] = encode_swapmap(count, true); | ||
2100 | result = 0; | ||
2101 | } else if (has_cache) /* someone added cache */ | ||
2102 | result = -EEXIST; | ||
2103 | else if (!count) /* no users */ | ||
2104 | result = -ENOENT; | ||
2105 | |||
2106 | } else if (count || has_cache) { | ||
2107 | if (count < SWAP_MAP_MAX - 1) { | ||
2108 | p->swap_map[offset] = encode_swapmap(count + 1, | ||
2109 | has_cache); | ||
2110 | result = 0; | ||
2111 | } else if (count <= SWAP_MAP_MAX) { | ||
1967 | if (swap_overflow++ < 5) | 2112 | if (swap_overflow++ < 5) |
1968 | printk(KERN_WARNING "swap_dup: swap entry overflow\n"); | 2113 | printk(KERN_WARNING |
1969 | p->swap_map[offset] = SWAP_MAP_MAX; | 2114 | "swap_dup: swap entry overflow\n"); |
1970 | result = 1; | 2115 | p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, |
2116 | has_cache); | ||
2117 | result = 0; | ||
1971 | } | 2118 | } |
1972 | } | 2119 | } else |
2120 | result = -ENOENT; /* unused swap entry */ | ||
2121 | unlock_out: | ||
1973 | spin_unlock(&swap_lock); | 2122 | spin_unlock(&swap_lock); |
1974 | out: | 2123 | out: |
1975 | return result; | 2124 | return result; |
@@ -1978,6 +2127,27 @@ bad_file: | |||
1978 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); | 2127 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); |
1979 | goto out; | 2128 | goto out; |
1980 | } | 2129 | } |
2130 | /* | ||
2131 | * increase reference count of swap entry by 1. | ||
2132 | */ | ||
2133 | void swap_duplicate(swp_entry_t entry) | ||
2134 | { | ||
2135 | __swap_duplicate(entry, SWAP_MAP); | ||
2136 | } | ||
2137 | |||
2138 | /* | ||
2139 | * @entry: swap entry for which we allocate swap cache. | ||
2140 | * | ||
2141 | * Called when allocating swap cache for exising swap entry, | ||
2142 | * This can return error codes. Returns 0 at success. | ||
2143 | * -EBUSY means there is a swap cache. | ||
2144 | * Note: return code is different from swap_duplicate(). | ||
2145 | */ | ||
2146 | int swapcache_prepare(swp_entry_t entry) | ||
2147 | { | ||
2148 | return __swap_duplicate(entry, SWAP_CACHE); | ||
2149 | } | ||
2150 | |||
1981 | 2151 | ||
1982 | struct swap_info_struct * | 2152 | struct swap_info_struct * |
1983 | get_swap_info_struct(unsigned type) | 2153 | get_swap_info_struct(unsigned type) |
@@ -2016,7 +2186,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2016 | /* Don't read in free or bad pages */ | 2186 | /* Don't read in free or bad pages */ |
2017 | if (!si->swap_map[toff]) | 2187 | if (!si->swap_map[toff]) |
2018 | break; | 2188 | break; |
2019 | if (si->swap_map[toff] == SWAP_MAP_BAD) | 2189 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) |
2020 | break; | 2190 | break; |
2021 | } | 2191 | } |
2022 | /* Count contiguous allocated slots below our target */ | 2192 | /* Count contiguous allocated slots below our target */ |
@@ -2024,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
2024 | /* Don't read in free or bad pages */ | 2194 | /* Don't read in free or bad pages */ |
2025 | if (!si->swap_map[toff]) | 2195 | if (!si->swap_map[toff]) |
2026 | break; | 2196 | break; |
2027 | if (si->swap_map[toff] == SWAP_MAP_BAD) | 2197 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) |
2028 | break; | 2198 | break; |
2029 | } | 2199 | } |
2030 | spin_unlock(&swap_lock); | 2200 | spin_unlock(&swap_lock); |
diff --git a/mm/truncate.c b/mm/truncate.c index 12e1579f9165..ccc3ecf7cb98 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | |||
267 | } | 267 | } |
268 | EXPORT_SYMBOL(truncate_inode_pages); | 268 | EXPORT_SYMBOL(truncate_inode_pages); |
269 | 269 | ||
270 | unsigned long __invalidate_mapping_pages(struct address_space *mapping, | 270 | /** |
271 | pgoff_t start, pgoff_t end, bool be_atomic) | 271 | * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode |
272 | * @mapping: the address_space which holds the pages to invalidate | ||
273 | * @start: the offset 'from' which to invalidate | ||
274 | * @end: the offset 'to' which to invalidate (inclusive) | ||
275 | * | ||
276 | * This function only removes the unlocked pages, if you want to | ||
277 | * remove all the pages of one inode, you must call truncate_inode_pages. | ||
278 | * | ||
279 | * invalidate_mapping_pages() will not block on IO activity. It will not | ||
280 | * invalidate pages which are dirty, locked, under writeback or mapped into | ||
281 | * pagetables. | ||
282 | */ | ||
283 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | ||
284 | pgoff_t start, pgoff_t end) | ||
272 | { | 285 | { |
273 | struct pagevec pvec; | 286 | struct pagevec pvec; |
274 | pgoff_t next = start; | 287 | pgoff_t next = start; |
@@ -309,30 +322,10 @@ unlock: | |||
309 | break; | 322 | break; |
310 | } | 323 | } |
311 | pagevec_release(&pvec); | 324 | pagevec_release(&pvec); |
312 | if (likely(!be_atomic)) | 325 | cond_resched(); |
313 | cond_resched(); | ||
314 | } | 326 | } |
315 | return ret; | 327 | return ret; |
316 | } | 328 | } |
317 | |||
318 | /** | ||
319 | * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode | ||
320 | * @mapping: the address_space which holds the pages to invalidate | ||
321 | * @start: the offset 'from' which to invalidate | ||
322 | * @end: the offset 'to' which to invalidate (inclusive) | ||
323 | * | ||
324 | * This function only removes the unlocked pages, if you want to | ||
325 | * remove all the pages of one inode, you must call truncate_inode_pages. | ||
326 | * | ||
327 | * invalidate_mapping_pages() will not block on IO activity. It will not | ||
328 | * invalidate pages which are dirty, locked, under writeback or mapped into | ||
329 | * pagetables. | ||
330 | */ | ||
331 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | ||
332 | pgoff_t start, pgoff_t end) | ||
333 | { | ||
334 | return __invalidate_mapping_pages(mapping, start, end, false); | ||
335 | } | ||
336 | EXPORT_SYMBOL(invalidate_mapping_pages); | 329 | EXPORT_SYMBOL(invalidate_mapping_pages); |
337 | 330 | ||
338 | /* | 331 | /* |
@@ -233,13 +233,21 @@ void arch_pick_mmap_layout(struct mm_struct *mm) | |||
233 | * @pages: array that receives pointers to the pages pinned. | 233 | * @pages: array that receives pointers to the pages pinned. |
234 | * Should be at least nr_pages long. | 234 | * Should be at least nr_pages long. |
235 | * | 235 | * |
236 | * Attempt to pin user pages in memory without taking mm->mmap_sem. | ||
237 | * If not successful, it will fall back to taking the lock and | ||
238 | * calling get_user_pages(). | ||
239 | * | ||
240 | * Returns number of pages pinned. This may be fewer than the number | 236 | * Returns number of pages pinned. This may be fewer than the number |
241 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | 237 | * requested. If nr_pages is 0 or negative, returns 0. If no pages |
242 | * were pinned, returns -errno. | 238 | * were pinned, returns -errno. |
239 | * | ||
240 | * get_user_pages_fast provides equivalent functionality to get_user_pages, | ||
241 | * operating on current and current->mm, with force=0 and vma=NULL. However | ||
242 | * unlike get_user_pages, it must be called without mmap_sem held. | ||
243 | * | ||
244 | * get_user_pages_fast may take mmap_sem and page table locks, so no | ||
245 | * assumptions can be made about lack of locking. get_user_pages_fast is to be | ||
246 | * implemented in a way that is advantageous (vs get_user_pages()) when the | ||
247 | * user memory area is already faulted in and present in ptes. However if the | ||
248 | * pages have to be faulted in, it may turn out to be slightly slower so | ||
249 | * callers need to carefully consider what to use. On many architectures, | ||
250 | * get_user_pages_fast simply falls back to get_user_pages. | ||
243 | */ | 251 | */ |
244 | int __attribute__((weak)) get_user_pages_fast(unsigned long start, | 252 | int __attribute__((weak)) get_user_pages_fast(unsigned long start, |
245 | int nr_pages, int write, struct page **pages) | 253 | int nr_pages, int write, struct page **pages) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 95c08a8cc2ba..4139aa52b941 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -470,8 +470,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) | |||
470 | swp_entry_t swap = { .val = page_private(page) }; | 470 | swp_entry_t swap = { .val = page_private(page) }; |
471 | __delete_from_swap_cache(page); | 471 | __delete_from_swap_cache(page); |
472 | spin_unlock_irq(&mapping->tree_lock); | 472 | spin_unlock_irq(&mapping->tree_lock); |
473 | mem_cgroup_uncharge_swapcache(page, swap); | 473 | swapcache_free(swap, page); |
474 | swap_free(swap); | ||
475 | } else { | 474 | } else { |
476 | __remove_from_page_cache(page); | 475 | __remove_from_page_cache(page); |
477 | spin_unlock_irq(&mapping->tree_lock); | 476 | spin_unlock_irq(&mapping->tree_lock); |
@@ -514,7 +513,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) | |||
514 | * | 513 | * |
515 | * lru_lock must not be held, interrupts must be enabled. | 514 | * lru_lock must not be held, interrupts must be enabled. |
516 | */ | 515 | */ |
517 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
518 | void putback_lru_page(struct page *page) | 516 | void putback_lru_page(struct page *page) |
519 | { | 517 | { |
520 | int lru; | 518 | int lru; |
@@ -568,20 +566,6 @@ redo: | |||
568 | put_page(page); /* drop ref from isolate */ | 566 | put_page(page); /* drop ref from isolate */ |
569 | } | 567 | } |
570 | 568 | ||
571 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
572 | |||
573 | void putback_lru_page(struct page *page) | ||
574 | { | ||
575 | int lru; | ||
576 | VM_BUG_ON(PageLRU(page)); | ||
577 | |||
578 | lru = !!TestClearPageActive(page) + page_is_file_cache(page); | ||
579 | lru_cache_add_lru(page, lru); | ||
580 | put_page(page); | ||
581 | } | ||
582 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
583 | |||
584 | |||
585 | /* | 569 | /* |
586 | * shrink_page_list() returns the number of reclaimed pages | 570 | * shrink_page_list() returns the number of reclaimed pages |
587 | */ | 571 | */ |
@@ -593,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
593 | struct pagevec freed_pvec; | 577 | struct pagevec freed_pvec; |
594 | int pgactivate = 0; | 578 | int pgactivate = 0; |
595 | unsigned long nr_reclaimed = 0; | 579 | unsigned long nr_reclaimed = 0; |
580 | unsigned long vm_flags; | ||
596 | 581 | ||
597 | cond_resched(); | 582 | cond_resched(); |
598 | 583 | ||
@@ -643,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
643 | goto keep_locked; | 628 | goto keep_locked; |
644 | } | 629 | } |
645 | 630 | ||
646 | referenced = page_referenced(page, 1, sc->mem_cgroup); | 631 | referenced = page_referenced(page, 1, |
632 | sc->mem_cgroup, &vm_flags); | ||
647 | /* In active use or really unfreeable? Activate it. */ | 633 | /* In active use or really unfreeable? Activate it. */ |
648 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && | 634 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && |
649 | referenced && page_mapping_inuse(page)) | 635 | referenced && page_mapping_inuse(page)) |
@@ -943,18 +929,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
943 | /* Check that we have not crossed a zone boundary. */ | 929 | /* Check that we have not crossed a zone boundary. */ |
944 | if (unlikely(page_zone_id(cursor_page) != zone_id)) | 930 | if (unlikely(page_zone_id(cursor_page) != zone_id)) |
945 | continue; | 931 | continue; |
946 | switch (__isolate_lru_page(cursor_page, mode, file)) { | 932 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { |
947 | case 0: | ||
948 | list_move(&cursor_page->lru, dst); | 933 | list_move(&cursor_page->lru, dst); |
949 | nr_taken++; | 934 | nr_taken++; |
950 | scan++; | 935 | scan++; |
951 | break; | ||
952 | |||
953 | case -EBUSY: | ||
954 | /* else it is being freed elsewhere */ | ||
955 | list_move(&cursor_page->lru, src); | ||
956 | default: | ||
957 | break; /* ! on LRU or wrong list */ | ||
958 | } | 936 | } |
959 | } | 937 | } |
960 | } | 938 | } |
@@ -1061,6 +1039,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1061 | unsigned long nr_scanned = 0; | 1039 | unsigned long nr_scanned = 0; |
1062 | unsigned long nr_reclaimed = 0; | 1040 | unsigned long nr_reclaimed = 0; |
1063 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1041 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1042 | int lumpy_reclaim = 0; | ||
1043 | |||
1044 | /* | ||
1045 | * If we need a large contiguous chunk of memory, or have | ||
1046 | * trouble getting a small set of contiguous pages, we | ||
1047 | * will reclaim both active and inactive pages. | ||
1048 | * | ||
1049 | * We use the same threshold as pageout congestion_wait below. | ||
1050 | */ | ||
1051 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
1052 | lumpy_reclaim = 1; | ||
1053 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
1054 | lumpy_reclaim = 1; | ||
1064 | 1055 | ||
1065 | pagevec_init(&pvec, 1); | 1056 | pagevec_init(&pvec, 1); |
1066 | 1057 | ||
@@ -1073,19 +1064,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1073 | unsigned long nr_freed; | 1064 | unsigned long nr_freed; |
1074 | unsigned long nr_active; | 1065 | unsigned long nr_active; |
1075 | unsigned int count[NR_LRU_LISTS] = { 0, }; | 1066 | unsigned int count[NR_LRU_LISTS] = { 0, }; |
1076 | int mode = ISOLATE_INACTIVE; | 1067 | int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; |
1077 | |||
1078 | /* | ||
1079 | * If we need a large contiguous chunk of memory, or have | ||
1080 | * trouble getting a small set of contiguous pages, we | ||
1081 | * will reclaim both active and inactive pages. | ||
1082 | * | ||
1083 | * We use the same threshold as pageout congestion_wait below. | ||
1084 | */ | ||
1085 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
1086 | mode = ISOLATE_BOTH; | ||
1087 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
1088 | mode = ISOLATE_BOTH; | ||
1089 | 1068 | ||
1090 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, | 1069 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, |
1091 | &page_list, &nr_scan, sc->order, mode, | 1070 | &page_list, &nr_scan, sc->order, mode, |
@@ -1122,7 +1101,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
1122 | * but that should be acceptable to the caller | 1101 | * but that should be acceptable to the caller |
1123 | */ | 1102 | */ |
1124 | if (nr_freed < nr_taken && !current_is_kswapd() && | 1103 | if (nr_freed < nr_taken && !current_is_kswapd() && |
1125 | sc->order > PAGE_ALLOC_COSTLY_ORDER) { | 1104 | lumpy_reclaim) { |
1126 | congestion_wait(WRITE, HZ/10); | 1105 | congestion_wait(WRITE, HZ/10); |
1127 | 1106 | ||
1128 | /* | 1107 | /* |
@@ -1217,18 +1196,54 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) | |||
1217 | * But we had to alter page->flags anyway. | 1196 | * But we had to alter page->flags anyway. |
1218 | */ | 1197 | */ |
1219 | 1198 | ||
1199 | static void move_active_pages_to_lru(struct zone *zone, | ||
1200 | struct list_head *list, | ||
1201 | enum lru_list lru) | ||
1202 | { | ||
1203 | unsigned long pgmoved = 0; | ||
1204 | struct pagevec pvec; | ||
1205 | struct page *page; | ||
1206 | |||
1207 | pagevec_init(&pvec, 1); | ||
1208 | |||
1209 | while (!list_empty(list)) { | ||
1210 | page = lru_to_page(list); | ||
1211 | prefetchw_prev_lru_page(page, list, flags); | ||
1212 | |||
1213 | VM_BUG_ON(PageLRU(page)); | ||
1214 | SetPageLRU(page); | ||
1215 | |||
1216 | VM_BUG_ON(!PageActive(page)); | ||
1217 | if (!is_active_lru(lru)) | ||
1218 | ClearPageActive(page); /* we are de-activating */ | ||
1219 | |||
1220 | list_move(&page->lru, &zone->lru[lru].list); | ||
1221 | mem_cgroup_add_lru_list(page, lru); | ||
1222 | pgmoved++; | ||
1223 | |||
1224 | if (!pagevec_add(&pvec, page) || list_empty(list)) { | ||
1225 | spin_unlock_irq(&zone->lru_lock); | ||
1226 | if (buffer_heads_over_limit) | ||
1227 | pagevec_strip(&pvec); | ||
1228 | __pagevec_release(&pvec); | ||
1229 | spin_lock_irq(&zone->lru_lock); | ||
1230 | } | ||
1231 | } | ||
1232 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | ||
1233 | if (!is_active_lru(lru)) | ||
1234 | __count_vm_events(PGDEACTIVATE, pgmoved); | ||
1235 | } | ||
1220 | 1236 | ||
1221 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | 1237 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, |
1222 | struct scan_control *sc, int priority, int file) | 1238 | struct scan_control *sc, int priority, int file) |
1223 | { | 1239 | { |
1224 | unsigned long pgmoved; | 1240 | unsigned long pgmoved; |
1225 | int pgdeactivate = 0; | ||
1226 | unsigned long pgscanned; | 1241 | unsigned long pgscanned; |
1242 | unsigned long vm_flags; | ||
1227 | LIST_HEAD(l_hold); /* The pages which were snipped off */ | 1243 | LIST_HEAD(l_hold); /* The pages which were snipped off */ |
1244 | LIST_HEAD(l_active); | ||
1228 | LIST_HEAD(l_inactive); | 1245 | LIST_HEAD(l_inactive); |
1229 | struct page *page; | 1246 | struct page *page; |
1230 | struct pagevec pvec; | ||
1231 | enum lru_list lru; | ||
1232 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1247 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1233 | 1248 | ||
1234 | lru_add_drain(); | 1249 | lru_add_drain(); |
@@ -1245,13 +1260,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1245 | } | 1260 | } |
1246 | reclaim_stat->recent_scanned[!!file] += pgmoved; | 1261 | reclaim_stat->recent_scanned[!!file] += pgmoved; |
1247 | 1262 | ||
1263 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | ||
1248 | if (file) | 1264 | if (file) |
1249 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); | 1265 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); |
1250 | else | 1266 | else |
1251 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); | 1267 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); |
1252 | spin_unlock_irq(&zone->lru_lock); | 1268 | spin_unlock_irq(&zone->lru_lock); |
1253 | 1269 | ||
1254 | pgmoved = 0; | 1270 | pgmoved = 0; /* count referenced (mapping) mapped pages */ |
1255 | while (!list_empty(&l_hold)) { | 1271 | while (!list_empty(&l_hold)) { |
1256 | cond_resched(); | 1272 | cond_resched(); |
1257 | page = lru_to_page(&l_hold); | 1273 | page = lru_to_page(&l_hold); |
@@ -1264,58 +1280,44 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1264 | 1280 | ||
1265 | /* page_referenced clears PageReferenced */ | 1281 | /* page_referenced clears PageReferenced */ |
1266 | if (page_mapping_inuse(page) && | 1282 | if (page_mapping_inuse(page) && |
1267 | page_referenced(page, 0, sc->mem_cgroup)) | 1283 | page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
1268 | pgmoved++; | 1284 | pgmoved++; |
1285 | /* | ||
1286 | * Identify referenced, file-backed active pages and | ||
1287 | * give them one more trip around the active list. So | ||
1288 | * that executable code get better chances to stay in | ||
1289 | * memory under moderate memory pressure. Anon pages | ||
1290 | * are not likely to be evicted by use-once streaming | ||
1291 | * IO, plus JVM can create lots of anon VM_EXEC pages, | ||
1292 | * so we ignore them here. | ||
1293 | */ | ||
1294 | if ((vm_flags & VM_EXEC) && !PageAnon(page)) { | ||
1295 | list_add(&page->lru, &l_active); | ||
1296 | continue; | ||
1297 | } | ||
1298 | } | ||
1269 | 1299 | ||
1270 | list_add(&page->lru, &l_inactive); | 1300 | list_add(&page->lru, &l_inactive); |
1271 | } | 1301 | } |
1272 | 1302 | ||
1273 | /* | 1303 | /* |
1274 | * Move the pages to the [file or anon] inactive list. | 1304 | * Move pages back to the lru list. |
1275 | */ | 1305 | */ |
1276 | pagevec_init(&pvec, 1); | ||
1277 | lru = LRU_BASE + file * LRU_FILE; | ||
1278 | |||
1279 | spin_lock_irq(&zone->lru_lock); | 1306 | spin_lock_irq(&zone->lru_lock); |
1280 | /* | 1307 | /* |
1281 | * Count referenced pages from currently used mappings as | 1308 | * Count referenced pages from currently used mappings as rotated, |
1282 | * rotated, even though they are moved to the inactive list. | 1309 | * even though only some of them are actually re-activated. This |
1283 | * This helps balance scan pressure between file and anonymous | 1310 | * helps balance scan pressure between file and anonymous pages in |
1284 | * pages in get_scan_ratio. | 1311 | * get_scan_ratio. |
1285 | */ | 1312 | */ |
1286 | reclaim_stat->recent_rotated[!!file] += pgmoved; | 1313 | reclaim_stat->recent_rotated[!!file] += pgmoved; |
1287 | 1314 | ||
1288 | pgmoved = 0; | 1315 | move_active_pages_to_lru(zone, &l_active, |
1289 | while (!list_empty(&l_inactive)) { | 1316 | LRU_ACTIVE + file * LRU_FILE); |
1290 | page = lru_to_page(&l_inactive); | 1317 | move_active_pages_to_lru(zone, &l_inactive, |
1291 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 1318 | LRU_BASE + file * LRU_FILE); |
1292 | VM_BUG_ON(PageLRU(page)); | ||
1293 | SetPageLRU(page); | ||
1294 | VM_BUG_ON(!PageActive(page)); | ||
1295 | ClearPageActive(page); | ||
1296 | 1319 | ||
1297 | list_move(&page->lru, &zone->lru[lru].list); | ||
1298 | mem_cgroup_add_lru_list(page, lru); | ||
1299 | pgmoved++; | ||
1300 | if (!pagevec_add(&pvec, page)) { | ||
1301 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | ||
1302 | spin_unlock_irq(&zone->lru_lock); | ||
1303 | pgdeactivate += pgmoved; | ||
1304 | pgmoved = 0; | ||
1305 | if (buffer_heads_over_limit) | ||
1306 | pagevec_strip(&pvec); | ||
1307 | __pagevec_release(&pvec); | ||
1308 | spin_lock_irq(&zone->lru_lock); | ||
1309 | } | ||
1310 | } | ||
1311 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | ||
1312 | pgdeactivate += pgmoved; | ||
1313 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | ||
1314 | __count_vm_events(PGDEACTIVATE, pgdeactivate); | ||
1315 | spin_unlock_irq(&zone->lru_lock); | 1320 | spin_unlock_irq(&zone->lru_lock); |
1316 | if (buffer_heads_over_limit) | ||
1317 | pagevec_strip(&pvec); | ||
1318 | pagevec_release(&pvec); | ||
1319 | } | 1321 | } |
1320 | 1322 | ||
1321 | static int inactive_anon_is_low_global(struct zone *zone) | 1323 | static int inactive_anon_is_low_global(struct zone *zone) |
@@ -1350,12 +1352,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) | |||
1350 | return low; | 1352 | return low; |
1351 | } | 1353 | } |
1352 | 1354 | ||
1355 | static int inactive_file_is_low_global(struct zone *zone) | ||
1356 | { | ||
1357 | unsigned long active, inactive; | ||
1358 | |||
1359 | active = zone_page_state(zone, NR_ACTIVE_FILE); | ||
1360 | inactive = zone_page_state(zone, NR_INACTIVE_FILE); | ||
1361 | |||
1362 | return (active > inactive); | ||
1363 | } | ||
1364 | |||
1365 | /** | ||
1366 | * inactive_file_is_low - check if file pages need to be deactivated | ||
1367 | * @zone: zone to check | ||
1368 | * @sc: scan control of this context | ||
1369 | * | ||
1370 | * When the system is doing streaming IO, memory pressure here | ||
1371 | * ensures that active file pages get deactivated, until more | ||
1372 | * than half of the file pages are on the inactive list. | ||
1373 | * | ||
1374 | * Once we get to that situation, protect the system's working | ||
1375 | * set from being evicted by disabling active file page aging. | ||
1376 | * | ||
1377 | * This uses a different ratio than the anonymous pages, because | ||
1378 | * the page cache uses a use-once replacement algorithm. | ||
1379 | */ | ||
1380 | static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) | ||
1381 | { | ||
1382 | int low; | ||
1383 | |||
1384 | if (scanning_global_lru(sc)) | ||
1385 | low = inactive_file_is_low_global(zone); | ||
1386 | else | ||
1387 | low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); | ||
1388 | return low; | ||
1389 | } | ||
1390 | |||
1353 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1391 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
1354 | struct zone *zone, struct scan_control *sc, int priority) | 1392 | struct zone *zone, struct scan_control *sc, int priority) |
1355 | { | 1393 | { |
1356 | int file = is_file_lru(lru); | 1394 | int file = is_file_lru(lru); |
1357 | 1395 | ||
1358 | if (lru == LRU_ACTIVE_FILE) { | 1396 | if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { |
1359 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | 1397 | shrink_active_list(nr_to_scan, zone, sc, priority, file); |
1360 | return 0; | 1398 | return 0; |
1361 | } | 1399 | } |
@@ -1384,13 +1422,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1384 | unsigned long ap, fp; | 1422 | unsigned long ap, fp; |
1385 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1423 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1386 | 1424 | ||
1387 | /* If we have no swap space, do not bother scanning anon pages. */ | ||
1388 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | ||
1389 | percent[0] = 0; | ||
1390 | percent[1] = 100; | ||
1391 | return; | ||
1392 | } | ||
1393 | |||
1394 | anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + | 1425 | anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + |
1395 | zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); | 1426 | zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); |
1396 | file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + | 1427 | file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + |
@@ -1400,7 +1431,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1400 | free = zone_page_state(zone, NR_FREE_PAGES); | 1431 | free = zone_page_state(zone, NR_FREE_PAGES); |
1401 | /* If we have very few page cache pages, | 1432 | /* If we have very few page cache pages, |
1402 | force-scan anon pages. */ | 1433 | force-scan anon pages. */ |
1403 | if (unlikely(file + free <= zone->pages_high)) { | 1434 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
1404 | percent[0] = 100; | 1435 | percent[0] = 100; |
1405 | percent[1] = 0; | 1436 | percent[1] = 0; |
1406 | return; | 1437 | return; |
@@ -1455,6 +1486,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
1455 | percent[1] = 100 - percent[0]; | 1486 | percent[1] = 100 - percent[0]; |
1456 | } | 1487 | } |
1457 | 1488 | ||
1489 | /* | ||
1490 | * Smallish @nr_to_scan's are deposited in @nr_saved_scan, | ||
1491 | * until we collected @swap_cluster_max pages to scan. | ||
1492 | */ | ||
1493 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, | ||
1494 | unsigned long *nr_saved_scan, | ||
1495 | unsigned long swap_cluster_max) | ||
1496 | { | ||
1497 | unsigned long nr; | ||
1498 | |||
1499 | *nr_saved_scan += nr_to_scan; | ||
1500 | nr = *nr_saved_scan; | ||
1501 | |||
1502 | if (nr >= swap_cluster_max) | ||
1503 | *nr_saved_scan = 0; | ||
1504 | else | ||
1505 | nr = 0; | ||
1506 | |||
1507 | return nr; | ||
1508 | } | ||
1458 | 1509 | ||
1459 | /* | 1510 | /* |
1460 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1511 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
@@ -1468,26 +1519,30 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1468 | enum lru_list l; | 1519 | enum lru_list l; |
1469 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1520 | unsigned long nr_reclaimed = sc->nr_reclaimed; |
1470 | unsigned long swap_cluster_max = sc->swap_cluster_max; | 1521 | unsigned long swap_cluster_max = sc->swap_cluster_max; |
1522 | int noswap = 0; | ||
1471 | 1523 | ||
1472 | get_scan_ratio(zone, sc, percent); | 1524 | /* If we have no swap space, do not bother scanning anon pages. */ |
1525 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | ||
1526 | noswap = 1; | ||
1527 | percent[0] = 0; | ||
1528 | percent[1] = 100; | ||
1529 | } else | ||
1530 | get_scan_ratio(zone, sc, percent); | ||
1473 | 1531 | ||
1474 | for_each_evictable_lru(l) { | 1532 | for_each_evictable_lru(l) { |
1475 | int file = is_file_lru(l); | 1533 | int file = is_file_lru(l); |
1476 | unsigned long scan; | 1534 | unsigned long scan; |
1477 | 1535 | ||
1478 | scan = zone_nr_pages(zone, sc, l); | 1536 | scan = zone_nr_pages(zone, sc, l); |
1479 | if (priority) { | 1537 | if (priority || noswap) { |
1480 | scan >>= priority; | 1538 | scan >>= priority; |
1481 | scan = (scan * percent[file]) / 100; | 1539 | scan = (scan * percent[file]) / 100; |
1482 | } | 1540 | } |
1483 | if (scanning_global_lru(sc)) { | 1541 | if (scanning_global_lru(sc)) |
1484 | zone->lru[l].nr_scan += scan; | 1542 | nr[l] = nr_scan_try_batch(scan, |
1485 | nr[l] = zone->lru[l].nr_scan; | 1543 | &zone->lru[l].nr_saved_scan, |
1486 | if (nr[l] >= swap_cluster_max) | 1544 | swap_cluster_max); |
1487 | zone->lru[l].nr_scan = 0; | 1545 | else |
1488 | else | ||
1489 | nr[l] = 0; | ||
1490 | } else | ||
1491 | nr[l] = scan; | 1546 | nr[l] = scan; |
1492 | } | 1547 | } |
1493 | 1548 | ||
@@ -1521,7 +1576,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1521 | * Even if we did not try to evict anon pages at all, we want to | 1576 | * Even if we did not try to evict anon pages at all, we want to |
1522 | * rebalance the anon lru active/inactive ratio. | 1577 | * rebalance the anon lru active/inactive ratio. |
1523 | */ | 1578 | */ |
1524 | if (inactive_anon_is_low(zone, sc)) | 1579 | if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) |
1525 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 1580 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); |
1526 | 1581 | ||
1527 | throttle_vm_writeout(sc->gfp_mask); | 1582 | throttle_vm_writeout(sc->gfp_mask); |
@@ -1532,11 +1587,13 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1532 | * try to reclaim pages from zones which will satisfy the caller's allocation | 1587 | * try to reclaim pages from zones which will satisfy the caller's allocation |
1533 | * request. | 1588 | * request. |
1534 | * | 1589 | * |
1535 | * We reclaim from a zone even if that zone is over pages_high. Because: | 1590 | * We reclaim from a zone even if that zone is over high_wmark_pages(zone). |
1591 | * Because: | ||
1536 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order | 1592 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order |
1537 | * allocation or | 1593 | * allocation or |
1538 | * b) The zones may be over pages_high but they must go *over* pages_high to | 1594 | * b) The target zone may be at high_wmark_pages(zone) but the lower zones |
1539 | * satisfy the `incremental min' zone defense algorithm. | 1595 | * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' |
1596 | * zone defense algorithm. | ||
1540 | * | 1597 | * |
1541 | * If a zone is deemed to be full of pinned pages then just give it a light | 1598 | * If a zone is deemed to be full of pinned pages then just give it a light |
1542 | * scan then give up on it. | 1599 | * scan then give up on it. |
@@ -1742,7 +1799,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1742 | 1799 | ||
1743 | /* | 1800 | /* |
1744 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1801 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1745 | * they are all at pages_high. | 1802 | * they are all at high_wmark_pages(zone). |
1746 | * | 1803 | * |
1747 | * Returns the number of pages which were actually freed. | 1804 | * Returns the number of pages which were actually freed. |
1748 | * | 1805 | * |
@@ -1755,11 +1812,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
1755 | * the zone for when the problem goes away. | 1812 | * the zone for when the problem goes away. |
1756 | * | 1813 | * |
1757 | * kswapd scans the zones in the highmem->normal->dma direction. It skips | 1814 | * kswapd scans the zones in the highmem->normal->dma direction. It skips |
1758 | * zones which have free_pages > pages_high, but once a zone is found to have | 1815 | * zones which have free_pages > high_wmark_pages(zone), but once a zone is |
1759 | * free_pages <= pages_high, we scan that zone and the lower zones regardless | 1816 | * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the |
1760 | * of the number of free pages in the lower zones. This interoperates with | 1817 | * lower zones regardless of the number of free pages in the lower zones. This |
1761 | * the page allocator fallback scheme to ensure that aging of pages is balanced | 1818 | * interoperates with the page allocator fallback scheme to ensure that aging |
1762 | * across the zones. | 1819 | * of pages is balanced across the zones. |
1763 | */ | 1820 | */ |
1764 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | 1821 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) |
1765 | { | 1822 | { |
@@ -1780,7 +1837,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
1780 | }; | 1837 | }; |
1781 | /* | 1838 | /* |
1782 | * temp_priority is used to remember the scanning priority at which | 1839 | * temp_priority is used to remember the scanning priority at which |
1783 | * this zone was successfully refilled to free_pages == pages_high. | 1840 | * this zone was successfully refilled to |
1841 | * free_pages == high_wmark_pages(zone). | ||
1784 | */ | 1842 | */ |
1785 | int temp_priority[MAX_NR_ZONES]; | 1843 | int temp_priority[MAX_NR_ZONES]; |
1786 | 1844 | ||
@@ -1825,8 +1883,8 @@ loop_again: | |||
1825 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 1883 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
1826 | &sc, priority, 0); | 1884 | &sc, priority, 0); |
1827 | 1885 | ||
1828 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1886 | if (!zone_watermark_ok(zone, order, |
1829 | 0, 0)) { | 1887 | high_wmark_pages(zone), 0, 0)) { |
1830 | end_zone = i; | 1888 | end_zone = i; |
1831 | break; | 1889 | break; |
1832 | } | 1890 | } |
@@ -1860,8 +1918,8 @@ loop_again: | |||
1860 | priority != DEF_PRIORITY) | 1918 | priority != DEF_PRIORITY) |
1861 | continue; | 1919 | continue; |
1862 | 1920 | ||
1863 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1921 | if (!zone_watermark_ok(zone, order, |
1864 | end_zone, 0)) | 1922 | high_wmark_pages(zone), end_zone, 0)) |
1865 | all_zones_ok = 0; | 1923 | all_zones_ok = 0; |
1866 | temp_priority[i] = priority; | 1924 | temp_priority[i] = priority; |
1867 | sc.nr_scanned = 0; | 1925 | sc.nr_scanned = 0; |
@@ -1870,8 +1928,8 @@ loop_again: | |||
1870 | * We put equal pressure on every zone, unless one | 1928 | * We put equal pressure on every zone, unless one |
1871 | * zone has way too many pages free already. | 1929 | * zone has way too many pages free already. |
1872 | */ | 1930 | */ |
1873 | if (!zone_watermark_ok(zone, order, 8*zone->pages_high, | 1931 | if (!zone_watermark_ok(zone, order, |
1874 | end_zone, 0)) | 1932 | 8*high_wmark_pages(zone), end_zone, 0)) |
1875 | shrink_zone(priority, zone, &sc); | 1933 | shrink_zone(priority, zone, &sc); |
1876 | reclaim_state->reclaimed_slab = 0; | 1934 | reclaim_state->reclaimed_slab = 0; |
1877 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1935 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
@@ -2037,7 +2095,7 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
2037 | return; | 2095 | return; |
2038 | 2096 | ||
2039 | pgdat = zone->zone_pgdat; | 2097 | pgdat = zone->zone_pgdat; |
2040 | if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) | 2098 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) |
2041 | return; | 2099 | return; |
2042 | if (pgdat->kswapd_max_order < order) | 2100 | if (pgdat->kswapd_max_order < order) |
2043 | pgdat->kswapd_max_order = order; | 2101 | pgdat->kswapd_max_order = order; |
@@ -2084,11 +2142,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, | |||
2084 | l == LRU_ACTIVE_FILE)) | 2142 | l == LRU_ACTIVE_FILE)) |
2085 | continue; | 2143 | continue; |
2086 | 2144 | ||
2087 | zone->lru[l].nr_scan += (lru_pages >> prio) + 1; | 2145 | zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; |
2088 | if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { | 2146 | if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { |
2089 | unsigned long nr_to_scan; | 2147 | unsigned long nr_to_scan; |
2090 | 2148 | ||
2091 | zone->lru[l].nr_scan = 0; | 2149 | zone->lru[l].nr_saved_scan = 0; |
2092 | nr_to_scan = min(nr_pages, lru_pages); | 2150 | nr_to_scan = min(nr_pages, lru_pages); |
2093 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, | 2151 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, |
2094 | sc, prio); | 2152 | sc, prio); |
@@ -2290,6 +2348,48 @@ int sysctl_min_unmapped_ratio = 1; | |||
2290 | */ | 2348 | */ |
2291 | int sysctl_min_slab_ratio = 5; | 2349 | int sysctl_min_slab_ratio = 5; |
2292 | 2350 | ||
2351 | static inline unsigned long zone_unmapped_file_pages(struct zone *zone) | ||
2352 | { | ||
2353 | unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); | ||
2354 | unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + | ||
2355 | zone_page_state(zone, NR_ACTIVE_FILE); | ||
2356 | |||
2357 | /* | ||
2358 | * It's possible for there to be more file mapped pages than | ||
2359 | * accounted for by the pages on the file LRU lists because | ||
2360 | * tmpfs pages accounted for as ANON can also be FILE_MAPPED | ||
2361 | */ | ||
2362 | return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; | ||
2363 | } | ||
2364 | |||
2365 | /* Work out how many page cache pages we can reclaim in this reclaim_mode */ | ||
2366 | static long zone_pagecache_reclaimable(struct zone *zone) | ||
2367 | { | ||
2368 | long nr_pagecache_reclaimable; | ||
2369 | long delta = 0; | ||
2370 | |||
2371 | /* | ||
2372 | * If RECLAIM_SWAP is set, then all file pages are considered | ||
2373 | * potentially reclaimable. Otherwise, we have to worry about | ||
2374 | * pages like swapcache and zone_unmapped_file_pages() provides | ||
2375 | * a better estimate | ||
2376 | */ | ||
2377 | if (zone_reclaim_mode & RECLAIM_SWAP) | ||
2378 | nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); | ||
2379 | else | ||
2380 | nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); | ||
2381 | |||
2382 | /* If we can't clean pages, remove dirty pages from consideration */ | ||
2383 | if (!(zone_reclaim_mode & RECLAIM_WRITE)) | ||
2384 | delta += zone_page_state(zone, NR_FILE_DIRTY); | ||
2385 | |||
2386 | /* Watch for any possible underflows due to delta */ | ||
2387 | if (unlikely(delta > nr_pagecache_reclaimable)) | ||
2388 | delta = nr_pagecache_reclaimable; | ||
2389 | |||
2390 | return nr_pagecache_reclaimable - delta; | ||
2391 | } | ||
2392 | |||
2293 | /* | 2393 | /* |
2294 | * Try to free up some pages from this zone through reclaim. | 2394 | * Try to free up some pages from this zone through reclaim. |
2295 | */ | 2395 | */ |
@@ -2324,9 +2424,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2324 | reclaim_state.reclaimed_slab = 0; | 2424 | reclaim_state.reclaimed_slab = 0; |
2325 | p->reclaim_state = &reclaim_state; | 2425 | p->reclaim_state = &reclaim_state; |
2326 | 2426 | ||
2327 | if (zone_page_state(zone, NR_FILE_PAGES) - | 2427 | if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { |
2328 | zone_page_state(zone, NR_FILE_MAPPED) > | ||
2329 | zone->min_unmapped_pages) { | ||
2330 | /* | 2428 | /* |
2331 | * Free memory by calling shrink zone with increasing | 2429 | * Free memory by calling shrink zone with increasing |
2332 | * priorities until we have enough memory freed. | 2430 | * priorities until we have enough memory freed. |
@@ -2384,20 +2482,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2384 | * if less than a specified percentage of the zone is used by | 2482 | * if less than a specified percentage of the zone is used by |
2385 | * unmapped file backed pages. | 2483 | * unmapped file backed pages. |
2386 | */ | 2484 | */ |
2387 | if (zone_page_state(zone, NR_FILE_PAGES) - | 2485 | if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && |
2388 | zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages | 2486 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) |
2389 | && zone_page_state(zone, NR_SLAB_RECLAIMABLE) | 2487 | return ZONE_RECLAIM_FULL; |
2390 | <= zone->min_slab_pages) | ||
2391 | return 0; | ||
2392 | 2488 | ||
2393 | if (zone_is_all_unreclaimable(zone)) | 2489 | if (zone_is_all_unreclaimable(zone)) |
2394 | return 0; | 2490 | return ZONE_RECLAIM_FULL; |
2395 | 2491 | ||
2396 | /* | 2492 | /* |
2397 | * Do not scan if the allocation should not be delayed. | 2493 | * Do not scan if the allocation should not be delayed. |
2398 | */ | 2494 | */ |
2399 | if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) | 2495 | if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) |
2400 | return 0; | 2496 | return ZONE_RECLAIM_NOSCAN; |
2401 | 2497 | ||
2402 | /* | 2498 | /* |
2403 | * Only run zone reclaim on the local zone or on zones that do not | 2499 | * Only run zone reclaim on the local zone or on zones that do not |
@@ -2407,18 +2503,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2407 | */ | 2503 | */ |
2408 | node_id = zone_to_nid(zone); | 2504 | node_id = zone_to_nid(zone); |
2409 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) | 2505 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) |
2410 | return 0; | 2506 | return ZONE_RECLAIM_NOSCAN; |
2411 | 2507 | ||
2412 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) | 2508 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) |
2413 | return 0; | 2509 | return ZONE_RECLAIM_NOSCAN; |
2510 | |||
2414 | ret = __zone_reclaim(zone, gfp_mask, order); | 2511 | ret = __zone_reclaim(zone, gfp_mask, order); |
2415 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); | 2512 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); |
2416 | 2513 | ||
2514 | if (!ret) | ||
2515 | count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); | ||
2516 | |||
2417 | return ret; | 2517 | return ret; |
2418 | } | 2518 | } |
2419 | #endif | 2519 | #endif |
2420 | 2520 | ||
2421 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
2422 | /* | 2521 | /* |
2423 | * page_evictable - test whether a page is evictable | 2522 | * page_evictable - test whether a page is evictable |
2424 | * @page: the page to test | 2523 | * @page: the page to test |
@@ -2665,4 +2764,3 @@ void scan_unevictable_unregister_node(struct node *node) | |||
2665 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); | 2764 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); |
2666 | } | 2765 | } |
2667 | 2766 | ||
2668 | #endif | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 74d66dba0cbe..138bed53706e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -629,10 +629,8 @@ static const char * const vmstat_text[] = { | |||
629 | "nr_active_anon", | 629 | "nr_active_anon", |
630 | "nr_inactive_file", | 630 | "nr_inactive_file", |
631 | "nr_active_file", | 631 | "nr_active_file", |
632 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
633 | "nr_unevictable", | 632 | "nr_unevictable", |
634 | "nr_mlock", | 633 | "nr_mlock", |
635 | #endif | ||
636 | "nr_anon_pages", | 634 | "nr_anon_pages", |
637 | "nr_mapped", | 635 | "nr_mapped", |
638 | "nr_file_pages", | 636 | "nr_file_pages", |
@@ -675,6 +673,9 @@ static const char * const vmstat_text[] = { | |||
675 | TEXTS_FOR_ZONES("pgscan_kswapd") | 673 | TEXTS_FOR_ZONES("pgscan_kswapd") |
676 | TEXTS_FOR_ZONES("pgscan_direct") | 674 | TEXTS_FOR_ZONES("pgscan_direct") |
677 | 675 | ||
676 | #ifdef CONFIG_NUMA | ||
677 | "zone_reclaim_failed", | ||
678 | #endif | ||
678 | "pginodesteal", | 679 | "pginodesteal", |
679 | "slabs_scanned", | 680 | "slabs_scanned", |
680 | "kswapd_steal", | 681 | "kswapd_steal", |
@@ -687,7 +688,6 @@ static const char * const vmstat_text[] = { | |||
687 | "htlb_buddy_alloc_success", | 688 | "htlb_buddy_alloc_success", |
688 | "htlb_buddy_alloc_fail", | 689 | "htlb_buddy_alloc_fail", |
689 | #endif | 690 | #endif |
690 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
691 | "unevictable_pgs_culled", | 691 | "unevictable_pgs_culled", |
692 | "unevictable_pgs_scanned", | 692 | "unevictable_pgs_scanned", |
693 | "unevictable_pgs_rescued", | 693 | "unevictable_pgs_rescued", |
@@ -697,7 +697,6 @@ static const char * const vmstat_text[] = { | |||
697 | "unevictable_pgs_stranded", | 697 | "unevictable_pgs_stranded", |
698 | "unevictable_pgs_mlockfreed", | 698 | "unevictable_pgs_mlockfreed", |
699 | #endif | 699 | #endif |
700 | #endif | ||
701 | }; | 700 | }; |
702 | 701 | ||
703 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | 702 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, |
@@ -710,18 +709,14 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
710 | "\n min %lu" | 709 | "\n min %lu" |
711 | "\n low %lu" | 710 | "\n low %lu" |
712 | "\n high %lu" | 711 | "\n high %lu" |
713 | "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" | 712 | "\n scanned %lu" |
714 | "\n spanned %lu" | 713 | "\n spanned %lu" |
715 | "\n present %lu", | 714 | "\n present %lu", |
716 | zone_page_state(zone, NR_FREE_PAGES), | 715 | zone_page_state(zone, NR_FREE_PAGES), |
717 | zone->pages_min, | 716 | min_wmark_pages(zone), |
718 | zone->pages_low, | 717 | low_wmark_pages(zone), |
719 | zone->pages_high, | 718 | high_wmark_pages(zone), |
720 | zone->pages_scanned, | 719 | zone->pages_scanned, |
721 | zone->lru[LRU_ACTIVE_ANON].nr_scan, | ||
722 | zone->lru[LRU_INACTIVE_ANON].nr_scan, | ||
723 | zone->lru[LRU_ACTIVE_FILE].nr_scan, | ||
724 | zone->lru[LRU_INACTIVE_FILE].nr_scan, | ||
725 | zone->spanned_pages, | 720 | zone->spanned_pages, |
726 | zone->present_pages); | 721 | zone->present_pages); |
727 | 722 | ||