diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 18 | ||||
| -rw-r--r-- | mm/Kconfig.debug | 1 | ||||
| -rw-r--r-- | mm/Makefile | 2 | ||||
| -rw-r--r-- | mm/bounce.c | 1 | ||||
| -rw-r--r-- | mm/fadvise.c | 2 | ||||
| -rw-r--r-- | mm/filemap.c | 169 | ||||
| -rw-r--r-- | mm/highmem.c | 1 | ||||
| -rw-r--r-- | mm/hugetlb.c | 106 | ||||
| -rw-r--r-- | mm/init-mm.c | 20 | ||||
| -rw-r--r-- | mm/internal.h | 33 | ||||
| -rw-r--r-- | mm/kmemcheck.c | 122 | ||||
| -rw-r--r-- | mm/madvise.c | 26 | ||||
| -rw-r--r-- | mm/memcontrol.c | 11 | ||||
| -rw-r--r-- | mm/memory.c | 128 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 6 | ||||
| -rw-r--r-- | mm/mempolicy.c | 145 | ||||
| -rw-r--r-- | mm/migrate.c | 6 | ||||
| -rw-r--r-- | mm/mlock.c | 22 | ||||
| -rw-r--r-- | mm/oom_kill.c | 64 | ||||
| -rw-r--r-- | mm/page-writeback.c | 19 | ||||
| -rw-r--r-- | mm/page_alloc.c | 772 | ||||
| -rw-r--r-- | mm/page_io.c | 2 | ||||
| -rw-r--r-- | mm/readahead.c | 145 | ||||
| -rw-r--r-- | mm/rmap.c | 40 | ||||
| -rw-r--r-- | mm/shmem.c | 4 | ||||
| -rw-r--r-- | mm/slab.c | 117 | ||||
| -rw-r--r-- | mm/slob.c | 4 | ||||
| -rw-r--r-- | mm/slub.c | 40 | ||||
| -rw-r--r-- | mm/swap_state.c | 17 | ||||
| -rw-r--r-- | mm/swapfile.c | 276 | ||||
| -rw-r--r-- | mm/truncate.c | 39 | ||||
| -rw-r--r-- | mm/util.c | 16 | ||||
| -rw-r--r-- | mm/vmscan.c | 372 | ||||
| -rw-r--r-- | mm/vmstat.c | 19 |
34 files changed, 1799 insertions, 966 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 71830ba7b986..c948d4ca8bde 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -128,11 +128,11 @@ config SPARSEMEM_VMEMMAP | |||
| 128 | config MEMORY_HOTPLUG | 128 | config MEMORY_HOTPLUG |
| 129 | bool "Allow for memory hot-add" | 129 | bool "Allow for memory hot-add" |
| 130 | depends on SPARSEMEM || X86_64_ACPI_NUMA | 130 | depends on SPARSEMEM || X86_64_ACPI_NUMA |
| 131 | depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG | 131 | depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG |
| 132 | depends on (IA64 || X86 || PPC64 || SUPERH || S390) | 132 | depends on (IA64 || X86 || PPC64 || SUPERH || S390) |
| 133 | 133 | ||
| 134 | comment "Memory hotplug is currently incompatible with Software Suspend" | 134 | comment "Memory hotplug is currently incompatible with Software Suspend" |
| 135 | depends on SPARSEMEM && HOTPLUG && HIBERNATION | 135 | depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390 |
| 136 | 136 | ||
| 137 | config MEMORY_HOTPLUG_SPARSE | 137 | config MEMORY_HOTPLUG_SPARSE |
| 138 | def_bool y | 138 | def_bool y |
| @@ -203,25 +203,13 @@ config VIRT_TO_BUS | |||
| 203 | def_bool y | 203 | def_bool y |
| 204 | depends on !ARCH_NO_VIRT_TO_BUS | 204 | depends on !ARCH_NO_VIRT_TO_BUS |
| 205 | 205 | ||
| 206 | config UNEVICTABLE_LRU | ||
| 207 | bool "Add LRU list to track non-evictable pages" | ||
| 208 | default y | ||
| 209 | help | ||
| 210 | Keeps unevictable pages off of the active and inactive pageout | ||
| 211 | lists, so kswapd will not waste CPU time or have its balancing | ||
| 212 | algorithms thrown off by scanning these pages. Selecting this | ||
| 213 | will use one page flag and increase the code size a little, | ||
| 214 | say Y unless you know what you are doing. | ||
| 215 | |||
| 216 | See Documentation/vm/unevictable-lru.txt for more information. | ||
| 217 | |||
| 218 | config HAVE_MLOCK | 206 | config HAVE_MLOCK |
| 219 | bool | 207 | bool |
| 220 | default y if MMU=y | 208 | default y if MMU=y |
| 221 | 209 | ||
| 222 | config HAVE_MLOCKED_PAGE_BIT | 210 | config HAVE_MLOCKED_PAGE_BIT |
| 223 | bool | 211 | bool |
| 224 | default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y | 212 | default y if HAVE_MLOCK=y |
| 225 | 213 | ||
| 226 | config MMU_NOTIFIER | 214 | config MMU_NOTIFIER |
| 227 | bool | 215 | bool |
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index bb01e298f260..aa99fd1f7109 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
| @@ -2,6 +2,7 @@ config DEBUG_PAGEALLOC | |||
| 2 | bool "Debug page memory allocations" | 2 | bool "Debug page memory allocations" |
| 3 | depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC | 3 | depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC |
| 4 | depends on !HIBERNATION || !PPC && !SPARC | 4 | depends on !HIBERNATION || !PPC && !SPARC |
| 5 | depends on !KMEMCHECK | ||
| 5 | ---help--- | 6 | ---help--- |
| 6 | Unmap pages from the kernel linear mapping after free_pages(). | 7 | Unmap pages from the kernel linear mapping after free_pages(). |
| 7 | This results in a large slowdown, but helps to find certain types | 8 | This results in a large slowdown, but helps to find certain types |
diff --git a/mm/Makefile b/mm/Makefile index e89acb090b4d..5e0bd6426693 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -12,6 +12,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | |||
| 12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
| 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
| 14 | page_isolation.o mm_init.o $(mmu-y) | 14 | page_isolation.o mm_init.o $(mmu-y) |
| 15 | obj-y += init-mm.o | ||
| 15 | 16 | ||
| 16 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o | 17 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o |
| 17 | obj-$(CONFIG_BOUNCE) += bounce.o | 18 | obj-$(CONFIG_BOUNCE) += bounce.o |
| @@ -27,6 +28,7 @@ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | |||
| 27 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o | 28 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o |
| 28 | obj-$(CONFIG_SLAB) += slab.o | 29 | obj-$(CONFIG_SLAB) += slab.o |
| 29 | obj-$(CONFIG_SLUB) += slub.o | 30 | obj-$(CONFIG_SLUB) += slub.o |
| 31 | obj-$(CONFIG_KMEMCHECK) += kmemcheck.o | ||
| 30 | obj-$(CONFIG_FAILSLAB) += failslab.o | 32 | obj-$(CONFIG_FAILSLAB) += failslab.o |
| 31 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 33 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
| 32 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 34 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
diff --git a/mm/bounce.c b/mm/bounce.c index 4ebe3ea83795..a2b76a588e34 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
| @@ -13,7 +13,6 @@ | |||
| 13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
| 14 | #include <linux/hash.h> | 14 | #include <linux/hash.h> |
| 15 | #include <linux/highmem.h> | 15 | #include <linux/highmem.h> |
| 16 | #include <linux/blktrace_api.h> | ||
| 17 | #include <asm/tlbflush.h> | 16 | #include <asm/tlbflush.h> |
| 18 | 17 | ||
| 19 | #include <trace/events/block.h> | 18 | #include <trace/events/block.h> |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 54a0f8040afa..e43359214f6f 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
| @@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
| 101 | 101 | ||
| 102 | ret = force_page_cache_readahead(mapping, file, | 102 | ret = force_page_cache_readahead(mapping, file, |
| 103 | start_index, | 103 | start_index, |
| 104 | max_sane_readahead(nrpages)); | 104 | nrpages); |
| 105 | if (ret > 0) | 105 | if (ret > 0) |
| 106 | ret = 0; | 106 | ret = 0; |
| 107 | break; | 107 | break; |
diff --git a/mm/filemap.c b/mm/filemap.c index 1b60f30cebfa..22396713feb9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp) | |||
| 521 | { | 521 | { |
| 522 | if (cpuset_do_page_mem_spread()) { | 522 | if (cpuset_do_page_mem_spread()) { |
| 523 | int n = cpuset_mem_spread_node(); | 523 | int n = cpuset_mem_spread_node(); |
| 524 | return alloc_pages_node(n, gfp, 0); | 524 | return alloc_pages_exact_node(n, gfp, 0); |
| 525 | } | 525 | } |
| 526 | return alloc_pages(gfp, 0); | 526 | return alloc_pages(gfp, 0); |
| 527 | } | 527 | } |
| @@ -1004,9 +1004,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait); | |||
| 1004 | static void shrink_readahead_size_eio(struct file *filp, | 1004 | static void shrink_readahead_size_eio(struct file *filp, |
| 1005 | struct file_ra_state *ra) | 1005 | struct file_ra_state *ra) |
| 1006 | { | 1006 | { |
| 1007 | if (!ra->ra_pages) | ||
| 1008 | return; | ||
| 1009 | |||
| 1010 | ra->ra_pages /= 4; | 1007 | ra->ra_pages /= 4; |
| 1011 | } | 1008 | } |
| 1012 | 1009 | ||
| @@ -1390,8 +1387,7 @@ do_readahead(struct address_space *mapping, struct file *filp, | |||
| 1390 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | 1387 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) |
| 1391 | return -EINVAL; | 1388 | return -EINVAL; |
| 1392 | 1389 | ||
| 1393 | force_page_cache_readahead(mapping, filp, index, | 1390 | force_page_cache_readahead(mapping, filp, index, nr); |
| 1394 | max_sane_readahead(nr)); | ||
| 1395 | return 0; | 1391 | return 0; |
| 1396 | } | 1392 | } |
| 1397 | 1393 | ||
| @@ -1457,6 +1453,73 @@ static int page_cache_read(struct file *file, pgoff_t offset) | |||
| 1457 | 1453 | ||
| 1458 | #define MMAP_LOTSAMISS (100) | 1454 | #define MMAP_LOTSAMISS (100) |
| 1459 | 1455 | ||
| 1456 | /* | ||
| 1457 | * Synchronous readahead happens when we don't even find | ||
| 1458 | * a page in the page cache at all. | ||
| 1459 | */ | ||
| 1460 | static void do_sync_mmap_readahead(struct vm_area_struct *vma, | ||
| 1461 | struct file_ra_state *ra, | ||
| 1462 | struct file *file, | ||
| 1463 | pgoff_t offset) | ||
| 1464 | { | ||
| 1465 | unsigned long ra_pages; | ||
| 1466 | struct address_space *mapping = file->f_mapping; | ||
| 1467 | |||
| 1468 | /* If we don't want any read-ahead, don't bother */ | ||
| 1469 | if (VM_RandomReadHint(vma)) | ||
| 1470 | return; | ||
| 1471 | |||
| 1472 | if (VM_SequentialReadHint(vma) || | ||
| 1473 | offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { | ||
| 1474 | page_cache_sync_readahead(mapping, ra, file, offset, | ||
| 1475 | ra->ra_pages); | ||
| 1476 | return; | ||
| 1477 | } | ||
| 1478 | |||
| 1479 | if (ra->mmap_miss < INT_MAX) | ||
| 1480 | ra->mmap_miss++; | ||
| 1481 | |||
| 1482 | /* | ||
| 1483 | * Do we miss much more than hit in this file? If so, | ||
| 1484 | * stop bothering with read-ahead. It will only hurt. | ||
| 1485 | */ | ||
| 1486 | if (ra->mmap_miss > MMAP_LOTSAMISS) | ||
| 1487 | return; | ||
| 1488 | |||
| 1489 | /* | ||
| 1490 | * mmap read-around | ||
| 1491 | */ | ||
| 1492 | ra_pages = max_sane_readahead(ra->ra_pages); | ||
| 1493 | if (ra_pages) { | ||
| 1494 | ra->start = max_t(long, 0, offset - ra_pages/2); | ||
| 1495 | ra->size = ra_pages; | ||
| 1496 | ra->async_size = 0; | ||
| 1497 | ra_submit(ra, mapping, file); | ||
| 1498 | } | ||
| 1499 | } | ||
| 1500 | |||
| 1501 | /* | ||
| 1502 | * Asynchronous readahead happens when we find the page and PG_readahead, | ||
| 1503 | * so we want to possibly extend the readahead further.. | ||
| 1504 | */ | ||
| 1505 | static void do_async_mmap_readahead(struct vm_area_struct *vma, | ||
| 1506 | struct file_ra_state *ra, | ||
| 1507 | struct file *file, | ||
| 1508 | struct page *page, | ||
| 1509 | pgoff_t offset) | ||
| 1510 | { | ||
| 1511 | struct address_space *mapping = file->f_mapping; | ||
| 1512 | |||
| 1513 | /* If we don't want any read-ahead, don't bother */ | ||
| 1514 | if (VM_RandomReadHint(vma)) | ||
| 1515 | return; | ||
| 1516 | if (ra->mmap_miss > 0) | ||
| 1517 | ra->mmap_miss--; | ||
| 1518 | if (PageReadahead(page)) | ||
| 1519 | page_cache_async_readahead(mapping, ra, file, | ||
| 1520 | page, offset, ra->ra_pages); | ||
| 1521 | } | ||
| 1522 | |||
| 1460 | /** | 1523 | /** |
| 1461 | * filemap_fault - read in file data for page fault handling | 1524 | * filemap_fault - read in file data for page fault handling |
| 1462 | * @vma: vma in which the fault was taken | 1525 | * @vma: vma in which the fault was taken |
| @@ -1476,78 +1539,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1476 | struct address_space *mapping = file->f_mapping; | 1539 | struct address_space *mapping = file->f_mapping; |
| 1477 | struct file_ra_state *ra = &file->f_ra; | 1540 | struct file_ra_state *ra = &file->f_ra; |
| 1478 | struct inode *inode = mapping->host; | 1541 | struct inode *inode = mapping->host; |
| 1542 | pgoff_t offset = vmf->pgoff; | ||
| 1479 | struct page *page; | 1543 | struct page *page; |
| 1480 | pgoff_t size; | 1544 | pgoff_t size; |
| 1481 | int did_readaround = 0; | ||
| 1482 | int ret = 0; | 1545 | int ret = 0; |
| 1483 | 1546 | ||
| 1484 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 1547 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
| 1485 | if (vmf->pgoff >= size) | 1548 | if (offset >= size) |
| 1486 | return VM_FAULT_SIGBUS; | 1549 | return VM_FAULT_SIGBUS; |
| 1487 | 1550 | ||
| 1488 | /* If we don't want any read-ahead, don't bother */ | ||
| 1489 | if (VM_RandomReadHint(vma)) | ||
| 1490 | goto no_cached_page; | ||
| 1491 | |||
| 1492 | /* | 1551 | /* |
| 1493 | * Do we have something in the page cache already? | 1552 | * Do we have something in the page cache already? |
| 1494 | */ | 1553 | */ |
| 1495 | retry_find: | 1554 | page = find_get_page(mapping, offset); |
| 1496 | page = find_lock_page(mapping, vmf->pgoff); | 1555 | if (likely(page)) { |
| 1497 | /* | ||
| 1498 | * For sequential accesses, we use the generic readahead logic. | ||
| 1499 | */ | ||
| 1500 | if (VM_SequentialReadHint(vma)) { | ||
| 1501 | if (!page) { | ||
| 1502 | page_cache_sync_readahead(mapping, ra, file, | ||
| 1503 | vmf->pgoff, 1); | ||
| 1504 | page = find_lock_page(mapping, vmf->pgoff); | ||
| 1505 | if (!page) | ||
| 1506 | goto no_cached_page; | ||
| 1507 | } | ||
| 1508 | if (PageReadahead(page)) { | ||
| 1509 | page_cache_async_readahead(mapping, ra, file, page, | ||
| 1510 | vmf->pgoff, 1); | ||
| 1511 | } | ||
| 1512 | } | ||
| 1513 | |||
| 1514 | if (!page) { | ||
| 1515 | unsigned long ra_pages; | ||
| 1516 | |||
| 1517 | ra->mmap_miss++; | ||
| 1518 | |||
| 1519 | /* | 1556 | /* |
| 1520 | * Do we miss much more than hit in this file? If so, | 1557 | * We found the page, so try async readahead before |
| 1521 | * stop bothering with read-ahead. It will only hurt. | 1558 | * waiting for the lock. |
| 1522 | */ | 1559 | */ |
| 1523 | if (ra->mmap_miss > MMAP_LOTSAMISS) | 1560 | do_async_mmap_readahead(vma, ra, file, page, offset); |
| 1524 | goto no_cached_page; | 1561 | lock_page(page); |
| 1525 | 1562 | ||
| 1526 | /* | 1563 | /* Did it get truncated? */ |
| 1527 | * To keep the pgmajfault counter straight, we need to | 1564 | if (unlikely(page->mapping != mapping)) { |
| 1528 | * check did_readaround, as this is an inner loop. | 1565 | unlock_page(page); |
| 1529 | */ | 1566 | put_page(page); |
| 1530 | if (!did_readaround) { | 1567 | goto no_cached_page; |
| 1531 | ret = VM_FAULT_MAJOR; | ||
| 1532 | count_vm_event(PGMAJFAULT); | ||
| 1533 | } | ||
| 1534 | did_readaround = 1; | ||
| 1535 | ra_pages = max_sane_readahead(file->f_ra.ra_pages); | ||
| 1536 | if (ra_pages) { | ||
| 1537 | pgoff_t start = 0; | ||
| 1538 | |||
| 1539 | if (vmf->pgoff > ra_pages / 2) | ||
| 1540 | start = vmf->pgoff - ra_pages / 2; | ||
| 1541 | do_page_cache_readahead(mapping, file, start, ra_pages); | ||
| 1542 | } | 1568 | } |
| 1543 | page = find_lock_page(mapping, vmf->pgoff); | 1569 | } else { |
| 1570 | /* No page in the page cache at all */ | ||
| 1571 | do_sync_mmap_readahead(vma, ra, file, offset); | ||
| 1572 | count_vm_event(PGMAJFAULT); | ||
| 1573 | ret = VM_FAULT_MAJOR; | ||
| 1574 | retry_find: | ||
| 1575 | page = find_lock_page(mapping, offset); | ||
| 1544 | if (!page) | 1576 | if (!page) |
| 1545 | goto no_cached_page; | 1577 | goto no_cached_page; |
| 1546 | } | 1578 | } |
| 1547 | 1579 | ||
| 1548 | if (!did_readaround) | ||
| 1549 | ra->mmap_miss--; | ||
| 1550 | |||
| 1551 | /* | 1580 | /* |
| 1552 | * We have a locked page in the page cache, now we need to check | 1581 | * We have a locked page in the page cache, now we need to check |
| 1553 | * that it's up-to-date. If not, it is going to be due to an error. | 1582 | * that it's up-to-date. If not, it is going to be due to an error. |
| @@ -1555,18 +1584,18 @@ retry_find: | |||
| 1555 | if (unlikely(!PageUptodate(page))) | 1584 | if (unlikely(!PageUptodate(page))) |
| 1556 | goto page_not_uptodate; | 1585 | goto page_not_uptodate; |
| 1557 | 1586 | ||
| 1558 | /* Must recheck i_size under page lock */ | 1587 | /* |
| 1588 | * Found the page and have a reference on it. | ||
| 1589 | * We must recheck i_size under page lock. | ||
| 1590 | */ | ||
| 1559 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 1591 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
| 1560 | if (unlikely(vmf->pgoff >= size)) { | 1592 | if (unlikely(offset >= size)) { |
| 1561 | unlock_page(page); | 1593 | unlock_page(page); |
| 1562 | page_cache_release(page); | 1594 | page_cache_release(page); |
| 1563 | return VM_FAULT_SIGBUS; | 1595 | return VM_FAULT_SIGBUS; |
| 1564 | } | 1596 | } |
| 1565 | 1597 | ||
| 1566 | /* | 1598 | ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; |
| 1567 | * Found the page and have a reference on it. | ||
| 1568 | */ | ||
| 1569 | ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; | ||
| 1570 | vmf->page = page; | 1599 | vmf->page = page; |
| 1571 | return ret | VM_FAULT_LOCKED; | 1600 | return ret | VM_FAULT_LOCKED; |
| 1572 | 1601 | ||
| @@ -1575,7 +1604,7 @@ no_cached_page: | |||
| 1575 | * We're only likely to ever get here if MADV_RANDOM is in | 1604 | * We're only likely to ever get here if MADV_RANDOM is in |
| 1576 | * effect. | 1605 | * effect. |
| 1577 | */ | 1606 | */ |
| 1578 | error = page_cache_read(file, vmf->pgoff); | 1607 | error = page_cache_read(file, offset); |
| 1579 | 1608 | ||
| 1580 | /* | 1609 | /* |
| 1581 | * The page we want has now been added to the page cache. | 1610 | * The page we want has now been added to the page cache. |
| @@ -1595,12 +1624,6 @@ no_cached_page: | |||
| 1595 | return VM_FAULT_SIGBUS; | 1624 | return VM_FAULT_SIGBUS; |
| 1596 | 1625 | ||
| 1597 | page_not_uptodate: | 1626 | page_not_uptodate: |
| 1598 | /* IO error path */ | ||
| 1599 | if (!did_readaround) { | ||
| 1600 | ret = VM_FAULT_MAJOR; | ||
| 1601 | count_vm_event(PGMAJFAULT); | ||
| 1602 | } | ||
| 1603 | |||
| 1604 | /* | 1627 | /* |
| 1605 | * Umm, take care of errors if the page isn't up-to-date. | 1628 | * Umm, take care of errors if the page isn't up-to-date. |
| 1606 | * Try to re-read it _once_. We do this synchronously, | 1629 | * Try to re-read it _once_. We do this synchronously, |
diff --git a/mm/highmem.c b/mm/highmem.c index 68eb1d9b63fa..25878cc49daa 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
| @@ -26,7 +26,6 @@ | |||
| 26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
| 27 | #include <linux/hash.h> | 27 | #include <linux/hash.h> |
| 28 | #include <linux/highmem.h> | 28 | #include <linux/highmem.h> |
| 29 | #include <linux/blktrace_api.h> | ||
| 30 | #include <asm/tlbflush.h> | 29 | #include <asm/tlbflush.h> |
| 31 | 30 | ||
| 32 | /* | 31 | /* |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e83ad2c9228c..a56e6f3ce979 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -578,41 +578,6 @@ static void free_huge_page(struct page *page) | |||
| 578 | hugetlb_put_quota(mapping, 1); | 578 | hugetlb_put_quota(mapping, 1); |
| 579 | } | 579 | } |
| 580 | 580 | ||
| 581 | /* | ||
| 582 | * Increment or decrement surplus_huge_pages. Keep node-specific counters | ||
| 583 | * balanced by operating on them in a round-robin fashion. | ||
| 584 | * Returns 1 if an adjustment was made. | ||
| 585 | */ | ||
| 586 | static int adjust_pool_surplus(struct hstate *h, int delta) | ||
| 587 | { | ||
| 588 | static int prev_nid; | ||
| 589 | int nid = prev_nid; | ||
| 590 | int ret = 0; | ||
| 591 | |||
| 592 | VM_BUG_ON(delta != -1 && delta != 1); | ||
| 593 | do { | ||
| 594 | nid = next_node(nid, node_online_map); | ||
| 595 | if (nid == MAX_NUMNODES) | ||
| 596 | nid = first_node(node_online_map); | ||
| 597 | |||
| 598 | /* To shrink on this node, there must be a surplus page */ | ||
| 599 | if (delta < 0 && !h->surplus_huge_pages_node[nid]) | ||
| 600 | continue; | ||
| 601 | /* Surplus cannot exceed the total number of pages */ | ||
| 602 | if (delta > 0 && h->surplus_huge_pages_node[nid] >= | ||
| 603 | h->nr_huge_pages_node[nid]) | ||
| 604 | continue; | ||
| 605 | |||
| 606 | h->surplus_huge_pages += delta; | ||
| 607 | h->surplus_huge_pages_node[nid] += delta; | ||
| 608 | ret = 1; | ||
| 609 | break; | ||
| 610 | } while (nid != prev_nid); | ||
| 611 | |||
| 612 | prev_nid = nid; | ||
| 613 | return ret; | ||
| 614 | } | ||
| 615 | |||
| 616 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 581 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
| 617 | { | 582 | { |
| 618 | set_compound_page_dtor(page, free_huge_page); | 583 | set_compound_page_dtor(page, free_huge_page); |
| @@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | |||
| 623 | put_page(page); /* free it into the hugepage allocator */ | 588 | put_page(page); /* free it into the hugepage allocator */ |
| 624 | } | 589 | } |
| 625 | 590 | ||
| 591 | static void prep_compound_gigantic_page(struct page *page, unsigned long order) | ||
| 592 | { | ||
| 593 | int i; | ||
| 594 | int nr_pages = 1 << order; | ||
| 595 | struct page *p = page + 1; | ||
| 596 | |||
| 597 | /* we rely on prep_new_huge_page to set the destructor */ | ||
| 598 | set_compound_order(page, order); | ||
| 599 | __SetPageHead(page); | ||
| 600 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | ||
| 601 | __SetPageTail(p); | ||
| 602 | p->first_page = page; | ||
| 603 | } | ||
| 604 | } | ||
| 605 | |||
| 606 | int PageHuge(struct page *page) | ||
| 607 | { | ||
| 608 | compound_page_dtor *dtor; | ||
| 609 | |||
| 610 | if (!PageCompound(page)) | ||
| 611 | return 0; | ||
| 612 | |||
| 613 | page = compound_head(page); | ||
| 614 | dtor = get_compound_page_dtor(page); | ||
| 615 | |||
| 616 | return dtor == free_huge_page; | ||
| 617 | } | ||
| 618 | |||
| 626 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | 619 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) |
| 627 | { | 620 | { |
| 628 | struct page *page; | 621 | struct page *page; |
| @@ -630,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
| 630 | if (h->order >= MAX_ORDER) | 623 | if (h->order >= MAX_ORDER) |
| 631 | return NULL; | 624 | return NULL; |
| 632 | 625 | ||
| 633 | page = alloc_pages_node(nid, | 626 | page = alloc_pages_exact_node(nid, |
| 634 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | 627 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| |
| 635 | __GFP_REPEAT|__GFP_NOWARN, | 628 | __GFP_REPEAT|__GFP_NOWARN, |
| 636 | huge_page_order(h)); | 629 | huge_page_order(h)); |
| @@ -649,7 +642,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
| 649 | * Use a helper variable to find the next node and then | 642 | * Use a helper variable to find the next node and then |
| 650 | * copy it back to hugetlb_next_nid afterwards: | 643 | * copy it back to hugetlb_next_nid afterwards: |
| 651 | * otherwise there's a window in which a racer might | 644 | * otherwise there's a window in which a racer might |
| 652 | * pass invalid nid MAX_NUMNODES to alloc_pages_node. | 645 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. |
| 653 | * But we don't need to use a spin_lock here: it really | 646 | * But we don't need to use a spin_lock here: it really |
| 654 | * doesn't matter if occasionally a racer chooses the | 647 | * doesn't matter if occasionally a racer chooses the |
| 655 | * same nid as we do. Move nid forward in the mask even | 648 | * same nid as we do. Move nid forward in the mask even |
| @@ -875,7 +868,7 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
| 875 | * can no longer free unreserved surplus pages. This occurs when | 868 | * can no longer free unreserved surplus pages. This occurs when |
| 876 | * the nodes with surplus pages have no free pages. | 869 | * the nodes with surplus pages have no free pages. |
| 877 | */ | 870 | */ |
| 878 | unsigned long remaining_iterations = num_online_nodes(); | 871 | unsigned long remaining_iterations = nr_online_nodes; |
| 879 | 872 | ||
| 880 | /* Uncommit the reservation */ | 873 | /* Uncommit the reservation */ |
| 881 | h->resv_huge_pages -= unused_resv_pages; | 874 | h->resv_huge_pages -= unused_resv_pages; |
| @@ -904,7 +897,7 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
| 904 | h->surplus_huge_pages--; | 897 | h->surplus_huge_pages--; |
| 905 | h->surplus_huge_pages_node[nid]--; | 898 | h->surplus_huge_pages_node[nid]--; |
| 906 | nr_pages--; | 899 | nr_pages--; |
| 907 | remaining_iterations = num_online_nodes(); | 900 | remaining_iterations = nr_online_nodes; |
| 908 | } | 901 | } |
| 909 | } | 902 | } |
| 910 | } | 903 | } |
| @@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) | |||
| 1140 | } | 1133 | } |
| 1141 | #endif | 1134 | #endif |
| 1142 | 1135 | ||
| 1136 | /* | ||
| 1137 | * Increment or decrement surplus_huge_pages. Keep node-specific counters | ||
| 1138 | * balanced by operating on them in a round-robin fashion. | ||
| 1139 | * Returns 1 if an adjustment was made. | ||
| 1140 | */ | ||
| 1141 | static int adjust_pool_surplus(struct hstate *h, int delta) | ||
| 1142 | { | ||
| 1143 | static int prev_nid; | ||
| 1144 | int nid = prev_nid; | ||
| 1145 | int ret = 0; | ||
| 1146 | |||
| 1147 | VM_BUG_ON(delta != -1 && delta != 1); | ||
| 1148 | do { | ||
| 1149 | nid = next_node(nid, node_online_map); | ||
| 1150 | if (nid == MAX_NUMNODES) | ||
| 1151 | nid = first_node(node_online_map); | ||
| 1152 | |||
| 1153 | /* To shrink on this node, there must be a surplus page */ | ||
| 1154 | if (delta < 0 && !h->surplus_huge_pages_node[nid]) | ||
| 1155 | continue; | ||
| 1156 | /* Surplus cannot exceed the total number of pages */ | ||
| 1157 | if (delta > 0 && h->surplus_huge_pages_node[nid] >= | ||
| 1158 | h->nr_huge_pages_node[nid]) | ||
| 1159 | continue; | ||
| 1160 | |||
| 1161 | h->surplus_huge_pages += delta; | ||
| 1162 | h->surplus_huge_pages_node[nid] += delta; | ||
| 1163 | ret = 1; | ||
| 1164 | break; | ||
| 1165 | } while (nid != prev_nid); | ||
| 1166 | |||
| 1167 | prev_nid = nid; | ||
| 1168 | return ret; | ||
| 1169 | } | ||
| 1170 | |||
| 1143 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) | 1171 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
| 1144 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | 1172 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) |
| 1145 | { | 1173 | { |
diff --git a/mm/init-mm.c b/mm/init-mm.c new file mode 100644 index 000000000000..57aba0da9668 --- /dev/null +++ b/mm/init-mm.c | |||
| @@ -0,0 +1,20 @@ | |||
| 1 | #include <linux/mm_types.h> | ||
| 2 | #include <linux/rbtree.h> | ||
| 3 | #include <linux/rwsem.h> | ||
| 4 | #include <linux/spinlock.h> | ||
| 5 | #include <linux/list.h> | ||
| 6 | #include <linux/cpumask.h> | ||
| 7 | |||
| 8 | #include <asm/atomic.h> | ||
| 9 | #include <asm/pgtable.h> | ||
| 10 | |||
| 11 | struct mm_struct init_mm = { | ||
| 12 | .mm_rb = RB_ROOT, | ||
| 13 | .pgd = swapper_pg_dir, | ||
| 14 | .mm_users = ATOMIC_INIT(2), | ||
| 15 | .mm_count = ATOMIC_INIT(1), | ||
| 16 | .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), | ||
| 17 | .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), | ||
| 18 | .mmlist = LIST_HEAD_INIT(init_mm.mmlist), | ||
| 19 | .cpu_vm_mask = CPU_MASK_ALL, | ||
| 20 | }; | ||
diff --git a/mm/internal.h b/mm/internal.h index 987bb03fbdd8..f290c4db528b 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -16,9 +16,6 @@ | |||
| 16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
| 17 | unsigned long floor, unsigned long ceiling); | 17 | unsigned long floor, unsigned long ceiling); |
| 18 | 18 | ||
| 19 | extern void prep_compound_page(struct page *page, unsigned long order); | ||
| 20 | extern void prep_compound_gigantic_page(struct page *page, unsigned long order); | ||
| 21 | |||
| 22 | static inline void set_page_count(struct page *page, int v) | 19 | static inline void set_page_count(struct page *page, int v) |
| 23 | { | 20 | { |
| 24 | atomic_set(&page->_count, v); | 21 | atomic_set(&page->_count, v); |
| @@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page); | |||
| 51 | */ | 48 | */ |
| 52 | extern unsigned long highest_memmap_pfn; | 49 | extern unsigned long highest_memmap_pfn; |
| 53 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 50 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
| 51 | extern void prep_compound_page(struct page *page, unsigned long order); | ||
| 52 | |||
| 54 | 53 | ||
| 55 | /* | 54 | /* |
| 56 | * function for dealing with page's order in buddy system. | 55 | * function for dealing with page's order in buddy system. |
| @@ -74,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
| 74 | } | 73 | } |
| 75 | #endif | 74 | #endif |
| 76 | 75 | ||
| 77 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 78 | /* | 76 | /* |
| 79 | * unevictable_migrate_page() called only from migrate_page_copy() to | 77 | * unevictable_migrate_page() called only from migrate_page_copy() to |
| 80 | * migrate unevictable flag to new page. | 78 | * migrate unevictable flag to new page. |
| @@ -86,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old) | |||
| 86 | if (TestClearPageUnevictable(old)) | 84 | if (TestClearPageUnevictable(old)) |
| 87 | SetPageUnevictable(new); | 85 | SetPageUnevictable(new); |
| 88 | } | 86 | } |
| 89 | #else | ||
| 90 | static inline void unevictable_migrate_page(struct page *new, struct page *old) | ||
| 91 | { | ||
| 92 | } | ||
| 93 | #endif | ||
| 94 | 87 | ||
| 95 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | 88 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT |
| 96 | /* | 89 | /* |
| @@ -150,23 +143,6 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
| 150 | } | 143 | } |
| 151 | } | 144 | } |
| 152 | 145 | ||
| 153 | /* | ||
| 154 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | ||
| 155 | * Page should not be on lru, so no need to fix that up. | ||
| 156 | * free_pages_check() will verify... | ||
| 157 | */ | ||
| 158 | static inline void free_page_mlock(struct page *page) | ||
| 159 | { | ||
| 160 | if (unlikely(TestClearPageMlocked(page))) { | ||
| 161 | unsigned long flags; | ||
| 162 | |||
| 163 | local_irq_save(flags); | ||
| 164 | __dec_zone_page_state(page, NR_MLOCK); | ||
| 165 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | ||
| 166 | local_irq_restore(flags); | ||
| 167 | } | ||
| 168 | } | ||
| 169 | |||
| 170 | #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 146 | #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ |
| 171 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | 147 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) |
| 172 | { | 148 | { |
| @@ -175,7 +151,6 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | |||
| 175 | static inline void clear_page_mlock(struct page *page) { } | 151 | static inline void clear_page_mlock(struct page *page) { } |
| 176 | static inline void mlock_vma_page(struct page *page) { } | 152 | static inline void mlock_vma_page(struct page *page) { } |
| 177 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } | 153 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } |
| 178 | static inline void free_page_mlock(struct page *page) { } | ||
| 179 | 154 | ||
| 180 | #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 155 | #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ |
| 181 | 156 | ||
| @@ -284,4 +259,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 284 | unsigned long start, int len, int flags, | 259 | unsigned long start, int len, int flags, |
| 285 | struct page **pages, struct vm_area_struct **vmas); | 260 | struct page **pages, struct vm_area_struct **vmas); |
| 286 | 261 | ||
| 262 | #define ZONE_RECLAIM_NOSCAN -2 | ||
| 263 | #define ZONE_RECLAIM_FULL -1 | ||
| 264 | #define ZONE_RECLAIM_SOME 0 | ||
| 265 | #define ZONE_RECLAIM_SUCCESS 1 | ||
| 287 | #endif | 266 | #endif |
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c new file mode 100644 index 000000000000..fd814fd61319 --- /dev/null +++ b/mm/kmemcheck.c | |||
| @@ -0,0 +1,122 @@ | |||
| 1 | #include <linux/gfp.h> | ||
| 2 | #include <linux/mm_types.h> | ||
| 3 | #include <linux/mm.h> | ||
| 4 | #include <linux/slab.h> | ||
| 5 | #include <linux/kmemcheck.h> | ||
| 6 | |||
| 7 | void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) | ||
| 8 | { | ||
| 9 | struct page *shadow; | ||
| 10 | int pages; | ||
| 11 | int i; | ||
| 12 | |||
| 13 | pages = 1 << order; | ||
| 14 | |||
| 15 | /* | ||
| 16 | * With kmemcheck enabled, we need to allocate a memory area for the | ||
| 17 | * shadow bits as well. | ||
| 18 | */ | ||
| 19 | shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); | ||
| 20 | if (!shadow) { | ||
| 21 | if (printk_ratelimit()) | ||
| 22 | printk(KERN_ERR "kmemcheck: failed to allocate " | ||
| 23 | "shadow bitmap\n"); | ||
| 24 | return; | ||
| 25 | } | ||
| 26 | |||
| 27 | for(i = 0; i < pages; ++i) | ||
| 28 | page[i].shadow = page_address(&shadow[i]); | ||
| 29 | |||
| 30 | /* | ||
| 31 | * Mark it as non-present for the MMU so that our accesses to | ||
| 32 | * this memory will trigger a page fault and let us analyze | ||
| 33 | * the memory accesses. | ||
| 34 | */ | ||
| 35 | kmemcheck_hide_pages(page, pages); | ||
| 36 | } | ||
| 37 | |||
| 38 | void kmemcheck_free_shadow(struct page *page, int order) | ||
| 39 | { | ||
| 40 | struct page *shadow; | ||
| 41 | int pages; | ||
| 42 | int i; | ||
| 43 | |||
| 44 | if (!kmemcheck_page_is_tracked(page)) | ||
| 45 | return; | ||
| 46 | |||
| 47 | pages = 1 << order; | ||
| 48 | |||
| 49 | kmemcheck_show_pages(page, pages); | ||
| 50 | |||
| 51 | shadow = virt_to_page(page[0].shadow); | ||
| 52 | |||
| 53 | for(i = 0; i < pages; ++i) | ||
| 54 | page[i].shadow = NULL; | ||
| 55 | |||
| 56 | __free_pages(shadow, order); | ||
| 57 | } | ||
| 58 | |||
| 59 | void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, | ||
| 60 | size_t size) | ||
| 61 | { | ||
| 62 | /* | ||
| 63 | * Has already been memset(), which initializes the shadow for us | ||
| 64 | * as well. | ||
| 65 | */ | ||
| 66 | if (gfpflags & __GFP_ZERO) | ||
| 67 | return; | ||
| 68 | |||
| 69 | /* No need to initialize the shadow of a non-tracked slab. */ | ||
| 70 | if (s->flags & SLAB_NOTRACK) | ||
| 71 | return; | ||
| 72 | |||
| 73 | if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) { | ||
| 74 | /* | ||
| 75 | * Allow notracked objects to be allocated from | ||
| 76 | * tracked caches. Note however that these objects | ||
| 77 | * will still get page faults on access, they just | ||
| 78 | * won't ever be flagged as uninitialized. If page | ||
| 79 | * faults are not acceptable, the slab cache itself | ||
| 80 | * should be marked NOTRACK. | ||
| 81 | */ | ||
| 82 | kmemcheck_mark_initialized(object, size); | ||
| 83 | } else if (!s->ctor) { | ||
| 84 | /* | ||
| 85 | * New objects should be marked uninitialized before | ||
| 86 | * they're returned to the called. | ||
| 87 | */ | ||
| 88 | kmemcheck_mark_uninitialized(object, size); | ||
| 89 | } | ||
| 90 | } | ||
| 91 | |||
| 92 | void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) | ||
| 93 | { | ||
| 94 | /* TODO: RCU freeing is unsupported for now; hide false positives. */ | ||
| 95 | if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) | ||
| 96 | kmemcheck_mark_freed(object, size); | ||
| 97 | } | ||
| 98 | |||
| 99 | void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order, | ||
| 100 | gfp_t gfpflags) | ||
| 101 | { | ||
| 102 | int pages; | ||
| 103 | |||
| 104 | if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK)) | ||
| 105 | return; | ||
| 106 | |||
| 107 | pages = 1 << order; | ||
| 108 | |||
| 109 | /* | ||
| 110 | * NOTE: We choose to track GFP_ZERO pages too; in fact, they | ||
| 111 | * can become uninitialized by copying uninitialized memory | ||
| 112 | * into them. | ||
| 113 | */ | ||
| 114 | |||
| 115 | /* XXX: Can use zone->node for node? */ | ||
| 116 | kmemcheck_alloc_shadow(page, order, gfpflags, -1); | ||
| 117 | |||
| 118 | if (gfpflags & __GFP_ZERO) | ||
| 119 | kmemcheck_mark_initialized_pages(page, pages); | ||
| 120 | else | ||
| 121 | kmemcheck_mark_uninitialized_pages(page, pages); | ||
| 122 | } | ||
diff --git a/mm/madvise.c b/mm/madvise.c index b9ce574827c8..76eb4193acdd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
| @@ -123,8 +123,7 @@ static long madvise_willneed(struct vm_area_struct * vma, | |||
| 123 | end = vma->vm_end; | 123 | end = vma->vm_end; |
| 124 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 124 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
| 125 | 125 | ||
| 126 | force_page_cache_readahead(file->f_mapping, | 126 | force_page_cache_readahead(file->f_mapping, file, start, end - start); |
| 127 | file, start, max_sane_readahead(end - start)); | ||
| 128 | return 0; | 127 | return 0; |
| 129 | } | 128 | } |
| 130 | 129 | ||
| @@ -239,12 +238,30 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
| 239 | break; | 238 | break; |
| 240 | 239 | ||
| 241 | default: | 240 | default: |
| 242 | error = -EINVAL; | 241 | BUG(); |
| 243 | break; | 242 | break; |
| 244 | } | 243 | } |
| 245 | return error; | 244 | return error; |
| 246 | } | 245 | } |
| 247 | 246 | ||
| 247 | static int | ||
| 248 | madvise_behavior_valid(int behavior) | ||
| 249 | { | ||
| 250 | switch (behavior) { | ||
| 251 | case MADV_DOFORK: | ||
| 252 | case MADV_DONTFORK: | ||
| 253 | case MADV_NORMAL: | ||
| 254 | case MADV_SEQUENTIAL: | ||
| 255 | case MADV_RANDOM: | ||
| 256 | case MADV_REMOVE: | ||
| 257 | case MADV_WILLNEED: | ||
| 258 | case MADV_DONTNEED: | ||
| 259 | return 1; | ||
| 260 | |||
| 261 | default: | ||
| 262 | return 0; | ||
| 263 | } | ||
| 264 | } | ||
| 248 | /* | 265 | /* |
| 249 | * The madvise(2) system call. | 266 | * The madvise(2) system call. |
| 250 | * | 267 | * |
| @@ -290,6 +307,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
| 290 | int write; | 307 | int write; |
| 291 | size_t len; | 308 | size_t len; |
| 292 | 309 | ||
| 310 | if (!madvise_behavior_valid(behavior)) | ||
| 311 | return error; | ||
| 312 | |||
| 293 | write = madvise_need_mmap_write(behavior); | 313 | write = madvise_need_mmap_write(behavior); |
| 294 | if (write) | 314 | if (write) |
| 295 | down_write(¤t->mm->mmap_sem); | 315 | down_write(¤t->mm->mmap_sem); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 78eb8552818b..70db6e0a5eec 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -570,6 +570,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) | |||
| 570 | return 0; | 570 | return 0; |
| 571 | } | 571 | } |
| 572 | 572 | ||
| 573 | int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) | ||
| 574 | { | ||
| 575 | unsigned long active; | ||
| 576 | unsigned long inactive; | ||
| 577 | |||
| 578 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); | ||
| 579 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); | ||
| 580 | |||
| 581 | return (active > inactive); | ||
| 582 | } | ||
| 583 | |||
| 573 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, | 584 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, |
| 574 | struct zone *zone, | 585 | struct zone *zone, |
| 575 | enum lru_list lru) | 586 | enum lru_list lru) |
diff --git a/mm/memory.c b/mm/memory.c index 4126dd16778c..d5d1653d60a6 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1360 | return i; | 1360 | return i; |
| 1361 | } | 1361 | } |
| 1362 | 1362 | ||
| 1363 | /** | ||
| 1364 | * get_user_pages() - pin user pages in memory | ||
| 1365 | * @tsk: task_struct of target task | ||
| 1366 | * @mm: mm_struct of target mm | ||
| 1367 | * @start: starting user address | ||
| 1368 | * @len: number of pages from start to pin | ||
| 1369 | * @write: whether pages will be written to by the caller | ||
| 1370 | * @force: whether to force write access even if user mapping is | ||
| 1371 | * readonly. This will result in the page being COWed even | ||
| 1372 | * in MAP_SHARED mappings. You do not want this. | ||
| 1373 | * @pages: array that receives pointers to the pages pinned. | ||
| 1374 | * Should be at least nr_pages long. Or NULL, if caller | ||
| 1375 | * only intends to ensure the pages are faulted in. | ||
| 1376 | * @vmas: array of pointers to vmas corresponding to each page. | ||
| 1377 | * Or NULL if the caller does not require them. | ||
| 1378 | * | ||
| 1379 | * Returns number of pages pinned. This may be fewer than the number | ||
| 1380 | * requested. If len is 0 or negative, returns 0. If no pages | ||
| 1381 | * were pinned, returns -errno. Each page returned must be released | ||
| 1382 | * with a put_page() call when it is finished with. vmas will only | ||
| 1383 | * remain valid while mmap_sem is held. | ||
| 1384 | * | ||
| 1385 | * Must be called with mmap_sem held for read or write. | ||
| 1386 | * | ||
| 1387 | * get_user_pages walks a process's page tables and takes a reference to | ||
| 1388 | * each struct page that each user address corresponds to at a given | ||
| 1389 | * instant. That is, it takes the page that would be accessed if a user | ||
| 1390 | * thread accesses the given user virtual address at that instant. | ||
| 1391 | * | ||
| 1392 | * This does not guarantee that the page exists in the user mappings when | ||
| 1393 | * get_user_pages returns, and there may even be a completely different | ||
| 1394 | * page there in some cases (eg. if mmapped pagecache has been invalidated | ||
| 1395 | * and subsequently re faulted). However it does guarantee that the page | ||
| 1396 | * won't be freed completely. And mostly callers simply care that the page | ||
| 1397 | * contains data that was valid *at some point in time*. Typically, an IO | ||
| 1398 | * or similar operation cannot guarantee anything stronger anyway because | ||
| 1399 | * locks can't be held over the syscall boundary. | ||
| 1400 | * | ||
| 1401 | * If write=0, the page must not be written to. If the page is written to, | ||
| 1402 | * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called | ||
| 1403 | * after the page is finished with, and before put_page is called. | ||
| 1404 | * | ||
| 1405 | * get_user_pages is typically used for fewer-copy IO operations, to get a | ||
| 1406 | * handle on the memory by some means other than accesses via the user virtual | ||
| 1407 | * addresses. The pages may be submitted for DMA to devices or accessed via | ||
| 1408 | * their kernel linear mapping (via the kmap APIs). Care should be taken to | ||
| 1409 | * use the correct cache flushing APIs. | ||
| 1410 | * | ||
| 1411 | * See also get_user_pages_fast, for performance critical applications. | ||
| 1412 | */ | ||
| 1363 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1413 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 1364 | unsigned long start, int len, int write, int force, | 1414 | unsigned long start, int len, int write, int force, |
| 1365 | struct page **pages, struct vm_area_struct **vmas) | 1415 | struct page **pages, struct vm_area_struct **vmas) |
| @@ -3053,22 +3103,13 @@ int in_gate_area_no_task(unsigned long addr) | |||
| 3053 | 3103 | ||
| 3054 | #endif /* __HAVE_ARCH_GATE_AREA */ | 3104 | #endif /* __HAVE_ARCH_GATE_AREA */ |
| 3055 | 3105 | ||
| 3056 | #ifdef CONFIG_HAVE_IOREMAP_PROT | 3106 | static int follow_pte(struct mm_struct *mm, unsigned long address, |
| 3057 | int follow_phys(struct vm_area_struct *vma, | 3107 | pte_t **ptepp, spinlock_t **ptlp) |
| 3058 | unsigned long address, unsigned int flags, | ||
| 3059 | unsigned long *prot, resource_size_t *phys) | ||
| 3060 | { | 3108 | { |
| 3061 | pgd_t *pgd; | 3109 | pgd_t *pgd; |
| 3062 | pud_t *pud; | 3110 | pud_t *pud; |
| 3063 | pmd_t *pmd; | 3111 | pmd_t *pmd; |
| 3064 | pte_t *ptep, pte; | 3112 | pte_t *ptep; |
| 3065 | spinlock_t *ptl; | ||
| 3066 | resource_size_t phys_addr = 0; | ||
| 3067 | struct mm_struct *mm = vma->vm_mm; | ||
| 3068 | int ret = -EINVAL; | ||
| 3069 | |||
| 3070 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
| 3071 | goto out; | ||
| 3072 | 3113 | ||
| 3073 | pgd = pgd_offset(mm, address); | 3114 | pgd = pgd_offset(mm, address); |
| 3074 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | 3115 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) |
| @@ -3086,22 +3127,71 @@ int follow_phys(struct vm_area_struct *vma, | |||
| 3086 | if (pmd_huge(*pmd)) | 3127 | if (pmd_huge(*pmd)) |
| 3087 | goto out; | 3128 | goto out; |
| 3088 | 3129 | ||
| 3089 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | 3130 | ptep = pte_offset_map_lock(mm, pmd, address, ptlp); |
| 3090 | if (!ptep) | 3131 | if (!ptep) |
| 3091 | goto out; | 3132 | goto out; |
| 3133 | if (!pte_present(*ptep)) | ||
| 3134 | goto unlock; | ||
| 3135 | *ptepp = ptep; | ||
| 3136 | return 0; | ||
| 3137 | unlock: | ||
| 3138 | pte_unmap_unlock(ptep, *ptlp); | ||
| 3139 | out: | ||
| 3140 | return -EINVAL; | ||
| 3141 | } | ||
| 3092 | 3142 | ||
| 3143 | /** | ||
| 3144 | * follow_pfn - look up PFN at a user virtual address | ||
| 3145 | * @vma: memory mapping | ||
| 3146 | * @address: user virtual address | ||
| 3147 | * @pfn: location to store found PFN | ||
| 3148 | * | ||
| 3149 | * Only IO mappings and raw PFN mappings are allowed. | ||
| 3150 | * | ||
| 3151 | * Returns zero and the pfn at @pfn on success, -ve otherwise. | ||
| 3152 | */ | ||
| 3153 | int follow_pfn(struct vm_area_struct *vma, unsigned long address, | ||
| 3154 | unsigned long *pfn) | ||
| 3155 | { | ||
| 3156 | int ret = -EINVAL; | ||
| 3157 | spinlock_t *ptl; | ||
| 3158 | pte_t *ptep; | ||
| 3159 | |||
| 3160 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
| 3161 | return ret; | ||
| 3162 | |||
| 3163 | ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); | ||
| 3164 | if (ret) | ||
| 3165 | return ret; | ||
| 3166 | *pfn = pte_pfn(*ptep); | ||
| 3167 | pte_unmap_unlock(ptep, ptl); | ||
| 3168 | return 0; | ||
| 3169 | } | ||
| 3170 | EXPORT_SYMBOL(follow_pfn); | ||
| 3171 | |||
| 3172 | #ifdef CONFIG_HAVE_IOREMAP_PROT | ||
| 3173 | int follow_phys(struct vm_area_struct *vma, | ||
| 3174 | unsigned long address, unsigned int flags, | ||
| 3175 | unsigned long *prot, resource_size_t *phys) | ||
| 3176 | { | ||
| 3177 | int ret = -EINVAL; | ||
| 3178 | pte_t *ptep, pte; | ||
| 3179 | spinlock_t *ptl; | ||
| 3180 | |||
| 3181 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
| 3182 | goto out; | ||
| 3183 | |||
| 3184 | if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) | ||
| 3185 | goto out; | ||
| 3093 | pte = *ptep; | 3186 | pte = *ptep; |
| 3094 | if (!pte_present(pte)) | 3187 | |
| 3095 | goto unlock; | ||
| 3096 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 3188 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
| 3097 | goto unlock; | 3189 | goto unlock; |
| 3098 | phys_addr = pte_pfn(pte); | ||
| 3099 | phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ | ||
| 3100 | 3190 | ||
| 3101 | *prot = pgprot_val(pte_pgprot(pte)); | 3191 | *prot = pgprot_val(pte_pgprot(pte)); |
| 3102 | *phys = phys_addr; | 3192 | *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; |
| 3103 | ret = 0; | ||
| 3104 | 3193 | ||
| 3194 | ret = 0; | ||
| 3105 | unlock: | 3195 | unlock: |
| 3106 | pte_unmap_unlock(ptep, ptl); | 3196 | pte_unmap_unlock(ptep, ptl); |
| 3107 | out: | 3197 | out: |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c083cf5fd6df..e4412a676c88 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -422,7 +422,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 422 | zone->present_pages += onlined_pages; | 422 | zone->present_pages += onlined_pages; |
| 423 | zone->zone_pgdat->node_present_pages += onlined_pages; | 423 | zone->zone_pgdat->node_present_pages += onlined_pages; |
| 424 | 424 | ||
| 425 | setup_per_zone_pages_min(); | 425 | setup_per_zone_wmarks(); |
| 426 | calculate_zone_inactive_ratio(zone); | ||
| 426 | if (onlined_pages) { | 427 | if (onlined_pages) { |
| 427 | kswapd_run(zone_to_nid(zone)); | 428 | kswapd_run(zone_to_nid(zone)); |
| 428 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | 429 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); |
| @@ -832,6 +833,9 @@ repeat: | |||
| 832 | totalram_pages -= offlined_pages; | 833 | totalram_pages -= offlined_pages; |
| 833 | num_physpages -= offlined_pages; | 834 | num_physpages -= offlined_pages; |
| 834 | 835 | ||
| 836 | setup_per_zone_wmarks(); | ||
| 837 | calculate_zone_inactive_ratio(zone); | ||
| 838 | |||
| 835 | vm_total_pages = nr_free_pagecache_pages(); | 839 | vm_total_pages = nr_free_pagecache_pages(); |
| 836 | writeback_set_ratelimit(); | 840 | writeback_set_ratelimit(); |
| 837 | 841 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3eb4a6fdc043..e08e2c4da63a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) | |||
| 182 | return 0; | 182 | return 0; |
| 183 | } | 183 | } |
| 184 | 184 | ||
| 185 | /* Create a new policy */ | 185 | /* |
| 186 | * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if | ||
| 187 | * any, for the new policy. mpol_new() has already validated the nodes | ||
| 188 | * parameter with respect to the policy mode and flags. But, we need to | ||
| 189 | * handle an empty nodemask with MPOL_PREFERRED here. | ||
| 190 | * | ||
| 191 | * Must be called holding task's alloc_lock to protect task's mems_allowed | ||
| 192 | * and mempolicy. May also be called holding the mmap_semaphore for write. | ||
| 193 | */ | ||
| 194 | static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) | ||
| 195 | { | ||
| 196 | nodemask_t cpuset_context_nmask; | ||
| 197 | int ret; | ||
| 198 | |||
| 199 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ | ||
| 200 | if (pol == NULL) | ||
| 201 | return 0; | ||
| 202 | |||
| 203 | VM_BUG_ON(!nodes); | ||
| 204 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) | ||
| 205 | nodes = NULL; /* explicit local allocation */ | ||
| 206 | else { | ||
| 207 | if (pol->flags & MPOL_F_RELATIVE_NODES) | ||
| 208 | mpol_relative_nodemask(&cpuset_context_nmask, nodes, | ||
| 209 | &cpuset_current_mems_allowed); | ||
| 210 | else | ||
| 211 | nodes_and(cpuset_context_nmask, *nodes, | ||
| 212 | cpuset_current_mems_allowed); | ||
| 213 | if (mpol_store_user_nodemask(pol)) | ||
| 214 | pol->w.user_nodemask = *nodes; | ||
| 215 | else | ||
| 216 | pol->w.cpuset_mems_allowed = | ||
| 217 | cpuset_current_mems_allowed; | ||
| 218 | } | ||
| 219 | |||
| 220 | ret = mpol_ops[pol->mode].create(pol, | ||
| 221 | nodes ? &cpuset_context_nmask : NULL); | ||
| 222 | return ret; | ||
| 223 | } | ||
| 224 | |||
| 225 | /* | ||
| 226 | * This function just creates a new policy, does some check and simple | ||
| 227 | * initialization. You must invoke mpol_set_nodemask() to set nodes. | ||
| 228 | */ | ||
| 186 | static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | 229 | static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, |
| 187 | nodemask_t *nodes) | 230 | nodemask_t *nodes) |
| 188 | { | 231 | { |
| 189 | struct mempolicy *policy; | 232 | struct mempolicy *policy; |
| 190 | nodemask_t cpuset_context_nmask; | ||
| 191 | int ret; | ||
| 192 | 233 | ||
| 193 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", | 234 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", |
| 194 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); | 235 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); |
| @@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
| 210 | if (((flags & MPOL_F_STATIC_NODES) || | 251 | if (((flags & MPOL_F_STATIC_NODES) || |
| 211 | (flags & MPOL_F_RELATIVE_NODES))) | 252 | (flags & MPOL_F_RELATIVE_NODES))) |
| 212 | return ERR_PTR(-EINVAL); | 253 | return ERR_PTR(-EINVAL); |
| 213 | nodes = NULL; /* flag local alloc */ | ||
| 214 | } | 254 | } |
| 215 | } else if (nodes_empty(*nodes)) | 255 | } else if (nodes_empty(*nodes)) |
| 216 | return ERR_PTR(-EINVAL); | 256 | return ERR_PTR(-EINVAL); |
| @@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
| 221 | policy->mode = mode; | 261 | policy->mode = mode; |
| 222 | policy->flags = flags; | 262 | policy->flags = flags; |
| 223 | 263 | ||
| 224 | if (nodes) { | ||
| 225 | /* | ||
| 226 | * cpuset related setup doesn't apply to local allocation | ||
| 227 | */ | ||
| 228 | cpuset_update_task_memory_state(); | ||
| 229 | if (flags & MPOL_F_RELATIVE_NODES) | ||
| 230 | mpol_relative_nodemask(&cpuset_context_nmask, nodes, | ||
| 231 | &cpuset_current_mems_allowed); | ||
| 232 | else | ||
| 233 | nodes_and(cpuset_context_nmask, *nodes, | ||
| 234 | cpuset_current_mems_allowed); | ||
| 235 | if (mpol_store_user_nodemask(policy)) | ||
| 236 | policy->w.user_nodemask = *nodes; | ||
| 237 | else | ||
| 238 | policy->w.cpuset_mems_allowed = | ||
| 239 | cpuset_mems_allowed(current); | ||
| 240 | } | ||
| 241 | |||
| 242 | ret = mpol_ops[mode].create(policy, | ||
| 243 | nodes ? &cpuset_context_nmask : NULL); | ||
| 244 | if (ret < 0) { | ||
| 245 | kmem_cache_free(policy_cache, policy); | ||
| 246 | return ERR_PTR(ret); | ||
| 247 | } | ||
| 248 | return policy; | 264 | return policy; |
| 249 | } | 265 | } |
| 250 | 266 | ||
| @@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol, | |||
| 324 | /* | 340 | /* |
| 325 | * Wrapper for mpol_rebind_policy() that just requires task | 341 | * Wrapper for mpol_rebind_policy() that just requires task |
| 326 | * pointer, and updates task mempolicy. | 342 | * pointer, and updates task mempolicy. |
| 343 | * | ||
| 344 | * Called with task's alloc_lock held. | ||
| 327 | */ | 345 | */ |
| 328 | 346 | ||
| 329 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) | 347 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) |
| @@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void) | |||
| 600 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, | 618 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, |
| 601 | nodemask_t *nodes) | 619 | nodemask_t *nodes) |
| 602 | { | 620 | { |
| 603 | struct mempolicy *new; | 621 | struct mempolicy *new, *old; |
| 604 | struct mm_struct *mm = current->mm; | 622 | struct mm_struct *mm = current->mm; |
| 623 | int ret; | ||
| 605 | 624 | ||
| 606 | new = mpol_new(mode, flags, nodes); | 625 | new = mpol_new(mode, flags, nodes); |
| 607 | if (IS_ERR(new)) | 626 | if (IS_ERR(new)) |
| @@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
| 615 | */ | 634 | */ |
| 616 | if (mm) | 635 | if (mm) |
| 617 | down_write(&mm->mmap_sem); | 636 | down_write(&mm->mmap_sem); |
| 618 | mpol_put(current->mempolicy); | 637 | task_lock(current); |
| 638 | ret = mpol_set_nodemask(new, nodes); | ||
| 639 | if (ret) { | ||
| 640 | task_unlock(current); | ||
| 641 | if (mm) | ||
| 642 | up_write(&mm->mmap_sem); | ||
| 643 | mpol_put(new); | ||
| 644 | return ret; | ||
| 645 | } | ||
| 646 | old = current->mempolicy; | ||
| 619 | current->mempolicy = new; | 647 | current->mempolicy = new; |
| 620 | mpol_set_task_struct_flag(); | 648 | mpol_set_task_struct_flag(); |
| 621 | if (new && new->mode == MPOL_INTERLEAVE && | 649 | if (new && new->mode == MPOL_INTERLEAVE && |
| 622 | nodes_weight(new->v.nodes)) | 650 | nodes_weight(new->v.nodes)) |
| 623 | current->il_next = first_node(new->v.nodes); | 651 | current->il_next = first_node(new->v.nodes); |
| 652 | task_unlock(current); | ||
| 624 | if (mm) | 653 | if (mm) |
| 625 | up_write(&mm->mmap_sem); | 654 | up_write(&mm->mmap_sem); |
| 626 | 655 | ||
| 656 | mpol_put(old); | ||
| 627 | return 0; | 657 | return 0; |
| 628 | } | 658 | } |
| 629 | 659 | ||
| 630 | /* | 660 | /* |
| 631 | * Return nodemask for policy for get_mempolicy() query | 661 | * Return nodemask for policy for get_mempolicy() query |
| 662 | * | ||
| 663 | * Called with task's alloc_lock held | ||
| 632 | */ | 664 | */ |
| 633 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) | 665 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) |
| 634 | { | 666 | { |
| @@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
| 674 | struct vm_area_struct *vma = NULL; | 706 | struct vm_area_struct *vma = NULL; |
| 675 | struct mempolicy *pol = current->mempolicy; | 707 | struct mempolicy *pol = current->mempolicy; |
| 676 | 708 | ||
| 677 | cpuset_update_task_memory_state(); | ||
| 678 | if (flags & | 709 | if (flags & |
| 679 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) | 710 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) |
| 680 | return -EINVAL; | 711 | return -EINVAL; |
| @@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
| 683 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) | 714 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) |
| 684 | return -EINVAL; | 715 | return -EINVAL; |
| 685 | *policy = 0; /* just so it's initialized */ | 716 | *policy = 0; /* just so it's initialized */ |
| 717 | task_lock(current); | ||
| 686 | *nmask = cpuset_current_mems_allowed; | 718 | *nmask = cpuset_current_mems_allowed; |
| 719 | task_unlock(current); | ||
| 687 | return 0; | 720 | return 0; |
| 688 | } | 721 | } |
| 689 | 722 | ||
| @@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
| 738 | } | 771 | } |
| 739 | 772 | ||
| 740 | err = 0; | 773 | err = 0; |
| 741 | if (nmask) | 774 | if (nmask) { |
| 775 | task_lock(current); | ||
| 742 | get_policy_nodemask(pol, nmask); | 776 | get_policy_nodemask(pol, nmask); |
| 777 | task_unlock(current); | ||
| 778 | } | ||
| 743 | 779 | ||
| 744 | out: | 780 | out: |
| 745 | mpol_cond_put(pol); | 781 | mpol_cond_put(pol); |
| @@ -767,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
| 767 | 803 | ||
| 768 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) | 804 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) |
| 769 | { | 805 | { |
| 770 | return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); | 806 | return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); |
| 771 | } | 807 | } |
| 772 | 808 | ||
| 773 | /* | 809 | /* |
| @@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
| 979 | return err; | 1015 | return err; |
| 980 | } | 1016 | } |
| 981 | down_write(&mm->mmap_sem); | 1017 | down_write(&mm->mmap_sem); |
| 1018 | task_lock(current); | ||
| 1019 | err = mpol_set_nodemask(new, nmask); | ||
| 1020 | task_unlock(current); | ||
| 1021 | if (err) { | ||
| 1022 | up_write(&mm->mmap_sem); | ||
| 1023 | mpol_put(new); | ||
| 1024 | return err; | ||
| 1025 | } | ||
| 982 | vma = check_range(mm, start, end, nmask, | 1026 | vma = check_range(mm, start, end, nmask, |
| 983 | flags | MPOL_MF_INVERT, &pagelist); | 1027 | flags | MPOL_MF_INVERT, &pagelist); |
| 984 | 1028 | ||
| @@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
| 1545 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1589 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
| 1546 | struct zonelist *zl; | 1590 | struct zonelist *zl; |
| 1547 | 1591 | ||
| 1548 | cpuset_update_task_memory_state(); | ||
| 1549 | |||
| 1550 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1592 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
| 1551 | unsigned nid; | 1593 | unsigned nid; |
| 1552 | 1594 | ||
| @@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
| 1593 | { | 1635 | { |
| 1594 | struct mempolicy *pol = current->mempolicy; | 1636 | struct mempolicy *pol = current->mempolicy; |
| 1595 | 1637 | ||
| 1596 | if ((gfp & __GFP_WAIT) && !in_interrupt()) | ||
| 1597 | cpuset_update_task_memory_state(); | ||
| 1598 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1638 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
| 1599 | pol = &default_policy; | 1639 | pol = &default_policy; |
| 1600 | 1640 | ||
| @@ -1854,6 +1894,8 @@ restart: | |||
| 1854 | */ | 1894 | */ |
| 1855 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | 1895 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) |
| 1856 | { | 1896 | { |
| 1897 | int ret; | ||
| 1898 | |||
| 1857 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ | 1899 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ |
| 1858 | spin_lock_init(&sp->lock); | 1900 | spin_lock_init(&sp->lock); |
| 1859 | 1901 | ||
| @@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | |||
| 1863 | 1905 | ||
| 1864 | /* contextualize the tmpfs mount point mempolicy */ | 1906 | /* contextualize the tmpfs mount point mempolicy */ |
| 1865 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); | 1907 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); |
| 1866 | mpol_put(mpol); /* drop our ref on sb mpol */ | 1908 | if (IS_ERR(new)) { |
| 1867 | if (IS_ERR(new)) | 1909 | mpol_put(mpol); /* drop our ref on sb mpol */ |
| 1868 | return; /* no valid nodemask intersection */ | 1910 | return; /* no valid nodemask intersection */ |
| 1911 | } | ||
| 1912 | |||
| 1913 | task_lock(current); | ||
| 1914 | ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); | ||
| 1915 | task_unlock(current); | ||
| 1916 | mpol_put(mpol); /* drop our ref on sb mpol */ | ||
| 1917 | if (ret) { | ||
| 1918 | mpol_put(new); | ||
| 1919 | return; | ||
| 1920 | } | ||
| 1869 | 1921 | ||
| 1870 | /* Create pseudo-vma that contains just the policy */ | 1922 | /* Create pseudo-vma that contains just the policy */ |
| 1871 | memset(&pvma, 0, sizeof(struct vm_area_struct)); | 1923 | memset(&pvma, 0, sizeof(struct vm_area_struct)); |
| @@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
| 2086 | new = mpol_new(mode, mode_flags, &nodes); | 2138 | new = mpol_new(mode, mode_flags, &nodes); |
| 2087 | if (IS_ERR(new)) | 2139 | if (IS_ERR(new)) |
| 2088 | err = 1; | 2140 | err = 1; |
| 2089 | else if (no_context) | 2141 | else { |
| 2090 | new->w.user_nodemask = nodes; /* save for contextualization */ | 2142 | int ret; |
| 2143 | |||
| 2144 | task_lock(current); | ||
| 2145 | ret = mpol_set_nodemask(new, &nodes); | ||
| 2146 | task_unlock(current); | ||
| 2147 | if (ret) | ||
| 2148 | err = 1; | ||
| 2149 | else if (no_context) { | ||
| 2150 | /* save for contextualization */ | ||
| 2151 | new->w.user_nodemask = nodes; | ||
| 2152 | } | ||
| 2153 | } | ||
| 2091 | 2154 | ||
| 2092 | out: | 2155 | out: |
| 2093 | /* Restore string for error message */ | 2156 | /* Restore string for error message */ |
diff --git a/mm/migrate.c b/mm/migrate.c index 068655d8f883..939888f9ddab 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, | |||
| 802 | 802 | ||
| 803 | *result = &pm->status; | 803 | *result = &pm->status; |
| 804 | 804 | ||
| 805 | return alloc_pages_node(pm->node, | 805 | return alloc_pages_exact_node(pm->node, |
| 806 | GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); | 806 | GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); |
| 807 | } | 807 | } |
| 808 | 808 | ||
| @@ -820,7 +820,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
| 820 | struct page_to_node *pp; | 820 | struct page_to_node *pp; |
| 821 | LIST_HEAD(pagelist); | 821 | LIST_HEAD(pagelist); |
| 822 | 822 | ||
| 823 | migrate_prep(); | ||
| 824 | down_read(&mm->mmap_sem); | 823 | down_read(&mm->mmap_sem); |
| 825 | 824 | ||
| 826 | /* | 825 | /* |
| @@ -907,6 +906,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | |||
| 907 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); | 906 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); |
| 908 | if (!pm) | 907 | if (!pm) |
| 909 | goto out; | 908 | goto out; |
| 909 | |||
| 910 | migrate_prep(); | ||
| 911 | |||
| 910 | /* | 912 | /* |
| 911 | * Store a chunk of page_to_node array in a page, | 913 | * Store a chunk of page_to_node array in a page, |
| 912 | * but keep the last one as a marker | 914 | * but keep the last one as a marker |
diff --git a/mm/mlock.c b/mm/mlock.c index ac130433c7d3..45eb650b9654 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
| @@ -31,7 +31,6 @@ int can_do_mlock(void) | |||
| 31 | } | 31 | } |
| 32 | EXPORT_SYMBOL(can_do_mlock); | 32 | EXPORT_SYMBOL(can_do_mlock); |
| 33 | 33 | ||
| 34 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 35 | /* | 34 | /* |
| 36 | * Mlocked pages are marked with PageMlocked() flag for efficient testing | 35 | * Mlocked pages are marked with PageMlocked() flag for efficient testing |
| 37 | * in vmscan and, possibly, the fault path; and to support semi-accurate | 36 | * in vmscan and, possibly, the fault path; and to support semi-accurate |
| @@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval) | |||
| 261 | return retval; | 260 | return retval; |
| 262 | } | 261 | } |
| 263 | 262 | ||
| 264 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
| 265 | |||
| 266 | /* | ||
| 267 | * Just make pages present if VM_LOCKED. No-op if unlocking. | ||
| 268 | */ | ||
| 269 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | ||
| 270 | unsigned long start, unsigned long end, | ||
| 271 | int mlock) | ||
| 272 | { | ||
| 273 | if (mlock && (vma->vm_flags & VM_LOCKED)) | ||
| 274 | return make_pages_present(start, end); | ||
| 275 | return 0; | ||
| 276 | } | ||
| 277 | |||
| 278 | static inline int __mlock_posix_error_return(long retval) | ||
| 279 | { | ||
| 280 | return 0; | ||
| 281 | } | ||
| 282 | |||
| 283 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
| 284 | |||
| 285 | /** | 263 | /** |
| 286 | * mlock_vma_pages_range() - mlock pages in specified vma range. | 264 | * mlock_vma_pages_range() - mlock pages in specified vma range. |
| 287 | * @vma - the vma containing the specfied address range | 265 | * @vma - the vma containing the specfied address range |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a7b2460e922b..175a67a78a99 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
| 58 | unsigned long points, cpu_time, run_time; | 58 | unsigned long points, cpu_time, run_time; |
| 59 | struct mm_struct *mm; | 59 | struct mm_struct *mm; |
| 60 | struct task_struct *child; | 60 | struct task_struct *child; |
| 61 | int oom_adj; | ||
| 61 | 62 | ||
| 62 | task_lock(p); | 63 | task_lock(p); |
| 63 | mm = p->mm; | 64 | mm = p->mm; |
| @@ -65,6 +66,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
| 65 | task_unlock(p); | 66 | task_unlock(p); |
| 66 | return 0; | 67 | return 0; |
| 67 | } | 68 | } |
| 69 | oom_adj = mm->oom_adj; | ||
| 70 | if (oom_adj == OOM_DISABLE) { | ||
| 71 | task_unlock(p); | ||
| 72 | return 0; | ||
| 73 | } | ||
| 68 | 74 | ||
| 69 | /* | 75 | /* |
| 70 | * The memory size of the process is the basis for the badness. | 76 | * The memory size of the process is the basis for the badness. |
| @@ -148,15 +154,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
| 148 | points /= 8; | 154 | points /= 8; |
| 149 | 155 | ||
| 150 | /* | 156 | /* |
| 151 | * Adjust the score by oomkilladj. | 157 | * Adjust the score by oom_adj. |
| 152 | */ | 158 | */ |
| 153 | if (p->oomkilladj) { | 159 | if (oom_adj) { |
| 154 | if (p->oomkilladj > 0) { | 160 | if (oom_adj > 0) { |
| 155 | if (!points) | 161 | if (!points) |
| 156 | points = 1; | 162 | points = 1; |
| 157 | points <<= p->oomkilladj; | 163 | points <<= oom_adj; |
| 158 | } else | 164 | } else |
| 159 | points >>= -(p->oomkilladj); | 165 | points >>= -(oom_adj); |
| 160 | } | 166 | } |
| 161 | 167 | ||
| 162 | #ifdef DEBUG | 168 | #ifdef DEBUG |
| @@ -251,11 +257,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, | |||
| 251 | *ppoints = ULONG_MAX; | 257 | *ppoints = ULONG_MAX; |
| 252 | } | 258 | } |
| 253 | 259 | ||
| 254 | if (p->oomkilladj == OOM_DISABLE) | ||
| 255 | continue; | ||
| 256 | |||
| 257 | points = badness(p, uptime.tv_sec); | 260 | points = badness(p, uptime.tv_sec); |
| 258 | if (points > *ppoints || !chosen) { | 261 | if (points > *ppoints) { |
| 259 | chosen = p; | 262 | chosen = p; |
| 260 | *ppoints = points; | 263 | *ppoints = points; |
| 261 | } | 264 | } |
| @@ -304,8 +307,7 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
| 304 | } | 307 | } |
| 305 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", | 308 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", |
| 306 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, | 309 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, |
| 307 | get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, | 310 | get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); |
| 308 | p->comm); | ||
| 309 | task_unlock(p); | 311 | task_unlock(p); |
| 310 | } while_each_thread(g, p); | 312 | } while_each_thread(g, p); |
| 311 | } | 313 | } |
| @@ -323,11 +325,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose) | |||
| 323 | return; | 325 | return; |
| 324 | } | 326 | } |
| 325 | 327 | ||
| 326 | if (!p->mm) { | 328 | if (!p->mm) |
| 327 | WARN_ON(1); | ||
| 328 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); | ||
| 329 | return; | 329 | return; |
| 330 | } | ||
| 331 | 330 | ||
| 332 | if (verbose) | 331 | if (verbose) |
| 333 | printk(KERN_ERR "Killed process %d (%s)\n", | 332 | printk(KERN_ERR "Killed process %d (%s)\n", |
| @@ -349,28 +348,13 @@ static int oom_kill_task(struct task_struct *p) | |||
| 349 | struct mm_struct *mm; | 348 | struct mm_struct *mm; |
| 350 | struct task_struct *g, *q; | 349 | struct task_struct *g, *q; |
| 351 | 350 | ||
| 351 | task_lock(p); | ||
| 352 | mm = p->mm; | 352 | mm = p->mm; |
| 353 | 353 | if (!mm || mm->oom_adj == OOM_DISABLE) { | |
| 354 | /* WARNING: mm may not be dereferenced since we did not obtain its | 354 | task_unlock(p); |
| 355 | * value from get_task_mm(p). This is OK since all we need to do is | ||
| 356 | * compare mm to q->mm below. | ||
| 357 | * | ||
| 358 | * Furthermore, even if mm contains a non-NULL value, p->mm may | ||
| 359 | * change to NULL at any time since we do not hold task_lock(p). | ||
| 360 | * However, this is of no concern to us. | ||
| 361 | */ | ||
| 362 | |||
| 363 | if (mm == NULL) | ||
| 364 | return 1; | 355 | return 1; |
| 365 | 356 | } | |
| 366 | /* | 357 | task_unlock(p); |
| 367 | * Don't kill the process if any threads are set to OOM_DISABLE | ||
| 368 | */ | ||
| 369 | do_each_thread(g, q) { | ||
| 370 | if (q->mm == mm && q->oomkilladj == OOM_DISABLE) | ||
| 371 | return 1; | ||
| 372 | } while_each_thread(g, q); | ||
| 373 | |||
| 374 | __oom_kill_task(p, 1); | 358 | __oom_kill_task(p, 1); |
| 375 | 359 | ||
| 376 | /* | 360 | /* |
| @@ -393,10 +377,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 393 | struct task_struct *c; | 377 | struct task_struct *c; |
| 394 | 378 | ||
| 395 | if (printk_ratelimit()) { | 379 | if (printk_ratelimit()) { |
| 396 | printk(KERN_WARNING "%s invoked oom-killer: " | ||
| 397 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | ||
| 398 | current->comm, gfp_mask, order, current->oomkilladj); | ||
| 399 | task_lock(current); | 380 | task_lock(current); |
| 381 | printk(KERN_WARNING "%s invoked oom-killer: " | ||
| 382 | "gfp_mask=0x%x, order=%d, oom_adj=%d\n", | ||
| 383 | current->comm, gfp_mask, order, | ||
| 384 | current->mm ? current->mm->oom_adj : OOM_DISABLE); | ||
| 400 | cpuset_print_task_mems_allowed(current); | 385 | cpuset_print_task_mems_allowed(current); |
| 401 | task_unlock(current); | 386 | task_unlock(current); |
| 402 | dump_stack(); | 387 | dump_stack(); |
| @@ -409,8 +394,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 409 | /* | 394 | /* |
| 410 | * If the task is already exiting, don't alarm the sysadmin or kill | 395 | * If the task is already exiting, don't alarm the sysadmin or kill |
| 411 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 396 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
| 397 | * if its mm is still attached. | ||
| 412 | */ | 398 | */ |
| 413 | if (p->flags & PF_EXITING) { | 399 | if (p->mm && (p->flags & PF_EXITING)) { |
| 414 | __oom_kill_task(p, 0); | 400 | __oom_kill_task(p, 0); |
| 415 | return 0; | 401 | return 0; |
| 416 | } | 402 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bb553c3e955d..7b0dcea4935b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -265,18 +265,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, | |||
| 265 | * This avoids exceeding the total dirty_limit when the floating averages | 265 | * This avoids exceeding the total dirty_limit when the floating averages |
| 266 | * fluctuate too quickly. | 266 | * fluctuate too quickly. |
| 267 | */ | 267 | */ |
| 268 | static void | 268 | static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, |
| 269 | clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) | 269 | unsigned long dirty, unsigned long *pbdi_dirty) |
| 270 | { | 270 | { |
| 271 | long avail_dirty; | 271 | unsigned long avail_dirty; |
| 272 | 272 | ||
| 273 | avail_dirty = dirty - | 273 | avail_dirty = global_page_state(NR_FILE_DIRTY) + |
| 274 | (global_page_state(NR_FILE_DIRTY) + | ||
| 275 | global_page_state(NR_WRITEBACK) + | 274 | global_page_state(NR_WRITEBACK) + |
| 276 | global_page_state(NR_UNSTABLE_NFS) + | 275 | global_page_state(NR_UNSTABLE_NFS) + |
| 277 | global_page_state(NR_WRITEBACK_TEMP)); | 276 | global_page_state(NR_WRITEBACK_TEMP); |
| 278 | 277 | ||
| 279 | if (avail_dirty < 0) | 278 | if (avail_dirty < dirty) |
| 279 | avail_dirty = dirty - avail_dirty; | ||
| 280 | else | ||
| 280 | avail_dirty = 0; | 281 | avail_dirty = 0; |
| 281 | 282 | ||
| 282 | avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + | 283 | avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + |
| @@ -299,10 +300,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk, | |||
| 299 | * | 300 | * |
| 300 | * dirty -= (dirty/8) * p_{t} | 301 | * dirty -= (dirty/8) * p_{t} |
| 301 | */ | 302 | */ |
| 302 | static void task_dirty_limit(struct task_struct *tsk, long *pdirty) | 303 | static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) |
| 303 | { | 304 | { |
| 304 | long numerator, denominator; | 305 | long numerator, denominator; |
| 305 | long dirty = *pdirty; | 306 | unsigned long dirty = *pdirty; |
| 306 | u64 inv = dirty >> 3; | 307 | u64 inv = dirty >> 3; |
| 307 | 308 | ||
| 308 | task_dirties_fraction(tsk, &numerator, &denominator); | 309 | task_dirties_fraction(tsk, &numerator, &denominator); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 17d5f539a9aa..a5f3c278c573 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/bootmem.h> | 23 | #include <linux/bootmem.h> |
| 24 | #include <linux/compiler.h> | 24 | #include <linux/compiler.h> |
| 25 | #include <linux/kernel.h> | 25 | #include <linux/kernel.h> |
| 26 | #include <linux/kmemcheck.h> | ||
| 26 | #include <linux/module.h> | 27 | #include <linux/module.h> |
| 27 | #include <linux/suspend.h> | 28 | #include <linux/suspend.h> |
| 28 | #include <linux/pagevec.h> | 29 | #include <linux/pagevec.h> |
| @@ -161,17 +162,25 @@ static unsigned long __meminitdata dma_reserve; | |||
| 161 | 162 | ||
| 162 | #if MAX_NUMNODES > 1 | 163 | #if MAX_NUMNODES > 1 |
| 163 | int nr_node_ids __read_mostly = MAX_NUMNODES; | 164 | int nr_node_ids __read_mostly = MAX_NUMNODES; |
| 165 | int nr_online_nodes __read_mostly = 1; | ||
| 164 | EXPORT_SYMBOL(nr_node_ids); | 166 | EXPORT_SYMBOL(nr_node_ids); |
| 167 | EXPORT_SYMBOL(nr_online_nodes); | ||
| 165 | #endif | 168 | #endif |
| 166 | 169 | ||
| 167 | int page_group_by_mobility_disabled __read_mostly; | 170 | int page_group_by_mobility_disabled __read_mostly; |
| 168 | 171 | ||
| 169 | static void set_pageblock_migratetype(struct page *page, int migratetype) | 172 | static void set_pageblock_migratetype(struct page *page, int migratetype) |
| 170 | { | 173 | { |
| 174 | |||
| 175 | if (unlikely(page_group_by_mobility_disabled)) | ||
| 176 | migratetype = MIGRATE_UNMOVABLE; | ||
| 177 | |||
| 171 | set_pageblock_flags_group(page, (unsigned long)migratetype, | 178 | set_pageblock_flags_group(page, (unsigned long)migratetype, |
| 172 | PB_migrate, PB_migrate_end); | 179 | PB_migrate, PB_migrate_end); |
| 173 | } | 180 | } |
| 174 | 181 | ||
| 182 | bool oom_killer_disabled __read_mostly; | ||
| 183 | |||
| 175 | #ifdef CONFIG_DEBUG_VM | 184 | #ifdef CONFIG_DEBUG_VM |
| 176 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 185 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
| 177 | { | 186 | { |
| @@ -294,23 +303,6 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
| 294 | } | 303 | } |
| 295 | } | 304 | } |
| 296 | 305 | ||
| 297 | #ifdef CONFIG_HUGETLBFS | ||
| 298 | void prep_compound_gigantic_page(struct page *page, unsigned long order) | ||
| 299 | { | ||
| 300 | int i; | ||
| 301 | int nr_pages = 1 << order; | ||
| 302 | struct page *p = page + 1; | ||
| 303 | |||
| 304 | set_compound_page_dtor(page, free_compound_page); | ||
| 305 | set_compound_order(page, order); | ||
| 306 | __SetPageHead(page); | ||
| 307 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | ||
| 308 | __SetPageTail(p); | ||
| 309 | p->first_page = page; | ||
| 310 | } | ||
| 311 | } | ||
| 312 | #endif | ||
| 313 | |||
| 314 | static int destroy_compound_page(struct page *page, unsigned long order) | 306 | static int destroy_compound_page(struct page *page, unsigned long order) |
| 315 | { | 307 | { |
| 316 | int i; | 308 | int i; |
| @@ -417,7 +409,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
| 417 | return 0; | 409 | return 0; |
| 418 | 410 | ||
| 419 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 411 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
| 420 | BUG_ON(page_count(buddy) != 0); | 412 | VM_BUG_ON(page_count(buddy) != 0); |
| 421 | return 1; | 413 | return 1; |
| 422 | } | 414 | } |
| 423 | return 0; | 415 | return 0; |
| @@ -448,22 +440,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
| 448 | */ | 440 | */ |
| 449 | 441 | ||
| 450 | static inline void __free_one_page(struct page *page, | 442 | static inline void __free_one_page(struct page *page, |
| 451 | struct zone *zone, unsigned int order) | 443 | struct zone *zone, unsigned int order, |
| 444 | int migratetype) | ||
| 452 | { | 445 | { |
| 453 | unsigned long page_idx; | 446 | unsigned long page_idx; |
| 454 | int order_size = 1 << order; | ||
| 455 | int migratetype = get_pageblock_migratetype(page); | ||
| 456 | 447 | ||
| 457 | if (unlikely(PageCompound(page))) | 448 | if (unlikely(PageCompound(page))) |
| 458 | if (unlikely(destroy_compound_page(page, order))) | 449 | if (unlikely(destroy_compound_page(page, order))) |
| 459 | return; | 450 | return; |
| 460 | 451 | ||
| 452 | VM_BUG_ON(migratetype == -1); | ||
| 453 | |||
| 461 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 454 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
| 462 | 455 | ||
| 463 | VM_BUG_ON(page_idx & (order_size - 1)); | 456 | VM_BUG_ON(page_idx & ((1 << order) - 1)); |
| 464 | VM_BUG_ON(bad_range(zone, page)); | 457 | VM_BUG_ON(bad_range(zone, page)); |
| 465 | 458 | ||
| 466 | __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); | ||
| 467 | while (order < MAX_ORDER-1) { | 459 | while (order < MAX_ORDER-1) { |
| 468 | unsigned long combined_idx; | 460 | unsigned long combined_idx; |
| 469 | struct page *buddy; | 461 | struct page *buddy; |
| @@ -487,12 +479,27 @@ static inline void __free_one_page(struct page *page, | |||
| 487 | zone->free_area[order].nr_free++; | 479 | zone->free_area[order].nr_free++; |
| 488 | } | 480 | } |
| 489 | 481 | ||
| 482 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
| 483 | /* | ||
| 484 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | ||
| 485 | * Page should not be on lru, so no need to fix that up. | ||
| 486 | * free_pages_check() will verify... | ||
| 487 | */ | ||
| 488 | static inline void free_page_mlock(struct page *page) | ||
| 489 | { | ||
| 490 | __ClearPageMlocked(page); | ||
| 491 | __dec_zone_page_state(page, NR_MLOCK); | ||
| 492 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | ||
| 493 | } | ||
| 494 | #else | ||
| 495 | static void free_page_mlock(struct page *page) { } | ||
| 496 | #endif | ||
| 497 | |||
| 490 | static inline int free_pages_check(struct page *page) | 498 | static inline int free_pages_check(struct page *page) |
| 491 | { | 499 | { |
| 492 | free_page_mlock(page); | ||
| 493 | if (unlikely(page_mapcount(page) | | 500 | if (unlikely(page_mapcount(page) | |
| 494 | (page->mapping != NULL) | | 501 | (page->mapping != NULL) | |
| 495 | (page_count(page) != 0) | | 502 | (atomic_read(&page->_count) != 0) | |
| 496 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { | 503 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { |
| 497 | bad_page(page); | 504 | bad_page(page); |
| 498 | return 1; | 505 | return 1; |
| @@ -519,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
| 519 | spin_lock(&zone->lock); | 526 | spin_lock(&zone->lock); |
| 520 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 527 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
| 521 | zone->pages_scanned = 0; | 528 | zone->pages_scanned = 0; |
| 529 | |||
| 530 | __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); | ||
| 522 | while (count--) { | 531 | while (count--) { |
| 523 | struct page *page; | 532 | struct page *page; |
| 524 | 533 | ||
| @@ -526,17 +535,20 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
| 526 | page = list_entry(list->prev, struct page, lru); | 535 | page = list_entry(list->prev, struct page, lru); |
| 527 | /* have to delete it as __free_one_page list manipulates */ | 536 | /* have to delete it as __free_one_page list manipulates */ |
| 528 | list_del(&page->lru); | 537 | list_del(&page->lru); |
| 529 | __free_one_page(page, zone, order); | 538 | __free_one_page(page, zone, order, page_private(page)); |
| 530 | } | 539 | } |
| 531 | spin_unlock(&zone->lock); | 540 | spin_unlock(&zone->lock); |
| 532 | } | 541 | } |
| 533 | 542 | ||
| 534 | static void free_one_page(struct zone *zone, struct page *page, int order) | 543 | static void free_one_page(struct zone *zone, struct page *page, int order, |
| 544 | int migratetype) | ||
| 535 | { | 545 | { |
| 536 | spin_lock(&zone->lock); | 546 | spin_lock(&zone->lock); |
| 537 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 547 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
| 538 | zone->pages_scanned = 0; | 548 | zone->pages_scanned = 0; |
| 539 | __free_one_page(page, zone, order); | 549 | |
| 550 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | ||
| 551 | __free_one_page(page, zone, order, migratetype); | ||
| 540 | spin_unlock(&zone->lock); | 552 | spin_unlock(&zone->lock); |
| 541 | } | 553 | } |
| 542 | 554 | ||
| @@ -545,6 +557,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
| 545 | unsigned long flags; | 557 | unsigned long flags; |
| 546 | int i; | 558 | int i; |
| 547 | int bad = 0; | 559 | int bad = 0; |
| 560 | int clearMlocked = PageMlocked(page); | ||
| 561 | |||
| 562 | kmemcheck_free_shadow(page, order); | ||
| 548 | 563 | ||
| 549 | for (i = 0 ; i < (1 << order) ; ++i) | 564 | for (i = 0 ; i < (1 << order) ; ++i) |
| 550 | bad += free_pages_check(page + i); | 565 | bad += free_pages_check(page + i); |
| @@ -560,8 +575,11 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
| 560 | kernel_map_pages(page, 1 << order, 0); | 575 | kernel_map_pages(page, 1 << order, 0); |
| 561 | 576 | ||
| 562 | local_irq_save(flags); | 577 | local_irq_save(flags); |
| 578 | if (unlikely(clearMlocked)) | ||
| 579 | free_page_mlock(page); | ||
| 563 | __count_vm_events(PGFREE, 1 << order); | 580 | __count_vm_events(PGFREE, 1 << order); |
| 564 | free_one_page(page_zone(page), page, order); | 581 | free_one_page(page_zone(page), page, order, |
| 582 | get_pageblock_migratetype(page)); | ||
| 565 | local_irq_restore(flags); | 583 | local_irq_restore(flags); |
| 566 | } | 584 | } |
| 567 | 585 | ||
| @@ -632,7 +650,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
| 632 | { | 650 | { |
| 633 | if (unlikely(page_mapcount(page) | | 651 | if (unlikely(page_mapcount(page) | |
| 634 | (page->mapping != NULL) | | 652 | (page->mapping != NULL) | |
| 635 | (page_count(page) != 0) | | 653 | (atomic_read(&page->_count) != 0) | |
| 636 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { | 654 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { |
| 637 | bad_page(page); | 655 | bad_page(page); |
| 638 | return 1; | 656 | return 1; |
| @@ -657,7 +675,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
| 657 | * Go through the free lists for the given migratetype and remove | 675 | * Go through the free lists for the given migratetype and remove |
| 658 | * the smallest available page from the freelists | 676 | * the smallest available page from the freelists |
| 659 | */ | 677 | */ |
| 660 | static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | 678 | static inline |
| 679 | struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | ||
| 661 | int migratetype) | 680 | int migratetype) |
| 662 | { | 681 | { |
| 663 | unsigned int current_order; | 682 | unsigned int current_order; |
| @@ -675,7 +694,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |||
| 675 | list_del(&page->lru); | 694 | list_del(&page->lru); |
| 676 | rmv_page_order(page); | 695 | rmv_page_order(page); |
| 677 | area->nr_free--; | 696 | area->nr_free--; |
| 678 | __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); | ||
| 679 | expand(zone, page, order, current_order, area, migratetype); | 697 | expand(zone, page, order, current_order, area, migratetype); |
| 680 | return page; | 698 | return page; |
| 681 | } | 699 | } |
| @@ -766,8 +784,8 @@ static int move_freepages_block(struct zone *zone, struct page *page, | |||
| 766 | } | 784 | } |
| 767 | 785 | ||
| 768 | /* Remove an element from the buddy allocator from the fallback list */ | 786 | /* Remove an element from the buddy allocator from the fallback list */ |
| 769 | static struct page *__rmqueue_fallback(struct zone *zone, int order, | 787 | static inline struct page * |
| 770 | int start_migratetype) | 788 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) |
| 771 | { | 789 | { |
| 772 | struct free_area * area; | 790 | struct free_area * area; |
| 773 | int current_order; | 791 | int current_order; |
| @@ -815,8 +833,6 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, | |||
| 815 | /* Remove the page from the freelists */ | 833 | /* Remove the page from the freelists */ |
| 816 | list_del(&page->lru); | 834 | list_del(&page->lru); |
| 817 | rmv_page_order(page); | 835 | rmv_page_order(page); |
| 818 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
| 819 | -(1UL << order)); | ||
| 820 | 836 | ||
| 821 | if (current_order == pageblock_order) | 837 | if (current_order == pageblock_order) |
| 822 | set_pageblock_migratetype(page, | 838 | set_pageblock_migratetype(page, |
| @@ -827,8 +843,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, | |||
| 827 | } | 843 | } |
| 828 | } | 844 | } |
| 829 | 845 | ||
| 830 | /* Use MIGRATE_RESERVE rather than fail an allocation */ | 846 | return NULL; |
| 831 | return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); | ||
| 832 | } | 847 | } |
| 833 | 848 | ||
| 834 | /* | 849 | /* |
| @@ -840,11 +855,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, | |||
| 840 | { | 855 | { |
| 841 | struct page *page; | 856 | struct page *page; |
| 842 | 857 | ||
| 858 | retry_reserve: | ||
| 843 | page = __rmqueue_smallest(zone, order, migratetype); | 859 | page = __rmqueue_smallest(zone, order, migratetype); |
| 844 | 860 | ||
| 845 | if (unlikely(!page)) | 861 | if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { |
| 846 | page = __rmqueue_fallback(zone, order, migratetype); | 862 | page = __rmqueue_fallback(zone, order, migratetype); |
| 847 | 863 | ||
| 864 | /* | ||
| 865 | * Use MIGRATE_RESERVE rather than fail an allocation. goto | ||
| 866 | * is used because __rmqueue_smallest is an inline function | ||
| 867 | * and we want just one call site | ||
| 868 | */ | ||
| 869 | if (!page) { | ||
| 870 | migratetype = MIGRATE_RESERVE; | ||
| 871 | goto retry_reserve; | ||
| 872 | } | ||
| 873 | } | ||
| 874 | |||
| 848 | return page; | 875 | return page; |
| 849 | } | 876 | } |
| 850 | 877 | ||
| @@ -878,6 +905,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
| 878 | set_page_private(page, migratetype); | 905 | set_page_private(page, migratetype); |
| 879 | list = &page->lru; | 906 | list = &page->lru; |
| 880 | } | 907 | } |
| 908 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); | ||
| 881 | spin_unlock(&zone->lock); | 909 | spin_unlock(&zone->lock); |
| 882 | return i; | 910 | return i; |
| 883 | } | 911 | } |
| @@ -993,6 +1021,9 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
| 993 | struct zone *zone = page_zone(page); | 1021 | struct zone *zone = page_zone(page); |
| 994 | struct per_cpu_pages *pcp; | 1022 | struct per_cpu_pages *pcp; |
| 995 | unsigned long flags; | 1023 | unsigned long flags; |
| 1024 | int clearMlocked = PageMlocked(page); | ||
| 1025 | |||
| 1026 | kmemcheck_free_shadow(page, 0); | ||
| 996 | 1027 | ||
| 997 | if (PageAnon(page)) | 1028 | if (PageAnon(page)) |
| 998 | page->mapping = NULL; | 1029 | page->mapping = NULL; |
| @@ -1007,13 +1038,16 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
| 1007 | kernel_map_pages(page, 1, 0); | 1038 | kernel_map_pages(page, 1, 0); |
| 1008 | 1039 | ||
| 1009 | pcp = &zone_pcp(zone, get_cpu())->pcp; | 1040 | pcp = &zone_pcp(zone, get_cpu())->pcp; |
| 1041 | set_page_private(page, get_pageblock_migratetype(page)); | ||
| 1010 | local_irq_save(flags); | 1042 | local_irq_save(flags); |
| 1043 | if (unlikely(clearMlocked)) | ||
| 1044 | free_page_mlock(page); | ||
| 1011 | __count_vm_event(PGFREE); | 1045 | __count_vm_event(PGFREE); |
| 1046 | |||
| 1012 | if (cold) | 1047 | if (cold) |
| 1013 | list_add_tail(&page->lru, &pcp->list); | 1048 | list_add_tail(&page->lru, &pcp->list); |
| 1014 | else | 1049 | else |
| 1015 | list_add(&page->lru, &pcp->list); | 1050 | list_add(&page->lru, &pcp->list); |
| 1016 | set_page_private(page, get_pageblock_migratetype(page)); | ||
| 1017 | pcp->count++; | 1051 | pcp->count++; |
| 1018 | if (pcp->count >= pcp->high) { | 1052 | if (pcp->count >= pcp->high) { |
| 1019 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 1053 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); |
| @@ -1047,6 +1081,16 @@ void split_page(struct page *page, unsigned int order) | |||
| 1047 | 1081 | ||
| 1048 | VM_BUG_ON(PageCompound(page)); | 1082 | VM_BUG_ON(PageCompound(page)); |
| 1049 | VM_BUG_ON(!page_count(page)); | 1083 | VM_BUG_ON(!page_count(page)); |
| 1084 | |||
| 1085 | #ifdef CONFIG_KMEMCHECK | ||
| 1086 | /* | ||
| 1087 | * Split shadow pages too, because free(page[0]) would | ||
| 1088 | * otherwise free the whole shadow. | ||
| 1089 | */ | ||
| 1090 | if (kmemcheck_page_is_tracked(page)) | ||
| 1091 | split_page(virt_to_page(page[0].shadow), order); | ||
| 1092 | #endif | ||
| 1093 | |||
| 1050 | for (i = 1; i < (1 << order); i++) | 1094 | for (i = 1; i < (1 << order); i++) |
| 1051 | set_page_refcounted(page + i); | 1095 | set_page_refcounted(page + i); |
| 1052 | } | 1096 | } |
| @@ -1056,14 +1100,15 @@ void split_page(struct page *page, unsigned int order) | |||
| 1056 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 1100 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
| 1057 | * or two. | 1101 | * or two. |
| 1058 | */ | 1102 | */ |
| 1059 | static struct page *buffered_rmqueue(struct zone *preferred_zone, | 1103 | static inline |
| 1060 | struct zone *zone, int order, gfp_t gfp_flags) | 1104 | struct page *buffered_rmqueue(struct zone *preferred_zone, |
| 1105 | struct zone *zone, int order, gfp_t gfp_flags, | ||
| 1106 | int migratetype) | ||
| 1061 | { | 1107 | { |
| 1062 | unsigned long flags; | 1108 | unsigned long flags; |
| 1063 | struct page *page; | 1109 | struct page *page; |
| 1064 | int cold = !!(gfp_flags & __GFP_COLD); | 1110 | int cold = !!(gfp_flags & __GFP_COLD); |
| 1065 | int cpu; | 1111 | int cpu; |
| 1066 | int migratetype = allocflags_to_migratetype(gfp_flags); | ||
| 1067 | 1112 | ||
| 1068 | again: | 1113 | again: |
| 1069 | cpu = get_cpu(); | 1114 | cpu = get_cpu(); |
| @@ -1100,8 +1145,22 @@ again: | |||
| 1100 | list_del(&page->lru); | 1145 | list_del(&page->lru); |
| 1101 | pcp->count--; | 1146 | pcp->count--; |
| 1102 | } else { | 1147 | } else { |
| 1148 | if (unlikely(gfp_flags & __GFP_NOFAIL)) { | ||
| 1149 | /* | ||
| 1150 | * __GFP_NOFAIL is not to be used in new code. | ||
| 1151 | * | ||
| 1152 | * All __GFP_NOFAIL callers should be fixed so that they | ||
| 1153 | * properly detect and handle allocation failures. | ||
| 1154 | * | ||
| 1155 | * We most definitely don't want callers attempting to | ||
| 1156 | * allocate greater than single-page units with | ||
| 1157 | * __GFP_NOFAIL. | ||
| 1158 | */ | ||
| 1159 | WARN_ON_ONCE(order > 0); | ||
| 1160 | } | ||
| 1103 | spin_lock_irqsave(&zone->lock, flags); | 1161 | spin_lock_irqsave(&zone->lock, flags); |
| 1104 | page = __rmqueue(zone, order, migratetype); | 1162 | page = __rmqueue(zone, order, migratetype); |
| 1163 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | ||
| 1105 | spin_unlock(&zone->lock); | 1164 | spin_unlock(&zone->lock); |
| 1106 | if (!page) | 1165 | if (!page) |
| 1107 | goto failed; | 1166 | goto failed; |
| @@ -1123,10 +1182,15 @@ failed: | |||
| 1123 | return NULL; | 1182 | return NULL; |
| 1124 | } | 1183 | } |
| 1125 | 1184 | ||
| 1126 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ | 1185 | /* The ALLOC_WMARK bits are used as an index to zone->watermark */ |
| 1127 | #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ | 1186 | #define ALLOC_WMARK_MIN WMARK_MIN |
| 1128 | #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ | 1187 | #define ALLOC_WMARK_LOW WMARK_LOW |
| 1129 | #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ | 1188 | #define ALLOC_WMARK_HIGH WMARK_HIGH |
| 1189 | #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ | ||
| 1190 | |||
| 1191 | /* Mask to get the watermark bits */ | ||
| 1192 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) | ||
| 1193 | |||
| 1130 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | 1194 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ |
| 1131 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 1195 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
| 1132 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 1196 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
| @@ -1384,23 +1448,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | |||
| 1384 | */ | 1448 | */ |
| 1385 | static struct page * | 1449 | static struct page * |
| 1386 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | 1450 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, |
| 1387 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags) | 1451 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, |
| 1452 | struct zone *preferred_zone, int migratetype) | ||
| 1388 | { | 1453 | { |
| 1389 | struct zoneref *z; | 1454 | struct zoneref *z; |
| 1390 | struct page *page = NULL; | 1455 | struct page *page = NULL; |
| 1391 | int classzone_idx; | 1456 | int classzone_idx; |
| 1392 | struct zone *zone, *preferred_zone; | 1457 | struct zone *zone; |
| 1393 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | 1458 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ |
| 1394 | int zlc_active = 0; /* set if using zonelist_cache */ | 1459 | int zlc_active = 0; /* set if using zonelist_cache */ |
| 1395 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1460 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
| 1396 | 1461 | ||
| 1397 | (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, | ||
| 1398 | &preferred_zone); | ||
| 1399 | if (!preferred_zone) | ||
| 1400 | return NULL; | ||
| 1401 | |||
| 1402 | classzone_idx = zone_idx(preferred_zone); | 1462 | classzone_idx = zone_idx(preferred_zone); |
| 1403 | |||
| 1404 | zonelist_scan: | 1463 | zonelist_scan: |
| 1405 | /* | 1464 | /* |
| 1406 | * Scan zonelist, looking for a zone with enough free. | 1465 | * Scan zonelist, looking for a zone with enough free. |
| @@ -1415,31 +1474,49 @@ zonelist_scan: | |||
| 1415 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1474 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
| 1416 | goto try_next_zone; | 1475 | goto try_next_zone; |
| 1417 | 1476 | ||
| 1477 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | ||
| 1418 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1478 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
| 1419 | unsigned long mark; | 1479 | unsigned long mark; |
| 1420 | if (alloc_flags & ALLOC_WMARK_MIN) | 1480 | int ret; |
| 1421 | mark = zone->pages_min; | 1481 | |
| 1422 | else if (alloc_flags & ALLOC_WMARK_LOW) | 1482 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
| 1423 | mark = zone->pages_low; | 1483 | if (zone_watermark_ok(zone, order, mark, |
| 1424 | else | 1484 | classzone_idx, alloc_flags)) |
| 1425 | mark = zone->pages_high; | 1485 | goto try_this_zone; |
| 1426 | if (!zone_watermark_ok(zone, order, mark, | 1486 | |
| 1427 | classzone_idx, alloc_flags)) { | 1487 | if (zone_reclaim_mode == 0) |
| 1428 | if (!zone_reclaim_mode || | 1488 | goto this_zone_full; |
| 1429 | !zone_reclaim(zone, gfp_mask, order)) | 1489 | |
| 1490 | ret = zone_reclaim(zone, gfp_mask, order); | ||
| 1491 | switch (ret) { | ||
| 1492 | case ZONE_RECLAIM_NOSCAN: | ||
| 1493 | /* did not scan */ | ||
| 1494 | goto try_next_zone; | ||
| 1495 | case ZONE_RECLAIM_FULL: | ||
| 1496 | /* scanned but unreclaimable */ | ||
| 1497 | goto this_zone_full; | ||
| 1498 | default: | ||
| 1499 | /* did we reclaim enough */ | ||
| 1500 | if (!zone_watermark_ok(zone, order, mark, | ||
| 1501 | classzone_idx, alloc_flags)) | ||
| 1430 | goto this_zone_full; | 1502 | goto this_zone_full; |
| 1431 | } | 1503 | } |
| 1432 | } | 1504 | } |
| 1433 | 1505 | ||
| 1434 | page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); | 1506 | try_this_zone: |
| 1507 | page = buffered_rmqueue(preferred_zone, zone, order, | ||
| 1508 | gfp_mask, migratetype); | ||
| 1435 | if (page) | 1509 | if (page) |
| 1436 | break; | 1510 | break; |
| 1437 | this_zone_full: | 1511 | this_zone_full: |
| 1438 | if (NUMA_BUILD) | 1512 | if (NUMA_BUILD) |
| 1439 | zlc_mark_zone_full(zonelist, z); | 1513 | zlc_mark_zone_full(zonelist, z); |
| 1440 | try_next_zone: | 1514 | try_next_zone: |
| 1441 | if (NUMA_BUILD && !did_zlc_setup) { | 1515 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { |
| 1442 | /* we do zlc_setup after the first zone is tried */ | 1516 | /* |
| 1517 | * we do zlc_setup after the first zone is tried but only | ||
| 1518 | * if there are multiple nodes make it worthwhile | ||
| 1519 | */ | ||
| 1443 | allowednodes = zlc_setup(zonelist, alloc_flags); | 1520 | allowednodes = zlc_setup(zonelist, alloc_flags); |
| 1444 | zlc_active = 1; | 1521 | zlc_active = 1; |
| 1445 | did_zlc_setup = 1; | 1522 | did_zlc_setup = 1; |
| @@ -1454,47 +1531,217 @@ try_next_zone: | |||
| 1454 | return page; | 1531 | return page; |
| 1455 | } | 1532 | } |
| 1456 | 1533 | ||
| 1534 | static inline int | ||
| 1535 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, | ||
| 1536 | unsigned long pages_reclaimed) | ||
| 1537 | { | ||
| 1538 | /* Do not loop if specifically requested */ | ||
| 1539 | if (gfp_mask & __GFP_NORETRY) | ||
| 1540 | return 0; | ||
| 1541 | |||
| 1542 | /* | ||
| 1543 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | ||
| 1544 | * means __GFP_NOFAIL, but that may not be true in other | ||
| 1545 | * implementations. | ||
| 1546 | */ | ||
| 1547 | if (order <= PAGE_ALLOC_COSTLY_ORDER) | ||
| 1548 | return 1; | ||
| 1549 | |||
| 1550 | /* | ||
| 1551 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is | ||
| 1552 | * specified, then we retry until we no longer reclaim any pages | ||
| 1553 | * (above), or we've reclaimed an order of pages at least as | ||
| 1554 | * large as the allocation's order. In both cases, if the | ||
| 1555 | * allocation still fails, we stop retrying. | ||
| 1556 | */ | ||
| 1557 | if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) | ||
| 1558 | return 1; | ||
| 1559 | |||
| 1560 | /* | ||
| 1561 | * Don't let big-order allocations loop unless the caller | ||
| 1562 | * explicitly requests that. | ||
| 1563 | */ | ||
| 1564 | if (gfp_mask & __GFP_NOFAIL) | ||
| 1565 | return 1; | ||
| 1566 | |||
| 1567 | return 0; | ||
| 1568 | } | ||
| 1569 | |||
| 1570 | static inline struct page * | ||
| 1571 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | ||
| 1572 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
| 1573 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
| 1574 | int migratetype) | ||
| 1575 | { | ||
| 1576 | struct page *page; | ||
| 1577 | |||
| 1578 | /* Acquire the OOM killer lock for the zones in zonelist */ | ||
| 1579 | if (!try_set_zone_oom(zonelist, gfp_mask)) { | ||
| 1580 | schedule_timeout_uninterruptible(1); | ||
| 1581 | return NULL; | ||
| 1582 | } | ||
| 1583 | |||
| 1584 | /* | ||
| 1585 | * Go through the zonelist yet one more time, keep very high watermark | ||
| 1586 | * here, this is only to catch a parallel oom killing, we must fail if | ||
| 1587 | * we're still under heavy pressure. | ||
| 1588 | */ | ||
| 1589 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | ||
| 1590 | order, zonelist, high_zoneidx, | ||
| 1591 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, | ||
| 1592 | preferred_zone, migratetype); | ||
| 1593 | if (page) | ||
| 1594 | goto out; | ||
| 1595 | |||
| 1596 | /* The OOM killer will not help higher order allocs */ | ||
| 1597 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) | ||
| 1598 | goto out; | ||
| 1599 | |||
| 1600 | /* Exhausted what can be done so it's blamo time */ | ||
| 1601 | out_of_memory(zonelist, gfp_mask, order); | ||
| 1602 | |||
| 1603 | out: | ||
| 1604 | clear_zonelist_oom(zonelist, gfp_mask); | ||
| 1605 | return page; | ||
| 1606 | } | ||
| 1607 | |||
| 1608 | /* The really slow allocator path where we enter direct reclaim */ | ||
| 1609 | static inline struct page * | ||
| 1610 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | ||
| 1611 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
| 1612 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
| 1613 | int migratetype, unsigned long *did_some_progress) | ||
| 1614 | { | ||
| 1615 | struct page *page = NULL; | ||
| 1616 | struct reclaim_state reclaim_state; | ||
| 1617 | struct task_struct *p = current; | ||
| 1618 | |||
| 1619 | cond_resched(); | ||
| 1620 | |||
| 1621 | /* We now go into synchronous reclaim */ | ||
| 1622 | cpuset_memory_pressure_bump(); | ||
| 1623 | |||
| 1624 | /* | ||
| 1625 | * The task's cpuset might have expanded its set of allowable nodes | ||
| 1626 | */ | ||
| 1627 | p->flags |= PF_MEMALLOC; | ||
| 1628 | lockdep_set_current_reclaim_state(gfp_mask); | ||
| 1629 | reclaim_state.reclaimed_slab = 0; | ||
| 1630 | p->reclaim_state = &reclaim_state; | ||
| 1631 | |||
| 1632 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | ||
| 1633 | |||
| 1634 | p->reclaim_state = NULL; | ||
| 1635 | lockdep_clear_current_reclaim_state(); | ||
| 1636 | p->flags &= ~PF_MEMALLOC; | ||
| 1637 | |||
| 1638 | cond_resched(); | ||
| 1639 | |||
| 1640 | if (order != 0) | ||
| 1641 | drain_all_pages(); | ||
| 1642 | |||
| 1643 | if (likely(*did_some_progress)) | ||
| 1644 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
| 1645 | zonelist, high_zoneidx, | ||
| 1646 | alloc_flags, preferred_zone, | ||
| 1647 | migratetype); | ||
| 1648 | return page; | ||
| 1649 | } | ||
| 1650 | |||
| 1457 | /* | 1651 | /* |
| 1458 | * This is the 'heart' of the zoned buddy allocator. | 1652 | * This is called in the allocator slow-path if the allocation request is of |
| 1653 | * sufficient urgency to ignore watermarks and take other desperate measures | ||
| 1459 | */ | 1654 | */ |
| 1460 | struct page * | 1655 | static inline struct page * |
| 1461 | __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, | 1656 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, |
| 1462 | struct zonelist *zonelist, nodemask_t *nodemask) | 1657 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
| 1658 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
| 1659 | int migratetype) | ||
| 1660 | { | ||
| 1661 | struct page *page; | ||
| 1662 | |||
| 1663 | do { | ||
| 1664 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
| 1665 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, | ||
| 1666 | preferred_zone, migratetype); | ||
| 1667 | |||
| 1668 | if (!page && gfp_mask & __GFP_NOFAIL) | ||
| 1669 | congestion_wait(WRITE, HZ/50); | ||
| 1670 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | ||
| 1671 | |||
| 1672 | return page; | ||
| 1673 | } | ||
| 1674 | |||
| 1675 | static inline | ||
| 1676 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, | ||
| 1677 | enum zone_type high_zoneidx) | ||
| 1463 | { | 1678 | { |
| 1464 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
| 1465 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
| 1466 | struct zoneref *z; | 1679 | struct zoneref *z; |
| 1467 | struct zone *zone; | 1680 | struct zone *zone; |
| 1468 | struct page *page; | ||
| 1469 | struct reclaim_state reclaim_state; | ||
| 1470 | struct task_struct *p = current; | ||
| 1471 | int do_retry; | ||
| 1472 | int alloc_flags; | ||
| 1473 | unsigned long did_some_progress; | ||
| 1474 | unsigned long pages_reclaimed = 0; | ||
| 1475 | 1681 | ||
| 1476 | lockdep_trace_alloc(gfp_mask); | 1682 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) |
| 1683 | wakeup_kswapd(zone, order); | ||
| 1684 | } | ||
| 1477 | 1685 | ||
| 1478 | might_sleep_if(wait); | 1686 | static inline int |
| 1687 | gfp_to_alloc_flags(gfp_t gfp_mask) | ||
| 1688 | { | ||
| 1689 | struct task_struct *p = current; | ||
| 1690 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; | ||
| 1691 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
| 1479 | 1692 | ||
| 1480 | if (should_fail_alloc_page(gfp_mask, order)) | 1693 | /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ |
| 1481 | return NULL; | 1694 | BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); |
| 1482 | 1695 | ||
| 1483 | restart: | 1696 | /* |
| 1484 | z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ | 1697 | * The caller may dip into page reserves a bit more if the caller |
| 1698 | * cannot run direct reclaim, or if the caller has realtime scheduling | ||
| 1699 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | ||
| 1700 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | ||
| 1701 | */ | ||
| 1702 | alloc_flags |= (gfp_mask & __GFP_HIGH); | ||
| 1485 | 1703 | ||
| 1486 | if (unlikely(!z->zone)) { | 1704 | if (!wait) { |
| 1705 | alloc_flags |= ALLOC_HARDER; | ||
| 1487 | /* | 1706 | /* |
| 1488 | * Happens if we have an empty zonelist as a result of | 1707 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
| 1489 | * GFP_THISNODE being used on a memoryless node | 1708 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
| 1490 | */ | 1709 | */ |
| 1491 | return NULL; | 1710 | alloc_flags &= ~ALLOC_CPUSET; |
| 1711 | } else if (unlikely(rt_task(p))) | ||
| 1712 | alloc_flags |= ALLOC_HARDER; | ||
| 1713 | |||
| 1714 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | ||
| 1715 | if (!in_interrupt() && | ||
| 1716 | ((p->flags & PF_MEMALLOC) || | ||
| 1717 | unlikely(test_thread_flag(TIF_MEMDIE)))) | ||
| 1718 | alloc_flags |= ALLOC_NO_WATERMARKS; | ||
| 1492 | } | 1719 | } |
| 1493 | 1720 | ||
| 1494 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 1721 | return alloc_flags; |
| 1495 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); | 1722 | } |
| 1496 | if (page) | 1723 | |
| 1497 | goto got_pg; | 1724 | static inline struct page * |
| 1725 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | ||
| 1726 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
| 1727 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
| 1728 | int migratetype) | ||
| 1729 | { | ||
| 1730 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
| 1731 | struct page *page = NULL; | ||
| 1732 | int alloc_flags; | ||
| 1733 | unsigned long pages_reclaimed = 0; | ||
| 1734 | unsigned long did_some_progress; | ||
| 1735 | struct task_struct *p = current; | ||
| 1736 | |||
| 1737 | /* | ||
| 1738 | * In the slowpath, we sanity check order to avoid ever trying to | ||
| 1739 | * reclaim >= MAX_ORDER areas which will never succeed. Callers may | ||
| 1740 | * be using allocators in order of preference for an area that is | ||
| 1741 | * too large. | ||
| 1742 | */ | ||
| 1743 | if (WARN_ON_ONCE(order >= MAX_ORDER)) | ||
| 1744 | return NULL; | ||
| 1498 | 1745 | ||
| 1499 | /* | 1746 | /* |
| 1500 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | 1747 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and |
| @@ -1507,154 +1754,83 @@ restart: | |||
| 1507 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 1754 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) |
| 1508 | goto nopage; | 1755 | goto nopage; |
| 1509 | 1756 | ||
| 1510 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 1757 | wake_all_kswapd(order, zonelist, high_zoneidx); |
| 1511 | wakeup_kswapd(zone, order); | ||
| 1512 | 1758 | ||
| 1513 | /* | 1759 | /* |
| 1514 | * OK, we're below the kswapd watermark and have kicked background | 1760 | * OK, we're below the kswapd watermark and have kicked background |
| 1515 | * reclaim. Now things get more complex, so set up alloc_flags according | 1761 | * reclaim. Now things get more complex, so set up alloc_flags according |
| 1516 | * to how we want to proceed. | 1762 | * to how we want to proceed. |
| 1517 | * | ||
| 1518 | * The caller may dip into page reserves a bit more if the caller | ||
| 1519 | * cannot run direct reclaim, or if the caller has realtime scheduling | ||
| 1520 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | ||
| 1521 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | ||
| 1522 | */ | 1763 | */ |
| 1523 | alloc_flags = ALLOC_WMARK_MIN; | 1764 | alloc_flags = gfp_to_alloc_flags(gfp_mask); |
| 1524 | if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) | ||
| 1525 | alloc_flags |= ALLOC_HARDER; | ||
| 1526 | if (gfp_mask & __GFP_HIGH) | ||
| 1527 | alloc_flags |= ALLOC_HIGH; | ||
| 1528 | if (wait) | ||
| 1529 | alloc_flags |= ALLOC_CPUSET; | ||
| 1530 | 1765 | ||
| 1531 | /* | 1766 | restart: |
| 1532 | * Go through the zonelist again. Let __GFP_HIGH and allocations | 1767 | /* This is the last chance, in general, before the goto nopage. */ |
| 1533 | * coming from realtime tasks go deeper into reserves. | ||
| 1534 | * | ||
| 1535 | * This is the last chance, in general, before the goto nopage. | ||
| 1536 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | ||
| 1537 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | ||
| 1538 | */ | ||
| 1539 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 1768 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
| 1540 | high_zoneidx, alloc_flags); | 1769 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, |
| 1770 | preferred_zone, migratetype); | ||
| 1541 | if (page) | 1771 | if (page) |
| 1542 | goto got_pg; | 1772 | goto got_pg; |
| 1543 | 1773 | ||
| 1544 | /* This allocation should allow future memory freeing. */ | ||
| 1545 | |||
| 1546 | rebalance: | 1774 | rebalance: |
| 1547 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) | 1775 | /* Allocate without watermarks if the context allows */ |
| 1548 | && !in_interrupt()) { | 1776 | if (alloc_flags & ALLOC_NO_WATERMARKS) { |
| 1549 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 1777 | page = __alloc_pages_high_priority(gfp_mask, order, |
| 1550 | nofail_alloc: | 1778 | zonelist, high_zoneidx, nodemask, |
| 1551 | /* go through the zonelist yet again, ignoring mins */ | 1779 | preferred_zone, migratetype); |
| 1552 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 1780 | if (page) |
| 1553 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); | 1781 | goto got_pg; |
| 1554 | if (page) | ||
| 1555 | goto got_pg; | ||
| 1556 | if (gfp_mask & __GFP_NOFAIL) { | ||
| 1557 | congestion_wait(WRITE, HZ/50); | ||
| 1558 | goto nofail_alloc; | ||
| 1559 | } | ||
| 1560 | } | ||
| 1561 | goto nopage; | ||
| 1562 | } | 1782 | } |
| 1563 | 1783 | ||
| 1564 | /* Atomic allocations - we can't balance anything */ | 1784 | /* Atomic allocations - we can't balance anything */ |
| 1565 | if (!wait) | 1785 | if (!wait) |
| 1566 | goto nopage; | 1786 | goto nopage; |
| 1567 | 1787 | ||
| 1568 | cond_resched(); | 1788 | /* Avoid recursion of direct reclaim */ |
| 1789 | if (p->flags & PF_MEMALLOC) | ||
| 1790 | goto nopage; | ||
| 1791 | |||
| 1792 | /* Try direct reclaim and then allocating */ | ||
| 1793 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | ||
| 1794 | zonelist, high_zoneidx, | ||
| 1795 | nodemask, | ||
| 1796 | alloc_flags, preferred_zone, | ||
| 1797 | migratetype, &did_some_progress); | ||
| 1798 | if (page) | ||
| 1799 | goto got_pg; | ||
| 1569 | 1800 | ||
| 1570 | /* We now go into synchronous reclaim */ | ||
| 1571 | cpuset_memory_pressure_bump(); | ||
| 1572 | /* | 1801 | /* |
| 1573 | * The task's cpuset might have expanded its set of allowable nodes | 1802 | * If we failed to make any progress reclaiming, then we are |
| 1803 | * running out of options and have to consider going OOM | ||
| 1574 | */ | 1804 | */ |
| 1575 | cpuset_update_task_memory_state(); | 1805 | if (!did_some_progress) { |
| 1576 | p->flags |= PF_MEMALLOC; | 1806 | if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { |
| 1577 | 1807 | if (oom_killer_disabled) | |
| 1578 | lockdep_set_current_reclaim_state(gfp_mask); | 1808 | goto nopage; |
| 1579 | reclaim_state.reclaimed_slab = 0; | 1809 | page = __alloc_pages_may_oom(gfp_mask, order, |
| 1580 | p->reclaim_state = &reclaim_state; | 1810 | zonelist, high_zoneidx, |
| 1581 | 1811 | nodemask, preferred_zone, | |
| 1582 | did_some_progress = try_to_free_pages(zonelist, order, | 1812 | migratetype); |
| 1583 | gfp_mask, nodemask); | 1813 | if (page) |
| 1584 | 1814 | goto got_pg; | |
| 1585 | p->reclaim_state = NULL; | ||
| 1586 | lockdep_clear_current_reclaim_state(); | ||
| 1587 | p->flags &= ~PF_MEMALLOC; | ||
| 1588 | |||
| 1589 | cond_resched(); | ||
| 1590 | 1815 | ||
| 1591 | if (order != 0) | 1816 | /* |
| 1592 | drain_all_pages(); | 1817 | * The OOM killer does not trigger for high-order |
| 1818 | * ~__GFP_NOFAIL allocations so if no progress is being | ||
| 1819 | * made, there are no other options and retrying is | ||
| 1820 | * unlikely to help. | ||
| 1821 | */ | ||
| 1822 | if (order > PAGE_ALLOC_COSTLY_ORDER && | ||
| 1823 | !(gfp_mask & __GFP_NOFAIL)) | ||
| 1824 | goto nopage; | ||
| 1593 | 1825 | ||
| 1594 | if (likely(did_some_progress)) { | ||
| 1595 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
| 1596 | zonelist, high_zoneidx, alloc_flags); | ||
| 1597 | if (page) | ||
| 1598 | goto got_pg; | ||
| 1599 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | ||
| 1600 | if (!try_set_zone_oom(zonelist, gfp_mask)) { | ||
| 1601 | schedule_timeout_uninterruptible(1); | ||
| 1602 | goto restart; | 1826 | goto restart; |
| 1603 | } | 1827 | } |
| 1604 | |||
| 1605 | /* | ||
| 1606 | * Go through the zonelist yet one more time, keep | ||
| 1607 | * very high watermark here, this is only to catch | ||
| 1608 | * a parallel oom killing, we must fail if we're still | ||
| 1609 | * under heavy pressure. | ||
| 1610 | */ | ||
| 1611 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | ||
| 1612 | order, zonelist, high_zoneidx, | ||
| 1613 | ALLOC_WMARK_HIGH|ALLOC_CPUSET); | ||
| 1614 | if (page) { | ||
| 1615 | clear_zonelist_oom(zonelist, gfp_mask); | ||
| 1616 | goto got_pg; | ||
| 1617 | } | ||
| 1618 | |||
| 1619 | /* The OOM killer will not help higher order allocs so fail */ | ||
| 1620 | if (order > PAGE_ALLOC_COSTLY_ORDER) { | ||
| 1621 | clear_zonelist_oom(zonelist, gfp_mask); | ||
| 1622 | goto nopage; | ||
| 1623 | } | ||
| 1624 | |||
| 1625 | out_of_memory(zonelist, gfp_mask, order); | ||
| 1626 | clear_zonelist_oom(zonelist, gfp_mask); | ||
| 1627 | goto restart; | ||
| 1628 | } | 1828 | } |
| 1629 | 1829 | ||
| 1630 | /* | 1830 | /* Check if we should retry the allocation */ |
| 1631 | * Don't let big-order allocations loop unless the caller explicitly | ||
| 1632 | * requests that. Wait for some write requests to complete then retry. | ||
| 1633 | * | ||
| 1634 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | ||
| 1635 | * means __GFP_NOFAIL, but that may not be true in other | ||
| 1636 | * implementations. | ||
| 1637 | * | ||
| 1638 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is | ||
| 1639 | * specified, then we retry until we no longer reclaim any pages | ||
| 1640 | * (above), or we've reclaimed an order of pages at least as | ||
| 1641 | * large as the allocation's order. In both cases, if the | ||
| 1642 | * allocation still fails, we stop retrying. | ||
| 1643 | */ | ||
| 1644 | pages_reclaimed += did_some_progress; | 1831 | pages_reclaimed += did_some_progress; |
| 1645 | do_retry = 0; | 1832 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { |
| 1646 | if (!(gfp_mask & __GFP_NORETRY)) { | 1833 | /* Wait for some write requests to complete then retry */ |
| 1647 | if (order <= PAGE_ALLOC_COSTLY_ORDER) { | ||
| 1648 | do_retry = 1; | ||
| 1649 | } else { | ||
| 1650 | if (gfp_mask & __GFP_REPEAT && | ||
| 1651 | pages_reclaimed < (1 << order)) | ||
| 1652 | do_retry = 1; | ||
| 1653 | } | ||
| 1654 | if (gfp_mask & __GFP_NOFAIL) | ||
| 1655 | do_retry = 1; | ||
| 1656 | } | ||
| 1657 | if (do_retry) { | ||
| 1658 | congestion_wait(WRITE, HZ/50); | 1834 | congestion_wait(WRITE, HZ/50); |
| 1659 | goto rebalance; | 1835 | goto rebalance; |
| 1660 | } | 1836 | } |
| @@ -1667,10 +1843,58 @@ nopage: | |||
| 1667 | dump_stack(); | 1843 | dump_stack(); |
| 1668 | show_mem(); | 1844 | show_mem(); |
| 1669 | } | 1845 | } |
| 1846 | return page; | ||
| 1670 | got_pg: | 1847 | got_pg: |
| 1848 | if (kmemcheck_enabled) | ||
| 1849 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
| 1850 | return page; | ||
| 1851 | |||
| 1852 | } | ||
| 1853 | |||
| 1854 | /* | ||
| 1855 | * This is the 'heart' of the zoned buddy allocator. | ||
| 1856 | */ | ||
| 1857 | struct page * | ||
| 1858 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | ||
| 1859 | struct zonelist *zonelist, nodemask_t *nodemask) | ||
| 1860 | { | ||
| 1861 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
| 1862 | struct zone *preferred_zone; | ||
| 1863 | struct page *page; | ||
| 1864 | int migratetype = allocflags_to_migratetype(gfp_mask); | ||
| 1865 | |||
| 1866 | lockdep_trace_alloc(gfp_mask); | ||
| 1867 | |||
| 1868 | might_sleep_if(gfp_mask & __GFP_WAIT); | ||
| 1869 | |||
| 1870 | if (should_fail_alloc_page(gfp_mask, order)) | ||
| 1871 | return NULL; | ||
| 1872 | |||
| 1873 | /* | ||
| 1874 | * Check the zones suitable for the gfp_mask contain at least one | ||
| 1875 | * valid zone. It's possible to have an empty zonelist as a result | ||
| 1876 | * of GFP_THISNODE and a memoryless node | ||
| 1877 | */ | ||
| 1878 | if (unlikely(!zonelist->_zonerefs->zone)) | ||
| 1879 | return NULL; | ||
| 1880 | |||
| 1881 | /* The preferred zone is used for statistics later */ | ||
| 1882 | first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); | ||
| 1883 | if (!preferred_zone) | ||
| 1884 | return NULL; | ||
| 1885 | |||
| 1886 | /* First allocation attempt */ | ||
| 1887 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | ||
| 1888 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, | ||
| 1889 | preferred_zone, migratetype); | ||
| 1890 | if (unlikely(!page)) | ||
| 1891 | page = __alloc_pages_slowpath(gfp_mask, order, | ||
| 1892 | zonelist, high_zoneidx, nodemask, | ||
| 1893 | preferred_zone, migratetype); | ||
| 1894 | |||
| 1671 | return page; | 1895 | return page; |
| 1672 | } | 1896 | } |
| 1673 | EXPORT_SYMBOL(__alloc_pages_internal); | 1897 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
| 1674 | 1898 | ||
| 1675 | /* | 1899 | /* |
| 1676 | * Common helper functions. | 1900 | * Common helper functions. |
| @@ -1799,7 +2023,7 @@ static unsigned int nr_free_zone_pages(int offset) | |||
| 1799 | 2023 | ||
| 1800 | for_each_zone_zonelist(zone, z, zonelist, offset) { | 2024 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
| 1801 | unsigned long size = zone->present_pages; | 2025 | unsigned long size = zone->present_pages; |
| 1802 | unsigned long high = zone->pages_high; | 2026 | unsigned long high = high_wmark_pages(zone); |
| 1803 | if (size > high) | 2027 | if (size > high) |
| 1804 | sum += size - high; | 2028 | sum += size - high; |
| 1805 | } | 2029 | } |
| @@ -1891,19 +2115,14 @@ void show_free_areas(void) | |||
| 1891 | 2115 | ||
| 1892 | printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" | 2116 | printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" |
| 1893 | " inactive_file:%lu" | 2117 | " inactive_file:%lu" |
| 1894 | //TODO: check/adjust line lengths | ||
| 1895 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1896 | " unevictable:%lu" | 2118 | " unevictable:%lu" |
| 1897 | #endif | ||
| 1898 | " dirty:%lu writeback:%lu unstable:%lu\n" | 2119 | " dirty:%lu writeback:%lu unstable:%lu\n" |
| 1899 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", | 2120 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", |
| 1900 | global_page_state(NR_ACTIVE_ANON), | 2121 | global_page_state(NR_ACTIVE_ANON), |
| 1901 | global_page_state(NR_ACTIVE_FILE), | 2122 | global_page_state(NR_ACTIVE_FILE), |
| 1902 | global_page_state(NR_INACTIVE_ANON), | 2123 | global_page_state(NR_INACTIVE_ANON), |
| 1903 | global_page_state(NR_INACTIVE_FILE), | 2124 | global_page_state(NR_INACTIVE_FILE), |
| 1904 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1905 | global_page_state(NR_UNEVICTABLE), | 2125 | global_page_state(NR_UNEVICTABLE), |
| 1906 | #endif | ||
| 1907 | global_page_state(NR_FILE_DIRTY), | 2126 | global_page_state(NR_FILE_DIRTY), |
| 1908 | global_page_state(NR_WRITEBACK), | 2127 | global_page_state(NR_WRITEBACK), |
| 1909 | global_page_state(NR_UNSTABLE_NFS), | 2128 | global_page_state(NR_UNSTABLE_NFS), |
| @@ -1927,25 +2146,21 @@ void show_free_areas(void) | |||
| 1927 | " inactive_anon:%lukB" | 2146 | " inactive_anon:%lukB" |
| 1928 | " active_file:%lukB" | 2147 | " active_file:%lukB" |
| 1929 | " inactive_file:%lukB" | 2148 | " inactive_file:%lukB" |
| 1930 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1931 | " unevictable:%lukB" | 2149 | " unevictable:%lukB" |
| 1932 | #endif | ||
| 1933 | " present:%lukB" | 2150 | " present:%lukB" |
| 1934 | " pages_scanned:%lu" | 2151 | " pages_scanned:%lu" |
| 1935 | " all_unreclaimable? %s" | 2152 | " all_unreclaimable? %s" |
| 1936 | "\n", | 2153 | "\n", |
| 1937 | zone->name, | 2154 | zone->name, |
| 1938 | K(zone_page_state(zone, NR_FREE_PAGES)), | 2155 | K(zone_page_state(zone, NR_FREE_PAGES)), |
| 1939 | K(zone->pages_min), | 2156 | K(min_wmark_pages(zone)), |
| 1940 | K(zone->pages_low), | 2157 | K(low_wmark_pages(zone)), |
| 1941 | K(zone->pages_high), | 2158 | K(high_wmark_pages(zone)), |
| 1942 | K(zone_page_state(zone, NR_ACTIVE_ANON)), | 2159 | K(zone_page_state(zone, NR_ACTIVE_ANON)), |
| 1943 | K(zone_page_state(zone, NR_INACTIVE_ANON)), | 2160 | K(zone_page_state(zone, NR_INACTIVE_ANON)), |
| 1944 | K(zone_page_state(zone, NR_ACTIVE_FILE)), | 2161 | K(zone_page_state(zone, NR_ACTIVE_FILE)), |
| 1945 | K(zone_page_state(zone, NR_INACTIVE_FILE)), | 2162 | K(zone_page_state(zone, NR_INACTIVE_FILE)), |
| 1946 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1947 | K(zone_page_state(zone, NR_UNEVICTABLE)), | 2163 | K(zone_page_state(zone, NR_UNEVICTABLE)), |
| 1948 | #endif | ||
| 1949 | K(zone->present_pages), | 2164 | K(zone->present_pages), |
| 1950 | zone->pages_scanned, | 2165 | zone->pages_scanned, |
| 1951 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") | 2166 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") |
| @@ -2103,7 +2318,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
| 2103 | } | 2318 | } |
| 2104 | 2319 | ||
| 2105 | 2320 | ||
| 2106 | #define MAX_NODE_LOAD (num_online_nodes()) | 2321 | #define MAX_NODE_LOAD (nr_online_nodes) |
| 2107 | static int node_load[MAX_NUMNODES]; | 2322 | static int node_load[MAX_NUMNODES]; |
| 2108 | 2323 | ||
| 2109 | /** | 2324 | /** |
| @@ -2312,7 +2527,7 @@ static void build_zonelists(pg_data_t *pgdat) | |||
| 2312 | 2527 | ||
| 2313 | /* NUMA-aware ordering of nodes */ | 2528 | /* NUMA-aware ordering of nodes */ |
| 2314 | local_node = pgdat->node_id; | 2529 | local_node = pgdat->node_id; |
| 2315 | load = num_online_nodes(); | 2530 | load = nr_online_nodes; |
| 2316 | prev_node = local_node; | 2531 | prev_node = local_node; |
| 2317 | nodes_clear(used_mask); | 2532 | nodes_clear(used_mask); |
| 2318 | 2533 | ||
| @@ -2463,7 +2678,7 @@ void build_all_zonelists(void) | |||
| 2463 | 2678 | ||
| 2464 | printk("Built %i zonelists in %s order, mobility grouping %s. " | 2679 | printk("Built %i zonelists in %s order, mobility grouping %s. " |
| 2465 | "Total pages: %ld\n", | 2680 | "Total pages: %ld\n", |
| 2466 | num_online_nodes(), | 2681 | nr_online_nodes, |
| 2467 | zonelist_order_name[current_zonelist_order], | 2682 | zonelist_order_name[current_zonelist_order], |
| 2468 | page_group_by_mobility_disabled ? "off" : "on", | 2683 | page_group_by_mobility_disabled ? "off" : "on", |
| 2469 | vm_total_pages); | 2684 | vm_total_pages); |
| @@ -2542,8 +2757,8 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
| 2542 | 2757 | ||
| 2543 | /* | 2758 | /* |
| 2544 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number | 2759 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number |
| 2545 | * of blocks reserved is based on zone->pages_min. The memory within the | 2760 | * of blocks reserved is based on min_wmark_pages(zone). The memory within |
| 2546 | * reserve will tend to store contiguous free pages. Setting min_free_kbytes | 2761 | * the reserve will tend to store contiguous free pages. Setting min_free_kbytes |
| 2547 | * higher will lead to a bigger reserve which will get freed as contiguous | 2762 | * higher will lead to a bigger reserve which will get freed as contiguous |
| 2548 | * blocks as reclaim kicks in | 2763 | * blocks as reclaim kicks in |
| 2549 | */ | 2764 | */ |
| @@ -2556,7 +2771,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
| 2556 | /* Get the start pfn, end pfn and the number of blocks to reserve */ | 2771 | /* Get the start pfn, end pfn and the number of blocks to reserve */ |
| 2557 | start_pfn = zone->zone_start_pfn; | 2772 | start_pfn = zone->zone_start_pfn; |
| 2558 | end_pfn = start_pfn + zone->spanned_pages; | 2773 | end_pfn = start_pfn + zone->spanned_pages; |
| 2559 | reserve = roundup(zone->pages_min, pageblock_nr_pages) >> | 2774 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> |
| 2560 | pageblock_order; | 2775 | pageblock_order; |
| 2561 | 2776 | ||
| 2562 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 2777 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
| @@ -3488,7 +3703,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 3488 | zone_pcp_init(zone); | 3703 | zone_pcp_init(zone); |
| 3489 | for_each_lru(l) { | 3704 | for_each_lru(l) { |
| 3490 | INIT_LIST_HEAD(&zone->lru[l].list); | 3705 | INIT_LIST_HEAD(&zone->lru[l].list); |
| 3491 | zone->lru[l].nr_scan = 0; | 3706 | zone->lru[l].nr_saved_scan = 0; |
| 3492 | } | 3707 | } |
| 3493 | zone->reclaim_stat.recent_rotated[0] = 0; | 3708 | zone->reclaim_stat.recent_rotated[0] = 0; |
| 3494 | zone->reclaim_stat.recent_rotated[1] = 0; | 3709 | zone->reclaim_stat.recent_rotated[1] = 0; |
| @@ -4025,6 +4240,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
| 4025 | early_node_map[i].start_pfn, | 4240 | early_node_map[i].start_pfn, |
| 4026 | early_node_map[i].end_pfn); | 4241 | early_node_map[i].end_pfn); |
| 4027 | 4242 | ||
| 4243 | /* | ||
| 4244 | * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init | ||
| 4245 | * that node_mask, clear it at first | ||
| 4246 | */ | ||
| 4247 | nodes_clear(node_states[N_HIGH_MEMORY]); | ||
| 4028 | /* Initialise every node */ | 4248 | /* Initialise every node */ |
| 4029 | mminit_verify_pageflags_layout(); | 4249 | mminit_verify_pageflags_layout(); |
| 4030 | setup_nr_node_ids(); | 4250 | setup_nr_node_ids(); |
| @@ -4159,8 +4379,8 @@ static void calculate_totalreserve_pages(void) | |||
| 4159 | max = zone->lowmem_reserve[j]; | 4379 | max = zone->lowmem_reserve[j]; |
| 4160 | } | 4380 | } |
| 4161 | 4381 | ||
| 4162 | /* we treat pages_high as reserved pages. */ | 4382 | /* we treat the high watermark as reserved pages. */ |
| 4163 | max += zone->pages_high; | 4383 | max += high_wmark_pages(zone); |
| 4164 | 4384 | ||
| 4165 | if (max > zone->present_pages) | 4385 | if (max > zone->present_pages) |
| 4166 | max = zone->present_pages; | 4386 | max = zone->present_pages; |
| @@ -4210,12 +4430,13 @@ static void setup_per_zone_lowmem_reserve(void) | |||
| 4210 | } | 4430 | } |
| 4211 | 4431 | ||
| 4212 | /** | 4432 | /** |
| 4213 | * setup_per_zone_pages_min - called when min_free_kbytes changes. | 4433 | * setup_per_zone_wmarks - called when min_free_kbytes changes |
| 4434 | * or when memory is hot-{added|removed} | ||
| 4214 | * | 4435 | * |
| 4215 | * Ensures that the pages_{min,low,high} values for each zone are set correctly | 4436 | * Ensures that the watermark[min,low,high] values for each zone are set |
| 4216 | * with respect to min_free_kbytes. | 4437 | * correctly with respect to min_free_kbytes. |
| 4217 | */ | 4438 | */ |
| 4218 | void setup_per_zone_pages_min(void) | 4439 | void setup_per_zone_wmarks(void) |
| 4219 | { | 4440 | { |
| 4220 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | 4441 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); |
| 4221 | unsigned long lowmem_pages = 0; | 4442 | unsigned long lowmem_pages = 0; |
| @@ -4240,7 +4461,7 @@ void setup_per_zone_pages_min(void) | |||
| 4240 | * need highmem pages, so cap pages_min to a small | 4461 | * need highmem pages, so cap pages_min to a small |
| 4241 | * value here. | 4462 | * value here. |
| 4242 | * | 4463 | * |
| 4243 | * The (pages_high-pages_low) and (pages_low-pages_min) | 4464 | * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) |
| 4244 | * deltas controls asynch page reclaim, and so should | 4465 | * deltas controls asynch page reclaim, and so should |
| 4245 | * not be capped for highmem. | 4466 | * not be capped for highmem. |
| 4246 | */ | 4467 | */ |
| @@ -4251,17 +4472,17 @@ void setup_per_zone_pages_min(void) | |||
| 4251 | min_pages = SWAP_CLUSTER_MAX; | 4472 | min_pages = SWAP_CLUSTER_MAX; |
| 4252 | if (min_pages > 128) | 4473 | if (min_pages > 128) |
| 4253 | min_pages = 128; | 4474 | min_pages = 128; |
| 4254 | zone->pages_min = min_pages; | 4475 | zone->watermark[WMARK_MIN] = min_pages; |
| 4255 | } else { | 4476 | } else { |
| 4256 | /* | 4477 | /* |
| 4257 | * If it's a lowmem zone, reserve a number of pages | 4478 | * If it's a lowmem zone, reserve a number of pages |
| 4258 | * proportionate to the zone's size. | 4479 | * proportionate to the zone's size. |
| 4259 | */ | 4480 | */ |
| 4260 | zone->pages_min = tmp; | 4481 | zone->watermark[WMARK_MIN] = tmp; |
| 4261 | } | 4482 | } |
| 4262 | 4483 | ||
| 4263 | zone->pages_low = zone->pages_min + (tmp >> 2); | 4484 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); |
| 4264 | zone->pages_high = zone->pages_min + (tmp >> 1); | 4485 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
| 4265 | setup_zone_migrate_reserve(zone); | 4486 | setup_zone_migrate_reserve(zone); |
| 4266 | spin_unlock_irqrestore(&zone->lock, flags); | 4487 | spin_unlock_irqrestore(&zone->lock, flags); |
| 4267 | } | 4488 | } |
| @@ -4271,8 +4492,6 @@ void setup_per_zone_pages_min(void) | |||
| 4271 | } | 4492 | } |
| 4272 | 4493 | ||
| 4273 | /** | 4494 | /** |
| 4274 | * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. | ||
| 4275 | * | ||
| 4276 | * The inactive anon list should be small enough that the VM never has to | 4495 | * The inactive anon list should be small enough that the VM never has to |
| 4277 | * do too much work, but large enough that each inactive page has a chance | 4496 | * do too much work, but large enough that each inactive page has a chance |
| 4278 | * to be referenced again before it is swapped out. | 4497 | * to be referenced again before it is swapped out. |
| @@ -4293,21 +4512,26 @@ void setup_per_zone_pages_min(void) | |||
| 4293 | * 1TB 101 10GB | 4512 | * 1TB 101 10GB |
| 4294 | * 10TB 320 32GB | 4513 | * 10TB 320 32GB |
| 4295 | */ | 4514 | */ |
| 4296 | static void setup_per_zone_inactive_ratio(void) | 4515 | void calculate_zone_inactive_ratio(struct zone *zone) |
| 4297 | { | 4516 | { |
| 4298 | struct zone *zone; | 4517 | unsigned int gb, ratio; |
| 4299 | |||
| 4300 | for_each_zone(zone) { | ||
| 4301 | unsigned int gb, ratio; | ||
| 4302 | 4518 | ||
| 4303 | /* Zone size in gigabytes */ | 4519 | /* Zone size in gigabytes */ |
| 4304 | gb = zone->present_pages >> (30 - PAGE_SHIFT); | 4520 | gb = zone->present_pages >> (30 - PAGE_SHIFT); |
| 4521 | if (gb) | ||
| 4305 | ratio = int_sqrt(10 * gb); | 4522 | ratio = int_sqrt(10 * gb); |
| 4306 | if (!ratio) | 4523 | else |
| 4307 | ratio = 1; | 4524 | ratio = 1; |
| 4308 | 4525 | ||
| 4309 | zone->inactive_ratio = ratio; | 4526 | zone->inactive_ratio = ratio; |
| 4310 | } | 4527 | } |
| 4528 | |||
| 4529 | static void __init setup_per_zone_inactive_ratio(void) | ||
| 4530 | { | ||
| 4531 | struct zone *zone; | ||
| 4532 | |||
| 4533 | for_each_zone(zone) | ||
| 4534 | calculate_zone_inactive_ratio(zone); | ||
| 4311 | } | 4535 | } |
| 4312 | 4536 | ||
| 4313 | /* | 4537 | /* |
| @@ -4334,7 +4558,7 @@ static void setup_per_zone_inactive_ratio(void) | |||
| 4334 | * 8192MB: 11584k | 4558 | * 8192MB: 11584k |
| 4335 | * 16384MB: 16384k | 4559 | * 16384MB: 16384k |
| 4336 | */ | 4560 | */ |
| 4337 | static int __init init_per_zone_pages_min(void) | 4561 | static int __init init_per_zone_wmark_min(void) |
| 4338 | { | 4562 | { |
| 4339 | unsigned long lowmem_kbytes; | 4563 | unsigned long lowmem_kbytes; |
| 4340 | 4564 | ||
| @@ -4345,12 +4569,12 @@ static int __init init_per_zone_pages_min(void) | |||
| 4345 | min_free_kbytes = 128; | 4569 | min_free_kbytes = 128; |
| 4346 | if (min_free_kbytes > 65536) | 4570 | if (min_free_kbytes > 65536) |
| 4347 | min_free_kbytes = 65536; | 4571 | min_free_kbytes = 65536; |
| 4348 | setup_per_zone_pages_min(); | 4572 | setup_per_zone_wmarks(); |
| 4349 | setup_per_zone_lowmem_reserve(); | 4573 | setup_per_zone_lowmem_reserve(); |
| 4350 | setup_per_zone_inactive_ratio(); | 4574 | setup_per_zone_inactive_ratio(); |
| 4351 | return 0; | 4575 | return 0; |
| 4352 | } | 4576 | } |
| 4353 | module_init(init_per_zone_pages_min) | 4577 | module_init(init_per_zone_wmark_min) |
| 4354 | 4578 | ||
| 4355 | /* | 4579 | /* |
| 4356 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so | 4580 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so |
| @@ -4362,7 +4586,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | |||
| 4362 | { | 4586 | { |
| 4363 | proc_dointvec(table, write, file, buffer, length, ppos); | 4587 | proc_dointvec(table, write, file, buffer, length, ppos); |
| 4364 | if (write) | 4588 | if (write) |
| 4365 | setup_per_zone_pages_min(); | 4589 | setup_per_zone_wmarks(); |
| 4366 | return 0; | 4590 | return 0; |
| 4367 | } | 4591 | } |
| 4368 | 4592 | ||
| @@ -4406,7 +4630,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | |||
| 4406 | * whenever sysctl_lowmem_reserve_ratio changes. | 4630 | * whenever sysctl_lowmem_reserve_ratio changes. |
| 4407 | * | 4631 | * |
| 4408 | * The reserve ratio obviously has absolutely no relation with the | 4632 | * The reserve ratio obviously has absolutely no relation with the |
| 4409 | * pages_min watermarks. The lowmem reserve ratio can only make sense | 4633 | * minimum watermarks. The lowmem reserve ratio can only make sense |
| 4410 | * if in function of the boot time zone sizes. | 4634 | * if in function of the boot time zone sizes. |
| 4411 | */ | 4635 | */ |
| 4412 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | 4636 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, |
| @@ -4513,23 +4737,13 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
| 4513 | else if (hashdist) | 4737 | else if (hashdist) |
| 4514 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 4738 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
| 4515 | else { | 4739 | else { |
| 4516 | unsigned long order = get_order(size); | ||
| 4517 | table = (void*) __get_free_pages(GFP_ATOMIC, order); | ||
| 4518 | /* | 4740 | /* |
| 4519 | * If bucketsize is not a power-of-two, we may free | 4741 | * If bucketsize is not a power-of-two, we may free |
| 4520 | * some pages at the end of hash table. | 4742 | * some pages at the end of hash table which |
| 4743 | * alloc_pages_exact() automatically does | ||
| 4521 | */ | 4744 | */ |
| 4522 | if (table) { | 4745 | if (get_order(size) < MAX_ORDER) |
| 4523 | unsigned long alloc_end = (unsigned long)table + | 4746 | table = alloc_pages_exact(size, GFP_ATOMIC); |
| 4524 | (PAGE_SIZE << order); | ||
| 4525 | unsigned long used = (unsigned long)table + | ||
| 4526 | PAGE_ALIGN(size); | ||
| 4527 | split_page(virt_to_page(table), order); | ||
| 4528 | while (used < alloc_end) { | ||
| 4529 | free_page(used); | ||
| 4530 | used += PAGE_SIZE; | ||
| 4531 | } | ||
| 4532 | } | ||
| 4533 | } | 4747 | } |
| 4534 | } while (!table && size > PAGE_SIZE && --log2qty); | 4748 | } while (!table && size > PAGE_SIZE && --log2qty); |
| 4535 | 4749 | ||
diff --git a/mm/page_io.c b/mm/page_io.c index 3023c475e041..c6f3e5071de3 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
| @@ -120,7 +120,7 @@ out: | |||
| 120 | return ret; | 120 | return ret; |
| 121 | } | 121 | } |
| 122 | 122 | ||
| 123 | int swap_readpage(struct file *file, struct page *page) | 123 | int swap_readpage(struct page *page) |
| 124 | { | 124 | { |
| 125 | struct bio *bio; | 125 | struct bio *bio; |
| 126 | int ret = 0; | 126 | int ret = 0; |
diff --git a/mm/readahead.c b/mm/readahead.c index 133b6d525513..aa1aa2345235 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
| @@ -133,15 +133,12 @@ out: | |||
| 133 | } | 133 | } |
| 134 | 134 | ||
| 135 | /* | 135 | /* |
| 136 | * do_page_cache_readahead actually reads a chunk of disk. It allocates all | 136 | * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all |
| 137 | * the pages first, then submits them all for I/O. This avoids the very bad | 137 | * the pages first, then submits them all for I/O. This avoids the very bad |
| 138 | * behaviour which would occur if page allocations are causing VM writeback. | 138 | * behaviour which would occur if page allocations are causing VM writeback. |
| 139 | * We really don't want to intermingle reads and writes like that. | 139 | * We really don't want to intermingle reads and writes like that. |
| 140 | * | 140 | * |
| 141 | * Returns the number of pages requested, or the maximum amount of I/O allowed. | 141 | * Returns the number of pages requested, or the maximum amount of I/O allowed. |
| 142 | * | ||
| 143 | * do_page_cache_readahead() returns -1 if it encountered request queue | ||
| 144 | * congestion. | ||
| 145 | */ | 142 | */ |
| 146 | static int | 143 | static int |
| 147 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | 144 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, |
| @@ -210,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
| 210 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) | 207 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) |
| 211 | return -EINVAL; | 208 | return -EINVAL; |
| 212 | 209 | ||
| 210 | nr_to_read = max_sane_readahead(nr_to_read); | ||
| 213 | while (nr_to_read) { | 211 | while (nr_to_read) { |
| 214 | int err; | 212 | int err; |
| 215 | 213 | ||
| @@ -231,22 +229,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
| 231 | } | 229 | } |
| 232 | 230 | ||
| 233 | /* | 231 | /* |
| 234 | * This version skips the IO if the queue is read-congested, and will tell the | ||
| 235 | * block layer to abandon the readahead if request allocation would block. | ||
| 236 | * | ||
| 237 | * force_page_cache_readahead() will ignore queue congestion and will block on | ||
| 238 | * request queues. | ||
| 239 | */ | ||
| 240 | int do_page_cache_readahead(struct address_space *mapping, struct file *filp, | ||
| 241 | pgoff_t offset, unsigned long nr_to_read) | ||
| 242 | { | ||
| 243 | if (bdi_read_congested(mapping->backing_dev_info)) | ||
| 244 | return -1; | ||
| 245 | |||
| 246 | return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); | ||
| 247 | } | ||
| 248 | |||
| 249 | /* | ||
| 250 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a | 232 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a |
| 251 | * sensible upper limit. | 233 | * sensible upper limit. |
| 252 | */ | 234 | */ |
| @@ -259,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr) | |||
| 259 | /* | 241 | /* |
| 260 | * Submit IO for the read-ahead request in file_ra_state. | 242 | * Submit IO for the read-ahead request in file_ra_state. |
| 261 | */ | 243 | */ |
| 262 | static unsigned long ra_submit(struct file_ra_state *ra, | 244 | unsigned long ra_submit(struct file_ra_state *ra, |
| 263 | struct address_space *mapping, struct file *filp) | 245 | struct address_space *mapping, struct file *filp) |
| 264 | { | 246 | { |
| 265 | int actual; | 247 | int actual; |
| @@ -348,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, | |||
| 348 | */ | 330 | */ |
| 349 | 331 | ||
| 350 | /* | 332 | /* |
| 333 | * Count contiguously cached pages from @offset-1 to @offset-@max, | ||
| 334 | * this count is a conservative estimation of | ||
| 335 | * - length of the sequential read sequence, or | ||
| 336 | * - thrashing threshold in memory tight systems | ||
| 337 | */ | ||
| 338 | static pgoff_t count_history_pages(struct address_space *mapping, | ||
| 339 | struct file_ra_state *ra, | ||
| 340 | pgoff_t offset, unsigned long max) | ||
| 341 | { | ||
| 342 | pgoff_t head; | ||
| 343 | |||
| 344 | rcu_read_lock(); | ||
| 345 | head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); | ||
| 346 | rcu_read_unlock(); | ||
| 347 | |||
| 348 | return offset - 1 - head; | ||
| 349 | } | ||
| 350 | |||
| 351 | /* | ||
| 352 | * page cache context based read-ahead | ||
| 353 | */ | ||
| 354 | static int try_context_readahead(struct address_space *mapping, | ||
| 355 | struct file_ra_state *ra, | ||
| 356 | pgoff_t offset, | ||
| 357 | unsigned long req_size, | ||
| 358 | unsigned long max) | ||
| 359 | { | ||
| 360 | pgoff_t size; | ||
| 361 | |||
| 362 | size = count_history_pages(mapping, ra, offset, max); | ||
| 363 | |||
| 364 | /* | ||
| 365 | * no history pages: | ||
| 366 | * it could be a random read | ||
| 367 | */ | ||
| 368 | if (!size) | ||
| 369 | return 0; | ||
| 370 | |||
| 371 | /* | ||
| 372 | * starts from beginning of file: | ||
| 373 | * it is a strong indication of long-run stream (or whole-file-read) | ||
| 374 | */ | ||
| 375 | if (size >= offset) | ||
| 376 | size *= 2; | ||
| 377 | |||
| 378 | ra->start = offset; | ||
| 379 | ra->size = get_init_ra_size(size + req_size, max); | ||
| 380 | ra->async_size = ra->size; | ||
| 381 | |||
| 382 | return 1; | ||
| 383 | } | ||
| 384 | |||
| 385 | /* | ||
| 351 | * A minimal readahead algorithm for trivial sequential/random reads. | 386 | * A minimal readahead algorithm for trivial sequential/random reads. |
| 352 | */ | 387 | */ |
| 353 | static unsigned long | 388 | static unsigned long |
| @@ -356,34 +391,26 @@ ondemand_readahead(struct address_space *mapping, | |||
| 356 | bool hit_readahead_marker, pgoff_t offset, | 391 | bool hit_readahead_marker, pgoff_t offset, |
| 357 | unsigned long req_size) | 392 | unsigned long req_size) |
| 358 | { | 393 | { |
| 359 | int max = ra->ra_pages; /* max readahead pages */ | 394 | unsigned long max = max_sane_readahead(ra->ra_pages); |
| 360 | pgoff_t prev_offset; | 395 | |
| 361 | int sequential; | 396 | /* |
| 397 | * start of file | ||
| 398 | */ | ||
| 399 | if (!offset) | ||
| 400 | goto initial_readahead; | ||
| 362 | 401 | ||
| 363 | /* | 402 | /* |
| 364 | * It's the expected callback offset, assume sequential access. | 403 | * It's the expected callback offset, assume sequential access. |
| 365 | * Ramp up sizes, and push forward the readahead window. | 404 | * Ramp up sizes, and push forward the readahead window. |
| 366 | */ | 405 | */ |
| 367 | if (offset && (offset == (ra->start + ra->size - ra->async_size) || | 406 | if ((offset == (ra->start + ra->size - ra->async_size) || |
| 368 | offset == (ra->start + ra->size))) { | 407 | offset == (ra->start + ra->size))) { |
| 369 | ra->start += ra->size; | 408 | ra->start += ra->size; |
| 370 | ra->size = get_next_ra_size(ra, max); | 409 | ra->size = get_next_ra_size(ra, max); |
| 371 | ra->async_size = ra->size; | 410 | ra->async_size = ra->size; |
| 372 | goto readit; | 411 | goto readit; |
| 373 | } | 412 | } |
| 374 | 413 | ||
| 375 | prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; | ||
| 376 | sequential = offset - prev_offset <= 1UL || req_size > max; | ||
| 377 | |||
| 378 | /* | ||
| 379 | * Standalone, small read. | ||
| 380 | * Read as is, and do not pollute the readahead state. | ||
| 381 | */ | ||
| 382 | if (!hit_readahead_marker && !sequential) { | ||
| 383 | return __do_page_cache_readahead(mapping, filp, | ||
| 384 | offset, req_size, 0); | ||
| 385 | } | ||
| 386 | |||
| 387 | /* | 414 | /* |
| 388 | * Hit a marked page without valid readahead state. | 415 | * Hit a marked page without valid readahead state. |
| 389 | * E.g. interleaved reads. | 416 | * E.g. interleaved reads. |
| @@ -394,7 +421,7 @@ ondemand_readahead(struct address_space *mapping, | |||
| 394 | pgoff_t start; | 421 | pgoff_t start; |
| 395 | 422 | ||
| 396 | rcu_read_lock(); | 423 | rcu_read_lock(); |
| 397 | start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); | 424 | start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); |
| 398 | rcu_read_unlock(); | 425 | rcu_read_unlock(); |
| 399 | 426 | ||
| 400 | if (!start || start - offset > max) | 427 | if (!start || start - offset > max) |
| @@ -402,23 +429,53 @@ ondemand_readahead(struct address_space *mapping, | |||
| 402 | 429 | ||
| 403 | ra->start = start; | 430 | ra->start = start; |
| 404 | ra->size = start - offset; /* old async_size */ | 431 | ra->size = start - offset; /* old async_size */ |
| 432 | ra->size += req_size; | ||
| 405 | ra->size = get_next_ra_size(ra, max); | 433 | ra->size = get_next_ra_size(ra, max); |
| 406 | ra->async_size = ra->size; | 434 | ra->async_size = ra->size; |
| 407 | goto readit; | 435 | goto readit; |
| 408 | } | 436 | } |
| 409 | 437 | ||
| 410 | /* | 438 | /* |
| 411 | * It may be one of | 439 | * oversize read |
| 412 | * - first read on start of file | 440 | */ |
| 413 | * - sequential cache miss | 441 | if (req_size > max) |
| 414 | * - oversize random read | 442 | goto initial_readahead; |
| 415 | * Start readahead for it. | 443 | |
| 444 | /* | ||
| 445 | * sequential cache miss | ||
| 446 | */ | ||
| 447 | if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) | ||
| 448 | goto initial_readahead; | ||
| 449 | |||
| 450 | /* | ||
| 451 | * Query the page cache and look for the traces(cached history pages) | ||
| 452 | * that a sequential stream would leave behind. | ||
| 453 | */ | ||
| 454 | if (try_context_readahead(mapping, ra, offset, req_size, max)) | ||
| 455 | goto readit; | ||
| 456 | |||
| 457 | /* | ||
| 458 | * standalone, small random read | ||
| 459 | * Read as is, and do not pollute the readahead state. | ||
| 416 | */ | 460 | */ |
| 461 | return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); | ||
| 462 | |||
| 463 | initial_readahead: | ||
| 417 | ra->start = offset; | 464 | ra->start = offset; |
| 418 | ra->size = get_init_ra_size(req_size, max); | 465 | ra->size = get_init_ra_size(req_size, max); |
| 419 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; | 466 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; |
| 420 | 467 | ||
| 421 | readit: | 468 | readit: |
| 469 | /* | ||
| 470 | * Will this read hit the readahead marker made by itself? | ||
| 471 | * If so, trigger the readahead marker hit now, and merge | ||
| 472 | * the resulted next readahead window into the current one. | ||
| 473 | */ | ||
| 474 | if (offset == ra->start && ra->size == ra->async_size) { | ||
| 475 | ra->async_size = get_next_ra_size(ra, max); | ||
| 476 | ra->size += ra->async_size; | ||
| 477 | } | ||
| 478 | |||
| 422 | return ra_submit(ra, mapping, filp); | 479 | return ra_submit(ra, mapping, filp); |
| 423 | } | 480 | } |
| 424 | 481 | ||
| @@ -333,7 +333,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
| 333 | * repeatedly from either page_referenced_anon or page_referenced_file. | 333 | * repeatedly from either page_referenced_anon or page_referenced_file. |
| 334 | */ | 334 | */ |
| 335 | static int page_referenced_one(struct page *page, | 335 | static int page_referenced_one(struct page *page, |
| 336 | struct vm_area_struct *vma, unsigned int *mapcount) | 336 | struct vm_area_struct *vma, |
| 337 | unsigned int *mapcount, | ||
| 338 | unsigned long *vm_flags) | ||
| 337 | { | 339 | { |
| 338 | struct mm_struct *mm = vma->vm_mm; | 340 | struct mm_struct *mm = vma->vm_mm; |
| 339 | unsigned long address; | 341 | unsigned long address; |
| @@ -381,11 +383,14 @@ out_unmap: | |||
| 381 | (*mapcount)--; | 383 | (*mapcount)--; |
| 382 | pte_unmap_unlock(pte, ptl); | 384 | pte_unmap_unlock(pte, ptl); |
| 383 | out: | 385 | out: |
| 386 | if (referenced) | ||
| 387 | *vm_flags |= vma->vm_flags; | ||
| 384 | return referenced; | 388 | return referenced; |
| 385 | } | 389 | } |
| 386 | 390 | ||
| 387 | static int page_referenced_anon(struct page *page, | 391 | static int page_referenced_anon(struct page *page, |
| 388 | struct mem_cgroup *mem_cont) | 392 | struct mem_cgroup *mem_cont, |
| 393 | unsigned long *vm_flags) | ||
| 389 | { | 394 | { |
| 390 | unsigned int mapcount; | 395 | unsigned int mapcount; |
| 391 | struct anon_vma *anon_vma; | 396 | struct anon_vma *anon_vma; |
| @@ -405,7 +410,8 @@ static int page_referenced_anon(struct page *page, | |||
| 405 | */ | 410 | */ |
| 406 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 411 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
| 407 | continue; | 412 | continue; |
| 408 | referenced += page_referenced_one(page, vma, &mapcount); | 413 | referenced += page_referenced_one(page, vma, |
| 414 | &mapcount, vm_flags); | ||
| 409 | if (!mapcount) | 415 | if (!mapcount) |
| 410 | break; | 416 | break; |
| 411 | } | 417 | } |
| @@ -418,6 +424,7 @@ static int page_referenced_anon(struct page *page, | |||
| 418 | * page_referenced_file - referenced check for object-based rmap | 424 | * page_referenced_file - referenced check for object-based rmap |
| 419 | * @page: the page we're checking references on. | 425 | * @page: the page we're checking references on. |
| 420 | * @mem_cont: target memory controller | 426 | * @mem_cont: target memory controller |
| 427 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | ||
| 421 | * | 428 | * |
| 422 | * For an object-based mapped page, find all the places it is mapped and | 429 | * For an object-based mapped page, find all the places it is mapped and |
| 423 | * check/clear the referenced flag. This is done by following the page->mapping | 430 | * check/clear the referenced flag. This is done by following the page->mapping |
| @@ -427,7 +434,8 @@ static int page_referenced_anon(struct page *page, | |||
| 427 | * This function is only called from page_referenced for object-based pages. | 434 | * This function is only called from page_referenced for object-based pages. |
| 428 | */ | 435 | */ |
| 429 | static int page_referenced_file(struct page *page, | 436 | static int page_referenced_file(struct page *page, |
| 430 | struct mem_cgroup *mem_cont) | 437 | struct mem_cgroup *mem_cont, |
| 438 | unsigned long *vm_flags) | ||
| 431 | { | 439 | { |
| 432 | unsigned int mapcount; | 440 | unsigned int mapcount; |
| 433 | struct address_space *mapping = page->mapping; | 441 | struct address_space *mapping = page->mapping; |
| @@ -467,7 +475,8 @@ static int page_referenced_file(struct page *page, | |||
| 467 | */ | 475 | */ |
| 468 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 476 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
| 469 | continue; | 477 | continue; |
| 470 | referenced += page_referenced_one(page, vma, &mapcount); | 478 | referenced += page_referenced_one(page, vma, |
| 479 | &mapcount, vm_flags); | ||
| 471 | if (!mapcount) | 480 | if (!mapcount) |
| 472 | break; | 481 | break; |
| 473 | } | 482 | } |
| @@ -481,29 +490,35 @@ static int page_referenced_file(struct page *page, | |||
| 481 | * @page: the page to test | 490 | * @page: the page to test |
| 482 | * @is_locked: caller holds lock on the page | 491 | * @is_locked: caller holds lock on the page |
| 483 | * @mem_cont: target memory controller | 492 | * @mem_cont: target memory controller |
| 493 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | ||
| 484 | * | 494 | * |
| 485 | * Quick test_and_clear_referenced for all mappings to a page, | 495 | * Quick test_and_clear_referenced for all mappings to a page, |
| 486 | * returns the number of ptes which referenced the page. | 496 | * returns the number of ptes which referenced the page. |
| 487 | */ | 497 | */ |
| 488 | int page_referenced(struct page *page, int is_locked, | 498 | int page_referenced(struct page *page, |
| 489 | struct mem_cgroup *mem_cont) | 499 | int is_locked, |
| 500 | struct mem_cgroup *mem_cont, | ||
| 501 | unsigned long *vm_flags) | ||
| 490 | { | 502 | { |
| 491 | int referenced = 0; | 503 | int referenced = 0; |
| 492 | 504 | ||
| 493 | if (TestClearPageReferenced(page)) | 505 | if (TestClearPageReferenced(page)) |
| 494 | referenced++; | 506 | referenced++; |
| 495 | 507 | ||
| 508 | *vm_flags = 0; | ||
| 496 | if (page_mapped(page) && page->mapping) { | 509 | if (page_mapped(page) && page->mapping) { |
| 497 | if (PageAnon(page)) | 510 | if (PageAnon(page)) |
| 498 | referenced += page_referenced_anon(page, mem_cont); | 511 | referenced += page_referenced_anon(page, mem_cont, |
| 512 | vm_flags); | ||
| 499 | else if (is_locked) | 513 | else if (is_locked) |
| 500 | referenced += page_referenced_file(page, mem_cont); | 514 | referenced += page_referenced_file(page, mem_cont, |
| 515 | vm_flags); | ||
| 501 | else if (!trylock_page(page)) | 516 | else if (!trylock_page(page)) |
| 502 | referenced++; | 517 | referenced++; |
| 503 | else { | 518 | else { |
| 504 | if (page->mapping) | 519 | if (page->mapping) |
| 505 | referenced += | 520 | referenced += page_referenced_file(page, |
| 506 | page_referenced_file(page, mem_cont); | 521 | mem_cont, vm_flags); |
| 507 | unlock_page(page); | 522 | unlock_page(page); |
| 508 | } | 523 | } |
| 509 | } | 524 | } |
| @@ -1202,7 +1217,6 @@ int try_to_unmap(struct page *page, int migration) | |||
| 1202 | return ret; | 1217 | return ret; |
| 1203 | } | 1218 | } |
| 1204 | 1219 | ||
| 1205 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1206 | /** | 1220 | /** |
| 1207 | * try_to_munlock - try to munlock a page | 1221 | * try_to_munlock - try to munlock a page |
| 1208 | * @page: the page to be munlocked | 1222 | * @page: the page to be munlocked |
| @@ -1226,4 +1240,4 @@ int try_to_munlock(struct page *page) | |||
| 1226 | else | 1240 | else |
| 1227 | return try_to_unmap_file(page, 1, 0); | 1241 | return try_to_unmap_file(page, 1, 0); |
| 1228 | } | 1242 | } |
| 1229 | #endif | 1243 | |
diff --git a/mm/shmem.c b/mm/shmem.c index 0132fbd45a23..e89d7ec18eda 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -1097,7 +1097,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
| 1097 | shmem_swp_unmap(entry); | 1097 | shmem_swp_unmap(entry); |
| 1098 | unlock: | 1098 | unlock: |
| 1099 | spin_unlock(&info->lock); | 1099 | spin_unlock(&info->lock); |
| 1100 | swap_free(swap); | 1100 | swapcache_free(swap, NULL); |
| 1101 | redirty: | 1101 | redirty: |
| 1102 | set_page_dirty(page); | 1102 | set_page_dirty(page); |
| 1103 | if (wbc->for_reclaim) | 1103 | if (wbc->for_reclaim) |
| @@ -2612,7 +2612,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
| 2612 | * @size: size to be set for the file | 2612 | * @size: size to be set for the file |
| 2613 | * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size | 2613 | * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size |
| 2614 | */ | 2614 | */ |
| 2615 | struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | 2615 | struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) |
| 2616 | { | 2616 | { |
| 2617 | int error; | 2617 | int error; |
| 2618 | struct file *file; | 2618 | struct file *file; |
| @@ -114,6 +114,7 @@ | |||
| 114 | #include <linux/rtmutex.h> | 114 | #include <linux/rtmutex.h> |
| 115 | #include <linux/reciprocal_div.h> | 115 | #include <linux/reciprocal_div.h> |
| 116 | #include <linux/debugobjects.h> | 116 | #include <linux/debugobjects.h> |
| 117 | #include <linux/kmemcheck.h> | ||
| 117 | 118 | ||
| 118 | #include <asm/cacheflush.h> | 119 | #include <asm/cacheflush.h> |
| 119 | #include <asm/tlbflush.h> | 120 | #include <asm/tlbflush.h> |
| @@ -179,13 +180,13 @@ | |||
| 179 | SLAB_STORE_USER | \ | 180 | SLAB_STORE_USER | \ |
| 180 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 181 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
| 181 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ | 182 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ |
| 182 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE) | 183 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) |
| 183 | #else | 184 | #else |
| 184 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ | 185 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ |
| 185 | SLAB_CACHE_DMA | \ | 186 | SLAB_CACHE_DMA | \ |
| 186 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 187 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
| 187 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ | 188 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ |
| 188 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE) | 189 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) |
| 189 | #endif | 190 | #endif |
| 190 | 191 | ||
| 191 | /* | 192 | /* |
| @@ -380,87 +381,6 @@ static void kmem_list3_init(struct kmem_list3 *parent) | |||
| 380 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ | 381 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ |
| 381 | } while (0) | 382 | } while (0) |
| 382 | 383 | ||
| 383 | /* | ||
| 384 | * struct kmem_cache | ||
| 385 | * | ||
| 386 | * manages a cache. | ||
| 387 | */ | ||
| 388 | |||
| 389 | struct kmem_cache { | ||
| 390 | /* 1) per-cpu data, touched during every alloc/free */ | ||
| 391 | struct array_cache *array[NR_CPUS]; | ||
| 392 | /* 2) Cache tunables. Protected by cache_chain_mutex */ | ||
| 393 | unsigned int batchcount; | ||
| 394 | unsigned int limit; | ||
| 395 | unsigned int shared; | ||
| 396 | |||
| 397 | unsigned int buffer_size; | ||
| 398 | u32 reciprocal_buffer_size; | ||
| 399 | /* 3) touched by every alloc & free from the backend */ | ||
| 400 | |||
| 401 | unsigned int flags; /* constant flags */ | ||
| 402 | unsigned int num; /* # of objs per slab */ | ||
| 403 | |||
| 404 | /* 4) cache_grow/shrink */ | ||
| 405 | /* order of pgs per slab (2^n) */ | ||
| 406 | unsigned int gfporder; | ||
| 407 | |||
| 408 | /* force GFP flags, e.g. GFP_DMA */ | ||
| 409 | gfp_t gfpflags; | ||
| 410 | |||
| 411 | size_t colour; /* cache colouring range */ | ||
| 412 | unsigned int colour_off; /* colour offset */ | ||
| 413 | struct kmem_cache *slabp_cache; | ||
| 414 | unsigned int slab_size; | ||
| 415 | unsigned int dflags; /* dynamic flags */ | ||
| 416 | |||
| 417 | /* constructor func */ | ||
| 418 | void (*ctor)(void *obj); | ||
| 419 | |||
| 420 | /* 5) cache creation/removal */ | ||
| 421 | const char *name; | ||
| 422 | struct list_head next; | ||
| 423 | |||
| 424 | /* 6) statistics */ | ||
| 425 | #if STATS | ||
| 426 | unsigned long num_active; | ||
| 427 | unsigned long num_allocations; | ||
| 428 | unsigned long high_mark; | ||
| 429 | unsigned long grown; | ||
| 430 | unsigned long reaped; | ||
| 431 | unsigned long errors; | ||
| 432 | unsigned long max_freeable; | ||
| 433 | unsigned long node_allocs; | ||
| 434 | unsigned long node_frees; | ||
| 435 | unsigned long node_overflow; | ||
| 436 | atomic_t allochit; | ||
| 437 | atomic_t allocmiss; | ||
| 438 | atomic_t freehit; | ||
| 439 | atomic_t freemiss; | ||
| 440 | #endif | ||
| 441 | #if DEBUG | ||
| 442 | /* | ||
| 443 | * If debugging is enabled, then the allocator can add additional | ||
| 444 | * fields and/or padding to every object. buffer_size contains the total | ||
| 445 | * object size including these internal fields, the following two | ||
| 446 | * variables contain the offset to the user object and its size. | ||
| 447 | */ | ||
| 448 | int obj_offset; | ||
| 449 | int obj_size; | ||
| 450 | #endif | ||
| 451 | /* | ||
| 452 | * We put nodelists[] at the end of kmem_cache, because we want to size | ||
| 453 | * this array to nr_node_ids slots instead of MAX_NUMNODES | ||
| 454 | * (see kmem_cache_init()) | ||
| 455 | * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache | ||
| 456 | * is statically defined, so we reserve the max number of nodes. | ||
| 457 | */ | ||
| 458 | struct kmem_list3 *nodelists[MAX_NUMNODES]; | ||
| 459 | /* | ||
| 460 | * Do not add fields after nodelists[] | ||
| 461 | */ | ||
| 462 | }; | ||
| 463 | |||
| 464 | #define CFLGS_OFF_SLAB (0x80000000UL) | 384 | #define CFLGS_OFF_SLAB (0x80000000UL) |
| 465 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) | 385 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) |
| 466 | 386 | ||
| @@ -898,7 +818,6 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, | |||
| 898 | */ | 818 | */ |
| 899 | 819 | ||
| 900 | static int use_alien_caches __read_mostly = 1; | 820 | static int use_alien_caches __read_mostly = 1; |
| 901 | static int numa_platform __read_mostly = 1; | ||
| 902 | static int __init noaliencache_setup(char *s) | 821 | static int __init noaliencache_setup(char *s) |
| 903 | { | 822 | { |
| 904 | use_alien_caches = 0; | 823 | use_alien_caches = 0; |
| @@ -1457,10 +1376,8 @@ void __init kmem_cache_init(void) | |||
| 1457 | int order; | 1376 | int order; |
| 1458 | int node; | 1377 | int node; |
| 1459 | 1378 | ||
| 1460 | if (num_possible_nodes() == 1) { | 1379 | if (num_possible_nodes() == 1) |
| 1461 | use_alien_caches = 0; | 1380 | use_alien_caches = 0; |
| 1462 | numa_platform = 0; | ||
| 1463 | } | ||
| 1464 | 1381 | ||
| 1465 | for (i = 0; i < NUM_INIT_LISTS; i++) { | 1382 | for (i = 0; i < NUM_INIT_LISTS; i++) { |
| 1466 | kmem_list3_init(&initkmem_list3[i]); | 1383 | kmem_list3_init(&initkmem_list3[i]); |
| @@ -1707,7 +1624,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 1707 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1624 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
| 1708 | flags |= __GFP_RECLAIMABLE; | 1625 | flags |= __GFP_RECLAIMABLE; |
| 1709 | 1626 | ||
| 1710 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | 1627 | page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); |
| 1711 | if (!page) | 1628 | if (!page) |
| 1712 | return NULL; | 1629 | return NULL; |
| 1713 | 1630 | ||
| @@ -1720,6 +1637,16 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 1720 | NR_SLAB_UNRECLAIMABLE, nr_pages); | 1637 | NR_SLAB_UNRECLAIMABLE, nr_pages); |
| 1721 | for (i = 0; i < nr_pages; i++) | 1638 | for (i = 0; i < nr_pages; i++) |
| 1722 | __SetPageSlab(page + i); | 1639 | __SetPageSlab(page + i); |
| 1640 | |||
| 1641 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | ||
| 1642 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); | ||
| 1643 | |||
| 1644 | if (cachep->ctor) | ||
| 1645 | kmemcheck_mark_uninitialized_pages(page, nr_pages); | ||
| 1646 | else | ||
| 1647 | kmemcheck_mark_unallocated_pages(page, nr_pages); | ||
| 1648 | } | ||
| 1649 | |||
| 1723 | return page_address(page); | 1650 | return page_address(page); |
| 1724 | } | 1651 | } |
| 1725 | 1652 | ||
| @@ -1732,6 +1659,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
| 1732 | struct page *page = virt_to_page(addr); | 1659 | struct page *page = virt_to_page(addr); |
| 1733 | const unsigned long nr_freed = i; | 1660 | const unsigned long nr_freed = i; |
| 1734 | 1661 | ||
| 1662 | kmemcheck_free_shadow(page, cachep->gfporder); | ||
| 1663 | |||
| 1735 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1664 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
| 1736 | sub_zone_page_state(page_zone(page), | 1665 | sub_zone_page_state(page_zone(page), |
| 1737 | NR_SLAB_RECLAIMABLE, nr_freed); | 1666 | NR_SLAB_RECLAIMABLE, nr_freed); |
| @@ -3261,7 +3190,7 @@ retry: | |||
| 3261 | if (local_flags & __GFP_WAIT) | 3190 | if (local_flags & __GFP_WAIT) |
| 3262 | local_irq_enable(); | 3191 | local_irq_enable(); |
| 3263 | kmem_flagcheck(cache, flags); | 3192 | kmem_flagcheck(cache, flags); |
| 3264 | obj = kmem_getpages(cache, local_flags, -1); | 3193 | obj = kmem_getpages(cache, local_flags, numa_node_id()); |
| 3265 | if (local_flags & __GFP_WAIT) | 3194 | if (local_flags & __GFP_WAIT) |
| 3266 | local_irq_disable(); | 3195 | local_irq_disable(); |
| 3267 | if (obj) { | 3196 | if (obj) { |
| @@ -3407,6 +3336,9 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
| 3407 | kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, | 3336 | kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, |
| 3408 | flags); | 3337 | flags); |
| 3409 | 3338 | ||
| 3339 | if (likely(ptr)) | ||
| 3340 | kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); | ||
| 3341 | |||
| 3410 | if (unlikely((flags & __GFP_ZERO) && ptr)) | 3342 | if (unlikely((flags & __GFP_ZERO) && ptr)) |
| 3411 | memset(ptr, 0, obj_size(cachep)); | 3343 | memset(ptr, 0, obj_size(cachep)); |
| 3412 | 3344 | ||
| @@ -3467,6 +3399,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | |||
| 3467 | flags); | 3399 | flags); |
| 3468 | prefetchw(objp); | 3400 | prefetchw(objp); |
| 3469 | 3401 | ||
| 3402 | if (likely(objp)) | ||
| 3403 | kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); | ||
| 3404 | |||
| 3470 | if (unlikely((flags & __GFP_ZERO) && objp)) | 3405 | if (unlikely((flags & __GFP_ZERO) && objp)) |
| 3471 | memset(objp, 0, obj_size(cachep)); | 3406 | memset(objp, 0, obj_size(cachep)); |
| 3472 | 3407 | ||
| @@ -3583,6 +3518,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
| 3583 | kmemleak_free_recursive(objp, cachep->flags); | 3518 | kmemleak_free_recursive(objp, cachep->flags); |
| 3584 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); | 3519 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); |
| 3585 | 3520 | ||
| 3521 | kmemcheck_slab_free(cachep, objp, obj_size(cachep)); | ||
| 3522 | |||
| 3586 | /* | 3523 | /* |
| 3587 | * Skip calling cache_free_alien() when the platform is not numa. | 3524 | * Skip calling cache_free_alien() when the platform is not numa. |
| 3588 | * This will avoid cache misses that happen while accessing slabp (which | 3525 | * This will avoid cache misses that happen while accessing slabp (which |
| @@ -3590,7 +3527,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
| 3590 | * variable to skip the call, which is mostly likely to be present in | 3527 | * variable to skip the call, which is mostly likely to be present in |
| 3591 | * the cache. | 3528 | * the cache. |
| 3592 | */ | 3529 | */ |
| 3593 | if (numa_platform && cache_free_alien(cachep, objp)) | 3530 | if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) |
| 3594 | return; | 3531 | return; |
| 3595 | 3532 | ||
| 3596 | if (likely(ac->avail < ac->limit)) { | 3533 | if (likely(ac->avail < ac->limit)) { |
| @@ -46,7 +46,7 @@ | |||
| 46 | * NUMA support in SLOB is fairly simplistic, pushing most of the real | 46 | * NUMA support in SLOB is fairly simplistic, pushing most of the real |
| 47 | * logic down to the page allocator, and simply doing the node accounting | 47 | * logic down to the page allocator, and simply doing the node accounting |
| 48 | * on the upper levels. In the event that a node id is explicitly | 48 | * on the upper levels. In the event that a node id is explicitly |
| 49 | * provided, alloc_pages_node() with the specified node id is used | 49 | * provided, alloc_pages_exact_node() with the specified node id is used |
| 50 | * instead. The common case (or when the node id isn't explicitly provided) | 50 | * instead. The common case (or when the node id isn't explicitly provided) |
| 51 | * will default to the current node, as per numa_node_id(). | 51 | * will default to the current node, as per numa_node_id(). |
| 52 | * | 52 | * |
| @@ -244,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) | |||
| 244 | 244 | ||
| 245 | #ifdef CONFIG_NUMA | 245 | #ifdef CONFIG_NUMA |
| 246 | if (node != -1) | 246 | if (node != -1) |
| 247 | page = alloc_pages_node(node, gfp, order); | 247 | page = alloc_pages_exact_node(node, gfp, order); |
| 248 | else | 248 | else |
| 249 | #endif | 249 | #endif |
| 250 | page = alloc_pages(gfp, order); | 250 | page = alloc_pages(gfp, order); |
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/proc_fs.h> | 18 | #include <linux/proc_fs.h> |
| 19 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
| 20 | #include <linux/kmemtrace.h> | 20 | #include <linux/kmemtrace.h> |
| 21 | #include <linux/kmemcheck.h> | ||
| 21 | #include <linux/cpu.h> | 22 | #include <linux/cpu.h> |
| 22 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> |
| 23 | #include <linux/kmemleak.h> | 24 | #include <linux/kmemleak.h> |
| @@ -147,7 +148,7 @@ | |||
| 147 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) | 148 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) |
| 148 | 149 | ||
| 149 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ | 150 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ |
| 150 | SLAB_CACHE_DMA) | 151 | SLAB_CACHE_DMA | SLAB_NOTRACK) |
| 151 | 152 | ||
| 152 | #ifndef ARCH_KMALLOC_MINALIGN | 153 | #ifndef ARCH_KMALLOC_MINALIGN |
| 153 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) | 154 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) |
| @@ -1071,6 +1072,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node, | |||
| 1071 | { | 1072 | { |
| 1072 | int order = oo_order(oo); | 1073 | int order = oo_order(oo); |
| 1073 | 1074 | ||
| 1075 | flags |= __GFP_NOTRACK; | ||
| 1076 | |||
| 1074 | if (node == -1) | 1077 | if (node == -1) |
| 1075 | return alloc_pages(flags, order); | 1078 | return alloc_pages(flags, order); |
| 1076 | else | 1079 | else |
| @@ -1098,6 +1101,24 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
| 1098 | 1101 | ||
| 1099 | stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); | 1102 | stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); |
| 1100 | } | 1103 | } |
| 1104 | |||
| 1105 | if (kmemcheck_enabled | ||
| 1106 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) | ||
| 1107 | { | ||
| 1108 | int pages = 1 << oo_order(oo); | ||
| 1109 | |||
| 1110 | kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); | ||
| 1111 | |||
| 1112 | /* | ||
| 1113 | * Objects from caches that have a constructor don't get | ||
| 1114 | * cleared when they're allocated, so we need to do it here. | ||
| 1115 | */ | ||
| 1116 | if (s->ctor) | ||
| 1117 | kmemcheck_mark_uninitialized_pages(page, pages); | ||
| 1118 | else | ||
| 1119 | kmemcheck_mark_unallocated_pages(page, pages); | ||
| 1120 | } | ||
| 1121 | |||
| 1101 | page->objects = oo_objects(oo); | 1122 | page->objects = oo_objects(oo); |
| 1102 | mod_zone_page_state(page_zone(page), | 1123 | mod_zone_page_state(page_zone(page), |
| 1103 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | 1124 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? |
| @@ -1171,6 +1192,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
| 1171 | __ClearPageSlubDebug(page); | 1192 | __ClearPageSlubDebug(page); |
| 1172 | } | 1193 | } |
| 1173 | 1194 | ||
| 1195 | kmemcheck_free_shadow(page, compound_order(page)); | ||
| 1196 | |||
| 1174 | mod_zone_page_state(page_zone(page), | 1197 | mod_zone_page_state(page_zone(page), |
| 1175 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | 1198 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? |
| 1176 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | 1199 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, |
| @@ -1626,7 +1649,9 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
| 1626 | if (unlikely((gfpflags & __GFP_ZERO) && object)) | 1649 | if (unlikely((gfpflags & __GFP_ZERO) && object)) |
| 1627 | memset(object, 0, objsize); | 1650 | memset(object, 0, objsize); |
| 1628 | 1651 | ||
| 1652 | kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); | ||
| 1629 | kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); | 1653 | kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); |
| 1654 | |||
| 1630 | return object; | 1655 | return object; |
| 1631 | } | 1656 | } |
| 1632 | 1657 | ||
| @@ -1759,6 +1784,7 @@ static __always_inline void slab_free(struct kmem_cache *s, | |||
| 1759 | kmemleak_free_recursive(x, s->flags); | 1784 | kmemleak_free_recursive(x, s->flags); |
| 1760 | local_irq_save(flags); | 1785 | local_irq_save(flags); |
| 1761 | c = get_cpu_slab(s, smp_processor_id()); | 1786 | c = get_cpu_slab(s, smp_processor_id()); |
| 1787 | kmemcheck_slab_free(s, object, c->objsize); | ||
| 1762 | debug_check_no_locks_freed(object, c->objsize); | 1788 | debug_check_no_locks_freed(object, c->objsize); |
| 1763 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | 1789 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) |
| 1764 | debug_check_no_obj_freed(object, c->objsize); | 1790 | debug_check_no_obj_freed(object, c->objsize); |
| @@ -2633,7 +2659,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
| 2633 | 2659 | ||
| 2634 | if (!s || !text || !kmem_cache_open(s, flags, text, | 2660 | if (!s || !text || !kmem_cache_open(s, flags, text, |
| 2635 | realsize, ARCH_KMALLOC_MINALIGN, | 2661 | realsize, ARCH_KMALLOC_MINALIGN, |
| 2636 | SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { | 2662 | SLAB_CACHE_DMA|SLAB_NOTRACK|__SYSFS_ADD_DEFERRED, |
| 2663 | NULL)) { | ||
| 2637 | kfree(s); | 2664 | kfree(s); |
| 2638 | kfree(text); | 2665 | kfree(text); |
| 2639 | goto unlock_out; | 2666 | goto unlock_out; |
| @@ -2727,9 +2754,10 @@ EXPORT_SYMBOL(__kmalloc); | |||
| 2727 | 2754 | ||
| 2728 | static void *kmalloc_large_node(size_t size, gfp_t flags, int node) | 2755 | static void *kmalloc_large_node(size_t size, gfp_t flags, int node) |
| 2729 | { | 2756 | { |
| 2730 | struct page *page = alloc_pages_node(node, flags | __GFP_COMP, | 2757 | struct page *page; |
| 2731 | get_order(size)); | ||
| 2732 | 2758 | ||
| 2759 | flags |= __GFP_COMP | __GFP_NOTRACK; | ||
| 2760 | page = alloc_pages_node(node, flags, get_order(size)); | ||
| 2733 | if (page) | 2761 | if (page) |
| 2734 | return page_address(page); | 2762 | return page_address(page); |
| 2735 | else | 2763 | else |
| @@ -3737,7 +3765,7 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
| 3737 | to_cpumask(l->cpus)); | 3765 | to_cpumask(l->cpus)); |
| 3738 | } | 3766 | } |
| 3739 | 3767 | ||
| 3740 | if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && | 3768 | if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && |
| 3741 | len < PAGE_SIZE - 60) { | 3769 | len < PAGE_SIZE - 60) { |
| 3742 | len += sprintf(buf + len, " nodes="); | 3770 | len += sprintf(buf + len, " nodes="); |
| 3743 | len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, | 3771 | len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, |
| @@ -4412,6 +4440,8 @@ static char *create_unique_id(struct kmem_cache *s) | |||
| 4412 | *p++ = 'a'; | 4440 | *p++ = 'a'; |
| 4413 | if (s->flags & SLAB_DEBUG_FREE) | 4441 | if (s->flags & SLAB_DEBUG_FREE) |
| 4414 | *p++ = 'F'; | 4442 | *p++ = 'F'; |
| 4443 | if (!(s->flags & SLAB_NOTRACK)) | ||
| 4444 | *p++ = 't'; | ||
| 4415 | if (p != name + 1) | 4445 | if (p != name + 1) |
| 4416 | *p++ = '-'; | 4446 | *p++ = '-'; |
| 4417 | p += sprintf(p, "%07d", s->size); | 4447 | p += sprintf(p, "%07d", s->size); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 1416e7e9e02d..42cd38eba79f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -124,7 +124,6 @@ void __delete_from_swap_cache(struct page *page) | |||
| 124 | /** | 124 | /** |
| 125 | * add_to_swap - allocate swap space for a page | 125 | * add_to_swap - allocate swap space for a page |
| 126 | * @page: page we want to move to swap | 126 | * @page: page we want to move to swap |
| 127 | * @gfp_mask: memory allocation flags | ||
| 128 | * | 127 | * |
| 129 | * Allocate swap space for the page and add the page to the | 128 | * Allocate swap space for the page and add the page to the |
| 130 | * swap cache. Caller needs to hold the page lock. | 129 | * swap cache. Caller needs to hold the page lock. |
| @@ -162,11 +161,11 @@ int add_to_swap(struct page *page) | |||
| 162 | return 1; | 161 | return 1; |
| 163 | case -EEXIST: | 162 | case -EEXIST: |
| 164 | /* Raced with "speculative" read_swap_cache_async */ | 163 | /* Raced with "speculative" read_swap_cache_async */ |
| 165 | swap_free(entry); | 164 | swapcache_free(entry, NULL); |
| 166 | continue; | 165 | continue; |
| 167 | default: | 166 | default: |
| 168 | /* -ENOMEM radix-tree allocation failure */ | 167 | /* -ENOMEM radix-tree allocation failure */ |
| 169 | swap_free(entry); | 168 | swapcache_free(entry, NULL); |
| 170 | return 0; | 169 | return 0; |
| 171 | } | 170 | } |
| 172 | } | 171 | } |
| @@ -188,8 +187,7 @@ void delete_from_swap_cache(struct page *page) | |||
| 188 | __delete_from_swap_cache(page); | 187 | __delete_from_swap_cache(page); |
| 189 | spin_unlock_irq(&swapper_space.tree_lock); | 188 | spin_unlock_irq(&swapper_space.tree_lock); |
| 190 | 189 | ||
| 191 | mem_cgroup_uncharge_swapcache(page, entry); | 190 | swapcache_free(entry, page); |
| 192 | swap_free(entry); | ||
| 193 | page_cache_release(page); | 191 | page_cache_release(page); |
| 194 | } | 192 | } |
| 195 | 193 | ||
| @@ -293,7 +291,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
| 293 | /* | 291 | /* |
| 294 | * Swap entry may have been freed since our caller observed it. | 292 | * Swap entry may have been freed since our caller observed it. |
| 295 | */ | 293 | */ |
| 296 | if (!swap_duplicate(entry)) | 294 | err = swapcache_prepare(entry); |
| 295 | if (err == -EEXIST) /* seems racy */ | ||
| 296 | continue; | ||
| 297 | if (err) /* swp entry is obsolete ? */ | ||
| 297 | break; | 298 | break; |
| 298 | 299 | ||
| 299 | /* | 300 | /* |
| @@ -312,12 +313,12 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
| 312 | * Initiate read into locked page and return. | 313 | * Initiate read into locked page and return. |
| 313 | */ | 314 | */ |
| 314 | lru_cache_add_anon(new_page); | 315 | lru_cache_add_anon(new_page); |
| 315 | swap_readpage(NULL, new_page); | 316 | swap_readpage(new_page); |
| 316 | return new_page; | 317 | return new_page; |
| 317 | } | 318 | } |
| 318 | ClearPageSwapBacked(new_page); | 319 | ClearPageSwapBacked(new_page); |
| 319 | __clear_page_locked(new_page); | 320 | __clear_page_locked(new_page); |
| 320 | swap_free(entry); | 321 | swapcache_free(entry, NULL); |
| 321 | } while (err != -ENOMEM); | 322 | } while (err != -ENOMEM); |
| 322 | 323 | ||
| 323 | if (new_page) | 324 | if (new_page) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 312fafe0ab6e..28faa01cf578 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -53,6 +53,59 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES]; | |||
| 53 | 53 | ||
| 54 | static DEFINE_MUTEX(swapon_mutex); | 54 | static DEFINE_MUTEX(swapon_mutex); |
| 55 | 55 | ||
| 56 | /* For reference count accounting in swap_map */ | ||
| 57 | /* enum for swap_map[] handling. internal use only */ | ||
| 58 | enum { | ||
| 59 | SWAP_MAP = 0, /* ops for reference from swap users */ | ||
| 60 | SWAP_CACHE, /* ops for reference from swap cache */ | ||
| 61 | }; | ||
| 62 | |||
| 63 | static inline int swap_count(unsigned short ent) | ||
| 64 | { | ||
| 65 | return ent & SWAP_COUNT_MASK; | ||
| 66 | } | ||
| 67 | |||
| 68 | static inline bool swap_has_cache(unsigned short ent) | ||
| 69 | { | ||
| 70 | return !!(ent & SWAP_HAS_CACHE); | ||
| 71 | } | ||
| 72 | |||
| 73 | static inline unsigned short encode_swapmap(int count, bool has_cache) | ||
| 74 | { | ||
| 75 | unsigned short ret = count; | ||
| 76 | |||
| 77 | if (has_cache) | ||
| 78 | return SWAP_HAS_CACHE | ret; | ||
| 79 | return ret; | ||
| 80 | } | ||
| 81 | |||
| 82 | /* returnes 1 if swap entry is freed */ | ||
| 83 | static int | ||
| 84 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | ||
| 85 | { | ||
| 86 | int type = si - swap_info; | ||
| 87 | swp_entry_t entry = swp_entry(type, offset); | ||
| 88 | struct page *page; | ||
| 89 | int ret = 0; | ||
| 90 | |||
| 91 | page = find_get_page(&swapper_space, entry.val); | ||
| 92 | if (!page) | ||
| 93 | return 0; | ||
| 94 | /* | ||
| 95 | * This function is called from scan_swap_map() and it's called | ||
| 96 | * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. | ||
| 97 | * We have to use trylock for avoiding deadlock. This is a special | ||
| 98 | * case and you should use try_to_free_swap() with explicit lock_page() | ||
| 99 | * in usual operations. | ||
| 100 | */ | ||
| 101 | if (trylock_page(page)) { | ||
| 102 | ret = try_to_free_swap(page); | ||
| 103 | unlock_page(page); | ||
| 104 | } | ||
| 105 | page_cache_release(page); | ||
| 106 | return ret; | ||
| 107 | } | ||
| 108 | |||
| 56 | /* | 109 | /* |
| 57 | * We need this because the bdev->unplug_fn can sleep and we cannot | 110 | * We need this because the bdev->unplug_fn can sleep and we cannot |
| 58 | * hold swap_lock while calling the unplug_fn. And swap_lock | 111 | * hold swap_lock while calling the unplug_fn. And swap_lock |
| @@ -167,7 +220,8 @@ static int wait_for_discard(void *word) | |||
| 167 | #define SWAPFILE_CLUSTER 256 | 220 | #define SWAPFILE_CLUSTER 256 |
| 168 | #define LATENCY_LIMIT 256 | 221 | #define LATENCY_LIMIT 256 |
| 169 | 222 | ||
| 170 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) | 223 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, |
| 224 | int cache) | ||
| 171 | { | 225 | { |
| 172 | unsigned long offset; | 226 | unsigned long offset; |
| 173 | unsigned long scan_base; | 227 | unsigned long scan_base; |
| @@ -273,6 +327,19 @@ checks: | |||
| 273 | goto no_page; | 327 | goto no_page; |
| 274 | if (offset > si->highest_bit) | 328 | if (offset > si->highest_bit) |
| 275 | scan_base = offset = si->lowest_bit; | 329 | scan_base = offset = si->lowest_bit; |
| 330 | |||
| 331 | /* reuse swap entry of cache-only swap if not busy. */ | ||
| 332 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | ||
| 333 | int swap_was_freed; | ||
| 334 | spin_unlock(&swap_lock); | ||
| 335 | swap_was_freed = __try_to_reclaim_swap(si, offset); | ||
| 336 | spin_lock(&swap_lock); | ||
| 337 | /* entry was freed successfully, try to use this again */ | ||
| 338 | if (swap_was_freed) | ||
| 339 | goto checks; | ||
| 340 | goto scan; /* check next one */ | ||
| 341 | } | ||
| 342 | |||
| 276 | if (si->swap_map[offset]) | 343 | if (si->swap_map[offset]) |
| 277 | goto scan; | 344 | goto scan; |
| 278 | 345 | ||
| @@ -285,7 +352,10 @@ checks: | |||
| 285 | si->lowest_bit = si->max; | 352 | si->lowest_bit = si->max; |
| 286 | si->highest_bit = 0; | 353 | si->highest_bit = 0; |
| 287 | } | 354 | } |
| 288 | si->swap_map[offset] = 1; | 355 | if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ |
| 356 | si->swap_map[offset] = encode_swapmap(0, true); | ||
| 357 | else /* at suspend */ | ||
| 358 | si->swap_map[offset] = encode_swapmap(1, false); | ||
| 289 | si->cluster_next = offset + 1; | 359 | si->cluster_next = offset + 1; |
| 290 | si->flags -= SWP_SCANNING; | 360 | si->flags -= SWP_SCANNING; |
| 291 | 361 | ||
| @@ -351,6 +421,10 @@ scan: | |||
| 351 | spin_lock(&swap_lock); | 421 | spin_lock(&swap_lock); |
| 352 | goto checks; | 422 | goto checks; |
| 353 | } | 423 | } |
| 424 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | ||
| 425 | spin_lock(&swap_lock); | ||
| 426 | goto checks; | ||
| 427 | } | ||
| 354 | if (unlikely(--latency_ration < 0)) { | 428 | if (unlikely(--latency_ration < 0)) { |
| 355 | cond_resched(); | 429 | cond_resched(); |
| 356 | latency_ration = LATENCY_LIMIT; | 430 | latency_ration = LATENCY_LIMIT; |
| @@ -362,6 +436,10 @@ scan: | |||
| 362 | spin_lock(&swap_lock); | 436 | spin_lock(&swap_lock); |
| 363 | goto checks; | 437 | goto checks; |
| 364 | } | 438 | } |
| 439 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | ||
| 440 | spin_lock(&swap_lock); | ||
| 441 | goto checks; | ||
| 442 | } | ||
| 365 | if (unlikely(--latency_ration < 0)) { | 443 | if (unlikely(--latency_ration < 0)) { |
| 366 | cond_resched(); | 444 | cond_resched(); |
| 367 | latency_ration = LATENCY_LIMIT; | 445 | latency_ration = LATENCY_LIMIT; |
| @@ -401,7 +479,8 @@ swp_entry_t get_swap_page(void) | |||
| 401 | continue; | 479 | continue; |
| 402 | 480 | ||
| 403 | swap_list.next = next; | 481 | swap_list.next = next; |
| 404 | offset = scan_swap_map(si); | 482 | /* This is called for allocating swap entry for cache */ |
| 483 | offset = scan_swap_map(si, SWAP_CACHE); | ||
| 405 | if (offset) { | 484 | if (offset) { |
| 406 | spin_unlock(&swap_lock); | 485 | spin_unlock(&swap_lock); |
| 407 | return swp_entry(type, offset); | 486 | return swp_entry(type, offset); |
| @@ -415,6 +494,7 @@ noswap: | |||
| 415 | return (swp_entry_t) {0}; | 494 | return (swp_entry_t) {0}; |
| 416 | } | 495 | } |
| 417 | 496 | ||
| 497 | /* The only caller of this function is now susupend routine */ | ||
| 418 | swp_entry_t get_swap_page_of_type(int type) | 498 | swp_entry_t get_swap_page_of_type(int type) |
| 419 | { | 499 | { |
| 420 | struct swap_info_struct *si; | 500 | struct swap_info_struct *si; |
| @@ -424,7 +504,8 @@ swp_entry_t get_swap_page_of_type(int type) | |||
| 424 | si = swap_info + type; | 504 | si = swap_info + type; |
| 425 | if (si->flags & SWP_WRITEOK) { | 505 | if (si->flags & SWP_WRITEOK) { |
| 426 | nr_swap_pages--; | 506 | nr_swap_pages--; |
| 427 | offset = scan_swap_map(si); | 507 | /* This is called for allocating swap entry, not cache */ |
| 508 | offset = scan_swap_map(si, SWAP_MAP); | ||
| 428 | if (offset) { | 509 | if (offset) { |
| 429 | spin_unlock(&swap_lock); | 510 | spin_unlock(&swap_lock); |
| 430 | return swp_entry(type, offset); | 511 | return swp_entry(type, offset); |
| @@ -471,25 +552,38 @@ out: | |||
| 471 | return NULL; | 552 | return NULL; |
| 472 | } | 553 | } |
| 473 | 554 | ||
| 474 | static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) | 555 | static int swap_entry_free(struct swap_info_struct *p, |
| 556 | swp_entry_t ent, int cache) | ||
| 475 | { | 557 | { |
| 476 | unsigned long offset = swp_offset(ent); | 558 | unsigned long offset = swp_offset(ent); |
| 477 | int count = p->swap_map[offset]; | 559 | int count = swap_count(p->swap_map[offset]); |
| 478 | 560 | bool has_cache; | |
| 479 | if (count < SWAP_MAP_MAX) { | 561 | |
| 480 | count--; | 562 | has_cache = swap_has_cache(p->swap_map[offset]); |
| 481 | p->swap_map[offset] = count; | 563 | |
| 482 | if (!count) { | 564 | if (cache == SWAP_MAP) { /* dropping usage count of swap */ |
| 483 | if (offset < p->lowest_bit) | 565 | if (count < SWAP_MAP_MAX) { |
| 484 | p->lowest_bit = offset; | 566 | count--; |
| 485 | if (offset > p->highest_bit) | 567 | p->swap_map[offset] = encode_swapmap(count, has_cache); |
| 486 | p->highest_bit = offset; | ||
| 487 | if (p->prio > swap_info[swap_list.next].prio) | ||
| 488 | swap_list.next = p - swap_info; | ||
| 489 | nr_swap_pages++; | ||
| 490 | p->inuse_pages--; | ||
| 491 | mem_cgroup_uncharge_swap(ent); | ||
| 492 | } | 568 | } |
| 569 | } else { /* dropping swap cache flag */ | ||
| 570 | VM_BUG_ON(!has_cache); | ||
| 571 | p->swap_map[offset] = encode_swapmap(count, false); | ||
| 572 | |||
| 573 | } | ||
| 574 | /* return code. */ | ||
| 575 | count = p->swap_map[offset]; | ||
| 576 | /* free if no reference */ | ||
| 577 | if (!count) { | ||
| 578 | if (offset < p->lowest_bit) | ||
| 579 | p->lowest_bit = offset; | ||
| 580 | if (offset > p->highest_bit) | ||
| 581 | p->highest_bit = offset; | ||
| 582 | if (p->prio > swap_info[swap_list.next].prio) | ||
| 583 | swap_list.next = p - swap_info; | ||
| 584 | nr_swap_pages++; | ||
| 585 | p->inuse_pages--; | ||
| 586 | mem_cgroup_uncharge_swap(ent); | ||
| 493 | } | 587 | } |
| 494 | return count; | 588 | return count; |
| 495 | } | 589 | } |
| @@ -504,9 +598,26 @@ void swap_free(swp_entry_t entry) | |||
| 504 | 598 | ||
| 505 | p = swap_info_get(entry); | 599 | p = swap_info_get(entry); |
| 506 | if (p) { | 600 | if (p) { |
| 507 | swap_entry_free(p, entry); | 601 | swap_entry_free(p, entry, SWAP_MAP); |
| 602 | spin_unlock(&swap_lock); | ||
| 603 | } | ||
| 604 | } | ||
| 605 | |||
| 606 | /* | ||
| 607 | * Called after dropping swapcache to decrease refcnt to swap entries. | ||
| 608 | */ | ||
| 609 | void swapcache_free(swp_entry_t entry, struct page *page) | ||
| 610 | { | ||
| 611 | struct swap_info_struct *p; | ||
| 612 | |||
| 613 | if (page) | ||
| 614 | mem_cgroup_uncharge_swapcache(page, entry); | ||
| 615 | p = swap_info_get(entry); | ||
| 616 | if (p) { | ||
| 617 | swap_entry_free(p, entry, SWAP_CACHE); | ||
| 508 | spin_unlock(&swap_lock); | 618 | spin_unlock(&swap_lock); |
| 509 | } | 619 | } |
| 620 | return; | ||
| 510 | } | 621 | } |
| 511 | 622 | ||
| 512 | /* | 623 | /* |
| @@ -521,8 +632,7 @@ static inline int page_swapcount(struct page *page) | |||
| 521 | entry.val = page_private(page); | 632 | entry.val = page_private(page); |
| 522 | p = swap_info_get(entry); | 633 | p = swap_info_get(entry); |
| 523 | if (p) { | 634 | if (p) { |
| 524 | /* Subtract the 1 for the swap cache itself */ | 635 | count = swap_count(p->swap_map[swp_offset(entry)]); |
| 525 | count = p->swap_map[swp_offset(entry)] - 1; | ||
| 526 | spin_unlock(&swap_lock); | 636 | spin_unlock(&swap_lock); |
| 527 | } | 637 | } |
| 528 | return count; | 638 | return count; |
| @@ -584,7 +694,7 @@ int free_swap_and_cache(swp_entry_t entry) | |||
| 584 | 694 | ||
| 585 | p = swap_info_get(entry); | 695 | p = swap_info_get(entry); |
| 586 | if (p) { | 696 | if (p) { |
| 587 | if (swap_entry_free(p, entry) == 1) { | 697 | if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { |
| 588 | page = find_get_page(&swapper_space, entry.val); | 698 | page = find_get_page(&swapper_space, entry.val); |
| 589 | if (page && !trylock_page(page)) { | 699 | if (page && !trylock_page(page)) { |
| 590 | page_cache_release(page); | 700 | page_cache_release(page); |
| @@ -891,7 +1001,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
| 891 | i = 1; | 1001 | i = 1; |
| 892 | } | 1002 | } |
| 893 | count = si->swap_map[i]; | 1003 | count = si->swap_map[i]; |
| 894 | if (count && count != SWAP_MAP_BAD) | 1004 | if (count && swap_count(count) != SWAP_MAP_BAD) |
| 895 | break; | 1005 | break; |
| 896 | } | 1006 | } |
| 897 | return i; | 1007 | return i; |
| @@ -995,13 +1105,13 @@ static int try_to_unuse(unsigned int type) | |||
| 995 | */ | 1105 | */ |
| 996 | shmem = 0; | 1106 | shmem = 0; |
| 997 | swcount = *swap_map; | 1107 | swcount = *swap_map; |
| 998 | if (swcount > 1) { | 1108 | if (swap_count(swcount)) { |
| 999 | if (start_mm == &init_mm) | 1109 | if (start_mm == &init_mm) |
| 1000 | shmem = shmem_unuse(entry, page); | 1110 | shmem = shmem_unuse(entry, page); |
| 1001 | else | 1111 | else |
| 1002 | retval = unuse_mm(start_mm, entry, page); | 1112 | retval = unuse_mm(start_mm, entry, page); |
| 1003 | } | 1113 | } |
| 1004 | if (*swap_map > 1) { | 1114 | if (swap_count(*swap_map)) { |
| 1005 | int set_start_mm = (*swap_map >= swcount); | 1115 | int set_start_mm = (*swap_map >= swcount); |
| 1006 | struct list_head *p = &start_mm->mmlist; | 1116 | struct list_head *p = &start_mm->mmlist; |
| 1007 | struct mm_struct *new_start_mm = start_mm; | 1117 | struct mm_struct *new_start_mm = start_mm; |
| @@ -1011,7 +1121,7 @@ static int try_to_unuse(unsigned int type) | |||
| 1011 | atomic_inc(&new_start_mm->mm_users); | 1121 | atomic_inc(&new_start_mm->mm_users); |
| 1012 | atomic_inc(&prev_mm->mm_users); | 1122 | atomic_inc(&prev_mm->mm_users); |
| 1013 | spin_lock(&mmlist_lock); | 1123 | spin_lock(&mmlist_lock); |
| 1014 | while (*swap_map > 1 && !retval && !shmem && | 1124 | while (swap_count(*swap_map) && !retval && !shmem && |
| 1015 | (p = p->next) != &start_mm->mmlist) { | 1125 | (p = p->next) != &start_mm->mmlist) { |
| 1016 | mm = list_entry(p, struct mm_struct, mmlist); | 1126 | mm = list_entry(p, struct mm_struct, mmlist); |
| 1017 | if (!atomic_inc_not_zero(&mm->mm_users)) | 1127 | if (!atomic_inc_not_zero(&mm->mm_users)) |
| @@ -1023,14 +1133,16 @@ static int try_to_unuse(unsigned int type) | |||
| 1023 | cond_resched(); | 1133 | cond_resched(); |
| 1024 | 1134 | ||
| 1025 | swcount = *swap_map; | 1135 | swcount = *swap_map; |
| 1026 | if (swcount <= 1) | 1136 | if (!swap_count(swcount)) /* any usage ? */ |
| 1027 | ; | 1137 | ; |
| 1028 | else if (mm == &init_mm) { | 1138 | else if (mm == &init_mm) { |
| 1029 | set_start_mm = 1; | 1139 | set_start_mm = 1; |
| 1030 | shmem = shmem_unuse(entry, page); | 1140 | shmem = shmem_unuse(entry, page); |
| 1031 | } else | 1141 | } else |
| 1032 | retval = unuse_mm(mm, entry, page); | 1142 | retval = unuse_mm(mm, entry, page); |
| 1033 | if (set_start_mm && *swap_map < swcount) { | 1143 | |
| 1144 | if (set_start_mm && | ||
| 1145 | swap_count(*swap_map) < swcount) { | ||
| 1034 | mmput(new_start_mm); | 1146 | mmput(new_start_mm); |
| 1035 | atomic_inc(&mm->mm_users); | 1147 | atomic_inc(&mm->mm_users); |
| 1036 | new_start_mm = mm; | 1148 | new_start_mm = mm; |
| @@ -1057,21 +1169,25 @@ static int try_to_unuse(unsigned int type) | |||
| 1057 | } | 1169 | } |
| 1058 | 1170 | ||
| 1059 | /* | 1171 | /* |
| 1060 | * How could swap count reach 0x7fff when the maximum | 1172 | * How could swap count reach 0x7ffe ? |
| 1061 | * pid is 0x7fff, and there's no way to repeat a swap | 1173 | * There's no way to repeat a swap page within an mm |
| 1062 | * page within an mm (except in shmem, where it's the | 1174 | * (except in shmem, where it's the shared object which takes |
| 1063 | * shared object which takes the reference count)? | 1175 | * the reference count)? |
| 1064 | * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. | 1176 | * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned |
| 1065 | * | 1177 | * short is too small....) |
| 1066 | * If that's wrong, then we should worry more about | 1178 | * If that's wrong, then we should worry more about |
| 1067 | * exit_mmap() and do_munmap() cases described above: | 1179 | * exit_mmap() and do_munmap() cases described above: |
| 1068 | * we might be resetting SWAP_MAP_MAX too early here. | 1180 | * we might be resetting SWAP_MAP_MAX too early here. |
| 1069 | * We know "Undead"s can happen, they're okay, so don't | 1181 | * We know "Undead"s can happen, they're okay, so don't |
| 1070 | * report them; but do report if we reset SWAP_MAP_MAX. | 1182 | * report them; but do report if we reset SWAP_MAP_MAX. |
| 1071 | */ | 1183 | */ |
| 1072 | if (*swap_map == SWAP_MAP_MAX) { | 1184 | /* We might release the lock_page() in unuse_mm(). */ |
| 1185 | if (!PageSwapCache(page) || page_private(page) != entry.val) | ||
| 1186 | goto retry; | ||
| 1187 | |||
| 1188 | if (swap_count(*swap_map) == SWAP_MAP_MAX) { | ||
| 1073 | spin_lock(&swap_lock); | 1189 | spin_lock(&swap_lock); |
| 1074 | *swap_map = 1; | 1190 | *swap_map = encode_swapmap(0, true); |
| 1075 | spin_unlock(&swap_lock); | 1191 | spin_unlock(&swap_lock); |
| 1076 | reset_overflow = 1; | 1192 | reset_overflow = 1; |
| 1077 | } | 1193 | } |
| @@ -1089,7 +1205,8 @@ static int try_to_unuse(unsigned int type) | |||
| 1089 | * pages would be incorrect if swap supported "shared | 1205 | * pages would be incorrect if swap supported "shared |
| 1090 | * private" pages, but they are handled by tmpfs files. | 1206 | * private" pages, but they are handled by tmpfs files. |
| 1091 | */ | 1207 | */ |
| 1092 | if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { | 1208 | if (swap_count(*swap_map) && |
| 1209 | PageDirty(page) && PageSwapCache(page)) { | ||
| 1093 | struct writeback_control wbc = { | 1210 | struct writeback_control wbc = { |
| 1094 | .sync_mode = WB_SYNC_NONE, | 1211 | .sync_mode = WB_SYNC_NONE, |
| 1095 | }; | 1212 | }; |
| @@ -1116,6 +1233,7 @@ static int try_to_unuse(unsigned int type) | |||
| 1116 | * mark page dirty so shrink_page_list will preserve it. | 1233 | * mark page dirty so shrink_page_list will preserve it. |
| 1117 | */ | 1234 | */ |
| 1118 | SetPageDirty(page); | 1235 | SetPageDirty(page); |
| 1236 | retry: | ||
| 1119 | unlock_page(page); | 1237 | unlock_page(page); |
| 1120 | page_cache_release(page); | 1238 | page_cache_release(page); |
| 1121 | 1239 | ||
| @@ -1942,15 +2060,23 @@ void si_swapinfo(struct sysinfo *val) | |||
| 1942 | * | 2060 | * |
| 1943 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as | 2061 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as |
| 1944 | * "permanent", but will be reclaimed by the next swapoff. | 2062 | * "permanent", but will be reclaimed by the next swapoff. |
| 2063 | * Returns error code in following case. | ||
| 2064 | * - success -> 0 | ||
| 2065 | * - swp_entry is invalid -> EINVAL | ||
| 2066 | * - swp_entry is migration entry -> EINVAL | ||
| 2067 | * - swap-cache reference is requested but there is already one. -> EEXIST | ||
| 2068 | * - swap-cache reference is requested but the entry is not used. -> ENOENT | ||
| 1945 | */ | 2069 | */ |
| 1946 | int swap_duplicate(swp_entry_t entry) | 2070 | static int __swap_duplicate(swp_entry_t entry, bool cache) |
| 1947 | { | 2071 | { |
| 1948 | struct swap_info_struct * p; | 2072 | struct swap_info_struct * p; |
| 1949 | unsigned long offset, type; | 2073 | unsigned long offset, type; |
| 1950 | int result = 0; | 2074 | int result = -EINVAL; |
| 2075 | int count; | ||
| 2076 | bool has_cache; | ||
| 1951 | 2077 | ||
| 1952 | if (is_migration_entry(entry)) | 2078 | if (is_migration_entry(entry)) |
| 1953 | return 1; | 2079 | return -EINVAL; |
| 1954 | 2080 | ||
| 1955 | type = swp_type(entry); | 2081 | type = swp_type(entry); |
| 1956 | if (type >= nr_swapfiles) | 2082 | if (type >= nr_swapfiles) |
| @@ -1959,17 +2085,40 @@ int swap_duplicate(swp_entry_t entry) | |||
| 1959 | offset = swp_offset(entry); | 2085 | offset = swp_offset(entry); |
| 1960 | 2086 | ||
| 1961 | spin_lock(&swap_lock); | 2087 | spin_lock(&swap_lock); |
| 1962 | if (offset < p->max && p->swap_map[offset]) { | 2088 | |
| 1963 | if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { | 2089 | if (unlikely(offset >= p->max)) |
| 1964 | p->swap_map[offset]++; | 2090 | goto unlock_out; |
| 1965 | result = 1; | 2091 | |
| 1966 | } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { | 2092 | count = swap_count(p->swap_map[offset]); |
| 2093 | has_cache = swap_has_cache(p->swap_map[offset]); | ||
| 2094 | |||
| 2095 | if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ | ||
| 2096 | |||
| 2097 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ | ||
| 2098 | if (!has_cache && count) { | ||
| 2099 | p->swap_map[offset] = encode_swapmap(count, true); | ||
| 2100 | result = 0; | ||
| 2101 | } else if (has_cache) /* someone added cache */ | ||
| 2102 | result = -EEXIST; | ||
| 2103 | else if (!count) /* no users */ | ||
| 2104 | result = -ENOENT; | ||
| 2105 | |||
| 2106 | } else if (count || has_cache) { | ||
| 2107 | if (count < SWAP_MAP_MAX - 1) { | ||
| 2108 | p->swap_map[offset] = encode_swapmap(count + 1, | ||
| 2109 | has_cache); | ||
| 2110 | result = 0; | ||
| 2111 | } else if (count <= SWAP_MAP_MAX) { | ||
| 1967 | if (swap_overflow++ < 5) | 2112 | if (swap_overflow++ < 5) |
| 1968 | printk(KERN_WARNING "swap_dup: swap entry overflow\n"); | 2113 | printk(KERN_WARNING |
| 1969 | p->swap_map[offset] = SWAP_MAP_MAX; | 2114 | "swap_dup: swap entry overflow\n"); |
| 1970 | result = 1; | 2115 | p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, |
| 2116 | has_cache); | ||
| 2117 | result = 0; | ||
| 1971 | } | 2118 | } |
| 1972 | } | 2119 | } else |
| 2120 | result = -ENOENT; /* unused swap entry */ | ||
| 2121 | unlock_out: | ||
| 1973 | spin_unlock(&swap_lock); | 2122 | spin_unlock(&swap_lock); |
| 1974 | out: | 2123 | out: |
| 1975 | return result; | 2124 | return result; |
| @@ -1978,6 +2127,27 @@ bad_file: | |||
| 1978 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); | 2127 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); |
| 1979 | goto out; | 2128 | goto out; |
| 1980 | } | 2129 | } |
| 2130 | /* | ||
| 2131 | * increase reference count of swap entry by 1. | ||
| 2132 | */ | ||
| 2133 | void swap_duplicate(swp_entry_t entry) | ||
| 2134 | { | ||
| 2135 | __swap_duplicate(entry, SWAP_MAP); | ||
| 2136 | } | ||
| 2137 | |||
| 2138 | /* | ||
| 2139 | * @entry: swap entry for which we allocate swap cache. | ||
| 2140 | * | ||
| 2141 | * Called when allocating swap cache for exising swap entry, | ||
| 2142 | * This can return error codes. Returns 0 at success. | ||
| 2143 | * -EBUSY means there is a swap cache. | ||
| 2144 | * Note: return code is different from swap_duplicate(). | ||
| 2145 | */ | ||
| 2146 | int swapcache_prepare(swp_entry_t entry) | ||
| 2147 | { | ||
| 2148 | return __swap_duplicate(entry, SWAP_CACHE); | ||
| 2149 | } | ||
| 2150 | |||
| 1981 | 2151 | ||
| 1982 | struct swap_info_struct * | 2152 | struct swap_info_struct * |
| 1983 | get_swap_info_struct(unsigned type) | 2153 | get_swap_info_struct(unsigned type) |
| @@ -2016,7 +2186,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
| 2016 | /* Don't read in free or bad pages */ | 2186 | /* Don't read in free or bad pages */ |
| 2017 | if (!si->swap_map[toff]) | 2187 | if (!si->swap_map[toff]) |
| 2018 | break; | 2188 | break; |
| 2019 | if (si->swap_map[toff] == SWAP_MAP_BAD) | 2189 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) |
| 2020 | break; | 2190 | break; |
| 2021 | } | 2191 | } |
| 2022 | /* Count contiguous allocated slots below our target */ | 2192 | /* Count contiguous allocated slots below our target */ |
| @@ -2024,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
| 2024 | /* Don't read in free or bad pages */ | 2194 | /* Don't read in free or bad pages */ |
| 2025 | if (!si->swap_map[toff]) | 2195 | if (!si->swap_map[toff]) |
| 2026 | break; | 2196 | break; |
| 2027 | if (si->swap_map[toff] == SWAP_MAP_BAD) | 2197 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) |
| 2028 | break; | 2198 | break; |
| 2029 | } | 2199 | } |
| 2030 | spin_unlock(&swap_lock); | 2200 | spin_unlock(&swap_lock); |
diff --git a/mm/truncate.c b/mm/truncate.c index 12e1579f9165..ccc3ecf7cb98 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
| @@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | |||
| 267 | } | 267 | } |
| 268 | EXPORT_SYMBOL(truncate_inode_pages); | 268 | EXPORT_SYMBOL(truncate_inode_pages); |
| 269 | 269 | ||
| 270 | unsigned long __invalidate_mapping_pages(struct address_space *mapping, | 270 | /** |
| 271 | pgoff_t start, pgoff_t end, bool be_atomic) | 271 | * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode |
| 272 | * @mapping: the address_space which holds the pages to invalidate | ||
| 273 | * @start: the offset 'from' which to invalidate | ||
| 274 | * @end: the offset 'to' which to invalidate (inclusive) | ||
| 275 | * | ||
| 276 | * This function only removes the unlocked pages, if you want to | ||
| 277 | * remove all the pages of one inode, you must call truncate_inode_pages. | ||
| 278 | * | ||
| 279 | * invalidate_mapping_pages() will not block on IO activity. It will not | ||
| 280 | * invalidate pages which are dirty, locked, under writeback or mapped into | ||
| 281 | * pagetables. | ||
| 282 | */ | ||
| 283 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | ||
| 284 | pgoff_t start, pgoff_t end) | ||
| 272 | { | 285 | { |
| 273 | struct pagevec pvec; | 286 | struct pagevec pvec; |
| 274 | pgoff_t next = start; | 287 | pgoff_t next = start; |
| @@ -309,30 +322,10 @@ unlock: | |||
| 309 | break; | 322 | break; |
| 310 | } | 323 | } |
| 311 | pagevec_release(&pvec); | 324 | pagevec_release(&pvec); |
| 312 | if (likely(!be_atomic)) | 325 | cond_resched(); |
| 313 | cond_resched(); | ||
| 314 | } | 326 | } |
| 315 | return ret; | 327 | return ret; |
| 316 | } | 328 | } |
| 317 | |||
| 318 | /** | ||
| 319 | * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode | ||
| 320 | * @mapping: the address_space which holds the pages to invalidate | ||
| 321 | * @start: the offset 'from' which to invalidate | ||
| 322 | * @end: the offset 'to' which to invalidate (inclusive) | ||
| 323 | * | ||
| 324 | * This function only removes the unlocked pages, if you want to | ||
| 325 | * remove all the pages of one inode, you must call truncate_inode_pages. | ||
| 326 | * | ||
| 327 | * invalidate_mapping_pages() will not block on IO activity. It will not | ||
| 328 | * invalidate pages which are dirty, locked, under writeback or mapped into | ||
| 329 | * pagetables. | ||
| 330 | */ | ||
| 331 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | ||
| 332 | pgoff_t start, pgoff_t end) | ||
| 333 | { | ||
| 334 | return __invalidate_mapping_pages(mapping, start, end, false); | ||
| 335 | } | ||
| 336 | EXPORT_SYMBOL(invalidate_mapping_pages); | 329 | EXPORT_SYMBOL(invalidate_mapping_pages); |
| 337 | 330 | ||
| 338 | /* | 331 | /* |
| @@ -233,13 +233,21 @@ void arch_pick_mmap_layout(struct mm_struct *mm) | |||
| 233 | * @pages: array that receives pointers to the pages pinned. | 233 | * @pages: array that receives pointers to the pages pinned. |
| 234 | * Should be at least nr_pages long. | 234 | * Should be at least nr_pages long. |
| 235 | * | 235 | * |
| 236 | * Attempt to pin user pages in memory without taking mm->mmap_sem. | ||
| 237 | * If not successful, it will fall back to taking the lock and | ||
| 238 | * calling get_user_pages(). | ||
| 239 | * | ||
| 240 | * Returns number of pages pinned. This may be fewer than the number | 236 | * Returns number of pages pinned. This may be fewer than the number |
| 241 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | 237 | * requested. If nr_pages is 0 or negative, returns 0. If no pages |
| 242 | * were pinned, returns -errno. | 238 | * were pinned, returns -errno. |
| 239 | * | ||
| 240 | * get_user_pages_fast provides equivalent functionality to get_user_pages, | ||
| 241 | * operating on current and current->mm, with force=0 and vma=NULL. However | ||
| 242 | * unlike get_user_pages, it must be called without mmap_sem held. | ||
| 243 | * | ||
| 244 | * get_user_pages_fast may take mmap_sem and page table locks, so no | ||
| 245 | * assumptions can be made about lack of locking. get_user_pages_fast is to be | ||
| 246 | * implemented in a way that is advantageous (vs get_user_pages()) when the | ||
| 247 | * user memory area is already faulted in and present in ptes. However if the | ||
| 248 | * pages have to be faulted in, it may turn out to be slightly slower so | ||
| 249 | * callers need to carefully consider what to use. On many architectures, | ||
| 250 | * get_user_pages_fast simply falls back to get_user_pages. | ||
| 243 | */ | 251 | */ |
| 244 | int __attribute__((weak)) get_user_pages_fast(unsigned long start, | 252 | int __attribute__((weak)) get_user_pages_fast(unsigned long start, |
| 245 | int nr_pages, int write, struct page **pages) | 253 | int nr_pages, int write, struct page **pages) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 95c08a8cc2ba..4139aa52b941 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -470,8 +470,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) | |||
| 470 | swp_entry_t swap = { .val = page_private(page) }; | 470 | swp_entry_t swap = { .val = page_private(page) }; |
| 471 | __delete_from_swap_cache(page); | 471 | __delete_from_swap_cache(page); |
| 472 | spin_unlock_irq(&mapping->tree_lock); | 472 | spin_unlock_irq(&mapping->tree_lock); |
| 473 | mem_cgroup_uncharge_swapcache(page, swap); | 473 | swapcache_free(swap, page); |
| 474 | swap_free(swap); | ||
| 475 | } else { | 474 | } else { |
| 476 | __remove_from_page_cache(page); | 475 | __remove_from_page_cache(page); |
| 477 | spin_unlock_irq(&mapping->tree_lock); | 476 | spin_unlock_irq(&mapping->tree_lock); |
| @@ -514,7 +513,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) | |||
| 514 | * | 513 | * |
| 515 | * lru_lock must not be held, interrupts must be enabled. | 514 | * lru_lock must not be held, interrupts must be enabled. |
| 516 | */ | 515 | */ |
| 517 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 518 | void putback_lru_page(struct page *page) | 516 | void putback_lru_page(struct page *page) |
| 519 | { | 517 | { |
| 520 | int lru; | 518 | int lru; |
| @@ -568,20 +566,6 @@ redo: | |||
| 568 | put_page(page); /* drop ref from isolate */ | 566 | put_page(page); /* drop ref from isolate */ |
| 569 | } | 567 | } |
| 570 | 568 | ||
| 571 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
| 572 | |||
| 573 | void putback_lru_page(struct page *page) | ||
| 574 | { | ||
| 575 | int lru; | ||
| 576 | VM_BUG_ON(PageLRU(page)); | ||
| 577 | |||
| 578 | lru = !!TestClearPageActive(page) + page_is_file_cache(page); | ||
| 579 | lru_cache_add_lru(page, lru); | ||
| 580 | put_page(page); | ||
| 581 | } | ||
| 582 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
| 583 | |||
| 584 | |||
| 585 | /* | 569 | /* |
| 586 | * shrink_page_list() returns the number of reclaimed pages | 570 | * shrink_page_list() returns the number of reclaimed pages |
| 587 | */ | 571 | */ |
| @@ -593,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 593 | struct pagevec freed_pvec; | 577 | struct pagevec freed_pvec; |
| 594 | int pgactivate = 0; | 578 | int pgactivate = 0; |
| 595 | unsigned long nr_reclaimed = 0; | 579 | unsigned long nr_reclaimed = 0; |
| 580 | unsigned long vm_flags; | ||
| 596 | 581 | ||
| 597 | cond_resched(); | 582 | cond_resched(); |
| 598 | 583 | ||
| @@ -643,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 643 | goto keep_locked; | 628 | goto keep_locked; |
| 644 | } | 629 | } |
| 645 | 630 | ||
| 646 | referenced = page_referenced(page, 1, sc->mem_cgroup); | 631 | referenced = page_referenced(page, 1, |
| 632 | sc->mem_cgroup, &vm_flags); | ||
| 647 | /* In active use or really unfreeable? Activate it. */ | 633 | /* In active use or really unfreeable? Activate it. */ |
| 648 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && | 634 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && |
| 649 | referenced && page_mapping_inuse(page)) | 635 | referenced && page_mapping_inuse(page)) |
| @@ -943,18 +929,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
| 943 | /* Check that we have not crossed a zone boundary. */ | 929 | /* Check that we have not crossed a zone boundary. */ |
| 944 | if (unlikely(page_zone_id(cursor_page) != zone_id)) | 930 | if (unlikely(page_zone_id(cursor_page) != zone_id)) |
| 945 | continue; | 931 | continue; |
| 946 | switch (__isolate_lru_page(cursor_page, mode, file)) { | 932 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { |
| 947 | case 0: | ||
| 948 | list_move(&cursor_page->lru, dst); | 933 | list_move(&cursor_page->lru, dst); |
| 949 | nr_taken++; | 934 | nr_taken++; |
| 950 | scan++; | 935 | scan++; |
| 951 | break; | ||
| 952 | |||
| 953 | case -EBUSY: | ||
| 954 | /* else it is being freed elsewhere */ | ||
| 955 | list_move(&cursor_page->lru, src); | ||
| 956 | default: | ||
| 957 | break; /* ! on LRU or wrong list */ | ||
| 958 | } | 936 | } |
| 959 | } | 937 | } |
| 960 | } | 938 | } |
| @@ -1061,6 +1039,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1061 | unsigned long nr_scanned = 0; | 1039 | unsigned long nr_scanned = 0; |
| 1062 | unsigned long nr_reclaimed = 0; | 1040 | unsigned long nr_reclaimed = 0; |
| 1063 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1041 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
| 1042 | int lumpy_reclaim = 0; | ||
| 1043 | |||
| 1044 | /* | ||
| 1045 | * If we need a large contiguous chunk of memory, or have | ||
| 1046 | * trouble getting a small set of contiguous pages, we | ||
| 1047 | * will reclaim both active and inactive pages. | ||
| 1048 | * | ||
| 1049 | * We use the same threshold as pageout congestion_wait below. | ||
| 1050 | */ | ||
| 1051 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
| 1052 | lumpy_reclaim = 1; | ||
| 1053 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
| 1054 | lumpy_reclaim = 1; | ||
| 1064 | 1055 | ||
| 1065 | pagevec_init(&pvec, 1); | 1056 | pagevec_init(&pvec, 1); |
| 1066 | 1057 | ||
| @@ -1073,19 +1064,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1073 | unsigned long nr_freed; | 1064 | unsigned long nr_freed; |
| 1074 | unsigned long nr_active; | 1065 | unsigned long nr_active; |
| 1075 | unsigned int count[NR_LRU_LISTS] = { 0, }; | 1066 | unsigned int count[NR_LRU_LISTS] = { 0, }; |
| 1076 | int mode = ISOLATE_INACTIVE; | 1067 | int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; |
| 1077 | |||
| 1078 | /* | ||
| 1079 | * If we need a large contiguous chunk of memory, or have | ||
| 1080 | * trouble getting a small set of contiguous pages, we | ||
| 1081 | * will reclaim both active and inactive pages. | ||
| 1082 | * | ||
| 1083 | * We use the same threshold as pageout congestion_wait below. | ||
| 1084 | */ | ||
| 1085 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
| 1086 | mode = ISOLATE_BOTH; | ||
| 1087 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
| 1088 | mode = ISOLATE_BOTH; | ||
| 1089 | 1068 | ||
| 1090 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, | 1069 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, |
| 1091 | &page_list, &nr_scan, sc->order, mode, | 1070 | &page_list, &nr_scan, sc->order, mode, |
| @@ -1122,7 +1101,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1122 | * but that should be acceptable to the caller | 1101 | * but that should be acceptable to the caller |
| 1123 | */ | 1102 | */ |
| 1124 | if (nr_freed < nr_taken && !current_is_kswapd() && | 1103 | if (nr_freed < nr_taken && !current_is_kswapd() && |
| 1125 | sc->order > PAGE_ALLOC_COSTLY_ORDER) { | 1104 | lumpy_reclaim) { |
| 1126 | congestion_wait(WRITE, HZ/10); | 1105 | congestion_wait(WRITE, HZ/10); |
| 1127 | 1106 | ||
| 1128 | /* | 1107 | /* |
| @@ -1217,18 +1196,54 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) | |||
| 1217 | * But we had to alter page->flags anyway. | 1196 | * But we had to alter page->flags anyway. |
| 1218 | */ | 1197 | */ |
| 1219 | 1198 | ||
| 1199 | static void move_active_pages_to_lru(struct zone *zone, | ||
| 1200 | struct list_head *list, | ||
| 1201 | enum lru_list lru) | ||
| 1202 | { | ||
| 1203 | unsigned long pgmoved = 0; | ||
| 1204 | struct pagevec pvec; | ||
| 1205 | struct page *page; | ||
| 1206 | |||
| 1207 | pagevec_init(&pvec, 1); | ||
| 1208 | |||
| 1209 | while (!list_empty(list)) { | ||
| 1210 | page = lru_to_page(list); | ||
| 1211 | prefetchw_prev_lru_page(page, list, flags); | ||
| 1212 | |||
| 1213 | VM_BUG_ON(PageLRU(page)); | ||
| 1214 | SetPageLRU(page); | ||
| 1215 | |||
| 1216 | VM_BUG_ON(!PageActive(page)); | ||
| 1217 | if (!is_active_lru(lru)) | ||
| 1218 | ClearPageActive(page); /* we are de-activating */ | ||
| 1219 | |||
| 1220 | list_move(&page->lru, &zone->lru[lru].list); | ||
| 1221 | mem_cgroup_add_lru_list(page, lru); | ||
| 1222 | pgmoved++; | ||
| 1223 | |||
| 1224 | if (!pagevec_add(&pvec, page) || list_empty(list)) { | ||
| 1225 | spin_unlock_irq(&zone->lru_lock); | ||
| 1226 | if (buffer_heads_over_limit) | ||
| 1227 | pagevec_strip(&pvec); | ||
| 1228 | __pagevec_release(&pvec); | ||
| 1229 | spin_lock_irq(&zone->lru_lock); | ||
| 1230 | } | ||
| 1231 | } | ||
| 1232 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | ||
| 1233 | if (!is_active_lru(lru)) | ||
| 1234 | __count_vm_events(PGDEACTIVATE, pgmoved); | ||
| 1235 | } | ||
| 1220 | 1236 | ||
| 1221 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | 1237 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, |
| 1222 | struct scan_control *sc, int priority, int file) | 1238 | struct scan_control *sc, int priority, int file) |
| 1223 | { | 1239 | { |
| 1224 | unsigned long pgmoved; | 1240 | unsigned long pgmoved; |
| 1225 | int pgdeactivate = 0; | ||
| 1226 | unsigned long pgscanned; | 1241 | unsigned long pgscanned; |
| 1242 | unsigned long vm_flags; | ||
| 1227 | LIST_HEAD(l_hold); /* The pages which were snipped off */ | 1243 | LIST_HEAD(l_hold); /* The pages which were snipped off */ |
| 1244 | LIST_HEAD(l_active); | ||
| 1228 | LIST_HEAD(l_inactive); | 1245 | LIST_HEAD(l_inactive); |
| 1229 | struct page *page; | 1246 | struct page *page; |
| 1230 | struct pagevec pvec; | ||
| 1231 | enum lru_list lru; | ||
| 1232 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1247 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
| 1233 | 1248 | ||
| 1234 | lru_add_drain(); | 1249 | lru_add_drain(); |
| @@ -1245,13 +1260,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 1245 | } | 1260 | } |
| 1246 | reclaim_stat->recent_scanned[!!file] += pgmoved; | 1261 | reclaim_stat->recent_scanned[!!file] += pgmoved; |
| 1247 | 1262 | ||
| 1263 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | ||
| 1248 | if (file) | 1264 | if (file) |
| 1249 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); | 1265 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); |
| 1250 | else | 1266 | else |
| 1251 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); | 1267 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); |
| 1252 | spin_unlock_irq(&zone->lru_lock); | 1268 | spin_unlock_irq(&zone->lru_lock); |
| 1253 | 1269 | ||
| 1254 | pgmoved = 0; | 1270 | pgmoved = 0; /* count referenced (mapping) mapped pages */ |
| 1255 | while (!list_empty(&l_hold)) { | 1271 | while (!list_empty(&l_hold)) { |
| 1256 | cond_resched(); | 1272 | cond_resched(); |
| 1257 | page = lru_to_page(&l_hold); | 1273 | page = lru_to_page(&l_hold); |
| @@ -1264,58 +1280,44 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 1264 | 1280 | ||
| 1265 | /* page_referenced clears PageReferenced */ | 1281 | /* page_referenced clears PageReferenced */ |
| 1266 | if (page_mapping_inuse(page) && | 1282 | if (page_mapping_inuse(page) && |
| 1267 | page_referenced(page, 0, sc->mem_cgroup)) | 1283 | page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
| 1268 | pgmoved++; | 1284 | pgmoved++; |
| 1285 | /* | ||
| 1286 | * Identify referenced, file-backed active pages and | ||
| 1287 | * give them one more trip around the active list. So | ||
| 1288 | * that executable code get better chances to stay in | ||
| 1289 | * memory under moderate memory pressure. Anon pages | ||
| 1290 | * are not likely to be evicted by use-once streaming | ||
| 1291 | * IO, plus JVM can create lots of anon VM_EXEC pages, | ||
| 1292 | * so we ignore them here. | ||
| 1293 | */ | ||
| 1294 | if ((vm_flags & VM_EXEC) && !PageAnon(page)) { | ||
| 1295 | list_add(&page->lru, &l_active); | ||
| 1296 | continue; | ||
| 1297 | } | ||
| 1298 | } | ||
| 1269 | 1299 | ||
| 1270 | list_add(&page->lru, &l_inactive); | 1300 | list_add(&page->lru, &l_inactive); |
| 1271 | } | 1301 | } |
| 1272 | 1302 | ||
| 1273 | /* | 1303 | /* |
| 1274 | * Move the pages to the [file or anon] inactive list. | 1304 | * Move pages back to the lru list. |
| 1275 | */ | 1305 | */ |
| 1276 | pagevec_init(&pvec, 1); | ||
| 1277 | lru = LRU_BASE + file * LRU_FILE; | ||
| 1278 | |||
| 1279 | spin_lock_irq(&zone->lru_lock); | 1306 | spin_lock_irq(&zone->lru_lock); |
| 1280 | /* | 1307 | /* |
| 1281 | * Count referenced pages from currently used mappings as | 1308 | * Count referenced pages from currently used mappings as rotated, |
| 1282 | * rotated, even though they are moved to the inactive list. | 1309 | * even though only some of them are actually re-activated. This |
| 1283 | * This helps balance scan pressure between file and anonymous | 1310 | * helps balance scan pressure between file and anonymous pages in |
| 1284 | * pages in get_scan_ratio. | 1311 | * get_scan_ratio. |
| 1285 | */ | 1312 | */ |
| 1286 | reclaim_stat->recent_rotated[!!file] += pgmoved; | 1313 | reclaim_stat->recent_rotated[!!file] += pgmoved; |
| 1287 | 1314 | ||
| 1288 | pgmoved = 0; | 1315 | move_active_pages_to_lru(zone, &l_active, |
| 1289 | while (!list_empty(&l_inactive)) { | 1316 | LRU_ACTIVE + file * LRU_FILE); |
| 1290 | page = lru_to_page(&l_inactive); | 1317 | move_active_pages_to_lru(zone, &l_inactive, |
| 1291 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 1318 | LRU_BASE + file * LRU_FILE); |
| 1292 | VM_BUG_ON(PageLRU(page)); | ||
| 1293 | SetPageLRU(page); | ||
| 1294 | VM_BUG_ON(!PageActive(page)); | ||
| 1295 | ClearPageActive(page); | ||
| 1296 | 1319 | ||
| 1297 | list_move(&page->lru, &zone->lru[lru].list); | ||
| 1298 | mem_cgroup_add_lru_list(page, lru); | ||
| 1299 | pgmoved++; | ||
| 1300 | if (!pagevec_add(&pvec, page)) { | ||
| 1301 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | ||
| 1302 | spin_unlock_irq(&zone->lru_lock); | ||
| 1303 | pgdeactivate += pgmoved; | ||
| 1304 | pgmoved = 0; | ||
| 1305 | if (buffer_heads_over_limit) | ||
| 1306 | pagevec_strip(&pvec); | ||
| 1307 | __pagevec_release(&pvec); | ||
| 1308 | spin_lock_irq(&zone->lru_lock); | ||
| 1309 | } | ||
| 1310 | } | ||
| 1311 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | ||
| 1312 | pgdeactivate += pgmoved; | ||
| 1313 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | ||
| 1314 | __count_vm_events(PGDEACTIVATE, pgdeactivate); | ||
| 1315 | spin_unlock_irq(&zone->lru_lock); | 1320 | spin_unlock_irq(&zone->lru_lock); |
| 1316 | if (buffer_heads_over_limit) | ||
| 1317 | pagevec_strip(&pvec); | ||
| 1318 | pagevec_release(&pvec); | ||
| 1319 | } | 1321 | } |
| 1320 | 1322 | ||
| 1321 | static int inactive_anon_is_low_global(struct zone *zone) | 1323 | static int inactive_anon_is_low_global(struct zone *zone) |
| @@ -1350,12 +1352,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) | |||
| 1350 | return low; | 1352 | return low; |
| 1351 | } | 1353 | } |
| 1352 | 1354 | ||
| 1355 | static int inactive_file_is_low_global(struct zone *zone) | ||
| 1356 | { | ||
| 1357 | unsigned long active, inactive; | ||
| 1358 | |||
| 1359 | active = zone_page_state(zone, NR_ACTIVE_FILE); | ||
| 1360 | inactive = zone_page_state(zone, NR_INACTIVE_FILE); | ||
| 1361 | |||
| 1362 | return (active > inactive); | ||
| 1363 | } | ||
| 1364 | |||
| 1365 | /** | ||
| 1366 | * inactive_file_is_low - check if file pages need to be deactivated | ||
| 1367 | * @zone: zone to check | ||
| 1368 | * @sc: scan control of this context | ||
| 1369 | * | ||
| 1370 | * When the system is doing streaming IO, memory pressure here | ||
| 1371 | * ensures that active file pages get deactivated, until more | ||
| 1372 | * than half of the file pages are on the inactive list. | ||
| 1373 | * | ||
| 1374 | * Once we get to that situation, protect the system's working | ||
| 1375 | * set from being evicted by disabling active file page aging. | ||
| 1376 | * | ||
| 1377 | * This uses a different ratio than the anonymous pages, because | ||
| 1378 | * the page cache uses a use-once replacement algorithm. | ||
| 1379 | */ | ||
| 1380 | static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) | ||
| 1381 | { | ||
| 1382 | int low; | ||
| 1383 | |||
| 1384 | if (scanning_global_lru(sc)) | ||
| 1385 | low = inactive_file_is_low_global(zone); | ||
| 1386 | else | ||
| 1387 | low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); | ||
| 1388 | return low; | ||
| 1389 | } | ||
| 1390 | |||
| 1353 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1391 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
| 1354 | struct zone *zone, struct scan_control *sc, int priority) | 1392 | struct zone *zone, struct scan_control *sc, int priority) |
| 1355 | { | 1393 | { |
| 1356 | int file = is_file_lru(lru); | 1394 | int file = is_file_lru(lru); |
| 1357 | 1395 | ||
| 1358 | if (lru == LRU_ACTIVE_FILE) { | 1396 | if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { |
| 1359 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | 1397 | shrink_active_list(nr_to_scan, zone, sc, priority, file); |
| 1360 | return 0; | 1398 | return 0; |
| 1361 | } | 1399 | } |
| @@ -1384,13 +1422,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
| 1384 | unsigned long ap, fp; | 1422 | unsigned long ap, fp; |
| 1385 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1423 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
| 1386 | 1424 | ||
| 1387 | /* If we have no swap space, do not bother scanning anon pages. */ | ||
| 1388 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | ||
| 1389 | percent[0] = 0; | ||
| 1390 | percent[1] = 100; | ||
| 1391 | return; | ||
| 1392 | } | ||
| 1393 | |||
| 1394 | anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + | 1425 | anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + |
| 1395 | zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); | 1426 | zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); |
| 1396 | file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + | 1427 | file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + |
| @@ -1400,7 +1431,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
| 1400 | free = zone_page_state(zone, NR_FREE_PAGES); | 1431 | free = zone_page_state(zone, NR_FREE_PAGES); |
| 1401 | /* If we have very few page cache pages, | 1432 | /* If we have very few page cache pages, |
| 1402 | force-scan anon pages. */ | 1433 | force-scan anon pages. */ |
| 1403 | if (unlikely(file + free <= zone->pages_high)) { | 1434 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
| 1404 | percent[0] = 100; | 1435 | percent[0] = 100; |
| 1405 | percent[1] = 0; | 1436 | percent[1] = 0; |
| 1406 | return; | 1437 | return; |
| @@ -1455,6 +1486,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
| 1455 | percent[1] = 100 - percent[0]; | 1486 | percent[1] = 100 - percent[0]; |
| 1456 | } | 1487 | } |
| 1457 | 1488 | ||
| 1489 | /* | ||
| 1490 | * Smallish @nr_to_scan's are deposited in @nr_saved_scan, | ||
| 1491 | * until we collected @swap_cluster_max pages to scan. | ||
| 1492 | */ | ||
| 1493 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, | ||
| 1494 | unsigned long *nr_saved_scan, | ||
| 1495 | unsigned long swap_cluster_max) | ||
| 1496 | { | ||
| 1497 | unsigned long nr; | ||
| 1498 | |||
| 1499 | *nr_saved_scan += nr_to_scan; | ||
| 1500 | nr = *nr_saved_scan; | ||
| 1501 | |||
| 1502 | if (nr >= swap_cluster_max) | ||
| 1503 | *nr_saved_scan = 0; | ||
| 1504 | else | ||
| 1505 | nr = 0; | ||
| 1506 | |||
| 1507 | return nr; | ||
| 1508 | } | ||
| 1458 | 1509 | ||
| 1459 | /* | 1510 | /* |
| 1460 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1511 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
| @@ -1468,26 +1519,30 @@ static void shrink_zone(int priority, struct zone *zone, | |||
| 1468 | enum lru_list l; | 1519 | enum lru_list l; |
| 1469 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1520 | unsigned long nr_reclaimed = sc->nr_reclaimed; |
| 1470 | unsigned long swap_cluster_max = sc->swap_cluster_max; | 1521 | unsigned long swap_cluster_max = sc->swap_cluster_max; |
| 1522 | int noswap = 0; | ||
| 1471 | 1523 | ||
| 1472 | get_scan_ratio(zone, sc, percent); | 1524 | /* If we have no swap space, do not bother scanning anon pages. */ |
| 1525 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | ||
| 1526 | noswap = 1; | ||
| 1527 | percent[0] = 0; | ||
| 1528 | percent[1] = 100; | ||
| 1529 | } else | ||
| 1530 | get_scan_ratio(zone, sc, percent); | ||
| 1473 | 1531 | ||
| 1474 | for_each_evictable_lru(l) { | 1532 | for_each_evictable_lru(l) { |
| 1475 | int file = is_file_lru(l); | 1533 | int file = is_file_lru(l); |
| 1476 | unsigned long scan; | 1534 | unsigned long scan; |
| 1477 | 1535 | ||
| 1478 | scan = zone_nr_pages(zone, sc, l); | 1536 | scan = zone_nr_pages(zone, sc, l); |
| 1479 | if (priority) { | 1537 | if (priority || noswap) { |
| 1480 | scan >>= priority; | 1538 | scan >>= priority; |
| 1481 | scan = (scan * percent[file]) / 100; | 1539 | scan = (scan * percent[file]) / 100; |
| 1482 | } | 1540 | } |
| 1483 | if (scanning_global_lru(sc)) { | 1541 | if (scanning_global_lru(sc)) |
| 1484 | zone->lru[l].nr_scan += scan; | 1542 | nr[l] = nr_scan_try_batch(scan, |
| 1485 | nr[l] = zone->lru[l].nr_scan; | 1543 | &zone->lru[l].nr_saved_scan, |
| 1486 | if (nr[l] >= swap_cluster_max) | 1544 | swap_cluster_max); |
| 1487 | zone->lru[l].nr_scan = 0; | 1545 | else |
| 1488 | else | ||
| 1489 | nr[l] = 0; | ||
| 1490 | } else | ||
| 1491 | nr[l] = scan; | 1546 | nr[l] = scan; |
| 1492 | } | 1547 | } |
| 1493 | 1548 | ||
| @@ -1521,7 +1576,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
| 1521 | * Even if we did not try to evict anon pages at all, we want to | 1576 | * Even if we did not try to evict anon pages at all, we want to |
| 1522 | * rebalance the anon lru active/inactive ratio. | 1577 | * rebalance the anon lru active/inactive ratio. |
| 1523 | */ | 1578 | */ |
| 1524 | if (inactive_anon_is_low(zone, sc)) | 1579 | if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) |
| 1525 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 1580 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); |
| 1526 | 1581 | ||
| 1527 | throttle_vm_writeout(sc->gfp_mask); | 1582 | throttle_vm_writeout(sc->gfp_mask); |
| @@ -1532,11 +1587,13 @@ static void shrink_zone(int priority, struct zone *zone, | |||
| 1532 | * try to reclaim pages from zones which will satisfy the caller's allocation | 1587 | * try to reclaim pages from zones which will satisfy the caller's allocation |
| 1533 | * request. | 1588 | * request. |
| 1534 | * | 1589 | * |
| 1535 | * We reclaim from a zone even if that zone is over pages_high. Because: | 1590 | * We reclaim from a zone even if that zone is over high_wmark_pages(zone). |
| 1591 | * Because: | ||
| 1536 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order | 1592 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order |
| 1537 | * allocation or | 1593 | * allocation or |
| 1538 | * b) The zones may be over pages_high but they must go *over* pages_high to | 1594 | * b) The target zone may be at high_wmark_pages(zone) but the lower zones |
| 1539 | * satisfy the `incremental min' zone defense algorithm. | 1595 | * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' |
| 1596 | * zone defense algorithm. | ||
| 1540 | * | 1597 | * |
| 1541 | * If a zone is deemed to be full of pinned pages then just give it a light | 1598 | * If a zone is deemed to be full of pinned pages then just give it a light |
| 1542 | * scan then give up on it. | 1599 | * scan then give up on it. |
| @@ -1742,7 +1799,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
| 1742 | 1799 | ||
| 1743 | /* | 1800 | /* |
| 1744 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1801 | * For kswapd, balance_pgdat() will work across all this node's zones until |
| 1745 | * they are all at pages_high. | 1802 | * they are all at high_wmark_pages(zone). |
| 1746 | * | 1803 | * |
| 1747 | * Returns the number of pages which were actually freed. | 1804 | * Returns the number of pages which were actually freed. |
| 1748 | * | 1805 | * |
| @@ -1755,11 +1812,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
| 1755 | * the zone for when the problem goes away. | 1812 | * the zone for when the problem goes away. |
| 1756 | * | 1813 | * |
| 1757 | * kswapd scans the zones in the highmem->normal->dma direction. It skips | 1814 | * kswapd scans the zones in the highmem->normal->dma direction. It skips |
| 1758 | * zones which have free_pages > pages_high, but once a zone is found to have | 1815 | * zones which have free_pages > high_wmark_pages(zone), but once a zone is |
| 1759 | * free_pages <= pages_high, we scan that zone and the lower zones regardless | 1816 | * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the |
| 1760 | * of the number of free pages in the lower zones. This interoperates with | 1817 | * lower zones regardless of the number of free pages in the lower zones. This |
| 1761 | * the page allocator fallback scheme to ensure that aging of pages is balanced | 1818 | * interoperates with the page allocator fallback scheme to ensure that aging |
| 1762 | * across the zones. | 1819 | * of pages is balanced across the zones. |
| 1763 | */ | 1820 | */ |
| 1764 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | 1821 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) |
| 1765 | { | 1822 | { |
| @@ -1780,7 +1837,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
| 1780 | }; | 1837 | }; |
| 1781 | /* | 1838 | /* |
| 1782 | * temp_priority is used to remember the scanning priority at which | 1839 | * temp_priority is used to remember the scanning priority at which |
| 1783 | * this zone was successfully refilled to free_pages == pages_high. | 1840 | * this zone was successfully refilled to |
| 1841 | * free_pages == high_wmark_pages(zone). | ||
| 1784 | */ | 1842 | */ |
| 1785 | int temp_priority[MAX_NR_ZONES]; | 1843 | int temp_priority[MAX_NR_ZONES]; |
| 1786 | 1844 | ||
| @@ -1825,8 +1883,8 @@ loop_again: | |||
| 1825 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 1883 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
| 1826 | &sc, priority, 0); | 1884 | &sc, priority, 0); |
| 1827 | 1885 | ||
| 1828 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1886 | if (!zone_watermark_ok(zone, order, |
| 1829 | 0, 0)) { | 1887 | high_wmark_pages(zone), 0, 0)) { |
| 1830 | end_zone = i; | 1888 | end_zone = i; |
| 1831 | break; | 1889 | break; |
| 1832 | } | 1890 | } |
| @@ -1860,8 +1918,8 @@ loop_again: | |||
| 1860 | priority != DEF_PRIORITY) | 1918 | priority != DEF_PRIORITY) |
| 1861 | continue; | 1919 | continue; |
| 1862 | 1920 | ||
| 1863 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1921 | if (!zone_watermark_ok(zone, order, |
| 1864 | end_zone, 0)) | 1922 | high_wmark_pages(zone), end_zone, 0)) |
| 1865 | all_zones_ok = 0; | 1923 | all_zones_ok = 0; |
| 1866 | temp_priority[i] = priority; | 1924 | temp_priority[i] = priority; |
| 1867 | sc.nr_scanned = 0; | 1925 | sc.nr_scanned = 0; |
| @@ -1870,8 +1928,8 @@ loop_again: | |||
| 1870 | * We put equal pressure on every zone, unless one | 1928 | * We put equal pressure on every zone, unless one |
| 1871 | * zone has way too many pages free already. | 1929 | * zone has way too many pages free already. |
| 1872 | */ | 1930 | */ |
| 1873 | if (!zone_watermark_ok(zone, order, 8*zone->pages_high, | 1931 | if (!zone_watermark_ok(zone, order, |
| 1874 | end_zone, 0)) | 1932 | 8*high_wmark_pages(zone), end_zone, 0)) |
| 1875 | shrink_zone(priority, zone, &sc); | 1933 | shrink_zone(priority, zone, &sc); |
| 1876 | reclaim_state->reclaimed_slab = 0; | 1934 | reclaim_state->reclaimed_slab = 0; |
| 1877 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1935 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
| @@ -2037,7 +2095,7 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
| 2037 | return; | 2095 | return; |
| 2038 | 2096 | ||
| 2039 | pgdat = zone->zone_pgdat; | 2097 | pgdat = zone->zone_pgdat; |
| 2040 | if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) | 2098 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) |
| 2041 | return; | 2099 | return; |
| 2042 | if (pgdat->kswapd_max_order < order) | 2100 | if (pgdat->kswapd_max_order < order) |
| 2043 | pgdat->kswapd_max_order = order; | 2101 | pgdat->kswapd_max_order = order; |
| @@ -2084,11 +2142,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, | |||
| 2084 | l == LRU_ACTIVE_FILE)) | 2142 | l == LRU_ACTIVE_FILE)) |
| 2085 | continue; | 2143 | continue; |
| 2086 | 2144 | ||
| 2087 | zone->lru[l].nr_scan += (lru_pages >> prio) + 1; | 2145 | zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; |
| 2088 | if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { | 2146 | if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { |
| 2089 | unsigned long nr_to_scan; | 2147 | unsigned long nr_to_scan; |
| 2090 | 2148 | ||
| 2091 | zone->lru[l].nr_scan = 0; | 2149 | zone->lru[l].nr_saved_scan = 0; |
| 2092 | nr_to_scan = min(nr_pages, lru_pages); | 2150 | nr_to_scan = min(nr_pages, lru_pages); |
| 2093 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, | 2151 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, |
| 2094 | sc, prio); | 2152 | sc, prio); |
| @@ -2290,6 +2348,48 @@ int sysctl_min_unmapped_ratio = 1; | |||
| 2290 | */ | 2348 | */ |
| 2291 | int sysctl_min_slab_ratio = 5; | 2349 | int sysctl_min_slab_ratio = 5; |
| 2292 | 2350 | ||
| 2351 | static inline unsigned long zone_unmapped_file_pages(struct zone *zone) | ||
| 2352 | { | ||
| 2353 | unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); | ||
| 2354 | unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + | ||
| 2355 | zone_page_state(zone, NR_ACTIVE_FILE); | ||
| 2356 | |||
| 2357 | /* | ||
| 2358 | * It's possible for there to be more file mapped pages than | ||
| 2359 | * accounted for by the pages on the file LRU lists because | ||
| 2360 | * tmpfs pages accounted for as ANON can also be FILE_MAPPED | ||
| 2361 | */ | ||
| 2362 | return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; | ||
| 2363 | } | ||
| 2364 | |||
| 2365 | /* Work out how many page cache pages we can reclaim in this reclaim_mode */ | ||
| 2366 | static long zone_pagecache_reclaimable(struct zone *zone) | ||
| 2367 | { | ||
| 2368 | long nr_pagecache_reclaimable; | ||
| 2369 | long delta = 0; | ||
| 2370 | |||
| 2371 | /* | ||
| 2372 | * If RECLAIM_SWAP is set, then all file pages are considered | ||
| 2373 | * potentially reclaimable. Otherwise, we have to worry about | ||
| 2374 | * pages like swapcache and zone_unmapped_file_pages() provides | ||
| 2375 | * a better estimate | ||
| 2376 | */ | ||
| 2377 | if (zone_reclaim_mode & RECLAIM_SWAP) | ||
| 2378 | nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); | ||
| 2379 | else | ||
| 2380 | nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); | ||
| 2381 | |||
| 2382 | /* If we can't clean pages, remove dirty pages from consideration */ | ||
| 2383 | if (!(zone_reclaim_mode & RECLAIM_WRITE)) | ||
| 2384 | delta += zone_page_state(zone, NR_FILE_DIRTY); | ||
| 2385 | |||
| 2386 | /* Watch for any possible underflows due to delta */ | ||
| 2387 | if (unlikely(delta > nr_pagecache_reclaimable)) | ||
| 2388 | delta = nr_pagecache_reclaimable; | ||
| 2389 | |||
| 2390 | return nr_pagecache_reclaimable - delta; | ||
| 2391 | } | ||
| 2392 | |||
| 2293 | /* | 2393 | /* |
| 2294 | * Try to free up some pages from this zone through reclaim. | 2394 | * Try to free up some pages from this zone through reclaim. |
| 2295 | */ | 2395 | */ |
| @@ -2324,9 +2424,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2324 | reclaim_state.reclaimed_slab = 0; | 2424 | reclaim_state.reclaimed_slab = 0; |
| 2325 | p->reclaim_state = &reclaim_state; | 2425 | p->reclaim_state = &reclaim_state; |
| 2326 | 2426 | ||
| 2327 | if (zone_page_state(zone, NR_FILE_PAGES) - | 2427 | if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { |
| 2328 | zone_page_state(zone, NR_FILE_MAPPED) > | ||
| 2329 | zone->min_unmapped_pages) { | ||
| 2330 | /* | 2428 | /* |
| 2331 | * Free memory by calling shrink zone with increasing | 2429 | * Free memory by calling shrink zone with increasing |
| 2332 | * priorities until we have enough memory freed. | 2430 | * priorities until we have enough memory freed. |
| @@ -2384,20 +2482,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2384 | * if less than a specified percentage of the zone is used by | 2482 | * if less than a specified percentage of the zone is used by |
| 2385 | * unmapped file backed pages. | 2483 | * unmapped file backed pages. |
| 2386 | */ | 2484 | */ |
| 2387 | if (zone_page_state(zone, NR_FILE_PAGES) - | 2485 | if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && |
| 2388 | zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages | 2486 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) |
| 2389 | && zone_page_state(zone, NR_SLAB_RECLAIMABLE) | 2487 | return ZONE_RECLAIM_FULL; |
| 2390 | <= zone->min_slab_pages) | ||
| 2391 | return 0; | ||
| 2392 | 2488 | ||
| 2393 | if (zone_is_all_unreclaimable(zone)) | 2489 | if (zone_is_all_unreclaimable(zone)) |
| 2394 | return 0; | 2490 | return ZONE_RECLAIM_FULL; |
| 2395 | 2491 | ||
| 2396 | /* | 2492 | /* |
| 2397 | * Do not scan if the allocation should not be delayed. | 2493 | * Do not scan if the allocation should not be delayed. |
| 2398 | */ | 2494 | */ |
| 2399 | if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) | 2495 | if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) |
| 2400 | return 0; | 2496 | return ZONE_RECLAIM_NOSCAN; |
| 2401 | 2497 | ||
| 2402 | /* | 2498 | /* |
| 2403 | * Only run zone reclaim on the local zone or on zones that do not | 2499 | * Only run zone reclaim on the local zone or on zones that do not |
| @@ -2407,18 +2503,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2407 | */ | 2503 | */ |
| 2408 | node_id = zone_to_nid(zone); | 2504 | node_id = zone_to_nid(zone); |
| 2409 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) | 2505 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) |
| 2410 | return 0; | 2506 | return ZONE_RECLAIM_NOSCAN; |
| 2411 | 2507 | ||
| 2412 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) | 2508 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) |
| 2413 | return 0; | 2509 | return ZONE_RECLAIM_NOSCAN; |
| 2510 | |||
| 2414 | ret = __zone_reclaim(zone, gfp_mask, order); | 2511 | ret = __zone_reclaim(zone, gfp_mask, order); |
| 2415 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); | 2512 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); |
| 2416 | 2513 | ||
| 2514 | if (!ret) | ||
| 2515 | count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); | ||
| 2516 | |||
| 2417 | return ret; | 2517 | return ret; |
| 2418 | } | 2518 | } |
| 2419 | #endif | 2519 | #endif |
| 2420 | 2520 | ||
| 2421 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 2422 | /* | 2521 | /* |
| 2423 | * page_evictable - test whether a page is evictable | 2522 | * page_evictable - test whether a page is evictable |
| 2424 | * @page: the page to test | 2523 | * @page: the page to test |
| @@ -2665,4 +2764,3 @@ void scan_unevictable_unregister_node(struct node *node) | |||
| 2665 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); | 2764 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); |
| 2666 | } | 2765 | } |
| 2667 | 2766 | ||
| 2668 | #endif | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 74d66dba0cbe..138bed53706e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -629,10 +629,8 @@ static const char * const vmstat_text[] = { | |||
| 629 | "nr_active_anon", | 629 | "nr_active_anon", |
| 630 | "nr_inactive_file", | 630 | "nr_inactive_file", |
| 631 | "nr_active_file", | 631 | "nr_active_file", |
| 632 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 633 | "nr_unevictable", | 632 | "nr_unevictable", |
| 634 | "nr_mlock", | 633 | "nr_mlock", |
| 635 | #endif | ||
| 636 | "nr_anon_pages", | 634 | "nr_anon_pages", |
| 637 | "nr_mapped", | 635 | "nr_mapped", |
| 638 | "nr_file_pages", | 636 | "nr_file_pages", |
| @@ -675,6 +673,9 @@ static const char * const vmstat_text[] = { | |||
| 675 | TEXTS_FOR_ZONES("pgscan_kswapd") | 673 | TEXTS_FOR_ZONES("pgscan_kswapd") |
| 676 | TEXTS_FOR_ZONES("pgscan_direct") | 674 | TEXTS_FOR_ZONES("pgscan_direct") |
| 677 | 675 | ||
| 676 | #ifdef CONFIG_NUMA | ||
| 677 | "zone_reclaim_failed", | ||
| 678 | #endif | ||
| 678 | "pginodesteal", | 679 | "pginodesteal", |
| 679 | "slabs_scanned", | 680 | "slabs_scanned", |
| 680 | "kswapd_steal", | 681 | "kswapd_steal", |
| @@ -687,7 +688,6 @@ static const char * const vmstat_text[] = { | |||
| 687 | "htlb_buddy_alloc_success", | 688 | "htlb_buddy_alloc_success", |
| 688 | "htlb_buddy_alloc_fail", | 689 | "htlb_buddy_alloc_fail", |
| 689 | #endif | 690 | #endif |
| 690 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 691 | "unevictable_pgs_culled", | 691 | "unevictable_pgs_culled", |
| 692 | "unevictable_pgs_scanned", | 692 | "unevictable_pgs_scanned", |
| 693 | "unevictable_pgs_rescued", | 693 | "unevictable_pgs_rescued", |
| @@ -697,7 +697,6 @@ static const char * const vmstat_text[] = { | |||
| 697 | "unevictable_pgs_stranded", | 697 | "unevictable_pgs_stranded", |
| 698 | "unevictable_pgs_mlockfreed", | 698 | "unevictable_pgs_mlockfreed", |
| 699 | #endif | 699 | #endif |
| 700 | #endif | ||
| 701 | }; | 700 | }; |
| 702 | 701 | ||
| 703 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | 702 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, |
| @@ -710,18 +709,14 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
| 710 | "\n min %lu" | 709 | "\n min %lu" |
| 711 | "\n low %lu" | 710 | "\n low %lu" |
| 712 | "\n high %lu" | 711 | "\n high %lu" |
| 713 | "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" | 712 | "\n scanned %lu" |
| 714 | "\n spanned %lu" | 713 | "\n spanned %lu" |
| 715 | "\n present %lu", | 714 | "\n present %lu", |
| 716 | zone_page_state(zone, NR_FREE_PAGES), | 715 | zone_page_state(zone, NR_FREE_PAGES), |
| 717 | zone->pages_min, | 716 | min_wmark_pages(zone), |
| 718 | zone->pages_low, | 717 | low_wmark_pages(zone), |
| 719 | zone->pages_high, | 718 | high_wmark_pages(zone), |
| 720 | zone->pages_scanned, | 719 | zone->pages_scanned, |
| 721 | zone->lru[LRU_ACTIVE_ANON].nr_scan, | ||
| 722 | zone->lru[LRU_INACTIVE_ANON].nr_scan, | ||
| 723 | zone->lru[LRU_ACTIVE_FILE].nr_scan, | ||
| 724 | zone->lru[LRU_INACTIVE_FILE].nr_scan, | ||
| 725 | zone->spanned_pages, | 720 | zone->spanned_pages, |
| 726 | zone->present_pages); | 721 | zone->present_pages); |
| 727 | 722 | ||
