diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 18 | ||||
| -rw-r--r-- | mm/Makefile | 3 | ||||
| -rw-r--r-- | mm/bounce.c | 6 | ||||
| -rw-r--r-- | mm/filemap.c | 766 | ||||
| -rw-r--r-- | mm/filemap.h | 103 | ||||
| -rw-r--r-- | mm/filemap_xip.c | 17 | ||||
| -rw-r--r-- | mm/hugetlb.c | 398 | ||||
| -rw-r--r-- | mm/internal.h | 10 | ||||
| -rw-r--r-- | mm/memory.c | 161 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 312 | ||||
| -rw-r--r-- | mm/mempolicy.c | 60 | ||||
| -rw-r--r-- | mm/migrate.c | 4 | ||||
| -rw-r--r-- | mm/mprotect.c | 1 | ||||
| -rw-r--r-- | mm/oom_kill.c | 9 | ||||
| -rw-r--r-- | mm/page-writeback.c | 10 | ||||
| -rw-r--r-- | mm/page_alloc.c | 731 | ||||
| -rw-r--r-- | mm/page_isolation.c | 138 | ||||
| -rw-r--r-- | mm/readahead.c | 88 | ||||
| -rw-r--r-- | mm/rmap.c | 1 | ||||
| -rw-r--r-- | mm/shmem.c | 62 | ||||
| -rw-r--r-- | mm/slab.c | 21 | ||||
| -rw-r--r-- | mm/slob.c | 7 | ||||
| -rw-r--r-- | mm/slub.c | 490 | ||||
| -rw-r--r-- | mm/sparse-vmemmap.c | 148 | ||||
| -rw-r--r-- | mm/sparse.c | 105 | ||||
| -rw-r--r-- | mm/swap.c | 106 | ||||
| -rw-r--r-- | mm/swap_state.c | 5 | ||||
| -rw-r--r-- | mm/util.c | 6 | ||||
| -rw-r--r-- | mm/vmalloc.c | 5 | ||||
| -rw-r--r-- | mm/vmscan.c | 59 | ||||
| -rw-r--r-- | mm/vmstat.c | 305 |
31 files changed, 3077 insertions, 1078 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a7609cbcb00d..1cc6cada2bbf 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -112,6 +112,19 @@ config SPARSEMEM_EXTREME | |||
| 112 | def_bool y | 112 | def_bool y |
| 113 | depends on SPARSEMEM && !SPARSEMEM_STATIC | 113 | depends on SPARSEMEM && !SPARSEMEM_STATIC |
| 114 | 114 | ||
| 115 | # | ||
| 116 | # SPARSEMEM_VMEMMAP uses a virtually mapped mem_map to optimise pfn_to_page | ||
| 117 | # and page_to_pfn. The most efficient option where kernel virtual space is | ||
| 118 | # not under pressure. | ||
| 119 | # | ||
| 120 | config SPARSEMEM_VMEMMAP_ENABLE | ||
| 121 | def_bool n | ||
| 122 | |||
| 123 | config SPARSEMEM_VMEMMAP | ||
| 124 | bool | ||
| 125 | depends on SPARSEMEM | ||
| 126 | default y if (SPARSEMEM_VMEMMAP_ENABLE) | ||
| 127 | |||
| 115 | # eventually, we can have this option just 'select SPARSEMEM' | 128 | # eventually, we can have this option just 'select SPARSEMEM' |
| 116 | config MEMORY_HOTPLUG | 129 | config MEMORY_HOTPLUG |
| 117 | bool "Allow for memory hot-add" | 130 | bool "Allow for memory hot-add" |
| @@ -126,6 +139,11 @@ config MEMORY_HOTPLUG_SPARSE | |||
| 126 | def_bool y | 139 | def_bool y |
| 127 | depends on SPARSEMEM && MEMORY_HOTPLUG | 140 | depends on SPARSEMEM && MEMORY_HOTPLUG |
| 128 | 141 | ||
| 142 | config MEMORY_HOTREMOVE | ||
| 143 | bool "Allow for memory hot remove" | ||
| 144 | depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE | ||
| 145 | depends on MIGRATION | ||
| 146 | |||
| 129 | # Heavily threaded applications may benefit from splitting the mm-wide | 147 | # Heavily threaded applications may benefit from splitting the mm-wide |
| 130 | # page_table_lock, so that faults on different parts of the user address | 148 | # page_table_lock, so that faults on different parts of the user address |
| 131 | # space can be handled with less contention: split it at this NR_CPUS. | 149 | # space can be handled with less contention: split it at this NR_CPUS. |
diff --git a/mm/Makefile b/mm/Makefile index 245e33ab00c4..5c0b0ea7572d 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -11,13 +11,14 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | |||
| 11 | page_alloc.o page-writeback.o pdflush.o \ | 11 | page_alloc.o page-writeback.o pdflush.o \ |
| 12 | readahead.o swap.o truncate.o vmscan.o \ | 12 | readahead.o swap.o truncate.o vmscan.o \ |
| 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
| 14 | $(mmu-y) | 14 | page_isolation.o $(mmu-y) |
| 15 | 15 | ||
| 16 | obj-$(CONFIG_BOUNCE) += bounce.o | 16 | obj-$(CONFIG_BOUNCE) += bounce.o |
| 17 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | 17 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o |
| 18 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 18 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
| 19 | obj-$(CONFIG_NUMA) += mempolicy.o | 19 | obj-$(CONFIG_NUMA) += mempolicy.o |
| 20 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 20 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
| 21 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | ||
| 21 | obj-$(CONFIG_SHMEM) += shmem.o | 22 | obj-$(CONFIG_SHMEM) += shmem.o |
| 22 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | 23 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o |
| 23 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | 24 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o |
diff --git a/mm/bounce.c b/mm/bounce.c index 3b549bf31f7d..b6d2d0f1019b 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
| @@ -265,6 +265,12 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) | |||
| 265 | mempool_t *pool; | 265 | mempool_t *pool; |
| 266 | 266 | ||
| 267 | /* | 267 | /* |
| 268 | * Data-less bio, nothing to bounce | ||
| 269 | */ | ||
| 270 | if (bio_empty_barrier(*bio_orig)) | ||
| 271 | return; | ||
| 272 | |||
| 273 | /* | ||
| 268 | * for non-isa bounce case, just check if the bounce pfn is equal | 274 | * for non-isa bounce case, just check if the bounce pfn is equal |
| 269 | * to or bigger than the highest pfn in the system -- in that case, | 275 | * to or bigger than the highest pfn in the system -- in that case, |
| 270 | * don't waste time iterating over bio segments | 276 | * don't waste time iterating over bio segments |
diff --git a/mm/filemap.c b/mm/filemap.c index 15c8413ee929..c6049e947cd9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -30,7 +30,7 @@ | |||
| 30 | #include <linux/security.h> | 30 | #include <linux/security.h> |
| 31 | #include <linux/syscalls.h> | 31 | #include <linux/syscalls.h> |
| 32 | #include <linux/cpuset.h> | 32 | #include <linux/cpuset.h> |
| 33 | #include "filemap.h" | 33 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
| 34 | #include "internal.h" | 34 | #include "internal.h" |
| 35 | 35 | ||
| 36 | /* | 36 | /* |
| @@ -593,7 +593,7 @@ void fastcall __lock_page_nosync(struct page *page) | |||
| 593 | * Is there a pagecache struct page at the given (mapping, offset) tuple? | 593 | * Is there a pagecache struct page at the given (mapping, offset) tuple? |
| 594 | * If yes, increment its refcount and return it; if no, return NULL. | 594 | * If yes, increment its refcount and return it; if no, return NULL. |
| 595 | */ | 595 | */ |
| 596 | struct page * find_get_page(struct address_space *mapping, unsigned long offset) | 596 | struct page * find_get_page(struct address_space *mapping, pgoff_t offset) |
| 597 | { | 597 | { |
| 598 | struct page *page; | 598 | struct page *page; |
| 599 | 599 | ||
| @@ -617,30 +617,31 @@ EXPORT_SYMBOL(find_get_page); | |||
| 617 | * Returns zero if the page was not present. find_lock_page() may sleep. | 617 | * Returns zero if the page was not present. find_lock_page() may sleep. |
| 618 | */ | 618 | */ |
| 619 | struct page *find_lock_page(struct address_space *mapping, | 619 | struct page *find_lock_page(struct address_space *mapping, |
| 620 | unsigned long offset) | 620 | pgoff_t offset) |
| 621 | { | 621 | { |
| 622 | struct page *page; | 622 | struct page *page; |
| 623 | 623 | ||
| 624 | read_lock_irq(&mapping->tree_lock); | ||
| 625 | repeat: | 624 | repeat: |
| 625 | read_lock_irq(&mapping->tree_lock); | ||
| 626 | page = radix_tree_lookup(&mapping->page_tree, offset); | 626 | page = radix_tree_lookup(&mapping->page_tree, offset); |
| 627 | if (page) { | 627 | if (page) { |
| 628 | page_cache_get(page); | 628 | page_cache_get(page); |
| 629 | if (TestSetPageLocked(page)) { | 629 | if (TestSetPageLocked(page)) { |
| 630 | read_unlock_irq(&mapping->tree_lock); | 630 | read_unlock_irq(&mapping->tree_lock); |
| 631 | __lock_page(page); | 631 | __lock_page(page); |
| 632 | read_lock_irq(&mapping->tree_lock); | ||
| 633 | 632 | ||
| 634 | /* Has the page been truncated while we slept? */ | 633 | /* Has the page been truncated while we slept? */ |
| 635 | if (unlikely(page->mapping != mapping || | 634 | if (unlikely(page->mapping != mapping)) { |
| 636 | page->index != offset)) { | ||
| 637 | unlock_page(page); | 635 | unlock_page(page); |
| 638 | page_cache_release(page); | 636 | page_cache_release(page); |
| 639 | goto repeat; | 637 | goto repeat; |
| 640 | } | 638 | } |
| 639 | VM_BUG_ON(page->index != offset); | ||
| 640 | goto out; | ||
| 641 | } | 641 | } |
| 642 | } | 642 | } |
| 643 | read_unlock_irq(&mapping->tree_lock); | 643 | read_unlock_irq(&mapping->tree_lock); |
| 644 | out: | ||
| 644 | return page; | 645 | return page; |
| 645 | } | 646 | } |
| 646 | EXPORT_SYMBOL(find_lock_page); | 647 | EXPORT_SYMBOL(find_lock_page); |
| @@ -663,29 +664,24 @@ EXPORT_SYMBOL(find_lock_page); | |||
| 663 | * memory exhaustion. | 664 | * memory exhaustion. |
| 664 | */ | 665 | */ |
| 665 | struct page *find_or_create_page(struct address_space *mapping, | 666 | struct page *find_or_create_page(struct address_space *mapping, |
| 666 | unsigned long index, gfp_t gfp_mask) | 667 | pgoff_t index, gfp_t gfp_mask) |
| 667 | { | 668 | { |
| 668 | struct page *page, *cached_page = NULL; | 669 | struct page *page; |
| 669 | int err; | 670 | int err; |
| 670 | repeat: | 671 | repeat: |
| 671 | page = find_lock_page(mapping, index); | 672 | page = find_lock_page(mapping, index); |
| 672 | if (!page) { | 673 | if (!page) { |
| 673 | if (!cached_page) { | 674 | page = __page_cache_alloc(gfp_mask); |
| 674 | cached_page = | 675 | if (!page) |
| 675 | __page_cache_alloc(gfp_mask); | 676 | return NULL; |
| 676 | if (!cached_page) | 677 | err = add_to_page_cache_lru(page, mapping, index, gfp_mask); |
| 677 | return NULL; | 678 | if (unlikely(err)) { |
| 679 | page_cache_release(page); | ||
| 680 | page = NULL; | ||
| 681 | if (err == -EEXIST) | ||
| 682 | goto repeat; | ||
| 678 | } | 683 | } |
| 679 | err = add_to_page_cache_lru(cached_page, mapping, | ||
| 680 | index, gfp_mask); | ||
| 681 | if (!err) { | ||
| 682 | page = cached_page; | ||
| 683 | cached_page = NULL; | ||
| 684 | } else if (err == -EEXIST) | ||
| 685 | goto repeat; | ||
| 686 | } | 684 | } |
| 687 | if (cached_page) | ||
| 688 | page_cache_release(cached_page); | ||
| 689 | return page; | 685 | return page; |
| 690 | } | 686 | } |
| 691 | EXPORT_SYMBOL(find_or_create_page); | 687 | EXPORT_SYMBOL(find_or_create_page); |
| @@ -797,7 +793,7 @@ EXPORT_SYMBOL(find_get_pages_tag); | |||
| 797 | * and deadlock against the caller's locked page. | 793 | * and deadlock against the caller's locked page. |
| 798 | */ | 794 | */ |
| 799 | struct page * | 795 | struct page * |
| 800 | grab_cache_page_nowait(struct address_space *mapping, unsigned long index) | 796 | grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) |
| 801 | { | 797 | { |
| 802 | struct page *page = find_get_page(mapping, index); | 798 | struct page *page = find_get_page(mapping, index); |
| 803 | 799 | ||
| @@ -859,34 +855,29 @@ static void shrink_readahead_size_eio(struct file *filp, | |||
| 859 | * It may be NULL. | 855 | * It may be NULL. |
| 860 | */ | 856 | */ |
| 861 | void do_generic_mapping_read(struct address_space *mapping, | 857 | void do_generic_mapping_read(struct address_space *mapping, |
| 862 | struct file_ra_state *_ra, | 858 | struct file_ra_state *ra, |
| 863 | struct file *filp, | 859 | struct file *filp, |
| 864 | loff_t *ppos, | 860 | loff_t *ppos, |
| 865 | read_descriptor_t *desc, | 861 | read_descriptor_t *desc, |
| 866 | read_actor_t actor) | 862 | read_actor_t actor) |
| 867 | { | 863 | { |
| 868 | struct inode *inode = mapping->host; | 864 | struct inode *inode = mapping->host; |
| 869 | unsigned long index; | 865 | pgoff_t index; |
| 870 | unsigned long offset; | 866 | pgoff_t last_index; |
| 871 | unsigned long last_index; | 867 | pgoff_t prev_index; |
| 872 | unsigned long next_index; | 868 | unsigned long offset; /* offset into pagecache page */ |
| 873 | unsigned long prev_index; | ||
| 874 | unsigned int prev_offset; | 869 | unsigned int prev_offset; |
| 875 | struct page *cached_page; | ||
| 876 | int error; | 870 | int error; |
| 877 | struct file_ra_state ra = *_ra; | ||
| 878 | 871 | ||
| 879 | cached_page = NULL; | ||
| 880 | index = *ppos >> PAGE_CACHE_SHIFT; | 872 | index = *ppos >> PAGE_CACHE_SHIFT; |
| 881 | next_index = index; | 873 | prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; |
| 882 | prev_index = ra.prev_index; | 874 | prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); |
| 883 | prev_offset = ra.prev_offset; | ||
| 884 | last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; | 875 | last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; |
| 885 | offset = *ppos & ~PAGE_CACHE_MASK; | 876 | offset = *ppos & ~PAGE_CACHE_MASK; |
| 886 | 877 | ||
| 887 | for (;;) { | 878 | for (;;) { |
| 888 | struct page *page; | 879 | struct page *page; |
| 889 | unsigned long end_index; | 880 | pgoff_t end_index; |
| 890 | loff_t isize; | 881 | loff_t isize; |
| 891 | unsigned long nr, ret; | 882 | unsigned long nr, ret; |
| 892 | 883 | ||
| @@ -895,7 +886,7 @@ find_page: | |||
| 895 | page = find_get_page(mapping, index); | 886 | page = find_get_page(mapping, index); |
| 896 | if (!page) { | 887 | if (!page) { |
| 897 | page_cache_sync_readahead(mapping, | 888 | page_cache_sync_readahead(mapping, |
| 898 | &ra, filp, | 889 | ra, filp, |
| 899 | index, last_index - index); | 890 | index, last_index - index); |
| 900 | page = find_get_page(mapping, index); | 891 | page = find_get_page(mapping, index); |
| 901 | if (unlikely(page == NULL)) | 892 | if (unlikely(page == NULL)) |
| @@ -903,7 +894,7 @@ find_page: | |||
| 903 | } | 894 | } |
| 904 | if (PageReadahead(page)) { | 895 | if (PageReadahead(page)) { |
| 905 | page_cache_async_readahead(mapping, | 896 | page_cache_async_readahead(mapping, |
| 906 | &ra, filp, page, | 897 | ra, filp, page, |
| 907 | index, last_index - index); | 898 | index, last_index - index); |
| 908 | } | 899 | } |
| 909 | if (!PageUptodate(page)) | 900 | if (!PageUptodate(page)) |
| @@ -966,7 +957,6 @@ page_ok: | |||
| 966 | index += offset >> PAGE_CACHE_SHIFT; | 957 | index += offset >> PAGE_CACHE_SHIFT; |
| 967 | offset &= ~PAGE_CACHE_MASK; | 958 | offset &= ~PAGE_CACHE_MASK; |
| 968 | prev_offset = offset; | 959 | prev_offset = offset; |
| 969 | ra.prev_offset = offset; | ||
| 970 | 960 | ||
| 971 | page_cache_release(page); | 961 | page_cache_release(page); |
| 972 | if (ret == nr && desc->count) | 962 | if (ret == nr && desc->count) |
| @@ -1015,7 +1005,7 @@ readpage: | |||
| 1015 | } | 1005 | } |
| 1016 | unlock_page(page); | 1006 | unlock_page(page); |
| 1017 | error = -EIO; | 1007 | error = -EIO; |
| 1018 | shrink_readahead_size_eio(filp, &ra); | 1008 | shrink_readahead_size_eio(filp, ra); |
| 1019 | goto readpage_error; | 1009 | goto readpage_error; |
| 1020 | } | 1010 | } |
| 1021 | unlock_page(page); | 1011 | unlock_page(page); |
| @@ -1034,33 +1024,29 @@ no_cached_page: | |||
| 1034 | * Ok, it wasn't cached, so we need to create a new | 1024 | * Ok, it wasn't cached, so we need to create a new |
| 1035 | * page.. | 1025 | * page.. |
| 1036 | */ | 1026 | */ |
| 1037 | if (!cached_page) { | 1027 | page = page_cache_alloc_cold(mapping); |
| 1038 | cached_page = page_cache_alloc_cold(mapping); | 1028 | if (!page) { |
| 1039 | if (!cached_page) { | 1029 | desc->error = -ENOMEM; |
| 1040 | desc->error = -ENOMEM; | 1030 | goto out; |
| 1041 | goto out; | ||
| 1042 | } | ||
| 1043 | } | 1031 | } |
| 1044 | error = add_to_page_cache_lru(cached_page, mapping, | 1032 | error = add_to_page_cache_lru(page, mapping, |
| 1045 | index, GFP_KERNEL); | 1033 | index, GFP_KERNEL); |
| 1046 | if (error) { | 1034 | if (error) { |
| 1035 | page_cache_release(page); | ||
| 1047 | if (error == -EEXIST) | 1036 | if (error == -EEXIST) |
| 1048 | goto find_page; | 1037 | goto find_page; |
| 1049 | desc->error = error; | 1038 | desc->error = error; |
| 1050 | goto out; | 1039 | goto out; |
| 1051 | } | 1040 | } |
| 1052 | page = cached_page; | ||
| 1053 | cached_page = NULL; | ||
| 1054 | goto readpage; | 1041 | goto readpage; |
| 1055 | } | 1042 | } |
| 1056 | 1043 | ||
| 1057 | out: | 1044 | out: |
| 1058 | *_ra = ra; | 1045 | ra->prev_pos = prev_index; |
| 1059 | _ra->prev_index = prev_index; | 1046 | ra->prev_pos <<= PAGE_CACHE_SHIFT; |
| 1047 | ra->prev_pos |= prev_offset; | ||
| 1060 | 1048 | ||
| 1061 | *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; | 1049 | *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; |
| 1062 | if (cached_page) | ||
| 1063 | page_cache_release(cached_page); | ||
| 1064 | if (filp) | 1050 | if (filp) |
| 1065 | file_accessed(filp); | 1051 | file_accessed(filp); |
| 1066 | } | 1052 | } |
| @@ -1220,7 +1206,7 @@ EXPORT_SYMBOL(generic_file_aio_read); | |||
| 1220 | 1206 | ||
| 1221 | static ssize_t | 1207 | static ssize_t |
| 1222 | do_readahead(struct address_space *mapping, struct file *filp, | 1208 | do_readahead(struct address_space *mapping, struct file *filp, |
| 1223 | unsigned long index, unsigned long nr) | 1209 | pgoff_t index, unsigned long nr) |
| 1224 | { | 1210 | { |
| 1225 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | 1211 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) |
| 1226 | return -EINVAL; | 1212 | return -EINVAL; |
| @@ -1240,8 +1226,8 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) | |||
| 1240 | if (file) { | 1226 | if (file) { |
| 1241 | if (file->f_mode & FMODE_READ) { | 1227 | if (file->f_mode & FMODE_READ) { |
| 1242 | struct address_space *mapping = file->f_mapping; | 1228 | struct address_space *mapping = file->f_mapping; |
| 1243 | unsigned long start = offset >> PAGE_CACHE_SHIFT; | 1229 | pgoff_t start = offset >> PAGE_CACHE_SHIFT; |
| 1244 | unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT; | 1230 | pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; |
| 1245 | unsigned long len = end - start + 1; | 1231 | unsigned long len = end - start + 1; |
| 1246 | ret = do_readahead(mapping, file, start, len); | 1232 | ret = do_readahead(mapping, file, start, len); |
| 1247 | } | 1233 | } |
| @@ -1251,7 +1237,6 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) | |||
| 1251 | } | 1237 | } |
| 1252 | 1238 | ||
| 1253 | #ifdef CONFIG_MMU | 1239 | #ifdef CONFIG_MMU |
| 1254 | static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); | ||
| 1255 | /** | 1240 | /** |
| 1256 | * page_cache_read - adds requested page to the page cache if not already there | 1241 | * page_cache_read - adds requested page to the page cache if not already there |
| 1257 | * @file: file to read | 1242 | * @file: file to read |
| @@ -1260,7 +1245,7 @@ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); | |||
| 1260 | * This adds the requested page to the page cache if it isn't already there, | 1245 | * This adds the requested page to the page cache if it isn't already there, |
| 1261 | * and schedules an I/O to read in its contents from disk. | 1246 | * and schedules an I/O to read in its contents from disk. |
| 1262 | */ | 1247 | */ |
| 1263 | static int fastcall page_cache_read(struct file * file, unsigned long offset) | 1248 | static int fastcall page_cache_read(struct file * file, pgoff_t offset) |
| 1264 | { | 1249 | { |
| 1265 | struct address_space *mapping = file->f_mapping; | 1250 | struct address_space *mapping = file->f_mapping; |
| 1266 | struct page *page; | 1251 | struct page *page; |
| @@ -1349,7 +1334,7 @@ retry_find: | |||
| 1349 | * Do we miss much more than hit in this file? If so, | 1334 | * Do we miss much more than hit in this file? If so, |
| 1350 | * stop bothering with read-ahead. It will only hurt. | 1335 | * stop bothering with read-ahead. It will only hurt. |
| 1351 | */ | 1336 | */ |
| 1352 | if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) | 1337 | if (ra->mmap_miss > MMAP_LOTSAMISS) |
| 1353 | goto no_cached_page; | 1338 | goto no_cached_page; |
| 1354 | 1339 | ||
| 1355 | /* | 1340 | /* |
| @@ -1375,7 +1360,7 @@ retry_find: | |||
| 1375 | } | 1360 | } |
| 1376 | 1361 | ||
| 1377 | if (!did_readaround) | 1362 | if (!did_readaround) |
| 1378 | ra->mmap_hit++; | 1363 | ra->mmap_miss--; |
| 1379 | 1364 | ||
| 1380 | /* | 1365 | /* |
| 1381 | * We have a locked page in the page cache, now we need to check | 1366 | * We have a locked page in the page cache, now we need to check |
| @@ -1396,7 +1381,7 @@ retry_find: | |||
| 1396 | * Found the page and have a reference on it. | 1381 | * Found the page and have a reference on it. |
| 1397 | */ | 1382 | */ |
| 1398 | mark_page_accessed(page); | 1383 | mark_page_accessed(page); |
| 1399 | ra->prev_index = page->index; | 1384 | ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; |
| 1400 | vmf->page = page; | 1385 | vmf->page = page; |
| 1401 | return ret | VM_FAULT_LOCKED; | 1386 | return ret | VM_FAULT_LOCKED; |
| 1402 | 1387 | ||
| @@ -1501,39 +1486,32 @@ EXPORT_SYMBOL(generic_file_mmap); | |||
| 1501 | EXPORT_SYMBOL(generic_file_readonly_mmap); | 1486 | EXPORT_SYMBOL(generic_file_readonly_mmap); |
| 1502 | 1487 | ||
| 1503 | static struct page *__read_cache_page(struct address_space *mapping, | 1488 | static struct page *__read_cache_page(struct address_space *mapping, |
| 1504 | unsigned long index, | 1489 | pgoff_t index, |
| 1505 | int (*filler)(void *,struct page*), | 1490 | int (*filler)(void *,struct page*), |
| 1506 | void *data) | 1491 | void *data) |
| 1507 | { | 1492 | { |
| 1508 | struct page *page, *cached_page = NULL; | 1493 | struct page *page; |
| 1509 | int err; | 1494 | int err; |
| 1510 | repeat: | 1495 | repeat: |
| 1511 | page = find_get_page(mapping, index); | 1496 | page = find_get_page(mapping, index); |
| 1512 | if (!page) { | 1497 | if (!page) { |
| 1513 | if (!cached_page) { | 1498 | page = page_cache_alloc_cold(mapping); |
| 1514 | cached_page = page_cache_alloc_cold(mapping); | 1499 | if (!page) |
| 1515 | if (!cached_page) | 1500 | return ERR_PTR(-ENOMEM); |
| 1516 | return ERR_PTR(-ENOMEM); | 1501 | err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); |
| 1517 | } | 1502 | if (unlikely(err)) { |
| 1518 | err = add_to_page_cache_lru(cached_page, mapping, | 1503 | page_cache_release(page); |
| 1519 | index, GFP_KERNEL); | 1504 | if (err == -EEXIST) |
| 1520 | if (err == -EEXIST) | 1505 | goto repeat; |
| 1521 | goto repeat; | ||
| 1522 | if (err < 0) { | ||
| 1523 | /* Presumably ENOMEM for radix tree node */ | 1506 | /* Presumably ENOMEM for radix tree node */ |
| 1524 | page_cache_release(cached_page); | ||
| 1525 | return ERR_PTR(err); | 1507 | return ERR_PTR(err); |
| 1526 | } | 1508 | } |
| 1527 | page = cached_page; | ||
| 1528 | cached_page = NULL; | ||
| 1529 | err = filler(data, page); | 1509 | err = filler(data, page); |
| 1530 | if (err < 0) { | 1510 | if (err < 0) { |
| 1531 | page_cache_release(page); | 1511 | page_cache_release(page); |
| 1532 | page = ERR_PTR(err); | 1512 | page = ERR_PTR(err); |
| 1533 | } | 1513 | } |
| 1534 | } | 1514 | } |
| 1535 | if (cached_page) | ||
| 1536 | page_cache_release(cached_page); | ||
| 1537 | return page; | 1515 | return page; |
| 1538 | } | 1516 | } |
| 1539 | 1517 | ||
| @@ -1542,7 +1520,7 @@ repeat: | |||
| 1542 | * after submitting it to the filler. | 1520 | * after submitting it to the filler. |
| 1543 | */ | 1521 | */ |
| 1544 | struct page *read_cache_page_async(struct address_space *mapping, | 1522 | struct page *read_cache_page_async(struct address_space *mapping, |
| 1545 | unsigned long index, | 1523 | pgoff_t index, |
| 1546 | int (*filler)(void *,struct page*), | 1524 | int (*filler)(void *,struct page*), |
| 1547 | void *data) | 1525 | void *data) |
| 1548 | { | 1526 | { |
| @@ -1590,7 +1568,7 @@ EXPORT_SYMBOL(read_cache_page_async); | |||
| 1590 | * If the page does not get brought uptodate, return -EIO. | 1568 | * If the page does not get brought uptodate, return -EIO. |
| 1591 | */ | 1569 | */ |
| 1592 | struct page *read_cache_page(struct address_space *mapping, | 1570 | struct page *read_cache_page(struct address_space *mapping, |
| 1593 | unsigned long index, | 1571 | pgoff_t index, |
| 1594 | int (*filler)(void *,struct page*), | 1572 | int (*filler)(void *,struct page*), |
| 1595 | void *data) | 1573 | void *data) |
| 1596 | { | 1574 | { |
| @@ -1610,40 +1588,6 @@ struct page *read_cache_page(struct address_space *mapping, | |||
| 1610 | EXPORT_SYMBOL(read_cache_page); | 1588 | EXPORT_SYMBOL(read_cache_page); |
| 1611 | 1589 | ||
| 1612 | /* | 1590 | /* |
| 1613 | * If the page was newly created, increment its refcount and add it to the | ||
| 1614 | * caller's lru-buffering pagevec. This function is specifically for | ||
| 1615 | * generic_file_write(). | ||
| 1616 | */ | ||
| 1617 | static inline struct page * | ||
| 1618 | __grab_cache_page(struct address_space *mapping, unsigned long index, | ||
| 1619 | struct page **cached_page, struct pagevec *lru_pvec) | ||
| 1620 | { | ||
| 1621 | int err; | ||
| 1622 | struct page *page; | ||
| 1623 | repeat: | ||
| 1624 | page = find_lock_page(mapping, index); | ||
| 1625 | if (!page) { | ||
| 1626 | if (!*cached_page) { | ||
| 1627 | *cached_page = page_cache_alloc(mapping); | ||
| 1628 | if (!*cached_page) | ||
| 1629 | return NULL; | ||
| 1630 | } | ||
| 1631 | err = add_to_page_cache(*cached_page, mapping, | ||
| 1632 | index, GFP_KERNEL); | ||
| 1633 | if (err == -EEXIST) | ||
| 1634 | goto repeat; | ||
| 1635 | if (err == 0) { | ||
| 1636 | page = *cached_page; | ||
| 1637 | page_cache_get(page); | ||
| 1638 | if (!pagevec_add(lru_pvec, page)) | ||
| 1639 | __pagevec_lru_add(lru_pvec); | ||
| 1640 | *cached_page = NULL; | ||
| 1641 | } | ||
| 1642 | } | ||
| 1643 | return page; | ||
| 1644 | } | ||
| 1645 | |||
| 1646 | /* | ||
| 1647 | * The logic we want is | 1591 | * The logic we want is |
| 1648 | * | 1592 | * |
| 1649 | * if suid or (sgid and xgrp) | 1593 | * if suid or (sgid and xgrp) |
| @@ -1691,8 +1635,7 @@ int remove_suid(struct dentry *dentry) | |||
| 1691 | } | 1635 | } |
| 1692 | EXPORT_SYMBOL(remove_suid); | 1636 | EXPORT_SYMBOL(remove_suid); |
| 1693 | 1637 | ||
| 1694 | size_t | 1638 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, |
| 1695 | __filemap_copy_from_user_iovec_inatomic(char *vaddr, | ||
| 1696 | const struct iovec *iov, size_t base, size_t bytes) | 1639 | const struct iovec *iov, size_t base, size_t bytes) |
| 1697 | { | 1640 | { |
| 1698 | size_t copied = 0, left = 0; | 1641 | size_t copied = 0, left = 0; |
| @@ -1715,6 +1658,124 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr, | |||
| 1715 | } | 1658 | } |
| 1716 | 1659 | ||
| 1717 | /* | 1660 | /* |
| 1661 | * Copy as much as we can into the page and return the number of bytes which | ||
| 1662 | * were sucessfully copied. If a fault is encountered then return the number of | ||
| 1663 | * bytes which were copied. | ||
| 1664 | */ | ||
| 1665 | size_t iov_iter_copy_from_user_atomic(struct page *page, | ||
| 1666 | struct iov_iter *i, unsigned long offset, size_t bytes) | ||
| 1667 | { | ||
| 1668 | char *kaddr; | ||
| 1669 | size_t copied; | ||
| 1670 | |||
| 1671 | BUG_ON(!in_atomic()); | ||
| 1672 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 1673 | if (likely(i->nr_segs == 1)) { | ||
| 1674 | int left; | ||
| 1675 | char __user *buf = i->iov->iov_base + i->iov_offset; | ||
| 1676 | left = __copy_from_user_inatomic_nocache(kaddr + offset, | ||
| 1677 | buf, bytes); | ||
| 1678 | copied = bytes - left; | ||
| 1679 | } else { | ||
| 1680 | copied = __iovec_copy_from_user_inatomic(kaddr + offset, | ||
| 1681 | i->iov, i->iov_offset, bytes); | ||
| 1682 | } | ||
| 1683 | kunmap_atomic(kaddr, KM_USER0); | ||
| 1684 | |||
| 1685 | return copied; | ||
| 1686 | } | ||
| 1687 | EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); | ||
| 1688 | |||
| 1689 | /* | ||
| 1690 | * This has the same sideeffects and return value as | ||
| 1691 | * iov_iter_copy_from_user_atomic(). | ||
| 1692 | * The difference is that it attempts to resolve faults. | ||
| 1693 | * Page must not be locked. | ||
| 1694 | */ | ||
| 1695 | size_t iov_iter_copy_from_user(struct page *page, | ||
| 1696 | struct iov_iter *i, unsigned long offset, size_t bytes) | ||
| 1697 | { | ||
| 1698 | char *kaddr; | ||
| 1699 | size_t copied; | ||
| 1700 | |||
| 1701 | kaddr = kmap(page); | ||
| 1702 | if (likely(i->nr_segs == 1)) { | ||
| 1703 | int left; | ||
| 1704 | char __user *buf = i->iov->iov_base + i->iov_offset; | ||
| 1705 | left = __copy_from_user_nocache(kaddr + offset, buf, bytes); | ||
| 1706 | copied = bytes - left; | ||
| 1707 | } else { | ||
| 1708 | copied = __iovec_copy_from_user_inatomic(kaddr + offset, | ||
| 1709 | i->iov, i->iov_offset, bytes); | ||
| 1710 | } | ||
| 1711 | kunmap(page); | ||
| 1712 | return copied; | ||
| 1713 | } | ||
| 1714 | EXPORT_SYMBOL(iov_iter_copy_from_user); | ||
| 1715 | |||
| 1716 | static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes) | ||
| 1717 | { | ||
| 1718 | if (likely(i->nr_segs == 1)) { | ||
| 1719 | i->iov_offset += bytes; | ||
| 1720 | } else { | ||
| 1721 | const struct iovec *iov = i->iov; | ||
| 1722 | size_t base = i->iov_offset; | ||
| 1723 | |||
| 1724 | while (bytes) { | ||
| 1725 | int copy = min(bytes, iov->iov_len - base); | ||
| 1726 | |||
| 1727 | bytes -= copy; | ||
| 1728 | base += copy; | ||
| 1729 | if (iov->iov_len == base) { | ||
| 1730 | iov++; | ||
| 1731 | base = 0; | ||
| 1732 | } | ||
| 1733 | } | ||
| 1734 | i->iov = iov; | ||
| 1735 | i->iov_offset = base; | ||
| 1736 | } | ||
| 1737 | } | ||
| 1738 | |||
| 1739 | void iov_iter_advance(struct iov_iter *i, size_t bytes) | ||
| 1740 | { | ||
| 1741 | BUG_ON(i->count < bytes); | ||
| 1742 | |||
| 1743 | __iov_iter_advance_iov(i, bytes); | ||
| 1744 | i->count -= bytes; | ||
| 1745 | } | ||
| 1746 | EXPORT_SYMBOL(iov_iter_advance); | ||
| 1747 | |||
| 1748 | /* | ||
| 1749 | * Fault in the first iovec of the given iov_iter, to a maximum length | ||
| 1750 | * of bytes. Returns 0 on success, or non-zero if the memory could not be | ||
| 1751 | * accessed (ie. because it is an invalid address). | ||
| 1752 | * | ||
| 1753 | * writev-intensive code may want this to prefault several iovecs -- that | ||
| 1754 | * would be possible (callers must not rely on the fact that _only_ the | ||
| 1755 | * first iovec will be faulted with the current implementation). | ||
| 1756 | */ | ||
| 1757 | int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) | ||
| 1758 | { | ||
| 1759 | char __user *buf = i->iov->iov_base + i->iov_offset; | ||
| 1760 | bytes = min(bytes, i->iov->iov_len - i->iov_offset); | ||
| 1761 | return fault_in_pages_readable(buf, bytes); | ||
| 1762 | } | ||
| 1763 | EXPORT_SYMBOL(iov_iter_fault_in_readable); | ||
| 1764 | |||
| 1765 | /* | ||
| 1766 | * Return the count of just the current iov_iter segment. | ||
| 1767 | */ | ||
| 1768 | size_t iov_iter_single_seg_count(struct iov_iter *i) | ||
| 1769 | { | ||
| 1770 | const struct iovec *iov = i->iov; | ||
| 1771 | if (i->nr_segs == 1) | ||
| 1772 | return i->count; | ||
| 1773 | else | ||
| 1774 | return min(i->count, iov->iov_len - i->iov_offset); | ||
| 1775 | } | ||
| 1776 | EXPORT_SYMBOL(iov_iter_single_seg_count); | ||
| 1777 | |||
| 1778 | /* | ||
| 1718 | * Performs necessary checks before doing a write | 1779 | * Performs necessary checks before doing a write |
| 1719 | * | 1780 | * |
| 1720 | * Can adjust writing position or amount of bytes to write. | 1781 | * Can adjust writing position or amount of bytes to write. |
| @@ -1796,6 +1857,91 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i | |||
| 1796 | } | 1857 | } |
| 1797 | EXPORT_SYMBOL(generic_write_checks); | 1858 | EXPORT_SYMBOL(generic_write_checks); |
| 1798 | 1859 | ||
| 1860 | int pagecache_write_begin(struct file *file, struct address_space *mapping, | ||
| 1861 | loff_t pos, unsigned len, unsigned flags, | ||
| 1862 | struct page **pagep, void **fsdata) | ||
| 1863 | { | ||
| 1864 | const struct address_space_operations *aops = mapping->a_ops; | ||
| 1865 | |||
| 1866 | if (aops->write_begin) { | ||
| 1867 | return aops->write_begin(file, mapping, pos, len, flags, | ||
| 1868 | pagep, fsdata); | ||
| 1869 | } else { | ||
| 1870 | int ret; | ||
| 1871 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | ||
| 1872 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
| 1873 | struct inode *inode = mapping->host; | ||
| 1874 | struct page *page; | ||
| 1875 | again: | ||
| 1876 | page = __grab_cache_page(mapping, index); | ||
| 1877 | *pagep = page; | ||
| 1878 | if (!page) | ||
| 1879 | return -ENOMEM; | ||
| 1880 | |||
| 1881 | if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { | ||
| 1882 | /* | ||
| 1883 | * There is no way to resolve a short write situation | ||
| 1884 | * for a !Uptodate page (except by double copying in | ||
| 1885 | * the caller done by generic_perform_write_2copy). | ||
| 1886 | * | ||
| 1887 | * Instead, we have to bring it uptodate here. | ||
| 1888 | */ | ||
| 1889 | ret = aops->readpage(file, page); | ||
| 1890 | page_cache_release(page); | ||
| 1891 | if (ret) { | ||
| 1892 | if (ret == AOP_TRUNCATED_PAGE) | ||
| 1893 | goto again; | ||
| 1894 | return ret; | ||
| 1895 | } | ||
| 1896 | goto again; | ||
| 1897 | } | ||
| 1898 | |||
| 1899 | ret = aops->prepare_write(file, page, offset, offset+len); | ||
| 1900 | if (ret) { | ||
| 1901 | unlock_page(page); | ||
| 1902 | page_cache_release(page); | ||
| 1903 | if (pos + len > inode->i_size) | ||
| 1904 | vmtruncate(inode, inode->i_size); | ||
| 1905 | } | ||
| 1906 | return ret; | ||
| 1907 | } | ||
| 1908 | } | ||
| 1909 | EXPORT_SYMBOL(pagecache_write_begin); | ||
| 1910 | |||
| 1911 | int pagecache_write_end(struct file *file, struct address_space *mapping, | ||
| 1912 | loff_t pos, unsigned len, unsigned copied, | ||
| 1913 | struct page *page, void *fsdata) | ||
| 1914 | { | ||
| 1915 | const struct address_space_operations *aops = mapping->a_ops; | ||
| 1916 | int ret; | ||
| 1917 | |||
| 1918 | if (aops->write_end) { | ||
| 1919 | mark_page_accessed(page); | ||
| 1920 | ret = aops->write_end(file, mapping, pos, len, copied, | ||
| 1921 | page, fsdata); | ||
| 1922 | } else { | ||
| 1923 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
| 1924 | struct inode *inode = mapping->host; | ||
| 1925 | |||
| 1926 | flush_dcache_page(page); | ||
| 1927 | ret = aops->commit_write(file, page, offset, offset+len); | ||
| 1928 | unlock_page(page); | ||
| 1929 | mark_page_accessed(page); | ||
| 1930 | page_cache_release(page); | ||
| 1931 | |||
| 1932 | if (ret < 0) { | ||
| 1933 | if (pos + len > inode->i_size) | ||
| 1934 | vmtruncate(inode, inode->i_size); | ||
| 1935 | } else if (ret > 0) | ||
| 1936 | ret = min_t(size_t, copied, ret); | ||
| 1937 | else | ||
| 1938 | ret = copied; | ||
| 1939 | } | ||
| 1940 | |||
| 1941 | return ret; | ||
| 1942 | } | ||
| 1943 | EXPORT_SYMBOL(pagecache_write_end); | ||
| 1944 | |||
| 1799 | ssize_t | 1945 | ssize_t |
| 1800 | generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | 1946 | generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, |
| 1801 | unsigned long *nr_segs, loff_t pos, loff_t *ppos, | 1947 | unsigned long *nr_segs, loff_t pos, loff_t *ppos, |
| @@ -1835,151 +1981,314 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 1835 | } | 1981 | } |
| 1836 | EXPORT_SYMBOL(generic_file_direct_write); | 1982 | EXPORT_SYMBOL(generic_file_direct_write); |
| 1837 | 1983 | ||
| 1838 | ssize_t | 1984 | /* |
| 1839 | generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | 1985 | * Find or create a page at the given pagecache position. Return the locked |
| 1840 | unsigned long nr_segs, loff_t pos, loff_t *ppos, | 1986 | * page. This function is specifically for buffered writes. |
| 1841 | size_t count, ssize_t written) | 1987 | */ |
| 1988 | struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) | ||
| 1842 | { | 1989 | { |
| 1843 | struct file *file = iocb->ki_filp; | 1990 | int status; |
| 1844 | struct address_space * mapping = file->f_mapping; | 1991 | struct page *page; |
| 1845 | const struct address_space_operations *a_ops = mapping->a_ops; | 1992 | repeat: |
| 1846 | struct inode *inode = mapping->host; | 1993 | page = find_lock_page(mapping, index); |
| 1847 | long status = 0; | 1994 | if (likely(page)) |
| 1848 | struct page *page; | 1995 | return page; |
| 1849 | struct page *cached_page = NULL; | ||
| 1850 | size_t bytes; | ||
| 1851 | struct pagevec lru_pvec; | ||
| 1852 | const struct iovec *cur_iov = iov; /* current iovec */ | ||
| 1853 | size_t iov_base = 0; /* offset in the current iovec */ | ||
| 1854 | char __user *buf; | ||
| 1855 | |||
| 1856 | pagevec_init(&lru_pvec, 0); | ||
| 1857 | 1996 | ||
| 1858 | /* | 1997 | page = page_cache_alloc(mapping); |
| 1859 | * handle partial DIO write. Adjust cur_iov if needed. | 1998 | if (!page) |
| 1860 | */ | 1999 | return NULL; |
| 1861 | if (likely(nr_segs == 1)) | 2000 | status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); |
| 1862 | buf = iov->iov_base + written; | 2001 | if (unlikely(status)) { |
| 1863 | else { | 2002 | page_cache_release(page); |
| 1864 | filemap_set_next_iovec(&cur_iov, &iov_base, written); | 2003 | if (status == -EEXIST) |
| 1865 | buf = cur_iov->iov_base + iov_base; | 2004 | goto repeat; |
| 2005 | return NULL; | ||
| 1866 | } | 2006 | } |
| 2007 | return page; | ||
| 2008 | } | ||
| 2009 | EXPORT_SYMBOL(__grab_cache_page); | ||
| 2010 | |||
| 2011 | static ssize_t generic_perform_write_2copy(struct file *file, | ||
| 2012 | struct iov_iter *i, loff_t pos) | ||
| 2013 | { | ||
| 2014 | struct address_space *mapping = file->f_mapping; | ||
| 2015 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
| 2016 | struct inode *inode = mapping->host; | ||
| 2017 | long status = 0; | ||
| 2018 | ssize_t written = 0; | ||
| 1867 | 2019 | ||
| 1868 | do { | 2020 | do { |
| 1869 | unsigned long index; | 2021 | struct page *src_page; |
| 1870 | unsigned long offset; | 2022 | struct page *page; |
| 1871 | size_t copied; | 2023 | pgoff_t index; /* Pagecache index for current page */ |
| 2024 | unsigned long offset; /* Offset into pagecache page */ | ||
| 2025 | unsigned long bytes; /* Bytes to write to page */ | ||
| 2026 | size_t copied; /* Bytes copied from user */ | ||
| 1872 | 2027 | ||
| 1873 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ | 2028 | offset = (pos & (PAGE_CACHE_SIZE - 1)); |
| 1874 | index = pos >> PAGE_CACHE_SHIFT; | 2029 | index = pos >> PAGE_CACHE_SHIFT; |
| 1875 | bytes = PAGE_CACHE_SIZE - offset; | 2030 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, |
| 1876 | 2031 | iov_iter_count(i)); | |
| 1877 | /* Limit the size of the copy to the caller's write size */ | ||
| 1878 | bytes = min(bytes, count); | ||
| 1879 | 2032 | ||
| 1880 | /* We only need to worry about prefaulting when writes are from | 2033 | /* |
| 1881 | * user-space. NFSd uses vfs_writev with several non-aligned | 2034 | * a non-NULL src_page indicates that we're doing the |
| 1882 | * segments in the vector, and limiting to one segment a time is | 2035 | * copy via get_user_pages and kmap. |
| 1883 | * a noticeable performance for re-write | ||
| 1884 | */ | 2036 | */ |
| 1885 | if (!segment_eq(get_fs(), KERNEL_DS)) { | 2037 | src_page = NULL; |
| 1886 | /* | ||
| 1887 | * Limit the size of the copy to that of the current | ||
| 1888 | * segment, because fault_in_pages_readable() doesn't | ||
| 1889 | * know how to walk segments. | ||
| 1890 | */ | ||
| 1891 | bytes = min(bytes, cur_iov->iov_len - iov_base); | ||
| 1892 | 2038 | ||
| 1893 | /* | 2039 | /* |
| 1894 | * Bring in the user page that we will copy from | 2040 | * Bring in the user page that we will copy from _first_. |
| 1895 | * _first_. Otherwise there's a nasty deadlock on | 2041 | * Otherwise there's a nasty deadlock on copying from the |
| 1896 | * copying from the same page as we're writing to, | 2042 | * same page as we're writing to, without it being marked |
| 1897 | * without it being marked up-to-date. | 2043 | * up-to-date. |
| 1898 | */ | 2044 | * |
| 1899 | fault_in_pages_readable(buf, bytes); | 2045 | * Not only is this an optimisation, but it is also required |
| 2046 | * to check that the address is actually valid, when atomic | ||
| 2047 | * usercopies are used, below. | ||
| 2048 | */ | ||
| 2049 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { | ||
| 2050 | status = -EFAULT; | ||
| 2051 | break; | ||
| 1900 | } | 2052 | } |
| 1901 | page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); | 2053 | |
| 2054 | page = __grab_cache_page(mapping, index); | ||
| 1902 | if (!page) { | 2055 | if (!page) { |
| 1903 | status = -ENOMEM; | 2056 | status = -ENOMEM; |
| 1904 | break; | 2057 | break; |
| 1905 | } | 2058 | } |
| 1906 | 2059 | ||
| 1907 | if (unlikely(bytes == 0)) { | 2060 | /* |
| 1908 | status = 0; | 2061 | * non-uptodate pages cannot cope with short copies, and we |
| 1909 | copied = 0; | 2062 | * cannot take a pagefault with the destination page locked. |
| 1910 | goto zero_length_segment; | 2063 | * So pin the source page to copy it. |
| 1911 | } | 2064 | */ |
| 2065 | if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) { | ||
| 2066 | unlock_page(page); | ||
| 1912 | 2067 | ||
| 1913 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | 2068 | src_page = alloc_page(GFP_KERNEL); |
| 1914 | if (unlikely(status)) { | 2069 | if (!src_page) { |
| 1915 | loff_t isize = i_size_read(inode); | 2070 | page_cache_release(page); |
| 2071 | status = -ENOMEM; | ||
| 2072 | break; | ||
| 2073 | } | ||
| 2074 | |||
| 2075 | /* | ||
| 2076 | * Cannot get_user_pages with a page locked for the | ||
| 2077 | * same reason as we can't take a page fault with a | ||
| 2078 | * page locked (as explained below). | ||
| 2079 | */ | ||
| 2080 | copied = iov_iter_copy_from_user(src_page, i, | ||
| 2081 | offset, bytes); | ||
| 2082 | if (unlikely(copied == 0)) { | ||
| 2083 | status = -EFAULT; | ||
| 2084 | page_cache_release(page); | ||
| 2085 | page_cache_release(src_page); | ||
| 2086 | break; | ||
| 2087 | } | ||
| 2088 | bytes = copied; | ||
| 1916 | 2089 | ||
| 1917 | if (status != AOP_TRUNCATED_PAGE) | 2090 | lock_page(page); |
| 2091 | /* | ||
| 2092 | * Can't handle the page going uptodate here, because | ||
| 2093 | * that means we would use non-atomic usercopies, which | ||
| 2094 | * zero out the tail of the page, which can cause | ||
| 2095 | * zeroes to become transiently visible. We could just | ||
| 2096 | * use a non-zeroing copy, but the APIs aren't too | ||
| 2097 | * consistent. | ||
| 2098 | */ | ||
| 2099 | if (unlikely(!page->mapping || PageUptodate(page))) { | ||
| 1918 | unlock_page(page); | 2100 | unlock_page(page); |
| 1919 | page_cache_release(page); | 2101 | page_cache_release(page); |
| 1920 | if (status == AOP_TRUNCATED_PAGE) | 2102 | page_cache_release(src_page); |
| 1921 | continue; | 2103 | continue; |
| 2104 | } | ||
| 2105 | } | ||
| 2106 | |||
| 2107 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | ||
| 2108 | if (unlikely(status)) | ||
| 2109 | goto fs_write_aop_error; | ||
| 2110 | |||
| 2111 | if (!src_page) { | ||
| 1922 | /* | 2112 | /* |
| 1923 | * prepare_write() may have instantiated a few blocks | 2113 | * Must not enter the pagefault handler here, because |
| 1924 | * outside i_size. Trim these off again. | 2114 | * we hold the page lock, so we might recursively |
| 2115 | * deadlock on the same lock, or get an ABBA deadlock | ||
| 2116 | * against a different lock, or against the mmap_sem | ||
| 2117 | * (which nests outside the page lock). So increment | ||
| 2118 | * preempt count, and use _atomic usercopies. | ||
| 2119 | * | ||
| 2120 | * The page is uptodate so we are OK to encounter a | ||
| 2121 | * short copy: if unmodified parts of the page are | ||
| 2122 | * marked dirty and written out to disk, it doesn't | ||
| 2123 | * really matter. | ||
| 1925 | */ | 2124 | */ |
| 1926 | if (pos + bytes > isize) | 2125 | pagefault_disable(); |
| 1927 | vmtruncate(inode, isize); | 2126 | copied = iov_iter_copy_from_user_atomic(page, i, |
| 1928 | break; | 2127 | offset, bytes); |
| 2128 | pagefault_enable(); | ||
| 2129 | } else { | ||
| 2130 | void *src, *dst; | ||
| 2131 | src = kmap_atomic(src_page, KM_USER0); | ||
| 2132 | dst = kmap_atomic(page, KM_USER1); | ||
| 2133 | memcpy(dst + offset, src + offset, bytes); | ||
| 2134 | kunmap_atomic(dst, KM_USER1); | ||
| 2135 | kunmap_atomic(src, KM_USER0); | ||
| 2136 | copied = bytes; | ||
| 1929 | } | 2137 | } |
| 1930 | if (likely(nr_segs == 1)) | ||
| 1931 | copied = filemap_copy_from_user(page, offset, | ||
| 1932 | buf, bytes); | ||
| 1933 | else | ||
| 1934 | copied = filemap_copy_from_user_iovec(page, offset, | ||
| 1935 | cur_iov, iov_base, bytes); | ||
| 1936 | flush_dcache_page(page); | 2138 | flush_dcache_page(page); |
| 2139 | |||
| 1937 | status = a_ops->commit_write(file, page, offset, offset+bytes); | 2140 | status = a_ops->commit_write(file, page, offset, offset+bytes); |
| 1938 | if (status == AOP_TRUNCATED_PAGE) { | 2141 | if (unlikely(status < 0)) |
| 1939 | page_cache_release(page); | 2142 | goto fs_write_aop_error; |
| 1940 | continue; | 2143 | if (unlikely(status > 0)) /* filesystem did partial write */ |
| 1941 | } | 2144 | copied = min_t(size_t, copied, status); |
| 1942 | zero_length_segment: | 2145 | |
| 1943 | if (likely(copied >= 0)) { | ||
| 1944 | if (!status) | ||
| 1945 | status = copied; | ||
| 1946 | |||
| 1947 | if (status >= 0) { | ||
| 1948 | written += status; | ||
| 1949 | count -= status; | ||
| 1950 | pos += status; | ||
| 1951 | buf += status; | ||
| 1952 | if (unlikely(nr_segs > 1)) { | ||
| 1953 | filemap_set_next_iovec(&cur_iov, | ||
| 1954 | &iov_base, status); | ||
| 1955 | if (count) | ||
| 1956 | buf = cur_iov->iov_base + | ||
| 1957 | iov_base; | ||
| 1958 | } else { | ||
| 1959 | iov_base += status; | ||
| 1960 | } | ||
| 1961 | } | ||
| 1962 | } | ||
| 1963 | if (unlikely(copied != bytes)) | ||
| 1964 | if (status >= 0) | ||
| 1965 | status = -EFAULT; | ||
| 1966 | unlock_page(page); | 2146 | unlock_page(page); |
| 1967 | mark_page_accessed(page); | 2147 | mark_page_accessed(page); |
| 1968 | page_cache_release(page); | 2148 | page_cache_release(page); |
| 1969 | if (status < 0) | 2149 | if (src_page) |
| 1970 | break; | 2150 | page_cache_release(src_page); |
| 2151 | |||
| 2152 | iov_iter_advance(i, copied); | ||
| 2153 | pos += copied; | ||
| 2154 | written += copied; | ||
| 2155 | |||
| 1971 | balance_dirty_pages_ratelimited(mapping); | 2156 | balance_dirty_pages_ratelimited(mapping); |
| 1972 | cond_resched(); | 2157 | cond_resched(); |
| 1973 | } while (count); | 2158 | continue; |
| 1974 | *ppos = pos; | ||
| 1975 | 2159 | ||
| 1976 | if (cached_page) | 2160 | fs_write_aop_error: |
| 1977 | page_cache_release(cached_page); | 2161 | unlock_page(page); |
| 2162 | page_cache_release(page); | ||
| 2163 | if (src_page) | ||
| 2164 | page_cache_release(src_page); | ||
| 2165 | |||
| 2166 | /* | ||
| 2167 | * prepare_write() may have instantiated a few blocks | ||
| 2168 | * outside i_size. Trim these off again. Don't need | ||
| 2169 | * i_size_read because we hold i_mutex. | ||
| 2170 | */ | ||
| 2171 | if (pos + bytes > inode->i_size) | ||
| 2172 | vmtruncate(inode, inode->i_size); | ||
| 2173 | break; | ||
| 2174 | } while (iov_iter_count(i)); | ||
| 2175 | |||
| 2176 | return written ? written : status; | ||
| 2177 | } | ||
| 2178 | |||
| 2179 | static ssize_t generic_perform_write(struct file *file, | ||
| 2180 | struct iov_iter *i, loff_t pos) | ||
| 2181 | { | ||
| 2182 | struct address_space *mapping = file->f_mapping; | ||
| 2183 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
| 2184 | long status = 0; | ||
| 2185 | ssize_t written = 0; | ||
| 2186 | unsigned int flags = 0; | ||
| 1978 | 2187 | ||
| 1979 | /* | 2188 | /* |
| 1980 | * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC | 2189 | * Copies from kernel address space cannot fail (NFSD is a big user). |
| 1981 | */ | 2190 | */ |
| 2191 | if (segment_eq(get_fs(), KERNEL_DS)) | ||
| 2192 | flags |= AOP_FLAG_UNINTERRUPTIBLE; | ||
| 2193 | |||
| 2194 | do { | ||
| 2195 | struct page *page; | ||
| 2196 | pgoff_t index; /* Pagecache index for current page */ | ||
| 2197 | unsigned long offset; /* Offset into pagecache page */ | ||
| 2198 | unsigned long bytes; /* Bytes to write to page */ | ||
| 2199 | size_t copied; /* Bytes copied from user */ | ||
| 2200 | void *fsdata; | ||
| 2201 | |||
| 2202 | offset = (pos & (PAGE_CACHE_SIZE - 1)); | ||
| 2203 | index = pos >> PAGE_CACHE_SHIFT; | ||
| 2204 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | ||
| 2205 | iov_iter_count(i)); | ||
| 2206 | |||
| 2207 | again: | ||
| 2208 | |||
| 2209 | /* | ||
| 2210 | * Bring in the user page that we will copy from _first_. | ||
| 2211 | * Otherwise there's a nasty deadlock on copying from the | ||
| 2212 | * same page as we're writing to, without it being marked | ||
| 2213 | * up-to-date. | ||
| 2214 | * | ||
| 2215 | * Not only is this an optimisation, but it is also required | ||
| 2216 | * to check that the address is actually valid, when atomic | ||
| 2217 | * usercopies are used, below. | ||
| 2218 | */ | ||
| 2219 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { | ||
| 2220 | status = -EFAULT; | ||
| 2221 | break; | ||
| 2222 | } | ||
| 2223 | |||
| 2224 | status = a_ops->write_begin(file, mapping, pos, bytes, flags, | ||
| 2225 | &page, &fsdata); | ||
| 2226 | if (unlikely(status)) | ||
| 2227 | break; | ||
| 2228 | |||
| 2229 | pagefault_disable(); | ||
| 2230 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); | ||
| 2231 | pagefault_enable(); | ||
| 2232 | flush_dcache_page(page); | ||
| 2233 | |||
| 2234 | status = a_ops->write_end(file, mapping, pos, bytes, copied, | ||
| 2235 | page, fsdata); | ||
| 2236 | if (unlikely(status < 0)) | ||
| 2237 | break; | ||
| 2238 | copied = status; | ||
| 2239 | |||
| 2240 | cond_resched(); | ||
| 2241 | |||
| 2242 | if (unlikely(copied == 0)) { | ||
| 2243 | /* | ||
| 2244 | * If we were unable to copy any data at all, we must | ||
| 2245 | * fall back to a single segment length write. | ||
| 2246 | * | ||
| 2247 | * If we didn't fallback here, we could livelock | ||
| 2248 | * because not all segments in the iov can be copied at | ||
| 2249 | * once without a pagefault. | ||
| 2250 | */ | ||
| 2251 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | ||
| 2252 | iov_iter_single_seg_count(i)); | ||
| 2253 | goto again; | ||
| 2254 | } | ||
| 2255 | iov_iter_advance(i, copied); | ||
| 2256 | pos += copied; | ||
| 2257 | written += copied; | ||
| 2258 | |||
| 2259 | balance_dirty_pages_ratelimited(mapping); | ||
| 2260 | |||
| 2261 | } while (iov_iter_count(i)); | ||
| 2262 | |||
| 2263 | return written ? written : status; | ||
| 2264 | } | ||
| 2265 | |||
| 2266 | ssize_t | ||
| 2267 | generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | ||
| 2268 | unsigned long nr_segs, loff_t pos, loff_t *ppos, | ||
| 2269 | size_t count, ssize_t written) | ||
| 2270 | { | ||
| 2271 | struct file *file = iocb->ki_filp; | ||
| 2272 | struct address_space *mapping = file->f_mapping; | ||
| 2273 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
| 2274 | struct inode *inode = mapping->host; | ||
| 2275 | ssize_t status; | ||
| 2276 | struct iov_iter i; | ||
| 2277 | |||
| 2278 | iov_iter_init(&i, iov, nr_segs, count, written); | ||
| 2279 | if (a_ops->write_begin) | ||
| 2280 | status = generic_perform_write(file, &i, pos); | ||
| 2281 | else | ||
| 2282 | status = generic_perform_write_2copy(file, &i, pos); | ||
| 2283 | |||
| 1982 | if (likely(status >= 0)) { | 2284 | if (likely(status >= 0)) { |
| 2285 | written += status; | ||
| 2286 | *ppos = pos + status; | ||
| 2287 | |||
| 2288 | /* | ||
| 2289 | * For now, when the user asks for O_SYNC, we'll actually give | ||
| 2290 | * O_DSYNC | ||
| 2291 | */ | ||
| 1983 | if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2292 | if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
| 1984 | if (!a_ops->writepage || !is_sync_kiocb(iocb)) | 2293 | if (!a_ops->writepage || !is_sync_kiocb(iocb)) |
| 1985 | status = generic_osync_inode(inode, mapping, | 2294 | status = generic_osync_inode(inode, mapping, |
| @@ -1995,7 +2304,6 @@ zero_length_segment: | |||
| 1995 | if (unlikely(file->f_flags & O_DIRECT) && written) | 2304 | if (unlikely(file->f_flags & O_DIRECT) && written) |
| 1996 | status = filemap_write_and_wait(mapping); | 2305 | status = filemap_write_and_wait(mapping); |
| 1997 | 2306 | ||
| 1998 | pagevec_lru_add(&lru_pvec); | ||
| 1999 | return written ? written : status; | 2307 | return written ? written : status; |
| 2000 | } | 2308 | } |
| 2001 | EXPORT_SYMBOL(generic_file_buffered_write); | 2309 | EXPORT_SYMBOL(generic_file_buffered_write); |
diff --git a/mm/filemap.h b/mm/filemap.h deleted file mode 100644 index c2bff04c84ed..000000000000 --- a/mm/filemap.h +++ /dev/null | |||
| @@ -1,103 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/filemap.h | ||
| 3 | * | ||
| 4 | * Copyright (C) 1994-1999 Linus Torvalds | ||
| 5 | */ | ||
| 6 | |||
| 7 | #ifndef __FILEMAP_H | ||
| 8 | #define __FILEMAP_H | ||
| 9 | |||
| 10 | #include <linux/types.h> | ||
| 11 | #include <linux/fs.h> | ||
| 12 | #include <linux/mm.h> | ||
| 13 | #include <linux/highmem.h> | ||
| 14 | #include <linux/uio.h> | ||
| 15 | #include <linux/uaccess.h> | ||
| 16 | |||
| 17 | size_t | ||
| 18 | __filemap_copy_from_user_iovec_inatomic(char *vaddr, | ||
| 19 | const struct iovec *iov, | ||
| 20 | size_t base, | ||
| 21 | size_t bytes); | ||
| 22 | |||
| 23 | /* | ||
| 24 | * Copy as much as we can into the page and return the number of bytes which | ||
| 25 | * were sucessfully copied. If a fault is encountered then clear the page | ||
| 26 | * out to (offset+bytes) and return the number of bytes which were copied. | ||
| 27 | * | ||
| 28 | * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache | ||
| 29 | * to *NOT* zero any tail of the buffer that it failed to copy. If it does, | ||
| 30 | * and if the following non-atomic copy succeeds, then there is a small window | ||
| 31 | * where the target page contains neither the data before the write, nor the | ||
| 32 | * data after the write (it contains zero). A read at this time will see | ||
| 33 | * data that is inconsistent with any ordering of the read and the write. | ||
| 34 | * (This has been detected in practice). | ||
| 35 | */ | ||
| 36 | static inline size_t | ||
| 37 | filemap_copy_from_user(struct page *page, unsigned long offset, | ||
| 38 | const char __user *buf, unsigned bytes) | ||
| 39 | { | ||
| 40 | char *kaddr; | ||
| 41 | int left; | ||
| 42 | |||
| 43 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 44 | left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); | ||
| 45 | kunmap_atomic(kaddr, KM_USER0); | ||
| 46 | |||
| 47 | if (left != 0) { | ||
| 48 | /* Do it the slow way */ | ||
| 49 | kaddr = kmap(page); | ||
| 50 | left = __copy_from_user_nocache(kaddr + offset, buf, bytes); | ||
| 51 | kunmap(page); | ||
| 52 | } | ||
| 53 | return bytes - left; | ||
| 54 | } | ||
| 55 | |||
| 56 | /* | ||
| 57 | * This has the same sideeffects and return value as filemap_copy_from_user(). | ||
| 58 | * The difference is that on a fault we need to memset the remainder of the | ||
| 59 | * page (out to offset+bytes), to emulate filemap_copy_from_user()'s | ||
| 60 | * single-segment behaviour. | ||
| 61 | */ | ||
| 62 | static inline size_t | ||
| 63 | filemap_copy_from_user_iovec(struct page *page, unsigned long offset, | ||
| 64 | const struct iovec *iov, size_t base, size_t bytes) | ||
| 65 | { | ||
| 66 | char *kaddr; | ||
| 67 | size_t copied; | ||
| 68 | |||
| 69 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 70 | copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, | ||
| 71 | base, bytes); | ||
| 72 | kunmap_atomic(kaddr, KM_USER0); | ||
| 73 | if (copied != bytes) { | ||
| 74 | kaddr = kmap(page); | ||
| 75 | copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, | ||
| 76 | base, bytes); | ||
| 77 | if (bytes - copied) | ||
| 78 | memset(kaddr + offset + copied, 0, bytes - copied); | ||
| 79 | kunmap(page); | ||
| 80 | } | ||
| 81 | return copied; | ||
| 82 | } | ||
| 83 | |||
| 84 | static inline void | ||
| 85 | filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) | ||
| 86 | { | ||
| 87 | const struct iovec *iov = *iovp; | ||
| 88 | size_t base = *basep; | ||
| 89 | |||
| 90 | do { | ||
| 91 | int copy = min(bytes, iov->iov_len - base); | ||
| 92 | |||
| 93 | bytes -= copy; | ||
| 94 | base += copy; | ||
| 95 | if (iov->iov_len == base) { | ||
| 96 | iov++; | ||
| 97 | base = 0; | ||
| 98 | } | ||
| 99 | } while (bytes); | ||
| 100 | *iovp = iov; | ||
| 101 | *basep = base; | ||
| 102 | } | ||
| 103 | #endif | ||
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 53ee6a299635..32132f3cd641 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
| @@ -15,7 +15,6 @@ | |||
| 15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
| 16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
| 17 | #include <asm/tlbflush.h> | 17 | #include <asm/tlbflush.h> |
| 18 | #include "filemap.h" | ||
| 19 | 18 | ||
| 20 | /* | 19 | /* |
| 21 | * We do use our own empty page to avoid interference with other users | 20 | * We do use our own empty page to avoid interference with other users |
| @@ -288,6 +287,7 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
| 288 | unsigned long index; | 287 | unsigned long index; |
| 289 | unsigned long offset; | 288 | unsigned long offset; |
| 290 | size_t copied; | 289 | size_t copied; |
| 290 | char *kaddr; | ||
| 291 | 291 | ||
| 292 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ | 292 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ |
| 293 | index = pos >> PAGE_CACHE_SHIFT; | 293 | index = pos >> PAGE_CACHE_SHIFT; |
| @@ -295,14 +295,6 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
| 295 | if (bytes > count) | 295 | if (bytes > count) |
| 296 | bytes = count; | 296 | bytes = count; |
| 297 | 297 | ||
| 298 | /* | ||
| 299 | * Bring in the user page that we will copy from _first_. | ||
| 300 | * Otherwise there's a nasty deadlock on copying from the | ||
| 301 | * same page as we're writing to, without it being marked | ||
| 302 | * up-to-date. | ||
| 303 | */ | ||
| 304 | fault_in_pages_readable(buf, bytes); | ||
| 305 | |||
| 306 | page = a_ops->get_xip_page(mapping, | 298 | page = a_ops->get_xip_page(mapping, |
| 307 | index*(PAGE_SIZE/512), 0); | 299 | index*(PAGE_SIZE/512), 0); |
| 308 | if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) { | 300 | if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) { |
| @@ -319,8 +311,13 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
| 319 | break; | 311 | break; |
| 320 | } | 312 | } |
| 321 | 313 | ||
| 322 | copied = filemap_copy_from_user(page, offset, buf, bytes); | 314 | fault_in_pages_readable(buf, bytes); |
| 315 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 316 | copied = bytes - | ||
| 317 | __copy_from_user_inatomic_nocache(kaddr, buf, bytes); | ||
| 318 | kunmap_atomic(kaddr, KM_USER0); | ||
| 323 | flush_dcache_page(page); | 319 | flush_dcache_page(page); |
| 320 | |||
| 324 | if (likely(copied > 0)) { | 321 | if (likely(copied > 0)) { |
| 325 | status = copied; | 322 | status = copied; |
| 326 | 323 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eab8c428cc93..ae2959bb59cb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -23,12 +23,16 @@ | |||
| 23 | 23 | ||
| 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
| 25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; | 25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; |
| 26 | static unsigned long surplus_huge_pages; | ||
| 26 | unsigned long max_huge_pages; | 27 | unsigned long max_huge_pages; |
| 27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | 28 | static struct list_head hugepage_freelists[MAX_NUMNODES]; |
| 28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | 29 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; |
| 29 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; | 30 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; |
| 31 | static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; | ||
| 30 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 32 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
| 31 | unsigned long hugepages_treat_as_movable; | 33 | unsigned long hugepages_treat_as_movable; |
| 34 | int hugetlb_dynamic_pool; | ||
| 35 | static int hugetlb_next_nid; | ||
| 32 | 36 | ||
| 33 | /* | 37 | /* |
| 34 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 38 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages |
| @@ -85,6 +89,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, | |||
| 85 | list_del(&page->lru); | 89 | list_del(&page->lru); |
| 86 | free_huge_pages--; | 90 | free_huge_pages--; |
| 87 | free_huge_pages_node[nid]--; | 91 | free_huge_pages_node[nid]--; |
| 92 | if (vma && vma->vm_flags & VM_MAYSHARE) | ||
| 93 | resv_huge_pages--; | ||
| 88 | break; | 94 | break; |
| 89 | } | 95 | } |
| 90 | } | 96 | } |
| @@ -92,58 +98,269 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, | |||
| 92 | return page; | 98 | return page; |
| 93 | } | 99 | } |
| 94 | 100 | ||
| 101 | static void update_and_free_page(struct page *page) | ||
| 102 | { | ||
| 103 | int i; | ||
| 104 | nr_huge_pages--; | ||
| 105 | nr_huge_pages_node[page_to_nid(page)]--; | ||
| 106 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { | ||
| 107 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | ||
| 108 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | ||
| 109 | 1 << PG_private | 1<< PG_writeback); | ||
| 110 | } | ||
| 111 | set_compound_page_dtor(page, NULL); | ||
| 112 | set_page_refcounted(page); | ||
| 113 | __free_pages(page, HUGETLB_PAGE_ORDER); | ||
| 114 | } | ||
| 115 | |||
| 95 | static void free_huge_page(struct page *page) | 116 | static void free_huge_page(struct page *page) |
| 96 | { | 117 | { |
| 97 | BUG_ON(page_count(page)); | 118 | int nid = page_to_nid(page); |
| 98 | 119 | ||
| 120 | BUG_ON(page_count(page)); | ||
| 99 | INIT_LIST_HEAD(&page->lru); | 121 | INIT_LIST_HEAD(&page->lru); |
| 100 | 122 | ||
| 101 | spin_lock(&hugetlb_lock); | 123 | spin_lock(&hugetlb_lock); |
| 102 | enqueue_huge_page(page); | 124 | if (surplus_huge_pages_node[nid]) { |
| 125 | update_and_free_page(page); | ||
| 126 | surplus_huge_pages--; | ||
| 127 | surplus_huge_pages_node[nid]--; | ||
| 128 | } else { | ||
| 129 | enqueue_huge_page(page); | ||
| 130 | } | ||
| 103 | spin_unlock(&hugetlb_lock); | 131 | spin_unlock(&hugetlb_lock); |
| 104 | } | 132 | } |
| 105 | 133 | ||
| 106 | static int alloc_fresh_huge_page(void) | 134 | /* |
| 135 | * Increment or decrement surplus_huge_pages. Keep node-specific counters | ||
| 136 | * balanced by operating on them in a round-robin fashion. | ||
| 137 | * Returns 1 if an adjustment was made. | ||
| 138 | */ | ||
| 139 | static int adjust_pool_surplus(int delta) | ||
| 107 | { | 140 | { |
| 108 | static int prev_nid; | 141 | static int prev_nid; |
| 109 | struct page *page; | 142 | int nid = prev_nid; |
| 110 | int nid; | 143 | int ret = 0; |
| 144 | |||
| 145 | VM_BUG_ON(delta != -1 && delta != 1); | ||
| 146 | do { | ||
| 147 | nid = next_node(nid, node_online_map); | ||
| 148 | if (nid == MAX_NUMNODES) | ||
| 149 | nid = first_node(node_online_map); | ||
| 150 | |||
| 151 | /* To shrink on this node, there must be a surplus page */ | ||
| 152 | if (delta < 0 && !surplus_huge_pages_node[nid]) | ||
| 153 | continue; | ||
| 154 | /* Surplus cannot exceed the total number of pages */ | ||
| 155 | if (delta > 0 && surplus_huge_pages_node[nid] >= | ||
| 156 | nr_huge_pages_node[nid]) | ||
| 157 | continue; | ||
| 158 | |||
| 159 | surplus_huge_pages += delta; | ||
| 160 | surplus_huge_pages_node[nid] += delta; | ||
| 161 | ret = 1; | ||
| 162 | break; | ||
| 163 | } while (nid != prev_nid); | ||
| 111 | 164 | ||
| 112 | /* | ||
| 113 | * Copy static prev_nid to local nid, work on that, then copy it | ||
| 114 | * back to prev_nid afterwards: otherwise there's a window in which | ||
| 115 | * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node. | ||
| 116 | * But we don't need to use a spin_lock here: it really doesn't | ||
| 117 | * matter if occasionally a racer chooses the same nid as we do. | ||
| 118 | */ | ||
| 119 | nid = next_node(prev_nid, node_online_map); | ||
| 120 | if (nid == MAX_NUMNODES) | ||
| 121 | nid = first_node(node_online_map); | ||
| 122 | prev_nid = nid; | 165 | prev_nid = nid; |
| 166 | return ret; | ||
| 167 | } | ||
| 168 | |||
| 169 | static struct page *alloc_fresh_huge_page_node(int nid) | ||
| 170 | { | ||
| 171 | struct page *page; | ||
| 123 | 172 | ||
| 124 | page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, | 173 | page = alloc_pages_node(nid, |
| 174 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, | ||
| 175 | HUGETLB_PAGE_ORDER); | ||
| 176 | if (page) { | ||
| 177 | set_compound_page_dtor(page, free_huge_page); | ||
| 178 | spin_lock(&hugetlb_lock); | ||
| 179 | nr_huge_pages++; | ||
| 180 | nr_huge_pages_node[nid]++; | ||
| 181 | spin_unlock(&hugetlb_lock); | ||
| 182 | put_page(page); /* free it into the hugepage allocator */ | ||
| 183 | } | ||
| 184 | |||
| 185 | return page; | ||
| 186 | } | ||
| 187 | |||
| 188 | static int alloc_fresh_huge_page(void) | ||
| 189 | { | ||
| 190 | struct page *page; | ||
| 191 | int start_nid; | ||
| 192 | int next_nid; | ||
| 193 | int ret = 0; | ||
| 194 | |||
| 195 | start_nid = hugetlb_next_nid; | ||
| 196 | |||
| 197 | do { | ||
| 198 | page = alloc_fresh_huge_page_node(hugetlb_next_nid); | ||
| 199 | if (page) | ||
| 200 | ret = 1; | ||
| 201 | /* | ||
| 202 | * Use a helper variable to find the next node and then | ||
| 203 | * copy it back to hugetlb_next_nid afterwards: | ||
| 204 | * otherwise there's a window in which a racer might | ||
| 205 | * pass invalid nid MAX_NUMNODES to alloc_pages_node. | ||
| 206 | * But we don't need to use a spin_lock here: it really | ||
| 207 | * doesn't matter if occasionally a racer chooses the | ||
| 208 | * same nid as we do. Move nid forward in the mask even | ||
| 209 | * if we just successfully allocated a hugepage so that | ||
| 210 | * the next caller gets hugepages on the next node. | ||
| 211 | */ | ||
| 212 | next_nid = next_node(hugetlb_next_nid, node_online_map); | ||
| 213 | if (next_nid == MAX_NUMNODES) | ||
| 214 | next_nid = first_node(node_online_map); | ||
| 215 | hugetlb_next_nid = next_nid; | ||
| 216 | } while (!page && hugetlb_next_nid != start_nid); | ||
| 217 | |||
| 218 | return ret; | ||
| 219 | } | ||
| 220 | |||
| 221 | static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | ||
| 222 | unsigned long address) | ||
| 223 | { | ||
| 224 | struct page *page; | ||
| 225 | |||
| 226 | /* Check if the dynamic pool is enabled */ | ||
| 227 | if (!hugetlb_dynamic_pool) | ||
| 228 | return NULL; | ||
| 229 | |||
| 230 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, | ||
| 125 | HUGETLB_PAGE_ORDER); | 231 | HUGETLB_PAGE_ORDER); |
| 126 | if (page) { | 232 | if (page) { |
| 127 | set_compound_page_dtor(page, free_huge_page); | 233 | set_compound_page_dtor(page, free_huge_page); |
| 128 | spin_lock(&hugetlb_lock); | 234 | spin_lock(&hugetlb_lock); |
| 129 | nr_huge_pages++; | 235 | nr_huge_pages++; |
| 130 | nr_huge_pages_node[page_to_nid(page)]++; | 236 | nr_huge_pages_node[page_to_nid(page)]++; |
| 237 | surplus_huge_pages++; | ||
| 238 | surplus_huge_pages_node[page_to_nid(page)]++; | ||
| 131 | spin_unlock(&hugetlb_lock); | 239 | spin_unlock(&hugetlb_lock); |
| 132 | put_page(page); /* free it into the hugepage allocator */ | ||
| 133 | return 1; | ||
| 134 | } | 240 | } |
| 135 | return 0; | 241 | |
| 242 | return page; | ||
| 243 | } | ||
| 244 | |||
| 245 | /* | ||
| 246 | * Increase the hugetlb pool such that it can accomodate a reservation | ||
| 247 | * of size 'delta'. | ||
| 248 | */ | ||
| 249 | static int gather_surplus_pages(int delta) | ||
| 250 | { | ||
| 251 | struct list_head surplus_list; | ||
| 252 | struct page *page, *tmp; | ||
| 253 | int ret, i; | ||
| 254 | int needed, allocated; | ||
| 255 | |||
| 256 | needed = (resv_huge_pages + delta) - free_huge_pages; | ||
| 257 | if (needed <= 0) | ||
| 258 | return 0; | ||
| 259 | |||
| 260 | allocated = 0; | ||
| 261 | INIT_LIST_HEAD(&surplus_list); | ||
| 262 | |||
| 263 | ret = -ENOMEM; | ||
| 264 | retry: | ||
| 265 | spin_unlock(&hugetlb_lock); | ||
| 266 | for (i = 0; i < needed; i++) { | ||
| 267 | page = alloc_buddy_huge_page(NULL, 0); | ||
| 268 | if (!page) { | ||
| 269 | /* | ||
| 270 | * We were not able to allocate enough pages to | ||
| 271 | * satisfy the entire reservation so we free what | ||
| 272 | * we've allocated so far. | ||
| 273 | */ | ||
| 274 | spin_lock(&hugetlb_lock); | ||
| 275 | needed = 0; | ||
| 276 | goto free; | ||
| 277 | } | ||
| 278 | |||
| 279 | list_add(&page->lru, &surplus_list); | ||
| 280 | } | ||
| 281 | allocated += needed; | ||
| 282 | |||
| 283 | /* | ||
| 284 | * After retaking hugetlb_lock, we need to recalculate 'needed' | ||
| 285 | * because either resv_huge_pages or free_huge_pages may have changed. | ||
| 286 | */ | ||
| 287 | spin_lock(&hugetlb_lock); | ||
| 288 | needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); | ||
| 289 | if (needed > 0) | ||
| 290 | goto retry; | ||
| 291 | |||
| 292 | /* | ||
| 293 | * The surplus_list now contains _at_least_ the number of extra pages | ||
| 294 | * needed to accomodate the reservation. Add the appropriate number | ||
| 295 | * of pages to the hugetlb pool and free the extras back to the buddy | ||
| 296 | * allocator. | ||
| 297 | */ | ||
| 298 | needed += allocated; | ||
| 299 | ret = 0; | ||
| 300 | free: | ||
| 301 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | ||
| 302 | list_del(&page->lru); | ||
| 303 | if ((--needed) >= 0) | ||
| 304 | enqueue_huge_page(page); | ||
| 305 | else { | ||
| 306 | /* | ||
| 307 | * Decrement the refcount and free the page using its | ||
| 308 | * destructor. This must be done with hugetlb_lock | ||
| 309 | * unlocked which is safe because free_huge_page takes | ||
| 310 | * hugetlb_lock before deciding how to free the page. | ||
| 311 | */ | ||
| 312 | spin_unlock(&hugetlb_lock); | ||
| 313 | put_page(page); | ||
| 314 | spin_lock(&hugetlb_lock); | ||
| 315 | } | ||
| 316 | } | ||
| 317 | |||
| 318 | return ret; | ||
| 319 | } | ||
| 320 | |||
| 321 | /* | ||
| 322 | * When releasing a hugetlb pool reservation, any surplus pages that were | ||
| 323 | * allocated to satisfy the reservation must be explicitly freed if they were | ||
| 324 | * never used. | ||
| 325 | */ | ||
| 326 | void return_unused_surplus_pages(unsigned long unused_resv_pages) | ||
| 327 | { | ||
| 328 | static int nid = -1; | ||
| 329 | struct page *page; | ||
| 330 | unsigned long nr_pages; | ||
| 331 | |||
| 332 | nr_pages = min(unused_resv_pages, surplus_huge_pages); | ||
| 333 | |||
| 334 | while (nr_pages) { | ||
| 335 | nid = next_node(nid, node_online_map); | ||
| 336 | if (nid == MAX_NUMNODES) | ||
| 337 | nid = first_node(node_online_map); | ||
| 338 | |||
| 339 | if (!surplus_huge_pages_node[nid]) | ||
| 340 | continue; | ||
| 341 | |||
| 342 | if (!list_empty(&hugepage_freelists[nid])) { | ||
| 343 | page = list_entry(hugepage_freelists[nid].next, | ||
| 344 | struct page, lru); | ||
| 345 | list_del(&page->lru); | ||
| 346 | update_and_free_page(page); | ||
| 347 | free_huge_pages--; | ||
| 348 | free_huge_pages_node[nid]--; | ||
| 349 | surplus_huge_pages--; | ||
| 350 | surplus_huge_pages_node[nid]--; | ||
| 351 | nr_pages--; | ||
| 352 | } | ||
| 353 | } | ||
| 136 | } | 354 | } |
| 137 | 355 | ||
| 138 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 356 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
| 139 | unsigned long addr) | 357 | unsigned long addr) |
| 140 | { | 358 | { |
| 141 | struct page *page; | 359 | struct page *page = NULL; |
| 360 | int use_reserved_page = vma->vm_flags & VM_MAYSHARE; | ||
| 142 | 361 | ||
| 143 | spin_lock(&hugetlb_lock); | 362 | spin_lock(&hugetlb_lock); |
| 144 | if (vma->vm_flags & VM_MAYSHARE) | 363 | if (!use_reserved_page && (free_huge_pages <= resv_huge_pages)) |
| 145 | resv_huge_pages--; | ||
| 146 | else if (free_huge_pages <= resv_huge_pages) | ||
| 147 | goto fail; | 364 | goto fail; |
| 148 | 365 | ||
| 149 | page = dequeue_huge_page(vma, addr); | 366 | page = dequeue_huge_page(vma, addr); |
| @@ -155,10 +372,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
| 155 | return page; | 372 | return page; |
| 156 | 373 | ||
| 157 | fail: | 374 | fail: |
| 158 | if (vma->vm_flags & VM_MAYSHARE) | ||
| 159 | resv_huge_pages++; | ||
| 160 | spin_unlock(&hugetlb_lock); | 375 | spin_unlock(&hugetlb_lock); |
| 161 | return NULL; | 376 | |
| 377 | /* | ||
| 378 | * Private mappings do not use reserved huge pages so the allocation | ||
| 379 | * may have failed due to an undersized hugetlb pool. Try to grab a | ||
| 380 | * surplus huge page from the buddy allocator. | ||
| 381 | */ | ||
| 382 | if (!use_reserved_page) | ||
| 383 | page = alloc_buddy_huge_page(vma, addr); | ||
| 384 | |||
| 385 | return page; | ||
| 162 | } | 386 | } |
| 163 | 387 | ||
| 164 | static int __init hugetlb_init(void) | 388 | static int __init hugetlb_init(void) |
| @@ -171,6 +395,8 @@ static int __init hugetlb_init(void) | |||
| 171 | for (i = 0; i < MAX_NUMNODES; ++i) | 395 | for (i = 0; i < MAX_NUMNODES; ++i) |
| 172 | INIT_LIST_HEAD(&hugepage_freelists[i]); | 396 | INIT_LIST_HEAD(&hugepage_freelists[i]); |
| 173 | 397 | ||
| 398 | hugetlb_next_nid = first_node(node_online_map); | ||
| 399 | |||
| 174 | for (i = 0; i < max_huge_pages; ++i) { | 400 | for (i = 0; i < max_huge_pages; ++i) { |
| 175 | if (!alloc_fresh_huge_page()) | 401 | if (!alloc_fresh_huge_page()) |
| 176 | break; | 402 | break; |
| @@ -201,21 +427,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array) | |||
| 201 | } | 427 | } |
| 202 | 428 | ||
| 203 | #ifdef CONFIG_SYSCTL | 429 | #ifdef CONFIG_SYSCTL |
| 204 | static void update_and_free_page(struct page *page) | ||
| 205 | { | ||
| 206 | int i; | ||
| 207 | nr_huge_pages--; | ||
| 208 | nr_huge_pages_node[page_to_nid(page)]--; | ||
| 209 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { | ||
| 210 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | ||
| 211 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | ||
| 212 | 1 << PG_private | 1<< PG_writeback); | ||
| 213 | } | ||
| 214 | set_compound_page_dtor(page, NULL); | ||
| 215 | set_page_refcounted(page); | ||
| 216 | __free_pages(page, HUGETLB_PAGE_ORDER); | ||
| 217 | } | ||
| 218 | |||
| 219 | #ifdef CONFIG_HIGHMEM | 430 | #ifdef CONFIG_HIGHMEM |
| 220 | static void try_to_free_low(unsigned long count) | 431 | static void try_to_free_low(unsigned long count) |
| 221 | { | 432 | { |
| @@ -224,14 +435,14 @@ static void try_to_free_low(unsigned long count) | |||
| 224 | for (i = 0; i < MAX_NUMNODES; ++i) { | 435 | for (i = 0; i < MAX_NUMNODES; ++i) { |
| 225 | struct page *page, *next; | 436 | struct page *page, *next; |
| 226 | list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { | 437 | list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { |
| 438 | if (count >= nr_huge_pages) | ||
| 439 | return; | ||
| 227 | if (PageHighMem(page)) | 440 | if (PageHighMem(page)) |
| 228 | continue; | 441 | continue; |
| 229 | list_del(&page->lru); | 442 | list_del(&page->lru); |
| 230 | update_and_free_page(page); | 443 | update_and_free_page(page); |
| 231 | free_huge_pages--; | 444 | free_huge_pages--; |
| 232 | free_huge_pages_node[page_to_nid(page)]--; | 445 | free_huge_pages_node[page_to_nid(page)]--; |
| 233 | if (count >= nr_huge_pages) | ||
| 234 | return; | ||
| 235 | } | 446 | } |
| 236 | } | 447 | } |
| 237 | } | 448 | } |
| @@ -241,26 +452,61 @@ static inline void try_to_free_low(unsigned long count) | |||
| 241 | } | 452 | } |
| 242 | #endif | 453 | #endif |
| 243 | 454 | ||
| 455 | #define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) | ||
| 244 | static unsigned long set_max_huge_pages(unsigned long count) | 456 | static unsigned long set_max_huge_pages(unsigned long count) |
| 245 | { | 457 | { |
| 246 | while (count > nr_huge_pages) { | 458 | unsigned long min_count, ret; |
| 247 | if (!alloc_fresh_huge_page()) | ||
| 248 | return nr_huge_pages; | ||
| 249 | } | ||
| 250 | if (count >= nr_huge_pages) | ||
| 251 | return nr_huge_pages; | ||
| 252 | 459 | ||
| 460 | /* | ||
| 461 | * Increase the pool size | ||
| 462 | * First take pages out of surplus state. Then make up the | ||
| 463 | * remaining difference by allocating fresh huge pages. | ||
| 464 | */ | ||
| 253 | spin_lock(&hugetlb_lock); | 465 | spin_lock(&hugetlb_lock); |
| 254 | count = max(count, resv_huge_pages); | 466 | while (surplus_huge_pages && count > persistent_huge_pages) { |
| 255 | try_to_free_low(count); | 467 | if (!adjust_pool_surplus(-1)) |
| 256 | while (count < nr_huge_pages) { | 468 | break; |
| 469 | } | ||
| 470 | |||
| 471 | while (count > persistent_huge_pages) { | ||
| 472 | int ret; | ||
| 473 | /* | ||
| 474 | * If this allocation races such that we no longer need the | ||
| 475 | * page, free_huge_page will handle it by freeing the page | ||
| 476 | * and reducing the surplus. | ||
| 477 | */ | ||
| 478 | spin_unlock(&hugetlb_lock); | ||
| 479 | ret = alloc_fresh_huge_page(); | ||
| 480 | spin_lock(&hugetlb_lock); | ||
| 481 | if (!ret) | ||
| 482 | goto out; | ||
| 483 | |||
| 484 | } | ||
| 485 | |||
| 486 | /* | ||
| 487 | * Decrease the pool size | ||
| 488 | * First return free pages to the buddy allocator (being careful | ||
| 489 | * to keep enough around to satisfy reservations). Then place | ||
| 490 | * pages into surplus state as needed so the pool will shrink | ||
| 491 | * to the desired size as pages become free. | ||
| 492 | */ | ||
| 493 | min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; | ||
| 494 | min_count = max(count, min_count); | ||
| 495 | try_to_free_low(min_count); | ||
| 496 | while (min_count < persistent_huge_pages) { | ||
| 257 | struct page *page = dequeue_huge_page(NULL, 0); | 497 | struct page *page = dequeue_huge_page(NULL, 0); |
| 258 | if (!page) | 498 | if (!page) |
| 259 | break; | 499 | break; |
| 260 | update_and_free_page(page); | 500 | update_and_free_page(page); |
| 261 | } | 501 | } |
| 502 | while (count < persistent_huge_pages) { | ||
| 503 | if (!adjust_pool_surplus(1)) | ||
| 504 | break; | ||
| 505 | } | ||
| 506 | out: | ||
| 507 | ret = persistent_huge_pages; | ||
| 262 | spin_unlock(&hugetlb_lock); | 508 | spin_unlock(&hugetlb_lock); |
| 263 | return nr_huge_pages; | 509 | return ret; |
| 264 | } | 510 | } |
| 265 | 511 | ||
| 266 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 512 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, |
| @@ -292,10 +538,12 @@ int hugetlb_report_meminfo(char *buf) | |||
| 292 | "HugePages_Total: %5lu\n" | 538 | "HugePages_Total: %5lu\n" |
| 293 | "HugePages_Free: %5lu\n" | 539 | "HugePages_Free: %5lu\n" |
| 294 | "HugePages_Rsvd: %5lu\n" | 540 | "HugePages_Rsvd: %5lu\n" |
| 541 | "HugePages_Surp: %5lu\n" | ||
| 295 | "Hugepagesize: %5lu kB\n", | 542 | "Hugepagesize: %5lu kB\n", |
| 296 | nr_huge_pages, | 543 | nr_huge_pages, |
| 297 | free_huge_pages, | 544 | free_huge_pages, |
| 298 | resv_huge_pages, | 545 | resv_huge_pages, |
| 546 | surplus_huge_pages, | ||
| 299 | HPAGE_SIZE/1024); | 547 | HPAGE_SIZE/1024); |
| 300 | } | 548 | } |
| 301 | 549 | ||
| @@ -355,7 +603,6 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, | |||
| 355 | entry = pte_mkwrite(pte_mkdirty(*ptep)); | 603 | entry = pte_mkwrite(pte_mkdirty(*ptep)); |
| 356 | if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { | 604 | if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { |
| 357 | update_mmu_cache(vma, address, entry); | 605 | update_mmu_cache(vma, address, entry); |
| 358 | lazy_mmu_prot_update(entry); | ||
| 359 | } | 606 | } |
| 360 | } | 607 | } |
| 361 | 608 | ||
| @@ -708,7 +955,6 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
| 708 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 955 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
| 709 | pte = pte_mkhuge(pte_modify(pte, newprot)); | 956 | pte = pte_mkhuge(pte_modify(pte, newprot)); |
| 710 | set_huge_pte_at(mm, address, ptep, pte); | 957 | set_huge_pte_at(mm, address, ptep, pte); |
| 711 | lazy_mmu_prot_update(pte); | ||
| 712 | } | 958 | } |
| 713 | } | 959 | } |
| 714 | spin_unlock(&mm->page_table_lock); | 960 | spin_unlock(&mm->page_table_lock); |
| @@ -843,21 +1089,6 @@ static int hugetlb_acct_memory(long delta) | |||
| 843 | int ret = -ENOMEM; | 1089 | int ret = -ENOMEM; |
| 844 | 1090 | ||
| 845 | spin_lock(&hugetlb_lock); | 1091 | spin_lock(&hugetlb_lock); |
| 846 | if ((delta + resv_huge_pages) <= free_huge_pages) { | ||
| 847 | resv_huge_pages += delta; | ||
| 848 | ret = 0; | ||
| 849 | } | ||
| 850 | spin_unlock(&hugetlb_lock); | ||
| 851 | return ret; | ||
| 852 | } | ||
| 853 | |||
| 854 | int hugetlb_reserve_pages(struct inode *inode, long from, long to) | ||
| 855 | { | ||
| 856 | long ret, chg; | ||
| 857 | |||
| 858 | chg = region_chg(&inode->i_mapping->private_list, from, to); | ||
| 859 | if (chg < 0) | ||
| 860 | return chg; | ||
| 861 | /* | 1092 | /* |
| 862 | * When cpuset is configured, it breaks the strict hugetlb page | 1093 | * When cpuset is configured, it breaks the strict hugetlb page |
| 863 | * reservation as the accounting is done on a global variable. Such | 1094 | * reservation as the accounting is done on a global variable. Such |
| @@ -875,8 +1106,31 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to) | |||
| 875 | * a best attempt and hopefully to minimize the impact of changing | 1106 | * a best attempt and hopefully to minimize the impact of changing |
| 876 | * semantics that cpuset has. | 1107 | * semantics that cpuset has. |
| 877 | */ | 1108 | */ |
| 878 | if (chg > cpuset_mems_nr(free_huge_pages_node)) | 1109 | if (delta > 0) { |
| 879 | return -ENOMEM; | 1110 | if (gather_surplus_pages(delta) < 0) |
| 1111 | goto out; | ||
| 1112 | |||
| 1113 | if (delta > cpuset_mems_nr(free_huge_pages_node)) | ||
| 1114 | goto out; | ||
| 1115 | } | ||
| 1116 | |||
| 1117 | ret = 0; | ||
| 1118 | resv_huge_pages += delta; | ||
| 1119 | if (delta < 0) | ||
| 1120 | return_unused_surplus_pages((unsigned long) -delta); | ||
| 1121 | |||
| 1122 | out: | ||
| 1123 | spin_unlock(&hugetlb_lock); | ||
| 1124 | return ret; | ||
| 1125 | } | ||
| 1126 | |||
| 1127 | int hugetlb_reserve_pages(struct inode *inode, long from, long to) | ||
| 1128 | { | ||
| 1129 | long ret, chg; | ||
| 1130 | |||
| 1131 | chg = region_chg(&inode->i_mapping->private_list, from, to); | ||
| 1132 | if (chg < 0) | ||
| 1133 | return chg; | ||
| 880 | 1134 | ||
| 881 | ret = hugetlb_acct_memory(chg); | 1135 | ret = hugetlb_acct_memory(chg); |
| 882 | if (ret < 0) | 1136 | if (ret < 0) |
diff --git a/mm/internal.h b/mm/internal.h index a3110c02aea7..953f941ea867 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -37,4 +37,14 @@ static inline void __put_page(struct page *page) | |||
| 37 | extern void fastcall __init __free_pages_bootmem(struct page *page, | 37 | extern void fastcall __init __free_pages_bootmem(struct page *page, |
| 38 | unsigned int order); | 38 | unsigned int order); |
| 39 | 39 | ||
| 40 | /* | ||
| 41 | * function for dealing with page's order in buddy system. | ||
| 42 | * zone->lock is already acquired when we use these. | ||
| 43 | * So, we don't need atomic page->flags operations here. | ||
| 44 | */ | ||
| 45 | static inline unsigned long page_order(struct page *page) | ||
| 46 | { | ||
| 47 | VM_BUG_ON(!PageBuddy(page)); | ||
| 48 | return page_private(page); | ||
| 49 | } | ||
| 40 | #endif | 50 | #endif |
diff --git a/mm/memory.c b/mm/memory.c index f82b359b2745..bd16dcaeefb8 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -966,7 +966,7 @@ no_page_table: | |||
| 966 | * has touched so far, we don't want to allocate page tables. | 966 | * has touched so far, we don't want to allocate page tables. |
| 967 | */ | 967 | */ |
| 968 | if (flags & FOLL_ANON) { | 968 | if (flags & FOLL_ANON) { |
| 969 | page = ZERO_PAGE(address); | 969 | page = ZERO_PAGE(0); |
| 970 | if (flags & FOLL_GET) | 970 | if (flags & FOLL_GET) |
| 971 | get_page(page); | 971 | get_page(page); |
| 972 | BUG_ON(flags & FOLL_WRITE); | 972 | BUG_ON(flags & FOLL_WRITE); |
| @@ -1111,95 +1111,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1111 | } | 1111 | } |
| 1112 | EXPORT_SYMBOL(get_user_pages); | 1112 | EXPORT_SYMBOL(get_user_pages); |
| 1113 | 1113 | ||
| 1114 | static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, | ||
| 1115 | unsigned long addr, unsigned long end, pgprot_t prot) | ||
| 1116 | { | ||
| 1117 | pte_t *pte; | ||
| 1118 | spinlock_t *ptl; | ||
| 1119 | int err = 0; | ||
| 1120 | |||
| 1121 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | ||
| 1122 | if (!pte) | ||
| 1123 | return -EAGAIN; | ||
| 1124 | arch_enter_lazy_mmu_mode(); | ||
| 1125 | do { | ||
| 1126 | struct page *page = ZERO_PAGE(addr); | ||
| 1127 | pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); | ||
| 1128 | |||
| 1129 | if (unlikely(!pte_none(*pte))) { | ||
| 1130 | err = -EEXIST; | ||
| 1131 | pte++; | ||
| 1132 | break; | ||
| 1133 | } | ||
| 1134 | page_cache_get(page); | ||
| 1135 | page_add_file_rmap(page); | ||
| 1136 | inc_mm_counter(mm, file_rss); | ||
| 1137 | set_pte_at(mm, addr, pte, zero_pte); | ||
| 1138 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
| 1139 | arch_leave_lazy_mmu_mode(); | ||
| 1140 | pte_unmap_unlock(pte - 1, ptl); | ||
| 1141 | return err; | ||
| 1142 | } | ||
| 1143 | |||
| 1144 | static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, | ||
| 1145 | unsigned long addr, unsigned long end, pgprot_t prot) | ||
| 1146 | { | ||
| 1147 | pmd_t *pmd; | ||
| 1148 | unsigned long next; | ||
| 1149 | int err; | ||
| 1150 | |||
| 1151 | pmd = pmd_alloc(mm, pud, addr); | ||
| 1152 | if (!pmd) | ||
| 1153 | return -EAGAIN; | ||
| 1154 | do { | ||
| 1155 | next = pmd_addr_end(addr, end); | ||
| 1156 | err = zeromap_pte_range(mm, pmd, addr, next, prot); | ||
| 1157 | if (err) | ||
| 1158 | break; | ||
| 1159 | } while (pmd++, addr = next, addr != end); | ||
| 1160 | return err; | ||
| 1161 | } | ||
| 1162 | |||
| 1163 | static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, | ||
| 1164 | unsigned long addr, unsigned long end, pgprot_t prot) | ||
| 1165 | { | ||
| 1166 | pud_t *pud; | ||
| 1167 | unsigned long next; | ||
| 1168 | int err; | ||
| 1169 | |||
| 1170 | pud = pud_alloc(mm, pgd, addr); | ||
| 1171 | if (!pud) | ||
| 1172 | return -EAGAIN; | ||
| 1173 | do { | ||
| 1174 | next = pud_addr_end(addr, end); | ||
| 1175 | err = zeromap_pmd_range(mm, pud, addr, next, prot); | ||
| 1176 | if (err) | ||
| 1177 | break; | ||
| 1178 | } while (pud++, addr = next, addr != end); | ||
| 1179 | return err; | ||
| 1180 | } | ||
| 1181 | |||
| 1182 | int zeromap_page_range(struct vm_area_struct *vma, | ||
| 1183 | unsigned long addr, unsigned long size, pgprot_t prot) | ||
| 1184 | { | ||
| 1185 | pgd_t *pgd; | ||
| 1186 | unsigned long next; | ||
| 1187 | unsigned long end = addr + size; | ||
| 1188 | struct mm_struct *mm = vma->vm_mm; | ||
| 1189 | int err; | ||
| 1190 | |||
| 1191 | BUG_ON(addr >= end); | ||
| 1192 | pgd = pgd_offset(mm, addr); | ||
| 1193 | flush_cache_range(vma, addr, end); | ||
| 1194 | do { | ||
| 1195 | next = pgd_addr_end(addr, end); | ||
| 1196 | err = zeromap_pud_range(mm, pgd, addr, next, prot); | ||
| 1197 | if (err) | ||
| 1198 | break; | ||
| 1199 | } while (pgd++, addr = next, addr != end); | ||
| 1200 | return err; | ||
| 1201 | } | ||
| 1202 | |||
| 1203 | pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) | 1114 | pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) |
| 1204 | { | 1115 | { |
| 1205 | pgd_t * pgd = pgd_offset(mm, addr); | 1116 | pgd_t * pgd = pgd_offset(mm, addr); |
| @@ -1700,10 +1611,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1700 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 1611 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
| 1701 | entry = pte_mkyoung(orig_pte); | 1612 | entry = pte_mkyoung(orig_pte); |
| 1702 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1613 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 1703 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) { | 1614 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) |
| 1704 | update_mmu_cache(vma, address, entry); | 1615 | update_mmu_cache(vma, address, entry); |
| 1705 | lazy_mmu_prot_update(entry); | ||
| 1706 | } | ||
| 1707 | ret |= VM_FAULT_WRITE; | 1616 | ret |= VM_FAULT_WRITE; |
| 1708 | goto unlock; | 1617 | goto unlock; |
| 1709 | } | 1618 | } |
| @@ -1717,16 +1626,11 @@ gotten: | |||
| 1717 | 1626 | ||
| 1718 | if (unlikely(anon_vma_prepare(vma))) | 1627 | if (unlikely(anon_vma_prepare(vma))) |
| 1719 | goto oom; | 1628 | goto oom; |
| 1720 | if (old_page == ZERO_PAGE(address)) { | 1629 | VM_BUG_ON(old_page == ZERO_PAGE(0)); |
| 1721 | new_page = alloc_zeroed_user_highpage_movable(vma, address); | 1630 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
| 1722 | if (!new_page) | 1631 | if (!new_page) |
| 1723 | goto oom; | 1632 | goto oom; |
| 1724 | } else { | 1633 | cow_user_page(new_page, old_page, address, vma); |
| 1725 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
| 1726 | if (!new_page) | ||
| 1727 | goto oom; | ||
| 1728 | cow_user_page(new_page, old_page, address, vma); | ||
| 1729 | } | ||
| 1730 | 1634 | ||
| 1731 | /* | 1635 | /* |
| 1732 | * Re-check the pte - we dropped the lock | 1636 | * Re-check the pte - we dropped the lock |
| @@ -1744,7 +1648,6 @@ gotten: | |||
| 1744 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 1648 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
| 1745 | entry = mk_pte(new_page, vma->vm_page_prot); | 1649 | entry = mk_pte(new_page, vma->vm_page_prot); |
| 1746 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1650 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 1747 | lazy_mmu_prot_update(entry); | ||
| 1748 | /* | 1651 | /* |
| 1749 | * Clear the pte entry and flush it first, before updating the | 1652 | * Clear the pte entry and flush it first, before updating the |
| 1750 | * pte with the new entry. This will avoid a race condition | 1653 | * pte with the new entry. This will avoid a race condition |
| @@ -2252,44 +2155,28 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2252 | spinlock_t *ptl; | 2155 | spinlock_t *ptl; |
| 2253 | pte_t entry; | 2156 | pte_t entry; |
| 2254 | 2157 | ||
| 2255 | if (write_access) { | 2158 | /* Allocate our own private page. */ |
| 2256 | /* Allocate our own private page. */ | 2159 | pte_unmap(page_table); |
| 2257 | pte_unmap(page_table); | ||
| 2258 | |||
| 2259 | if (unlikely(anon_vma_prepare(vma))) | ||
| 2260 | goto oom; | ||
| 2261 | page = alloc_zeroed_user_highpage_movable(vma, address); | ||
| 2262 | if (!page) | ||
| 2263 | goto oom; | ||
| 2264 | |||
| 2265 | entry = mk_pte(page, vma->vm_page_prot); | ||
| 2266 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
| 2267 | 2160 | ||
| 2268 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2161 | if (unlikely(anon_vma_prepare(vma))) |
| 2269 | if (!pte_none(*page_table)) | 2162 | goto oom; |
| 2270 | goto release; | 2163 | page = alloc_zeroed_user_highpage_movable(vma, address); |
| 2271 | inc_mm_counter(mm, anon_rss); | 2164 | if (!page) |
| 2272 | lru_cache_add_active(page); | 2165 | goto oom; |
| 2273 | page_add_new_anon_rmap(page, vma, address); | ||
| 2274 | } else { | ||
| 2275 | /* Map the ZERO_PAGE - vm_page_prot is readonly */ | ||
| 2276 | page = ZERO_PAGE(address); | ||
| 2277 | page_cache_get(page); | ||
| 2278 | entry = mk_pte(page, vma->vm_page_prot); | ||
| 2279 | 2166 | ||
| 2280 | ptl = pte_lockptr(mm, pmd); | 2167 | entry = mk_pte(page, vma->vm_page_prot); |
| 2281 | spin_lock(ptl); | 2168 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 2282 | if (!pte_none(*page_table)) | ||
| 2283 | goto release; | ||
| 2284 | inc_mm_counter(mm, file_rss); | ||
| 2285 | page_add_file_rmap(page); | ||
| 2286 | } | ||
| 2287 | 2169 | ||
| 2170 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
| 2171 | if (!pte_none(*page_table)) | ||
| 2172 | goto release; | ||
| 2173 | inc_mm_counter(mm, anon_rss); | ||
| 2174 | lru_cache_add_active(page); | ||
| 2175 | page_add_new_anon_rmap(page, vma, address); | ||
| 2288 | set_pte_at(mm, address, page_table, entry); | 2176 | set_pte_at(mm, address, page_table, entry); |
| 2289 | 2177 | ||
| 2290 | /* No need to invalidate - it was non-present before */ | 2178 | /* No need to invalidate - it was non-present before */ |
| 2291 | update_mmu_cache(vma, address, entry); | 2179 | update_mmu_cache(vma, address, entry); |
| 2292 | lazy_mmu_prot_update(entry); | ||
| 2293 | unlock: | 2180 | unlock: |
| 2294 | pte_unmap_unlock(page_table, ptl); | 2181 | pte_unmap_unlock(page_table, ptl); |
| 2295 | return 0; | 2182 | return 0; |
| @@ -2442,7 +2329,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2442 | 2329 | ||
| 2443 | /* no need to invalidate: a not-present page won't be cached */ | 2330 | /* no need to invalidate: a not-present page won't be cached */ |
| 2444 | update_mmu_cache(vma, address, entry); | 2331 | update_mmu_cache(vma, address, entry); |
| 2445 | lazy_mmu_prot_update(entry); | ||
| 2446 | } else { | 2332 | } else { |
| 2447 | if (anon) | 2333 | if (anon) |
| 2448 | page_cache_release(page); | 2334 | page_cache_release(page); |
| @@ -2470,7 +2356,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2470 | int write_access, pte_t orig_pte) | 2356 | int write_access, pte_t orig_pte) |
| 2471 | { | 2357 | { |
| 2472 | pgoff_t pgoff = (((address & PAGE_MASK) | 2358 | pgoff_t pgoff = (((address & PAGE_MASK) |
| 2473 | - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; | 2359 | - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
| 2474 | unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); | 2360 | unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); |
| 2475 | 2361 | ||
| 2476 | pte_unmap(page_table); | 2362 | pte_unmap(page_table); |
| @@ -2614,7 +2500,6 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
| 2614 | entry = pte_mkyoung(entry); | 2500 | entry = pte_mkyoung(entry); |
| 2615 | if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { | 2501 | if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { |
| 2616 | update_mmu_cache(vma, address, entry); | 2502 | update_mmu_cache(vma, address, entry); |
| 2617 | lazy_mmu_prot_update(entry); | ||
| 2618 | } else { | 2503 | } else { |
| 2619 | /* | 2504 | /* |
| 2620 | * This is needed only for protection faults but the arch code | 2505 | * This is needed only for protection faults but the arch code |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index df9d554bea30..091b9c6c2529 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -23,6 +23,9 @@ | |||
| 23 | #include <linux/vmalloc.h> | 23 | #include <linux/vmalloc.h> |
| 24 | #include <linux/ioport.h> | 24 | #include <linux/ioport.h> |
| 25 | #include <linux/cpuset.h> | 25 | #include <linux/cpuset.h> |
| 26 | #include <linux/delay.h> | ||
| 27 | #include <linux/migrate.h> | ||
| 28 | #include <linux/page-isolation.h> | ||
| 26 | 29 | ||
| 27 | #include <asm/tlbflush.h> | 30 | #include <asm/tlbflush.h> |
| 28 | 31 | ||
| @@ -161,14 +164,27 @@ static void grow_pgdat_span(struct pglist_data *pgdat, | |||
| 161 | pgdat->node_start_pfn; | 164 | pgdat->node_start_pfn; |
| 162 | } | 165 | } |
| 163 | 166 | ||
| 164 | int online_pages(unsigned long pfn, unsigned long nr_pages) | 167 | static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, |
| 168 | void *arg) | ||
| 165 | { | 169 | { |
| 166 | unsigned long i; | 170 | unsigned long i; |
| 171 | unsigned long onlined_pages = *(unsigned long *)arg; | ||
| 172 | struct page *page; | ||
| 173 | if (PageReserved(pfn_to_page(start_pfn))) | ||
| 174 | for (i = 0; i < nr_pages; i++) { | ||
| 175 | page = pfn_to_page(start_pfn + i); | ||
| 176 | online_page(page); | ||
| 177 | onlined_pages++; | ||
| 178 | } | ||
| 179 | *(unsigned long *)arg = onlined_pages; | ||
| 180 | return 0; | ||
| 181 | } | ||
| 182 | |||
| 183 | |||
| 184 | int online_pages(unsigned long pfn, unsigned long nr_pages) | ||
| 185 | { | ||
| 167 | unsigned long flags; | 186 | unsigned long flags; |
| 168 | unsigned long onlined_pages = 0; | 187 | unsigned long onlined_pages = 0; |
| 169 | struct resource res; | ||
| 170 | u64 section_end; | ||
| 171 | unsigned long start_pfn; | ||
| 172 | struct zone *zone; | 188 | struct zone *zone; |
| 173 | int need_zonelists_rebuild = 0; | 189 | int need_zonelists_rebuild = 0; |
| 174 | 190 | ||
| @@ -191,32 +207,16 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 191 | if (!populated_zone(zone)) | 207 | if (!populated_zone(zone)) |
| 192 | need_zonelists_rebuild = 1; | 208 | need_zonelists_rebuild = 1; |
| 193 | 209 | ||
| 194 | res.start = (u64)pfn << PAGE_SHIFT; | 210 | walk_memory_resource(pfn, nr_pages, &onlined_pages, |
| 195 | res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1; | 211 | online_pages_range); |
| 196 | res.flags = IORESOURCE_MEM; /* we just need system ram */ | ||
| 197 | section_end = res.end; | ||
| 198 | |||
| 199 | while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { | ||
| 200 | start_pfn = (unsigned long)(res.start >> PAGE_SHIFT); | ||
| 201 | nr_pages = (unsigned long) | ||
| 202 | ((res.end + 1 - res.start) >> PAGE_SHIFT); | ||
| 203 | |||
| 204 | if (PageReserved(pfn_to_page(start_pfn))) { | ||
| 205 | /* this region's page is not onlined now */ | ||
| 206 | for (i = 0; i < nr_pages; i++) { | ||
| 207 | struct page *page = pfn_to_page(start_pfn + i); | ||
| 208 | online_page(page); | ||
| 209 | onlined_pages++; | ||
| 210 | } | ||
| 211 | } | ||
| 212 | |||
| 213 | res.start = res.end + 1; | ||
| 214 | res.end = section_end; | ||
| 215 | } | ||
| 216 | zone->present_pages += onlined_pages; | 212 | zone->present_pages += onlined_pages; |
| 217 | zone->zone_pgdat->node_present_pages += onlined_pages; | 213 | zone->zone_pgdat->node_present_pages += onlined_pages; |
| 218 | 214 | ||
| 219 | setup_per_zone_pages_min(); | 215 | setup_per_zone_pages_min(); |
| 216 | if (onlined_pages) { | ||
| 217 | kswapd_run(zone_to_nid(zone)); | ||
| 218 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | ||
| 219 | } | ||
| 220 | 220 | ||
| 221 | if (need_zonelists_rebuild) | 221 | if (need_zonelists_rebuild) |
| 222 | build_all_zonelists(); | 222 | build_all_zonelists(); |
| @@ -271,9 +271,6 @@ int add_memory(int nid, u64 start, u64 size) | |||
| 271 | if (!pgdat) | 271 | if (!pgdat) |
| 272 | return -ENOMEM; | 272 | return -ENOMEM; |
| 273 | new_pgdat = 1; | 273 | new_pgdat = 1; |
| 274 | ret = kswapd_run(nid); | ||
| 275 | if (ret) | ||
| 276 | goto error; | ||
| 277 | } | 274 | } |
| 278 | 275 | ||
| 279 | /* call arch's memory hotadd */ | 276 | /* call arch's memory hotadd */ |
| @@ -308,3 +305,260 @@ error: | |||
| 308 | return ret; | 305 | return ret; |
| 309 | } | 306 | } |
| 310 | EXPORT_SYMBOL_GPL(add_memory); | 307 | EXPORT_SYMBOL_GPL(add_memory); |
| 308 | |||
| 309 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
| 310 | /* | ||
| 311 | * Confirm all pages in a range [start, end) is belongs to the same zone. | ||
| 312 | */ | ||
| 313 | static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) | ||
| 314 | { | ||
| 315 | unsigned long pfn; | ||
| 316 | struct zone *zone = NULL; | ||
| 317 | struct page *page; | ||
| 318 | int i; | ||
| 319 | for (pfn = start_pfn; | ||
| 320 | pfn < end_pfn; | ||
| 321 | pfn += MAX_ORDER_NR_PAGES) { | ||
| 322 | i = 0; | ||
| 323 | /* This is just a CONFIG_HOLES_IN_ZONE check.*/ | ||
| 324 | while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) | ||
| 325 | i++; | ||
| 326 | if (i == MAX_ORDER_NR_PAGES) | ||
| 327 | continue; | ||
| 328 | page = pfn_to_page(pfn + i); | ||
| 329 | if (zone && page_zone(page) != zone) | ||
| 330 | return 0; | ||
| 331 | zone = page_zone(page); | ||
| 332 | } | ||
| 333 | return 1; | ||
| 334 | } | ||
| 335 | |||
| 336 | /* | ||
| 337 | * Scanning pfn is much easier than scanning lru list. | ||
| 338 | * Scan pfn from start to end and Find LRU page. | ||
| 339 | */ | ||
| 340 | int scan_lru_pages(unsigned long start, unsigned long end) | ||
| 341 | { | ||
| 342 | unsigned long pfn; | ||
| 343 | struct page *page; | ||
| 344 | for (pfn = start; pfn < end; pfn++) { | ||
| 345 | if (pfn_valid(pfn)) { | ||
| 346 | page = pfn_to_page(pfn); | ||
| 347 | if (PageLRU(page)) | ||
| 348 | return pfn; | ||
| 349 | } | ||
| 350 | } | ||
| 351 | return 0; | ||
| 352 | } | ||
| 353 | |||
| 354 | static struct page * | ||
| 355 | hotremove_migrate_alloc(struct page *page, | ||
| 356 | unsigned long private, | ||
| 357 | int **x) | ||
| 358 | { | ||
| 359 | /* This should be improoooooved!! */ | ||
| 360 | return alloc_page(GFP_HIGHUSER_PAGECACHE); | ||
| 361 | } | ||
| 362 | |||
| 363 | |||
| 364 | #define NR_OFFLINE_AT_ONCE_PAGES (256) | ||
| 365 | static int | ||
| 366 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | ||
| 367 | { | ||
| 368 | unsigned long pfn; | ||
| 369 | struct page *page; | ||
| 370 | int move_pages = NR_OFFLINE_AT_ONCE_PAGES; | ||
| 371 | int not_managed = 0; | ||
| 372 | int ret = 0; | ||
| 373 | LIST_HEAD(source); | ||
| 374 | |||
| 375 | for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { | ||
| 376 | if (!pfn_valid(pfn)) | ||
| 377 | continue; | ||
| 378 | page = pfn_to_page(pfn); | ||
| 379 | if (!page_count(page)) | ||
| 380 | continue; | ||
| 381 | /* | ||
| 382 | * We can skip free pages. And we can only deal with pages on | ||
| 383 | * LRU. | ||
| 384 | */ | ||
| 385 | ret = isolate_lru_page(page, &source); | ||
| 386 | if (!ret) { /* Success */ | ||
| 387 | move_pages--; | ||
| 388 | } else { | ||
| 389 | /* Becasue we don't have big zone->lock. we should | ||
| 390 | check this again here. */ | ||
| 391 | if (page_count(page)) | ||
| 392 | not_managed++; | ||
| 393 | #ifdef CONFIG_DEBUG_VM | ||
| 394 | printk(KERN_INFO "removing from LRU failed" | ||
| 395 | " %lx/%d/%lx\n", | ||
| 396 | pfn, page_count(page), page->flags); | ||
| 397 | #endif | ||
| 398 | } | ||
| 399 | } | ||
| 400 | ret = -EBUSY; | ||
| 401 | if (not_managed) { | ||
| 402 | if (!list_empty(&source)) | ||
| 403 | putback_lru_pages(&source); | ||
| 404 | goto out; | ||
| 405 | } | ||
| 406 | ret = 0; | ||
| 407 | if (list_empty(&source)) | ||
| 408 | goto out; | ||
| 409 | /* this function returns # of failed pages */ | ||
| 410 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0); | ||
| 411 | |||
| 412 | out: | ||
| 413 | return ret; | ||
| 414 | } | ||
| 415 | |||
| 416 | /* | ||
| 417 | * remove from free_area[] and mark all as Reserved. | ||
| 418 | */ | ||
| 419 | static int | ||
| 420 | offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, | ||
| 421 | void *data) | ||
| 422 | { | ||
| 423 | __offline_isolated_pages(start, start + nr_pages); | ||
| 424 | return 0; | ||
| 425 | } | ||
| 426 | |||
| 427 | static void | ||
| 428 | offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | ||
| 429 | { | ||
| 430 | walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL, | ||
| 431 | offline_isolated_pages_cb); | ||
| 432 | } | ||
| 433 | |||
| 434 | /* | ||
| 435 | * Check all pages in range, recoreded as memory resource, are isolated. | ||
| 436 | */ | ||
| 437 | static int | ||
| 438 | check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, | ||
| 439 | void *data) | ||
| 440 | { | ||
| 441 | int ret; | ||
| 442 | long offlined = *(long *)data; | ||
| 443 | ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); | ||
| 444 | offlined = nr_pages; | ||
| 445 | if (!ret) | ||
| 446 | *(long *)data += offlined; | ||
| 447 | return ret; | ||
| 448 | } | ||
| 449 | |||
| 450 | static long | ||
| 451 | check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | ||
| 452 | { | ||
| 453 | long offlined = 0; | ||
| 454 | int ret; | ||
| 455 | |||
| 456 | ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined, | ||
| 457 | check_pages_isolated_cb); | ||
| 458 | if (ret < 0) | ||
| 459 | offlined = (long)ret; | ||
| 460 | return offlined; | ||
| 461 | } | ||
| 462 | |||
| 463 | extern void drain_all_local_pages(void); | ||
| 464 | |||
| 465 | int offline_pages(unsigned long start_pfn, | ||
| 466 | unsigned long end_pfn, unsigned long timeout) | ||
| 467 | { | ||
| 468 | unsigned long pfn, nr_pages, expire; | ||
| 469 | long offlined_pages; | ||
| 470 | int ret, drain, retry_max; | ||
| 471 | struct zone *zone; | ||
| 472 | |||
| 473 | BUG_ON(start_pfn >= end_pfn); | ||
| 474 | /* at least, alignment against pageblock is necessary */ | ||
| 475 | if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) | ||
| 476 | return -EINVAL; | ||
| 477 | if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) | ||
| 478 | return -EINVAL; | ||
| 479 | /* This makes hotplug much easier...and readable. | ||
| 480 | we assume this for now. .*/ | ||
| 481 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) | ||
| 482 | return -EINVAL; | ||
| 483 | /* set above range as isolated */ | ||
| 484 | ret = start_isolate_page_range(start_pfn, end_pfn); | ||
| 485 | if (ret) | ||
| 486 | return ret; | ||
| 487 | nr_pages = end_pfn - start_pfn; | ||
| 488 | pfn = start_pfn; | ||
| 489 | expire = jiffies + timeout; | ||
| 490 | drain = 0; | ||
| 491 | retry_max = 5; | ||
| 492 | repeat: | ||
| 493 | /* start memory hot removal */ | ||
| 494 | ret = -EAGAIN; | ||
| 495 | if (time_after(jiffies, expire)) | ||
| 496 | goto failed_removal; | ||
| 497 | ret = -EINTR; | ||
| 498 | if (signal_pending(current)) | ||
| 499 | goto failed_removal; | ||
| 500 | ret = 0; | ||
| 501 | if (drain) { | ||
| 502 | lru_add_drain_all(); | ||
| 503 | flush_scheduled_work(); | ||
| 504 | cond_resched(); | ||
| 505 | drain_all_local_pages(); | ||
| 506 | } | ||
| 507 | |||
| 508 | pfn = scan_lru_pages(start_pfn, end_pfn); | ||
| 509 | if (pfn) { /* We have page on LRU */ | ||
| 510 | ret = do_migrate_range(pfn, end_pfn); | ||
| 511 | if (!ret) { | ||
| 512 | drain = 1; | ||
| 513 | goto repeat; | ||
| 514 | } else { | ||
| 515 | if (ret < 0) | ||
| 516 | if (--retry_max == 0) | ||
| 517 | goto failed_removal; | ||
| 518 | yield(); | ||
| 519 | drain = 1; | ||
| 520 | goto repeat; | ||
| 521 | } | ||
| 522 | } | ||
| 523 | /* drain all zone's lru pagevec, this is asyncronous... */ | ||
| 524 | lru_add_drain_all(); | ||
| 525 | flush_scheduled_work(); | ||
| 526 | yield(); | ||
| 527 | /* drain pcp pages , this is synchrouns. */ | ||
| 528 | drain_all_local_pages(); | ||
| 529 | /* check again */ | ||
| 530 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); | ||
| 531 | if (offlined_pages < 0) { | ||
| 532 | ret = -EBUSY; | ||
| 533 | goto failed_removal; | ||
| 534 | } | ||
| 535 | printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); | ||
| 536 | /* Ok, all of our target is islaoted. | ||
| 537 | We cannot do rollback at this point. */ | ||
| 538 | offline_isolated_pages(start_pfn, end_pfn); | ||
| 539 | /* reset pagetype flags */ | ||
| 540 | start_isolate_page_range(start_pfn, end_pfn); | ||
| 541 | /* removal success */ | ||
| 542 | zone = page_zone(pfn_to_page(start_pfn)); | ||
| 543 | zone->present_pages -= offlined_pages; | ||
| 544 | zone->zone_pgdat->node_present_pages -= offlined_pages; | ||
| 545 | totalram_pages -= offlined_pages; | ||
| 546 | num_physpages -= offlined_pages; | ||
| 547 | vm_total_pages = nr_free_pagecache_pages(); | ||
| 548 | writeback_set_ratelimit(); | ||
| 549 | return 0; | ||
| 550 | |||
| 551 | failed_removal: | ||
| 552 | printk(KERN_INFO "memory offlining %lx to %lx failed\n", | ||
| 553 | start_pfn, end_pfn); | ||
| 554 | /* pushback to free area */ | ||
| 555 | undo_isolate_page_range(start_pfn, end_pfn); | ||
| 556 | return ret; | ||
| 557 | } | ||
| 558 | #else | ||
| 559 | int remove_memory(u64 start, u64 size) | ||
| 560 | { | ||
| 561 | return -EINVAL; | ||
| 562 | } | ||
| 563 | EXPORT_SYMBOL_GPL(remove_memory); | ||
| 564 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3d6ac9505d07..568152ae6caf 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -72,7 +72,6 @@ | |||
| 72 | #include <linux/hugetlb.h> | 72 | #include <linux/hugetlb.h> |
| 73 | #include <linux/kernel.h> | 73 | #include <linux/kernel.h> |
| 74 | #include <linux/sched.h> | 74 | #include <linux/sched.h> |
| 75 | #include <linux/mm.h> | ||
| 76 | #include <linux/nodemask.h> | 75 | #include <linux/nodemask.h> |
| 77 | #include <linux/cpuset.h> | 76 | #include <linux/cpuset.h> |
| 78 | #include <linux/gfp.h> | 77 | #include <linux/gfp.h> |
| @@ -82,13 +81,13 @@ | |||
| 82 | #include <linux/interrupt.h> | 81 | #include <linux/interrupt.h> |
| 83 | #include <linux/init.h> | 82 | #include <linux/init.h> |
| 84 | #include <linux/compat.h> | 83 | #include <linux/compat.h> |
| 85 | #include <linux/mempolicy.h> | ||
| 86 | #include <linux/swap.h> | 84 | #include <linux/swap.h> |
| 87 | #include <linux/seq_file.h> | 85 | #include <linux/seq_file.h> |
| 88 | #include <linux/proc_fs.h> | 86 | #include <linux/proc_fs.h> |
| 89 | #include <linux/migrate.h> | 87 | #include <linux/migrate.h> |
| 90 | #include <linux/rmap.h> | 88 | #include <linux/rmap.h> |
| 91 | #include <linux/security.h> | 89 | #include <linux/security.h> |
| 90 | #include <linux/syscalls.h> | ||
| 92 | 91 | ||
| 93 | #include <asm/tlbflush.h> | 92 | #include <asm/tlbflush.h> |
| 94 | #include <asm/uaccess.h> | 93 | #include <asm/uaccess.h> |
| @@ -110,6 +109,9 @@ struct mempolicy default_policy = { | |||
| 110 | .policy = MPOL_DEFAULT, | 109 | .policy = MPOL_DEFAULT, |
| 111 | }; | 110 | }; |
| 112 | 111 | ||
| 112 | static void mpol_rebind_policy(struct mempolicy *pol, | ||
| 113 | const nodemask_t *newmask); | ||
| 114 | |||
| 113 | /* Do sanity checking on a policy */ | 115 | /* Do sanity checking on a policy */ |
| 114 | static int mpol_check_policy(int mode, nodemask_t *nodes) | 116 | static int mpol_check_policy(int mode, nodemask_t *nodes) |
| 115 | { | 117 | { |
| @@ -128,7 +130,7 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) | |||
| 128 | return -EINVAL; | 130 | return -EINVAL; |
| 129 | break; | 131 | break; |
| 130 | } | 132 | } |
| 131 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; | 133 | return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL; |
| 132 | } | 134 | } |
| 133 | 135 | ||
| 134 | /* Generate a custom zonelist for the BIND policy. */ | 136 | /* Generate a custom zonelist for the BIND policy. */ |
| @@ -185,7 +187,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
| 185 | switch (mode) { | 187 | switch (mode) { |
| 186 | case MPOL_INTERLEAVE: | 188 | case MPOL_INTERLEAVE: |
| 187 | policy->v.nodes = *nodes; | 189 | policy->v.nodes = *nodes; |
| 188 | if (nodes_weight(*nodes) == 0) { | 190 | nodes_and(policy->v.nodes, policy->v.nodes, |
| 191 | node_states[N_HIGH_MEMORY]); | ||
| 192 | if (nodes_weight(policy->v.nodes) == 0) { | ||
| 189 | kmem_cache_free(policy_cache, policy); | 193 | kmem_cache_free(policy_cache, policy); |
| 190 | return ERR_PTR(-EINVAL); | 194 | return ERR_PTR(-EINVAL); |
| 191 | } | 195 | } |
| @@ -459,7 +463,7 @@ static void mpol_set_task_struct_flag(void) | |||
| 459 | } | 463 | } |
| 460 | 464 | ||
| 461 | /* Set the process memory policy */ | 465 | /* Set the process memory policy */ |
| 462 | long do_set_mempolicy(int mode, nodemask_t *nodes) | 466 | static long do_set_mempolicy(int mode, nodemask_t *nodes) |
| 463 | { | 467 | { |
| 464 | struct mempolicy *new; | 468 | struct mempolicy *new; |
| 465 | 469 | ||
| @@ -494,9 +498,9 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) | |||
| 494 | *nodes = p->v.nodes; | 498 | *nodes = p->v.nodes; |
| 495 | break; | 499 | break; |
| 496 | case MPOL_PREFERRED: | 500 | case MPOL_PREFERRED: |
| 497 | /* or use current node instead of online map? */ | 501 | /* or use current node instead of memory_map? */ |
| 498 | if (p->v.preferred_node < 0) | 502 | if (p->v.preferred_node < 0) |
| 499 | *nodes = node_online_map; | 503 | *nodes = node_states[N_HIGH_MEMORY]; |
| 500 | else | 504 | else |
| 501 | node_set(p->v.preferred_node, *nodes); | 505 | node_set(p->v.preferred_node, *nodes); |
| 502 | break; | 506 | break; |
| @@ -519,8 +523,8 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) | |||
| 519 | } | 523 | } |
| 520 | 524 | ||
| 521 | /* Retrieve NUMA policy */ | 525 | /* Retrieve NUMA policy */ |
| 522 | long do_get_mempolicy(int *policy, nodemask_t *nmask, | 526 | static long do_get_mempolicy(int *policy, nodemask_t *nmask, |
| 523 | unsigned long addr, unsigned long flags) | 527 | unsigned long addr, unsigned long flags) |
| 524 | { | 528 | { |
| 525 | int err; | 529 | int err; |
| 526 | struct mm_struct *mm = current->mm; | 530 | struct mm_struct *mm = current->mm; |
| @@ -528,8 +532,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
| 528 | struct mempolicy *pol = current->mempolicy; | 532 | struct mempolicy *pol = current->mempolicy; |
| 529 | 533 | ||
| 530 | cpuset_update_task_memory_state(); | 534 | cpuset_update_task_memory_state(); |
| 531 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) | 535 | if (flags & |
| 536 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) | ||
| 532 | return -EINVAL; | 537 | return -EINVAL; |
| 538 | |||
| 539 | if (flags & MPOL_F_MEMS_ALLOWED) { | ||
| 540 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) | ||
| 541 | return -EINVAL; | ||
| 542 | *policy = 0; /* just so it's initialized */ | ||
| 543 | *nmask = cpuset_current_mems_allowed; | ||
| 544 | return 0; | ||
| 545 | } | ||
| 546 | |||
| 533 | if (flags & MPOL_F_ADDR) { | 547 | if (flags & MPOL_F_ADDR) { |
| 534 | down_read(&mm->mmap_sem); | 548 | down_read(&mm->mmap_sem); |
| 535 | vma = find_vma_intersection(mm, addr, addr+1); | 549 | vma = find_vma_intersection(mm, addr, addr+1); |
| @@ -601,7 +615,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x | |||
| 601 | * Migrate pages from one node to a target node. | 615 | * Migrate pages from one node to a target node. |
| 602 | * Returns error or the number of pages not migrated. | 616 | * Returns error or the number of pages not migrated. |
| 603 | */ | 617 | */ |
| 604 | int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) | 618 | static int migrate_to_node(struct mm_struct *mm, int source, int dest, |
| 619 | int flags) | ||
| 605 | { | 620 | { |
| 606 | nodemask_t nmask; | 621 | nodemask_t nmask; |
| 607 | LIST_HEAD(pagelist); | 622 | LIST_HEAD(pagelist); |
| @@ -732,8 +747,9 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * | |||
| 732 | } | 747 | } |
| 733 | #endif | 748 | #endif |
| 734 | 749 | ||
| 735 | long do_mbind(unsigned long start, unsigned long len, | 750 | static long do_mbind(unsigned long start, unsigned long len, |
| 736 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | 751 | unsigned long mode, nodemask_t *nmask, |
| 752 | unsigned long flags) | ||
| 737 | { | 753 | { |
| 738 | struct vm_area_struct *vma; | 754 | struct vm_area_struct *vma; |
| 739 | struct mm_struct *mm = current->mm; | 755 | struct mm_struct *mm = current->mm; |
| @@ -955,7 +971,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, | |||
| 955 | goto out; | 971 | goto out; |
| 956 | } | 972 | } |
| 957 | 973 | ||
| 958 | if (!nodes_subset(new, node_online_map)) { | 974 | if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { |
| 959 | err = -EINVAL; | 975 | err = -EINVAL; |
| 960 | goto out; | 976 | goto out; |
| 961 | } | 977 | } |
| @@ -978,7 +994,8 @@ asmlinkage long sys_get_mempolicy(int __user *policy, | |||
| 978 | unsigned long maxnode, | 994 | unsigned long maxnode, |
| 979 | unsigned long addr, unsigned long flags) | 995 | unsigned long addr, unsigned long flags) |
| 980 | { | 996 | { |
| 981 | int err, pval; | 997 | int err; |
| 998 | int uninitialized_var(pval); | ||
| 982 | nodemask_t nodes; | 999 | nodemask_t nodes; |
| 983 | 1000 | ||
| 984 | if (nmask != NULL && maxnode < MAX_NUMNODES) | 1001 | if (nmask != NULL && maxnode < MAX_NUMNODES) |
| @@ -1527,8 +1544,8 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n) | |||
| 1527 | kmem_cache_free(sn_cache, n); | 1544 | kmem_cache_free(sn_cache, n); |
| 1528 | } | 1545 | } |
| 1529 | 1546 | ||
| 1530 | struct sp_node * | 1547 | static struct sp_node *sp_alloc(unsigned long start, unsigned long end, |
| 1531 | sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) | 1548 | struct mempolicy *pol) |
| 1532 | { | 1549 | { |
| 1533 | struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); | 1550 | struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); |
| 1534 | 1551 | ||
| @@ -1677,7 +1694,7 @@ void __init numa_policy_init(void) | |||
| 1677 | * fall back to the largest node if they're all smaller. | 1694 | * fall back to the largest node if they're all smaller. |
| 1678 | */ | 1695 | */ |
| 1679 | nodes_clear(interleave_nodes); | 1696 | nodes_clear(interleave_nodes); |
| 1680 | for_each_online_node(nid) { | 1697 | for_each_node_state(nid, N_HIGH_MEMORY) { |
| 1681 | unsigned long total_pages = node_present_pages(nid); | 1698 | unsigned long total_pages = node_present_pages(nid); |
| 1682 | 1699 | ||
| 1683 | /* Preserve the largest node */ | 1700 | /* Preserve the largest node */ |
| @@ -1706,7 +1723,8 @@ void numa_default_policy(void) | |||
| 1706 | } | 1723 | } |
| 1707 | 1724 | ||
| 1708 | /* Migrate a policy to a different set of nodes */ | 1725 | /* Migrate a policy to a different set of nodes */ |
| 1709 | void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) | 1726 | static void mpol_rebind_policy(struct mempolicy *pol, |
| 1727 | const nodemask_t *newmask) | ||
| 1710 | { | 1728 | { |
| 1711 | nodemask_t *mpolmask; | 1729 | nodemask_t *mpolmask; |
| 1712 | nodemask_t tmp; | 1730 | nodemask_t tmp; |
| @@ -1963,7 +1981,7 @@ int show_numa_map(struct seq_file *m, void *v) | |||
| 1963 | seq_printf(m, " huge"); | 1981 | seq_printf(m, " huge"); |
| 1964 | } else { | 1982 | } else { |
| 1965 | check_pgd_range(vma, vma->vm_start, vma->vm_end, | 1983 | check_pgd_range(vma, vma->vm_start, vma->vm_end, |
| 1966 | &node_online_map, MPOL_MF_STATS, md); | 1984 | &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md); |
| 1967 | } | 1985 | } |
| 1968 | 1986 | ||
| 1969 | if (!md->pages) | 1987 | if (!md->pages) |
| @@ -1990,7 +2008,7 @@ int show_numa_map(struct seq_file *m, void *v) | |||
| 1990 | if (md->writeback) | 2008 | if (md->writeback) |
| 1991 | seq_printf(m," writeback=%lu", md->writeback); | 2009 | seq_printf(m," writeback=%lu", md->writeback); |
| 1992 | 2010 | ||
| 1993 | for_each_online_node(n) | 2011 | for_each_node_state(n, N_HIGH_MEMORY) |
| 1994 | if (md->node[n]) | 2012 | if (md->node[n]) |
| 1995 | seq_printf(m, " N%d=%lu", n, md->node[n]); | 2013 | seq_printf(m, " N%d=%lu", n, md->node[n]); |
| 1996 | out: | 2014 | out: |
diff --git a/mm/migrate.c b/mm/migrate.c index 07f22d4a431f..06d0877a66ef 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -171,6 +171,7 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
| 171 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 171 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
| 172 | if (is_write_migration_entry(entry)) | 172 | if (is_write_migration_entry(entry)) |
| 173 | pte = pte_mkwrite(pte); | 173 | pte = pte_mkwrite(pte); |
| 174 | flush_cache_page(vma, addr, pte_pfn(pte)); | ||
| 174 | set_pte_at(mm, addr, ptep, pte); | 175 | set_pte_at(mm, addr, ptep, pte); |
| 175 | 176 | ||
| 176 | if (PageAnon(new)) | 177 | if (PageAnon(new)) |
| @@ -180,7 +181,6 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
| 180 | 181 | ||
| 181 | /* No need to invalidate - it was non-present before */ | 182 | /* No need to invalidate - it was non-present before */ |
| 182 | update_mmu_cache(vma, addr, pte); | 183 | update_mmu_cache(vma, addr, pte); |
| 183 | lazy_mmu_prot_update(pte); | ||
| 184 | 184 | ||
| 185 | out: | 185 | out: |
| 186 | pte_unmap_unlock(ptep, ptl); | 186 | pte_unmap_unlock(ptep, ptl); |
| @@ -986,7 +986,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, | |||
| 986 | goto out; | 986 | goto out; |
| 987 | 987 | ||
| 988 | err = -ENODEV; | 988 | err = -ENODEV; |
| 989 | if (!node_online(node)) | 989 | if (!node_state(node, N_HIGH_MEMORY)) |
| 990 | goto out; | 990 | goto out; |
| 991 | 991 | ||
| 992 | err = -EACCES; | 992 | err = -EACCES; |
diff --git a/mm/mprotect.c b/mm/mprotect.c index e8346c30abec..1d4d69790e59 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
| @@ -53,7 +53,6 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
| 53 | if (dirty_accountable && pte_dirty(ptent)) | 53 | if (dirty_accountable && pte_dirty(ptent)) |
| 54 | ptent = pte_mkwrite(ptent); | 54 | ptent = pte_mkwrite(ptent); |
| 55 | set_pte_at(mm, addr, pte, ptent); | 55 | set_pte_at(mm, addr, pte, ptent); |
| 56 | lazy_mmu_prot_update(ptent); | ||
| 57 | #ifdef CONFIG_MIGRATION | 56 | #ifdef CONFIG_MIGRATION |
| 58 | } else if (!pte_file(oldpte)) { | 57 | } else if (!pte_file(oldpte)) { |
| 59 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 58 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f9b82ad5047f..41b4e362221d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -177,14 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) | |||
| 177 | { | 177 | { |
| 178 | #ifdef CONFIG_NUMA | 178 | #ifdef CONFIG_NUMA |
| 179 | struct zone **z; | 179 | struct zone **z; |
| 180 | nodemask_t nodes; | 180 | nodemask_t nodes = node_states[N_HIGH_MEMORY]; |
| 181 | int node; | ||
| 182 | |||
| 183 | nodes_clear(nodes); | ||
| 184 | /* node has memory ? */ | ||
| 185 | for_each_online_node(node) | ||
| 186 | if (NODE_DATA(node)->node_present_pages) | ||
| 187 | node_set(node, nodes); | ||
| 188 | 181 | ||
| 189 | for (z = zonelist->zones; *z; z++) | 182 | for (z = zonelist->zones; *z; z++) |
| 190 | if (cpuset_zone_allowed_softwall(*z, gfp_mask)) | 183 | if (cpuset_zone_allowed_softwall(*z, gfp_mask)) |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 44720363374c..d821321326e3 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -126,7 +126,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) | |||
| 126 | int node; | 126 | int node; |
| 127 | unsigned long x = 0; | 127 | unsigned long x = 0; |
| 128 | 128 | ||
| 129 | for_each_online_node(node) { | 129 | for_each_node_state(node, N_HIGH_MEMORY) { |
| 130 | struct zone *z = | 130 | struct zone *z = |
| 131 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; | 131 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; |
| 132 | 132 | ||
| @@ -1022,17 +1022,15 @@ int test_set_page_writeback(struct page *page) | |||
| 1022 | EXPORT_SYMBOL(test_set_page_writeback); | 1022 | EXPORT_SYMBOL(test_set_page_writeback); |
| 1023 | 1023 | ||
| 1024 | /* | 1024 | /* |
| 1025 | * Return true if any of the pages in the mapping are marged with the | 1025 | * Return true if any of the pages in the mapping are marked with the |
| 1026 | * passed tag. | 1026 | * passed tag. |
| 1027 | */ | 1027 | */ |
| 1028 | int mapping_tagged(struct address_space *mapping, int tag) | 1028 | int mapping_tagged(struct address_space *mapping, int tag) |
| 1029 | { | 1029 | { |
| 1030 | unsigned long flags; | ||
| 1031 | int ret; | 1030 | int ret; |
| 1032 | 1031 | rcu_read_lock(); | |
| 1033 | read_lock_irqsave(&mapping->tree_lock, flags); | ||
| 1034 | ret = radix_tree_tagged(&mapping->page_tree, tag); | 1032 | ret = radix_tree_tagged(&mapping->page_tree, tag); |
| 1035 | read_unlock_irqrestore(&mapping->tree_lock, flags); | 1033 | rcu_read_unlock(); |
| 1036 | return ret; | 1034 | return ret; |
| 1037 | } | 1035 | } |
| 1038 | EXPORT_SYMBOL(mapping_tagged); | 1036 | EXPORT_SYMBOL(mapping_tagged); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1a8c59571cb7..d315e1127dc9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -41,24 +41,37 @@ | |||
| 41 | #include <linux/pfn.h> | 41 | #include <linux/pfn.h> |
| 42 | #include <linux/backing-dev.h> | 42 | #include <linux/backing-dev.h> |
| 43 | #include <linux/fault-inject.h> | 43 | #include <linux/fault-inject.h> |
| 44 | #include <linux/page-isolation.h> | ||
| 44 | 45 | ||
| 45 | #include <asm/tlbflush.h> | 46 | #include <asm/tlbflush.h> |
| 46 | #include <asm/div64.h> | 47 | #include <asm/div64.h> |
| 47 | #include "internal.h" | 48 | #include "internal.h" |
| 48 | 49 | ||
| 49 | /* | 50 | /* |
| 50 | * MCD - HACK: Find somewhere to initialize this EARLY, or make this | 51 | * Array of node states. |
| 51 | * initializer cleaner | ||
| 52 | */ | 52 | */ |
| 53 | nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; | 53 | nodemask_t node_states[NR_NODE_STATES] __read_mostly = { |
| 54 | EXPORT_SYMBOL(node_online_map); | 54 | [N_POSSIBLE] = NODE_MASK_ALL, |
| 55 | nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; | 55 | [N_ONLINE] = { { [0] = 1UL } }, |
| 56 | EXPORT_SYMBOL(node_possible_map); | 56 | #ifndef CONFIG_NUMA |
| 57 | [N_NORMAL_MEMORY] = { { [0] = 1UL } }, | ||
| 58 | #ifdef CONFIG_HIGHMEM | ||
| 59 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, | ||
| 60 | #endif | ||
| 61 | [N_CPU] = { { [0] = 1UL } }, | ||
| 62 | #endif /* NUMA */ | ||
| 63 | }; | ||
| 64 | EXPORT_SYMBOL(node_states); | ||
| 65 | |||
| 57 | unsigned long totalram_pages __read_mostly; | 66 | unsigned long totalram_pages __read_mostly; |
| 58 | unsigned long totalreserve_pages __read_mostly; | 67 | unsigned long totalreserve_pages __read_mostly; |
| 59 | long nr_swap_pages; | 68 | long nr_swap_pages; |
| 60 | int percpu_pagelist_fraction; | 69 | int percpu_pagelist_fraction; |
| 61 | 70 | ||
| 71 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | ||
| 72 | int pageblock_order __read_mostly; | ||
| 73 | #endif | ||
| 74 | |||
| 62 | static void __free_pages_ok(struct page *page, unsigned int order); | 75 | static void __free_pages_ok(struct page *page, unsigned int order); |
| 63 | 76 | ||
| 64 | /* | 77 | /* |
| @@ -137,7 +150,7 @@ static unsigned long __meminitdata dma_reserve; | |||
| 137 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; | 150 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; |
| 138 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | 151 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ |
| 139 | unsigned long __initdata required_kernelcore; | 152 | unsigned long __initdata required_kernelcore; |
| 140 | unsigned long __initdata required_movablecore; | 153 | static unsigned long __initdata required_movablecore; |
| 141 | unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 154 | unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
| 142 | 155 | ||
| 143 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ | 156 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
| @@ -150,6 +163,14 @@ int nr_node_ids __read_mostly = MAX_NUMNODES; | |||
| 150 | EXPORT_SYMBOL(nr_node_ids); | 163 | EXPORT_SYMBOL(nr_node_ids); |
| 151 | #endif | 164 | #endif |
| 152 | 165 | ||
| 166 | int page_group_by_mobility_disabled __read_mostly; | ||
| 167 | |||
| 168 | static void set_pageblock_migratetype(struct page *page, int migratetype) | ||
| 169 | { | ||
| 170 | set_pageblock_flags_group(page, (unsigned long)migratetype, | ||
| 171 | PB_migrate, PB_migrate_end); | ||
| 172 | } | ||
| 173 | |||
| 153 | #ifdef CONFIG_DEBUG_VM | 174 | #ifdef CONFIG_DEBUG_VM |
| 154 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 175 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
| 155 | { | 176 | { |
| @@ -293,16 +314,6 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | |||
| 293 | clear_highpage(page + i); | 314 | clear_highpage(page + i); |
| 294 | } | 315 | } |
| 295 | 316 | ||
| 296 | /* | ||
| 297 | * function for dealing with page's order in buddy system. | ||
| 298 | * zone->lock is already acquired when we use these. | ||
| 299 | * So, we don't need atomic page->flags operations here. | ||
| 300 | */ | ||
| 301 | static inline unsigned long page_order(struct page *page) | ||
| 302 | { | ||
| 303 | return page_private(page); | ||
| 304 | } | ||
| 305 | |||
| 306 | static inline void set_page_order(struct page *page, int order) | 317 | static inline void set_page_order(struct page *page, int order) |
| 307 | { | 318 | { |
| 308 | set_page_private(page, order); | 319 | set_page_private(page, order); |
| @@ -404,6 +415,7 @@ static inline void __free_one_page(struct page *page, | |||
| 404 | { | 415 | { |
| 405 | unsigned long page_idx; | 416 | unsigned long page_idx; |
| 406 | int order_size = 1 << order; | 417 | int order_size = 1 << order; |
| 418 | int migratetype = get_pageblock_migratetype(page); | ||
| 407 | 419 | ||
| 408 | if (unlikely(PageCompound(page))) | 420 | if (unlikely(PageCompound(page))) |
| 409 | destroy_compound_page(page, order); | 421 | destroy_compound_page(page, order); |
| @@ -416,7 +428,6 @@ static inline void __free_one_page(struct page *page, | |||
| 416 | __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); | 428 | __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); |
| 417 | while (order < MAX_ORDER-1) { | 429 | while (order < MAX_ORDER-1) { |
| 418 | unsigned long combined_idx; | 430 | unsigned long combined_idx; |
| 419 | struct free_area *area; | ||
| 420 | struct page *buddy; | 431 | struct page *buddy; |
| 421 | 432 | ||
| 422 | buddy = __page_find_buddy(page, page_idx, order); | 433 | buddy = __page_find_buddy(page, page_idx, order); |
| @@ -424,8 +435,7 @@ static inline void __free_one_page(struct page *page, | |||
| 424 | break; /* Move the buddy up one level. */ | 435 | break; /* Move the buddy up one level. */ |
| 425 | 436 | ||
| 426 | list_del(&buddy->lru); | 437 | list_del(&buddy->lru); |
| 427 | area = zone->free_area + order; | 438 | zone->free_area[order].nr_free--; |
| 428 | area->nr_free--; | ||
| 429 | rmv_page_order(buddy); | 439 | rmv_page_order(buddy); |
| 430 | combined_idx = __find_combined_index(page_idx, order); | 440 | combined_idx = __find_combined_index(page_idx, order); |
| 431 | page = page + (combined_idx - page_idx); | 441 | page = page + (combined_idx - page_idx); |
| @@ -433,7 +443,8 @@ static inline void __free_one_page(struct page *page, | |||
| 433 | order++; | 443 | order++; |
| 434 | } | 444 | } |
| 435 | set_page_order(page, order); | 445 | set_page_order(page, order); |
| 436 | list_add(&page->lru, &zone->free_area[order].free_list); | 446 | list_add(&page->lru, |
| 447 | &zone->free_area[order].free_list[migratetype]); | ||
| 437 | zone->free_area[order].nr_free++; | 448 | zone->free_area[order].nr_free++; |
| 438 | } | 449 | } |
| 439 | 450 | ||
| @@ -567,7 +578,8 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) | |||
| 567 | * -- wli | 578 | * -- wli |
| 568 | */ | 579 | */ |
| 569 | static inline void expand(struct zone *zone, struct page *page, | 580 | static inline void expand(struct zone *zone, struct page *page, |
| 570 | int low, int high, struct free_area *area) | 581 | int low, int high, struct free_area *area, |
| 582 | int migratetype) | ||
| 571 | { | 583 | { |
| 572 | unsigned long size = 1 << high; | 584 | unsigned long size = 1 << high; |
| 573 | 585 | ||
| @@ -576,7 +588,7 @@ static inline void expand(struct zone *zone, struct page *page, | |||
| 576 | high--; | 588 | high--; |
| 577 | size >>= 1; | 589 | size >>= 1; |
| 578 | VM_BUG_ON(bad_range(zone, &page[size])); | 590 | VM_BUG_ON(bad_range(zone, &page[size])); |
| 579 | list_add(&page[size].lru, &area->free_list); | 591 | list_add(&page[size].lru, &area->free_list[migratetype]); |
| 580 | area->nr_free++; | 592 | area->nr_free++; |
| 581 | set_page_order(&page[size], high); | 593 | set_page_order(&page[size], high); |
| 582 | } | 594 | } |
| @@ -628,49 +640,235 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
| 628 | return 0; | 640 | return 0; |
| 629 | } | 641 | } |
| 630 | 642 | ||
| 631 | /* | 643 | /* |
| 632 | * Do the hard work of removing an element from the buddy allocator. | 644 | * Go through the free lists for the given migratetype and remove |
| 633 | * Call me with the zone->lock already held. | 645 | * the smallest available page from the freelists |
| 634 | */ | 646 | */ |
| 635 | static struct page *__rmqueue(struct zone *zone, unsigned int order) | 647 | static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, |
| 648 | int migratetype) | ||
| 636 | { | 649 | { |
| 637 | struct free_area * area; | ||
| 638 | unsigned int current_order; | 650 | unsigned int current_order; |
| 651 | struct free_area * area; | ||
| 639 | struct page *page; | 652 | struct page *page; |
| 640 | 653 | ||
| 654 | /* Find a page of the appropriate size in the preferred list */ | ||
| 641 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { | 655 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { |
| 642 | area = zone->free_area + current_order; | 656 | area = &(zone->free_area[current_order]); |
| 643 | if (list_empty(&area->free_list)) | 657 | if (list_empty(&area->free_list[migratetype])) |
| 644 | continue; | 658 | continue; |
| 645 | 659 | ||
| 646 | page = list_entry(area->free_list.next, struct page, lru); | 660 | page = list_entry(area->free_list[migratetype].next, |
| 661 | struct page, lru); | ||
| 647 | list_del(&page->lru); | 662 | list_del(&page->lru); |
| 648 | rmv_page_order(page); | 663 | rmv_page_order(page); |
| 649 | area->nr_free--; | 664 | area->nr_free--; |
| 650 | __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); | 665 | __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); |
| 651 | expand(zone, page, order, current_order, area); | 666 | expand(zone, page, order, current_order, area, migratetype); |
| 652 | return page; | 667 | return page; |
| 653 | } | 668 | } |
| 654 | 669 | ||
| 655 | return NULL; | 670 | return NULL; |
| 656 | } | 671 | } |
| 657 | 672 | ||
| 673 | |||
| 674 | /* | ||
| 675 | * This array describes the order lists are fallen back to when | ||
| 676 | * the free lists for the desirable migrate type are depleted | ||
| 677 | */ | ||
| 678 | static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { | ||
| 679 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | ||
| 680 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | ||
| 681 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | ||
| 682 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ | ||
| 683 | }; | ||
| 684 | |||
| 685 | /* | ||
| 686 | * Move the free pages in a range to the free lists of the requested type. | ||
| 687 | * Note that start_page and end_pages are not aligned on a pageblock | ||
| 688 | * boundary. If alignment is required, use move_freepages_block() | ||
| 689 | */ | ||
| 690 | int move_freepages(struct zone *zone, | ||
| 691 | struct page *start_page, struct page *end_page, | ||
| 692 | int migratetype) | ||
| 693 | { | ||
| 694 | struct page *page; | ||
| 695 | unsigned long order; | ||
| 696 | int pages_moved = 0; | ||
| 697 | |||
| 698 | #ifndef CONFIG_HOLES_IN_ZONE | ||
| 699 | /* | ||
| 700 | * page_zone is not safe to call in this context when | ||
| 701 | * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant | ||
| 702 | * anyway as we check zone boundaries in move_freepages_block(). | ||
| 703 | * Remove at a later date when no bug reports exist related to | ||
| 704 | * grouping pages by mobility | ||
| 705 | */ | ||
| 706 | BUG_ON(page_zone(start_page) != page_zone(end_page)); | ||
| 707 | #endif | ||
| 708 | |||
| 709 | for (page = start_page; page <= end_page;) { | ||
| 710 | if (!pfn_valid_within(page_to_pfn(page))) { | ||
| 711 | page++; | ||
| 712 | continue; | ||
| 713 | } | ||
| 714 | |||
| 715 | if (!PageBuddy(page)) { | ||
| 716 | page++; | ||
| 717 | continue; | ||
| 718 | } | ||
| 719 | |||
| 720 | order = page_order(page); | ||
| 721 | list_del(&page->lru); | ||
| 722 | list_add(&page->lru, | ||
| 723 | &zone->free_area[order].free_list[migratetype]); | ||
| 724 | page += 1 << order; | ||
| 725 | pages_moved += 1 << order; | ||
| 726 | } | ||
| 727 | |||
| 728 | return pages_moved; | ||
| 729 | } | ||
| 730 | |||
| 731 | int move_freepages_block(struct zone *zone, struct page *page, int migratetype) | ||
| 732 | { | ||
| 733 | unsigned long start_pfn, end_pfn; | ||
| 734 | struct page *start_page, *end_page; | ||
| 735 | |||
| 736 | start_pfn = page_to_pfn(page); | ||
| 737 | start_pfn = start_pfn & ~(pageblock_nr_pages-1); | ||
| 738 | start_page = pfn_to_page(start_pfn); | ||
| 739 | end_page = start_page + pageblock_nr_pages - 1; | ||
| 740 | end_pfn = start_pfn + pageblock_nr_pages - 1; | ||
| 741 | |||
| 742 | /* Do not cross zone boundaries */ | ||
| 743 | if (start_pfn < zone->zone_start_pfn) | ||
| 744 | start_page = page; | ||
| 745 | if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) | ||
| 746 | return 0; | ||
| 747 | |||
| 748 | return move_freepages(zone, start_page, end_page, migratetype); | ||
| 749 | } | ||
| 750 | |||
| 751 | /* Return the page with the lowest PFN in the list */ | ||
| 752 | static struct page *min_page(struct list_head *list) | ||
| 753 | { | ||
| 754 | unsigned long min_pfn = -1UL; | ||
| 755 | struct page *min_page = NULL, *page;; | ||
| 756 | |||
| 757 | list_for_each_entry(page, list, lru) { | ||
| 758 | unsigned long pfn = page_to_pfn(page); | ||
| 759 | if (pfn < min_pfn) { | ||
| 760 | min_pfn = pfn; | ||
| 761 | min_page = page; | ||
| 762 | } | ||
| 763 | } | ||
| 764 | |||
| 765 | return min_page; | ||
| 766 | } | ||
| 767 | |||
| 768 | /* Remove an element from the buddy allocator from the fallback list */ | ||
| 769 | static struct page *__rmqueue_fallback(struct zone *zone, int order, | ||
| 770 | int start_migratetype) | ||
| 771 | { | ||
| 772 | struct free_area * area; | ||
| 773 | int current_order; | ||
| 774 | struct page *page; | ||
| 775 | int migratetype, i; | ||
| 776 | |||
| 777 | /* Find the largest possible block of pages in the other list */ | ||
| 778 | for (current_order = MAX_ORDER-1; current_order >= order; | ||
| 779 | --current_order) { | ||
| 780 | for (i = 0; i < MIGRATE_TYPES - 1; i++) { | ||
| 781 | migratetype = fallbacks[start_migratetype][i]; | ||
| 782 | |||
| 783 | /* MIGRATE_RESERVE handled later if necessary */ | ||
| 784 | if (migratetype == MIGRATE_RESERVE) | ||
| 785 | continue; | ||
| 786 | |||
| 787 | area = &(zone->free_area[current_order]); | ||
| 788 | if (list_empty(&area->free_list[migratetype])) | ||
| 789 | continue; | ||
| 790 | |||
| 791 | /* Bias kernel allocations towards low pfns */ | ||
| 792 | page = list_entry(area->free_list[migratetype].next, | ||
| 793 | struct page, lru); | ||
| 794 | if (unlikely(start_migratetype != MIGRATE_MOVABLE)) | ||
| 795 | page = min_page(&area->free_list[migratetype]); | ||
| 796 | area->nr_free--; | ||
| 797 | |||
| 798 | /* | ||
| 799 | * If breaking a large block of pages, move all free | ||
| 800 | * pages to the preferred allocation list. If falling | ||
| 801 | * back for a reclaimable kernel allocation, be more | ||
| 802 | * agressive about taking ownership of free pages | ||
| 803 | */ | ||
| 804 | if (unlikely(current_order >= (pageblock_order >> 1)) || | ||
| 805 | start_migratetype == MIGRATE_RECLAIMABLE) { | ||
| 806 | unsigned long pages; | ||
| 807 | pages = move_freepages_block(zone, page, | ||
| 808 | start_migratetype); | ||
| 809 | |||
| 810 | /* Claim the whole block if over half of it is free */ | ||
| 811 | if (pages >= (1 << (pageblock_order-1))) | ||
| 812 | set_pageblock_migratetype(page, | ||
| 813 | start_migratetype); | ||
| 814 | |||
| 815 | migratetype = start_migratetype; | ||
| 816 | } | ||
| 817 | |||
| 818 | /* Remove the page from the freelists */ | ||
| 819 | list_del(&page->lru); | ||
| 820 | rmv_page_order(page); | ||
| 821 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
| 822 | -(1UL << order)); | ||
| 823 | |||
| 824 | if (current_order == pageblock_order) | ||
| 825 | set_pageblock_migratetype(page, | ||
| 826 | start_migratetype); | ||
| 827 | |||
| 828 | expand(zone, page, order, current_order, area, migratetype); | ||
| 829 | return page; | ||
| 830 | } | ||
| 831 | } | ||
| 832 | |||
| 833 | /* Use MIGRATE_RESERVE rather than fail an allocation */ | ||
| 834 | return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); | ||
| 835 | } | ||
| 836 | |||
| 837 | /* | ||
| 838 | * Do the hard work of removing an element from the buddy allocator. | ||
| 839 | * Call me with the zone->lock already held. | ||
| 840 | */ | ||
| 841 | static struct page *__rmqueue(struct zone *zone, unsigned int order, | ||
| 842 | int migratetype) | ||
| 843 | { | ||
| 844 | struct page *page; | ||
| 845 | |||
| 846 | page = __rmqueue_smallest(zone, order, migratetype); | ||
| 847 | |||
| 848 | if (unlikely(!page)) | ||
| 849 | page = __rmqueue_fallback(zone, order, migratetype); | ||
| 850 | |||
| 851 | return page; | ||
| 852 | } | ||
| 853 | |||
| 658 | /* | 854 | /* |
| 659 | * Obtain a specified number of elements from the buddy allocator, all under | 855 | * Obtain a specified number of elements from the buddy allocator, all under |
| 660 | * a single hold of the lock, for efficiency. Add them to the supplied list. | 856 | * a single hold of the lock, for efficiency. Add them to the supplied list. |
| 661 | * Returns the number of new pages which were placed at *list. | 857 | * Returns the number of new pages which were placed at *list. |
| 662 | */ | 858 | */ |
| 663 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 859 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
| 664 | unsigned long count, struct list_head *list) | 860 | unsigned long count, struct list_head *list, |
| 861 | int migratetype) | ||
| 665 | { | 862 | { |
| 666 | int i; | 863 | int i; |
| 667 | 864 | ||
| 668 | spin_lock(&zone->lock); | 865 | spin_lock(&zone->lock); |
| 669 | for (i = 0; i < count; ++i) { | 866 | for (i = 0; i < count; ++i) { |
| 670 | struct page *page = __rmqueue(zone, order); | 867 | struct page *page = __rmqueue(zone, order, migratetype); |
| 671 | if (unlikely(page == NULL)) | 868 | if (unlikely(page == NULL)) |
| 672 | break; | 869 | break; |
| 673 | list_add_tail(&page->lru, list); | 870 | list_add(&page->lru, list); |
| 871 | set_page_private(page, migratetype); | ||
| 674 | } | 872 | } |
| 675 | spin_unlock(&zone->lock); | 873 | spin_unlock(&zone->lock); |
| 676 | return i; | 874 | return i; |
| @@ -732,7 +930,7 @@ void mark_free_pages(struct zone *zone) | |||
| 732 | { | 930 | { |
| 733 | unsigned long pfn, max_zone_pfn; | 931 | unsigned long pfn, max_zone_pfn; |
| 734 | unsigned long flags; | 932 | unsigned long flags; |
| 735 | int order; | 933 | int order, t; |
| 736 | struct list_head *curr; | 934 | struct list_head *curr; |
| 737 | 935 | ||
| 738 | if (!zone->spanned_pages) | 936 | if (!zone->spanned_pages) |
| @@ -749,17 +947,18 @@ void mark_free_pages(struct zone *zone) | |||
| 749 | swsusp_unset_page_free(page); | 947 | swsusp_unset_page_free(page); |
| 750 | } | 948 | } |
| 751 | 949 | ||
| 752 | for (order = MAX_ORDER - 1; order >= 0; --order) | 950 | for_each_migratetype_order(order, t) { |
| 753 | list_for_each(curr, &zone->free_area[order].free_list) { | 951 | list_for_each(curr, &zone->free_area[order].free_list[t]) { |
| 754 | unsigned long i; | 952 | unsigned long i; |
| 755 | 953 | ||
| 756 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); | 954 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); |
| 757 | for (i = 0; i < (1UL << order); i++) | 955 | for (i = 0; i < (1UL << order); i++) |
| 758 | swsusp_set_page_free(pfn_to_page(pfn + i)); | 956 | swsusp_set_page_free(pfn_to_page(pfn + i)); |
| 759 | } | 957 | } |
| 760 | 958 | } | |
| 761 | spin_unlock_irqrestore(&zone->lock, flags); | 959 | spin_unlock_irqrestore(&zone->lock, flags); |
| 762 | } | 960 | } |
| 961 | #endif /* CONFIG_PM */ | ||
| 763 | 962 | ||
| 764 | /* | 963 | /* |
| 765 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. | 964 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. |
| @@ -772,7 +971,25 @@ void drain_local_pages(void) | |||
| 772 | __drain_pages(smp_processor_id()); | 971 | __drain_pages(smp_processor_id()); |
| 773 | local_irq_restore(flags); | 972 | local_irq_restore(flags); |
| 774 | } | 973 | } |
| 775 | #endif /* CONFIG_HIBERNATION */ | 974 | |
| 975 | void smp_drain_local_pages(void *arg) | ||
| 976 | { | ||
| 977 | drain_local_pages(); | ||
| 978 | } | ||
| 979 | |||
| 980 | /* | ||
| 981 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator | ||
| 982 | */ | ||
| 983 | void drain_all_local_pages(void) | ||
| 984 | { | ||
| 985 | unsigned long flags; | ||
| 986 | |||
| 987 | local_irq_save(flags); | ||
| 988 | __drain_pages(smp_processor_id()); | ||
| 989 | local_irq_restore(flags); | ||
| 990 | |||
| 991 | smp_call_function(smp_drain_local_pages, NULL, 0, 1); | ||
| 992 | } | ||
| 776 | 993 | ||
| 777 | /* | 994 | /* |
| 778 | * Free a 0-order page | 995 | * Free a 0-order page |
| @@ -797,6 +1014,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
| 797 | local_irq_save(flags); | 1014 | local_irq_save(flags); |
| 798 | __count_vm_event(PGFREE); | 1015 | __count_vm_event(PGFREE); |
| 799 | list_add(&page->lru, &pcp->list); | 1016 | list_add(&page->lru, &pcp->list); |
| 1017 | set_page_private(page, get_pageblock_migratetype(page)); | ||
| 800 | pcp->count++; | 1018 | pcp->count++; |
| 801 | if (pcp->count >= pcp->high) { | 1019 | if (pcp->count >= pcp->high) { |
| 802 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 1020 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); |
| @@ -846,6 +1064,7 @@ static struct page *buffered_rmqueue(struct zonelist *zonelist, | |||
| 846 | struct page *page; | 1064 | struct page *page; |
| 847 | int cold = !!(gfp_flags & __GFP_COLD); | 1065 | int cold = !!(gfp_flags & __GFP_COLD); |
| 848 | int cpu; | 1066 | int cpu; |
| 1067 | int migratetype = allocflags_to_migratetype(gfp_flags); | ||
| 849 | 1068 | ||
| 850 | again: | 1069 | again: |
| 851 | cpu = get_cpu(); | 1070 | cpu = get_cpu(); |
| @@ -856,16 +1075,28 @@ again: | |||
| 856 | local_irq_save(flags); | 1075 | local_irq_save(flags); |
| 857 | if (!pcp->count) { | 1076 | if (!pcp->count) { |
| 858 | pcp->count = rmqueue_bulk(zone, 0, | 1077 | pcp->count = rmqueue_bulk(zone, 0, |
| 859 | pcp->batch, &pcp->list); | 1078 | pcp->batch, &pcp->list, migratetype); |
| 860 | if (unlikely(!pcp->count)) | 1079 | if (unlikely(!pcp->count)) |
| 861 | goto failed; | 1080 | goto failed; |
| 862 | } | 1081 | } |
| 863 | page = list_entry(pcp->list.next, struct page, lru); | 1082 | |
| 1083 | /* Find a page of the appropriate migrate type */ | ||
| 1084 | list_for_each_entry(page, &pcp->list, lru) | ||
| 1085 | if (page_private(page) == migratetype) | ||
| 1086 | break; | ||
| 1087 | |||
| 1088 | /* Allocate more to the pcp list if necessary */ | ||
| 1089 | if (unlikely(&page->lru == &pcp->list)) { | ||
| 1090 | pcp->count += rmqueue_bulk(zone, 0, | ||
| 1091 | pcp->batch, &pcp->list, migratetype); | ||
| 1092 | page = list_entry(pcp->list.next, struct page, lru); | ||
| 1093 | } | ||
| 1094 | |||
| 864 | list_del(&page->lru); | 1095 | list_del(&page->lru); |
| 865 | pcp->count--; | 1096 | pcp->count--; |
| 866 | } else { | 1097 | } else { |
| 867 | spin_lock_irqsave(&zone->lock, flags); | 1098 | spin_lock_irqsave(&zone->lock, flags); |
| 868 | page = __rmqueue(zone, order); | 1099 | page = __rmqueue(zone, order, migratetype); |
| 869 | spin_unlock(&zone->lock); | 1100 | spin_unlock(&zone->lock); |
| 870 | if (!page) | 1101 | if (!page) |
| 871 | goto failed; | 1102 | goto failed; |
| @@ -1032,7 +1263,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
| 1032 | * | 1263 | * |
| 1033 | * If the zonelist cache is present in the passed in zonelist, then | 1264 | * If the zonelist cache is present in the passed in zonelist, then |
| 1034 | * returns a pointer to the allowed node mask (either the current | 1265 | * returns a pointer to the allowed node mask (either the current |
| 1035 | * tasks mems_allowed, or node_online_map.) | 1266 | * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) |
| 1036 | * | 1267 | * |
| 1037 | * If the zonelist cache is not available for this zonelist, does | 1268 | * If the zonelist cache is not available for this zonelist, does |
| 1038 | * nothing and returns NULL. | 1269 | * nothing and returns NULL. |
| @@ -1061,7 +1292,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
| 1061 | 1292 | ||
| 1062 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | 1293 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? |
| 1063 | &cpuset_current_mems_allowed : | 1294 | &cpuset_current_mems_allowed : |
| 1064 | &node_online_map; | 1295 | &node_states[N_HIGH_MEMORY]; |
| 1065 | return allowednodes; | 1296 | return allowednodes; |
| 1066 | } | 1297 | } |
| 1067 | 1298 | ||
| @@ -1183,9 +1414,6 @@ zonelist_scan: | |||
| 1183 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1414 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
| 1184 | continue; | 1415 | continue; |
| 1185 | zone = *z; | 1416 | zone = *z; |
| 1186 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && | ||
| 1187 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) | ||
| 1188 | break; | ||
| 1189 | if ((alloc_flags & ALLOC_CPUSET) && | 1417 | if ((alloc_flags & ALLOC_CPUSET) && |
| 1190 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1418 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
| 1191 | goto try_next_zone; | 1419 | goto try_next_zone; |
| @@ -1254,7 +1482,10 @@ restart: | |||
| 1254 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ | 1482 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ |
| 1255 | 1483 | ||
| 1256 | if (unlikely(*z == NULL)) { | 1484 | if (unlikely(*z == NULL)) { |
| 1257 | /* Should this ever happen?? */ | 1485 | /* |
| 1486 | * Happens if we have an empty zonelist as a result of | ||
| 1487 | * GFP_THISNODE being used on a memoryless node | ||
| 1488 | */ | ||
| 1258 | return NULL; | 1489 | return NULL; |
| 1259 | } | 1490 | } |
| 1260 | 1491 | ||
| @@ -1346,6 +1577,9 @@ nofail_alloc: | |||
| 1346 | 1577 | ||
| 1347 | cond_resched(); | 1578 | cond_resched(); |
| 1348 | 1579 | ||
| 1580 | if (order != 0) | ||
| 1581 | drain_all_local_pages(); | ||
| 1582 | |||
| 1349 | if (likely(did_some_progress)) { | 1583 | if (likely(did_some_progress)) { |
| 1350 | page = get_page_from_freelist(gfp_mask, order, | 1584 | page = get_page_from_freelist(gfp_mask, order, |
| 1351 | zonelist, alloc_flags); | 1585 | zonelist, alloc_flags); |
| @@ -1794,7 +2028,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
| 1794 | return node; | 2028 | return node; |
| 1795 | } | 2029 | } |
| 1796 | 2030 | ||
| 1797 | for_each_online_node(n) { | 2031 | for_each_node_state(n, N_HIGH_MEMORY) { |
| 1798 | cpumask_t tmp; | 2032 | cpumask_t tmp; |
| 1799 | 2033 | ||
| 1800 | /* Don't want a node to appear more than once */ | 2034 | /* Don't want a node to appear more than once */ |
| @@ -1850,6 +2084,22 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | |||
| 1850 | } | 2084 | } |
| 1851 | 2085 | ||
| 1852 | /* | 2086 | /* |
| 2087 | * Build gfp_thisnode zonelists | ||
| 2088 | */ | ||
| 2089 | static void build_thisnode_zonelists(pg_data_t *pgdat) | ||
| 2090 | { | ||
| 2091 | enum zone_type i; | ||
| 2092 | int j; | ||
| 2093 | struct zonelist *zonelist; | ||
| 2094 | |||
| 2095 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
| 2096 | zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i; | ||
| 2097 | j = build_zonelists_node(pgdat, zonelist, 0, i); | ||
| 2098 | zonelist->zones[j] = NULL; | ||
| 2099 | } | ||
| 2100 | } | ||
| 2101 | |||
| 2102 | /* | ||
| 1853 | * Build zonelists ordered by zone and nodes within zones. | 2103 | * Build zonelists ordered by zone and nodes within zones. |
| 1854 | * This results in conserving DMA zone[s] until all Normal memory is | 2104 | * This results in conserving DMA zone[s] until all Normal memory is |
| 1855 | * exhausted, but results in overflowing to remote node while memory | 2105 | * exhausted, but results in overflowing to remote node while memory |
| @@ -1915,7 +2165,8 @@ static int default_zonelist_order(void) | |||
| 1915 | * If there is a node whose DMA/DMA32 memory is very big area on | 2165 | * If there is a node whose DMA/DMA32 memory is very big area on |
| 1916 | * local memory, NODE_ORDER may be suitable. | 2166 | * local memory, NODE_ORDER may be suitable. |
| 1917 | */ | 2167 | */ |
| 1918 | average_size = total_size / (num_online_nodes() + 1); | 2168 | average_size = total_size / |
| 2169 | (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); | ||
| 1919 | for_each_online_node(nid) { | 2170 | for_each_online_node(nid) { |
| 1920 | low_kmem_size = 0; | 2171 | low_kmem_size = 0; |
| 1921 | total_size = 0; | 2172 | total_size = 0; |
| @@ -1953,7 +2204,7 @@ static void build_zonelists(pg_data_t *pgdat) | |||
| 1953 | int order = current_zonelist_order; | 2204 | int order = current_zonelist_order; |
| 1954 | 2205 | ||
| 1955 | /* initialize zonelists */ | 2206 | /* initialize zonelists */ |
| 1956 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2207 | for (i = 0; i < MAX_ZONELISTS; i++) { |
| 1957 | zonelist = pgdat->node_zonelists + i; | 2208 | zonelist = pgdat->node_zonelists + i; |
| 1958 | zonelist->zones[0] = NULL; | 2209 | zonelist->zones[0] = NULL; |
| 1959 | } | 2210 | } |
| @@ -1998,6 +2249,8 @@ static void build_zonelists(pg_data_t *pgdat) | |||
| 1998 | /* calculate node order -- i.e., DMA last! */ | 2249 | /* calculate node order -- i.e., DMA last! */ |
| 1999 | build_zonelists_in_zone_order(pgdat, j); | 2250 | build_zonelists_in_zone_order(pgdat, j); |
| 2000 | } | 2251 | } |
| 2252 | |||
| 2253 | build_thisnode_zonelists(pgdat); | ||
| 2001 | } | 2254 | } |
| 2002 | 2255 | ||
| 2003 | /* Construct the zonelist performance cache - see further mmzone.h */ | 2256 | /* Construct the zonelist performance cache - see further mmzone.h */ |
| @@ -2078,8 +2331,10 @@ static int __build_all_zonelists(void *dummy) | |||
| 2078 | int nid; | 2331 | int nid; |
| 2079 | 2332 | ||
| 2080 | for_each_online_node(nid) { | 2333 | for_each_online_node(nid) { |
| 2081 | build_zonelists(NODE_DATA(nid)); | 2334 | pg_data_t *pgdat = NODE_DATA(nid); |
| 2082 | build_zonelist_cache(NODE_DATA(nid)); | 2335 | |
| 2336 | build_zonelists(pgdat); | ||
| 2337 | build_zonelist_cache(pgdat); | ||
| 2083 | } | 2338 | } |
| 2084 | return 0; | 2339 | return 0; |
| 2085 | } | 2340 | } |
| @@ -2098,9 +2353,23 @@ void build_all_zonelists(void) | |||
| 2098 | /* cpuset refresh routine should be here */ | 2353 | /* cpuset refresh routine should be here */ |
| 2099 | } | 2354 | } |
| 2100 | vm_total_pages = nr_free_pagecache_pages(); | 2355 | vm_total_pages = nr_free_pagecache_pages(); |
| 2101 | printk("Built %i zonelists in %s order. Total pages: %ld\n", | 2356 | /* |
| 2357 | * Disable grouping by mobility if the number of pages in the | ||
| 2358 | * system is too low to allow the mechanism to work. It would be | ||
| 2359 | * more accurate, but expensive to check per-zone. This check is | ||
| 2360 | * made on memory-hotadd so a system can start with mobility | ||
| 2361 | * disabled and enable it later | ||
| 2362 | */ | ||
| 2363 | if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) | ||
| 2364 | page_group_by_mobility_disabled = 1; | ||
| 2365 | else | ||
| 2366 | page_group_by_mobility_disabled = 0; | ||
| 2367 | |||
| 2368 | printk("Built %i zonelists in %s order, mobility grouping %s. " | ||
| 2369 | "Total pages: %ld\n", | ||
| 2102 | num_online_nodes(), | 2370 | num_online_nodes(), |
| 2103 | zonelist_order_name[current_zonelist_order], | 2371 | zonelist_order_name[current_zonelist_order], |
| 2372 | page_group_by_mobility_disabled ? "off" : "on", | ||
| 2104 | vm_total_pages); | 2373 | vm_total_pages); |
| 2105 | #ifdef CONFIG_NUMA | 2374 | #ifdef CONFIG_NUMA |
| 2106 | printk("Policy zone: %s\n", zone_names[policy_zone]); | 2375 | printk("Policy zone: %s\n", zone_names[policy_zone]); |
| @@ -2176,6 +2445,61 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
| 2176 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | 2445 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) |
| 2177 | 2446 | ||
| 2178 | /* | 2447 | /* |
| 2448 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number | ||
| 2449 | * of blocks reserved is based on zone->pages_min. The memory within the | ||
| 2450 | * reserve will tend to store contiguous free pages. Setting min_free_kbytes | ||
| 2451 | * higher will lead to a bigger reserve which will get freed as contiguous | ||
| 2452 | * blocks as reclaim kicks in | ||
| 2453 | */ | ||
| 2454 | static void setup_zone_migrate_reserve(struct zone *zone) | ||
| 2455 | { | ||
| 2456 | unsigned long start_pfn, pfn, end_pfn; | ||
| 2457 | struct page *page; | ||
| 2458 | unsigned long reserve, block_migratetype; | ||
| 2459 | |||
| 2460 | /* Get the start pfn, end pfn and the number of blocks to reserve */ | ||
| 2461 | start_pfn = zone->zone_start_pfn; | ||
| 2462 | end_pfn = start_pfn + zone->spanned_pages; | ||
| 2463 | reserve = roundup(zone->pages_min, pageblock_nr_pages) >> | ||
| 2464 | pageblock_order; | ||
| 2465 | |||
| 2466 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | ||
| 2467 | if (!pfn_valid(pfn)) | ||
| 2468 | continue; | ||
| 2469 | page = pfn_to_page(pfn); | ||
| 2470 | |||
| 2471 | /* Blocks with reserved pages will never free, skip them. */ | ||
| 2472 | if (PageReserved(page)) | ||
| 2473 | continue; | ||
| 2474 | |||
| 2475 | block_migratetype = get_pageblock_migratetype(page); | ||
| 2476 | |||
| 2477 | /* If this block is reserved, account for it */ | ||
| 2478 | if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { | ||
| 2479 | reserve--; | ||
| 2480 | continue; | ||
| 2481 | } | ||
| 2482 | |||
| 2483 | /* Suitable for reserving if this block is movable */ | ||
| 2484 | if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { | ||
| 2485 | set_pageblock_migratetype(page, MIGRATE_RESERVE); | ||
| 2486 | move_freepages_block(zone, page, MIGRATE_RESERVE); | ||
| 2487 | reserve--; | ||
| 2488 | continue; | ||
| 2489 | } | ||
| 2490 | |||
| 2491 | /* | ||
| 2492 | * If the reserve is met and this is a previous reserved block, | ||
| 2493 | * take it back | ||
| 2494 | */ | ||
| 2495 | if (block_migratetype == MIGRATE_RESERVE) { | ||
| 2496 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
| 2497 | move_freepages_block(zone, page, MIGRATE_MOVABLE); | ||
| 2498 | } | ||
| 2499 | } | ||
| 2500 | } | ||
| 2501 | |||
| 2502 | /* | ||
| 2179 | * Initially all pages are reserved - free ones are freed | 2503 | * Initially all pages are reserved - free ones are freed |
| 2180 | * up by free_all_bootmem() once the early boot process is | 2504 | * up by free_all_bootmem() once the early boot process is |
| 2181 | * done. Non-atomic initialization, single-pass. | 2505 | * done. Non-atomic initialization, single-pass. |
| @@ -2204,6 +2528,19 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
| 2204 | init_page_count(page); | 2528 | init_page_count(page); |
| 2205 | reset_page_mapcount(page); | 2529 | reset_page_mapcount(page); |
| 2206 | SetPageReserved(page); | 2530 | SetPageReserved(page); |
| 2531 | |||
| 2532 | /* | ||
| 2533 | * Mark the block movable so that blocks are reserved for | ||
| 2534 | * movable at startup. This will force kernel allocations | ||
| 2535 | * to reserve their blocks rather than leaking throughout | ||
| 2536 | * the address space during boot when many long-lived | ||
| 2537 | * kernel allocations are made. Later some blocks near | ||
| 2538 | * the start are marked MIGRATE_RESERVE by | ||
| 2539 | * setup_zone_migrate_reserve() | ||
| 2540 | */ | ||
| 2541 | if ((pfn & (pageblock_nr_pages-1))) | ||
| 2542 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
| 2543 | |||
| 2207 | INIT_LIST_HEAD(&page->lru); | 2544 | INIT_LIST_HEAD(&page->lru); |
| 2208 | #ifdef WANT_PAGE_VIRTUAL | 2545 | #ifdef WANT_PAGE_VIRTUAL |
| 2209 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ | 2546 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ |
| @@ -2216,9 +2553,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
| 2216 | static void __meminit zone_init_free_lists(struct pglist_data *pgdat, | 2553 | static void __meminit zone_init_free_lists(struct pglist_data *pgdat, |
| 2217 | struct zone *zone, unsigned long size) | 2554 | struct zone *zone, unsigned long size) |
| 2218 | { | 2555 | { |
| 2219 | int order; | 2556 | int order, t; |
| 2220 | for (order = 0; order < MAX_ORDER ; order++) { | 2557 | for_each_migratetype_order(order, t) { |
| 2221 | INIT_LIST_HEAD(&zone->free_area[order].free_list); | 2558 | INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); |
| 2222 | zone->free_area[order].nr_free = 0; | 2559 | zone->free_area[order].nr_free = 0; |
| 2223 | } | 2560 | } |
| 2224 | } | 2561 | } |
| @@ -2324,6 +2661,9 @@ static struct per_cpu_pageset boot_pageset[NR_CPUS]; | |||
| 2324 | static int __cpuinit process_zones(int cpu) | 2661 | static int __cpuinit process_zones(int cpu) |
| 2325 | { | 2662 | { |
| 2326 | struct zone *zone, *dzone; | 2663 | struct zone *zone, *dzone; |
| 2664 | int node = cpu_to_node(cpu); | ||
| 2665 | |||
| 2666 | node_set_state(node, N_CPU); /* this node has a cpu */ | ||
| 2327 | 2667 | ||
| 2328 | for_each_zone(zone) { | 2668 | for_each_zone(zone) { |
| 2329 | 2669 | ||
| @@ -2331,7 +2671,7 @@ static int __cpuinit process_zones(int cpu) | |||
| 2331 | continue; | 2671 | continue; |
| 2332 | 2672 | ||
| 2333 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 2673 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), |
| 2334 | GFP_KERNEL, cpu_to_node(cpu)); | 2674 | GFP_KERNEL, node); |
| 2335 | if (!zone_pcp(zone, cpu)) | 2675 | if (!zone_pcp(zone, cpu)) |
| 2336 | goto bad; | 2676 | goto bad; |
| 2337 | 2677 | ||
| @@ -2444,7 +2784,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
| 2444 | * To use this new node's memory, further consideration will be | 2784 | * To use this new node's memory, further consideration will be |
| 2445 | * necessary. | 2785 | * necessary. |
| 2446 | */ | 2786 | */ |
| 2447 | zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); | 2787 | zone->wait_table = vmalloc(alloc_size); |
| 2448 | } | 2788 | } |
| 2449 | if (!zone->wait_table) | 2789 | if (!zone->wait_table) |
| 2450 | return -ENOMEM; | 2790 | return -ENOMEM; |
| @@ -2680,10 +3020,8 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, | |||
| 2680 | *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); | 3020 | *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); |
| 2681 | } | 3021 | } |
| 2682 | 3022 | ||
| 2683 | if (*start_pfn == -1UL) { | 3023 | if (*start_pfn == -1UL) |
| 2684 | printk(KERN_WARNING "Node %u active with no memory\n", nid); | ||
| 2685 | *start_pfn = 0; | 3024 | *start_pfn = 0; |
| 2686 | } | ||
| 2687 | 3025 | ||
| 2688 | /* Push the node boundaries out if requested */ | 3026 | /* Push the node boundaries out if requested */ |
| 2689 | account_node_boundary(nid, start_pfn, end_pfn); | 3027 | account_node_boundary(nid, start_pfn, end_pfn); |
| @@ -2901,6 +3239,62 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, | |||
| 2901 | realtotalpages); | 3239 | realtotalpages); |
| 2902 | } | 3240 | } |
| 2903 | 3241 | ||
| 3242 | #ifndef CONFIG_SPARSEMEM | ||
| 3243 | /* | ||
| 3244 | * Calculate the size of the zone->blockflags rounded to an unsigned long | ||
| 3245 | * Start by making sure zonesize is a multiple of pageblock_order by rounding | ||
| 3246 | * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally | ||
| 3247 | * round what is now in bits to nearest long in bits, then return it in | ||
| 3248 | * bytes. | ||
| 3249 | */ | ||
| 3250 | static unsigned long __init usemap_size(unsigned long zonesize) | ||
| 3251 | { | ||
| 3252 | unsigned long usemapsize; | ||
| 3253 | |||
| 3254 | usemapsize = roundup(zonesize, pageblock_nr_pages); | ||
| 3255 | usemapsize = usemapsize >> pageblock_order; | ||
| 3256 | usemapsize *= NR_PAGEBLOCK_BITS; | ||
| 3257 | usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); | ||
| 3258 | |||
| 3259 | return usemapsize / 8; | ||
| 3260 | } | ||
| 3261 | |||
| 3262 | static void __init setup_usemap(struct pglist_data *pgdat, | ||
| 3263 | struct zone *zone, unsigned long zonesize) | ||
| 3264 | { | ||
| 3265 | unsigned long usemapsize = usemap_size(zonesize); | ||
| 3266 | zone->pageblock_flags = NULL; | ||
| 3267 | if (usemapsize) { | ||
| 3268 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); | ||
| 3269 | memset(zone->pageblock_flags, 0, usemapsize); | ||
| 3270 | } | ||
| 3271 | } | ||
| 3272 | #else | ||
| 3273 | static void inline setup_usemap(struct pglist_data *pgdat, | ||
| 3274 | struct zone *zone, unsigned long zonesize) {} | ||
| 3275 | #endif /* CONFIG_SPARSEMEM */ | ||
| 3276 | |||
| 3277 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | ||
| 3278 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ | ||
| 3279 | static inline void __init set_pageblock_order(unsigned int order) | ||
| 3280 | { | ||
| 3281 | /* Check that pageblock_nr_pages has not already been setup */ | ||
| 3282 | if (pageblock_order) | ||
| 3283 | return; | ||
| 3284 | |||
| 3285 | /* | ||
| 3286 | * Assume the largest contiguous order of interest is a huge page. | ||
| 3287 | * This value may be variable depending on boot parameters on IA64 | ||
| 3288 | */ | ||
| 3289 | pageblock_order = order; | ||
| 3290 | } | ||
| 3291 | #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | ||
| 3292 | |||
| 3293 | /* Defined this way to avoid accidently referencing HUGETLB_PAGE_ORDER */ | ||
| 3294 | #define set_pageblock_order(x) do {} while (0) | ||
| 3295 | |||
| 3296 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | ||
| 3297 | |||
| 2904 | /* | 3298 | /* |
| 2905 | * Set up the zone data structures: | 3299 | * Set up the zone data structures: |
| 2906 | * - mark all pages reserved | 3300 | * - mark all pages reserved |
| @@ -2981,6 +3375,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
| 2981 | if (!size) | 3375 | if (!size) |
| 2982 | continue; | 3376 | continue; |
| 2983 | 3377 | ||
| 3378 | set_pageblock_order(HUGETLB_PAGE_ORDER); | ||
| 3379 | setup_usemap(pgdat, zone, size); | ||
| 2984 | ret = init_currently_empty_zone(zone, zone_start_pfn, | 3380 | ret = init_currently_empty_zone(zone, zone_start_pfn, |
| 2985 | size, MEMMAP_EARLY); | 3381 | size, MEMMAP_EARLY); |
| 2986 | BUG_ON(ret); | 3382 | BUG_ON(ret); |
| @@ -3234,16 +3630,24 @@ unsigned long __init find_max_pfn_with_active_regions(void) | |||
| 3234 | return max_pfn; | 3630 | return max_pfn; |
| 3235 | } | 3631 | } |
| 3236 | 3632 | ||
| 3237 | unsigned long __init early_calculate_totalpages(void) | 3633 | /* |
| 3634 | * early_calculate_totalpages() | ||
| 3635 | * Sum pages in active regions for movable zone. | ||
| 3636 | * Populate N_HIGH_MEMORY for calculating usable_nodes. | ||
| 3637 | */ | ||
| 3638 | static unsigned long __init early_calculate_totalpages(void) | ||
| 3238 | { | 3639 | { |
| 3239 | int i; | 3640 | int i; |
| 3240 | unsigned long totalpages = 0; | 3641 | unsigned long totalpages = 0; |
| 3241 | 3642 | ||
| 3242 | for (i = 0; i < nr_nodemap_entries; i++) | 3643 | for (i = 0; i < nr_nodemap_entries; i++) { |
| 3243 | totalpages += early_node_map[i].end_pfn - | 3644 | unsigned long pages = early_node_map[i].end_pfn - |
| 3244 | early_node_map[i].start_pfn; | 3645 | early_node_map[i].start_pfn; |
| 3245 | 3646 | totalpages += pages; | |
| 3246 | return totalpages; | 3647 | if (pages) |
| 3648 | node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); | ||
| 3649 | } | ||
| 3650 | return totalpages; | ||
| 3247 | } | 3651 | } |
| 3248 | 3652 | ||
| 3249 | /* | 3653 | /* |
| @@ -3257,7 +3661,8 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
| 3257 | int i, nid; | 3661 | int i, nid; |
| 3258 | unsigned long usable_startpfn; | 3662 | unsigned long usable_startpfn; |
| 3259 | unsigned long kernelcore_node, kernelcore_remaining; | 3663 | unsigned long kernelcore_node, kernelcore_remaining; |
| 3260 | int usable_nodes = num_online_nodes(); | 3664 | unsigned long totalpages = early_calculate_totalpages(); |
| 3665 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | ||
| 3261 | 3666 | ||
| 3262 | /* | 3667 | /* |
| 3263 | * If movablecore was specified, calculate what size of | 3668 | * If movablecore was specified, calculate what size of |
| @@ -3268,7 +3673,6 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
| 3268 | * what movablecore would have allowed. | 3673 | * what movablecore would have allowed. |
| 3269 | */ | 3674 | */ |
| 3270 | if (required_movablecore) { | 3675 | if (required_movablecore) { |
| 3271 | unsigned long totalpages = early_calculate_totalpages(); | ||
| 3272 | unsigned long corepages; | 3676 | unsigned long corepages; |
| 3273 | 3677 | ||
| 3274 | /* | 3678 | /* |
| @@ -3293,7 +3697,7 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
| 3293 | restart: | 3697 | restart: |
| 3294 | /* Spread kernelcore memory as evenly as possible throughout nodes */ | 3698 | /* Spread kernelcore memory as evenly as possible throughout nodes */ |
| 3295 | kernelcore_node = required_kernelcore / usable_nodes; | 3699 | kernelcore_node = required_kernelcore / usable_nodes; |
| 3296 | for_each_online_node(nid) { | 3700 | for_each_node_state(nid, N_HIGH_MEMORY) { |
| 3297 | /* | 3701 | /* |
| 3298 | * Recalculate kernelcore_node if the division per node | 3702 | * Recalculate kernelcore_node if the division per node |
| 3299 | * now exceeds what is necessary to satisfy the requested | 3703 | * now exceeds what is necessary to satisfy the requested |
| @@ -3385,6 +3789,20 @@ restart: | |||
| 3385 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); | 3789 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); |
| 3386 | } | 3790 | } |
| 3387 | 3791 | ||
| 3792 | /* Any regular memory on that node ? */ | ||
| 3793 | static void check_for_regular_memory(pg_data_t *pgdat) | ||
| 3794 | { | ||
| 3795 | #ifdef CONFIG_HIGHMEM | ||
| 3796 | enum zone_type zone_type; | ||
| 3797 | |||
| 3798 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { | ||
| 3799 | struct zone *zone = &pgdat->node_zones[zone_type]; | ||
| 3800 | if (zone->present_pages) | ||
| 3801 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); | ||
| 3802 | } | ||
| 3803 | #endif | ||
| 3804 | } | ||
| 3805 | |||
| 3388 | /** | 3806 | /** |
| 3389 | * free_area_init_nodes - Initialise all pg_data_t and zone data | 3807 | * free_area_init_nodes - Initialise all pg_data_t and zone data |
| 3390 | * @max_zone_pfn: an array of max PFNs for each zone | 3808 | * @max_zone_pfn: an array of max PFNs for each zone |
| @@ -3459,6 +3877,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
| 3459 | pg_data_t *pgdat = NODE_DATA(nid); | 3877 | pg_data_t *pgdat = NODE_DATA(nid); |
| 3460 | free_area_init_node(nid, pgdat, NULL, | 3878 | free_area_init_node(nid, pgdat, NULL, |
| 3461 | find_min_pfn_for_node(nid), NULL); | 3879 | find_min_pfn_for_node(nid), NULL); |
| 3880 | |||
| 3881 | /* Any memory on that node */ | ||
| 3882 | if (pgdat->node_present_pages) | ||
| 3883 | node_set_state(nid, N_HIGH_MEMORY); | ||
| 3884 | check_for_regular_memory(pgdat); | ||
| 3462 | } | 3885 | } |
| 3463 | } | 3886 | } |
| 3464 | 3887 | ||
| @@ -3673,6 +4096,7 @@ void setup_per_zone_pages_min(void) | |||
| 3673 | 4096 | ||
| 3674 | zone->pages_low = zone->pages_min + (tmp >> 2); | 4097 | zone->pages_low = zone->pages_min + (tmp >> 2); |
| 3675 | zone->pages_high = zone->pages_min + (tmp >> 1); | 4098 | zone->pages_high = zone->pages_min + (tmp >> 1); |
| 4099 | setup_zone_migrate_reserve(zone); | ||
| 3676 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 4100 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| 3677 | } | 4101 | } |
| 3678 | 4102 | ||
| @@ -3934,4 +4358,169 @@ EXPORT_SYMBOL(pfn_to_page); | |||
| 3934 | EXPORT_SYMBOL(page_to_pfn); | 4358 | EXPORT_SYMBOL(page_to_pfn); |
| 3935 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ | 4359 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ |
| 3936 | 4360 | ||
| 4361 | /* Return a pointer to the bitmap storing bits affecting a block of pages */ | ||
| 4362 | static inline unsigned long *get_pageblock_bitmap(struct zone *zone, | ||
| 4363 | unsigned long pfn) | ||
| 4364 | { | ||
| 4365 | #ifdef CONFIG_SPARSEMEM | ||
| 4366 | return __pfn_to_section(pfn)->pageblock_flags; | ||
| 4367 | #else | ||
| 4368 | return zone->pageblock_flags; | ||
| 4369 | #endif /* CONFIG_SPARSEMEM */ | ||
| 4370 | } | ||
| 4371 | |||
| 4372 | static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) | ||
| 4373 | { | ||
| 4374 | #ifdef CONFIG_SPARSEMEM | ||
| 4375 | pfn &= (PAGES_PER_SECTION-1); | ||
| 4376 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | ||
| 4377 | #else | ||
| 4378 | pfn = pfn - zone->zone_start_pfn; | ||
| 4379 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | ||
| 4380 | #endif /* CONFIG_SPARSEMEM */ | ||
| 4381 | } | ||
| 4382 | |||
| 4383 | /** | ||
| 4384 | * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages | ||
| 4385 | * @page: The page within the block of interest | ||
| 4386 | * @start_bitidx: The first bit of interest to retrieve | ||
| 4387 | * @end_bitidx: The last bit of interest | ||
| 4388 | * returns pageblock_bits flags | ||
| 4389 | */ | ||
| 4390 | unsigned long get_pageblock_flags_group(struct page *page, | ||
| 4391 | int start_bitidx, int end_bitidx) | ||
| 4392 | { | ||
| 4393 | struct zone *zone; | ||
| 4394 | unsigned long *bitmap; | ||
| 4395 | unsigned long pfn, bitidx; | ||
| 4396 | unsigned long flags = 0; | ||
| 4397 | unsigned long value = 1; | ||
| 4398 | |||
| 4399 | zone = page_zone(page); | ||
| 4400 | pfn = page_to_pfn(page); | ||
| 4401 | bitmap = get_pageblock_bitmap(zone, pfn); | ||
| 4402 | bitidx = pfn_to_bitidx(zone, pfn); | ||
| 4403 | |||
| 4404 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | ||
| 4405 | if (test_bit(bitidx + start_bitidx, bitmap)) | ||
| 4406 | flags |= value; | ||
| 4407 | |||
| 4408 | return flags; | ||
| 4409 | } | ||
| 3937 | 4410 | ||
| 4411 | /** | ||
| 4412 | * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages | ||
| 4413 | * @page: The page within the block of interest | ||
| 4414 | * @start_bitidx: The first bit of interest | ||
| 4415 | * @end_bitidx: The last bit of interest | ||
| 4416 | * @flags: The flags to set | ||
| 4417 | */ | ||
| 4418 | void set_pageblock_flags_group(struct page *page, unsigned long flags, | ||
| 4419 | int start_bitidx, int end_bitidx) | ||
| 4420 | { | ||
| 4421 | struct zone *zone; | ||
| 4422 | unsigned long *bitmap; | ||
| 4423 | unsigned long pfn, bitidx; | ||
| 4424 | unsigned long value = 1; | ||
| 4425 | |||
| 4426 | zone = page_zone(page); | ||
| 4427 | pfn = page_to_pfn(page); | ||
| 4428 | bitmap = get_pageblock_bitmap(zone, pfn); | ||
| 4429 | bitidx = pfn_to_bitidx(zone, pfn); | ||
| 4430 | |||
| 4431 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | ||
| 4432 | if (flags & value) | ||
| 4433 | __set_bit(bitidx + start_bitidx, bitmap); | ||
| 4434 | else | ||
| 4435 | __clear_bit(bitidx + start_bitidx, bitmap); | ||
| 4436 | } | ||
| 4437 | |||
| 4438 | /* | ||
| 4439 | * This is designed as sub function...plz see page_isolation.c also. | ||
| 4440 | * set/clear page block's type to be ISOLATE. | ||
| 4441 | * page allocater never alloc memory from ISOLATE block. | ||
| 4442 | */ | ||
| 4443 | |||
| 4444 | int set_migratetype_isolate(struct page *page) | ||
| 4445 | { | ||
| 4446 | struct zone *zone; | ||
| 4447 | unsigned long flags; | ||
| 4448 | int ret = -EBUSY; | ||
| 4449 | |||
| 4450 | zone = page_zone(page); | ||
| 4451 | spin_lock_irqsave(&zone->lock, flags); | ||
| 4452 | /* | ||
| 4453 | * In future, more migrate types will be able to be isolation target. | ||
| 4454 | */ | ||
| 4455 | if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) | ||
| 4456 | goto out; | ||
| 4457 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
| 4458 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
| 4459 | ret = 0; | ||
| 4460 | out: | ||
| 4461 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 4462 | if (!ret) | ||
| 4463 | drain_all_local_pages(); | ||
| 4464 | return ret; | ||
| 4465 | } | ||
| 4466 | |||
| 4467 | void unset_migratetype_isolate(struct page *page) | ||
| 4468 | { | ||
| 4469 | struct zone *zone; | ||
| 4470 | unsigned long flags; | ||
| 4471 | zone = page_zone(page); | ||
| 4472 | spin_lock_irqsave(&zone->lock, flags); | ||
| 4473 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | ||
| 4474 | goto out; | ||
| 4475 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
| 4476 | move_freepages_block(zone, page, MIGRATE_MOVABLE); | ||
| 4477 | out: | ||
| 4478 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 4479 | } | ||
| 4480 | |||
| 4481 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
| 4482 | /* | ||
| 4483 | * All pages in the range must be isolated before calling this. | ||
| 4484 | */ | ||
| 4485 | void | ||
| 4486 | __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | ||
| 4487 | { | ||
| 4488 | struct page *page; | ||
| 4489 | struct zone *zone; | ||
| 4490 | int order, i; | ||
| 4491 | unsigned long pfn; | ||
| 4492 | unsigned long flags; | ||
| 4493 | /* find the first valid pfn */ | ||
| 4494 | for (pfn = start_pfn; pfn < end_pfn; pfn++) | ||
| 4495 | if (pfn_valid(pfn)) | ||
| 4496 | break; | ||
| 4497 | if (pfn == end_pfn) | ||
| 4498 | return; | ||
| 4499 | zone = page_zone(pfn_to_page(pfn)); | ||
| 4500 | spin_lock_irqsave(&zone->lock, flags); | ||
| 4501 | pfn = start_pfn; | ||
| 4502 | while (pfn < end_pfn) { | ||
| 4503 | if (!pfn_valid(pfn)) { | ||
| 4504 | pfn++; | ||
| 4505 | continue; | ||
| 4506 | } | ||
| 4507 | page = pfn_to_page(pfn); | ||
| 4508 | BUG_ON(page_count(page)); | ||
| 4509 | BUG_ON(!PageBuddy(page)); | ||
| 4510 | order = page_order(page); | ||
| 4511 | #ifdef CONFIG_DEBUG_VM | ||
| 4512 | printk(KERN_INFO "remove from free list %lx %d %lx\n", | ||
| 4513 | pfn, 1 << order, end_pfn); | ||
| 4514 | #endif | ||
| 4515 | list_del(&page->lru); | ||
| 4516 | rmv_page_order(page); | ||
| 4517 | zone->free_area[order].nr_free--; | ||
| 4518 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
| 4519 | - (1UL << order)); | ||
| 4520 | for (i = 0; i < (1 << order); i++) | ||
| 4521 | SetPageReserved((page+i)); | ||
| 4522 | pfn += (1 << order); | ||
| 4523 | } | ||
| 4524 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 4525 | } | ||
| 4526 | #endif | ||
diff --git a/mm/page_isolation.c b/mm/page_isolation.c new file mode 100644 index 000000000000..8f92a29695cc --- /dev/null +++ b/mm/page_isolation.c | |||
| @@ -0,0 +1,138 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/page_isolation.c | ||
| 3 | */ | ||
| 4 | |||
| 5 | #include <stddef.h> | ||
| 6 | #include <linux/mm.h> | ||
| 7 | #include <linux/page-isolation.h> | ||
| 8 | #include <linux/pageblock-flags.h> | ||
| 9 | #include "internal.h" | ||
| 10 | |||
| 11 | static inline struct page * | ||
| 12 | __first_valid_page(unsigned long pfn, unsigned long nr_pages) | ||
| 13 | { | ||
| 14 | int i; | ||
| 15 | for (i = 0; i < nr_pages; i++) | ||
| 16 | if (pfn_valid_within(pfn + i)) | ||
| 17 | break; | ||
| 18 | if (unlikely(i == nr_pages)) | ||
| 19 | return NULL; | ||
| 20 | return pfn_to_page(pfn + i); | ||
| 21 | } | ||
| 22 | |||
| 23 | /* | ||
| 24 | * start_isolate_page_range() -- make page-allocation-type of range of pages | ||
| 25 | * to be MIGRATE_ISOLATE. | ||
| 26 | * @start_pfn: The lower PFN of the range to be isolated. | ||
| 27 | * @end_pfn: The upper PFN of the range to be isolated. | ||
| 28 | * | ||
| 29 | * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in | ||
| 30 | * the range will never be allocated. Any free pages and pages freed in the | ||
| 31 | * future will not be allocated again. | ||
| 32 | * | ||
| 33 | * start_pfn/end_pfn must be aligned to pageblock_order. | ||
| 34 | * Returns 0 on success and -EBUSY if any part of range cannot be isolated. | ||
| 35 | */ | ||
| 36 | int | ||
| 37 | start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) | ||
| 38 | { | ||
| 39 | unsigned long pfn; | ||
| 40 | unsigned long undo_pfn; | ||
| 41 | struct page *page; | ||
| 42 | |||
| 43 | BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); | ||
| 44 | BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); | ||
| 45 | |||
| 46 | for (pfn = start_pfn; | ||
| 47 | pfn < end_pfn; | ||
| 48 | pfn += pageblock_nr_pages) { | ||
| 49 | page = __first_valid_page(pfn, pageblock_nr_pages); | ||
| 50 | if (page && set_migratetype_isolate(page)) { | ||
| 51 | undo_pfn = pfn; | ||
| 52 | goto undo; | ||
| 53 | } | ||
| 54 | } | ||
| 55 | return 0; | ||
| 56 | undo: | ||
| 57 | for (pfn = start_pfn; | ||
| 58 | pfn <= undo_pfn; | ||
| 59 | pfn += pageblock_nr_pages) | ||
| 60 | unset_migratetype_isolate(pfn_to_page(pfn)); | ||
| 61 | |||
| 62 | return -EBUSY; | ||
| 63 | } | ||
| 64 | |||
| 65 | /* | ||
| 66 | * Make isolated pages available again. | ||
| 67 | */ | ||
| 68 | int | ||
| 69 | undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) | ||
| 70 | { | ||
| 71 | unsigned long pfn; | ||
| 72 | struct page *page; | ||
| 73 | BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); | ||
| 74 | BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); | ||
| 75 | for (pfn = start_pfn; | ||
| 76 | pfn < end_pfn; | ||
| 77 | pfn += pageblock_nr_pages) { | ||
| 78 | page = __first_valid_page(pfn, pageblock_nr_pages); | ||
| 79 | if (!page || get_pageblock_flags(page) != MIGRATE_ISOLATE) | ||
| 80 | continue; | ||
| 81 | unset_migratetype_isolate(page); | ||
| 82 | } | ||
| 83 | return 0; | ||
| 84 | } | ||
| 85 | /* | ||
| 86 | * Test all pages in the range is free(means isolated) or not. | ||
| 87 | * all pages in [start_pfn...end_pfn) must be in the same zone. | ||
| 88 | * zone->lock must be held before call this. | ||
| 89 | * | ||
| 90 | * Returns 0 if all pages in the range is isolated. | ||
| 91 | */ | ||
| 92 | static int | ||
| 93 | __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | ||
| 94 | { | ||
| 95 | struct page *page; | ||
| 96 | |||
| 97 | while (pfn < end_pfn) { | ||
| 98 | if (!pfn_valid_within(pfn)) { | ||
| 99 | pfn++; | ||
| 100 | continue; | ||
| 101 | } | ||
| 102 | page = pfn_to_page(pfn); | ||
| 103 | if (PageBuddy(page)) | ||
| 104 | pfn += 1 << page_order(page); | ||
| 105 | else if (page_count(page) == 0 && | ||
| 106 | page_private(page) == MIGRATE_ISOLATE) | ||
| 107 | pfn += 1; | ||
| 108 | else | ||
| 109 | break; | ||
| 110 | } | ||
| 111 | if (pfn < end_pfn) | ||
| 112 | return 0; | ||
| 113 | return 1; | ||
| 114 | } | ||
| 115 | |||
| 116 | int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | ||
| 117 | { | ||
| 118 | unsigned long pfn; | ||
| 119 | struct page *page; | ||
| 120 | |||
| 121 | pfn = start_pfn; | ||
| 122 | /* | ||
| 123 | * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page | ||
| 124 | * is not aligned to pageblock_nr_pages. | ||
| 125 | * Then we just check pagetype fist. | ||
| 126 | */ | ||
| 127 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | ||
| 128 | page = __first_valid_page(pfn, pageblock_nr_pages); | ||
| 129 | if (page && get_pageblock_flags(page) != MIGRATE_ISOLATE) | ||
| 130 | break; | ||
| 131 | } | ||
| 132 | if (pfn < end_pfn) | ||
| 133 | return -EBUSY; | ||
| 134 | /* Check all pages are free or Marked as ISOLATED */ | ||
| 135 | if (__test_page_isolated_in_pageblock(start_pfn, end_pfn)) | ||
| 136 | return 0; | ||
| 137 | return -EBUSY; | ||
| 138 | } | ||
diff --git a/mm/readahead.c b/mm/readahead.c index be20c9d699d3..229788884010 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
| @@ -22,16 +22,8 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | |||
| 22 | } | 22 | } |
| 23 | EXPORT_SYMBOL(default_unplug_io_fn); | 23 | EXPORT_SYMBOL(default_unplug_io_fn); |
| 24 | 24 | ||
| 25 | /* | ||
| 26 | * Convienent macros for min/max read-ahead pages. | ||
| 27 | * Note that MAX_RA_PAGES is rounded down, while MIN_RA_PAGES is rounded up. | ||
| 28 | * The latter is necessary for systems with large page size(i.e. 64k). | ||
| 29 | */ | ||
| 30 | #define MAX_RA_PAGES (VM_MAX_READAHEAD*1024 / PAGE_CACHE_SIZE) | ||
| 31 | #define MIN_RA_PAGES DIV_ROUND_UP(VM_MIN_READAHEAD*1024, PAGE_CACHE_SIZE) | ||
| 32 | |||
| 33 | struct backing_dev_info default_backing_dev_info = { | 25 | struct backing_dev_info default_backing_dev_info = { |
| 34 | .ra_pages = MAX_RA_PAGES, | 26 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, |
| 35 | .state = 0, | 27 | .state = 0, |
| 36 | .capabilities = BDI_CAP_MAP_COPY, | 28 | .capabilities = BDI_CAP_MAP_COPY, |
| 37 | .unplug_io_fn = default_unplug_io_fn, | 29 | .unplug_io_fn = default_unplug_io_fn, |
| @@ -46,7 +38,7 @@ void | |||
| 46 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) | 38 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) |
| 47 | { | 39 | { |
| 48 | ra->ra_pages = mapping->backing_dev_info->ra_pages; | 40 | ra->ra_pages = mapping->backing_dev_info->ra_pages; |
| 49 | ra->prev_index = -1; | 41 | ra->prev_pos = -1; |
| 50 | } | 42 | } |
| 51 | EXPORT_SYMBOL_GPL(file_ra_state_init); | 43 | EXPORT_SYMBOL_GPL(file_ra_state_init); |
| 52 | 44 | ||
| @@ -66,28 +58,25 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, | |||
| 66 | int (*filler)(void *, struct page *), void *data) | 58 | int (*filler)(void *, struct page *), void *data) |
| 67 | { | 59 | { |
| 68 | struct page *page; | 60 | struct page *page; |
| 69 | struct pagevec lru_pvec; | ||
| 70 | int ret = 0; | 61 | int ret = 0; |
| 71 | 62 | ||
| 72 | pagevec_init(&lru_pvec, 0); | ||
| 73 | |||
| 74 | while (!list_empty(pages)) { | 63 | while (!list_empty(pages)) { |
| 75 | page = list_to_page(pages); | 64 | page = list_to_page(pages); |
| 76 | list_del(&page->lru); | 65 | list_del(&page->lru); |
| 77 | if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { | 66 | if (add_to_page_cache_lru(page, mapping, |
| 67 | page->index, GFP_KERNEL)) { | ||
| 78 | page_cache_release(page); | 68 | page_cache_release(page); |
| 79 | continue; | 69 | continue; |
| 80 | } | 70 | } |
| 71 | page_cache_release(page); | ||
| 72 | |||
| 81 | ret = filler(data, page); | 73 | ret = filler(data, page); |
| 82 | if (!pagevec_add(&lru_pvec, page)) | 74 | if (unlikely(ret)) { |
| 83 | __pagevec_lru_add(&lru_pvec); | ||
| 84 | if (ret) { | ||
| 85 | put_pages_list(pages); | 75 | put_pages_list(pages); |
| 86 | break; | 76 | break; |
| 87 | } | 77 | } |
| 88 | task_io_account_read(PAGE_CACHE_SIZE); | 78 | task_io_account_read(PAGE_CACHE_SIZE); |
| 89 | } | 79 | } |
| 90 | pagevec_lru_add(&lru_pvec); | ||
| 91 | return ret; | 80 | return ret; |
| 92 | } | 81 | } |
| 93 | 82 | ||
| @@ -97,7 +86,6 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
| 97 | struct list_head *pages, unsigned nr_pages) | 86 | struct list_head *pages, unsigned nr_pages) |
| 98 | { | 87 | { |
| 99 | unsigned page_idx; | 88 | unsigned page_idx; |
| 100 | struct pagevec lru_pvec; | ||
| 101 | int ret; | 89 | int ret; |
| 102 | 90 | ||
| 103 | if (mapping->a_ops->readpages) { | 91 | if (mapping->a_ops->readpages) { |
| @@ -107,19 +95,15 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
| 107 | goto out; | 95 | goto out; |
| 108 | } | 96 | } |
| 109 | 97 | ||
| 110 | pagevec_init(&lru_pvec, 0); | ||
| 111 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { | 98 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { |
| 112 | struct page *page = list_to_page(pages); | 99 | struct page *page = list_to_page(pages); |
| 113 | list_del(&page->lru); | 100 | list_del(&page->lru); |
| 114 | if (!add_to_page_cache(page, mapping, | 101 | if (!add_to_page_cache_lru(page, mapping, |
| 115 | page->index, GFP_KERNEL)) { | 102 | page->index, GFP_KERNEL)) { |
| 116 | mapping->a_ops->readpage(filp, page); | 103 | mapping->a_ops->readpage(filp, page); |
| 117 | if (!pagevec_add(&lru_pvec, page)) | 104 | } |
| 118 | __pagevec_lru_add(&lru_pvec); | 105 | page_cache_release(page); |
| 119 | } else | ||
| 120 | page_cache_release(page); | ||
| 121 | } | 106 | } |
| 122 | pagevec_lru_add(&lru_pvec); | ||
| 123 | ret = 0; | 107 | ret = 0; |
| 124 | out: | 108 | out: |
| 125 | return ret; | 109 | return ret; |
| @@ -157,20 +141,19 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
| 157 | /* | 141 | /* |
| 158 | * Preallocate as many pages as we will need. | 142 | * Preallocate as many pages as we will need. |
| 159 | */ | 143 | */ |
| 160 | read_lock_irq(&mapping->tree_lock); | ||
| 161 | for (page_idx = 0; page_idx < nr_to_read; page_idx++) { | 144 | for (page_idx = 0; page_idx < nr_to_read; page_idx++) { |
| 162 | pgoff_t page_offset = offset + page_idx; | 145 | pgoff_t page_offset = offset + page_idx; |
| 163 | 146 | ||
| 164 | if (page_offset > end_index) | 147 | if (page_offset > end_index) |
| 165 | break; | 148 | break; |
| 166 | 149 | ||
| 150 | rcu_read_lock(); | ||
| 167 | page = radix_tree_lookup(&mapping->page_tree, page_offset); | 151 | page = radix_tree_lookup(&mapping->page_tree, page_offset); |
| 152 | rcu_read_unlock(); | ||
| 168 | if (page) | 153 | if (page) |
| 169 | continue; | 154 | continue; |
| 170 | 155 | ||
| 171 | read_unlock_irq(&mapping->tree_lock); | ||
| 172 | page = page_cache_alloc_cold(mapping); | 156 | page = page_cache_alloc_cold(mapping); |
| 173 | read_lock_irq(&mapping->tree_lock); | ||
| 174 | if (!page) | 157 | if (!page) |
| 175 | break; | 158 | break; |
| 176 | page->index = page_offset; | 159 | page->index = page_offset; |
| @@ -179,7 +162,6 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
| 179 | SetPageReadahead(page); | 162 | SetPageReadahead(page); |
| 180 | ret++; | 163 | ret++; |
| 181 | } | 164 | } |
| 182 | read_unlock_irq(&mapping->tree_lock); | ||
| 183 | 165 | ||
| 184 | /* | 166 | /* |
| 185 | * Now start the IO. We ignore I/O errors - if the page is not | 167 | * Now start the IO. We ignore I/O errors - if the page is not |
| @@ -327,7 +309,7 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, | |||
| 327 | * indicator. The flag won't be set on already cached pages, to avoid the | 309 | * indicator. The flag won't be set on already cached pages, to avoid the |
| 328 | * readahead-for-nothing fuss, saving pointless page cache lookups. | 310 | * readahead-for-nothing fuss, saving pointless page cache lookups. |
| 329 | * | 311 | * |
| 330 | * prev_index tracks the last visited page in the _previous_ read request. | 312 | * prev_pos tracks the last visited byte in the _previous_ read request. |
| 331 | * It should be maintained by the caller, and will be used for detecting | 313 | * It should be maintained by the caller, and will be used for detecting |
| 332 | * small random reads. Note that the readahead algorithm checks loosely | 314 | * small random reads. Note that the readahead algorithm checks loosely |
| 333 | * for sequential patterns. Hence interleaved reads might be served as | 315 | * for sequential patterns. Hence interleaved reads might be served as |
| @@ -351,11 +333,9 @@ ondemand_readahead(struct address_space *mapping, | |||
| 351 | bool hit_readahead_marker, pgoff_t offset, | 333 | bool hit_readahead_marker, pgoff_t offset, |
| 352 | unsigned long req_size) | 334 | unsigned long req_size) |
| 353 | { | 335 | { |
| 354 | unsigned long max; /* max readahead pages */ | 336 | int max = ra->ra_pages; /* max readahead pages */ |
| 355 | int sequential; | 337 | pgoff_t prev_offset; |
| 356 | 338 | int sequential; | |
| 357 | max = ra->ra_pages; | ||
| 358 | sequential = (offset - ra->prev_index <= 1UL) || (req_size > max); | ||
| 359 | 339 | ||
| 360 | /* | 340 | /* |
| 361 | * It's the expected callback offset, assume sequential access. | 341 | * It's the expected callback offset, assume sequential access. |
| @@ -369,6 +349,9 @@ ondemand_readahead(struct address_space *mapping, | |||
| 369 | goto readit; | 349 | goto readit; |
| 370 | } | 350 | } |
| 371 | 351 | ||
| 352 | prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; | ||
| 353 | sequential = offset - prev_offset <= 1UL || req_size > max; | ||
| 354 | |||
| 372 | /* | 355 | /* |
| 373 | * Standalone, small read. | 356 | * Standalone, small read. |
| 374 | * Read as is, and do not pollute the readahead state. | 357 | * Read as is, and do not pollute the readahead state. |
| @@ -379,6 +362,29 @@ ondemand_readahead(struct address_space *mapping, | |||
| 379 | } | 362 | } |
| 380 | 363 | ||
| 381 | /* | 364 | /* |
| 365 | * Hit a marked page without valid readahead state. | ||
| 366 | * E.g. interleaved reads. | ||
| 367 | * Query the pagecache for async_size, which normally equals to | ||
| 368 | * readahead size. Ramp it up and use it as the new readahead size. | ||
| 369 | */ | ||
| 370 | if (hit_readahead_marker) { | ||
| 371 | pgoff_t start; | ||
| 372 | |||
| 373 | read_lock_irq(&mapping->tree_lock); | ||
| 374 | start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); | ||
| 375 | read_unlock_irq(&mapping->tree_lock); | ||
| 376 | |||
| 377 | if (!start || start - offset > max) | ||
| 378 | return 0; | ||
| 379 | |||
| 380 | ra->start = start; | ||
| 381 | ra->size = start - offset; /* old async_size */ | ||
| 382 | ra->size = get_next_ra_size(ra, max); | ||
| 383 | ra->async_size = ra->size; | ||
| 384 | goto readit; | ||
| 385 | } | ||
| 386 | |||
| 387 | /* | ||
| 382 | * It may be one of | 388 | * It may be one of |
| 383 | * - first read on start of file | 389 | * - first read on start of file |
| 384 | * - sequential cache miss | 390 | * - sequential cache miss |
| @@ -389,16 +395,6 @@ ondemand_readahead(struct address_space *mapping, | |||
| 389 | ra->size = get_init_ra_size(req_size, max); | 395 | ra->size = get_init_ra_size(req_size, max); |
| 390 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; | 396 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; |
| 391 | 397 | ||
| 392 | /* | ||
| 393 | * Hit on a marked page without valid readahead state. | ||
| 394 | * E.g. interleaved reads. | ||
| 395 | * Not knowing its readahead pos/size, bet on the minimal possible one. | ||
| 396 | */ | ||
| 397 | if (hit_readahead_marker) { | ||
| 398 | ra->start++; | ||
| 399 | ra->size = get_next_ra_size(ra, max); | ||
| 400 | } | ||
| 401 | |||
| 402 | readit: | 398 | readit: |
| 403 | return ra_submit(ra, mapping, filp); | 399 | return ra_submit(ra, mapping, filp); |
| 404 | } | 400 | } |
| @@ -436,7 +436,6 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | |||
| 436 | entry = pte_wrprotect(entry); | 436 | entry = pte_wrprotect(entry); |
| 437 | entry = pte_mkclean(entry); | 437 | entry = pte_mkclean(entry); |
| 438 | set_pte_at(mm, address, pte, entry); | 438 | set_pte_at(mm, address, pte, entry); |
| 439 | lazy_mmu_prot_update(entry); | ||
| 440 | ret = 1; | 439 | ret = 1; |
| 441 | } | 440 | } |
| 442 | 441 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index fcd19d323f9f..8a82342a8595 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -49,7 +49,6 @@ | |||
| 49 | #include <linux/ctype.h> | 49 | #include <linux/ctype.h> |
| 50 | #include <linux/migrate.h> | 50 | #include <linux/migrate.h> |
| 51 | #include <linux/highmem.h> | 51 | #include <linux/highmem.h> |
| 52 | #include <linux/backing-dev.h> | ||
| 53 | 52 | ||
| 54 | #include <asm/uaccess.h> | 53 | #include <asm/uaccess.h> |
| 55 | #include <asm/div64.h> | 54 | #include <asm/div64.h> |
| @@ -96,9 +95,9 @@ static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) | |||
| 96 | * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: | 95 | * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: |
| 97 | * might be reconsidered if it ever diverges from PAGE_SIZE. | 96 | * might be reconsidered if it ever diverges from PAGE_SIZE. |
| 98 | * | 97 | * |
| 99 | * __GFP_MOVABLE is masked out as swap vectors cannot move | 98 | * Mobility flags are masked out as swap vectors cannot move |
| 100 | */ | 99 | */ |
| 101 | return alloc_pages((gfp_mask & ~__GFP_MOVABLE) | __GFP_ZERO, | 100 | return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO, |
| 102 | PAGE_CACHE_SHIFT-PAGE_SHIFT); | 101 | PAGE_CACHE_SHIFT-PAGE_SHIFT); |
| 103 | } | 102 | } |
| 104 | 103 | ||
| @@ -972,7 +971,7 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_ | |||
| 972 | *nodelist++ = '\0'; | 971 | *nodelist++ = '\0'; |
| 973 | if (nodelist_parse(nodelist, *policy_nodes)) | 972 | if (nodelist_parse(nodelist, *policy_nodes)) |
| 974 | goto out; | 973 | goto out; |
| 975 | if (!nodes_subset(*policy_nodes, node_online_map)) | 974 | if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY])) |
| 976 | goto out; | 975 | goto out; |
| 977 | } | 976 | } |
| 978 | if (!strcmp(value, "default")) { | 977 | if (!strcmp(value, "default")) { |
| @@ -997,9 +996,11 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_ | |||
| 997 | err = 0; | 996 | err = 0; |
| 998 | } else if (!strcmp(value, "interleave")) { | 997 | } else if (!strcmp(value, "interleave")) { |
| 999 | *policy = MPOL_INTERLEAVE; | 998 | *policy = MPOL_INTERLEAVE; |
| 1000 | /* Default to nodes online if no nodelist */ | 999 | /* |
| 1000 | * Default to online nodes with memory if no nodelist | ||
| 1001 | */ | ||
| 1001 | if (!nodelist) | 1002 | if (!nodelist) |
| 1002 | *policy_nodes = node_online_map; | 1003 | *policy_nodes = node_states[N_HIGH_MEMORY]; |
| 1003 | err = 0; | 1004 | err = 0; |
| 1004 | } | 1005 | } |
| 1005 | out: | 1006 | out: |
| @@ -1025,8 +1026,8 @@ static struct page *shmem_swapin_async(struct shared_policy *p, | |||
| 1025 | return page; | 1026 | return page; |
| 1026 | } | 1027 | } |
| 1027 | 1028 | ||
| 1028 | struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry, | 1029 | static struct page *shmem_swapin(struct shmem_inode_info *info, |
| 1029 | unsigned long idx) | 1030 | swp_entry_t entry, unsigned long idx) |
| 1030 | { | 1031 | { |
| 1031 | struct shared_policy *p = &info->policy; | 1032 | struct shared_policy *p = &info->policy; |
| 1032 | int i, num; | 1033 | int i, num; |
| @@ -1061,7 +1062,8 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, | |||
| 1061 | return page; | 1062 | return page; |
| 1062 | } | 1063 | } |
| 1063 | #else | 1064 | #else |
| 1064 | static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) | 1065 | static inline int shmem_parse_mpol(char *value, int *policy, |
| 1066 | nodemask_t *policy_nodes) | ||
| 1065 | { | 1067 | { |
| 1066 | return 1; | 1068 | return 1; |
| 1067 | } | 1069 | } |
| @@ -1109,7 +1111,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, | |||
| 1109 | * Normally, filepage is NULL on entry, and either found | 1111 | * Normally, filepage is NULL on entry, and either found |
| 1110 | * uptodate immediately, or allocated and zeroed, or read | 1112 | * uptodate immediately, or allocated and zeroed, or read |
| 1111 | * in under swappage, which is then assigned to filepage. | 1113 | * in under swappage, which is then assigned to filepage. |
| 1112 | * But shmem_readpage and shmem_prepare_write pass in a locked | 1114 | * But shmem_readpage and shmem_write_begin pass in a locked |
| 1113 | * filepage, which may be found not uptodate by other callers | 1115 | * filepage, which may be found not uptodate by other callers |
| 1114 | * too, and may need to be copied from the swappage read in. | 1116 | * too, and may need to be copied from the swappage read in. |
| 1115 | */ | 1117 | */ |
| @@ -1327,14 +1329,14 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1327 | } | 1329 | } |
| 1328 | 1330 | ||
| 1329 | #ifdef CONFIG_NUMA | 1331 | #ifdef CONFIG_NUMA |
| 1330 | int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | 1332 | static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) |
| 1331 | { | 1333 | { |
| 1332 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; | 1334 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; |
| 1333 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); | 1335 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); |
| 1334 | } | 1336 | } |
| 1335 | 1337 | ||
| 1336 | struct mempolicy * | 1338 | static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, |
| 1337 | shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) | 1339 | unsigned long addr) |
| 1338 | { | 1340 | { |
| 1339 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; | 1341 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; |
| 1340 | unsigned long idx; | 1342 | unsigned long idx; |
| @@ -1446,7 +1448,7 @@ static const struct inode_operations shmem_symlink_inode_operations; | |||
| 1446 | static const struct inode_operations shmem_symlink_inline_operations; | 1448 | static const struct inode_operations shmem_symlink_inline_operations; |
| 1447 | 1449 | ||
| 1448 | /* | 1450 | /* |
| 1449 | * Normally tmpfs avoids the use of shmem_readpage and shmem_prepare_write; | 1451 | * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin; |
| 1450 | * but providing them allows a tmpfs file to be used for splice, sendfile, and | 1452 | * but providing them allows a tmpfs file to be used for splice, sendfile, and |
| 1451 | * below the loop driver, in the generic fashion that many filesystems support. | 1453 | * below the loop driver, in the generic fashion that many filesystems support. |
| 1452 | */ | 1454 | */ |
| @@ -1459,10 +1461,30 @@ static int shmem_readpage(struct file *file, struct page *page) | |||
| 1459 | } | 1461 | } |
| 1460 | 1462 | ||
| 1461 | static int | 1463 | static int |
| 1462 | shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) | 1464 | shmem_write_begin(struct file *file, struct address_space *mapping, |
| 1465 | loff_t pos, unsigned len, unsigned flags, | ||
| 1466 | struct page **pagep, void **fsdata) | ||
| 1463 | { | 1467 | { |
| 1464 | struct inode *inode = page->mapping->host; | 1468 | struct inode *inode = mapping->host; |
| 1465 | return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL); | 1469 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
| 1470 | *pagep = NULL; | ||
| 1471 | return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); | ||
| 1472 | } | ||
| 1473 | |||
| 1474 | static int | ||
| 1475 | shmem_write_end(struct file *file, struct address_space *mapping, | ||
| 1476 | loff_t pos, unsigned len, unsigned copied, | ||
| 1477 | struct page *page, void *fsdata) | ||
| 1478 | { | ||
| 1479 | struct inode *inode = mapping->host; | ||
| 1480 | |||
| 1481 | set_page_dirty(page); | ||
| 1482 | page_cache_release(page); | ||
| 1483 | |||
| 1484 | if (pos+copied > inode->i_size) | ||
| 1485 | i_size_write(inode, pos+copied); | ||
| 1486 | |||
| 1487 | return copied; | ||
| 1466 | } | 1488 | } |
| 1467 | 1489 | ||
| 1468 | static ssize_t | 1490 | static ssize_t |
| @@ -2219,7 +2241,7 @@ static int shmem_fill_super(struct super_block *sb, | |||
| 2219 | unsigned long blocks = 0; | 2241 | unsigned long blocks = 0; |
| 2220 | unsigned long inodes = 0; | 2242 | unsigned long inodes = 0; |
| 2221 | int policy = MPOL_DEFAULT; | 2243 | int policy = MPOL_DEFAULT; |
| 2222 | nodemask_t policy_nodes = node_online_map; | 2244 | nodemask_t policy_nodes = node_states[N_HIGH_MEMORY]; |
| 2223 | 2245 | ||
| 2224 | #ifdef CONFIG_TMPFS | 2246 | #ifdef CONFIG_TMPFS |
| 2225 | /* | 2247 | /* |
| @@ -2338,8 +2360,8 @@ static const struct address_space_operations shmem_aops = { | |||
| 2338 | .set_page_dirty = __set_page_dirty_no_writeback, | 2360 | .set_page_dirty = __set_page_dirty_no_writeback, |
| 2339 | #ifdef CONFIG_TMPFS | 2361 | #ifdef CONFIG_TMPFS |
| 2340 | .readpage = shmem_readpage, | 2362 | .readpage = shmem_readpage, |
| 2341 | .prepare_write = shmem_prepare_write, | 2363 | .write_begin = shmem_write_begin, |
| 2342 | .commit_write = simple_commit_write, | 2364 | .write_end = shmem_write_end, |
| 2343 | #endif | 2365 | #endif |
| 2344 | .migratepage = migrate_page, | 2366 | .migratepage = migrate_page, |
| 2345 | }; | 2367 | }; |
| @@ -1568,7 +1568,7 @@ void __init kmem_cache_init(void) | |||
| 1568 | /* Replace the static kmem_list3 structures for the boot cpu */ | 1568 | /* Replace the static kmem_list3 structures for the boot cpu */ |
| 1569 | init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); | 1569 | init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); |
| 1570 | 1570 | ||
| 1571 | for_each_online_node(nid) { | 1571 | for_each_node_state(nid, N_NORMAL_MEMORY) { |
| 1572 | init_list(malloc_sizes[INDEX_AC].cs_cachep, | 1572 | init_list(malloc_sizes[INDEX_AC].cs_cachep, |
| 1573 | &initkmem_list3[SIZE_AC + nid], nid); | 1573 | &initkmem_list3[SIZE_AC + nid], nid); |
| 1574 | 1574 | ||
| @@ -1643,6 +1643,8 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 1643 | #endif | 1643 | #endif |
| 1644 | 1644 | ||
| 1645 | flags |= cachep->gfpflags; | 1645 | flags |= cachep->gfpflags; |
| 1646 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | ||
| 1647 | flags |= __GFP_RECLAIMABLE; | ||
| 1646 | 1648 | ||
| 1647 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | 1649 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); |
| 1648 | if (!page) | 1650 | if (!page) |
| @@ -1944,7 +1946,7 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index) | |||
| 1944 | { | 1946 | { |
| 1945 | int node; | 1947 | int node; |
| 1946 | 1948 | ||
| 1947 | for_each_online_node(node) { | 1949 | for_each_node_state(node, N_NORMAL_MEMORY) { |
| 1948 | cachep->nodelists[node] = &initkmem_list3[index + node]; | 1950 | cachep->nodelists[node] = &initkmem_list3[index + node]; |
| 1949 | cachep->nodelists[node]->next_reap = jiffies + | 1951 | cachep->nodelists[node]->next_reap = jiffies + |
| 1950 | REAPTIMEOUT_LIST3 + | 1952 | REAPTIMEOUT_LIST3 + |
| @@ -2075,7 +2077,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | |||
| 2075 | g_cpucache_up = PARTIAL_L3; | 2077 | g_cpucache_up = PARTIAL_L3; |
| 2076 | } else { | 2078 | } else { |
| 2077 | int node; | 2079 | int node; |
| 2078 | for_each_online_node(node) { | 2080 | for_each_node_state(node, N_NORMAL_MEMORY) { |
| 2079 | cachep->nodelists[node] = | 2081 | cachep->nodelists[node] = |
| 2080 | kmalloc_node(sizeof(struct kmem_list3), | 2082 | kmalloc_node(sizeof(struct kmem_list3), |
| 2081 | GFP_KERNEL, node); | 2083 | GFP_KERNEL, node); |
| @@ -2746,9 +2748,9 @@ static int cache_grow(struct kmem_cache *cachep, | |||
| 2746 | * Be lazy and only check for valid flags here, keeping it out of the | 2748 | * Be lazy and only check for valid flags here, keeping it out of the |
| 2747 | * critical path in kmem_cache_alloc(). | 2749 | * critical path in kmem_cache_alloc(). |
| 2748 | */ | 2750 | */ |
| 2749 | BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); | 2751 | BUG_ON(flags & GFP_SLAB_BUG_MASK); |
| 2752 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | ||
| 2750 | 2753 | ||
| 2751 | local_flags = (flags & GFP_LEVEL_MASK); | ||
| 2752 | /* Take the l3 list lock to change the colour_next on this node */ | 2754 | /* Take the l3 list lock to change the colour_next on this node */ |
| 2753 | check_irq_off(); | 2755 | check_irq_off(); |
| 2754 | l3 = cachep->nodelists[nodeid]; | 2756 | l3 = cachep->nodelists[nodeid]; |
| @@ -2785,7 +2787,7 @@ static int cache_grow(struct kmem_cache *cachep, | |||
| 2785 | 2787 | ||
| 2786 | /* Get slab management. */ | 2788 | /* Get slab management. */ |
| 2787 | slabp = alloc_slabmgmt(cachep, objp, offset, | 2789 | slabp = alloc_slabmgmt(cachep, objp, offset, |
| 2788 | local_flags & ~GFP_THISNODE, nodeid); | 2790 | local_flags & ~GFP_CONSTRAINT_MASK, nodeid); |
| 2789 | if (!slabp) | 2791 | if (!slabp) |
| 2790 | goto opps1; | 2792 | goto opps1; |
| 2791 | 2793 | ||
| @@ -3225,7 +3227,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
| 3225 | 3227 | ||
| 3226 | zonelist = &NODE_DATA(slab_node(current->mempolicy)) | 3228 | zonelist = &NODE_DATA(slab_node(current->mempolicy)) |
| 3227 | ->node_zonelists[gfp_zone(flags)]; | 3229 | ->node_zonelists[gfp_zone(flags)]; |
| 3228 | local_flags = (flags & GFP_LEVEL_MASK); | 3230 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
| 3229 | 3231 | ||
| 3230 | retry: | 3232 | retry: |
| 3231 | /* | 3233 | /* |
| @@ -3792,7 +3794,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
| 3792 | struct array_cache *new_shared; | 3794 | struct array_cache *new_shared; |
| 3793 | struct array_cache **new_alien = NULL; | 3795 | struct array_cache **new_alien = NULL; |
| 3794 | 3796 | ||
| 3795 | for_each_online_node(node) { | 3797 | for_each_node_state(node, N_NORMAL_MEMORY) { |
| 3796 | 3798 | ||
| 3797 | if (use_alien_caches) { | 3799 | if (use_alien_caches) { |
| 3798 | new_alien = alloc_alien_cache(node, cachep->limit); | 3800 | new_alien = alloc_alien_cache(node, cachep->limit); |
| @@ -4446,7 +4448,8 @@ const struct seq_operations slabstats_op = { | |||
| 4446 | */ | 4448 | */ |
| 4447 | size_t ksize(const void *objp) | 4449 | size_t ksize(const void *objp) |
| 4448 | { | 4450 | { |
| 4449 | if (unlikely(ZERO_OR_NULL_PTR(objp))) | 4451 | BUG_ON(!objp); |
| 4452 | if (unlikely(objp == ZERO_SIZE_PTR)) | ||
| 4450 | return 0; | 4453 | return 0; |
| 4451 | 4454 | ||
| 4452 | return obj_size(virt_to_cache(objp)); | 4455 | return obj_size(virt_to_cache(objp)); |
| @@ -360,7 +360,7 @@ static void slob_free(void *block, int size) | |||
| 360 | slobidx_t units; | 360 | slobidx_t units; |
| 361 | unsigned long flags; | 361 | unsigned long flags; |
| 362 | 362 | ||
| 363 | if (ZERO_OR_NULL_PTR(block)) | 363 | if (unlikely(ZERO_OR_NULL_PTR(block))) |
| 364 | return; | 364 | return; |
| 365 | BUG_ON(!size); | 365 | BUG_ON(!size); |
| 366 | 366 | ||
| @@ -466,7 +466,7 @@ void kfree(const void *block) | |||
| 466 | { | 466 | { |
| 467 | struct slob_page *sp; | 467 | struct slob_page *sp; |
| 468 | 468 | ||
| 469 | if (ZERO_OR_NULL_PTR(block)) | 469 | if (unlikely(ZERO_OR_NULL_PTR(block))) |
| 470 | return; | 470 | return; |
| 471 | 471 | ||
| 472 | sp = (struct slob_page *)virt_to_page(block); | 472 | sp = (struct slob_page *)virt_to_page(block); |
| @@ -484,7 +484,8 @@ size_t ksize(const void *block) | |||
| 484 | { | 484 | { |
| 485 | struct slob_page *sp; | 485 | struct slob_page *sp; |
| 486 | 486 | ||
| 487 | if (ZERO_OR_NULL_PTR(block)) | 487 | BUG_ON(!block); |
| 488 | if (unlikely(block == ZERO_SIZE_PTR)) | ||
| 488 | return 0; | 489 | return 0; |
| 489 | 490 | ||
| 490 | sp = (struct slob_page *)virt_to_page(block); | 491 | sp = (struct slob_page *)virt_to_page(block); |
| @@ -90,7 +90,7 @@ | |||
| 90 | * One use of this flag is to mark slabs that are | 90 | * One use of this flag is to mark slabs that are |
| 91 | * used for allocations. Then such a slab becomes a cpu | 91 | * used for allocations. Then such a slab becomes a cpu |
| 92 | * slab. The cpu slab may be equipped with an additional | 92 | * slab. The cpu slab may be equipped with an additional |
| 93 | * lockless_freelist that allows lockless access to | 93 | * freelist that allows lockless access to |
| 94 | * free objects in addition to the regular freelist | 94 | * free objects in addition to the regular freelist |
| 95 | * that requires the slab lock. | 95 | * that requires the slab lock. |
| 96 | * | 96 | * |
| @@ -140,11 +140,6 @@ static inline void ClearSlabDebug(struct page *page) | |||
| 140 | /* | 140 | /* |
| 141 | * Issues still to be resolved: | 141 | * Issues still to be resolved: |
| 142 | * | 142 | * |
| 143 | * - The per cpu array is updated for each new slab and and is a remote | ||
| 144 | * cacheline for most nodes. This could become a bouncing cacheline given | ||
| 145 | * enough frequent updates. There are 16 pointers in a cacheline, so at | ||
| 146 | * max 16 cpus could compete for the cacheline which may be okay. | ||
| 147 | * | ||
| 148 | * - Support PAGE_ALLOC_DEBUG. Should be easy to do. | 143 | * - Support PAGE_ALLOC_DEBUG. Should be easy to do. |
| 149 | * | 144 | * |
| 150 | * - Variable sizing of the per node arrays | 145 | * - Variable sizing of the per node arrays |
| @@ -205,11 +200,6 @@ static inline void ClearSlabDebug(struct page *page) | |||
| 205 | #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) | 200 | #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) |
| 206 | #endif | 201 | #endif |
| 207 | 202 | ||
| 208 | /* | ||
| 209 | * The page->inuse field is 16 bit thus we have this limitation | ||
| 210 | */ | ||
| 211 | #define MAX_OBJECTS_PER_SLAB 65535 | ||
| 212 | |||
| 213 | /* Internal SLUB flags */ | 203 | /* Internal SLUB flags */ |
| 214 | #define __OBJECT_POISON 0x80000000 /* Poison object */ | 204 | #define __OBJECT_POISON 0x80000000 /* Poison object */ |
| 215 | #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ | 205 | #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ |
| @@ -277,6 +267,15 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | |||
| 277 | #endif | 267 | #endif |
| 278 | } | 268 | } |
| 279 | 269 | ||
| 270 | static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) | ||
| 271 | { | ||
| 272 | #ifdef CONFIG_SMP | ||
| 273 | return s->cpu_slab[cpu]; | ||
| 274 | #else | ||
| 275 | return &s->cpu_slab; | ||
| 276 | #endif | ||
| 277 | } | ||
| 278 | |||
| 280 | static inline int check_valid_pointer(struct kmem_cache *s, | 279 | static inline int check_valid_pointer(struct kmem_cache *s, |
| 281 | struct page *page, const void *object) | 280 | struct page *page, const void *object) |
| 282 | { | 281 | { |
| @@ -729,11 +728,6 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
| 729 | slab_err(s, page, "Not a valid slab page"); | 728 | slab_err(s, page, "Not a valid slab page"); |
| 730 | return 0; | 729 | return 0; |
| 731 | } | 730 | } |
| 732 | if (page->offset * sizeof(void *) != s->offset) { | ||
| 733 | slab_err(s, page, "Corrupted offset %lu", | ||
| 734 | (unsigned long)(page->offset * sizeof(void *))); | ||
| 735 | return 0; | ||
| 736 | } | ||
| 737 | if (page->inuse > s->objects) { | 731 | if (page->inuse > s->objects) { |
| 738 | slab_err(s, page, "inuse %u > max %u", | 732 | slab_err(s, page, "inuse %u > max %u", |
| 739 | s->name, page->inuse, s->objects); | 733 | s->name, page->inuse, s->objects); |
| @@ -872,8 +866,6 @@ bad: | |||
| 872 | slab_fix(s, "Marking all objects used"); | 866 | slab_fix(s, "Marking all objects used"); |
| 873 | page->inuse = s->objects; | 867 | page->inuse = s->objects; |
| 874 | page->freelist = NULL; | 868 | page->freelist = NULL; |
| 875 | /* Fix up fields that may be corrupted */ | ||
| 876 | page->offset = s->offset / sizeof(void *); | ||
| 877 | } | 869 | } |
| 878 | return 0; | 870 | return 0; |
| 879 | } | 871 | } |
| @@ -1055,6 +1047,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
| 1055 | if (s->flags & SLAB_CACHE_DMA) | 1047 | if (s->flags & SLAB_CACHE_DMA) |
| 1056 | flags |= SLUB_DMA; | 1048 | flags |= SLUB_DMA; |
| 1057 | 1049 | ||
| 1050 | if (s->flags & SLAB_RECLAIM_ACCOUNT) | ||
| 1051 | flags |= __GFP_RECLAIMABLE; | ||
| 1052 | |||
| 1058 | if (node == -1) | 1053 | if (node == -1) |
| 1059 | page = alloc_pages(flags, s->order); | 1054 | page = alloc_pages(flags, s->order); |
| 1060 | else | 1055 | else |
| @@ -1088,19 +1083,19 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
| 1088 | void *last; | 1083 | void *last; |
| 1089 | void *p; | 1084 | void *p; |
| 1090 | 1085 | ||
| 1091 | BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); | 1086 | BUG_ON(flags & GFP_SLAB_BUG_MASK); |
| 1092 | 1087 | ||
| 1093 | if (flags & __GFP_WAIT) | 1088 | if (flags & __GFP_WAIT) |
| 1094 | local_irq_enable(); | 1089 | local_irq_enable(); |
| 1095 | 1090 | ||
| 1096 | page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); | 1091 | page = allocate_slab(s, |
| 1092 | flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); | ||
| 1097 | if (!page) | 1093 | if (!page) |
| 1098 | goto out; | 1094 | goto out; |
| 1099 | 1095 | ||
| 1100 | n = get_node(s, page_to_nid(page)); | 1096 | n = get_node(s, page_to_nid(page)); |
| 1101 | if (n) | 1097 | if (n) |
| 1102 | atomic_long_inc(&n->nr_slabs); | 1098 | atomic_long_inc(&n->nr_slabs); |
| 1103 | page->offset = s->offset / sizeof(void *); | ||
| 1104 | page->slab = s; | 1099 | page->slab = s; |
| 1105 | page->flags |= 1 << PG_slab; | 1100 | page->flags |= 1 << PG_slab; |
| 1106 | if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | | 1101 | if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | |
| @@ -1123,7 +1118,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
| 1123 | set_freepointer(s, last, NULL); | 1118 | set_freepointer(s, last, NULL); |
| 1124 | 1119 | ||
| 1125 | page->freelist = start; | 1120 | page->freelist = start; |
| 1126 | page->lockless_freelist = NULL; | ||
| 1127 | page->inuse = 0; | 1121 | page->inuse = 0; |
| 1128 | out: | 1122 | out: |
| 1129 | if (flags & __GFP_WAIT) | 1123 | if (flags & __GFP_WAIT) |
| @@ -1149,7 +1143,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
| 1149 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | 1143 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, |
| 1150 | - pages); | 1144 | - pages); |
| 1151 | 1145 | ||
| 1152 | page->mapping = NULL; | ||
| 1153 | __free_pages(page, s->order); | 1146 | __free_pages(page, s->order); |
| 1154 | } | 1147 | } |
| 1155 | 1148 | ||
| @@ -1383,33 +1376,34 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page) | |||
| 1383 | /* | 1376 | /* |
| 1384 | * Remove the cpu slab | 1377 | * Remove the cpu slab |
| 1385 | */ | 1378 | */ |
| 1386 | static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) | 1379 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
| 1387 | { | 1380 | { |
| 1381 | struct page *page = c->page; | ||
| 1388 | /* | 1382 | /* |
| 1389 | * Merge cpu freelist into freelist. Typically we get here | 1383 | * Merge cpu freelist into freelist. Typically we get here |
| 1390 | * because both freelists are empty. So this is unlikely | 1384 | * because both freelists are empty. So this is unlikely |
| 1391 | * to occur. | 1385 | * to occur. |
| 1392 | */ | 1386 | */ |
| 1393 | while (unlikely(page->lockless_freelist)) { | 1387 | while (unlikely(c->freelist)) { |
| 1394 | void **object; | 1388 | void **object; |
| 1395 | 1389 | ||
| 1396 | /* Retrieve object from cpu_freelist */ | 1390 | /* Retrieve object from cpu_freelist */ |
| 1397 | object = page->lockless_freelist; | 1391 | object = c->freelist; |
| 1398 | page->lockless_freelist = page->lockless_freelist[page->offset]; | 1392 | c->freelist = c->freelist[c->offset]; |
| 1399 | 1393 | ||
| 1400 | /* And put onto the regular freelist */ | 1394 | /* And put onto the regular freelist */ |
| 1401 | object[page->offset] = page->freelist; | 1395 | object[c->offset] = page->freelist; |
| 1402 | page->freelist = object; | 1396 | page->freelist = object; |
| 1403 | page->inuse--; | 1397 | page->inuse--; |
| 1404 | } | 1398 | } |
| 1405 | s->cpu_slab[cpu] = NULL; | 1399 | c->page = NULL; |
| 1406 | unfreeze_slab(s, page); | 1400 | unfreeze_slab(s, page); |
| 1407 | } | 1401 | } |
| 1408 | 1402 | ||
| 1409 | static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) | 1403 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
| 1410 | { | 1404 | { |
| 1411 | slab_lock(page); | 1405 | slab_lock(c->page); |
| 1412 | deactivate_slab(s, page, cpu); | 1406 | deactivate_slab(s, c); |
| 1413 | } | 1407 | } |
| 1414 | 1408 | ||
| 1415 | /* | 1409 | /* |
| @@ -1418,18 +1412,17 @@ static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) | |||
| 1418 | */ | 1412 | */ |
| 1419 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) | 1413 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) |
| 1420 | { | 1414 | { |
| 1421 | struct page *page = s->cpu_slab[cpu]; | 1415 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); |
| 1422 | 1416 | ||
| 1423 | if (likely(page)) | 1417 | if (likely(c && c->page)) |
| 1424 | flush_slab(s, page, cpu); | 1418 | flush_slab(s, c); |
| 1425 | } | 1419 | } |
| 1426 | 1420 | ||
| 1427 | static void flush_cpu_slab(void *d) | 1421 | static void flush_cpu_slab(void *d) |
| 1428 | { | 1422 | { |
| 1429 | struct kmem_cache *s = d; | 1423 | struct kmem_cache *s = d; |
| 1430 | int cpu = smp_processor_id(); | ||
| 1431 | 1424 | ||
| 1432 | __flush_cpu_slab(s, cpu); | 1425 | __flush_cpu_slab(s, smp_processor_id()); |
| 1433 | } | 1426 | } |
| 1434 | 1427 | ||
| 1435 | static void flush_all(struct kmem_cache *s) | 1428 | static void flush_all(struct kmem_cache *s) |
| @@ -1446,6 +1439,19 @@ static void flush_all(struct kmem_cache *s) | |||
| 1446 | } | 1439 | } |
| 1447 | 1440 | ||
| 1448 | /* | 1441 | /* |
| 1442 | * Check if the objects in a per cpu structure fit numa | ||
| 1443 | * locality expectations. | ||
| 1444 | */ | ||
| 1445 | static inline int node_match(struct kmem_cache_cpu *c, int node) | ||
| 1446 | { | ||
| 1447 | #ifdef CONFIG_NUMA | ||
| 1448 | if (node != -1 && c->node != node) | ||
| 1449 | return 0; | ||
| 1450 | #endif | ||
| 1451 | return 1; | ||
| 1452 | } | ||
| 1453 | |||
| 1454 | /* | ||
| 1449 | * Slow path. The lockless freelist is empty or we need to perform | 1455 | * Slow path. The lockless freelist is empty or we need to perform |
| 1450 | * debugging duties. | 1456 | * debugging duties. |
| 1451 | * | 1457 | * |
| @@ -1463,45 +1469,46 @@ static void flush_all(struct kmem_cache *s) | |||
| 1463 | * we need to allocate a new slab. This is slowest path since we may sleep. | 1469 | * we need to allocate a new slab. This is slowest path since we may sleep. |
| 1464 | */ | 1470 | */ |
| 1465 | static void *__slab_alloc(struct kmem_cache *s, | 1471 | static void *__slab_alloc(struct kmem_cache *s, |
| 1466 | gfp_t gfpflags, int node, void *addr, struct page *page) | 1472 | gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) |
| 1467 | { | 1473 | { |
| 1468 | void **object; | 1474 | void **object; |
| 1469 | int cpu = smp_processor_id(); | 1475 | struct page *new; |
| 1470 | 1476 | ||
| 1471 | if (!page) | 1477 | if (!c->page) |
| 1472 | goto new_slab; | 1478 | goto new_slab; |
| 1473 | 1479 | ||
| 1474 | slab_lock(page); | 1480 | slab_lock(c->page); |
| 1475 | if (unlikely(node != -1 && page_to_nid(page) != node)) | 1481 | if (unlikely(!node_match(c, node))) |
| 1476 | goto another_slab; | 1482 | goto another_slab; |
| 1477 | load_freelist: | 1483 | load_freelist: |
| 1478 | object = page->freelist; | 1484 | object = c->page->freelist; |
| 1479 | if (unlikely(!object)) | 1485 | if (unlikely(!object)) |
| 1480 | goto another_slab; | 1486 | goto another_slab; |
| 1481 | if (unlikely(SlabDebug(page))) | 1487 | if (unlikely(SlabDebug(c->page))) |
| 1482 | goto debug; | 1488 | goto debug; |
| 1483 | 1489 | ||
| 1484 | object = page->freelist; | 1490 | object = c->page->freelist; |
| 1485 | page->lockless_freelist = object[page->offset]; | 1491 | c->freelist = object[c->offset]; |
| 1486 | page->inuse = s->objects; | 1492 | c->page->inuse = s->objects; |
| 1487 | page->freelist = NULL; | 1493 | c->page->freelist = NULL; |
| 1488 | slab_unlock(page); | 1494 | c->node = page_to_nid(c->page); |
| 1495 | slab_unlock(c->page); | ||
| 1489 | return object; | 1496 | return object; |
| 1490 | 1497 | ||
| 1491 | another_slab: | 1498 | another_slab: |
| 1492 | deactivate_slab(s, page, cpu); | 1499 | deactivate_slab(s, c); |
| 1493 | 1500 | ||
| 1494 | new_slab: | 1501 | new_slab: |
| 1495 | page = get_partial(s, gfpflags, node); | 1502 | new = get_partial(s, gfpflags, node); |
| 1496 | if (page) { | 1503 | if (new) { |
| 1497 | s->cpu_slab[cpu] = page; | 1504 | c->page = new; |
| 1498 | goto load_freelist; | 1505 | goto load_freelist; |
| 1499 | } | 1506 | } |
| 1500 | 1507 | ||
| 1501 | page = new_slab(s, gfpflags, node); | 1508 | new = new_slab(s, gfpflags, node); |
| 1502 | if (page) { | 1509 | if (new) { |
| 1503 | cpu = smp_processor_id(); | 1510 | c = get_cpu_slab(s, smp_processor_id()); |
| 1504 | if (s->cpu_slab[cpu]) { | 1511 | if (c->page) { |
| 1505 | /* | 1512 | /* |
| 1506 | * Someone else populated the cpu_slab while we | 1513 | * Someone else populated the cpu_slab while we |
| 1507 | * enabled interrupts, or we have gotten scheduled | 1514 | * enabled interrupts, or we have gotten scheduled |
| @@ -1509,34 +1516,33 @@ new_slab: | |||
| 1509 | * requested node even if __GFP_THISNODE was | 1516 | * requested node even if __GFP_THISNODE was |
| 1510 | * specified. So we need to recheck. | 1517 | * specified. So we need to recheck. |
| 1511 | */ | 1518 | */ |
| 1512 | if (node == -1 || | 1519 | if (node_match(c, node)) { |
| 1513 | page_to_nid(s->cpu_slab[cpu]) == node) { | ||
| 1514 | /* | 1520 | /* |
| 1515 | * Current cpuslab is acceptable and we | 1521 | * Current cpuslab is acceptable and we |
| 1516 | * want the current one since its cache hot | 1522 | * want the current one since its cache hot |
| 1517 | */ | 1523 | */ |
| 1518 | discard_slab(s, page); | 1524 | discard_slab(s, new); |
| 1519 | page = s->cpu_slab[cpu]; | 1525 | slab_lock(c->page); |
| 1520 | slab_lock(page); | ||
| 1521 | goto load_freelist; | 1526 | goto load_freelist; |
| 1522 | } | 1527 | } |
| 1523 | /* New slab does not fit our expectations */ | 1528 | /* New slab does not fit our expectations */ |
| 1524 | flush_slab(s, s->cpu_slab[cpu], cpu); | 1529 | flush_slab(s, c); |
| 1525 | } | 1530 | } |
| 1526 | slab_lock(page); | 1531 | slab_lock(new); |
| 1527 | SetSlabFrozen(page); | 1532 | SetSlabFrozen(new); |
| 1528 | s->cpu_slab[cpu] = page; | 1533 | c->page = new; |
| 1529 | goto load_freelist; | 1534 | goto load_freelist; |
| 1530 | } | 1535 | } |
| 1531 | return NULL; | 1536 | return NULL; |
| 1532 | debug: | 1537 | debug: |
| 1533 | object = page->freelist; | 1538 | object = c->page->freelist; |
| 1534 | if (!alloc_debug_processing(s, page, object, addr)) | 1539 | if (!alloc_debug_processing(s, c->page, object, addr)) |
| 1535 | goto another_slab; | 1540 | goto another_slab; |
| 1536 | 1541 | ||
| 1537 | page->inuse++; | 1542 | c->page->inuse++; |
| 1538 | page->freelist = object[page->offset]; | 1543 | c->page->freelist = object[c->offset]; |
| 1539 | slab_unlock(page); | 1544 | c->node = -1; |
| 1545 | slab_unlock(c->page); | ||
| 1540 | return object; | 1546 | return object; |
| 1541 | } | 1547 | } |
| 1542 | 1548 | ||
| @@ -1553,25 +1559,24 @@ debug: | |||
| 1553 | static void __always_inline *slab_alloc(struct kmem_cache *s, | 1559 | static void __always_inline *slab_alloc(struct kmem_cache *s, |
| 1554 | gfp_t gfpflags, int node, void *addr) | 1560 | gfp_t gfpflags, int node, void *addr) |
| 1555 | { | 1561 | { |
| 1556 | struct page *page; | ||
| 1557 | void **object; | 1562 | void **object; |
| 1558 | unsigned long flags; | 1563 | unsigned long flags; |
| 1564 | struct kmem_cache_cpu *c; | ||
| 1559 | 1565 | ||
| 1560 | local_irq_save(flags); | 1566 | local_irq_save(flags); |
| 1561 | page = s->cpu_slab[smp_processor_id()]; | 1567 | c = get_cpu_slab(s, smp_processor_id()); |
| 1562 | if (unlikely(!page || !page->lockless_freelist || | 1568 | if (unlikely(!c->freelist || !node_match(c, node))) |
| 1563 | (node != -1 && page_to_nid(page) != node))) | ||
| 1564 | 1569 | ||
| 1565 | object = __slab_alloc(s, gfpflags, node, addr, page); | 1570 | object = __slab_alloc(s, gfpflags, node, addr, c); |
| 1566 | 1571 | ||
| 1567 | else { | 1572 | else { |
| 1568 | object = page->lockless_freelist; | 1573 | object = c->freelist; |
| 1569 | page->lockless_freelist = object[page->offset]; | 1574 | c->freelist = object[c->offset]; |
| 1570 | } | 1575 | } |
| 1571 | local_irq_restore(flags); | 1576 | local_irq_restore(flags); |
| 1572 | 1577 | ||
| 1573 | if (unlikely((gfpflags & __GFP_ZERO) && object)) | 1578 | if (unlikely((gfpflags & __GFP_ZERO) && object)) |
| 1574 | memset(object, 0, s->objsize); | 1579 | memset(object, 0, c->objsize); |
| 1575 | 1580 | ||
| 1576 | return object; | 1581 | return object; |
| 1577 | } | 1582 | } |
| @@ -1599,7 +1604,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); | |||
| 1599 | * handling required then we can return immediately. | 1604 | * handling required then we can return immediately. |
| 1600 | */ | 1605 | */ |
| 1601 | static void __slab_free(struct kmem_cache *s, struct page *page, | 1606 | static void __slab_free(struct kmem_cache *s, struct page *page, |
| 1602 | void *x, void *addr) | 1607 | void *x, void *addr, unsigned int offset) |
| 1603 | { | 1608 | { |
| 1604 | void *prior; | 1609 | void *prior; |
| 1605 | void **object = (void *)x; | 1610 | void **object = (void *)x; |
| @@ -1609,7 +1614,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
| 1609 | if (unlikely(SlabDebug(page))) | 1614 | if (unlikely(SlabDebug(page))) |
| 1610 | goto debug; | 1615 | goto debug; |
| 1611 | checks_ok: | 1616 | checks_ok: |
| 1612 | prior = object[page->offset] = page->freelist; | 1617 | prior = object[offset] = page->freelist; |
| 1613 | page->freelist = object; | 1618 | page->freelist = object; |
| 1614 | page->inuse--; | 1619 | page->inuse--; |
| 1615 | 1620 | ||
| @@ -1664,15 +1669,16 @@ static void __always_inline slab_free(struct kmem_cache *s, | |||
| 1664 | { | 1669 | { |
| 1665 | void **object = (void *)x; | 1670 | void **object = (void *)x; |
| 1666 | unsigned long flags; | 1671 | unsigned long flags; |
| 1672 | struct kmem_cache_cpu *c; | ||
| 1667 | 1673 | ||
| 1668 | local_irq_save(flags); | 1674 | local_irq_save(flags); |
| 1669 | debug_check_no_locks_freed(object, s->objsize); | 1675 | debug_check_no_locks_freed(object, s->objsize); |
| 1670 | if (likely(page == s->cpu_slab[smp_processor_id()] && | 1676 | c = get_cpu_slab(s, smp_processor_id()); |
| 1671 | !SlabDebug(page))) { | 1677 | if (likely(page == c->page && c->node >= 0)) { |
| 1672 | object[page->offset] = page->lockless_freelist; | 1678 | object[c->offset] = c->freelist; |
| 1673 | page->lockless_freelist = object; | 1679 | c->freelist = object; |
| 1674 | } else | 1680 | } else |
| 1675 | __slab_free(s, page, x, addr); | 1681 | __slab_free(s, page, x, addr, c->offset); |
| 1676 | 1682 | ||
| 1677 | local_irq_restore(flags); | 1683 | local_irq_restore(flags); |
| 1678 | } | 1684 | } |
| @@ -1759,14 +1765,6 @@ static inline int slab_order(int size, int min_objects, | |||
| 1759 | int rem; | 1765 | int rem; |
| 1760 | int min_order = slub_min_order; | 1766 | int min_order = slub_min_order; |
| 1761 | 1767 | ||
| 1762 | /* | ||
| 1763 | * If we would create too many object per slab then reduce | ||
| 1764 | * the slab order even if it goes below slub_min_order. | ||
| 1765 | */ | ||
| 1766 | while (min_order > 0 && | ||
| 1767 | (PAGE_SIZE << min_order) >= MAX_OBJECTS_PER_SLAB * size) | ||
| 1768 | min_order--; | ||
| 1769 | |||
| 1770 | for (order = max(min_order, | 1768 | for (order = max(min_order, |
| 1771 | fls(min_objects * size - 1) - PAGE_SHIFT); | 1769 | fls(min_objects * size - 1) - PAGE_SHIFT); |
| 1772 | order <= max_order; order++) { | 1770 | order <= max_order; order++) { |
| @@ -1781,9 +1779,6 @@ static inline int slab_order(int size, int min_objects, | |||
| 1781 | if (rem <= slab_size / fract_leftover) | 1779 | if (rem <= slab_size / fract_leftover) |
| 1782 | break; | 1780 | break; |
| 1783 | 1781 | ||
| 1784 | /* If the next size is too high then exit now */ | ||
| 1785 | if (slab_size * 2 >= MAX_OBJECTS_PER_SLAB * size) | ||
| 1786 | break; | ||
| 1787 | } | 1782 | } |
| 1788 | 1783 | ||
| 1789 | return order; | 1784 | return order; |
| @@ -1858,6 +1853,16 @@ static unsigned long calculate_alignment(unsigned long flags, | |||
| 1858 | return ALIGN(align, sizeof(void *)); | 1853 | return ALIGN(align, sizeof(void *)); |
| 1859 | } | 1854 | } |
| 1860 | 1855 | ||
| 1856 | static void init_kmem_cache_cpu(struct kmem_cache *s, | ||
| 1857 | struct kmem_cache_cpu *c) | ||
| 1858 | { | ||
| 1859 | c->page = NULL; | ||
| 1860 | c->freelist = NULL; | ||
| 1861 | c->node = 0; | ||
| 1862 | c->offset = s->offset / sizeof(void *); | ||
| 1863 | c->objsize = s->objsize; | ||
| 1864 | } | ||
| 1865 | |||
| 1861 | static void init_kmem_cache_node(struct kmem_cache_node *n) | 1866 | static void init_kmem_cache_node(struct kmem_cache_node *n) |
| 1862 | { | 1867 | { |
| 1863 | n->nr_partial = 0; | 1868 | n->nr_partial = 0; |
| @@ -1869,6 +1874,131 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) | |||
| 1869 | #endif | 1874 | #endif |
| 1870 | } | 1875 | } |
| 1871 | 1876 | ||
| 1877 | #ifdef CONFIG_SMP | ||
| 1878 | /* | ||
| 1879 | * Per cpu array for per cpu structures. | ||
| 1880 | * | ||
| 1881 | * The per cpu array places all kmem_cache_cpu structures from one processor | ||
| 1882 | * close together meaning that it becomes possible that multiple per cpu | ||
| 1883 | * structures are contained in one cacheline. This may be particularly | ||
| 1884 | * beneficial for the kmalloc caches. | ||
| 1885 | * | ||
| 1886 | * A desktop system typically has around 60-80 slabs. With 100 here we are | ||
| 1887 | * likely able to get per cpu structures for all caches from the array defined | ||
| 1888 | * here. We must be able to cover all kmalloc caches during bootstrap. | ||
| 1889 | * | ||
| 1890 | * If the per cpu array is exhausted then fall back to kmalloc | ||
| 1891 | * of individual cachelines. No sharing is possible then. | ||
| 1892 | */ | ||
| 1893 | #define NR_KMEM_CACHE_CPU 100 | ||
| 1894 | |||
| 1895 | static DEFINE_PER_CPU(struct kmem_cache_cpu, | ||
| 1896 | kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; | ||
| 1897 | |||
| 1898 | static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); | ||
| 1899 | static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE; | ||
| 1900 | |||
| 1901 | static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, | ||
| 1902 | int cpu, gfp_t flags) | ||
| 1903 | { | ||
| 1904 | struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); | ||
| 1905 | |||
| 1906 | if (c) | ||
| 1907 | per_cpu(kmem_cache_cpu_free, cpu) = | ||
| 1908 | (void *)c->freelist; | ||
| 1909 | else { | ||
| 1910 | /* Table overflow: So allocate ourselves */ | ||
| 1911 | c = kmalloc_node( | ||
| 1912 | ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), | ||
| 1913 | flags, cpu_to_node(cpu)); | ||
| 1914 | if (!c) | ||
| 1915 | return NULL; | ||
| 1916 | } | ||
| 1917 | |||
| 1918 | init_kmem_cache_cpu(s, c); | ||
| 1919 | return c; | ||
| 1920 | } | ||
| 1921 | |||
| 1922 | static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) | ||
| 1923 | { | ||
| 1924 | if (c < per_cpu(kmem_cache_cpu, cpu) || | ||
| 1925 | c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { | ||
| 1926 | kfree(c); | ||
| 1927 | return; | ||
| 1928 | } | ||
| 1929 | c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); | ||
| 1930 | per_cpu(kmem_cache_cpu_free, cpu) = c; | ||
| 1931 | } | ||
| 1932 | |||
| 1933 | static void free_kmem_cache_cpus(struct kmem_cache *s) | ||
| 1934 | { | ||
| 1935 | int cpu; | ||
| 1936 | |||
| 1937 | for_each_online_cpu(cpu) { | ||
| 1938 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
| 1939 | |||
| 1940 | if (c) { | ||
| 1941 | s->cpu_slab[cpu] = NULL; | ||
| 1942 | free_kmem_cache_cpu(c, cpu); | ||
| 1943 | } | ||
| 1944 | } | ||
| 1945 | } | ||
| 1946 | |||
| 1947 | static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) | ||
| 1948 | { | ||
| 1949 | int cpu; | ||
| 1950 | |||
| 1951 | for_each_online_cpu(cpu) { | ||
| 1952 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
| 1953 | |||
| 1954 | if (c) | ||
| 1955 | continue; | ||
| 1956 | |||
| 1957 | c = alloc_kmem_cache_cpu(s, cpu, flags); | ||
| 1958 | if (!c) { | ||
| 1959 | free_kmem_cache_cpus(s); | ||
| 1960 | return 0; | ||
| 1961 | } | ||
| 1962 | s->cpu_slab[cpu] = c; | ||
| 1963 | } | ||
| 1964 | return 1; | ||
| 1965 | } | ||
| 1966 | |||
| 1967 | /* | ||
| 1968 | * Initialize the per cpu array. | ||
| 1969 | */ | ||
| 1970 | static void init_alloc_cpu_cpu(int cpu) | ||
| 1971 | { | ||
| 1972 | int i; | ||
| 1973 | |||
| 1974 | if (cpu_isset(cpu, kmem_cach_cpu_free_init_once)) | ||
| 1975 | return; | ||
| 1976 | |||
| 1977 | for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) | ||
| 1978 | free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); | ||
| 1979 | |||
| 1980 | cpu_set(cpu, kmem_cach_cpu_free_init_once); | ||
| 1981 | } | ||
| 1982 | |||
| 1983 | static void __init init_alloc_cpu(void) | ||
| 1984 | { | ||
| 1985 | int cpu; | ||
| 1986 | |||
| 1987 | for_each_online_cpu(cpu) | ||
| 1988 | init_alloc_cpu_cpu(cpu); | ||
| 1989 | } | ||
| 1990 | |||
| 1991 | #else | ||
| 1992 | static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} | ||
| 1993 | static inline void init_alloc_cpu(void) {} | ||
| 1994 | |||
| 1995 | static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) | ||
| 1996 | { | ||
| 1997 | init_kmem_cache_cpu(s, &s->cpu_slab); | ||
| 1998 | return 1; | ||
| 1999 | } | ||
| 2000 | #endif | ||
| 2001 | |||
| 1872 | #ifdef CONFIG_NUMA | 2002 | #ifdef CONFIG_NUMA |
| 1873 | /* | 2003 | /* |
| 1874 | * No kmalloc_node yet so do it by hand. We know that this is the first | 2004 | * No kmalloc_node yet so do it by hand. We know that this is the first |
| @@ -1876,10 +2006,11 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) | |||
| 1876 | * possible. | 2006 | * possible. |
| 1877 | * | 2007 | * |
| 1878 | * Note that this function only works on the kmalloc_node_cache | 2008 | * Note that this function only works on the kmalloc_node_cache |
| 1879 | * when allocating for the kmalloc_node_cache. | 2009 | * when allocating for the kmalloc_node_cache. This is used for bootstrapping |
| 2010 | * memory on a fresh node that has no slab structures yet. | ||
| 1880 | */ | 2011 | */ |
| 1881 | static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags, | 2012 | static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, |
| 1882 | int node) | 2013 | int node) |
| 1883 | { | 2014 | { |
| 1884 | struct page *page; | 2015 | struct page *page; |
| 1885 | struct kmem_cache_node *n; | 2016 | struct kmem_cache_node *n; |
| @@ -1921,7 +2052,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) | |||
| 1921 | { | 2052 | { |
| 1922 | int node; | 2053 | int node; |
| 1923 | 2054 | ||
| 1924 | for_each_online_node(node) { | 2055 | for_each_node_state(node, N_NORMAL_MEMORY) { |
| 1925 | struct kmem_cache_node *n = s->node[node]; | 2056 | struct kmem_cache_node *n = s->node[node]; |
| 1926 | if (n && n != &s->local_node) | 2057 | if (n && n != &s->local_node) |
| 1927 | kmem_cache_free(kmalloc_caches, n); | 2058 | kmem_cache_free(kmalloc_caches, n); |
| @@ -1939,7 +2070,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) | |||
| 1939 | else | 2070 | else |
| 1940 | local_node = 0; | 2071 | local_node = 0; |
| 1941 | 2072 | ||
| 1942 | for_each_online_node(node) { | 2073 | for_each_node_state(node, N_NORMAL_MEMORY) { |
| 1943 | struct kmem_cache_node *n; | 2074 | struct kmem_cache_node *n; |
| 1944 | 2075 | ||
| 1945 | if (local_node == node) | 2076 | if (local_node == node) |
| @@ -2077,14 +2208,7 @@ static int calculate_sizes(struct kmem_cache *s) | |||
| 2077 | */ | 2208 | */ |
| 2078 | s->objects = (PAGE_SIZE << s->order) / size; | 2209 | s->objects = (PAGE_SIZE << s->order) / size; |
| 2079 | 2210 | ||
| 2080 | /* | 2211 | return !!s->objects; |
| 2081 | * Verify that the number of objects is within permitted limits. | ||
| 2082 | * The page->inuse field is only 16 bit wide! So we cannot have | ||
| 2083 | * more than 64k objects per slab. | ||
| 2084 | */ | ||
| 2085 | if (!s->objects || s->objects > MAX_OBJECTS_PER_SLAB) | ||
| 2086 | return 0; | ||
| 2087 | return 1; | ||
| 2088 | 2212 | ||
| 2089 | } | 2213 | } |
| 2090 | 2214 | ||
| @@ -2107,9 +2231,12 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | |||
| 2107 | #ifdef CONFIG_NUMA | 2231 | #ifdef CONFIG_NUMA |
| 2108 | s->defrag_ratio = 100; | 2232 | s->defrag_ratio = 100; |
| 2109 | #endif | 2233 | #endif |
| 2234 | if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) | ||
| 2235 | goto error; | ||
| 2110 | 2236 | ||
| 2111 | if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) | 2237 | if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) |
| 2112 | return 1; | 2238 | return 1; |
| 2239 | free_kmem_cache_nodes(s); | ||
| 2113 | error: | 2240 | error: |
| 2114 | if (flags & SLAB_PANIC) | 2241 | if (flags & SLAB_PANIC) |
| 2115 | panic("Cannot create slab %s size=%lu realsize=%u " | 2242 | panic("Cannot create slab %s size=%lu realsize=%u " |
| @@ -2192,7 +2319,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) | |||
| 2192 | flush_all(s); | 2319 | flush_all(s); |
| 2193 | 2320 | ||
| 2194 | /* Attempt to free all objects */ | 2321 | /* Attempt to free all objects */ |
| 2195 | for_each_online_node(node) { | 2322 | free_kmem_cache_cpus(s); |
| 2323 | for_each_node_state(node, N_NORMAL_MEMORY) { | ||
| 2196 | struct kmem_cache_node *n = get_node(s, node); | 2324 | struct kmem_cache_node *n = get_node(s, node); |
| 2197 | 2325 | ||
| 2198 | n->nr_partial -= free_list(s, n, &n->partial); | 2326 | n->nr_partial -= free_list(s, n, &n->partial); |
| @@ -2227,11 +2355,11 @@ EXPORT_SYMBOL(kmem_cache_destroy); | |||
| 2227 | * Kmalloc subsystem | 2355 | * Kmalloc subsystem |
| 2228 | *******************************************************************/ | 2356 | *******************************************************************/ |
| 2229 | 2357 | ||
| 2230 | struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; | 2358 | struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned; |
| 2231 | EXPORT_SYMBOL(kmalloc_caches); | 2359 | EXPORT_SYMBOL(kmalloc_caches); |
| 2232 | 2360 | ||
| 2233 | #ifdef CONFIG_ZONE_DMA | 2361 | #ifdef CONFIG_ZONE_DMA |
| 2234 | static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; | 2362 | static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT]; |
| 2235 | #endif | 2363 | #endif |
| 2236 | 2364 | ||
| 2237 | static int __init setup_slub_min_order(char *str) | 2365 | static int __init setup_slub_min_order(char *str) |
| @@ -2397,12 +2525,8 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) | |||
| 2397 | return ZERO_SIZE_PTR; | 2525 | return ZERO_SIZE_PTR; |
| 2398 | 2526 | ||
| 2399 | index = size_index[(size - 1) / 8]; | 2527 | index = size_index[(size - 1) / 8]; |
| 2400 | } else { | 2528 | } else |
| 2401 | if (size > KMALLOC_MAX_SIZE) | ||
| 2402 | return NULL; | ||
| 2403 | |||
| 2404 | index = fls(size - 1); | 2529 | index = fls(size - 1); |
| 2405 | } | ||
| 2406 | 2530 | ||
| 2407 | #ifdef CONFIG_ZONE_DMA | 2531 | #ifdef CONFIG_ZONE_DMA |
| 2408 | if (unlikely((flags & SLUB_DMA))) | 2532 | if (unlikely((flags & SLUB_DMA))) |
| @@ -2414,9 +2538,15 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) | |||
| 2414 | 2538 | ||
| 2415 | void *__kmalloc(size_t size, gfp_t flags) | 2539 | void *__kmalloc(size_t size, gfp_t flags) |
| 2416 | { | 2540 | { |
| 2417 | struct kmem_cache *s = get_slab(size, flags); | 2541 | struct kmem_cache *s; |
| 2542 | |||
| 2543 | if (unlikely(size > PAGE_SIZE / 2)) | ||
| 2544 | return (void *)__get_free_pages(flags | __GFP_COMP, | ||
| 2545 | get_order(size)); | ||
| 2418 | 2546 | ||
| 2419 | if (ZERO_OR_NULL_PTR(s)) | 2547 | s = get_slab(size, flags); |
| 2548 | |||
| 2549 | if (unlikely(ZERO_OR_NULL_PTR(s))) | ||
| 2420 | return s; | 2550 | return s; |
| 2421 | 2551 | ||
| 2422 | return slab_alloc(s, flags, -1, __builtin_return_address(0)); | 2552 | return slab_alloc(s, flags, -1, __builtin_return_address(0)); |
| @@ -2426,9 +2556,15 @@ EXPORT_SYMBOL(__kmalloc); | |||
| 2426 | #ifdef CONFIG_NUMA | 2556 | #ifdef CONFIG_NUMA |
| 2427 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 2557 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
| 2428 | { | 2558 | { |
| 2429 | struct kmem_cache *s = get_slab(size, flags); | 2559 | struct kmem_cache *s; |
| 2430 | 2560 | ||
| 2431 | if (ZERO_OR_NULL_PTR(s)) | 2561 | if (unlikely(size > PAGE_SIZE / 2)) |
| 2562 | return (void *)__get_free_pages(flags | __GFP_COMP, | ||
| 2563 | get_order(size)); | ||
| 2564 | |||
| 2565 | s = get_slab(size, flags); | ||
| 2566 | |||
| 2567 | if (unlikely(ZERO_OR_NULL_PTR(s))) | ||
| 2432 | return s; | 2568 | return s; |
| 2433 | 2569 | ||
| 2434 | return slab_alloc(s, flags, node, __builtin_return_address(0)); | 2570 | return slab_alloc(s, flags, node, __builtin_return_address(0)); |
| @@ -2441,7 +2577,8 @@ size_t ksize(const void *object) | |||
| 2441 | struct page *page; | 2577 | struct page *page; |
| 2442 | struct kmem_cache *s; | 2578 | struct kmem_cache *s; |
| 2443 | 2579 | ||
| 2444 | if (ZERO_OR_NULL_PTR(object)) | 2580 | BUG_ON(!object); |
| 2581 | if (unlikely(object == ZERO_SIZE_PTR)) | ||
| 2445 | return 0; | 2582 | return 0; |
| 2446 | 2583 | ||
| 2447 | page = get_object_page(object); | 2584 | page = get_object_page(object); |
| @@ -2473,22 +2610,17 @@ EXPORT_SYMBOL(ksize); | |||
| 2473 | 2610 | ||
| 2474 | void kfree(const void *x) | 2611 | void kfree(const void *x) |
| 2475 | { | 2612 | { |
| 2476 | struct kmem_cache *s; | ||
| 2477 | struct page *page; | 2613 | struct page *page; |
| 2478 | 2614 | ||
| 2479 | /* | 2615 | if (unlikely(ZERO_OR_NULL_PTR(x))) |
| 2480 | * This has to be an unsigned comparison. According to Linus | ||
| 2481 | * some gcc version treat a pointer as a signed entity. Then | ||
| 2482 | * this comparison would be true for all "negative" pointers | ||
| 2483 | * (which would cover the whole upper half of the address space). | ||
| 2484 | */ | ||
| 2485 | if (ZERO_OR_NULL_PTR(x)) | ||
| 2486 | return; | 2616 | return; |
| 2487 | 2617 | ||
| 2488 | page = virt_to_head_page(x); | 2618 | page = virt_to_head_page(x); |
| 2489 | s = page->slab; | 2619 | if (unlikely(!PageSlab(page))) { |
| 2490 | 2620 | put_page(page); | |
| 2491 | slab_free(s, page, (void *)x, __builtin_return_address(0)); | 2621 | return; |
| 2622 | } | ||
| 2623 | slab_free(page->slab, page, (void *)x, __builtin_return_address(0)); | ||
| 2492 | } | 2624 | } |
| 2493 | EXPORT_SYMBOL(kfree); | 2625 | EXPORT_SYMBOL(kfree); |
| 2494 | 2626 | ||
| @@ -2517,7 +2649,7 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
| 2517 | return -ENOMEM; | 2649 | return -ENOMEM; |
| 2518 | 2650 | ||
| 2519 | flush_all(s); | 2651 | flush_all(s); |
| 2520 | for_each_online_node(node) { | 2652 | for_each_node_state(node, N_NORMAL_MEMORY) { |
| 2521 | n = get_node(s, node); | 2653 | n = get_node(s, node); |
| 2522 | 2654 | ||
| 2523 | if (!n->nr_partial) | 2655 | if (!n->nr_partial) |
| @@ -2575,6 +2707,8 @@ void __init kmem_cache_init(void) | |||
| 2575 | int i; | 2707 | int i; |
| 2576 | int caches = 0; | 2708 | int caches = 0; |
| 2577 | 2709 | ||
| 2710 | init_alloc_cpu(); | ||
| 2711 | |||
| 2578 | #ifdef CONFIG_NUMA | 2712 | #ifdef CONFIG_NUMA |
| 2579 | /* | 2713 | /* |
| 2580 | * Must first have the slab cache available for the allocations of the | 2714 | * Must first have the slab cache available for the allocations of the |
| @@ -2602,7 +2736,7 @@ void __init kmem_cache_init(void) | |||
| 2602 | caches++; | 2736 | caches++; |
| 2603 | } | 2737 | } |
| 2604 | 2738 | ||
| 2605 | for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { | 2739 | for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) { |
| 2606 | create_kmalloc_cache(&kmalloc_caches[i], | 2740 | create_kmalloc_cache(&kmalloc_caches[i], |
| 2607 | "kmalloc", 1 << i, GFP_KERNEL); | 2741 | "kmalloc", 1 << i, GFP_KERNEL); |
| 2608 | caches++; | 2742 | caches++; |
| @@ -2629,16 +2763,18 @@ void __init kmem_cache_init(void) | |||
| 2629 | slab_state = UP; | 2763 | slab_state = UP; |
| 2630 | 2764 | ||
| 2631 | /* Provide the correct kmalloc names now that the caches are up */ | 2765 | /* Provide the correct kmalloc names now that the caches are up */ |
| 2632 | for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) | 2766 | for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) |
| 2633 | kmalloc_caches[i]. name = | 2767 | kmalloc_caches[i]. name = |
| 2634 | kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); | 2768 | kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); |
| 2635 | 2769 | ||
| 2636 | #ifdef CONFIG_SMP | 2770 | #ifdef CONFIG_SMP |
| 2637 | register_cpu_notifier(&slab_notifier); | 2771 | register_cpu_notifier(&slab_notifier); |
| 2772 | kmem_size = offsetof(struct kmem_cache, cpu_slab) + | ||
| 2773 | nr_cpu_ids * sizeof(struct kmem_cache_cpu *); | ||
| 2774 | #else | ||
| 2775 | kmem_size = sizeof(struct kmem_cache); | ||
| 2638 | #endif | 2776 | #endif |
| 2639 | 2777 | ||
| 2640 | kmem_size = offsetof(struct kmem_cache, cpu_slab) + | ||
| 2641 | nr_cpu_ids * sizeof(struct page *); | ||
| 2642 | 2778 | ||
| 2643 | printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," | 2779 | printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," |
| 2644 | " CPUs=%d, Nodes=%d\n", | 2780 | " CPUs=%d, Nodes=%d\n", |
| @@ -2717,12 +2853,21 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
| 2717 | down_write(&slub_lock); | 2853 | down_write(&slub_lock); |
| 2718 | s = find_mergeable(size, align, flags, name, ctor); | 2854 | s = find_mergeable(size, align, flags, name, ctor); |
| 2719 | if (s) { | 2855 | if (s) { |
| 2856 | int cpu; | ||
| 2857 | |||
| 2720 | s->refcount++; | 2858 | s->refcount++; |
| 2721 | /* | 2859 | /* |
| 2722 | * Adjust the object sizes so that we clear | 2860 | * Adjust the object sizes so that we clear |
| 2723 | * the complete object on kzalloc. | 2861 | * the complete object on kzalloc. |
| 2724 | */ | 2862 | */ |
| 2725 | s->objsize = max(s->objsize, (int)size); | 2863 | s->objsize = max(s->objsize, (int)size); |
| 2864 | |||
| 2865 | /* | ||
| 2866 | * And then we need to update the object size in the | ||
| 2867 | * per cpu structures | ||
| 2868 | */ | ||
| 2869 | for_each_online_cpu(cpu) | ||
| 2870 | get_cpu_slab(s, cpu)->objsize = s->objsize; | ||
| 2726 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); | 2871 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); |
| 2727 | up_write(&slub_lock); | 2872 | up_write(&slub_lock); |
| 2728 | if (sysfs_slab_alias(s, name)) | 2873 | if (sysfs_slab_alias(s, name)) |
| @@ -2765,15 +2910,29 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | |||
| 2765 | unsigned long flags; | 2910 | unsigned long flags; |
| 2766 | 2911 | ||
| 2767 | switch (action) { | 2912 | switch (action) { |
| 2913 | case CPU_UP_PREPARE: | ||
| 2914 | case CPU_UP_PREPARE_FROZEN: | ||
| 2915 | init_alloc_cpu_cpu(cpu); | ||
| 2916 | down_read(&slub_lock); | ||
| 2917 | list_for_each_entry(s, &slab_caches, list) | ||
| 2918 | s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, | ||
| 2919 | GFP_KERNEL); | ||
| 2920 | up_read(&slub_lock); | ||
| 2921 | break; | ||
| 2922 | |||
| 2768 | case CPU_UP_CANCELED: | 2923 | case CPU_UP_CANCELED: |
| 2769 | case CPU_UP_CANCELED_FROZEN: | 2924 | case CPU_UP_CANCELED_FROZEN: |
| 2770 | case CPU_DEAD: | 2925 | case CPU_DEAD: |
| 2771 | case CPU_DEAD_FROZEN: | 2926 | case CPU_DEAD_FROZEN: |
| 2772 | down_read(&slub_lock); | 2927 | down_read(&slub_lock); |
| 2773 | list_for_each_entry(s, &slab_caches, list) { | 2928 | list_for_each_entry(s, &slab_caches, list) { |
| 2929 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
| 2930 | |||
| 2774 | local_irq_save(flags); | 2931 | local_irq_save(flags); |
| 2775 | __flush_cpu_slab(s, cpu); | 2932 | __flush_cpu_slab(s, cpu); |
| 2776 | local_irq_restore(flags); | 2933 | local_irq_restore(flags); |
| 2934 | free_kmem_cache_cpu(c, cpu); | ||
| 2935 | s->cpu_slab[cpu] = NULL; | ||
| 2777 | } | 2936 | } |
| 2778 | up_read(&slub_lock); | 2937 | up_read(&slub_lock); |
| 2779 | break; | 2938 | break; |
| @@ -2790,9 +2949,14 @@ static struct notifier_block __cpuinitdata slab_notifier = | |||
| 2790 | 2949 | ||
| 2791 | void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) | 2950 | void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) |
| 2792 | { | 2951 | { |
| 2793 | struct kmem_cache *s = get_slab(size, gfpflags); | 2952 | struct kmem_cache *s; |
| 2953 | |||
| 2954 | if (unlikely(size > PAGE_SIZE / 2)) | ||
| 2955 | return (void *)__get_free_pages(gfpflags | __GFP_COMP, | ||
| 2956 | get_order(size)); | ||
| 2957 | s = get_slab(size, gfpflags); | ||
| 2794 | 2958 | ||
| 2795 | if (ZERO_OR_NULL_PTR(s)) | 2959 | if (unlikely(ZERO_OR_NULL_PTR(s))) |
| 2796 | return s; | 2960 | return s; |
| 2797 | 2961 | ||
| 2798 | return slab_alloc(s, gfpflags, -1, caller); | 2962 | return slab_alloc(s, gfpflags, -1, caller); |
| @@ -2801,9 +2965,14 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) | |||
| 2801 | void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, | 2965 | void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, |
| 2802 | int node, void *caller) | 2966 | int node, void *caller) |
| 2803 | { | 2967 | { |
| 2804 | struct kmem_cache *s = get_slab(size, gfpflags); | 2968 | struct kmem_cache *s; |
| 2969 | |||
| 2970 | if (unlikely(size > PAGE_SIZE / 2)) | ||
| 2971 | return (void *)__get_free_pages(gfpflags | __GFP_COMP, | ||
| 2972 | get_order(size)); | ||
| 2973 | s = get_slab(size, gfpflags); | ||
| 2805 | 2974 | ||
| 2806 | if (ZERO_OR_NULL_PTR(s)) | 2975 | if (unlikely(ZERO_OR_NULL_PTR(s))) |
| 2807 | return s; | 2976 | return s; |
| 2808 | 2977 | ||
| 2809 | return slab_alloc(s, gfpflags, node, caller); | 2978 | return slab_alloc(s, gfpflags, node, caller); |
| @@ -2902,7 +3071,7 @@ static long validate_slab_cache(struct kmem_cache *s) | |||
| 2902 | return -ENOMEM; | 3071 | return -ENOMEM; |
| 2903 | 3072 | ||
| 2904 | flush_all(s); | 3073 | flush_all(s); |
| 2905 | for_each_online_node(node) { | 3074 | for_each_node_state(node, N_NORMAL_MEMORY) { |
| 2906 | struct kmem_cache_node *n = get_node(s, node); | 3075 | struct kmem_cache_node *n = get_node(s, node); |
| 2907 | 3076 | ||
| 2908 | count += validate_slab_node(s, n, map); | 3077 | count += validate_slab_node(s, n, map); |
| @@ -3116,13 +3285,13 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
| 3116 | int node; | 3285 | int node; |
| 3117 | 3286 | ||
| 3118 | if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), | 3287 | if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), |
| 3119 | GFP_KERNEL)) | 3288 | GFP_TEMPORARY)) |
| 3120 | return sprintf(buf, "Out of memory\n"); | 3289 | return sprintf(buf, "Out of memory\n"); |
| 3121 | 3290 | ||
| 3122 | /* Push back cpu slabs */ | 3291 | /* Push back cpu slabs */ |
| 3123 | flush_all(s); | 3292 | flush_all(s); |
| 3124 | 3293 | ||
| 3125 | for_each_online_node(node) { | 3294 | for_each_node_state(node, N_NORMAL_MEMORY) { |
| 3126 | struct kmem_cache_node *n = get_node(s, node); | 3295 | struct kmem_cache_node *n = get_node(s, node); |
| 3127 | unsigned long flags; | 3296 | unsigned long flags; |
| 3128 | struct page *page; | 3297 | struct page *page; |
| @@ -3230,11 +3399,18 @@ static unsigned long slab_objects(struct kmem_cache *s, | |||
| 3230 | per_cpu = nodes + nr_node_ids; | 3399 | per_cpu = nodes + nr_node_ids; |
| 3231 | 3400 | ||
| 3232 | for_each_possible_cpu(cpu) { | 3401 | for_each_possible_cpu(cpu) { |
| 3233 | struct page *page = s->cpu_slab[cpu]; | 3402 | struct page *page; |
| 3234 | int node; | 3403 | int node; |
| 3404 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
| 3235 | 3405 | ||
| 3406 | if (!c) | ||
| 3407 | continue; | ||
| 3408 | |||
| 3409 | page = c->page; | ||
| 3410 | node = c->node; | ||
| 3411 | if (node < 0) | ||
| 3412 | continue; | ||
| 3236 | if (page) { | 3413 | if (page) { |
| 3237 | node = page_to_nid(page); | ||
| 3238 | if (flags & SO_CPU) { | 3414 | if (flags & SO_CPU) { |
| 3239 | int x = 0; | 3415 | int x = 0; |
| 3240 | 3416 | ||
| @@ -3249,7 +3425,7 @@ static unsigned long slab_objects(struct kmem_cache *s, | |||
| 3249 | } | 3425 | } |
| 3250 | } | 3426 | } |
| 3251 | 3427 | ||
| 3252 | for_each_online_node(node) { | 3428 | for_each_node_state(node, N_NORMAL_MEMORY) { |
| 3253 | struct kmem_cache_node *n = get_node(s, node); | 3429 | struct kmem_cache_node *n = get_node(s, node); |
| 3254 | 3430 | ||
| 3255 | if (flags & SO_PARTIAL) { | 3431 | if (flags & SO_PARTIAL) { |
| @@ -3277,7 +3453,7 @@ static unsigned long slab_objects(struct kmem_cache *s, | |||
| 3277 | 3453 | ||
| 3278 | x = sprintf(buf, "%lu", total); | 3454 | x = sprintf(buf, "%lu", total); |
| 3279 | #ifdef CONFIG_NUMA | 3455 | #ifdef CONFIG_NUMA |
| 3280 | for_each_online_node(node) | 3456 | for_each_node_state(node, N_NORMAL_MEMORY) |
| 3281 | if (nodes[node]) | 3457 | if (nodes[node]) |
| 3282 | x += sprintf(buf + x, " N%d=%lu", | 3458 | x += sprintf(buf + x, " N%d=%lu", |
| 3283 | node, nodes[node]); | 3459 | node, nodes[node]); |
| @@ -3291,13 +3467,19 @@ static int any_slab_objects(struct kmem_cache *s) | |||
| 3291 | int node; | 3467 | int node; |
| 3292 | int cpu; | 3468 | int cpu; |
| 3293 | 3469 | ||
| 3294 | for_each_possible_cpu(cpu) | 3470 | for_each_possible_cpu(cpu) { |
| 3295 | if (s->cpu_slab[cpu]) | 3471 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); |
| 3472 | |||
| 3473 | if (c && c->page) | ||
| 3296 | return 1; | 3474 | return 1; |
| 3475 | } | ||
| 3297 | 3476 | ||
| 3298 | for_each_node(node) { | 3477 | for_each_online_node(node) { |
| 3299 | struct kmem_cache_node *n = get_node(s, node); | 3478 | struct kmem_cache_node *n = get_node(s, node); |
| 3300 | 3479 | ||
| 3480 | if (!n) | ||
| 3481 | continue; | ||
| 3482 | |||
| 3301 | if (n->nr_partial || atomic_long_read(&n->nr_slabs)) | 3483 | if (n->nr_partial || atomic_long_read(&n->nr_slabs)) |
| 3302 | return 1; | 3484 | return 1; |
| 3303 | } | 3485 | } |
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c new file mode 100644 index 000000000000..d3b718b0c20a --- /dev/null +++ b/mm/sparse-vmemmap.c | |||
| @@ -0,0 +1,148 @@ | |||
| 1 | /* | ||
| 2 | * Virtual Memory Map support | ||
| 3 | * | ||
| 4 | * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>. | ||
| 5 | * | ||
| 6 | * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn, | ||
| 7 | * virt_to_page, page_address() to be implemented as a base offset | ||
| 8 | * calculation without memory access. | ||
| 9 | * | ||
| 10 | * However, virtual mappings need a page table and TLBs. Many Linux | ||
| 11 | * architectures already map their physical space using 1-1 mappings | ||
| 12 | * via TLBs. For those arches the virtual memmory map is essentially | ||
| 13 | * for free if we use the same page size as the 1-1 mappings. In that | ||
| 14 | * case the overhead consists of a few additional pages that are | ||
| 15 | * allocated to create a view of memory for vmemmap. | ||
| 16 | * | ||
| 17 | * The architecture is expected to provide a vmemmap_populate() function | ||
| 18 | * to instantiate the mapping. | ||
| 19 | */ | ||
| 20 | #include <linux/mm.h> | ||
| 21 | #include <linux/mmzone.h> | ||
| 22 | #include <linux/bootmem.h> | ||
| 23 | #include <linux/highmem.h> | ||
| 24 | #include <linux/module.h> | ||
| 25 | #include <linux/spinlock.h> | ||
| 26 | #include <linux/vmalloc.h> | ||
| 27 | #include <asm/dma.h> | ||
| 28 | #include <asm/pgalloc.h> | ||
| 29 | #include <asm/pgtable.h> | ||
| 30 | |||
| 31 | /* | ||
| 32 | * Allocate a block of memory to be used to back the virtual memory map | ||
| 33 | * or to back the page tables that are used to create the mapping. | ||
| 34 | * Uses the main allocators if they are available, else bootmem. | ||
| 35 | */ | ||
| 36 | void * __meminit vmemmap_alloc_block(unsigned long size, int node) | ||
| 37 | { | ||
| 38 | /* If the main allocator is up use that, fallback to bootmem. */ | ||
| 39 | if (slab_is_available()) { | ||
| 40 | struct page *page = alloc_pages_node(node, | ||
| 41 | GFP_KERNEL | __GFP_ZERO, get_order(size)); | ||
| 42 | if (page) | ||
| 43 | return page_address(page); | ||
| 44 | return NULL; | ||
| 45 | } else | ||
| 46 | return __alloc_bootmem_node(NODE_DATA(node), size, size, | ||
| 47 | __pa(MAX_DMA_ADDRESS)); | ||
| 48 | } | ||
| 49 | |||
| 50 | void __meminit vmemmap_verify(pte_t *pte, int node, | ||
| 51 | unsigned long start, unsigned long end) | ||
| 52 | { | ||
| 53 | unsigned long pfn = pte_pfn(*pte); | ||
| 54 | int actual_node = early_pfn_to_nid(pfn); | ||
| 55 | |||
| 56 | if (actual_node != node) | ||
| 57 | printk(KERN_WARNING "[%lx-%lx] potential offnode " | ||
| 58 | "page_structs\n", start, end - 1); | ||
| 59 | } | ||
| 60 | |||
| 61 | pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) | ||
| 62 | { | ||
| 63 | pte_t *pte = pte_offset_kernel(pmd, addr); | ||
| 64 | if (pte_none(*pte)) { | ||
| 65 | pte_t entry; | ||
| 66 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | ||
| 67 | if (!p) | ||
| 68 | return 0; | ||
| 69 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); | ||
| 70 | set_pte_at(&init_mm, addr, pte, entry); | ||
| 71 | } | ||
| 72 | return pte; | ||
| 73 | } | ||
| 74 | |||
| 75 | pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) | ||
| 76 | { | ||
| 77 | pmd_t *pmd = pmd_offset(pud, addr); | ||
| 78 | if (pmd_none(*pmd)) { | ||
| 79 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | ||
| 80 | if (!p) | ||
| 81 | return 0; | ||
| 82 | pmd_populate_kernel(&init_mm, pmd, p); | ||
| 83 | } | ||
| 84 | return pmd; | ||
| 85 | } | ||
| 86 | |||
| 87 | pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node) | ||
| 88 | { | ||
| 89 | pud_t *pud = pud_offset(pgd, addr); | ||
| 90 | if (pud_none(*pud)) { | ||
| 91 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | ||
| 92 | if (!p) | ||
| 93 | return 0; | ||
| 94 | pud_populate(&init_mm, pud, p); | ||
| 95 | } | ||
| 96 | return pud; | ||
| 97 | } | ||
| 98 | |||
| 99 | pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) | ||
| 100 | { | ||
| 101 | pgd_t *pgd = pgd_offset_k(addr); | ||
| 102 | if (pgd_none(*pgd)) { | ||
| 103 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | ||
| 104 | if (!p) | ||
| 105 | return 0; | ||
| 106 | pgd_populate(&init_mm, pgd, p); | ||
| 107 | } | ||
| 108 | return pgd; | ||
| 109 | } | ||
| 110 | |||
| 111 | int __meminit vmemmap_populate_basepages(struct page *start_page, | ||
| 112 | unsigned long size, int node) | ||
| 113 | { | ||
| 114 | unsigned long addr = (unsigned long)start_page; | ||
| 115 | unsigned long end = (unsigned long)(start_page + size); | ||
| 116 | pgd_t *pgd; | ||
| 117 | pud_t *pud; | ||
| 118 | pmd_t *pmd; | ||
| 119 | pte_t *pte; | ||
| 120 | |||
| 121 | for (; addr < end; addr += PAGE_SIZE) { | ||
| 122 | pgd = vmemmap_pgd_populate(addr, node); | ||
| 123 | if (!pgd) | ||
| 124 | return -ENOMEM; | ||
| 125 | pud = vmemmap_pud_populate(pgd, addr, node); | ||
| 126 | if (!pud) | ||
| 127 | return -ENOMEM; | ||
| 128 | pmd = vmemmap_pmd_populate(pud, addr, node); | ||
| 129 | if (!pmd) | ||
| 130 | return -ENOMEM; | ||
| 131 | pte = vmemmap_pte_populate(pmd, addr, node); | ||
| 132 | if (!pte) | ||
| 133 | return -ENOMEM; | ||
| 134 | vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); | ||
| 135 | } | ||
| 136 | |||
| 137 | return 0; | ||
| 138 | } | ||
| 139 | |||
| 140 | struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) | ||
| 141 | { | ||
| 142 | struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION); | ||
| 143 | int error = vmemmap_populate(map, PAGES_PER_SECTION, nid); | ||
| 144 | if (error) | ||
| 145 | return NULL; | ||
| 146 | |||
| 147 | return map; | ||
| 148 | } | ||
diff --git a/mm/sparse.c b/mm/sparse.c index 239f5a720d38..08fb14f5eea3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
| @@ -9,6 +9,8 @@ | |||
| 9 | #include <linux/spinlock.h> | 9 | #include <linux/spinlock.h> |
| 10 | #include <linux/vmalloc.h> | 10 | #include <linux/vmalloc.h> |
| 11 | #include <asm/dma.h> | 11 | #include <asm/dma.h> |
| 12 | #include <asm/pgalloc.h> | ||
| 13 | #include <asm/pgtable.h> | ||
| 12 | 14 | ||
| 13 | /* | 15 | /* |
| 14 | * Permanent SPARSEMEM data: | 16 | * Permanent SPARSEMEM data: |
| @@ -106,7 +108,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) | |||
| 106 | 108 | ||
| 107 | /* | 109 | /* |
| 108 | * Although written for the SPARSEMEM_EXTREME case, this happens | 110 | * Although written for the SPARSEMEM_EXTREME case, this happens |
| 109 | * to also work for the flat array case becase | 111 | * to also work for the flat array case because |
| 110 | * NR_SECTION_ROOTS==NR_MEM_SECTIONS. | 112 | * NR_SECTION_ROOTS==NR_MEM_SECTIONS. |
| 111 | */ | 113 | */ |
| 112 | int __section_nr(struct mem_section* ms) | 114 | int __section_nr(struct mem_section* ms) |
| @@ -176,7 +178,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, | |||
| 176 | if (nid != early_pfn_to_nid(pfn)) | 178 | if (nid != early_pfn_to_nid(pfn)) |
| 177 | continue; | 179 | continue; |
| 178 | 180 | ||
| 179 | if (pfn_valid(pfn)) | 181 | if (pfn_present(pfn)) |
| 180 | nr_pages += PAGES_PER_SECTION; | 182 | nr_pages += PAGES_PER_SECTION; |
| 181 | } | 183 | } |
| 182 | 184 | ||
| @@ -204,13 +206,16 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn | |||
| 204 | } | 206 | } |
| 205 | 207 | ||
| 206 | static int __meminit sparse_init_one_section(struct mem_section *ms, | 208 | static int __meminit sparse_init_one_section(struct mem_section *ms, |
| 207 | unsigned long pnum, struct page *mem_map) | 209 | unsigned long pnum, struct page *mem_map, |
| 210 | unsigned long *pageblock_bitmap) | ||
| 208 | { | 211 | { |
| 209 | if (!valid_section(ms)) | 212 | if (!present_section(ms)) |
| 210 | return -EINVAL; | 213 | return -EINVAL; |
| 211 | 214 | ||
| 212 | ms->section_mem_map &= ~SECTION_MAP_MASK; | 215 | ms->section_mem_map &= ~SECTION_MAP_MASK; |
| 213 | ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum); | 216 | ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) | |
| 217 | SECTION_HAS_MEM_MAP; | ||
| 218 | ms->pageblock_flags = pageblock_bitmap; | ||
| 214 | 219 | ||
| 215 | return 1; | 220 | return 1; |
| 216 | } | 221 | } |
| @@ -221,12 +226,43 @@ void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) | |||
| 221 | return NULL; | 226 | return NULL; |
| 222 | } | 227 | } |
| 223 | 228 | ||
| 224 | static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | 229 | static unsigned long usemap_size(void) |
| 225 | { | 230 | { |
| 226 | struct page *map; | 231 | unsigned long size_bytes; |
| 232 | size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; | ||
| 233 | size_bytes = roundup(size_bytes, sizeof(unsigned long)); | ||
| 234 | return size_bytes; | ||
| 235 | } | ||
| 236 | |||
| 237 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
| 238 | static unsigned long *__kmalloc_section_usemap(void) | ||
| 239 | { | ||
| 240 | return kmalloc(usemap_size(), GFP_KERNEL); | ||
| 241 | } | ||
| 242 | #endif /* CONFIG_MEMORY_HOTPLUG */ | ||
| 243 | |||
| 244 | static unsigned long *sparse_early_usemap_alloc(unsigned long pnum) | ||
| 245 | { | ||
| 246 | unsigned long *usemap; | ||
| 227 | struct mem_section *ms = __nr_to_section(pnum); | 247 | struct mem_section *ms = __nr_to_section(pnum); |
| 228 | int nid = sparse_early_nid(ms); | 248 | int nid = sparse_early_nid(ms); |
| 229 | 249 | ||
| 250 | usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); | ||
| 251 | if (usemap) | ||
| 252 | return usemap; | ||
| 253 | |||
| 254 | /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ | ||
| 255 | nid = 0; | ||
| 256 | |||
| 257 | printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__); | ||
| 258 | return NULL; | ||
| 259 | } | ||
| 260 | |||
| 261 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | ||
| 262 | struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | ||
| 263 | { | ||
| 264 | struct page *map; | ||
| 265 | |||
| 230 | map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); | 266 | map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); |
| 231 | if (map) | 267 | if (map) |
| 232 | return map; | 268 | return map; |
| @@ -238,10 +274,22 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | |||
| 238 | 274 | ||
| 239 | map = alloc_bootmem_node(NODE_DATA(nid), | 275 | map = alloc_bootmem_node(NODE_DATA(nid), |
| 240 | sizeof(struct page) * PAGES_PER_SECTION); | 276 | sizeof(struct page) * PAGES_PER_SECTION); |
| 277 | return map; | ||
| 278 | } | ||
| 279 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | ||
| 280 | |||
| 281 | struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | ||
| 282 | { | ||
| 283 | struct page *map; | ||
| 284 | struct mem_section *ms = __nr_to_section(pnum); | ||
| 285 | int nid = sparse_early_nid(ms); | ||
| 286 | |||
| 287 | map = sparse_mem_map_populate(pnum, nid); | ||
| 241 | if (map) | 288 | if (map) |
| 242 | return map; | 289 | return map; |
| 243 | 290 | ||
| 244 | printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__); | 291 | printk(KERN_ERR "%s: sparsemem memory map backing failed " |
| 292 | "some memory will not be available.\n", __FUNCTION__); | ||
| 245 | ms->section_mem_map = 0; | 293 | ms->section_mem_map = 0; |
| 246 | return NULL; | 294 | return NULL; |
| 247 | } | 295 | } |
| @@ -254,19 +302,38 @@ void __init sparse_init(void) | |||
| 254 | { | 302 | { |
| 255 | unsigned long pnum; | 303 | unsigned long pnum; |
| 256 | struct page *map; | 304 | struct page *map; |
| 305 | unsigned long *usemap; | ||
| 257 | 306 | ||
| 258 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | 307 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { |
| 259 | if (!valid_section_nr(pnum)) | 308 | if (!present_section_nr(pnum)) |
| 260 | continue; | 309 | continue; |
| 261 | 310 | ||
| 262 | map = sparse_early_mem_map_alloc(pnum); | 311 | map = sparse_early_mem_map_alloc(pnum); |
| 263 | if (!map) | 312 | if (!map) |
| 264 | continue; | 313 | continue; |
| 265 | sparse_init_one_section(__nr_to_section(pnum), pnum, map); | 314 | |
| 315 | usemap = sparse_early_usemap_alloc(pnum); | ||
| 316 | if (!usemap) | ||
| 317 | continue; | ||
| 318 | |||
| 319 | sparse_init_one_section(__nr_to_section(pnum), pnum, map, | ||
| 320 | usemap); | ||
| 266 | } | 321 | } |
| 267 | } | 322 | } |
| 268 | 323 | ||
| 269 | #ifdef CONFIG_MEMORY_HOTPLUG | 324 | #ifdef CONFIG_MEMORY_HOTPLUG |
| 325 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | ||
| 326 | static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, | ||
| 327 | unsigned long nr_pages) | ||
| 328 | { | ||
| 329 | /* This will make the necessary allocations eventually. */ | ||
| 330 | return sparse_mem_map_populate(pnum, nid); | ||
| 331 | } | ||
| 332 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | ||
| 333 | { | ||
| 334 | return; /* XXX: Not implemented yet */ | ||
| 335 | } | ||
| 336 | #else | ||
| 270 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | 337 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) |
| 271 | { | 338 | { |
| 272 | struct page *page, *ret; | 339 | struct page *page, *ret; |
| @@ -289,6 +356,12 @@ got_map_ptr: | |||
| 289 | return ret; | 356 | return ret; |
| 290 | } | 357 | } |
| 291 | 358 | ||
| 359 | static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, | ||
| 360 | unsigned long nr_pages) | ||
| 361 | { | ||
| 362 | return __kmalloc_section_memmap(nr_pages); | ||
| 363 | } | ||
| 364 | |||
| 292 | static int vaddr_in_vmalloc_area(void *addr) | 365 | static int vaddr_in_vmalloc_area(void *addr) |
| 293 | { | 366 | { |
| 294 | if (addr >= (void *)VMALLOC_START && | 367 | if (addr >= (void *)VMALLOC_START && |
| @@ -305,6 +378,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | |||
| 305 | free_pages((unsigned long)memmap, | 378 | free_pages((unsigned long)memmap, |
| 306 | get_order(sizeof(struct page) * nr_pages)); | 379 | get_order(sizeof(struct page) * nr_pages)); |
| 307 | } | 380 | } |
| 381 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ | ||
| 308 | 382 | ||
| 309 | /* | 383 | /* |
| 310 | * returns the number of sections whose mem_maps were properly | 384 | * returns the number of sections whose mem_maps were properly |
| @@ -318,6 +392,7 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
| 318 | struct pglist_data *pgdat = zone->zone_pgdat; | 392 | struct pglist_data *pgdat = zone->zone_pgdat; |
| 319 | struct mem_section *ms; | 393 | struct mem_section *ms; |
| 320 | struct page *memmap; | 394 | struct page *memmap; |
| 395 | unsigned long *usemap; | ||
| 321 | unsigned long flags; | 396 | unsigned long flags; |
| 322 | int ret; | 397 | int ret; |
| 323 | 398 | ||
| @@ -326,7 +401,8 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
| 326 | * plus, it does a kmalloc | 401 | * plus, it does a kmalloc |
| 327 | */ | 402 | */ |
| 328 | sparse_index_init(section_nr, pgdat->node_id); | 403 | sparse_index_init(section_nr, pgdat->node_id); |
| 329 | memmap = __kmalloc_section_memmap(nr_pages); | 404 | memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); |
| 405 | usemap = __kmalloc_section_usemap(); | ||
| 330 | 406 | ||
| 331 | pgdat_resize_lock(pgdat, &flags); | 407 | pgdat_resize_lock(pgdat, &flags); |
| 332 | 408 | ||
| @@ -335,9 +411,14 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
| 335 | ret = -EEXIST; | 411 | ret = -EEXIST; |
| 336 | goto out; | 412 | goto out; |
| 337 | } | 413 | } |
| 414 | |||
| 415 | if (!usemap) { | ||
| 416 | ret = -ENOMEM; | ||
| 417 | goto out; | ||
| 418 | } | ||
| 338 | ms->section_mem_map |= SECTION_MARKED_PRESENT; | 419 | ms->section_mem_map |= SECTION_MARKED_PRESENT; |
| 339 | 420 | ||
| 340 | ret = sparse_init_one_section(ms, section_nr, memmap); | 421 | ret = sparse_init_one_section(ms, section_nr, memmap, usemap); |
| 341 | 422 | ||
| 342 | out: | 423 | out: |
| 343 | pgdat_resize_unlock(pgdat, &flags); | 424 | pgdat_resize_unlock(pgdat, &flags); |
| @@ -24,16 +24,18 @@ | |||
| 24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
| 25 | #include <linux/mm_inline.h> | 25 | #include <linux/mm_inline.h> |
| 26 | #include <linux/buffer_head.h> /* for try_to_release_page() */ | 26 | #include <linux/buffer_head.h> /* for try_to_release_page() */ |
| 27 | #include <linux/module.h> | ||
| 28 | #include <linux/percpu_counter.h> | 27 | #include <linux/percpu_counter.h> |
| 29 | #include <linux/percpu.h> | 28 | #include <linux/percpu.h> |
| 30 | #include <linux/cpu.h> | 29 | #include <linux/cpu.h> |
| 31 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
| 32 | #include <linux/init.h> | ||
| 33 | 31 | ||
| 34 | /* How many pages do we try to swap or page in/out together? */ | 32 | /* How many pages do we try to swap or page in/out together? */ |
| 35 | int page_cluster; | 33 | int page_cluster; |
| 36 | 34 | ||
| 35 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; | ||
| 36 | static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; | ||
| 37 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; | ||
| 38 | |||
| 37 | /* | 39 | /* |
| 38 | * This path almost never happens for VM activity - pages are normally | 40 | * This path almost never happens for VM activity - pages are normally |
| 39 | * freed via pagevecs. But it gets used by networking. | 41 | * freed via pagevecs. But it gets used by networking. |
| @@ -94,23 +96,47 @@ void put_pages_list(struct list_head *pages) | |||
| 94 | EXPORT_SYMBOL(put_pages_list); | 96 | EXPORT_SYMBOL(put_pages_list); |
| 95 | 97 | ||
| 96 | /* | 98 | /* |
| 99 | * pagevec_move_tail() must be called with IRQ disabled. | ||
| 100 | * Otherwise this may cause nasty races. | ||
| 101 | */ | ||
| 102 | static void pagevec_move_tail(struct pagevec *pvec) | ||
| 103 | { | ||
| 104 | int i; | ||
| 105 | int pgmoved = 0; | ||
| 106 | struct zone *zone = NULL; | ||
| 107 | |||
| 108 | for (i = 0; i < pagevec_count(pvec); i++) { | ||
| 109 | struct page *page = pvec->pages[i]; | ||
| 110 | struct zone *pagezone = page_zone(page); | ||
| 111 | |||
| 112 | if (pagezone != zone) { | ||
| 113 | if (zone) | ||
| 114 | spin_unlock(&zone->lru_lock); | ||
| 115 | zone = pagezone; | ||
| 116 | spin_lock(&zone->lru_lock); | ||
| 117 | } | ||
| 118 | if (PageLRU(page) && !PageActive(page)) { | ||
| 119 | list_move_tail(&page->lru, &zone->inactive_list); | ||
| 120 | pgmoved++; | ||
| 121 | } | ||
| 122 | } | ||
| 123 | if (zone) | ||
| 124 | spin_unlock(&zone->lru_lock); | ||
| 125 | __count_vm_events(PGROTATED, pgmoved); | ||
| 126 | release_pages(pvec->pages, pvec->nr, pvec->cold); | ||
| 127 | pagevec_reinit(pvec); | ||
| 128 | } | ||
| 129 | |||
| 130 | /* | ||
| 97 | * Writeback is about to end against a page which has been marked for immediate | 131 | * Writeback is about to end against a page which has been marked for immediate |
| 98 | * reclaim. If it still appears to be reclaimable, move it to the tail of the | 132 | * reclaim. If it still appears to be reclaimable, move it to the tail of the |
| 99 | * inactive list. The page still has PageWriteback set, which will pin it. | 133 | * inactive list. |
| 100 | * | ||
| 101 | * We don't expect many pages to come through here, so don't bother batching | ||
| 102 | * things up. | ||
| 103 | * | ||
| 104 | * To avoid placing the page at the tail of the LRU while PG_writeback is still | ||
| 105 | * set, this function will clear PG_writeback before performing the page | ||
| 106 | * motion. Do that inside the lru lock because once PG_writeback is cleared | ||
| 107 | * we may not touch the page. | ||
| 108 | * | 134 | * |
| 109 | * Returns zero if it cleared PG_writeback. | 135 | * Returns zero if it cleared PG_writeback. |
| 110 | */ | 136 | */ |
| 111 | int rotate_reclaimable_page(struct page *page) | 137 | int rotate_reclaimable_page(struct page *page) |
| 112 | { | 138 | { |
| 113 | struct zone *zone; | 139 | struct pagevec *pvec; |
| 114 | unsigned long flags; | 140 | unsigned long flags; |
| 115 | 141 | ||
| 116 | if (PageLocked(page)) | 142 | if (PageLocked(page)) |
| @@ -122,15 +148,16 @@ int rotate_reclaimable_page(struct page *page) | |||
| 122 | if (!PageLRU(page)) | 148 | if (!PageLRU(page)) |
| 123 | return 1; | 149 | return 1; |
| 124 | 150 | ||
| 125 | zone = page_zone(page); | 151 | page_cache_get(page); |
| 126 | spin_lock_irqsave(&zone->lru_lock, flags); | 152 | local_irq_save(flags); |
| 127 | if (PageLRU(page) && !PageActive(page)) { | 153 | pvec = &__get_cpu_var(lru_rotate_pvecs); |
| 128 | list_move_tail(&page->lru, &zone->inactive_list); | 154 | if (!pagevec_add(pvec, page)) |
| 129 | __count_vm_event(PGROTATED); | 155 | pagevec_move_tail(pvec); |
| 130 | } | 156 | local_irq_restore(flags); |
| 157 | |||
| 131 | if (!test_clear_page_writeback(page)) | 158 | if (!test_clear_page_writeback(page)) |
| 132 | BUG(); | 159 | BUG(); |
| 133 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 160 | |
| 134 | return 0; | 161 | return 0; |
| 135 | } | 162 | } |
| 136 | 163 | ||
| @@ -174,9 +201,6 @@ EXPORT_SYMBOL(mark_page_accessed); | |||
| 174 | * lru_cache_add: add a page to the page lists | 201 | * lru_cache_add: add a page to the page lists |
| 175 | * @page: the page to add | 202 | * @page: the page to add |
| 176 | */ | 203 | */ |
| 177 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; | ||
| 178 | static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; | ||
| 179 | |||
| 180 | void fastcall lru_cache_add(struct page *page) | 204 | void fastcall lru_cache_add(struct page *page) |
| 181 | { | 205 | { |
| 182 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); | 206 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); |
| @@ -197,21 +221,37 @@ void fastcall lru_cache_add_active(struct page *page) | |||
| 197 | put_cpu_var(lru_add_active_pvecs); | 221 | put_cpu_var(lru_add_active_pvecs); |
| 198 | } | 222 | } |
| 199 | 223 | ||
| 200 | static void __lru_add_drain(int cpu) | 224 | /* |
| 225 | * Drain pages out of the cpu's pagevecs. | ||
| 226 | * Either "cpu" is the current CPU, and preemption has already been | ||
| 227 | * disabled; or "cpu" is being hot-unplugged, and is already dead. | ||
| 228 | */ | ||
| 229 | static void drain_cpu_pagevecs(int cpu) | ||
| 201 | { | 230 | { |
| 202 | struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); | 231 | struct pagevec *pvec; |
| 203 | 232 | ||
| 204 | /* CPU is dead, so no locking needed. */ | 233 | pvec = &per_cpu(lru_add_pvecs, cpu); |
| 205 | if (pagevec_count(pvec)) | 234 | if (pagevec_count(pvec)) |
| 206 | __pagevec_lru_add(pvec); | 235 | __pagevec_lru_add(pvec); |
| 236 | |||
| 207 | pvec = &per_cpu(lru_add_active_pvecs, cpu); | 237 | pvec = &per_cpu(lru_add_active_pvecs, cpu); |
| 208 | if (pagevec_count(pvec)) | 238 | if (pagevec_count(pvec)) |
| 209 | __pagevec_lru_add_active(pvec); | 239 | __pagevec_lru_add_active(pvec); |
| 240 | |||
| 241 | pvec = &per_cpu(lru_rotate_pvecs, cpu); | ||
| 242 | if (pagevec_count(pvec)) { | ||
| 243 | unsigned long flags; | ||
| 244 | |||
| 245 | /* No harm done if a racing interrupt already did this */ | ||
| 246 | local_irq_save(flags); | ||
| 247 | pagevec_move_tail(pvec); | ||
| 248 | local_irq_restore(flags); | ||
| 249 | } | ||
| 210 | } | 250 | } |
| 211 | 251 | ||
| 212 | void lru_add_drain(void) | 252 | void lru_add_drain(void) |
| 213 | { | 253 | { |
| 214 | __lru_add_drain(get_cpu()); | 254 | drain_cpu_pagevecs(get_cpu()); |
| 215 | put_cpu(); | 255 | put_cpu(); |
| 216 | } | 256 | } |
| 217 | 257 | ||
| @@ -258,6 +298,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
| 258 | int i; | 298 | int i; |
| 259 | struct pagevec pages_to_free; | 299 | struct pagevec pages_to_free; |
| 260 | struct zone *zone = NULL; | 300 | struct zone *zone = NULL; |
| 301 | unsigned long uninitialized_var(flags); | ||
| 261 | 302 | ||
| 262 | pagevec_init(&pages_to_free, cold); | 303 | pagevec_init(&pages_to_free, cold); |
| 263 | for (i = 0; i < nr; i++) { | 304 | for (i = 0; i < nr; i++) { |
| @@ -265,7 +306,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
| 265 | 306 | ||
| 266 | if (unlikely(PageCompound(page))) { | 307 | if (unlikely(PageCompound(page))) { |
| 267 | if (zone) { | 308 | if (zone) { |
| 268 | spin_unlock_irq(&zone->lru_lock); | 309 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| 269 | zone = NULL; | 310 | zone = NULL; |
| 270 | } | 311 | } |
| 271 | put_compound_page(page); | 312 | put_compound_page(page); |
| @@ -279,9 +320,10 @@ void release_pages(struct page **pages, int nr, int cold) | |||
| 279 | struct zone *pagezone = page_zone(page); | 320 | struct zone *pagezone = page_zone(page); |
| 280 | if (pagezone != zone) { | 321 | if (pagezone != zone) { |
| 281 | if (zone) | 322 | if (zone) |
| 282 | spin_unlock_irq(&zone->lru_lock); | 323 | spin_unlock_irqrestore(&zone->lru_lock, |
| 324 | flags); | ||
| 283 | zone = pagezone; | 325 | zone = pagezone; |
| 284 | spin_lock_irq(&zone->lru_lock); | 326 | spin_lock_irqsave(&zone->lru_lock, flags); |
| 285 | } | 327 | } |
| 286 | VM_BUG_ON(!PageLRU(page)); | 328 | VM_BUG_ON(!PageLRU(page)); |
| 287 | __ClearPageLRU(page); | 329 | __ClearPageLRU(page); |
| @@ -290,7 +332,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
| 290 | 332 | ||
| 291 | if (!pagevec_add(&pages_to_free, page)) { | 333 | if (!pagevec_add(&pages_to_free, page)) { |
| 292 | if (zone) { | 334 | if (zone) { |
| 293 | spin_unlock_irq(&zone->lru_lock); | 335 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| 294 | zone = NULL; | 336 | zone = NULL; |
| 295 | } | 337 | } |
| 296 | __pagevec_free(&pages_to_free); | 338 | __pagevec_free(&pages_to_free); |
| @@ -298,7 +340,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
| 298 | } | 340 | } |
| 299 | } | 341 | } |
| 300 | if (zone) | 342 | if (zone) |
| 301 | spin_unlock_irq(&zone->lru_lock); | 343 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| 302 | 344 | ||
| 303 | pagevec_free(&pages_to_free); | 345 | pagevec_free(&pages_to_free); |
| 304 | } | 346 | } |
| @@ -491,7 +533,7 @@ static int cpu_swap_callback(struct notifier_block *nfb, | |||
| 491 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | 533 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { |
| 492 | atomic_add(*committed, &vm_committed_space); | 534 | atomic_add(*committed, &vm_committed_space); |
| 493 | *committed = 0; | 535 | *committed = 0; |
| 494 | __lru_add_drain((long)hcpu); | 536 | drain_cpu_pagevecs((long)hcpu); |
| 495 | } | 537 | } |
| 496 | return NOTIFY_OK; | 538 | return NOTIFY_OK; |
| 497 | } | 539 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 67daecb6031a..b52635601dfe 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -74,6 +74,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, | |||
| 74 | { | 74 | { |
| 75 | int error; | 75 | int error; |
| 76 | 76 | ||
| 77 | BUG_ON(!PageLocked(page)); | ||
| 77 | BUG_ON(PageSwapCache(page)); | 78 | BUG_ON(PageSwapCache(page)); |
| 78 | BUG_ON(PagePrivate(page)); | 79 | BUG_ON(PagePrivate(page)); |
| 79 | error = radix_tree_preload(gfp_mask); | 80 | error = radix_tree_preload(gfp_mask); |
| @@ -83,7 +84,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, | |||
| 83 | entry.val, page); | 84 | entry.val, page); |
| 84 | if (!error) { | 85 | if (!error) { |
| 85 | page_cache_get(page); | 86 | page_cache_get(page); |
| 86 | SetPageLocked(page); | ||
| 87 | SetPageSwapCache(page); | 87 | SetPageSwapCache(page); |
| 88 | set_page_private(page, entry.val); | 88 | set_page_private(page, entry.val); |
| 89 | total_swapcache_pages++; | 89 | total_swapcache_pages++; |
| @@ -99,15 +99,18 @@ static int add_to_swap_cache(struct page *page, swp_entry_t entry) | |||
| 99 | { | 99 | { |
| 100 | int error; | 100 | int error; |
| 101 | 101 | ||
| 102 | BUG_ON(PageLocked(page)); | ||
| 102 | if (!swap_duplicate(entry)) { | 103 | if (!swap_duplicate(entry)) { |
| 103 | INC_CACHE_INFO(noent_race); | 104 | INC_CACHE_INFO(noent_race); |
| 104 | return -ENOENT; | 105 | return -ENOENT; |
| 105 | } | 106 | } |
| 107 | SetPageLocked(page); | ||
| 106 | error = __add_to_swap_cache(page, entry, GFP_KERNEL); | 108 | error = __add_to_swap_cache(page, entry, GFP_KERNEL); |
| 107 | /* | 109 | /* |
| 108 | * Anon pages are already on the LRU, we don't run lru_cache_add here. | 110 | * Anon pages are already on the LRU, we don't run lru_cache_add here. |
| 109 | */ | 111 | */ |
| 110 | if (error) { | 112 | if (error) { |
| 113 | ClearPageLocked(page); | ||
| 111 | swap_free(entry); | 114 | swap_free(entry); |
| 112 | if (error == -EEXIST) | 115 | if (error == -EEXIST) |
| 113 | INC_CACHE_INFO(exist_race); | 116 | INC_CACHE_INFO(exist_race); |
| @@ -81,14 +81,16 @@ EXPORT_SYMBOL(kmemdup); | |||
| 81 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | 81 | void *krealloc(const void *p, size_t new_size, gfp_t flags) |
| 82 | { | 82 | { |
| 83 | void *ret; | 83 | void *ret; |
| 84 | size_t ks; | 84 | size_t ks = 0; |
| 85 | 85 | ||
| 86 | if (unlikely(!new_size)) { | 86 | if (unlikely(!new_size)) { |
| 87 | kfree(p); | 87 | kfree(p); |
| 88 | return ZERO_SIZE_PTR; | 88 | return ZERO_SIZE_PTR; |
| 89 | } | 89 | } |
| 90 | 90 | ||
| 91 | ks = ksize(p); | 91 | if (p) |
| 92 | ks = ksize(p); | ||
| 93 | |||
| 92 | if (ks >= new_size) | 94 | if (ks >= new_size) |
| 93 | return (void *)p; | 95 | return (void *)p; |
| 94 | 96 | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3cee76a8c9f0..2e01af365848 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -190,7 +190,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl | |||
| 190 | if (unlikely(!size)) | 190 | if (unlikely(!size)) |
| 191 | return NULL; | 191 | return NULL; |
| 192 | 192 | ||
| 193 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node); | 193 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); |
| 194 | |||
| 194 | if (unlikely(!area)) | 195 | if (unlikely(!area)) |
| 195 | return NULL; | 196 | return NULL; |
| 196 | 197 | ||
| @@ -439,7 +440,7 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
| 439 | area->flags |= VM_VPAGES; | 440 | area->flags |= VM_VPAGES; |
| 440 | } else { | 441 | } else { |
| 441 | pages = kmalloc_node(array_size, | 442 | pages = kmalloc_node(array_size, |
| 442 | (gfp_mask & GFP_LEVEL_MASK) | __GFP_ZERO, | 443 | (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO, |
| 443 | node); | 444 | node); |
| 444 | } | 445 | } |
| 445 | area->pages = pages; | 446 | area->pages = pages; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index a6e65d024995..bbd194630c5b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -932,6 +932,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 932 | long mapped_ratio; | 932 | long mapped_ratio; |
| 933 | long distress; | 933 | long distress; |
| 934 | long swap_tendency; | 934 | long swap_tendency; |
| 935 | long imbalance; | ||
| 935 | 936 | ||
| 936 | if (zone_is_near_oom(zone)) | 937 | if (zone_is_near_oom(zone)) |
| 937 | goto force_reclaim_mapped; | 938 | goto force_reclaim_mapped; |
| @@ -967,6 +968,46 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 967 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; | 968 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; |
| 968 | 969 | ||
| 969 | /* | 970 | /* |
| 971 | * If there's huge imbalance between active and inactive | ||
| 972 | * (think active 100 times larger than inactive) we should | ||
| 973 | * become more permissive, or the system will take too much | ||
| 974 | * cpu before it start swapping during memory pressure. | ||
| 975 | * Distress is about avoiding early-oom, this is about | ||
| 976 | * making swappiness graceful despite setting it to low | ||
| 977 | * values. | ||
| 978 | * | ||
| 979 | * Avoid div by zero with nr_inactive+1, and max resulting | ||
| 980 | * value is vm_total_pages. | ||
| 981 | */ | ||
| 982 | imbalance = zone_page_state(zone, NR_ACTIVE); | ||
| 983 | imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; | ||
| 984 | |||
| 985 | /* | ||
| 986 | * Reduce the effect of imbalance if swappiness is low, | ||
| 987 | * this means for a swappiness very low, the imbalance | ||
| 988 | * must be much higher than 100 for this logic to make | ||
| 989 | * the difference. | ||
| 990 | * | ||
| 991 | * Max temporary value is vm_total_pages*100. | ||
| 992 | */ | ||
| 993 | imbalance *= (vm_swappiness + 1); | ||
| 994 | imbalance /= 100; | ||
| 995 | |||
| 996 | /* | ||
| 997 | * If not much of the ram is mapped, makes the imbalance | ||
| 998 | * less relevant, it's high priority we refill the inactive | ||
| 999 | * list with mapped pages only in presence of high ratio of | ||
| 1000 | * mapped pages. | ||
| 1001 | * | ||
| 1002 | * Max temporary value is vm_total_pages*100. | ||
| 1003 | */ | ||
| 1004 | imbalance *= mapped_ratio; | ||
| 1005 | imbalance /= 100; | ||
| 1006 | |||
| 1007 | /* apply imbalance feedback to swap_tendency */ | ||
| 1008 | swap_tendency += imbalance; | ||
| 1009 | |||
| 1010 | /* | ||
| 970 | * Now use this metric to decide whether to start moving mapped | 1011 | * Now use this metric to decide whether to start moving mapped |
| 971 | * memory onto the inactive list. | 1012 | * memory onto the inactive list. |
| 972 | */ | 1013 | */ |
| @@ -1371,7 +1412,13 @@ loop_again: | |||
| 1371 | temp_priority[i] = priority; | 1412 | temp_priority[i] = priority; |
| 1372 | sc.nr_scanned = 0; | 1413 | sc.nr_scanned = 0; |
| 1373 | note_zone_scanning_priority(zone, priority); | 1414 | note_zone_scanning_priority(zone, priority); |
| 1374 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 1415 | /* |
| 1416 | * We put equal pressure on every zone, unless one | ||
| 1417 | * zone has way too many pages free already. | ||
| 1418 | */ | ||
| 1419 | if (!zone_watermark_ok(zone, order, 8*zone->pages_high, | ||
| 1420 | end_zone, 0)) | ||
| 1421 | nr_reclaimed += shrink_zone(priority, zone, &sc); | ||
| 1375 | reclaim_state->reclaimed_slab = 0; | 1422 | reclaim_state->reclaimed_slab = 0; |
| 1376 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1423 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
| 1377 | lru_pages); | 1424 | lru_pages); |
| @@ -1688,9 +1735,11 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
| 1688 | { | 1735 | { |
| 1689 | pg_data_t *pgdat; | 1736 | pg_data_t *pgdat; |
| 1690 | cpumask_t mask; | 1737 | cpumask_t mask; |
| 1738 | int nid; | ||
| 1691 | 1739 | ||
| 1692 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { | 1740 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { |
| 1693 | for_each_online_pgdat(pgdat) { | 1741 | for_each_node_state(nid, N_HIGH_MEMORY) { |
| 1742 | pgdat = NODE_DATA(nid); | ||
| 1694 | mask = node_to_cpumask(pgdat->node_id); | 1743 | mask = node_to_cpumask(pgdat->node_id); |
| 1695 | if (any_online_cpu(mask) != NR_CPUS) | 1744 | if (any_online_cpu(mask) != NR_CPUS) |
| 1696 | /* One of our CPUs online: restore mask */ | 1745 | /* One of our CPUs online: restore mask */ |
| @@ -1727,7 +1776,7 @@ static int __init kswapd_init(void) | |||
| 1727 | int nid; | 1776 | int nid; |
| 1728 | 1777 | ||
| 1729 | swap_setup(); | 1778 | swap_setup(); |
| 1730 | for_each_online_node(nid) | 1779 | for_each_node_state(nid, N_HIGH_MEMORY) |
| 1731 | kswapd_run(nid); | 1780 | kswapd_run(nid); |
| 1732 | hotcpu_notifier(cpu_callback, 0); | 1781 | hotcpu_notifier(cpu_callback, 0); |
| 1733 | return 0; | 1782 | return 0; |
| @@ -1847,7 +1896,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 1847 | 1896 | ||
| 1848 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 1897 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
| 1849 | { | 1898 | { |
| 1850 | cpumask_t mask; | ||
| 1851 | int node_id; | 1899 | int node_id; |
| 1852 | 1900 | ||
| 1853 | /* | 1901 | /* |
| @@ -1884,8 +1932,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 1884 | * as wide as possible. | 1932 | * as wide as possible. |
| 1885 | */ | 1933 | */ |
| 1886 | node_id = zone_to_nid(zone); | 1934 | node_id = zone_to_nid(zone); |
| 1887 | mask = node_to_cpumask(node_id); | 1935 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) |
| 1888 | if (!cpus_empty(mask) && node_id != numa_node_id()) | ||
| 1889 | return 0; | 1936 | return 0; |
| 1890 | return __zone_reclaim(zone, gfp_mask, order); | 1937 | return __zone_reclaim(zone, gfp_mask, order); |
| 1891 | } | 1938 | } |
diff --git a/mm/vmstat.c b/mm/vmstat.c index c64d169537bf..3b5e9043e7db 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -353,23 +353,6 @@ void refresh_cpu_vm_stats(int cpu) | |||
| 353 | } | 353 | } |
| 354 | } | 354 | } |
| 355 | 355 | ||
| 356 | static void __refresh_cpu_vm_stats(void *dummy) | ||
| 357 | { | ||
| 358 | refresh_cpu_vm_stats(smp_processor_id()); | ||
| 359 | } | ||
| 360 | |||
| 361 | /* | ||
| 362 | * Consolidate all counters. | ||
| 363 | * | ||
| 364 | * Note that the result is less inaccurate but still inaccurate | ||
| 365 | * if concurrent processes are allowed to run. | ||
| 366 | */ | ||
| 367 | void refresh_vm_stats(void) | ||
| 368 | { | ||
| 369 | on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1); | ||
| 370 | } | ||
| 371 | EXPORT_SYMBOL(refresh_vm_stats); | ||
| 372 | |||
| 373 | #endif | 356 | #endif |
| 374 | 357 | ||
| 375 | #ifdef CONFIG_NUMA | 358 | #ifdef CONFIG_NUMA |
| @@ -398,6 +381,13 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z) | |||
| 398 | 381 | ||
| 399 | #include <linux/seq_file.h> | 382 | #include <linux/seq_file.h> |
| 400 | 383 | ||
| 384 | static char * const migratetype_names[MIGRATE_TYPES] = { | ||
| 385 | "Unmovable", | ||
| 386 | "Reclaimable", | ||
| 387 | "Movable", | ||
| 388 | "Reserve", | ||
| 389 | }; | ||
| 390 | |||
| 401 | static void *frag_start(struct seq_file *m, loff_t *pos) | 391 | static void *frag_start(struct seq_file *m, loff_t *pos) |
| 402 | { | 392 | { |
| 403 | pg_data_t *pgdat; | 393 | pg_data_t *pgdat; |
| @@ -422,28 +412,144 @@ static void frag_stop(struct seq_file *m, void *arg) | |||
| 422 | { | 412 | { |
| 423 | } | 413 | } |
| 424 | 414 | ||
| 425 | /* | 415 | /* Walk all the zones in a node and print using a callback */ |
| 426 | * This walks the free areas for each zone. | 416 | static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, |
| 427 | */ | 417 | void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) |
| 428 | static int frag_show(struct seq_file *m, void *arg) | ||
| 429 | { | 418 | { |
| 430 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
| 431 | struct zone *zone; | 419 | struct zone *zone; |
| 432 | struct zone *node_zones = pgdat->node_zones; | 420 | struct zone *node_zones = pgdat->node_zones; |
| 433 | unsigned long flags; | 421 | unsigned long flags; |
| 434 | int order; | ||
| 435 | 422 | ||
| 436 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | 423 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { |
| 437 | if (!populated_zone(zone)) | 424 | if (!populated_zone(zone)) |
| 438 | continue; | 425 | continue; |
| 439 | 426 | ||
| 440 | spin_lock_irqsave(&zone->lock, flags); | 427 | spin_lock_irqsave(&zone->lock, flags); |
| 441 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | 428 | print(m, pgdat, zone); |
| 442 | for (order = 0; order < MAX_ORDER; ++order) | ||
| 443 | seq_printf(m, "%6lu ", zone->free_area[order].nr_free); | ||
| 444 | spin_unlock_irqrestore(&zone->lock, flags); | 429 | spin_unlock_irqrestore(&zone->lock, flags); |
| 430 | } | ||
| 431 | } | ||
| 432 | |||
| 433 | static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, | ||
| 434 | struct zone *zone) | ||
| 435 | { | ||
| 436 | int order; | ||
| 437 | |||
| 438 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
| 439 | for (order = 0; order < MAX_ORDER; ++order) | ||
| 440 | seq_printf(m, "%6lu ", zone->free_area[order].nr_free); | ||
| 441 | seq_putc(m, '\n'); | ||
| 442 | } | ||
| 443 | |||
| 444 | /* | ||
| 445 | * This walks the free areas for each zone. | ||
| 446 | */ | ||
| 447 | static int frag_show(struct seq_file *m, void *arg) | ||
| 448 | { | ||
| 449 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
| 450 | walk_zones_in_node(m, pgdat, frag_show_print); | ||
| 451 | return 0; | ||
| 452 | } | ||
| 453 | |||
| 454 | static void pagetypeinfo_showfree_print(struct seq_file *m, | ||
| 455 | pg_data_t *pgdat, struct zone *zone) | ||
| 456 | { | ||
| 457 | int order, mtype; | ||
| 458 | |||
| 459 | for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) { | ||
| 460 | seq_printf(m, "Node %4d, zone %8s, type %12s ", | ||
| 461 | pgdat->node_id, | ||
| 462 | zone->name, | ||
| 463 | migratetype_names[mtype]); | ||
| 464 | for (order = 0; order < MAX_ORDER; ++order) { | ||
| 465 | unsigned long freecount = 0; | ||
| 466 | struct free_area *area; | ||
| 467 | struct list_head *curr; | ||
| 468 | |||
| 469 | area = &(zone->free_area[order]); | ||
| 470 | |||
| 471 | list_for_each(curr, &area->free_list[mtype]) | ||
| 472 | freecount++; | ||
| 473 | seq_printf(m, "%6lu ", freecount); | ||
| 474 | } | ||
| 445 | seq_putc(m, '\n'); | 475 | seq_putc(m, '\n'); |
| 446 | } | 476 | } |
| 477 | } | ||
| 478 | |||
| 479 | /* Print out the free pages at each order for each migatetype */ | ||
| 480 | static int pagetypeinfo_showfree(struct seq_file *m, void *arg) | ||
| 481 | { | ||
| 482 | int order; | ||
| 483 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
| 484 | |||
| 485 | /* Print header */ | ||
| 486 | seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); | ||
| 487 | for (order = 0; order < MAX_ORDER; ++order) | ||
| 488 | seq_printf(m, "%6d ", order); | ||
| 489 | seq_putc(m, '\n'); | ||
| 490 | |||
| 491 | walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); | ||
| 492 | |||
| 493 | return 0; | ||
| 494 | } | ||
| 495 | |||
| 496 | static void pagetypeinfo_showblockcount_print(struct seq_file *m, | ||
| 497 | pg_data_t *pgdat, struct zone *zone) | ||
| 498 | { | ||
| 499 | int mtype; | ||
| 500 | unsigned long pfn; | ||
| 501 | unsigned long start_pfn = zone->zone_start_pfn; | ||
| 502 | unsigned long end_pfn = start_pfn + zone->spanned_pages; | ||
| 503 | unsigned long count[MIGRATE_TYPES] = { 0, }; | ||
| 504 | |||
| 505 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | ||
| 506 | struct page *page; | ||
| 507 | |||
| 508 | if (!pfn_valid(pfn)) | ||
| 509 | continue; | ||
| 510 | |||
| 511 | page = pfn_to_page(pfn); | ||
| 512 | mtype = get_pageblock_migratetype(page); | ||
| 513 | |||
| 514 | count[mtype]++; | ||
| 515 | } | ||
| 516 | |||
| 517 | /* Print counts */ | ||
| 518 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
| 519 | for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) | ||
| 520 | seq_printf(m, "%12lu ", count[mtype]); | ||
| 521 | seq_putc(m, '\n'); | ||
| 522 | } | ||
| 523 | |||
| 524 | /* Print out the free pages at each order for each migratetype */ | ||
| 525 | static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) | ||
| 526 | { | ||
| 527 | int mtype; | ||
| 528 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
| 529 | |||
| 530 | seq_printf(m, "\n%-23s", "Number of blocks type "); | ||
| 531 | for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) | ||
| 532 | seq_printf(m, "%12s ", migratetype_names[mtype]); | ||
| 533 | seq_putc(m, '\n'); | ||
| 534 | walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); | ||
| 535 | |||
| 536 | return 0; | ||
| 537 | } | ||
| 538 | |||
| 539 | /* | ||
| 540 | * This prints out statistics in relation to grouping pages by mobility. | ||
| 541 | * It is expensive to collect so do not constantly read the file. | ||
| 542 | */ | ||
| 543 | static int pagetypeinfo_show(struct seq_file *m, void *arg) | ||
| 544 | { | ||
| 545 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
| 546 | |||
| 547 | seq_printf(m, "Page block order: %d\n", pageblock_order); | ||
| 548 | seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages); | ||
| 549 | seq_putc(m, '\n'); | ||
| 550 | pagetypeinfo_showfree(m, pgdat); | ||
| 551 | pagetypeinfo_showblockcount(m, pgdat); | ||
| 552 | |||
| 447 | return 0; | 553 | return 0; |
| 448 | } | 554 | } |
| 449 | 555 | ||
| @@ -454,6 +560,13 @@ const struct seq_operations fragmentation_op = { | |||
| 454 | .show = frag_show, | 560 | .show = frag_show, |
| 455 | }; | 561 | }; |
| 456 | 562 | ||
| 563 | const struct seq_operations pagetypeinfo_op = { | ||
| 564 | .start = frag_start, | ||
| 565 | .next = frag_next, | ||
| 566 | .stop = frag_stop, | ||
| 567 | .show = pagetypeinfo_show, | ||
| 568 | }; | ||
| 569 | |||
| 457 | #ifdef CONFIG_ZONE_DMA | 570 | #ifdef CONFIG_ZONE_DMA |
| 458 | #define TEXT_FOR_DMA(xx) xx "_dma", | 571 | #define TEXT_FOR_DMA(xx) xx "_dma", |
| 459 | #else | 572 | #else |
| @@ -532,84 +645,78 @@ static const char * const vmstat_text[] = { | |||
| 532 | #endif | 645 | #endif |
| 533 | }; | 646 | }; |
| 534 | 647 | ||
| 535 | /* | 648 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, |
| 536 | * Output information about zones in @pgdat. | 649 | struct zone *zone) |
| 537 | */ | ||
| 538 | static int zoneinfo_show(struct seq_file *m, void *arg) | ||
| 539 | { | 650 | { |
| 540 | pg_data_t *pgdat = arg; | 651 | int i; |
| 541 | struct zone *zone; | 652 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); |
| 542 | struct zone *node_zones = pgdat->node_zones; | 653 | seq_printf(m, |
| 543 | unsigned long flags; | 654 | "\n pages free %lu" |
| 544 | 655 | "\n min %lu" | |
| 545 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | 656 | "\n low %lu" |
| 546 | int i; | 657 | "\n high %lu" |
| 547 | 658 | "\n scanned %lu (a: %lu i: %lu)" | |
| 548 | if (!populated_zone(zone)) | 659 | "\n spanned %lu" |
| 549 | continue; | 660 | "\n present %lu", |
| 550 | 661 | zone_page_state(zone, NR_FREE_PAGES), | |
| 551 | spin_lock_irqsave(&zone->lock, flags); | 662 | zone->pages_min, |
| 552 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); | 663 | zone->pages_low, |
| 553 | seq_printf(m, | 664 | zone->pages_high, |
| 554 | "\n pages free %lu" | 665 | zone->pages_scanned, |
| 555 | "\n min %lu" | 666 | zone->nr_scan_active, zone->nr_scan_inactive, |
| 556 | "\n low %lu" | 667 | zone->spanned_pages, |
| 557 | "\n high %lu" | 668 | zone->present_pages); |
| 558 | "\n scanned %lu (a: %lu i: %lu)" | ||
| 559 | "\n spanned %lu" | ||
| 560 | "\n present %lu", | ||
| 561 | zone_page_state(zone, NR_FREE_PAGES), | ||
| 562 | zone->pages_min, | ||
| 563 | zone->pages_low, | ||
| 564 | zone->pages_high, | ||
| 565 | zone->pages_scanned, | ||
| 566 | zone->nr_scan_active, zone->nr_scan_inactive, | ||
| 567 | zone->spanned_pages, | ||
| 568 | zone->present_pages); | ||
| 569 | 669 | ||
| 570 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 670 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
| 571 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], | 671 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], |
| 572 | zone_page_state(zone, i)); | 672 | zone_page_state(zone, i)); |
| 573 | 673 | ||
| 574 | seq_printf(m, | 674 | seq_printf(m, |
| 575 | "\n protection: (%lu", | 675 | "\n protection: (%lu", |
| 576 | zone->lowmem_reserve[0]); | 676 | zone->lowmem_reserve[0]); |
| 577 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) | 677 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) |
| 578 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); | 678 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); |
| 579 | seq_printf(m, | 679 | seq_printf(m, |
| 580 | ")" | 680 | ")" |
| 581 | "\n pagesets"); | 681 | "\n pagesets"); |
| 582 | for_each_online_cpu(i) { | 682 | for_each_online_cpu(i) { |
| 583 | struct per_cpu_pageset *pageset; | 683 | struct per_cpu_pageset *pageset; |
| 584 | int j; | 684 | int j; |
| 585 | 685 | ||
| 586 | pageset = zone_pcp(zone, i); | 686 | pageset = zone_pcp(zone, i); |
| 587 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | 687 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { |
| 588 | seq_printf(m, | 688 | seq_printf(m, |
| 589 | "\n cpu: %i pcp: %i" | 689 | "\n cpu: %i pcp: %i" |
| 590 | "\n count: %i" | 690 | "\n count: %i" |
| 591 | "\n high: %i" | 691 | "\n high: %i" |
| 592 | "\n batch: %i", | 692 | "\n batch: %i", |
| 593 | i, j, | 693 | i, j, |
| 594 | pageset->pcp[j].count, | 694 | pageset->pcp[j].count, |
| 595 | pageset->pcp[j].high, | 695 | pageset->pcp[j].high, |
| 596 | pageset->pcp[j].batch); | 696 | pageset->pcp[j].batch); |
| 597 | } | 697 | } |
| 598 | #ifdef CONFIG_SMP | 698 | #ifdef CONFIG_SMP |
| 599 | seq_printf(m, "\n vm stats threshold: %d", | 699 | seq_printf(m, "\n vm stats threshold: %d", |
| 600 | pageset->stat_threshold); | 700 | pageset->stat_threshold); |
| 601 | #endif | 701 | #endif |
| 602 | } | ||
| 603 | seq_printf(m, | ||
| 604 | "\n all_unreclaimable: %u" | ||
| 605 | "\n prev_priority: %i" | ||
| 606 | "\n start_pfn: %lu", | ||
| 607 | zone->all_unreclaimable, | ||
| 608 | zone->prev_priority, | ||
| 609 | zone->zone_start_pfn); | ||
| 610 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 611 | seq_putc(m, '\n'); | ||
| 612 | } | 702 | } |
| 703 | seq_printf(m, | ||
| 704 | "\n all_unreclaimable: %u" | ||
| 705 | "\n prev_priority: %i" | ||
| 706 | "\n start_pfn: %lu", | ||
| 707 | zone->all_unreclaimable, | ||
| 708 | zone->prev_priority, | ||
| 709 | zone->zone_start_pfn); | ||
| 710 | seq_putc(m, '\n'); | ||
| 711 | } | ||
| 712 | |||
| 713 | /* | ||
| 714 | * Output information about zones in @pgdat. | ||
| 715 | */ | ||
| 716 | static int zoneinfo_show(struct seq_file *m, void *arg) | ||
| 717 | { | ||
| 718 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
| 719 | walk_zones_in_node(m, pgdat, zoneinfo_show_print); | ||
| 613 | return 0; | 720 | return 0; |
| 614 | } | 721 | } |
| 615 | 722 | ||
| @@ -741,7 +848,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
| 741 | static struct notifier_block __cpuinitdata vmstat_notifier = | 848 | static struct notifier_block __cpuinitdata vmstat_notifier = |
| 742 | { &vmstat_cpuup_callback, NULL, 0 }; | 849 | { &vmstat_cpuup_callback, NULL, 0 }; |
| 743 | 850 | ||
| 744 | int __init setup_vmstat(void) | 851 | static int __init setup_vmstat(void) |
| 745 | { | 852 | { |
| 746 | int cpu; | 853 | int cpu; |
| 747 | 854 | ||
