diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 18 | ||||
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/filemap.c | 766 | ||||
-rw-r--r-- | mm/filemap.h | 103 | ||||
-rw-r--r-- | mm/filemap_xip.c | 17 | ||||
-rw-r--r-- | mm/hugetlb.c | 398 | ||||
-rw-r--r-- | mm/internal.h | 10 | ||||
-rw-r--r-- | mm/memory.c | 161 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 312 | ||||
-rw-r--r-- | mm/mempolicy.c | 60 | ||||
-rw-r--r-- | mm/migrate.c | 4 | ||||
-rw-r--r-- | mm/mprotect.c | 1 | ||||
-rw-r--r-- | mm/oom_kill.c | 9 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 731 | ||||
-rw-r--r-- | mm/page_isolation.c | 138 | ||||
-rw-r--r-- | mm/readahead.c | 88 | ||||
-rw-r--r-- | mm/rmap.c | 1 | ||||
-rw-r--r-- | mm/shmem.c | 62 | ||||
-rw-r--r-- | mm/slab.c | 21 | ||||
-rw-r--r-- | mm/slob.c | 7 | ||||
-rw-r--r-- | mm/slub.c | 490 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 148 | ||||
-rw-r--r-- | mm/sparse.c | 105 | ||||
-rw-r--r-- | mm/swap.c | 106 | ||||
-rw-r--r-- | mm/swap_state.c | 5 | ||||
-rw-r--r-- | mm/util.c | 6 | ||||
-rw-r--r-- | mm/vmalloc.c | 5 | ||||
-rw-r--r-- | mm/vmscan.c | 59 | ||||
-rw-r--r-- | mm/vmstat.c | 305 |
30 files changed, 3071 insertions, 1078 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a7609cbcb00d..1cc6cada2bbf 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -112,6 +112,19 @@ config SPARSEMEM_EXTREME | |||
112 | def_bool y | 112 | def_bool y |
113 | depends on SPARSEMEM && !SPARSEMEM_STATIC | 113 | depends on SPARSEMEM && !SPARSEMEM_STATIC |
114 | 114 | ||
115 | # | ||
116 | # SPARSEMEM_VMEMMAP uses a virtually mapped mem_map to optimise pfn_to_page | ||
117 | # and page_to_pfn. The most efficient option where kernel virtual space is | ||
118 | # not under pressure. | ||
119 | # | ||
120 | config SPARSEMEM_VMEMMAP_ENABLE | ||
121 | def_bool n | ||
122 | |||
123 | config SPARSEMEM_VMEMMAP | ||
124 | bool | ||
125 | depends on SPARSEMEM | ||
126 | default y if (SPARSEMEM_VMEMMAP_ENABLE) | ||
127 | |||
115 | # eventually, we can have this option just 'select SPARSEMEM' | 128 | # eventually, we can have this option just 'select SPARSEMEM' |
116 | config MEMORY_HOTPLUG | 129 | config MEMORY_HOTPLUG |
117 | bool "Allow for memory hot-add" | 130 | bool "Allow for memory hot-add" |
@@ -126,6 +139,11 @@ config MEMORY_HOTPLUG_SPARSE | |||
126 | def_bool y | 139 | def_bool y |
127 | depends on SPARSEMEM && MEMORY_HOTPLUG | 140 | depends on SPARSEMEM && MEMORY_HOTPLUG |
128 | 141 | ||
142 | config MEMORY_HOTREMOVE | ||
143 | bool "Allow for memory hot remove" | ||
144 | depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE | ||
145 | depends on MIGRATION | ||
146 | |||
129 | # Heavily threaded applications may benefit from splitting the mm-wide | 147 | # Heavily threaded applications may benefit from splitting the mm-wide |
130 | # page_table_lock, so that faults on different parts of the user address | 148 | # page_table_lock, so that faults on different parts of the user address |
131 | # space can be handled with less contention: split it at this NR_CPUS. | 149 | # space can be handled with less contention: split it at this NR_CPUS. |
diff --git a/mm/Makefile b/mm/Makefile index 245e33ab00c4..5c0b0ea7572d 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -11,13 +11,14 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | |||
11 | page_alloc.o page-writeback.o pdflush.o \ | 11 | page_alloc.o page-writeback.o pdflush.o \ |
12 | readahead.o swap.o truncate.o vmscan.o \ | 12 | readahead.o swap.o truncate.o vmscan.o \ |
13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
14 | $(mmu-y) | 14 | page_isolation.o $(mmu-y) |
15 | 15 | ||
16 | obj-$(CONFIG_BOUNCE) += bounce.o | 16 | obj-$(CONFIG_BOUNCE) += bounce.o |
17 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | 17 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o |
18 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 18 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
19 | obj-$(CONFIG_NUMA) += mempolicy.o | 19 | obj-$(CONFIG_NUMA) += mempolicy.o |
20 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 20 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
21 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | ||
21 | obj-$(CONFIG_SHMEM) += shmem.o | 22 | obj-$(CONFIG_SHMEM) += shmem.o |
22 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | 23 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o |
23 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | 24 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o |
diff --git a/mm/filemap.c b/mm/filemap.c index 15c8413ee929..c6049e947cd9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -30,7 +30,7 @@ | |||
30 | #include <linux/security.h> | 30 | #include <linux/security.h> |
31 | #include <linux/syscalls.h> | 31 | #include <linux/syscalls.h> |
32 | #include <linux/cpuset.h> | 32 | #include <linux/cpuset.h> |
33 | #include "filemap.h" | 33 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
34 | #include "internal.h" | 34 | #include "internal.h" |
35 | 35 | ||
36 | /* | 36 | /* |
@@ -593,7 +593,7 @@ void fastcall __lock_page_nosync(struct page *page) | |||
593 | * Is there a pagecache struct page at the given (mapping, offset) tuple? | 593 | * Is there a pagecache struct page at the given (mapping, offset) tuple? |
594 | * If yes, increment its refcount and return it; if no, return NULL. | 594 | * If yes, increment its refcount and return it; if no, return NULL. |
595 | */ | 595 | */ |
596 | struct page * find_get_page(struct address_space *mapping, unsigned long offset) | 596 | struct page * find_get_page(struct address_space *mapping, pgoff_t offset) |
597 | { | 597 | { |
598 | struct page *page; | 598 | struct page *page; |
599 | 599 | ||
@@ -617,30 +617,31 @@ EXPORT_SYMBOL(find_get_page); | |||
617 | * Returns zero if the page was not present. find_lock_page() may sleep. | 617 | * Returns zero if the page was not present. find_lock_page() may sleep. |
618 | */ | 618 | */ |
619 | struct page *find_lock_page(struct address_space *mapping, | 619 | struct page *find_lock_page(struct address_space *mapping, |
620 | unsigned long offset) | 620 | pgoff_t offset) |
621 | { | 621 | { |
622 | struct page *page; | 622 | struct page *page; |
623 | 623 | ||
624 | read_lock_irq(&mapping->tree_lock); | ||
625 | repeat: | 624 | repeat: |
625 | read_lock_irq(&mapping->tree_lock); | ||
626 | page = radix_tree_lookup(&mapping->page_tree, offset); | 626 | page = radix_tree_lookup(&mapping->page_tree, offset); |
627 | if (page) { | 627 | if (page) { |
628 | page_cache_get(page); | 628 | page_cache_get(page); |
629 | if (TestSetPageLocked(page)) { | 629 | if (TestSetPageLocked(page)) { |
630 | read_unlock_irq(&mapping->tree_lock); | 630 | read_unlock_irq(&mapping->tree_lock); |
631 | __lock_page(page); | 631 | __lock_page(page); |
632 | read_lock_irq(&mapping->tree_lock); | ||
633 | 632 | ||
634 | /* Has the page been truncated while we slept? */ | 633 | /* Has the page been truncated while we slept? */ |
635 | if (unlikely(page->mapping != mapping || | 634 | if (unlikely(page->mapping != mapping)) { |
636 | page->index != offset)) { | ||
637 | unlock_page(page); | 635 | unlock_page(page); |
638 | page_cache_release(page); | 636 | page_cache_release(page); |
639 | goto repeat; | 637 | goto repeat; |
640 | } | 638 | } |
639 | VM_BUG_ON(page->index != offset); | ||
640 | goto out; | ||
641 | } | 641 | } |
642 | } | 642 | } |
643 | read_unlock_irq(&mapping->tree_lock); | 643 | read_unlock_irq(&mapping->tree_lock); |
644 | out: | ||
644 | return page; | 645 | return page; |
645 | } | 646 | } |
646 | EXPORT_SYMBOL(find_lock_page); | 647 | EXPORT_SYMBOL(find_lock_page); |
@@ -663,29 +664,24 @@ EXPORT_SYMBOL(find_lock_page); | |||
663 | * memory exhaustion. | 664 | * memory exhaustion. |
664 | */ | 665 | */ |
665 | struct page *find_or_create_page(struct address_space *mapping, | 666 | struct page *find_or_create_page(struct address_space *mapping, |
666 | unsigned long index, gfp_t gfp_mask) | 667 | pgoff_t index, gfp_t gfp_mask) |
667 | { | 668 | { |
668 | struct page *page, *cached_page = NULL; | 669 | struct page *page; |
669 | int err; | 670 | int err; |
670 | repeat: | 671 | repeat: |
671 | page = find_lock_page(mapping, index); | 672 | page = find_lock_page(mapping, index); |
672 | if (!page) { | 673 | if (!page) { |
673 | if (!cached_page) { | 674 | page = __page_cache_alloc(gfp_mask); |
674 | cached_page = | 675 | if (!page) |
675 | __page_cache_alloc(gfp_mask); | 676 | return NULL; |
676 | if (!cached_page) | 677 | err = add_to_page_cache_lru(page, mapping, index, gfp_mask); |
677 | return NULL; | 678 | if (unlikely(err)) { |
679 | page_cache_release(page); | ||
680 | page = NULL; | ||
681 | if (err == -EEXIST) | ||
682 | goto repeat; | ||
678 | } | 683 | } |
679 | err = add_to_page_cache_lru(cached_page, mapping, | ||
680 | index, gfp_mask); | ||
681 | if (!err) { | ||
682 | page = cached_page; | ||
683 | cached_page = NULL; | ||
684 | } else if (err == -EEXIST) | ||
685 | goto repeat; | ||
686 | } | 684 | } |
687 | if (cached_page) | ||
688 | page_cache_release(cached_page); | ||
689 | return page; | 685 | return page; |
690 | } | 686 | } |
691 | EXPORT_SYMBOL(find_or_create_page); | 687 | EXPORT_SYMBOL(find_or_create_page); |
@@ -797,7 +793,7 @@ EXPORT_SYMBOL(find_get_pages_tag); | |||
797 | * and deadlock against the caller's locked page. | 793 | * and deadlock against the caller's locked page. |
798 | */ | 794 | */ |
799 | struct page * | 795 | struct page * |
800 | grab_cache_page_nowait(struct address_space *mapping, unsigned long index) | 796 | grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) |
801 | { | 797 | { |
802 | struct page *page = find_get_page(mapping, index); | 798 | struct page *page = find_get_page(mapping, index); |
803 | 799 | ||
@@ -859,34 +855,29 @@ static void shrink_readahead_size_eio(struct file *filp, | |||
859 | * It may be NULL. | 855 | * It may be NULL. |
860 | */ | 856 | */ |
861 | void do_generic_mapping_read(struct address_space *mapping, | 857 | void do_generic_mapping_read(struct address_space *mapping, |
862 | struct file_ra_state *_ra, | 858 | struct file_ra_state *ra, |
863 | struct file *filp, | 859 | struct file *filp, |
864 | loff_t *ppos, | 860 | loff_t *ppos, |
865 | read_descriptor_t *desc, | 861 | read_descriptor_t *desc, |
866 | read_actor_t actor) | 862 | read_actor_t actor) |
867 | { | 863 | { |
868 | struct inode *inode = mapping->host; | 864 | struct inode *inode = mapping->host; |
869 | unsigned long index; | 865 | pgoff_t index; |
870 | unsigned long offset; | 866 | pgoff_t last_index; |
871 | unsigned long last_index; | 867 | pgoff_t prev_index; |
872 | unsigned long next_index; | 868 | unsigned long offset; /* offset into pagecache page */ |
873 | unsigned long prev_index; | ||
874 | unsigned int prev_offset; | 869 | unsigned int prev_offset; |
875 | struct page *cached_page; | ||
876 | int error; | 870 | int error; |
877 | struct file_ra_state ra = *_ra; | ||
878 | 871 | ||
879 | cached_page = NULL; | ||
880 | index = *ppos >> PAGE_CACHE_SHIFT; | 872 | index = *ppos >> PAGE_CACHE_SHIFT; |
881 | next_index = index; | 873 | prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; |
882 | prev_index = ra.prev_index; | 874 | prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); |
883 | prev_offset = ra.prev_offset; | ||
884 | last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; | 875 | last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; |
885 | offset = *ppos & ~PAGE_CACHE_MASK; | 876 | offset = *ppos & ~PAGE_CACHE_MASK; |
886 | 877 | ||
887 | for (;;) { | 878 | for (;;) { |
888 | struct page *page; | 879 | struct page *page; |
889 | unsigned long end_index; | 880 | pgoff_t end_index; |
890 | loff_t isize; | 881 | loff_t isize; |
891 | unsigned long nr, ret; | 882 | unsigned long nr, ret; |
892 | 883 | ||
@@ -895,7 +886,7 @@ find_page: | |||
895 | page = find_get_page(mapping, index); | 886 | page = find_get_page(mapping, index); |
896 | if (!page) { | 887 | if (!page) { |
897 | page_cache_sync_readahead(mapping, | 888 | page_cache_sync_readahead(mapping, |
898 | &ra, filp, | 889 | ra, filp, |
899 | index, last_index - index); | 890 | index, last_index - index); |
900 | page = find_get_page(mapping, index); | 891 | page = find_get_page(mapping, index); |
901 | if (unlikely(page == NULL)) | 892 | if (unlikely(page == NULL)) |
@@ -903,7 +894,7 @@ find_page: | |||
903 | } | 894 | } |
904 | if (PageReadahead(page)) { | 895 | if (PageReadahead(page)) { |
905 | page_cache_async_readahead(mapping, | 896 | page_cache_async_readahead(mapping, |
906 | &ra, filp, page, | 897 | ra, filp, page, |
907 | index, last_index - index); | 898 | index, last_index - index); |
908 | } | 899 | } |
909 | if (!PageUptodate(page)) | 900 | if (!PageUptodate(page)) |
@@ -966,7 +957,6 @@ page_ok: | |||
966 | index += offset >> PAGE_CACHE_SHIFT; | 957 | index += offset >> PAGE_CACHE_SHIFT; |
967 | offset &= ~PAGE_CACHE_MASK; | 958 | offset &= ~PAGE_CACHE_MASK; |
968 | prev_offset = offset; | 959 | prev_offset = offset; |
969 | ra.prev_offset = offset; | ||
970 | 960 | ||
971 | page_cache_release(page); | 961 | page_cache_release(page); |
972 | if (ret == nr && desc->count) | 962 | if (ret == nr && desc->count) |
@@ -1015,7 +1005,7 @@ readpage: | |||
1015 | } | 1005 | } |
1016 | unlock_page(page); | 1006 | unlock_page(page); |
1017 | error = -EIO; | 1007 | error = -EIO; |
1018 | shrink_readahead_size_eio(filp, &ra); | 1008 | shrink_readahead_size_eio(filp, ra); |
1019 | goto readpage_error; | 1009 | goto readpage_error; |
1020 | } | 1010 | } |
1021 | unlock_page(page); | 1011 | unlock_page(page); |
@@ -1034,33 +1024,29 @@ no_cached_page: | |||
1034 | * Ok, it wasn't cached, so we need to create a new | 1024 | * Ok, it wasn't cached, so we need to create a new |
1035 | * page.. | 1025 | * page.. |
1036 | */ | 1026 | */ |
1037 | if (!cached_page) { | 1027 | page = page_cache_alloc_cold(mapping); |
1038 | cached_page = page_cache_alloc_cold(mapping); | 1028 | if (!page) { |
1039 | if (!cached_page) { | 1029 | desc->error = -ENOMEM; |
1040 | desc->error = -ENOMEM; | 1030 | goto out; |
1041 | goto out; | ||
1042 | } | ||
1043 | } | 1031 | } |
1044 | error = add_to_page_cache_lru(cached_page, mapping, | 1032 | error = add_to_page_cache_lru(page, mapping, |
1045 | index, GFP_KERNEL); | 1033 | index, GFP_KERNEL); |
1046 | if (error) { | 1034 | if (error) { |
1035 | page_cache_release(page); | ||
1047 | if (error == -EEXIST) | 1036 | if (error == -EEXIST) |
1048 | goto find_page; | 1037 | goto find_page; |
1049 | desc->error = error; | 1038 | desc->error = error; |
1050 | goto out; | 1039 | goto out; |
1051 | } | 1040 | } |
1052 | page = cached_page; | ||
1053 | cached_page = NULL; | ||
1054 | goto readpage; | 1041 | goto readpage; |
1055 | } | 1042 | } |
1056 | 1043 | ||
1057 | out: | 1044 | out: |
1058 | *_ra = ra; | 1045 | ra->prev_pos = prev_index; |
1059 | _ra->prev_index = prev_index; | 1046 | ra->prev_pos <<= PAGE_CACHE_SHIFT; |
1047 | ra->prev_pos |= prev_offset; | ||
1060 | 1048 | ||
1061 | *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; | 1049 | *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; |
1062 | if (cached_page) | ||
1063 | page_cache_release(cached_page); | ||
1064 | if (filp) | 1050 | if (filp) |
1065 | file_accessed(filp); | 1051 | file_accessed(filp); |
1066 | } | 1052 | } |
@@ -1220,7 +1206,7 @@ EXPORT_SYMBOL(generic_file_aio_read); | |||
1220 | 1206 | ||
1221 | static ssize_t | 1207 | static ssize_t |
1222 | do_readahead(struct address_space *mapping, struct file *filp, | 1208 | do_readahead(struct address_space *mapping, struct file *filp, |
1223 | unsigned long index, unsigned long nr) | 1209 | pgoff_t index, unsigned long nr) |
1224 | { | 1210 | { |
1225 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | 1211 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) |
1226 | return -EINVAL; | 1212 | return -EINVAL; |
@@ -1240,8 +1226,8 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) | |||
1240 | if (file) { | 1226 | if (file) { |
1241 | if (file->f_mode & FMODE_READ) { | 1227 | if (file->f_mode & FMODE_READ) { |
1242 | struct address_space *mapping = file->f_mapping; | 1228 | struct address_space *mapping = file->f_mapping; |
1243 | unsigned long start = offset >> PAGE_CACHE_SHIFT; | 1229 | pgoff_t start = offset >> PAGE_CACHE_SHIFT; |
1244 | unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT; | 1230 | pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; |
1245 | unsigned long len = end - start + 1; | 1231 | unsigned long len = end - start + 1; |
1246 | ret = do_readahead(mapping, file, start, len); | 1232 | ret = do_readahead(mapping, file, start, len); |
1247 | } | 1233 | } |
@@ -1251,7 +1237,6 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) | |||
1251 | } | 1237 | } |
1252 | 1238 | ||
1253 | #ifdef CONFIG_MMU | 1239 | #ifdef CONFIG_MMU |
1254 | static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); | ||
1255 | /** | 1240 | /** |
1256 | * page_cache_read - adds requested page to the page cache if not already there | 1241 | * page_cache_read - adds requested page to the page cache if not already there |
1257 | * @file: file to read | 1242 | * @file: file to read |
@@ -1260,7 +1245,7 @@ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); | |||
1260 | * This adds the requested page to the page cache if it isn't already there, | 1245 | * This adds the requested page to the page cache if it isn't already there, |
1261 | * and schedules an I/O to read in its contents from disk. | 1246 | * and schedules an I/O to read in its contents from disk. |
1262 | */ | 1247 | */ |
1263 | static int fastcall page_cache_read(struct file * file, unsigned long offset) | 1248 | static int fastcall page_cache_read(struct file * file, pgoff_t offset) |
1264 | { | 1249 | { |
1265 | struct address_space *mapping = file->f_mapping; | 1250 | struct address_space *mapping = file->f_mapping; |
1266 | struct page *page; | 1251 | struct page *page; |
@@ -1349,7 +1334,7 @@ retry_find: | |||
1349 | * Do we miss much more than hit in this file? If so, | 1334 | * Do we miss much more than hit in this file? If so, |
1350 | * stop bothering with read-ahead. It will only hurt. | 1335 | * stop bothering with read-ahead. It will only hurt. |
1351 | */ | 1336 | */ |
1352 | if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) | 1337 | if (ra->mmap_miss > MMAP_LOTSAMISS) |
1353 | goto no_cached_page; | 1338 | goto no_cached_page; |
1354 | 1339 | ||
1355 | /* | 1340 | /* |
@@ -1375,7 +1360,7 @@ retry_find: | |||
1375 | } | 1360 | } |
1376 | 1361 | ||
1377 | if (!did_readaround) | 1362 | if (!did_readaround) |
1378 | ra->mmap_hit++; | 1363 | ra->mmap_miss--; |
1379 | 1364 | ||
1380 | /* | 1365 | /* |
1381 | * We have a locked page in the page cache, now we need to check | 1366 | * We have a locked page in the page cache, now we need to check |
@@ -1396,7 +1381,7 @@ retry_find: | |||
1396 | * Found the page and have a reference on it. | 1381 | * Found the page and have a reference on it. |
1397 | */ | 1382 | */ |
1398 | mark_page_accessed(page); | 1383 | mark_page_accessed(page); |
1399 | ra->prev_index = page->index; | 1384 | ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; |
1400 | vmf->page = page; | 1385 | vmf->page = page; |
1401 | return ret | VM_FAULT_LOCKED; | 1386 | return ret | VM_FAULT_LOCKED; |
1402 | 1387 | ||
@@ -1501,39 +1486,32 @@ EXPORT_SYMBOL(generic_file_mmap); | |||
1501 | EXPORT_SYMBOL(generic_file_readonly_mmap); | 1486 | EXPORT_SYMBOL(generic_file_readonly_mmap); |
1502 | 1487 | ||
1503 | static struct page *__read_cache_page(struct address_space *mapping, | 1488 | static struct page *__read_cache_page(struct address_space *mapping, |
1504 | unsigned long index, | 1489 | pgoff_t index, |
1505 | int (*filler)(void *,struct page*), | 1490 | int (*filler)(void *,struct page*), |
1506 | void *data) | 1491 | void *data) |
1507 | { | 1492 | { |
1508 | struct page *page, *cached_page = NULL; | 1493 | struct page *page; |
1509 | int err; | 1494 | int err; |
1510 | repeat: | 1495 | repeat: |
1511 | page = find_get_page(mapping, index); | 1496 | page = find_get_page(mapping, index); |
1512 | if (!page) { | 1497 | if (!page) { |
1513 | if (!cached_page) { | 1498 | page = page_cache_alloc_cold(mapping); |
1514 | cached_page = page_cache_alloc_cold(mapping); | 1499 | if (!page) |
1515 | if (!cached_page) | 1500 | return ERR_PTR(-ENOMEM); |
1516 | return ERR_PTR(-ENOMEM); | 1501 | err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); |
1517 | } | 1502 | if (unlikely(err)) { |
1518 | err = add_to_page_cache_lru(cached_page, mapping, | 1503 | page_cache_release(page); |
1519 | index, GFP_KERNEL); | 1504 | if (err == -EEXIST) |
1520 | if (err == -EEXIST) | 1505 | goto repeat; |
1521 | goto repeat; | ||
1522 | if (err < 0) { | ||
1523 | /* Presumably ENOMEM for radix tree node */ | 1506 | /* Presumably ENOMEM for radix tree node */ |
1524 | page_cache_release(cached_page); | ||
1525 | return ERR_PTR(err); | 1507 | return ERR_PTR(err); |
1526 | } | 1508 | } |
1527 | page = cached_page; | ||
1528 | cached_page = NULL; | ||
1529 | err = filler(data, page); | 1509 | err = filler(data, page); |
1530 | if (err < 0) { | 1510 | if (err < 0) { |
1531 | page_cache_release(page); | 1511 | page_cache_release(page); |
1532 | page = ERR_PTR(err); | 1512 | page = ERR_PTR(err); |
1533 | } | 1513 | } |
1534 | } | 1514 | } |
1535 | if (cached_page) | ||
1536 | page_cache_release(cached_page); | ||
1537 | return page; | 1515 | return page; |
1538 | } | 1516 | } |
1539 | 1517 | ||
@@ -1542,7 +1520,7 @@ repeat: | |||
1542 | * after submitting it to the filler. | 1520 | * after submitting it to the filler. |
1543 | */ | 1521 | */ |
1544 | struct page *read_cache_page_async(struct address_space *mapping, | 1522 | struct page *read_cache_page_async(struct address_space *mapping, |
1545 | unsigned long index, | 1523 | pgoff_t index, |
1546 | int (*filler)(void *,struct page*), | 1524 | int (*filler)(void *,struct page*), |
1547 | void *data) | 1525 | void *data) |
1548 | { | 1526 | { |
@@ -1590,7 +1568,7 @@ EXPORT_SYMBOL(read_cache_page_async); | |||
1590 | * If the page does not get brought uptodate, return -EIO. | 1568 | * If the page does not get brought uptodate, return -EIO. |
1591 | */ | 1569 | */ |
1592 | struct page *read_cache_page(struct address_space *mapping, | 1570 | struct page *read_cache_page(struct address_space *mapping, |
1593 | unsigned long index, | 1571 | pgoff_t index, |
1594 | int (*filler)(void *,struct page*), | 1572 | int (*filler)(void *,struct page*), |
1595 | void *data) | 1573 | void *data) |
1596 | { | 1574 | { |
@@ -1610,40 +1588,6 @@ struct page *read_cache_page(struct address_space *mapping, | |||
1610 | EXPORT_SYMBOL(read_cache_page); | 1588 | EXPORT_SYMBOL(read_cache_page); |
1611 | 1589 | ||
1612 | /* | 1590 | /* |
1613 | * If the page was newly created, increment its refcount and add it to the | ||
1614 | * caller's lru-buffering pagevec. This function is specifically for | ||
1615 | * generic_file_write(). | ||
1616 | */ | ||
1617 | static inline struct page * | ||
1618 | __grab_cache_page(struct address_space *mapping, unsigned long index, | ||
1619 | struct page **cached_page, struct pagevec *lru_pvec) | ||
1620 | { | ||
1621 | int err; | ||
1622 | struct page *page; | ||
1623 | repeat: | ||
1624 | page = find_lock_page(mapping, index); | ||
1625 | if (!page) { | ||
1626 | if (!*cached_page) { | ||
1627 | *cached_page = page_cache_alloc(mapping); | ||
1628 | if (!*cached_page) | ||
1629 | return NULL; | ||
1630 | } | ||
1631 | err = add_to_page_cache(*cached_page, mapping, | ||
1632 | index, GFP_KERNEL); | ||
1633 | if (err == -EEXIST) | ||
1634 | goto repeat; | ||
1635 | if (err == 0) { | ||
1636 | page = *cached_page; | ||
1637 | page_cache_get(page); | ||
1638 | if (!pagevec_add(lru_pvec, page)) | ||
1639 | __pagevec_lru_add(lru_pvec); | ||
1640 | *cached_page = NULL; | ||
1641 | } | ||
1642 | } | ||
1643 | return page; | ||
1644 | } | ||
1645 | |||
1646 | /* | ||
1647 | * The logic we want is | 1591 | * The logic we want is |
1648 | * | 1592 | * |
1649 | * if suid or (sgid and xgrp) | 1593 | * if suid or (sgid and xgrp) |
@@ -1691,8 +1635,7 @@ int remove_suid(struct dentry *dentry) | |||
1691 | } | 1635 | } |
1692 | EXPORT_SYMBOL(remove_suid); | 1636 | EXPORT_SYMBOL(remove_suid); |
1693 | 1637 | ||
1694 | size_t | 1638 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, |
1695 | __filemap_copy_from_user_iovec_inatomic(char *vaddr, | ||
1696 | const struct iovec *iov, size_t base, size_t bytes) | 1639 | const struct iovec *iov, size_t base, size_t bytes) |
1697 | { | 1640 | { |
1698 | size_t copied = 0, left = 0; | 1641 | size_t copied = 0, left = 0; |
@@ -1715,6 +1658,124 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr, | |||
1715 | } | 1658 | } |
1716 | 1659 | ||
1717 | /* | 1660 | /* |
1661 | * Copy as much as we can into the page and return the number of bytes which | ||
1662 | * were sucessfully copied. If a fault is encountered then return the number of | ||
1663 | * bytes which were copied. | ||
1664 | */ | ||
1665 | size_t iov_iter_copy_from_user_atomic(struct page *page, | ||
1666 | struct iov_iter *i, unsigned long offset, size_t bytes) | ||
1667 | { | ||
1668 | char *kaddr; | ||
1669 | size_t copied; | ||
1670 | |||
1671 | BUG_ON(!in_atomic()); | ||
1672 | kaddr = kmap_atomic(page, KM_USER0); | ||
1673 | if (likely(i->nr_segs == 1)) { | ||
1674 | int left; | ||
1675 | char __user *buf = i->iov->iov_base + i->iov_offset; | ||
1676 | left = __copy_from_user_inatomic_nocache(kaddr + offset, | ||
1677 | buf, bytes); | ||
1678 | copied = bytes - left; | ||
1679 | } else { | ||
1680 | copied = __iovec_copy_from_user_inatomic(kaddr + offset, | ||
1681 | i->iov, i->iov_offset, bytes); | ||
1682 | } | ||
1683 | kunmap_atomic(kaddr, KM_USER0); | ||
1684 | |||
1685 | return copied; | ||
1686 | } | ||
1687 | EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); | ||
1688 | |||
1689 | /* | ||
1690 | * This has the same sideeffects and return value as | ||
1691 | * iov_iter_copy_from_user_atomic(). | ||
1692 | * The difference is that it attempts to resolve faults. | ||
1693 | * Page must not be locked. | ||
1694 | */ | ||
1695 | size_t iov_iter_copy_from_user(struct page *page, | ||
1696 | struct iov_iter *i, unsigned long offset, size_t bytes) | ||
1697 | { | ||
1698 | char *kaddr; | ||
1699 | size_t copied; | ||
1700 | |||
1701 | kaddr = kmap(page); | ||
1702 | if (likely(i->nr_segs == 1)) { | ||
1703 | int left; | ||
1704 | char __user *buf = i->iov->iov_base + i->iov_offset; | ||
1705 | left = __copy_from_user_nocache(kaddr + offset, buf, bytes); | ||
1706 | copied = bytes - left; | ||
1707 | } else { | ||
1708 | copied = __iovec_copy_from_user_inatomic(kaddr + offset, | ||
1709 | i->iov, i->iov_offset, bytes); | ||
1710 | } | ||
1711 | kunmap(page); | ||
1712 | return copied; | ||
1713 | } | ||
1714 | EXPORT_SYMBOL(iov_iter_copy_from_user); | ||
1715 | |||
1716 | static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes) | ||
1717 | { | ||
1718 | if (likely(i->nr_segs == 1)) { | ||
1719 | i->iov_offset += bytes; | ||
1720 | } else { | ||
1721 | const struct iovec *iov = i->iov; | ||
1722 | size_t base = i->iov_offset; | ||
1723 | |||
1724 | while (bytes) { | ||
1725 | int copy = min(bytes, iov->iov_len - base); | ||
1726 | |||
1727 | bytes -= copy; | ||
1728 | base += copy; | ||
1729 | if (iov->iov_len == base) { | ||
1730 | iov++; | ||
1731 | base = 0; | ||
1732 | } | ||
1733 | } | ||
1734 | i->iov = iov; | ||
1735 | i->iov_offset = base; | ||
1736 | } | ||
1737 | } | ||
1738 | |||
1739 | void iov_iter_advance(struct iov_iter *i, size_t bytes) | ||
1740 | { | ||
1741 | BUG_ON(i->count < bytes); | ||
1742 | |||
1743 | __iov_iter_advance_iov(i, bytes); | ||
1744 | i->count -= bytes; | ||
1745 | } | ||
1746 | EXPORT_SYMBOL(iov_iter_advance); | ||
1747 | |||
1748 | /* | ||
1749 | * Fault in the first iovec of the given iov_iter, to a maximum length | ||
1750 | * of bytes. Returns 0 on success, or non-zero if the memory could not be | ||
1751 | * accessed (ie. because it is an invalid address). | ||
1752 | * | ||
1753 | * writev-intensive code may want this to prefault several iovecs -- that | ||
1754 | * would be possible (callers must not rely on the fact that _only_ the | ||
1755 | * first iovec will be faulted with the current implementation). | ||
1756 | */ | ||
1757 | int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) | ||
1758 | { | ||
1759 | char __user *buf = i->iov->iov_base + i->iov_offset; | ||
1760 | bytes = min(bytes, i->iov->iov_len - i->iov_offset); | ||
1761 | return fault_in_pages_readable(buf, bytes); | ||
1762 | } | ||
1763 | EXPORT_SYMBOL(iov_iter_fault_in_readable); | ||
1764 | |||
1765 | /* | ||
1766 | * Return the count of just the current iov_iter segment. | ||
1767 | */ | ||
1768 | size_t iov_iter_single_seg_count(struct iov_iter *i) | ||
1769 | { | ||
1770 | const struct iovec *iov = i->iov; | ||
1771 | if (i->nr_segs == 1) | ||
1772 | return i->count; | ||
1773 | else | ||
1774 | return min(i->count, iov->iov_len - i->iov_offset); | ||
1775 | } | ||
1776 | EXPORT_SYMBOL(iov_iter_single_seg_count); | ||
1777 | |||
1778 | /* | ||
1718 | * Performs necessary checks before doing a write | 1779 | * Performs necessary checks before doing a write |
1719 | * | 1780 | * |
1720 | * Can adjust writing position or amount of bytes to write. | 1781 | * Can adjust writing position or amount of bytes to write. |
@@ -1796,6 +1857,91 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i | |||
1796 | } | 1857 | } |
1797 | EXPORT_SYMBOL(generic_write_checks); | 1858 | EXPORT_SYMBOL(generic_write_checks); |
1798 | 1859 | ||
1860 | int pagecache_write_begin(struct file *file, struct address_space *mapping, | ||
1861 | loff_t pos, unsigned len, unsigned flags, | ||
1862 | struct page **pagep, void **fsdata) | ||
1863 | { | ||
1864 | const struct address_space_operations *aops = mapping->a_ops; | ||
1865 | |||
1866 | if (aops->write_begin) { | ||
1867 | return aops->write_begin(file, mapping, pos, len, flags, | ||
1868 | pagep, fsdata); | ||
1869 | } else { | ||
1870 | int ret; | ||
1871 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | ||
1872 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
1873 | struct inode *inode = mapping->host; | ||
1874 | struct page *page; | ||
1875 | again: | ||
1876 | page = __grab_cache_page(mapping, index); | ||
1877 | *pagep = page; | ||
1878 | if (!page) | ||
1879 | return -ENOMEM; | ||
1880 | |||
1881 | if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { | ||
1882 | /* | ||
1883 | * There is no way to resolve a short write situation | ||
1884 | * for a !Uptodate page (except by double copying in | ||
1885 | * the caller done by generic_perform_write_2copy). | ||
1886 | * | ||
1887 | * Instead, we have to bring it uptodate here. | ||
1888 | */ | ||
1889 | ret = aops->readpage(file, page); | ||
1890 | page_cache_release(page); | ||
1891 | if (ret) { | ||
1892 | if (ret == AOP_TRUNCATED_PAGE) | ||
1893 | goto again; | ||
1894 | return ret; | ||
1895 | } | ||
1896 | goto again; | ||
1897 | } | ||
1898 | |||
1899 | ret = aops->prepare_write(file, page, offset, offset+len); | ||
1900 | if (ret) { | ||
1901 | unlock_page(page); | ||
1902 | page_cache_release(page); | ||
1903 | if (pos + len > inode->i_size) | ||
1904 | vmtruncate(inode, inode->i_size); | ||
1905 | } | ||
1906 | return ret; | ||
1907 | } | ||
1908 | } | ||
1909 | EXPORT_SYMBOL(pagecache_write_begin); | ||
1910 | |||
1911 | int pagecache_write_end(struct file *file, struct address_space *mapping, | ||
1912 | loff_t pos, unsigned len, unsigned copied, | ||
1913 | struct page *page, void *fsdata) | ||
1914 | { | ||
1915 | const struct address_space_operations *aops = mapping->a_ops; | ||
1916 | int ret; | ||
1917 | |||
1918 | if (aops->write_end) { | ||
1919 | mark_page_accessed(page); | ||
1920 | ret = aops->write_end(file, mapping, pos, len, copied, | ||
1921 | page, fsdata); | ||
1922 | } else { | ||
1923 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
1924 | struct inode *inode = mapping->host; | ||
1925 | |||
1926 | flush_dcache_page(page); | ||
1927 | ret = aops->commit_write(file, page, offset, offset+len); | ||
1928 | unlock_page(page); | ||
1929 | mark_page_accessed(page); | ||
1930 | page_cache_release(page); | ||
1931 | |||
1932 | if (ret < 0) { | ||
1933 | if (pos + len > inode->i_size) | ||
1934 | vmtruncate(inode, inode->i_size); | ||
1935 | } else if (ret > 0) | ||
1936 | ret = min_t(size_t, copied, ret); | ||
1937 | else | ||
1938 | ret = copied; | ||
1939 | } | ||
1940 | |||
1941 | return ret; | ||
1942 | } | ||
1943 | EXPORT_SYMBOL(pagecache_write_end); | ||
1944 | |||
1799 | ssize_t | 1945 | ssize_t |
1800 | generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | 1946 | generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, |
1801 | unsigned long *nr_segs, loff_t pos, loff_t *ppos, | 1947 | unsigned long *nr_segs, loff_t pos, loff_t *ppos, |
@@ -1835,151 +1981,314 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
1835 | } | 1981 | } |
1836 | EXPORT_SYMBOL(generic_file_direct_write); | 1982 | EXPORT_SYMBOL(generic_file_direct_write); |
1837 | 1983 | ||
1838 | ssize_t | 1984 | /* |
1839 | generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | 1985 | * Find or create a page at the given pagecache position. Return the locked |
1840 | unsigned long nr_segs, loff_t pos, loff_t *ppos, | 1986 | * page. This function is specifically for buffered writes. |
1841 | size_t count, ssize_t written) | 1987 | */ |
1988 | struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) | ||
1842 | { | 1989 | { |
1843 | struct file *file = iocb->ki_filp; | 1990 | int status; |
1844 | struct address_space * mapping = file->f_mapping; | 1991 | struct page *page; |
1845 | const struct address_space_operations *a_ops = mapping->a_ops; | 1992 | repeat: |
1846 | struct inode *inode = mapping->host; | 1993 | page = find_lock_page(mapping, index); |
1847 | long status = 0; | 1994 | if (likely(page)) |
1848 | struct page *page; | 1995 | return page; |
1849 | struct page *cached_page = NULL; | ||
1850 | size_t bytes; | ||
1851 | struct pagevec lru_pvec; | ||
1852 | const struct iovec *cur_iov = iov; /* current iovec */ | ||
1853 | size_t iov_base = 0; /* offset in the current iovec */ | ||
1854 | char __user *buf; | ||
1855 | |||
1856 | pagevec_init(&lru_pvec, 0); | ||
1857 | 1996 | ||
1858 | /* | 1997 | page = page_cache_alloc(mapping); |
1859 | * handle partial DIO write. Adjust cur_iov if needed. | 1998 | if (!page) |
1860 | */ | 1999 | return NULL; |
1861 | if (likely(nr_segs == 1)) | 2000 | status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); |
1862 | buf = iov->iov_base + written; | 2001 | if (unlikely(status)) { |
1863 | else { | 2002 | page_cache_release(page); |
1864 | filemap_set_next_iovec(&cur_iov, &iov_base, written); | 2003 | if (status == -EEXIST) |
1865 | buf = cur_iov->iov_base + iov_base; | 2004 | goto repeat; |
2005 | return NULL; | ||
1866 | } | 2006 | } |
2007 | return page; | ||
2008 | } | ||
2009 | EXPORT_SYMBOL(__grab_cache_page); | ||
2010 | |||
2011 | static ssize_t generic_perform_write_2copy(struct file *file, | ||
2012 | struct iov_iter *i, loff_t pos) | ||
2013 | { | ||
2014 | struct address_space *mapping = file->f_mapping; | ||
2015 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
2016 | struct inode *inode = mapping->host; | ||
2017 | long status = 0; | ||
2018 | ssize_t written = 0; | ||
1867 | 2019 | ||
1868 | do { | 2020 | do { |
1869 | unsigned long index; | 2021 | struct page *src_page; |
1870 | unsigned long offset; | 2022 | struct page *page; |
1871 | size_t copied; | 2023 | pgoff_t index; /* Pagecache index for current page */ |
2024 | unsigned long offset; /* Offset into pagecache page */ | ||
2025 | unsigned long bytes; /* Bytes to write to page */ | ||
2026 | size_t copied; /* Bytes copied from user */ | ||
1872 | 2027 | ||
1873 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ | 2028 | offset = (pos & (PAGE_CACHE_SIZE - 1)); |
1874 | index = pos >> PAGE_CACHE_SHIFT; | 2029 | index = pos >> PAGE_CACHE_SHIFT; |
1875 | bytes = PAGE_CACHE_SIZE - offset; | 2030 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, |
1876 | 2031 | iov_iter_count(i)); | |
1877 | /* Limit the size of the copy to the caller's write size */ | ||
1878 | bytes = min(bytes, count); | ||
1879 | 2032 | ||
1880 | /* We only need to worry about prefaulting when writes are from | 2033 | /* |
1881 | * user-space. NFSd uses vfs_writev with several non-aligned | 2034 | * a non-NULL src_page indicates that we're doing the |
1882 | * segments in the vector, and limiting to one segment a time is | 2035 | * copy via get_user_pages and kmap. |
1883 | * a noticeable performance for re-write | ||
1884 | */ | 2036 | */ |
1885 | if (!segment_eq(get_fs(), KERNEL_DS)) { | 2037 | src_page = NULL; |
1886 | /* | ||
1887 | * Limit the size of the copy to that of the current | ||
1888 | * segment, because fault_in_pages_readable() doesn't | ||
1889 | * know how to walk segments. | ||
1890 | */ | ||
1891 | bytes = min(bytes, cur_iov->iov_len - iov_base); | ||
1892 | 2038 | ||
1893 | /* | 2039 | /* |
1894 | * Bring in the user page that we will copy from | 2040 | * Bring in the user page that we will copy from _first_. |
1895 | * _first_. Otherwise there's a nasty deadlock on | 2041 | * Otherwise there's a nasty deadlock on copying from the |
1896 | * copying from the same page as we're writing to, | 2042 | * same page as we're writing to, without it being marked |
1897 | * without it being marked up-to-date. | 2043 | * up-to-date. |
1898 | */ | 2044 | * |
1899 | fault_in_pages_readable(buf, bytes); | 2045 | * Not only is this an optimisation, but it is also required |
2046 | * to check that the address is actually valid, when atomic | ||
2047 | * usercopies are used, below. | ||
2048 | */ | ||
2049 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { | ||
2050 | status = -EFAULT; | ||
2051 | break; | ||
1900 | } | 2052 | } |
1901 | page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); | 2053 | |
2054 | page = __grab_cache_page(mapping, index); | ||
1902 | if (!page) { | 2055 | if (!page) { |
1903 | status = -ENOMEM; | 2056 | status = -ENOMEM; |
1904 | break; | 2057 | break; |
1905 | } | 2058 | } |
1906 | 2059 | ||
1907 | if (unlikely(bytes == 0)) { | 2060 | /* |
1908 | status = 0; | 2061 | * non-uptodate pages cannot cope with short copies, and we |
1909 | copied = 0; | 2062 | * cannot take a pagefault with the destination page locked. |
1910 | goto zero_length_segment; | 2063 | * So pin the source page to copy it. |
1911 | } | 2064 | */ |
2065 | if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) { | ||
2066 | unlock_page(page); | ||
1912 | 2067 | ||
1913 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | 2068 | src_page = alloc_page(GFP_KERNEL); |
1914 | if (unlikely(status)) { | 2069 | if (!src_page) { |
1915 | loff_t isize = i_size_read(inode); | 2070 | page_cache_release(page); |
2071 | status = -ENOMEM; | ||
2072 | break; | ||
2073 | } | ||
2074 | |||
2075 | /* | ||
2076 | * Cannot get_user_pages with a page locked for the | ||
2077 | * same reason as we can't take a page fault with a | ||
2078 | * page locked (as explained below). | ||
2079 | */ | ||
2080 | copied = iov_iter_copy_from_user(src_page, i, | ||
2081 | offset, bytes); | ||
2082 | if (unlikely(copied == 0)) { | ||
2083 | status = -EFAULT; | ||
2084 | page_cache_release(page); | ||
2085 | page_cache_release(src_page); | ||
2086 | break; | ||
2087 | } | ||
2088 | bytes = copied; | ||
1916 | 2089 | ||
1917 | if (status != AOP_TRUNCATED_PAGE) | 2090 | lock_page(page); |
2091 | /* | ||
2092 | * Can't handle the page going uptodate here, because | ||
2093 | * that means we would use non-atomic usercopies, which | ||
2094 | * zero out the tail of the page, which can cause | ||
2095 | * zeroes to become transiently visible. We could just | ||
2096 | * use a non-zeroing copy, but the APIs aren't too | ||
2097 | * consistent. | ||
2098 | */ | ||
2099 | if (unlikely(!page->mapping || PageUptodate(page))) { | ||
1918 | unlock_page(page); | 2100 | unlock_page(page); |
1919 | page_cache_release(page); | 2101 | page_cache_release(page); |
1920 | if (status == AOP_TRUNCATED_PAGE) | 2102 | page_cache_release(src_page); |
1921 | continue; | 2103 | continue; |
2104 | } | ||
2105 | } | ||
2106 | |||
2107 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | ||
2108 | if (unlikely(status)) | ||
2109 | goto fs_write_aop_error; | ||
2110 | |||
2111 | if (!src_page) { | ||
1922 | /* | 2112 | /* |
1923 | * prepare_write() may have instantiated a few blocks | 2113 | * Must not enter the pagefault handler here, because |
1924 | * outside i_size. Trim these off again. | 2114 | * we hold the page lock, so we might recursively |
2115 | * deadlock on the same lock, or get an ABBA deadlock | ||
2116 | * against a different lock, or against the mmap_sem | ||
2117 | * (which nests outside the page lock). So increment | ||
2118 | * preempt count, and use _atomic usercopies. | ||
2119 | * | ||
2120 | * The page is uptodate so we are OK to encounter a | ||
2121 | * short copy: if unmodified parts of the page are | ||
2122 | * marked dirty and written out to disk, it doesn't | ||
2123 | * really matter. | ||
1925 | */ | 2124 | */ |
1926 | if (pos + bytes > isize) | 2125 | pagefault_disable(); |
1927 | vmtruncate(inode, isize); | 2126 | copied = iov_iter_copy_from_user_atomic(page, i, |
1928 | break; | 2127 | offset, bytes); |
2128 | pagefault_enable(); | ||
2129 | } else { | ||
2130 | void *src, *dst; | ||
2131 | src = kmap_atomic(src_page, KM_USER0); | ||
2132 | dst = kmap_atomic(page, KM_USER1); | ||
2133 | memcpy(dst + offset, src + offset, bytes); | ||
2134 | kunmap_atomic(dst, KM_USER1); | ||
2135 | kunmap_atomic(src, KM_USER0); | ||
2136 | copied = bytes; | ||
1929 | } | 2137 | } |
1930 | if (likely(nr_segs == 1)) | ||
1931 | copied = filemap_copy_from_user(page, offset, | ||
1932 | buf, bytes); | ||
1933 | else | ||
1934 | copied = filemap_copy_from_user_iovec(page, offset, | ||
1935 | cur_iov, iov_base, bytes); | ||
1936 | flush_dcache_page(page); | 2138 | flush_dcache_page(page); |
2139 | |||
1937 | status = a_ops->commit_write(file, page, offset, offset+bytes); | 2140 | status = a_ops->commit_write(file, page, offset, offset+bytes); |
1938 | if (status == AOP_TRUNCATED_PAGE) { | 2141 | if (unlikely(status < 0)) |
1939 | page_cache_release(page); | 2142 | goto fs_write_aop_error; |
1940 | continue; | 2143 | if (unlikely(status > 0)) /* filesystem did partial write */ |
1941 | } | 2144 | copied = min_t(size_t, copied, status); |
1942 | zero_length_segment: | 2145 | |
1943 | if (likely(copied >= 0)) { | ||
1944 | if (!status) | ||
1945 | status = copied; | ||
1946 | |||
1947 | if (status >= 0) { | ||
1948 | written += status; | ||
1949 | count -= status; | ||
1950 | pos += status; | ||
1951 | buf += status; | ||
1952 | if (unlikely(nr_segs > 1)) { | ||
1953 | filemap_set_next_iovec(&cur_iov, | ||
1954 | &iov_base, status); | ||
1955 | if (count) | ||
1956 | buf = cur_iov->iov_base + | ||
1957 | iov_base; | ||
1958 | } else { | ||
1959 | iov_base += status; | ||
1960 | } | ||
1961 | } | ||
1962 | } | ||
1963 | if (unlikely(copied != bytes)) | ||
1964 | if (status >= 0) | ||
1965 | status = -EFAULT; | ||
1966 | unlock_page(page); | 2146 | unlock_page(page); |
1967 | mark_page_accessed(page); | 2147 | mark_page_accessed(page); |
1968 | page_cache_release(page); | 2148 | page_cache_release(page); |
1969 | if (status < 0) | 2149 | if (src_page) |
1970 | break; | 2150 | page_cache_release(src_page); |
2151 | |||
2152 | iov_iter_advance(i, copied); | ||
2153 | pos += copied; | ||
2154 | written += copied; | ||
2155 | |||
1971 | balance_dirty_pages_ratelimited(mapping); | 2156 | balance_dirty_pages_ratelimited(mapping); |
1972 | cond_resched(); | 2157 | cond_resched(); |
1973 | } while (count); | 2158 | continue; |
1974 | *ppos = pos; | ||
1975 | 2159 | ||
1976 | if (cached_page) | 2160 | fs_write_aop_error: |
1977 | page_cache_release(cached_page); | 2161 | unlock_page(page); |
2162 | page_cache_release(page); | ||
2163 | if (src_page) | ||
2164 | page_cache_release(src_page); | ||
2165 | |||
2166 | /* | ||
2167 | * prepare_write() may have instantiated a few blocks | ||
2168 | * outside i_size. Trim these off again. Don't need | ||
2169 | * i_size_read because we hold i_mutex. | ||
2170 | */ | ||
2171 | if (pos + bytes > inode->i_size) | ||
2172 | vmtruncate(inode, inode->i_size); | ||
2173 | break; | ||
2174 | } while (iov_iter_count(i)); | ||
2175 | |||
2176 | return written ? written : status; | ||
2177 | } | ||
2178 | |||
2179 | static ssize_t generic_perform_write(struct file *file, | ||
2180 | struct iov_iter *i, loff_t pos) | ||
2181 | { | ||
2182 | struct address_space *mapping = file->f_mapping; | ||
2183 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
2184 | long status = 0; | ||
2185 | ssize_t written = 0; | ||
2186 | unsigned int flags = 0; | ||
1978 | 2187 | ||
1979 | /* | 2188 | /* |
1980 | * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC | 2189 | * Copies from kernel address space cannot fail (NFSD is a big user). |
1981 | */ | 2190 | */ |
2191 | if (segment_eq(get_fs(), KERNEL_DS)) | ||
2192 | flags |= AOP_FLAG_UNINTERRUPTIBLE; | ||
2193 | |||
2194 | do { | ||
2195 | struct page *page; | ||
2196 | pgoff_t index; /* Pagecache index for current page */ | ||
2197 | unsigned long offset; /* Offset into pagecache page */ | ||
2198 | unsigned long bytes; /* Bytes to write to page */ | ||
2199 | size_t copied; /* Bytes copied from user */ | ||
2200 | void *fsdata; | ||
2201 | |||
2202 | offset = (pos & (PAGE_CACHE_SIZE - 1)); | ||
2203 | index = pos >> PAGE_CACHE_SHIFT; | ||
2204 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | ||
2205 | iov_iter_count(i)); | ||
2206 | |||
2207 | again: | ||
2208 | |||
2209 | /* | ||
2210 | * Bring in the user page that we will copy from _first_. | ||
2211 | * Otherwise there's a nasty deadlock on copying from the | ||
2212 | * same page as we're writing to, without it being marked | ||
2213 | * up-to-date. | ||
2214 | * | ||
2215 | * Not only is this an optimisation, but it is also required | ||
2216 | * to check that the address is actually valid, when atomic | ||
2217 | * usercopies are used, below. | ||
2218 | */ | ||
2219 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { | ||
2220 | status = -EFAULT; | ||
2221 | break; | ||
2222 | } | ||
2223 | |||
2224 | status = a_ops->write_begin(file, mapping, pos, bytes, flags, | ||
2225 | &page, &fsdata); | ||
2226 | if (unlikely(status)) | ||
2227 | break; | ||
2228 | |||
2229 | pagefault_disable(); | ||
2230 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); | ||
2231 | pagefault_enable(); | ||
2232 | flush_dcache_page(page); | ||
2233 | |||
2234 | status = a_ops->write_end(file, mapping, pos, bytes, copied, | ||
2235 | page, fsdata); | ||
2236 | if (unlikely(status < 0)) | ||
2237 | break; | ||
2238 | copied = status; | ||
2239 | |||
2240 | cond_resched(); | ||
2241 | |||
2242 | if (unlikely(copied == 0)) { | ||
2243 | /* | ||
2244 | * If we were unable to copy any data at all, we must | ||
2245 | * fall back to a single segment length write. | ||
2246 | * | ||
2247 | * If we didn't fallback here, we could livelock | ||
2248 | * because not all segments in the iov can be copied at | ||
2249 | * once without a pagefault. | ||
2250 | */ | ||
2251 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | ||
2252 | iov_iter_single_seg_count(i)); | ||
2253 | goto again; | ||
2254 | } | ||
2255 | iov_iter_advance(i, copied); | ||
2256 | pos += copied; | ||
2257 | written += copied; | ||
2258 | |||
2259 | balance_dirty_pages_ratelimited(mapping); | ||
2260 | |||
2261 | } while (iov_iter_count(i)); | ||
2262 | |||
2263 | return written ? written : status; | ||
2264 | } | ||
2265 | |||
2266 | ssize_t | ||
2267 | generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | ||
2268 | unsigned long nr_segs, loff_t pos, loff_t *ppos, | ||
2269 | size_t count, ssize_t written) | ||
2270 | { | ||
2271 | struct file *file = iocb->ki_filp; | ||
2272 | struct address_space *mapping = file->f_mapping; | ||
2273 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
2274 | struct inode *inode = mapping->host; | ||
2275 | ssize_t status; | ||
2276 | struct iov_iter i; | ||
2277 | |||
2278 | iov_iter_init(&i, iov, nr_segs, count, written); | ||
2279 | if (a_ops->write_begin) | ||
2280 | status = generic_perform_write(file, &i, pos); | ||
2281 | else | ||
2282 | status = generic_perform_write_2copy(file, &i, pos); | ||
2283 | |||
1982 | if (likely(status >= 0)) { | 2284 | if (likely(status >= 0)) { |
2285 | written += status; | ||
2286 | *ppos = pos + status; | ||
2287 | |||
2288 | /* | ||
2289 | * For now, when the user asks for O_SYNC, we'll actually give | ||
2290 | * O_DSYNC | ||
2291 | */ | ||
1983 | if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2292 | if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
1984 | if (!a_ops->writepage || !is_sync_kiocb(iocb)) | 2293 | if (!a_ops->writepage || !is_sync_kiocb(iocb)) |
1985 | status = generic_osync_inode(inode, mapping, | 2294 | status = generic_osync_inode(inode, mapping, |
@@ -1995,7 +2304,6 @@ zero_length_segment: | |||
1995 | if (unlikely(file->f_flags & O_DIRECT) && written) | 2304 | if (unlikely(file->f_flags & O_DIRECT) && written) |
1996 | status = filemap_write_and_wait(mapping); | 2305 | status = filemap_write_and_wait(mapping); |
1997 | 2306 | ||
1998 | pagevec_lru_add(&lru_pvec); | ||
1999 | return written ? written : status; | 2307 | return written ? written : status; |
2000 | } | 2308 | } |
2001 | EXPORT_SYMBOL(generic_file_buffered_write); | 2309 | EXPORT_SYMBOL(generic_file_buffered_write); |
diff --git a/mm/filemap.h b/mm/filemap.h deleted file mode 100644 index c2bff04c84ed..000000000000 --- a/mm/filemap.h +++ /dev/null | |||
@@ -1,103 +0,0 @@ | |||
1 | /* | ||
2 | * linux/mm/filemap.h | ||
3 | * | ||
4 | * Copyright (C) 1994-1999 Linus Torvalds | ||
5 | */ | ||
6 | |||
7 | #ifndef __FILEMAP_H | ||
8 | #define __FILEMAP_H | ||
9 | |||
10 | #include <linux/types.h> | ||
11 | #include <linux/fs.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/highmem.h> | ||
14 | #include <linux/uio.h> | ||
15 | #include <linux/uaccess.h> | ||
16 | |||
17 | size_t | ||
18 | __filemap_copy_from_user_iovec_inatomic(char *vaddr, | ||
19 | const struct iovec *iov, | ||
20 | size_t base, | ||
21 | size_t bytes); | ||
22 | |||
23 | /* | ||
24 | * Copy as much as we can into the page and return the number of bytes which | ||
25 | * were sucessfully copied. If a fault is encountered then clear the page | ||
26 | * out to (offset+bytes) and return the number of bytes which were copied. | ||
27 | * | ||
28 | * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache | ||
29 | * to *NOT* zero any tail of the buffer that it failed to copy. If it does, | ||
30 | * and if the following non-atomic copy succeeds, then there is a small window | ||
31 | * where the target page contains neither the data before the write, nor the | ||
32 | * data after the write (it contains zero). A read at this time will see | ||
33 | * data that is inconsistent with any ordering of the read and the write. | ||
34 | * (This has been detected in practice). | ||
35 | */ | ||
36 | static inline size_t | ||
37 | filemap_copy_from_user(struct page *page, unsigned long offset, | ||
38 | const char __user *buf, unsigned bytes) | ||
39 | { | ||
40 | char *kaddr; | ||
41 | int left; | ||
42 | |||
43 | kaddr = kmap_atomic(page, KM_USER0); | ||
44 | left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); | ||
45 | kunmap_atomic(kaddr, KM_USER0); | ||
46 | |||
47 | if (left != 0) { | ||
48 | /* Do it the slow way */ | ||
49 | kaddr = kmap(page); | ||
50 | left = __copy_from_user_nocache(kaddr + offset, buf, bytes); | ||
51 | kunmap(page); | ||
52 | } | ||
53 | return bytes - left; | ||
54 | } | ||
55 | |||
56 | /* | ||
57 | * This has the same sideeffects and return value as filemap_copy_from_user(). | ||
58 | * The difference is that on a fault we need to memset the remainder of the | ||
59 | * page (out to offset+bytes), to emulate filemap_copy_from_user()'s | ||
60 | * single-segment behaviour. | ||
61 | */ | ||
62 | static inline size_t | ||
63 | filemap_copy_from_user_iovec(struct page *page, unsigned long offset, | ||
64 | const struct iovec *iov, size_t base, size_t bytes) | ||
65 | { | ||
66 | char *kaddr; | ||
67 | size_t copied; | ||
68 | |||
69 | kaddr = kmap_atomic(page, KM_USER0); | ||
70 | copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, | ||
71 | base, bytes); | ||
72 | kunmap_atomic(kaddr, KM_USER0); | ||
73 | if (copied != bytes) { | ||
74 | kaddr = kmap(page); | ||
75 | copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, | ||
76 | base, bytes); | ||
77 | if (bytes - copied) | ||
78 | memset(kaddr + offset + copied, 0, bytes - copied); | ||
79 | kunmap(page); | ||
80 | } | ||
81 | return copied; | ||
82 | } | ||
83 | |||
84 | static inline void | ||
85 | filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) | ||
86 | { | ||
87 | const struct iovec *iov = *iovp; | ||
88 | size_t base = *basep; | ||
89 | |||
90 | do { | ||
91 | int copy = min(bytes, iov->iov_len - base); | ||
92 | |||
93 | bytes -= copy; | ||
94 | base += copy; | ||
95 | if (iov->iov_len == base) { | ||
96 | iov++; | ||
97 | base = 0; | ||
98 | } | ||
99 | } while (bytes); | ||
100 | *iovp = iov; | ||
101 | *basep = base; | ||
102 | } | ||
103 | #endif | ||
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 53ee6a299635..32132f3cd641 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -15,7 +15,6 @@ | |||
15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <asm/tlbflush.h> | 17 | #include <asm/tlbflush.h> |
18 | #include "filemap.h" | ||
19 | 18 | ||
20 | /* | 19 | /* |
21 | * We do use our own empty page to avoid interference with other users | 20 | * We do use our own empty page to avoid interference with other users |
@@ -288,6 +287,7 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
288 | unsigned long index; | 287 | unsigned long index; |
289 | unsigned long offset; | 288 | unsigned long offset; |
290 | size_t copied; | 289 | size_t copied; |
290 | char *kaddr; | ||
291 | 291 | ||
292 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ | 292 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ |
293 | index = pos >> PAGE_CACHE_SHIFT; | 293 | index = pos >> PAGE_CACHE_SHIFT; |
@@ -295,14 +295,6 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
295 | if (bytes > count) | 295 | if (bytes > count) |
296 | bytes = count; | 296 | bytes = count; |
297 | 297 | ||
298 | /* | ||
299 | * Bring in the user page that we will copy from _first_. | ||
300 | * Otherwise there's a nasty deadlock on copying from the | ||
301 | * same page as we're writing to, without it being marked | ||
302 | * up-to-date. | ||
303 | */ | ||
304 | fault_in_pages_readable(buf, bytes); | ||
305 | |||
306 | page = a_ops->get_xip_page(mapping, | 298 | page = a_ops->get_xip_page(mapping, |
307 | index*(PAGE_SIZE/512), 0); | 299 | index*(PAGE_SIZE/512), 0); |
308 | if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) { | 300 | if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) { |
@@ -319,8 +311,13 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
319 | break; | 311 | break; |
320 | } | 312 | } |
321 | 313 | ||
322 | copied = filemap_copy_from_user(page, offset, buf, bytes); | 314 | fault_in_pages_readable(buf, bytes); |
315 | kaddr = kmap_atomic(page, KM_USER0); | ||
316 | copied = bytes - | ||
317 | __copy_from_user_inatomic_nocache(kaddr, buf, bytes); | ||
318 | kunmap_atomic(kaddr, KM_USER0); | ||
323 | flush_dcache_page(page); | 319 | flush_dcache_page(page); |
320 | |||
324 | if (likely(copied > 0)) { | 321 | if (likely(copied > 0)) { |
325 | status = copied; | 322 | status = copied; |
326 | 323 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eab8c428cc93..ae2959bb59cb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -23,12 +23,16 @@ | |||
23 | 23 | ||
24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; | 25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; |
26 | static unsigned long surplus_huge_pages; | ||
26 | unsigned long max_huge_pages; | 27 | unsigned long max_huge_pages; |
27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | 28 | static struct list_head hugepage_freelists[MAX_NUMNODES]; |
28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | 29 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; |
29 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; | 30 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; |
31 | static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; | ||
30 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 32 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
31 | unsigned long hugepages_treat_as_movable; | 33 | unsigned long hugepages_treat_as_movable; |
34 | int hugetlb_dynamic_pool; | ||
35 | static int hugetlb_next_nid; | ||
32 | 36 | ||
33 | /* | 37 | /* |
34 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 38 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages |
@@ -85,6 +89,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, | |||
85 | list_del(&page->lru); | 89 | list_del(&page->lru); |
86 | free_huge_pages--; | 90 | free_huge_pages--; |
87 | free_huge_pages_node[nid]--; | 91 | free_huge_pages_node[nid]--; |
92 | if (vma && vma->vm_flags & VM_MAYSHARE) | ||
93 | resv_huge_pages--; | ||
88 | break; | 94 | break; |
89 | } | 95 | } |
90 | } | 96 | } |
@@ -92,58 +98,269 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, | |||
92 | return page; | 98 | return page; |
93 | } | 99 | } |
94 | 100 | ||
101 | static void update_and_free_page(struct page *page) | ||
102 | { | ||
103 | int i; | ||
104 | nr_huge_pages--; | ||
105 | nr_huge_pages_node[page_to_nid(page)]--; | ||
106 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { | ||
107 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | ||
108 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | ||
109 | 1 << PG_private | 1<< PG_writeback); | ||
110 | } | ||
111 | set_compound_page_dtor(page, NULL); | ||
112 | set_page_refcounted(page); | ||
113 | __free_pages(page, HUGETLB_PAGE_ORDER); | ||
114 | } | ||
115 | |||
95 | static void free_huge_page(struct page *page) | 116 | static void free_huge_page(struct page *page) |
96 | { | 117 | { |
97 | BUG_ON(page_count(page)); | 118 | int nid = page_to_nid(page); |
98 | 119 | ||
120 | BUG_ON(page_count(page)); | ||
99 | INIT_LIST_HEAD(&page->lru); | 121 | INIT_LIST_HEAD(&page->lru); |
100 | 122 | ||
101 | spin_lock(&hugetlb_lock); | 123 | spin_lock(&hugetlb_lock); |
102 | enqueue_huge_page(page); | 124 | if (surplus_huge_pages_node[nid]) { |
125 | update_and_free_page(page); | ||
126 | surplus_huge_pages--; | ||
127 | surplus_huge_pages_node[nid]--; | ||
128 | } else { | ||
129 | enqueue_huge_page(page); | ||
130 | } | ||
103 | spin_unlock(&hugetlb_lock); | 131 | spin_unlock(&hugetlb_lock); |
104 | } | 132 | } |
105 | 133 | ||
106 | static int alloc_fresh_huge_page(void) | 134 | /* |
135 | * Increment or decrement surplus_huge_pages. Keep node-specific counters | ||
136 | * balanced by operating on them in a round-robin fashion. | ||
137 | * Returns 1 if an adjustment was made. | ||
138 | */ | ||
139 | static int adjust_pool_surplus(int delta) | ||
107 | { | 140 | { |
108 | static int prev_nid; | 141 | static int prev_nid; |
109 | struct page *page; | 142 | int nid = prev_nid; |
110 | int nid; | 143 | int ret = 0; |
144 | |||
145 | VM_BUG_ON(delta != -1 && delta != 1); | ||
146 | do { | ||
147 | nid = next_node(nid, node_online_map); | ||
148 | if (nid == MAX_NUMNODES) | ||
149 | nid = first_node(node_online_map); | ||
150 | |||
151 | /* To shrink on this node, there must be a surplus page */ | ||
152 | if (delta < 0 && !surplus_huge_pages_node[nid]) | ||
153 | continue; | ||
154 | /* Surplus cannot exceed the total number of pages */ | ||
155 | if (delta > 0 && surplus_huge_pages_node[nid] >= | ||
156 | nr_huge_pages_node[nid]) | ||
157 | continue; | ||
158 | |||
159 | surplus_huge_pages += delta; | ||
160 | surplus_huge_pages_node[nid] += delta; | ||
161 | ret = 1; | ||
162 | break; | ||
163 | } while (nid != prev_nid); | ||
111 | 164 | ||
112 | /* | ||
113 | * Copy static prev_nid to local nid, work on that, then copy it | ||
114 | * back to prev_nid afterwards: otherwise there's a window in which | ||
115 | * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node. | ||
116 | * But we don't need to use a spin_lock here: it really doesn't | ||
117 | * matter if occasionally a racer chooses the same nid as we do. | ||
118 | */ | ||
119 | nid = next_node(prev_nid, node_online_map); | ||
120 | if (nid == MAX_NUMNODES) | ||
121 | nid = first_node(node_online_map); | ||
122 | prev_nid = nid; | 165 | prev_nid = nid; |
166 | return ret; | ||
167 | } | ||
168 | |||
169 | static struct page *alloc_fresh_huge_page_node(int nid) | ||
170 | { | ||
171 | struct page *page; | ||
123 | 172 | ||
124 | page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, | 173 | page = alloc_pages_node(nid, |
174 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, | ||
175 | HUGETLB_PAGE_ORDER); | ||
176 | if (page) { | ||
177 | set_compound_page_dtor(page, free_huge_page); | ||
178 | spin_lock(&hugetlb_lock); | ||
179 | nr_huge_pages++; | ||
180 | nr_huge_pages_node[nid]++; | ||
181 | spin_unlock(&hugetlb_lock); | ||
182 | put_page(page); /* free it into the hugepage allocator */ | ||
183 | } | ||
184 | |||
185 | return page; | ||
186 | } | ||
187 | |||
188 | static int alloc_fresh_huge_page(void) | ||
189 | { | ||
190 | struct page *page; | ||
191 | int start_nid; | ||
192 | int next_nid; | ||
193 | int ret = 0; | ||
194 | |||
195 | start_nid = hugetlb_next_nid; | ||
196 | |||
197 | do { | ||
198 | page = alloc_fresh_huge_page_node(hugetlb_next_nid); | ||
199 | if (page) | ||
200 | ret = 1; | ||
201 | /* | ||
202 | * Use a helper variable to find the next node and then | ||
203 | * copy it back to hugetlb_next_nid afterwards: | ||
204 | * otherwise there's a window in which a racer might | ||
205 | * pass invalid nid MAX_NUMNODES to alloc_pages_node. | ||
206 | * But we don't need to use a spin_lock here: it really | ||
207 | * doesn't matter if occasionally a racer chooses the | ||
208 | * same nid as we do. Move nid forward in the mask even | ||
209 | * if we just successfully allocated a hugepage so that | ||
210 | * the next caller gets hugepages on the next node. | ||
211 | */ | ||
212 | next_nid = next_node(hugetlb_next_nid, node_online_map); | ||
213 | if (next_nid == MAX_NUMNODES) | ||
214 | next_nid = first_node(node_online_map); | ||
215 | hugetlb_next_nid = next_nid; | ||
216 | } while (!page && hugetlb_next_nid != start_nid); | ||
217 | |||
218 | return ret; | ||
219 | } | ||
220 | |||
221 | static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | ||
222 | unsigned long address) | ||
223 | { | ||
224 | struct page *page; | ||
225 | |||
226 | /* Check if the dynamic pool is enabled */ | ||
227 | if (!hugetlb_dynamic_pool) | ||
228 | return NULL; | ||
229 | |||
230 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, | ||
125 | HUGETLB_PAGE_ORDER); | 231 | HUGETLB_PAGE_ORDER); |
126 | if (page) { | 232 | if (page) { |
127 | set_compound_page_dtor(page, free_huge_page); | 233 | set_compound_page_dtor(page, free_huge_page); |
128 | spin_lock(&hugetlb_lock); | 234 | spin_lock(&hugetlb_lock); |
129 | nr_huge_pages++; | 235 | nr_huge_pages++; |
130 | nr_huge_pages_node[page_to_nid(page)]++; | 236 | nr_huge_pages_node[page_to_nid(page)]++; |
237 | surplus_huge_pages++; | ||
238 | surplus_huge_pages_node[page_to_nid(page)]++; | ||
131 | spin_unlock(&hugetlb_lock); | 239 | spin_unlock(&hugetlb_lock); |
132 | put_page(page); /* free it into the hugepage allocator */ | ||
133 | return 1; | ||
134 | } | 240 | } |
135 | return 0; | 241 | |
242 | return page; | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Increase the hugetlb pool such that it can accomodate a reservation | ||
247 | * of size 'delta'. | ||
248 | */ | ||
249 | static int gather_surplus_pages(int delta) | ||
250 | { | ||
251 | struct list_head surplus_list; | ||
252 | struct page *page, *tmp; | ||
253 | int ret, i; | ||
254 | int needed, allocated; | ||
255 | |||
256 | needed = (resv_huge_pages + delta) - free_huge_pages; | ||
257 | if (needed <= 0) | ||
258 | return 0; | ||
259 | |||
260 | allocated = 0; | ||
261 | INIT_LIST_HEAD(&surplus_list); | ||
262 | |||
263 | ret = -ENOMEM; | ||
264 | retry: | ||
265 | spin_unlock(&hugetlb_lock); | ||
266 | for (i = 0; i < needed; i++) { | ||
267 | page = alloc_buddy_huge_page(NULL, 0); | ||
268 | if (!page) { | ||
269 | /* | ||
270 | * We were not able to allocate enough pages to | ||
271 | * satisfy the entire reservation so we free what | ||
272 | * we've allocated so far. | ||
273 | */ | ||
274 | spin_lock(&hugetlb_lock); | ||
275 | needed = 0; | ||
276 | goto free; | ||
277 | } | ||
278 | |||
279 | list_add(&page->lru, &surplus_list); | ||
280 | } | ||
281 | allocated += needed; | ||
282 | |||
283 | /* | ||
284 | * After retaking hugetlb_lock, we need to recalculate 'needed' | ||
285 | * because either resv_huge_pages or free_huge_pages may have changed. | ||
286 | */ | ||
287 | spin_lock(&hugetlb_lock); | ||
288 | needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); | ||
289 | if (needed > 0) | ||
290 | goto retry; | ||
291 | |||
292 | /* | ||
293 | * The surplus_list now contains _at_least_ the number of extra pages | ||
294 | * needed to accomodate the reservation. Add the appropriate number | ||
295 | * of pages to the hugetlb pool and free the extras back to the buddy | ||
296 | * allocator. | ||
297 | */ | ||
298 | needed += allocated; | ||
299 | ret = 0; | ||
300 | free: | ||
301 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | ||
302 | list_del(&page->lru); | ||
303 | if ((--needed) >= 0) | ||
304 | enqueue_huge_page(page); | ||
305 | else { | ||
306 | /* | ||
307 | * Decrement the refcount and free the page using its | ||
308 | * destructor. This must be done with hugetlb_lock | ||
309 | * unlocked which is safe because free_huge_page takes | ||
310 | * hugetlb_lock before deciding how to free the page. | ||
311 | */ | ||
312 | spin_unlock(&hugetlb_lock); | ||
313 | put_page(page); | ||
314 | spin_lock(&hugetlb_lock); | ||
315 | } | ||
316 | } | ||
317 | |||
318 | return ret; | ||
319 | } | ||
320 | |||
321 | /* | ||
322 | * When releasing a hugetlb pool reservation, any surplus pages that were | ||
323 | * allocated to satisfy the reservation must be explicitly freed if they were | ||
324 | * never used. | ||
325 | */ | ||
326 | void return_unused_surplus_pages(unsigned long unused_resv_pages) | ||
327 | { | ||
328 | static int nid = -1; | ||
329 | struct page *page; | ||
330 | unsigned long nr_pages; | ||
331 | |||
332 | nr_pages = min(unused_resv_pages, surplus_huge_pages); | ||
333 | |||
334 | while (nr_pages) { | ||
335 | nid = next_node(nid, node_online_map); | ||
336 | if (nid == MAX_NUMNODES) | ||
337 | nid = first_node(node_online_map); | ||
338 | |||
339 | if (!surplus_huge_pages_node[nid]) | ||
340 | continue; | ||
341 | |||
342 | if (!list_empty(&hugepage_freelists[nid])) { | ||
343 | page = list_entry(hugepage_freelists[nid].next, | ||
344 | struct page, lru); | ||
345 | list_del(&page->lru); | ||
346 | update_and_free_page(page); | ||
347 | free_huge_pages--; | ||
348 | free_huge_pages_node[nid]--; | ||
349 | surplus_huge_pages--; | ||
350 | surplus_huge_pages_node[nid]--; | ||
351 | nr_pages--; | ||
352 | } | ||
353 | } | ||
136 | } | 354 | } |
137 | 355 | ||
138 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 356 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
139 | unsigned long addr) | 357 | unsigned long addr) |
140 | { | 358 | { |
141 | struct page *page; | 359 | struct page *page = NULL; |
360 | int use_reserved_page = vma->vm_flags & VM_MAYSHARE; | ||
142 | 361 | ||
143 | spin_lock(&hugetlb_lock); | 362 | spin_lock(&hugetlb_lock); |
144 | if (vma->vm_flags & VM_MAYSHARE) | 363 | if (!use_reserved_page && (free_huge_pages <= resv_huge_pages)) |
145 | resv_huge_pages--; | ||
146 | else if (free_huge_pages <= resv_huge_pages) | ||
147 | goto fail; | 364 | goto fail; |
148 | 365 | ||
149 | page = dequeue_huge_page(vma, addr); | 366 | page = dequeue_huge_page(vma, addr); |
@@ -155,10 +372,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
155 | return page; | 372 | return page; |
156 | 373 | ||
157 | fail: | 374 | fail: |
158 | if (vma->vm_flags & VM_MAYSHARE) | ||
159 | resv_huge_pages++; | ||
160 | spin_unlock(&hugetlb_lock); | 375 | spin_unlock(&hugetlb_lock); |
161 | return NULL; | 376 | |
377 | /* | ||
378 | * Private mappings do not use reserved huge pages so the allocation | ||
379 | * may have failed due to an undersized hugetlb pool. Try to grab a | ||
380 | * surplus huge page from the buddy allocator. | ||
381 | */ | ||
382 | if (!use_reserved_page) | ||
383 | page = alloc_buddy_huge_page(vma, addr); | ||
384 | |||
385 | return page; | ||
162 | } | 386 | } |
163 | 387 | ||
164 | static int __init hugetlb_init(void) | 388 | static int __init hugetlb_init(void) |
@@ -171,6 +395,8 @@ static int __init hugetlb_init(void) | |||
171 | for (i = 0; i < MAX_NUMNODES; ++i) | 395 | for (i = 0; i < MAX_NUMNODES; ++i) |
172 | INIT_LIST_HEAD(&hugepage_freelists[i]); | 396 | INIT_LIST_HEAD(&hugepage_freelists[i]); |
173 | 397 | ||
398 | hugetlb_next_nid = first_node(node_online_map); | ||
399 | |||
174 | for (i = 0; i < max_huge_pages; ++i) { | 400 | for (i = 0; i < max_huge_pages; ++i) { |
175 | if (!alloc_fresh_huge_page()) | 401 | if (!alloc_fresh_huge_page()) |
176 | break; | 402 | break; |
@@ -201,21 +427,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array) | |||
201 | } | 427 | } |
202 | 428 | ||
203 | #ifdef CONFIG_SYSCTL | 429 | #ifdef CONFIG_SYSCTL |
204 | static void update_and_free_page(struct page *page) | ||
205 | { | ||
206 | int i; | ||
207 | nr_huge_pages--; | ||
208 | nr_huge_pages_node[page_to_nid(page)]--; | ||
209 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { | ||
210 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | ||
211 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | ||
212 | 1 << PG_private | 1<< PG_writeback); | ||
213 | } | ||
214 | set_compound_page_dtor(page, NULL); | ||
215 | set_page_refcounted(page); | ||
216 | __free_pages(page, HUGETLB_PAGE_ORDER); | ||
217 | } | ||
218 | |||
219 | #ifdef CONFIG_HIGHMEM | 430 | #ifdef CONFIG_HIGHMEM |
220 | static void try_to_free_low(unsigned long count) | 431 | static void try_to_free_low(unsigned long count) |
221 | { | 432 | { |
@@ -224,14 +435,14 @@ static void try_to_free_low(unsigned long count) | |||
224 | for (i = 0; i < MAX_NUMNODES; ++i) { | 435 | for (i = 0; i < MAX_NUMNODES; ++i) { |
225 | struct page *page, *next; | 436 | struct page *page, *next; |
226 | list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { | 437 | list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { |
438 | if (count >= nr_huge_pages) | ||
439 | return; | ||
227 | if (PageHighMem(page)) | 440 | if (PageHighMem(page)) |
228 | continue; | 441 | continue; |
229 | list_del(&page->lru); | 442 | list_del(&page->lru); |
230 | update_and_free_page(page); | 443 | update_and_free_page(page); |
231 | free_huge_pages--; | 444 | free_huge_pages--; |
232 | free_huge_pages_node[page_to_nid(page)]--; | 445 | free_huge_pages_node[page_to_nid(page)]--; |
233 | if (count >= nr_huge_pages) | ||
234 | return; | ||
235 | } | 446 | } |
236 | } | 447 | } |
237 | } | 448 | } |
@@ -241,26 +452,61 @@ static inline void try_to_free_low(unsigned long count) | |||
241 | } | 452 | } |
242 | #endif | 453 | #endif |
243 | 454 | ||
455 | #define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) | ||
244 | static unsigned long set_max_huge_pages(unsigned long count) | 456 | static unsigned long set_max_huge_pages(unsigned long count) |
245 | { | 457 | { |
246 | while (count > nr_huge_pages) { | 458 | unsigned long min_count, ret; |
247 | if (!alloc_fresh_huge_page()) | ||
248 | return nr_huge_pages; | ||
249 | } | ||
250 | if (count >= nr_huge_pages) | ||
251 | return nr_huge_pages; | ||
252 | 459 | ||
460 | /* | ||
461 | * Increase the pool size | ||
462 | * First take pages out of surplus state. Then make up the | ||
463 | * remaining difference by allocating fresh huge pages. | ||
464 | */ | ||
253 | spin_lock(&hugetlb_lock); | 465 | spin_lock(&hugetlb_lock); |
254 | count = max(count, resv_huge_pages); | 466 | while (surplus_huge_pages && count > persistent_huge_pages) { |
255 | try_to_free_low(count); | 467 | if (!adjust_pool_surplus(-1)) |
256 | while (count < nr_huge_pages) { | 468 | break; |
469 | } | ||
470 | |||
471 | while (count > persistent_huge_pages) { | ||
472 | int ret; | ||
473 | /* | ||
474 | * If this allocation races such that we no longer need the | ||
475 | * page, free_huge_page will handle it by freeing the page | ||
476 | * and reducing the surplus. | ||
477 | */ | ||
478 | spin_unlock(&hugetlb_lock); | ||
479 | ret = alloc_fresh_huge_page(); | ||
480 | spin_lock(&hugetlb_lock); | ||
481 | if (!ret) | ||
482 | goto out; | ||
483 | |||
484 | } | ||
485 | |||
486 | /* | ||
487 | * Decrease the pool size | ||
488 | * First return free pages to the buddy allocator (being careful | ||
489 | * to keep enough around to satisfy reservations). Then place | ||
490 | * pages into surplus state as needed so the pool will shrink | ||
491 | * to the desired size as pages become free. | ||
492 | */ | ||
493 | min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; | ||
494 | min_count = max(count, min_count); | ||
495 | try_to_free_low(min_count); | ||
496 | while (min_count < persistent_huge_pages) { | ||
257 | struct page *page = dequeue_huge_page(NULL, 0); | 497 | struct page *page = dequeue_huge_page(NULL, 0); |
258 | if (!page) | 498 | if (!page) |
259 | break; | 499 | break; |
260 | update_and_free_page(page); | 500 | update_and_free_page(page); |
261 | } | 501 | } |
502 | while (count < persistent_huge_pages) { | ||
503 | if (!adjust_pool_surplus(1)) | ||
504 | break; | ||
505 | } | ||
506 | out: | ||
507 | ret = persistent_huge_pages; | ||
262 | spin_unlock(&hugetlb_lock); | 508 | spin_unlock(&hugetlb_lock); |
263 | return nr_huge_pages; | 509 | return ret; |
264 | } | 510 | } |
265 | 511 | ||
266 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 512 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, |
@@ -292,10 +538,12 @@ int hugetlb_report_meminfo(char *buf) | |||
292 | "HugePages_Total: %5lu\n" | 538 | "HugePages_Total: %5lu\n" |
293 | "HugePages_Free: %5lu\n" | 539 | "HugePages_Free: %5lu\n" |
294 | "HugePages_Rsvd: %5lu\n" | 540 | "HugePages_Rsvd: %5lu\n" |
541 | "HugePages_Surp: %5lu\n" | ||
295 | "Hugepagesize: %5lu kB\n", | 542 | "Hugepagesize: %5lu kB\n", |
296 | nr_huge_pages, | 543 | nr_huge_pages, |
297 | free_huge_pages, | 544 | free_huge_pages, |
298 | resv_huge_pages, | 545 | resv_huge_pages, |
546 | surplus_huge_pages, | ||
299 | HPAGE_SIZE/1024); | 547 | HPAGE_SIZE/1024); |
300 | } | 548 | } |
301 | 549 | ||
@@ -355,7 +603,6 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, | |||
355 | entry = pte_mkwrite(pte_mkdirty(*ptep)); | 603 | entry = pte_mkwrite(pte_mkdirty(*ptep)); |
356 | if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { | 604 | if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { |
357 | update_mmu_cache(vma, address, entry); | 605 | update_mmu_cache(vma, address, entry); |
358 | lazy_mmu_prot_update(entry); | ||
359 | } | 606 | } |
360 | } | 607 | } |
361 | 608 | ||
@@ -708,7 +955,6 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
708 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 955 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
709 | pte = pte_mkhuge(pte_modify(pte, newprot)); | 956 | pte = pte_mkhuge(pte_modify(pte, newprot)); |
710 | set_huge_pte_at(mm, address, ptep, pte); | 957 | set_huge_pte_at(mm, address, ptep, pte); |
711 | lazy_mmu_prot_update(pte); | ||
712 | } | 958 | } |
713 | } | 959 | } |
714 | spin_unlock(&mm->page_table_lock); | 960 | spin_unlock(&mm->page_table_lock); |
@@ -843,21 +1089,6 @@ static int hugetlb_acct_memory(long delta) | |||
843 | int ret = -ENOMEM; | 1089 | int ret = -ENOMEM; |
844 | 1090 | ||
845 | spin_lock(&hugetlb_lock); | 1091 | spin_lock(&hugetlb_lock); |
846 | if ((delta + resv_huge_pages) <= free_huge_pages) { | ||
847 | resv_huge_pages += delta; | ||
848 | ret = 0; | ||
849 | } | ||
850 | spin_unlock(&hugetlb_lock); | ||
851 | return ret; | ||
852 | } | ||
853 | |||
854 | int hugetlb_reserve_pages(struct inode *inode, long from, long to) | ||
855 | { | ||
856 | long ret, chg; | ||
857 | |||
858 | chg = region_chg(&inode->i_mapping->private_list, from, to); | ||
859 | if (chg < 0) | ||
860 | return chg; | ||
861 | /* | 1092 | /* |
862 | * When cpuset is configured, it breaks the strict hugetlb page | 1093 | * When cpuset is configured, it breaks the strict hugetlb page |
863 | * reservation as the accounting is done on a global variable. Such | 1094 | * reservation as the accounting is done on a global variable. Such |
@@ -875,8 +1106,31 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to) | |||
875 | * a best attempt and hopefully to minimize the impact of changing | 1106 | * a best attempt and hopefully to minimize the impact of changing |
876 | * semantics that cpuset has. | 1107 | * semantics that cpuset has. |
877 | */ | 1108 | */ |
878 | if (chg > cpuset_mems_nr(free_huge_pages_node)) | 1109 | if (delta > 0) { |
879 | return -ENOMEM; | 1110 | if (gather_surplus_pages(delta) < 0) |
1111 | goto out; | ||
1112 | |||
1113 | if (delta > cpuset_mems_nr(free_huge_pages_node)) | ||
1114 | goto out; | ||
1115 | } | ||
1116 | |||
1117 | ret = 0; | ||
1118 | resv_huge_pages += delta; | ||
1119 | if (delta < 0) | ||
1120 | return_unused_surplus_pages((unsigned long) -delta); | ||
1121 | |||
1122 | out: | ||
1123 | spin_unlock(&hugetlb_lock); | ||
1124 | return ret; | ||
1125 | } | ||
1126 | |||
1127 | int hugetlb_reserve_pages(struct inode *inode, long from, long to) | ||
1128 | { | ||
1129 | long ret, chg; | ||
1130 | |||
1131 | chg = region_chg(&inode->i_mapping->private_list, from, to); | ||
1132 | if (chg < 0) | ||
1133 | return chg; | ||
880 | 1134 | ||
881 | ret = hugetlb_acct_memory(chg); | 1135 | ret = hugetlb_acct_memory(chg); |
882 | if (ret < 0) | 1136 | if (ret < 0) |
diff --git a/mm/internal.h b/mm/internal.h index a3110c02aea7..953f941ea867 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -37,4 +37,14 @@ static inline void __put_page(struct page *page) | |||
37 | extern void fastcall __init __free_pages_bootmem(struct page *page, | 37 | extern void fastcall __init __free_pages_bootmem(struct page *page, |
38 | unsigned int order); | 38 | unsigned int order); |
39 | 39 | ||
40 | /* | ||
41 | * function for dealing with page's order in buddy system. | ||
42 | * zone->lock is already acquired when we use these. | ||
43 | * So, we don't need atomic page->flags operations here. | ||
44 | */ | ||
45 | static inline unsigned long page_order(struct page *page) | ||
46 | { | ||
47 | VM_BUG_ON(!PageBuddy(page)); | ||
48 | return page_private(page); | ||
49 | } | ||
40 | #endif | 50 | #endif |
diff --git a/mm/memory.c b/mm/memory.c index f82b359b2745..bd16dcaeefb8 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -966,7 +966,7 @@ no_page_table: | |||
966 | * has touched so far, we don't want to allocate page tables. | 966 | * has touched so far, we don't want to allocate page tables. |
967 | */ | 967 | */ |
968 | if (flags & FOLL_ANON) { | 968 | if (flags & FOLL_ANON) { |
969 | page = ZERO_PAGE(address); | 969 | page = ZERO_PAGE(0); |
970 | if (flags & FOLL_GET) | 970 | if (flags & FOLL_GET) |
971 | get_page(page); | 971 | get_page(page); |
972 | BUG_ON(flags & FOLL_WRITE); | 972 | BUG_ON(flags & FOLL_WRITE); |
@@ -1111,95 +1111,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1111 | } | 1111 | } |
1112 | EXPORT_SYMBOL(get_user_pages); | 1112 | EXPORT_SYMBOL(get_user_pages); |
1113 | 1113 | ||
1114 | static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, | ||
1115 | unsigned long addr, unsigned long end, pgprot_t prot) | ||
1116 | { | ||
1117 | pte_t *pte; | ||
1118 | spinlock_t *ptl; | ||
1119 | int err = 0; | ||
1120 | |||
1121 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | ||
1122 | if (!pte) | ||
1123 | return -EAGAIN; | ||
1124 | arch_enter_lazy_mmu_mode(); | ||
1125 | do { | ||
1126 | struct page *page = ZERO_PAGE(addr); | ||
1127 | pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); | ||
1128 | |||
1129 | if (unlikely(!pte_none(*pte))) { | ||
1130 | err = -EEXIST; | ||
1131 | pte++; | ||
1132 | break; | ||
1133 | } | ||
1134 | page_cache_get(page); | ||
1135 | page_add_file_rmap(page); | ||
1136 | inc_mm_counter(mm, file_rss); | ||
1137 | set_pte_at(mm, addr, pte, zero_pte); | ||
1138 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
1139 | arch_leave_lazy_mmu_mode(); | ||
1140 | pte_unmap_unlock(pte - 1, ptl); | ||
1141 | return err; | ||
1142 | } | ||
1143 | |||
1144 | static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, | ||
1145 | unsigned long addr, unsigned long end, pgprot_t prot) | ||
1146 | { | ||
1147 | pmd_t *pmd; | ||
1148 | unsigned long next; | ||
1149 | int err; | ||
1150 | |||
1151 | pmd = pmd_alloc(mm, pud, addr); | ||
1152 | if (!pmd) | ||
1153 | return -EAGAIN; | ||
1154 | do { | ||
1155 | next = pmd_addr_end(addr, end); | ||
1156 | err = zeromap_pte_range(mm, pmd, addr, next, prot); | ||
1157 | if (err) | ||
1158 | break; | ||
1159 | } while (pmd++, addr = next, addr != end); | ||
1160 | return err; | ||
1161 | } | ||
1162 | |||
1163 | static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, | ||
1164 | unsigned long addr, unsigned long end, pgprot_t prot) | ||
1165 | { | ||
1166 | pud_t *pud; | ||
1167 | unsigned long next; | ||
1168 | int err; | ||
1169 | |||
1170 | pud = pud_alloc(mm, pgd, addr); | ||
1171 | if (!pud) | ||
1172 | return -EAGAIN; | ||
1173 | do { | ||
1174 | next = pud_addr_end(addr, end); | ||
1175 | err = zeromap_pmd_range(mm, pud, addr, next, prot); | ||
1176 | if (err) | ||
1177 | break; | ||
1178 | } while (pud++, addr = next, addr != end); | ||
1179 | return err; | ||
1180 | } | ||
1181 | |||
1182 | int zeromap_page_range(struct vm_area_struct *vma, | ||
1183 | unsigned long addr, unsigned long size, pgprot_t prot) | ||
1184 | { | ||
1185 | pgd_t *pgd; | ||
1186 | unsigned long next; | ||
1187 | unsigned long end = addr + size; | ||
1188 | struct mm_struct *mm = vma->vm_mm; | ||
1189 | int err; | ||
1190 | |||
1191 | BUG_ON(addr >= end); | ||
1192 | pgd = pgd_offset(mm, addr); | ||
1193 | flush_cache_range(vma, addr, end); | ||
1194 | do { | ||
1195 | next = pgd_addr_end(addr, end); | ||
1196 | err = zeromap_pud_range(mm, pgd, addr, next, prot); | ||
1197 | if (err) | ||
1198 | break; | ||
1199 | } while (pgd++, addr = next, addr != end); | ||
1200 | return err; | ||
1201 | } | ||
1202 | |||
1203 | pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) | 1114 | pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) |
1204 | { | 1115 | { |
1205 | pgd_t * pgd = pgd_offset(mm, addr); | 1116 | pgd_t * pgd = pgd_offset(mm, addr); |
@@ -1700,10 +1611,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1700 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 1611 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
1701 | entry = pte_mkyoung(orig_pte); | 1612 | entry = pte_mkyoung(orig_pte); |
1702 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1613 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1703 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) { | 1614 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) |
1704 | update_mmu_cache(vma, address, entry); | 1615 | update_mmu_cache(vma, address, entry); |
1705 | lazy_mmu_prot_update(entry); | ||
1706 | } | ||
1707 | ret |= VM_FAULT_WRITE; | 1616 | ret |= VM_FAULT_WRITE; |
1708 | goto unlock; | 1617 | goto unlock; |
1709 | } | 1618 | } |
@@ -1717,16 +1626,11 @@ gotten: | |||
1717 | 1626 | ||
1718 | if (unlikely(anon_vma_prepare(vma))) | 1627 | if (unlikely(anon_vma_prepare(vma))) |
1719 | goto oom; | 1628 | goto oom; |
1720 | if (old_page == ZERO_PAGE(address)) { | 1629 | VM_BUG_ON(old_page == ZERO_PAGE(0)); |
1721 | new_page = alloc_zeroed_user_highpage_movable(vma, address); | 1630 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
1722 | if (!new_page) | 1631 | if (!new_page) |
1723 | goto oom; | 1632 | goto oom; |
1724 | } else { | 1633 | cow_user_page(new_page, old_page, address, vma); |
1725 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
1726 | if (!new_page) | ||
1727 | goto oom; | ||
1728 | cow_user_page(new_page, old_page, address, vma); | ||
1729 | } | ||
1730 | 1634 | ||
1731 | /* | 1635 | /* |
1732 | * Re-check the pte - we dropped the lock | 1636 | * Re-check the pte - we dropped the lock |
@@ -1744,7 +1648,6 @@ gotten: | |||
1744 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 1648 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
1745 | entry = mk_pte(new_page, vma->vm_page_prot); | 1649 | entry = mk_pte(new_page, vma->vm_page_prot); |
1746 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1650 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1747 | lazy_mmu_prot_update(entry); | ||
1748 | /* | 1651 | /* |
1749 | * Clear the pte entry and flush it first, before updating the | 1652 | * Clear the pte entry and flush it first, before updating the |
1750 | * pte with the new entry. This will avoid a race condition | 1653 | * pte with the new entry. This will avoid a race condition |
@@ -2252,44 +2155,28 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2252 | spinlock_t *ptl; | 2155 | spinlock_t *ptl; |
2253 | pte_t entry; | 2156 | pte_t entry; |
2254 | 2157 | ||
2255 | if (write_access) { | 2158 | /* Allocate our own private page. */ |
2256 | /* Allocate our own private page. */ | 2159 | pte_unmap(page_table); |
2257 | pte_unmap(page_table); | ||
2258 | |||
2259 | if (unlikely(anon_vma_prepare(vma))) | ||
2260 | goto oom; | ||
2261 | page = alloc_zeroed_user_highpage_movable(vma, address); | ||
2262 | if (!page) | ||
2263 | goto oom; | ||
2264 | |||
2265 | entry = mk_pte(page, vma->vm_page_prot); | ||
2266 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
2267 | 2160 | ||
2268 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2161 | if (unlikely(anon_vma_prepare(vma))) |
2269 | if (!pte_none(*page_table)) | 2162 | goto oom; |
2270 | goto release; | 2163 | page = alloc_zeroed_user_highpage_movable(vma, address); |
2271 | inc_mm_counter(mm, anon_rss); | 2164 | if (!page) |
2272 | lru_cache_add_active(page); | 2165 | goto oom; |
2273 | page_add_new_anon_rmap(page, vma, address); | ||
2274 | } else { | ||
2275 | /* Map the ZERO_PAGE - vm_page_prot is readonly */ | ||
2276 | page = ZERO_PAGE(address); | ||
2277 | page_cache_get(page); | ||
2278 | entry = mk_pte(page, vma->vm_page_prot); | ||
2279 | 2166 | ||
2280 | ptl = pte_lockptr(mm, pmd); | 2167 | entry = mk_pte(page, vma->vm_page_prot); |
2281 | spin_lock(ptl); | 2168 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2282 | if (!pte_none(*page_table)) | ||
2283 | goto release; | ||
2284 | inc_mm_counter(mm, file_rss); | ||
2285 | page_add_file_rmap(page); | ||
2286 | } | ||
2287 | 2169 | ||
2170 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
2171 | if (!pte_none(*page_table)) | ||
2172 | goto release; | ||
2173 | inc_mm_counter(mm, anon_rss); | ||
2174 | lru_cache_add_active(page); | ||
2175 | page_add_new_anon_rmap(page, vma, address); | ||
2288 | set_pte_at(mm, address, page_table, entry); | 2176 | set_pte_at(mm, address, page_table, entry); |
2289 | 2177 | ||
2290 | /* No need to invalidate - it was non-present before */ | 2178 | /* No need to invalidate - it was non-present before */ |
2291 | update_mmu_cache(vma, address, entry); | 2179 | update_mmu_cache(vma, address, entry); |
2292 | lazy_mmu_prot_update(entry); | ||
2293 | unlock: | 2180 | unlock: |
2294 | pte_unmap_unlock(page_table, ptl); | 2181 | pte_unmap_unlock(page_table, ptl); |
2295 | return 0; | 2182 | return 0; |
@@ -2442,7 +2329,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2442 | 2329 | ||
2443 | /* no need to invalidate: a not-present page won't be cached */ | 2330 | /* no need to invalidate: a not-present page won't be cached */ |
2444 | update_mmu_cache(vma, address, entry); | 2331 | update_mmu_cache(vma, address, entry); |
2445 | lazy_mmu_prot_update(entry); | ||
2446 | } else { | 2332 | } else { |
2447 | if (anon) | 2333 | if (anon) |
2448 | page_cache_release(page); | 2334 | page_cache_release(page); |
@@ -2470,7 +2356,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2470 | int write_access, pte_t orig_pte) | 2356 | int write_access, pte_t orig_pte) |
2471 | { | 2357 | { |
2472 | pgoff_t pgoff = (((address & PAGE_MASK) | 2358 | pgoff_t pgoff = (((address & PAGE_MASK) |
2473 | - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; | 2359 | - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
2474 | unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); | 2360 | unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); |
2475 | 2361 | ||
2476 | pte_unmap(page_table); | 2362 | pte_unmap(page_table); |
@@ -2614,7 +2500,6 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2614 | entry = pte_mkyoung(entry); | 2500 | entry = pte_mkyoung(entry); |
2615 | if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { | 2501 | if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { |
2616 | update_mmu_cache(vma, address, entry); | 2502 | update_mmu_cache(vma, address, entry); |
2617 | lazy_mmu_prot_update(entry); | ||
2618 | } else { | 2503 | } else { |
2619 | /* | 2504 | /* |
2620 | * This is needed only for protection faults but the arch code | 2505 | * This is needed only for protection faults but the arch code |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index df9d554bea30..091b9c6c2529 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -23,6 +23,9 @@ | |||
23 | #include <linux/vmalloc.h> | 23 | #include <linux/vmalloc.h> |
24 | #include <linux/ioport.h> | 24 | #include <linux/ioport.h> |
25 | #include <linux/cpuset.h> | 25 | #include <linux/cpuset.h> |
26 | #include <linux/delay.h> | ||
27 | #include <linux/migrate.h> | ||
28 | #include <linux/page-isolation.h> | ||
26 | 29 | ||
27 | #include <asm/tlbflush.h> | 30 | #include <asm/tlbflush.h> |
28 | 31 | ||
@@ -161,14 +164,27 @@ static void grow_pgdat_span(struct pglist_data *pgdat, | |||
161 | pgdat->node_start_pfn; | 164 | pgdat->node_start_pfn; |
162 | } | 165 | } |
163 | 166 | ||
164 | int online_pages(unsigned long pfn, unsigned long nr_pages) | 167 | static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, |
168 | void *arg) | ||
165 | { | 169 | { |
166 | unsigned long i; | 170 | unsigned long i; |
171 | unsigned long onlined_pages = *(unsigned long *)arg; | ||
172 | struct page *page; | ||
173 | if (PageReserved(pfn_to_page(start_pfn))) | ||
174 | for (i = 0; i < nr_pages; i++) { | ||
175 | page = pfn_to_page(start_pfn + i); | ||
176 | online_page(page); | ||
177 | onlined_pages++; | ||
178 | } | ||
179 | *(unsigned long *)arg = onlined_pages; | ||
180 | return 0; | ||
181 | } | ||
182 | |||
183 | |||
184 | int online_pages(unsigned long pfn, unsigned long nr_pages) | ||
185 | { | ||
167 | unsigned long flags; | 186 | unsigned long flags; |
168 | unsigned long onlined_pages = 0; | 187 | unsigned long onlined_pages = 0; |
169 | struct resource res; | ||
170 | u64 section_end; | ||
171 | unsigned long start_pfn; | ||
172 | struct zone *zone; | 188 | struct zone *zone; |
173 | int need_zonelists_rebuild = 0; | 189 | int need_zonelists_rebuild = 0; |
174 | 190 | ||
@@ -191,32 +207,16 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
191 | if (!populated_zone(zone)) | 207 | if (!populated_zone(zone)) |
192 | need_zonelists_rebuild = 1; | 208 | need_zonelists_rebuild = 1; |
193 | 209 | ||
194 | res.start = (u64)pfn << PAGE_SHIFT; | 210 | walk_memory_resource(pfn, nr_pages, &onlined_pages, |
195 | res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1; | 211 | online_pages_range); |
196 | res.flags = IORESOURCE_MEM; /* we just need system ram */ | ||
197 | section_end = res.end; | ||
198 | |||
199 | while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { | ||
200 | start_pfn = (unsigned long)(res.start >> PAGE_SHIFT); | ||
201 | nr_pages = (unsigned long) | ||
202 | ((res.end + 1 - res.start) >> PAGE_SHIFT); | ||
203 | |||
204 | if (PageReserved(pfn_to_page(start_pfn))) { | ||
205 | /* this region's page is not onlined now */ | ||
206 | for (i = 0; i < nr_pages; i++) { | ||
207 | struct page *page = pfn_to_page(start_pfn + i); | ||
208 | online_page(page); | ||
209 | onlined_pages++; | ||
210 | } | ||
211 | } | ||
212 | |||
213 | res.start = res.end + 1; | ||
214 | res.end = section_end; | ||
215 | } | ||
216 | zone->present_pages += onlined_pages; | 212 | zone->present_pages += onlined_pages; |
217 | zone->zone_pgdat->node_present_pages += onlined_pages; | 213 | zone->zone_pgdat->node_present_pages += onlined_pages; |
218 | 214 | ||
219 | setup_per_zone_pages_min(); | 215 | setup_per_zone_pages_min(); |
216 | if (onlined_pages) { | ||
217 | kswapd_run(zone_to_nid(zone)); | ||
218 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | ||
219 | } | ||
220 | 220 | ||
221 | if (need_zonelists_rebuild) | 221 | if (need_zonelists_rebuild) |
222 | build_all_zonelists(); | 222 | build_all_zonelists(); |
@@ -271,9 +271,6 @@ int add_memory(int nid, u64 start, u64 size) | |||
271 | if (!pgdat) | 271 | if (!pgdat) |
272 | return -ENOMEM; | 272 | return -ENOMEM; |
273 | new_pgdat = 1; | 273 | new_pgdat = 1; |
274 | ret = kswapd_run(nid); | ||
275 | if (ret) | ||
276 | goto error; | ||
277 | } | 274 | } |
278 | 275 | ||
279 | /* call arch's memory hotadd */ | 276 | /* call arch's memory hotadd */ |
@@ -308,3 +305,260 @@ error: | |||
308 | return ret; | 305 | return ret; |
309 | } | 306 | } |
310 | EXPORT_SYMBOL_GPL(add_memory); | 307 | EXPORT_SYMBOL_GPL(add_memory); |
308 | |||
309 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
310 | /* | ||
311 | * Confirm all pages in a range [start, end) is belongs to the same zone. | ||
312 | */ | ||
313 | static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) | ||
314 | { | ||
315 | unsigned long pfn; | ||
316 | struct zone *zone = NULL; | ||
317 | struct page *page; | ||
318 | int i; | ||
319 | for (pfn = start_pfn; | ||
320 | pfn < end_pfn; | ||
321 | pfn += MAX_ORDER_NR_PAGES) { | ||
322 | i = 0; | ||
323 | /* This is just a CONFIG_HOLES_IN_ZONE check.*/ | ||
324 | while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) | ||
325 | i++; | ||
326 | if (i == MAX_ORDER_NR_PAGES) | ||
327 | continue; | ||
328 | page = pfn_to_page(pfn + i); | ||
329 | if (zone && page_zone(page) != zone) | ||
330 | return 0; | ||
331 | zone = page_zone(page); | ||
332 | } | ||
333 | return 1; | ||
334 | } | ||
335 | |||
336 | /* | ||
337 | * Scanning pfn is much easier than scanning lru list. | ||
338 | * Scan pfn from start to end and Find LRU page. | ||
339 | */ | ||
340 | int scan_lru_pages(unsigned long start, unsigned long end) | ||
341 | { | ||
342 | unsigned long pfn; | ||
343 | struct page *page; | ||
344 | for (pfn = start; pfn < end; pfn++) { | ||
345 | if (pfn_valid(pfn)) { | ||
346 | page = pfn_to_page(pfn); | ||
347 | if (PageLRU(page)) | ||
348 | return pfn; | ||
349 | } | ||
350 | } | ||
351 | return 0; | ||
352 | } | ||
353 | |||
354 | static struct page * | ||
355 | hotremove_migrate_alloc(struct page *page, | ||
356 | unsigned long private, | ||
357 | int **x) | ||
358 | { | ||
359 | /* This should be improoooooved!! */ | ||
360 | return alloc_page(GFP_HIGHUSER_PAGECACHE); | ||
361 | } | ||
362 | |||
363 | |||
364 | #define NR_OFFLINE_AT_ONCE_PAGES (256) | ||
365 | static int | ||
366 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | ||
367 | { | ||
368 | unsigned long pfn; | ||
369 | struct page *page; | ||
370 | int move_pages = NR_OFFLINE_AT_ONCE_PAGES; | ||
371 | int not_managed = 0; | ||
372 | int ret = 0; | ||
373 | LIST_HEAD(source); | ||
374 | |||
375 | for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { | ||
376 | if (!pfn_valid(pfn)) | ||
377 | continue; | ||
378 | page = pfn_to_page(pfn); | ||
379 | if (!page_count(page)) | ||
380 | continue; | ||
381 | /* | ||
382 | * We can skip free pages. And we can only deal with pages on | ||
383 | * LRU. | ||
384 | */ | ||
385 | ret = isolate_lru_page(page, &source); | ||
386 | if (!ret) { /* Success */ | ||
387 | move_pages--; | ||
388 | } else { | ||
389 | /* Becasue we don't have big zone->lock. we should | ||
390 | check this again here. */ | ||
391 | if (page_count(page)) | ||
392 | not_managed++; | ||
393 | #ifdef CONFIG_DEBUG_VM | ||
394 | printk(KERN_INFO "removing from LRU failed" | ||
395 | " %lx/%d/%lx\n", | ||
396 | pfn, page_count(page), page->flags); | ||
397 | #endif | ||
398 | } | ||
399 | } | ||
400 | ret = -EBUSY; | ||
401 | if (not_managed) { | ||
402 | if (!list_empty(&source)) | ||
403 | putback_lru_pages(&source); | ||
404 | goto out; | ||
405 | } | ||
406 | ret = 0; | ||
407 | if (list_empty(&source)) | ||
408 | goto out; | ||
409 | /* this function returns # of failed pages */ | ||
410 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0); | ||
411 | |||
412 | out: | ||
413 | return ret; | ||
414 | } | ||
415 | |||
416 | /* | ||
417 | * remove from free_area[] and mark all as Reserved. | ||
418 | */ | ||
419 | static int | ||
420 | offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, | ||
421 | void *data) | ||
422 | { | ||
423 | __offline_isolated_pages(start, start + nr_pages); | ||
424 | return 0; | ||
425 | } | ||
426 | |||
427 | static void | ||
428 | offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | ||
429 | { | ||
430 | walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL, | ||
431 | offline_isolated_pages_cb); | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * Check all pages in range, recoreded as memory resource, are isolated. | ||
436 | */ | ||
437 | static int | ||
438 | check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, | ||
439 | void *data) | ||
440 | { | ||
441 | int ret; | ||
442 | long offlined = *(long *)data; | ||
443 | ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); | ||
444 | offlined = nr_pages; | ||
445 | if (!ret) | ||
446 | *(long *)data += offlined; | ||
447 | return ret; | ||
448 | } | ||
449 | |||
450 | static long | ||
451 | check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | ||
452 | { | ||
453 | long offlined = 0; | ||
454 | int ret; | ||
455 | |||
456 | ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined, | ||
457 | check_pages_isolated_cb); | ||
458 | if (ret < 0) | ||
459 | offlined = (long)ret; | ||
460 | return offlined; | ||
461 | } | ||
462 | |||
463 | extern void drain_all_local_pages(void); | ||
464 | |||
465 | int offline_pages(unsigned long start_pfn, | ||
466 | unsigned long end_pfn, unsigned long timeout) | ||
467 | { | ||
468 | unsigned long pfn, nr_pages, expire; | ||
469 | long offlined_pages; | ||
470 | int ret, drain, retry_max; | ||
471 | struct zone *zone; | ||
472 | |||
473 | BUG_ON(start_pfn >= end_pfn); | ||
474 | /* at least, alignment against pageblock is necessary */ | ||
475 | if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) | ||
476 | return -EINVAL; | ||
477 | if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) | ||
478 | return -EINVAL; | ||
479 | /* This makes hotplug much easier...and readable. | ||
480 | we assume this for now. .*/ | ||
481 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) | ||
482 | return -EINVAL; | ||
483 | /* set above range as isolated */ | ||
484 | ret = start_isolate_page_range(start_pfn, end_pfn); | ||
485 | if (ret) | ||
486 | return ret; | ||
487 | nr_pages = end_pfn - start_pfn; | ||
488 | pfn = start_pfn; | ||
489 | expire = jiffies + timeout; | ||
490 | drain = 0; | ||
491 | retry_max = 5; | ||
492 | repeat: | ||
493 | /* start memory hot removal */ | ||
494 | ret = -EAGAIN; | ||
495 | if (time_after(jiffies, expire)) | ||
496 | goto failed_removal; | ||
497 | ret = -EINTR; | ||
498 | if (signal_pending(current)) | ||
499 | goto failed_removal; | ||
500 | ret = 0; | ||
501 | if (drain) { | ||
502 | lru_add_drain_all(); | ||
503 | flush_scheduled_work(); | ||
504 | cond_resched(); | ||
505 | drain_all_local_pages(); | ||
506 | } | ||
507 | |||
508 | pfn = scan_lru_pages(start_pfn, end_pfn); | ||
509 | if (pfn) { /* We have page on LRU */ | ||
510 | ret = do_migrate_range(pfn, end_pfn); | ||
511 | if (!ret) { | ||
512 | drain = 1; | ||
513 | goto repeat; | ||
514 | } else { | ||
515 | if (ret < 0) | ||
516 | if (--retry_max == 0) | ||
517 | goto failed_removal; | ||
518 | yield(); | ||
519 | drain = 1; | ||
520 | goto repeat; | ||
521 | } | ||
522 | } | ||
523 | /* drain all zone's lru pagevec, this is asyncronous... */ | ||
524 | lru_add_drain_all(); | ||
525 | flush_scheduled_work(); | ||
526 | yield(); | ||
527 | /* drain pcp pages , this is synchrouns. */ | ||
528 | drain_all_local_pages(); | ||
529 | /* check again */ | ||
530 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); | ||
531 | if (offlined_pages < 0) { | ||
532 | ret = -EBUSY; | ||
533 | goto failed_removal; | ||
534 | } | ||
535 | printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); | ||
536 | /* Ok, all of our target is islaoted. | ||
537 | We cannot do rollback at this point. */ | ||
538 | offline_isolated_pages(start_pfn, end_pfn); | ||
539 | /* reset pagetype flags */ | ||
540 | start_isolate_page_range(start_pfn, end_pfn); | ||
541 | /* removal success */ | ||
542 | zone = page_zone(pfn_to_page(start_pfn)); | ||
543 | zone->present_pages -= offlined_pages; | ||
544 | zone->zone_pgdat->node_present_pages -= offlined_pages; | ||
545 | totalram_pages -= offlined_pages; | ||
546 | num_physpages -= offlined_pages; | ||
547 | vm_total_pages = nr_free_pagecache_pages(); | ||
548 | writeback_set_ratelimit(); | ||
549 | return 0; | ||
550 | |||
551 | failed_removal: | ||
552 | printk(KERN_INFO "memory offlining %lx to %lx failed\n", | ||
553 | start_pfn, end_pfn); | ||
554 | /* pushback to free area */ | ||
555 | undo_isolate_page_range(start_pfn, end_pfn); | ||
556 | return ret; | ||
557 | } | ||
558 | #else | ||
559 | int remove_memory(u64 start, u64 size) | ||
560 | { | ||
561 | return -EINVAL; | ||
562 | } | ||
563 | EXPORT_SYMBOL_GPL(remove_memory); | ||
564 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3d6ac9505d07..568152ae6caf 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -72,7 +72,6 @@ | |||
72 | #include <linux/hugetlb.h> | 72 | #include <linux/hugetlb.h> |
73 | #include <linux/kernel.h> | 73 | #include <linux/kernel.h> |
74 | #include <linux/sched.h> | 74 | #include <linux/sched.h> |
75 | #include <linux/mm.h> | ||
76 | #include <linux/nodemask.h> | 75 | #include <linux/nodemask.h> |
77 | #include <linux/cpuset.h> | 76 | #include <linux/cpuset.h> |
78 | #include <linux/gfp.h> | 77 | #include <linux/gfp.h> |
@@ -82,13 +81,13 @@ | |||
82 | #include <linux/interrupt.h> | 81 | #include <linux/interrupt.h> |
83 | #include <linux/init.h> | 82 | #include <linux/init.h> |
84 | #include <linux/compat.h> | 83 | #include <linux/compat.h> |
85 | #include <linux/mempolicy.h> | ||
86 | #include <linux/swap.h> | 84 | #include <linux/swap.h> |
87 | #include <linux/seq_file.h> | 85 | #include <linux/seq_file.h> |
88 | #include <linux/proc_fs.h> | 86 | #include <linux/proc_fs.h> |
89 | #include <linux/migrate.h> | 87 | #include <linux/migrate.h> |
90 | #include <linux/rmap.h> | 88 | #include <linux/rmap.h> |
91 | #include <linux/security.h> | 89 | #include <linux/security.h> |
90 | #include <linux/syscalls.h> | ||
92 | 91 | ||
93 | #include <asm/tlbflush.h> | 92 | #include <asm/tlbflush.h> |
94 | #include <asm/uaccess.h> | 93 | #include <asm/uaccess.h> |
@@ -110,6 +109,9 @@ struct mempolicy default_policy = { | |||
110 | .policy = MPOL_DEFAULT, | 109 | .policy = MPOL_DEFAULT, |
111 | }; | 110 | }; |
112 | 111 | ||
112 | static void mpol_rebind_policy(struct mempolicy *pol, | ||
113 | const nodemask_t *newmask); | ||
114 | |||
113 | /* Do sanity checking on a policy */ | 115 | /* Do sanity checking on a policy */ |
114 | static int mpol_check_policy(int mode, nodemask_t *nodes) | 116 | static int mpol_check_policy(int mode, nodemask_t *nodes) |
115 | { | 117 | { |
@@ -128,7 +130,7 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) | |||
128 | return -EINVAL; | 130 | return -EINVAL; |
129 | break; | 131 | break; |
130 | } | 132 | } |
131 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; | 133 | return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL; |
132 | } | 134 | } |
133 | 135 | ||
134 | /* Generate a custom zonelist for the BIND policy. */ | 136 | /* Generate a custom zonelist for the BIND policy. */ |
@@ -185,7 +187,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
185 | switch (mode) { | 187 | switch (mode) { |
186 | case MPOL_INTERLEAVE: | 188 | case MPOL_INTERLEAVE: |
187 | policy->v.nodes = *nodes; | 189 | policy->v.nodes = *nodes; |
188 | if (nodes_weight(*nodes) == 0) { | 190 | nodes_and(policy->v.nodes, policy->v.nodes, |
191 | node_states[N_HIGH_MEMORY]); | ||
192 | if (nodes_weight(policy->v.nodes) == 0) { | ||
189 | kmem_cache_free(policy_cache, policy); | 193 | kmem_cache_free(policy_cache, policy); |
190 | return ERR_PTR(-EINVAL); | 194 | return ERR_PTR(-EINVAL); |
191 | } | 195 | } |
@@ -459,7 +463,7 @@ static void mpol_set_task_struct_flag(void) | |||
459 | } | 463 | } |
460 | 464 | ||
461 | /* Set the process memory policy */ | 465 | /* Set the process memory policy */ |
462 | long do_set_mempolicy(int mode, nodemask_t *nodes) | 466 | static long do_set_mempolicy(int mode, nodemask_t *nodes) |
463 | { | 467 | { |
464 | struct mempolicy *new; | 468 | struct mempolicy *new; |
465 | 469 | ||
@@ -494,9 +498,9 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) | |||
494 | *nodes = p->v.nodes; | 498 | *nodes = p->v.nodes; |
495 | break; | 499 | break; |
496 | case MPOL_PREFERRED: | 500 | case MPOL_PREFERRED: |
497 | /* or use current node instead of online map? */ | 501 | /* or use current node instead of memory_map? */ |
498 | if (p->v.preferred_node < 0) | 502 | if (p->v.preferred_node < 0) |
499 | *nodes = node_online_map; | 503 | *nodes = node_states[N_HIGH_MEMORY]; |
500 | else | 504 | else |
501 | node_set(p->v.preferred_node, *nodes); | 505 | node_set(p->v.preferred_node, *nodes); |
502 | break; | 506 | break; |
@@ -519,8 +523,8 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) | |||
519 | } | 523 | } |
520 | 524 | ||
521 | /* Retrieve NUMA policy */ | 525 | /* Retrieve NUMA policy */ |
522 | long do_get_mempolicy(int *policy, nodemask_t *nmask, | 526 | static long do_get_mempolicy(int *policy, nodemask_t *nmask, |
523 | unsigned long addr, unsigned long flags) | 527 | unsigned long addr, unsigned long flags) |
524 | { | 528 | { |
525 | int err; | 529 | int err; |
526 | struct mm_struct *mm = current->mm; | 530 | struct mm_struct *mm = current->mm; |
@@ -528,8 +532,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
528 | struct mempolicy *pol = current->mempolicy; | 532 | struct mempolicy *pol = current->mempolicy; |
529 | 533 | ||
530 | cpuset_update_task_memory_state(); | 534 | cpuset_update_task_memory_state(); |
531 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) | 535 | if (flags & |
536 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) | ||
532 | return -EINVAL; | 537 | return -EINVAL; |
538 | |||
539 | if (flags & MPOL_F_MEMS_ALLOWED) { | ||
540 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) | ||
541 | return -EINVAL; | ||
542 | *policy = 0; /* just so it's initialized */ | ||
543 | *nmask = cpuset_current_mems_allowed; | ||
544 | return 0; | ||
545 | } | ||
546 | |||
533 | if (flags & MPOL_F_ADDR) { | 547 | if (flags & MPOL_F_ADDR) { |
534 | down_read(&mm->mmap_sem); | 548 | down_read(&mm->mmap_sem); |
535 | vma = find_vma_intersection(mm, addr, addr+1); | 549 | vma = find_vma_intersection(mm, addr, addr+1); |
@@ -601,7 +615,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x | |||
601 | * Migrate pages from one node to a target node. | 615 | * Migrate pages from one node to a target node. |
602 | * Returns error or the number of pages not migrated. | 616 | * Returns error or the number of pages not migrated. |
603 | */ | 617 | */ |
604 | int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) | 618 | static int migrate_to_node(struct mm_struct *mm, int source, int dest, |
619 | int flags) | ||
605 | { | 620 | { |
606 | nodemask_t nmask; | 621 | nodemask_t nmask; |
607 | LIST_HEAD(pagelist); | 622 | LIST_HEAD(pagelist); |
@@ -732,8 +747,9 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * | |||
732 | } | 747 | } |
733 | #endif | 748 | #endif |
734 | 749 | ||
735 | long do_mbind(unsigned long start, unsigned long len, | 750 | static long do_mbind(unsigned long start, unsigned long len, |
736 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | 751 | unsigned long mode, nodemask_t *nmask, |
752 | unsigned long flags) | ||
737 | { | 753 | { |
738 | struct vm_area_struct *vma; | 754 | struct vm_area_struct *vma; |
739 | struct mm_struct *mm = current->mm; | 755 | struct mm_struct *mm = current->mm; |
@@ -955,7 +971,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, | |||
955 | goto out; | 971 | goto out; |
956 | } | 972 | } |
957 | 973 | ||
958 | if (!nodes_subset(new, node_online_map)) { | 974 | if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { |
959 | err = -EINVAL; | 975 | err = -EINVAL; |
960 | goto out; | 976 | goto out; |
961 | } | 977 | } |
@@ -978,7 +994,8 @@ asmlinkage long sys_get_mempolicy(int __user *policy, | |||
978 | unsigned long maxnode, | 994 | unsigned long maxnode, |
979 | unsigned long addr, unsigned long flags) | 995 | unsigned long addr, unsigned long flags) |
980 | { | 996 | { |
981 | int err, pval; | 997 | int err; |
998 | int uninitialized_var(pval); | ||
982 | nodemask_t nodes; | 999 | nodemask_t nodes; |
983 | 1000 | ||
984 | if (nmask != NULL && maxnode < MAX_NUMNODES) | 1001 | if (nmask != NULL && maxnode < MAX_NUMNODES) |
@@ -1527,8 +1544,8 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n) | |||
1527 | kmem_cache_free(sn_cache, n); | 1544 | kmem_cache_free(sn_cache, n); |
1528 | } | 1545 | } |
1529 | 1546 | ||
1530 | struct sp_node * | 1547 | static struct sp_node *sp_alloc(unsigned long start, unsigned long end, |
1531 | sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) | 1548 | struct mempolicy *pol) |
1532 | { | 1549 | { |
1533 | struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); | 1550 | struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); |
1534 | 1551 | ||
@@ -1677,7 +1694,7 @@ void __init numa_policy_init(void) | |||
1677 | * fall back to the largest node if they're all smaller. | 1694 | * fall back to the largest node if they're all smaller. |
1678 | */ | 1695 | */ |
1679 | nodes_clear(interleave_nodes); | 1696 | nodes_clear(interleave_nodes); |
1680 | for_each_online_node(nid) { | 1697 | for_each_node_state(nid, N_HIGH_MEMORY) { |
1681 | unsigned long total_pages = node_present_pages(nid); | 1698 | unsigned long total_pages = node_present_pages(nid); |
1682 | 1699 | ||
1683 | /* Preserve the largest node */ | 1700 | /* Preserve the largest node */ |
@@ -1706,7 +1723,8 @@ void numa_default_policy(void) | |||
1706 | } | 1723 | } |
1707 | 1724 | ||
1708 | /* Migrate a policy to a different set of nodes */ | 1725 | /* Migrate a policy to a different set of nodes */ |
1709 | void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) | 1726 | static void mpol_rebind_policy(struct mempolicy *pol, |
1727 | const nodemask_t *newmask) | ||
1710 | { | 1728 | { |
1711 | nodemask_t *mpolmask; | 1729 | nodemask_t *mpolmask; |
1712 | nodemask_t tmp; | 1730 | nodemask_t tmp; |
@@ -1963,7 +1981,7 @@ int show_numa_map(struct seq_file *m, void *v) | |||
1963 | seq_printf(m, " huge"); | 1981 | seq_printf(m, " huge"); |
1964 | } else { | 1982 | } else { |
1965 | check_pgd_range(vma, vma->vm_start, vma->vm_end, | 1983 | check_pgd_range(vma, vma->vm_start, vma->vm_end, |
1966 | &node_online_map, MPOL_MF_STATS, md); | 1984 | &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md); |
1967 | } | 1985 | } |
1968 | 1986 | ||
1969 | if (!md->pages) | 1987 | if (!md->pages) |
@@ -1990,7 +2008,7 @@ int show_numa_map(struct seq_file *m, void *v) | |||
1990 | if (md->writeback) | 2008 | if (md->writeback) |
1991 | seq_printf(m," writeback=%lu", md->writeback); | 2009 | seq_printf(m," writeback=%lu", md->writeback); |
1992 | 2010 | ||
1993 | for_each_online_node(n) | 2011 | for_each_node_state(n, N_HIGH_MEMORY) |
1994 | if (md->node[n]) | 2012 | if (md->node[n]) |
1995 | seq_printf(m, " N%d=%lu", n, md->node[n]); | 2013 | seq_printf(m, " N%d=%lu", n, md->node[n]); |
1996 | out: | 2014 | out: |
diff --git a/mm/migrate.c b/mm/migrate.c index 07f22d4a431f..06d0877a66ef 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -171,6 +171,7 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
171 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 171 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
172 | if (is_write_migration_entry(entry)) | 172 | if (is_write_migration_entry(entry)) |
173 | pte = pte_mkwrite(pte); | 173 | pte = pte_mkwrite(pte); |
174 | flush_cache_page(vma, addr, pte_pfn(pte)); | ||
174 | set_pte_at(mm, addr, ptep, pte); | 175 | set_pte_at(mm, addr, ptep, pte); |
175 | 176 | ||
176 | if (PageAnon(new)) | 177 | if (PageAnon(new)) |
@@ -180,7 +181,6 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
180 | 181 | ||
181 | /* No need to invalidate - it was non-present before */ | 182 | /* No need to invalidate - it was non-present before */ |
182 | update_mmu_cache(vma, addr, pte); | 183 | update_mmu_cache(vma, addr, pte); |
183 | lazy_mmu_prot_update(pte); | ||
184 | 184 | ||
185 | out: | 185 | out: |
186 | pte_unmap_unlock(ptep, ptl); | 186 | pte_unmap_unlock(ptep, ptl); |
@@ -986,7 +986,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, | |||
986 | goto out; | 986 | goto out; |
987 | 987 | ||
988 | err = -ENODEV; | 988 | err = -ENODEV; |
989 | if (!node_online(node)) | 989 | if (!node_state(node, N_HIGH_MEMORY)) |
990 | goto out; | 990 | goto out; |
991 | 991 | ||
992 | err = -EACCES; | 992 | err = -EACCES; |
diff --git a/mm/mprotect.c b/mm/mprotect.c index e8346c30abec..1d4d69790e59 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -53,7 +53,6 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
53 | if (dirty_accountable && pte_dirty(ptent)) | 53 | if (dirty_accountable && pte_dirty(ptent)) |
54 | ptent = pte_mkwrite(ptent); | 54 | ptent = pte_mkwrite(ptent); |
55 | set_pte_at(mm, addr, pte, ptent); | 55 | set_pte_at(mm, addr, pte, ptent); |
56 | lazy_mmu_prot_update(ptent); | ||
57 | #ifdef CONFIG_MIGRATION | 56 | #ifdef CONFIG_MIGRATION |
58 | } else if (!pte_file(oldpte)) { | 57 | } else if (!pte_file(oldpte)) { |
59 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 58 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f9b82ad5047f..41b4e362221d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -177,14 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) | |||
177 | { | 177 | { |
178 | #ifdef CONFIG_NUMA | 178 | #ifdef CONFIG_NUMA |
179 | struct zone **z; | 179 | struct zone **z; |
180 | nodemask_t nodes; | 180 | nodemask_t nodes = node_states[N_HIGH_MEMORY]; |
181 | int node; | ||
182 | |||
183 | nodes_clear(nodes); | ||
184 | /* node has memory ? */ | ||
185 | for_each_online_node(node) | ||
186 | if (NODE_DATA(node)->node_present_pages) | ||
187 | node_set(node, nodes); | ||
188 | 181 | ||
189 | for (z = zonelist->zones; *z; z++) | 182 | for (z = zonelist->zones; *z; z++) |
190 | if (cpuset_zone_allowed_softwall(*z, gfp_mask)) | 183 | if (cpuset_zone_allowed_softwall(*z, gfp_mask)) |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 44720363374c..d821321326e3 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -126,7 +126,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) | |||
126 | int node; | 126 | int node; |
127 | unsigned long x = 0; | 127 | unsigned long x = 0; |
128 | 128 | ||
129 | for_each_online_node(node) { | 129 | for_each_node_state(node, N_HIGH_MEMORY) { |
130 | struct zone *z = | 130 | struct zone *z = |
131 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; | 131 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; |
132 | 132 | ||
@@ -1022,17 +1022,15 @@ int test_set_page_writeback(struct page *page) | |||
1022 | EXPORT_SYMBOL(test_set_page_writeback); | 1022 | EXPORT_SYMBOL(test_set_page_writeback); |
1023 | 1023 | ||
1024 | /* | 1024 | /* |
1025 | * Return true if any of the pages in the mapping are marged with the | 1025 | * Return true if any of the pages in the mapping are marked with the |
1026 | * passed tag. | 1026 | * passed tag. |
1027 | */ | 1027 | */ |
1028 | int mapping_tagged(struct address_space *mapping, int tag) | 1028 | int mapping_tagged(struct address_space *mapping, int tag) |
1029 | { | 1029 | { |
1030 | unsigned long flags; | ||
1031 | int ret; | 1030 | int ret; |
1032 | 1031 | rcu_read_lock(); | |
1033 | read_lock_irqsave(&mapping->tree_lock, flags); | ||
1034 | ret = radix_tree_tagged(&mapping->page_tree, tag); | 1032 | ret = radix_tree_tagged(&mapping->page_tree, tag); |
1035 | read_unlock_irqrestore(&mapping->tree_lock, flags); | 1033 | rcu_read_unlock(); |
1036 | return ret; | 1034 | return ret; |
1037 | } | 1035 | } |
1038 | EXPORT_SYMBOL(mapping_tagged); | 1036 | EXPORT_SYMBOL(mapping_tagged); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1a8c59571cb7..d315e1127dc9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -41,24 +41,37 @@ | |||
41 | #include <linux/pfn.h> | 41 | #include <linux/pfn.h> |
42 | #include <linux/backing-dev.h> | 42 | #include <linux/backing-dev.h> |
43 | #include <linux/fault-inject.h> | 43 | #include <linux/fault-inject.h> |
44 | #include <linux/page-isolation.h> | ||
44 | 45 | ||
45 | #include <asm/tlbflush.h> | 46 | #include <asm/tlbflush.h> |
46 | #include <asm/div64.h> | 47 | #include <asm/div64.h> |
47 | #include "internal.h" | 48 | #include "internal.h" |
48 | 49 | ||
49 | /* | 50 | /* |
50 | * MCD - HACK: Find somewhere to initialize this EARLY, or make this | 51 | * Array of node states. |
51 | * initializer cleaner | ||
52 | */ | 52 | */ |
53 | nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; | 53 | nodemask_t node_states[NR_NODE_STATES] __read_mostly = { |
54 | EXPORT_SYMBOL(node_online_map); | 54 | [N_POSSIBLE] = NODE_MASK_ALL, |
55 | nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; | 55 | [N_ONLINE] = { { [0] = 1UL } }, |
56 | EXPORT_SYMBOL(node_possible_map); | 56 | #ifndef CONFIG_NUMA |
57 | [N_NORMAL_MEMORY] = { { [0] = 1UL } }, | ||
58 | #ifdef CONFIG_HIGHMEM | ||
59 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, | ||
60 | #endif | ||
61 | [N_CPU] = { { [0] = 1UL } }, | ||
62 | #endif /* NUMA */ | ||
63 | }; | ||
64 | EXPORT_SYMBOL(node_states); | ||
65 | |||
57 | unsigned long totalram_pages __read_mostly; | 66 | unsigned long totalram_pages __read_mostly; |
58 | unsigned long totalreserve_pages __read_mostly; | 67 | unsigned long totalreserve_pages __read_mostly; |
59 | long nr_swap_pages; | 68 | long nr_swap_pages; |
60 | int percpu_pagelist_fraction; | 69 | int percpu_pagelist_fraction; |
61 | 70 | ||
71 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | ||
72 | int pageblock_order __read_mostly; | ||
73 | #endif | ||
74 | |||
62 | static void __free_pages_ok(struct page *page, unsigned int order); | 75 | static void __free_pages_ok(struct page *page, unsigned int order); |
63 | 76 | ||
64 | /* | 77 | /* |
@@ -137,7 +150,7 @@ static unsigned long __meminitdata dma_reserve; | |||
137 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; | 150 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; |
138 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | 151 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ |
139 | unsigned long __initdata required_kernelcore; | 152 | unsigned long __initdata required_kernelcore; |
140 | unsigned long __initdata required_movablecore; | 153 | static unsigned long __initdata required_movablecore; |
141 | unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 154 | unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
142 | 155 | ||
143 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ | 156 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
@@ -150,6 +163,14 @@ int nr_node_ids __read_mostly = MAX_NUMNODES; | |||
150 | EXPORT_SYMBOL(nr_node_ids); | 163 | EXPORT_SYMBOL(nr_node_ids); |
151 | #endif | 164 | #endif |
152 | 165 | ||
166 | int page_group_by_mobility_disabled __read_mostly; | ||
167 | |||
168 | static void set_pageblock_migratetype(struct page *page, int migratetype) | ||
169 | { | ||
170 | set_pageblock_flags_group(page, (unsigned long)migratetype, | ||
171 | PB_migrate, PB_migrate_end); | ||
172 | } | ||
173 | |||
153 | #ifdef CONFIG_DEBUG_VM | 174 | #ifdef CONFIG_DEBUG_VM |
154 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 175 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
155 | { | 176 | { |
@@ -293,16 +314,6 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | |||
293 | clear_highpage(page + i); | 314 | clear_highpage(page + i); |
294 | } | 315 | } |
295 | 316 | ||
296 | /* | ||
297 | * function for dealing with page's order in buddy system. | ||
298 | * zone->lock is already acquired when we use these. | ||
299 | * So, we don't need atomic page->flags operations here. | ||
300 | */ | ||
301 | static inline unsigned long page_order(struct page *page) | ||
302 | { | ||
303 | return page_private(page); | ||
304 | } | ||
305 | |||
306 | static inline void set_page_order(struct page *page, int order) | 317 | static inline void set_page_order(struct page *page, int order) |
307 | { | 318 | { |
308 | set_page_private(page, order); | 319 | set_page_private(page, order); |
@@ -404,6 +415,7 @@ static inline void __free_one_page(struct page *page, | |||
404 | { | 415 | { |
405 | unsigned long page_idx; | 416 | unsigned long page_idx; |
406 | int order_size = 1 << order; | 417 | int order_size = 1 << order; |
418 | int migratetype = get_pageblock_migratetype(page); | ||
407 | 419 | ||
408 | if (unlikely(PageCompound(page))) | 420 | if (unlikely(PageCompound(page))) |
409 | destroy_compound_page(page, order); | 421 | destroy_compound_page(page, order); |
@@ -416,7 +428,6 @@ static inline void __free_one_page(struct page *page, | |||
416 | __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); | 428 | __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); |
417 | while (order < MAX_ORDER-1) { | 429 | while (order < MAX_ORDER-1) { |
418 | unsigned long combined_idx; | 430 | unsigned long combined_idx; |
419 | struct free_area *area; | ||
420 | struct page *buddy; | 431 | struct page *buddy; |
421 | 432 | ||
422 | buddy = __page_find_buddy(page, page_idx, order); | 433 | buddy = __page_find_buddy(page, page_idx, order); |
@@ -424,8 +435,7 @@ static inline void __free_one_page(struct page *page, | |||
424 | break; /* Move the buddy up one level. */ | 435 | break; /* Move the buddy up one level. */ |
425 | 436 | ||
426 | list_del(&buddy->lru); | 437 | list_del(&buddy->lru); |
427 | area = zone->free_area + order; | 438 | zone->free_area[order].nr_free--; |
428 | area->nr_free--; | ||
429 | rmv_page_order(buddy); | 439 | rmv_page_order(buddy); |
430 | combined_idx = __find_combined_index(page_idx, order); | 440 | combined_idx = __find_combined_index(page_idx, order); |
431 | page = page + (combined_idx - page_idx); | 441 | page = page + (combined_idx - page_idx); |
@@ -433,7 +443,8 @@ static inline void __free_one_page(struct page *page, | |||
433 | order++; | 443 | order++; |
434 | } | 444 | } |
435 | set_page_order(page, order); | 445 | set_page_order(page, order); |
436 | list_add(&page->lru, &zone->free_area[order].free_list); | 446 | list_add(&page->lru, |
447 | &zone->free_area[order].free_list[migratetype]); | ||
437 | zone->free_area[order].nr_free++; | 448 | zone->free_area[order].nr_free++; |
438 | } | 449 | } |
439 | 450 | ||
@@ -567,7 +578,8 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) | |||
567 | * -- wli | 578 | * -- wli |
568 | */ | 579 | */ |
569 | static inline void expand(struct zone *zone, struct page *page, | 580 | static inline void expand(struct zone *zone, struct page *page, |
570 | int low, int high, struct free_area *area) | 581 | int low, int high, struct free_area *area, |
582 | int migratetype) | ||
571 | { | 583 | { |
572 | unsigned long size = 1 << high; | 584 | unsigned long size = 1 << high; |
573 | 585 | ||
@@ -576,7 +588,7 @@ static inline void expand(struct zone *zone, struct page *page, | |||
576 | high--; | 588 | high--; |
577 | size >>= 1; | 589 | size >>= 1; |
578 | VM_BUG_ON(bad_range(zone, &page[size])); | 590 | VM_BUG_ON(bad_range(zone, &page[size])); |
579 | list_add(&page[size].lru, &area->free_list); | 591 | list_add(&page[size].lru, &area->free_list[migratetype]); |
580 | area->nr_free++; | 592 | area->nr_free++; |
581 | set_page_order(&page[size], high); | 593 | set_page_order(&page[size], high); |
582 | } | 594 | } |
@@ -628,49 +640,235 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
628 | return 0; | 640 | return 0; |
629 | } | 641 | } |
630 | 642 | ||
631 | /* | 643 | /* |
632 | * Do the hard work of removing an element from the buddy allocator. | 644 | * Go through the free lists for the given migratetype and remove |
633 | * Call me with the zone->lock already held. | 645 | * the smallest available page from the freelists |
634 | */ | 646 | */ |
635 | static struct page *__rmqueue(struct zone *zone, unsigned int order) | 647 | static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, |
648 | int migratetype) | ||
636 | { | 649 | { |
637 | struct free_area * area; | ||
638 | unsigned int current_order; | 650 | unsigned int current_order; |
651 | struct free_area * area; | ||
639 | struct page *page; | 652 | struct page *page; |
640 | 653 | ||
654 | /* Find a page of the appropriate size in the preferred list */ | ||
641 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { | 655 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { |
642 | area = zone->free_area + current_order; | 656 | area = &(zone->free_area[current_order]); |
643 | if (list_empty(&area->free_list)) | 657 | if (list_empty(&area->free_list[migratetype])) |
644 | continue; | 658 | continue; |
645 | 659 | ||
646 | page = list_entry(area->free_list.next, struct page, lru); | 660 | page = list_entry(area->free_list[migratetype].next, |
661 | struct page, lru); | ||
647 | list_del(&page->lru); | 662 | list_del(&page->lru); |
648 | rmv_page_order(page); | 663 | rmv_page_order(page); |
649 | area->nr_free--; | 664 | area->nr_free--; |
650 | __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); | 665 | __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); |
651 | expand(zone, page, order, current_order, area); | 666 | expand(zone, page, order, current_order, area, migratetype); |
652 | return page; | 667 | return page; |
653 | } | 668 | } |
654 | 669 | ||
655 | return NULL; | 670 | return NULL; |
656 | } | 671 | } |
657 | 672 | ||
673 | |||
674 | /* | ||
675 | * This array describes the order lists are fallen back to when | ||
676 | * the free lists for the desirable migrate type are depleted | ||
677 | */ | ||
678 | static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { | ||
679 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | ||
680 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | ||
681 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | ||
682 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ | ||
683 | }; | ||
684 | |||
685 | /* | ||
686 | * Move the free pages in a range to the free lists of the requested type. | ||
687 | * Note that start_page and end_pages are not aligned on a pageblock | ||
688 | * boundary. If alignment is required, use move_freepages_block() | ||
689 | */ | ||
690 | int move_freepages(struct zone *zone, | ||
691 | struct page *start_page, struct page *end_page, | ||
692 | int migratetype) | ||
693 | { | ||
694 | struct page *page; | ||
695 | unsigned long order; | ||
696 | int pages_moved = 0; | ||
697 | |||
698 | #ifndef CONFIG_HOLES_IN_ZONE | ||
699 | /* | ||
700 | * page_zone is not safe to call in this context when | ||
701 | * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant | ||
702 | * anyway as we check zone boundaries in move_freepages_block(). | ||
703 | * Remove at a later date when no bug reports exist related to | ||
704 | * grouping pages by mobility | ||
705 | */ | ||
706 | BUG_ON(page_zone(start_page) != page_zone(end_page)); | ||
707 | #endif | ||
708 | |||
709 | for (page = start_page; page <= end_page;) { | ||
710 | if (!pfn_valid_within(page_to_pfn(page))) { | ||
711 | page++; | ||
712 | continue; | ||
713 | } | ||
714 | |||
715 | if (!PageBuddy(page)) { | ||
716 | page++; | ||
717 | continue; | ||
718 | } | ||
719 | |||
720 | order = page_order(page); | ||
721 | list_del(&page->lru); | ||
722 | list_add(&page->lru, | ||
723 | &zone->free_area[order].free_list[migratetype]); | ||
724 | page += 1 << order; | ||
725 | pages_moved += 1 << order; | ||
726 | } | ||
727 | |||
728 | return pages_moved; | ||
729 | } | ||
730 | |||
731 | int move_freepages_block(struct zone *zone, struct page *page, int migratetype) | ||
732 | { | ||
733 | unsigned long start_pfn, end_pfn; | ||
734 | struct page *start_page, *end_page; | ||
735 | |||
736 | start_pfn = page_to_pfn(page); | ||
737 | start_pfn = start_pfn & ~(pageblock_nr_pages-1); | ||
738 | start_page = pfn_to_page(start_pfn); | ||
739 | end_page = start_page + pageblock_nr_pages - 1; | ||
740 | end_pfn = start_pfn + pageblock_nr_pages - 1; | ||
741 | |||
742 | /* Do not cross zone boundaries */ | ||
743 | if (start_pfn < zone->zone_start_pfn) | ||
744 | start_page = page; | ||
745 | if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) | ||
746 | return 0; | ||
747 | |||
748 | return move_freepages(zone, start_page, end_page, migratetype); | ||
749 | } | ||
750 | |||
751 | /* Return the page with the lowest PFN in the list */ | ||
752 | static struct page *min_page(struct list_head *list) | ||
753 | { | ||
754 | unsigned long min_pfn = -1UL; | ||
755 | struct page *min_page = NULL, *page;; | ||
756 | |||
757 | list_for_each_entry(page, list, lru) { | ||
758 | unsigned long pfn = page_to_pfn(page); | ||
759 | if (pfn < min_pfn) { | ||
760 | min_pfn = pfn; | ||
761 | min_page = page; | ||
762 | } | ||
763 | } | ||
764 | |||
765 | return min_page; | ||
766 | } | ||
767 | |||
768 | /* Remove an element from the buddy allocator from the fallback list */ | ||
769 | static struct page *__rmqueue_fallback(struct zone *zone, int order, | ||
770 | int start_migratetype) | ||
771 | { | ||
772 | struct free_area * area; | ||
773 | int current_order; | ||
774 | struct page *page; | ||
775 | int migratetype, i; | ||
776 | |||
777 | /* Find the largest possible block of pages in the other list */ | ||
778 | for (current_order = MAX_ORDER-1; current_order >= order; | ||
779 | --current_order) { | ||
780 | for (i = 0; i < MIGRATE_TYPES - 1; i++) { | ||
781 | migratetype = fallbacks[start_migratetype][i]; | ||
782 | |||
783 | /* MIGRATE_RESERVE handled later if necessary */ | ||
784 | if (migratetype == MIGRATE_RESERVE) | ||
785 | continue; | ||
786 | |||
787 | area = &(zone->free_area[current_order]); | ||
788 | if (list_empty(&area->free_list[migratetype])) | ||
789 | continue; | ||
790 | |||
791 | /* Bias kernel allocations towards low pfns */ | ||
792 | page = list_entry(area->free_list[migratetype].next, | ||
793 | struct page, lru); | ||
794 | if (unlikely(start_migratetype != MIGRATE_MOVABLE)) | ||
795 | page = min_page(&area->free_list[migratetype]); | ||
796 | area->nr_free--; | ||
797 | |||
798 | /* | ||
799 | * If breaking a large block of pages, move all free | ||
800 | * pages to the preferred allocation list. If falling | ||
801 | * back for a reclaimable kernel allocation, be more | ||
802 | * agressive about taking ownership of free pages | ||
803 | */ | ||
804 | if (unlikely(current_order >= (pageblock_order >> 1)) || | ||
805 | start_migratetype == MIGRATE_RECLAIMABLE) { | ||
806 | unsigned long pages; | ||
807 | pages = move_freepages_block(zone, page, | ||
808 | start_migratetype); | ||
809 | |||
810 | /* Claim the whole block if over half of it is free */ | ||
811 | if (pages >= (1 << (pageblock_order-1))) | ||
812 | set_pageblock_migratetype(page, | ||
813 | start_migratetype); | ||
814 | |||
815 | migratetype = start_migratetype; | ||
816 | } | ||
817 | |||
818 | /* Remove the page from the freelists */ | ||
819 | list_del(&page->lru); | ||
820 | rmv_page_order(page); | ||
821 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
822 | -(1UL << order)); | ||
823 | |||
824 | if (current_order == pageblock_order) | ||
825 | set_pageblock_migratetype(page, | ||
826 | start_migratetype); | ||
827 | |||
828 | expand(zone, page, order, current_order, area, migratetype); | ||
829 | return page; | ||
830 | } | ||
831 | } | ||
832 | |||
833 | /* Use MIGRATE_RESERVE rather than fail an allocation */ | ||
834 | return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); | ||
835 | } | ||
836 | |||
837 | /* | ||
838 | * Do the hard work of removing an element from the buddy allocator. | ||
839 | * Call me with the zone->lock already held. | ||
840 | */ | ||
841 | static struct page *__rmqueue(struct zone *zone, unsigned int order, | ||
842 | int migratetype) | ||
843 | { | ||
844 | struct page *page; | ||
845 | |||
846 | page = __rmqueue_smallest(zone, order, migratetype); | ||
847 | |||
848 | if (unlikely(!page)) | ||
849 | page = __rmqueue_fallback(zone, order, migratetype); | ||
850 | |||
851 | return page; | ||
852 | } | ||
853 | |||
658 | /* | 854 | /* |
659 | * Obtain a specified number of elements from the buddy allocator, all under | 855 | * Obtain a specified number of elements from the buddy allocator, all under |
660 | * a single hold of the lock, for efficiency. Add them to the supplied list. | 856 | * a single hold of the lock, for efficiency. Add them to the supplied list. |
661 | * Returns the number of new pages which were placed at *list. | 857 | * Returns the number of new pages which were placed at *list. |
662 | */ | 858 | */ |
663 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 859 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
664 | unsigned long count, struct list_head *list) | 860 | unsigned long count, struct list_head *list, |
861 | int migratetype) | ||
665 | { | 862 | { |
666 | int i; | 863 | int i; |
667 | 864 | ||
668 | spin_lock(&zone->lock); | 865 | spin_lock(&zone->lock); |
669 | for (i = 0; i < count; ++i) { | 866 | for (i = 0; i < count; ++i) { |
670 | struct page *page = __rmqueue(zone, order); | 867 | struct page *page = __rmqueue(zone, order, migratetype); |
671 | if (unlikely(page == NULL)) | 868 | if (unlikely(page == NULL)) |
672 | break; | 869 | break; |
673 | list_add_tail(&page->lru, list); | 870 | list_add(&page->lru, list); |
871 | set_page_private(page, migratetype); | ||
674 | } | 872 | } |
675 | spin_unlock(&zone->lock); | 873 | spin_unlock(&zone->lock); |
676 | return i; | 874 | return i; |
@@ -732,7 +930,7 @@ void mark_free_pages(struct zone *zone) | |||
732 | { | 930 | { |
733 | unsigned long pfn, max_zone_pfn; | 931 | unsigned long pfn, max_zone_pfn; |
734 | unsigned long flags; | 932 | unsigned long flags; |
735 | int order; | 933 | int order, t; |
736 | struct list_head *curr; | 934 | struct list_head *curr; |
737 | 935 | ||
738 | if (!zone->spanned_pages) | 936 | if (!zone->spanned_pages) |
@@ -749,17 +947,18 @@ void mark_free_pages(struct zone *zone) | |||
749 | swsusp_unset_page_free(page); | 947 | swsusp_unset_page_free(page); |
750 | } | 948 | } |
751 | 949 | ||
752 | for (order = MAX_ORDER - 1; order >= 0; --order) | 950 | for_each_migratetype_order(order, t) { |
753 | list_for_each(curr, &zone->free_area[order].free_list) { | 951 | list_for_each(curr, &zone->free_area[order].free_list[t]) { |
754 | unsigned long i; | 952 | unsigned long i; |
755 | 953 | ||
756 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); | 954 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); |
757 | for (i = 0; i < (1UL << order); i++) | 955 | for (i = 0; i < (1UL << order); i++) |
758 | swsusp_set_page_free(pfn_to_page(pfn + i)); | 956 | swsusp_set_page_free(pfn_to_page(pfn + i)); |
759 | } | 957 | } |
760 | 958 | } | |
761 | spin_unlock_irqrestore(&zone->lock, flags); | 959 | spin_unlock_irqrestore(&zone->lock, flags); |
762 | } | 960 | } |
961 | #endif /* CONFIG_PM */ | ||
763 | 962 | ||
764 | /* | 963 | /* |
765 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. | 964 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. |
@@ -772,7 +971,25 @@ void drain_local_pages(void) | |||
772 | __drain_pages(smp_processor_id()); | 971 | __drain_pages(smp_processor_id()); |
773 | local_irq_restore(flags); | 972 | local_irq_restore(flags); |
774 | } | 973 | } |
775 | #endif /* CONFIG_HIBERNATION */ | 974 | |
975 | void smp_drain_local_pages(void *arg) | ||
976 | { | ||
977 | drain_local_pages(); | ||
978 | } | ||
979 | |||
980 | /* | ||
981 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator | ||
982 | */ | ||
983 | void drain_all_local_pages(void) | ||
984 | { | ||
985 | unsigned long flags; | ||
986 | |||
987 | local_irq_save(flags); | ||
988 | __drain_pages(smp_processor_id()); | ||
989 | local_irq_restore(flags); | ||
990 | |||
991 | smp_call_function(smp_drain_local_pages, NULL, 0, 1); | ||
992 | } | ||
776 | 993 | ||
777 | /* | 994 | /* |
778 | * Free a 0-order page | 995 | * Free a 0-order page |
@@ -797,6 +1014,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
797 | local_irq_save(flags); | 1014 | local_irq_save(flags); |
798 | __count_vm_event(PGFREE); | 1015 | __count_vm_event(PGFREE); |
799 | list_add(&page->lru, &pcp->list); | 1016 | list_add(&page->lru, &pcp->list); |
1017 | set_page_private(page, get_pageblock_migratetype(page)); | ||
800 | pcp->count++; | 1018 | pcp->count++; |
801 | if (pcp->count >= pcp->high) { | 1019 | if (pcp->count >= pcp->high) { |
802 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 1020 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); |
@@ -846,6 +1064,7 @@ static struct page *buffered_rmqueue(struct zonelist *zonelist, | |||
846 | struct page *page; | 1064 | struct page *page; |
847 | int cold = !!(gfp_flags & __GFP_COLD); | 1065 | int cold = !!(gfp_flags & __GFP_COLD); |
848 | int cpu; | 1066 | int cpu; |
1067 | int migratetype = allocflags_to_migratetype(gfp_flags); | ||
849 | 1068 | ||
850 | again: | 1069 | again: |
851 | cpu = get_cpu(); | 1070 | cpu = get_cpu(); |
@@ -856,16 +1075,28 @@ again: | |||
856 | local_irq_save(flags); | 1075 | local_irq_save(flags); |
857 | if (!pcp->count) { | 1076 | if (!pcp->count) { |
858 | pcp->count = rmqueue_bulk(zone, 0, | 1077 | pcp->count = rmqueue_bulk(zone, 0, |
859 | pcp->batch, &pcp->list); | 1078 | pcp->batch, &pcp->list, migratetype); |
860 | if (unlikely(!pcp->count)) | 1079 | if (unlikely(!pcp->count)) |
861 | goto failed; | 1080 | goto failed; |
862 | } | 1081 | } |
863 | page = list_entry(pcp->list.next, struct page, lru); | 1082 | |
1083 | /* Find a page of the appropriate migrate type */ | ||
1084 | list_for_each_entry(page, &pcp->list, lru) | ||
1085 | if (page_private(page) == migratetype) | ||
1086 | break; | ||
1087 | |||
1088 | /* Allocate more to the pcp list if necessary */ | ||
1089 | if (unlikely(&page->lru == &pcp->list)) { | ||
1090 | pcp->count += rmqueue_bulk(zone, 0, | ||
1091 | pcp->batch, &pcp->list, migratetype); | ||
1092 | page = list_entry(pcp->list.next, struct page, lru); | ||
1093 | } | ||
1094 | |||
864 | list_del(&page->lru); | 1095 | list_del(&page->lru); |
865 | pcp->count--; | 1096 | pcp->count--; |
866 | } else { | 1097 | } else { |
867 | spin_lock_irqsave(&zone->lock, flags); | 1098 | spin_lock_irqsave(&zone->lock, flags); |
868 | page = __rmqueue(zone, order); | 1099 | page = __rmqueue(zone, order, migratetype); |
869 | spin_unlock(&zone->lock); | 1100 | spin_unlock(&zone->lock); |
870 | if (!page) | 1101 | if (!page) |
871 | goto failed; | 1102 | goto failed; |
@@ -1032,7 +1263,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1032 | * | 1263 | * |
1033 | * If the zonelist cache is present in the passed in zonelist, then | 1264 | * If the zonelist cache is present in the passed in zonelist, then |
1034 | * returns a pointer to the allowed node mask (either the current | 1265 | * returns a pointer to the allowed node mask (either the current |
1035 | * tasks mems_allowed, or node_online_map.) | 1266 | * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) |
1036 | * | 1267 | * |
1037 | * If the zonelist cache is not available for this zonelist, does | 1268 | * If the zonelist cache is not available for this zonelist, does |
1038 | * nothing and returns NULL. | 1269 | * nothing and returns NULL. |
@@ -1061,7 +1292,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
1061 | 1292 | ||
1062 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | 1293 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? |
1063 | &cpuset_current_mems_allowed : | 1294 | &cpuset_current_mems_allowed : |
1064 | &node_online_map; | 1295 | &node_states[N_HIGH_MEMORY]; |
1065 | return allowednodes; | 1296 | return allowednodes; |
1066 | } | 1297 | } |
1067 | 1298 | ||
@@ -1183,9 +1414,6 @@ zonelist_scan: | |||
1183 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1414 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1184 | continue; | 1415 | continue; |
1185 | zone = *z; | 1416 | zone = *z; |
1186 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && | ||
1187 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) | ||
1188 | break; | ||
1189 | if ((alloc_flags & ALLOC_CPUSET) && | 1417 | if ((alloc_flags & ALLOC_CPUSET) && |
1190 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1418 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1191 | goto try_next_zone; | 1419 | goto try_next_zone; |
@@ -1254,7 +1482,10 @@ restart: | |||
1254 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ | 1482 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ |
1255 | 1483 | ||
1256 | if (unlikely(*z == NULL)) { | 1484 | if (unlikely(*z == NULL)) { |
1257 | /* Should this ever happen?? */ | 1485 | /* |
1486 | * Happens if we have an empty zonelist as a result of | ||
1487 | * GFP_THISNODE being used on a memoryless node | ||
1488 | */ | ||
1258 | return NULL; | 1489 | return NULL; |
1259 | } | 1490 | } |
1260 | 1491 | ||
@@ -1346,6 +1577,9 @@ nofail_alloc: | |||
1346 | 1577 | ||
1347 | cond_resched(); | 1578 | cond_resched(); |
1348 | 1579 | ||
1580 | if (order != 0) | ||
1581 | drain_all_local_pages(); | ||
1582 | |||
1349 | if (likely(did_some_progress)) { | 1583 | if (likely(did_some_progress)) { |
1350 | page = get_page_from_freelist(gfp_mask, order, | 1584 | page = get_page_from_freelist(gfp_mask, order, |
1351 | zonelist, alloc_flags); | 1585 | zonelist, alloc_flags); |
@@ -1794,7 +2028,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
1794 | return node; | 2028 | return node; |
1795 | } | 2029 | } |
1796 | 2030 | ||
1797 | for_each_online_node(n) { | 2031 | for_each_node_state(n, N_HIGH_MEMORY) { |
1798 | cpumask_t tmp; | 2032 | cpumask_t tmp; |
1799 | 2033 | ||
1800 | /* Don't want a node to appear more than once */ | 2034 | /* Don't want a node to appear more than once */ |
@@ -1850,6 +2084,22 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | |||
1850 | } | 2084 | } |
1851 | 2085 | ||
1852 | /* | 2086 | /* |
2087 | * Build gfp_thisnode zonelists | ||
2088 | */ | ||
2089 | static void build_thisnode_zonelists(pg_data_t *pgdat) | ||
2090 | { | ||
2091 | enum zone_type i; | ||
2092 | int j; | ||
2093 | struct zonelist *zonelist; | ||
2094 | |||
2095 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
2096 | zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i; | ||
2097 | j = build_zonelists_node(pgdat, zonelist, 0, i); | ||
2098 | zonelist->zones[j] = NULL; | ||
2099 | } | ||
2100 | } | ||
2101 | |||
2102 | /* | ||
1853 | * Build zonelists ordered by zone and nodes within zones. | 2103 | * Build zonelists ordered by zone and nodes within zones. |
1854 | * This results in conserving DMA zone[s] until all Normal memory is | 2104 | * This results in conserving DMA zone[s] until all Normal memory is |
1855 | * exhausted, but results in overflowing to remote node while memory | 2105 | * exhausted, but results in overflowing to remote node while memory |
@@ -1915,7 +2165,8 @@ static int default_zonelist_order(void) | |||
1915 | * If there is a node whose DMA/DMA32 memory is very big area on | 2165 | * If there is a node whose DMA/DMA32 memory is very big area on |
1916 | * local memory, NODE_ORDER may be suitable. | 2166 | * local memory, NODE_ORDER may be suitable. |
1917 | */ | 2167 | */ |
1918 | average_size = total_size / (num_online_nodes() + 1); | 2168 | average_size = total_size / |
2169 | (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); | ||
1919 | for_each_online_node(nid) { | 2170 | for_each_online_node(nid) { |
1920 | low_kmem_size = 0; | 2171 | low_kmem_size = 0; |
1921 | total_size = 0; | 2172 | total_size = 0; |
@@ -1953,7 +2204,7 @@ static void build_zonelists(pg_data_t *pgdat) | |||
1953 | int order = current_zonelist_order; | 2204 | int order = current_zonelist_order; |
1954 | 2205 | ||
1955 | /* initialize zonelists */ | 2206 | /* initialize zonelists */ |
1956 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2207 | for (i = 0; i < MAX_ZONELISTS; i++) { |
1957 | zonelist = pgdat->node_zonelists + i; | 2208 | zonelist = pgdat->node_zonelists + i; |
1958 | zonelist->zones[0] = NULL; | 2209 | zonelist->zones[0] = NULL; |
1959 | } | 2210 | } |
@@ -1998,6 +2249,8 @@ static void build_zonelists(pg_data_t *pgdat) | |||
1998 | /* calculate node order -- i.e., DMA last! */ | 2249 | /* calculate node order -- i.e., DMA last! */ |
1999 | build_zonelists_in_zone_order(pgdat, j); | 2250 | build_zonelists_in_zone_order(pgdat, j); |
2000 | } | 2251 | } |
2252 | |||
2253 | build_thisnode_zonelists(pgdat); | ||
2001 | } | 2254 | } |
2002 | 2255 | ||
2003 | /* Construct the zonelist performance cache - see further mmzone.h */ | 2256 | /* Construct the zonelist performance cache - see further mmzone.h */ |
@@ -2078,8 +2331,10 @@ static int __build_all_zonelists(void *dummy) | |||
2078 | int nid; | 2331 | int nid; |
2079 | 2332 | ||
2080 | for_each_online_node(nid) { | 2333 | for_each_online_node(nid) { |
2081 | build_zonelists(NODE_DATA(nid)); | 2334 | pg_data_t *pgdat = NODE_DATA(nid); |
2082 | build_zonelist_cache(NODE_DATA(nid)); | 2335 | |
2336 | build_zonelists(pgdat); | ||
2337 | build_zonelist_cache(pgdat); | ||
2083 | } | 2338 | } |
2084 | return 0; | 2339 | return 0; |
2085 | } | 2340 | } |
@@ -2098,9 +2353,23 @@ void build_all_zonelists(void) | |||
2098 | /* cpuset refresh routine should be here */ | 2353 | /* cpuset refresh routine should be here */ |
2099 | } | 2354 | } |
2100 | vm_total_pages = nr_free_pagecache_pages(); | 2355 | vm_total_pages = nr_free_pagecache_pages(); |
2101 | printk("Built %i zonelists in %s order. Total pages: %ld\n", | 2356 | /* |
2357 | * Disable grouping by mobility if the number of pages in the | ||
2358 | * system is too low to allow the mechanism to work. It would be | ||
2359 | * more accurate, but expensive to check per-zone. This check is | ||
2360 | * made on memory-hotadd so a system can start with mobility | ||
2361 | * disabled and enable it later | ||
2362 | */ | ||
2363 | if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) | ||
2364 | page_group_by_mobility_disabled = 1; | ||
2365 | else | ||
2366 | page_group_by_mobility_disabled = 0; | ||
2367 | |||
2368 | printk("Built %i zonelists in %s order, mobility grouping %s. " | ||
2369 | "Total pages: %ld\n", | ||
2102 | num_online_nodes(), | 2370 | num_online_nodes(), |
2103 | zonelist_order_name[current_zonelist_order], | 2371 | zonelist_order_name[current_zonelist_order], |
2372 | page_group_by_mobility_disabled ? "off" : "on", | ||
2104 | vm_total_pages); | 2373 | vm_total_pages); |
2105 | #ifdef CONFIG_NUMA | 2374 | #ifdef CONFIG_NUMA |
2106 | printk("Policy zone: %s\n", zone_names[policy_zone]); | 2375 | printk("Policy zone: %s\n", zone_names[policy_zone]); |
@@ -2176,6 +2445,61 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
2176 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | 2445 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) |
2177 | 2446 | ||
2178 | /* | 2447 | /* |
2448 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number | ||
2449 | * of blocks reserved is based on zone->pages_min. The memory within the | ||
2450 | * reserve will tend to store contiguous free pages. Setting min_free_kbytes | ||
2451 | * higher will lead to a bigger reserve which will get freed as contiguous | ||
2452 | * blocks as reclaim kicks in | ||
2453 | */ | ||
2454 | static void setup_zone_migrate_reserve(struct zone *zone) | ||
2455 | { | ||
2456 | unsigned long start_pfn, pfn, end_pfn; | ||
2457 | struct page *page; | ||
2458 | unsigned long reserve, block_migratetype; | ||
2459 | |||
2460 | /* Get the start pfn, end pfn and the number of blocks to reserve */ | ||
2461 | start_pfn = zone->zone_start_pfn; | ||
2462 | end_pfn = start_pfn + zone->spanned_pages; | ||
2463 | reserve = roundup(zone->pages_min, pageblock_nr_pages) >> | ||
2464 | pageblock_order; | ||
2465 | |||
2466 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | ||
2467 | if (!pfn_valid(pfn)) | ||
2468 | continue; | ||
2469 | page = pfn_to_page(pfn); | ||
2470 | |||
2471 | /* Blocks with reserved pages will never free, skip them. */ | ||
2472 | if (PageReserved(page)) | ||
2473 | continue; | ||
2474 | |||
2475 | block_migratetype = get_pageblock_migratetype(page); | ||
2476 | |||
2477 | /* If this block is reserved, account for it */ | ||
2478 | if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { | ||
2479 | reserve--; | ||
2480 | continue; | ||
2481 | } | ||
2482 | |||
2483 | /* Suitable for reserving if this block is movable */ | ||
2484 | if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { | ||
2485 | set_pageblock_migratetype(page, MIGRATE_RESERVE); | ||
2486 | move_freepages_block(zone, page, MIGRATE_RESERVE); | ||
2487 | reserve--; | ||
2488 | continue; | ||
2489 | } | ||
2490 | |||
2491 | /* | ||
2492 | * If the reserve is met and this is a previous reserved block, | ||
2493 | * take it back | ||
2494 | */ | ||
2495 | if (block_migratetype == MIGRATE_RESERVE) { | ||
2496 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
2497 | move_freepages_block(zone, page, MIGRATE_MOVABLE); | ||
2498 | } | ||
2499 | } | ||
2500 | } | ||
2501 | |||
2502 | /* | ||
2179 | * Initially all pages are reserved - free ones are freed | 2503 | * Initially all pages are reserved - free ones are freed |
2180 | * up by free_all_bootmem() once the early boot process is | 2504 | * up by free_all_bootmem() once the early boot process is |
2181 | * done. Non-atomic initialization, single-pass. | 2505 | * done. Non-atomic initialization, single-pass. |
@@ -2204,6 +2528,19 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
2204 | init_page_count(page); | 2528 | init_page_count(page); |
2205 | reset_page_mapcount(page); | 2529 | reset_page_mapcount(page); |
2206 | SetPageReserved(page); | 2530 | SetPageReserved(page); |
2531 | |||
2532 | /* | ||
2533 | * Mark the block movable so that blocks are reserved for | ||
2534 | * movable at startup. This will force kernel allocations | ||
2535 | * to reserve their blocks rather than leaking throughout | ||
2536 | * the address space during boot when many long-lived | ||
2537 | * kernel allocations are made. Later some blocks near | ||
2538 | * the start are marked MIGRATE_RESERVE by | ||
2539 | * setup_zone_migrate_reserve() | ||
2540 | */ | ||
2541 | if ((pfn & (pageblock_nr_pages-1))) | ||
2542 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
2543 | |||
2207 | INIT_LIST_HEAD(&page->lru); | 2544 | INIT_LIST_HEAD(&page->lru); |
2208 | #ifdef WANT_PAGE_VIRTUAL | 2545 | #ifdef WANT_PAGE_VIRTUAL |
2209 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ | 2546 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ |
@@ -2216,9 +2553,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
2216 | static void __meminit zone_init_free_lists(struct pglist_data *pgdat, | 2553 | static void __meminit zone_init_free_lists(struct pglist_data *pgdat, |
2217 | struct zone *zone, unsigned long size) | 2554 | struct zone *zone, unsigned long size) |
2218 | { | 2555 | { |
2219 | int order; | 2556 | int order, t; |
2220 | for (order = 0; order < MAX_ORDER ; order++) { | 2557 | for_each_migratetype_order(order, t) { |
2221 | INIT_LIST_HEAD(&zone->free_area[order].free_list); | 2558 | INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); |
2222 | zone->free_area[order].nr_free = 0; | 2559 | zone->free_area[order].nr_free = 0; |
2223 | } | 2560 | } |
2224 | } | 2561 | } |
@@ -2324,6 +2661,9 @@ static struct per_cpu_pageset boot_pageset[NR_CPUS]; | |||
2324 | static int __cpuinit process_zones(int cpu) | 2661 | static int __cpuinit process_zones(int cpu) |
2325 | { | 2662 | { |
2326 | struct zone *zone, *dzone; | 2663 | struct zone *zone, *dzone; |
2664 | int node = cpu_to_node(cpu); | ||
2665 | |||
2666 | node_set_state(node, N_CPU); /* this node has a cpu */ | ||
2327 | 2667 | ||
2328 | for_each_zone(zone) { | 2668 | for_each_zone(zone) { |
2329 | 2669 | ||
@@ -2331,7 +2671,7 @@ static int __cpuinit process_zones(int cpu) | |||
2331 | continue; | 2671 | continue; |
2332 | 2672 | ||
2333 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 2673 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), |
2334 | GFP_KERNEL, cpu_to_node(cpu)); | 2674 | GFP_KERNEL, node); |
2335 | if (!zone_pcp(zone, cpu)) | 2675 | if (!zone_pcp(zone, cpu)) |
2336 | goto bad; | 2676 | goto bad; |
2337 | 2677 | ||
@@ -2444,7 +2784,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
2444 | * To use this new node's memory, further consideration will be | 2784 | * To use this new node's memory, further consideration will be |
2445 | * necessary. | 2785 | * necessary. |
2446 | */ | 2786 | */ |
2447 | zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); | 2787 | zone->wait_table = vmalloc(alloc_size); |
2448 | } | 2788 | } |
2449 | if (!zone->wait_table) | 2789 | if (!zone->wait_table) |
2450 | return -ENOMEM; | 2790 | return -ENOMEM; |
@@ -2680,10 +3020,8 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, | |||
2680 | *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); | 3020 | *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); |
2681 | } | 3021 | } |
2682 | 3022 | ||
2683 | if (*start_pfn == -1UL) { | 3023 | if (*start_pfn == -1UL) |
2684 | printk(KERN_WARNING "Node %u active with no memory\n", nid); | ||
2685 | *start_pfn = 0; | 3024 | *start_pfn = 0; |
2686 | } | ||
2687 | 3025 | ||
2688 | /* Push the node boundaries out if requested */ | 3026 | /* Push the node boundaries out if requested */ |
2689 | account_node_boundary(nid, start_pfn, end_pfn); | 3027 | account_node_boundary(nid, start_pfn, end_pfn); |
@@ -2901,6 +3239,62 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, | |||
2901 | realtotalpages); | 3239 | realtotalpages); |
2902 | } | 3240 | } |
2903 | 3241 | ||
3242 | #ifndef CONFIG_SPARSEMEM | ||
3243 | /* | ||
3244 | * Calculate the size of the zone->blockflags rounded to an unsigned long | ||
3245 | * Start by making sure zonesize is a multiple of pageblock_order by rounding | ||
3246 | * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally | ||
3247 | * round what is now in bits to nearest long in bits, then return it in | ||
3248 | * bytes. | ||
3249 | */ | ||
3250 | static unsigned long __init usemap_size(unsigned long zonesize) | ||
3251 | { | ||
3252 | unsigned long usemapsize; | ||
3253 | |||
3254 | usemapsize = roundup(zonesize, pageblock_nr_pages); | ||
3255 | usemapsize = usemapsize >> pageblock_order; | ||
3256 | usemapsize *= NR_PAGEBLOCK_BITS; | ||
3257 | usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); | ||
3258 | |||
3259 | return usemapsize / 8; | ||
3260 | } | ||
3261 | |||
3262 | static void __init setup_usemap(struct pglist_data *pgdat, | ||
3263 | struct zone *zone, unsigned long zonesize) | ||
3264 | { | ||
3265 | unsigned long usemapsize = usemap_size(zonesize); | ||
3266 | zone->pageblock_flags = NULL; | ||
3267 | if (usemapsize) { | ||
3268 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); | ||
3269 | memset(zone->pageblock_flags, 0, usemapsize); | ||
3270 | } | ||
3271 | } | ||
3272 | #else | ||
3273 | static void inline setup_usemap(struct pglist_data *pgdat, | ||
3274 | struct zone *zone, unsigned long zonesize) {} | ||
3275 | #endif /* CONFIG_SPARSEMEM */ | ||
3276 | |||
3277 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | ||
3278 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ | ||
3279 | static inline void __init set_pageblock_order(unsigned int order) | ||
3280 | { | ||
3281 | /* Check that pageblock_nr_pages has not already been setup */ | ||
3282 | if (pageblock_order) | ||
3283 | return; | ||
3284 | |||
3285 | /* | ||
3286 | * Assume the largest contiguous order of interest is a huge page. | ||
3287 | * This value may be variable depending on boot parameters on IA64 | ||
3288 | */ | ||
3289 | pageblock_order = order; | ||
3290 | } | ||
3291 | #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | ||
3292 | |||
3293 | /* Defined this way to avoid accidently referencing HUGETLB_PAGE_ORDER */ | ||
3294 | #define set_pageblock_order(x) do {} while (0) | ||
3295 | |||
3296 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | ||
3297 | |||
2904 | /* | 3298 | /* |
2905 | * Set up the zone data structures: | 3299 | * Set up the zone data structures: |
2906 | * - mark all pages reserved | 3300 | * - mark all pages reserved |
@@ -2981,6 +3375,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
2981 | if (!size) | 3375 | if (!size) |
2982 | continue; | 3376 | continue; |
2983 | 3377 | ||
3378 | set_pageblock_order(HUGETLB_PAGE_ORDER); | ||
3379 | setup_usemap(pgdat, zone, size); | ||
2984 | ret = init_currently_empty_zone(zone, zone_start_pfn, | 3380 | ret = init_currently_empty_zone(zone, zone_start_pfn, |
2985 | size, MEMMAP_EARLY); | 3381 | size, MEMMAP_EARLY); |
2986 | BUG_ON(ret); | 3382 | BUG_ON(ret); |
@@ -3234,16 +3630,24 @@ unsigned long __init find_max_pfn_with_active_regions(void) | |||
3234 | return max_pfn; | 3630 | return max_pfn; |
3235 | } | 3631 | } |
3236 | 3632 | ||
3237 | unsigned long __init early_calculate_totalpages(void) | 3633 | /* |
3634 | * early_calculate_totalpages() | ||
3635 | * Sum pages in active regions for movable zone. | ||
3636 | * Populate N_HIGH_MEMORY for calculating usable_nodes. | ||
3637 | */ | ||
3638 | static unsigned long __init early_calculate_totalpages(void) | ||
3238 | { | 3639 | { |
3239 | int i; | 3640 | int i; |
3240 | unsigned long totalpages = 0; | 3641 | unsigned long totalpages = 0; |
3241 | 3642 | ||
3242 | for (i = 0; i < nr_nodemap_entries; i++) | 3643 | for (i = 0; i < nr_nodemap_entries; i++) { |
3243 | totalpages += early_node_map[i].end_pfn - | 3644 | unsigned long pages = early_node_map[i].end_pfn - |
3244 | early_node_map[i].start_pfn; | 3645 | early_node_map[i].start_pfn; |
3245 | 3646 | totalpages += pages; | |
3246 | return totalpages; | 3647 | if (pages) |
3648 | node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); | ||
3649 | } | ||
3650 | return totalpages; | ||
3247 | } | 3651 | } |
3248 | 3652 | ||
3249 | /* | 3653 | /* |
@@ -3257,7 +3661,8 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
3257 | int i, nid; | 3661 | int i, nid; |
3258 | unsigned long usable_startpfn; | 3662 | unsigned long usable_startpfn; |
3259 | unsigned long kernelcore_node, kernelcore_remaining; | 3663 | unsigned long kernelcore_node, kernelcore_remaining; |
3260 | int usable_nodes = num_online_nodes(); | 3664 | unsigned long totalpages = early_calculate_totalpages(); |
3665 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | ||
3261 | 3666 | ||
3262 | /* | 3667 | /* |
3263 | * If movablecore was specified, calculate what size of | 3668 | * If movablecore was specified, calculate what size of |
@@ -3268,7 +3673,6 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
3268 | * what movablecore would have allowed. | 3673 | * what movablecore would have allowed. |
3269 | */ | 3674 | */ |
3270 | if (required_movablecore) { | 3675 | if (required_movablecore) { |
3271 | unsigned long totalpages = early_calculate_totalpages(); | ||
3272 | unsigned long corepages; | 3676 | unsigned long corepages; |
3273 | 3677 | ||
3274 | /* | 3678 | /* |
@@ -3293,7 +3697,7 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
3293 | restart: | 3697 | restart: |
3294 | /* Spread kernelcore memory as evenly as possible throughout nodes */ | 3698 | /* Spread kernelcore memory as evenly as possible throughout nodes */ |
3295 | kernelcore_node = required_kernelcore / usable_nodes; | 3699 | kernelcore_node = required_kernelcore / usable_nodes; |
3296 | for_each_online_node(nid) { | 3700 | for_each_node_state(nid, N_HIGH_MEMORY) { |
3297 | /* | 3701 | /* |
3298 | * Recalculate kernelcore_node if the division per node | 3702 | * Recalculate kernelcore_node if the division per node |
3299 | * now exceeds what is necessary to satisfy the requested | 3703 | * now exceeds what is necessary to satisfy the requested |
@@ -3385,6 +3789,20 @@ restart: | |||
3385 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); | 3789 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); |
3386 | } | 3790 | } |
3387 | 3791 | ||
3792 | /* Any regular memory on that node ? */ | ||
3793 | static void check_for_regular_memory(pg_data_t *pgdat) | ||
3794 | { | ||
3795 | #ifdef CONFIG_HIGHMEM | ||
3796 | enum zone_type zone_type; | ||
3797 | |||
3798 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { | ||
3799 | struct zone *zone = &pgdat->node_zones[zone_type]; | ||
3800 | if (zone->present_pages) | ||
3801 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); | ||
3802 | } | ||
3803 | #endif | ||
3804 | } | ||
3805 | |||
3388 | /** | 3806 | /** |
3389 | * free_area_init_nodes - Initialise all pg_data_t and zone data | 3807 | * free_area_init_nodes - Initialise all pg_data_t and zone data |
3390 | * @max_zone_pfn: an array of max PFNs for each zone | 3808 | * @max_zone_pfn: an array of max PFNs for each zone |
@@ -3459,6 +3877,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
3459 | pg_data_t *pgdat = NODE_DATA(nid); | 3877 | pg_data_t *pgdat = NODE_DATA(nid); |
3460 | free_area_init_node(nid, pgdat, NULL, | 3878 | free_area_init_node(nid, pgdat, NULL, |
3461 | find_min_pfn_for_node(nid), NULL); | 3879 | find_min_pfn_for_node(nid), NULL); |
3880 | |||
3881 | /* Any memory on that node */ | ||
3882 | if (pgdat->node_present_pages) | ||
3883 | node_set_state(nid, N_HIGH_MEMORY); | ||
3884 | check_for_regular_memory(pgdat); | ||
3462 | } | 3885 | } |
3463 | } | 3886 | } |
3464 | 3887 | ||
@@ -3673,6 +4096,7 @@ void setup_per_zone_pages_min(void) | |||
3673 | 4096 | ||
3674 | zone->pages_low = zone->pages_min + (tmp >> 2); | 4097 | zone->pages_low = zone->pages_min + (tmp >> 2); |
3675 | zone->pages_high = zone->pages_min + (tmp >> 1); | 4098 | zone->pages_high = zone->pages_min + (tmp >> 1); |
4099 | setup_zone_migrate_reserve(zone); | ||
3676 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 4100 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3677 | } | 4101 | } |
3678 | 4102 | ||
@@ -3934,4 +4358,169 @@ EXPORT_SYMBOL(pfn_to_page); | |||
3934 | EXPORT_SYMBOL(page_to_pfn); | 4358 | EXPORT_SYMBOL(page_to_pfn); |
3935 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ | 4359 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ |
3936 | 4360 | ||
4361 | /* Return a pointer to the bitmap storing bits affecting a block of pages */ | ||
4362 | static inline unsigned long *get_pageblock_bitmap(struct zone *zone, | ||
4363 | unsigned long pfn) | ||
4364 | { | ||
4365 | #ifdef CONFIG_SPARSEMEM | ||
4366 | return __pfn_to_section(pfn)->pageblock_flags; | ||
4367 | #else | ||
4368 | return zone->pageblock_flags; | ||
4369 | #endif /* CONFIG_SPARSEMEM */ | ||
4370 | } | ||
4371 | |||
4372 | static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) | ||
4373 | { | ||
4374 | #ifdef CONFIG_SPARSEMEM | ||
4375 | pfn &= (PAGES_PER_SECTION-1); | ||
4376 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | ||
4377 | #else | ||
4378 | pfn = pfn - zone->zone_start_pfn; | ||
4379 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | ||
4380 | #endif /* CONFIG_SPARSEMEM */ | ||
4381 | } | ||
4382 | |||
4383 | /** | ||
4384 | * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages | ||
4385 | * @page: The page within the block of interest | ||
4386 | * @start_bitidx: The first bit of interest to retrieve | ||
4387 | * @end_bitidx: The last bit of interest | ||
4388 | * returns pageblock_bits flags | ||
4389 | */ | ||
4390 | unsigned long get_pageblock_flags_group(struct page *page, | ||
4391 | int start_bitidx, int end_bitidx) | ||
4392 | { | ||
4393 | struct zone *zone; | ||
4394 | unsigned long *bitmap; | ||
4395 | unsigned long pfn, bitidx; | ||
4396 | unsigned long flags = 0; | ||
4397 | unsigned long value = 1; | ||
4398 | |||
4399 | zone = page_zone(page); | ||
4400 | pfn = page_to_pfn(page); | ||
4401 | bitmap = get_pageblock_bitmap(zone, pfn); | ||
4402 | bitidx = pfn_to_bitidx(zone, pfn); | ||
4403 | |||
4404 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | ||
4405 | if (test_bit(bitidx + start_bitidx, bitmap)) | ||
4406 | flags |= value; | ||
4407 | |||
4408 | return flags; | ||
4409 | } | ||
3937 | 4410 | ||
4411 | /** | ||
4412 | * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages | ||
4413 | * @page: The page within the block of interest | ||
4414 | * @start_bitidx: The first bit of interest | ||
4415 | * @end_bitidx: The last bit of interest | ||
4416 | * @flags: The flags to set | ||
4417 | */ | ||
4418 | void set_pageblock_flags_group(struct page *page, unsigned long flags, | ||
4419 | int start_bitidx, int end_bitidx) | ||
4420 | { | ||
4421 | struct zone *zone; | ||
4422 | unsigned long *bitmap; | ||
4423 | unsigned long pfn, bitidx; | ||
4424 | unsigned long value = 1; | ||
4425 | |||
4426 | zone = page_zone(page); | ||
4427 | pfn = page_to_pfn(page); | ||
4428 | bitmap = get_pageblock_bitmap(zone, pfn); | ||
4429 | bitidx = pfn_to_bitidx(zone, pfn); | ||
4430 | |||
4431 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | ||
4432 | if (flags & value) | ||
4433 | __set_bit(bitidx + start_bitidx, bitmap); | ||
4434 | else | ||
4435 | __clear_bit(bitidx + start_bitidx, bitmap); | ||
4436 | } | ||
4437 | |||
4438 | /* | ||
4439 | * This is designed as sub function...plz see page_isolation.c also. | ||
4440 | * set/clear page block's type to be ISOLATE. | ||
4441 | * page allocater never alloc memory from ISOLATE block. | ||
4442 | */ | ||
4443 | |||
4444 | int set_migratetype_isolate(struct page *page) | ||
4445 | { | ||
4446 | struct zone *zone; | ||
4447 | unsigned long flags; | ||
4448 | int ret = -EBUSY; | ||
4449 | |||
4450 | zone = page_zone(page); | ||
4451 | spin_lock_irqsave(&zone->lock, flags); | ||
4452 | /* | ||
4453 | * In future, more migrate types will be able to be isolation target. | ||
4454 | */ | ||
4455 | if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) | ||
4456 | goto out; | ||
4457 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
4458 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
4459 | ret = 0; | ||
4460 | out: | ||
4461 | spin_unlock_irqrestore(&zone->lock, flags); | ||
4462 | if (!ret) | ||
4463 | drain_all_local_pages(); | ||
4464 | return ret; | ||
4465 | } | ||
4466 | |||
4467 | void unset_migratetype_isolate(struct page *page) | ||
4468 | { | ||
4469 | struct zone *zone; | ||
4470 | unsigned long flags; | ||
4471 | zone = page_zone(page); | ||
4472 | spin_lock_irqsave(&zone->lock, flags); | ||
4473 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | ||
4474 | goto out; | ||
4475 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
4476 | move_freepages_block(zone, page, MIGRATE_MOVABLE); | ||
4477 | out: | ||
4478 | spin_unlock_irqrestore(&zone->lock, flags); | ||
4479 | } | ||
4480 | |||
4481 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
4482 | /* | ||
4483 | * All pages in the range must be isolated before calling this. | ||
4484 | */ | ||
4485 | void | ||
4486 | __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | ||
4487 | { | ||
4488 | struct page *page; | ||
4489 | struct zone *zone; | ||
4490 | int order, i; | ||
4491 | unsigned long pfn; | ||
4492 | unsigned long flags; | ||
4493 | /* find the first valid pfn */ | ||
4494 | for (pfn = start_pfn; pfn < end_pfn; pfn++) | ||
4495 | if (pfn_valid(pfn)) | ||
4496 | break; | ||
4497 | if (pfn == end_pfn) | ||
4498 | return; | ||
4499 | zone = page_zone(pfn_to_page(pfn)); | ||
4500 | spin_lock_irqsave(&zone->lock, flags); | ||
4501 | pfn = start_pfn; | ||
4502 | while (pfn < end_pfn) { | ||
4503 | if (!pfn_valid(pfn)) { | ||
4504 | pfn++; | ||
4505 | continue; | ||
4506 | } | ||
4507 | page = pfn_to_page(pfn); | ||
4508 | BUG_ON(page_count(page)); | ||
4509 | BUG_ON(!PageBuddy(page)); | ||
4510 | order = page_order(page); | ||
4511 | #ifdef CONFIG_DEBUG_VM | ||
4512 | printk(KERN_INFO "remove from free list %lx %d %lx\n", | ||
4513 | pfn, 1 << order, end_pfn); | ||
4514 | #endif | ||
4515 | list_del(&page->lru); | ||
4516 | rmv_page_order(page); | ||
4517 | zone->free_area[order].nr_free--; | ||
4518 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
4519 | - (1UL << order)); | ||
4520 | for (i = 0; i < (1 << order); i++) | ||
4521 | SetPageReserved((page+i)); | ||
4522 | pfn += (1 << order); | ||
4523 | } | ||
4524 | spin_unlock_irqrestore(&zone->lock, flags); | ||
4525 | } | ||
4526 | #endif | ||
diff --git a/mm/page_isolation.c b/mm/page_isolation.c new file mode 100644 index 000000000000..8f92a29695cc --- /dev/null +++ b/mm/page_isolation.c | |||
@@ -0,0 +1,138 @@ | |||
1 | /* | ||
2 | * linux/mm/page_isolation.c | ||
3 | */ | ||
4 | |||
5 | #include <stddef.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/page-isolation.h> | ||
8 | #include <linux/pageblock-flags.h> | ||
9 | #include "internal.h" | ||
10 | |||
11 | static inline struct page * | ||
12 | __first_valid_page(unsigned long pfn, unsigned long nr_pages) | ||
13 | { | ||
14 | int i; | ||
15 | for (i = 0; i < nr_pages; i++) | ||
16 | if (pfn_valid_within(pfn + i)) | ||
17 | break; | ||
18 | if (unlikely(i == nr_pages)) | ||
19 | return NULL; | ||
20 | return pfn_to_page(pfn + i); | ||
21 | } | ||
22 | |||
23 | /* | ||
24 | * start_isolate_page_range() -- make page-allocation-type of range of pages | ||
25 | * to be MIGRATE_ISOLATE. | ||
26 | * @start_pfn: The lower PFN of the range to be isolated. | ||
27 | * @end_pfn: The upper PFN of the range to be isolated. | ||
28 | * | ||
29 | * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in | ||
30 | * the range will never be allocated. Any free pages and pages freed in the | ||
31 | * future will not be allocated again. | ||
32 | * | ||
33 | * start_pfn/end_pfn must be aligned to pageblock_order. | ||
34 | * Returns 0 on success and -EBUSY if any part of range cannot be isolated. | ||
35 | */ | ||
36 | int | ||
37 | start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) | ||
38 | { | ||
39 | unsigned long pfn; | ||
40 | unsigned long undo_pfn; | ||
41 | struct page *page; | ||
42 | |||
43 | BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); | ||
44 | BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); | ||
45 | |||
46 | for (pfn = start_pfn; | ||
47 | pfn < end_pfn; | ||
48 | pfn += pageblock_nr_pages) { | ||
49 | page = __first_valid_page(pfn, pageblock_nr_pages); | ||
50 | if (page && set_migratetype_isolate(page)) { | ||
51 | undo_pfn = pfn; | ||
52 | goto undo; | ||
53 | } | ||
54 | } | ||
55 | return 0; | ||
56 | undo: | ||
57 | for (pfn = start_pfn; | ||
58 | pfn <= undo_pfn; | ||
59 | pfn += pageblock_nr_pages) | ||
60 | unset_migratetype_isolate(pfn_to_page(pfn)); | ||
61 | |||
62 | return -EBUSY; | ||
63 | } | ||
64 | |||
65 | /* | ||
66 | * Make isolated pages available again. | ||
67 | */ | ||
68 | int | ||
69 | undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) | ||
70 | { | ||
71 | unsigned long pfn; | ||
72 | struct page *page; | ||
73 | BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); | ||
74 | BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); | ||
75 | for (pfn = start_pfn; | ||
76 | pfn < end_pfn; | ||
77 | pfn += pageblock_nr_pages) { | ||
78 | page = __first_valid_page(pfn, pageblock_nr_pages); | ||
79 | if (!page || get_pageblock_flags(page) != MIGRATE_ISOLATE) | ||
80 | continue; | ||
81 | unset_migratetype_isolate(page); | ||
82 | } | ||
83 | return 0; | ||
84 | } | ||
85 | /* | ||
86 | * Test all pages in the range is free(means isolated) or not. | ||
87 | * all pages in [start_pfn...end_pfn) must be in the same zone. | ||
88 | * zone->lock must be held before call this. | ||
89 | * | ||
90 | * Returns 0 if all pages in the range is isolated. | ||
91 | */ | ||
92 | static int | ||
93 | __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | ||
94 | { | ||
95 | struct page *page; | ||
96 | |||
97 | while (pfn < end_pfn) { | ||
98 | if (!pfn_valid_within(pfn)) { | ||
99 | pfn++; | ||
100 | continue; | ||
101 | } | ||
102 | page = pfn_to_page(pfn); | ||
103 | if (PageBuddy(page)) | ||
104 | pfn += 1 << page_order(page); | ||
105 | else if (page_count(page) == 0 && | ||
106 | page_private(page) == MIGRATE_ISOLATE) | ||
107 | pfn += 1; | ||
108 | else | ||
109 | break; | ||
110 | } | ||
111 | if (pfn < end_pfn) | ||
112 | return 0; | ||
113 | return 1; | ||
114 | } | ||
115 | |||
116 | int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | ||
117 | { | ||
118 | unsigned long pfn; | ||
119 | struct page *page; | ||
120 | |||
121 | pfn = start_pfn; | ||
122 | /* | ||
123 | * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page | ||
124 | * is not aligned to pageblock_nr_pages. | ||
125 | * Then we just check pagetype fist. | ||
126 | */ | ||
127 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | ||
128 | page = __first_valid_page(pfn, pageblock_nr_pages); | ||
129 | if (page && get_pageblock_flags(page) != MIGRATE_ISOLATE) | ||
130 | break; | ||
131 | } | ||
132 | if (pfn < end_pfn) | ||
133 | return -EBUSY; | ||
134 | /* Check all pages are free or Marked as ISOLATED */ | ||
135 | if (__test_page_isolated_in_pageblock(start_pfn, end_pfn)) | ||
136 | return 0; | ||
137 | return -EBUSY; | ||
138 | } | ||
diff --git a/mm/readahead.c b/mm/readahead.c index be20c9d699d3..229788884010 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -22,16 +22,8 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | |||
22 | } | 22 | } |
23 | EXPORT_SYMBOL(default_unplug_io_fn); | 23 | EXPORT_SYMBOL(default_unplug_io_fn); |
24 | 24 | ||
25 | /* | ||
26 | * Convienent macros for min/max read-ahead pages. | ||
27 | * Note that MAX_RA_PAGES is rounded down, while MIN_RA_PAGES is rounded up. | ||
28 | * The latter is necessary for systems with large page size(i.e. 64k). | ||
29 | */ | ||
30 | #define MAX_RA_PAGES (VM_MAX_READAHEAD*1024 / PAGE_CACHE_SIZE) | ||
31 | #define MIN_RA_PAGES DIV_ROUND_UP(VM_MIN_READAHEAD*1024, PAGE_CACHE_SIZE) | ||
32 | |||
33 | struct backing_dev_info default_backing_dev_info = { | 25 | struct backing_dev_info default_backing_dev_info = { |
34 | .ra_pages = MAX_RA_PAGES, | 26 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, |
35 | .state = 0, | 27 | .state = 0, |
36 | .capabilities = BDI_CAP_MAP_COPY, | 28 | .capabilities = BDI_CAP_MAP_COPY, |
37 | .unplug_io_fn = default_unplug_io_fn, | 29 | .unplug_io_fn = default_unplug_io_fn, |
@@ -46,7 +38,7 @@ void | |||
46 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) | 38 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) |
47 | { | 39 | { |
48 | ra->ra_pages = mapping->backing_dev_info->ra_pages; | 40 | ra->ra_pages = mapping->backing_dev_info->ra_pages; |
49 | ra->prev_index = -1; | 41 | ra->prev_pos = -1; |
50 | } | 42 | } |
51 | EXPORT_SYMBOL_GPL(file_ra_state_init); | 43 | EXPORT_SYMBOL_GPL(file_ra_state_init); |
52 | 44 | ||
@@ -66,28 +58,25 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, | |||
66 | int (*filler)(void *, struct page *), void *data) | 58 | int (*filler)(void *, struct page *), void *data) |
67 | { | 59 | { |
68 | struct page *page; | 60 | struct page *page; |
69 | struct pagevec lru_pvec; | ||
70 | int ret = 0; | 61 | int ret = 0; |
71 | 62 | ||
72 | pagevec_init(&lru_pvec, 0); | ||
73 | |||
74 | while (!list_empty(pages)) { | 63 | while (!list_empty(pages)) { |
75 | page = list_to_page(pages); | 64 | page = list_to_page(pages); |
76 | list_del(&page->lru); | 65 | list_del(&page->lru); |
77 | if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { | 66 | if (add_to_page_cache_lru(page, mapping, |
67 | page->index, GFP_KERNEL)) { | ||
78 | page_cache_release(page); | 68 | page_cache_release(page); |
79 | continue; | 69 | continue; |
80 | } | 70 | } |
71 | page_cache_release(page); | ||
72 | |||
81 | ret = filler(data, page); | 73 | ret = filler(data, page); |
82 | if (!pagevec_add(&lru_pvec, page)) | 74 | if (unlikely(ret)) { |
83 | __pagevec_lru_add(&lru_pvec); | ||
84 | if (ret) { | ||
85 | put_pages_list(pages); | 75 | put_pages_list(pages); |
86 | break; | 76 | break; |
87 | } | 77 | } |
88 | task_io_account_read(PAGE_CACHE_SIZE); | 78 | task_io_account_read(PAGE_CACHE_SIZE); |
89 | } | 79 | } |
90 | pagevec_lru_add(&lru_pvec); | ||
91 | return ret; | 80 | return ret; |
92 | } | 81 | } |
93 | 82 | ||
@@ -97,7 +86,6 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
97 | struct list_head *pages, unsigned nr_pages) | 86 | struct list_head *pages, unsigned nr_pages) |
98 | { | 87 | { |
99 | unsigned page_idx; | 88 | unsigned page_idx; |
100 | struct pagevec lru_pvec; | ||
101 | int ret; | 89 | int ret; |
102 | 90 | ||
103 | if (mapping->a_ops->readpages) { | 91 | if (mapping->a_ops->readpages) { |
@@ -107,19 +95,15 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
107 | goto out; | 95 | goto out; |
108 | } | 96 | } |
109 | 97 | ||
110 | pagevec_init(&lru_pvec, 0); | ||
111 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { | 98 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { |
112 | struct page *page = list_to_page(pages); | 99 | struct page *page = list_to_page(pages); |
113 | list_del(&page->lru); | 100 | list_del(&page->lru); |
114 | if (!add_to_page_cache(page, mapping, | 101 | if (!add_to_page_cache_lru(page, mapping, |
115 | page->index, GFP_KERNEL)) { | 102 | page->index, GFP_KERNEL)) { |
116 | mapping->a_ops->readpage(filp, page); | 103 | mapping->a_ops->readpage(filp, page); |
117 | if (!pagevec_add(&lru_pvec, page)) | 104 | } |
118 | __pagevec_lru_add(&lru_pvec); | 105 | page_cache_release(page); |
119 | } else | ||
120 | page_cache_release(page); | ||
121 | } | 106 | } |
122 | pagevec_lru_add(&lru_pvec); | ||
123 | ret = 0; | 107 | ret = 0; |
124 | out: | 108 | out: |
125 | return ret; | 109 | return ret; |
@@ -157,20 +141,19 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
157 | /* | 141 | /* |
158 | * Preallocate as many pages as we will need. | 142 | * Preallocate as many pages as we will need. |
159 | */ | 143 | */ |
160 | read_lock_irq(&mapping->tree_lock); | ||
161 | for (page_idx = 0; page_idx < nr_to_read; page_idx++) { | 144 | for (page_idx = 0; page_idx < nr_to_read; page_idx++) { |
162 | pgoff_t page_offset = offset + page_idx; | 145 | pgoff_t page_offset = offset + page_idx; |
163 | 146 | ||
164 | if (page_offset > end_index) | 147 | if (page_offset > end_index) |
165 | break; | 148 | break; |
166 | 149 | ||
150 | rcu_read_lock(); | ||
167 | page = radix_tree_lookup(&mapping->page_tree, page_offset); | 151 | page = radix_tree_lookup(&mapping->page_tree, page_offset); |
152 | rcu_read_unlock(); | ||
168 | if (page) | 153 | if (page) |
169 | continue; | 154 | continue; |
170 | 155 | ||
171 | read_unlock_irq(&mapping->tree_lock); | ||
172 | page = page_cache_alloc_cold(mapping); | 156 | page = page_cache_alloc_cold(mapping); |
173 | read_lock_irq(&mapping->tree_lock); | ||
174 | if (!page) | 157 | if (!page) |
175 | break; | 158 | break; |
176 | page->index = page_offset; | 159 | page->index = page_offset; |
@@ -179,7 +162,6 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
179 | SetPageReadahead(page); | 162 | SetPageReadahead(page); |
180 | ret++; | 163 | ret++; |
181 | } | 164 | } |
182 | read_unlock_irq(&mapping->tree_lock); | ||
183 | 165 | ||
184 | /* | 166 | /* |
185 | * Now start the IO. We ignore I/O errors - if the page is not | 167 | * Now start the IO. We ignore I/O errors - if the page is not |
@@ -327,7 +309,7 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, | |||
327 | * indicator. The flag won't be set on already cached pages, to avoid the | 309 | * indicator. The flag won't be set on already cached pages, to avoid the |
328 | * readahead-for-nothing fuss, saving pointless page cache lookups. | 310 | * readahead-for-nothing fuss, saving pointless page cache lookups. |
329 | * | 311 | * |
330 | * prev_index tracks the last visited page in the _previous_ read request. | 312 | * prev_pos tracks the last visited byte in the _previous_ read request. |
331 | * It should be maintained by the caller, and will be used for detecting | 313 | * It should be maintained by the caller, and will be used for detecting |
332 | * small random reads. Note that the readahead algorithm checks loosely | 314 | * small random reads. Note that the readahead algorithm checks loosely |
333 | * for sequential patterns. Hence interleaved reads might be served as | 315 | * for sequential patterns. Hence interleaved reads might be served as |
@@ -351,11 +333,9 @@ ondemand_readahead(struct address_space *mapping, | |||
351 | bool hit_readahead_marker, pgoff_t offset, | 333 | bool hit_readahead_marker, pgoff_t offset, |
352 | unsigned long req_size) | 334 | unsigned long req_size) |
353 | { | 335 | { |
354 | unsigned long max; /* max readahead pages */ | 336 | int max = ra->ra_pages; /* max readahead pages */ |
355 | int sequential; | 337 | pgoff_t prev_offset; |
356 | 338 | int sequential; | |
357 | max = ra->ra_pages; | ||
358 | sequential = (offset - ra->prev_index <= 1UL) || (req_size > max); | ||
359 | 339 | ||
360 | /* | 340 | /* |
361 | * It's the expected callback offset, assume sequential access. | 341 | * It's the expected callback offset, assume sequential access. |
@@ -369,6 +349,9 @@ ondemand_readahead(struct address_space *mapping, | |||
369 | goto readit; | 349 | goto readit; |
370 | } | 350 | } |
371 | 351 | ||
352 | prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; | ||
353 | sequential = offset - prev_offset <= 1UL || req_size > max; | ||
354 | |||
372 | /* | 355 | /* |
373 | * Standalone, small read. | 356 | * Standalone, small read. |
374 | * Read as is, and do not pollute the readahead state. | 357 | * Read as is, and do not pollute the readahead state. |
@@ -379,6 +362,29 @@ ondemand_readahead(struct address_space *mapping, | |||
379 | } | 362 | } |
380 | 363 | ||
381 | /* | 364 | /* |
365 | * Hit a marked page without valid readahead state. | ||
366 | * E.g. interleaved reads. | ||
367 | * Query the pagecache for async_size, which normally equals to | ||
368 | * readahead size. Ramp it up and use it as the new readahead size. | ||
369 | */ | ||
370 | if (hit_readahead_marker) { | ||
371 | pgoff_t start; | ||
372 | |||
373 | read_lock_irq(&mapping->tree_lock); | ||
374 | start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); | ||
375 | read_unlock_irq(&mapping->tree_lock); | ||
376 | |||
377 | if (!start || start - offset > max) | ||
378 | return 0; | ||
379 | |||
380 | ra->start = start; | ||
381 | ra->size = start - offset; /* old async_size */ | ||
382 | ra->size = get_next_ra_size(ra, max); | ||
383 | ra->async_size = ra->size; | ||
384 | goto readit; | ||
385 | } | ||
386 | |||
387 | /* | ||
382 | * It may be one of | 388 | * It may be one of |
383 | * - first read on start of file | 389 | * - first read on start of file |
384 | * - sequential cache miss | 390 | * - sequential cache miss |
@@ -389,16 +395,6 @@ ondemand_readahead(struct address_space *mapping, | |||
389 | ra->size = get_init_ra_size(req_size, max); | 395 | ra->size = get_init_ra_size(req_size, max); |
390 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; | 396 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; |
391 | 397 | ||
392 | /* | ||
393 | * Hit on a marked page without valid readahead state. | ||
394 | * E.g. interleaved reads. | ||
395 | * Not knowing its readahead pos/size, bet on the minimal possible one. | ||
396 | */ | ||
397 | if (hit_readahead_marker) { | ||
398 | ra->start++; | ||
399 | ra->size = get_next_ra_size(ra, max); | ||
400 | } | ||
401 | |||
402 | readit: | 398 | readit: |
403 | return ra_submit(ra, mapping, filp); | 399 | return ra_submit(ra, mapping, filp); |
404 | } | 400 | } |
@@ -436,7 +436,6 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | |||
436 | entry = pte_wrprotect(entry); | 436 | entry = pte_wrprotect(entry); |
437 | entry = pte_mkclean(entry); | 437 | entry = pte_mkclean(entry); |
438 | set_pte_at(mm, address, pte, entry); | 438 | set_pte_at(mm, address, pte, entry); |
439 | lazy_mmu_prot_update(entry); | ||
440 | ret = 1; | 439 | ret = 1; |
441 | } | 440 | } |
442 | 441 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index fcd19d323f9f..8a82342a8595 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -49,7 +49,6 @@ | |||
49 | #include <linux/ctype.h> | 49 | #include <linux/ctype.h> |
50 | #include <linux/migrate.h> | 50 | #include <linux/migrate.h> |
51 | #include <linux/highmem.h> | 51 | #include <linux/highmem.h> |
52 | #include <linux/backing-dev.h> | ||
53 | 52 | ||
54 | #include <asm/uaccess.h> | 53 | #include <asm/uaccess.h> |
55 | #include <asm/div64.h> | 54 | #include <asm/div64.h> |
@@ -96,9 +95,9 @@ static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) | |||
96 | * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: | 95 | * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: |
97 | * might be reconsidered if it ever diverges from PAGE_SIZE. | 96 | * might be reconsidered if it ever diverges from PAGE_SIZE. |
98 | * | 97 | * |
99 | * __GFP_MOVABLE is masked out as swap vectors cannot move | 98 | * Mobility flags are masked out as swap vectors cannot move |
100 | */ | 99 | */ |
101 | return alloc_pages((gfp_mask & ~__GFP_MOVABLE) | __GFP_ZERO, | 100 | return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO, |
102 | PAGE_CACHE_SHIFT-PAGE_SHIFT); | 101 | PAGE_CACHE_SHIFT-PAGE_SHIFT); |
103 | } | 102 | } |
104 | 103 | ||
@@ -972,7 +971,7 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_ | |||
972 | *nodelist++ = '\0'; | 971 | *nodelist++ = '\0'; |
973 | if (nodelist_parse(nodelist, *policy_nodes)) | 972 | if (nodelist_parse(nodelist, *policy_nodes)) |
974 | goto out; | 973 | goto out; |
975 | if (!nodes_subset(*policy_nodes, node_online_map)) | 974 | if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY])) |
976 | goto out; | 975 | goto out; |
977 | } | 976 | } |
978 | if (!strcmp(value, "default")) { | 977 | if (!strcmp(value, "default")) { |
@@ -997,9 +996,11 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_ | |||
997 | err = 0; | 996 | err = 0; |
998 | } else if (!strcmp(value, "interleave")) { | 997 | } else if (!strcmp(value, "interleave")) { |
999 | *policy = MPOL_INTERLEAVE; | 998 | *policy = MPOL_INTERLEAVE; |
1000 | /* Default to nodes online if no nodelist */ | 999 | /* |
1000 | * Default to online nodes with memory if no nodelist | ||
1001 | */ | ||
1001 | if (!nodelist) | 1002 | if (!nodelist) |
1002 | *policy_nodes = node_online_map; | 1003 | *policy_nodes = node_states[N_HIGH_MEMORY]; |
1003 | err = 0; | 1004 | err = 0; |
1004 | } | 1005 | } |
1005 | out: | 1006 | out: |
@@ -1025,8 +1026,8 @@ static struct page *shmem_swapin_async(struct shared_policy *p, | |||
1025 | return page; | 1026 | return page; |
1026 | } | 1027 | } |
1027 | 1028 | ||
1028 | struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry, | 1029 | static struct page *shmem_swapin(struct shmem_inode_info *info, |
1029 | unsigned long idx) | 1030 | swp_entry_t entry, unsigned long idx) |
1030 | { | 1031 | { |
1031 | struct shared_policy *p = &info->policy; | 1032 | struct shared_policy *p = &info->policy; |
1032 | int i, num; | 1033 | int i, num; |
@@ -1061,7 +1062,8 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, | |||
1061 | return page; | 1062 | return page; |
1062 | } | 1063 | } |
1063 | #else | 1064 | #else |
1064 | static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) | 1065 | static inline int shmem_parse_mpol(char *value, int *policy, |
1066 | nodemask_t *policy_nodes) | ||
1065 | { | 1067 | { |
1066 | return 1; | 1068 | return 1; |
1067 | } | 1069 | } |
@@ -1109,7 +1111,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, | |||
1109 | * Normally, filepage is NULL on entry, and either found | 1111 | * Normally, filepage is NULL on entry, and either found |
1110 | * uptodate immediately, or allocated and zeroed, or read | 1112 | * uptodate immediately, or allocated and zeroed, or read |
1111 | * in under swappage, which is then assigned to filepage. | 1113 | * in under swappage, which is then assigned to filepage. |
1112 | * But shmem_readpage and shmem_prepare_write pass in a locked | 1114 | * But shmem_readpage and shmem_write_begin pass in a locked |
1113 | * filepage, which may be found not uptodate by other callers | 1115 | * filepage, which may be found not uptodate by other callers |
1114 | * too, and may need to be copied from the swappage read in. | 1116 | * too, and may need to be copied from the swappage read in. |
1115 | */ | 1117 | */ |
@@ -1327,14 +1329,14 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1327 | } | 1329 | } |
1328 | 1330 | ||
1329 | #ifdef CONFIG_NUMA | 1331 | #ifdef CONFIG_NUMA |
1330 | int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | 1332 | static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) |
1331 | { | 1333 | { |
1332 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; | 1334 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; |
1333 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); | 1335 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); |
1334 | } | 1336 | } |
1335 | 1337 | ||
1336 | struct mempolicy * | 1338 | static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, |
1337 | shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) | 1339 | unsigned long addr) |
1338 | { | 1340 | { |
1339 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; | 1341 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; |
1340 | unsigned long idx; | 1342 | unsigned long idx; |
@@ -1446,7 +1448,7 @@ static const struct inode_operations shmem_symlink_inode_operations; | |||
1446 | static const struct inode_operations shmem_symlink_inline_operations; | 1448 | static const struct inode_operations shmem_symlink_inline_operations; |
1447 | 1449 | ||
1448 | /* | 1450 | /* |
1449 | * Normally tmpfs avoids the use of shmem_readpage and shmem_prepare_write; | 1451 | * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin; |
1450 | * but providing them allows a tmpfs file to be used for splice, sendfile, and | 1452 | * but providing them allows a tmpfs file to be used for splice, sendfile, and |
1451 | * below the loop driver, in the generic fashion that many filesystems support. | 1453 | * below the loop driver, in the generic fashion that many filesystems support. |
1452 | */ | 1454 | */ |
@@ -1459,10 +1461,30 @@ static int shmem_readpage(struct file *file, struct page *page) | |||
1459 | } | 1461 | } |
1460 | 1462 | ||
1461 | static int | 1463 | static int |
1462 | shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) | 1464 | shmem_write_begin(struct file *file, struct address_space *mapping, |
1465 | loff_t pos, unsigned len, unsigned flags, | ||
1466 | struct page **pagep, void **fsdata) | ||
1463 | { | 1467 | { |
1464 | struct inode *inode = page->mapping->host; | 1468 | struct inode *inode = mapping->host; |
1465 | return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL); | 1469 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
1470 | *pagep = NULL; | ||
1471 | return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); | ||
1472 | } | ||
1473 | |||
1474 | static int | ||
1475 | shmem_write_end(struct file *file, struct address_space *mapping, | ||
1476 | loff_t pos, unsigned len, unsigned copied, | ||
1477 | struct page *page, void *fsdata) | ||
1478 | { | ||
1479 | struct inode *inode = mapping->host; | ||
1480 | |||
1481 | set_page_dirty(page); | ||
1482 | page_cache_release(page); | ||
1483 | |||
1484 | if (pos+copied > inode->i_size) | ||
1485 | i_size_write(inode, pos+copied); | ||
1486 | |||
1487 | return copied; | ||
1466 | } | 1488 | } |
1467 | 1489 | ||
1468 | static ssize_t | 1490 | static ssize_t |
@@ -2219,7 +2241,7 @@ static int shmem_fill_super(struct super_block *sb, | |||
2219 | unsigned long blocks = 0; | 2241 | unsigned long blocks = 0; |
2220 | unsigned long inodes = 0; | 2242 | unsigned long inodes = 0; |
2221 | int policy = MPOL_DEFAULT; | 2243 | int policy = MPOL_DEFAULT; |
2222 | nodemask_t policy_nodes = node_online_map; | 2244 | nodemask_t policy_nodes = node_states[N_HIGH_MEMORY]; |
2223 | 2245 | ||
2224 | #ifdef CONFIG_TMPFS | 2246 | #ifdef CONFIG_TMPFS |
2225 | /* | 2247 | /* |
@@ -2338,8 +2360,8 @@ static const struct address_space_operations shmem_aops = { | |||
2338 | .set_page_dirty = __set_page_dirty_no_writeback, | 2360 | .set_page_dirty = __set_page_dirty_no_writeback, |
2339 | #ifdef CONFIG_TMPFS | 2361 | #ifdef CONFIG_TMPFS |
2340 | .readpage = shmem_readpage, | 2362 | .readpage = shmem_readpage, |
2341 | .prepare_write = shmem_prepare_write, | 2363 | .write_begin = shmem_write_begin, |
2342 | .commit_write = simple_commit_write, | 2364 | .write_end = shmem_write_end, |
2343 | #endif | 2365 | #endif |
2344 | .migratepage = migrate_page, | 2366 | .migratepage = migrate_page, |
2345 | }; | 2367 | }; |
@@ -1568,7 +1568,7 @@ void __init kmem_cache_init(void) | |||
1568 | /* Replace the static kmem_list3 structures for the boot cpu */ | 1568 | /* Replace the static kmem_list3 structures for the boot cpu */ |
1569 | init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); | 1569 | init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); |
1570 | 1570 | ||
1571 | for_each_online_node(nid) { | 1571 | for_each_node_state(nid, N_NORMAL_MEMORY) { |
1572 | init_list(malloc_sizes[INDEX_AC].cs_cachep, | 1572 | init_list(malloc_sizes[INDEX_AC].cs_cachep, |
1573 | &initkmem_list3[SIZE_AC + nid], nid); | 1573 | &initkmem_list3[SIZE_AC + nid], nid); |
1574 | 1574 | ||
@@ -1643,6 +1643,8 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1643 | #endif | 1643 | #endif |
1644 | 1644 | ||
1645 | flags |= cachep->gfpflags; | 1645 | flags |= cachep->gfpflags; |
1646 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | ||
1647 | flags |= __GFP_RECLAIMABLE; | ||
1646 | 1648 | ||
1647 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | 1649 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); |
1648 | if (!page) | 1650 | if (!page) |
@@ -1944,7 +1946,7 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index) | |||
1944 | { | 1946 | { |
1945 | int node; | 1947 | int node; |
1946 | 1948 | ||
1947 | for_each_online_node(node) { | 1949 | for_each_node_state(node, N_NORMAL_MEMORY) { |
1948 | cachep->nodelists[node] = &initkmem_list3[index + node]; | 1950 | cachep->nodelists[node] = &initkmem_list3[index + node]; |
1949 | cachep->nodelists[node]->next_reap = jiffies + | 1951 | cachep->nodelists[node]->next_reap = jiffies + |
1950 | REAPTIMEOUT_LIST3 + | 1952 | REAPTIMEOUT_LIST3 + |
@@ -2075,7 +2077,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | |||
2075 | g_cpucache_up = PARTIAL_L3; | 2077 | g_cpucache_up = PARTIAL_L3; |
2076 | } else { | 2078 | } else { |
2077 | int node; | 2079 | int node; |
2078 | for_each_online_node(node) { | 2080 | for_each_node_state(node, N_NORMAL_MEMORY) { |
2079 | cachep->nodelists[node] = | 2081 | cachep->nodelists[node] = |
2080 | kmalloc_node(sizeof(struct kmem_list3), | 2082 | kmalloc_node(sizeof(struct kmem_list3), |
2081 | GFP_KERNEL, node); | 2083 | GFP_KERNEL, node); |
@@ -2746,9 +2748,9 @@ static int cache_grow(struct kmem_cache *cachep, | |||
2746 | * Be lazy and only check for valid flags here, keeping it out of the | 2748 | * Be lazy and only check for valid flags here, keeping it out of the |
2747 | * critical path in kmem_cache_alloc(). | 2749 | * critical path in kmem_cache_alloc(). |
2748 | */ | 2750 | */ |
2749 | BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); | 2751 | BUG_ON(flags & GFP_SLAB_BUG_MASK); |
2752 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | ||
2750 | 2753 | ||
2751 | local_flags = (flags & GFP_LEVEL_MASK); | ||
2752 | /* Take the l3 list lock to change the colour_next on this node */ | 2754 | /* Take the l3 list lock to change the colour_next on this node */ |
2753 | check_irq_off(); | 2755 | check_irq_off(); |
2754 | l3 = cachep->nodelists[nodeid]; | 2756 | l3 = cachep->nodelists[nodeid]; |
@@ -2785,7 +2787,7 @@ static int cache_grow(struct kmem_cache *cachep, | |||
2785 | 2787 | ||
2786 | /* Get slab management. */ | 2788 | /* Get slab management. */ |
2787 | slabp = alloc_slabmgmt(cachep, objp, offset, | 2789 | slabp = alloc_slabmgmt(cachep, objp, offset, |
2788 | local_flags & ~GFP_THISNODE, nodeid); | 2790 | local_flags & ~GFP_CONSTRAINT_MASK, nodeid); |
2789 | if (!slabp) | 2791 | if (!slabp) |
2790 | goto opps1; | 2792 | goto opps1; |
2791 | 2793 | ||
@@ -3225,7 +3227,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3225 | 3227 | ||
3226 | zonelist = &NODE_DATA(slab_node(current->mempolicy)) | 3228 | zonelist = &NODE_DATA(slab_node(current->mempolicy)) |
3227 | ->node_zonelists[gfp_zone(flags)]; | 3229 | ->node_zonelists[gfp_zone(flags)]; |
3228 | local_flags = (flags & GFP_LEVEL_MASK); | 3230 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
3229 | 3231 | ||
3230 | retry: | 3232 | retry: |
3231 | /* | 3233 | /* |
@@ -3792,7 +3794,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
3792 | struct array_cache *new_shared; | 3794 | struct array_cache *new_shared; |
3793 | struct array_cache **new_alien = NULL; | 3795 | struct array_cache **new_alien = NULL; |
3794 | 3796 | ||
3795 | for_each_online_node(node) { | 3797 | for_each_node_state(node, N_NORMAL_MEMORY) { |
3796 | 3798 | ||
3797 | if (use_alien_caches) { | 3799 | if (use_alien_caches) { |
3798 | new_alien = alloc_alien_cache(node, cachep->limit); | 3800 | new_alien = alloc_alien_cache(node, cachep->limit); |
@@ -4446,7 +4448,8 @@ const struct seq_operations slabstats_op = { | |||
4446 | */ | 4448 | */ |
4447 | size_t ksize(const void *objp) | 4449 | size_t ksize(const void *objp) |
4448 | { | 4450 | { |
4449 | if (unlikely(ZERO_OR_NULL_PTR(objp))) | 4451 | BUG_ON(!objp); |
4452 | if (unlikely(objp == ZERO_SIZE_PTR)) | ||
4450 | return 0; | 4453 | return 0; |
4451 | 4454 | ||
4452 | return obj_size(virt_to_cache(objp)); | 4455 | return obj_size(virt_to_cache(objp)); |
@@ -360,7 +360,7 @@ static void slob_free(void *block, int size) | |||
360 | slobidx_t units; | 360 | slobidx_t units; |
361 | unsigned long flags; | 361 | unsigned long flags; |
362 | 362 | ||
363 | if (ZERO_OR_NULL_PTR(block)) | 363 | if (unlikely(ZERO_OR_NULL_PTR(block))) |
364 | return; | 364 | return; |
365 | BUG_ON(!size); | 365 | BUG_ON(!size); |
366 | 366 | ||
@@ -466,7 +466,7 @@ void kfree(const void *block) | |||
466 | { | 466 | { |
467 | struct slob_page *sp; | 467 | struct slob_page *sp; |
468 | 468 | ||
469 | if (ZERO_OR_NULL_PTR(block)) | 469 | if (unlikely(ZERO_OR_NULL_PTR(block))) |
470 | return; | 470 | return; |
471 | 471 | ||
472 | sp = (struct slob_page *)virt_to_page(block); | 472 | sp = (struct slob_page *)virt_to_page(block); |
@@ -484,7 +484,8 @@ size_t ksize(const void *block) | |||
484 | { | 484 | { |
485 | struct slob_page *sp; | 485 | struct slob_page *sp; |
486 | 486 | ||
487 | if (ZERO_OR_NULL_PTR(block)) | 487 | BUG_ON(!block); |
488 | if (unlikely(block == ZERO_SIZE_PTR)) | ||
488 | return 0; | 489 | return 0; |
489 | 490 | ||
490 | sp = (struct slob_page *)virt_to_page(block); | 491 | sp = (struct slob_page *)virt_to_page(block); |
@@ -90,7 +90,7 @@ | |||
90 | * One use of this flag is to mark slabs that are | 90 | * One use of this flag is to mark slabs that are |
91 | * used for allocations. Then such a slab becomes a cpu | 91 | * used for allocations. Then such a slab becomes a cpu |
92 | * slab. The cpu slab may be equipped with an additional | 92 | * slab. The cpu slab may be equipped with an additional |
93 | * lockless_freelist that allows lockless access to | 93 | * freelist that allows lockless access to |
94 | * free objects in addition to the regular freelist | 94 | * free objects in addition to the regular freelist |
95 | * that requires the slab lock. | 95 | * that requires the slab lock. |
96 | * | 96 | * |
@@ -140,11 +140,6 @@ static inline void ClearSlabDebug(struct page *page) | |||
140 | /* | 140 | /* |
141 | * Issues still to be resolved: | 141 | * Issues still to be resolved: |
142 | * | 142 | * |
143 | * - The per cpu array is updated for each new slab and and is a remote | ||
144 | * cacheline for most nodes. This could become a bouncing cacheline given | ||
145 | * enough frequent updates. There are 16 pointers in a cacheline, so at | ||
146 | * max 16 cpus could compete for the cacheline which may be okay. | ||
147 | * | ||
148 | * - Support PAGE_ALLOC_DEBUG. Should be easy to do. | 143 | * - Support PAGE_ALLOC_DEBUG. Should be easy to do. |
149 | * | 144 | * |
150 | * - Variable sizing of the per node arrays | 145 | * - Variable sizing of the per node arrays |
@@ -205,11 +200,6 @@ static inline void ClearSlabDebug(struct page *page) | |||
205 | #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) | 200 | #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) |
206 | #endif | 201 | #endif |
207 | 202 | ||
208 | /* | ||
209 | * The page->inuse field is 16 bit thus we have this limitation | ||
210 | */ | ||
211 | #define MAX_OBJECTS_PER_SLAB 65535 | ||
212 | |||
213 | /* Internal SLUB flags */ | 203 | /* Internal SLUB flags */ |
214 | #define __OBJECT_POISON 0x80000000 /* Poison object */ | 204 | #define __OBJECT_POISON 0x80000000 /* Poison object */ |
215 | #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ | 205 | #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ |
@@ -277,6 +267,15 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | |||
277 | #endif | 267 | #endif |
278 | } | 268 | } |
279 | 269 | ||
270 | static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) | ||
271 | { | ||
272 | #ifdef CONFIG_SMP | ||
273 | return s->cpu_slab[cpu]; | ||
274 | #else | ||
275 | return &s->cpu_slab; | ||
276 | #endif | ||
277 | } | ||
278 | |||
280 | static inline int check_valid_pointer(struct kmem_cache *s, | 279 | static inline int check_valid_pointer(struct kmem_cache *s, |
281 | struct page *page, const void *object) | 280 | struct page *page, const void *object) |
282 | { | 281 | { |
@@ -729,11 +728,6 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
729 | slab_err(s, page, "Not a valid slab page"); | 728 | slab_err(s, page, "Not a valid slab page"); |
730 | return 0; | 729 | return 0; |
731 | } | 730 | } |
732 | if (page->offset * sizeof(void *) != s->offset) { | ||
733 | slab_err(s, page, "Corrupted offset %lu", | ||
734 | (unsigned long)(page->offset * sizeof(void *))); | ||
735 | return 0; | ||
736 | } | ||
737 | if (page->inuse > s->objects) { | 731 | if (page->inuse > s->objects) { |
738 | slab_err(s, page, "inuse %u > max %u", | 732 | slab_err(s, page, "inuse %u > max %u", |
739 | s->name, page->inuse, s->objects); | 733 | s->name, page->inuse, s->objects); |
@@ -872,8 +866,6 @@ bad: | |||
872 | slab_fix(s, "Marking all objects used"); | 866 | slab_fix(s, "Marking all objects used"); |
873 | page->inuse = s->objects; | 867 | page->inuse = s->objects; |
874 | page->freelist = NULL; | 868 | page->freelist = NULL; |
875 | /* Fix up fields that may be corrupted */ | ||
876 | page->offset = s->offset / sizeof(void *); | ||
877 | } | 869 | } |
878 | return 0; | 870 | return 0; |
879 | } | 871 | } |
@@ -1055,6 +1047,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1055 | if (s->flags & SLAB_CACHE_DMA) | 1047 | if (s->flags & SLAB_CACHE_DMA) |
1056 | flags |= SLUB_DMA; | 1048 | flags |= SLUB_DMA; |
1057 | 1049 | ||
1050 | if (s->flags & SLAB_RECLAIM_ACCOUNT) | ||
1051 | flags |= __GFP_RECLAIMABLE; | ||
1052 | |||
1058 | if (node == -1) | 1053 | if (node == -1) |
1059 | page = alloc_pages(flags, s->order); | 1054 | page = alloc_pages(flags, s->order); |
1060 | else | 1055 | else |
@@ -1088,19 +1083,19 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1088 | void *last; | 1083 | void *last; |
1089 | void *p; | 1084 | void *p; |
1090 | 1085 | ||
1091 | BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); | 1086 | BUG_ON(flags & GFP_SLAB_BUG_MASK); |
1092 | 1087 | ||
1093 | if (flags & __GFP_WAIT) | 1088 | if (flags & __GFP_WAIT) |
1094 | local_irq_enable(); | 1089 | local_irq_enable(); |
1095 | 1090 | ||
1096 | page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); | 1091 | page = allocate_slab(s, |
1092 | flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); | ||
1097 | if (!page) | 1093 | if (!page) |
1098 | goto out; | 1094 | goto out; |
1099 | 1095 | ||
1100 | n = get_node(s, page_to_nid(page)); | 1096 | n = get_node(s, page_to_nid(page)); |
1101 | if (n) | 1097 | if (n) |
1102 | atomic_long_inc(&n->nr_slabs); | 1098 | atomic_long_inc(&n->nr_slabs); |
1103 | page->offset = s->offset / sizeof(void *); | ||
1104 | page->slab = s; | 1099 | page->slab = s; |
1105 | page->flags |= 1 << PG_slab; | 1100 | page->flags |= 1 << PG_slab; |
1106 | if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | | 1101 | if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | |
@@ -1123,7 +1118,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1123 | set_freepointer(s, last, NULL); | 1118 | set_freepointer(s, last, NULL); |
1124 | 1119 | ||
1125 | page->freelist = start; | 1120 | page->freelist = start; |
1126 | page->lockless_freelist = NULL; | ||
1127 | page->inuse = 0; | 1121 | page->inuse = 0; |
1128 | out: | 1122 | out: |
1129 | if (flags & __GFP_WAIT) | 1123 | if (flags & __GFP_WAIT) |
@@ -1149,7 +1143,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1149 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | 1143 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, |
1150 | - pages); | 1144 | - pages); |
1151 | 1145 | ||
1152 | page->mapping = NULL; | ||
1153 | __free_pages(page, s->order); | 1146 | __free_pages(page, s->order); |
1154 | } | 1147 | } |
1155 | 1148 | ||
@@ -1383,33 +1376,34 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page) | |||
1383 | /* | 1376 | /* |
1384 | * Remove the cpu slab | 1377 | * Remove the cpu slab |
1385 | */ | 1378 | */ |
1386 | static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) | 1379 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1387 | { | 1380 | { |
1381 | struct page *page = c->page; | ||
1388 | /* | 1382 | /* |
1389 | * Merge cpu freelist into freelist. Typically we get here | 1383 | * Merge cpu freelist into freelist. Typically we get here |
1390 | * because both freelists are empty. So this is unlikely | 1384 | * because both freelists are empty. So this is unlikely |
1391 | * to occur. | 1385 | * to occur. |
1392 | */ | 1386 | */ |
1393 | while (unlikely(page->lockless_freelist)) { | 1387 | while (unlikely(c->freelist)) { |
1394 | void **object; | 1388 | void **object; |
1395 | 1389 | ||
1396 | /* Retrieve object from cpu_freelist */ | 1390 | /* Retrieve object from cpu_freelist */ |
1397 | object = page->lockless_freelist; | 1391 | object = c->freelist; |
1398 | page->lockless_freelist = page->lockless_freelist[page->offset]; | 1392 | c->freelist = c->freelist[c->offset]; |
1399 | 1393 | ||
1400 | /* And put onto the regular freelist */ | 1394 | /* And put onto the regular freelist */ |
1401 | object[page->offset] = page->freelist; | 1395 | object[c->offset] = page->freelist; |
1402 | page->freelist = object; | 1396 | page->freelist = object; |
1403 | page->inuse--; | 1397 | page->inuse--; |
1404 | } | 1398 | } |
1405 | s->cpu_slab[cpu] = NULL; | 1399 | c->page = NULL; |
1406 | unfreeze_slab(s, page); | 1400 | unfreeze_slab(s, page); |
1407 | } | 1401 | } |
1408 | 1402 | ||
1409 | static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) | 1403 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1410 | { | 1404 | { |
1411 | slab_lock(page); | 1405 | slab_lock(c->page); |
1412 | deactivate_slab(s, page, cpu); | 1406 | deactivate_slab(s, c); |
1413 | } | 1407 | } |
1414 | 1408 | ||
1415 | /* | 1409 | /* |
@@ -1418,18 +1412,17 @@ static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) | |||
1418 | */ | 1412 | */ |
1419 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) | 1413 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) |
1420 | { | 1414 | { |
1421 | struct page *page = s->cpu_slab[cpu]; | 1415 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); |
1422 | 1416 | ||
1423 | if (likely(page)) | 1417 | if (likely(c && c->page)) |
1424 | flush_slab(s, page, cpu); | 1418 | flush_slab(s, c); |
1425 | } | 1419 | } |
1426 | 1420 | ||
1427 | static void flush_cpu_slab(void *d) | 1421 | static void flush_cpu_slab(void *d) |
1428 | { | 1422 | { |
1429 | struct kmem_cache *s = d; | 1423 | struct kmem_cache *s = d; |
1430 | int cpu = smp_processor_id(); | ||
1431 | 1424 | ||
1432 | __flush_cpu_slab(s, cpu); | 1425 | __flush_cpu_slab(s, smp_processor_id()); |
1433 | } | 1426 | } |
1434 | 1427 | ||
1435 | static void flush_all(struct kmem_cache *s) | 1428 | static void flush_all(struct kmem_cache *s) |
@@ -1446,6 +1439,19 @@ static void flush_all(struct kmem_cache *s) | |||
1446 | } | 1439 | } |
1447 | 1440 | ||
1448 | /* | 1441 | /* |
1442 | * Check if the objects in a per cpu structure fit numa | ||
1443 | * locality expectations. | ||
1444 | */ | ||
1445 | static inline int node_match(struct kmem_cache_cpu *c, int node) | ||
1446 | { | ||
1447 | #ifdef CONFIG_NUMA | ||
1448 | if (node != -1 && c->node != node) | ||
1449 | return 0; | ||
1450 | #endif | ||
1451 | return 1; | ||
1452 | } | ||
1453 | |||
1454 | /* | ||
1449 | * Slow path. The lockless freelist is empty or we need to perform | 1455 | * Slow path. The lockless freelist is empty or we need to perform |
1450 | * debugging duties. | 1456 | * debugging duties. |
1451 | * | 1457 | * |
@@ -1463,45 +1469,46 @@ static void flush_all(struct kmem_cache *s) | |||
1463 | * we need to allocate a new slab. This is slowest path since we may sleep. | 1469 | * we need to allocate a new slab. This is slowest path since we may sleep. |
1464 | */ | 1470 | */ |
1465 | static void *__slab_alloc(struct kmem_cache *s, | 1471 | static void *__slab_alloc(struct kmem_cache *s, |
1466 | gfp_t gfpflags, int node, void *addr, struct page *page) | 1472 | gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) |
1467 | { | 1473 | { |
1468 | void **object; | 1474 | void **object; |
1469 | int cpu = smp_processor_id(); | 1475 | struct page *new; |
1470 | 1476 | ||
1471 | if (!page) | 1477 | if (!c->page) |
1472 | goto new_slab; | 1478 | goto new_slab; |
1473 | 1479 | ||
1474 | slab_lock(page); | 1480 | slab_lock(c->page); |
1475 | if (unlikely(node != -1 && page_to_nid(page) != node)) | 1481 | if (unlikely(!node_match(c, node))) |
1476 | goto another_slab; | 1482 | goto another_slab; |
1477 | load_freelist: | 1483 | load_freelist: |
1478 | object = page->freelist; | 1484 | object = c->page->freelist; |
1479 | if (unlikely(!object)) | 1485 | if (unlikely(!object)) |
1480 | goto another_slab; | 1486 | goto another_slab; |
1481 | if (unlikely(SlabDebug(page))) | 1487 | if (unlikely(SlabDebug(c->page))) |
1482 | goto debug; | 1488 | goto debug; |
1483 | 1489 | ||
1484 | object = page->freelist; | 1490 | object = c->page->freelist; |
1485 | page->lockless_freelist = object[page->offset]; | 1491 | c->freelist = object[c->offset]; |
1486 | page->inuse = s->objects; | 1492 | c->page->inuse = s->objects; |
1487 | page->freelist = NULL; | 1493 | c->page->freelist = NULL; |
1488 | slab_unlock(page); | 1494 | c->node = page_to_nid(c->page); |
1495 | slab_unlock(c->page); | ||
1489 | return object; | 1496 | return object; |
1490 | 1497 | ||
1491 | another_slab: | 1498 | another_slab: |
1492 | deactivate_slab(s, page, cpu); | 1499 | deactivate_slab(s, c); |
1493 | 1500 | ||
1494 | new_slab: | 1501 | new_slab: |
1495 | page = get_partial(s, gfpflags, node); | 1502 | new = get_partial(s, gfpflags, node); |
1496 | if (page) { | 1503 | if (new) { |
1497 | s->cpu_slab[cpu] = page; | 1504 | c->page = new; |
1498 | goto load_freelist; | 1505 | goto load_freelist; |
1499 | } | 1506 | } |
1500 | 1507 | ||
1501 | page = new_slab(s, gfpflags, node); | 1508 | new = new_slab(s, gfpflags, node); |
1502 | if (page) { | 1509 | if (new) { |
1503 | cpu = smp_processor_id(); | 1510 | c = get_cpu_slab(s, smp_processor_id()); |
1504 | if (s->cpu_slab[cpu]) { | 1511 | if (c->page) { |
1505 | /* | 1512 | /* |
1506 | * Someone else populated the cpu_slab while we | 1513 | * Someone else populated the cpu_slab while we |
1507 | * enabled interrupts, or we have gotten scheduled | 1514 | * enabled interrupts, or we have gotten scheduled |
@@ -1509,34 +1516,33 @@ new_slab: | |||
1509 | * requested node even if __GFP_THISNODE was | 1516 | * requested node even if __GFP_THISNODE was |
1510 | * specified. So we need to recheck. | 1517 | * specified. So we need to recheck. |
1511 | */ | 1518 | */ |
1512 | if (node == -1 || | 1519 | if (node_match(c, node)) { |
1513 | page_to_nid(s->cpu_slab[cpu]) == node) { | ||
1514 | /* | 1520 | /* |
1515 | * Current cpuslab is acceptable and we | 1521 | * Current cpuslab is acceptable and we |
1516 | * want the current one since its cache hot | 1522 | * want the current one since its cache hot |
1517 | */ | 1523 | */ |
1518 | discard_slab(s, page); | 1524 | discard_slab(s, new); |
1519 | page = s->cpu_slab[cpu]; | 1525 | slab_lock(c->page); |
1520 | slab_lock(page); | ||
1521 | goto load_freelist; | 1526 | goto load_freelist; |
1522 | } | 1527 | } |
1523 | /* New slab does not fit our expectations */ | 1528 | /* New slab does not fit our expectations */ |
1524 | flush_slab(s, s->cpu_slab[cpu], cpu); | 1529 | flush_slab(s, c); |
1525 | } | 1530 | } |
1526 | slab_lock(page); | 1531 | slab_lock(new); |
1527 | SetSlabFrozen(page); | 1532 | SetSlabFrozen(new); |
1528 | s->cpu_slab[cpu] = page; | 1533 | c->page = new; |
1529 | goto load_freelist; | 1534 | goto load_freelist; |
1530 | } | 1535 | } |
1531 | return NULL; | 1536 | return NULL; |
1532 | debug: | 1537 | debug: |
1533 | object = page->freelist; | 1538 | object = c->page->freelist; |
1534 | if (!alloc_debug_processing(s, page, object, addr)) | 1539 | if (!alloc_debug_processing(s, c->page, object, addr)) |
1535 | goto another_slab; | 1540 | goto another_slab; |
1536 | 1541 | ||
1537 | page->inuse++; | 1542 | c->page->inuse++; |
1538 | page->freelist = object[page->offset]; | 1543 | c->page->freelist = object[c->offset]; |
1539 | slab_unlock(page); | 1544 | c->node = -1; |
1545 | slab_unlock(c->page); | ||
1540 | return object; | 1546 | return object; |
1541 | } | 1547 | } |
1542 | 1548 | ||
@@ -1553,25 +1559,24 @@ debug: | |||
1553 | static void __always_inline *slab_alloc(struct kmem_cache *s, | 1559 | static void __always_inline *slab_alloc(struct kmem_cache *s, |
1554 | gfp_t gfpflags, int node, void *addr) | 1560 | gfp_t gfpflags, int node, void *addr) |
1555 | { | 1561 | { |
1556 | struct page *page; | ||
1557 | void **object; | 1562 | void **object; |
1558 | unsigned long flags; | 1563 | unsigned long flags; |
1564 | struct kmem_cache_cpu *c; | ||
1559 | 1565 | ||
1560 | local_irq_save(flags); | 1566 | local_irq_save(flags); |
1561 | page = s->cpu_slab[smp_processor_id()]; | 1567 | c = get_cpu_slab(s, smp_processor_id()); |
1562 | if (unlikely(!page || !page->lockless_freelist || | 1568 | if (unlikely(!c->freelist || !node_match(c, node))) |
1563 | (node != -1 && page_to_nid(page) != node))) | ||
1564 | 1569 | ||
1565 | object = __slab_alloc(s, gfpflags, node, addr, page); | 1570 | object = __slab_alloc(s, gfpflags, node, addr, c); |
1566 | 1571 | ||
1567 | else { | 1572 | else { |
1568 | object = page->lockless_freelist; | 1573 | object = c->freelist; |
1569 | page->lockless_freelist = object[page->offset]; | 1574 | c->freelist = object[c->offset]; |
1570 | } | 1575 | } |
1571 | local_irq_restore(flags); | 1576 | local_irq_restore(flags); |
1572 | 1577 | ||
1573 | if (unlikely((gfpflags & __GFP_ZERO) && object)) | 1578 | if (unlikely((gfpflags & __GFP_ZERO) && object)) |
1574 | memset(object, 0, s->objsize); | 1579 | memset(object, 0, c->objsize); |
1575 | 1580 | ||
1576 | return object; | 1581 | return object; |
1577 | } | 1582 | } |
@@ -1599,7 +1604,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); | |||
1599 | * handling required then we can return immediately. | 1604 | * handling required then we can return immediately. |
1600 | */ | 1605 | */ |
1601 | static void __slab_free(struct kmem_cache *s, struct page *page, | 1606 | static void __slab_free(struct kmem_cache *s, struct page *page, |
1602 | void *x, void *addr) | 1607 | void *x, void *addr, unsigned int offset) |
1603 | { | 1608 | { |
1604 | void *prior; | 1609 | void *prior; |
1605 | void **object = (void *)x; | 1610 | void **object = (void *)x; |
@@ -1609,7 +1614,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
1609 | if (unlikely(SlabDebug(page))) | 1614 | if (unlikely(SlabDebug(page))) |
1610 | goto debug; | 1615 | goto debug; |
1611 | checks_ok: | 1616 | checks_ok: |
1612 | prior = object[page->offset] = page->freelist; | 1617 | prior = object[offset] = page->freelist; |
1613 | page->freelist = object; | 1618 | page->freelist = object; |
1614 | page->inuse--; | 1619 | page->inuse--; |
1615 | 1620 | ||
@@ -1664,15 +1669,16 @@ static void __always_inline slab_free(struct kmem_cache *s, | |||
1664 | { | 1669 | { |
1665 | void **object = (void *)x; | 1670 | void **object = (void *)x; |
1666 | unsigned long flags; | 1671 | unsigned long flags; |
1672 | struct kmem_cache_cpu *c; | ||
1667 | 1673 | ||
1668 | local_irq_save(flags); | 1674 | local_irq_save(flags); |
1669 | debug_check_no_locks_freed(object, s->objsize); | 1675 | debug_check_no_locks_freed(object, s->objsize); |
1670 | if (likely(page == s->cpu_slab[smp_processor_id()] && | 1676 | c = get_cpu_slab(s, smp_processor_id()); |
1671 | !SlabDebug(page))) { | 1677 | if (likely(page == c->page && c->node >= 0)) { |
1672 | object[page->offset] = page->lockless_freelist; | 1678 | object[c->offset] = c->freelist; |
1673 | page->lockless_freelist = object; | 1679 | c->freelist = object; |
1674 | } else | 1680 | } else |
1675 | __slab_free(s, page, x, addr); | 1681 | __slab_free(s, page, x, addr, c->offset); |
1676 | 1682 | ||
1677 | local_irq_restore(flags); | 1683 | local_irq_restore(flags); |
1678 | } | 1684 | } |
@@ -1759,14 +1765,6 @@ static inline int slab_order(int size, int min_objects, | |||
1759 | int rem; | 1765 | int rem; |
1760 | int min_order = slub_min_order; | 1766 | int min_order = slub_min_order; |
1761 | 1767 | ||
1762 | /* | ||
1763 | * If we would create too many object per slab then reduce | ||
1764 | * the slab order even if it goes below slub_min_order. | ||
1765 | */ | ||
1766 | while (min_order > 0 && | ||
1767 | (PAGE_SIZE << min_order) >= MAX_OBJECTS_PER_SLAB * size) | ||
1768 | min_order--; | ||
1769 | |||
1770 | for (order = max(min_order, | 1768 | for (order = max(min_order, |
1771 | fls(min_objects * size - 1) - PAGE_SHIFT); | 1769 | fls(min_objects * size - 1) - PAGE_SHIFT); |
1772 | order <= max_order; order++) { | 1770 | order <= max_order; order++) { |
@@ -1781,9 +1779,6 @@ static inline int slab_order(int size, int min_objects, | |||
1781 | if (rem <= slab_size / fract_leftover) | 1779 | if (rem <= slab_size / fract_leftover) |
1782 | break; | 1780 | break; |
1783 | 1781 | ||
1784 | /* If the next size is too high then exit now */ | ||
1785 | if (slab_size * 2 >= MAX_OBJECTS_PER_SLAB * size) | ||
1786 | break; | ||
1787 | } | 1782 | } |
1788 | 1783 | ||
1789 | return order; | 1784 | return order; |
@@ -1858,6 +1853,16 @@ static unsigned long calculate_alignment(unsigned long flags, | |||
1858 | return ALIGN(align, sizeof(void *)); | 1853 | return ALIGN(align, sizeof(void *)); |
1859 | } | 1854 | } |
1860 | 1855 | ||
1856 | static void init_kmem_cache_cpu(struct kmem_cache *s, | ||
1857 | struct kmem_cache_cpu *c) | ||
1858 | { | ||
1859 | c->page = NULL; | ||
1860 | c->freelist = NULL; | ||
1861 | c->node = 0; | ||
1862 | c->offset = s->offset / sizeof(void *); | ||
1863 | c->objsize = s->objsize; | ||
1864 | } | ||
1865 | |||
1861 | static void init_kmem_cache_node(struct kmem_cache_node *n) | 1866 | static void init_kmem_cache_node(struct kmem_cache_node *n) |
1862 | { | 1867 | { |
1863 | n->nr_partial = 0; | 1868 | n->nr_partial = 0; |
@@ -1869,6 +1874,131 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) | |||
1869 | #endif | 1874 | #endif |
1870 | } | 1875 | } |
1871 | 1876 | ||
1877 | #ifdef CONFIG_SMP | ||
1878 | /* | ||
1879 | * Per cpu array for per cpu structures. | ||
1880 | * | ||
1881 | * The per cpu array places all kmem_cache_cpu structures from one processor | ||
1882 | * close together meaning that it becomes possible that multiple per cpu | ||
1883 | * structures are contained in one cacheline. This may be particularly | ||
1884 | * beneficial for the kmalloc caches. | ||
1885 | * | ||
1886 | * A desktop system typically has around 60-80 slabs. With 100 here we are | ||
1887 | * likely able to get per cpu structures for all caches from the array defined | ||
1888 | * here. We must be able to cover all kmalloc caches during bootstrap. | ||
1889 | * | ||
1890 | * If the per cpu array is exhausted then fall back to kmalloc | ||
1891 | * of individual cachelines. No sharing is possible then. | ||
1892 | */ | ||
1893 | #define NR_KMEM_CACHE_CPU 100 | ||
1894 | |||
1895 | static DEFINE_PER_CPU(struct kmem_cache_cpu, | ||
1896 | kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; | ||
1897 | |||
1898 | static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); | ||
1899 | static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE; | ||
1900 | |||
1901 | static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, | ||
1902 | int cpu, gfp_t flags) | ||
1903 | { | ||
1904 | struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); | ||
1905 | |||
1906 | if (c) | ||
1907 | per_cpu(kmem_cache_cpu_free, cpu) = | ||
1908 | (void *)c->freelist; | ||
1909 | else { | ||
1910 | /* Table overflow: So allocate ourselves */ | ||
1911 | c = kmalloc_node( | ||
1912 | ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), | ||
1913 | flags, cpu_to_node(cpu)); | ||
1914 | if (!c) | ||
1915 | return NULL; | ||
1916 | } | ||
1917 | |||
1918 | init_kmem_cache_cpu(s, c); | ||
1919 | return c; | ||
1920 | } | ||
1921 | |||
1922 | static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) | ||
1923 | { | ||
1924 | if (c < per_cpu(kmem_cache_cpu, cpu) || | ||
1925 | c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { | ||
1926 | kfree(c); | ||
1927 | return; | ||
1928 | } | ||
1929 | c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); | ||
1930 | per_cpu(kmem_cache_cpu_free, cpu) = c; | ||
1931 | } | ||
1932 | |||
1933 | static void free_kmem_cache_cpus(struct kmem_cache *s) | ||
1934 | { | ||
1935 | int cpu; | ||
1936 | |||
1937 | for_each_online_cpu(cpu) { | ||
1938 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
1939 | |||
1940 | if (c) { | ||
1941 | s->cpu_slab[cpu] = NULL; | ||
1942 | free_kmem_cache_cpu(c, cpu); | ||
1943 | } | ||
1944 | } | ||
1945 | } | ||
1946 | |||
1947 | static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) | ||
1948 | { | ||
1949 | int cpu; | ||
1950 | |||
1951 | for_each_online_cpu(cpu) { | ||
1952 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
1953 | |||
1954 | if (c) | ||
1955 | continue; | ||
1956 | |||
1957 | c = alloc_kmem_cache_cpu(s, cpu, flags); | ||
1958 | if (!c) { | ||
1959 | free_kmem_cache_cpus(s); | ||
1960 | return 0; | ||
1961 | } | ||
1962 | s->cpu_slab[cpu] = c; | ||
1963 | } | ||
1964 | return 1; | ||
1965 | } | ||
1966 | |||
1967 | /* | ||
1968 | * Initialize the per cpu array. | ||
1969 | */ | ||
1970 | static void init_alloc_cpu_cpu(int cpu) | ||
1971 | { | ||
1972 | int i; | ||
1973 | |||
1974 | if (cpu_isset(cpu, kmem_cach_cpu_free_init_once)) | ||
1975 | return; | ||
1976 | |||
1977 | for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) | ||
1978 | free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); | ||
1979 | |||
1980 | cpu_set(cpu, kmem_cach_cpu_free_init_once); | ||
1981 | } | ||
1982 | |||
1983 | static void __init init_alloc_cpu(void) | ||
1984 | { | ||
1985 | int cpu; | ||
1986 | |||
1987 | for_each_online_cpu(cpu) | ||
1988 | init_alloc_cpu_cpu(cpu); | ||
1989 | } | ||
1990 | |||
1991 | #else | ||
1992 | static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} | ||
1993 | static inline void init_alloc_cpu(void) {} | ||
1994 | |||
1995 | static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) | ||
1996 | { | ||
1997 | init_kmem_cache_cpu(s, &s->cpu_slab); | ||
1998 | return 1; | ||
1999 | } | ||
2000 | #endif | ||
2001 | |||
1872 | #ifdef CONFIG_NUMA | 2002 | #ifdef CONFIG_NUMA |
1873 | /* | 2003 | /* |
1874 | * No kmalloc_node yet so do it by hand. We know that this is the first | 2004 | * No kmalloc_node yet so do it by hand. We know that this is the first |
@@ -1876,10 +2006,11 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) | |||
1876 | * possible. | 2006 | * possible. |
1877 | * | 2007 | * |
1878 | * Note that this function only works on the kmalloc_node_cache | 2008 | * Note that this function only works on the kmalloc_node_cache |
1879 | * when allocating for the kmalloc_node_cache. | 2009 | * when allocating for the kmalloc_node_cache. This is used for bootstrapping |
2010 | * memory on a fresh node that has no slab structures yet. | ||
1880 | */ | 2011 | */ |
1881 | static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags, | 2012 | static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, |
1882 | int node) | 2013 | int node) |
1883 | { | 2014 | { |
1884 | struct page *page; | 2015 | struct page *page; |
1885 | struct kmem_cache_node *n; | 2016 | struct kmem_cache_node *n; |
@@ -1921,7 +2052,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) | |||
1921 | { | 2052 | { |
1922 | int node; | 2053 | int node; |
1923 | 2054 | ||
1924 | for_each_online_node(node) { | 2055 | for_each_node_state(node, N_NORMAL_MEMORY) { |
1925 | struct kmem_cache_node *n = s->node[node]; | 2056 | struct kmem_cache_node *n = s->node[node]; |
1926 | if (n && n != &s->local_node) | 2057 | if (n && n != &s->local_node) |
1927 | kmem_cache_free(kmalloc_caches, n); | 2058 | kmem_cache_free(kmalloc_caches, n); |
@@ -1939,7 +2070,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) | |||
1939 | else | 2070 | else |
1940 | local_node = 0; | 2071 | local_node = 0; |
1941 | 2072 | ||
1942 | for_each_online_node(node) { | 2073 | for_each_node_state(node, N_NORMAL_MEMORY) { |
1943 | struct kmem_cache_node *n; | 2074 | struct kmem_cache_node *n; |
1944 | 2075 | ||
1945 | if (local_node == node) | 2076 | if (local_node == node) |
@@ -2077,14 +2208,7 @@ static int calculate_sizes(struct kmem_cache *s) | |||
2077 | */ | 2208 | */ |
2078 | s->objects = (PAGE_SIZE << s->order) / size; | 2209 | s->objects = (PAGE_SIZE << s->order) / size; |
2079 | 2210 | ||
2080 | /* | 2211 | return !!s->objects; |
2081 | * Verify that the number of objects is within permitted limits. | ||
2082 | * The page->inuse field is only 16 bit wide! So we cannot have | ||
2083 | * more than 64k objects per slab. | ||
2084 | */ | ||
2085 | if (!s->objects || s->objects > MAX_OBJECTS_PER_SLAB) | ||
2086 | return 0; | ||
2087 | return 1; | ||
2088 | 2212 | ||
2089 | } | 2213 | } |
2090 | 2214 | ||
@@ -2107,9 +2231,12 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | |||
2107 | #ifdef CONFIG_NUMA | 2231 | #ifdef CONFIG_NUMA |
2108 | s->defrag_ratio = 100; | 2232 | s->defrag_ratio = 100; |
2109 | #endif | 2233 | #endif |
2234 | if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) | ||
2235 | goto error; | ||
2110 | 2236 | ||
2111 | if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) | 2237 | if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) |
2112 | return 1; | 2238 | return 1; |
2239 | free_kmem_cache_nodes(s); | ||
2113 | error: | 2240 | error: |
2114 | if (flags & SLAB_PANIC) | 2241 | if (flags & SLAB_PANIC) |
2115 | panic("Cannot create slab %s size=%lu realsize=%u " | 2242 | panic("Cannot create slab %s size=%lu realsize=%u " |
@@ -2192,7 +2319,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) | |||
2192 | flush_all(s); | 2319 | flush_all(s); |
2193 | 2320 | ||
2194 | /* Attempt to free all objects */ | 2321 | /* Attempt to free all objects */ |
2195 | for_each_online_node(node) { | 2322 | free_kmem_cache_cpus(s); |
2323 | for_each_node_state(node, N_NORMAL_MEMORY) { | ||
2196 | struct kmem_cache_node *n = get_node(s, node); | 2324 | struct kmem_cache_node *n = get_node(s, node); |
2197 | 2325 | ||
2198 | n->nr_partial -= free_list(s, n, &n->partial); | 2326 | n->nr_partial -= free_list(s, n, &n->partial); |
@@ -2227,11 +2355,11 @@ EXPORT_SYMBOL(kmem_cache_destroy); | |||
2227 | * Kmalloc subsystem | 2355 | * Kmalloc subsystem |
2228 | *******************************************************************/ | 2356 | *******************************************************************/ |
2229 | 2357 | ||
2230 | struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; | 2358 | struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned; |
2231 | EXPORT_SYMBOL(kmalloc_caches); | 2359 | EXPORT_SYMBOL(kmalloc_caches); |
2232 | 2360 | ||
2233 | #ifdef CONFIG_ZONE_DMA | 2361 | #ifdef CONFIG_ZONE_DMA |
2234 | static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; | 2362 | static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT]; |
2235 | #endif | 2363 | #endif |
2236 | 2364 | ||
2237 | static int __init setup_slub_min_order(char *str) | 2365 | static int __init setup_slub_min_order(char *str) |
@@ -2397,12 +2525,8 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) | |||
2397 | return ZERO_SIZE_PTR; | 2525 | return ZERO_SIZE_PTR; |
2398 | 2526 | ||
2399 | index = size_index[(size - 1) / 8]; | 2527 | index = size_index[(size - 1) / 8]; |
2400 | } else { | 2528 | } else |
2401 | if (size > KMALLOC_MAX_SIZE) | ||
2402 | return NULL; | ||
2403 | |||
2404 | index = fls(size - 1); | 2529 | index = fls(size - 1); |
2405 | } | ||
2406 | 2530 | ||
2407 | #ifdef CONFIG_ZONE_DMA | 2531 | #ifdef CONFIG_ZONE_DMA |
2408 | if (unlikely((flags & SLUB_DMA))) | 2532 | if (unlikely((flags & SLUB_DMA))) |
@@ -2414,9 +2538,15 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) | |||
2414 | 2538 | ||
2415 | void *__kmalloc(size_t size, gfp_t flags) | 2539 | void *__kmalloc(size_t size, gfp_t flags) |
2416 | { | 2540 | { |
2417 | struct kmem_cache *s = get_slab(size, flags); | 2541 | struct kmem_cache *s; |
2542 | |||
2543 | if (unlikely(size > PAGE_SIZE / 2)) | ||
2544 | return (void *)__get_free_pages(flags | __GFP_COMP, | ||
2545 | get_order(size)); | ||
2418 | 2546 | ||
2419 | if (ZERO_OR_NULL_PTR(s)) | 2547 | s = get_slab(size, flags); |
2548 | |||
2549 | if (unlikely(ZERO_OR_NULL_PTR(s))) | ||
2420 | return s; | 2550 | return s; |
2421 | 2551 | ||
2422 | return slab_alloc(s, flags, -1, __builtin_return_address(0)); | 2552 | return slab_alloc(s, flags, -1, __builtin_return_address(0)); |
@@ -2426,9 +2556,15 @@ EXPORT_SYMBOL(__kmalloc); | |||
2426 | #ifdef CONFIG_NUMA | 2556 | #ifdef CONFIG_NUMA |
2427 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 2557 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
2428 | { | 2558 | { |
2429 | struct kmem_cache *s = get_slab(size, flags); | 2559 | struct kmem_cache *s; |
2430 | 2560 | ||
2431 | if (ZERO_OR_NULL_PTR(s)) | 2561 | if (unlikely(size > PAGE_SIZE / 2)) |
2562 | return (void *)__get_free_pages(flags | __GFP_COMP, | ||
2563 | get_order(size)); | ||
2564 | |||
2565 | s = get_slab(size, flags); | ||
2566 | |||
2567 | if (unlikely(ZERO_OR_NULL_PTR(s))) | ||
2432 | return s; | 2568 | return s; |
2433 | 2569 | ||
2434 | return slab_alloc(s, flags, node, __builtin_return_address(0)); | 2570 | return slab_alloc(s, flags, node, __builtin_return_address(0)); |
@@ -2441,7 +2577,8 @@ size_t ksize(const void *object) | |||
2441 | struct page *page; | 2577 | struct page *page; |
2442 | struct kmem_cache *s; | 2578 | struct kmem_cache *s; |
2443 | 2579 | ||
2444 | if (ZERO_OR_NULL_PTR(object)) | 2580 | BUG_ON(!object); |
2581 | if (unlikely(object == ZERO_SIZE_PTR)) | ||
2445 | return 0; | 2582 | return 0; |
2446 | 2583 | ||
2447 | page = get_object_page(object); | 2584 | page = get_object_page(object); |
@@ -2473,22 +2610,17 @@ EXPORT_SYMBOL(ksize); | |||
2473 | 2610 | ||
2474 | void kfree(const void *x) | 2611 | void kfree(const void *x) |
2475 | { | 2612 | { |
2476 | struct kmem_cache *s; | ||
2477 | struct page *page; | 2613 | struct page *page; |
2478 | 2614 | ||
2479 | /* | 2615 | if (unlikely(ZERO_OR_NULL_PTR(x))) |
2480 | * This has to be an unsigned comparison. According to Linus | ||
2481 | * some gcc version treat a pointer as a signed entity. Then | ||
2482 | * this comparison would be true for all "negative" pointers | ||
2483 | * (which would cover the whole upper half of the address space). | ||
2484 | */ | ||
2485 | if (ZERO_OR_NULL_PTR(x)) | ||
2486 | return; | 2616 | return; |
2487 | 2617 | ||
2488 | page = virt_to_head_page(x); | 2618 | page = virt_to_head_page(x); |
2489 | s = page->slab; | 2619 | if (unlikely(!PageSlab(page))) { |
2490 | 2620 | put_page(page); | |
2491 | slab_free(s, page, (void *)x, __builtin_return_address(0)); | 2621 | return; |
2622 | } | ||
2623 | slab_free(page->slab, page, (void *)x, __builtin_return_address(0)); | ||
2492 | } | 2624 | } |
2493 | EXPORT_SYMBOL(kfree); | 2625 | EXPORT_SYMBOL(kfree); |
2494 | 2626 | ||
@@ -2517,7 +2649,7 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
2517 | return -ENOMEM; | 2649 | return -ENOMEM; |
2518 | 2650 | ||
2519 | flush_all(s); | 2651 | flush_all(s); |
2520 | for_each_online_node(node) { | 2652 | for_each_node_state(node, N_NORMAL_MEMORY) { |
2521 | n = get_node(s, node); | 2653 | n = get_node(s, node); |
2522 | 2654 | ||
2523 | if (!n->nr_partial) | 2655 | if (!n->nr_partial) |
@@ -2575,6 +2707,8 @@ void __init kmem_cache_init(void) | |||
2575 | int i; | 2707 | int i; |
2576 | int caches = 0; | 2708 | int caches = 0; |
2577 | 2709 | ||
2710 | init_alloc_cpu(); | ||
2711 | |||
2578 | #ifdef CONFIG_NUMA | 2712 | #ifdef CONFIG_NUMA |
2579 | /* | 2713 | /* |
2580 | * Must first have the slab cache available for the allocations of the | 2714 | * Must first have the slab cache available for the allocations of the |
@@ -2602,7 +2736,7 @@ void __init kmem_cache_init(void) | |||
2602 | caches++; | 2736 | caches++; |
2603 | } | 2737 | } |
2604 | 2738 | ||
2605 | for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { | 2739 | for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) { |
2606 | create_kmalloc_cache(&kmalloc_caches[i], | 2740 | create_kmalloc_cache(&kmalloc_caches[i], |
2607 | "kmalloc", 1 << i, GFP_KERNEL); | 2741 | "kmalloc", 1 << i, GFP_KERNEL); |
2608 | caches++; | 2742 | caches++; |
@@ -2629,16 +2763,18 @@ void __init kmem_cache_init(void) | |||
2629 | slab_state = UP; | 2763 | slab_state = UP; |
2630 | 2764 | ||
2631 | /* Provide the correct kmalloc names now that the caches are up */ | 2765 | /* Provide the correct kmalloc names now that the caches are up */ |
2632 | for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) | 2766 | for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) |
2633 | kmalloc_caches[i]. name = | 2767 | kmalloc_caches[i]. name = |
2634 | kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); | 2768 | kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); |
2635 | 2769 | ||
2636 | #ifdef CONFIG_SMP | 2770 | #ifdef CONFIG_SMP |
2637 | register_cpu_notifier(&slab_notifier); | 2771 | register_cpu_notifier(&slab_notifier); |
2772 | kmem_size = offsetof(struct kmem_cache, cpu_slab) + | ||
2773 | nr_cpu_ids * sizeof(struct kmem_cache_cpu *); | ||
2774 | #else | ||
2775 | kmem_size = sizeof(struct kmem_cache); | ||
2638 | #endif | 2776 | #endif |
2639 | 2777 | ||
2640 | kmem_size = offsetof(struct kmem_cache, cpu_slab) + | ||
2641 | nr_cpu_ids * sizeof(struct page *); | ||
2642 | 2778 | ||
2643 | printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," | 2779 | printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," |
2644 | " CPUs=%d, Nodes=%d\n", | 2780 | " CPUs=%d, Nodes=%d\n", |
@@ -2717,12 +2853,21 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
2717 | down_write(&slub_lock); | 2853 | down_write(&slub_lock); |
2718 | s = find_mergeable(size, align, flags, name, ctor); | 2854 | s = find_mergeable(size, align, flags, name, ctor); |
2719 | if (s) { | 2855 | if (s) { |
2856 | int cpu; | ||
2857 | |||
2720 | s->refcount++; | 2858 | s->refcount++; |
2721 | /* | 2859 | /* |
2722 | * Adjust the object sizes so that we clear | 2860 | * Adjust the object sizes so that we clear |
2723 | * the complete object on kzalloc. | 2861 | * the complete object on kzalloc. |
2724 | */ | 2862 | */ |
2725 | s->objsize = max(s->objsize, (int)size); | 2863 | s->objsize = max(s->objsize, (int)size); |
2864 | |||
2865 | /* | ||
2866 | * And then we need to update the object size in the | ||
2867 | * per cpu structures | ||
2868 | */ | ||
2869 | for_each_online_cpu(cpu) | ||
2870 | get_cpu_slab(s, cpu)->objsize = s->objsize; | ||
2726 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); | 2871 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); |
2727 | up_write(&slub_lock); | 2872 | up_write(&slub_lock); |
2728 | if (sysfs_slab_alias(s, name)) | 2873 | if (sysfs_slab_alias(s, name)) |
@@ -2765,15 +2910,29 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | |||
2765 | unsigned long flags; | 2910 | unsigned long flags; |
2766 | 2911 | ||
2767 | switch (action) { | 2912 | switch (action) { |
2913 | case CPU_UP_PREPARE: | ||
2914 | case CPU_UP_PREPARE_FROZEN: | ||
2915 | init_alloc_cpu_cpu(cpu); | ||
2916 | down_read(&slub_lock); | ||
2917 | list_for_each_entry(s, &slab_caches, list) | ||
2918 | s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, | ||
2919 | GFP_KERNEL); | ||
2920 | up_read(&slub_lock); | ||
2921 | break; | ||
2922 | |||
2768 | case CPU_UP_CANCELED: | 2923 | case CPU_UP_CANCELED: |
2769 | case CPU_UP_CANCELED_FROZEN: | 2924 | case CPU_UP_CANCELED_FROZEN: |
2770 | case CPU_DEAD: | 2925 | case CPU_DEAD: |
2771 | case CPU_DEAD_FROZEN: | 2926 | case CPU_DEAD_FROZEN: |
2772 | down_read(&slub_lock); | 2927 | down_read(&slub_lock); |
2773 | list_for_each_entry(s, &slab_caches, list) { | 2928 | list_for_each_entry(s, &slab_caches, list) { |
2929 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
2930 | |||
2774 | local_irq_save(flags); | 2931 | local_irq_save(flags); |
2775 | __flush_cpu_slab(s, cpu); | 2932 | __flush_cpu_slab(s, cpu); |
2776 | local_irq_restore(flags); | 2933 | local_irq_restore(flags); |
2934 | free_kmem_cache_cpu(c, cpu); | ||
2935 | s->cpu_slab[cpu] = NULL; | ||
2777 | } | 2936 | } |
2778 | up_read(&slub_lock); | 2937 | up_read(&slub_lock); |
2779 | break; | 2938 | break; |
@@ -2790,9 +2949,14 @@ static struct notifier_block __cpuinitdata slab_notifier = | |||
2790 | 2949 | ||
2791 | void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) | 2950 | void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) |
2792 | { | 2951 | { |
2793 | struct kmem_cache *s = get_slab(size, gfpflags); | 2952 | struct kmem_cache *s; |
2953 | |||
2954 | if (unlikely(size > PAGE_SIZE / 2)) | ||
2955 | return (void *)__get_free_pages(gfpflags | __GFP_COMP, | ||
2956 | get_order(size)); | ||
2957 | s = get_slab(size, gfpflags); | ||
2794 | 2958 | ||
2795 | if (ZERO_OR_NULL_PTR(s)) | 2959 | if (unlikely(ZERO_OR_NULL_PTR(s))) |
2796 | return s; | 2960 | return s; |
2797 | 2961 | ||
2798 | return slab_alloc(s, gfpflags, -1, caller); | 2962 | return slab_alloc(s, gfpflags, -1, caller); |
@@ -2801,9 +2965,14 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) | |||
2801 | void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, | 2965 | void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, |
2802 | int node, void *caller) | 2966 | int node, void *caller) |
2803 | { | 2967 | { |
2804 | struct kmem_cache *s = get_slab(size, gfpflags); | 2968 | struct kmem_cache *s; |
2969 | |||
2970 | if (unlikely(size > PAGE_SIZE / 2)) | ||
2971 | return (void *)__get_free_pages(gfpflags | __GFP_COMP, | ||
2972 | get_order(size)); | ||
2973 | s = get_slab(size, gfpflags); | ||
2805 | 2974 | ||
2806 | if (ZERO_OR_NULL_PTR(s)) | 2975 | if (unlikely(ZERO_OR_NULL_PTR(s))) |
2807 | return s; | 2976 | return s; |
2808 | 2977 | ||
2809 | return slab_alloc(s, gfpflags, node, caller); | 2978 | return slab_alloc(s, gfpflags, node, caller); |
@@ -2902,7 +3071,7 @@ static long validate_slab_cache(struct kmem_cache *s) | |||
2902 | return -ENOMEM; | 3071 | return -ENOMEM; |
2903 | 3072 | ||
2904 | flush_all(s); | 3073 | flush_all(s); |
2905 | for_each_online_node(node) { | 3074 | for_each_node_state(node, N_NORMAL_MEMORY) { |
2906 | struct kmem_cache_node *n = get_node(s, node); | 3075 | struct kmem_cache_node *n = get_node(s, node); |
2907 | 3076 | ||
2908 | count += validate_slab_node(s, n, map); | 3077 | count += validate_slab_node(s, n, map); |
@@ -3116,13 +3285,13 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
3116 | int node; | 3285 | int node; |
3117 | 3286 | ||
3118 | if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), | 3287 | if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), |
3119 | GFP_KERNEL)) | 3288 | GFP_TEMPORARY)) |
3120 | return sprintf(buf, "Out of memory\n"); | 3289 | return sprintf(buf, "Out of memory\n"); |
3121 | 3290 | ||
3122 | /* Push back cpu slabs */ | 3291 | /* Push back cpu slabs */ |
3123 | flush_all(s); | 3292 | flush_all(s); |
3124 | 3293 | ||
3125 | for_each_online_node(node) { | 3294 | for_each_node_state(node, N_NORMAL_MEMORY) { |
3126 | struct kmem_cache_node *n = get_node(s, node); | 3295 | struct kmem_cache_node *n = get_node(s, node); |
3127 | unsigned long flags; | 3296 | unsigned long flags; |
3128 | struct page *page; | 3297 | struct page *page; |
@@ -3230,11 +3399,18 @@ static unsigned long slab_objects(struct kmem_cache *s, | |||
3230 | per_cpu = nodes + nr_node_ids; | 3399 | per_cpu = nodes + nr_node_ids; |
3231 | 3400 | ||
3232 | for_each_possible_cpu(cpu) { | 3401 | for_each_possible_cpu(cpu) { |
3233 | struct page *page = s->cpu_slab[cpu]; | 3402 | struct page *page; |
3234 | int node; | 3403 | int node; |
3404 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
3235 | 3405 | ||
3406 | if (!c) | ||
3407 | continue; | ||
3408 | |||
3409 | page = c->page; | ||
3410 | node = c->node; | ||
3411 | if (node < 0) | ||
3412 | continue; | ||
3236 | if (page) { | 3413 | if (page) { |
3237 | node = page_to_nid(page); | ||
3238 | if (flags & SO_CPU) { | 3414 | if (flags & SO_CPU) { |
3239 | int x = 0; | 3415 | int x = 0; |
3240 | 3416 | ||
@@ -3249,7 +3425,7 @@ static unsigned long slab_objects(struct kmem_cache *s, | |||
3249 | } | 3425 | } |
3250 | } | 3426 | } |
3251 | 3427 | ||
3252 | for_each_online_node(node) { | 3428 | for_each_node_state(node, N_NORMAL_MEMORY) { |
3253 | struct kmem_cache_node *n = get_node(s, node); | 3429 | struct kmem_cache_node *n = get_node(s, node); |
3254 | 3430 | ||
3255 | if (flags & SO_PARTIAL) { | 3431 | if (flags & SO_PARTIAL) { |
@@ -3277,7 +3453,7 @@ static unsigned long slab_objects(struct kmem_cache *s, | |||
3277 | 3453 | ||
3278 | x = sprintf(buf, "%lu", total); | 3454 | x = sprintf(buf, "%lu", total); |
3279 | #ifdef CONFIG_NUMA | 3455 | #ifdef CONFIG_NUMA |
3280 | for_each_online_node(node) | 3456 | for_each_node_state(node, N_NORMAL_MEMORY) |
3281 | if (nodes[node]) | 3457 | if (nodes[node]) |
3282 | x += sprintf(buf + x, " N%d=%lu", | 3458 | x += sprintf(buf + x, " N%d=%lu", |
3283 | node, nodes[node]); | 3459 | node, nodes[node]); |
@@ -3291,13 +3467,19 @@ static int any_slab_objects(struct kmem_cache *s) | |||
3291 | int node; | 3467 | int node; |
3292 | int cpu; | 3468 | int cpu; |
3293 | 3469 | ||
3294 | for_each_possible_cpu(cpu) | 3470 | for_each_possible_cpu(cpu) { |
3295 | if (s->cpu_slab[cpu]) | 3471 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); |
3472 | |||
3473 | if (c && c->page) | ||
3296 | return 1; | 3474 | return 1; |
3475 | } | ||
3297 | 3476 | ||
3298 | for_each_node(node) { | 3477 | for_each_online_node(node) { |
3299 | struct kmem_cache_node *n = get_node(s, node); | 3478 | struct kmem_cache_node *n = get_node(s, node); |
3300 | 3479 | ||
3480 | if (!n) | ||
3481 | continue; | ||
3482 | |||
3301 | if (n->nr_partial || atomic_long_read(&n->nr_slabs)) | 3483 | if (n->nr_partial || atomic_long_read(&n->nr_slabs)) |
3302 | return 1; | 3484 | return 1; |
3303 | } | 3485 | } |
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c new file mode 100644 index 000000000000..d3b718b0c20a --- /dev/null +++ b/mm/sparse-vmemmap.c | |||
@@ -0,0 +1,148 @@ | |||
1 | /* | ||
2 | * Virtual Memory Map support | ||
3 | * | ||
4 | * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>. | ||
5 | * | ||
6 | * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn, | ||
7 | * virt_to_page, page_address() to be implemented as a base offset | ||
8 | * calculation without memory access. | ||
9 | * | ||
10 | * However, virtual mappings need a page table and TLBs. Many Linux | ||
11 | * architectures already map their physical space using 1-1 mappings | ||
12 | * via TLBs. For those arches the virtual memmory map is essentially | ||
13 | * for free if we use the same page size as the 1-1 mappings. In that | ||
14 | * case the overhead consists of a few additional pages that are | ||
15 | * allocated to create a view of memory for vmemmap. | ||
16 | * | ||
17 | * The architecture is expected to provide a vmemmap_populate() function | ||
18 | * to instantiate the mapping. | ||
19 | */ | ||
20 | #include <linux/mm.h> | ||
21 | #include <linux/mmzone.h> | ||
22 | #include <linux/bootmem.h> | ||
23 | #include <linux/highmem.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/spinlock.h> | ||
26 | #include <linux/vmalloc.h> | ||
27 | #include <asm/dma.h> | ||
28 | #include <asm/pgalloc.h> | ||
29 | #include <asm/pgtable.h> | ||
30 | |||
31 | /* | ||
32 | * Allocate a block of memory to be used to back the virtual memory map | ||
33 | * or to back the page tables that are used to create the mapping. | ||
34 | * Uses the main allocators if they are available, else bootmem. | ||
35 | */ | ||
36 | void * __meminit vmemmap_alloc_block(unsigned long size, int node) | ||
37 | { | ||
38 | /* If the main allocator is up use that, fallback to bootmem. */ | ||
39 | if (slab_is_available()) { | ||
40 | struct page *page = alloc_pages_node(node, | ||
41 | GFP_KERNEL | __GFP_ZERO, get_order(size)); | ||
42 | if (page) | ||
43 | return page_address(page); | ||
44 | return NULL; | ||
45 | } else | ||
46 | return __alloc_bootmem_node(NODE_DATA(node), size, size, | ||
47 | __pa(MAX_DMA_ADDRESS)); | ||
48 | } | ||
49 | |||
50 | void __meminit vmemmap_verify(pte_t *pte, int node, | ||
51 | unsigned long start, unsigned long end) | ||
52 | { | ||
53 | unsigned long pfn = pte_pfn(*pte); | ||
54 | int actual_node = early_pfn_to_nid(pfn); | ||
55 | |||
56 | if (actual_node != node) | ||
57 | printk(KERN_WARNING "[%lx-%lx] potential offnode " | ||
58 | "page_structs\n", start, end - 1); | ||
59 | } | ||
60 | |||
61 | pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) | ||
62 | { | ||
63 | pte_t *pte = pte_offset_kernel(pmd, addr); | ||
64 | if (pte_none(*pte)) { | ||
65 | pte_t entry; | ||
66 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | ||
67 | if (!p) | ||
68 | return 0; | ||
69 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); | ||
70 | set_pte_at(&init_mm, addr, pte, entry); | ||
71 | } | ||
72 | return pte; | ||
73 | } | ||
74 | |||
75 | pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) | ||
76 | { | ||
77 | pmd_t *pmd = pmd_offset(pud, addr); | ||
78 | if (pmd_none(*pmd)) { | ||
79 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | ||
80 | if (!p) | ||
81 | return 0; | ||
82 | pmd_populate_kernel(&init_mm, pmd, p); | ||
83 | } | ||
84 | return pmd; | ||
85 | } | ||
86 | |||
87 | pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node) | ||
88 | { | ||
89 | pud_t *pud = pud_offset(pgd, addr); | ||
90 | if (pud_none(*pud)) { | ||
91 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | ||
92 | if (!p) | ||
93 | return 0; | ||
94 | pud_populate(&init_mm, pud, p); | ||
95 | } | ||
96 | return pud; | ||
97 | } | ||
98 | |||
99 | pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) | ||
100 | { | ||
101 | pgd_t *pgd = pgd_offset_k(addr); | ||
102 | if (pgd_none(*pgd)) { | ||
103 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | ||
104 | if (!p) | ||
105 | return 0; | ||
106 | pgd_populate(&init_mm, pgd, p); | ||
107 | } | ||
108 | return pgd; | ||
109 | } | ||
110 | |||
111 | int __meminit vmemmap_populate_basepages(struct page *start_page, | ||
112 | unsigned long size, int node) | ||
113 | { | ||
114 | unsigned long addr = (unsigned long)start_page; | ||
115 | unsigned long end = (unsigned long)(start_page + size); | ||
116 | pgd_t *pgd; | ||
117 | pud_t *pud; | ||
118 | pmd_t *pmd; | ||
119 | pte_t *pte; | ||
120 | |||
121 | for (; addr < end; addr += PAGE_SIZE) { | ||
122 | pgd = vmemmap_pgd_populate(addr, node); | ||
123 | if (!pgd) | ||
124 | return -ENOMEM; | ||
125 | pud = vmemmap_pud_populate(pgd, addr, node); | ||
126 | if (!pud) | ||
127 | return -ENOMEM; | ||
128 | pmd = vmemmap_pmd_populate(pud, addr, node); | ||
129 | if (!pmd) | ||
130 | return -ENOMEM; | ||
131 | pte = vmemmap_pte_populate(pmd, addr, node); | ||
132 | if (!pte) | ||
133 | return -ENOMEM; | ||
134 | vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); | ||
135 | } | ||
136 | |||
137 | return 0; | ||
138 | } | ||
139 | |||
140 | struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) | ||
141 | { | ||
142 | struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION); | ||
143 | int error = vmemmap_populate(map, PAGES_PER_SECTION, nid); | ||
144 | if (error) | ||
145 | return NULL; | ||
146 | |||
147 | return map; | ||
148 | } | ||
diff --git a/mm/sparse.c b/mm/sparse.c index 239f5a720d38..08fb14f5eea3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -9,6 +9,8 @@ | |||
9 | #include <linux/spinlock.h> | 9 | #include <linux/spinlock.h> |
10 | #include <linux/vmalloc.h> | 10 | #include <linux/vmalloc.h> |
11 | #include <asm/dma.h> | 11 | #include <asm/dma.h> |
12 | #include <asm/pgalloc.h> | ||
13 | #include <asm/pgtable.h> | ||
12 | 14 | ||
13 | /* | 15 | /* |
14 | * Permanent SPARSEMEM data: | 16 | * Permanent SPARSEMEM data: |
@@ -106,7 +108,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) | |||
106 | 108 | ||
107 | /* | 109 | /* |
108 | * Although written for the SPARSEMEM_EXTREME case, this happens | 110 | * Although written for the SPARSEMEM_EXTREME case, this happens |
109 | * to also work for the flat array case becase | 111 | * to also work for the flat array case because |
110 | * NR_SECTION_ROOTS==NR_MEM_SECTIONS. | 112 | * NR_SECTION_ROOTS==NR_MEM_SECTIONS. |
111 | */ | 113 | */ |
112 | int __section_nr(struct mem_section* ms) | 114 | int __section_nr(struct mem_section* ms) |
@@ -176,7 +178,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, | |||
176 | if (nid != early_pfn_to_nid(pfn)) | 178 | if (nid != early_pfn_to_nid(pfn)) |
177 | continue; | 179 | continue; |
178 | 180 | ||
179 | if (pfn_valid(pfn)) | 181 | if (pfn_present(pfn)) |
180 | nr_pages += PAGES_PER_SECTION; | 182 | nr_pages += PAGES_PER_SECTION; |
181 | } | 183 | } |
182 | 184 | ||
@@ -204,13 +206,16 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn | |||
204 | } | 206 | } |
205 | 207 | ||
206 | static int __meminit sparse_init_one_section(struct mem_section *ms, | 208 | static int __meminit sparse_init_one_section(struct mem_section *ms, |
207 | unsigned long pnum, struct page *mem_map) | 209 | unsigned long pnum, struct page *mem_map, |
210 | unsigned long *pageblock_bitmap) | ||
208 | { | 211 | { |
209 | if (!valid_section(ms)) | 212 | if (!present_section(ms)) |
210 | return -EINVAL; | 213 | return -EINVAL; |
211 | 214 | ||
212 | ms->section_mem_map &= ~SECTION_MAP_MASK; | 215 | ms->section_mem_map &= ~SECTION_MAP_MASK; |
213 | ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum); | 216 | ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) | |
217 | SECTION_HAS_MEM_MAP; | ||
218 | ms->pageblock_flags = pageblock_bitmap; | ||
214 | 219 | ||
215 | return 1; | 220 | return 1; |
216 | } | 221 | } |
@@ -221,12 +226,43 @@ void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) | |||
221 | return NULL; | 226 | return NULL; |
222 | } | 227 | } |
223 | 228 | ||
224 | static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | 229 | static unsigned long usemap_size(void) |
225 | { | 230 | { |
226 | struct page *map; | 231 | unsigned long size_bytes; |
232 | size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; | ||
233 | size_bytes = roundup(size_bytes, sizeof(unsigned long)); | ||
234 | return size_bytes; | ||
235 | } | ||
236 | |||
237 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
238 | static unsigned long *__kmalloc_section_usemap(void) | ||
239 | { | ||
240 | return kmalloc(usemap_size(), GFP_KERNEL); | ||
241 | } | ||
242 | #endif /* CONFIG_MEMORY_HOTPLUG */ | ||
243 | |||
244 | static unsigned long *sparse_early_usemap_alloc(unsigned long pnum) | ||
245 | { | ||
246 | unsigned long *usemap; | ||
227 | struct mem_section *ms = __nr_to_section(pnum); | 247 | struct mem_section *ms = __nr_to_section(pnum); |
228 | int nid = sparse_early_nid(ms); | 248 | int nid = sparse_early_nid(ms); |
229 | 249 | ||
250 | usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); | ||
251 | if (usemap) | ||
252 | return usemap; | ||
253 | |||
254 | /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ | ||
255 | nid = 0; | ||
256 | |||
257 | printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__); | ||
258 | return NULL; | ||
259 | } | ||
260 | |||
261 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | ||
262 | struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | ||
263 | { | ||
264 | struct page *map; | ||
265 | |||
230 | map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); | 266 | map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); |
231 | if (map) | 267 | if (map) |
232 | return map; | 268 | return map; |
@@ -238,10 +274,22 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | |||
238 | 274 | ||
239 | map = alloc_bootmem_node(NODE_DATA(nid), | 275 | map = alloc_bootmem_node(NODE_DATA(nid), |
240 | sizeof(struct page) * PAGES_PER_SECTION); | 276 | sizeof(struct page) * PAGES_PER_SECTION); |
277 | return map; | ||
278 | } | ||
279 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | ||
280 | |||
281 | struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | ||
282 | { | ||
283 | struct page *map; | ||
284 | struct mem_section *ms = __nr_to_section(pnum); | ||
285 | int nid = sparse_early_nid(ms); | ||
286 | |||
287 | map = sparse_mem_map_populate(pnum, nid); | ||
241 | if (map) | 288 | if (map) |
242 | return map; | 289 | return map; |
243 | 290 | ||
244 | printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__); | 291 | printk(KERN_ERR "%s: sparsemem memory map backing failed " |
292 | "some memory will not be available.\n", __FUNCTION__); | ||
245 | ms->section_mem_map = 0; | 293 | ms->section_mem_map = 0; |
246 | return NULL; | 294 | return NULL; |
247 | } | 295 | } |
@@ -254,19 +302,38 @@ void __init sparse_init(void) | |||
254 | { | 302 | { |
255 | unsigned long pnum; | 303 | unsigned long pnum; |
256 | struct page *map; | 304 | struct page *map; |
305 | unsigned long *usemap; | ||
257 | 306 | ||
258 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | 307 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { |
259 | if (!valid_section_nr(pnum)) | 308 | if (!present_section_nr(pnum)) |
260 | continue; | 309 | continue; |
261 | 310 | ||
262 | map = sparse_early_mem_map_alloc(pnum); | 311 | map = sparse_early_mem_map_alloc(pnum); |
263 | if (!map) | 312 | if (!map) |
264 | continue; | 313 | continue; |
265 | sparse_init_one_section(__nr_to_section(pnum), pnum, map); | 314 | |
315 | usemap = sparse_early_usemap_alloc(pnum); | ||
316 | if (!usemap) | ||
317 | continue; | ||
318 | |||
319 | sparse_init_one_section(__nr_to_section(pnum), pnum, map, | ||
320 | usemap); | ||
266 | } | 321 | } |
267 | } | 322 | } |
268 | 323 | ||
269 | #ifdef CONFIG_MEMORY_HOTPLUG | 324 | #ifdef CONFIG_MEMORY_HOTPLUG |
325 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | ||
326 | static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, | ||
327 | unsigned long nr_pages) | ||
328 | { | ||
329 | /* This will make the necessary allocations eventually. */ | ||
330 | return sparse_mem_map_populate(pnum, nid); | ||
331 | } | ||
332 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | ||
333 | { | ||
334 | return; /* XXX: Not implemented yet */ | ||
335 | } | ||
336 | #else | ||
270 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | 337 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) |
271 | { | 338 | { |
272 | struct page *page, *ret; | 339 | struct page *page, *ret; |
@@ -289,6 +356,12 @@ got_map_ptr: | |||
289 | return ret; | 356 | return ret; |
290 | } | 357 | } |
291 | 358 | ||
359 | static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, | ||
360 | unsigned long nr_pages) | ||
361 | { | ||
362 | return __kmalloc_section_memmap(nr_pages); | ||
363 | } | ||
364 | |||
292 | static int vaddr_in_vmalloc_area(void *addr) | 365 | static int vaddr_in_vmalloc_area(void *addr) |
293 | { | 366 | { |
294 | if (addr >= (void *)VMALLOC_START && | 367 | if (addr >= (void *)VMALLOC_START && |
@@ -305,6 +378,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | |||
305 | free_pages((unsigned long)memmap, | 378 | free_pages((unsigned long)memmap, |
306 | get_order(sizeof(struct page) * nr_pages)); | 379 | get_order(sizeof(struct page) * nr_pages)); |
307 | } | 380 | } |
381 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ | ||
308 | 382 | ||
309 | /* | 383 | /* |
310 | * returns the number of sections whose mem_maps were properly | 384 | * returns the number of sections whose mem_maps were properly |
@@ -318,6 +392,7 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
318 | struct pglist_data *pgdat = zone->zone_pgdat; | 392 | struct pglist_data *pgdat = zone->zone_pgdat; |
319 | struct mem_section *ms; | 393 | struct mem_section *ms; |
320 | struct page *memmap; | 394 | struct page *memmap; |
395 | unsigned long *usemap; | ||
321 | unsigned long flags; | 396 | unsigned long flags; |
322 | int ret; | 397 | int ret; |
323 | 398 | ||
@@ -326,7 +401,8 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
326 | * plus, it does a kmalloc | 401 | * plus, it does a kmalloc |
327 | */ | 402 | */ |
328 | sparse_index_init(section_nr, pgdat->node_id); | 403 | sparse_index_init(section_nr, pgdat->node_id); |
329 | memmap = __kmalloc_section_memmap(nr_pages); | 404 | memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); |
405 | usemap = __kmalloc_section_usemap(); | ||
330 | 406 | ||
331 | pgdat_resize_lock(pgdat, &flags); | 407 | pgdat_resize_lock(pgdat, &flags); |
332 | 408 | ||
@@ -335,9 +411,14 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
335 | ret = -EEXIST; | 411 | ret = -EEXIST; |
336 | goto out; | 412 | goto out; |
337 | } | 413 | } |
414 | |||
415 | if (!usemap) { | ||
416 | ret = -ENOMEM; | ||
417 | goto out; | ||
418 | } | ||
338 | ms->section_mem_map |= SECTION_MARKED_PRESENT; | 419 | ms->section_mem_map |= SECTION_MARKED_PRESENT; |
339 | 420 | ||
340 | ret = sparse_init_one_section(ms, section_nr, memmap); | 421 | ret = sparse_init_one_section(ms, section_nr, memmap, usemap); |
341 | 422 | ||
342 | out: | 423 | out: |
343 | pgdat_resize_unlock(pgdat, &flags); | 424 | pgdat_resize_unlock(pgdat, &flags); |
@@ -24,16 +24,18 @@ | |||
24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
25 | #include <linux/mm_inline.h> | 25 | #include <linux/mm_inline.h> |
26 | #include <linux/buffer_head.h> /* for try_to_release_page() */ | 26 | #include <linux/buffer_head.h> /* for try_to_release_page() */ |
27 | #include <linux/module.h> | ||
28 | #include <linux/percpu_counter.h> | 27 | #include <linux/percpu_counter.h> |
29 | #include <linux/percpu.h> | 28 | #include <linux/percpu.h> |
30 | #include <linux/cpu.h> | 29 | #include <linux/cpu.h> |
31 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
32 | #include <linux/init.h> | ||
33 | 31 | ||
34 | /* How many pages do we try to swap or page in/out together? */ | 32 | /* How many pages do we try to swap or page in/out together? */ |
35 | int page_cluster; | 33 | int page_cluster; |
36 | 34 | ||
35 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; | ||
36 | static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; | ||
37 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; | ||
38 | |||
37 | /* | 39 | /* |
38 | * This path almost never happens for VM activity - pages are normally | 40 | * This path almost never happens for VM activity - pages are normally |
39 | * freed via pagevecs. But it gets used by networking. | 41 | * freed via pagevecs. But it gets used by networking. |
@@ -94,23 +96,47 @@ void put_pages_list(struct list_head *pages) | |||
94 | EXPORT_SYMBOL(put_pages_list); | 96 | EXPORT_SYMBOL(put_pages_list); |
95 | 97 | ||
96 | /* | 98 | /* |
99 | * pagevec_move_tail() must be called with IRQ disabled. | ||
100 | * Otherwise this may cause nasty races. | ||
101 | */ | ||
102 | static void pagevec_move_tail(struct pagevec *pvec) | ||
103 | { | ||
104 | int i; | ||
105 | int pgmoved = 0; | ||
106 | struct zone *zone = NULL; | ||
107 | |||
108 | for (i = 0; i < pagevec_count(pvec); i++) { | ||
109 | struct page *page = pvec->pages[i]; | ||
110 | struct zone *pagezone = page_zone(page); | ||
111 | |||
112 | if (pagezone != zone) { | ||
113 | if (zone) | ||
114 | spin_unlock(&zone->lru_lock); | ||
115 | zone = pagezone; | ||
116 | spin_lock(&zone->lru_lock); | ||
117 | } | ||
118 | if (PageLRU(page) && !PageActive(page)) { | ||
119 | list_move_tail(&page->lru, &zone->inactive_list); | ||
120 | pgmoved++; | ||
121 | } | ||
122 | } | ||
123 | if (zone) | ||
124 | spin_unlock(&zone->lru_lock); | ||
125 | __count_vm_events(PGROTATED, pgmoved); | ||
126 | release_pages(pvec->pages, pvec->nr, pvec->cold); | ||
127 | pagevec_reinit(pvec); | ||
128 | } | ||
129 | |||
130 | /* | ||
97 | * Writeback is about to end against a page which has been marked for immediate | 131 | * Writeback is about to end against a page which has been marked for immediate |
98 | * reclaim. If it still appears to be reclaimable, move it to the tail of the | 132 | * reclaim. If it still appears to be reclaimable, move it to the tail of the |
99 | * inactive list. The page still has PageWriteback set, which will pin it. | 133 | * inactive list. |
100 | * | ||
101 | * We don't expect many pages to come through here, so don't bother batching | ||
102 | * things up. | ||
103 | * | ||
104 | * To avoid placing the page at the tail of the LRU while PG_writeback is still | ||
105 | * set, this function will clear PG_writeback before performing the page | ||
106 | * motion. Do that inside the lru lock because once PG_writeback is cleared | ||
107 | * we may not touch the page. | ||
108 | * | 134 | * |
109 | * Returns zero if it cleared PG_writeback. | 135 | * Returns zero if it cleared PG_writeback. |
110 | */ | 136 | */ |
111 | int rotate_reclaimable_page(struct page *page) | 137 | int rotate_reclaimable_page(struct page *page) |
112 | { | 138 | { |
113 | struct zone *zone; | 139 | struct pagevec *pvec; |
114 | unsigned long flags; | 140 | unsigned long flags; |
115 | 141 | ||
116 | if (PageLocked(page)) | 142 | if (PageLocked(page)) |
@@ -122,15 +148,16 @@ int rotate_reclaimable_page(struct page *page) | |||
122 | if (!PageLRU(page)) | 148 | if (!PageLRU(page)) |
123 | return 1; | 149 | return 1; |
124 | 150 | ||
125 | zone = page_zone(page); | 151 | page_cache_get(page); |
126 | spin_lock_irqsave(&zone->lru_lock, flags); | 152 | local_irq_save(flags); |
127 | if (PageLRU(page) && !PageActive(page)) { | 153 | pvec = &__get_cpu_var(lru_rotate_pvecs); |
128 | list_move_tail(&page->lru, &zone->inactive_list); | 154 | if (!pagevec_add(pvec, page)) |
129 | __count_vm_event(PGROTATED); | 155 | pagevec_move_tail(pvec); |
130 | } | 156 | local_irq_restore(flags); |
157 | |||
131 | if (!test_clear_page_writeback(page)) | 158 | if (!test_clear_page_writeback(page)) |
132 | BUG(); | 159 | BUG(); |
133 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 160 | |
134 | return 0; | 161 | return 0; |
135 | } | 162 | } |
136 | 163 | ||
@@ -174,9 +201,6 @@ EXPORT_SYMBOL(mark_page_accessed); | |||
174 | * lru_cache_add: add a page to the page lists | 201 | * lru_cache_add: add a page to the page lists |
175 | * @page: the page to add | 202 | * @page: the page to add |
176 | */ | 203 | */ |
177 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; | ||
178 | static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; | ||
179 | |||
180 | void fastcall lru_cache_add(struct page *page) | 204 | void fastcall lru_cache_add(struct page *page) |
181 | { | 205 | { |
182 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); | 206 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); |
@@ -197,21 +221,37 @@ void fastcall lru_cache_add_active(struct page *page) | |||
197 | put_cpu_var(lru_add_active_pvecs); | 221 | put_cpu_var(lru_add_active_pvecs); |
198 | } | 222 | } |
199 | 223 | ||
200 | static void __lru_add_drain(int cpu) | 224 | /* |
225 | * Drain pages out of the cpu's pagevecs. | ||
226 | * Either "cpu" is the current CPU, and preemption has already been | ||
227 | * disabled; or "cpu" is being hot-unplugged, and is already dead. | ||
228 | */ | ||
229 | static void drain_cpu_pagevecs(int cpu) | ||
201 | { | 230 | { |
202 | struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); | 231 | struct pagevec *pvec; |
203 | 232 | ||
204 | /* CPU is dead, so no locking needed. */ | 233 | pvec = &per_cpu(lru_add_pvecs, cpu); |
205 | if (pagevec_count(pvec)) | 234 | if (pagevec_count(pvec)) |
206 | __pagevec_lru_add(pvec); | 235 | __pagevec_lru_add(pvec); |
236 | |||
207 | pvec = &per_cpu(lru_add_active_pvecs, cpu); | 237 | pvec = &per_cpu(lru_add_active_pvecs, cpu); |
208 | if (pagevec_count(pvec)) | 238 | if (pagevec_count(pvec)) |
209 | __pagevec_lru_add_active(pvec); | 239 | __pagevec_lru_add_active(pvec); |
240 | |||
241 | pvec = &per_cpu(lru_rotate_pvecs, cpu); | ||
242 | if (pagevec_count(pvec)) { | ||
243 | unsigned long flags; | ||
244 | |||
245 | /* No harm done if a racing interrupt already did this */ | ||
246 | local_irq_save(flags); | ||
247 | pagevec_move_tail(pvec); | ||
248 | local_irq_restore(flags); | ||
249 | } | ||
210 | } | 250 | } |
211 | 251 | ||
212 | void lru_add_drain(void) | 252 | void lru_add_drain(void) |
213 | { | 253 | { |
214 | __lru_add_drain(get_cpu()); | 254 | drain_cpu_pagevecs(get_cpu()); |
215 | put_cpu(); | 255 | put_cpu(); |
216 | } | 256 | } |
217 | 257 | ||
@@ -258,6 +298,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
258 | int i; | 298 | int i; |
259 | struct pagevec pages_to_free; | 299 | struct pagevec pages_to_free; |
260 | struct zone *zone = NULL; | 300 | struct zone *zone = NULL; |
301 | unsigned long uninitialized_var(flags); | ||
261 | 302 | ||
262 | pagevec_init(&pages_to_free, cold); | 303 | pagevec_init(&pages_to_free, cold); |
263 | for (i = 0; i < nr; i++) { | 304 | for (i = 0; i < nr; i++) { |
@@ -265,7 +306,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
265 | 306 | ||
266 | if (unlikely(PageCompound(page))) { | 307 | if (unlikely(PageCompound(page))) { |
267 | if (zone) { | 308 | if (zone) { |
268 | spin_unlock_irq(&zone->lru_lock); | 309 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
269 | zone = NULL; | 310 | zone = NULL; |
270 | } | 311 | } |
271 | put_compound_page(page); | 312 | put_compound_page(page); |
@@ -279,9 +320,10 @@ void release_pages(struct page **pages, int nr, int cold) | |||
279 | struct zone *pagezone = page_zone(page); | 320 | struct zone *pagezone = page_zone(page); |
280 | if (pagezone != zone) { | 321 | if (pagezone != zone) { |
281 | if (zone) | 322 | if (zone) |
282 | spin_unlock_irq(&zone->lru_lock); | 323 | spin_unlock_irqrestore(&zone->lru_lock, |
324 | flags); | ||
283 | zone = pagezone; | 325 | zone = pagezone; |
284 | spin_lock_irq(&zone->lru_lock); | 326 | spin_lock_irqsave(&zone->lru_lock, flags); |
285 | } | 327 | } |
286 | VM_BUG_ON(!PageLRU(page)); | 328 | VM_BUG_ON(!PageLRU(page)); |
287 | __ClearPageLRU(page); | 329 | __ClearPageLRU(page); |
@@ -290,7 +332,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
290 | 332 | ||
291 | if (!pagevec_add(&pages_to_free, page)) { | 333 | if (!pagevec_add(&pages_to_free, page)) { |
292 | if (zone) { | 334 | if (zone) { |
293 | spin_unlock_irq(&zone->lru_lock); | 335 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
294 | zone = NULL; | 336 | zone = NULL; |
295 | } | 337 | } |
296 | __pagevec_free(&pages_to_free); | 338 | __pagevec_free(&pages_to_free); |
@@ -298,7 +340,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
298 | } | 340 | } |
299 | } | 341 | } |
300 | if (zone) | 342 | if (zone) |
301 | spin_unlock_irq(&zone->lru_lock); | 343 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
302 | 344 | ||
303 | pagevec_free(&pages_to_free); | 345 | pagevec_free(&pages_to_free); |
304 | } | 346 | } |
@@ -491,7 +533,7 @@ static int cpu_swap_callback(struct notifier_block *nfb, | |||
491 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | 533 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { |
492 | atomic_add(*committed, &vm_committed_space); | 534 | atomic_add(*committed, &vm_committed_space); |
493 | *committed = 0; | 535 | *committed = 0; |
494 | __lru_add_drain((long)hcpu); | 536 | drain_cpu_pagevecs((long)hcpu); |
495 | } | 537 | } |
496 | return NOTIFY_OK; | 538 | return NOTIFY_OK; |
497 | } | 539 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 67daecb6031a..b52635601dfe 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -74,6 +74,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, | |||
74 | { | 74 | { |
75 | int error; | 75 | int error; |
76 | 76 | ||
77 | BUG_ON(!PageLocked(page)); | ||
77 | BUG_ON(PageSwapCache(page)); | 78 | BUG_ON(PageSwapCache(page)); |
78 | BUG_ON(PagePrivate(page)); | 79 | BUG_ON(PagePrivate(page)); |
79 | error = radix_tree_preload(gfp_mask); | 80 | error = radix_tree_preload(gfp_mask); |
@@ -83,7 +84,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, | |||
83 | entry.val, page); | 84 | entry.val, page); |
84 | if (!error) { | 85 | if (!error) { |
85 | page_cache_get(page); | 86 | page_cache_get(page); |
86 | SetPageLocked(page); | ||
87 | SetPageSwapCache(page); | 87 | SetPageSwapCache(page); |
88 | set_page_private(page, entry.val); | 88 | set_page_private(page, entry.val); |
89 | total_swapcache_pages++; | 89 | total_swapcache_pages++; |
@@ -99,15 +99,18 @@ static int add_to_swap_cache(struct page *page, swp_entry_t entry) | |||
99 | { | 99 | { |
100 | int error; | 100 | int error; |
101 | 101 | ||
102 | BUG_ON(PageLocked(page)); | ||
102 | if (!swap_duplicate(entry)) { | 103 | if (!swap_duplicate(entry)) { |
103 | INC_CACHE_INFO(noent_race); | 104 | INC_CACHE_INFO(noent_race); |
104 | return -ENOENT; | 105 | return -ENOENT; |
105 | } | 106 | } |
107 | SetPageLocked(page); | ||
106 | error = __add_to_swap_cache(page, entry, GFP_KERNEL); | 108 | error = __add_to_swap_cache(page, entry, GFP_KERNEL); |
107 | /* | 109 | /* |
108 | * Anon pages are already on the LRU, we don't run lru_cache_add here. | 110 | * Anon pages are already on the LRU, we don't run lru_cache_add here. |
109 | */ | 111 | */ |
110 | if (error) { | 112 | if (error) { |
113 | ClearPageLocked(page); | ||
111 | swap_free(entry); | 114 | swap_free(entry); |
112 | if (error == -EEXIST) | 115 | if (error == -EEXIST) |
113 | INC_CACHE_INFO(exist_race); | 116 | INC_CACHE_INFO(exist_race); |
@@ -81,14 +81,16 @@ EXPORT_SYMBOL(kmemdup); | |||
81 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | 81 | void *krealloc(const void *p, size_t new_size, gfp_t flags) |
82 | { | 82 | { |
83 | void *ret; | 83 | void *ret; |
84 | size_t ks; | 84 | size_t ks = 0; |
85 | 85 | ||
86 | if (unlikely(!new_size)) { | 86 | if (unlikely(!new_size)) { |
87 | kfree(p); | 87 | kfree(p); |
88 | return ZERO_SIZE_PTR; | 88 | return ZERO_SIZE_PTR; |
89 | } | 89 | } |
90 | 90 | ||
91 | ks = ksize(p); | 91 | if (p) |
92 | ks = ksize(p); | ||
93 | |||
92 | if (ks >= new_size) | 94 | if (ks >= new_size) |
93 | return (void *)p; | 95 | return (void *)p; |
94 | 96 | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3cee76a8c9f0..2e01af365848 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -190,7 +190,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl | |||
190 | if (unlikely(!size)) | 190 | if (unlikely(!size)) |
191 | return NULL; | 191 | return NULL; |
192 | 192 | ||
193 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node); | 193 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); |
194 | |||
194 | if (unlikely(!area)) | 195 | if (unlikely(!area)) |
195 | return NULL; | 196 | return NULL; |
196 | 197 | ||
@@ -439,7 +440,7 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
439 | area->flags |= VM_VPAGES; | 440 | area->flags |= VM_VPAGES; |
440 | } else { | 441 | } else { |
441 | pages = kmalloc_node(array_size, | 442 | pages = kmalloc_node(array_size, |
442 | (gfp_mask & GFP_LEVEL_MASK) | __GFP_ZERO, | 443 | (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO, |
443 | node); | 444 | node); |
444 | } | 445 | } |
445 | area->pages = pages; | 446 | area->pages = pages; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index a6e65d024995..bbd194630c5b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -932,6 +932,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
932 | long mapped_ratio; | 932 | long mapped_ratio; |
933 | long distress; | 933 | long distress; |
934 | long swap_tendency; | 934 | long swap_tendency; |
935 | long imbalance; | ||
935 | 936 | ||
936 | if (zone_is_near_oom(zone)) | 937 | if (zone_is_near_oom(zone)) |
937 | goto force_reclaim_mapped; | 938 | goto force_reclaim_mapped; |
@@ -967,6 +968,46 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
967 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; | 968 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; |
968 | 969 | ||
969 | /* | 970 | /* |
971 | * If there's huge imbalance between active and inactive | ||
972 | * (think active 100 times larger than inactive) we should | ||
973 | * become more permissive, or the system will take too much | ||
974 | * cpu before it start swapping during memory pressure. | ||
975 | * Distress is about avoiding early-oom, this is about | ||
976 | * making swappiness graceful despite setting it to low | ||
977 | * values. | ||
978 | * | ||
979 | * Avoid div by zero with nr_inactive+1, and max resulting | ||
980 | * value is vm_total_pages. | ||
981 | */ | ||
982 | imbalance = zone_page_state(zone, NR_ACTIVE); | ||
983 | imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; | ||
984 | |||
985 | /* | ||
986 | * Reduce the effect of imbalance if swappiness is low, | ||
987 | * this means for a swappiness very low, the imbalance | ||
988 | * must be much higher than 100 for this logic to make | ||
989 | * the difference. | ||
990 | * | ||
991 | * Max temporary value is vm_total_pages*100. | ||
992 | */ | ||
993 | imbalance *= (vm_swappiness + 1); | ||
994 | imbalance /= 100; | ||
995 | |||
996 | /* | ||
997 | * If not much of the ram is mapped, makes the imbalance | ||
998 | * less relevant, it's high priority we refill the inactive | ||
999 | * list with mapped pages only in presence of high ratio of | ||
1000 | * mapped pages. | ||
1001 | * | ||
1002 | * Max temporary value is vm_total_pages*100. | ||
1003 | */ | ||
1004 | imbalance *= mapped_ratio; | ||
1005 | imbalance /= 100; | ||
1006 | |||
1007 | /* apply imbalance feedback to swap_tendency */ | ||
1008 | swap_tendency += imbalance; | ||
1009 | |||
1010 | /* | ||
970 | * Now use this metric to decide whether to start moving mapped | 1011 | * Now use this metric to decide whether to start moving mapped |
971 | * memory onto the inactive list. | 1012 | * memory onto the inactive list. |
972 | */ | 1013 | */ |
@@ -1371,7 +1412,13 @@ loop_again: | |||
1371 | temp_priority[i] = priority; | 1412 | temp_priority[i] = priority; |
1372 | sc.nr_scanned = 0; | 1413 | sc.nr_scanned = 0; |
1373 | note_zone_scanning_priority(zone, priority); | 1414 | note_zone_scanning_priority(zone, priority); |
1374 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 1415 | /* |
1416 | * We put equal pressure on every zone, unless one | ||
1417 | * zone has way too many pages free already. | ||
1418 | */ | ||
1419 | if (!zone_watermark_ok(zone, order, 8*zone->pages_high, | ||
1420 | end_zone, 0)) | ||
1421 | nr_reclaimed += shrink_zone(priority, zone, &sc); | ||
1375 | reclaim_state->reclaimed_slab = 0; | 1422 | reclaim_state->reclaimed_slab = 0; |
1376 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1423 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
1377 | lru_pages); | 1424 | lru_pages); |
@@ -1688,9 +1735,11 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
1688 | { | 1735 | { |
1689 | pg_data_t *pgdat; | 1736 | pg_data_t *pgdat; |
1690 | cpumask_t mask; | 1737 | cpumask_t mask; |
1738 | int nid; | ||
1691 | 1739 | ||
1692 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { | 1740 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { |
1693 | for_each_online_pgdat(pgdat) { | 1741 | for_each_node_state(nid, N_HIGH_MEMORY) { |
1742 | pgdat = NODE_DATA(nid); | ||
1694 | mask = node_to_cpumask(pgdat->node_id); | 1743 | mask = node_to_cpumask(pgdat->node_id); |
1695 | if (any_online_cpu(mask) != NR_CPUS) | 1744 | if (any_online_cpu(mask) != NR_CPUS) |
1696 | /* One of our CPUs online: restore mask */ | 1745 | /* One of our CPUs online: restore mask */ |
@@ -1727,7 +1776,7 @@ static int __init kswapd_init(void) | |||
1727 | int nid; | 1776 | int nid; |
1728 | 1777 | ||
1729 | swap_setup(); | 1778 | swap_setup(); |
1730 | for_each_online_node(nid) | 1779 | for_each_node_state(nid, N_HIGH_MEMORY) |
1731 | kswapd_run(nid); | 1780 | kswapd_run(nid); |
1732 | hotcpu_notifier(cpu_callback, 0); | 1781 | hotcpu_notifier(cpu_callback, 0); |
1733 | return 0; | 1782 | return 0; |
@@ -1847,7 +1896,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1847 | 1896 | ||
1848 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 1897 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
1849 | { | 1898 | { |
1850 | cpumask_t mask; | ||
1851 | int node_id; | 1899 | int node_id; |
1852 | 1900 | ||
1853 | /* | 1901 | /* |
@@ -1884,8 +1932,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1884 | * as wide as possible. | 1932 | * as wide as possible. |
1885 | */ | 1933 | */ |
1886 | node_id = zone_to_nid(zone); | 1934 | node_id = zone_to_nid(zone); |
1887 | mask = node_to_cpumask(node_id); | 1935 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) |
1888 | if (!cpus_empty(mask) && node_id != numa_node_id()) | ||
1889 | return 0; | 1936 | return 0; |
1890 | return __zone_reclaim(zone, gfp_mask, order); | 1937 | return __zone_reclaim(zone, gfp_mask, order); |
1891 | } | 1938 | } |
diff --git a/mm/vmstat.c b/mm/vmstat.c index c64d169537bf..3b5e9043e7db 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -353,23 +353,6 @@ void refresh_cpu_vm_stats(int cpu) | |||
353 | } | 353 | } |
354 | } | 354 | } |
355 | 355 | ||
356 | static void __refresh_cpu_vm_stats(void *dummy) | ||
357 | { | ||
358 | refresh_cpu_vm_stats(smp_processor_id()); | ||
359 | } | ||
360 | |||
361 | /* | ||
362 | * Consolidate all counters. | ||
363 | * | ||
364 | * Note that the result is less inaccurate but still inaccurate | ||
365 | * if concurrent processes are allowed to run. | ||
366 | */ | ||
367 | void refresh_vm_stats(void) | ||
368 | { | ||
369 | on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1); | ||
370 | } | ||
371 | EXPORT_SYMBOL(refresh_vm_stats); | ||
372 | |||
373 | #endif | 356 | #endif |
374 | 357 | ||
375 | #ifdef CONFIG_NUMA | 358 | #ifdef CONFIG_NUMA |
@@ -398,6 +381,13 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z) | |||
398 | 381 | ||
399 | #include <linux/seq_file.h> | 382 | #include <linux/seq_file.h> |
400 | 383 | ||
384 | static char * const migratetype_names[MIGRATE_TYPES] = { | ||
385 | "Unmovable", | ||
386 | "Reclaimable", | ||
387 | "Movable", | ||
388 | "Reserve", | ||
389 | }; | ||
390 | |||
401 | static void *frag_start(struct seq_file *m, loff_t *pos) | 391 | static void *frag_start(struct seq_file *m, loff_t *pos) |
402 | { | 392 | { |
403 | pg_data_t *pgdat; | 393 | pg_data_t *pgdat; |
@@ -422,28 +412,144 @@ static void frag_stop(struct seq_file *m, void *arg) | |||
422 | { | 412 | { |
423 | } | 413 | } |
424 | 414 | ||
425 | /* | 415 | /* Walk all the zones in a node and print using a callback */ |
426 | * This walks the free areas for each zone. | 416 | static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, |
427 | */ | 417 | void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) |
428 | static int frag_show(struct seq_file *m, void *arg) | ||
429 | { | 418 | { |
430 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
431 | struct zone *zone; | 419 | struct zone *zone; |
432 | struct zone *node_zones = pgdat->node_zones; | 420 | struct zone *node_zones = pgdat->node_zones; |
433 | unsigned long flags; | 421 | unsigned long flags; |
434 | int order; | ||
435 | 422 | ||
436 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | 423 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { |
437 | if (!populated_zone(zone)) | 424 | if (!populated_zone(zone)) |
438 | continue; | 425 | continue; |
439 | 426 | ||
440 | spin_lock_irqsave(&zone->lock, flags); | 427 | spin_lock_irqsave(&zone->lock, flags); |
441 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | 428 | print(m, pgdat, zone); |
442 | for (order = 0; order < MAX_ORDER; ++order) | ||
443 | seq_printf(m, "%6lu ", zone->free_area[order].nr_free); | ||
444 | spin_unlock_irqrestore(&zone->lock, flags); | 429 | spin_unlock_irqrestore(&zone->lock, flags); |
430 | } | ||
431 | } | ||
432 | |||
433 | static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, | ||
434 | struct zone *zone) | ||
435 | { | ||
436 | int order; | ||
437 | |||
438 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
439 | for (order = 0; order < MAX_ORDER; ++order) | ||
440 | seq_printf(m, "%6lu ", zone->free_area[order].nr_free); | ||
441 | seq_putc(m, '\n'); | ||
442 | } | ||
443 | |||
444 | /* | ||
445 | * This walks the free areas for each zone. | ||
446 | */ | ||
447 | static int frag_show(struct seq_file *m, void *arg) | ||
448 | { | ||
449 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
450 | walk_zones_in_node(m, pgdat, frag_show_print); | ||
451 | return 0; | ||
452 | } | ||
453 | |||
454 | static void pagetypeinfo_showfree_print(struct seq_file *m, | ||
455 | pg_data_t *pgdat, struct zone *zone) | ||
456 | { | ||
457 | int order, mtype; | ||
458 | |||
459 | for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) { | ||
460 | seq_printf(m, "Node %4d, zone %8s, type %12s ", | ||
461 | pgdat->node_id, | ||
462 | zone->name, | ||
463 | migratetype_names[mtype]); | ||
464 | for (order = 0; order < MAX_ORDER; ++order) { | ||
465 | unsigned long freecount = 0; | ||
466 | struct free_area *area; | ||
467 | struct list_head *curr; | ||
468 | |||
469 | area = &(zone->free_area[order]); | ||
470 | |||
471 | list_for_each(curr, &area->free_list[mtype]) | ||
472 | freecount++; | ||
473 | seq_printf(m, "%6lu ", freecount); | ||
474 | } | ||
445 | seq_putc(m, '\n'); | 475 | seq_putc(m, '\n'); |
446 | } | 476 | } |
477 | } | ||
478 | |||
479 | /* Print out the free pages at each order for each migatetype */ | ||
480 | static int pagetypeinfo_showfree(struct seq_file *m, void *arg) | ||
481 | { | ||
482 | int order; | ||
483 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
484 | |||
485 | /* Print header */ | ||
486 | seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); | ||
487 | for (order = 0; order < MAX_ORDER; ++order) | ||
488 | seq_printf(m, "%6d ", order); | ||
489 | seq_putc(m, '\n'); | ||
490 | |||
491 | walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); | ||
492 | |||
493 | return 0; | ||
494 | } | ||
495 | |||
496 | static void pagetypeinfo_showblockcount_print(struct seq_file *m, | ||
497 | pg_data_t *pgdat, struct zone *zone) | ||
498 | { | ||
499 | int mtype; | ||
500 | unsigned long pfn; | ||
501 | unsigned long start_pfn = zone->zone_start_pfn; | ||
502 | unsigned long end_pfn = start_pfn + zone->spanned_pages; | ||
503 | unsigned long count[MIGRATE_TYPES] = { 0, }; | ||
504 | |||
505 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | ||
506 | struct page *page; | ||
507 | |||
508 | if (!pfn_valid(pfn)) | ||
509 | continue; | ||
510 | |||
511 | page = pfn_to_page(pfn); | ||
512 | mtype = get_pageblock_migratetype(page); | ||
513 | |||
514 | count[mtype]++; | ||
515 | } | ||
516 | |||
517 | /* Print counts */ | ||
518 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
519 | for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) | ||
520 | seq_printf(m, "%12lu ", count[mtype]); | ||
521 | seq_putc(m, '\n'); | ||
522 | } | ||
523 | |||
524 | /* Print out the free pages at each order for each migratetype */ | ||
525 | static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) | ||
526 | { | ||
527 | int mtype; | ||
528 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
529 | |||
530 | seq_printf(m, "\n%-23s", "Number of blocks type "); | ||
531 | for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) | ||
532 | seq_printf(m, "%12s ", migratetype_names[mtype]); | ||
533 | seq_putc(m, '\n'); | ||
534 | walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); | ||
535 | |||
536 | return 0; | ||
537 | } | ||
538 | |||
539 | /* | ||
540 | * This prints out statistics in relation to grouping pages by mobility. | ||
541 | * It is expensive to collect so do not constantly read the file. | ||
542 | */ | ||
543 | static int pagetypeinfo_show(struct seq_file *m, void *arg) | ||
544 | { | ||
545 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
546 | |||
547 | seq_printf(m, "Page block order: %d\n", pageblock_order); | ||
548 | seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages); | ||
549 | seq_putc(m, '\n'); | ||
550 | pagetypeinfo_showfree(m, pgdat); | ||
551 | pagetypeinfo_showblockcount(m, pgdat); | ||
552 | |||
447 | return 0; | 553 | return 0; |
448 | } | 554 | } |
449 | 555 | ||
@@ -454,6 +560,13 @@ const struct seq_operations fragmentation_op = { | |||
454 | .show = frag_show, | 560 | .show = frag_show, |
455 | }; | 561 | }; |
456 | 562 | ||
563 | const struct seq_operations pagetypeinfo_op = { | ||
564 | .start = frag_start, | ||
565 | .next = frag_next, | ||
566 | .stop = frag_stop, | ||
567 | .show = pagetypeinfo_show, | ||
568 | }; | ||
569 | |||
457 | #ifdef CONFIG_ZONE_DMA | 570 | #ifdef CONFIG_ZONE_DMA |
458 | #define TEXT_FOR_DMA(xx) xx "_dma", | 571 | #define TEXT_FOR_DMA(xx) xx "_dma", |
459 | #else | 572 | #else |
@@ -532,84 +645,78 @@ static const char * const vmstat_text[] = { | |||
532 | #endif | 645 | #endif |
533 | }; | 646 | }; |
534 | 647 | ||
535 | /* | 648 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, |
536 | * Output information about zones in @pgdat. | 649 | struct zone *zone) |
537 | */ | ||
538 | static int zoneinfo_show(struct seq_file *m, void *arg) | ||
539 | { | 650 | { |
540 | pg_data_t *pgdat = arg; | 651 | int i; |
541 | struct zone *zone; | 652 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); |
542 | struct zone *node_zones = pgdat->node_zones; | 653 | seq_printf(m, |
543 | unsigned long flags; | 654 | "\n pages free %lu" |
544 | 655 | "\n min %lu" | |
545 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | 656 | "\n low %lu" |
546 | int i; | 657 | "\n high %lu" |
547 | 658 | "\n scanned %lu (a: %lu i: %lu)" | |
548 | if (!populated_zone(zone)) | 659 | "\n spanned %lu" |
549 | continue; | 660 | "\n present %lu", |
550 | 661 | zone_page_state(zone, NR_FREE_PAGES), | |
551 | spin_lock_irqsave(&zone->lock, flags); | 662 | zone->pages_min, |
552 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); | 663 | zone->pages_low, |
553 | seq_printf(m, | 664 | zone->pages_high, |
554 | "\n pages free %lu" | 665 | zone->pages_scanned, |
555 | "\n min %lu" | 666 | zone->nr_scan_active, zone->nr_scan_inactive, |
556 | "\n low %lu" | 667 | zone->spanned_pages, |
557 | "\n high %lu" | 668 | zone->present_pages); |
558 | "\n scanned %lu (a: %lu i: %lu)" | ||
559 | "\n spanned %lu" | ||
560 | "\n present %lu", | ||
561 | zone_page_state(zone, NR_FREE_PAGES), | ||
562 | zone->pages_min, | ||
563 | zone->pages_low, | ||
564 | zone->pages_high, | ||
565 | zone->pages_scanned, | ||
566 | zone->nr_scan_active, zone->nr_scan_inactive, | ||
567 | zone->spanned_pages, | ||
568 | zone->present_pages); | ||
569 | 669 | ||
570 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 670 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
571 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], | 671 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], |
572 | zone_page_state(zone, i)); | 672 | zone_page_state(zone, i)); |
573 | 673 | ||
574 | seq_printf(m, | 674 | seq_printf(m, |
575 | "\n protection: (%lu", | 675 | "\n protection: (%lu", |
576 | zone->lowmem_reserve[0]); | 676 | zone->lowmem_reserve[0]); |
577 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) | 677 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) |
578 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); | 678 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); |
579 | seq_printf(m, | 679 | seq_printf(m, |
580 | ")" | 680 | ")" |
581 | "\n pagesets"); | 681 | "\n pagesets"); |
582 | for_each_online_cpu(i) { | 682 | for_each_online_cpu(i) { |
583 | struct per_cpu_pageset *pageset; | 683 | struct per_cpu_pageset *pageset; |
584 | int j; | 684 | int j; |
585 | 685 | ||
586 | pageset = zone_pcp(zone, i); | 686 | pageset = zone_pcp(zone, i); |
587 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | 687 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { |
588 | seq_printf(m, | 688 | seq_printf(m, |
589 | "\n cpu: %i pcp: %i" | 689 | "\n cpu: %i pcp: %i" |
590 | "\n count: %i" | 690 | "\n count: %i" |
591 | "\n high: %i" | 691 | "\n high: %i" |
592 | "\n batch: %i", | 692 | "\n batch: %i", |
593 | i, j, | 693 | i, j, |
594 | pageset->pcp[j].count, | 694 | pageset->pcp[j].count, |
595 | pageset->pcp[j].high, | 695 | pageset->pcp[j].high, |
596 | pageset->pcp[j].batch); | 696 | pageset->pcp[j].batch); |
597 | } | 697 | } |
598 | #ifdef CONFIG_SMP | 698 | #ifdef CONFIG_SMP |
599 | seq_printf(m, "\n vm stats threshold: %d", | 699 | seq_printf(m, "\n vm stats threshold: %d", |
600 | pageset->stat_threshold); | 700 | pageset->stat_threshold); |
601 | #endif | 701 | #endif |
602 | } | ||
603 | seq_printf(m, | ||
604 | "\n all_unreclaimable: %u" | ||
605 | "\n prev_priority: %i" | ||
606 | "\n start_pfn: %lu", | ||
607 | zone->all_unreclaimable, | ||
608 | zone->prev_priority, | ||
609 | zone->zone_start_pfn); | ||
610 | spin_unlock_irqrestore(&zone->lock, flags); | ||
611 | seq_putc(m, '\n'); | ||
612 | } | 702 | } |
703 | seq_printf(m, | ||
704 | "\n all_unreclaimable: %u" | ||
705 | "\n prev_priority: %i" | ||
706 | "\n start_pfn: %lu", | ||
707 | zone->all_unreclaimable, | ||
708 | zone->prev_priority, | ||
709 | zone->zone_start_pfn); | ||
710 | seq_putc(m, '\n'); | ||
711 | } | ||
712 | |||
713 | /* | ||
714 | * Output information about zones in @pgdat. | ||
715 | */ | ||
716 | static int zoneinfo_show(struct seq_file *m, void *arg) | ||
717 | { | ||
718 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
719 | walk_zones_in_node(m, pgdat, zoneinfo_show_print); | ||
613 | return 0; | 720 | return 0; |
614 | } | 721 | } |
615 | 722 | ||
@@ -741,7 +848,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
741 | static struct notifier_block __cpuinitdata vmstat_notifier = | 848 | static struct notifier_block __cpuinitdata vmstat_notifier = |
742 | { &vmstat_cpuup_callback, NULL, 0 }; | 849 | { &vmstat_cpuup_callback, NULL, 0 }; |
743 | 850 | ||
744 | int __init setup_vmstat(void) | 851 | static int __init setup_vmstat(void) |
745 | { | 852 | { |
746 | int cpu; | 853 | int cpu; |
747 | 854 | ||