diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 18 | ||||
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/backing-dev.c | 47 | ||||
-rw-r--r-- | mm/bounce.c | 6 | ||||
-rw-r--r-- | mm/filemap.c | 781 | ||||
-rw-r--r-- | mm/filemap.h | 103 | ||||
-rw-r--r-- | mm/filemap_xip.c | 17 | ||||
-rw-r--r-- | mm/fremap.c | 26 | ||||
-rw-r--r-- | mm/hugetlb.c | 398 | ||||
-rw-r--r-- | mm/internal.h | 10 | ||||
-rw-r--r-- | mm/memory.c | 161 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 312 | ||||
-rw-r--r-- | mm/mempolicy.c | 60 | ||||
-rw-r--r-- | mm/migrate.c | 6 | ||||
-rw-r--r-- | mm/mmap.c | 3 | ||||
-rw-r--r-- | mm/mprotect.c | 1 | ||||
-rw-r--r-- | mm/nommu.c | 1 | ||||
-rw-r--r-- | mm/oom_kill.c | 116 | ||||
-rw-r--r-- | mm/page-writeback.c | 310 | ||||
-rw-r--r-- | mm/page_alloc.c | 754 | ||||
-rw-r--r-- | mm/page_isolation.c | 138 | ||||
-rw-r--r-- | mm/readahead.c | 94 | ||||
-rw-r--r-- | mm/rmap.c | 5 | ||||
-rw-r--r-- | mm/shmem.c | 82 | ||||
-rw-r--r-- | mm/slab.c | 35 | ||||
-rw-r--r-- | mm/slob.c | 13 | ||||
-rw-r--r-- | mm/slub.c | 520 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 148 | ||||
-rw-r--r-- | mm/sparse.c | 105 | ||||
-rw-r--r-- | mm/swap.c | 111 | ||||
-rw-r--r-- | mm/swap_state.c | 5 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 19 | ||||
-rw-r--r-- | mm/truncate.c | 3 | ||||
-rw-r--r-- | mm/util.c | 6 | ||||
-rw-r--r-- | mm/vmalloc.c | 5 | ||||
-rw-r--r-- | mm/vmscan.c | 99 | ||||
-rw-r--r-- | mm/vmstat.c | 305 |
37 files changed, 3561 insertions, 1265 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e24d348083c3..b1f03b0eb7f1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -112,6 +112,19 @@ config SPARSEMEM_EXTREME | |||
112 | def_bool y | 112 | def_bool y |
113 | depends on SPARSEMEM && !SPARSEMEM_STATIC | 113 | depends on SPARSEMEM && !SPARSEMEM_STATIC |
114 | 114 | ||
115 | # | ||
116 | # SPARSEMEM_VMEMMAP uses a virtually mapped mem_map to optimise pfn_to_page | ||
117 | # and page_to_pfn. The most efficient option where kernel virtual space is | ||
118 | # not under pressure. | ||
119 | # | ||
120 | config SPARSEMEM_VMEMMAP_ENABLE | ||
121 | def_bool n | ||
122 | |||
123 | config SPARSEMEM_VMEMMAP | ||
124 | bool | ||
125 | depends on SPARSEMEM | ||
126 | default y if (SPARSEMEM_VMEMMAP_ENABLE) | ||
127 | |||
115 | # eventually, we can have this option just 'select SPARSEMEM' | 128 | # eventually, we can have this option just 'select SPARSEMEM' |
116 | config MEMORY_HOTPLUG | 129 | config MEMORY_HOTPLUG |
117 | bool "Allow for memory hot-add" | 130 | bool "Allow for memory hot-add" |
@@ -126,6 +139,11 @@ config MEMORY_HOTPLUG_SPARSE | |||
126 | def_bool y | 139 | def_bool y |
127 | depends on SPARSEMEM && MEMORY_HOTPLUG | 140 | depends on SPARSEMEM && MEMORY_HOTPLUG |
128 | 141 | ||
142 | config MEMORY_HOTREMOVE | ||
143 | bool "Allow for memory hot remove" | ||
144 | depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE | ||
145 | depends on MIGRATION | ||
146 | |||
129 | # Heavily threaded applications may benefit from splitting the mm-wide | 147 | # Heavily threaded applications may benefit from splitting the mm-wide |
130 | # page_table_lock, so that faults on different parts of the user address | 148 | # page_table_lock, so that faults on different parts of the user address |
131 | # space can be handled with less contention: split it at this NR_CPUS. | 149 | # space can be handled with less contention: split it at this NR_CPUS. |
diff --git a/mm/Makefile b/mm/Makefile index 245e33ab00c4..5c0b0ea7572d 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -11,13 +11,14 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | |||
11 | page_alloc.o page-writeback.o pdflush.o \ | 11 | page_alloc.o page-writeback.o pdflush.o \ |
12 | readahead.o swap.o truncate.o vmscan.o \ | 12 | readahead.o swap.o truncate.o vmscan.o \ |
13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
14 | $(mmu-y) | 14 | page_isolation.o $(mmu-y) |
15 | 15 | ||
16 | obj-$(CONFIG_BOUNCE) += bounce.o | 16 | obj-$(CONFIG_BOUNCE) += bounce.o |
17 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | 17 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o |
18 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 18 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
19 | obj-$(CONFIG_NUMA) += mempolicy.o | 19 | obj-$(CONFIG_NUMA) += mempolicy.o |
20 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 20 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
21 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | ||
21 | obj-$(CONFIG_SHMEM) += shmem.o | 22 | obj-$(CONFIG_SHMEM) += shmem.o |
22 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | 23 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o |
23 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | 24 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f50a2811f9dc..b0ceb29da4c7 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -5,6 +5,41 @@ | |||
5 | #include <linux/sched.h> | 5 | #include <linux/sched.h> |
6 | #include <linux/module.h> | 6 | #include <linux/module.h> |
7 | 7 | ||
8 | int bdi_init(struct backing_dev_info *bdi) | ||
9 | { | ||
10 | int i, j; | ||
11 | int err; | ||
12 | |||
13 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { | ||
14 | err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); | ||
15 | if (err) | ||
16 | goto err; | ||
17 | } | ||
18 | |||
19 | bdi->dirty_exceeded = 0; | ||
20 | err = prop_local_init_percpu(&bdi->completions); | ||
21 | |||
22 | if (err) { | ||
23 | err: | ||
24 | for (j = 0; j < i; j++) | ||
25 | percpu_counter_destroy(&bdi->bdi_stat[i]); | ||
26 | } | ||
27 | |||
28 | return err; | ||
29 | } | ||
30 | EXPORT_SYMBOL(bdi_init); | ||
31 | |||
32 | void bdi_destroy(struct backing_dev_info *bdi) | ||
33 | { | ||
34 | int i; | ||
35 | |||
36 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | ||
37 | percpu_counter_destroy(&bdi->bdi_stat[i]); | ||
38 | |||
39 | prop_local_destroy_percpu(&bdi->completions); | ||
40 | } | ||
41 | EXPORT_SYMBOL(bdi_destroy); | ||
42 | |||
8 | static wait_queue_head_t congestion_wqh[2] = { | 43 | static wait_queue_head_t congestion_wqh[2] = { |
9 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), | 44 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), |
10 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) | 45 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) |
@@ -55,15 +90,3 @@ long congestion_wait(int rw, long timeout) | |||
55 | } | 90 | } |
56 | EXPORT_SYMBOL(congestion_wait); | 91 | EXPORT_SYMBOL(congestion_wait); |
57 | 92 | ||
58 | /** | ||
59 | * congestion_end - wake up sleepers on a congested backing_dev_info | ||
60 | * @rw: READ or WRITE | ||
61 | */ | ||
62 | void congestion_end(int rw) | ||
63 | { | ||
64 | wait_queue_head_t *wqh = &congestion_wqh[rw]; | ||
65 | |||
66 | if (waitqueue_active(wqh)) | ||
67 | wake_up(wqh); | ||
68 | } | ||
69 | EXPORT_SYMBOL(congestion_end); | ||
diff --git a/mm/bounce.c b/mm/bounce.c index 3b549bf31f7d..b6d2d0f1019b 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -265,6 +265,12 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) | |||
265 | mempool_t *pool; | 265 | mempool_t *pool; |
266 | 266 | ||
267 | /* | 267 | /* |
268 | * Data-less bio, nothing to bounce | ||
269 | */ | ||
270 | if (bio_empty_barrier(*bio_orig)) | ||
271 | return; | ||
272 | |||
273 | /* | ||
268 | * for non-isa bounce case, just check if the bounce pfn is equal | 274 | * for non-isa bounce case, just check if the bounce pfn is equal |
269 | * to or bigger than the highest pfn in the system -- in that case, | 275 | * to or bigger than the highest pfn in the system -- in that case, |
270 | * don't waste time iterating over bio segments | 276 | * don't waste time iterating over bio segments |
diff --git a/mm/filemap.c b/mm/filemap.c index 15c8413ee929..79f24a969cb4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -30,7 +30,7 @@ | |||
30 | #include <linux/security.h> | 30 | #include <linux/security.h> |
31 | #include <linux/syscalls.h> | 31 | #include <linux/syscalls.h> |
32 | #include <linux/cpuset.h> | 32 | #include <linux/cpuset.h> |
33 | #include "filemap.h" | 33 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
34 | #include "internal.h" | 34 | #include "internal.h" |
35 | 35 | ||
36 | /* | 36 | /* |
@@ -63,6 +63,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
63 | * ->private_lock (__free_pte->__set_page_dirty_buffers) | 63 | * ->private_lock (__free_pte->__set_page_dirty_buffers) |
64 | * ->swap_lock (exclusive_swap_page, others) | 64 | * ->swap_lock (exclusive_swap_page, others) |
65 | * ->mapping->tree_lock | 65 | * ->mapping->tree_lock |
66 | * ->zone.lock | ||
66 | * | 67 | * |
67 | * ->i_mutex | 68 | * ->i_mutex |
68 | * ->i_mmap_lock (truncate->unmap_mapping_range) | 69 | * ->i_mmap_lock (truncate->unmap_mapping_range) |
@@ -593,7 +594,7 @@ void fastcall __lock_page_nosync(struct page *page) | |||
593 | * Is there a pagecache struct page at the given (mapping, offset) tuple? | 594 | * Is there a pagecache struct page at the given (mapping, offset) tuple? |
594 | * If yes, increment its refcount and return it; if no, return NULL. | 595 | * If yes, increment its refcount and return it; if no, return NULL. |
595 | */ | 596 | */ |
596 | struct page * find_get_page(struct address_space *mapping, unsigned long offset) | 597 | struct page * find_get_page(struct address_space *mapping, pgoff_t offset) |
597 | { | 598 | { |
598 | struct page *page; | 599 | struct page *page; |
599 | 600 | ||
@@ -617,30 +618,31 @@ EXPORT_SYMBOL(find_get_page); | |||
617 | * Returns zero if the page was not present. find_lock_page() may sleep. | 618 | * Returns zero if the page was not present. find_lock_page() may sleep. |
618 | */ | 619 | */ |
619 | struct page *find_lock_page(struct address_space *mapping, | 620 | struct page *find_lock_page(struct address_space *mapping, |
620 | unsigned long offset) | 621 | pgoff_t offset) |
621 | { | 622 | { |
622 | struct page *page; | 623 | struct page *page; |
623 | 624 | ||
624 | read_lock_irq(&mapping->tree_lock); | ||
625 | repeat: | 625 | repeat: |
626 | read_lock_irq(&mapping->tree_lock); | ||
626 | page = radix_tree_lookup(&mapping->page_tree, offset); | 627 | page = radix_tree_lookup(&mapping->page_tree, offset); |
627 | if (page) { | 628 | if (page) { |
628 | page_cache_get(page); | 629 | page_cache_get(page); |
629 | if (TestSetPageLocked(page)) { | 630 | if (TestSetPageLocked(page)) { |
630 | read_unlock_irq(&mapping->tree_lock); | 631 | read_unlock_irq(&mapping->tree_lock); |
631 | __lock_page(page); | 632 | __lock_page(page); |
632 | read_lock_irq(&mapping->tree_lock); | ||
633 | 633 | ||
634 | /* Has the page been truncated while we slept? */ | 634 | /* Has the page been truncated while we slept? */ |
635 | if (unlikely(page->mapping != mapping || | 635 | if (unlikely(page->mapping != mapping)) { |
636 | page->index != offset)) { | ||
637 | unlock_page(page); | 636 | unlock_page(page); |
638 | page_cache_release(page); | 637 | page_cache_release(page); |
639 | goto repeat; | 638 | goto repeat; |
640 | } | 639 | } |
640 | VM_BUG_ON(page->index != offset); | ||
641 | goto out; | ||
641 | } | 642 | } |
642 | } | 643 | } |
643 | read_unlock_irq(&mapping->tree_lock); | 644 | read_unlock_irq(&mapping->tree_lock); |
645 | out: | ||
644 | return page; | 646 | return page; |
645 | } | 647 | } |
646 | EXPORT_SYMBOL(find_lock_page); | 648 | EXPORT_SYMBOL(find_lock_page); |
@@ -663,29 +665,24 @@ EXPORT_SYMBOL(find_lock_page); | |||
663 | * memory exhaustion. | 665 | * memory exhaustion. |
664 | */ | 666 | */ |
665 | struct page *find_or_create_page(struct address_space *mapping, | 667 | struct page *find_or_create_page(struct address_space *mapping, |
666 | unsigned long index, gfp_t gfp_mask) | 668 | pgoff_t index, gfp_t gfp_mask) |
667 | { | 669 | { |
668 | struct page *page, *cached_page = NULL; | 670 | struct page *page; |
669 | int err; | 671 | int err; |
670 | repeat: | 672 | repeat: |
671 | page = find_lock_page(mapping, index); | 673 | page = find_lock_page(mapping, index); |
672 | if (!page) { | 674 | if (!page) { |
673 | if (!cached_page) { | 675 | page = __page_cache_alloc(gfp_mask); |
674 | cached_page = | 676 | if (!page) |
675 | __page_cache_alloc(gfp_mask); | 677 | return NULL; |
676 | if (!cached_page) | 678 | err = add_to_page_cache_lru(page, mapping, index, gfp_mask); |
677 | return NULL; | 679 | if (unlikely(err)) { |
680 | page_cache_release(page); | ||
681 | page = NULL; | ||
682 | if (err == -EEXIST) | ||
683 | goto repeat; | ||
678 | } | 684 | } |
679 | err = add_to_page_cache_lru(cached_page, mapping, | ||
680 | index, gfp_mask); | ||
681 | if (!err) { | ||
682 | page = cached_page; | ||
683 | cached_page = NULL; | ||
684 | } else if (err == -EEXIST) | ||
685 | goto repeat; | ||
686 | } | 685 | } |
687 | if (cached_page) | ||
688 | page_cache_release(cached_page); | ||
689 | return page; | 686 | return page; |
690 | } | 687 | } |
691 | EXPORT_SYMBOL(find_or_create_page); | 688 | EXPORT_SYMBOL(find_or_create_page); |
@@ -797,7 +794,7 @@ EXPORT_SYMBOL(find_get_pages_tag); | |||
797 | * and deadlock against the caller's locked page. | 794 | * and deadlock against the caller's locked page. |
798 | */ | 795 | */ |
799 | struct page * | 796 | struct page * |
800 | grab_cache_page_nowait(struct address_space *mapping, unsigned long index) | 797 | grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) |
801 | { | 798 | { |
802 | struct page *page = find_get_page(mapping, index); | 799 | struct page *page = find_get_page(mapping, index); |
803 | 800 | ||
@@ -859,34 +856,29 @@ static void shrink_readahead_size_eio(struct file *filp, | |||
859 | * It may be NULL. | 856 | * It may be NULL. |
860 | */ | 857 | */ |
861 | void do_generic_mapping_read(struct address_space *mapping, | 858 | void do_generic_mapping_read(struct address_space *mapping, |
862 | struct file_ra_state *_ra, | 859 | struct file_ra_state *ra, |
863 | struct file *filp, | 860 | struct file *filp, |
864 | loff_t *ppos, | 861 | loff_t *ppos, |
865 | read_descriptor_t *desc, | 862 | read_descriptor_t *desc, |
866 | read_actor_t actor) | 863 | read_actor_t actor) |
867 | { | 864 | { |
868 | struct inode *inode = mapping->host; | 865 | struct inode *inode = mapping->host; |
869 | unsigned long index; | 866 | pgoff_t index; |
870 | unsigned long offset; | 867 | pgoff_t last_index; |
871 | unsigned long last_index; | 868 | pgoff_t prev_index; |
872 | unsigned long next_index; | 869 | unsigned long offset; /* offset into pagecache page */ |
873 | unsigned long prev_index; | ||
874 | unsigned int prev_offset; | 870 | unsigned int prev_offset; |
875 | struct page *cached_page; | ||
876 | int error; | 871 | int error; |
877 | struct file_ra_state ra = *_ra; | ||
878 | 872 | ||
879 | cached_page = NULL; | ||
880 | index = *ppos >> PAGE_CACHE_SHIFT; | 873 | index = *ppos >> PAGE_CACHE_SHIFT; |
881 | next_index = index; | 874 | prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; |
882 | prev_index = ra.prev_index; | 875 | prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); |
883 | prev_offset = ra.prev_offset; | ||
884 | last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; | 876 | last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; |
885 | offset = *ppos & ~PAGE_CACHE_MASK; | 877 | offset = *ppos & ~PAGE_CACHE_MASK; |
886 | 878 | ||
887 | for (;;) { | 879 | for (;;) { |
888 | struct page *page; | 880 | struct page *page; |
889 | unsigned long end_index; | 881 | pgoff_t end_index; |
890 | loff_t isize; | 882 | loff_t isize; |
891 | unsigned long nr, ret; | 883 | unsigned long nr, ret; |
892 | 884 | ||
@@ -895,7 +887,7 @@ find_page: | |||
895 | page = find_get_page(mapping, index); | 887 | page = find_get_page(mapping, index); |
896 | if (!page) { | 888 | if (!page) { |
897 | page_cache_sync_readahead(mapping, | 889 | page_cache_sync_readahead(mapping, |
898 | &ra, filp, | 890 | ra, filp, |
899 | index, last_index - index); | 891 | index, last_index - index); |
900 | page = find_get_page(mapping, index); | 892 | page = find_get_page(mapping, index); |
901 | if (unlikely(page == NULL)) | 893 | if (unlikely(page == NULL)) |
@@ -903,7 +895,7 @@ find_page: | |||
903 | } | 895 | } |
904 | if (PageReadahead(page)) { | 896 | if (PageReadahead(page)) { |
905 | page_cache_async_readahead(mapping, | 897 | page_cache_async_readahead(mapping, |
906 | &ra, filp, page, | 898 | ra, filp, page, |
907 | index, last_index - index); | 899 | index, last_index - index); |
908 | } | 900 | } |
909 | if (!PageUptodate(page)) | 901 | if (!PageUptodate(page)) |
@@ -966,7 +958,6 @@ page_ok: | |||
966 | index += offset >> PAGE_CACHE_SHIFT; | 958 | index += offset >> PAGE_CACHE_SHIFT; |
967 | offset &= ~PAGE_CACHE_MASK; | 959 | offset &= ~PAGE_CACHE_MASK; |
968 | prev_offset = offset; | 960 | prev_offset = offset; |
969 | ra.prev_offset = offset; | ||
970 | 961 | ||
971 | page_cache_release(page); | 962 | page_cache_release(page); |
972 | if (ret == nr && desc->count) | 963 | if (ret == nr && desc->count) |
@@ -1015,7 +1006,7 @@ readpage: | |||
1015 | } | 1006 | } |
1016 | unlock_page(page); | 1007 | unlock_page(page); |
1017 | error = -EIO; | 1008 | error = -EIO; |
1018 | shrink_readahead_size_eio(filp, &ra); | 1009 | shrink_readahead_size_eio(filp, ra); |
1019 | goto readpage_error; | 1010 | goto readpage_error; |
1020 | } | 1011 | } |
1021 | unlock_page(page); | 1012 | unlock_page(page); |
@@ -1034,33 +1025,29 @@ no_cached_page: | |||
1034 | * Ok, it wasn't cached, so we need to create a new | 1025 | * Ok, it wasn't cached, so we need to create a new |
1035 | * page.. | 1026 | * page.. |
1036 | */ | 1027 | */ |
1037 | if (!cached_page) { | 1028 | page = page_cache_alloc_cold(mapping); |
1038 | cached_page = page_cache_alloc_cold(mapping); | 1029 | if (!page) { |
1039 | if (!cached_page) { | 1030 | desc->error = -ENOMEM; |
1040 | desc->error = -ENOMEM; | 1031 | goto out; |
1041 | goto out; | ||
1042 | } | ||
1043 | } | 1032 | } |
1044 | error = add_to_page_cache_lru(cached_page, mapping, | 1033 | error = add_to_page_cache_lru(page, mapping, |
1045 | index, GFP_KERNEL); | 1034 | index, GFP_KERNEL); |
1046 | if (error) { | 1035 | if (error) { |
1036 | page_cache_release(page); | ||
1047 | if (error == -EEXIST) | 1037 | if (error == -EEXIST) |
1048 | goto find_page; | 1038 | goto find_page; |
1049 | desc->error = error; | 1039 | desc->error = error; |
1050 | goto out; | 1040 | goto out; |
1051 | } | 1041 | } |
1052 | page = cached_page; | ||
1053 | cached_page = NULL; | ||
1054 | goto readpage; | 1042 | goto readpage; |
1055 | } | 1043 | } |
1056 | 1044 | ||
1057 | out: | 1045 | out: |
1058 | *_ra = ra; | 1046 | ra->prev_pos = prev_index; |
1059 | _ra->prev_index = prev_index; | 1047 | ra->prev_pos <<= PAGE_CACHE_SHIFT; |
1048 | ra->prev_pos |= prev_offset; | ||
1060 | 1049 | ||
1061 | *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; | 1050 | *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; |
1062 | if (cached_page) | ||
1063 | page_cache_release(cached_page); | ||
1064 | if (filp) | 1051 | if (filp) |
1065 | file_accessed(filp); | 1052 | file_accessed(filp); |
1066 | } | 1053 | } |
@@ -1220,7 +1207,7 @@ EXPORT_SYMBOL(generic_file_aio_read); | |||
1220 | 1207 | ||
1221 | static ssize_t | 1208 | static ssize_t |
1222 | do_readahead(struct address_space *mapping, struct file *filp, | 1209 | do_readahead(struct address_space *mapping, struct file *filp, |
1223 | unsigned long index, unsigned long nr) | 1210 | pgoff_t index, unsigned long nr) |
1224 | { | 1211 | { |
1225 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | 1212 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) |
1226 | return -EINVAL; | 1213 | return -EINVAL; |
@@ -1240,8 +1227,8 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) | |||
1240 | if (file) { | 1227 | if (file) { |
1241 | if (file->f_mode & FMODE_READ) { | 1228 | if (file->f_mode & FMODE_READ) { |
1242 | struct address_space *mapping = file->f_mapping; | 1229 | struct address_space *mapping = file->f_mapping; |
1243 | unsigned long start = offset >> PAGE_CACHE_SHIFT; | 1230 | pgoff_t start = offset >> PAGE_CACHE_SHIFT; |
1244 | unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT; | 1231 | pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; |
1245 | unsigned long len = end - start + 1; | 1232 | unsigned long len = end - start + 1; |
1246 | ret = do_readahead(mapping, file, start, len); | 1233 | ret = do_readahead(mapping, file, start, len); |
1247 | } | 1234 | } |
@@ -1251,7 +1238,6 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) | |||
1251 | } | 1238 | } |
1252 | 1239 | ||
1253 | #ifdef CONFIG_MMU | 1240 | #ifdef CONFIG_MMU |
1254 | static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); | ||
1255 | /** | 1241 | /** |
1256 | * page_cache_read - adds requested page to the page cache if not already there | 1242 | * page_cache_read - adds requested page to the page cache if not already there |
1257 | * @file: file to read | 1243 | * @file: file to read |
@@ -1260,7 +1246,7 @@ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); | |||
1260 | * This adds the requested page to the page cache if it isn't already there, | 1246 | * This adds the requested page to the page cache if it isn't already there, |
1261 | * and schedules an I/O to read in its contents from disk. | 1247 | * and schedules an I/O to read in its contents from disk. |
1262 | */ | 1248 | */ |
1263 | static int fastcall page_cache_read(struct file * file, unsigned long offset) | 1249 | static int fastcall page_cache_read(struct file * file, pgoff_t offset) |
1264 | { | 1250 | { |
1265 | struct address_space *mapping = file->f_mapping; | 1251 | struct address_space *mapping = file->f_mapping; |
1266 | struct page *page; | 1252 | struct page *page; |
@@ -1349,7 +1335,7 @@ retry_find: | |||
1349 | * Do we miss much more than hit in this file? If so, | 1335 | * Do we miss much more than hit in this file? If so, |
1350 | * stop bothering with read-ahead. It will only hurt. | 1336 | * stop bothering with read-ahead. It will only hurt. |
1351 | */ | 1337 | */ |
1352 | if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) | 1338 | if (ra->mmap_miss > MMAP_LOTSAMISS) |
1353 | goto no_cached_page; | 1339 | goto no_cached_page; |
1354 | 1340 | ||
1355 | /* | 1341 | /* |
@@ -1375,7 +1361,7 @@ retry_find: | |||
1375 | } | 1361 | } |
1376 | 1362 | ||
1377 | if (!did_readaround) | 1363 | if (!did_readaround) |
1378 | ra->mmap_hit++; | 1364 | ra->mmap_miss--; |
1379 | 1365 | ||
1380 | /* | 1366 | /* |
1381 | * We have a locked page in the page cache, now we need to check | 1367 | * We have a locked page in the page cache, now we need to check |
@@ -1396,7 +1382,7 @@ retry_find: | |||
1396 | * Found the page and have a reference on it. | 1382 | * Found the page and have a reference on it. |
1397 | */ | 1383 | */ |
1398 | mark_page_accessed(page); | 1384 | mark_page_accessed(page); |
1399 | ra->prev_index = page->index; | 1385 | ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; |
1400 | vmf->page = page; | 1386 | vmf->page = page; |
1401 | return ret | VM_FAULT_LOCKED; | 1387 | return ret | VM_FAULT_LOCKED; |
1402 | 1388 | ||
@@ -1501,39 +1487,32 @@ EXPORT_SYMBOL(generic_file_mmap); | |||
1501 | EXPORT_SYMBOL(generic_file_readonly_mmap); | 1487 | EXPORT_SYMBOL(generic_file_readonly_mmap); |
1502 | 1488 | ||
1503 | static struct page *__read_cache_page(struct address_space *mapping, | 1489 | static struct page *__read_cache_page(struct address_space *mapping, |
1504 | unsigned long index, | 1490 | pgoff_t index, |
1505 | int (*filler)(void *,struct page*), | 1491 | int (*filler)(void *,struct page*), |
1506 | void *data) | 1492 | void *data) |
1507 | { | 1493 | { |
1508 | struct page *page, *cached_page = NULL; | 1494 | struct page *page; |
1509 | int err; | 1495 | int err; |
1510 | repeat: | 1496 | repeat: |
1511 | page = find_get_page(mapping, index); | 1497 | page = find_get_page(mapping, index); |
1512 | if (!page) { | 1498 | if (!page) { |
1513 | if (!cached_page) { | 1499 | page = page_cache_alloc_cold(mapping); |
1514 | cached_page = page_cache_alloc_cold(mapping); | 1500 | if (!page) |
1515 | if (!cached_page) | 1501 | return ERR_PTR(-ENOMEM); |
1516 | return ERR_PTR(-ENOMEM); | 1502 | err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); |
1517 | } | 1503 | if (unlikely(err)) { |
1518 | err = add_to_page_cache_lru(cached_page, mapping, | 1504 | page_cache_release(page); |
1519 | index, GFP_KERNEL); | 1505 | if (err == -EEXIST) |
1520 | if (err == -EEXIST) | 1506 | goto repeat; |
1521 | goto repeat; | ||
1522 | if (err < 0) { | ||
1523 | /* Presumably ENOMEM for radix tree node */ | 1507 | /* Presumably ENOMEM for radix tree node */ |
1524 | page_cache_release(cached_page); | ||
1525 | return ERR_PTR(err); | 1508 | return ERR_PTR(err); |
1526 | } | 1509 | } |
1527 | page = cached_page; | ||
1528 | cached_page = NULL; | ||
1529 | err = filler(data, page); | 1510 | err = filler(data, page); |
1530 | if (err < 0) { | 1511 | if (err < 0) { |
1531 | page_cache_release(page); | 1512 | page_cache_release(page); |
1532 | page = ERR_PTR(err); | 1513 | page = ERR_PTR(err); |
1533 | } | 1514 | } |
1534 | } | 1515 | } |
1535 | if (cached_page) | ||
1536 | page_cache_release(cached_page); | ||
1537 | return page; | 1516 | return page; |
1538 | } | 1517 | } |
1539 | 1518 | ||
@@ -1542,7 +1521,7 @@ repeat: | |||
1542 | * after submitting it to the filler. | 1521 | * after submitting it to the filler. |
1543 | */ | 1522 | */ |
1544 | struct page *read_cache_page_async(struct address_space *mapping, | 1523 | struct page *read_cache_page_async(struct address_space *mapping, |
1545 | unsigned long index, | 1524 | pgoff_t index, |
1546 | int (*filler)(void *,struct page*), | 1525 | int (*filler)(void *,struct page*), |
1547 | void *data) | 1526 | void *data) |
1548 | { | 1527 | { |
@@ -1590,7 +1569,7 @@ EXPORT_SYMBOL(read_cache_page_async); | |||
1590 | * If the page does not get brought uptodate, return -EIO. | 1569 | * If the page does not get brought uptodate, return -EIO. |
1591 | */ | 1570 | */ |
1592 | struct page *read_cache_page(struct address_space *mapping, | 1571 | struct page *read_cache_page(struct address_space *mapping, |
1593 | unsigned long index, | 1572 | pgoff_t index, |
1594 | int (*filler)(void *,struct page*), | 1573 | int (*filler)(void *,struct page*), |
1595 | void *data) | 1574 | void *data) |
1596 | { | 1575 | { |
@@ -1610,40 +1589,6 @@ struct page *read_cache_page(struct address_space *mapping, | |||
1610 | EXPORT_SYMBOL(read_cache_page); | 1589 | EXPORT_SYMBOL(read_cache_page); |
1611 | 1590 | ||
1612 | /* | 1591 | /* |
1613 | * If the page was newly created, increment its refcount and add it to the | ||
1614 | * caller's lru-buffering pagevec. This function is specifically for | ||
1615 | * generic_file_write(). | ||
1616 | */ | ||
1617 | static inline struct page * | ||
1618 | __grab_cache_page(struct address_space *mapping, unsigned long index, | ||
1619 | struct page **cached_page, struct pagevec *lru_pvec) | ||
1620 | { | ||
1621 | int err; | ||
1622 | struct page *page; | ||
1623 | repeat: | ||
1624 | page = find_lock_page(mapping, index); | ||
1625 | if (!page) { | ||
1626 | if (!*cached_page) { | ||
1627 | *cached_page = page_cache_alloc(mapping); | ||
1628 | if (!*cached_page) | ||
1629 | return NULL; | ||
1630 | } | ||
1631 | err = add_to_page_cache(*cached_page, mapping, | ||
1632 | index, GFP_KERNEL); | ||
1633 | if (err == -EEXIST) | ||
1634 | goto repeat; | ||
1635 | if (err == 0) { | ||
1636 | page = *cached_page; | ||
1637 | page_cache_get(page); | ||
1638 | if (!pagevec_add(lru_pvec, page)) | ||
1639 | __pagevec_lru_add(lru_pvec); | ||
1640 | *cached_page = NULL; | ||
1641 | } | ||
1642 | } | ||
1643 | return page; | ||
1644 | } | ||
1645 | |||
1646 | /* | ||
1647 | * The logic we want is | 1592 | * The logic we want is |
1648 | * | 1593 | * |
1649 | * if suid or (sgid and xgrp) | 1594 | * if suid or (sgid and xgrp) |
@@ -1682,17 +1627,22 @@ int __remove_suid(struct dentry *dentry, int kill) | |||
1682 | 1627 | ||
1683 | int remove_suid(struct dentry *dentry) | 1628 | int remove_suid(struct dentry *dentry) |
1684 | { | 1629 | { |
1685 | int kill = should_remove_suid(dentry); | 1630 | int killsuid = should_remove_suid(dentry); |
1631 | int killpriv = security_inode_need_killpriv(dentry); | ||
1632 | int error = 0; | ||
1686 | 1633 | ||
1687 | if (unlikely(kill)) | 1634 | if (killpriv < 0) |
1688 | return __remove_suid(dentry, kill); | 1635 | return killpriv; |
1636 | if (killpriv) | ||
1637 | error = security_inode_killpriv(dentry); | ||
1638 | if (!error && killsuid) | ||
1639 | error = __remove_suid(dentry, killsuid); | ||
1689 | 1640 | ||
1690 | return 0; | 1641 | return error; |
1691 | } | 1642 | } |
1692 | EXPORT_SYMBOL(remove_suid); | 1643 | EXPORT_SYMBOL(remove_suid); |
1693 | 1644 | ||
1694 | size_t | 1645 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, |
1695 | __filemap_copy_from_user_iovec_inatomic(char *vaddr, | ||
1696 | const struct iovec *iov, size_t base, size_t bytes) | 1646 | const struct iovec *iov, size_t base, size_t bytes) |
1697 | { | 1647 | { |
1698 | size_t copied = 0, left = 0; | 1648 | size_t copied = 0, left = 0; |
@@ -1715,6 +1665,124 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr, | |||
1715 | } | 1665 | } |
1716 | 1666 | ||
1717 | /* | 1667 | /* |
1668 | * Copy as much as we can into the page and return the number of bytes which | ||
1669 | * were sucessfully copied. If a fault is encountered then return the number of | ||
1670 | * bytes which were copied. | ||
1671 | */ | ||
1672 | size_t iov_iter_copy_from_user_atomic(struct page *page, | ||
1673 | struct iov_iter *i, unsigned long offset, size_t bytes) | ||
1674 | { | ||
1675 | char *kaddr; | ||
1676 | size_t copied; | ||
1677 | |||
1678 | BUG_ON(!in_atomic()); | ||
1679 | kaddr = kmap_atomic(page, KM_USER0); | ||
1680 | if (likely(i->nr_segs == 1)) { | ||
1681 | int left; | ||
1682 | char __user *buf = i->iov->iov_base + i->iov_offset; | ||
1683 | left = __copy_from_user_inatomic_nocache(kaddr + offset, | ||
1684 | buf, bytes); | ||
1685 | copied = bytes - left; | ||
1686 | } else { | ||
1687 | copied = __iovec_copy_from_user_inatomic(kaddr + offset, | ||
1688 | i->iov, i->iov_offset, bytes); | ||
1689 | } | ||
1690 | kunmap_atomic(kaddr, KM_USER0); | ||
1691 | |||
1692 | return copied; | ||
1693 | } | ||
1694 | EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); | ||
1695 | |||
1696 | /* | ||
1697 | * This has the same sideeffects and return value as | ||
1698 | * iov_iter_copy_from_user_atomic(). | ||
1699 | * The difference is that it attempts to resolve faults. | ||
1700 | * Page must not be locked. | ||
1701 | */ | ||
1702 | size_t iov_iter_copy_from_user(struct page *page, | ||
1703 | struct iov_iter *i, unsigned long offset, size_t bytes) | ||
1704 | { | ||
1705 | char *kaddr; | ||
1706 | size_t copied; | ||
1707 | |||
1708 | kaddr = kmap(page); | ||
1709 | if (likely(i->nr_segs == 1)) { | ||
1710 | int left; | ||
1711 | char __user *buf = i->iov->iov_base + i->iov_offset; | ||
1712 | left = __copy_from_user_nocache(kaddr + offset, buf, bytes); | ||
1713 | copied = bytes - left; | ||
1714 | } else { | ||
1715 | copied = __iovec_copy_from_user_inatomic(kaddr + offset, | ||
1716 | i->iov, i->iov_offset, bytes); | ||
1717 | } | ||
1718 | kunmap(page); | ||
1719 | return copied; | ||
1720 | } | ||
1721 | EXPORT_SYMBOL(iov_iter_copy_from_user); | ||
1722 | |||
1723 | static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes) | ||
1724 | { | ||
1725 | if (likely(i->nr_segs == 1)) { | ||
1726 | i->iov_offset += bytes; | ||
1727 | } else { | ||
1728 | const struct iovec *iov = i->iov; | ||
1729 | size_t base = i->iov_offset; | ||
1730 | |||
1731 | while (bytes) { | ||
1732 | int copy = min(bytes, iov->iov_len - base); | ||
1733 | |||
1734 | bytes -= copy; | ||
1735 | base += copy; | ||
1736 | if (iov->iov_len == base) { | ||
1737 | iov++; | ||
1738 | base = 0; | ||
1739 | } | ||
1740 | } | ||
1741 | i->iov = iov; | ||
1742 | i->iov_offset = base; | ||
1743 | } | ||
1744 | } | ||
1745 | |||
1746 | void iov_iter_advance(struct iov_iter *i, size_t bytes) | ||
1747 | { | ||
1748 | BUG_ON(i->count < bytes); | ||
1749 | |||
1750 | __iov_iter_advance_iov(i, bytes); | ||
1751 | i->count -= bytes; | ||
1752 | } | ||
1753 | EXPORT_SYMBOL(iov_iter_advance); | ||
1754 | |||
1755 | /* | ||
1756 | * Fault in the first iovec of the given iov_iter, to a maximum length | ||
1757 | * of bytes. Returns 0 on success, or non-zero if the memory could not be | ||
1758 | * accessed (ie. because it is an invalid address). | ||
1759 | * | ||
1760 | * writev-intensive code may want this to prefault several iovecs -- that | ||
1761 | * would be possible (callers must not rely on the fact that _only_ the | ||
1762 | * first iovec will be faulted with the current implementation). | ||
1763 | */ | ||
1764 | int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) | ||
1765 | { | ||
1766 | char __user *buf = i->iov->iov_base + i->iov_offset; | ||
1767 | bytes = min(bytes, i->iov->iov_len - i->iov_offset); | ||
1768 | return fault_in_pages_readable(buf, bytes); | ||
1769 | } | ||
1770 | EXPORT_SYMBOL(iov_iter_fault_in_readable); | ||
1771 | |||
1772 | /* | ||
1773 | * Return the count of just the current iov_iter segment. | ||
1774 | */ | ||
1775 | size_t iov_iter_single_seg_count(struct iov_iter *i) | ||
1776 | { | ||
1777 | const struct iovec *iov = i->iov; | ||
1778 | if (i->nr_segs == 1) | ||
1779 | return i->count; | ||
1780 | else | ||
1781 | return min(i->count, iov->iov_len - i->iov_offset); | ||
1782 | } | ||
1783 | EXPORT_SYMBOL(iov_iter_single_seg_count); | ||
1784 | |||
1785 | /* | ||
1718 | * Performs necessary checks before doing a write | 1786 | * Performs necessary checks before doing a write |
1719 | * | 1787 | * |
1720 | * Can adjust writing position or amount of bytes to write. | 1788 | * Can adjust writing position or amount of bytes to write. |
@@ -1796,6 +1864,91 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i | |||
1796 | } | 1864 | } |
1797 | EXPORT_SYMBOL(generic_write_checks); | 1865 | EXPORT_SYMBOL(generic_write_checks); |
1798 | 1866 | ||
1867 | int pagecache_write_begin(struct file *file, struct address_space *mapping, | ||
1868 | loff_t pos, unsigned len, unsigned flags, | ||
1869 | struct page **pagep, void **fsdata) | ||
1870 | { | ||
1871 | const struct address_space_operations *aops = mapping->a_ops; | ||
1872 | |||
1873 | if (aops->write_begin) { | ||
1874 | return aops->write_begin(file, mapping, pos, len, flags, | ||
1875 | pagep, fsdata); | ||
1876 | } else { | ||
1877 | int ret; | ||
1878 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | ||
1879 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
1880 | struct inode *inode = mapping->host; | ||
1881 | struct page *page; | ||
1882 | again: | ||
1883 | page = __grab_cache_page(mapping, index); | ||
1884 | *pagep = page; | ||
1885 | if (!page) | ||
1886 | return -ENOMEM; | ||
1887 | |||
1888 | if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { | ||
1889 | /* | ||
1890 | * There is no way to resolve a short write situation | ||
1891 | * for a !Uptodate page (except by double copying in | ||
1892 | * the caller done by generic_perform_write_2copy). | ||
1893 | * | ||
1894 | * Instead, we have to bring it uptodate here. | ||
1895 | */ | ||
1896 | ret = aops->readpage(file, page); | ||
1897 | page_cache_release(page); | ||
1898 | if (ret) { | ||
1899 | if (ret == AOP_TRUNCATED_PAGE) | ||
1900 | goto again; | ||
1901 | return ret; | ||
1902 | } | ||
1903 | goto again; | ||
1904 | } | ||
1905 | |||
1906 | ret = aops->prepare_write(file, page, offset, offset+len); | ||
1907 | if (ret) { | ||
1908 | unlock_page(page); | ||
1909 | page_cache_release(page); | ||
1910 | if (pos + len > inode->i_size) | ||
1911 | vmtruncate(inode, inode->i_size); | ||
1912 | } | ||
1913 | return ret; | ||
1914 | } | ||
1915 | } | ||
1916 | EXPORT_SYMBOL(pagecache_write_begin); | ||
1917 | |||
1918 | int pagecache_write_end(struct file *file, struct address_space *mapping, | ||
1919 | loff_t pos, unsigned len, unsigned copied, | ||
1920 | struct page *page, void *fsdata) | ||
1921 | { | ||
1922 | const struct address_space_operations *aops = mapping->a_ops; | ||
1923 | int ret; | ||
1924 | |||
1925 | if (aops->write_end) { | ||
1926 | mark_page_accessed(page); | ||
1927 | ret = aops->write_end(file, mapping, pos, len, copied, | ||
1928 | page, fsdata); | ||
1929 | } else { | ||
1930 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
1931 | struct inode *inode = mapping->host; | ||
1932 | |||
1933 | flush_dcache_page(page); | ||
1934 | ret = aops->commit_write(file, page, offset, offset+len); | ||
1935 | unlock_page(page); | ||
1936 | mark_page_accessed(page); | ||
1937 | page_cache_release(page); | ||
1938 | |||
1939 | if (ret < 0) { | ||
1940 | if (pos + len > inode->i_size) | ||
1941 | vmtruncate(inode, inode->i_size); | ||
1942 | } else if (ret > 0) | ||
1943 | ret = min_t(size_t, copied, ret); | ||
1944 | else | ||
1945 | ret = copied; | ||
1946 | } | ||
1947 | |||
1948 | return ret; | ||
1949 | } | ||
1950 | EXPORT_SYMBOL(pagecache_write_end); | ||
1951 | |||
1799 | ssize_t | 1952 | ssize_t |
1800 | generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | 1953 | generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, |
1801 | unsigned long *nr_segs, loff_t pos, loff_t *ppos, | 1954 | unsigned long *nr_segs, loff_t pos, loff_t *ppos, |
@@ -1835,151 +1988,314 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
1835 | } | 1988 | } |
1836 | EXPORT_SYMBOL(generic_file_direct_write); | 1989 | EXPORT_SYMBOL(generic_file_direct_write); |
1837 | 1990 | ||
1838 | ssize_t | 1991 | /* |
1839 | generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | 1992 | * Find or create a page at the given pagecache position. Return the locked |
1840 | unsigned long nr_segs, loff_t pos, loff_t *ppos, | 1993 | * page. This function is specifically for buffered writes. |
1841 | size_t count, ssize_t written) | 1994 | */ |
1995 | struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) | ||
1842 | { | 1996 | { |
1843 | struct file *file = iocb->ki_filp; | 1997 | int status; |
1844 | struct address_space * mapping = file->f_mapping; | 1998 | struct page *page; |
1845 | const struct address_space_operations *a_ops = mapping->a_ops; | 1999 | repeat: |
1846 | struct inode *inode = mapping->host; | 2000 | page = find_lock_page(mapping, index); |
1847 | long status = 0; | 2001 | if (likely(page)) |
1848 | struct page *page; | 2002 | return page; |
1849 | struct page *cached_page = NULL; | ||
1850 | size_t bytes; | ||
1851 | struct pagevec lru_pvec; | ||
1852 | const struct iovec *cur_iov = iov; /* current iovec */ | ||
1853 | size_t iov_base = 0; /* offset in the current iovec */ | ||
1854 | char __user *buf; | ||
1855 | |||
1856 | pagevec_init(&lru_pvec, 0); | ||
1857 | 2003 | ||
1858 | /* | 2004 | page = page_cache_alloc(mapping); |
1859 | * handle partial DIO write. Adjust cur_iov if needed. | 2005 | if (!page) |
1860 | */ | 2006 | return NULL; |
1861 | if (likely(nr_segs == 1)) | 2007 | status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); |
1862 | buf = iov->iov_base + written; | 2008 | if (unlikely(status)) { |
1863 | else { | 2009 | page_cache_release(page); |
1864 | filemap_set_next_iovec(&cur_iov, &iov_base, written); | 2010 | if (status == -EEXIST) |
1865 | buf = cur_iov->iov_base + iov_base; | 2011 | goto repeat; |
2012 | return NULL; | ||
1866 | } | 2013 | } |
2014 | return page; | ||
2015 | } | ||
2016 | EXPORT_SYMBOL(__grab_cache_page); | ||
2017 | |||
2018 | static ssize_t generic_perform_write_2copy(struct file *file, | ||
2019 | struct iov_iter *i, loff_t pos) | ||
2020 | { | ||
2021 | struct address_space *mapping = file->f_mapping; | ||
2022 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
2023 | struct inode *inode = mapping->host; | ||
2024 | long status = 0; | ||
2025 | ssize_t written = 0; | ||
1867 | 2026 | ||
1868 | do { | 2027 | do { |
1869 | unsigned long index; | 2028 | struct page *src_page; |
1870 | unsigned long offset; | 2029 | struct page *page; |
1871 | size_t copied; | 2030 | pgoff_t index; /* Pagecache index for current page */ |
2031 | unsigned long offset; /* Offset into pagecache page */ | ||
2032 | unsigned long bytes; /* Bytes to write to page */ | ||
2033 | size_t copied; /* Bytes copied from user */ | ||
1872 | 2034 | ||
1873 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ | 2035 | offset = (pos & (PAGE_CACHE_SIZE - 1)); |
1874 | index = pos >> PAGE_CACHE_SHIFT; | 2036 | index = pos >> PAGE_CACHE_SHIFT; |
1875 | bytes = PAGE_CACHE_SIZE - offset; | 2037 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, |
2038 | iov_iter_count(i)); | ||
1876 | 2039 | ||
1877 | /* Limit the size of the copy to the caller's write size */ | 2040 | /* |
1878 | bytes = min(bytes, count); | 2041 | * a non-NULL src_page indicates that we're doing the |
1879 | 2042 | * copy via get_user_pages and kmap. | |
1880 | /* We only need to worry about prefaulting when writes are from | ||
1881 | * user-space. NFSd uses vfs_writev with several non-aligned | ||
1882 | * segments in the vector, and limiting to one segment a time is | ||
1883 | * a noticeable performance for re-write | ||
1884 | */ | 2043 | */ |
1885 | if (!segment_eq(get_fs(), KERNEL_DS)) { | 2044 | src_page = NULL; |
1886 | /* | ||
1887 | * Limit the size of the copy to that of the current | ||
1888 | * segment, because fault_in_pages_readable() doesn't | ||
1889 | * know how to walk segments. | ||
1890 | */ | ||
1891 | bytes = min(bytes, cur_iov->iov_len - iov_base); | ||
1892 | 2045 | ||
1893 | /* | 2046 | /* |
1894 | * Bring in the user page that we will copy from | 2047 | * Bring in the user page that we will copy from _first_. |
1895 | * _first_. Otherwise there's a nasty deadlock on | 2048 | * Otherwise there's a nasty deadlock on copying from the |
1896 | * copying from the same page as we're writing to, | 2049 | * same page as we're writing to, without it being marked |
1897 | * without it being marked up-to-date. | 2050 | * up-to-date. |
1898 | */ | 2051 | * |
1899 | fault_in_pages_readable(buf, bytes); | 2052 | * Not only is this an optimisation, but it is also required |
2053 | * to check that the address is actually valid, when atomic | ||
2054 | * usercopies are used, below. | ||
2055 | */ | ||
2056 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { | ||
2057 | status = -EFAULT; | ||
2058 | break; | ||
1900 | } | 2059 | } |
1901 | page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); | 2060 | |
2061 | page = __grab_cache_page(mapping, index); | ||
1902 | if (!page) { | 2062 | if (!page) { |
1903 | status = -ENOMEM; | 2063 | status = -ENOMEM; |
1904 | break; | 2064 | break; |
1905 | } | 2065 | } |
1906 | 2066 | ||
1907 | if (unlikely(bytes == 0)) { | 2067 | /* |
1908 | status = 0; | 2068 | * non-uptodate pages cannot cope with short copies, and we |
1909 | copied = 0; | 2069 | * cannot take a pagefault with the destination page locked. |
1910 | goto zero_length_segment; | 2070 | * So pin the source page to copy it. |
1911 | } | 2071 | */ |
2072 | if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) { | ||
2073 | unlock_page(page); | ||
1912 | 2074 | ||
1913 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | 2075 | src_page = alloc_page(GFP_KERNEL); |
1914 | if (unlikely(status)) { | 2076 | if (!src_page) { |
1915 | loff_t isize = i_size_read(inode); | 2077 | page_cache_release(page); |
2078 | status = -ENOMEM; | ||
2079 | break; | ||
2080 | } | ||
1916 | 2081 | ||
1917 | if (status != AOP_TRUNCATED_PAGE) | 2082 | /* |
2083 | * Cannot get_user_pages with a page locked for the | ||
2084 | * same reason as we can't take a page fault with a | ||
2085 | * page locked (as explained below). | ||
2086 | */ | ||
2087 | copied = iov_iter_copy_from_user(src_page, i, | ||
2088 | offset, bytes); | ||
2089 | if (unlikely(copied == 0)) { | ||
2090 | status = -EFAULT; | ||
2091 | page_cache_release(page); | ||
2092 | page_cache_release(src_page); | ||
2093 | break; | ||
2094 | } | ||
2095 | bytes = copied; | ||
2096 | |||
2097 | lock_page(page); | ||
2098 | /* | ||
2099 | * Can't handle the page going uptodate here, because | ||
2100 | * that means we would use non-atomic usercopies, which | ||
2101 | * zero out the tail of the page, which can cause | ||
2102 | * zeroes to become transiently visible. We could just | ||
2103 | * use a non-zeroing copy, but the APIs aren't too | ||
2104 | * consistent. | ||
2105 | */ | ||
2106 | if (unlikely(!page->mapping || PageUptodate(page))) { | ||
1918 | unlock_page(page); | 2107 | unlock_page(page); |
1919 | page_cache_release(page); | 2108 | page_cache_release(page); |
1920 | if (status == AOP_TRUNCATED_PAGE) | 2109 | page_cache_release(src_page); |
1921 | continue; | 2110 | continue; |
2111 | } | ||
2112 | } | ||
2113 | |||
2114 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | ||
2115 | if (unlikely(status)) | ||
2116 | goto fs_write_aop_error; | ||
2117 | |||
2118 | if (!src_page) { | ||
1922 | /* | 2119 | /* |
1923 | * prepare_write() may have instantiated a few blocks | 2120 | * Must not enter the pagefault handler here, because |
1924 | * outside i_size. Trim these off again. | 2121 | * we hold the page lock, so we might recursively |
2122 | * deadlock on the same lock, or get an ABBA deadlock | ||
2123 | * against a different lock, or against the mmap_sem | ||
2124 | * (which nests outside the page lock). So increment | ||
2125 | * preempt count, and use _atomic usercopies. | ||
2126 | * | ||
2127 | * The page is uptodate so we are OK to encounter a | ||
2128 | * short copy: if unmodified parts of the page are | ||
2129 | * marked dirty and written out to disk, it doesn't | ||
2130 | * really matter. | ||
1925 | */ | 2131 | */ |
1926 | if (pos + bytes > isize) | 2132 | pagefault_disable(); |
1927 | vmtruncate(inode, isize); | 2133 | copied = iov_iter_copy_from_user_atomic(page, i, |
1928 | break; | 2134 | offset, bytes); |
2135 | pagefault_enable(); | ||
2136 | } else { | ||
2137 | void *src, *dst; | ||
2138 | src = kmap_atomic(src_page, KM_USER0); | ||
2139 | dst = kmap_atomic(page, KM_USER1); | ||
2140 | memcpy(dst + offset, src + offset, bytes); | ||
2141 | kunmap_atomic(dst, KM_USER1); | ||
2142 | kunmap_atomic(src, KM_USER0); | ||
2143 | copied = bytes; | ||
1929 | } | 2144 | } |
1930 | if (likely(nr_segs == 1)) | ||
1931 | copied = filemap_copy_from_user(page, offset, | ||
1932 | buf, bytes); | ||
1933 | else | ||
1934 | copied = filemap_copy_from_user_iovec(page, offset, | ||
1935 | cur_iov, iov_base, bytes); | ||
1936 | flush_dcache_page(page); | 2145 | flush_dcache_page(page); |
2146 | |||
1937 | status = a_ops->commit_write(file, page, offset, offset+bytes); | 2147 | status = a_ops->commit_write(file, page, offset, offset+bytes); |
1938 | if (status == AOP_TRUNCATED_PAGE) { | 2148 | if (unlikely(status < 0)) |
1939 | page_cache_release(page); | 2149 | goto fs_write_aop_error; |
1940 | continue; | 2150 | if (unlikely(status > 0)) /* filesystem did partial write */ |
1941 | } | 2151 | copied = min_t(size_t, copied, status); |
1942 | zero_length_segment: | 2152 | |
1943 | if (likely(copied >= 0)) { | ||
1944 | if (!status) | ||
1945 | status = copied; | ||
1946 | |||
1947 | if (status >= 0) { | ||
1948 | written += status; | ||
1949 | count -= status; | ||
1950 | pos += status; | ||
1951 | buf += status; | ||
1952 | if (unlikely(nr_segs > 1)) { | ||
1953 | filemap_set_next_iovec(&cur_iov, | ||
1954 | &iov_base, status); | ||
1955 | if (count) | ||
1956 | buf = cur_iov->iov_base + | ||
1957 | iov_base; | ||
1958 | } else { | ||
1959 | iov_base += status; | ||
1960 | } | ||
1961 | } | ||
1962 | } | ||
1963 | if (unlikely(copied != bytes)) | ||
1964 | if (status >= 0) | ||
1965 | status = -EFAULT; | ||
1966 | unlock_page(page); | 2153 | unlock_page(page); |
1967 | mark_page_accessed(page); | 2154 | mark_page_accessed(page); |
1968 | page_cache_release(page); | 2155 | page_cache_release(page); |
1969 | if (status < 0) | 2156 | if (src_page) |
1970 | break; | 2157 | page_cache_release(src_page); |
2158 | |||
2159 | iov_iter_advance(i, copied); | ||
2160 | pos += copied; | ||
2161 | written += copied; | ||
2162 | |||
1971 | balance_dirty_pages_ratelimited(mapping); | 2163 | balance_dirty_pages_ratelimited(mapping); |
1972 | cond_resched(); | 2164 | cond_resched(); |
1973 | } while (count); | 2165 | continue; |
1974 | *ppos = pos; | ||
1975 | 2166 | ||
1976 | if (cached_page) | 2167 | fs_write_aop_error: |
1977 | page_cache_release(cached_page); | 2168 | unlock_page(page); |
2169 | page_cache_release(page); | ||
2170 | if (src_page) | ||
2171 | page_cache_release(src_page); | ||
2172 | |||
2173 | /* | ||
2174 | * prepare_write() may have instantiated a few blocks | ||
2175 | * outside i_size. Trim these off again. Don't need | ||
2176 | * i_size_read because we hold i_mutex. | ||
2177 | */ | ||
2178 | if (pos + bytes > inode->i_size) | ||
2179 | vmtruncate(inode, inode->i_size); | ||
2180 | break; | ||
2181 | } while (iov_iter_count(i)); | ||
2182 | |||
2183 | return written ? written : status; | ||
2184 | } | ||
2185 | |||
2186 | static ssize_t generic_perform_write(struct file *file, | ||
2187 | struct iov_iter *i, loff_t pos) | ||
2188 | { | ||
2189 | struct address_space *mapping = file->f_mapping; | ||
2190 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
2191 | long status = 0; | ||
2192 | ssize_t written = 0; | ||
2193 | unsigned int flags = 0; | ||
1978 | 2194 | ||
1979 | /* | 2195 | /* |
1980 | * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC | 2196 | * Copies from kernel address space cannot fail (NFSD is a big user). |
1981 | */ | 2197 | */ |
2198 | if (segment_eq(get_fs(), KERNEL_DS)) | ||
2199 | flags |= AOP_FLAG_UNINTERRUPTIBLE; | ||
2200 | |||
2201 | do { | ||
2202 | struct page *page; | ||
2203 | pgoff_t index; /* Pagecache index for current page */ | ||
2204 | unsigned long offset; /* Offset into pagecache page */ | ||
2205 | unsigned long bytes; /* Bytes to write to page */ | ||
2206 | size_t copied; /* Bytes copied from user */ | ||
2207 | void *fsdata; | ||
2208 | |||
2209 | offset = (pos & (PAGE_CACHE_SIZE - 1)); | ||
2210 | index = pos >> PAGE_CACHE_SHIFT; | ||
2211 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | ||
2212 | iov_iter_count(i)); | ||
2213 | |||
2214 | again: | ||
2215 | |||
2216 | /* | ||
2217 | * Bring in the user page that we will copy from _first_. | ||
2218 | * Otherwise there's a nasty deadlock on copying from the | ||
2219 | * same page as we're writing to, without it being marked | ||
2220 | * up-to-date. | ||
2221 | * | ||
2222 | * Not only is this an optimisation, but it is also required | ||
2223 | * to check that the address is actually valid, when atomic | ||
2224 | * usercopies are used, below. | ||
2225 | */ | ||
2226 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { | ||
2227 | status = -EFAULT; | ||
2228 | break; | ||
2229 | } | ||
2230 | |||
2231 | status = a_ops->write_begin(file, mapping, pos, bytes, flags, | ||
2232 | &page, &fsdata); | ||
2233 | if (unlikely(status)) | ||
2234 | break; | ||
2235 | |||
2236 | pagefault_disable(); | ||
2237 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); | ||
2238 | pagefault_enable(); | ||
2239 | flush_dcache_page(page); | ||
2240 | |||
2241 | status = a_ops->write_end(file, mapping, pos, bytes, copied, | ||
2242 | page, fsdata); | ||
2243 | if (unlikely(status < 0)) | ||
2244 | break; | ||
2245 | copied = status; | ||
2246 | |||
2247 | cond_resched(); | ||
2248 | |||
2249 | if (unlikely(copied == 0)) { | ||
2250 | /* | ||
2251 | * If we were unable to copy any data at all, we must | ||
2252 | * fall back to a single segment length write. | ||
2253 | * | ||
2254 | * If we didn't fallback here, we could livelock | ||
2255 | * because not all segments in the iov can be copied at | ||
2256 | * once without a pagefault. | ||
2257 | */ | ||
2258 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | ||
2259 | iov_iter_single_seg_count(i)); | ||
2260 | goto again; | ||
2261 | } | ||
2262 | iov_iter_advance(i, copied); | ||
2263 | pos += copied; | ||
2264 | written += copied; | ||
2265 | |||
2266 | balance_dirty_pages_ratelimited(mapping); | ||
2267 | |||
2268 | } while (iov_iter_count(i)); | ||
2269 | |||
2270 | return written ? written : status; | ||
2271 | } | ||
2272 | |||
2273 | ssize_t | ||
2274 | generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | ||
2275 | unsigned long nr_segs, loff_t pos, loff_t *ppos, | ||
2276 | size_t count, ssize_t written) | ||
2277 | { | ||
2278 | struct file *file = iocb->ki_filp; | ||
2279 | struct address_space *mapping = file->f_mapping; | ||
2280 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
2281 | struct inode *inode = mapping->host; | ||
2282 | ssize_t status; | ||
2283 | struct iov_iter i; | ||
2284 | |||
2285 | iov_iter_init(&i, iov, nr_segs, count, written); | ||
2286 | if (a_ops->write_begin) | ||
2287 | status = generic_perform_write(file, &i, pos); | ||
2288 | else | ||
2289 | status = generic_perform_write_2copy(file, &i, pos); | ||
2290 | |||
1982 | if (likely(status >= 0)) { | 2291 | if (likely(status >= 0)) { |
2292 | written += status; | ||
2293 | *ppos = pos + status; | ||
2294 | |||
2295 | /* | ||
2296 | * For now, when the user asks for O_SYNC, we'll actually give | ||
2297 | * O_DSYNC | ||
2298 | */ | ||
1983 | if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2299 | if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
1984 | if (!a_ops->writepage || !is_sync_kiocb(iocb)) | 2300 | if (!a_ops->writepage || !is_sync_kiocb(iocb)) |
1985 | status = generic_osync_inode(inode, mapping, | 2301 | status = generic_osync_inode(inode, mapping, |
@@ -1995,7 +2311,6 @@ zero_length_segment: | |||
1995 | if (unlikely(file->f_flags & O_DIRECT) && written) | 2311 | if (unlikely(file->f_flags & O_DIRECT) && written) |
1996 | status = filemap_write_and_wait(mapping); | 2312 | status = filemap_write_and_wait(mapping); |
1997 | 2313 | ||
1998 | pagevec_lru_add(&lru_pvec); | ||
1999 | return written ? written : status; | 2314 | return written ? written : status; |
2000 | } | 2315 | } |
2001 | EXPORT_SYMBOL(generic_file_buffered_write); | 2316 | EXPORT_SYMBOL(generic_file_buffered_write); |
diff --git a/mm/filemap.h b/mm/filemap.h deleted file mode 100644 index c2bff04c84ed..000000000000 --- a/mm/filemap.h +++ /dev/null | |||
@@ -1,103 +0,0 @@ | |||
1 | /* | ||
2 | * linux/mm/filemap.h | ||
3 | * | ||
4 | * Copyright (C) 1994-1999 Linus Torvalds | ||
5 | */ | ||
6 | |||
7 | #ifndef __FILEMAP_H | ||
8 | #define __FILEMAP_H | ||
9 | |||
10 | #include <linux/types.h> | ||
11 | #include <linux/fs.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/highmem.h> | ||
14 | #include <linux/uio.h> | ||
15 | #include <linux/uaccess.h> | ||
16 | |||
17 | size_t | ||
18 | __filemap_copy_from_user_iovec_inatomic(char *vaddr, | ||
19 | const struct iovec *iov, | ||
20 | size_t base, | ||
21 | size_t bytes); | ||
22 | |||
23 | /* | ||
24 | * Copy as much as we can into the page and return the number of bytes which | ||
25 | * were sucessfully copied. If a fault is encountered then clear the page | ||
26 | * out to (offset+bytes) and return the number of bytes which were copied. | ||
27 | * | ||
28 | * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache | ||
29 | * to *NOT* zero any tail of the buffer that it failed to copy. If it does, | ||
30 | * and if the following non-atomic copy succeeds, then there is a small window | ||
31 | * where the target page contains neither the data before the write, nor the | ||
32 | * data after the write (it contains zero). A read at this time will see | ||
33 | * data that is inconsistent with any ordering of the read and the write. | ||
34 | * (This has been detected in practice). | ||
35 | */ | ||
36 | static inline size_t | ||
37 | filemap_copy_from_user(struct page *page, unsigned long offset, | ||
38 | const char __user *buf, unsigned bytes) | ||
39 | { | ||
40 | char *kaddr; | ||
41 | int left; | ||
42 | |||
43 | kaddr = kmap_atomic(page, KM_USER0); | ||
44 | left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); | ||
45 | kunmap_atomic(kaddr, KM_USER0); | ||
46 | |||
47 | if (left != 0) { | ||
48 | /* Do it the slow way */ | ||
49 | kaddr = kmap(page); | ||
50 | left = __copy_from_user_nocache(kaddr + offset, buf, bytes); | ||
51 | kunmap(page); | ||
52 | } | ||
53 | return bytes - left; | ||
54 | } | ||
55 | |||
56 | /* | ||
57 | * This has the same sideeffects and return value as filemap_copy_from_user(). | ||
58 | * The difference is that on a fault we need to memset the remainder of the | ||
59 | * page (out to offset+bytes), to emulate filemap_copy_from_user()'s | ||
60 | * single-segment behaviour. | ||
61 | */ | ||
62 | static inline size_t | ||
63 | filemap_copy_from_user_iovec(struct page *page, unsigned long offset, | ||
64 | const struct iovec *iov, size_t base, size_t bytes) | ||
65 | { | ||
66 | char *kaddr; | ||
67 | size_t copied; | ||
68 | |||
69 | kaddr = kmap_atomic(page, KM_USER0); | ||
70 | copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, | ||
71 | base, bytes); | ||
72 | kunmap_atomic(kaddr, KM_USER0); | ||
73 | if (copied != bytes) { | ||
74 | kaddr = kmap(page); | ||
75 | copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, | ||
76 | base, bytes); | ||
77 | if (bytes - copied) | ||
78 | memset(kaddr + offset + copied, 0, bytes - copied); | ||
79 | kunmap(page); | ||
80 | } | ||
81 | return copied; | ||
82 | } | ||
83 | |||
84 | static inline void | ||
85 | filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) | ||
86 | { | ||
87 | const struct iovec *iov = *iovp; | ||
88 | size_t base = *basep; | ||
89 | |||
90 | do { | ||
91 | int copy = min(bytes, iov->iov_len - base); | ||
92 | |||
93 | bytes -= copy; | ||
94 | base += copy; | ||
95 | if (iov->iov_len == base) { | ||
96 | iov++; | ||
97 | base = 0; | ||
98 | } | ||
99 | } while (bytes); | ||
100 | *iovp = iov; | ||
101 | *basep = base; | ||
102 | } | ||
103 | #endif | ||
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 53ee6a299635..32132f3cd641 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -15,7 +15,6 @@ | |||
15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <asm/tlbflush.h> | 17 | #include <asm/tlbflush.h> |
18 | #include "filemap.h" | ||
19 | 18 | ||
20 | /* | 19 | /* |
21 | * We do use our own empty page to avoid interference with other users | 20 | * We do use our own empty page to avoid interference with other users |
@@ -288,6 +287,7 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
288 | unsigned long index; | 287 | unsigned long index; |
289 | unsigned long offset; | 288 | unsigned long offset; |
290 | size_t copied; | 289 | size_t copied; |
290 | char *kaddr; | ||
291 | 291 | ||
292 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ | 292 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ |
293 | index = pos >> PAGE_CACHE_SHIFT; | 293 | index = pos >> PAGE_CACHE_SHIFT; |
@@ -295,14 +295,6 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
295 | if (bytes > count) | 295 | if (bytes > count) |
296 | bytes = count; | 296 | bytes = count; |
297 | 297 | ||
298 | /* | ||
299 | * Bring in the user page that we will copy from _first_. | ||
300 | * Otherwise there's a nasty deadlock on copying from the | ||
301 | * same page as we're writing to, without it being marked | ||
302 | * up-to-date. | ||
303 | */ | ||
304 | fault_in_pages_readable(buf, bytes); | ||
305 | |||
306 | page = a_ops->get_xip_page(mapping, | 298 | page = a_ops->get_xip_page(mapping, |
307 | index*(PAGE_SIZE/512), 0); | 299 | index*(PAGE_SIZE/512), 0); |
308 | if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) { | 300 | if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) { |
@@ -319,8 +311,13 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
319 | break; | 311 | break; |
320 | } | 312 | } |
321 | 313 | ||
322 | copied = filemap_copy_from_user(page, offset, buf, bytes); | 314 | fault_in_pages_readable(buf, bytes); |
315 | kaddr = kmap_atomic(page, KM_USER0); | ||
316 | copied = bytes - | ||
317 | __copy_from_user_inatomic_nocache(kaddr, buf, bytes); | ||
318 | kunmap_atomic(kaddr, KM_USER0); | ||
323 | flush_dcache_page(page); | 319 | flush_dcache_page(page); |
320 | |||
324 | if (likely(copied > 0)) { | 321 | if (likely(copied > 0)) { |
325 | status = copied; | 322 | status = copied; |
326 | 323 | ||
diff --git a/mm/fremap.c b/mm/fremap.c index 95bcb5641c72..14bd3bf7826e 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * | 5 | * |
6 | * started by Ingo Molnar, Copyright (C) 2002, 2003 | 6 | * started by Ingo Molnar, Copyright (C) 2002, 2003 |
7 | */ | 7 | */ |
8 | 8 | #include <linux/backing-dev.h> | |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/swap.h> | 10 | #include <linux/swap.h> |
11 | #include <linux/file.h> | 11 | #include <linux/file.h> |
@@ -97,26 +97,28 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, | |||
97 | 97 | ||
98 | } | 98 | } |
99 | 99 | ||
100 | /*** | 100 | /** |
101 | * sys_remap_file_pages - remap arbitrary pages of a shared backing store | 101 | * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma |
102 | * file within an existing vma. | ||
103 | * @start: start of the remapped virtual memory range | 102 | * @start: start of the remapped virtual memory range |
104 | * @size: size of the remapped virtual memory range | 103 | * @size: size of the remapped virtual memory range |
105 | * @prot: new protection bits of the range | 104 | * @prot: new protection bits of the range (see NOTE) |
106 | * @pgoff: to be mapped page of the backing store file | 105 | * @pgoff: to-be-mapped page of the backing store file |
107 | * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. | 106 | * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. |
108 | * | 107 | * |
109 | * this syscall works purely via pagetables, so it's the most efficient | 108 | * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma |
109 | * (shared backing store file). | ||
110 | * | ||
111 | * This syscall works purely via pagetables, so it's the most efficient | ||
110 | * way to map the same (large) file into a given virtual window. Unlike | 112 | * way to map the same (large) file into a given virtual window. Unlike |
111 | * mmap()/mremap() it does not create any new vmas. The new mappings are | 113 | * mmap()/mremap() it does not create any new vmas. The new mappings are |
112 | * also safe across swapout. | 114 | * also safe across swapout. |
113 | * | 115 | * |
114 | * NOTE: the 'prot' parameter right now is ignored, and the vma's default | 116 | * NOTE: the 'prot' parameter right now is ignored (but must be zero), |
115 | * protection is used. Arbitrary protections might be implemented in the | 117 | * and the vma's default protection is used. Arbitrary protections |
116 | * future. | 118 | * might be implemented in the future. |
117 | */ | 119 | */ |
118 | asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, | 120 | asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, |
119 | unsigned long __prot, unsigned long pgoff, unsigned long flags) | 121 | unsigned long prot, unsigned long pgoff, unsigned long flags) |
120 | { | 122 | { |
121 | struct mm_struct *mm = current->mm; | 123 | struct mm_struct *mm = current->mm; |
122 | struct address_space *mapping; | 124 | struct address_space *mapping; |
@@ -125,7 +127,7 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, | |||
125 | int err = -EINVAL; | 127 | int err = -EINVAL; |
126 | int has_write_lock = 0; | 128 | int has_write_lock = 0; |
127 | 129 | ||
128 | if (__prot) | 130 | if (prot) |
129 | return err; | 131 | return err; |
130 | /* | 132 | /* |
131 | * Sanitize the syscall parameters: | 133 | * Sanitize the syscall parameters: |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eab8c428cc93..ae2959bb59cb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -23,12 +23,16 @@ | |||
23 | 23 | ||
24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; | 25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; |
26 | static unsigned long surplus_huge_pages; | ||
26 | unsigned long max_huge_pages; | 27 | unsigned long max_huge_pages; |
27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | 28 | static struct list_head hugepage_freelists[MAX_NUMNODES]; |
28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | 29 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; |
29 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; | 30 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; |
31 | static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; | ||
30 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 32 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
31 | unsigned long hugepages_treat_as_movable; | 33 | unsigned long hugepages_treat_as_movable; |
34 | int hugetlb_dynamic_pool; | ||
35 | static int hugetlb_next_nid; | ||
32 | 36 | ||
33 | /* | 37 | /* |
34 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 38 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages |
@@ -85,6 +89,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, | |||
85 | list_del(&page->lru); | 89 | list_del(&page->lru); |
86 | free_huge_pages--; | 90 | free_huge_pages--; |
87 | free_huge_pages_node[nid]--; | 91 | free_huge_pages_node[nid]--; |
92 | if (vma && vma->vm_flags & VM_MAYSHARE) | ||
93 | resv_huge_pages--; | ||
88 | break; | 94 | break; |
89 | } | 95 | } |
90 | } | 96 | } |
@@ -92,58 +98,269 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, | |||
92 | return page; | 98 | return page; |
93 | } | 99 | } |
94 | 100 | ||
101 | static void update_and_free_page(struct page *page) | ||
102 | { | ||
103 | int i; | ||
104 | nr_huge_pages--; | ||
105 | nr_huge_pages_node[page_to_nid(page)]--; | ||
106 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { | ||
107 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | ||
108 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | ||
109 | 1 << PG_private | 1<< PG_writeback); | ||
110 | } | ||
111 | set_compound_page_dtor(page, NULL); | ||
112 | set_page_refcounted(page); | ||
113 | __free_pages(page, HUGETLB_PAGE_ORDER); | ||
114 | } | ||
115 | |||
95 | static void free_huge_page(struct page *page) | 116 | static void free_huge_page(struct page *page) |
96 | { | 117 | { |
97 | BUG_ON(page_count(page)); | 118 | int nid = page_to_nid(page); |
98 | 119 | ||
120 | BUG_ON(page_count(page)); | ||
99 | INIT_LIST_HEAD(&page->lru); | 121 | INIT_LIST_HEAD(&page->lru); |
100 | 122 | ||
101 | spin_lock(&hugetlb_lock); | 123 | spin_lock(&hugetlb_lock); |
102 | enqueue_huge_page(page); | 124 | if (surplus_huge_pages_node[nid]) { |
125 | update_and_free_page(page); | ||
126 | surplus_huge_pages--; | ||
127 | surplus_huge_pages_node[nid]--; | ||
128 | } else { | ||
129 | enqueue_huge_page(page); | ||
130 | } | ||
103 | spin_unlock(&hugetlb_lock); | 131 | spin_unlock(&hugetlb_lock); |
104 | } | 132 | } |
105 | 133 | ||
106 | static int alloc_fresh_huge_page(void) | 134 | /* |
135 | * Increment or decrement surplus_huge_pages. Keep node-specific counters | ||
136 | * balanced by operating on them in a round-robin fashion. | ||
137 | * Returns 1 if an adjustment was made. | ||
138 | */ | ||
139 | static int adjust_pool_surplus(int delta) | ||
107 | { | 140 | { |
108 | static int prev_nid; | 141 | static int prev_nid; |
109 | struct page *page; | 142 | int nid = prev_nid; |
110 | int nid; | 143 | int ret = 0; |
144 | |||
145 | VM_BUG_ON(delta != -1 && delta != 1); | ||
146 | do { | ||
147 | nid = next_node(nid, node_online_map); | ||
148 | if (nid == MAX_NUMNODES) | ||
149 | nid = first_node(node_online_map); | ||
150 | |||
151 | /* To shrink on this node, there must be a surplus page */ | ||
152 | if (delta < 0 && !surplus_huge_pages_node[nid]) | ||
153 | continue; | ||
154 | /* Surplus cannot exceed the total number of pages */ | ||
155 | if (delta > 0 && surplus_huge_pages_node[nid] >= | ||
156 | nr_huge_pages_node[nid]) | ||
157 | continue; | ||
158 | |||
159 | surplus_huge_pages += delta; | ||
160 | surplus_huge_pages_node[nid] += delta; | ||
161 | ret = 1; | ||
162 | break; | ||
163 | } while (nid != prev_nid); | ||
111 | 164 | ||
112 | /* | ||
113 | * Copy static prev_nid to local nid, work on that, then copy it | ||
114 | * back to prev_nid afterwards: otherwise there's a window in which | ||
115 | * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node. | ||
116 | * But we don't need to use a spin_lock here: it really doesn't | ||
117 | * matter if occasionally a racer chooses the same nid as we do. | ||
118 | */ | ||
119 | nid = next_node(prev_nid, node_online_map); | ||
120 | if (nid == MAX_NUMNODES) | ||
121 | nid = first_node(node_online_map); | ||
122 | prev_nid = nid; | 165 | prev_nid = nid; |
166 | return ret; | ||
167 | } | ||
168 | |||
169 | static struct page *alloc_fresh_huge_page_node(int nid) | ||
170 | { | ||
171 | struct page *page; | ||
123 | 172 | ||
124 | page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, | 173 | page = alloc_pages_node(nid, |
174 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, | ||
175 | HUGETLB_PAGE_ORDER); | ||
176 | if (page) { | ||
177 | set_compound_page_dtor(page, free_huge_page); | ||
178 | spin_lock(&hugetlb_lock); | ||
179 | nr_huge_pages++; | ||
180 | nr_huge_pages_node[nid]++; | ||
181 | spin_unlock(&hugetlb_lock); | ||
182 | put_page(page); /* free it into the hugepage allocator */ | ||
183 | } | ||
184 | |||
185 | return page; | ||
186 | } | ||
187 | |||
188 | static int alloc_fresh_huge_page(void) | ||
189 | { | ||
190 | struct page *page; | ||
191 | int start_nid; | ||
192 | int next_nid; | ||
193 | int ret = 0; | ||
194 | |||
195 | start_nid = hugetlb_next_nid; | ||
196 | |||
197 | do { | ||
198 | page = alloc_fresh_huge_page_node(hugetlb_next_nid); | ||
199 | if (page) | ||
200 | ret = 1; | ||
201 | /* | ||
202 | * Use a helper variable to find the next node and then | ||
203 | * copy it back to hugetlb_next_nid afterwards: | ||
204 | * otherwise there's a window in which a racer might | ||
205 | * pass invalid nid MAX_NUMNODES to alloc_pages_node. | ||
206 | * But we don't need to use a spin_lock here: it really | ||
207 | * doesn't matter if occasionally a racer chooses the | ||
208 | * same nid as we do. Move nid forward in the mask even | ||
209 | * if we just successfully allocated a hugepage so that | ||
210 | * the next caller gets hugepages on the next node. | ||
211 | */ | ||
212 | next_nid = next_node(hugetlb_next_nid, node_online_map); | ||
213 | if (next_nid == MAX_NUMNODES) | ||
214 | next_nid = first_node(node_online_map); | ||
215 | hugetlb_next_nid = next_nid; | ||
216 | } while (!page && hugetlb_next_nid != start_nid); | ||
217 | |||
218 | return ret; | ||
219 | } | ||
220 | |||
221 | static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | ||
222 | unsigned long address) | ||
223 | { | ||
224 | struct page *page; | ||
225 | |||
226 | /* Check if the dynamic pool is enabled */ | ||
227 | if (!hugetlb_dynamic_pool) | ||
228 | return NULL; | ||
229 | |||
230 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, | ||
125 | HUGETLB_PAGE_ORDER); | 231 | HUGETLB_PAGE_ORDER); |
126 | if (page) { | 232 | if (page) { |
127 | set_compound_page_dtor(page, free_huge_page); | 233 | set_compound_page_dtor(page, free_huge_page); |
128 | spin_lock(&hugetlb_lock); | 234 | spin_lock(&hugetlb_lock); |
129 | nr_huge_pages++; | 235 | nr_huge_pages++; |
130 | nr_huge_pages_node[page_to_nid(page)]++; | 236 | nr_huge_pages_node[page_to_nid(page)]++; |
237 | surplus_huge_pages++; | ||
238 | surplus_huge_pages_node[page_to_nid(page)]++; | ||
131 | spin_unlock(&hugetlb_lock); | 239 | spin_unlock(&hugetlb_lock); |
132 | put_page(page); /* free it into the hugepage allocator */ | ||
133 | return 1; | ||
134 | } | 240 | } |
135 | return 0; | 241 | |
242 | return page; | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Increase the hugetlb pool such that it can accomodate a reservation | ||
247 | * of size 'delta'. | ||
248 | */ | ||
249 | static int gather_surplus_pages(int delta) | ||
250 | { | ||
251 | struct list_head surplus_list; | ||
252 | struct page *page, *tmp; | ||
253 | int ret, i; | ||
254 | int needed, allocated; | ||
255 | |||
256 | needed = (resv_huge_pages + delta) - free_huge_pages; | ||
257 | if (needed <= 0) | ||
258 | return 0; | ||
259 | |||
260 | allocated = 0; | ||
261 | INIT_LIST_HEAD(&surplus_list); | ||
262 | |||
263 | ret = -ENOMEM; | ||
264 | retry: | ||
265 | spin_unlock(&hugetlb_lock); | ||
266 | for (i = 0; i < needed; i++) { | ||
267 | page = alloc_buddy_huge_page(NULL, 0); | ||
268 | if (!page) { | ||
269 | /* | ||
270 | * We were not able to allocate enough pages to | ||
271 | * satisfy the entire reservation so we free what | ||
272 | * we've allocated so far. | ||
273 | */ | ||
274 | spin_lock(&hugetlb_lock); | ||
275 | needed = 0; | ||
276 | goto free; | ||
277 | } | ||
278 | |||
279 | list_add(&page->lru, &surplus_list); | ||
280 | } | ||
281 | allocated += needed; | ||
282 | |||
283 | /* | ||
284 | * After retaking hugetlb_lock, we need to recalculate 'needed' | ||
285 | * because either resv_huge_pages or free_huge_pages may have changed. | ||
286 | */ | ||
287 | spin_lock(&hugetlb_lock); | ||
288 | needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); | ||
289 | if (needed > 0) | ||
290 | goto retry; | ||
291 | |||
292 | /* | ||
293 | * The surplus_list now contains _at_least_ the number of extra pages | ||
294 | * needed to accomodate the reservation. Add the appropriate number | ||
295 | * of pages to the hugetlb pool and free the extras back to the buddy | ||
296 | * allocator. | ||
297 | */ | ||
298 | needed += allocated; | ||
299 | ret = 0; | ||
300 | free: | ||
301 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | ||
302 | list_del(&page->lru); | ||
303 | if ((--needed) >= 0) | ||
304 | enqueue_huge_page(page); | ||
305 | else { | ||
306 | /* | ||
307 | * Decrement the refcount and free the page using its | ||
308 | * destructor. This must be done with hugetlb_lock | ||
309 | * unlocked which is safe because free_huge_page takes | ||
310 | * hugetlb_lock before deciding how to free the page. | ||
311 | */ | ||
312 | spin_unlock(&hugetlb_lock); | ||
313 | put_page(page); | ||
314 | spin_lock(&hugetlb_lock); | ||
315 | } | ||
316 | } | ||
317 | |||
318 | return ret; | ||
319 | } | ||
320 | |||
321 | /* | ||
322 | * When releasing a hugetlb pool reservation, any surplus pages that were | ||
323 | * allocated to satisfy the reservation must be explicitly freed if they were | ||
324 | * never used. | ||
325 | */ | ||
326 | void return_unused_surplus_pages(unsigned long unused_resv_pages) | ||
327 | { | ||
328 | static int nid = -1; | ||
329 | struct page *page; | ||
330 | unsigned long nr_pages; | ||
331 | |||
332 | nr_pages = min(unused_resv_pages, surplus_huge_pages); | ||
333 | |||
334 | while (nr_pages) { | ||
335 | nid = next_node(nid, node_online_map); | ||
336 | if (nid == MAX_NUMNODES) | ||
337 | nid = first_node(node_online_map); | ||
338 | |||
339 | if (!surplus_huge_pages_node[nid]) | ||
340 | continue; | ||
341 | |||
342 | if (!list_empty(&hugepage_freelists[nid])) { | ||
343 | page = list_entry(hugepage_freelists[nid].next, | ||
344 | struct page, lru); | ||
345 | list_del(&page->lru); | ||
346 | update_and_free_page(page); | ||
347 | free_huge_pages--; | ||
348 | free_huge_pages_node[nid]--; | ||
349 | surplus_huge_pages--; | ||
350 | surplus_huge_pages_node[nid]--; | ||
351 | nr_pages--; | ||
352 | } | ||
353 | } | ||
136 | } | 354 | } |
137 | 355 | ||
138 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 356 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
139 | unsigned long addr) | 357 | unsigned long addr) |
140 | { | 358 | { |
141 | struct page *page; | 359 | struct page *page = NULL; |
360 | int use_reserved_page = vma->vm_flags & VM_MAYSHARE; | ||
142 | 361 | ||
143 | spin_lock(&hugetlb_lock); | 362 | spin_lock(&hugetlb_lock); |
144 | if (vma->vm_flags & VM_MAYSHARE) | 363 | if (!use_reserved_page && (free_huge_pages <= resv_huge_pages)) |
145 | resv_huge_pages--; | ||
146 | else if (free_huge_pages <= resv_huge_pages) | ||
147 | goto fail; | 364 | goto fail; |
148 | 365 | ||
149 | page = dequeue_huge_page(vma, addr); | 366 | page = dequeue_huge_page(vma, addr); |
@@ -155,10 +372,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
155 | return page; | 372 | return page; |
156 | 373 | ||
157 | fail: | 374 | fail: |
158 | if (vma->vm_flags & VM_MAYSHARE) | ||
159 | resv_huge_pages++; | ||
160 | spin_unlock(&hugetlb_lock); | 375 | spin_unlock(&hugetlb_lock); |
161 | return NULL; | 376 | |
377 | /* | ||
378 | * Private mappings do not use reserved huge pages so the allocation | ||
379 | * may have failed due to an undersized hugetlb pool. Try to grab a | ||
380 | * surplus huge page from the buddy allocator. | ||
381 | */ | ||
382 | if (!use_reserved_page) | ||
383 | page = alloc_buddy_huge_page(vma, addr); | ||
384 | |||
385 | return page; | ||
162 | } | 386 | } |
163 | 387 | ||
164 | static int __init hugetlb_init(void) | 388 | static int __init hugetlb_init(void) |
@@ -171,6 +395,8 @@ static int __init hugetlb_init(void) | |||
171 | for (i = 0; i < MAX_NUMNODES; ++i) | 395 | for (i = 0; i < MAX_NUMNODES; ++i) |
172 | INIT_LIST_HEAD(&hugepage_freelists[i]); | 396 | INIT_LIST_HEAD(&hugepage_freelists[i]); |
173 | 397 | ||
398 | hugetlb_next_nid = first_node(node_online_map); | ||
399 | |||
174 | for (i = 0; i < max_huge_pages; ++i) { | 400 | for (i = 0; i < max_huge_pages; ++i) { |
175 | if (!alloc_fresh_huge_page()) | 401 | if (!alloc_fresh_huge_page()) |
176 | break; | 402 | break; |
@@ -201,21 +427,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array) | |||
201 | } | 427 | } |
202 | 428 | ||
203 | #ifdef CONFIG_SYSCTL | 429 | #ifdef CONFIG_SYSCTL |
204 | static void update_and_free_page(struct page *page) | ||
205 | { | ||
206 | int i; | ||
207 | nr_huge_pages--; | ||
208 | nr_huge_pages_node[page_to_nid(page)]--; | ||
209 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { | ||
210 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | ||
211 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | ||
212 | 1 << PG_private | 1<< PG_writeback); | ||
213 | } | ||
214 | set_compound_page_dtor(page, NULL); | ||
215 | set_page_refcounted(page); | ||
216 | __free_pages(page, HUGETLB_PAGE_ORDER); | ||
217 | } | ||
218 | |||
219 | #ifdef CONFIG_HIGHMEM | 430 | #ifdef CONFIG_HIGHMEM |
220 | static void try_to_free_low(unsigned long count) | 431 | static void try_to_free_low(unsigned long count) |
221 | { | 432 | { |
@@ -224,14 +435,14 @@ static void try_to_free_low(unsigned long count) | |||
224 | for (i = 0; i < MAX_NUMNODES; ++i) { | 435 | for (i = 0; i < MAX_NUMNODES; ++i) { |
225 | struct page *page, *next; | 436 | struct page *page, *next; |
226 | list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { | 437 | list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { |
438 | if (count >= nr_huge_pages) | ||
439 | return; | ||
227 | if (PageHighMem(page)) | 440 | if (PageHighMem(page)) |
228 | continue; | 441 | continue; |
229 | list_del(&page->lru); | 442 | list_del(&page->lru); |
230 | update_and_free_page(page); | 443 | update_and_free_page(page); |
231 | free_huge_pages--; | 444 | free_huge_pages--; |
232 | free_huge_pages_node[page_to_nid(page)]--; | 445 | free_huge_pages_node[page_to_nid(page)]--; |
233 | if (count >= nr_huge_pages) | ||
234 | return; | ||
235 | } | 446 | } |
236 | } | 447 | } |
237 | } | 448 | } |
@@ -241,26 +452,61 @@ static inline void try_to_free_low(unsigned long count) | |||
241 | } | 452 | } |
242 | #endif | 453 | #endif |
243 | 454 | ||
455 | #define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) | ||
244 | static unsigned long set_max_huge_pages(unsigned long count) | 456 | static unsigned long set_max_huge_pages(unsigned long count) |
245 | { | 457 | { |
246 | while (count > nr_huge_pages) { | 458 | unsigned long min_count, ret; |
247 | if (!alloc_fresh_huge_page()) | ||
248 | return nr_huge_pages; | ||
249 | } | ||
250 | if (count >= nr_huge_pages) | ||
251 | return nr_huge_pages; | ||
252 | 459 | ||
460 | /* | ||
461 | * Increase the pool size | ||
462 | * First take pages out of surplus state. Then make up the | ||
463 | * remaining difference by allocating fresh huge pages. | ||
464 | */ | ||
253 | spin_lock(&hugetlb_lock); | 465 | spin_lock(&hugetlb_lock); |
254 | count = max(count, resv_huge_pages); | 466 | while (surplus_huge_pages && count > persistent_huge_pages) { |
255 | try_to_free_low(count); | 467 | if (!adjust_pool_surplus(-1)) |
256 | while (count < nr_huge_pages) { | 468 | break; |
469 | } | ||
470 | |||
471 | while (count > persistent_huge_pages) { | ||
472 | int ret; | ||
473 | /* | ||
474 | * If this allocation races such that we no longer need the | ||
475 | * page, free_huge_page will handle it by freeing the page | ||
476 | * and reducing the surplus. | ||
477 | */ | ||
478 | spin_unlock(&hugetlb_lock); | ||
479 | ret = alloc_fresh_huge_page(); | ||
480 | spin_lock(&hugetlb_lock); | ||
481 | if (!ret) | ||
482 | goto out; | ||
483 | |||
484 | } | ||
485 | |||
486 | /* | ||
487 | * Decrease the pool size | ||
488 | * First return free pages to the buddy allocator (being careful | ||
489 | * to keep enough around to satisfy reservations). Then place | ||
490 | * pages into surplus state as needed so the pool will shrink | ||
491 | * to the desired size as pages become free. | ||
492 | */ | ||
493 | min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; | ||
494 | min_count = max(count, min_count); | ||
495 | try_to_free_low(min_count); | ||
496 | while (min_count < persistent_huge_pages) { | ||
257 | struct page *page = dequeue_huge_page(NULL, 0); | 497 | struct page *page = dequeue_huge_page(NULL, 0); |
258 | if (!page) | 498 | if (!page) |
259 | break; | 499 | break; |
260 | update_and_free_page(page); | 500 | update_and_free_page(page); |
261 | } | 501 | } |
502 | while (count < persistent_huge_pages) { | ||
503 | if (!adjust_pool_surplus(1)) | ||
504 | break; | ||
505 | } | ||
506 | out: | ||
507 | ret = persistent_huge_pages; | ||
262 | spin_unlock(&hugetlb_lock); | 508 | spin_unlock(&hugetlb_lock); |
263 | return nr_huge_pages; | 509 | return ret; |
264 | } | 510 | } |
265 | 511 | ||
266 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 512 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, |
@@ -292,10 +538,12 @@ int hugetlb_report_meminfo(char *buf) | |||
292 | "HugePages_Total: %5lu\n" | 538 | "HugePages_Total: %5lu\n" |
293 | "HugePages_Free: %5lu\n" | 539 | "HugePages_Free: %5lu\n" |
294 | "HugePages_Rsvd: %5lu\n" | 540 | "HugePages_Rsvd: %5lu\n" |
541 | "HugePages_Surp: %5lu\n" | ||
295 | "Hugepagesize: %5lu kB\n", | 542 | "Hugepagesize: %5lu kB\n", |
296 | nr_huge_pages, | 543 | nr_huge_pages, |
297 | free_huge_pages, | 544 | free_huge_pages, |
298 | resv_huge_pages, | 545 | resv_huge_pages, |
546 | surplus_huge_pages, | ||
299 | HPAGE_SIZE/1024); | 547 | HPAGE_SIZE/1024); |
300 | } | 548 | } |
301 | 549 | ||
@@ -355,7 +603,6 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, | |||
355 | entry = pte_mkwrite(pte_mkdirty(*ptep)); | 603 | entry = pte_mkwrite(pte_mkdirty(*ptep)); |
356 | if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { | 604 | if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { |
357 | update_mmu_cache(vma, address, entry); | 605 | update_mmu_cache(vma, address, entry); |
358 | lazy_mmu_prot_update(entry); | ||
359 | } | 606 | } |
360 | } | 607 | } |
361 | 608 | ||
@@ -708,7 +955,6 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
708 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 955 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
709 | pte = pte_mkhuge(pte_modify(pte, newprot)); | 956 | pte = pte_mkhuge(pte_modify(pte, newprot)); |
710 | set_huge_pte_at(mm, address, ptep, pte); | 957 | set_huge_pte_at(mm, address, ptep, pte); |
711 | lazy_mmu_prot_update(pte); | ||
712 | } | 958 | } |
713 | } | 959 | } |
714 | spin_unlock(&mm->page_table_lock); | 960 | spin_unlock(&mm->page_table_lock); |
@@ -843,21 +1089,6 @@ static int hugetlb_acct_memory(long delta) | |||
843 | int ret = -ENOMEM; | 1089 | int ret = -ENOMEM; |
844 | 1090 | ||
845 | spin_lock(&hugetlb_lock); | 1091 | spin_lock(&hugetlb_lock); |
846 | if ((delta + resv_huge_pages) <= free_huge_pages) { | ||
847 | resv_huge_pages += delta; | ||
848 | ret = 0; | ||
849 | } | ||
850 | spin_unlock(&hugetlb_lock); | ||
851 | return ret; | ||
852 | } | ||
853 | |||
854 | int hugetlb_reserve_pages(struct inode *inode, long from, long to) | ||
855 | { | ||
856 | long ret, chg; | ||
857 | |||
858 | chg = region_chg(&inode->i_mapping->private_list, from, to); | ||
859 | if (chg < 0) | ||
860 | return chg; | ||
861 | /* | 1092 | /* |
862 | * When cpuset is configured, it breaks the strict hugetlb page | 1093 | * When cpuset is configured, it breaks the strict hugetlb page |
863 | * reservation as the accounting is done on a global variable. Such | 1094 | * reservation as the accounting is done on a global variable. Such |
@@ -875,8 +1106,31 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to) | |||
875 | * a best attempt and hopefully to minimize the impact of changing | 1106 | * a best attempt and hopefully to minimize the impact of changing |
876 | * semantics that cpuset has. | 1107 | * semantics that cpuset has. |
877 | */ | 1108 | */ |
878 | if (chg > cpuset_mems_nr(free_huge_pages_node)) | 1109 | if (delta > 0) { |
879 | return -ENOMEM; | 1110 | if (gather_surplus_pages(delta) < 0) |
1111 | goto out; | ||
1112 | |||
1113 | if (delta > cpuset_mems_nr(free_huge_pages_node)) | ||
1114 | goto out; | ||
1115 | } | ||
1116 | |||
1117 | ret = 0; | ||
1118 | resv_huge_pages += delta; | ||
1119 | if (delta < 0) | ||
1120 | return_unused_surplus_pages((unsigned long) -delta); | ||
1121 | |||
1122 | out: | ||
1123 | spin_unlock(&hugetlb_lock); | ||
1124 | return ret; | ||
1125 | } | ||
1126 | |||
1127 | int hugetlb_reserve_pages(struct inode *inode, long from, long to) | ||
1128 | { | ||
1129 | long ret, chg; | ||
1130 | |||
1131 | chg = region_chg(&inode->i_mapping->private_list, from, to); | ||
1132 | if (chg < 0) | ||
1133 | return chg; | ||
880 | 1134 | ||
881 | ret = hugetlb_acct_memory(chg); | 1135 | ret = hugetlb_acct_memory(chg); |
882 | if (ret < 0) | 1136 | if (ret < 0) |
diff --git a/mm/internal.h b/mm/internal.h index a3110c02aea7..953f941ea867 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -37,4 +37,14 @@ static inline void __put_page(struct page *page) | |||
37 | extern void fastcall __init __free_pages_bootmem(struct page *page, | 37 | extern void fastcall __init __free_pages_bootmem(struct page *page, |
38 | unsigned int order); | 38 | unsigned int order); |
39 | 39 | ||
40 | /* | ||
41 | * function for dealing with page's order in buddy system. | ||
42 | * zone->lock is already acquired when we use these. | ||
43 | * So, we don't need atomic page->flags operations here. | ||
44 | */ | ||
45 | static inline unsigned long page_order(struct page *page) | ||
46 | { | ||
47 | VM_BUG_ON(!PageBuddy(page)); | ||
48 | return page_private(page); | ||
49 | } | ||
40 | #endif | 50 | #endif |
diff --git a/mm/memory.c b/mm/memory.c index f82b359b2745..bd16dcaeefb8 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -966,7 +966,7 @@ no_page_table: | |||
966 | * has touched so far, we don't want to allocate page tables. | 966 | * has touched so far, we don't want to allocate page tables. |
967 | */ | 967 | */ |
968 | if (flags & FOLL_ANON) { | 968 | if (flags & FOLL_ANON) { |
969 | page = ZERO_PAGE(address); | 969 | page = ZERO_PAGE(0); |
970 | if (flags & FOLL_GET) | 970 | if (flags & FOLL_GET) |
971 | get_page(page); | 971 | get_page(page); |
972 | BUG_ON(flags & FOLL_WRITE); | 972 | BUG_ON(flags & FOLL_WRITE); |
@@ -1111,95 +1111,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1111 | } | 1111 | } |
1112 | EXPORT_SYMBOL(get_user_pages); | 1112 | EXPORT_SYMBOL(get_user_pages); |
1113 | 1113 | ||
1114 | static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, | ||
1115 | unsigned long addr, unsigned long end, pgprot_t prot) | ||
1116 | { | ||
1117 | pte_t *pte; | ||
1118 | spinlock_t *ptl; | ||
1119 | int err = 0; | ||
1120 | |||
1121 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | ||
1122 | if (!pte) | ||
1123 | return -EAGAIN; | ||
1124 | arch_enter_lazy_mmu_mode(); | ||
1125 | do { | ||
1126 | struct page *page = ZERO_PAGE(addr); | ||
1127 | pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); | ||
1128 | |||
1129 | if (unlikely(!pte_none(*pte))) { | ||
1130 | err = -EEXIST; | ||
1131 | pte++; | ||
1132 | break; | ||
1133 | } | ||
1134 | page_cache_get(page); | ||
1135 | page_add_file_rmap(page); | ||
1136 | inc_mm_counter(mm, file_rss); | ||
1137 | set_pte_at(mm, addr, pte, zero_pte); | ||
1138 | } while (pte++, addr += PAGE_SIZE, addr != end); | ||
1139 | arch_leave_lazy_mmu_mode(); | ||
1140 | pte_unmap_unlock(pte - 1, ptl); | ||
1141 | return err; | ||
1142 | } | ||
1143 | |||
1144 | static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, | ||
1145 | unsigned long addr, unsigned long end, pgprot_t prot) | ||
1146 | { | ||
1147 | pmd_t *pmd; | ||
1148 | unsigned long next; | ||
1149 | int err; | ||
1150 | |||
1151 | pmd = pmd_alloc(mm, pud, addr); | ||
1152 | if (!pmd) | ||
1153 | return -EAGAIN; | ||
1154 | do { | ||
1155 | next = pmd_addr_end(addr, end); | ||
1156 | err = zeromap_pte_range(mm, pmd, addr, next, prot); | ||
1157 | if (err) | ||
1158 | break; | ||
1159 | } while (pmd++, addr = next, addr != end); | ||
1160 | return err; | ||
1161 | } | ||
1162 | |||
1163 | static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, | ||
1164 | unsigned long addr, unsigned long end, pgprot_t prot) | ||
1165 | { | ||
1166 | pud_t *pud; | ||
1167 | unsigned long next; | ||
1168 | int err; | ||
1169 | |||
1170 | pud = pud_alloc(mm, pgd, addr); | ||
1171 | if (!pud) | ||
1172 | return -EAGAIN; | ||
1173 | do { | ||
1174 | next = pud_addr_end(addr, end); | ||
1175 | err = zeromap_pmd_range(mm, pud, addr, next, prot); | ||
1176 | if (err) | ||
1177 | break; | ||
1178 | } while (pud++, addr = next, addr != end); | ||
1179 | return err; | ||
1180 | } | ||
1181 | |||
1182 | int zeromap_page_range(struct vm_area_struct *vma, | ||
1183 | unsigned long addr, unsigned long size, pgprot_t prot) | ||
1184 | { | ||
1185 | pgd_t *pgd; | ||
1186 | unsigned long next; | ||
1187 | unsigned long end = addr + size; | ||
1188 | struct mm_struct *mm = vma->vm_mm; | ||
1189 | int err; | ||
1190 | |||
1191 | BUG_ON(addr >= end); | ||
1192 | pgd = pgd_offset(mm, addr); | ||
1193 | flush_cache_range(vma, addr, end); | ||
1194 | do { | ||
1195 | next = pgd_addr_end(addr, end); | ||
1196 | err = zeromap_pud_range(mm, pgd, addr, next, prot); | ||
1197 | if (err) | ||
1198 | break; | ||
1199 | } while (pgd++, addr = next, addr != end); | ||
1200 | return err; | ||
1201 | } | ||
1202 | |||
1203 | pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) | 1114 | pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) |
1204 | { | 1115 | { |
1205 | pgd_t * pgd = pgd_offset(mm, addr); | 1116 | pgd_t * pgd = pgd_offset(mm, addr); |
@@ -1700,10 +1611,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1700 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 1611 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
1701 | entry = pte_mkyoung(orig_pte); | 1612 | entry = pte_mkyoung(orig_pte); |
1702 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1613 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1703 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) { | 1614 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) |
1704 | update_mmu_cache(vma, address, entry); | 1615 | update_mmu_cache(vma, address, entry); |
1705 | lazy_mmu_prot_update(entry); | ||
1706 | } | ||
1707 | ret |= VM_FAULT_WRITE; | 1616 | ret |= VM_FAULT_WRITE; |
1708 | goto unlock; | 1617 | goto unlock; |
1709 | } | 1618 | } |
@@ -1717,16 +1626,11 @@ gotten: | |||
1717 | 1626 | ||
1718 | if (unlikely(anon_vma_prepare(vma))) | 1627 | if (unlikely(anon_vma_prepare(vma))) |
1719 | goto oom; | 1628 | goto oom; |
1720 | if (old_page == ZERO_PAGE(address)) { | 1629 | VM_BUG_ON(old_page == ZERO_PAGE(0)); |
1721 | new_page = alloc_zeroed_user_highpage_movable(vma, address); | 1630 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
1722 | if (!new_page) | 1631 | if (!new_page) |
1723 | goto oom; | 1632 | goto oom; |
1724 | } else { | 1633 | cow_user_page(new_page, old_page, address, vma); |
1725 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
1726 | if (!new_page) | ||
1727 | goto oom; | ||
1728 | cow_user_page(new_page, old_page, address, vma); | ||
1729 | } | ||
1730 | 1634 | ||
1731 | /* | 1635 | /* |
1732 | * Re-check the pte - we dropped the lock | 1636 | * Re-check the pte - we dropped the lock |
@@ -1744,7 +1648,6 @@ gotten: | |||
1744 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 1648 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
1745 | entry = mk_pte(new_page, vma->vm_page_prot); | 1649 | entry = mk_pte(new_page, vma->vm_page_prot); |
1746 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1650 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1747 | lazy_mmu_prot_update(entry); | ||
1748 | /* | 1651 | /* |
1749 | * Clear the pte entry and flush it first, before updating the | 1652 | * Clear the pte entry and flush it first, before updating the |
1750 | * pte with the new entry. This will avoid a race condition | 1653 | * pte with the new entry. This will avoid a race condition |
@@ -2252,44 +2155,28 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2252 | spinlock_t *ptl; | 2155 | spinlock_t *ptl; |
2253 | pte_t entry; | 2156 | pte_t entry; |
2254 | 2157 | ||
2255 | if (write_access) { | 2158 | /* Allocate our own private page. */ |
2256 | /* Allocate our own private page. */ | 2159 | pte_unmap(page_table); |
2257 | pte_unmap(page_table); | ||
2258 | |||
2259 | if (unlikely(anon_vma_prepare(vma))) | ||
2260 | goto oom; | ||
2261 | page = alloc_zeroed_user_highpage_movable(vma, address); | ||
2262 | if (!page) | ||
2263 | goto oom; | ||
2264 | |||
2265 | entry = mk_pte(page, vma->vm_page_prot); | ||
2266 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
2267 | 2160 | ||
2268 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2161 | if (unlikely(anon_vma_prepare(vma))) |
2269 | if (!pte_none(*page_table)) | 2162 | goto oom; |
2270 | goto release; | 2163 | page = alloc_zeroed_user_highpage_movable(vma, address); |
2271 | inc_mm_counter(mm, anon_rss); | 2164 | if (!page) |
2272 | lru_cache_add_active(page); | 2165 | goto oom; |
2273 | page_add_new_anon_rmap(page, vma, address); | ||
2274 | } else { | ||
2275 | /* Map the ZERO_PAGE - vm_page_prot is readonly */ | ||
2276 | page = ZERO_PAGE(address); | ||
2277 | page_cache_get(page); | ||
2278 | entry = mk_pte(page, vma->vm_page_prot); | ||
2279 | 2166 | ||
2280 | ptl = pte_lockptr(mm, pmd); | 2167 | entry = mk_pte(page, vma->vm_page_prot); |
2281 | spin_lock(ptl); | 2168 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2282 | if (!pte_none(*page_table)) | ||
2283 | goto release; | ||
2284 | inc_mm_counter(mm, file_rss); | ||
2285 | page_add_file_rmap(page); | ||
2286 | } | ||
2287 | 2169 | ||
2170 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
2171 | if (!pte_none(*page_table)) | ||
2172 | goto release; | ||
2173 | inc_mm_counter(mm, anon_rss); | ||
2174 | lru_cache_add_active(page); | ||
2175 | page_add_new_anon_rmap(page, vma, address); | ||
2288 | set_pte_at(mm, address, page_table, entry); | 2176 | set_pte_at(mm, address, page_table, entry); |
2289 | 2177 | ||
2290 | /* No need to invalidate - it was non-present before */ | 2178 | /* No need to invalidate - it was non-present before */ |
2291 | update_mmu_cache(vma, address, entry); | 2179 | update_mmu_cache(vma, address, entry); |
2292 | lazy_mmu_prot_update(entry); | ||
2293 | unlock: | 2180 | unlock: |
2294 | pte_unmap_unlock(page_table, ptl); | 2181 | pte_unmap_unlock(page_table, ptl); |
2295 | return 0; | 2182 | return 0; |
@@ -2442,7 +2329,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2442 | 2329 | ||
2443 | /* no need to invalidate: a not-present page won't be cached */ | 2330 | /* no need to invalidate: a not-present page won't be cached */ |
2444 | update_mmu_cache(vma, address, entry); | 2331 | update_mmu_cache(vma, address, entry); |
2445 | lazy_mmu_prot_update(entry); | ||
2446 | } else { | 2332 | } else { |
2447 | if (anon) | 2333 | if (anon) |
2448 | page_cache_release(page); | 2334 | page_cache_release(page); |
@@ -2470,7 +2356,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2470 | int write_access, pte_t orig_pte) | 2356 | int write_access, pte_t orig_pte) |
2471 | { | 2357 | { |
2472 | pgoff_t pgoff = (((address & PAGE_MASK) | 2358 | pgoff_t pgoff = (((address & PAGE_MASK) |
2473 | - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; | 2359 | - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
2474 | unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); | 2360 | unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); |
2475 | 2361 | ||
2476 | pte_unmap(page_table); | 2362 | pte_unmap(page_table); |
@@ -2614,7 +2500,6 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2614 | entry = pte_mkyoung(entry); | 2500 | entry = pte_mkyoung(entry); |
2615 | if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { | 2501 | if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { |
2616 | update_mmu_cache(vma, address, entry); | 2502 | update_mmu_cache(vma, address, entry); |
2617 | lazy_mmu_prot_update(entry); | ||
2618 | } else { | 2503 | } else { |
2619 | /* | 2504 | /* |
2620 | * This is needed only for protection faults but the arch code | 2505 | * This is needed only for protection faults but the arch code |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index df9d554bea30..091b9c6c2529 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -23,6 +23,9 @@ | |||
23 | #include <linux/vmalloc.h> | 23 | #include <linux/vmalloc.h> |
24 | #include <linux/ioport.h> | 24 | #include <linux/ioport.h> |
25 | #include <linux/cpuset.h> | 25 | #include <linux/cpuset.h> |
26 | #include <linux/delay.h> | ||
27 | #include <linux/migrate.h> | ||
28 | #include <linux/page-isolation.h> | ||
26 | 29 | ||
27 | #include <asm/tlbflush.h> | 30 | #include <asm/tlbflush.h> |
28 | 31 | ||
@@ -161,14 +164,27 @@ static void grow_pgdat_span(struct pglist_data *pgdat, | |||
161 | pgdat->node_start_pfn; | 164 | pgdat->node_start_pfn; |
162 | } | 165 | } |
163 | 166 | ||
164 | int online_pages(unsigned long pfn, unsigned long nr_pages) | 167 | static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, |
168 | void *arg) | ||
165 | { | 169 | { |
166 | unsigned long i; | 170 | unsigned long i; |
171 | unsigned long onlined_pages = *(unsigned long *)arg; | ||
172 | struct page *page; | ||
173 | if (PageReserved(pfn_to_page(start_pfn))) | ||
174 | for (i = 0; i < nr_pages; i++) { | ||
175 | page = pfn_to_page(start_pfn + i); | ||
176 | online_page(page); | ||
177 | onlined_pages++; | ||
178 | } | ||
179 | *(unsigned long *)arg = onlined_pages; | ||
180 | return 0; | ||
181 | } | ||
182 | |||
183 | |||
184 | int online_pages(unsigned long pfn, unsigned long nr_pages) | ||
185 | { | ||
167 | unsigned long flags; | 186 | unsigned long flags; |
168 | unsigned long onlined_pages = 0; | 187 | unsigned long onlined_pages = 0; |
169 | struct resource res; | ||
170 | u64 section_end; | ||
171 | unsigned long start_pfn; | ||
172 | struct zone *zone; | 188 | struct zone *zone; |
173 | int need_zonelists_rebuild = 0; | 189 | int need_zonelists_rebuild = 0; |
174 | 190 | ||
@@ -191,32 +207,16 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
191 | if (!populated_zone(zone)) | 207 | if (!populated_zone(zone)) |
192 | need_zonelists_rebuild = 1; | 208 | need_zonelists_rebuild = 1; |
193 | 209 | ||
194 | res.start = (u64)pfn << PAGE_SHIFT; | 210 | walk_memory_resource(pfn, nr_pages, &onlined_pages, |
195 | res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1; | 211 | online_pages_range); |
196 | res.flags = IORESOURCE_MEM; /* we just need system ram */ | ||
197 | section_end = res.end; | ||
198 | |||
199 | while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { | ||
200 | start_pfn = (unsigned long)(res.start >> PAGE_SHIFT); | ||
201 | nr_pages = (unsigned long) | ||
202 | ((res.end + 1 - res.start) >> PAGE_SHIFT); | ||
203 | |||
204 | if (PageReserved(pfn_to_page(start_pfn))) { | ||
205 | /* this region's page is not onlined now */ | ||
206 | for (i = 0; i < nr_pages; i++) { | ||
207 | struct page *page = pfn_to_page(start_pfn + i); | ||
208 | online_page(page); | ||
209 | onlined_pages++; | ||
210 | } | ||
211 | } | ||
212 | |||
213 | res.start = res.end + 1; | ||
214 | res.end = section_end; | ||
215 | } | ||
216 | zone->present_pages += onlined_pages; | 212 | zone->present_pages += onlined_pages; |
217 | zone->zone_pgdat->node_present_pages += onlined_pages; | 213 | zone->zone_pgdat->node_present_pages += onlined_pages; |
218 | 214 | ||
219 | setup_per_zone_pages_min(); | 215 | setup_per_zone_pages_min(); |
216 | if (onlined_pages) { | ||
217 | kswapd_run(zone_to_nid(zone)); | ||
218 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | ||
219 | } | ||
220 | 220 | ||
221 | if (need_zonelists_rebuild) | 221 | if (need_zonelists_rebuild) |
222 | build_all_zonelists(); | 222 | build_all_zonelists(); |
@@ -271,9 +271,6 @@ int add_memory(int nid, u64 start, u64 size) | |||
271 | if (!pgdat) | 271 | if (!pgdat) |
272 | return -ENOMEM; | 272 | return -ENOMEM; |
273 | new_pgdat = 1; | 273 | new_pgdat = 1; |
274 | ret = kswapd_run(nid); | ||
275 | if (ret) | ||
276 | goto error; | ||
277 | } | 274 | } |
278 | 275 | ||
279 | /* call arch's memory hotadd */ | 276 | /* call arch's memory hotadd */ |
@@ -308,3 +305,260 @@ error: | |||
308 | return ret; | 305 | return ret; |
309 | } | 306 | } |
310 | EXPORT_SYMBOL_GPL(add_memory); | 307 | EXPORT_SYMBOL_GPL(add_memory); |
308 | |||
309 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
310 | /* | ||
311 | * Confirm all pages in a range [start, end) is belongs to the same zone. | ||
312 | */ | ||
313 | static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) | ||
314 | { | ||
315 | unsigned long pfn; | ||
316 | struct zone *zone = NULL; | ||
317 | struct page *page; | ||
318 | int i; | ||
319 | for (pfn = start_pfn; | ||
320 | pfn < end_pfn; | ||
321 | pfn += MAX_ORDER_NR_PAGES) { | ||
322 | i = 0; | ||
323 | /* This is just a CONFIG_HOLES_IN_ZONE check.*/ | ||
324 | while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) | ||
325 | i++; | ||
326 | if (i == MAX_ORDER_NR_PAGES) | ||
327 | continue; | ||
328 | page = pfn_to_page(pfn + i); | ||
329 | if (zone && page_zone(page) != zone) | ||
330 | return 0; | ||
331 | zone = page_zone(page); | ||
332 | } | ||
333 | return 1; | ||
334 | } | ||
335 | |||
336 | /* | ||
337 | * Scanning pfn is much easier than scanning lru list. | ||
338 | * Scan pfn from start to end and Find LRU page. | ||
339 | */ | ||
340 | int scan_lru_pages(unsigned long start, unsigned long end) | ||
341 | { | ||
342 | unsigned long pfn; | ||
343 | struct page *page; | ||
344 | for (pfn = start; pfn < end; pfn++) { | ||
345 | if (pfn_valid(pfn)) { | ||
346 | page = pfn_to_page(pfn); | ||
347 | if (PageLRU(page)) | ||
348 | return pfn; | ||
349 | } | ||
350 | } | ||
351 | return 0; | ||
352 | } | ||
353 | |||
354 | static struct page * | ||
355 | hotremove_migrate_alloc(struct page *page, | ||
356 | unsigned long private, | ||
357 | int **x) | ||
358 | { | ||
359 | /* This should be improoooooved!! */ | ||
360 | return alloc_page(GFP_HIGHUSER_PAGECACHE); | ||
361 | } | ||
362 | |||
363 | |||
364 | #define NR_OFFLINE_AT_ONCE_PAGES (256) | ||
365 | static int | ||
366 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | ||
367 | { | ||
368 | unsigned long pfn; | ||
369 | struct page *page; | ||
370 | int move_pages = NR_OFFLINE_AT_ONCE_PAGES; | ||
371 | int not_managed = 0; | ||
372 | int ret = 0; | ||
373 | LIST_HEAD(source); | ||
374 | |||
375 | for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { | ||
376 | if (!pfn_valid(pfn)) | ||
377 | continue; | ||
378 | page = pfn_to_page(pfn); | ||
379 | if (!page_count(page)) | ||
380 | continue; | ||
381 | /* | ||
382 | * We can skip free pages. And we can only deal with pages on | ||
383 | * LRU. | ||
384 | */ | ||
385 | ret = isolate_lru_page(page, &source); | ||
386 | if (!ret) { /* Success */ | ||
387 | move_pages--; | ||
388 | } else { | ||
389 | /* Becasue we don't have big zone->lock. we should | ||
390 | check this again here. */ | ||
391 | if (page_count(page)) | ||
392 | not_managed++; | ||
393 | #ifdef CONFIG_DEBUG_VM | ||
394 | printk(KERN_INFO "removing from LRU failed" | ||
395 | " %lx/%d/%lx\n", | ||
396 | pfn, page_count(page), page->flags); | ||
397 | #endif | ||
398 | } | ||
399 | } | ||
400 | ret = -EBUSY; | ||
401 | if (not_managed) { | ||
402 | if (!list_empty(&source)) | ||
403 | putback_lru_pages(&source); | ||
404 | goto out; | ||
405 | } | ||
406 | ret = 0; | ||
407 | if (list_empty(&source)) | ||
408 | goto out; | ||
409 | /* this function returns # of failed pages */ | ||
410 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0); | ||
411 | |||
412 | out: | ||
413 | return ret; | ||
414 | } | ||
415 | |||
416 | /* | ||
417 | * remove from free_area[] and mark all as Reserved. | ||
418 | */ | ||
419 | static int | ||
420 | offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, | ||
421 | void *data) | ||
422 | { | ||
423 | __offline_isolated_pages(start, start + nr_pages); | ||
424 | return 0; | ||
425 | } | ||
426 | |||
427 | static void | ||
428 | offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | ||
429 | { | ||
430 | walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL, | ||
431 | offline_isolated_pages_cb); | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * Check all pages in range, recoreded as memory resource, are isolated. | ||
436 | */ | ||
437 | static int | ||
438 | check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, | ||
439 | void *data) | ||
440 | { | ||
441 | int ret; | ||
442 | long offlined = *(long *)data; | ||
443 | ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); | ||
444 | offlined = nr_pages; | ||
445 | if (!ret) | ||
446 | *(long *)data += offlined; | ||
447 | return ret; | ||
448 | } | ||
449 | |||
450 | static long | ||
451 | check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | ||
452 | { | ||
453 | long offlined = 0; | ||
454 | int ret; | ||
455 | |||
456 | ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined, | ||
457 | check_pages_isolated_cb); | ||
458 | if (ret < 0) | ||
459 | offlined = (long)ret; | ||
460 | return offlined; | ||
461 | } | ||
462 | |||
463 | extern void drain_all_local_pages(void); | ||
464 | |||
465 | int offline_pages(unsigned long start_pfn, | ||
466 | unsigned long end_pfn, unsigned long timeout) | ||
467 | { | ||
468 | unsigned long pfn, nr_pages, expire; | ||
469 | long offlined_pages; | ||
470 | int ret, drain, retry_max; | ||
471 | struct zone *zone; | ||
472 | |||
473 | BUG_ON(start_pfn >= end_pfn); | ||
474 | /* at least, alignment against pageblock is necessary */ | ||
475 | if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) | ||
476 | return -EINVAL; | ||
477 | if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) | ||
478 | return -EINVAL; | ||
479 | /* This makes hotplug much easier...and readable. | ||
480 | we assume this for now. .*/ | ||
481 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) | ||
482 | return -EINVAL; | ||
483 | /* set above range as isolated */ | ||
484 | ret = start_isolate_page_range(start_pfn, end_pfn); | ||
485 | if (ret) | ||
486 | return ret; | ||
487 | nr_pages = end_pfn - start_pfn; | ||
488 | pfn = start_pfn; | ||
489 | expire = jiffies + timeout; | ||
490 | drain = 0; | ||
491 | retry_max = 5; | ||
492 | repeat: | ||
493 | /* start memory hot removal */ | ||
494 | ret = -EAGAIN; | ||
495 | if (time_after(jiffies, expire)) | ||
496 | goto failed_removal; | ||
497 | ret = -EINTR; | ||
498 | if (signal_pending(current)) | ||
499 | goto failed_removal; | ||
500 | ret = 0; | ||
501 | if (drain) { | ||
502 | lru_add_drain_all(); | ||
503 | flush_scheduled_work(); | ||
504 | cond_resched(); | ||
505 | drain_all_local_pages(); | ||
506 | } | ||
507 | |||
508 | pfn = scan_lru_pages(start_pfn, end_pfn); | ||
509 | if (pfn) { /* We have page on LRU */ | ||
510 | ret = do_migrate_range(pfn, end_pfn); | ||
511 | if (!ret) { | ||
512 | drain = 1; | ||
513 | goto repeat; | ||
514 | } else { | ||
515 | if (ret < 0) | ||
516 | if (--retry_max == 0) | ||
517 | goto failed_removal; | ||
518 | yield(); | ||
519 | drain = 1; | ||
520 | goto repeat; | ||
521 | } | ||
522 | } | ||
523 | /* drain all zone's lru pagevec, this is asyncronous... */ | ||
524 | lru_add_drain_all(); | ||
525 | flush_scheduled_work(); | ||
526 | yield(); | ||
527 | /* drain pcp pages , this is synchrouns. */ | ||
528 | drain_all_local_pages(); | ||
529 | /* check again */ | ||
530 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); | ||
531 | if (offlined_pages < 0) { | ||
532 | ret = -EBUSY; | ||
533 | goto failed_removal; | ||
534 | } | ||
535 | printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); | ||
536 | /* Ok, all of our target is islaoted. | ||
537 | We cannot do rollback at this point. */ | ||
538 | offline_isolated_pages(start_pfn, end_pfn); | ||
539 | /* reset pagetype flags */ | ||
540 | start_isolate_page_range(start_pfn, end_pfn); | ||
541 | /* removal success */ | ||
542 | zone = page_zone(pfn_to_page(start_pfn)); | ||
543 | zone->present_pages -= offlined_pages; | ||
544 | zone->zone_pgdat->node_present_pages -= offlined_pages; | ||
545 | totalram_pages -= offlined_pages; | ||
546 | num_physpages -= offlined_pages; | ||
547 | vm_total_pages = nr_free_pagecache_pages(); | ||
548 | writeback_set_ratelimit(); | ||
549 | return 0; | ||
550 | |||
551 | failed_removal: | ||
552 | printk(KERN_INFO "memory offlining %lx to %lx failed\n", | ||
553 | start_pfn, end_pfn); | ||
554 | /* pushback to free area */ | ||
555 | undo_isolate_page_range(start_pfn, end_pfn); | ||
556 | return ret; | ||
557 | } | ||
558 | #else | ||
559 | int remove_memory(u64 start, u64 size) | ||
560 | { | ||
561 | return -EINVAL; | ||
562 | } | ||
563 | EXPORT_SYMBOL_GPL(remove_memory); | ||
564 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3d6ac9505d07..568152ae6caf 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -72,7 +72,6 @@ | |||
72 | #include <linux/hugetlb.h> | 72 | #include <linux/hugetlb.h> |
73 | #include <linux/kernel.h> | 73 | #include <linux/kernel.h> |
74 | #include <linux/sched.h> | 74 | #include <linux/sched.h> |
75 | #include <linux/mm.h> | ||
76 | #include <linux/nodemask.h> | 75 | #include <linux/nodemask.h> |
77 | #include <linux/cpuset.h> | 76 | #include <linux/cpuset.h> |
78 | #include <linux/gfp.h> | 77 | #include <linux/gfp.h> |
@@ -82,13 +81,13 @@ | |||
82 | #include <linux/interrupt.h> | 81 | #include <linux/interrupt.h> |
83 | #include <linux/init.h> | 82 | #include <linux/init.h> |
84 | #include <linux/compat.h> | 83 | #include <linux/compat.h> |
85 | #include <linux/mempolicy.h> | ||
86 | #include <linux/swap.h> | 84 | #include <linux/swap.h> |
87 | #include <linux/seq_file.h> | 85 | #include <linux/seq_file.h> |
88 | #include <linux/proc_fs.h> | 86 | #include <linux/proc_fs.h> |
89 | #include <linux/migrate.h> | 87 | #include <linux/migrate.h> |
90 | #include <linux/rmap.h> | 88 | #include <linux/rmap.h> |
91 | #include <linux/security.h> | 89 | #include <linux/security.h> |
90 | #include <linux/syscalls.h> | ||
92 | 91 | ||
93 | #include <asm/tlbflush.h> | 92 | #include <asm/tlbflush.h> |
94 | #include <asm/uaccess.h> | 93 | #include <asm/uaccess.h> |
@@ -110,6 +109,9 @@ struct mempolicy default_policy = { | |||
110 | .policy = MPOL_DEFAULT, | 109 | .policy = MPOL_DEFAULT, |
111 | }; | 110 | }; |
112 | 111 | ||
112 | static void mpol_rebind_policy(struct mempolicy *pol, | ||
113 | const nodemask_t *newmask); | ||
114 | |||
113 | /* Do sanity checking on a policy */ | 115 | /* Do sanity checking on a policy */ |
114 | static int mpol_check_policy(int mode, nodemask_t *nodes) | 116 | static int mpol_check_policy(int mode, nodemask_t *nodes) |
115 | { | 117 | { |
@@ -128,7 +130,7 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) | |||
128 | return -EINVAL; | 130 | return -EINVAL; |
129 | break; | 131 | break; |
130 | } | 132 | } |
131 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; | 133 | return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL; |
132 | } | 134 | } |
133 | 135 | ||
134 | /* Generate a custom zonelist for the BIND policy. */ | 136 | /* Generate a custom zonelist for the BIND policy. */ |
@@ -185,7 +187,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
185 | switch (mode) { | 187 | switch (mode) { |
186 | case MPOL_INTERLEAVE: | 188 | case MPOL_INTERLEAVE: |
187 | policy->v.nodes = *nodes; | 189 | policy->v.nodes = *nodes; |
188 | if (nodes_weight(*nodes) == 0) { | 190 | nodes_and(policy->v.nodes, policy->v.nodes, |
191 | node_states[N_HIGH_MEMORY]); | ||
192 | if (nodes_weight(policy->v.nodes) == 0) { | ||
189 | kmem_cache_free(policy_cache, policy); | 193 | kmem_cache_free(policy_cache, policy); |
190 | return ERR_PTR(-EINVAL); | 194 | return ERR_PTR(-EINVAL); |
191 | } | 195 | } |
@@ -459,7 +463,7 @@ static void mpol_set_task_struct_flag(void) | |||
459 | } | 463 | } |
460 | 464 | ||
461 | /* Set the process memory policy */ | 465 | /* Set the process memory policy */ |
462 | long do_set_mempolicy(int mode, nodemask_t *nodes) | 466 | static long do_set_mempolicy(int mode, nodemask_t *nodes) |
463 | { | 467 | { |
464 | struct mempolicy *new; | 468 | struct mempolicy *new; |
465 | 469 | ||
@@ -494,9 +498,9 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) | |||
494 | *nodes = p->v.nodes; | 498 | *nodes = p->v.nodes; |
495 | break; | 499 | break; |
496 | case MPOL_PREFERRED: | 500 | case MPOL_PREFERRED: |
497 | /* or use current node instead of online map? */ | 501 | /* or use current node instead of memory_map? */ |
498 | if (p->v.preferred_node < 0) | 502 | if (p->v.preferred_node < 0) |
499 | *nodes = node_online_map; | 503 | *nodes = node_states[N_HIGH_MEMORY]; |
500 | else | 504 | else |
501 | node_set(p->v.preferred_node, *nodes); | 505 | node_set(p->v.preferred_node, *nodes); |
502 | break; | 506 | break; |
@@ -519,8 +523,8 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) | |||
519 | } | 523 | } |
520 | 524 | ||
521 | /* Retrieve NUMA policy */ | 525 | /* Retrieve NUMA policy */ |
522 | long do_get_mempolicy(int *policy, nodemask_t *nmask, | 526 | static long do_get_mempolicy(int *policy, nodemask_t *nmask, |
523 | unsigned long addr, unsigned long flags) | 527 | unsigned long addr, unsigned long flags) |
524 | { | 528 | { |
525 | int err; | 529 | int err; |
526 | struct mm_struct *mm = current->mm; | 530 | struct mm_struct *mm = current->mm; |
@@ -528,8 +532,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
528 | struct mempolicy *pol = current->mempolicy; | 532 | struct mempolicy *pol = current->mempolicy; |
529 | 533 | ||
530 | cpuset_update_task_memory_state(); | 534 | cpuset_update_task_memory_state(); |
531 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) | 535 | if (flags & |
536 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) | ||
532 | return -EINVAL; | 537 | return -EINVAL; |
538 | |||
539 | if (flags & MPOL_F_MEMS_ALLOWED) { | ||
540 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) | ||
541 | return -EINVAL; | ||
542 | *policy = 0; /* just so it's initialized */ | ||
543 | *nmask = cpuset_current_mems_allowed; | ||
544 | return 0; | ||
545 | } | ||
546 | |||
533 | if (flags & MPOL_F_ADDR) { | 547 | if (flags & MPOL_F_ADDR) { |
534 | down_read(&mm->mmap_sem); | 548 | down_read(&mm->mmap_sem); |
535 | vma = find_vma_intersection(mm, addr, addr+1); | 549 | vma = find_vma_intersection(mm, addr, addr+1); |
@@ -601,7 +615,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x | |||
601 | * Migrate pages from one node to a target node. | 615 | * Migrate pages from one node to a target node. |
602 | * Returns error or the number of pages not migrated. | 616 | * Returns error or the number of pages not migrated. |
603 | */ | 617 | */ |
604 | int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) | 618 | static int migrate_to_node(struct mm_struct *mm, int source, int dest, |
619 | int flags) | ||
605 | { | 620 | { |
606 | nodemask_t nmask; | 621 | nodemask_t nmask; |
607 | LIST_HEAD(pagelist); | 622 | LIST_HEAD(pagelist); |
@@ -732,8 +747,9 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * | |||
732 | } | 747 | } |
733 | #endif | 748 | #endif |
734 | 749 | ||
735 | long do_mbind(unsigned long start, unsigned long len, | 750 | static long do_mbind(unsigned long start, unsigned long len, |
736 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | 751 | unsigned long mode, nodemask_t *nmask, |
752 | unsigned long flags) | ||
737 | { | 753 | { |
738 | struct vm_area_struct *vma; | 754 | struct vm_area_struct *vma; |
739 | struct mm_struct *mm = current->mm; | 755 | struct mm_struct *mm = current->mm; |
@@ -955,7 +971,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, | |||
955 | goto out; | 971 | goto out; |
956 | } | 972 | } |
957 | 973 | ||
958 | if (!nodes_subset(new, node_online_map)) { | 974 | if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { |
959 | err = -EINVAL; | 975 | err = -EINVAL; |
960 | goto out; | 976 | goto out; |
961 | } | 977 | } |
@@ -978,7 +994,8 @@ asmlinkage long sys_get_mempolicy(int __user *policy, | |||
978 | unsigned long maxnode, | 994 | unsigned long maxnode, |
979 | unsigned long addr, unsigned long flags) | 995 | unsigned long addr, unsigned long flags) |
980 | { | 996 | { |
981 | int err, pval; | 997 | int err; |
998 | int uninitialized_var(pval); | ||
982 | nodemask_t nodes; | 999 | nodemask_t nodes; |
983 | 1000 | ||
984 | if (nmask != NULL && maxnode < MAX_NUMNODES) | 1001 | if (nmask != NULL && maxnode < MAX_NUMNODES) |
@@ -1527,8 +1544,8 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n) | |||
1527 | kmem_cache_free(sn_cache, n); | 1544 | kmem_cache_free(sn_cache, n); |
1528 | } | 1545 | } |
1529 | 1546 | ||
1530 | struct sp_node * | 1547 | static struct sp_node *sp_alloc(unsigned long start, unsigned long end, |
1531 | sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) | 1548 | struct mempolicy *pol) |
1532 | { | 1549 | { |
1533 | struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); | 1550 | struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); |
1534 | 1551 | ||
@@ -1677,7 +1694,7 @@ void __init numa_policy_init(void) | |||
1677 | * fall back to the largest node if they're all smaller. | 1694 | * fall back to the largest node if they're all smaller. |
1678 | */ | 1695 | */ |
1679 | nodes_clear(interleave_nodes); | 1696 | nodes_clear(interleave_nodes); |
1680 | for_each_online_node(nid) { | 1697 | for_each_node_state(nid, N_HIGH_MEMORY) { |
1681 | unsigned long total_pages = node_present_pages(nid); | 1698 | unsigned long total_pages = node_present_pages(nid); |
1682 | 1699 | ||
1683 | /* Preserve the largest node */ | 1700 | /* Preserve the largest node */ |
@@ -1706,7 +1723,8 @@ void numa_default_policy(void) | |||
1706 | } | 1723 | } |
1707 | 1724 | ||
1708 | /* Migrate a policy to a different set of nodes */ | 1725 | /* Migrate a policy to a different set of nodes */ |
1709 | void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) | 1726 | static void mpol_rebind_policy(struct mempolicy *pol, |
1727 | const nodemask_t *newmask) | ||
1710 | { | 1728 | { |
1711 | nodemask_t *mpolmask; | 1729 | nodemask_t *mpolmask; |
1712 | nodemask_t tmp; | 1730 | nodemask_t tmp; |
@@ -1963,7 +1981,7 @@ int show_numa_map(struct seq_file *m, void *v) | |||
1963 | seq_printf(m, " huge"); | 1981 | seq_printf(m, " huge"); |
1964 | } else { | 1982 | } else { |
1965 | check_pgd_range(vma, vma->vm_start, vma->vm_end, | 1983 | check_pgd_range(vma, vma->vm_start, vma->vm_end, |
1966 | &node_online_map, MPOL_MF_STATS, md); | 1984 | &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md); |
1967 | } | 1985 | } |
1968 | 1986 | ||
1969 | if (!md->pages) | 1987 | if (!md->pages) |
@@ -1990,7 +2008,7 @@ int show_numa_map(struct seq_file *m, void *v) | |||
1990 | if (md->writeback) | 2008 | if (md->writeback) |
1991 | seq_printf(m," writeback=%lu", md->writeback); | 2009 | seq_printf(m," writeback=%lu", md->writeback); |
1992 | 2010 | ||
1993 | for_each_online_node(n) | 2011 | for_each_node_state(n, N_HIGH_MEMORY) |
1994 | if (md->node[n]) | 2012 | if (md->node[n]) |
1995 | seq_printf(m, " N%d=%lu", n, md->node[n]); | 2013 | seq_printf(m, " N%d=%lu", n, md->node[n]); |
1996 | out: | 2014 | out: |
diff --git a/mm/migrate.c b/mm/migrate.c index e2fdbce1874b..06d0877a66ef 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -171,6 +171,7 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
171 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 171 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
172 | if (is_write_migration_entry(entry)) | 172 | if (is_write_migration_entry(entry)) |
173 | pte = pte_mkwrite(pte); | 173 | pte = pte_mkwrite(pte); |
174 | flush_cache_page(vma, addr, pte_pfn(pte)); | ||
174 | set_pte_at(mm, addr, ptep, pte); | 175 | set_pte_at(mm, addr, ptep, pte); |
175 | 176 | ||
176 | if (PageAnon(new)) | 177 | if (PageAnon(new)) |
@@ -180,7 +181,6 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
180 | 181 | ||
181 | /* No need to invalidate - it was non-present before */ | 182 | /* No need to invalidate - it was non-present before */ |
182 | update_mmu_cache(vma, addr, pte); | 183 | update_mmu_cache(vma, addr, pte); |
183 | lazy_mmu_prot_update(pte); | ||
184 | 184 | ||
185 | out: | 185 | out: |
186 | pte_unmap_unlock(ptep, ptl); | 186 | pte_unmap_unlock(ptep, ptl); |
@@ -972,7 +972,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, | |||
972 | * array. Return various errors if the user did something wrong. | 972 | * array. Return various errors if the user did something wrong. |
973 | */ | 973 | */ |
974 | for (i = 0; i < nr_pages; i++) { | 974 | for (i = 0; i < nr_pages; i++) { |
975 | const void *p; | 975 | const void __user *p; |
976 | 976 | ||
977 | err = -EFAULT; | 977 | err = -EFAULT; |
978 | if (get_user(p, pages + i)) | 978 | if (get_user(p, pages + i)) |
@@ -986,7 +986,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, | |||
986 | goto out; | 986 | goto out; |
987 | 987 | ||
988 | err = -ENODEV; | 988 | err = -ENODEV; |
989 | if (!node_online(node)) | 989 | if (!node_state(node, N_HIGH_MEMORY)) |
990 | goto out; | 990 | goto out; |
991 | 991 | ||
992 | err = -EACCES; | 992 | err = -EACCES; |
@@ -7,6 +7,7 @@ | |||
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/slab.h> | 9 | #include <linux/slab.h> |
10 | #include <linux/backing-dev.h> | ||
10 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
11 | #include <linux/shm.h> | 12 | #include <linux/shm.h> |
12 | #include <linux/mman.h> | 13 | #include <linux/mman.h> |
@@ -180,8 +181,6 @@ error: | |||
180 | return -ENOMEM; | 181 | return -ENOMEM; |
181 | } | 182 | } |
182 | 183 | ||
183 | EXPORT_SYMBOL(__vm_enough_memory); | ||
184 | |||
185 | /* | 184 | /* |
186 | * Requires inode->i_mapping->i_mmap_lock | 185 | * Requires inode->i_mapping->i_mmap_lock |
187 | */ | 186 | */ |
diff --git a/mm/mprotect.c b/mm/mprotect.c index e8346c30abec..1d4d69790e59 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -53,7 +53,6 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
53 | if (dirty_accountable && pte_dirty(ptent)) | 53 | if (dirty_accountable && pte_dirty(ptent)) |
54 | ptent = pte_mkwrite(ptent); | 54 | ptent = pte_mkwrite(ptent); |
55 | set_pte_at(mm, addr, pte, ptent); | 55 | set_pte_at(mm, addr, pte, ptent); |
56 | lazy_mmu_prot_update(ptent); | ||
57 | #ifdef CONFIG_MIGRATION | 56 | #ifdef CONFIG_MIGRATION |
58 | } else if (!pte_file(oldpte)) { | 57 | } else if (!pte_file(oldpte)) { |
59 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 58 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
diff --git a/mm/nommu.c b/mm/nommu.c index 8ed0cb43118a..42fb84e9e815 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -44,7 +44,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | |||
44 | int heap_stack_gap = 0; | 44 | int heap_stack_gap = 0; |
45 | 45 | ||
46 | EXPORT_SYMBOL(mem_map); | 46 | EXPORT_SYMBOL(mem_map); |
47 | EXPORT_SYMBOL(__vm_enough_memory); | ||
48 | EXPORT_SYMBOL(num_physpages); | 47 | EXPORT_SYMBOL(num_physpages); |
49 | 48 | ||
50 | /* list of shareable VMAs */ | 49 | /* list of shareable VMAs */ |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f9b82ad5047f..a64decb5b13f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -27,6 +27,8 @@ | |||
27 | #include <linux/notifier.h> | 27 | #include <linux/notifier.h> |
28 | 28 | ||
29 | int sysctl_panic_on_oom; | 29 | int sysctl_panic_on_oom; |
30 | int sysctl_oom_kill_allocating_task; | ||
31 | static DEFINE_SPINLOCK(zone_scan_mutex); | ||
30 | /* #define DEBUG */ | 32 | /* #define DEBUG */ |
31 | 33 | ||
32 | /** | 34 | /** |
@@ -141,7 +143,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
141 | * because p may have allocated or otherwise mapped memory on | 143 | * because p may have allocated or otherwise mapped memory on |
142 | * this node before. However it will be less likely. | 144 | * this node before. However it will be less likely. |
143 | */ | 145 | */ |
144 | if (!cpuset_excl_nodes_overlap(p)) | 146 | if (!cpuset_mems_allowed_intersects(current, p)) |
145 | points /= 8; | 147 | points /= 8; |
146 | 148 | ||
147 | /* | 149 | /* |
@@ -164,27 +166,14 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
164 | } | 166 | } |
165 | 167 | ||
166 | /* | 168 | /* |
167 | * Types of limitations to the nodes from which allocations may occur | ||
168 | */ | ||
169 | #define CONSTRAINT_NONE 1 | ||
170 | #define CONSTRAINT_MEMORY_POLICY 2 | ||
171 | #define CONSTRAINT_CPUSET 3 | ||
172 | |||
173 | /* | ||
174 | * Determine the type of allocation constraint. | 169 | * Determine the type of allocation constraint. |
175 | */ | 170 | */ |
176 | static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) | 171 | static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, |
172 | gfp_t gfp_mask) | ||
177 | { | 173 | { |
178 | #ifdef CONFIG_NUMA | 174 | #ifdef CONFIG_NUMA |
179 | struct zone **z; | 175 | struct zone **z; |
180 | nodemask_t nodes; | 176 | nodemask_t nodes = node_states[N_HIGH_MEMORY]; |
181 | int node; | ||
182 | |||
183 | nodes_clear(nodes); | ||
184 | /* node has memory ? */ | ||
185 | for_each_online_node(node) | ||
186 | if (NODE_DATA(node)->node_present_pages) | ||
187 | node_set(node, nodes); | ||
188 | 177 | ||
189 | for (z = zonelist->zones; *z; z++) | 178 | for (z = zonelist->zones; *z; z++) |
190 | if (cpuset_zone_allowed_softwall(*z, gfp_mask)) | 179 | if (cpuset_zone_allowed_softwall(*z, gfp_mask)) |
@@ -344,12 +333,20 @@ static int oom_kill_task(struct task_struct *p) | |||
344 | return 0; | 333 | return 0; |
345 | } | 334 | } |
346 | 335 | ||
347 | static int oom_kill_process(struct task_struct *p, unsigned long points, | 336 | static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, |
348 | const char *message) | 337 | unsigned long points, const char *message) |
349 | { | 338 | { |
350 | struct task_struct *c; | 339 | struct task_struct *c; |
351 | struct list_head *tsk; | 340 | struct list_head *tsk; |
352 | 341 | ||
342 | if (printk_ratelimit()) { | ||
343 | printk(KERN_WARNING "%s invoked oom-killer: " | ||
344 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | ||
345 | current->comm, gfp_mask, order, current->oomkilladj); | ||
346 | dump_stack(); | ||
347 | show_mem(); | ||
348 | } | ||
349 | |||
353 | /* | 350 | /* |
354 | * If the task is already exiting, don't alarm the sysadmin or kill | 351 | * If the task is already exiting, don't alarm the sysadmin or kill |
355 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 352 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
@@ -387,6 +384,57 @@ int unregister_oom_notifier(struct notifier_block *nb) | |||
387 | } | 384 | } |
388 | EXPORT_SYMBOL_GPL(unregister_oom_notifier); | 385 | EXPORT_SYMBOL_GPL(unregister_oom_notifier); |
389 | 386 | ||
387 | /* | ||
388 | * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero | ||
389 | * if a parallel OOM killing is already taking place that includes a zone in | ||
390 | * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. | ||
391 | */ | ||
392 | int try_set_zone_oom(struct zonelist *zonelist) | ||
393 | { | ||
394 | struct zone **z; | ||
395 | int ret = 1; | ||
396 | |||
397 | z = zonelist->zones; | ||
398 | |||
399 | spin_lock(&zone_scan_mutex); | ||
400 | do { | ||
401 | if (zone_is_oom_locked(*z)) { | ||
402 | ret = 0; | ||
403 | goto out; | ||
404 | } | ||
405 | } while (*(++z) != NULL); | ||
406 | |||
407 | /* | ||
408 | * Lock each zone in the zonelist under zone_scan_mutex so a parallel | ||
409 | * invocation of try_set_zone_oom() doesn't succeed when it shouldn't. | ||
410 | */ | ||
411 | z = zonelist->zones; | ||
412 | do { | ||
413 | zone_set_flag(*z, ZONE_OOM_LOCKED); | ||
414 | } while (*(++z) != NULL); | ||
415 | out: | ||
416 | spin_unlock(&zone_scan_mutex); | ||
417 | return ret; | ||
418 | } | ||
419 | |||
420 | /* | ||
421 | * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed | ||
422 | * allocation attempts with zonelists containing them may now recall the OOM | ||
423 | * killer, if necessary. | ||
424 | */ | ||
425 | void clear_zonelist_oom(struct zonelist *zonelist) | ||
426 | { | ||
427 | struct zone **z; | ||
428 | |||
429 | z = zonelist->zones; | ||
430 | |||
431 | spin_lock(&zone_scan_mutex); | ||
432 | do { | ||
433 | zone_clear_flag(*z, ZONE_OOM_LOCKED); | ||
434 | } while (*(++z) != NULL); | ||
435 | spin_unlock(&zone_scan_mutex); | ||
436 | } | ||
437 | |||
390 | /** | 438 | /** |
391 | * out_of_memory - kill the "best" process when we run out of memory | 439 | * out_of_memory - kill the "best" process when we run out of memory |
392 | * | 440 | * |
@@ -400,21 +448,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
400 | struct task_struct *p; | 448 | struct task_struct *p; |
401 | unsigned long points = 0; | 449 | unsigned long points = 0; |
402 | unsigned long freed = 0; | 450 | unsigned long freed = 0; |
403 | int constraint; | 451 | enum oom_constraint constraint; |
404 | 452 | ||
405 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); | 453 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); |
406 | if (freed > 0) | 454 | if (freed > 0) |
407 | /* Got some memory back in the last second. */ | 455 | /* Got some memory back in the last second. */ |
408 | return; | 456 | return; |
409 | 457 | ||
410 | if (printk_ratelimit()) { | ||
411 | printk(KERN_WARNING "%s invoked oom-killer: " | ||
412 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | ||
413 | current->comm, gfp_mask, order, current->oomkilladj); | ||
414 | dump_stack(); | ||
415 | show_mem(); | ||
416 | } | ||
417 | |||
418 | if (sysctl_panic_on_oom == 2) | 458 | if (sysctl_panic_on_oom == 2) |
419 | panic("out of memory. Compulsory panic_on_oom is selected.\n"); | 459 | panic("out of memory. Compulsory panic_on_oom is selected.\n"); |
420 | 460 | ||
@@ -423,23 +463,24 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
423 | * NUMA) that may require different handling. | 463 | * NUMA) that may require different handling. |
424 | */ | 464 | */ |
425 | constraint = constrained_alloc(zonelist, gfp_mask); | 465 | constraint = constrained_alloc(zonelist, gfp_mask); |
426 | cpuset_lock(); | ||
427 | read_lock(&tasklist_lock); | 466 | read_lock(&tasklist_lock); |
428 | 467 | ||
429 | switch (constraint) { | 468 | switch (constraint) { |
430 | case CONSTRAINT_MEMORY_POLICY: | 469 | case CONSTRAINT_MEMORY_POLICY: |
431 | oom_kill_process(current, points, | 470 | oom_kill_process(current, gfp_mask, order, points, |
432 | "No available memory (MPOL_BIND)"); | 471 | "No available memory (MPOL_BIND)"); |
433 | break; | 472 | break; |
434 | 473 | ||
435 | case CONSTRAINT_CPUSET: | ||
436 | oom_kill_process(current, points, | ||
437 | "No available memory in cpuset"); | ||
438 | break; | ||
439 | |||
440 | case CONSTRAINT_NONE: | 474 | case CONSTRAINT_NONE: |
441 | if (sysctl_panic_on_oom) | 475 | if (sysctl_panic_on_oom) |
442 | panic("out of memory. panic_on_oom is selected\n"); | 476 | panic("out of memory. panic_on_oom is selected\n"); |
477 | /* Fall-through */ | ||
478 | case CONSTRAINT_CPUSET: | ||
479 | if (sysctl_oom_kill_allocating_task) { | ||
480 | oom_kill_process(current, gfp_mask, order, points, | ||
481 | "Out of memory (oom_kill_allocating_task)"); | ||
482 | break; | ||
483 | } | ||
443 | retry: | 484 | retry: |
444 | /* | 485 | /* |
445 | * Rambo mode: Shoot down a process and hope it solves whatever | 486 | * Rambo mode: Shoot down a process and hope it solves whatever |
@@ -453,11 +494,11 @@ retry: | |||
453 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 494 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
454 | if (!p) { | 495 | if (!p) { |
455 | read_unlock(&tasklist_lock); | 496 | read_unlock(&tasklist_lock); |
456 | cpuset_unlock(); | ||
457 | panic("Out of memory and no killable processes...\n"); | 497 | panic("Out of memory and no killable processes...\n"); |
458 | } | 498 | } |
459 | 499 | ||
460 | if (oom_kill_process(p, points, "Out of memory")) | 500 | if (oom_kill_process(p, points, gfp_mask, order, |
501 | "Out of memory")) | ||
461 | goto retry; | 502 | goto retry; |
462 | 503 | ||
463 | break; | 504 | break; |
@@ -465,7 +506,6 @@ retry: | |||
465 | 506 | ||
466 | out: | 507 | out: |
467 | read_unlock(&tasklist_lock); | 508 | read_unlock(&tasklist_lock); |
468 | cpuset_unlock(); | ||
469 | 509 | ||
470 | /* | 510 | /* |
471 | * Give "p" a good chance of killing itself before we | 511 | * Give "p" a good chance of killing itself before we |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 44720363374c..7845462064f4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2,6 +2,7 @@ | |||
2 | * mm/page-writeback.c | 2 | * mm/page-writeback.c |
3 | * | 3 | * |
4 | * Copyright (C) 2002, Linus Torvalds. | 4 | * Copyright (C) 2002, Linus Torvalds. |
5 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
5 | * | 6 | * |
6 | * Contains functions related to writing back dirty pages at the | 7 | * Contains functions related to writing back dirty pages at the |
7 | * address_space level. | 8 | * address_space level. |
@@ -36,7 +37,7 @@ | |||
36 | 37 | ||
37 | /* | 38 | /* |
38 | * The maximum number of pages to writeout in a single bdflush/kupdate | 39 | * The maximum number of pages to writeout in a single bdflush/kupdate |
39 | * operation. We do this so we don't hold I_LOCK against an inode for | 40 | * operation. We do this so we don't hold I_SYNC against an inode for |
40 | * enormous amounts of time, which would block a userspace task which has | 41 | * enormous amounts of time, which would block a userspace task which has |
41 | * been forced to throttle against that inode. Also, the code reevaluates | 42 | * been forced to throttle against that inode. Also, the code reevaluates |
42 | * the dirty each time it has written this many pages. | 43 | * the dirty each time it has written this many pages. |
@@ -49,8 +50,6 @@ | |||
49 | */ | 50 | */ |
50 | static long ratelimit_pages = 32; | 51 | static long ratelimit_pages = 32; |
51 | 52 | ||
52 | static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ | ||
53 | |||
54 | /* | 53 | /* |
55 | * When balance_dirty_pages decides that the caller needs to perform some | 54 | * When balance_dirty_pages decides that the caller needs to perform some |
56 | * non-background writeback, this is how many pages it will attempt to write. | 55 | * non-background writeback, this is how many pages it will attempt to write. |
@@ -103,6 +102,141 @@ EXPORT_SYMBOL(laptop_mode); | |||
103 | static void background_writeout(unsigned long _min_pages); | 102 | static void background_writeout(unsigned long _min_pages); |
104 | 103 | ||
105 | /* | 104 | /* |
105 | * Scale the writeback cache size proportional to the relative writeout speeds. | ||
106 | * | ||
107 | * We do this by keeping a floating proportion between BDIs, based on page | ||
108 | * writeback completions [end_page_writeback()]. Those devices that write out | ||
109 | * pages fastest will get the larger share, while the slower will get a smaller | ||
110 | * share. | ||
111 | * | ||
112 | * We use page writeout completions because we are interested in getting rid of | ||
113 | * dirty pages. Having them written out is the primary goal. | ||
114 | * | ||
115 | * We introduce a concept of time, a period over which we measure these events, | ||
116 | * because demand can/will vary over time. The length of this period itself is | ||
117 | * measured in page writeback completions. | ||
118 | * | ||
119 | */ | ||
120 | static struct prop_descriptor vm_completions; | ||
121 | static struct prop_descriptor vm_dirties; | ||
122 | |||
123 | static unsigned long determine_dirtyable_memory(void); | ||
124 | |||
125 | /* | ||
126 | * couple the period to the dirty_ratio: | ||
127 | * | ||
128 | * period/2 ~ roundup_pow_of_two(dirty limit) | ||
129 | */ | ||
130 | static int calc_period_shift(void) | ||
131 | { | ||
132 | unsigned long dirty_total; | ||
133 | |||
134 | dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; | ||
135 | return 2 + ilog2(dirty_total - 1); | ||
136 | } | ||
137 | |||
138 | /* | ||
139 | * update the period when the dirty ratio changes. | ||
140 | */ | ||
141 | int dirty_ratio_handler(struct ctl_table *table, int write, | ||
142 | struct file *filp, void __user *buffer, size_t *lenp, | ||
143 | loff_t *ppos) | ||
144 | { | ||
145 | int old_ratio = vm_dirty_ratio; | ||
146 | int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
147 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { | ||
148 | int shift = calc_period_shift(); | ||
149 | prop_change_shift(&vm_completions, shift); | ||
150 | prop_change_shift(&vm_dirties, shift); | ||
151 | } | ||
152 | return ret; | ||
153 | } | ||
154 | |||
155 | /* | ||
156 | * Increment the BDI's writeout completion count and the global writeout | ||
157 | * completion count. Called from test_clear_page_writeback(). | ||
158 | */ | ||
159 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | ||
160 | { | ||
161 | __prop_inc_percpu(&vm_completions, &bdi->completions); | ||
162 | } | ||
163 | |||
164 | static inline void task_dirty_inc(struct task_struct *tsk) | ||
165 | { | ||
166 | prop_inc_single(&vm_dirties, &tsk->dirties); | ||
167 | } | ||
168 | |||
169 | /* | ||
170 | * Obtain an accurate fraction of the BDI's portion. | ||
171 | */ | ||
172 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | ||
173 | long *numerator, long *denominator) | ||
174 | { | ||
175 | if (bdi_cap_writeback_dirty(bdi)) { | ||
176 | prop_fraction_percpu(&vm_completions, &bdi->completions, | ||
177 | numerator, denominator); | ||
178 | } else { | ||
179 | *numerator = 0; | ||
180 | *denominator = 1; | ||
181 | } | ||
182 | } | ||
183 | |||
184 | /* | ||
185 | * Clip the earned share of dirty pages to that which is actually available. | ||
186 | * This avoids exceeding the total dirty_limit when the floating averages | ||
187 | * fluctuate too quickly. | ||
188 | */ | ||
189 | static void | ||
190 | clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) | ||
191 | { | ||
192 | long avail_dirty; | ||
193 | |||
194 | avail_dirty = dirty - | ||
195 | (global_page_state(NR_FILE_DIRTY) + | ||
196 | global_page_state(NR_WRITEBACK) + | ||
197 | global_page_state(NR_UNSTABLE_NFS)); | ||
198 | |||
199 | if (avail_dirty < 0) | ||
200 | avail_dirty = 0; | ||
201 | |||
202 | avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + | ||
203 | bdi_stat(bdi, BDI_WRITEBACK); | ||
204 | |||
205 | *pbdi_dirty = min(*pbdi_dirty, avail_dirty); | ||
206 | } | ||
207 | |||
208 | static inline void task_dirties_fraction(struct task_struct *tsk, | ||
209 | long *numerator, long *denominator) | ||
210 | { | ||
211 | prop_fraction_single(&vm_dirties, &tsk->dirties, | ||
212 | numerator, denominator); | ||
213 | } | ||
214 | |||
215 | /* | ||
216 | * scale the dirty limit | ||
217 | * | ||
218 | * task specific dirty limit: | ||
219 | * | ||
220 | * dirty -= (dirty/8) * p_{t} | ||
221 | */ | ||
222 | void task_dirty_limit(struct task_struct *tsk, long *pdirty) | ||
223 | { | ||
224 | long numerator, denominator; | ||
225 | long dirty = *pdirty; | ||
226 | u64 inv = dirty >> 3; | ||
227 | |||
228 | task_dirties_fraction(tsk, &numerator, &denominator); | ||
229 | inv *= numerator; | ||
230 | do_div(inv, denominator); | ||
231 | |||
232 | dirty -= inv; | ||
233 | if (dirty < *pdirty/2) | ||
234 | dirty = *pdirty/2; | ||
235 | |||
236 | *pdirty = dirty; | ||
237 | } | ||
238 | |||
239 | /* | ||
106 | * Work out the current dirty-memory clamping and background writeout | 240 | * Work out the current dirty-memory clamping and background writeout |
107 | * thresholds. | 241 | * thresholds. |
108 | * | 242 | * |
@@ -126,7 +260,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) | |||
126 | int node; | 260 | int node; |
127 | unsigned long x = 0; | 261 | unsigned long x = 0; |
128 | 262 | ||
129 | for_each_online_node(node) { | 263 | for_each_node_state(node, N_HIGH_MEMORY) { |
130 | struct zone *z = | 264 | struct zone *z = |
131 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; | 265 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; |
132 | 266 | ||
@@ -158,8 +292,8 @@ static unsigned long determine_dirtyable_memory(void) | |||
158 | } | 292 | } |
159 | 293 | ||
160 | static void | 294 | static void |
161 | get_dirty_limits(long *pbackground, long *pdirty, | 295 | get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, |
162 | struct address_space *mapping) | 296 | struct backing_dev_info *bdi) |
163 | { | 297 | { |
164 | int background_ratio; /* Percentages */ | 298 | int background_ratio; /* Percentages */ |
165 | int dirty_ratio; | 299 | int dirty_ratio; |
@@ -193,6 +327,23 @@ get_dirty_limits(long *pbackground, long *pdirty, | |||
193 | } | 327 | } |
194 | *pbackground = background; | 328 | *pbackground = background; |
195 | *pdirty = dirty; | 329 | *pdirty = dirty; |
330 | |||
331 | if (bdi) { | ||
332 | u64 bdi_dirty = dirty; | ||
333 | long numerator, denominator; | ||
334 | |||
335 | /* | ||
336 | * Calculate this BDI's share of the dirty ratio. | ||
337 | */ | ||
338 | bdi_writeout_fraction(bdi, &numerator, &denominator); | ||
339 | |||
340 | bdi_dirty *= numerator; | ||
341 | do_div(bdi_dirty, denominator); | ||
342 | |||
343 | *pbdi_dirty = bdi_dirty; | ||
344 | clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); | ||
345 | task_dirty_limit(current, pbdi_dirty); | ||
346 | } | ||
196 | } | 347 | } |
197 | 348 | ||
198 | /* | 349 | /* |
@@ -204,9 +355,11 @@ get_dirty_limits(long *pbackground, long *pdirty, | |||
204 | */ | 355 | */ |
205 | static void balance_dirty_pages(struct address_space *mapping) | 356 | static void balance_dirty_pages(struct address_space *mapping) |
206 | { | 357 | { |
207 | long nr_reclaimable; | 358 | long bdi_nr_reclaimable; |
359 | long bdi_nr_writeback; | ||
208 | long background_thresh; | 360 | long background_thresh; |
209 | long dirty_thresh; | 361 | long dirty_thresh; |
362 | long bdi_thresh; | ||
210 | unsigned long pages_written = 0; | 363 | unsigned long pages_written = 0; |
211 | unsigned long write_chunk = sync_writeback_pages(); | 364 | unsigned long write_chunk = sync_writeback_pages(); |
212 | 365 | ||
@@ -221,15 +374,15 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
221 | .range_cyclic = 1, | 374 | .range_cyclic = 1, |
222 | }; | 375 | }; |
223 | 376 | ||
224 | get_dirty_limits(&background_thresh, &dirty_thresh, mapping); | 377 | get_dirty_limits(&background_thresh, &dirty_thresh, |
225 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 378 | &bdi_thresh, bdi); |
226 | global_page_state(NR_UNSTABLE_NFS); | 379 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); |
227 | if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= | 380 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); |
228 | dirty_thresh) | 381 | if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) |
229 | break; | 382 | break; |
230 | 383 | ||
231 | if (!dirty_exceeded) | 384 | if (!bdi->dirty_exceeded) |
232 | dirty_exceeded = 1; | 385 | bdi->dirty_exceeded = 1; |
233 | 386 | ||
234 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | 387 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. |
235 | * Unstable writes are a feature of certain networked | 388 | * Unstable writes are a feature of certain networked |
@@ -237,26 +390,42 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
237 | * written to the server's write cache, but has not yet | 390 | * written to the server's write cache, but has not yet |
238 | * been flushed to permanent storage. | 391 | * been flushed to permanent storage. |
239 | */ | 392 | */ |
240 | if (nr_reclaimable) { | 393 | if (bdi_nr_reclaimable) { |
241 | writeback_inodes(&wbc); | 394 | writeback_inodes(&wbc); |
242 | get_dirty_limits(&background_thresh, | ||
243 | &dirty_thresh, mapping); | ||
244 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | ||
245 | global_page_state(NR_UNSTABLE_NFS); | ||
246 | if (nr_reclaimable + | ||
247 | global_page_state(NR_WRITEBACK) | ||
248 | <= dirty_thresh) | ||
249 | break; | ||
250 | pages_written += write_chunk - wbc.nr_to_write; | 395 | pages_written += write_chunk - wbc.nr_to_write; |
251 | if (pages_written >= write_chunk) | 396 | get_dirty_limits(&background_thresh, &dirty_thresh, |
252 | break; /* We've done our duty */ | 397 | &bdi_thresh, bdi); |
253 | } | 398 | } |
399 | |||
400 | /* | ||
401 | * In order to avoid the stacked BDI deadlock we need | ||
402 | * to ensure we accurately count the 'dirty' pages when | ||
403 | * the threshold is low. | ||
404 | * | ||
405 | * Otherwise it would be possible to get thresh+n pages | ||
406 | * reported dirty, even though there are thresh-m pages | ||
407 | * actually dirty; with m+n sitting in the percpu | ||
408 | * deltas. | ||
409 | */ | ||
410 | if (bdi_thresh < 2*bdi_stat_error(bdi)) { | ||
411 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | ||
412 | bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); | ||
413 | } else if (bdi_nr_reclaimable) { | ||
414 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | ||
415 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); | ||
416 | } | ||
417 | |||
418 | if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) | ||
419 | break; | ||
420 | if (pages_written >= write_chunk) | ||
421 | break; /* We've done our duty */ | ||
422 | |||
254 | congestion_wait(WRITE, HZ/10); | 423 | congestion_wait(WRITE, HZ/10); |
255 | } | 424 | } |
256 | 425 | ||
257 | if (nr_reclaimable + global_page_state(NR_WRITEBACK) | 426 | if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && |
258 | <= dirty_thresh && dirty_exceeded) | 427 | bdi->dirty_exceeded) |
259 | dirty_exceeded = 0; | 428 | bdi->dirty_exceeded = 0; |
260 | 429 | ||
261 | if (writeback_in_progress(bdi)) | 430 | if (writeback_in_progress(bdi)) |
262 | return; /* pdflush is already working this queue */ | 431 | return; /* pdflush is already working this queue */ |
@@ -270,7 +439,9 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
270 | * background_thresh, to keep the amount of dirty memory low. | 439 | * background_thresh, to keep the amount of dirty memory low. |
271 | */ | 440 | */ |
272 | if ((laptop_mode && pages_written) || | 441 | if ((laptop_mode && pages_written) || |
273 | (!laptop_mode && (nr_reclaimable > background_thresh))) | 442 | (!laptop_mode && (global_page_state(NR_FILE_DIRTY) |
443 | + global_page_state(NR_UNSTABLE_NFS) | ||
444 | > background_thresh))) | ||
274 | pdflush_operation(background_writeout, 0); | 445 | pdflush_operation(background_writeout, 0); |
275 | } | 446 | } |
276 | 447 | ||
@@ -306,7 +477,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | |||
306 | unsigned long *p; | 477 | unsigned long *p; |
307 | 478 | ||
308 | ratelimit = ratelimit_pages; | 479 | ratelimit = ratelimit_pages; |
309 | if (dirty_exceeded) | 480 | if (mapping->backing_dev_info->dirty_exceeded) |
310 | ratelimit = 8; | 481 | ratelimit = 8; |
311 | 482 | ||
312 | /* | 483 | /* |
@@ -331,18 +502,8 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
331 | long background_thresh; | 502 | long background_thresh; |
332 | long dirty_thresh; | 503 | long dirty_thresh; |
333 | 504 | ||
334 | if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) { | ||
335 | /* | ||
336 | * The caller might hold locks which can prevent IO completion | ||
337 | * or progress in the filesystem. So we cannot just sit here | ||
338 | * waiting for IO to complete. | ||
339 | */ | ||
340 | congestion_wait(WRITE, HZ/10); | ||
341 | return; | ||
342 | } | ||
343 | |||
344 | for ( ; ; ) { | 505 | for ( ; ; ) { |
345 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL); | 506 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); |
346 | 507 | ||
347 | /* | 508 | /* |
348 | * Boost the allowable dirty threshold a bit for page | 509 | * Boost the allowable dirty threshold a bit for page |
@@ -354,6 +515,14 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
354 | global_page_state(NR_WRITEBACK) <= dirty_thresh) | 515 | global_page_state(NR_WRITEBACK) <= dirty_thresh) |
355 | break; | 516 | break; |
356 | congestion_wait(WRITE, HZ/10); | 517 | congestion_wait(WRITE, HZ/10); |
518 | |||
519 | /* | ||
520 | * The caller might hold locks which can prevent IO completion | ||
521 | * or progress in the filesystem. So we cannot just sit here | ||
522 | * waiting for IO to complete. | ||
523 | */ | ||
524 | if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) | ||
525 | break; | ||
357 | } | 526 | } |
358 | } | 527 | } |
359 | 528 | ||
@@ -377,11 +546,12 @@ static void background_writeout(unsigned long _min_pages) | |||
377 | long background_thresh; | 546 | long background_thresh; |
378 | long dirty_thresh; | 547 | long dirty_thresh; |
379 | 548 | ||
380 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL); | 549 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); |
381 | if (global_page_state(NR_FILE_DIRTY) + | 550 | if (global_page_state(NR_FILE_DIRTY) + |
382 | global_page_state(NR_UNSTABLE_NFS) < background_thresh | 551 | global_page_state(NR_UNSTABLE_NFS) < background_thresh |
383 | && min_pages <= 0) | 552 | && min_pages <= 0) |
384 | break; | 553 | break; |
554 | wbc.more_io = 0; | ||
385 | wbc.encountered_congestion = 0; | 555 | wbc.encountered_congestion = 0; |
386 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | 556 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; |
387 | wbc.pages_skipped = 0; | 557 | wbc.pages_skipped = 0; |
@@ -389,8 +559,9 @@ static void background_writeout(unsigned long _min_pages) | |||
389 | min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; | 559 | min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; |
390 | if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { | 560 | if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { |
391 | /* Wrote less than expected */ | 561 | /* Wrote less than expected */ |
392 | congestion_wait(WRITE, HZ/10); | 562 | if (wbc.encountered_congestion || wbc.more_io) |
393 | if (!wbc.encountered_congestion) | 563 | congestion_wait(WRITE, HZ/10); |
564 | else | ||
394 | break; | 565 | break; |
395 | } | 566 | } |
396 | } | 567 | } |
@@ -455,11 +626,12 @@ static void wb_kupdate(unsigned long arg) | |||
455 | global_page_state(NR_UNSTABLE_NFS) + | 626 | global_page_state(NR_UNSTABLE_NFS) + |
456 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | 627 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); |
457 | while (nr_to_write > 0) { | 628 | while (nr_to_write > 0) { |
629 | wbc.more_io = 0; | ||
458 | wbc.encountered_congestion = 0; | 630 | wbc.encountered_congestion = 0; |
459 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | 631 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; |
460 | writeback_inodes(&wbc); | 632 | writeback_inodes(&wbc); |
461 | if (wbc.nr_to_write > 0) { | 633 | if (wbc.nr_to_write > 0) { |
462 | if (wbc.encountered_congestion) | 634 | if (wbc.encountered_congestion || wbc.more_io) |
463 | congestion_wait(WRITE, HZ/10); | 635 | congestion_wait(WRITE, HZ/10); |
464 | else | 636 | else |
465 | break; /* All the old data is written */ | 637 | break; /* All the old data is written */ |
@@ -580,9 +752,15 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { | |||
580 | */ | 752 | */ |
581 | void __init page_writeback_init(void) | 753 | void __init page_writeback_init(void) |
582 | { | 754 | { |
755 | int shift; | ||
756 | |||
583 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); | 757 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); |
584 | writeback_set_ratelimit(); | 758 | writeback_set_ratelimit(); |
585 | register_cpu_notifier(&ratelimit_nb); | 759 | register_cpu_notifier(&ratelimit_nb); |
760 | |||
761 | shift = calc_period_shift(); | ||
762 | prop_descriptor_init(&vm_completions, shift); | ||
763 | prop_descriptor_init(&vm_dirties, shift); | ||
586 | } | 764 | } |
587 | 765 | ||
588 | /** | 766 | /** |
@@ -672,8 +850,10 @@ retry: | |||
672 | 850 | ||
673 | ret = (*writepage)(page, wbc, data); | 851 | ret = (*writepage)(page, wbc, data); |
674 | 852 | ||
675 | if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) | 853 | if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { |
676 | unlock_page(page); | 854 | unlock_page(page); |
855 | ret = 0; | ||
856 | } | ||
677 | if (ret || (--(wbc->nr_to_write) <= 0)) | 857 | if (ret || (--(wbc->nr_to_write) <= 0)) |
678 | done = 1; | 858 | done = 1; |
679 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | 859 | if (wbc->nonblocking && bdi_write_congested(bdi)) { |
@@ -827,6 +1007,8 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
827 | WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); | 1007 | WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); |
828 | if (mapping_cap_account_dirty(mapping)) { | 1008 | if (mapping_cap_account_dirty(mapping)) { |
829 | __inc_zone_page_state(page, NR_FILE_DIRTY); | 1009 | __inc_zone_page_state(page, NR_FILE_DIRTY); |
1010 | __inc_bdi_stat(mapping->backing_dev_info, | ||
1011 | BDI_RECLAIMABLE); | ||
830 | task_io_account_write(PAGE_CACHE_SIZE); | 1012 | task_io_account_write(PAGE_CACHE_SIZE); |
831 | } | 1013 | } |
832 | radix_tree_tag_set(&mapping->page_tree, | 1014 | radix_tree_tag_set(&mapping->page_tree, |
@@ -859,7 +1041,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage); | |||
859 | * If the mapping doesn't provide a set_page_dirty a_op, then | 1041 | * If the mapping doesn't provide a set_page_dirty a_op, then |
860 | * just fall through and assume that it wants buffer_heads. | 1042 | * just fall through and assume that it wants buffer_heads. |
861 | */ | 1043 | */ |
862 | int fastcall set_page_dirty(struct page *page) | 1044 | static int __set_page_dirty(struct page *page) |
863 | { | 1045 | { |
864 | struct address_space *mapping = page_mapping(page); | 1046 | struct address_space *mapping = page_mapping(page); |
865 | 1047 | ||
@@ -877,6 +1059,14 @@ int fastcall set_page_dirty(struct page *page) | |||
877 | } | 1059 | } |
878 | return 0; | 1060 | return 0; |
879 | } | 1061 | } |
1062 | |||
1063 | int fastcall set_page_dirty(struct page *page) | ||
1064 | { | ||
1065 | int ret = __set_page_dirty(page); | ||
1066 | if (ret) | ||
1067 | task_dirty_inc(current); | ||
1068 | return ret; | ||
1069 | } | ||
880 | EXPORT_SYMBOL(set_page_dirty); | 1070 | EXPORT_SYMBOL(set_page_dirty); |
881 | 1071 | ||
882 | /* | 1072 | /* |
@@ -961,6 +1151,8 @@ int clear_page_dirty_for_io(struct page *page) | |||
961 | */ | 1151 | */ |
962 | if (TestClearPageDirty(page)) { | 1152 | if (TestClearPageDirty(page)) { |
963 | dec_zone_page_state(page, NR_FILE_DIRTY); | 1153 | dec_zone_page_state(page, NR_FILE_DIRTY); |
1154 | dec_bdi_stat(mapping->backing_dev_info, | ||
1155 | BDI_RECLAIMABLE); | ||
964 | return 1; | 1156 | return 1; |
965 | } | 1157 | } |
966 | return 0; | 1158 | return 0; |
@@ -975,14 +1167,20 @@ int test_clear_page_writeback(struct page *page) | |||
975 | int ret; | 1167 | int ret; |
976 | 1168 | ||
977 | if (mapping) { | 1169 | if (mapping) { |
1170 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
978 | unsigned long flags; | 1171 | unsigned long flags; |
979 | 1172 | ||
980 | write_lock_irqsave(&mapping->tree_lock, flags); | 1173 | write_lock_irqsave(&mapping->tree_lock, flags); |
981 | ret = TestClearPageWriteback(page); | 1174 | ret = TestClearPageWriteback(page); |
982 | if (ret) | 1175 | if (ret) { |
983 | radix_tree_tag_clear(&mapping->page_tree, | 1176 | radix_tree_tag_clear(&mapping->page_tree, |
984 | page_index(page), | 1177 | page_index(page), |
985 | PAGECACHE_TAG_WRITEBACK); | 1178 | PAGECACHE_TAG_WRITEBACK); |
1179 | if (bdi_cap_writeback_dirty(bdi)) { | ||
1180 | __dec_bdi_stat(bdi, BDI_WRITEBACK); | ||
1181 | __bdi_writeout_inc(bdi); | ||
1182 | } | ||
1183 | } | ||
986 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 1184 | write_unlock_irqrestore(&mapping->tree_lock, flags); |
987 | } else { | 1185 | } else { |
988 | ret = TestClearPageWriteback(page); | 1186 | ret = TestClearPageWriteback(page); |
@@ -998,14 +1196,18 @@ int test_set_page_writeback(struct page *page) | |||
998 | int ret; | 1196 | int ret; |
999 | 1197 | ||
1000 | if (mapping) { | 1198 | if (mapping) { |
1199 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
1001 | unsigned long flags; | 1200 | unsigned long flags; |
1002 | 1201 | ||
1003 | write_lock_irqsave(&mapping->tree_lock, flags); | 1202 | write_lock_irqsave(&mapping->tree_lock, flags); |
1004 | ret = TestSetPageWriteback(page); | 1203 | ret = TestSetPageWriteback(page); |
1005 | if (!ret) | 1204 | if (!ret) { |
1006 | radix_tree_tag_set(&mapping->page_tree, | 1205 | radix_tree_tag_set(&mapping->page_tree, |
1007 | page_index(page), | 1206 | page_index(page), |
1008 | PAGECACHE_TAG_WRITEBACK); | 1207 | PAGECACHE_TAG_WRITEBACK); |
1208 | if (bdi_cap_writeback_dirty(bdi)) | ||
1209 | __inc_bdi_stat(bdi, BDI_WRITEBACK); | ||
1210 | } | ||
1009 | if (!PageDirty(page)) | 1211 | if (!PageDirty(page)) |
1010 | radix_tree_tag_clear(&mapping->page_tree, | 1212 | radix_tree_tag_clear(&mapping->page_tree, |
1011 | page_index(page), | 1213 | page_index(page), |
@@ -1022,17 +1224,15 @@ int test_set_page_writeback(struct page *page) | |||
1022 | EXPORT_SYMBOL(test_set_page_writeback); | 1224 | EXPORT_SYMBOL(test_set_page_writeback); |
1023 | 1225 | ||
1024 | /* | 1226 | /* |
1025 | * Return true if any of the pages in the mapping are marged with the | 1227 | * Return true if any of the pages in the mapping are marked with the |
1026 | * passed tag. | 1228 | * passed tag. |
1027 | */ | 1229 | */ |
1028 | int mapping_tagged(struct address_space *mapping, int tag) | 1230 | int mapping_tagged(struct address_space *mapping, int tag) |
1029 | { | 1231 | { |
1030 | unsigned long flags; | ||
1031 | int ret; | 1232 | int ret; |
1032 | 1233 | rcu_read_lock(); | |
1033 | read_lock_irqsave(&mapping->tree_lock, flags); | ||
1034 | ret = radix_tree_tagged(&mapping->page_tree, tag); | 1234 | ret = radix_tree_tagged(&mapping->page_tree, tag); |
1035 | read_unlock_irqrestore(&mapping->tree_lock, flags); | 1235 | rcu_read_unlock(); |
1036 | return ret; | 1236 | return ret; |
1037 | } | 1237 | } |
1038 | EXPORT_SYMBOL(mapping_tagged); | 1238 | EXPORT_SYMBOL(mapping_tagged); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1a8c59571cb7..43f757fcf30f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/pagevec.h> | 27 | #include <linux/pagevec.h> |
28 | #include <linux/blkdev.h> | 28 | #include <linux/blkdev.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/oom.h> | ||
30 | #include <linux/notifier.h> | 31 | #include <linux/notifier.h> |
31 | #include <linux/topology.h> | 32 | #include <linux/topology.h> |
32 | #include <linux/sysctl.h> | 33 | #include <linux/sysctl.h> |
@@ -41,24 +42,37 @@ | |||
41 | #include <linux/pfn.h> | 42 | #include <linux/pfn.h> |
42 | #include <linux/backing-dev.h> | 43 | #include <linux/backing-dev.h> |
43 | #include <linux/fault-inject.h> | 44 | #include <linux/fault-inject.h> |
45 | #include <linux/page-isolation.h> | ||
44 | 46 | ||
45 | #include <asm/tlbflush.h> | 47 | #include <asm/tlbflush.h> |
46 | #include <asm/div64.h> | 48 | #include <asm/div64.h> |
47 | #include "internal.h" | 49 | #include "internal.h" |
48 | 50 | ||
49 | /* | 51 | /* |
50 | * MCD - HACK: Find somewhere to initialize this EARLY, or make this | 52 | * Array of node states. |
51 | * initializer cleaner | ||
52 | */ | 53 | */ |
53 | nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; | 54 | nodemask_t node_states[NR_NODE_STATES] __read_mostly = { |
54 | EXPORT_SYMBOL(node_online_map); | 55 | [N_POSSIBLE] = NODE_MASK_ALL, |
55 | nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; | 56 | [N_ONLINE] = { { [0] = 1UL } }, |
56 | EXPORT_SYMBOL(node_possible_map); | 57 | #ifndef CONFIG_NUMA |
58 | [N_NORMAL_MEMORY] = { { [0] = 1UL } }, | ||
59 | #ifdef CONFIG_HIGHMEM | ||
60 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, | ||
61 | #endif | ||
62 | [N_CPU] = { { [0] = 1UL } }, | ||
63 | #endif /* NUMA */ | ||
64 | }; | ||
65 | EXPORT_SYMBOL(node_states); | ||
66 | |||
57 | unsigned long totalram_pages __read_mostly; | 67 | unsigned long totalram_pages __read_mostly; |
58 | unsigned long totalreserve_pages __read_mostly; | 68 | unsigned long totalreserve_pages __read_mostly; |
59 | long nr_swap_pages; | 69 | long nr_swap_pages; |
60 | int percpu_pagelist_fraction; | 70 | int percpu_pagelist_fraction; |
61 | 71 | ||
72 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | ||
73 | int pageblock_order __read_mostly; | ||
74 | #endif | ||
75 | |||
62 | static void __free_pages_ok(struct page *page, unsigned int order); | 76 | static void __free_pages_ok(struct page *page, unsigned int order); |
63 | 77 | ||
64 | /* | 78 | /* |
@@ -137,7 +151,7 @@ static unsigned long __meminitdata dma_reserve; | |||
137 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; | 151 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; |
138 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | 152 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ |
139 | unsigned long __initdata required_kernelcore; | 153 | unsigned long __initdata required_kernelcore; |
140 | unsigned long __initdata required_movablecore; | 154 | static unsigned long __initdata required_movablecore; |
141 | unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 155 | unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
142 | 156 | ||
143 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ | 157 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
@@ -150,6 +164,14 @@ int nr_node_ids __read_mostly = MAX_NUMNODES; | |||
150 | EXPORT_SYMBOL(nr_node_ids); | 164 | EXPORT_SYMBOL(nr_node_ids); |
151 | #endif | 165 | #endif |
152 | 166 | ||
167 | int page_group_by_mobility_disabled __read_mostly; | ||
168 | |||
169 | static void set_pageblock_migratetype(struct page *page, int migratetype) | ||
170 | { | ||
171 | set_pageblock_flags_group(page, (unsigned long)migratetype, | ||
172 | PB_migrate, PB_migrate_end); | ||
173 | } | ||
174 | |||
153 | #ifdef CONFIG_DEBUG_VM | 175 | #ifdef CONFIG_DEBUG_VM |
154 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 176 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
155 | { | 177 | { |
@@ -293,16 +315,6 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | |||
293 | clear_highpage(page + i); | 315 | clear_highpage(page + i); |
294 | } | 316 | } |
295 | 317 | ||
296 | /* | ||
297 | * function for dealing with page's order in buddy system. | ||
298 | * zone->lock is already acquired when we use these. | ||
299 | * So, we don't need atomic page->flags operations here. | ||
300 | */ | ||
301 | static inline unsigned long page_order(struct page *page) | ||
302 | { | ||
303 | return page_private(page); | ||
304 | } | ||
305 | |||
306 | static inline void set_page_order(struct page *page, int order) | 318 | static inline void set_page_order(struct page *page, int order) |
307 | { | 319 | { |
308 | set_page_private(page, order); | 320 | set_page_private(page, order); |
@@ -404,6 +416,7 @@ static inline void __free_one_page(struct page *page, | |||
404 | { | 416 | { |
405 | unsigned long page_idx; | 417 | unsigned long page_idx; |
406 | int order_size = 1 << order; | 418 | int order_size = 1 << order; |
419 | int migratetype = get_pageblock_migratetype(page); | ||
407 | 420 | ||
408 | if (unlikely(PageCompound(page))) | 421 | if (unlikely(PageCompound(page))) |
409 | destroy_compound_page(page, order); | 422 | destroy_compound_page(page, order); |
@@ -416,7 +429,6 @@ static inline void __free_one_page(struct page *page, | |||
416 | __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); | 429 | __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); |
417 | while (order < MAX_ORDER-1) { | 430 | while (order < MAX_ORDER-1) { |
418 | unsigned long combined_idx; | 431 | unsigned long combined_idx; |
419 | struct free_area *area; | ||
420 | struct page *buddy; | 432 | struct page *buddy; |
421 | 433 | ||
422 | buddy = __page_find_buddy(page, page_idx, order); | 434 | buddy = __page_find_buddy(page, page_idx, order); |
@@ -424,8 +436,7 @@ static inline void __free_one_page(struct page *page, | |||
424 | break; /* Move the buddy up one level. */ | 436 | break; /* Move the buddy up one level. */ |
425 | 437 | ||
426 | list_del(&buddy->lru); | 438 | list_del(&buddy->lru); |
427 | area = zone->free_area + order; | 439 | zone->free_area[order].nr_free--; |
428 | area->nr_free--; | ||
429 | rmv_page_order(buddy); | 440 | rmv_page_order(buddy); |
430 | combined_idx = __find_combined_index(page_idx, order); | 441 | combined_idx = __find_combined_index(page_idx, order); |
431 | page = page + (combined_idx - page_idx); | 442 | page = page + (combined_idx - page_idx); |
@@ -433,7 +444,8 @@ static inline void __free_one_page(struct page *page, | |||
433 | order++; | 444 | order++; |
434 | } | 445 | } |
435 | set_page_order(page, order); | 446 | set_page_order(page, order); |
436 | list_add(&page->lru, &zone->free_area[order].free_list); | 447 | list_add(&page->lru, |
448 | &zone->free_area[order].free_list[migratetype]); | ||
437 | zone->free_area[order].nr_free++; | 449 | zone->free_area[order].nr_free++; |
438 | } | 450 | } |
439 | 451 | ||
@@ -478,7 +490,7 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
478 | struct list_head *list, int order) | 490 | struct list_head *list, int order) |
479 | { | 491 | { |
480 | spin_lock(&zone->lock); | 492 | spin_lock(&zone->lock); |
481 | zone->all_unreclaimable = 0; | 493 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
482 | zone->pages_scanned = 0; | 494 | zone->pages_scanned = 0; |
483 | while (count--) { | 495 | while (count--) { |
484 | struct page *page; | 496 | struct page *page; |
@@ -495,7 +507,7 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
495 | static void free_one_page(struct zone *zone, struct page *page, int order) | 507 | static void free_one_page(struct zone *zone, struct page *page, int order) |
496 | { | 508 | { |
497 | spin_lock(&zone->lock); | 509 | spin_lock(&zone->lock); |
498 | zone->all_unreclaimable = 0; | 510 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
499 | zone->pages_scanned = 0; | 511 | zone->pages_scanned = 0; |
500 | __free_one_page(page, zone, order); | 512 | __free_one_page(page, zone, order); |
501 | spin_unlock(&zone->lock); | 513 | spin_unlock(&zone->lock); |
@@ -567,7 +579,8 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) | |||
567 | * -- wli | 579 | * -- wli |
568 | */ | 580 | */ |
569 | static inline void expand(struct zone *zone, struct page *page, | 581 | static inline void expand(struct zone *zone, struct page *page, |
570 | int low, int high, struct free_area *area) | 582 | int low, int high, struct free_area *area, |
583 | int migratetype) | ||
571 | { | 584 | { |
572 | unsigned long size = 1 << high; | 585 | unsigned long size = 1 << high; |
573 | 586 | ||
@@ -576,7 +589,7 @@ static inline void expand(struct zone *zone, struct page *page, | |||
576 | high--; | 589 | high--; |
577 | size >>= 1; | 590 | size >>= 1; |
578 | VM_BUG_ON(bad_range(zone, &page[size])); | 591 | VM_BUG_ON(bad_range(zone, &page[size])); |
579 | list_add(&page[size].lru, &area->free_list); | 592 | list_add(&page[size].lru, &area->free_list[migratetype]); |
580 | area->nr_free++; | 593 | area->nr_free++; |
581 | set_page_order(&page[size], high); | 594 | set_page_order(&page[size], high); |
582 | } | 595 | } |
@@ -628,49 +641,235 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
628 | return 0; | 641 | return 0; |
629 | } | 642 | } |
630 | 643 | ||
631 | /* | 644 | /* |
632 | * Do the hard work of removing an element from the buddy allocator. | 645 | * Go through the free lists for the given migratetype and remove |
633 | * Call me with the zone->lock already held. | 646 | * the smallest available page from the freelists |
634 | */ | 647 | */ |
635 | static struct page *__rmqueue(struct zone *zone, unsigned int order) | 648 | static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, |
649 | int migratetype) | ||
636 | { | 650 | { |
637 | struct free_area * area; | ||
638 | unsigned int current_order; | 651 | unsigned int current_order; |
652 | struct free_area * area; | ||
639 | struct page *page; | 653 | struct page *page; |
640 | 654 | ||
655 | /* Find a page of the appropriate size in the preferred list */ | ||
641 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { | 656 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { |
642 | area = zone->free_area + current_order; | 657 | area = &(zone->free_area[current_order]); |
643 | if (list_empty(&area->free_list)) | 658 | if (list_empty(&area->free_list[migratetype])) |
644 | continue; | 659 | continue; |
645 | 660 | ||
646 | page = list_entry(area->free_list.next, struct page, lru); | 661 | page = list_entry(area->free_list[migratetype].next, |
662 | struct page, lru); | ||
647 | list_del(&page->lru); | 663 | list_del(&page->lru); |
648 | rmv_page_order(page); | 664 | rmv_page_order(page); |
649 | area->nr_free--; | 665 | area->nr_free--; |
650 | __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); | 666 | __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); |
651 | expand(zone, page, order, current_order, area); | 667 | expand(zone, page, order, current_order, area, migratetype); |
652 | return page; | 668 | return page; |
653 | } | 669 | } |
654 | 670 | ||
655 | return NULL; | 671 | return NULL; |
656 | } | 672 | } |
657 | 673 | ||
674 | |||
675 | /* | ||
676 | * This array describes the order lists are fallen back to when | ||
677 | * the free lists for the desirable migrate type are depleted | ||
678 | */ | ||
679 | static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { | ||
680 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | ||
681 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | ||
682 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | ||
683 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ | ||
684 | }; | ||
685 | |||
686 | /* | ||
687 | * Move the free pages in a range to the free lists of the requested type. | ||
688 | * Note that start_page and end_pages are not aligned on a pageblock | ||
689 | * boundary. If alignment is required, use move_freepages_block() | ||
690 | */ | ||
691 | int move_freepages(struct zone *zone, | ||
692 | struct page *start_page, struct page *end_page, | ||
693 | int migratetype) | ||
694 | { | ||
695 | struct page *page; | ||
696 | unsigned long order; | ||
697 | int pages_moved = 0; | ||
698 | |||
699 | #ifndef CONFIG_HOLES_IN_ZONE | ||
700 | /* | ||
701 | * page_zone is not safe to call in this context when | ||
702 | * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant | ||
703 | * anyway as we check zone boundaries in move_freepages_block(). | ||
704 | * Remove at a later date when no bug reports exist related to | ||
705 | * grouping pages by mobility | ||
706 | */ | ||
707 | BUG_ON(page_zone(start_page) != page_zone(end_page)); | ||
708 | #endif | ||
709 | |||
710 | for (page = start_page; page <= end_page;) { | ||
711 | if (!pfn_valid_within(page_to_pfn(page))) { | ||
712 | page++; | ||
713 | continue; | ||
714 | } | ||
715 | |||
716 | if (!PageBuddy(page)) { | ||
717 | page++; | ||
718 | continue; | ||
719 | } | ||
720 | |||
721 | order = page_order(page); | ||
722 | list_del(&page->lru); | ||
723 | list_add(&page->lru, | ||
724 | &zone->free_area[order].free_list[migratetype]); | ||
725 | page += 1 << order; | ||
726 | pages_moved += 1 << order; | ||
727 | } | ||
728 | |||
729 | return pages_moved; | ||
730 | } | ||
731 | |||
732 | int move_freepages_block(struct zone *zone, struct page *page, int migratetype) | ||
733 | { | ||
734 | unsigned long start_pfn, end_pfn; | ||
735 | struct page *start_page, *end_page; | ||
736 | |||
737 | start_pfn = page_to_pfn(page); | ||
738 | start_pfn = start_pfn & ~(pageblock_nr_pages-1); | ||
739 | start_page = pfn_to_page(start_pfn); | ||
740 | end_page = start_page + pageblock_nr_pages - 1; | ||
741 | end_pfn = start_pfn + pageblock_nr_pages - 1; | ||
742 | |||
743 | /* Do not cross zone boundaries */ | ||
744 | if (start_pfn < zone->zone_start_pfn) | ||
745 | start_page = page; | ||
746 | if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) | ||
747 | return 0; | ||
748 | |||
749 | return move_freepages(zone, start_page, end_page, migratetype); | ||
750 | } | ||
751 | |||
752 | /* Return the page with the lowest PFN in the list */ | ||
753 | static struct page *min_page(struct list_head *list) | ||
754 | { | ||
755 | unsigned long min_pfn = -1UL; | ||
756 | struct page *min_page = NULL, *page;; | ||
757 | |||
758 | list_for_each_entry(page, list, lru) { | ||
759 | unsigned long pfn = page_to_pfn(page); | ||
760 | if (pfn < min_pfn) { | ||
761 | min_pfn = pfn; | ||
762 | min_page = page; | ||
763 | } | ||
764 | } | ||
765 | |||
766 | return min_page; | ||
767 | } | ||
768 | |||
769 | /* Remove an element from the buddy allocator from the fallback list */ | ||
770 | static struct page *__rmqueue_fallback(struct zone *zone, int order, | ||
771 | int start_migratetype) | ||
772 | { | ||
773 | struct free_area * area; | ||
774 | int current_order; | ||
775 | struct page *page; | ||
776 | int migratetype, i; | ||
777 | |||
778 | /* Find the largest possible block of pages in the other list */ | ||
779 | for (current_order = MAX_ORDER-1; current_order >= order; | ||
780 | --current_order) { | ||
781 | for (i = 0; i < MIGRATE_TYPES - 1; i++) { | ||
782 | migratetype = fallbacks[start_migratetype][i]; | ||
783 | |||
784 | /* MIGRATE_RESERVE handled later if necessary */ | ||
785 | if (migratetype == MIGRATE_RESERVE) | ||
786 | continue; | ||
787 | |||
788 | area = &(zone->free_area[current_order]); | ||
789 | if (list_empty(&area->free_list[migratetype])) | ||
790 | continue; | ||
791 | |||
792 | /* Bias kernel allocations towards low pfns */ | ||
793 | page = list_entry(area->free_list[migratetype].next, | ||
794 | struct page, lru); | ||
795 | if (unlikely(start_migratetype != MIGRATE_MOVABLE)) | ||
796 | page = min_page(&area->free_list[migratetype]); | ||
797 | area->nr_free--; | ||
798 | |||
799 | /* | ||
800 | * If breaking a large block of pages, move all free | ||
801 | * pages to the preferred allocation list. If falling | ||
802 | * back for a reclaimable kernel allocation, be more | ||
803 | * agressive about taking ownership of free pages | ||
804 | */ | ||
805 | if (unlikely(current_order >= (pageblock_order >> 1)) || | ||
806 | start_migratetype == MIGRATE_RECLAIMABLE) { | ||
807 | unsigned long pages; | ||
808 | pages = move_freepages_block(zone, page, | ||
809 | start_migratetype); | ||
810 | |||
811 | /* Claim the whole block if over half of it is free */ | ||
812 | if (pages >= (1 << (pageblock_order-1))) | ||
813 | set_pageblock_migratetype(page, | ||
814 | start_migratetype); | ||
815 | |||
816 | migratetype = start_migratetype; | ||
817 | } | ||
818 | |||
819 | /* Remove the page from the freelists */ | ||
820 | list_del(&page->lru); | ||
821 | rmv_page_order(page); | ||
822 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
823 | -(1UL << order)); | ||
824 | |||
825 | if (current_order == pageblock_order) | ||
826 | set_pageblock_migratetype(page, | ||
827 | start_migratetype); | ||
828 | |||
829 | expand(zone, page, order, current_order, area, migratetype); | ||
830 | return page; | ||
831 | } | ||
832 | } | ||
833 | |||
834 | /* Use MIGRATE_RESERVE rather than fail an allocation */ | ||
835 | return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); | ||
836 | } | ||
837 | |||
838 | /* | ||
839 | * Do the hard work of removing an element from the buddy allocator. | ||
840 | * Call me with the zone->lock already held. | ||
841 | */ | ||
842 | static struct page *__rmqueue(struct zone *zone, unsigned int order, | ||
843 | int migratetype) | ||
844 | { | ||
845 | struct page *page; | ||
846 | |||
847 | page = __rmqueue_smallest(zone, order, migratetype); | ||
848 | |||
849 | if (unlikely(!page)) | ||
850 | page = __rmqueue_fallback(zone, order, migratetype); | ||
851 | |||
852 | return page; | ||
853 | } | ||
854 | |||
658 | /* | 855 | /* |
659 | * Obtain a specified number of elements from the buddy allocator, all under | 856 | * Obtain a specified number of elements from the buddy allocator, all under |
660 | * a single hold of the lock, for efficiency. Add them to the supplied list. | 857 | * a single hold of the lock, for efficiency. Add them to the supplied list. |
661 | * Returns the number of new pages which were placed at *list. | 858 | * Returns the number of new pages which were placed at *list. |
662 | */ | 859 | */ |
663 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 860 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
664 | unsigned long count, struct list_head *list) | 861 | unsigned long count, struct list_head *list, |
862 | int migratetype) | ||
665 | { | 863 | { |
666 | int i; | 864 | int i; |
667 | 865 | ||
668 | spin_lock(&zone->lock); | 866 | spin_lock(&zone->lock); |
669 | for (i = 0; i < count; ++i) { | 867 | for (i = 0; i < count; ++i) { |
670 | struct page *page = __rmqueue(zone, order); | 868 | struct page *page = __rmqueue(zone, order, migratetype); |
671 | if (unlikely(page == NULL)) | 869 | if (unlikely(page == NULL)) |
672 | break; | 870 | break; |
673 | list_add_tail(&page->lru, list); | 871 | list_add(&page->lru, list); |
872 | set_page_private(page, migratetype); | ||
674 | } | 873 | } |
675 | spin_unlock(&zone->lock); | 874 | spin_unlock(&zone->lock); |
676 | return i; | 875 | return i; |
@@ -732,7 +931,7 @@ void mark_free_pages(struct zone *zone) | |||
732 | { | 931 | { |
733 | unsigned long pfn, max_zone_pfn; | 932 | unsigned long pfn, max_zone_pfn; |
734 | unsigned long flags; | 933 | unsigned long flags; |
735 | int order; | 934 | int order, t; |
736 | struct list_head *curr; | 935 | struct list_head *curr; |
737 | 936 | ||
738 | if (!zone->spanned_pages) | 937 | if (!zone->spanned_pages) |
@@ -749,17 +948,18 @@ void mark_free_pages(struct zone *zone) | |||
749 | swsusp_unset_page_free(page); | 948 | swsusp_unset_page_free(page); |
750 | } | 949 | } |
751 | 950 | ||
752 | for (order = MAX_ORDER - 1; order >= 0; --order) | 951 | for_each_migratetype_order(order, t) { |
753 | list_for_each(curr, &zone->free_area[order].free_list) { | 952 | list_for_each(curr, &zone->free_area[order].free_list[t]) { |
754 | unsigned long i; | 953 | unsigned long i; |
755 | 954 | ||
756 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); | 955 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); |
757 | for (i = 0; i < (1UL << order); i++) | 956 | for (i = 0; i < (1UL << order); i++) |
758 | swsusp_set_page_free(pfn_to_page(pfn + i)); | 957 | swsusp_set_page_free(pfn_to_page(pfn + i)); |
759 | } | 958 | } |
760 | 959 | } | |
761 | spin_unlock_irqrestore(&zone->lock, flags); | 960 | spin_unlock_irqrestore(&zone->lock, flags); |
762 | } | 961 | } |
962 | #endif /* CONFIG_PM */ | ||
763 | 963 | ||
764 | /* | 964 | /* |
765 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. | 965 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. |
@@ -772,7 +972,25 @@ void drain_local_pages(void) | |||
772 | __drain_pages(smp_processor_id()); | 972 | __drain_pages(smp_processor_id()); |
773 | local_irq_restore(flags); | 973 | local_irq_restore(flags); |
774 | } | 974 | } |
775 | #endif /* CONFIG_HIBERNATION */ | 975 | |
976 | void smp_drain_local_pages(void *arg) | ||
977 | { | ||
978 | drain_local_pages(); | ||
979 | } | ||
980 | |||
981 | /* | ||
982 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator | ||
983 | */ | ||
984 | void drain_all_local_pages(void) | ||
985 | { | ||
986 | unsigned long flags; | ||
987 | |||
988 | local_irq_save(flags); | ||
989 | __drain_pages(smp_processor_id()); | ||
990 | local_irq_restore(flags); | ||
991 | |||
992 | smp_call_function(smp_drain_local_pages, NULL, 0, 1); | ||
993 | } | ||
776 | 994 | ||
777 | /* | 995 | /* |
778 | * Free a 0-order page | 996 | * Free a 0-order page |
@@ -797,6 +1015,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
797 | local_irq_save(flags); | 1015 | local_irq_save(flags); |
798 | __count_vm_event(PGFREE); | 1016 | __count_vm_event(PGFREE); |
799 | list_add(&page->lru, &pcp->list); | 1017 | list_add(&page->lru, &pcp->list); |
1018 | set_page_private(page, get_pageblock_migratetype(page)); | ||
800 | pcp->count++; | 1019 | pcp->count++; |
801 | if (pcp->count >= pcp->high) { | 1020 | if (pcp->count >= pcp->high) { |
802 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 1021 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); |
@@ -846,6 +1065,7 @@ static struct page *buffered_rmqueue(struct zonelist *zonelist, | |||
846 | struct page *page; | 1065 | struct page *page; |
847 | int cold = !!(gfp_flags & __GFP_COLD); | 1066 | int cold = !!(gfp_flags & __GFP_COLD); |
848 | int cpu; | 1067 | int cpu; |
1068 | int migratetype = allocflags_to_migratetype(gfp_flags); | ||
849 | 1069 | ||
850 | again: | 1070 | again: |
851 | cpu = get_cpu(); | 1071 | cpu = get_cpu(); |
@@ -856,16 +1076,28 @@ again: | |||
856 | local_irq_save(flags); | 1076 | local_irq_save(flags); |
857 | if (!pcp->count) { | 1077 | if (!pcp->count) { |
858 | pcp->count = rmqueue_bulk(zone, 0, | 1078 | pcp->count = rmqueue_bulk(zone, 0, |
859 | pcp->batch, &pcp->list); | 1079 | pcp->batch, &pcp->list, migratetype); |
860 | if (unlikely(!pcp->count)) | 1080 | if (unlikely(!pcp->count)) |
861 | goto failed; | 1081 | goto failed; |
862 | } | 1082 | } |
863 | page = list_entry(pcp->list.next, struct page, lru); | 1083 | |
1084 | /* Find a page of the appropriate migrate type */ | ||
1085 | list_for_each_entry(page, &pcp->list, lru) | ||
1086 | if (page_private(page) == migratetype) | ||
1087 | break; | ||
1088 | |||
1089 | /* Allocate more to the pcp list if necessary */ | ||
1090 | if (unlikely(&page->lru == &pcp->list)) { | ||
1091 | pcp->count += rmqueue_bulk(zone, 0, | ||
1092 | pcp->batch, &pcp->list, migratetype); | ||
1093 | page = list_entry(pcp->list.next, struct page, lru); | ||
1094 | } | ||
1095 | |||
864 | list_del(&page->lru); | 1096 | list_del(&page->lru); |
865 | pcp->count--; | 1097 | pcp->count--; |
866 | } else { | 1098 | } else { |
867 | spin_lock_irqsave(&zone->lock, flags); | 1099 | spin_lock_irqsave(&zone->lock, flags); |
868 | page = __rmqueue(zone, order); | 1100 | page = __rmqueue(zone, order, migratetype); |
869 | spin_unlock(&zone->lock); | 1101 | spin_unlock(&zone->lock); |
870 | if (!page) | 1102 | if (!page) |
871 | goto failed; | 1103 | goto failed; |
@@ -1032,7 +1264,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1032 | * | 1264 | * |
1033 | * If the zonelist cache is present in the passed in zonelist, then | 1265 | * If the zonelist cache is present in the passed in zonelist, then |
1034 | * returns a pointer to the allowed node mask (either the current | 1266 | * returns a pointer to the allowed node mask (either the current |
1035 | * tasks mems_allowed, or node_online_map.) | 1267 | * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) |
1036 | * | 1268 | * |
1037 | * If the zonelist cache is not available for this zonelist, does | 1269 | * If the zonelist cache is not available for this zonelist, does |
1038 | * nothing and returns NULL. | 1270 | * nothing and returns NULL. |
@@ -1061,7 +1293,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
1061 | 1293 | ||
1062 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | 1294 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? |
1063 | &cpuset_current_mems_allowed : | 1295 | &cpuset_current_mems_allowed : |
1064 | &node_online_map; | 1296 | &node_states[N_HIGH_MEMORY]; |
1065 | return allowednodes; | 1297 | return allowednodes; |
1066 | } | 1298 | } |
1067 | 1299 | ||
@@ -1183,9 +1415,6 @@ zonelist_scan: | |||
1183 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1415 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1184 | continue; | 1416 | continue; |
1185 | zone = *z; | 1417 | zone = *z; |
1186 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && | ||
1187 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) | ||
1188 | break; | ||
1189 | if ((alloc_flags & ALLOC_CPUSET) && | 1418 | if ((alloc_flags & ALLOC_CPUSET) && |
1190 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1419 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1191 | goto try_next_zone; | 1420 | goto try_next_zone; |
@@ -1254,7 +1483,10 @@ restart: | |||
1254 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ | 1483 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ |
1255 | 1484 | ||
1256 | if (unlikely(*z == NULL)) { | 1485 | if (unlikely(*z == NULL)) { |
1257 | /* Should this ever happen?? */ | 1486 | /* |
1487 | * Happens if we have an empty zonelist as a result of | ||
1488 | * GFP_THISNODE being used on a memoryless node | ||
1489 | */ | ||
1258 | return NULL; | 1490 | return NULL; |
1259 | } | 1491 | } |
1260 | 1492 | ||
@@ -1346,12 +1578,20 @@ nofail_alloc: | |||
1346 | 1578 | ||
1347 | cond_resched(); | 1579 | cond_resched(); |
1348 | 1580 | ||
1581 | if (order != 0) | ||
1582 | drain_all_local_pages(); | ||
1583 | |||
1349 | if (likely(did_some_progress)) { | 1584 | if (likely(did_some_progress)) { |
1350 | page = get_page_from_freelist(gfp_mask, order, | 1585 | page = get_page_from_freelist(gfp_mask, order, |
1351 | zonelist, alloc_flags); | 1586 | zonelist, alloc_flags); |
1352 | if (page) | 1587 | if (page) |
1353 | goto got_pg; | 1588 | goto got_pg; |
1354 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | 1589 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { |
1590 | if (!try_set_zone_oom(zonelist)) { | ||
1591 | schedule_timeout_uninterruptible(1); | ||
1592 | goto restart; | ||
1593 | } | ||
1594 | |||
1355 | /* | 1595 | /* |
1356 | * Go through the zonelist yet one more time, keep | 1596 | * Go through the zonelist yet one more time, keep |
1357 | * very high watermark here, this is only to catch | 1597 | * very high watermark here, this is only to catch |
@@ -1360,14 +1600,19 @@ nofail_alloc: | |||
1360 | */ | 1600 | */ |
1361 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 1601 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, |
1362 | zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); | 1602 | zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); |
1363 | if (page) | 1603 | if (page) { |
1604 | clear_zonelist_oom(zonelist); | ||
1364 | goto got_pg; | 1605 | goto got_pg; |
1606 | } | ||
1365 | 1607 | ||
1366 | /* The OOM killer will not help higher order allocs so fail */ | 1608 | /* The OOM killer will not help higher order allocs so fail */ |
1367 | if (order > PAGE_ALLOC_COSTLY_ORDER) | 1609 | if (order > PAGE_ALLOC_COSTLY_ORDER) { |
1610 | clear_zonelist_oom(zonelist); | ||
1368 | goto nopage; | 1611 | goto nopage; |
1612 | } | ||
1369 | 1613 | ||
1370 | out_of_memory(zonelist, gfp_mask, order); | 1614 | out_of_memory(zonelist, gfp_mask, order); |
1615 | clear_zonelist_oom(zonelist); | ||
1371 | goto restart; | 1616 | goto restart; |
1372 | } | 1617 | } |
1373 | 1618 | ||
@@ -1616,7 +1861,7 @@ void show_free_areas(void) | |||
1616 | K(zone_page_state(zone, NR_INACTIVE)), | 1861 | K(zone_page_state(zone, NR_INACTIVE)), |
1617 | K(zone->present_pages), | 1862 | K(zone->present_pages), |
1618 | zone->pages_scanned, | 1863 | zone->pages_scanned, |
1619 | (zone->all_unreclaimable ? "yes" : "no") | 1864 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") |
1620 | ); | 1865 | ); |
1621 | printk("lowmem_reserve[]:"); | 1866 | printk("lowmem_reserve[]:"); |
1622 | for (i = 0; i < MAX_NR_ZONES; i++) | 1867 | for (i = 0; i < MAX_NR_ZONES; i++) |
@@ -1794,7 +2039,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
1794 | return node; | 2039 | return node; |
1795 | } | 2040 | } |
1796 | 2041 | ||
1797 | for_each_online_node(n) { | 2042 | for_each_node_state(n, N_HIGH_MEMORY) { |
1798 | cpumask_t tmp; | 2043 | cpumask_t tmp; |
1799 | 2044 | ||
1800 | /* Don't want a node to appear more than once */ | 2045 | /* Don't want a node to appear more than once */ |
@@ -1850,6 +2095,22 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | |||
1850 | } | 2095 | } |
1851 | 2096 | ||
1852 | /* | 2097 | /* |
2098 | * Build gfp_thisnode zonelists | ||
2099 | */ | ||
2100 | static void build_thisnode_zonelists(pg_data_t *pgdat) | ||
2101 | { | ||
2102 | enum zone_type i; | ||
2103 | int j; | ||
2104 | struct zonelist *zonelist; | ||
2105 | |||
2106 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
2107 | zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i; | ||
2108 | j = build_zonelists_node(pgdat, zonelist, 0, i); | ||
2109 | zonelist->zones[j] = NULL; | ||
2110 | } | ||
2111 | } | ||
2112 | |||
2113 | /* | ||
1853 | * Build zonelists ordered by zone and nodes within zones. | 2114 | * Build zonelists ordered by zone and nodes within zones. |
1854 | * This results in conserving DMA zone[s] until all Normal memory is | 2115 | * This results in conserving DMA zone[s] until all Normal memory is |
1855 | * exhausted, but results in overflowing to remote node while memory | 2116 | * exhausted, but results in overflowing to remote node while memory |
@@ -1915,7 +2176,8 @@ static int default_zonelist_order(void) | |||
1915 | * If there is a node whose DMA/DMA32 memory is very big area on | 2176 | * If there is a node whose DMA/DMA32 memory is very big area on |
1916 | * local memory, NODE_ORDER may be suitable. | 2177 | * local memory, NODE_ORDER may be suitable. |
1917 | */ | 2178 | */ |
1918 | average_size = total_size / (num_online_nodes() + 1); | 2179 | average_size = total_size / |
2180 | (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); | ||
1919 | for_each_online_node(nid) { | 2181 | for_each_online_node(nid) { |
1920 | low_kmem_size = 0; | 2182 | low_kmem_size = 0; |
1921 | total_size = 0; | 2183 | total_size = 0; |
@@ -1953,7 +2215,7 @@ static void build_zonelists(pg_data_t *pgdat) | |||
1953 | int order = current_zonelist_order; | 2215 | int order = current_zonelist_order; |
1954 | 2216 | ||
1955 | /* initialize zonelists */ | 2217 | /* initialize zonelists */ |
1956 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2218 | for (i = 0; i < MAX_ZONELISTS; i++) { |
1957 | zonelist = pgdat->node_zonelists + i; | 2219 | zonelist = pgdat->node_zonelists + i; |
1958 | zonelist->zones[0] = NULL; | 2220 | zonelist->zones[0] = NULL; |
1959 | } | 2221 | } |
@@ -1998,6 +2260,8 @@ static void build_zonelists(pg_data_t *pgdat) | |||
1998 | /* calculate node order -- i.e., DMA last! */ | 2260 | /* calculate node order -- i.e., DMA last! */ |
1999 | build_zonelists_in_zone_order(pgdat, j); | 2261 | build_zonelists_in_zone_order(pgdat, j); |
2000 | } | 2262 | } |
2263 | |||
2264 | build_thisnode_zonelists(pgdat); | ||
2001 | } | 2265 | } |
2002 | 2266 | ||
2003 | /* Construct the zonelist performance cache - see further mmzone.h */ | 2267 | /* Construct the zonelist performance cache - see further mmzone.h */ |
@@ -2078,8 +2342,10 @@ static int __build_all_zonelists(void *dummy) | |||
2078 | int nid; | 2342 | int nid; |
2079 | 2343 | ||
2080 | for_each_online_node(nid) { | 2344 | for_each_online_node(nid) { |
2081 | build_zonelists(NODE_DATA(nid)); | 2345 | pg_data_t *pgdat = NODE_DATA(nid); |
2082 | build_zonelist_cache(NODE_DATA(nid)); | 2346 | |
2347 | build_zonelists(pgdat); | ||
2348 | build_zonelist_cache(pgdat); | ||
2083 | } | 2349 | } |
2084 | return 0; | 2350 | return 0; |
2085 | } | 2351 | } |
@@ -2098,9 +2364,23 @@ void build_all_zonelists(void) | |||
2098 | /* cpuset refresh routine should be here */ | 2364 | /* cpuset refresh routine should be here */ |
2099 | } | 2365 | } |
2100 | vm_total_pages = nr_free_pagecache_pages(); | 2366 | vm_total_pages = nr_free_pagecache_pages(); |
2101 | printk("Built %i zonelists in %s order. Total pages: %ld\n", | 2367 | /* |
2368 | * Disable grouping by mobility if the number of pages in the | ||
2369 | * system is too low to allow the mechanism to work. It would be | ||
2370 | * more accurate, but expensive to check per-zone. This check is | ||
2371 | * made on memory-hotadd so a system can start with mobility | ||
2372 | * disabled and enable it later | ||
2373 | */ | ||
2374 | if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) | ||
2375 | page_group_by_mobility_disabled = 1; | ||
2376 | else | ||
2377 | page_group_by_mobility_disabled = 0; | ||
2378 | |||
2379 | printk("Built %i zonelists in %s order, mobility grouping %s. " | ||
2380 | "Total pages: %ld\n", | ||
2102 | num_online_nodes(), | 2381 | num_online_nodes(), |
2103 | zonelist_order_name[current_zonelist_order], | 2382 | zonelist_order_name[current_zonelist_order], |
2383 | page_group_by_mobility_disabled ? "off" : "on", | ||
2104 | vm_total_pages); | 2384 | vm_total_pages); |
2105 | #ifdef CONFIG_NUMA | 2385 | #ifdef CONFIG_NUMA |
2106 | printk("Policy zone: %s\n", zone_names[policy_zone]); | 2386 | printk("Policy zone: %s\n", zone_names[policy_zone]); |
@@ -2176,6 +2456,61 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
2176 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | 2456 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) |
2177 | 2457 | ||
2178 | /* | 2458 | /* |
2459 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number | ||
2460 | * of blocks reserved is based on zone->pages_min. The memory within the | ||
2461 | * reserve will tend to store contiguous free pages. Setting min_free_kbytes | ||
2462 | * higher will lead to a bigger reserve which will get freed as contiguous | ||
2463 | * blocks as reclaim kicks in | ||
2464 | */ | ||
2465 | static void setup_zone_migrate_reserve(struct zone *zone) | ||
2466 | { | ||
2467 | unsigned long start_pfn, pfn, end_pfn; | ||
2468 | struct page *page; | ||
2469 | unsigned long reserve, block_migratetype; | ||
2470 | |||
2471 | /* Get the start pfn, end pfn and the number of blocks to reserve */ | ||
2472 | start_pfn = zone->zone_start_pfn; | ||
2473 | end_pfn = start_pfn + zone->spanned_pages; | ||
2474 | reserve = roundup(zone->pages_min, pageblock_nr_pages) >> | ||
2475 | pageblock_order; | ||
2476 | |||
2477 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | ||
2478 | if (!pfn_valid(pfn)) | ||
2479 | continue; | ||
2480 | page = pfn_to_page(pfn); | ||
2481 | |||
2482 | /* Blocks with reserved pages will never free, skip them. */ | ||
2483 | if (PageReserved(page)) | ||
2484 | continue; | ||
2485 | |||
2486 | block_migratetype = get_pageblock_migratetype(page); | ||
2487 | |||
2488 | /* If this block is reserved, account for it */ | ||
2489 | if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { | ||
2490 | reserve--; | ||
2491 | continue; | ||
2492 | } | ||
2493 | |||
2494 | /* Suitable for reserving if this block is movable */ | ||
2495 | if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { | ||
2496 | set_pageblock_migratetype(page, MIGRATE_RESERVE); | ||
2497 | move_freepages_block(zone, page, MIGRATE_RESERVE); | ||
2498 | reserve--; | ||
2499 | continue; | ||
2500 | } | ||
2501 | |||
2502 | /* | ||
2503 | * If the reserve is met and this is a previous reserved block, | ||
2504 | * take it back | ||
2505 | */ | ||
2506 | if (block_migratetype == MIGRATE_RESERVE) { | ||
2507 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
2508 | move_freepages_block(zone, page, MIGRATE_MOVABLE); | ||
2509 | } | ||
2510 | } | ||
2511 | } | ||
2512 | |||
2513 | /* | ||
2179 | * Initially all pages are reserved - free ones are freed | 2514 | * Initially all pages are reserved - free ones are freed |
2180 | * up by free_all_bootmem() once the early boot process is | 2515 | * up by free_all_bootmem() once the early boot process is |
2181 | * done. Non-atomic initialization, single-pass. | 2516 | * done. Non-atomic initialization, single-pass. |
@@ -2204,6 +2539,19 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
2204 | init_page_count(page); | 2539 | init_page_count(page); |
2205 | reset_page_mapcount(page); | 2540 | reset_page_mapcount(page); |
2206 | SetPageReserved(page); | 2541 | SetPageReserved(page); |
2542 | |||
2543 | /* | ||
2544 | * Mark the block movable so that blocks are reserved for | ||
2545 | * movable at startup. This will force kernel allocations | ||
2546 | * to reserve their blocks rather than leaking throughout | ||
2547 | * the address space during boot when many long-lived | ||
2548 | * kernel allocations are made. Later some blocks near | ||
2549 | * the start are marked MIGRATE_RESERVE by | ||
2550 | * setup_zone_migrate_reserve() | ||
2551 | */ | ||
2552 | if ((pfn & (pageblock_nr_pages-1))) | ||
2553 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
2554 | |||
2207 | INIT_LIST_HEAD(&page->lru); | 2555 | INIT_LIST_HEAD(&page->lru); |
2208 | #ifdef WANT_PAGE_VIRTUAL | 2556 | #ifdef WANT_PAGE_VIRTUAL |
2209 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ | 2557 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ |
@@ -2216,9 +2564,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
2216 | static void __meminit zone_init_free_lists(struct pglist_data *pgdat, | 2564 | static void __meminit zone_init_free_lists(struct pglist_data *pgdat, |
2217 | struct zone *zone, unsigned long size) | 2565 | struct zone *zone, unsigned long size) |
2218 | { | 2566 | { |
2219 | int order; | 2567 | int order, t; |
2220 | for (order = 0; order < MAX_ORDER ; order++) { | 2568 | for_each_migratetype_order(order, t) { |
2221 | INIT_LIST_HEAD(&zone->free_area[order].free_list); | 2569 | INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); |
2222 | zone->free_area[order].nr_free = 0; | 2570 | zone->free_area[order].nr_free = 0; |
2223 | } | 2571 | } |
2224 | } | 2572 | } |
@@ -2324,6 +2672,9 @@ static struct per_cpu_pageset boot_pageset[NR_CPUS]; | |||
2324 | static int __cpuinit process_zones(int cpu) | 2672 | static int __cpuinit process_zones(int cpu) |
2325 | { | 2673 | { |
2326 | struct zone *zone, *dzone; | 2674 | struct zone *zone, *dzone; |
2675 | int node = cpu_to_node(cpu); | ||
2676 | |||
2677 | node_set_state(node, N_CPU); /* this node has a cpu */ | ||
2327 | 2678 | ||
2328 | for_each_zone(zone) { | 2679 | for_each_zone(zone) { |
2329 | 2680 | ||
@@ -2331,7 +2682,7 @@ static int __cpuinit process_zones(int cpu) | |||
2331 | continue; | 2682 | continue; |
2332 | 2683 | ||
2333 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 2684 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), |
2334 | GFP_KERNEL, cpu_to_node(cpu)); | 2685 | GFP_KERNEL, node); |
2335 | if (!zone_pcp(zone, cpu)) | 2686 | if (!zone_pcp(zone, cpu)) |
2336 | goto bad; | 2687 | goto bad; |
2337 | 2688 | ||
@@ -2444,7 +2795,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
2444 | * To use this new node's memory, further consideration will be | 2795 | * To use this new node's memory, further consideration will be |
2445 | * necessary. | 2796 | * necessary. |
2446 | */ | 2797 | */ |
2447 | zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); | 2798 | zone->wait_table = vmalloc(alloc_size); |
2448 | } | 2799 | } |
2449 | if (!zone->wait_table) | 2800 | if (!zone->wait_table) |
2450 | return -ENOMEM; | 2801 | return -ENOMEM; |
@@ -2680,10 +3031,8 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, | |||
2680 | *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); | 3031 | *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); |
2681 | } | 3032 | } |
2682 | 3033 | ||
2683 | if (*start_pfn == -1UL) { | 3034 | if (*start_pfn == -1UL) |
2684 | printk(KERN_WARNING "Node %u active with no memory\n", nid); | ||
2685 | *start_pfn = 0; | 3035 | *start_pfn = 0; |
2686 | } | ||
2687 | 3036 | ||
2688 | /* Push the node boundaries out if requested */ | 3037 | /* Push the node boundaries out if requested */ |
2689 | account_node_boundary(nid, start_pfn, end_pfn); | 3038 | account_node_boundary(nid, start_pfn, end_pfn); |
@@ -2901,6 +3250,62 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, | |||
2901 | realtotalpages); | 3250 | realtotalpages); |
2902 | } | 3251 | } |
2903 | 3252 | ||
3253 | #ifndef CONFIG_SPARSEMEM | ||
3254 | /* | ||
3255 | * Calculate the size of the zone->blockflags rounded to an unsigned long | ||
3256 | * Start by making sure zonesize is a multiple of pageblock_order by rounding | ||
3257 | * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally | ||
3258 | * round what is now in bits to nearest long in bits, then return it in | ||
3259 | * bytes. | ||
3260 | */ | ||
3261 | static unsigned long __init usemap_size(unsigned long zonesize) | ||
3262 | { | ||
3263 | unsigned long usemapsize; | ||
3264 | |||
3265 | usemapsize = roundup(zonesize, pageblock_nr_pages); | ||
3266 | usemapsize = usemapsize >> pageblock_order; | ||
3267 | usemapsize *= NR_PAGEBLOCK_BITS; | ||
3268 | usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); | ||
3269 | |||
3270 | return usemapsize / 8; | ||
3271 | } | ||
3272 | |||
3273 | static void __init setup_usemap(struct pglist_data *pgdat, | ||
3274 | struct zone *zone, unsigned long zonesize) | ||
3275 | { | ||
3276 | unsigned long usemapsize = usemap_size(zonesize); | ||
3277 | zone->pageblock_flags = NULL; | ||
3278 | if (usemapsize) { | ||
3279 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); | ||
3280 | memset(zone->pageblock_flags, 0, usemapsize); | ||
3281 | } | ||
3282 | } | ||
3283 | #else | ||
3284 | static void inline setup_usemap(struct pglist_data *pgdat, | ||
3285 | struct zone *zone, unsigned long zonesize) {} | ||
3286 | #endif /* CONFIG_SPARSEMEM */ | ||
3287 | |||
3288 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | ||
3289 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ | ||
3290 | static inline void __init set_pageblock_order(unsigned int order) | ||
3291 | { | ||
3292 | /* Check that pageblock_nr_pages has not already been setup */ | ||
3293 | if (pageblock_order) | ||
3294 | return; | ||
3295 | |||
3296 | /* | ||
3297 | * Assume the largest contiguous order of interest is a huge page. | ||
3298 | * This value may be variable depending on boot parameters on IA64 | ||
3299 | */ | ||
3300 | pageblock_order = order; | ||
3301 | } | ||
3302 | #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | ||
3303 | |||
3304 | /* Defined this way to avoid accidently referencing HUGETLB_PAGE_ORDER */ | ||
3305 | #define set_pageblock_order(x) do {} while (0) | ||
3306 | |||
3307 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | ||
3308 | |||
2904 | /* | 3309 | /* |
2905 | * Set up the zone data structures: | 3310 | * Set up the zone data structures: |
2906 | * - mark all pages reserved | 3311 | * - mark all pages reserved |
@@ -2977,10 +3382,12 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
2977 | zone->nr_scan_active = 0; | 3382 | zone->nr_scan_active = 0; |
2978 | zone->nr_scan_inactive = 0; | 3383 | zone->nr_scan_inactive = 0; |
2979 | zap_zone_vm_stats(zone); | 3384 | zap_zone_vm_stats(zone); |
2980 | atomic_set(&zone->reclaim_in_progress, 0); | 3385 | zone->flags = 0; |
2981 | if (!size) | 3386 | if (!size) |
2982 | continue; | 3387 | continue; |
2983 | 3388 | ||
3389 | set_pageblock_order(HUGETLB_PAGE_ORDER); | ||
3390 | setup_usemap(pgdat, zone, size); | ||
2984 | ret = init_currently_empty_zone(zone, zone_start_pfn, | 3391 | ret = init_currently_empty_zone(zone, zone_start_pfn, |
2985 | size, MEMMAP_EARLY); | 3392 | size, MEMMAP_EARLY); |
2986 | BUG_ON(ret); | 3393 | BUG_ON(ret); |
@@ -3234,16 +3641,24 @@ unsigned long __init find_max_pfn_with_active_regions(void) | |||
3234 | return max_pfn; | 3641 | return max_pfn; |
3235 | } | 3642 | } |
3236 | 3643 | ||
3237 | unsigned long __init early_calculate_totalpages(void) | 3644 | /* |
3645 | * early_calculate_totalpages() | ||
3646 | * Sum pages in active regions for movable zone. | ||
3647 | * Populate N_HIGH_MEMORY for calculating usable_nodes. | ||
3648 | */ | ||
3649 | static unsigned long __init early_calculate_totalpages(void) | ||
3238 | { | 3650 | { |
3239 | int i; | 3651 | int i; |
3240 | unsigned long totalpages = 0; | 3652 | unsigned long totalpages = 0; |
3241 | 3653 | ||
3242 | for (i = 0; i < nr_nodemap_entries; i++) | 3654 | for (i = 0; i < nr_nodemap_entries; i++) { |
3243 | totalpages += early_node_map[i].end_pfn - | 3655 | unsigned long pages = early_node_map[i].end_pfn - |
3244 | early_node_map[i].start_pfn; | 3656 | early_node_map[i].start_pfn; |
3245 | 3657 | totalpages += pages; | |
3246 | return totalpages; | 3658 | if (pages) |
3659 | node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); | ||
3660 | } | ||
3661 | return totalpages; | ||
3247 | } | 3662 | } |
3248 | 3663 | ||
3249 | /* | 3664 | /* |
@@ -3257,7 +3672,8 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
3257 | int i, nid; | 3672 | int i, nid; |
3258 | unsigned long usable_startpfn; | 3673 | unsigned long usable_startpfn; |
3259 | unsigned long kernelcore_node, kernelcore_remaining; | 3674 | unsigned long kernelcore_node, kernelcore_remaining; |
3260 | int usable_nodes = num_online_nodes(); | 3675 | unsigned long totalpages = early_calculate_totalpages(); |
3676 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | ||
3261 | 3677 | ||
3262 | /* | 3678 | /* |
3263 | * If movablecore was specified, calculate what size of | 3679 | * If movablecore was specified, calculate what size of |
@@ -3268,7 +3684,6 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
3268 | * what movablecore would have allowed. | 3684 | * what movablecore would have allowed. |
3269 | */ | 3685 | */ |
3270 | if (required_movablecore) { | 3686 | if (required_movablecore) { |
3271 | unsigned long totalpages = early_calculate_totalpages(); | ||
3272 | unsigned long corepages; | 3687 | unsigned long corepages; |
3273 | 3688 | ||
3274 | /* | 3689 | /* |
@@ -3293,7 +3708,7 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
3293 | restart: | 3708 | restart: |
3294 | /* Spread kernelcore memory as evenly as possible throughout nodes */ | 3709 | /* Spread kernelcore memory as evenly as possible throughout nodes */ |
3295 | kernelcore_node = required_kernelcore / usable_nodes; | 3710 | kernelcore_node = required_kernelcore / usable_nodes; |
3296 | for_each_online_node(nid) { | 3711 | for_each_node_state(nid, N_HIGH_MEMORY) { |
3297 | /* | 3712 | /* |
3298 | * Recalculate kernelcore_node if the division per node | 3713 | * Recalculate kernelcore_node if the division per node |
3299 | * now exceeds what is necessary to satisfy the requested | 3714 | * now exceeds what is necessary to satisfy the requested |
@@ -3385,6 +3800,20 @@ restart: | |||
3385 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); | 3800 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); |
3386 | } | 3801 | } |
3387 | 3802 | ||
3803 | /* Any regular memory on that node ? */ | ||
3804 | static void check_for_regular_memory(pg_data_t *pgdat) | ||
3805 | { | ||
3806 | #ifdef CONFIG_HIGHMEM | ||
3807 | enum zone_type zone_type; | ||
3808 | |||
3809 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { | ||
3810 | struct zone *zone = &pgdat->node_zones[zone_type]; | ||
3811 | if (zone->present_pages) | ||
3812 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); | ||
3813 | } | ||
3814 | #endif | ||
3815 | } | ||
3816 | |||
3388 | /** | 3817 | /** |
3389 | * free_area_init_nodes - Initialise all pg_data_t and zone data | 3818 | * free_area_init_nodes - Initialise all pg_data_t and zone data |
3390 | * @max_zone_pfn: an array of max PFNs for each zone | 3819 | * @max_zone_pfn: an array of max PFNs for each zone |
@@ -3459,6 +3888,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
3459 | pg_data_t *pgdat = NODE_DATA(nid); | 3888 | pg_data_t *pgdat = NODE_DATA(nid); |
3460 | free_area_init_node(nid, pgdat, NULL, | 3889 | free_area_init_node(nid, pgdat, NULL, |
3461 | find_min_pfn_for_node(nid), NULL); | 3890 | find_min_pfn_for_node(nid), NULL); |
3891 | |||
3892 | /* Any memory on that node */ | ||
3893 | if (pgdat->node_present_pages) | ||
3894 | node_set_state(nid, N_HIGH_MEMORY); | ||
3895 | check_for_regular_memory(pgdat); | ||
3462 | } | 3896 | } |
3463 | } | 3897 | } |
3464 | 3898 | ||
@@ -3673,6 +4107,7 @@ void setup_per_zone_pages_min(void) | |||
3673 | 4107 | ||
3674 | zone->pages_low = zone->pages_min + (tmp >> 2); | 4108 | zone->pages_low = zone->pages_min + (tmp >> 2); |
3675 | zone->pages_high = zone->pages_min + (tmp >> 1); | 4109 | zone->pages_high = zone->pages_min + (tmp >> 1); |
4110 | setup_zone_migrate_reserve(zone); | ||
3676 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 4111 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3677 | } | 4112 | } |
3678 | 4113 | ||
@@ -3934,4 +4369,169 @@ EXPORT_SYMBOL(pfn_to_page); | |||
3934 | EXPORT_SYMBOL(page_to_pfn); | 4369 | EXPORT_SYMBOL(page_to_pfn); |
3935 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ | 4370 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ |
3936 | 4371 | ||
4372 | /* Return a pointer to the bitmap storing bits affecting a block of pages */ | ||
4373 | static inline unsigned long *get_pageblock_bitmap(struct zone *zone, | ||
4374 | unsigned long pfn) | ||
4375 | { | ||
4376 | #ifdef CONFIG_SPARSEMEM | ||
4377 | return __pfn_to_section(pfn)->pageblock_flags; | ||
4378 | #else | ||
4379 | return zone->pageblock_flags; | ||
4380 | #endif /* CONFIG_SPARSEMEM */ | ||
4381 | } | ||
4382 | |||
4383 | static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) | ||
4384 | { | ||
4385 | #ifdef CONFIG_SPARSEMEM | ||
4386 | pfn &= (PAGES_PER_SECTION-1); | ||
4387 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | ||
4388 | #else | ||
4389 | pfn = pfn - zone->zone_start_pfn; | ||
4390 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | ||
4391 | #endif /* CONFIG_SPARSEMEM */ | ||
4392 | } | ||
4393 | |||
4394 | /** | ||
4395 | * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages | ||
4396 | * @page: The page within the block of interest | ||
4397 | * @start_bitidx: The first bit of interest to retrieve | ||
4398 | * @end_bitidx: The last bit of interest | ||
4399 | * returns pageblock_bits flags | ||
4400 | */ | ||
4401 | unsigned long get_pageblock_flags_group(struct page *page, | ||
4402 | int start_bitidx, int end_bitidx) | ||
4403 | { | ||
4404 | struct zone *zone; | ||
4405 | unsigned long *bitmap; | ||
4406 | unsigned long pfn, bitidx; | ||
4407 | unsigned long flags = 0; | ||
4408 | unsigned long value = 1; | ||
4409 | |||
4410 | zone = page_zone(page); | ||
4411 | pfn = page_to_pfn(page); | ||
4412 | bitmap = get_pageblock_bitmap(zone, pfn); | ||
4413 | bitidx = pfn_to_bitidx(zone, pfn); | ||
4414 | |||
4415 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | ||
4416 | if (test_bit(bitidx + start_bitidx, bitmap)) | ||
4417 | flags |= value; | ||
4418 | |||
4419 | return flags; | ||
4420 | } | ||
4421 | |||
4422 | /** | ||
4423 | * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages | ||
4424 | * @page: The page within the block of interest | ||
4425 | * @start_bitidx: The first bit of interest | ||
4426 | * @end_bitidx: The last bit of interest | ||
4427 | * @flags: The flags to set | ||
4428 | */ | ||
4429 | void set_pageblock_flags_group(struct page *page, unsigned long flags, | ||
4430 | int start_bitidx, int end_bitidx) | ||
4431 | { | ||
4432 | struct zone *zone; | ||
4433 | unsigned long *bitmap; | ||
4434 | unsigned long pfn, bitidx; | ||
4435 | unsigned long value = 1; | ||
4436 | |||
4437 | zone = page_zone(page); | ||
4438 | pfn = page_to_pfn(page); | ||
4439 | bitmap = get_pageblock_bitmap(zone, pfn); | ||
4440 | bitidx = pfn_to_bitidx(zone, pfn); | ||
4441 | |||
4442 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | ||
4443 | if (flags & value) | ||
4444 | __set_bit(bitidx + start_bitidx, bitmap); | ||
4445 | else | ||
4446 | __clear_bit(bitidx + start_bitidx, bitmap); | ||
4447 | } | ||
4448 | |||
4449 | /* | ||
4450 | * This is designed as sub function...plz see page_isolation.c also. | ||
4451 | * set/clear page block's type to be ISOLATE. | ||
4452 | * page allocater never alloc memory from ISOLATE block. | ||
4453 | */ | ||
4454 | |||
4455 | int set_migratetype_isolate(struct page *page) | ||
4456 | { | ||
4457 | struct zone *zone; | ||
4458 | unsigned long flags; | ||
4459 | int ret = -EBUSY; | ||
4460 | |||
4461 | zone = page_zone(page); | ||
4462 | spin_lock_irqsave(&zone->lock, flags); | ||
4463 | /* | ||
4464 | * In future, more migrate types will be able to be isolation target. | ||
4465 | */ | ||
4466 | if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) | ||
4467 | goto out; | ||
4468 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
4469 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
4470 | ret = 0; | ||
4471 | out: | ||
4472 | spin_unlock_irqrestore(&zone->lock, flags); | ||
4473 | if (!ret) | ||
4474 | drain_all_local_pages(); | ||
4475 | return ret; | ||
4476 | } | ||
3937 | 4477 | ||
4478 | void unset_migratetype_isolate(struct page *page) | ||
4479 | { | ||
4480 | struct zone *zone; | ||
4481 | unsigned long flags; | ||
4482 | zone = page_zone(page); | ||
4483 | spin_lock_irqsave(&zone->lock, flags); | ||
4484 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | ||
4485 | goto out; | ||
4486 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
4487 | move_freepages_block(zone, page, MIGRATE_MOVABLE); | ||
4488 | out: | ||
4489 | spin_unlock_irqrestore(&zone->lock, flags); | ||
4490 | } | ||
4491 | |||
4492 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
4493 | /* | ||
4494 | * All pages in the range must be isolated before calling this. | ||
4495 | */ | ||
4496 | void | ||
4497 | __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | ||
4498 | { | ||
4499 | struct page *page; | ||
4500 | struct zone *zone; | ||
4501 | int order, i; | ||
4502 | unsigned long pfn; | ||
4503 | unsigned long flags; | ||
4504 | /* find the first valid pfn */ | ||
4505 | for (pfn = start_pfn; pfn < end_pfn; pfn++) | ||
4506 | if (pfn_valid(pfn)) | ||
4507 | break; | ||
4508 | if (pfn == end_pfn) | ||
4509 | return; | ||
4510 | zone = page_zone(pfn_to_page(pfn)); | ||
4511 | spin_lock_irqsave(&zone->lock, flags); | ||
4512 | pfn = start_pfn; | ||
4513 | while (pfn < end_pfn) { | ||
4514 | if (!pfn_valid(pfn)) { | ||
4515 | pfn++; | ||
4516 | continue; | ||
4517 | } | ||
4518 | page = pfn_to_page(pfn); | ||
4519 | BUG_ON(page_count(page)); | ||
4520 | BUG_ON(!PageBuddy(page)); | ||
4521 | order = page_order(page); | ||
4522 | #ifdef CONFIG_DEBUG_VM | ||
4523 | printk(KERN_INFO "remove from free list %lx %d %lx\n", | ||
4524 | pfn, 1 << order, end_pfn); | ||
4525 | #endif | ||
4526 | list_del(&page->lru); | ||
4527 | rmv_page_order(page); | ||
4528 | zone->free_area[order].nr_free--; | ||
4529 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
4530 | - (1UL << order)); | ||
4531 | for (i = 0; i < (1 << order); i++) | ||
4532 | SetPageReserved((page+i)); | ||
4533 | pfn += (1 << order); | ||
4534 | } | ||
4535 | spin_unlock_irqrestore(&zone->lock, flags); | ||
4536 | } | ||
4537 | #endif | ||
diff --git a/mm/page_isolation.c b/mm/page_isolation.c new file mode 100644 index 000000000000..8f92a29695cc --- /dev/null +++ b/mm/page_isolation.c | |||
@@ -0,0 +1,138 @@ | |||
1 | /* | ||
2 | * linux/mm/page_isolation.c | ||
3 | */ | ||
4 | |||
5 | #include <stddef.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/page-isolation.h> | ||
8 | #include <linux/pageblock-flags.h> | ||
9 | #include "internal.h" | ||
10 | |||
11 | static inline struct page * | ||
12 | __first_valid_page(unsigned long pfn, unsigned long nr_pages) | ||
13 | { | ||
14 | int i; | ||
15 | for (i = 0; i < nr_pages; i++) | ||
16 | if (pfn_valid_within(pfn + i)) | ||
17 | break; | ||
18 | if (unlikely(i == nr_pages)) | ||
19 | return NULL; | ||
20 | return pfn_to_page(pfn + i); | ||
21 | } | ||
22 | |||
23 | /* | ||
24 | * start_isolate_page_range() -- make page-allocation-type of range of pages | ||
25 | * to be MIGRATE_ISOLATE. | ||
26 | * @start_pfn: The lower PFN of the range to be isolated. | ||
27 | * @end_pfn: The upper PFN of the range to be isolated. | ||
28 | * | ||
29 | * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in | ||
30 | * the range will never be allocated. Any free pages and pages freed in the | ||
31 | * future will not be allocated again. | ||
32 | * | ||
33 | * start_pfn/end_pfn must be aligned to pageblock_order. | ||
34 | * Returns 0 on success and -EBUSY if any part of range cannot be isolated. | ||
35 | */ | ||
36 | int | ||
37 | start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) | ||
38 | { | ||
39 | unsigned long pfn; | ||
40 | unsigned long undo_pfn; | ||
41 | struct page *page; | ||
42 | |||
43 | BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); | ||
44 | BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); | ||
45 | |||
46 | for (pfn = start_pfn; | ||
47 | pfn < end_pfn; | ||
48 | pfn += pageblock_nr_pages) { | ||
49 | page = __first_valid_page(pfn, pageblock_nr_pages); | ||
50 | if (page && set_migratetype_isolate(page)) { | ||
51 | undo_pfn = pfn; | ||
52 | goto undo; | ||
53 | } | ||
54 | } | ||
55 | return 0; | ||
56 | undo: | ||
57 | for (pfn = start_pfn; | ||
58 | pfn <= undo_pfn; | ||
59 | pfn += pageblock_nr_pages) | ||
60 | unset_migratetype_isolate(pfn_to_page(pfn)); | ||
61 | |||
62 | return -EBUSY; | ||
63 | } | ||
64 | |||
65 | /* | ||
66 | * Make isolated pages available again. | ||
67 | */ | ||
68 | int | ||
69 | undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) | ||
70 | { | ||
71 | unsigned long pfn; | ||
72 | struct page *page; | ||
73 | BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); | ||
74 | BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); | ||
75 | for (pfn = start_pfn; | ||
76 | pfn < end_pfn; | ||
77 | pfn += pageblock_nr_pages) { | ||
78 | page = __first_valid_page(pfn, pageblock_nr_pages); | ||
79 | if (!page || get_pageblock_flags(page) != MIGRATE_ISOLATE) | ||
80 | continue; | ||
81 | unset_migratetype_isolate(page); | ||
82 | } | ||
83 | return 0; | ||
84 | } | ||
85 | /* | ||
86 | * Test all pages in the range is free(means isolated) or not. | ||
87 | * all pages in [start_pfn...end_pfn) must be in the same zone. | ||
88 | * zone->lock must be held before call this. | ||
89 | * | ||
90 | * Returns 0 if all pages in the range is isolated. | ||
91 | */ | ||
92 | static int | ||
93 | __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | ||
94 | { | ||
95 | struct page *page; | ||
96 | |||
97 | while (pfn < end_pfn) { | ||
98 | if (!pfn_valid_within(pfn)) { | ||
99 | pfn++; | ||
100 | continue; | ||
101 | } | ||
102 | page = pfn_to_page(pfn); | ||
103 | if (PageBuddy(page)) | ||
104 | pfn += 1 << page_order(page); | ||
105 | else if (page_count(page) == 0 && | ||
106 | page_private(page) == MIGRATE_ISOLATE) | ||
107 | pfn += 1; | ||
108 | else | ||
109 | break; | ||
110 | } | ||
111 | if (pfn < end_pfn) | ||
112 | return 0; | ||
113 | return 1; | ||
114 | } | ||
115 | |||
116 | int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | ||
117 | { | ||
118 | unsigned long pfn; | ||
119 | struct page *page; | ||
120 | |||
121 | pfn = start_pfn; | ||
122 | /* | ||
123 | * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page | ||
124 | * is not aligned to pageblock_nr_pages. | ||
125 | * Then we just check pagetype fist. | ||
126 | */ | ||
127 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | ||
128 | page = __first_valid_page(pfn, pageblock_nr_pages); | ||
129 | if (page && get_pageblock_flags(page) != MIGRATE_ISOLATE) | ||
130 | break; | ||
131 | } | ||
132 | if (pfn < end_pfn) | ||
133 | return -EBUSY; | ||
134 | /* Check all pages are free or Marked as ISOLATED */ | ||
135 | if (__test_page_isolated_in_pageblock(start_pfn, end_pfn)) | ||
136 | return 0; | ||
137 | return -EBUSY; | ||
138 | } | ||
diff --git a/mm/readahead.c b/mm/readahead.c index be20c9d699d3..c9c50ca1ec38 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -22,16 +22,8 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | |||
22 | } | 22 | } |
23 | EXPORT_SYMBOL(default_unplug_io_fn); | 23 | EXPORT_SYMBOL(default_unplug_io_fn); |
24 | 24 | ||
25 | /* | ||
26 | * Convienent macros for min/max read-ahead pages. | ||
27 | * Note that MAX_RA_PAGES is rounded down, while MIN_RA_PAGES is rounded up. | ||
28 | * The latter is necessary for systems with large page size(i.e. 64k). | ||
29 | */ | ||
30 | #define MAX_RA_PAGES (VM_MAX_READAHEAD*1024 / PAGE_CACHE_SIZE) | ||
31 | #define MIN_RA_PAGES DIV_ROUND_UP(VM_MIN_READAHEAD*1024, PAGE_CACHE_SIZE) | ||
32 | |||
33 | struct backing_dev_info default_backing_dev_info = { | 25 | struct backing_dev_info default_backing_dev_info = { |
34 | .ra_pages = MAX_RA_PAGES, | 26 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, |
35 | .state = 0, | 27 | .state = 0, |
36 | .capabilities = BDI_CAP_MAP_COPY, | 28 | .capabilities = BDI_CAP_MAP_COPY, |
37 | .unplug_io_fn = default_unplug_io_fn, | 29 | .unplug_io_fn = default_unplug_io_fn, |
@@ -46,7 +38,7 @@ void | |||
46 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) | 38 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) |
47 | { | 39 | { |
48 | ra->ra_pages = mapping->backing_dev_info->ra_pages; | 40 | ra->ra_pages = mapping->backing_dev_info->ra_pages; |
49 | ra->prev_index = -1; | 41 | ra->prev_pos = -1; |
50 | } | 42 | } |
51 | EXPORT_SYMBOL_GPL(file_ra_state_init); | 43 | EXPORT_SYMBOL_GPL(file_ra_state_init); |
52 | 44 | ||
@@ -66,28 +58,25 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, | |||
66 | int (*filler)(void *, struct page *), void *data) | 58 | int (*filler)(void *, struct page *), void *data) |
67 | { | 59 | { |
68 | struct page *page; | 60 | struct page *page; |
69 | struct pagevec lru_pvec; | ||
70 | int ret = 0; | 61 | int ret = 0; |
71 | 62 | ||
72 | pagevec_init(&lru_pvec, 0); | ||
73 | |||
74 | while (!list_empty(pages)) { | 63 | while (!list_empty(pages)) { |
75 | page = list_to_page(pages); | 64 | page = list_to_page(pages); |
76 | list_del(&page->lru); | 65 | list_del(&page->lru); |
77 | if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { | 66 | if (add_to_page_cache_lru(page, mapping, |
67 | page->index, GFP_KERNEL)) { | ||
78 | page_cache_release(page); | 68 | page_cache_release(page); |
79 | continue; | 69 | continue; |
80 | } | 70 | } |
71 | page_cache_release(page); | ||
72 | |||
81 | ret = filler(data, page); | 73 | ret = filler(data, page); |
82 | if (!pagevec_add(&lru_pvec, page)) | 74 | if (unlikely(ret)) { |
83 | __pagevec_lru_add(&lru_pvec); | ||
84 | if (ret) { | ||
85 | put_pages_list(pages); | 75 | put_pages_list(pages); |
86 | break; | 76 | break; |
87 | } | 77 | } |
88 | task_io_account_read(PAGE_CACHE_SIZE); | 78 | task_io_account_read(PAGE_CACHE_SIZE); |
89 | } | 79 | } |
90 | pagevec_lru_add(&lru_pvec); | ||
91 | return ret; | 80 | return ret; |
92 | } | 81 | } |
93 | 82 | ||
@@ -97,7 +86,6 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
97 | struct list_head *pages, unsigned nr_pages) | 86 | struct list_head *pages, unsigned nr_pages) |
98 | { | 87 | { |
99 | unsigned page_idx; | 88 | unsigned page_idx; |
100 | struct pagevec lru_pvec; | ||
101 | int ret; | 89 | int ret; |
102 | 90 | ||
103 | if (mapping->a_ops->readpages) { | 91 | if (mapping->a_ops->readpages) { |
@@ -107,19 +95,15 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
107 | goto out; | 95 | goto out; |
108 | } | 96 | } |
109 | 97 | ||
110 | pagevec_init(&lru_pvec, 0); | ||
111 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { | 98 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { |
112 | struct page *page = list_to_page(pages); | 99 | struct page *page = list_to_page(pages); |
113 | list_del(&page->lru); | 100 | list_del(&page->lru); |
114 | if (!add_to_page_cache(page, mapping, | 101 | if (!add_to_page_cache_lru(page, mapping, |
115 | page->index, GFP_KERNEL)) { | 102 | page->index, GFP_KERNEL)) { |
116 | mapping->a_ops->readpage(filp, page); | 103 | mapping->a_ops->readpage(filp, page); |
117 | if (!pagevec_add(&lru_pvec, page)) | 104 | } |
118 | __pagevec_lru_add(&lru_pvec); | 105 | page_cache_release(page); |
119 | } else | ||
120 | page_cache_release(page); | ||
121 | } | 106 | } |
122 | pagevec_lru_add(&lru_pvec); | ||
123 | ret = 0; | 107 | ret = 0; |
124 | out: | 108 | out: |
125 | return ret; | 109 | return ret; |
@@ -157,20 +141,19 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
157 | /* | 141 | /* |
158 | * Preallocate as many pages as we will need. | 142 | * Preallocate as many pages as we will need. |
159 | */ | 143 | */ |
160 | read_lock_irq(&mapping->tree_lock); | ||
161 | for (page_idx = 0; page_idx < nr_to_read; page_idx++) { | 144 | for (page_idx = 0; page_idx < nr_to_read; page_idx++) { |
162 | pgoff_t page_offset = offset + page_idx; | 145 | pgoff_t page_offset = offset + page_idx; |
163 | 146 | ||
164 | if (page_offset > end_index) | 147 | if (page_offset > end_index) |
165 | break; | 148 | break; |
166 | 149 | ||
150 | rcu_read_lock(); | ||
167 | page = radix_tree_lookup(&mapping->page_tree, page_offset); | 151 | page = radix_tree_lookup(&mapping->page_tree, page_offset); |
152 | rcu_read_unlock(); | ||
168 | if (page) | 153 | if (page) |
169 | continue; | 154 | continue; |
170 | 155 | ||
171 | read_unlock_irq(&mapping->tree_lock); | ||
172 | page = page_cache_alloc_cold(mapping); | 156 | page = page_cache_alloc_cold(mapping); |
173 | read_lock_irq(&mapping->tree_lock); | ||
174 | if (!page) | 157 | if (!page) |
175 | break; | 158 | break; |
176 | page->index = page_offset; | 159 | page->index = page_offset; |
@@ -179,7 +162,6 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
179 | SetPageReadahead(page); | 162 | SetPageReadahead(page); |
180 | ret++; | 163 | ret++; |
181 | } | 164 | } |
182 | read_unlock_irq(&mapping->tree_lock); | ||
183 | 165 | ||
184 | /* | 166 | /* |
185 | * Now start the IO. We ignore I/O errors - if the page is not | 167 | * Now start the IO. We ignore I/O errors - if the page is not |
@@ -251,6 +233,12 @@ unsigned long max_sane_readahead(unsigned long nr) | |||
251 | + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); | 233 | + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); |
252 | } | 234 | } |
253 | 235 | ||
236 | static int __init readahead_init(void) | ||
237 | { | ||
238 | return bdi_init(&default_backing_dev_info); | ||
239 | } | ||
240 | subsys_initcall(readahead_init); | ||
241 | |||
254 | /* | 242 | /* |
255 | * Submit IO for the read-ahead request in file_ra_state. | 243 | * Submit IO for the read-ahead request in file_ra_state. |
256 | */ | 244 | */ |
@@ -327,7 +315,7 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, | |||
327 | * indicator. The flag won't be set on already cached pages, to avoid the | 315 | * indicator. The flag won't be set on already cached pages, to avoid the |
328 | * readahead-for-nothing fuss, saving pointless page cache lookups. | 316 | * readahead-for-nothing fuss, saving pointless page cache lookups. |
329 | * | 317 | * |
330 | * prev_index tracks the last visited page in the _previous_ read request. | 318 | * prev_pos tracks the last visited byte in the _previous_ read request. |
331 | * It should be maintained by the caller, and will be used for detecting | 319 | * It should be maintained by the caller, and will be used for detecting |
332 | * small random reads. Note that the readahead algorithm checks loosely | 320 | * small random reads. Note that the readahead algorithm checks loosely |
333 | * for sequential patterns. Hence interleaved reads might be served as | 321 | * for sequential patterns. Hence interleaved reads might be served as |
@@ -351,11 +339,9 @@ ondemand_readahead(struct address_space *mapping, | |||
351 | bool hit_readahead_marker, pgoff_t offset, | 339 | bool hit_readahead_marker, pgoff_t offset, |
352 | unsigned long req_size) | 340 | unsigned long req_size) |
353 | { | 341 | { |
354 | unsigned long max; /* max readahead pages */ | 342 | int max = ra->ra_pages; /* max readahead pages */ |
355 | int sequential; | 343 | pgoff_t prev_offset; |
356 | 344 | int sequential; | |
357 | max = ra->ra_pages; | ||
358 | sequential = (offset - ra->prev_index <= 1UL) || (req_size > max); | ||
359 | 345 | ||
360 | /* | 346 | /* |
361 | * It's the expected callback offset, assume sequential access. | 347 | * It's the expected callback offset, assume sequential access. |
@@ -369,6 +355,9 @@ ondemand_readahead(struct address_space *mapping, | |||
369 | goto readit; | 355 | goto readit; |
370 | } | 356 | } |
371 | 357 | ||
358 | prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; | ||
359 | sequential = offset - prev_offset <= 1UL || req_size > max; | ||
360 | |||
372 | /* | 361 | /* |
373 | * Standalone, small read. | 362 | * Standalone, small read. |
374 | * Read as is, and do not pollute the readahead state. | 363 | * Read as is, and do not pollute the readahead state. |
@@ -379,6 +368,29 @@ ondemand_readahead(struct address_space *mapping, | |||
379 | } | 368 | } |
380 | 369 | ||
381 | /* | 370 | /* |
371 | * Hit a marked page without valid readahead state. | ||
372 | * E.g. interleaved reads. | ||
373 | * Query the pagecache for async_size, which normally equals to | ||
374 | * readahead size. Ramp it up and use it as the new readahead size. | ||
375 | */ | ||
376 | if (hit_readahead_marker) { | ||
377 | pgoff_t start; | ||
378 | |||
379 | read_lock_irq(&mapping->tree_lock); | ||
380 | start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); | ||
381 | read_unlock_irq(&mapping->tree_lock); | ||
382 | |||
383 | if (!start || start - offset > max) | ||
384 | return 0; | ||
385 | |||
386 | ra->start = start; | ||
387 | ra->size = start - offset; /* old async_size */ | ||
388 | ra->size = get_next_ra_size(ra, max); | ||
389 | ra->async_size = ra->size; | ||
390 | goto readit; | ||
391 | } | ||
392 | |||
393 | /* | ||
382 | * It may be one of | 394 | * It may be one of |
383 | * - first read on start of file | 395 | * - first read on start of file |
384 | * - sequential cache miss | 396 | * - sequential cache miss |
@@ -389,16 +401,6 @@ ondemand_readahead(struct address_space *mapping, | |||
389 | ra->size = get_init_ra_size(req_size, max); | 401 | ra->size = get_init_ra_size(req_size, max); |
390 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; | 402 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; |
391 | 403 | ||
392 | /* | ||
393 | * Hit on a marked page without valid readahead state. | ||
394 | * E.g. interleaved reads. | ||
395 | * Not knowing its readahead pos/size, bet on the minimal possible one. | ||
396 | */ | ||
397 | if (hit_readahead_marker) { | ||
398 | ra->start++; | ||
399 | ra->size = get_next_ra_size(ra, max); | ||
400 | } | ||
401 | |||
402 | readit: | 404 | readit: |
403 | return ra_submit(ra, mapping, filp); | 405 | return ra_submit(ra, mapping, filp); |
404 | } | 406 | } |
@@ -36,6 +36,7 @@ | |||
36 | * mapping->tree_lock (widely used, in set_page_dirty, | 36 | * mapping->tree_lock (widely used, in set_page_dirty, |
37 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within inode_lock in __sync_single_inode) | 38 | * within inode_lock in __sync_single_inode) |
39 | * zone->lock (within radix tree node alloc) | ||
39 | */ | 40 | */ |
40 | 41 | ||
41 | #include <linux/mm.h> | 42 | #include <linux/mm.h> |
@@ -137,8 +138,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) | |||
137 | anon_vma_free(anon_vma); | 138 | anon_vma_free(anon_vma); |
138 | } | 139 | } |
139 | 140 | ||
140 | static void anon_vma_ctor(void *data, struct kmem_cache *cachep, | 141 | static void anon_vma_ctor(struct kmem_cache *cachep, void *data) |
141 | unsigned long flags) | ||
142 | { | 142 | { |
143 | struct anon_vma *anon_vma = data; | 143 | struct anon_vma *anon_vma = data; |
144 | 144 | ||
@@ -436,7 +436,6 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | |||
436 | entry = pte_wrprotect(entry); | 436 | entry = pte_wrprotect(entry); |
437 | entry = pte_mkclean(entry); | 437 | entry = pte_mkclean(entry); |
438 | set_pte_at(mm, address, pte, entry); | 438 | set_pte_at(mm, address, pte, entry); |
439 | lazy_mmu_prot_update(entry); | ||
440 | ret = 1; | 439 | ret = 1; |
441 | } | 440 | } |
442 | 441 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index fcd19d323f9f..289dbb0a6fd6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -49,7 +49,6 @@ | |||
49 | #include <linux/ctype.h> | 49 | #include <linux/ctype.h> |
50 | #include <linux/migrate.h> | 50 | #include <linux/migrate.h> |
51 | #include <linux/highmem.h> | 51 | #include <linux/highmem.h> |
52 | #include <linux/backing-dev.h> | ||
53 | 52 | ||
54 | #include <asm/uaccess.h> | 53 | #include <asm/uaccess.h> |
55 | #include <asm/div64.h> | 54 | #include <asm/div64.h> |
@@ -96,9 +95,9 @@ static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) | |||
96 | * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: | 95 | * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: |
97 | * might be reconsidered if it ever diverges from PAGE_SIZE. | 96 | * might be reconsidered if it ever diverges from PAGE_SIZE. |
98 | * | 97 | * |
99 | * __GFP_MOVABLE is masked out as swap vectors cannot move | 98 | * Mobility flags are masked out as swap vectors cannot move |
100 | */ | 99 | */ |
101 | return alloc_pages((gfp_mask & ~__GFP_MOVABLE) | __GFP_ZERO, | 100 | return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO, |
102 | PAGE_CACHE_SHIFT-PAGE_SHIFT); | 101 | PAGE_CACHE_SHIFT-PAGE_SHIFT); |
103 | } | 102 | } |
104 | 103 | ||
@@ -972,7 +971,7 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_ | |||
972 | *nodelist++ = '\0'; | 971 | *nodelist++ = '\0'; |
973 | if (nodelist_parse(nodelist, *policy_nodes)) | 972 | if (nodelist_parse(nodelist, *policy_nodes)) |
974 | goto out; | 973 | goto out; |
975 | if (!nodes_subset(*policy_nodes, node_online_map)) | 974 | if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY])) |
976 | goto out; | 975 | goto out; |
977 | } | 976 | } |
978 | if (!strcmp(value, "default")) { | 977 | if (!strcmp(value, "default")) { |
@@ -997,9 +996,11 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_ | |||
997 | err = 0; | 996 | err = 0; |
998 | } else if (!strcmp(value, "interleave")) { | 997 | } else if (!strcmp(value, "interleave")) { |
999 | *policy = MPOL_INTERLEAVE; | 998 | *policy = MPOL_INTERLEAVE; |
1000 | /* Default to nodes online if no nodelist */ | 999 | /* |
1000 | * Default to online nodes with memory if no nodelist | ||
1001 | */ | ||
1001 | if (!nodelist) | 1002 | if (!nodelist) |
1002 | *policy_nodes = node_online_map; | 1003 | *policy_nodes = node_states[N_HIGH_MEMORY]; |
1003 | err = 0; | 1004 | err = 0; |
1004 | } | 1005 | } |
1005 | out: | 1006 | out: |
@@ -1025,8 +1026,8 @@ static struct page *shmem_swapin_async(struct shared_policy *p, | |||
1025 | return page; | 1026 | return page; |
1026 | } | 1027 | } |
1027 | 1028 | ||
1028 | struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry, | 1029 | static struct page *shmem_swapin(struct shmem_inode_info *info, |
1029 | unsigned long idx) | 1030 | swp_entry_t entry, unsigned long idx) |
1030 | { | 1031 | { |
1031 | struct shared_policy *p = &info->policy; | 1032 | struct shared_policy *p = &info->policy; |
1032 | int i, num; | 1033 | int i, num; |
@@ -1061,7 +1062,8 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, | |||
1061 | return page; | 1062 | return page; |
1062 | } | 1063 | } |
1063 | #else | 1064 | #else |
1064 | static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) | 1065 | static inline int shmem_parse_mpol(char *value, int *policy, |
1066 | nodemask_t *policy_nodes) | ||
1065 | { | 1067 | { |
1066 | return 1; | 1068 | return 1; |
1067 | } | 1069 | } |
@@ -1109,7 +1111,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, | |||
1109 | * Normally, filepage is NULL on entry, and either found | 1111 | * Normally, filepage is NULL on entry, and either found |
1110 | * uptodate immediately, or allocated and zeroed, or read | 1112 | * uptodate immediately, or allocated and zeroed, or read |
1111 | * in under swappage, which is then assigned to filepage. | 1113 | * in under swappage, which is then assigned to filepage. |
1112 | * But shmem_readpage and shmem_prepare_write pass in a locked | 1114 | * But shmem_readpage and shmem_write_begin pass in a locked |
1113 | * filepage, which may be found not uptodate by other callers | 1115 | * filepage, which may be found not uptodate by other callers |
1114 | * too, and may need to be copied from the swappage read in. | 1116 | * too, and may need to be copied from the swappage read in. |
1115 | */ | 1117 | */ |
@@ -1327,14 +1329,14 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1327 | } | 1329 | } |
1328 | 1330 | ||
1329 | #ifdef CONFIG_NUMA | 1331 | #ifdef CONFIG_NUMA |
1330 | int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | 1332 | static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) |
1331 | { | 1333 | { |
1332 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; | 1334 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; |
1333 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); | 1335 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); |
1334 | } | 1336 | } |
1335 | 1337 | ||
1336 | struct mempolicy * | 1338 | static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, |
1337 | shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) | 1339 | unsigned long addr) |
1338 | { | 1340 | { |
1339 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; | 1341 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; |
1340 | unsigned long idx; | 1342 | unsigned long idx; |
@@ -1446,7 +1448,7 @@ static const struct inode_operations shmem_symlink_inode_operations; | |||
1446 | static const struct inode_operations shmem_symlink_inline_operations; | 1448 | static const struct inode_operations shmem_symlink_inline_operations; |
1447 | 1449 | ||
1448 | /* | 1450 | /* |
1449 | * Normally tmpfs avoids the use of shmem_readpage and shmem_prepare_write; | 1451 | * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin; |
1450 | * but providing them allows a tmpfs file to be used for splice, sendfile, and | 1452 | * but providing them allows a tmpfs file to be used for splice, sendfile, and |
1451 | * below the loop driver, in the generic fashion that many filesystems support. | 1453 | * below the loop driver, in the generic fashion that many filesystems support. |
1452 | */ | 1454 | */ |
@@ -1459,10 +1461,30 @@ static int shmem_readpage(struct file *file, struct page *page) | |||
1459 | } | 1461 | } |
1460 | 1462 | ||
1461 | static int | 1463 | static int |
1462 | shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) | 1464 | shmem_write_begin(struct file *file, struct address_space *mapping, |
1465 | loff_t pos, unsigned len, unsigned flags, | ||
1466 | struct page **pagep, void **fsdata) | ||
1463 | { | 1467 | { |
1464 | struct inode *inode = page->mapping->host; | 1468 | struct inode *inode = mapping->host; |
1465 | return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL); | 1469 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
1470 | *pagep = NULL; | ||
1471 | return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); | ||
1472 | } | ||
1473 | |||
1474 | static int | ||
1475 | shmem_write_end(struct file *file, struct address_space *mapping, | ||
1476 | loff_t pos, unsigned len, unsigned copied, | ||
1477 | struct page *page, void *fsdata) | ||
1478 | { | ||
1479 | struct inode *inode = mapping->host; | ||
1480 | |||
1481 | set_page_dirty(page); | ||
1482 | page_cache_release(page); | ||
1483 | |||
1484 | if (pos+copied > inode->i_size) | ||
1485 | i_size_write(inode, pos+copied); | ||
1486 | |||
1487 | return copied; | ||
1466 | } | 1488 | } |
1467 | 1489 | ||
1468 | static ssize_t | 1490 | static ssize_t |
@@ -2219,7 +2241,7 @@ static int shmem_fill_super(struct super_block *sb, | |||
2219 | unsigned long blocks = 0; | 2241 | unsigned long blocks = 0; |
2220 | unsigned long inodes = 0; | 2242 | unsigned long inodes = 0; |
2221 | int policy = MPOL_DEFAULT; | 2243 | int policy = MPOL_DEFAULT; |
2222 | nodemask_t policy_nodes = node_online_map; | 2244 | nodemask_t policy_nodes = node_states[N_HIGH_MEMORY]; |
2223 | 2245 | ||
2224 | #ifdef CONFIG_TMPFS | 2246 | #ifdef CONFIG_TMPFS |
2225 | /* | 2247 | /* |
@@ -2306,8 +2328,7 @@ static void shmem_destroy_inode(struct inode *inode) | |||
2306 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 2328 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); |
2307 | } | 2329 | } |
2308 | 2330 | ||
2309 | static void init_once(void *foo, struct kmem_cache *cachep, | 2331 | static void init_once(struct kmem_cache *cachep, void *foo) |
2310 | unsigned long flags) | ||
2311 | { | 2332 | { |
2312 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | 2333 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; |
2313 | 2334 | ||
@@ -2322,9 +2343,7 @@ static int init_inodecache(void) | |||
2322 | { | 2343 | { |
2323 | shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", | 2344 | shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", |
2324 | sizeof(struct shmem_inode_info), | 2345 | sizeof(struct shmem_inode_info), |
2325 | 0, 0, init_once); | 2346 | 0, SLAB_PANIC, init_once); |
2326 | if (shmem_inode_cachep == NULL) | ||
2327 | return -ENOMEM; | ||
2328 | return 0; | 2347 | return 0; |
2329 | } | 2348 | } |
2330 | 2349 | ||
@@ -2338,8 +2357,8 @@ static const struct address_space_operations shmem_aops = { | |||
2338 | .set_page_dirty = __set_page_dirty_no_writeback, | 2357 | .set_page_dirty = __set_page_dirty_no_writeback, |
2339 | #ifdef CONFIG_TMPFS | 2358 | #ifdef CONFIG_TMPFS |
2340 | .readpage = shmem_readpage, | 2359 | .readpage = shmem_readpage, |
2341 | .prepare_write = shmem_prepare_write, | 2360 | .write_begin = shmem_write_begin, |
2342 | .commit_write = simple_commit_write, | 2361 | .write_end = shmem_write_end, |
2343 | #endif | 2362 | #endif |
2344 | .migratepage = migrate_page, | 2363 | .migratepage = migrate_page, |
2345 | }; | 2364 | }; |
@@ -2442,6 +2461,10 @@ static int __init init_tmpfs(void) | |||
2442 | { | 2461 | { |
2443 | int error; | 2462 | int error; |
2444 | 2463 | ||
2464 | error = bdi_init(&shmem_backing_dev_info); | ||
2465 | if (error) | ||
2466 | goto out4; | ||
2467 | |||
2445 | error = init_inodecache(); | 2468 | error = init_inodecache(); |
2446 | if (error) | 2469 | if (error) |
2447 | goto out3; | 2470 | goto out3; |
@@ -2466,6 +2489,8 @@ out1: | |||
2466 | out2: | 2489 | out2: |
2467 | destroy_inodecache(); | 2490 | destroy_inodecache(); |
2468 | out3: | 2491 | out3: |
2492 | bdi_destroy(&shmem_backing_dev_info); | ||
2493 | out4: | ||
2469 | shm_mnt = ERR_PTR(error); | 2494 | shm_mnt = ERR_PTR(error); |
2470 | return error; | 2495 | return error; |
2471 | } | 2496 | } |
@@ -2518,11 +2543,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
2518 | d_instantiate(dentry, inode); | 2543 | d_instantiate(dentry, inode); |
2519 | inode->i_size = size; | 2544 | inode->i_size = size; |
2520 | inode->i_nlink = 0; /* It is unlinked */ | 2545 | inode->i_nlink = 0; /* It is unlinked */ |
2521 | file->f_path.mnt = mntget(shm_mnt); | 2546 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, |
2522 | file->f_path.dentry = dentry; | 2547 | &shmem_file_operations); |
2523 | file->f_mapping = inode->i_mapping; | ||
2524 | file->f_op = &shmem_file_operations; | ||
2525 | file->f_mode = FMODE_WRITE | FMODE_READ; | ||
2526 | return file; | 2548 | return file; |
2527 | 2549 | ||
2528 | close_file: | 2550 | close_file: |
@@ -267,11 +267,10 @@ struct array_cache { | |||
267 | unsigned int batchcount; | 267 | unsigned int batchcount; |
268 | unsigned int touched; | 268 | unsigned int touched; |
269 | spinlock_t lock; | 269 | spinlock_t lock; |
270 | void *entry[0]; /* | 270 | void *entry[]; /* |
271 | * Must have this definition in here for the proper | 271 | * Must have this definition in here for the proper |
272 | * alignment of array_cache. Also simplifies accessing | 272 | * alignment of array_cache. Also simplifies accessing |
273 | * the entries. | 273 | * the entries. |
274 | * [0] is for gcc 2.95. It should really be []. | ||
275 | */ | 274 | */ |
276 | }; | 275 | }; |
277 | 276 | ||
@@ -408,7 +407,7 @@ struct kmem_cache { | |||
408 | unsigned int dflags; /* dynamic flags */ | 407 | unsigned int dflags; /* dynamic flags */ |
409 | 408 | ||
410 | /* constructor func */ | 409 | /* constructor func */ |
411 | void (*ctor) (void *, struct kmem_cache *, unsigned long); | 410 | void (*ctor)(struct kmem_cache *, void *); |
412 | 411 | ||
413 | /* 5) cache creation/removal */ | 412 | /* 5) cache creation/removal */ |
414 | const char *name; | 413 | const char *name; |
@@ -1568,7 +1567,7 @@ void __init kmem_cache_init(void) | |||
1568 | /* Replace the static kmem_list3 structures for the boot cpu */ | 1567 | /* Replace the static kmem_list3 structures for the boot cpu */ |
1569 | init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); | 1568 | init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); |
1570 | 1569 | ||
1571 | for_each_online_node(nid) { | 1570 | for_each_node_state(nid, N_NORMAL_MEMORY) { |
1572 | init_list(malloc_sizes[INDEX_AC].cs_cachep, | 1571 | init_list(malloc_sizes[INDEX_AC].cs_cachep, |
1573 | &initkmem_list3[SIZE_AC + nid], nid); | 1572 | &initkmem_list3[SIZE_AC + nid], nid); |
1574 | 1573 | ||
@@ -1643,6 +1642,8 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1643 | #endif | 1642 | #endif |
1644 | 1643 | ||
1645 | flags |= cachep->gfpflags; | 1644 | flags |= cachep->gfpflags; |
1645 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | ||
1646 | flags |= __GFP_RECLAIMABLE; | ||
1646 | 1647 | ||
1647 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | 1648 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); |
1648 | if (!page) | 1649 | if (!page) |
@@ -1944,7 +1945,7 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index) | |||
1944 | { | 1945 | { |
1945 | int node; | 1946 | int node; |
1946 | 1947 | ||
1947 | for_each_online_node(node) { | 1948 | for_each_node_state(node, N_NORMAL_MEMORY) { |
1948 | cachep->nodelists[node] = &initkmem_list3[index + node]; | 1949 | cachep->nodelists[node] = &initkmem_list3[index + node]; |
1949 | cachep->nodelists[node]->next_reap = jiffies + | 1950 | cachep->nodelists[node]->next_reap = jiffies + |
1950 | REAPTIMEOUT_LIST3 + | 1951 | REAPTIMEOUT_LIST3 + |
@@ -2075,7 +2076,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | |||
2075 | g_cpucache_up = PARTIAL_L3; | 2076 | g_cpucache_up = PARTIAL_L3; |
2076 | } else { | 2077 | } else { |
2077 | int node; | 2078 | int node; |
2078 | for_each_online_node(node) { | 2079 | for_each_node_state(node, N_NORMAL_MEMORY) { |
2079 | cachep->nodelists[node] = | 2080 | cachep->nodelists[node] = |
2080 | kmalloc_node(sizeof(struct kmem_list3), | 2081 | kmalloc_node(sizeof(struct kmem_list3), |
2081 | GFP_KERNEL, node); | 2082 | GFP_KERNEL, node); |
@@ -2127,7 +2128,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | |||
2127 | struct kmem_cache * | 2128 | struct kmem_cache * |
2128 | kmem_cache_create (const char *name, size_t size, size_t align, | 2129 | kmem_cache_create (const char *name, size_t size, size_t align, |
2129 | unsigned long flags, | 2130 | unsigned long flags, |
2130 | void (*ctor)(void*, struct kmem_cache *, unsigned long)) | 2131 | void (*ctor)(struct kmem_cache *, void *)) |
2131 | { | 2132 | { |
2132 | size_t left_over, slab_size, ralign; | 2133 | size_t left_over, slab_size, ralign; |
2133 | struct kmem_cache *cachep = NULL, *pc; | 2134 | struct kmem_cache *cachep = NULL, *pc; |
@@ -2634,8 +2635,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2634 | * They must also be threaded. | 2635 | * They must also be threaded. |
2635 | */ | 2636 | */ |
2636 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) | 2637 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) |
2637 | cachep->ctor(objp + obj_offset(cachep), cachep, | 2638 | cachep->ctor(cachep, objp + obj_offset(cachep)); |
2638 | 0); | ||
2639 | 2639 | ||
2640 | if (cachep->flags & SLAB_RED_ZONE) { | 2640 | if (cachep->flags & SLAB_RED_ZONE) { |
2641 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) | 2641 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) |
@@ -2651,7 +2651,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2651 | cachep->buffer_size / PAGE_SIZE, 0); | 2651 | cachep->buffer_size / PAGE_SIZE, 0); |
2652 | #else | 2652 | #else |
2653 | if (cachep->ctor) | 2653 | if (cachep->ctor) |
2654 | cachep->ctor(objp, cachep, 0); | 2654 | cachep->ctor(cachep, objp); |
2655 | #endif | 2655 | #endif |
2656 | slab_bufctl(slabp)[i] = i + 1; | 2656 | slab_bufctl(slabp)[i] = i + 1; |
2657 | } | 2657 | } |
@@ -2746,9 +2746,9 @@ static int cache_grow(struct kmem_cache *cachep, | |||
2746 | * Be lazy and only check for valid flags here, keeping it out of the | 2746 | * Be lazy and only check for valid flags here, keeping it out of the |
2747 | * critical path in kmem_cache_alloc(). | 2747 | * critical path in kmem_cache_alloc(). |
2748 | */ | 2748 | */ |
2749 | BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); | 2749 | BUG_ON(flags & GFP_SLAB_BUG_MASK); |
2750 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | ||
2750 | 2751 | ||
2751 | local_flags = (flags & GFP_LEVEL_MASK); | ||
2752 | /* Take the l3 list lock to change the colour_next on this node */ | 2752 | /* Take the l3 list lock to change the colour_next on this node */ |
2753 | check_irq_off(); | 2753 | check_irq_off(); |
2754 | l3 = cachep->nodelists[nodeid]; | 2754 | l3 = cachep->nodelists[nodeid]; |
@@ -2785,7 +2785,7 @@ static int cache_grow(struct kmem_cache *cachep, | |||
2785 | 2785 | ||
2786 | /* Get slab management. */ | 2786 | /* Get slab management. */ |
2787 | slabp = alloc_slabmgmt(cachep, objp, offset, | 2787 | slabp = alloc_slabmgmt(cachep, objp, offset, |
2788 | local_flags & ~GFP_THISNODE, nodeid); | 2788 | local_flags & ~GFP_CONSTRAINT_MASK, nodeid); |
2789 | if (!slabp) | 2789 | if (!slabp) |
2790 | goto opps1; | 2790 | goto opps1; |
2791 | 2791 | ||
@@ -3076,7 +3076,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3076 | #endif | 3076 | #endif |
3077 | objp += obj_offset(cachep); | 3077 | objp += obj_offset(cachep); |
3078 | if (cachep->ctor && cachep->flags & SLAB_POISON) | 3078 | if (cachep->ctor && cachep->flags & SLAB_POISON) |
3079 | cachep->ctor(objp, cachep, 0); | 3079 | cachep->ctor(cachep, objp); |
3080 | #if ARCH_SLAB_MINALIGN | 3080 | #if ARCH_SLAB_MINALIGN |
3081 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { | 3081 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { |
3082 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", | 3082 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", |
@@ -3225,7 +3225,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3225 | 3225 | ||
3226 | zonelist = &NODE_DATA(slab_node(current->mempolicy)) | 3226 | zonelist = &NODE_DATA(slab_node(current->mempolicy)) |
3227 | ->node_zonelists[gfp_zone(flags)]; | 3227 | ->node_zonelists[gfp_zone(flags)]; |
3228 | local_flags = (flags & GFP_LEVEL_MASK); | 3228 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
3229 | 3229 | ||
3230 | retry: | 3230 | retry: |
3231 | /* | 3231 | /* |
@@ -3792,7 +3792,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
3792 | struct array_cache *new_shared; | 3792 | struct array_cache *new_shared; |
3793 | struct array_cache **new_alien = NULL; | 3793 | struct array_cache **new_alien = NULL; |
3794 | 3794 | ||
3795 | for_each_online_node(node) { | 3795 | for_each_node_state(node, N_NORMAL_MEMORY) { |
3796 | 3796 | ||
3797 | if (use_alien_caches) { | 3797 | if (use_alien_caches) { |
3798 | new_alien = alloc_alien_cache(node, cachep->limit); | 3798 | new_alien = alloc_alien_cache(node, cachep->limit); |
@@ -4446,7 +4446,8 @@ const struct seq_operations slabstats_op = { | |||
4446 | */ | 4446 | */ |
4447 | size_t ksize(const void *objp) | 4447 | size_t ksize(const void *objp) |
4448 | { | 4448 | { |
4449 | if (unlikely(ZERO_OR_NULL_PTR(objp))) | 4449 | BUG_ON(!objp); |
4450 | if (unlikely(objp == ZERO_SIZE_PTR)) | ||
4450 | return 0; | 4451 | return 0; |
4451 | 4452 | ||
4452 | return obj_size(virt_to_cache(objp)); | 4453 | return obj_size(virt_to_cache(objp)); |
@@ -360,7 +360,7 @@ static void slob_free(void *block, int size) | |||
360 | slobidx_t units; | 360 | slobidx_t units; |
361 | unsigned long flags; | 361 | unsigned long flags; |
362 | 362 | ||
363 | if (ZERO_OR_NULL_PTR(block)) | 363 | if (unlikely(ZERO_OR_NULL_PTR(block))) |
364 | return; | 364 | return; |
365 | BUG_ON(!size); | 365 | BUG_ON(!size); |
366 | 366 | ||
@@ -466,7 +466,7 @@ void kfree(const void *block) | |||
466 | { | 466 | { |
467 | struct slob_page *sp; | 467 | struct slob_page *sp; |
468 | 468 | ||
469 | if (ZERO_OR_NULL_PTR(block)) | 469 | if (unlikely(ZERO_OR_NULL_PTR(block))) |
470 | return; | 470 | return; |
471 | 471 | ||
472 | sp = (struct slob_page *)virt_to_page(block); | 472 | sp = (struct slob_page *)virt_to_page(block); |
@@ -484,7 +484,8 @@ size_t ksize(const void *block) | |||
484 | { | 484 | { |
485 | struct slob_page *sp; | 485 | struct slob_page *sp; |
486 | 486 | ||
487 | if (ZERO_OR_NULL_PTR(block)) | 487 | BUG_ON(!block); |
488 | if (unlikely(block == ZERO_SIZE_PTR)) | ||
488 | return 0; | 489 | return 0; |
489 | 490 | ||
490 | sp = (struct slob_page *)virt_to_page(block); | 491 | sp = (struct slob_page *)virt_to_page(block); |
@@ -498,12 +499,12 @@ struct kmem_cache { | |||
498 | unsigned int size, align; | 499 | unsigned int size, align; |
499 | unsigned long flags; | 500 | unsigned long flags; |
500 | const char *name; | 501 | const char *name; |
501 | void (*ctor)(void *, struct kmem_cache *, unsigned long); | 502 | void (*ctor)(struct kmem_cache *, void *); |
502 | }; | 503 | }; |
503 | 504 | ||
504 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | 505 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, |
505 | size_t align, unsigned long flags, | 506 | size_t align, unsigned long flags, |
506 | void (*ctor)(void*, struct kmem_cache *, unsigned long)) | 507 | void (*ctor)(struct kmem_cache *, void *)) |
507 | { | 508 | { |
508 | struct kmem_cache *c; | 509 | struct kmem_cache *c; |
509 | 510 | ||
@@ -547,7 +548,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
547 | b = slob_new_page(flags, get_order(c->size), node); | 548 | b = slob_new_page(flags, get_order(c->size), node); |
548 | 549 | ||
549 | if (c->ctor) | 550 | if (c->ctor) |
550 | c->ctor(b, c, 0); | 551 | c->ctor(c, b); |
551 | 552 | ||
552 | return b; | 553 | return b; |
553 | } | 554 | } |
@@ -90,7 +90,7 @@ | |||
90 | * One use of this flag is to mark slabs that are | 90 | * One use of this flag is to mark slabs that are |
91 | * used for allocations. Then such a slab becomes a cpu | 91 | * used for allocations. Then such a slab becomes a cpu |
92 | * slab. The cpu slab may be equipped with an additional | 92 | * slab. The cpu slab may be equipped with an additional |
93 | * lockless_freelist that allows lockless access to | 93 | * freelist that allows lockless access to |
94 | * free objects in addition to the regular freelist | 94 | * free objects in addition to the regular freelist |
95 | * that requires the slab lock. | 95 | * that requires the slab lock. |
96 | * | 96 | * |
@@ -140,11 +140,6 @@ static inline void ClearSlabDebug(struct page *page) | |||
140 | /* | 140 | /* |
141 | * Issues still to be resolved: | 141 | * Issues still to be resolved: |
142 | * | 142 | * |
143 | * - The per cpu array is updated for each new slab and and is a remote | ||
144 | * cacheline for most nodes. This could become a bouncing cacheline given | ||
145 | * enough frequent updates. There are 16 pointers in a cacheline, so at | ||
146 | * max 16 cpus could compete for the cacheline which may be okay. | ||
147 | * | ||
148 | * - Support PAGE_ALLOC_DEBUG. Should be easy to do. | 143 | * - Support PAGE_ALLOC_DEBUG. Should be easy to do. |
149 | * | 144 | * |
150 | * - Variable sizing of the per node arrays | 145 | * - Variable sizing of the per node arrays |
@@ -205,11 +200,6 @@ static inline void ClearSlabDebug(struct page *page) | |||
205 | #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) | 200 | #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) |
206 | #endif | 201 | #endif |
207 | 202 | ||
208 | /* | ||
209 | * The page->inuse field is 16 bit thus we have this limitation | ||
210 | */ | ||
211 | #define MAX_OBJECTS_PER_SLAB 65535 | ||
212 | |||
213 | /* Internal SLUB flags */ | 203 | /* Internal SLUB flags */ |
214 | #define __OBJECT_POISON 0x80000000 /* Poison object */ | 204 | #define __OBJECT_POISON 0x80000000 /* Poison object */ |
215 | #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ | 205 | #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ |
@@ -277,6 +267,15 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | |||
277 | #endif | 267 | #endif |
278 | } | 268 | } |
279 | 269 | ||
270 | static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) | ||
271 | { | ||
272 | #ifdef CONFIG_SMP | ||
273 | return s->cpu_slab[cpu]; | ||
274 | #else | ||
275 | return &s->cpu_slab; | ||
276 | #endif | ||
277 | } | ||
278 | |||
280 | static inline int check_valid_pointer(struct kmem_cache *s, | 279 | static inline int check_valid_pointer(struct kmem_cache *s, |
281 | struct page *page, const void *object) | 280 | struct page *page, const void *object) |
282 | { | 281 | { |
@@ -729,11 +728,6 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
729 | slab_err(s, page, "Not a valid slab page"); | 728 | slab_err(s, page, "Not a valid slab page"); |
730 | return 0; | 729 | return 0; |
731 | } | 730 | } |
732 | if (page->offset * sizeof(void *) != s->offset) { | ||
733 | slab_err(s, page, "Corrupted offset %lu", | ||
734 | (unsigned long)(page->offset * sizeof(void *))); | ||
735 | return 0; | ||
736 | } | ||
737 | if (page->inuse > s->objects) { | 731 | if (page->inuse > s->objects) { |
738 | slab_err(s, page, "inuse %u > max %u", | 732 | slab_err(s, page, "inuse %u > max %u", |
739 | s->name, page->inuse, s->objects); | 733 | s->name, page->inuse, s->objects); |
@@ -872,8 +866,6 @@ bad: | |||
872 | slab_fix(s, "Marking all objects used"); | 866 | slab_fix(s, "Marking all objects used"); |
873 | page->inuse = s->objects; | 867 | page->inuse = s->objects; |
874 | page->freelist = NULL; | 868 | page->freelist = NULL; |
875 | /* Fix up fields that may be corrupted */ | ||
876 | page->offset = s->offset / sizeof(void *); | ||
877 | } | 869 | } |
878 | return 0; | 870 | return 0; |
879 | } | 871 | } |
@@ -988,7 +980,7 @@ __setup("slub_debug", setup_slub_debug); | |||
988 | 980 | ||
989 | static unsigned long kmem_cache_flags(unsigned long objsize, | 981 | static unsigned long kmem_cache_flags(unsigned long objsize, |
990 | unsigned long flags, const char *name, | 982 | unsigned long flags, const char *name, |
991 | void (*ctor)(void *, struct kmem_cache *, unsigned long)) | 983 | void (*ctor)(struct kmem_cache *, void *)) |
992 | { | 984 | { |
993 | /* | 985 | /* |
994 | * The page->offset field is only 16 bit wide. This is an offset | 986 | * The page->offset field is only 16 bit wide. This is an offset |
@@ -1035,7 +1027,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page, | |||
1035 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} | 1027 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} |
1036 | static inline unsigned long kmem_cache_flags(unsigned long objsize, | 1028 | static inline unsigned long kmem_cache_flags(unsigned long objsize, |
1037 | unsigned long flags, const char *name, | 1029 | unsigned long flags, const char *name, |
1038 | void (*ctor)(void *, struct kmem_cache *, unsigned long)) | 1030 | void (*ctor)(struct kmem_cache *, void *)) |
1039 | { | 1031 | { |
1040 | return flags; | 1032 | return flags; |
1041 | } | 1033 | } |
@@ -1055,6 +1047,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1055 | if (s->flags & SLAB_CACHE_DMA) | 1047 | if (s->flags & SLAB_CACHE_DMA) |
1056 | flags |= SLUB_DMA; | 1048 | flags |= SLUB_DMA; |
1057 | 1049 | ||
1050 | if (s->flags & SLAB_RECLAIM_ACCOUNT) | ||
1051 | flags |= __GFP_RECLAIMABLE; | ||
1052 | |||
1058 | if (node == -1) | 1053 | if (node == -1) |
1059 | page = alloc_pages(flags, s->order); | 1054 | page = alloc_pages(flags, s->order); |
1060 | else | 1055 | else |
@@ -1076,7 +1071,7 @@ static void setup_object(struct kmem_cache *s, struct page *page, | |||
1076 | { | 1071 | { |
1077 | setup_object_debug(s, page, object); | 1072 | setup_object_debug(s, page, object); |
1078 | if (unlikely(s->ctor)) | 1073 | if (unlikely(s->ctor)) |
1079 | s->ctor(object, s, 0); | 1074 | s->ctor(s, object); |
1080 | } | 1075 | } |
1081 | 1076 | ||
1082 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | 1077 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) |
@@ -1088,19 +1083,16 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1088 | void *last; | 1083 | void *last; |
1089 | void *p; | 1084 | void *p; |
1090 | 1085 | ||
1091 | BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); | 1086 | BUG_ON(flags & GFP_SLAB_BUG_MASK); |
1092 | |||
1093 | if (flags & __GFP_WAIT) | ||
1094 | local_irq_enable(); | ||
1095 | 1087 | ||
1096 | page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); | 1088 | page = allocate_slab(s, |
1089 | flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); | ||
1097 | if (!page) | 1090 | if (!page) |
1098 | goto out; | 1091 | goto out; |
1099 | 1092 | ||
1100 | n = get_node(s, page_to_nid(page)); | 1093 | n = get_node(s, page_to_nid(page)); |
1101 | if (n) | 1094 | if (n) |
1102 | atomic_long_inc(&n->nr_slabs); | 1095 | atomic_long_inc(&n->nr_slabs); |
1103 | page->offset = s->offset / sizeof(void *); | ||
1104 | page->slab = s; | 1096 | page->slab = s; |
1105 | page->flags |= 1 << PG_slab; | 1097 | page->flags |= 1 << PG_slab; |
1106 | if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | | 1098 | if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | |
@@ -1123,11 +1115,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1123 | set_freepointer(s, last, NULL); | 1115 | set_freepointer(s, last, NULL); |
1124 | 1116 | ||
1125 | page->freelist = start; | 1117 | page->freelist = start; |
1126 | page->lockless_freelist = NULL; | ||
1127 | page->inuse = 0; | 1118 | page->inuse = 0; |
1128 | out: | 1119 | out: |
1129 | if (flags & __GFP_WAIT) | ||
1130 | local_irq_disable(); | ||
1131 | return page; | 1120 | return page; |
1132 | } | 1121 | } |
1133 | 1122 | ||
@@ -1149,7 +1138,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1149 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | 1138 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, |
1150 | - pages); | 1139 | - pages); |
1151 | 1140 | ||
1152 | page->mapping = NULL; | ||
1153 | __free_pages(page, s->order); | 1141 | __free_pages(page, s->order); |
1154 | } | 1142 | } |
1155 | 1143 | ||
@@ -1383,33 +1371,34 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page) | |||
1383 | /* | 1371 | /* |
1384 | * Remove the cpu slab | 1372 | * Remove the cpu slab |
1385 | */ | 1373 | */ |
1386 | static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) | 1374 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1387 | { | 1375 | { |
1376 | struct page *page = c->page; | ||
1388 | /* | 1377 | /* |
1389 | * Merge cpu freelist into freelist. Typically we get here | 1378 | * Merge cpu freelist into freelist. Typically we get here |
1390 | * because both freelists are empty. So this is unlikely | 1379 | * because both freelists are empty. So this is unlikely |
1391 | * to occur. | 1380 | * to occur. |
1392 | */ | 1381 | */ |
1393 | while (unlikely(page->lockless_freelist)) { | 1382 | while (unlikely(c->freelist)) { |
1394 | void **object; | 1383 | void **object; |
1395 | 1384 | ||
1396 | /* Retrieve object from cpu_freelist */ | 1385 | /* Retrieve object from cpu_freelist */ |
1397 | object = page->lockless_freelist; | 1386 | object = c->freelist; |
1398 | page->lockless_freelist = page->lockless_freelist[page->offset]; | 1387 | c->freelist = c->freelist[c->offset]; |
1399 | 1388 | ||
1400 | /* And put onto the regular freelist */ | 1389 | /* And put onto the regular freelist */ |
1401 | object[page->offset] = page->freelist; | 1390 | object[c->offset] = page->freelist; |
1402 | page->freelist = object; | 1391 | page->freelist = object; |
1403 | page->inuse--; | 1392 | page->inuse--; |
1404 | } | 1393 | } |
1405 | s->cpu_slab[cpu] = NULL; | 1394 | c->page = NULL; |
1406 | unfreeze_slab(s, page); | 1395 | unfreeze_slab(s, page); |
1407 | } | 1396 | } |
1408 | 1397 | ||
1409 | static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) | 1398 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1410 | { | 1399 | { |
1411 | slab_lock(page); | 1400 | slab_lock(c->page); |
1412 | deactivate_slab(s, page, cpu); | 1401 | deactivate_slab(s, c); |
1413 | } | 1402 | } |
1414 | 1403 | ||
1415 | /* | 1404 | /* |
@@ -1418,18 +1407,17 @@ static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) | |||
1418 | */ | 1407 | */ |
1419 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) | 1408 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) |
1420 | { | 1409 | { |
1421 | struct page *page = s->cpu_slab[cpu]; | 1410 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); |
1422 | 1411 | ||
1423 | if (likely(page)) | 1412 | if (likely(c && c->page)) |
1424 | flush_slab(s, page, cpu); | 1413 | flush_slab(s, c); |
1425 | } | 1414 | } |
1426 | 1415 | ||
1427 | static void flush_cpu_slab(void *d) | 1416 | static void flush_cpu_slab(void *d) |
1428 | { | 1417 | { |
1429 | struct kmem_cache *s = d; | 1418 | struct kmem_cache *s = d; |
1430 | int cpu = smp_processor_id(); | ||
1431 | 1419 | ||
1432 | __flush_cpu_slab(s, cpu); | 1420 | __flush_cpu_slab(s, smp_processor_id()); |
1433 | } | 1421 | } |
1434 | 1422 | ||
1435 | static void flush_all(struct kmem_cache *s) | 1423 | static void flush_all(struct kmem_cache *s) |
@@ -1446,6 +1434,19 @@ static void flush_all(struct kmem_cache *s) | |||
1446 | } | 1434 | } |
1447 | 1435 | ||
1448 | /* | 1436 | /* |
1437 | * Check if the objects in a per cpu structure fit numa | ||
1438 | * locality expectations. | ||
1439 | */ | ||
1440 | static inline int node_match(struct kmem_cache_cpu *c, int node) | ||
1441 | { | ||
1442 | #ifdef CONFIG_NUMA | ||
1443 | if (node != -1 && c->node != node) | ||
1444 | return 0; | ||
1445 | #endif | ||
1446 | return 1; | ||
1447 | } | ||
1448 | |||
1449 | /* | ||
1449 | * Slow path. The lockless freelist is empty or we need to perform | 1450 | * Slow path. The lockless freelist is empty or we need to perform |
1450 | * debugging duties. | 1451 | * debugging duties. |
1451 | * | 1452 | * |
@@ -1463,45 +1464,53 @@ static void flush_all(struct kmem_cache *s) | |||
1463 | * we need to allocate a new slab. This is slowest path since we may sleep. | 1464 | * we need to allocate a new slab. This is slowest path since we may sleep. |
1464 | */ | 1465 | */ |
1465 | static void *__slab_alloc(struct kmem_cache *s, | 1466 | static void *__slab_alloc(struct kmem_cache *s, |
1466 | gfp_t gfpflags, int node, void *addr, struct page *page) | 1467 | gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) |
1467 | { | 1468 | { |
1468 | void **object; | 1469 | void **object; |
1469 | int cpu = smp_processor_id(); | 1470 | struct page *new; |
1470 | 1471 | ||
1471 | if (!page) | 1472 | if (!c->page) |
1472 | goto new_slab; | 1473 | goto new_slab; |
1473 | 1474 | ||
1474 | slab_lock(page); | 1475 | slab_lock(c->page); |
1475 | if (unlikely(node != -1 && page_to_nid(page) != node)) | 1476 | if (unlikely(!node_match(c, node))) |
1476 | goto another_slab; | 1477 | goto another_slab; |
1477 | load_freelist: | 1478 | load_freelist: |
1478 | object = page->freelist; | 1479 | object = c->page->freelist; |
1479 | if (unlikely(!object)) | 1480 | if (unlikely(!object)) |
1480 | goto another_slab; | 1481 | goto another_slab; |
1481 | if (unlikely(SlabDebug(page))) | 1482 | if (unlikely(SlabDebug(c->page))) |
1482 | goto debug; | 1483 | goto debug; |
1483 | 1484 | ||
1484 | object = page->freelist; | 1485 | object = c->page->freelist; |
1485 | page->lockless_freelist = object[page->offset]; | 1486 | c->freelist = object[c->offset]; |
1486 | page->inuse = s->objects; | 1487 | c->page->inuse = s->objects; |
1487 | page->freelist = NULL; | 1488 | c->page->freelist = NULL; |
1488 | slab_unlock(page); | 1489 | c->node = page_to_nid(c->page); |
1490 | slab_unlock(c->page); | ||
1489 | return object; | 1491 | return object; |
1490 | 1492 | ||
1491 | another_slab: | 1493 | another_slab: |
1492 | deactivate_slab(s, page, cpu); | 1494 | deactivate_slab(s, c); |
1493 | 1495 | ||
1494 | new_slab: | 1496 | new_slab: |
1495 | page = get_partial(s, gfpflags, node); | 1497 | new = get_partial(s, gfpflags, node); |
1496 | if (page) { | 1498 | if (new) { |
1497 | s->cpu_slab[cpu] = page; | 1499 | c->page = new; |
1498 | goto load_freelist; | 1500 | goto load_freelist; |
1499 | } | 1501 | } |
1500 | 1502 | ||
1501 | page = new_slab(s, gfpflags, node); | 1503 | if (gfpflags & __GFP_WAIT) |
1502 | if (page) { | 1504 | local_irq_enable(); |
1503 | cpu = smp_processor_id(); | 1505 | |
1504 | if (s->cpu_slab[cpu]) { | 1506 | new = new_slab(s, gfpflags, node); |
1507 | |||
1508 | if (gfpflags & __GFP_WAIT) | ||
1509 | local_irq_disable(); | ||
1510 | |||
1511 | if (new) { | ||
1512 | c = get_cpu_slab(s, smp_processor_id()); | ||
1513 | if (c->page) { | ||
1505 | /* | 1514 | /* |
1506 | * Someone else populated the cpu_slab while we | 1515 | * Someone else populated the cpu_slab while we |
1507 | * enabled interrupts, or we have gotten scheduled | 1516 | * enabled interrupts, or we have gotten scheduled |
@@ -1509,34 +1518,33 @@ new_slab: | |||
1509 | * requested node even if __GFP_THISNODE was | 1518 | * requested node even if __GFP_THISNODE was |
1510 | * specified. So we need to recheck. | 1519 | * specified. So we need to recheck. |
1511 | */ | 1520 | */ |
1512 | if (node == -1 || | 1521 | if (node_match(c, node)) { |
1513 | page_to_nid(s->cpu_slab[cpu]) == node) { | ||
1514 | /* | 1522 | /* |
1515 | * Current cpuslab is acceptable and we | 1523 | * Current cpuslab is acceptable and we |
1516 | * want the current one since its cache hot | 1524 | * want the current one since its cache hot |
1517 | */ | 1525 | */ |
1518 | discard_slab(s, page); | 1526 | discard_slab(s, new); |
1519 | page = s->cpu_slab[cpu]; | 1527 | slab_lock(c->page); |
1520 | slab_lock(page); | ||
1521 | goto load_freelist; | 1528 | goto load_freelist; |
1522 | } | 1529 | } |
1523 | /* New slab does not fit our expectations */ | 1530 | /* New slab does not fit our expectations */ |
1524 | flush_slab(s, s->cpu_slab[cpu], cpu); | 1531 | flush_slab(s, c); |
1525 | } | 1532 | } |
1526 | slab_lock(page); | 1533 | slab_lock(new); |
1527 | SetSlabFrozen(page); | 1534 | SetSlabFrozen(new); |
1528 | s->cpu_slab[cpu] = page; | 1535 | c->page = new; |
1529 | goto load_freelist; | 1536 | goto load_freelist; |
1530 | } | 1537 | } |
1531 | return NULL; | 1538 | return NULL; |
1532 | debug: | 1539 | debug: |
1533 | object = page->freelist; | 1540 | object = c->page->freelist; |
1534 | if (!alloc_debug_processing(s, page, object, addr)) | 1541 | if (!alloc_debug_processing(s, c->page, object, addr)) |
1535 | goto another_slab; | 1542 | goto another_slab; |
1536 | 1543 | ||
1537 | page->inuse++; | 1544 | c->page->inuse++; |
1538 | page->freelist = object[page->offset]; | 1545 | c->page->freelist = object[c->offset]; |
1539 | slab_unlock(page); | 1546 | c->node = -1; |
1547 | slab_unlock(c->page); | ||
1540 | return object; | 1548 | return object; |
1541 | } | 1549 | } |
1542 | 1550 | ||
@@ -1553,25 +1561,24 @@ debug: | |||
1553 | static void __always_inline *slab_alloc(struct kmem_cache *s, | 1561 | static void __always_inline *slab_alloc(struct kmem_cache *s, |
1554 | gfp_t gfpflags, int node, void *addr) | 1562 | gfp_t gfpflags, int node, void *addr) |
1555 | { | 1563 | { |
1556 | struct page *page; | ||
1557 | void **object; | 1564 | void **object; |
1558 | unsigned long flags; | 1565 | unsigned long flags; |
1566 | struct kmem_cache_cpu *c; | ||
1559 | 1567 | ||
1560 | local_irq_save(flags); | 1568 | local_irq_save(flags); |
1561 | page = s->cpu_slab[smp_processor_id()]; | 1569 | c = get_cpu_slab(s, smp_processor_id()); |
1562 | if (unlikely(!page || !page->lockless_freelist || | 1570 | if (unlikely(!c->freelist || !node_match(c, node))) |
1563 | (node != -1 && page_to_nid(page) != node))) | ||
1564 | 1571 | ||
1565 | object = __slab_alloc(s, gfpflags, node, addr, page); | 1572 | object = __slab_alloc(s, gfpflags, node, addr, c); |
1566 | 1573 | ||
1567 | else { | 1574 | else { |
1568 | object = page->lockless_freelist; | 1575 | object = c->freelist; |
1569 | page->lockless_freelist = object[page->offset]; | 1576 | c->freelist = object[c->offset]; |
1570 | } | 1577 | } |
1571 | local_irq_restore(flags); | 1578 | local_irq_restore(flags); |
1572 | 1579 | ||
1573 | if (unlikely((gfpflags & __GFP_ZERO) && object)) | 1580 | if (unlikely((gfpflags & __GFP_ZERO) && object)) |
1574 | memset(object, 0, s->objsize); | 1581 | memset(object, 0, c->objsize); |
1575 | 1582 | ||
1576 | return object; | 1583 | return object; |
1577 | } | 1584 | } |
@@ -1599,7 +1606,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); | |||
1599 | * handling required then we can return immediately. | 1606 | * handling required then we can return immediately. |
1600 | */ | 1607 | */ |
1601 | static void __slab_free(struct kmem_cache *s, struct page *page, | 1608 | static void __slab_free(struct kmem_cache *s, struct page *page, |
1602 | void *x, void *addr) | 1609 | void *x, void *addr, unsigned int offset) |
1603 | { | 1610 | { |
1604 | void *prior; | 1611 | void *prior; |
1605 | void **object = (void *)x; | 1612 | void **object = (void *)x; |
@@ -1609,7 +1616,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
1609 | if (unlikely(SlabDebug(page))) | 1616 | if (unlikely(SlabDebug(page))) |
1610 | goto debug; | 1617 | goto debug; |
1611 | checks_ok: | 1618 | checks_ok: |
1612 | prior = object[page->offset] = page->freelist; | 1619 | prior = object[offset] = page->freelist; |
1613 | page->freelist = object; | 1620 | page->freelist = object; |
1614 | page->inuse--; | 1621 | page->inuse--; |
1615 | 1622 | ||
@@ -1664,15 +1671,16 @@ static void __always_inline slab_free(struct kmem_cache *s, | |||
1664 | { | 1671 | { |
1665 | void **object = (void *)x; | 1672 | void **object = (void *)x; |
1666 | unsigned long flags; | 1673 | unsigned long flags; |
1674 | struct kmem_cache_cpu *c; | ||
1667 | 1675 | ||
1668 | local_irq_save(flags); | 1676 | local_irq_save(flags); |
1669 | debug_check_no_locks_freed(object, s->objsize); | 1677 | debug_check_no_locks_freed(object, s->objsize); |
1670 | if (likely(page == s->cpu_slab[smp_processor_id()] && | 1678 | c = get_cpu_slab(s, smp_processor_id()); |
1671 | !SlabDebug(page))) { | 1679 | if (likely(page == c->page && c->node >= 0)) { |
1672 | object[page->offset] = page->lockless_freelist; | 1680 | object[c->offset] = c->freelist; |
1673 | page->lockless_freelist = object; | 1681 | c->freelist = object; |
1674 | } else | 1682 | } else |
1675 | __slab_free(s, page, x, addr); | 1683 | __slab_free(s, page, x, addr, c->offset); |
1676 | 1684 | ||
1677 | local_irq_restore(flags); | 1685 | local_irq_restore(flags); |
1678 | } | 1686 | } |
@@ -1759,14 +1767,6 @@ static inline int slab_order(int size, int min_objects, | |||
1759 | int rem; | 1767 | int rem; |
1760 | int min_order = slub_min_order; | 1768 | int min_order = slub_min_order; |
1761 | 1769 | ||
1762 | /* | ||
1763 | * If we would create too many object per slab then reduce | ||
1764 | * the slab order even if it goes below slub_min_order. | ||
1765 | */ | ||
1766 | while (min_order > 0 && | ||
1767 | (PAGE_SIZE << min_order) >= MAX_OBJECTS_PER_SLAB * size) | ||
1768 | min_order--; | ||
1769 | |||
1770 | for (order = max(min_order, | 1770 | for (order = max(min_order, |
1771 | fls(min_objects * size - 1) - PAGE_SHIFT); | 1771 | fls(min_objects * size - 1) - PAGE_SHIFT); |
1772 | order <= max_order; order++) { | 1772 | order <= max_order; order++) { |
@@ -1781,9 +1781,6 @@ static inline int slab_order(int size, int min_objects, | |||
1781 | if (rem <= slab_size / fract_leftover) | 1781 | if (rem <= slab_size / fract_leftover) |
1782 | break; | 1782 | break; |
1783 | 1783 | ||
1784 | /* If the next size is too high then exit now */ | ||
1785 | if (slab_size * 2 >= MAX_OBJECTS_PER_SLAB * size) | ||
1786 | break; | ||
1787 | } | 1784 | } |
1788 | 1785 | ||
1789 | return order; | 1786 | return order; |
@@ -1858,6 +1855,16 @@ static unsigned long calculate_alignment(unsigned long flags, | |||
1858 | return ALIGN(align, sizeof(void *)); | 1855 | return ALIGN(align, sizeof(void *)); |
1859 | } | 1856 | } |
1860 | 1857 | ||
1858 | static void init_kmem_cache_cpu(struct kmem_cache *s, | ||
1859 | struct kmem_cache_cpu *c) | ||
1860 | { | ||
1861 | c->page = NULL; | ||
1862 | c->freelist = NULL; | ||
1863 | c->node = 0; | ||
1864 | c->offset = s->offset / sizeof(void *); | ||
1865 | c->objsize = s->objsize; | ||
1866 | } | ||
1867 | |||
1861 | static void init_kmem_cache_node(struct kmem_cache_node *n) | 1868 | static void init_kmem_cache_node(struct kmem_cache_node *n) |
1862 | { | 1869 | { |
1863 | n->nr_partial = 0; | 1870 | n->nr_partial = 0; |
@@ -1869,6 +1876,131 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) | |||
1869 | #endif | 1876 | #endif |
1870 | } | 1877 | } |
1871 | 1878 | ||
1879 | #ifdef CONFIG_SMP | ||
1880 | /* | ||
1881 | * Per cpu array for per cpu structures. | ||
1882 | * | ||
1883 | * The per cpu array places all kmem_cache_cpu structures from one processor | ||
1884 | * close together meaning that it becomes possible that multiple per cpu | ||
1885 | * structures are contained in one cacheline. This may be particularly | ||
1886 | * beneficial for the kmalloc caches. | ||
1887 | * | ||
1888 | * A desktop system typically has around 60-80 slabs. With 100 here we are | ||
1889 | * likely able to get per cpu structures for all caches from the array defined | ||
1890 | * here. We must be able to cover all kmalloc caches during bootstrap. | ||
1891 | * | ||
1892 | * If the per cpu array is exhausted then fall back to kmalloc | ||
1893 | * of individual cachelines. No sharing is possible then. | ||
1894 | */ | ||
1895 | #define NR_KMEM_CACHE_CPU 100 | ||
1896 | |||
1897 | static DEFINE_PER_CPU(struct kmem_cache_cpu, | ||
1898 | kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; | ||
1899 | |||
1900 | static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); | ||
1901 | static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE; | ||
1902 | |||
1903 | static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, | ||
1904 | int cpu, gfp_t flags) | ||
1905 | { | ||
1906 | struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); | ||
1907 | |||
1908 | if (c) | ||
1909 | per_cpu(kmem_cache_cpu_free, cpu) = | ||
1910 | (void *)c->freelist; | ||
1911 | else { | ||
1912 | /* Table overflow: So allocate ourselves */ | ||
1913 | c = kmalloc_node( | ||
1914 | ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), | ||
1915 | flags, cpu_to_node(cpu)); | ||
1916 | if (!c) | ||
1917 | return NULL; | ||
1918 | } | ||
1919 | |||
1920 | init_kmem_cache_cpu(s, c); | ||
1921 | return c; | ||
1922 | } | ||
1923 | |||
1924 | static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) | ||
1925 | { | ||
1926 | if (c < per_cpu(kmem_cache_cpu, cpu) || | ||
1927 | c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { | ||
1928 | kfree(c); | ||
1929 | return; | ||
1930 | } | ||
1931 | c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); | ||
1932 | per_cpu(kmem_cache_cpu_free, cpu) = c; | ||
1933 | } | ||
1934 | |||
1935 | static void free_kmem_cache_cpus(struct kmem_cache *s) | ||
1936 | { | ||
1937 | int cpu; | ||
1938 | |||
1939 | for_each_online_cpu(cpu) { | ||
1940 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
1941 | |||
1942 | if (c) { | ||
1943 | s->cpu_slab[cpu] = NULL; | ||
1944 | free_kmem_cache_cpu(c, cpu); | ||
1945 | } | ||
1946 | } | ||
1947 | } | ||
1948 | |||
1949 | static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) | ||
1950 | { | ||
1951 | int cpu; | ||
1952 | |||
1953 | for_each_online_cpu(cpu) { | ||
1954 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
1955 | |||
1956 | if (c) | ||
1957 | continue; | ||
1958 | |||
1959 | c = alloc_kmem_cache_cpu(s, cpu, flags); | ||
1960 | if (!c) { | ||
1961 | free_kmem_cache_cpus(s); | ||
1962 | return 0; | ||
1963 | } | ||
1964 | s->cpu_slab[cpu] = c; | ||
1965 | } | ||
1966 | return 1; | ||
1967 | } | ||
1968 | |||
1969 | /* | ||
1970 | * Initialize the per cpu array. | ||
1971 | */ | ||
1972 | static void init_alloc_cpu_cpu(int cpu) | ||
1973 | { | ||
1974 | int i; | ||
1975 | |||
1976 | if (cpu_isset(cpu, kmem_cach_cpu_free_init_once)) | ||
1977 | return; | ||
1978 | |||
1979 | for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) | ||
1980 | free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); | ||
1981 | |||
1982 | cpu_set(cpu, kmem_cach_cpu_free_init_once); | ||
1983 | } | ||
1984 | |||
1985 | static void __init init_alloc_cpu(void) | ||
1986 | { | ||
1987 | int cpu; | ||
1988 | |||
1989 | for_each_online_cpu(cpu) | ||
1990 | init_alloc_cpu_cpu(cpu); | ||
1991 | } | ||
1992 | |||
1993 | #else | ||
1994 | static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} | ||
1995 | static inline void init_alloc_cpu(void) {} | ||
1996 | |||
1997 | static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) | ||
1998 | { | ||
1999 | init_kmem_cache_cpu(s, &s->cpu_slab); | ||
2000 | return 1; | ||
2001 | } | ||
2002 | #endif | ||
2003 | |||
1872 | #ifdef CONFIG_NUMA | 2004 | #ifdef CONFIG_NUMA |
1873 | /* | 2005 | /* |
1874 | * No kmalloc_node yet so do it by hand. We know that this is the first | 2006 | * No kmalloc_node yet so do it by hand. We know that this is the first |
@@ -1876,10 +2008,11 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) | |||
1876 | * possible. | 2008 | * possible. |
1877 | * | 2009 | * |
1878 | * Note that this function only works on the kmalloc_node_cache | 2010 | * Note that this function only works on the kmalloc_node_cache |
1879 | * when allocating for the kmalloc_node_cache. | 2011 | * when allocating for the kmalloc_node_cache. This is used for bootstrapping |
2012 | * memory on a fresh node that has no slab structures yet. | ||
1880 | */ | 2013 | */ |
1881 | static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags, | 2014 | static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, |
1882 | int node) | 2015 | int node) |
1883 | { | 2016 | { |
1884 | struct page *page; | 2017 | struct page *page; |
1885 | struct kmem_cache_node *n; | 2018 | struct kmem_cache_node *n; |
@@ -1908,12 +2041,6 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag | |||
1908 | init_kmem_cache_node(n); | 2041 | init_kmem_cache_node(n); |
1909 | atomic_long_inc(&n->nr_slabs); | 2042 | atomic_long_inc(&n->nr_slabs); |
1910 | add_partial(n, page); | 2043 | add_partial(n, page); |
1911 | |||
1912 | /* | ||
1913 | * new_slab() disables interupts. If we do not reenable interrupts here | ||
1914 | * then bootup would continue with interrupts disabled. | ||
1915 | */ | ||
1916 | local_irq_enable(); | ||
1917 | return n; | 2044 | return n; |
1918 | } | 2045 | } |
1919 | 2046 | ||
@@ -1921,7 +2048,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) | |||
1921 | { | 2048 | { |
1922 | int node; | 2049 | int node; |
1923 | 2050 | ||
1924 | for_each_online_node(node) { | 2051 | for_each_node_state(node, N_NORMAL_MEMORY) { |
1925 | struct kmem_cache_node *n = s->node[node]; | 2052 | struct kmem_cache_node *n = s->node[node]; |
1926 | if (n && n != &s->local_node) | 2053 | if (n && n != &s->local_node) |
1927 | kmem_cache_free(kmalloc_caches, n); | 2054 | kmem_cache_free(kmalloc_caches, n); |
@@ -1939,7 +2066,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) | |||
1939 | else | 2066 | else |
1940 | local_node = 0; | 2067 | local_node = 0; |
1941 | 2068 | ||
1942 | for_each_online_node(node) { | 2069 | for_each_node_state(node, N_NORMAL_MEMORY) { |
1943 | struct kmem_cache_node *n; | 2070 | struct kmem_cache_node *n; |
1944 | 2071 | ||
1945 | if (local_node == node) | 2072 | if (local_node == node) |
@@ -2077,21 +2204,14 @@ static int calculate_sizes(struct kmem_cache *s) | |||
2077 | */ | 2204 | */ |
2078 | s->objects = (PAGE_SIZE << s->order) / size; | 2205 | s->objects = (PAGE_SIZE << s->order) / size; |
2079 | 2206 | ||
2080 | /* | 2207 | return !!s->objects; |
2081 | * Verify that the number of objects is within permitted limits. | ||
2082 | * The page->inuse field is only 16 bit wide! So we cannot have | ||
2083 | * more than 64k objects per slab. | ||
2084 | */ | ||
2085 | if (!s->objects || s->objects > MAX_OBJECTS_PER_SLAB) | ||
2086 | return 0; | ||
2087 | return 1; | ||
2088 | 2208 | ||
2089 | } | 2209 | } |
2090 | 2210 | ||
2091 | static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | 2211 | static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, |
2092 | const char *name, size_t size, | 2212 | const char *name, size_t size, |
2093 | size_t align, unsigned long flags, | 2213 | size_t align, unsigned long flags, |
2094 | void (*ctor)(void *, struct kmem_cache *, unsigned long)) | 2214 | void (*ctor)(struct kmem_cache *, void *)) |
2095 | { | 2215 | { |
2096 | memset(s, 0, kmem_size); | 2216 | memset(s, 0, kmem_size); |
2097 | s->name = name; | 2217 | s->name = name; |
@@ -2107,9 +2227,12 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | |||
2107 | #ifdef CONFIG_NUMA | 2227 | #ifdef CONFIG_NUMA |
2108 | s->defrag_ratio = 100; | 2228 | s->defrag_ratio = 100; |
2109 | #endif | 2229 | #endif |
2230 | if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) | ||
2231 | goto error; | ||
2110 | 2232 | ||
2111 | if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) | 2233 | if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) |
2112 | return 1; | 2234 | return 1; |
2235 | free_kmem_cache_nodes(s); | ||
2113 | error: | 2236 | error: |
2114 | if (flags & SLAB_PANIC) | 2237 | if (flags & SLAB_PANIC) |
2115 | panic("Cannot create slab %s size=%lu realsize=%u " | 2238 | panic("Cannot create slab %s size=%lu realsize=%u " |
@@ -2192,7 +2315,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) | |||
2192 | flush_all(s); | 2315 | flush_all(s); |
2193 | 2316 | ||
2194 | /* Attempt to free all objects */ | 2317 | /* Attempt to free all objects */ |
2195 | for_each_online_node(node) { | 2318 | free_kmem_cache_cpus(s); |
2319 | for_each_node_state(node, N_NORMAL_MEMORY) { | ||
2196 | struct kmem_cache_node *n = get_node(s, node); | 2320 | struct kmem_cache_node *n = get_node(s, node); |
2197 | 2321 | ||
2198 | n->nr_partial -= free_list(s, n, &n->partial); | 2322 | n->nr_partial -= free_list(s, n, &n->partial); |
@@ -2227,11 +2351,11 @@ EXPORT_SYMBOL(kmem_cache_destroy); | |||
2227 | * Kmalloc subsystem | 2351 | * Kmalloc subsystem |
2228 | *******************************************************************/ | 2352 | *******************************************************************/ |
2229 | 2353 | ||
2230 | struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; | 2354 | struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned; |
2231 | EXPORT_SYMBOL(kmalloc_caches); | 2355 | EXPORT_SYMBOL(kmalloc_caches); |
2232 | 2356 | ||
2233 | #ifdef CONFIG_ZONE_DMA | 2357 | #ifdef CONFIG_ZONE_DMA |
2234 | static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; | 2358 | static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT]; |
2235 | #endif | 2359 | #endif |
2236 | 2360 | ||
2237 | static int __init setup_slub_min_order(char *str) | 2361 | static int __init setup_slub_min_order(char *str) |
@@ -2397,12 +2521,8 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) | |||
2397 | return ZERO_SIZE_PTR; | 2521 | return ZERO_SIZE_PTR; |
2398 | 2522 | ||
2399 | index = size_index[(size - 1) / 8]; | 2523 | index = size_index[(size - 1) / 8]; |
2400 | } else { | 2524 | } else |
2401 | if (size > KMALLOC_MAX_SIZE) | ||
2402 | return NULL; | ||
2403 | |||
2404 | index = fls(size - 1); | 2525 | index = fls(size - 1); |
2405 | } | ||
2406 | 2526 | ||
2407 | #ifdef CONFIG_ZONE_DMA | 2527 | #ifdef CONFIG_ZONE_DMA |
2408 | if (unlikely((flags & SLUB_DMA))) | 2528 | if (unlikely((flags & SLUB_DMA))) |
@@ -2414,9 +2534,15 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) | |||
2414 | 2534 | ||
2415 | void *__kmalloc(size_t size, gfp_t flags) | 2535 | void *__kmalloc(size_t size, gfp_t flags) |
2416 | { | 2536 | { |
2417 | struct kmem_cache *s = get_slab(size, flags); | 2537 | struct kmem_cache *s; |
2418 | 2538 | ||
2419 | if (ZERO_OR_NULL_PTR(s)) | 2539 | if (unlikely(size > PAGE_SIZE / 2)) |
2540 | return (void *)__get_free_pages(flags | __GFP_COMP, | ||
2541 | get_order(size)); | ||
2542 | |||
2543 | s = get_slab(size, flags); | ||
2544 | |||
2545 | if (unlikely(ZERO_OR_NULL_PTR(s))) | ||
2420 | return s; | 2546 | return s; |
2421 | 2547 | ||
2422 | return slab_alloc(s, flags, -1, __builtin_return_address(0)); | 2548 | return slab_alloc(s, flags, -1, __builtin_return_address(0)); |
@@ -2426,9 +2552,15 @@ EXPORT_SYMBOL(__kmalloc); | |||
2426 | #ifdef CONFIG_NUMA | 2552 | #ifdef CONFIG_NUMA |
2427 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 2553 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
2428 | { | 2554 | { |
2429 | struct kmem_cache *s = get_slab(size, flags); | 2555 | struct kmem_cache *s; |
2430 | 2556 | ||
2431 | if (ZERO_OR_NULL_PTR(s)) | 2557 | if (unlikely(size > PAGE_SIZE / 2)) |
2558 | return (void *)__get_free_pages(flags | __GFP_COMP, | ||
2559 | get_order(size)); | ||
2560 | |||
2561 | s = get_slab(size, flags); | ||
2562 | |||
2563 | if (unlikely(ZERO_OR_NULL_PTR(s))) | ||
2432 | return s; | 2564 | return s; |
2433 | 2565 | ||
2434 | return slab_alloc(s, flags, node, __builtin_return_address(0)); | 2566 | return slab_alloc(s, flags, node, __builtin_return_address(0)); |
@@ -2441,7 +2573,8 @@ size_t ksize(const void *object) | |||
2441 | struct page *page; | 2573 | struct page *page; |
2442 | struct kmem_cache *s; | 2574 | struct kmem_cache *s; |
2443 | 2575 | ||
2444 | if (ZERO_OR_NULL_PTR(object)) | 2576 | BUG_ON(!object); |
2577 | if (unlikely(object == ZERO_SIZE_PTR)) | ||
2445 | return 0; | 2578 | return 0; |
2446 | 2579 | ||
2447 | page = get_object_page(object); | 2580 | page = get_object_page(object); |
@@ -2473,22 +2606,17 @@ EXPORT_SYMBOL(ksize); | |||
2473 | 2606 | ||
2474 | void kfree(const void *x) | 2607 | void kfree(const void *x) |
2475 | { | 2608 | { |
2476 | struct kmem_cache *s; | ||
2477 | struct page *page; | 2609 | struct page *page; |
2478 | 2610 | ||
2479 | /* | 2611 | if (unlikely(ZERO_OR_NULL_PTR(x))) |
2480 | * This has to be an unsigned comparison. According to Linus | ||
2481 | * some gcc version treat a pointer as a signed entity. Then | ||
2482 | * this comparison would be true for all "negative" pointers | ||
2483 | * (which would cover the whole upper half of the address space). | ||
2484 | */ | ||
2485 | if (ZERO_OR_NULL_PTR(x)) | ||
2486 | return; | 2612 | return; |
2487 | 2613 | ||
2488 | page = virt_to_head_page(x); | 2614 | page = virt_to_head_page(x); |
2489 | s = page->slab; | 2615 | if (unlikely(!PageSlab(page))) { |
2490 | 2616 | put_page(page); | |
2491 | slab_free(s, page, (void *)x, __builtin_return_address(0)); | 2617 | return; |
2618 | } | ||
2619 | slab_free(page->slab, page, (void *)x, __builtin_return_address(0)); | ||
2492 | } | 2620 | } |
2493 | EXPORT_SYMBOL(kfree); | 2621 | EXPORT_SYMBOL(kfree); |
2494 | 2622 | ||
@@ -2517,7 +2645,7 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
2517 | return -ENOMEM; | 2645 | return -ENOMEM; |
2518 | 2646 | ||
2519 | flush_all(s); | 2647 | flush_all(s); |
2520 | for_each_online_node(node) { | 2648 | for_each_node_state(node, N_NORMAL_MEMORY) { |
2521 | n = get_node(s, node); | 2649 | n = get_node(s, node); |
2522 | 2650 | ||
2523 | if (!n->nr_partial) | 2651 | if (!n->nr_partial) |
@@ -2575,6 +2703,8 @@ void __init kmem_cache_init(void) | |||
2575 | int i; | 2703 | int i; |
2576 | int caches = 0; | 2704 | int caches = 0; |
2577 | 2705 | ||
2706 | init_alloc_cpu(); | ||
2707 | |||
2578 | #ifdef CONFIG_NUMA | 2708 | #ifdef CONFIG_NUMA |
2579 | /* | 2709 | /* |
2580 | * Must first have the slab cache available for the allocations of the | 2710 | * Must first have the slab cache available for the allocations of the |
@@ -2602,7 +2732,7 @@ void __init kmem_cache_init(void) | |||
2602 | caches++; | 2732 | caches++; |
2603 | } | 2733 | } |
2604 | 2734 | ||
2605 | for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { | 2735 | for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) { |
2606 | create_kmalloc_cache(&kmalloc_caches[i], | 2736 | create_kmalloc_cache(&kmalloc_caches[i], |
2607 | "kmalloc", 1 << i, GFP_KERNEL); | 2737 | "kmalloc", 1 << i, GFP_KERNEL); |
2608 | caches++; | 2738 | caches++; |
@@ -2629,16 +2759,18 @@ void __init kmem_cache_init(void) | |||
2629 | slab_state = UP; | 2759 | slab_state = UP; |
2630 | 2760 | ||
2631 | /* Provide the correct kmalloc names now that the caches are up */ | 2761 | /* Provide the correct kmalloc names now that the caches are up */ |
2632 | for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) | 2762 | for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) |
2633 | kmalloc_caches[i]. name = | 2763 | kmalloc_caches[i]. name = |
2634 | kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); | 2764 | kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); |
2635 | 2765 | ||
2636 | #ifdef CONFIG_SMP | 2766 | #ifdef CONFIG_SMP |
2637 | register_cpu_notifier(&slab_notifier); | 2767 | register_cpu_notifier(&slab_notifier); |
2768 | kmem_size = offsetof(struct kmem_cache, cpu_slab) + | ||
2769 | nr_cpu_ids * sizeof(struct kmem_cache_cpu *); | ||
2770 | #else | ||
2771 | kmem_size = sizeof(struct kmem_cache); | ||
2638 | #endif | 2772 | #endif |
2639 | 2773 | ||
2640 | kmem_size = offsetof(struct kmem_cache, cpu_slab) + | ||
2641 | nr_cpu_ids * sizeof(struct page *); | ||
2642 | 2774 | ||
2643 | printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," | 2775 | printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," |
2644 | " CPUs=%d, Nodes=%d\n", | 2776 | " CPUs=%d, Nodes=%d\n", |
@@ -2669,7 +2801,7 @@ static int slab_unmergeable(struct kmem_cache *s) | |||
2669 | 2801 | ||
2670 | static struct kmem_cache *find_mergeable(size_t size, | 2802 | static struct kmem_cache *find_mergeable(size_t size, |
2671 | size_t align, unsigned long flags, const char *name, | 2803 | size_t align, unsigned long flags, const char *name, |
2672 | void (*ctor)(void *, struct kmem_cache *, unsigned long)) | 2804 | void (*ctor)(struct kmem_cache *, void *)) |
2673 | { | 2805 | { |
2674 | struct kmem_cache *s; | 2806 | struct kmem_cache *s; |
2675 | 2807 | ||
@@ -2710,19 +2842,28 @@ static struct kmem_cache *find_mergeable(size_t size, | |||
2710 | 2842 | ||
2711 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | 2843 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, |
2712 | size_t align, unsigned long flags, | 2844 | size_t align, unsigned long flags, |
2713 | void (*ctor)(void *, struct kmem_cache *, unsigned long)) | 2845 | void (*ctor)(struct kmem_cache *, void *)) |
2714 | { | 2846 | { |
2715 | struct kmem_cache *s; | 2847 | struct kmem_cache *s; |
2716 | 2848 | ||
2717 | down_write(&slub_lock); | 2849 | down_write(&slub_lock); |
2718 | s = find_mergeable(size, align, flags, name, ctor); | 2850 | s = find_mergeable(size, align, flags, name, ctor); |
2719 | if (s) { | 2851 | if (s) { |
2852 | int cpu; | ||
2853 | |||
2720 | s->refcount++; | 2854 | s->refcount++; |
2721 | /* | 2855 | /* |
2722 | * Adjust the object sizes so that we clear | 2856 | * Adjust the object sizes so that we clear |
2723 | * the complete object on kzalloc. | 2857 | * the complete object on kzalloc. |
2724 | */ | 2858 | */ |
2725 | s->objsize = max(s->objsize, (int)size); | 2859 | s->objsize = max(s->objsize, (int)size); |
2860 | |||
2861 | /* | ||
2862 | * And then we need to update the object size in the | ||
2863 | * per cpu structures | ||
2864 | */ | ||
2865 | for_each_online_cpu(cpu) | ||
2866 | get_cpu_slab(s, cpu)->objsize = s->objsize; | ||
2726 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); | 2867 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); |
2727 | up_write(&slub_lock); | 2868 | up_write(&slub_lock); |
2728 | if (sysfs_slab_alias(s, name)) | 2869 | if (sysfs_slab_alias(s, name)) |
@@ -2765,15 +2906,29 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | |||
2765 | unsigned long flags; | 2906 | unsigned long flags; |
2766 | 2907 | ||
2767 | switch (action) { | 2908 | switch (action) { |
2909 | case CPU_UP_PREPARE: | ||
2910 | case CPU_UP_PREPARE_FROZEN: | ||
2911 | init_alloc_cpu_cpu(cpu); | ||
2912 | down_read(&slub_lock); | ||
2913 | list_for_each_entry(s, &slab_caches, list) | ||
2914 | s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, | ||
2915 | GFP_KERNEL); | ||
2916 | up_read(&slub_lock); | ||
2917 | break; | ||
2918 | |||
2768 | case CPU_UP_CANCELED: | 2919 | case CPU_UP_CANCELED: |
2769 | case CPU_UP_CANCELED_FROZEN: | 2920 | case CPU_UP_CANCELED_FROZEN: |
2770 | case CPU_DEAD: | 2921 | case CPU_DEAD: |
2771 | case CPU_DEAD_FROZEN: | 2922 | case CPU_DEAD_FROZEN: |
2772 | down_read(&slub_lock); | 2923 | down_read(&slub_lock); |
2773 | list_for_each_entry(s, &slab_caches, list) { | 2924 | list_for_each_entry(s, &slab_caches, list) { |
2925 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
2926 | |||
2774 | local_irq_save(flags); | 2927 | local_irq_save(flags); |
2775 | __flush_cpu_slab(s, cpu); | 2928 | __flush_cpu_slab(s, cpu); |
2776 | local_irq_restore(flags); | 2929 | local_irq_restore(flags); |
2930 | free_kmem_cache_cpu(c, cpu); | ||
2931 | s->cpu_slab[cpu] = NULL; | ||
2777 | } | 2932 | } |
2778 | up_read(&slub_lock); | 2933 | up_read(&slub_lock); |
2779 | break; | 2934 | break; |
@@ -2790,9 +2945,14 @@ static struct notifier_block __cpuinitdata slab_notifier = | |||
2790 | 2945 | ||
2791 | void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) | 2946 | void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) |
2792 | { | 2947 | { |
2793 | struct kmem_cache *s = get_slab(size, gfpflags); | 2948 | struct kmem_cache *s; |
2949 | |||
2950 | if (unlikely(size > PAGE_SIZE / 2)) | ||
2951 | return (void *)__get_free_pages(gfpflags | __GFP_COMP, | ||
2952 | get_order(size)); | ||
2953 | s = get_slab(size, gfpflags); | ||
2794 | 2954 | ||
2795 | if (ZERO_OR_NULL_PTR(s)) | 2955 | if (unlikely(ZERO_OR_NULL_PTR(s))) |
2796 | return s; | 2956 | return s; |
2797 | 2957 | ||
2798 | return slab_alloc(s, gfpflags, -1, caller); | 2958 | return slab_alloc(s, gfpflags, -1, caller); |
@@ -2801,9 +2961,14 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) | |||
2801 | void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, | 2961 | void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, |
2802 | int node, void *caller) | 2962 | int node, void *caller) |
2803 | { | 2963 | { |
2804 | struct kmem_cache *s = get_slab(size, gfpflags); | 2964 | struct kmem_cache *s; |
2965 | |||
2966 | if (unlikely(size > PAGE_SIZE / 2)) | ||
2967 | return (void *)__get_free_pages(gfpflags | __GFP_COMP, | ||
2968 | get_order(size)); | ||
2969 | s = get_slab(size, gfpflags); | ||
2805 | 2970 | ||
2806 | if (ZERO_OR_NULL_PTR(s)) | 2971 | if (unlikely(ZERO_OR_NULL_PTR(s))) |
2807 | return s; | 2972 | return s; |
2808 | 2973 | ||
2809 | return slab_alloc(s, gfpflags, node, caller); | 2974 | return slab_alloc(s, gfpflags, node, caller); |
@@ -2902,7 +3067,7 @@ static long validate_slab_cache(struct kmem_cache *s) | |||
2902 | return -ENOMEM; | 3067 | return -ENOMEM; |
2903 | 3068 | ||
2904 | flush_all(s); | 3069 | flush_all(s); |
2905 | for_each_online_node(node) { | 3070 | for_each_node_state(node, N_NORMAL_MEMORY) { |
2906 | struct kmem_cache_node *n = get_node(s, node); | 3071 | struct kmem_cache_node *n = get_node(s, node); |
2907 | 3072 | ||
2908 | count += validate_slab_node(s, n, map); | 3073 | count += validate_slab_node(s, n, map); |
@@ -3116,13 +3281,13 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
3116 | int node; | 3281 | int node; |
3117 | 3282 | ||
3118 | if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), | 3283 | if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), |
3119 | GFP_KERNEL)) | 3284 | GFP_TEMPORARY)) |
3120 | return sprintf(buf, "Out of memory\n"); | 3285 | return sprintf(buf, "Out of memory\n"); |
3121 | 3286 | ||
3122 | /* Push back cpu slabs */ | 3287 | /* Push back cpu slabs */ |
3123 | flush_all(s); | 3288 | flush_all(s); |
3124 | 3289 | ||
3125 | for_each_online_node(node) { | 3290 | for_each_node_state(node, N_NORMAL_MEMORY) { |
3126 | struct kmem_cache_node *n = get_node(s, node); | 3291 | struct kmem_cache_node *n = get_node(s, node); |
3127 | unsigned long flags; | 3292 | unsigned long flags; |
3128 | struct page *page; | 3293 | struct page *page; |
@@ -3230,11 +3395,18 @@ static unsigned long slab_objects(struct kmem_cache *s, | |||
3230 | per_cpu = nodes + nr_node_ids; | 3395 | per_cpu = nodes + nr_node_ids; |
3231 | 3396 | ||
3232 | for_each_possible_cpu(cpu) { | 3397 | for_each_possible_cpu(cpu) { |
3233 | struct page *page = s->cpu_slab[cpu]; | 3398 | struct page *page; |
3234 | int node; | 3399 | int node; |
3400 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
3235 | 3401 | ||
3402 | if (!c) | ||
3403 | continue; | ||
3404 | |||
3405 | page = c->page; | ||
3406 | node = c->node; | ||
3407 | if (node < 0) | ||
3408 | continue; | ||
3236 | if (page) { | 3409 | if (page) { |
3237 | node = page_to_nid(page); | ||
3238 | if (flags & SO_CPU) { | 3410 | if (flags & SO_CPU) { |
3239 | int x = 0; | 3411 | int x = 0; |
3240 | 3412 | ||
@@ -3249,7 +3421,7 @@ static unsigned long slab_objects(struct kmem_cache *s, | |||
3249 | } | 3421 | } |
3250 | } | 3422 | } |
3251 | 3423 | ||
3252 | for_each_online_node(node) { | 3424 | for_each_node_state(node, N_NORMAL_MEMORY) { |
3253 | struct kmem_cache_node *n = get_node(s, node); | 3425 | struct kmem_cache_node *n = get_node(s, node); |
3254 | 3426 | ||
3255 | if (flags & SO_PARTIAL) { | 3427 | if (flags & SO_PARTIAL) { |
@@ -3277,7 +3449,7 @@ static unsigned long slab_objects(struct kmem_cache *s, | |||
3277 | 3449 | ||
3278 | x = sprintf(buf, "%lu", total); | 3450 | x = sprintf(buf, "%lu", total); |
3279 | #ifdef CONFIG_NUMA | 3451 | #ifdef CONFIG_NUMA |
3280 | for_each_online_node(node) | 3452 | for_each_node_state(node, N_NORMAL_MEMORY) |
3281 | if (nodes[node]) | 3453 | if (nodes[node]) |
3282 | x += sprintf(buf + x, " N%d=%lu", | 3454 | x += sprintf(buf + x, " N%d=%lu", |
3283 | node, nodes[node]); | 3455 | node, nodes[node]); |
@@ -3291,13 +3463,19 @@ static int any_slab_objects(struct kmem_cache *s) | |||
3291 | int node; | 3463 | int node; |
3292 | int cpu; | 3464 | int cpu; |
3293 | 3465 | ||
3294 | for_each_possible_cpu(cpu) | 3466 | for_each_possible_cpu(cpu) { |
3295 | if (s->cpu_slab[cpu]) | 3467 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); |
3468 | |||
3469 | if (c && c->page) | ||
3296 | return 1; | 3470 | return 1; |
3471 | } | ||
3297 | 3472 | ||
3298 | for_each_node(node) { | 3473 | for_each_online_node(node) { |
3299 | struct kmem_cache_node *n = get_node(s, node); | 3474 | struct kmem_cache_node *n = get_node(s, node); |
3300 | 3475 | ||
3476 | if (!n) | ||
3477 | continue; | ||
3478 | |||
3301 | if (n->nr_partial || atomic_long_read(&n->nr_slabs)) | 3479 | if (n->nr_partial || atomic_long_read(&n->nr_slabs)) |
3302 | return 1; | 3480 | return 1; |
3303 | } | 3481 | } |
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c new file mode 100644 index 000000000000..d3b718b0c20a --- /dev/null +++ b/mm/sparse-vmemmap.c | |||
@@ -0,0 +1,148 @@ | |||
1 | /* | ||
2 | * Virtual Memory Map support | ||
3 | * | ||
4 | * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>. | ||
5 | * | ||
6 | * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn, | ||
7 | * virt_to_page, page_address() to be implemented as a base offset | ||
8 | * calculation without memory access. | ||
9 | * | ||
10 | * However, virtual mappings need a page table and TLBs. Many Linux | ||
11 | * architectures already map their physical space using 1-1 mappings | ||
12 | * via TLBs. For those arches the virtual memmory map is essentially | ||
13 | * for free if we use the same page size as the 1-1 mappings. In that | ||
14 | * case the overhead consists of a few additional pages that are | ||
15 | * allocated to create a view of memory for vmemmap. | ||
16 | * | ||
17 | * The architecture is expected to provide a vmemmap_populate() function | ||
18 | * to instantiate the mapping. | ||
19 | */ | ||
20 | #include <linux/mm.h> | ||
21 | #include <linux/mmzone.h> | ||
22 | #include <linux/bootmem.h> | ||
23 | #include <linux/highmem.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/spinlock.h> | ||
26 | #include <linux/vmalloc.h> | ||
27 | #include <asm/dma.h> | ||
28 | #include <asm/pgalloc.h> | ||
29 | #include <asm/pgtable.h> | ||
30 | |||
31 | /* | ||
32 | * Allocate a block of memory to be used to back the virtual memory map | ||
33 | * or to back the page tables that are used to create the mapping. | ||
34 | * Uses the main allocators if they are available, else bootmem. | ||
35 | */ | ||
36 | void * __meminit vmemmap_alloc_block(unsigned long size, int node) | ||
37 | { | ||
38 | /* If the main allocator is up use that, fallback to bootmem. */ | ||
39 | if (slab_is_available()) { | ||
40 | struct page *page = alloc_pages_node(node, | ||
41 | GFP_KERNEL | __GFP_ZERO, get_order(size)); | ||
42 | if (page) | ||
43 | return page_address(page); | ||
44 | return NULL; | ||
45 | } else | ||
46 | return __alloc_bootmem_node(NODE_DATA(node), size, size, | ||
47 | __pa(MAX_DMA_ADDRESS)); | ||
48 | } | ||
49 | |||
50 | void __meminit vmemmap_verify(pte_t *pte, int node, | ||
51 | unsigned long start, unsigned long end) | ||
52 | { | ||
53 | unsigned long pfn = pte_pfn(*pte); | ||
54 | int actual_node = early_pfn_to_nid(pfn); | ||
55 | |||
56 | if (actual_node != node) | ||
57 | printk(KERN_WARNING "[%lx-%lx] potential offnode " | ||
58 | "page_structs\n", start, end - 1); | ||
59 | } | ||
60 | |||
61 | pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) | ||
62 | { | ||
63 | pte_t *pte = pte_offset_kernel(pmd, addr); | ||
64 | if (pte_none(*pte)) { | ||
65 | pte_t entry; | ||
66 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | ||
67 | if (!p) | ||
68 | return 0; | ||
69 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); | ||
70 | set_pte_at(&init_mm, addr, pte, entry); | ||
71 | } | ||
72 | return pte; | ||
73 | } | ||
74 | |||
75 | pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) | ||
76 | { | ||
77 | pmd_t *pmd = pmd_offset(pud, addr); | ||
78 | if (pmd_none(*pmd)) { | ||
79 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | ||
80 | if (!p) | ||
81 | return 0; | ||
82 | pmd_populate_kernel(&init_mm, pmd, p); | ||
83 | } | ||
84 | return pmd; | ||
85 | } | ||
86 | |||
87 | pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node) | ||
88 | { | ||
89 | pud_t *pud = pud_offset(pgd, addr); | ||
90 | if (pud_none(*pud)) { | ||
91 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | ||
92 | if (!p) | ||
93 | return 0; | ||
94 | pud_populate(&init_mm, pud, p); | ||
95 | } | ||
96 | return pud; | ||
97 | } | ||
98 | |||
99 | pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) | ||
100 | { | ||
101 | pgd_t *pgd = pgd_offset_k(addr); | ||
102 | if (pgd_none(*pgd)) { | ||
103 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | ||
104 | if (!p) | ||
105 | return 0; | ||
106 | pgd_populate(&init_mm, pgd, p); | ||
107 | } | ||
108 | return pgd; | ||
109 | } | ||
110 | |||
111 | int __meminit vmemmap_populate_basepages(struct page *start_page, | ||
112 | unsigned long size, int node) | ||
113 | { | ||
114 | unsigned long addr = (unsigned long)start_page; | ||
115 | unsigned long end = (unsigned long)(start_page + size); | ||
116 | pgd_t *pgd; | ||
117 | pud_t *pud; | ||
118 | pmd_t *pmd; | ||
119 | pte_t *pte; | ||
120 | |||
121 | for (; addr < end; addr += PAGE_SIZE) { | ||
122 | pgd = vmemmap_pgd_populate(addr, node); | ||
123 | if (!pgd) | ||
124 | return -ENOMEM; | ||
125 | pud = vmemmap_pud_populate(pgd, addr, node); | ||
126 | if (!pud) | ||
127 | return -ENOMEM; | ||
128 | pmd = vmemmap_pmd_populate(pud, addr, node); | ||
129 | if (!pmd) | ||
130 | return -ENOMEM; | ||
131 | pte = vmemmap_pte_populate(pmd, addr, node); | ||
132 | if (!pte) | ||
133 | return -ENOMEM; | ||
134 | vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); | ||
135 | } | ||
136 | |||
137 | return 0; | ||
138 | } | ||
139 | |||
140 | struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) | ||
141 | { | ||
142 | struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION); | ||
143 | int error = vmemmap_populate(map, PAGES_PER_SECTION, nid); | ||
144 | if (error) | ||
145 | return NULL; | ||
146 | |||
147 | return map; | ||
148 | } | ||
diff --git a/mm/sparse.c b/mm/sparse.c index 239f5a720d38..08fb14f5eea3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -9,6 +9,8 @@ | |||
9 | #include <linux/spinlock.h> | 9 | #include <linux/spinlock.h> |
10 | #include <linux/vmalloc.h> | 10 | #include <linux/vmalloc.h> |
11 | #include <asm/dma.h> | 11 | #include <asm/dma.h> |
12 | #include <asm/pgalloc.h> | ||
13 | #include <asm/pgtable.h> | ||
12 | 14 | ||
13 | /* | 15 | /* |
14 | * Permanent SPARSEMEM data: | 16 | * Permanent SPARSEMEM data: |
@@ -106,7 +108,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) | |||
106 | 108 | ||
107 | /* | 109 | /* |
108 | * Although written for the SPARSEMEM_EXTREME case, this happens | 110 | * Although written for the SPARSEMEM_EXTREME case, this happens |
109 | * to also work for the flat array case becase | 111 | * to also work for the flat array case because |
110 | * NR_SECTION_ROOTS==NR_MEM_SECTIONS. | 112 | * NR_SECTION_ROOTS==NR_MEM_SECTIONS. |
111 | */ | 113 | */ |
112 | int __section_nr(struct mem_section* ms) | 114 | int __section_nr(struct mem_section* ms) |
@@ -176,7 +178,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, | |||
176 | if (nid != early_pfn_to_nid(pfn)) | 178 | if (nid != early_pfn_to_nid(pfn)) |
177 | continue; | 179 | continue; |
178 | 180 | ||
179 | if (pfn_valid(pfn)) | 181 | if (pfn_present(pfn)) |
180 | nr_pages += PAGES_PER_SECTION; | 182 | nr_pages += PAGES_PER_SECTION; |
181 | } | 183 | } |
182 | 184 | ||
@@ -204,13 +206,16 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn | |||
204 | } | 206 | } |
205 | 207 | ||
206 | static int __meminit sparse_init_one_section(struct mem_section *ms, | 208 | static int __meminit sparse_init_one_section(struct mem_section *ms, |
207 | unsigned long pnum, struct page *mem_map) | 209 | unsigned long pnum, struct page *mem_map, |
210 | unsigned long *pageblock_bitmap) | ||
208 | { | 211 | { |
209 | if (!valid_section(ms)) | 212 | if (!present_section(ms)) |
210 | return -EINVAL; | 213 | return -EINVAL; |
211 | 214 | ||
212 | ms->section_mem_map &= ~SECTION_MAP_MASK; | 215 | ms->section_mem_map &= ~SECTION_MAP_MASK; |
213 | ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum); | 216 | ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) | |
217 | SECTION_HAS_MEM_MAP; | ||
218 | ms->pageblock_flags = pageblock_bitmap; | ||
214 | 219 | ||
215 | return 1; | 220 | return 1; |
216 | } | 221 | } |
@@ -221,12 +226,43 @@ void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) | |||
221 | return NULL; | 226 | return NULL; |
222 | } | 227 | } |
223 | 228 | ||
224 | static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | 229 | static unsigned long usemap_size(void) |
225 | { | 230 | { |
226 | struct page *map; | 231 | unsigned long size_bytes; |
232 | size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; | ||
233 | size_bytes = roundup(size_bytes, sizeof(unsigned long)); | ||
234 | return size_bytes; | ||
235 | } | ||
236 | |||
237 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
238 | static unsigned long *__kmalloc_section_usemap(void) | ||
239 | { | ||
240 | return kmalloc(usemap_size(), GFP_KERNEL); | ||
241 | } | ||
242 | #endif /* CONFIG_MEMORY_HOTPLUG */ | ||
243 | |||
244 | static unsigned long *sparse_early_usemap_alloc(unsigned long pnum) | ||
245 | { | ||
246 | unsigned long *usemap; | ||
227 | struct mem_section *ms = __nr_to_section(pnum); | 247 | struct mem_section *ms = __nr_to_section(pnum); |
228 | int nid = sparse_early_nid(ms); | 248 | int nid = sparse_early_nid(ms); |
229 | 249 | ||
250 | usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); | ||
251 | if (usemap) | ||
252 | return usemap; | ||
253 | |||
254 | /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ | ||
255 | nid = 0; | ||
256 | |||
257 | printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__); | ||
258 | return NULL; | ||
259 | } | ||
260 | |||
261 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | ||
262 | struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | ||
263 | { | ||
264 | struct page *map; | ||
265 | |||
230 | map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); | 266 | map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); |
231 | if (map) | 267 | if (map) |
232 | return map; | 268 | return map; |
@@ -238,10 +274,22 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | |||
238 | 274 | ||
239 | map = alloc_bootmem_node(NODE_DATA(nid), | 275 | map = alloc_bootmem_node(NODE_DATA(nid), |
240 | sizeof(struct page) * PAGES_PER_SECTION); | 276 | sizeof(struct page) * PAGES_PER_SECTION); |
277 | return map; | ||
278 | } | ||
279 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | ||
280 | |||
281 | struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | ||
282 | { | ||
283 | struct page *map; | ||
284 | struct mem_section *ms = __nr_to_section(pnum); | ||
285 | int nid = sparse_early_nid(ms); | ||
286 | |||
287 | map = sparse_mem_map_populate(pnum, nid); | ||
241 | if (map) | 288 | if (map) |
242 | return map; | 289 | return map; |
243 | 290 | ||
244 | printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__); | 291 | printk(KERN_ERR "%s: sparsemem memory map backing failed " |
292 | "some memory will not be available.\n", __FUNCTION__); | ||
245 | ms->section_mem_map = 0; | 293 | ms->section_mem_map = 0; |
246 | return NULL; | 294 | return NULL; |
247 | } | 295 | } |
@@ -254,19 +302,38 @@ void __init sparse_init(void) | |||
254 | { | 302 | { |
255 | unsigned long pnum; | 303 | unsigned long pnum; |
256 | struct page *map; | 304 | struct page *map; |
305 | unsigned long *usemap; | ||
257 | 306 | ||
258 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | 307 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { |
259 | if (!valid_section_nr(pnum)) | 308 | if (!present_section_nr(pnum)) |
260 | continue; | 309 | continue; |
261 | 310 | ||
262 | map = sparse_early_mem_map_alloc(pnum); | 311 | map = sparse_early_mem_map_alloc(pnum); |
263 | if (!map) | 312 | if (!map) |
264 | continue; | 313 | continue; |
265 | sparse_init_one_section(__nr_to_section(pnum), pnum, map); | 314 | |
315 | usemap = sparse_early_usemap_alloc(pnum); | ||
316 | if (!usemap) | ||
317 | continue; | ||
318 | |||
319 | sparse_init_one_section(__nr_to_section(pnum), pnum, map, | ||
320 | usemap); | ||
266 | } | 321 | } |
267 | } | 322 | } |
268 | 323 | ||
269 | #ifdef CONFIG_MEMORY_HOTPLUG | 324 | #ifdef CONFIG_MEMORY_HOTPLUG |
325 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | ||
326 | static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, | ||
327 | unsigned long nr_pages) | ||
328 | { | ||
329 | /* This will make the necessary allocations eventually. */ | ||
330 | return sparse_mem_map_populate(pnum, nid); | ||
331 | } | ||
332 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | ||
333 | { | ||
334 | return; /* XXX: Not implemented yet */ | ||
335 | } | ||
336 | #else | ||
270 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | 337 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) |
271 | { | 338 | { |
272 | struct page *page, *ret; | 339 | struct page *page, *ret; |
@@ -289,6 +356,12 @@ got_map_ptr: | |||
289 | return ret; | 356 | return ret; |
290 | } | 357 | } |
291 | 358 | ||
359 | static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, | ||
360 | unsigned long nr_pages) | ||
361 | { | ||
362 | return __kmalloc_section_memmap(nr_pages); | ||
363 | } | ||
364 | |||
292 | static int vaddr_in_vmalloc_area(void *addr) | 365 | static int vaddr_in_vmalloc_area(void *addr) |
293 | { | 366 | { |
294 | if (addr >= (void *)VMALLOC_START && | 367 | if (addr >= (void *)VMALLOC_START && |
@@ -305,6 +378,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | |||
305 | free_pages((unsigned long)memmap, | 378 | free_pages((unsigned long)memmap, |
306 | get_order(sizeof(struct page) * nr_pages)); | 379 | get_order(sizeof(struct page) * nr_pages)); |
307 | } | 380 | } |
381 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ | ||
308 | 382 | ||
309 | /* | 383 | /* |
310 | * returns the number of sections whose mem_maps were properly | 384 | * returns the number of sections whose mem_maps were properly |
@@ -318,6 +392,7 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
318 | struct pglist_data *pgdat = zone->zone_pgdat; | 392 | struct pglist_data *pgdat = zone->zone_pgdat; |
319 | struct mem_section *ms; | 393 | struct mem_section *ms; |
320 | struct page *memmap; | 394 | struct page *memmap; |
395 | unsigned long *usemap; | ||
321 | unsigned long flags; | 396 | unsigned long flags; |
322 | int ret; | 397 | int ret; |
323 | 398 | ||
@@ -326,7 +401,8 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
326 | * plus, it does a kmalloc | 401 | * plus, it does a kmalloc |
327 | */ | 402 | */ |
328 | sparse_index_init(section_nr, pgdat->node_id); | 403 | sparse_index_init(section_nr, pgdat->node_id); |
329 | memmap = __kmalloc_section_memmap(nr_pages); | 404 | memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); |
405 | usemap = __kmalloc_section_usemap(); | ||
330 | 406 | ||
331 | pgdat_resize_lock(pgdat, &flags); | 407 | pgdat_resize_lock(pgdat, &flags); |
332 | 408 | ||
@@ -335,9 +411,14 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
335 | ret = -EEXIST; | 411 | ret = -EEXIST; |
336 | goto out; | 412 | goto out; |
337 | } | 413 | } |
414 | |||
415 | if (!usemap) { | ||
416 | ret = -ENOMEM; | ||
417 | goto out; | ||
418 | } | ||
338 | ms->section_mem_map |= SECTION_MARKED_PRESENT; | 419 | ms->section_mem_map |= SECTION_MARKED_PRESENT; |
339 | 420 | ||
340 | ret = sparse_init_one_section(ms, section_nr, memmap); | 421 | ret = sparse_init_one_section(ms, section_nr, memmap, usemap); |
341 | 422 | ||
342 | out: | 423 | out: |
343 | pgdat_resize_unlock(pgdat, &flags); | 424 | pgdat_resize_unlock(pgdat, &flags); |
@@ -24,16 +24,19 @@ | |||
24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
25 | #include <linux/mm_inline.h> | 25 | #include <linux/mm_inline.h> |
26 | #include <linux/buffer_head.h> /* for try_to_release_page() */ | 26 | #include <linux/buffer_head.h> /* for try_to_release_page() */ |
27 | #include <linux/module.h> | ||
28 | #include <linux/percpu_counter.h> | 27 | #include <linux/percpu_counter.h> |
29 | #include <linux/percpu.h> | 28 | #include <linux/percpu.h> |
30 | #include <linux/cpu.h> | 29 | #include <linux/cpu.h> |
31 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
32 | #include <linux/init.h> | 31 | #include <linux/backing-dev.h> |
33 | 32 | ||
34 | /* How many pages do we try to swap or page in/out together? */ | 33 | /* How many pages do we try to swap or page in/out together? */ |
35 | int page_cluster; | 34 | int page_cluster; |
36 | 35 | ||
36 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; | ||
37 | static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; | ||
38 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; | ||
39 | |||
37 | /* | 40 | /* |
38 | * This path almost never happens for VM activity - pages are normally | 41 | * This path almost never happens for VM activity - pages are normally |
39 | * freed via pagevecs. But it gets used by networking. | 42 | * freed via pagevecs. But it gets used by networking. |
@@ -94,23 +97,47 @@ void put_pages_list(struct list_head *pages) | |||
94 | EXPORT_SYMBOL(put_pages_list); | 97 | EXPORT_SYMBOL(put_pages_list); |
95 | 98 | ||
96 | /* | 99 | /* |
100 | * pagevec_move_tail() must be called with IRQ disabled. | ||
101 | * Otherwise this may cause nasty races. | ||
102 | */ | ||
103 | static void pagevec_move_tail(struct pagevec *pvec) | ||
104 | { | ||
105 | int i; | ||
106 | int pgmoved = 0; | ||
107 | struct zone *zone = NULL; | ||
108 | |||
109 | for (i = 0; i < pagevec_count(pvec); i++) { | ||
110 | struct page *page = pvec->pages[i]; | ||
111 | struct zone *pagezone = page_zone(page); | ||
112 | |||
113 | if (pagezone != zone) { | ||
114 | if (zone) | ||
115 | spin_unlock(&zone->lru_lock); | ||
116 | zone = pagezone; | ||
117 | spin_lock(&zone->lru_lock); | ||
118 | } | ||
119 | if (PageLRU(page) && !PageActive(page)) { | ||
120 | list_move_tail(&page->lru, &zone->inactive_list); | ||
121 | pgmoved++; | ||
122 | } | ||
123 | } | ||
124 | if (zone) | ||
125 | spin_unlock(&zone->lru_lock); | ||
126 | __count_vm_events(PGROTATED, pgmoved); | ||
127 | release_pages(pvec->pages, pvec->nr, pvec->cold); | ||
128 | pagevec_reinit(pvec); | ||
129 | } | ||
130 | |||
131 | /* | ||
97 | * Writeback is about to end against a page which has been marked for immediate | 132 | * Writeback is about to end against a page which has been marked for immediate |
98 | * reclaim. If it still appears to be reclaimable, move it to the tail of the | 133 | * reclaim. If it still appears to be reclaimable, move it to the tail of the |
99 | * inactive list. The page still has PageWriteback set, which will pin it. | 134 | * inactive list. |
100 | * | ||
101 | * We don't expect many pages to come through here, so don't bother batching | ||
102 | * things up. | ||
103 | * | ||
104 | * To avoid placing the page at the tail of the LRU while PG_writeback is still | ||
105 | * set, this function will clear PG_writeback before performing the page | ||
106 | * motion. Do that inside the lru lock because once PG_writeback is cleared | ||
107 | * we may not touch the page. | ||
108 | * | 135 | * |
109 | * Returns zero if it cleared PG_writeback. | 136 | * Returns zero if it cleared PG_writeback. |
110 | */ | 137 | */ |
111 | int rotate_reclaimable_page(struct page *page) | 138 | int rotate_reclaimable_page(struct page *page) |
112 | { | 139 | { |
113 | struct zone *zone; | 140 | struct pagevec *pvec; |
114 | unsigned long flags; | 141 | unsigned long flags; |
115 | 142 | ||
116 | if (PageLocked(page)) | 143 | if (PageLocked(page)) |
@@ -122,15 +149,16 @@ int rotate_reclaimable_page(struct page *page) | |||
122 | if (!PageLRU(page)) | 149 | if (!PageLRU(page)) |
123 | return 1; | 150 | return 1; |
124 | 151 | ||
125 | zone = page_zone(page); | 152 | page_cache_get(page); |
126 | spin_lock_irqsave(&zone->lru_lock, flags); | 153 | local_irq_save(flags); |
127 | if (PageLRU(page) && !PageActive(page)) { | 154 | pvec = &__get_cpu_var(lru_rotate_pvecs); |
128 | list_move_tail(&page->lru, &zone->inactive_list); | 155 | if (!pagevec_add(pvec, page)) |
129 | __count_vm_event(PGROTATED); | 156 | pagevec_move_tail(pvec); |
130 | } | 157 | local_irq_restore(flags); |
158 | |||
131 | if (!test_clear_page_writeback(page)) | 159 | if (!test_clear_page_writeback(page)) |
132 | BUG(); | 160 | BUG(); |
133 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 161 | |
134 | return 0; | 162 | return 0; |
135 | } | 163 | } |
136 | 164 | ||
@@ -174,9 +202,6 @@ EXPORT_SYMBOL(mark_page_accessed); | |||
174 | * lru_cache_add: add a page to the page lists | 202 | * lru_cache_add: add a page to the page lists |
175 | * @page: the page to add | 203 | * @page: the page to add |
176 | */ | 204 | */ |
177 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; | ||
178 | static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; | ||
179 | |||
180 | void fastcall lru_cache_add(struct page *page) | 205 | void fastcall lru_cache_add(struct page *page) |
181 | { | 206 | { |
182 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); | 207 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); |
@@ -197,21 +222,37 @@ void fastcall lru_cache_add_active(struct page *page) | |||
197 | put_cpu_var(lru_add_active_pvecs); | 222 | put_cpu_var(lru_add_active_pvecs); |
198 | } | 223 | } |
199 | 224 | ||
200 | static void __lru_add_drain(int cpu) | 225 | /* |
226 | * Drain pages out of the cpu's pagevecs. | ||
227 | * Either "cpu" is the current CPU, and preemption has already been | ||
228 | * disabled; or "cpu" is being hot-unplugged, and is already dead. | ||
229 | */ | ||
230 | static void drain_cpu_pagevecs(int cpu) | ||
201 | { | 231 | { |
202 | struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); | 232 | struct pagevec *pvec; |
203 | 233 | ||
204 | /* CPU is dead, so no locking needed. */ | 234 | pvec = &per_cpu(lru_add_pvecs, cpu); |
205 | if (pagevec_count(pvec)) | 235 | if (pagevec_count(pvec)) |
206 | __pagevec_lru_add(pvec); | 236 | __pagevec_lru_add(pvec); |
237 | |||
207 | pvec = &per_cpu(lru_add_active_pvecs, cpu); | 238 | pvec = &per_cpu(lru_add_active_pvecs, cpu); |
208 | if (pagevec_count(pvec)) | 239 | if (pagevec_count(pvec)) |
209 | __pagevec_lru_add_active(pvec); | 240 | __pagevec_lru_add_active(pvec); |
241 | |||
242 | pvec = &per_cpu(lru_rotate_pvecs, cpu); | ||
243 | if (pagevec_count(pvec)) { | ||
244 | unsigned long flags; | ||
245 | |||
246 | /* No harm done if a racing interrupt already did this */ | ||
247 | local_irq_save(flags); | ||
248 | pagevec_move_tail(pvec); | ||
249 | local_irq_restore(flags); | ||
250 | } | ||
210 | } | 251 | } |
211 | 252 | ||
212 | void lru_add_drain(void) | 253 | void lru_add_drain(void) |
213 | { | 254 | { |
214 | __lru_add_drain(get_cpu()); | 255 | drain_cpu_pagevecs(get_cpu()); |
215 | put_cpu(); | 256 | put_cpu(); |
216 | } | 257 | } |
217 | 258 | ||
@@ -258,6 +299,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
258 | int i; | 299 | int i; |
259 | struct pagevec pages_to_free; | 300 | struct pagevec pages_to_free; |
260 | struct zone *zone = NULL; | 301 | struct zone *zone = NULL; |
302 | unsigned long uninitialized_var(flags); | ||
261 | 303 | ||
262 | pagevec_init(&pages_to_free, cold); | 304 | pagevec_init(&pages_to_free, cold); |
263 | for (i = 0; i < nr; i++) { | 305 | for (i = 0; i < nr; i++) { |
@@ -265,7 +307,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
265 | 307 | ||
266 | if (unlikely(PageCompound(page))) { | 308 | if (unlikely(PageCompound(page))) { |
267 | if (zone) { | 309 | if (zone) { |
268 | spin_unlock_irq(&zone->lru_lock); | 310 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
269 | zone = NULL; | 311 | zone = NULL; |
270 | } | 312 | } |
271 | put_compound_page(page); | 313 | put_compound_page(page); |
@@ -279,9 +321,10 @@ void release_pages(struct page **pages, int nr, int cold) | |||
279 | struct zone *pagezone = page_zone(page); | 321 | struct zone *pagezone = page_zone(page); |
280 | if (pagezone != zone) { | 322 | if (pagezone != zone) { |
281 | if (zone) | 323 | if (zone) |
282 | spin_unlock_irq(&zone->lru_lock); | 324 | spin_unlock_irqrestore(&zone->lru_lock, |
325 | flags); | ||
283 | zone = pagezone; | 326 | zone = pagezone; |
284 | spin_lock_irq(&zone->lru_lock); | 327 | spin_lock_irqsave(&zone->lru_lock, flags); |
285 | } | 328 | } |
286 | VM_BUG_ON(!PageLRU(page)); | 329 | VM_BUG_ON(!PageLRU(page)); |
287 | __ClearPageLRU(page); | 330 | __ClearPageLRU(page); |
@@ -290,7 +333,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
290 | 333 | ||
291 | if (!pagevec_add(&pages_to_free, page)) { | 334 | if (!pagevec_add(&pages_to_free, page)) { |
292 | if (zone) { | 335 | if (zone) { |
293 | spin_unlock_irq(&zone->lru_lock); | 336 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
294 | zone = NULL; | 337 | zone = NULL; |
295 | } | 338 | } |
296 | __pagevec_free(&pages_to_free); | 339 | __pagevec_free(&pages_to_free); |
@@ -298,7 +341,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
298 | } | 341 | } |
299 | } | 342 | } |
300 | if (zone) | 343 | if (zone) |
301 | spin_unlock_irq(&zone->lru_lock); | 344 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
302 | 345 | ||
303 | pagevec_free(&pages_to_free); | 346 | pagevec_free(&pages_to_free); |
304 | } | 347 | } |
@@ -491,7 +534,7 @@ static int cpu_swap_callback(struct notifier_block *nfb, | |||
491 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | 534 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { |
492 | atomic_add(*committed, &vm_committed_space); | 535 | atomic_add(*committed, &vm_committed_space); |
493 | *committed = 0; | 536 | *committed = 0; |
494 | __lru_add_drain((long)hcpu); | 537 | drain_cpu_pagevecs((long)hcpu); |
495 | } | 538 | } |
496 | return NOTIFY_OK; | 539 | return NOTIFY_OK; |
497 | } | 540 | } |
@@ -505,6 +548,10 @@ void __init swap_setup(void) | |||
505 | { | 548 | { |
506 | unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); | 549 | unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); |
507 | 550 | ||
551 | #ifdef CONFIG_SWAP | ||
552 | bdi_init(swapper_space.backing_dev_info); | ||
553 | #endif | ||
554 | |||
508 | /* Use a smaller cluster for small-memory machines */ | 555 | /* Use a smaller cluster for small-memory machines */ |
509 | if (megs < 16) | 556 | if (megs < 16) |
510 | page_cluster = 2; | 557 | page_cluster = 2; |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 67daecb6031a..b52635601dfe 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -74,6 +74,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, | |||
74 | { | 74 | { |
75 | int error; | 75 | int error; |
76 | 76 | ||
77 | BUG_ON(!PageLocked(page)); | ||
77 | BUG_ON(PageSwapCache(page)); | 78 | BUG_ON(PageSwapCache(page)); |
78 | BUG_ON(PagePrivate(page)); | 79 | BUG_ON(PagePrivate(page)); |
79 | error = radix_tree_preload(gfp_mask); | 80 | error = radix_tree_preload(gfp_mask); |
@@ -83,7 +84,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, | |||
83 | entry.val, page); | 84 | entry.val, page); |
84 | if (!error) { | 85 | if (!error) { |
85 | page_cache_get(page); | 86 | page_cache_get(page); |
86 | SetPageLocked(page); | ||
87 | SetPageSwapCache(page); | 87 | SetPageSwapCache(page); |
88 | set_page_private(page, entry.val); | 88 | set_page_private(page, entry.val); |
89 | total_swapcache_pages++; | 89 | total_swapcache_pages++; |
@@ -99,15 +99,18 @@ static int add_to_swap_cache(struct page *page, swp_entry_t entry) | |||
99 | { | 99 | { |
100 | int error; | 100 | int error; |
101 | 101 | ||
102 | BUG_ON(PageLocked(page)); | ||
102 | if (!swap_duplicate(entry)) { | 103 | if (!swap_duplicate(entry)) { |
103 | INC_CACHE_INFO(noent_race); | 104 | INC_CACHE_INFO(noent_race); |
104 | return -ENOENT; | 105 | return -ENOENT; |
105 | } | 106 | } |
107 | SetPageLocked(page); | ||
106 | error = __add_to_swap_cache(page, entry, GFP_KERNEL); | 108 | error = __add_to_swap_cache(page, entry, GFP_KERNEL); |
107 | /* | 109 | /* |
108 | * Anon pages are already on the LRU, we don't run lru_cache_add here. | 110 | * Anon pages are already on the LRU, we don't run lru_cache_add here. |
109 | */ | 111 | */ |
110 | if (error) { | 112 | if (error) { |
113 | ClearPageLocked(page); | ||
111 | swap_free(entry); | 114 | swap_free(entry); |
112 | if (error == -EEXIST) | 115 | if (error == -EEXIST) |
113 | INC_CACHE_INFO(exist_race); | 116 | INC_CACHE_INFO(exist_race); |
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index 8803471593fd..d436a9c82db7 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c | |||
@@ -66,24 +66,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
66 | if (!dentry) | 66 | if (!dentry) |
67 | goto put_memory; | 67 | goto put_memory; |
68 | 68 | ||
69 | error = -ENFILE; | ||
70 | file = get_empty_filp(); | ||
71 | if (!file) | ||
72 | goto put_dentry; | ||
73 | |||
74 | error = -ENOSPC; | 69 | error = -ENOSPC; |
75 | inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); | 70 | inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); |
76 | if (!inode) | 71 | if (!inode) |
77 | goto close_file; | 72 | goto put_dentry; |
78 | 73 | ||
79 | d_instantiate(dentry, inode); | 74 | d_instantiate(dentry, inode); |
80 | inode->i_nlink = 0; /* It is unlinked */ | 75 | error = -ENFILE; |
76 | file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | ||
77 | &ramfs_file_operations); | ||
78 | if (!file) | ||
79 | goto put_dentry; | ||
81 | 80 | ||
82 | file->f_path.mnt = mntget(shm_mnt); | 81 | inode->i_nlink = 0; /* It is unlinked */ |
83 | file->f_path.dentry = dentry; | ||
84 | file->f_mapping = inode->i_mapping; | ||
85 | file->f_op = &ramfs_file_operations; | ||
86 | file->f_mode = FMODE_WRITE | FMODE_READ; | ||
87 | 82 | ||
88 | /* notify everyone as to the change of file size */ | 83 | /* notify everyone as to the change of file size */ |
89 | error = do_truncate(dentry, size, 0, file); | 84 | error = do_truncate(dentry, size, 0, file); |
diff --git a/mm/truncate.c b/mm/truncate.c index 5cdfbc1a59fd..cadc15653dde 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -8,6 +8,7 @@ | |||
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/kernel.h> | 10 | #include <linux/kernel.h> |
11 | #include <linux/backing-dev.h> | ||
11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
12 | #include <linux/swap.h> | 13 | #include <linux/swap.h> |
13 | #include <linux/module.h> | 14 | #include <linux/module.h> |
@@ -72,6 +73,8 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) | |||
72 | struct address_space *mapping = page->mapping; | 73 | struct address_space *mapping = page->mapping; |
73 | if (mapping && mapping_cap_account_dirty(mapping)) { | 74 | if (mapping && mapping_cap_account_dirty(mapping)) { |
74 | dec_zone_page_state(page, NR_FILE_DIRTY); | 75 | dec_zone_page_state(page, NR_FILE_DIRTY); |
76 | dec_bdi_stat(mapping->backing_dev_info, | ||
77 | BDI_RECLAIMABLE); | ||
75 | if (account_size) | 78 | if (account_size) |
76 | task_io_account_cancelled_write(account_size); | 79 | task_io_account_cancelled_write(account_size); |
77 | } | 80 | } |
@@ -81,14 +81,16 @@ EXPORT_SYMBOL(kmemdup); | |||
81 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | 81 | void *krealloc(const void *p, size_t new_size, gfp_t flags) |
82 | { | 82 | { |
83 | void *ret; | 83 | void *ret; |
84 | size_t ks; | 84 | size_t ks = 0; |
85 | 85 | ||
86 | if (unlikely(!new_size)) { | 86 | if (unlikely(!new_size)) { |
87 | kfree(p); | 87 | kfree(p); |
88 | return ZERO_SIZE_PTR; | 88 | return ZERO_SIZE_PTR; |
89 | } | 89 | } |
90 | 90 | ||
91 | ks = ksize(p); | 91 | if (p) |
92 | ks = ksize(p); | ||
93 | |||
92 | if (ks >= new_size) | 94 | if (ks >= new_size) |
93 | return (void *)p; | 95 | return (void *)p; |
94 | 96 | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3cee76a8c9f0..2e01af365848 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -190,7 +190,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl | |||
190 | if (unlikely(!size)) | 190 | if (unlikely(!size)) |
191 | return NULL; | 191 | return NULL; |
192 | 192 | ||
193 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node); | 193 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); |
194 | |||
194 | if (unlikely(!area)) | 195 | if (unlikely(!area)) |
195 | return NULL; | 196 | return NULL; |
196 | 197 | ||
@@ -439,7 +440,7 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
439 | area->flags |= VM_VPAGES; | 440 | area->flags |= VM_VPAGES; |
440 | } else { | 441 | } else { |
441 | pages = kmalloc_node(array_size, | 442 | pages = kmalloc_node(array_size, |
442 | (gfp_mask & GFP_LEVEL_MASK) | __GFP_ZERO, | 443 | (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO, |
443 | node); | 444 | node); |
444 | } | 445 | } |
445 | area->pages = pages; | 446 | area->pages = pages; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index a6e65d024995..e1471385d001 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -932,6 +932,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
932 | long mapped_ratio; | 932 | long mapped_ratio; |
933 | long distress; | 933 | long distress; |
934 | long swap_tendency; | 934 | long swap_tendency; |
935 | long imbalance; | ||
935 | 936 | ||
936 | if (zone_is_near_oom(zone)) | 937 | if (zone_is_near_oom(zone)) |
937 | goto force_reclaim_mapped; | 938 | goto force_reclaim_mapped; |
@@ -967,6 +968,46 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
967 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; | 968 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; |
968 | 969 | ||
969 | /* | 970 | /* |
971 | * If there's huge imbalance between active and inactive | ||
972 | * (think active 100 times larger than inactive) we should | ||
973 | * become more permissive, or the system will take too much | ||
974 | * cpu before it start swapping during memory pressure. | ||
975 | * Distress is about avoiding early-oom, this is about | ||
976 | * making swappiness graceful despite setting it to low | ||
977 | * values. | ||
978 | * | ||
979 | * Avoid div by zero with nr_inactive+1, and max resulting | ||
980 | * value is vm_total_pages. | ||
981 | */ | ||
982 | imbalance = zone_page_state(zone, NR_ACTIVE); | ||
983 | imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; | ||
984 | |||
985 | /* | ||
986 | * Reduce the effect of imbalance if swappiness is low, | ||
987 | * this means for a swappiness very low, the imbalance | ||
988 | * must be much higher than 100 for this logic to make | ||
989 | * the difference. | ||
990 | * | ||
991 | * Max temporary value is vm_total_pages*100. | ||
992 | */ | ||
993 | imbalance *= (vm_swappiness + 1); | ||
994 | imbalance /= 100; | ||
995 | |||
996 | /* | ||
997 | * If not much of the ram is mapped, makes the imbalance | ||
998 | * less relevant, it's high priority we refill the inactive | ||
999 | * list with mapped pages only in presence of high ratio of | ||
1000 | * mapped pages. | ||
1001 | * | ||
1002 | * Max temporary value is vm_total_pages*100. | ||
1003 | */ | ||
1004 | imbalance *= mapped_ratio; | ||
1005 | imbalance /= 100; | ||
1006 | |||
1007 | /* apply imbalance feedback to swap_tendency */ | ||
1008 | swap_tendency += imbalance; | ||
1009 | |||
1010 | /* | ||
970 | * Now use this metric to decide whether to start moving mapped | 1011 | * Now use this metric to decide whether to start moving mapped |
971 | * memory onto the inactive list. | 1012 | * memory onto the inactive list. |
972 | */ | 1013 | */ |
@@ -1067,8 +1108,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone, | |||
1067 | unsigned long nr_to_scan; | 1108 | unsigned long nr_to_scan; |
1068 | unsigned long nr_reclaimed = 0; | 1109 | unsigned long nr_reclaimed = 0; |
1069 | 1110 | ||
1070 | atomic_inc(&zone->reclaim_in_progress); | ||
1071 | |||
1072 | /* | 1111 | /* |
1073 | * Add one to `nr_to_scan' just to make sure that the kernel will | 1112 | * Add one to `nr_to_scan' just to make sure that the kernel will |
1074 | * slowly sift through the active list. | 1113 | * slowly sift through the active list. |
@@ -1107,8 +1146,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone, | |||
1107 | } | 1146 | } |
1108 | 1147 | ||
1109 | throttle_vm_writeout(sc->gfp_mask); | 1148 | throttle_vm_writeout(sc->gfp_mask); |
1110 | |||
1111 | atomic_dec(&zone->reclaim_in_progress); | ||
1112 | return nr_reclaimed; | 1149 | return nr_reclaimed; |
1113 | } | 1150 | } |
1114 | 1151 | ||
@@ -1146,7 +1183,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
1146 | 1183 | ||
1147 | note_zone_scanning_priority(zone, priority); | 1184 | note_zone_scanning_priority(zone, priority); |
1148 | 1185 | ||
1149 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1186 | if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY) |
1150 | continue; /* Let kswapd poll it */ | 1187 | continue; /* Let kswapd poll it */ |
1151 | 1188 | ||
1152 | sc->all_unreclaimable = 0; | 1189 | sc->all_unreclaimable = 0; |
@@ -1327,7 +1364,8 @@ loop_again: | |||
1327 | if (!populated_zone(zone)) | 1364 | if (!populated_zone(zone)) |
1328 | continue; | 1365 | continue; |
1329 | 1366 | ||
1330 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1367 | if (zone_is_all_unreclaimable(zone) && |
1368 | priority != DEF_PRIORITY) | ||
1331 | continue; | 1369 | continue; |
1332 | 1370 | ||
1333 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1371 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
@@ -1362,7 +1400,8 @@ loop_again: | |||
1362 | if (!populated_zone(zone)) | 1400 | if (!populated_zone(zone)) |
1363 | continue; | 1401 | continue; |
1364 | 1402 | ||
1365 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1403 | if (zone_is_all_unreclaimable(zone) && |
1404 | priority != DEF_PRIORITY) | ||
1366 | continue; | 1405 | continue; |
1367 | 1406 | ||
1368 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1407 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
@@ -1371,18 +1410,25 @@ loop_again: | |||
1371 | temp_priority[i] = priority; | 1410 | temp_priority[i] = priority; |
1372 | sc.nr_scanned = 0; | 1411 | sc.nr_scanned = 0; |
1373 | note_zone_scanning_priority(zone, priority); | 1412 | note_zone_scanning_priority(zone, priority); |
1374 | nr_reclaimed += shrink_zone(priority, zone, &sc); | 1413 | /* |
1414 | * We put equal pressure on every zone, unless one | ||
1415 | * zone has way too many pages free already. | ||
1416 | */ | ||
1417 | if (!zone_watermark_ok(zone, order, 8*zone->pages_high, | ||
1418 | end_zone, 0)) | ||
1419 | nr_reclaimed += shrink_zone(priority, zone, &sc); | ||
1375 | reclaim_state->reclaimed_slab = 0; | 1420 | reclaim_state->reclaimed_slab = 0; |
1376 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1421 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
1377 | lru_pages); | 1422 | lru_pages); |
1378 | nr_reclaimed += reclaim_state->reclaimed_slab; | 1423 | nr_reclaimed += reclaim_state->reclaimed_slab; |
1379 | total_scanned += sc.nr_scanned; | 1424 | total_scanned += sc.nr_scanned; |
1380 | if (zone->all_unreclaimable) | 1425 | if (zone_is_all_unreclaimable(zone)) |
1381 | continue; | 1426 | continue; |
1382 | if (nr_slab == 0 && zone->pages_scanned >= | 1427 | if (nr_slab == 0 && zone->pages_scanned >= |
1383 | (zone_page_state(zone, NR_ACTIVE) | 1428 | (zone_page_state(zone, NR_ACTIVE) |
1384 | + zone_page_state(zone, NR_INACTIVE)) * 6) | 1429 | + zone_page_state(zone, NR_INACTIVE)) * 6) |
1385 | zone->all_unreclaimable = 1; | 1430 | zone_set_flag(zone, |
1431 | ZONE_ALL_UNRECLAIMABLE); | ||
1386 | /* | 1432 | /* |
1387 | * If we've done a decent amount of scanning and | 1433 | * If we've done a decent amount of scanning and |
1388 | * the reclaim ratio is low, start doing writepage | 1434 | * the reclaim ratio is low, start doing writepage |
@@ -1548,7 +1594,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, | |||
1548 | if (!populated_zone(zone)) | 1594 | if (!populated_zone(zone)) |
1549 | continue; | 1595 | continue; |
1550 | 1596 | ||
1551 | if (zone->all_unreclaimable && prio != DEF_PRIORITY) | 1597 | if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) |
1552 | continue; | 1598 | continue; |
1553 | 1599 | ||
1554 | /* For pass = 0 we don't shrink the active list */ | 1600 | /* For pass = 0 we don't shrink the active list */ |
@@ -1688,9 +1734,11 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
1688 | { | 1734 | { |
1689 | pg_data_t *pgdat; | 1735 | pg_data_t *pgdat; |
1690 | cpumask_t mask; | 1736 | cpumask_t mask; |
1737 | int nid; | ||
1691 | 1738 | ||
1692 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { | 1739 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { |
1693 | for_each_online_pgdat(pgdat) { | 1740 | for_each_node_state(nid, N_HIGH_MEMORY) { |
1741 | pgdat = NODE_DATA(nid); | ||
1694 | mask = node_to_cpumask(pgdat->node_id); | 1742 | mask = node_to_cpumask(pgdat->node_id); |
1695 | if (any_online_cpu(mask) != NR_CPUS) | 1743 | if (any_online_cpu(mask) != NR_CPUS) |
1696 | /* One of our CPUs online: restore mask */ | 1744 | /* One of our CPUs online: restore mask */ |
@@ -1727,7 +1775,7 @@ static int __init kswapd_init(void) | |||
1727 | int nid; | 1775 | int nid; |
1728 | 1776 | ||
1729 | swap_setup(); | 1777 | swap_setup(); |
1730 | for_each_online_node(nid) | 1778 | for_each_node_state(nid, N_HIGH_MEMORY) |
1731 | kswapd_run(nid); | 1779 | kswapd_run(nid); |
1732 | hotcpu_notifier(cpu_callback, 0); | 1780 | hotcpu_notifier(cpu_callback, 0); |
1733 | return 0; | 1781 | return 0; |
@@ -1847,8 +1895,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1847 | 1895 | ||
1848 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 1896 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
1849 | { | 1897 | { |
1850 | cpumask_t mask; | ||
1851 | int node_id; | 1898 | int node_id; |
1899 | int ret; | ||
1852 | 1900 | ||
1853 | /* | 1901 | /* |
1854 | * Zone reclaim reclaims unmapped file backed pages and | 1902 | * Zone reclaim reclaims unmapped file backed pages and |
@@ -1866,15 +1914,13 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1866 | <= zone->min_slab_pages) | 1914 | <= zone->min_slab_pages) |
1867 | return 0; | 1915 | return 0; |
1868 | 1916 | ||
1917 | if (zone_is_all_unreclaimable(zone)) | ||
1918 | return 0; | ||
1919 | |||
1869 | /* | 1920 | /* |
1870 | * Avoid concurrent zone reclaims, do not reclaim in a zone that does | 1921 | * Do not scan if the allocation should not be delayed. |
1871 | * not have reclaimable pages and if we should not delay the allocation | ||
1872 | * then do not scan. | ||
1873 | */ | 1922 | */ |
1874 | if (!(gfp_mask & __GFP_WAIT) || | 1923 | if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) |
1875 | zone->all_unreclaimable || | ||
1876 | atomic_read(&zone->reclaim_in_progress) > 0 || | ||
1877 | (current->flags & PF_MEMALLOC)) | ||
1878 | return 0; | 1924 | return 0; |
1879 | 1925 | ||
1880 | /* | 1926 | /* |
@@ -1884,9 +1930,14 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1884 | * as wide as possible. | 1930 | * as wide as possible. |
1885 | */ | 1931 | */ |
1886 | node_id = zone_to_nid(zone); | 1932 | node_id = zone_to_nid(zone); |
1887 | mask = node_to_cpumask(node_id); | 1933 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) |
1888 | if (!cpus_empty(mask) && node_id != numa_node_id()) | 1934 | return 0; |
1935 | |||
1936 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) | ||
1889 | return 0; | 1937 | return 0; |
1890 | return __zone_reclaim(zone, gfp_mask, order); | 1938 | ret = __zone_reclaim(zone, gfp_mask, order); |
1939 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); | ||
1940 | |||
1941 | return ret; | ||
1891 | } | 1942 | } |
1892 | #endif | 1943 | #endif |
diff --git a/mm/vmstat.c b/mm/vmstat.c index c64d169537bf..4651bf153f35 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -353,23 +353,6 @@ void refresh_cpu_vm_stats(int cpu) | |||
353 | } | 353 | } |
354 | } | 354 | } |
355 | 355 | ||
356 | static void __refresh_cpu_vm_stats(void *dummy) | ||
357 | { | ||
358 | refresh_cpu_vm_stats(smp_processor_id()); | ||
359 | } | ||
360 | |||
361 | /* | ||
362 | * Consolidate all counters. | ||
363 | * | ||
364 | * Note that the result is less inaccurate but still inaccurate | ||
365 | * if concurrent processes are allowed to run. | ||
366 | */ | ||
367 | void refresh_vm_stats(void) | ||
368 | { | ||
369 | on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1); | ||
370 | } | ||
371 | EXPORT_SYMBOL(refresh_vm_stats); | ||
372 | |||
373 | #endif | 356 | #endif |
374 | 357 | ||
375 | #ifdef CONFIG_NUMA | 358 | #ifdef CONFIG_NUMA |
@@ -398,6 +381,13 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z) | |||
398 | 381 | ||
399 | #include <linux/seq_file.h> | 382 | #include <linux/seq_file.h> |
400 | 383 | ||
384 | static char * const migratetype_names[MIGRATE_TYPES] = { | ||
385 | "Unmovable", | ||
386 | "Reclaimable", | ||
387 | "Movable", | ||
388 | "Reserve", | ||
389 | }; | ||
390 | |||
401 | static void *frag_start(struct seq_file *m, loff_t *pos) | 391 | static void *frag_start(struct seq_file *m, loff_t *pos) |
402 | { | 392 | { |
403 | pg_data_t *pgdat; | 393 | pg_data_t *pgdat; |
@@ -422,28 +412,144 @@ static void frag_stop(struct seq_file *m, void *arg) | |||
422 | { | 412 | { |
423 | } | 413 | } |
424 | 414 | ||
425 | /* | 415 | /* Walk all the zones in a node and print using a callback */ |
426 | * This walks the free areas for each zone. | 416 | static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, |
427 | */ | 417 | void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) |
428 | static int frag_show(struct seq_file *m, void *arg) | ||
429 | { | 418 | { |
430 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
431 | struct zone *zone; | 419 | struct zone *zone; |
432 | struct zone *node_zones = pgdat->node_zones; | 420 | struct zone *node_zones = pgdat->node_zones; |
433 | unsigned long flags; | 421 | unsigned long flags; |
434 | int order; | ||
435 | 422 | ||
436 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | 423 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { |
437 | if (!populated_zone(zone)) | 424 | if (!populated_zone(zone)) |
438 | continue; | 425 | continue; |
439 | 426 | ||
440 | spin_lock_irqsave(&zone->lock, flags); | 427 | spin_lock_irqsave(&zone->lock, flags); |
441 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | 428 | print(m, pgdat, zone); |
442 | for (order = 0; order < MAX_ORDER; ++order) | ||
443 | seq_printf(m, "%6lu ", zone->free_area[order].nr_free); | ||
444 | spin_unlock_irqrestore(&zone->lock, flags); | 429 | spin_unlock_irqrestore(&zone->lock, flags); |
430 | } | ||
431 | } | ||
432 | |||
433 | static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, | ||
434 | struct zone *zone) | ||
435 | { | ||
436 | int order; | ||
437 | |||
438 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
439 | for (order = 0; order < MAX_ORDER; ++order) | ||
440 | seq_printf(m, "%6lu ", zone->free_area[order].nr_free); | ||
441 | seq_putc(m, '\n'); | ||
442 | } | ||
443 | |||
444 | /* | ||
445 | * This walks the free areas for each zone. | ||
446 | */ | ||
447 | static int frag_show(struct seq_file *m, void *arg) | ||
448 | { | ||
449 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
450 | walk_zones_in_node(m, pgdat, frag_show_print); | ||
451 | return 0; | ||
452 | } | ||
453 | |||
454 | static void pagetypeinfo_showfree_print(struct seq_file *m, | ||
455 | pg_data_t *pgdat, struct zone *zone) | ||
456 | { | ||
457 | int order, mtype; | ||
458 | |||
459 | for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) { | ||
460 | seq_printf(m, "Node %4d, zone %8s, type %12s ", | ||
461 | pgdat->node_id, | ||
462 | zone->name, | ||
463 | migratetype_names[mtype]); | ||
464 | for (order = 0; order < MAX_ORDER; ++order) { | ||
465 | unsigned long freecount = 0; | ||
466 | struct free_area *area; | ||
467 | struct list_head *curr; | ||
468 | |||
469 | area = &(zone->free_area[order]); | ||
470 | |||
471 | list_for_each(curr, &area->free_list[mtype]) | ||
472 | freecount++; | ||
473 | seq_printf(m, "%6lu ", freecount); | ||
474 | } | ||
445 | seq_putc(m, '\n'); | 475 | seq_putc(m, '\n'); |
446 | } | 476 | } |
477 | } | ||
478 | |||
479 | /* Print out the free pages at each order for each migatetype */ | ||
480 | static int pagetypeinfo_showfree(struct seq_file *m, void *arg) | ||
481 | { | ||
482 | int order; | ||
483 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
484 | |||
485 | /* Print header */ | ||
486 | seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); | ||
487 | for (order = 0; order < MAX_ORDER; ++order) | ||
488 | seq_printf(m, "%6d ", order); | ||
489 | seq_putc(m, '\n'); | ||
490 | |||
491 | walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); | ||
492 | |||
493 | return 0; | ||
494 | } | ||
495 | |||
496 | static void pagetypeinfo_showblockcount_print(struct seq_file *m, | ||
497 | pg_data_t *pgdat, struct zone *zone) | ||
498 | { | ||
499 | int mtype; | ||
500 | unsigned long pfn; | ||
501 | unsigned long start_pfn = zone->zone_start_pfn; | ||
502 | unsigned long end_pfn = start_pfn + zone->spanned_pages; | ||
503 | unsigned long count[MIGRATE_TYPES] = { 0, }; | ||
504 | |||
505 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | ||
506 | struct page *page; | ||
507 | |||
508 | if (!pfn_valid(pfn)) | ||
509 | continue; | ||
510 | |||
511 | page = pfn_to_page(pfn); | ||
512 | mtype = get_pageblock_migratetype(page); | ||
513 | |||
514 | count[mtype]++; | ||
515 | } | ||
516 | |||
517 | /* Print counts */ | ||
518 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
519 | for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) | ||
520 | seq_printf(m, "%12lu ", count[mtype]); | ||
521 | seq_putc(m, '\n'); | ||
522 | } | ||
523 | |||
524 | /* Print out the free pages at each order for each migratetype */ | ||
525 | static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) | ||
526 | { | ||
527 | int mtype; | ||
528 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
529 | |||
530 | seq_printf(m, "\n%-23s", "Number of blocks type "); | ||
531 | for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) | ||
532 | seq_printf(m, "%12s ", migratetype_names[mtype]); | ||
533 | seq_putc(m, '\n'); | ||
534 | walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); | ||
535 | |||
536 | return 0; | ||
537 | } | ||
538 | |||
539 | /* | ||
540 | * This prints out statistics in relation to grouping pages by mobility. | ||
541 | * It is expensive to collect so do not constantly read the file. | ||
542 | */ | ||
543 | static int pagetypeinfo_show(struct seq_file *m, void *arg) | ||
544 | { | ||
545 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
546 | |||
547 | seq_printf(m, "Page block order: %d\n", pageblock_order); | ||
548 | seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages); | ||
549 | seq_putc(m, '\n'); | ||
550 | pagetypeinfo_showfree(m, pgdat); | ||
551 | pagetypeinfo_showblockcount(m, pgdat); | ||
552 | |||
447 | return 0; | 553 | return 0; |
448 | } | 554 | } |
449 | 555 | ||
@@ -454,6 +560,13 @@ const struct seq_operations fragmentation_op = { | |||
454 | .show = frag_show, | 560 | .show = frag_show, |
455 | }; | 561 | }; |
456 | 562 | ||
563 | const struct seq_operations pagetypeinfo_op = { | ||
564 | .start = frag_start, | ||
565 | .next = frag_next, | ||
566 | .stop = frag_stop, | ||
567 | .show = pagetypeinfo_show, | ||
568 | }; | ||
569 | |||
457 | #ifdef CONFIG_ZONE_DMA | 570 | #ifdef CONFIG_ZONE_DMA |
458 | #define TEXT_FOR_DMA(xx) xx "_dma", | 571 | #define TEXT_FOR_DMA(xx) xx "_dma", |
459 | #else | 572 | #else |
@@ -532,84 +645,78 @@ static const char * const vmstat_text[] = { | |||
532 | #endif | 645 | #endif |
533 | }; | 646 | }; |
534 | 647 | ||
535 | /* | 648 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, |
536 | * Output information about zones in @pgdat. | 649 | struct zone *zone) |
537 | */ | ||
538 | static int zoneinfo_show(struct seq_file *m, void *arg) | ||
539 | { | 650 | { |
540 | pg_data_t *pgdat = arg; | 651 | int i; |
541 | struct zone *zone; | 652 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); |
542 | struct zone *node_zones = pgdat->node_zones; | 653 | seq_printf(m, |
543 | unsigned long flags; | 654 | "\n pages free %lu" |
544 | 655 | "\n min %lu" | |
545 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | 656 | "\n low %lu" |
546 | int i; | 657 | "\n high %lu" |
547 | 658 | "\n scanned %lu (a: %lu i: %lu)" | |
548 | if (!populated_zone(zone)) | 659 | "\n spanned %lu" |
549 | continue; | 660 | "\n present %lu", |
550 | 661 | zone_page_state(zone, NR_FREE_PAGES), | |
551 | spin_lock_irqsave(&zone->lock, flags); | 662 | zone->pages_min, |
552 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); | 663 | zone->pages_low, |
553 | seq_printf(m, | 664 | zone->pages_high, |
554 | "\n pages free %lu" | 665 | zone->pages_scanned, |
555 | "\n min %lu" | 666 | zone->nr_scan_active, zone->nr_scan_inactive, |
556 | "\n low %lu" | 667 | zone->spanned_pages, |
557 | "\n high %lu" | 668 | zone->present_pages); |
558 | "\n scanned %lu (a: %lu i: %lu)" | ||
559 | "\n spanned %lu" | ||
560 | "\n present %lu", | ||
561 | zone_page_state(zone, NR_FREE_PAGES), | ||
562 | zone->pages_min, | ||
563 | zone->pages_low, | ||
564 | zone->pages_high, | ||
565 | zone->pages_scanned, | ||
566 | zone->nr_scan_active, zone->nr_scan_inactive, | ||
567 | zone->spanned_pages, | ||
568 | zone->present_pages); | ||
569 | 669 | ||
570 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 670 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
571 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], | 671 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], |
572 | zone_page_state(zone, i)); | 672 | zone_page_state(zone, i)); |
573 | 673 | ||
574 | seq_printf(m, | 674 | seq_printf(m, |
575 | "\n protection: (%lu", | 675 | "\n protection: (%lu", |
576 | zone->lowmem_reserve[0]); | 676 | zone->lowmem_reserve[0]); |
577 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) | 677 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) |
578 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); | 678 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); |
579 | seq_printf(m, | 679 | seq_printf(m, |
580 | ")" | 680 | ")" |
581 | "\n pagesets"); | 681 | "\n pagesets"); |
582 | for_each_online_cpu(i) { | 682 | for_each_online_cpu(i) { |
583 | struct per_cpu_pageset *pageset; | 683 | struct per_cpu_pageset *pageset; |
584 | int j; | 684 | int j; |
585 | 685 | ||
586 | pageset = zone_pcp(zone, i); | 686 | pageset = zone_pcp(zone, i); |
587 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | 687 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { |
588 | seq_printf(m, | 688 | seq_printf(m, |
589 | "\n cpu: %i pcp: %i" | 689 | "\n cpu: %i pcp: %i" |
590 | "\n count: %i" | 690 | "\n count: %i" |
591 | "\n high: %i" | 691 | "\n high: %i" |
592 | "\n batch: %i", | 692 | "\n batch: %i", |
593 | i, j, | 693 | i, j, |
594 | pageset->pcp[j].count, | 694 | pageset->pcp[j].count, |
595 | pageset->pcp[j].high, | 695 | pageset->pcp[j].high, |
596 | pageset->pcp[j].batch); | 696 | pageset->pcp[j].batch); |
597 | } | 697 | } |
598 | #ifdef CONFIG_SMP | 698 | #ifdef CONFIG_SMP |
599 | seq_printf(m, "\n vm stats threshold: %d", | 699 | seq_printf(m, "\n vm stats threshold: %d", |
600 | pageset->stat_threshold); | 700 | pageset->stat_threshold); |
601 | #endif | 701 | #endif |
602 | } | ||
603 | seq_printf(m, | ||
604 | "\n all_unreclaimable: %u" | ||
605 | "\n prev_priority: %i" | ||
606 | "\n start_pfn: %lu", | ||
607 | zone->all_unreclaimable, | ||
608 | zone->prev_priority, | ||
609 | zone->zone_start_pfn); | ||
610 | spin_unlock_irqrestore(&zone->lock, flags); | ||
611 | seq_putc(m, '\n'); | ||
612 | } | 702 | } |
703 | seq_printf(m, | ||
704 | "\n all_unreclaimable: %u" | ||
705 | "\n prev_priority: %i" | ||
706 | "\n start_pfn: %lu", | ||
707 | zone_is_all_unreclaimable(zone), | ||
708 | zone->prev_priority, | ||
709 | zone->zone_start_pfn); | ||
710 | seq_putc(m, '\n'); | ||
711 | } | ||
712 | |||
713 | /* | ||
714 | * Output information about zones in @pgdat. | ||
715 | */ | ||
716 | static int zoneinfo_show(struct seq_file *m, void *arg) | ||
717 | { | ||
718 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
719 | walk_zones_in_node(m, pgdat, zoneinfo_show_print); | ||
613 | return 0; | 720 | return 0; |
614 | } | 721 | } |
615 | 722 | ||
@@ -741,7 +848,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
741 | static struct notifier_block __cpuinitdata vmstat_notifier = | 848 | static struct notifier_block __cpuinitdata vmstat_notifier = |
742 | { &vmstat_cpuup_callback, NULL, 0 }; | 849 | { &vmstat_cpuup_callback, NULL, 0 }; |
743 | 850 | ||
744 | int __init setup_vmstat(void) | 851 | static int __init setup_vmstat(void) |
745 | { | 852 | { |
746 | int cpu; | 853 | int cpu; |
747 | 854 | ||