aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig18
-rw-r--r--mm/Makefile3
-rw-r--r--mm/backing-dev.c47
-rw-r--r--mm/bounce.c6
-rw-r--r--mm/filemap.c781
-rw-r--r--mm/filemap.h103
-rw-r--r--mm/filemap_xip.c17
-rw-r--r--mm/fremap.c26
-rw-r--r--mm/hugetlb.c398
-rw-r--r--mm/internal.h10
-rw-r--r--mm/memory.c161
-rw-r--r--mm/memory_hotplug.c312
-rw-r--r--mm/mempolicy.c60
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/mprotect.c1
-rw-r--r--mm/nommu.c1
-rw-r--r--mm/oom_kill.c116
-rw-r--r--mm/page-writeback.c310
-rw-r--r--mm/page_alloc.c754
-rw-r--r--mm/page_isolation.c138
-rw-r--r--mm/readahead.c94
-rw-r--r--mm/rmap.c5
-rw-r--r--mm/shmem.c82
-rw-r--r--mm/slab.c35
-rw-r--r--mm/slob.c13
-rw-r--r--mm/slub.c520
-rw-r--r--mm/sparse-vmemmap.c148
-rw-r--r--mm/sparse.c105
-rw-r--r--mm/swap.c111
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/tiny-shmem.c19
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/util.c6
-rw-r--r--mm/vmalloc.c5
-rw-r--r--mm/vmscan.c99
-rw-r--r--mm/vmstat.c305
37 files changed, 3561 insertions, 1265 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e24d348083c3..b1f03b0eb7f1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -112,6 +112,19 @@ config SPARSEMEM_EXTREME
112 def_bool y 112 def_bool y
113 depends on SPARSEMEM && !SPARSEMEM_STATIC 113 depends on SPARSEMEM && !SPARSEMEM_STATIC
114 114
115#
116# SPARSEMEM_VMEMMAP uses a virtually mapped mem_map to optimise pfn_to_page
117# and page_to_pfn. The most efficient option where kernel virtual space is
118# not under pressure.
119#
120config SPARSEMEM_VMEMMAP_ENABLE
121 def_bool n
122
123config SPARSEMEM_VMEMMAP
124 bool
125 depends on SPARSEMEM
126 default y if (SPARSEMEM_VMEMMAP_ENABLE)
127
115# eventually, we can have this option just 'select SPARSEMEM' 128# eventually, we can have this option just 'select SPARSEMEM'
116config MEMORY_HOTPLUG 129config MEMORY_HOTPLUG
117 bool "Allow for memory hot-add" 130 bool "Allow for memory hot-add"
@@ -126,6 +139,11 @@ config MEMORY_HOTPLUG_SPARSE
126 def_bool y 139 def_bool y
127 depends on SPARSEMEM && MEMORY_HOTPLUG 140 depends on SPARSEMEM && MEMORY_HOTPLUG
128 141
142config MEMORY_HOTREMOVE
143 bool "Allow for memory hot remove"
144 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
145 depends on MIGRATION
146
129# Heavily threaded applications may benefit from splitting the mm-wide 147# Heavily threaded applications may benefit from splitting the mm-wide
130# page_table_lock, so that faults on different parts of the user address 148# page_table_lock, so that faults on different parts of the user address
131# space can be handled with less contention: split it at this NR_CPUS. 149# space can be handled with less contention: split it at this NR_CPUS.
diff --git a/mm/Makefile b/mm/Makefile
index 245e33ab00c4..5c0b0ea7572d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,13 +11,14 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 page_alloc.o page-writeback.o pdflush.o \ 11 page_alloc.o page-writeback.o pdflush.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 $(mmu-y) 14 page_isolation.o $(mmu-y)
15 15
16obj-$(CONFIG_BOUNCE) += bounce.o 16obj-$(CONFIG_BOUNCE) += bounce.o
17obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 17obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
18obj-$(CONFIG_HUGETLBFS) += hugetlb.o 18obj-$(CONFIG_HUGETLBFS) += hugetlb.o
19obj-$(CONFIG_NUMA) += mempolicy.o 19obj-$(CONFIG_NUMA) += mempolicy.o
20obj-$(CONFIG_SPARSEMEM) += sparse.o 20obj-$(CONFIG_SPARSEMEM) += sparse.o
21obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
21obj-$(CONFIG_SHMEM) += shmem.o 22obj-$(CONFIG_SHMEM) += shmem.o
22obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 23obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
23obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 24obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f50a2811f9dc..b0ceb29da4c7 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -5,6 +5,41 @@
5#include <linux/sched.h> 5#include <linux/sched.h>
6#include <linux/module.h> 6#include <linux/module.h>
7 7
8int bdi_init(struct backing_dev_info *bdi)
9{
10 int i, j;
11 int err;
12
13 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
14 err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
15 if (err)
16 goto err;
17 }
18
19 bdi->dirty_exceeded = 0;
20 err = prop_local_init_percpu(&bdi->completions);
21
22 if (err) {
23err:
24 for (j = 0; j < i; j++)
25 percpu_counter_destroy(&bdi->bdi_stat[i]);
26 }
27
28 return err;
29}
30EXPORT_SYMBOL(bdi_init);
31
32void bdi_destroy(struct backing_dev_info *bdi)
33{
34 int i;
35
36 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
37 percpu_counter_destroy(&bdi->bdi_stat[i]);
38
39 prop_local_destroy_percpu(&bdi->completions);
40}
41EXPORT_SYMBOL(bdi_destroy);
42
8static wait_queue_head_t congestion_wqh[2] = { 43static wait_queue_head_t congestion_wqh[2] = {
9 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 44 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
10 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 45 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
@@ -55,15 +90,3 @@ long congestion_wait(int rw, long timeout)
55} 90}
56EXPORT_SYMBOL(congestion_wait); 91EXPORT_SYMBOL(congestion_wait);
57 92
58/**
59 * congestion_end - wake up sleepers on a congested backing_dev_info
60 * @rw: READ or WRITE
61 */
62void congestion_end(int rw)
63{
64 wait_queue_head_t *wqh = &congestion_wqh[rw];
65
66 if (waitqueue_active(wqh))
67 wake_up(wqh);
68}
69EXPORT_SYMBOL(congestion_end);
diff --git a/mm/bounce.c b/mm/bounce.c
index 3b549bf31f7d..b6d2d0f1019b 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -265,6 +265,12 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
265 mempool_t *pool; 265 mempool_t *pool;
266 266
267 /* 267 /*
268 * Data-less bio, nothing to bounce
269 */
270 if (bio_empty_barrier(*bio_orig))
271 return;
272
273 /*
268 * for non-isa bounce case, just check if the bounce pfn is equal 274 * for non-isa bounce case, just check if the bounce pfn is equal
269 * to or bigger than the highest pfn in the system -- in that case, 275 * to or bigger than the highest pfn in the system -- in that case,
270 * don't waste time iterating over bio segments 276 * don't waste time iterating over bio segments
diff --git a/mm/filemap.c b/mm/filemap.c
index 15c8413ee929..79f24a969cb4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,7 +30,7 @@
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/cpuset.h> 32#include <linux/cpuset.h>
33#include "filemap.h" 33#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
34#include "internal.h" 34#include "internal.h"
35 35
36/* 36/*
@@ -63,6 +63,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
63 * ->private_lock (__free_pte->__set_page_dirty_buffers) 63 * ->private_lock (__free_pte->__set_page_dirty_buffers)
64 * ->swap_lock (exclusive_swap_page, others) 64 * ->swap_lock (exclusive_swap_page, others)
65 * ->mapping->tree_lock 65 * ->mapping->tree_lock
66 * ->zone.lock
66 * 67 *
67 * ->i_mutex 68 * ->i_mutex
68 * ->i_mmap_lock (truncate->unmap_mapping_range) 69 * ->i_mmap_lock (truncate->unmap_mapping_range)
@@ -593,7 +594,7 @@ void fastcall __lock_page_nosync(struct page *page)
593 * Is there a pagecache struct page at the given (mapping, offset) tuple? 594 * Is there a pagecache struct page at the given (mapping, offset) tuple?
594 * If yes, increment its refcount and return it; if no, return NULL. 595 * If yes, increment its refcount and return it; if no, return NULL.
595 */ 596 */
596struct page * find_get_page(struct address_space *mapping, unsigned long offset) 597struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
597{ 598{
598 struct page *page; 599 struct page *page;
599 600
@@ -617,30 +618,31 @@ EXPORT_SYMBOL(find_get_page);
617 * Returns zero if the page was not present. find_lock_page() may sleep. 618 * Returns zero if the page was not present. find_lock_page() may sleep.
618 */ 619 */
619struct page *find_lock_page(struct address_space *mapping, 620struct page *find_lock_page(struct address_space *mapping,
620 unsigned long offset) 621 pgoff_t offset)
621{ 622{
622 struct page *page; 623 struct page *page;
623 624
624 read_lock_irq(&mapping->tree_lock);
625repeat: 625repeat:
626 read_lock_irq(&mapping->tree_lock);
626 page = radix_tree_lookup(&mapping->page_tree, offset); 627 page = radix_tree_lookup(&mapping->page_tree, offset);
627 if (page) { 628 if (page) {
628 page_cache_get(page); 629 page_cache_get(page);
629 if (TestSetPageLocked(page)) { 630 if (TestSetPageLocked(page)) {
630 read_unlock_irq(&mapping->tree_lock); 631 read_unlock_irq(&mapping->tree_lock);
631 __lock_page(page); 632 __lock_page(page);
632 read_lock_irq(&mapping->tree_lock);
633 633
634 /* Has the page been truncated while we slept? */ 634 /* Has the page been truncated while we slept? */
635 if (unlikely(page->mapping != mapping || 635 if (unlikely(page->mapping != mapping)) {
636 page->index != offset)) {
637 unlock_page(page); 636 unlock_page(page);
638 page_cache_release(page); 637 page_cache_release(page);
639 goto repeat; 638 goto repeat;
640 } 639 }
640 VM_BUG_ON(page->index != offset);
641 goto out;
641 } 642 }
642 } 643 }
643 read_unlock_irq(&mapping->tree_lock); 644 read_unlock_irq(&mapping->tree_lock);
645out:
644 return page; 646 return page;
645} 647}
646EXPORT_SYMBOL(find_lock_page); 648EXPORT_SYMBOL(find_lock_page);
@@ -663,29 +665,24 @@ EXPORT_SYMBOL(find_lock_page);
663 * memory exhaustion. 665 * memory exhaustion.
664 */ 666 */
665struct page *find_or_create_page(struct address_space *mapping, 667struct page *find_or_create_page(struct address_space *mapping,
666 unsigned long index, gfp_t gfp_mask) 668 pgoff_t index, gfp_t gfp_mask)
667{ 669{
668 struct page *page, *cached_page = NULL; 670 struct page *page;
669 int err; 671 int err;
670repeat: 672repeat:
671 page = find_lock_page(mapping, index); 673 page = find_lock_page(mapping, index);
672 if (!page) { 674 if (!page) {
673 if (!cached_page) { 675 page = __page_cache_alloc(gfp_mask);
674 cached_page = 676 if (!page)
675 __page_cache_alloc(gfp_mask); 677 return NULL;
676 if (!cached_page) 678 err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
677 return NULL; 679 if (unlikely(err)) {
680 page_cache_release(page);
681 page = NULL;
682 if (err == -EEXIST)
683 goto repeat;
678 } 684 }
679 err = add_to_page_cache_lru(cached_page, mapping,
680 index, gfp_mask);
681 if (!err) {
682 page = cached_page;
683 cached_page = NULL;
684 } else if (err == -EEXIST)
685 goto repeat;
686 } 685 }
687 if (cached_page)
688 page_cache_release(cached_page);
689 return page; 686 return page;
690} 687}
691EXPORT_SYMBOL(find_or_create_page); 688EXPORT_SYMBOL(find_or_create_page);
@@ -797,7 +794,7 @@ EXPORT_SYMBOL(find_get_pages_tag);
797 * and deadlock against the caller's locked page. 794 * and deadlock against the caller's locked page.
798 */ 795 */
799struct page * 796struct page *
800grab_cache_page_nowait(struct address_space *mapping, unsigned long index) 797grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
801{ 798{
802 struct page *page = find_get_page(mapping, index); 799 struct page *page = find_get_page(mapping, index);
803 800
@@ -859,34 +856,29 @@ static void shrink_readahead_size_eio(struct file *filp,
859 * It may be NULL. 856 * It may be NULL.
860 */ 857 */
861void do_generic_mapping_read(struct address_space *mapping, 858void do_generic_mapping_read(struct address_space *mapping,
862 struct file_ra_state *_ra, 859 struct file_ra_state *ra,
863 struct file *filp, 860 struct file *filp,
864 loff_t *ppos, 861 loff_t *ppos,
865 read_descriptor_t *desc, 862 read_descriptor_t *desc,
866 read_actor_t actor) 863 read_actor_t actor)
867{ 864{
868 struct inode *inode = mapping->host; 865 struct inode *inode = mapping->host;
869 unsigned long index; 866 pgoff_t index;
870 unsigned long offset; 867 pgoff_t last_index;
871 unsigned long last_index; 868 pgoff_t prev_index;
872 unsigned long next_index; 869 unsigned long offset; /* offset into pagecache page */
873 unsigned long prev_index;
874 unsigned int prev_offset; 870 unsigned int prev_offset;
875 struct page *cached_page;
876 int error; 871 int error;
877 struct file_ra_state ra = *_ra;
878 872
879 cached_page = NULL;
880 index = *ppos >> PAGE_CACHE_SHIFT; 873 index = *ppos >> PAGE_CACHE_SHIFT;
881 next_index = index; 874 prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
882 prev_index = ra.prev_index; 875 prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
883 prev_offset = ra.prev_offset;
884 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 876 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
885 offset = *ppos & ~PAGE_CACHE_MASK; 877 offset = *ppos & ~PAGE_CACHE_MASK;
886 878
887 for (;;) { 879 for (;;) {
888 struct page *page; 880 struct page *page;
889 unsigned long end_index; 881 pgoff_t end_index;
890 loff_t isize; 882 loff_t isize;
891 unsigned long nr, ret; 883 unsigned long nr, ret;
892 884
@@ -895,7 +887,7 @@ find_page:
895 page = find_get_page(mapping, index); 887 page = find_get_page(mapping, index);
896 if (!page) { 888 if (!page) {
897 page_cache_sync_readahead(mapping, 889 page_cache_sync_readahead(mapping,
898 &ra, filp, 890 ra, filp,
899 index, last_index - index); 891 index, last_index - index);
900 page = find_get_page(mapping, index); 892 page = find_get_page(mapping, index);
901 if (unlikely(page == NULL)) 893 if (unlikely(page == NULL))
@@ -903,7 +895,7 @@ find_page:
903 } 895 }
904 if (PageReadahead(page)) { 896 if (PageReadahead(page)) {
905 page_cache_async_readahead(mapping, 897 page_cache_async_readahead(mapping,
906 &ra, filp, page, 898 ra, filp, page,
907 index, last_index - index); 899 index, last_index - index);
908 } 900 }
909 if (!PageUptodate(page)) 901 if (!PageUptodate(page))
@@ -966,7 +958,6 @@ page_ok:
966 index += offset >> PAGE_CACHE_SHIFT; 958 index += offset >> PAGE_CACHE_SHIFT;
967 offset &= ~PAGE_CACHE_MASK; 959 offset &= ~PAGE_CACHE_MASK;
968 prev_offset = offset; 960 prev_offset = offset;
969 ra.prev_offset = offset;
970 961
971 page_cache_release(page); 962 page_cache_release(page);
972 if (ret == nr && desc->count) 963 if (ret == nr && desc->count)
@@ -1015,7 +1006,7 @@ readpage:
1015 } 1006 }
1016 unlock_page(page); 1007 unlock_page(page);
1017 error = -EIO; 1008 error = -EIO;
1018 shrink_readahead_size_eio(filp, &ra); 1009 shrink_readahead_size_eio(filp, ra);
1019 goto readpage_error; 1010 goto readpage_error;
1020 } 1011 }
1021 unlock_page(page); 1012 unlock_page(page);
@@ -1034,33 +1025,29 @@ no_cached_page:
1034 * Ok, it wasn't cached, so we need to create a new 1025 * Ok, it wasn't cached, so we need to create a new
1035 * page.. 1026 * page..
1036 */ 1027 */
1037 if (!cached_page) { 1028 page = page_cache_alloc_cold(mapping);
1038 cached_page = page_cache_alloc_cold(mapping); 1029 if (!page) {
1039 if (!cached_page) { 1030 desc->error = -ENOMEM;
1040 desc->error = -ENOMEM; 1031 goto out;
1041 goto out;
1042 }
1043 } 1032 }
1044 error = add_to_page_cache_lru(cached_page, mapping, 1033 error = add_to_page_cache_lru(page, mapping,
1045 index, GFP_KERNEL); 1034 index, GFP_KERNEL);
1046 if (error) { 1035 if (error) {
1036 page_cache_release(page);
1047 if (error == -EEXIST) 1037 if (error == -EEXIST)
1048 goto find_page; 1038 goto find_page;
1049 desc->error = error; 1039 desc->error = error;
1050 goto out; 1040 goto out;
1051 } 1041 }
1052 page = cached_page;
1053 cached_page = NULL;
1054 goto readpage; 1042 goto readpage;
1055 } 1043 }
1056 1044
1057out: 1045out:
1058 *_ra = ra; 1046 ra->prev_pos = prev_index;
1059 _ra->prev_index = prev_index; 1047 ra->prev_pos <<= PAGE_CACHE_SHIFT;
1048 ra->prev_pos |= prev_offset;
1060 1049
1061 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; 1050 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1062 if (cached_page)
1063 page_cache_release(cached_page);
1064 if (filp) 1051 if (filp)
1065 file_accessed(filp); 1052 file_accessed(filp);
1066} 1053}
@@ -1220,7 +1207,7 @@ EXPORT_SYMBOL(generic_file_aio_read);
1220 1207
1221static ssize_t 1208static ssize_t
1222do_readahead(struct address_space *mapping, struct file *filp, 1209do_readahead(struct address_space *mapping, struct file *filp,
1223 unsigned long index, unsigned long nr) 1210 pgoff_t index, unsigned long nr)
1224{ 1211{
1225 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 1212 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1226 return -EINVAL; 1213 return -EINVAL;
@@ -1240,8 +1227,8 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1240 if (file) { 1227 if (file) {
1241 if (file->f_mode & FMODE_READ) { 1228 if (file->f_mode & FMODE_READ) {
1242 struct address_space *mapping = file->f_mapping; 1229 struct address_space *mapping = file->f_mapping;
1243 unsigned long start = offset >> PAGE_CACHE_SHIFT; 1230 pgoff_t start = offset >> PAGE_CACHE_SHIFT;
1244 unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT; 1231 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1245 unsigned long len = end - start + 1; 1232 unsigned long len = end - start + 1;
1246 ret = do_readahead(mapping, file, start, len); 1233 ret = do_readahead(mapping, file, start, len);
1247 } 1234 }
@@ -1251,7 +1238,6 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1251} 1238}
1252 1239
1253#ifdef CONFIG_MMU 1240#ifdef CONFIG_MMU
1254static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1255/** 1241/**
1256 * page_cache_read - adds requested page to the page cache if not already there 1242 * page_cache_read - adds requested page to the page cache if not already there
1257 * @file: file to read 1243 * @file: file to read
@@ -1260,7 +1246,7 @@ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1260 * This adds the requested page to the page cache if it isn't already there, 1246 * This adds the requested page to the page cache if it isn't already there,
1261 * and schedules an I/O to read in its contents from disk. 1247 * and schedules an I/O to read in its contents from disk.
1262 */ 1248 */
1263static int fastcall page_cache_read(struct file * file, unsigned long offset) 1249static int fastcall page_cache_read(struct file * file, pgoff_t offset)
1264{ 1250{
1265 struct address_space *mapping = file->f_mapping; 1251 struct address_space *mapping = file->f_mapping;
1266 struct page *page; 1252 struct page *page;
@@ -1349,7 +1335,7 @@ retry_find:
1349 * Do we miss much more than hit in this file? If so, 1335 * Do we miss much more than hit in this file? If so,
1350 * stop bothering with read-ahead. It will only hurt. 1336 * stop bothering with read-ahead. It will only hurt.
1351 */ 1337 */
1352 if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) 1338 if (ra->mmap_miss > MMAP_LOTSAMISS)
1353 goto no_cached_page; 1339 goto no_cached_page;
1354 1340
1355 /* 1341 /*
@@ -1375,7 +1361,7 @@ retry_find:
1375 } 1361 }
1376 1362
1377 if (!did_readaround) 1363 if (!did_readaround)
1378 ra->mmap_hit++; 1364 ra->mmap_miss--;
1379 1365
1380 /* 1366 /*
1381 * We have a locked page in the page cache, now we need to check 1367 * We have a locked page in the page cache, now we need to check
@@ -1396,7 +1382,7 @@ retry_find:
1396 * Found the page and have a reference on it. 1382 * Found the page and have a reference on it.
1397 */ 1383 */
1398 mark_page_accessed(page); 1384 mark_page_accessed(page);
1399 ra->prev_index = page->index; 1385 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
1400 vmf->page = page; 1386 vmf->page = page;
1401 return ret | VM_FAULT_LOCKED; 1387 return ret | VM_FAULT_LOCKED;
1402 1388
@@ -1501,39 +1487,32 @@ EXPORT_SYMBOL(generic_file_mmap);
1501EXPORT_SYMBOL(generic_file_readonly_mmap); 1487EXPORT_SYMBOL(generic_file_readonly_mmap);
1502 1488
1503static struct page *__read_cache_page(struct address_space *mapping, 1489static struct page *__read_cache_page(struct address_space *mapping,
1504 unsigned long index, 1490 pgoff_t index,
1505 int (*filler)(void *,struct page*), 1491 int (*filler)(void *,struct page*),
1506 void *data) 1492 void *data)
1507{ 1493{
1508 struct page *page, *cached_page = NULL; 1494 struct page *page;
1509 int err; 1495 int err;
1510repeat: 1496repeat:
1511 page = find_get_page(mapping, index); 1497 page = find_get_page(mapping, index);
1512 if (!page) { 1498 if (!page) {
1513 if (!cached_page) { 1499 page = page_cache_alloc_cold(mapping);
1514 cached_page = page_cache_alloc_cold(mapping); 1500 if (!page)
1515 if (!cached_page) 1501 return ERR_PTR(-ENOMEM);
1516 return ERR_PTR(-ENOMEM); 1502 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
1517 } 1503 if (unlikely(err)) {
1518 err = add_to_page_cache_lru(cached_page, mapping, 1504 page_cache_release(page);
1519 index, GFP_KERNEL); 1505 if (err == -EEXIST)
1520 if (err == -EEXIST) 1506 goto repeat;
1521 goto repeat;
1522 if (err < 0) {
1523 /* Presumably ENOMEM for radix tree node */ 1507 /* Presumably ENOMEM for radix tree node */
1524 page_cache_release(cached_page);
1525 return ERR_PTR(err); 1508 return ERR_PTR(err);
1526 } 1509 }
1527 page = cached_page;
1528 cached_page = NULL;
1529 err = filler(data, page); 1510 err = filler(data, page);
1530 if (err < 0) { 1511 if (err < 0) {
1531 page_cache_release(page); 1512 page_cache_release(page);
1532 page = ERR_PTR(err); 1513 page = ERR_PTR(err);
1533 } 1514 }
1534 } 1515 }
1535 if (cached_page)
1536 page_cache_release(cached_page);
1537 return page; 1516 return page;
1538} 1517}
1539 1518
@@ -1542,7 +1521,7 @@ repeat:
1542 * after submitting it to the filler. 1521 * after submitting it to the filler.
1543 */ 1522 */
1544struct page *read_cache_page_async(struct address_space *mapping, 1523struct page *read_cache_page_async(struct address_space *mapping,
1545 unsigned long index, 1524 pgoff_t index,
1546 int (*filler)(void *,struct page*), 1525 int (*filler)(void *,struct page*),
1547 void *data) 1526 void *data)
1548{ 1527{
@@ -1590,7 +1569,7 @@ EXPORT_SYMBOL(read_cache_page_async);
1590 * If the page does not get brought uptodate, return -EIO. 1569 * If the page does not get brought uptodate, return -EIO.
1591 */ 1570 */
1592struct page *read_cache_page(struct address_space *mapping, 1571struct page *read_cache_page(struct address_space *mapping,
1593 unsigned long index, 1572 pgoff_t index,
1594 int (*filler)(void *,struct page*), 1573 int (*filler)(void *,struct page*),
1595 void *data) 1574 void *data)
1596{ 1575{
@@ -1610,40 +1589,6 @@ struct page *read_cache_page(struct address_space *mapping,
1610EXPORT_SYMBOL(read_cache_page); 1589EXPORT_SYMBOL(read_cache_page);
1611 1590
1612/* 1591/*
1613 * If the page was newly created, increment its refcount and add it to the
1614 * caller's lru-buffering pagevec. This function is specifically for
1615 * generic_file_write().
1616 */
1617static inline struct page *
1618__grab_cache_page(struct address_space *mapping, unsigned long index,
1619 struct page **cached_page, struct pagevec *lru_pvec)
1620{
1621 int err;
1622 struct page *page;
1623repeat:
1624 page = find_lock_page(mapping, index);
1625 if (!page) {
1626 if (!*cached_page) {
1627 *cached_page = page_cache_alloc(mapping);
1628 if (!*cached_page)
1629 return NULL;
1630 }
1631 err = add_to_page_cache(*cached_page, mapping,
1632 index, GFP_KERNEL);
1633 if (err == -EEXIST)
1634 goto repeat;
1635 if (err == 0) {
1636 page = *cached_page;
1637 page_cache_get(page);
1638 if (!pagevec_add(lru_pvec, page))
1639 __pagevec_lru_add(lru_pvec);
1640 *cached_page = NULL;
1641 }
1642 }
1643 return page;
1644}
1645
1646/*
1647 * The logic we want is 1592 * The logic we want is
1648 * 1593 *
1649 * if suid or (sgid and xgrp) 1594 * if suid or (sgid and xgrp)
@@ -1682,17 +1627,22 @@ int __remove_suid(struct dentry *dentry, int kill)
1682 1627
1683int remove_suid(struct dentry *dentry) 1628int remove_suid(struct dentry *dentry)
1684{ 1629{
1685 int kill = should_remove_suid(dentry); 1630 int killsuid = should_remove_suid(dentry);
1631 int killpriv = security_inode_need_killpriv(dentry);
1632 int error = 0;
1686 1633
1687 if (unlikely(kill)) 1634 if (killpriv < 0)
1688 return __remove_suid(dentry, kill); 1635 return killpriv;
1636 if (killpriv)
1637 error = security_inode_killpriv(dentry);
1638 if (!error && killsuid)
1639 error = __remove_suid(dentry, killsuid);
1689 1640
1690 return 0; 1641 return error;
1691} 1642}
1692EXPORT_SYMBOL(remove_suid); 1643EXPORT_SYMBOL(remove_suid);
1693 1644
1694size_t 1645static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1695__filemap_copy_from_user_iovec_inatomic(char *vaddr,
1696 const struct iovec *iov, size_t base, size_t bytes) 1646 const struct iovec *iov, size_t base, size_t bytes)
1697{ 1647{
1698 size_t copied = 0, left = 0; 1648 size_t copied = 0, left = 0;
@@ -1715,6 +1665,124 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr,
1715} 1665}
1716 1666
1717/* 1667/*
1668 * Copy as much as we can into the page and return the number of bytes which
1669 * were sucessfully copied. If a fault is encountered then return the number of
1670 * bytes which were copied.
1671 */
1672size_t iov_iter_copy_from_user_atomic(struct page *page,
1673 struct iov_iter *i, unsigned long offset, size_t bytes)
1674{
1675 char *kaddr;
1676 size_t copied;
1677
1678 BUG_ON(!in_atomic());
1679 kaddr = kmap_atomic(page, KM_USER0);
1680 if (likely(i->nr_segs == 1)) {
1681 int left;
1682 char __user *buf = i->iov->iov_base + i->iov_offset;
1683 left = __copy_from_user_inatomic_nocache(kaddr + offset,
1684 buf, bytes);
1685 copied = bytes - left;
1686 } else {
1687 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1688 i->iov, i->iov_offset, bytes);
1689 }
1690 kunmap_atomic(kaddr, KM_USER0);
1691
1692 return copied;
1693}
1694EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
1695
1696/*
1697 * This has the same sideeffects and return value as
1698 * iov_iter_copy_from_user_atomic().
1699 * The difference is that it attempts to resolve faults.
1700 * Page must not be locked.
1701 */
1702size_t iov_iter_copy_from_user(struct page *page,
1703 struct iov_iter *i, unsigned long offset, size_t bytes)
1704{
1705 char *kaddr;
1706 size_t copied;
1707
1708 kaddr = kmap(page);
1709 if (likely(i->nr_segs == 1)) {
1710 int left;
1711 char __user *buf = i->iov->iov_base + i->iov_offset;
1712 left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
1713 copied = bytes - left;
1714 } else {
1715 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1716 i->iov, i->iov_offset, bytes);
1717 }
1718 kunmap(page);
1719 return copied;
1720}
1721EXPORT_SYMBOL(iov_iter_copy_from_user);
1722
1723static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes)
1724{
1725 if (likely(i->nr_segs == 1)) {
1726 i->iov_offset += bytes;
1727 } else {
1728 const struct iovec *iov = i->iov;
1729 size_t base = i->iov_offset;
1730
1731 while (bytes) {
1732 int copy = min(bytes, iov->iov_len - base);
1733
1734 bytes -= copy;
1735 base += copy;
1736 if (iov->iov_len == base) {
1737 iov++;
1738 base = 0;
1739 }
1740 }
1741 i->iov = iov;
1742 i->iov_offset = base;
1743 }
1744}
1745
1746void iov_iter_advance(struct iov_iter *i, size_t bytes)
1747{
1748 BUG_ON(i->count < bytes);
1749
1750 __iov_iter_advance_iov(i, bytes);
1751 i->count -= bytes;
1752}
1753EXPORT_SYMBOL(iov_iter_advance);
1754
1755/*
1756 * Fault in the first iovec of the given iov_iter, to a maximum length
1757 * of bytes. Returns 0 on success, or non-zero if the memory could not be
1758 * accessed (ie. because it is an invalid address).
1759 *
1760 * writev-intensive code may want this to prefault several iovecs -- that
1761 * would be possible (callers must not rely on the fact that _only_ the
1762 * first iovec will be faulted with the current implementation).
1763 */
1764int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
1765{
1766 char __user *buf = i->iov->iov_base + i->iov_offset;
1767 bytes = min(bytes, i->iov->iov_len - i->iov_offset);
1768 return fault_in_pages_readable(buf, bytes);
1769}
1770EXPORT_SYMBOL(iov_iter_fault_in_readable);
1771
1772/*
1773 * Return the count of just the current iov_iter segment.
1774 */
1775size_t iov_iter_single_seg_count(struct iov_iter *i)
1776{
1777 const struct iovec *iov = i->iov;
1778 if (i->nr_segs == 1)
1779 return i->count;
1780 else
1781 return min(i->count, iov->iov_len - i->iov_offset);
1782}
1783EXPORT_SYMBOL(iov_iter_single_seg_count);
1784
1785/*
1718 * Performs necessary checks before doing a write 1786 * Performs necessary checks before doing a write
1719 * 1787 *
1720 * Can adjust writing position or amount of bytes to write. 1788 * Can adjust writing position or amount of bytes to write.
@@ -1796,6 +1864,91 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
1796} 1864}
1797EXPORT_SYMBOL(generic_write_checks); 1865EXPORT_SYMBOL(generic_write_checks);
1798 1866
1867int pagecache_write_begin(struct file *file, struct address_space *mapping,
1868 loff_t pos, unsigned len, unsigned flags,
1869 struct page **pagep, void **fsdata)
1870{
1871 const struct address_space_operations *aops = mapping->a_ops;
1872
1873 if (aops->write_begin) {
1874 return aops->write_begin(file, mapping, pos, len, flags,
1875 pagep, fsdata);
1876 } else {
1877 int ret;
1878 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1879 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1880 struct inode *inode = mapping->host;
1881 struct page *page;
1882again:
1883 page = __grab_cache_page(mapping, index);
1884 *pagep = page;
1885 if (!page)
1886 return -ENOMEM;
1887
1888 if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
1889 /*
1890 * There is no way to resolve a short write situation
1891 * for a !Uptodate page (except by double copying in
1892 * the caller done by generic_perform_write_2copy).
1893 *
1894 * Instead, we have to bring it uptodate here.
1895 */
1896 ret = aops->readpage(file, page);
1897 page_cache_release(page);
1898 if (ret) {
1899 if (ret == AOP_TRUNCATED_PAGE)
1900 goto again;
1901 return ret;
1902 }
1903 goto again;
1904 }
1905
1906 ret = aops->prepare_write(file, page, offset, offset+len);
1907 if (ret) {
1908 unlock_page(page);
1909 page_cache_release(page);
1910 if (pos + len > inode->i_size)
1911 vmtruncate(inode, inode->i_size);
1912 }
1913 return ret;
1914 }
1915}
1916EXPORT_SYMBOL(pagecache_write_begin);
1917
1918int pagecache_write_end(struct file *file, struct address_space *mapping,
1919 loff_t pos, unsigned len, unsigned copied,
1920 struct page *page, void *fsdata)
1921{
1922 const struct address_space_operations *aops = mapping->a_ops;
1923 int ret;
1924
1925 if (aops->write_end) {
1926 mark_page_accessed(page);
1927 ret = aops->write_end(file, mapping, pos, len, copied,
1928 page, fsdata);
1929 } else {
1930 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1931 struct inode *inode = mapping->host;
1932
1933 flush_dcache_page(page);
1934 ret = aops->commit_write(file, page, offset, offset+len);
1935 unlock_page(page);
1936 mark_page_accessed(page);
1937 page_cache_release(page);
1938
1939 if (ret < 0) {
1940 if (pos + len > inode->i_size)
1941 vmtruncate(inode, inode->i_size);
1942 } else if (ret > 0)
1943 ret = min_t(size_t, copied, ret);
1944 else
1945 ret = copied;
1946 }
1947
1948 return ret;
1949}
1950EXPORT_SYMBOL(pagecache_write_end);
1951
1799ssize_t 1952ssize_t
1800generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 1953generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1801 unsigned long *nr_segs, loff_t pos, loff_t *ppos, 1954 unsigned long *nr_segs, loff_t pos, loff_t *ppos,
@@ -1835,151 +1988,314 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1835} 1988}
1836EXPORT_SYMBOL(generic_file_direct_write); 1989EXPORT_SYMBOL(generic_file_direct_write);
1837 1990
1838ssize_t 1991/*
1839generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, 1992 * Find or create a page at the given pagecache position. Return the locked
1840 unsigned long nr_segs, loff_t pos, loff_t *ppos, 1993 * page. This function is specifically for buffered writes.
1841 size_t count, ssize_t written) 1994 */
1995struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
1842{ 1996{
1843 struct file *file = iocb->ki_filp; 1997 int status;
1844 struct address_space * mapping = file->f_mapping; 1998 struct page *page;
1845 const struct address_space_operations *a_ops = mapping->a_ops; 1999repeat:
1846 struct inode *inode = mapping->host; 2000 page = find_lock_page(mapping, index);
1847 long status = 0; 2001 if (likely(page))
1848 struct page *page; 2002 return page;
1849 struct page *cached_page = NULL;
1850 size_t bytes;
1851 struct pagevec lru_pvec;
1852 const struct iovec *cur_iov = iov; /* current iovec */
1853 size_t iov_base = 0; /* offset in the current iovec */
1854 char __user *buf;
1855
1856 pagevec_init(&lru_pvec, 0);
1857 2003
1858 /* 2004 page = page_cache_alloc(mapping);
1859 * handle partial DIO write. Adjust cur_iov if needed. 2005 if (!page)
1860 */ 2006 return NULL;
1861 if (likely(nr_segs == 1)) 2007 status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
1862 buf = iov->iov_base + written; 2008 if (unlikely(status)) {
1863 else { 2009 page_cache_release(page);
1864 filemap_set_next_iovec(&cur_iov, &iov_base, written); 2010 if (status == -EEXIST)
1865 buf = cur_iov->iov_base + iov_base; 2011 goto repeat;
2012 return NULL;
1866 } 2013 }
2014 return page;
2015}
2016EXPORT_SYMBOL(__grab_cache_page);
2017
2018static ssize_t generic_perform_write_2copy(struct file *file,
2019 struct iov_iter *i, loff_t pos)
2020{
2021 struct address_space *mapping = file->f_mapping;
2022 const struct address_space_operations *a_ops = mapping->a_ops;
2023 struct inode *inode = mapping->host;
2024 long status = 0;
2025 ssize_t written = 0;
1867 2026
1868 do { 2027 do {
1869 unsigned long index; 2028 struct page *src_page;
1870 unsigned long offset; 2029 struct page *page;
1871 size_t copied; 2030 pgoff_t index; /* Pagecache index for current page */
2031 unsigned long offset; /* Offset into pagecache page */
2032 unsigned long bytes; /* Bytes to write to page */
2033 size_t copied; /* Bytes copied from user */
1872 2034
1873 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ 2035 offset = (pos & (PAGE_CACHE_SIZE - 1));
1874 index = pos >> PAGE_CACHE_SHIFT; 2036 index = pos >> PAGE_CACHE_SHIFT;
1875 bytes = PAGE_CACHE_SIZE - offset; 2037 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2038 iov_iter_count(i));
1876 2039
1877 /* Limit the size of the copy to the caller's write size */ 2040 /*
1878 bytes = min(bytes, count); 2041 * a non-NULL src_page indicates that we're doing the
1879 2042 * copy via get_user_pages and kmap.
1880 /* We only need to worry about prefaulting when writes are from
1881 * user-space. NFSd uses vfs_writev with several non-aligned
1882 * segments in the vector, and limiting to one segment a time is
1883 * a noticeable performance for re-write
1884 */ 2043 */
1885 if (!segment_eq(get_fs(), KERNEL_DS)) { 2044 src_page = NULL;
1886 /*
1887 * Limit the size of the copy to that of the current
1888 * segment, because fault_in_pages_readable() doesn't
1889 * know how to walk segments.
1890 */
1891 bytes = min(bytes, cur_iov->iov_len - iov_base);
1892 2045
1893 /* 2046 /*
1894 * Bring in the user page that we will copy from 2047 * Bring in the user page that we will copy from _first_.
1895 * _first_. Otherwise there's a nasty deadlock on 2048 * Otherwise there's a nasty deadlock on copying from the
1896 * copying from the same page as we're writing to, 2049 * same page as we're writing to, without it being marked
1897 * without it being marked up-to-date. 2050 * up-to-date.
1898 */ 2051 *
1899 fault_in_pages_readable(buf, bytes); 2052 * Not only is this an optimisation, but it is also required
2053 * to check that the address is actually valid, when atomic
2054 * usercopies are used, below.
2055 */
2056 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2057 status = -EFAULT;
2058 break;
1900 } 2059 }
1901 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); 2060
2061 page = __grab_cache_page(mapping, index);
1902 if (!page) { 2062 if (!page) {
1903 status = -ENOMEM; 2063 status = -ENOMEM;
1904 break; 2064 break;
1905 } 2065 }
1906 2066
1907 if (unlikely(bytes == 0)) { 2067 /*
1908 status = 0; 2068 * non-uptodate pages cannot cope with short copies, and we
1909 copied = 0; 2069 * cannot take a pagefault with the destination page locked.
1910 goto zero_length_segment; 2070 * So pin the source page to copy it.
1911 } 2071 */
2072 if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
2073 unlock_page(page);
1912 2074
1913 status = a_ops->prepare_write(file, page, offset, offset+bytes); 2075 src_page = alloc_page(GFP_KERNEL);
1914 if (unlikely(status)) { 2076 if (!src_page) {
1915 loff_t isize = i_size_read(inode); 2077 page_cache_release(page);
2078 status = -ENOMEM;
2079 break;
2080 }
1916 2081
1917 if (status != AOP_TRUNCATED_PAGE) 2082 /*
2083 * Cannot get_user_pages with a page locked for the
2084 * same reason as we can't take a page fault with a
2085 * page locked (as explained below).
2086 */
2087 copied = iov_iter_copy_from_user(src_page, i,
2088 offset, bytes);
2089 if (unlikely(copied == 0)) {
2090 status = -EFAULT;
2091 page_cache_release(page);
2092 page_cache_release(src_page);
2093 break;
2094 }
2095 bytes = copied;
2096
2097 lock_page(page);
2098 /*
2099 * Can't handle the page going uptodate here, because
2100 * that means we would use non-atomic usercopies, which
2101 * zero out the tail of the page, which can cause
2102 * zeroes to become transiently visible. We could just
2103 * use a non-zeroing copy, but the APIs aren't too
2104 * consistent.
2105 */
2106 if (unlikely(!page->mapping || PageUptodate(page))) {
1918 unlock_page(page); 2107 unlock_page(page);
1919 page_cache_release(page); 2108 page_cache_release(page);
1920 if (status == AOP_TRUNCATED_PAGE) 2109 page_cache_release(src_page);
1921 continue; 2110 continue;
2111 }
2112 }
2113
2114 status = a_ops->prepare_write(file, page, offset, offset+bytes);
2115 if (unlikely(status))
2116 goto fs_write_aop_error;
2117
2118 if (!src_page) {
1922 /* 2119 /*
1923 * prepare_write() may have instantiated a few blocks 2120 * Must not enter the pagefault handler here, because
1924 * outside i_size. Trim these off again. 2121 * we hold the page lock, so we might recursively
2122 * deadlock on the same lock, or get an ABBA deadlock
2123 * against a different lock, or against the mmap_sem
2124 * (which nests outside the page lock). So increment
2125 * preempt count, and use _atomic usercopies.
2126 *
2127 * The page is uptodate so we are OK to encounter a
2128 * short copy: if unmodified parts of the page are
2129 * marked dirty and written out to disk, it doesn't
2130 * really matter.
1925 */ 2131 */
1926 if (pos + bytes > isize) 2132 pagefault_disable();
1927 vmtruncate(inode, isize); 2133 copied = iov_iter_copy_from_user_atomic(page, i,
1928 break; 2134 offset, bytes);
2135 pagefault_enable();
2136 } else {
2137 void *src, *dst;
2138 src = kmap_atomic(src_page, KM_USER0);
2139 dst = kmap_atomic(page, KM_USER1);
2140 memcpy(dst + offset, src + offset, bytes);
2141 kunmap_atomic(dst, KM_USER1);
2142 kunmap_atomic(src, KM_USER0);
2143 copied = bytes;
1929 } 2144 }
1930 if (likely(nr_segs == 1))
1931 copied = filemap_copy_from_user(page, offset,
1932 buf, bytes);
1933 else
1934 copied = filemap_copy_from_user_iovec(page, offset,
1935 cur_iov, iov_base, bytes);
1936 flush_dcache_page(page); 2145 flush_dcache_page(page);
2146
1937 status = a_ops->commit_write(file, page, offset, offset+bytes); 2147 status = a_ops->commit_write(file, page, offset, offset+bytes);
1938 if (status == AOP_TRUNCATED_PAGE) { 2148 if (unlikely(status < 0))
1939 page_cache_release(page); 2149 goto fs_write_aop_error;
1940 continue; 2150 if (unlikely(status > 0)) /* filesystem did partial write */
1941 } 2151 copied = min_t(size_t, copied, status);
1942zero_length_segment: 2152
1943 if (likely(copied >= 0)) {
1944 if (!status)
1945 status = copied;
1946
1947 if (status >= 0) {
1948 written += status;
1949 count -= status;
1950 pos += status;
1951 buf += status;
1952 if (unlikely(nr_segs > 1)) {
1953 filemap_set_next_iovec(&cur_iov,
1954 &iov_base, status);
1955 if (count)
1956 buf = cur_iov->iov_base +
1957 iov_base;
1958 } else {
1959 iov_base += status;
1960 }
1961 }
1962 }
1963 if (unlikely(copied != bytes))
1964 if (status >= 0)
1965 status = -EFAULT;
1966 unlock_page(page); 2153 unlock_page(page);
1967 mark_page_accessed(page); 2154 mark_page_accessed(page);
1968 page_cache_release(page); 2155 page_cache_release(page);
1969 if (status < 0) 2156 if (src_page)
1970 break; 2157 page_cache_release(src_page);
2158
2159 iov_iter_advance(i, copied);
2160 pos += copied;
2161 written += copied;
2162
1971 balance_dirty_pages_ratelimited(mapping); 2163 balance_dirty_pages_ratelimited(mapping);
1972 cond_resched(); 2164 cond_resched();
1973 } while (count); 2165 continue;
1974 *ppos = pos;
1975 2166
1976 if (cached_page) 2167fs_write_aop_error:
1977 page_cache_release(cached_page); 2168 unlock_page(page);
2169 page_cache_release(page);
2170 if (src_page)
2171 page_cache_release(src_page);
2172
2173 /*
2174 * prepare_write() may have instantiated a few blocks
2175 * outside i_size. Trim these off again. Don't need
2176 * i_size_read because we hold i_mutex.
2177 */
2178 if (pos + bytes > inode->i_size)
2179 vmtruncate(inode, inode->i_size);
2180 break;
2181 } while (iov_iter_count(i));
2182
2183 return written ? written : status;
2184}
2185
2186static ssize_t generic_perform_write(struct file *file,
2187 struct iov_iter *i, loff_t pos)
2188{
2189 struct address_space *mapping = file->f_mapping;
2190 const struct address_space_operations *a_ops = mapping->a_ops;
2191 long status = 0;
2192 ssize_t written = 0;
2193 unsigned int flags = 0;
1978 2194
1979 /* 2195 /*
1980 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC 2196 * Copies from kernel address space cannot fail (NFSD is a big user).
1981 */ 2197 */
2198 if (segment_eq(get_fs(), KERNEL_DS))
2199 flags |= AOP_FLAG_UNINTERRUPTIBLE;
2200
2201 do {
2202 struct page *page;
2203 pgoff_t index; /* Pagecache index for current page */
2204 unsigned long offset; /* Offset into pagecache page */
2205 unsigned long bytes; /* Bytes to write to page */
2206 size_t copied; /* Bytes copied from user */
2207 void *fsdata;
2208
2209 offset = (pos & (PAGE_CACHE_SIZE - 1));
2210 index = pos >> PAGE_CACHE_SHIFT;
2211 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2212 iov_iter_count(i));
2213
2214again:
2215
2216 /*
2217 * Bring in the user page that we will copy from _first_.
2218 * Otherwise there's a nasty deadlock on copying from the
2219 * same page as we're writing to, without it being marked
2220 * up-to-date.
2221 *
2222 * Not only is this an optimisation, but it is also required
2223 * to check that the address is actually valid, when atomic
2224 * usercopies are used, below.
2225 */
2226 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2227 status = -EFAULT;
2228 break;
2229 }
2230
2231 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2232 &page, &fsdata);
2233 if (unlikely(status))
2234 break;
2235
2236 pagefault_disable();
2237 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2238 pagefault_enable();
2239 flush_dcache_page(page);
2240
2241 status = a_ops->write_end(file, mapping, pos, bytes, copied,
2242 page, fsdata);
2243 if (unlikely(status < 0))
2244 break;
2245 copied = status;
2246
2247 cond_resched();
2248
2249 if (unlikely(copied == 0)) {
2250 /*
2251 * If we were unable to copy any data at all, we must
2252 * fall back to a single segment length write.
2253 *
2254 * If we didn't fallback here, we could livelock
2255 * because not all segments in the iov can be copied at
2256 * once without a pagefault.
2257 */
2258 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2259 iov_iter_single_seg_count(i));
2260 goto again;
2261 }
2262 iov_iter_advance(i, copied);
2263 pos += copied;
2264 written += copied;
2265
2266 balance_dirty_pages_ratelimited(mapping);
2267
2268 } while (iov_iter_count(i));
2269
2270 return written ? written : status;
2271}
2272
2273ssize_t
2274generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2275 unsigned long nr_segs, loff_t pos, loff_t *ppos,
2276 size_t count, ssize_t written)
2277{
2278 struct file *file = iocb->ki_filp;
2279 struct address_space *mapping = file->f_mapping;
2280 const struct address_space_operations *a_ops = mapping->a_ops;
2281 struct inode *inode = mapping->host;
2282 ssize_t status;
2283 struct iov_iter i;
2284
2285 iov_iter_init(&i, iov, nr_segs, count, written);
2286 if (a_ops->write_begin)
2287 status = generic_perform_write(file, &i, pos);
2288 else
2289 status = generic_perform_write_2copy(file, &i, pos);
2290
1982 if (likely(status >= 0)) { 2291 if (likely(status >= 0)) {
2292 written += status;
2293 *ppos = pos + status;
2294
2295 /*
2296 * For now, when the user asks for O_SYNC, we'll actually give
2297 * O_DSYNC
2298 */
1983 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2299 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
1984 if (!a_ops->writepage || !is_sync_kiocb(iocb)) 2300 if (!a_ops->writepage || !is_sync_kiocb(iocb))
1985 status = generic_osync_inode(inode, mapping, 2301 status = generic_osync_inode(inode, mapping,
@@ -1995,7 +2311,6 @@ zero_length_segment:
1995 if (unlikely(file->f_flags & O_DIRECT) && written) 2311 if (unlikely(file->f_flags & O_DIRECT) && written)
1996 status = filemap_write_and_wait(mapping); 2312 status = filemap_write_and_wait(mapping);
1997 2313
1998 pagevec_lru_add(&lru_pvec);
1999 return written ? written : status; 2314 return written ? written : status;
2000} 2315}
2001EXPORT_SYMBOL(generic_file_buffered_write); 2316EXPORT_SYMBOL(generic_file_buffered_write);
diff --git a/mm/filemap.h b/mm/filemap.h
deleted file mode 100644
index c2bff04c84ed..000000000000
--- a/mm/filemap.h
+++ /dev/null
@@ -1,103 +0,0 @@
1/*
2 * linux/mm/filemap.h
3 *
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
6
7#ifndef __FILEMAP_H
8#define __FILEMAP_H
9
10#include <linux/types.h>
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/highmem.h>
14#include <linux/uio.h>
15#include <linux/uaccess.h>
16
17size_t
18__filemap_copy_from_user_iovec_inatomic(char *vaddr,
19 const struct iovec *iov,
20 size_t base,
21 size_t bytes);
22
23/*
24 * Copy as much as we can into the page and return the number of bytes which
25 * were sucessfully copied. If a fault is encountered then clear the page
26 * out to (offset+bytes) and return the number of bytes which were copied.
27 *
28 * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache
29 * to *NOT* zero any tail of the buffer that it failed to copy. If it does,
30 * and if the following non-atomic copy succeeds, then there is a small window
31 * where the target page contains neither the data before the write, nor the
32 * data after the write (it contains zero). A read at this time will see
33 * data that is inconsistent with any ordering of the read and the write.
34 * (This has been detected in practice).
35 */
36static inline size_t
37filemap_copy_from_user(struct page *page, unsigned long offset,
38 const char __user *buf, unsigned bytes)
39{
40 char *kaddr;
41 int left;
42
43 kaddr = kmap_atomic(page, KM_USER0);
44 left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
45 kunmap_atomic(kaddr, KM_USER0);
46
47 if (left != 0) {
48 /* Do it the slow way */
49 kaddr = kmap(page);
50 left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
51 kunmap(page);
52 }
53 return bytes - left;
54}
55
56/*
57 * This has the same sideeffects and return value as filemap_copy_from_user().
58 * The difference is that on a fault we need to memset the remainder of the
59 * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
60 * single-segment behaviour.
61 */
62static inline size_t
63filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
64 const struct iovec *iov, size_t base, size_t bytes)
65{
66 char *kaddr;
67 size_t copied;
68
69 kaddr = kmap_atomic(page, KM_USER0);
70 copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
71 base, bytes);
72 kunmap_atomic(kaddr, KM_USER0);
73 if (copied != bytes) {
74 kaddr = kmap(page);
75 copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
76 base, bytes);
77 if (bytes - copied)
78 memset(kaddr + offset + copied, 0, bytes - copied);
79 kunmap(page);
80 }
81 return copied;
82}
83
84static inline void
85filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
86{
87 const struct iovec *iov = *iovp;
88 size_t base = *basep;
89
90 do {
91 int copy = min(bytes, iov->iov_len - base);
92
93 bytes -= copy;
94 base += copy;
95 if (iov->iov_len == base) {
96 iov++;
97 base = 0;
98 }
99 } while (bytes);
100 *iovp = iov;
101 *basep = base;
102}
103#endif
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 53ee6a299635..32132f3cd641 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -15,7 +15,6 @@
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <asm/tlbflush.h> 17#include <asm/tlbflush.h>
18#include "filemap.h"
19 18
20/* 19/*
21 * We do use our own empty page to avoid interference with other users 20 * We do use our own empty page to avoid interference with other users
@@ -288,6 +287,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
288 unsigned long index; 287 unsigned long index;
289 unsigned long offset; 288 unsigned long offset;
290 size_t copied; 289 size_t copied;
290 char *kaddr;
291 291
292 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ 292 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
293 index = pos >> PAGE_CACHE_SHIFT; 293 index = pos >> PAGE_CACHE_SHIFT;
@@ -295,14 +295,6 @@ __xip_file_write(struct file *filp, const char __user *buf,
295 if (bytes > count) 295 if (bytes > count)
296 bytes = count; 296 bytes = count;
297 297
298 /*
299 * Bring in the user page that we will copy from _first_.
300 * Otherwise there's a nasty deadlock on copying from the
301 * same page as we're writing to, without it being marked
302 * up-to-date.
303 */
304 fault_in_pages_readable(buf, bytes);
305
306 page = a_ops->get_xip_page(mapping, 298 page = a_ops->get_xip_page(mapping,
307 index*(PAGE_SIZE/512), 0); 299 index*(PAGE_SIZE/512), 0);
308 if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) { 300 if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
@@ -319,8 +311,13 @@ __xip_file_write(struct file *filp, const char __user *buf,
319 break; 311 break;
320 } 312 }
321 313
322 copied = filemap_copy_from_user(page, offset, buf, bytes); 314 fault_in_pages_readable(buf, bytes);
315 kaddr = kmap_atomic(page, KM_USER0);
316 copied = bytes -
317 __copy_from_user_inatomic_nocache(kaddr, buf, bytes);
318 kunmap_atomic(kaddr, KM_USER0);
323 flush_dcache_page(page); 319 flush_dcache_page(page);
320
324 if (likely(copied > 0)) { 321 if (likely(copied > 0)) {
325 status = copied; 322 status = copied;
326 323
diff --git a/mm/fremap.c b/mm/fremap.c
index 95bcb5641c72..14bd3bf7826e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * started by Ingo Molnar, Copyright (C) 2002, 2003 6 * started by Ingo Molnar, Copyright (C) 2002, 2003
7 */ 7 */
8 8#include <linux/backing-dev.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/swap.h> 10#include <linux/swap.h>
11#include <linux/file.h> 11#include <linux/file.h>
@@ -97,26 +97,28 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
97 97
98} 98}
99 99
100/*** 100/**
101 * sys_remap_file_pages - remap arbitrary pages of a shared backing store 101 * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
102 * file within an existing vma.
103 * @start: start of the remapped virtual memory range 102 * @start: start of the remapped virtual memory range
104 * @size: size of the remapped virtual memory range 103 * @size: size of the remapped virtual memory range
105 * @prot: new protection bits of the range 104 * @prot: new protection bits of the range (see NOTE)
106 * @pgoff: to be mapped page of the backing store file 105 * @pgoff: to-be-mapped page of the backing store file
107 * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. 106 * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
108 * 107 *
109 * this syscall works purely via pagetables, so it's the most efficient 108 * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
109 * (shared backing store file).
110 *
111 * This syscall works purely via pagetables, so it's the most efficient
110 * way to map the same (large) file into a given virtual window. Unlike 112 * way to map the same (large) file into a given virtual window. Unlike
111 * mmap()/mremap() it does not create any new vmas. The new mappings are 113 * mmap()/mremap() it does not create any new vmas. The new mappings are
112 * also safe across swapout. 114 * also safe across swapout.
113 * 115 *
114 * NOTE: the 'prot' parameter right now is ignored, and the vma's default 116 * NOTE: the 'prot' parameter right now is ignored (but must be zero),
115 * protection is used. Arbitrary protections might be implemented in the 117 * and the vma's default protection is used. Arbitrary protections
116 * future. 118 * might be implemented in the future.
117 */ 119 */
118asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, 120asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
119 unsigned long __prot, unsigned long pgoff, unsigned long flags) 121 unsigned long prot, unsigned long pgoff, unsigned long flags)
120{ 122{
121 struct mm_struct *mm = current->mm; 123 struct mm_struct *mm = current->mm;
122 struct address_space *mapping; 124 struct address_space *mapping;
@@ -125,7 +127,7 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
125 int err = -EINVAL; 127 int err = -EINVAL;
126 int has_write_lock = 0; 128 int has_write_lock = 0;
127 129
128 if (__prot) 130 if (prot)
129 return err; 131 return err;
130 /* 132 /*
131 * Sanitize the syscall parameters: 133 * Sanitize the syscall parameters:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eab8c428cc93..ae2959bb59cb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -23,12 +23,16 @@
23 23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26static unsigned long surplus_huge_pages;
26unsigned long max_huge_pages; 27unsigned long max_huge_pages;
27static struct list_head hugepage_freelists[MAX_NUMNODES]; 28static struct list_head hugepage_freelists[MAX_NUMNODES];
28static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 29static unsigned int nr_huge_pages_node[MAX_NUMNODES];
29static unsigned int free_huge_pages_node[MAX_NUMNODES]; 30static unsigned int free_huge_pages_node[MAX_NUMNODES];
31static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
30static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 32static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
31unsigned long hugepages_treat_as_movable; 33unsigned long hugepages_treat_as_movable;
34int hugetlb_dynamic_pool;
35static int hugetlb_next_nid;
32 36
33/* 37/*
34 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 38 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
@@ -85,6 +89,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
85 list_del(&page->lru); 89 list_del(&page->lru);
86 free_huge_pages--; 90 free_huge_pages--;
87 free_huge_pages_node[nid]--; 91 free_huge_pages_node[nid]--;
92 if (vma && vma->vm_flags & VM_MAYSHARE)
93 resv_huge_pages--;
88 break; 94 break;
89 } 95 }
90 } 96 }
@@ -92,58 +98,269 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
92 return page; 98 return page;
93} 99}
94 100
101static void update_and_free_page(struct page *page)
102{
103 int i;
104 nr_huge_pages--;
105 nr_huge_pages_node[page_to_nid(page)]--;
106 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
107 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
108 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
109 1 << PG_private | 1<< PG_writeback);
110 }
111 set_compound_page_dtor(page, NULL);
112 set_page_refcounted(page);
113 __free_pages(page, HUGETLB_PAGE_ORDER);
114}
115
95static void free_huge_page(struct page *page) 116static void free_huge_page(struct page *page)
96{ 117{
97 BUG_ON(page_count(page)); 118 int nid = page_to_nid(page);
98 119
120 BUG_ON(page_count(page));
99 INIT_LIST_HEAD(&page->lru); 121 INIT_LIST_HEAD(&page->lru);
100 122
101 spin_lock(&hugetlb_lock); 123 spin_lock(&hugetlb_lock);
102 enqueue_huge_page(page); 124 if (surplus_huge_pages_node[nid]) {
125 update_and_free_page(page);
126 surplus_huge_pages--;
127 surplus_huge_pages_node[nid]--;
128 } else {
129 enqueue_huge_page(page);
130 }
103 spin_unlock(&hugetlb_lock); 131 spin_unlock(&hugetlb_lock);
104} 132}
105 133
106static int alloc_fresh_huge_page(void) 134/*
135 * Increment or decrement surplus_huge_pages. Keep node-specific counters
136 * balanced by operating on them in a round-robin fashion.
137 * Returns 1 if an adjustment was made.
138 */
139static int adjust_pool_surplus(int delta)
107{ 140{
108 static int prev_nid; 141 static int prev_nid;
109 struct page *page; 142 int nid = prev_nid;
110 int nid; 143 int ret = 0;
144
145 VM_BUG_ON(delta != -1 && delta != 1);
146 do {
147 nid = next_node(nid, node_online_map);
148 if (nid == MAX_NUMNODES)
149 nid = first_node(node_online_map);
150
151 /* To shrink on this node, there must be a surplus page */
152 if (delta < 0 && !surplus_huge_pages_node[nid])
153 continue;
154 /* Surplus cannot exceed the total number of pages */
155 if (delta > 0 && surplus_huge_pages_node[nid] >=
156 nr_huge_pages_node[nid])
157 continue;
158
159 surplus_huge_pages += delta;
160 surplus_huge_pages_node[nid] += delta;
161 ret = 1;
162 break;
163 } while (nid != prev_nid);
111 164
112 /*
113 * Copy static prev_nid to local nid, work on that, then copy it
114 * back to prev_nid afterwards: otherwise there's a window in which
115 * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
116 * But we don't need to use a spin_lock here: it really doesn't
117 * matter if occasionally a racer chooses the same nid as we do.
118 */
119 nid = next_node(prev_nid, node_online_map);
120 if (nid == MAX_NUMNODES)
121 nid = first_node(node_online_map);
122 prev_nid = nid; 165 prev_nid = nid;
166 return ret;
167}
168
169static struct page *alloc_fresh_huge_page_node(int nid)
170{
171 struct page *page;
123 172
124 page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, 173 page = alloc_pages_node(nid,
174 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
175 HUGETLB_PAGE_ORDER);
176 if (page) {
177 set_compound_page_dtor(page, free_huge_page);
178 spin_lock(&hugetlb_lock);
179 nr_huge_pages++;
180 nr_huge_pages_node[nid]++;
181 spin_unlock(&hugetlb_lock);
182 put_page(page); /* free it into the hugepage allocator */
183 }
184
185 return page;
186}
187
188static int alloc_fresh_huge_page(void)
189{
190 struct page *page;
191 int start_nid;
192 int next_nid;
193 int ret = 0;
194
195 start_nid = hugetlb_next_nid;
196
197 do {
198 page = alloc_fresh_huge_page_node(hugetlb_next_nid);
199 if (page)
200 ret = 1;
201 /*
202 * Use a helper variable to find the next node and then
203 * copy it back to hugetlb_next_nid afterwards:
204 * otherwise there's a window in which a racer might
205 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
206 * But we don't need to use a spin_lock here: it really
207 * doesn't matter if occasionally a racer chooses the
208 * same nid as we do. Move nid forward in the mask even
209 * if we just successfully allocated a hugepage so that
210 * the next caller gets hugepages on the next node.
211 */
212 next_nid = next_node(hugetlb_next_nid, node_online_map);
213 if (next_nid == MAX_NUMNODES)
214 next_nid = first_node(node_online_map);
215 hugetlb_next_nid = next_nid;
216 } while (!page && hugetlb_next_nid != start_nid);
217
218 return ret;
219}
220
221static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
222 unsigned long address)
223{
224 struct page *page;
225
226 /* Check if the dynamic pool is enabled */
227 if (!hugetlb_dynamic_pool)
228 return NULL;
229
230 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
125 HUGETLB_PAGE_ORDER); 231 HUGETLB_PAGE_ORDER);
126 if (page) { 232 if (page) {
127 set_compound_page_dtor(page, free_huge_page); 233 set_compound_page_dtor(page, free_huge_page);
128 spin_lock(&hugetlb_lock); 234 spin_lock(&hugetlb_lock);
129 nr_huge_pages++; 235 nr_huge_pages++;
130 nr_huge_pages_node[page_to_nid(page)]++; 236 nr_huge_pages_node[page_to_nid(page)]++;
237 surplus_huge_pages++;
238 surplus_huge_pages_node[page_to_nid(page)]++;
131 spin_unlock(&hugetlb_lock); 239 spin_unlock(&hugetlb_lock);
132 put_page(page); /* free it into the hugepage allocator */
133 return 1;
134 } 240 }
135 return 0; 241
242 return page;
243}
244
245/*
246 * Increase the hugetlb pool such that it can accomodate a reservation
247 * of size 'delta'.
248 */
249static int gather_surplus_pages(int delta)
250{
251 struct list_head surplus_list;
252 struct page *page, *tmp;
253 int ret, i;
254 int needed, allocated;
255
256 needed = (resv_huge_pages + delta) - free_huge_pages;
257 if (needed <= 0)
258 return 0;
259
260 allocated = 0;
261 INIT_LIST_HEAD(&surplus_list);
262
263 ret = -ENOMEM;
264retry:
265 spin_unlock(&hugetlb_lock);
266 for (i = 0; i < needed; i++) {
267 page = alloc_buddy_huge_page(NULL, 0);
268 if (!page) {
269 /*
270 * We were not able to allocate enough pages to
271 * satisfy the entire reservation so we free what
272 * we've allocated so far.
273 */
274 spin_lock(&hugetlb_lock);
275 needed = 0;
276 goto free;
277 }
278
279 list_add(&page->lru, &surplus_list);
280 }
281 allocated += needed;
282
283 /*
284 * After retaking hugetlb_lock, we need to recalculate 'needed'
285 * because either resv_huge_pages or free_huge_pages may have changed.
286 */
287 spin_lock(&hugetlb_lock);
288 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
289 if (needed > 0)
290 goto retry;
291
292 /*
293 * The surplus_list now contains _at_least_ the number of extra pages
294 * needed to accomodate the reservation. Add the appropriate number
295 * of pages to the hugetlb pool and free the extras back to the buddy
296 * allocator.
297 */
298 needed += allocated;
299 ret = 0;
300free:
301 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
302 list_del(&page->lru);
303 if ((--needed) >= 0)
304 enqueue_huge_page(page);
305 else {
306 /*
307 * Decrement the refcount and free the page using its
308 * destructor. This must be done with hugetlb_lock
309 * unlocked which is safe because free_huge_page takes
310 * hugetlb_lock before deciding how to free the page.
311 */
312 spin_unlock(&hugetlb_lock);
313 put_page(page);
314 spin_lock(&hugetlb_lock);
315 }
316 }
317
318 return ret;
319}
320
321/*
322 * When releasing a hugetlb pool reservation, any surplus pages that were
323 * allocated to satisfy the reservation must be explicitly freed if they were
324 * never used.
325 */
326void return_unused_surplus_pages(unsigned long unused_resv_pages)
327{
328 static int nid = -1;
329 struct page *page;
330 unsigned long nr_pages;
331
332 nr_pages = min(unused_resv_pages, surplus_huge_pages);
333
334 while (nr_pages) {
335 nid = next_node(nid, node_online_map);
336 if (nid == MAX_NUMNODES)
337 nid = first_node(node_online_map);
338
339 if (!surplus_huge_pages_node[nid])
340 continue;
341
342 if (!list_empty(&hugepage_freelists[nid])) {
343 page = list_entry(hugepage_freelists[nid].next,
344 struct page, lru);
345 list_del(&page->lru);
346 update_and_free_page(page);
347 free_huge_pages--;
348 free_huge_pages_node[nid]--;
349 surplus_huge_pages--;
350 surplus_huge_pages_node[nid]--;
351 nr_pages--;
352 }
353 }
136} 354}
137 355
138static struct page *alloc_huge_page(struct vm_area_struct *vma, 356static struct page *alloc_huge_page(struct vm_area_struct *vma,
139 unsigned long addr) 357 unsigned long addr)
140{ 358{
141 struct page *page; 359 struct page *page = NULL;
360 int use_reserved_page = vma->vm_flags & VM_MAYSHARE;
142 361
143 spin_lock(&hugetlb_lock); 362 spin_lock(&hugetlb_lock);
144 if (vma->vm_flags & VM_MAYSHARE) 363 if (!use_reserved_page && (free_huge_pages <= resv_huge_pages))
145 resv_huge_pages--;
146 else if (free_huge_pages <= resv_huge_pages)
147 goto fail; 364 goto fail;
148 365
149 page = dequeue_huge_page(vma, addr); 366 page = dequeue_huge_page(vma, addr);
@@ -155,10 +372,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
155 return page; 372 return page;
156 373
157fail: 374fail:
158 if (vma->vm_flags & VM_MAYSHARE)
159 resv_huge_pages++;
160 spin_unlock(&hugetlb_lock); 375 spin_unlock(&hugetlb_lock);
161 return NULL; 376
377 /*
378 * Private mappings do not use reserved huge pages so the allocation
379 * may have failed due to an undersized hugetlb pool. Try to grab a
380 * surplus huge page from the buddy allocator.
381 */
382 if (!use_reserved_page)
383 page = alloc_buddy_huge_page(vma, addr);
384
385 return page;
162} 386}
163 387
164static int __init hugetlb_init(void) 388static int __init hugetlb_init(void)
@@ -171,6 +395,8 @@ static int __init hugetlb_init(void)
171 for (i = 0; i < MAX_NUMNODES; ++i) 395 for (i = 0; i < MAX_NUMNODES; ++i)
172 INIT_LIST_HEAD(&hugepage_freelists[i]); 396 INIT_LIST_HEAD(&hugepage_freelists[i]);
173 397
398 hugetlb_next_nid = first_node(node_online_map);
399
174 for (i = 0; i < max_huge_pages; ++i) { 400 for (i = 0; i < max_huge_pages; ++i) {
175 if (!alloc_fresh_huge_page()) 401 if (!alloc_fresh_huge_page())
176 break; 402 break;
@@ -201,21 +427,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
201} 427}
202 428
203#ifdef CONFIG_SYSCTL 429#ifdef CONFIG_SYSCTL
204static void update_and_free_page(struct page *page)
205{
206 int i;
207 nr_huge_pages--;
208 nr_huge_pages_node[page_to_nid(page)]--;
209 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
210 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
211 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
212 1 << PG_private | 1<< PG_writeback);
213 }
214 set_compound_page_dtor(page, NULL);
215 set_page_refcounted(page);
216 __free_pages(page, HUGETLB_PAGE_ORDER);
217}
218
219#ifdef CONFIG_HIGHMEM 430#ifdef CONFIG_HIGHMEM
220static void try_to_free_low(unsigned long count) 431static void try_to_free_low(unsigned long count)
221{ 432{
@@ -224,14 +435,14 @@ static void try_to_free_low(unsigned long count)
224 for (i = 0; i < MAX_NUMNODES; ++i) { 435 for (i = 0; i < MAX_NUMNODES; ++i) {
225 struct page *page, *next; 436 struct page *page, *next;
226 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 437 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
438 if (count >= nr_huge_pages)
439 return;
227 if (PageHighMem(page)) 440 if (PageHighMem(page))
228 continue; 441 continue;
229 list_del(&page->lru); 442 list_del(&page->lru);
230 update_and_free_page(page); 443 update_and_free_page(page);
231 free_huge_pages--; 444 free_huge_pages--;
232 free_huge_pages_node[page_to_nid(page)]--; 445 free_huge_pages_node[page_to_nid(page)]--;
233 if (count >= nr_huge_pages)
234 return;
235 } 446 }
236 } 447 }
237} 448}
@@ -241,26 +452,61 @@ static inline void try_to_free_low(unsigned long count)
241} 452}
242#endif 453#endif
243 454
455#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
244static unsigned long set_max_huge_pages(unsigned long count) 456static unsigned long set_max_huge_pages(unsigned long count)
245{ 457{
246 while (count > nr_huge_pages) { 458 unsigned long min_count, ret;
247 if (!alloc_fresh_huge_page())
248 return nr_huge_pages;
249 }
250 if (count >= nr_huge_pages)
251 return nr_huge_pages;
252 459
460 /*
461 * Increase the pool size
462 * First take pages out of surplus state. Then make up the
463 * remaining difference by allocating fresh huge pages.
464 */
253 spin_lock(&hugetlb_lock); 465 spin_lock(&hugetlb_lock);
254 count = max(count, resv_huge_pages); 466 while (surplus_huge_pages && count > persistent_huge_pages) {
255 try_to_free_low(count); 467 if (!adjust_pool_surplus(-1))
256 while (count < nr_huge_pages) { 468 break;
469 }
470
471 while (count > persistent_huge_pages) {
472 int ret;
473 /*
474 * If this allocation races such that we no longer need the
475 * page, free_huge_page will handle it by freeing the page
476 * and reducing the surplus.
477 */
478 spin_unlock(&hugetlb_lock);
479 ret = alloc_fresh_huge_page();
480 spin_lock(&hugetlb_lock);
481 if (!ret)
482 goto out;
483
484 }
485
486 /*
487 * Decrease the pool size
488 * First return free pages to the buddy allocator (being careful
489 * to keep enough around to satisfy reservations). Then place
490 * pages into surplus state as needed so the pool will shrink
491 * to the desired size as pages become free.
492 */
493 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
494 min_count = max(count, min_count);
495 try_to_free_low(min_count);
496 while (min_count < persistent_huge_pages) {
257 struct page *page = dequeue_huge_page(NULL, 0); 497 struct page *page = dequeue_huge_page(NULL, 0);
258 if (!page) 498 if (!page)
259 break; 499 break;
260 update_and_free_page(page); 500 update_and_free_page(page);
261 } 501 }
502 while (count < persistent_huge_pages) {
503 if (!adjust_pool_surplus(1))
504 break;
505 }
506out:
507 ret = persistent_huge_pages;
262 spin_unlock(&hugetlb_lock); 508 spin_unlock(&hugetlb_lock);
263 return nr_huge_pages; 509 return ret;
264} 510}
265 511
266int hugetlb_sysctl_handler(struct ctl_table *table, int write, 512int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -292,10 +538,12 @@ int hugetlb_report_meminfo(char *buf)
292 "HugePages_Total: %5lu\n" 538 "HugePages_Total: %5lu\n"
293 "HugePages_Free: %5lu\n" 539 "HugePages_Free: %5lu\n"
294 "HugePages_Rsvd: %5lu\n" 540 "HugePages_Rsvd: %5lu\n"
541 "HugePages_Surp: %5lu\n"
295 "Hugepagesize: %5lu kB\n", 542 "Hugepagesize: %5lu kB\n",
296 nr_huge_pages, 543 nr_huge_pages,
297 free_huge_pages, 544 free_huge_pages,
298 resv_huge_pages, 545 resv_huge_pages,
546 surplus_huge_pages,
299 HPAGE_SIZE/1024); 547 HPAGE_SIZE/1024);
300} 548}
301 549
@@ -355,7 +603,6 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
355 entry = pte_mkwrite(pte_mkdirty(*ptep)); 603 entry = pte_mkwrite(pte_mkdirty(*ptep));
356 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { 604 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
357 update_mmu_cache(vma, address, entry); 605 update_mmu_cache(vma, address, entry);
358 lazy_mmu_prot_update(entry);
359 } 606 }
360} 607}
361 608
@@ -708,7 +955,6 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
708 pte = huge_ptep_get_and_clear(mm, address, ptep); 955 pte = huge_ptep_get_and_clear(mm, address, ptep);
709 pte = pte_mkhuge(pte_modify(pte, newprot)); 956 pte = pte_mkhuge(pte_modify(pte, newprot));
710 set_huge_pte_at(mm, address, ptep, pte); 957 set_huge_pte_at(mm, address, ptep, pte);
711 lazy_mmu_prot_update(pte);
712 } 958 }
713 } 959 }
714 spin_unlock(&mm->page_table_lock); 960 spin_unlock(&mm->page_table_lock);
@@ -843,21 +1089,6 @@ static int hugetlb_acct_memory(long delta)
843 int ret = -ENOMEM; 1089 int ret = -ENOMEM;
844 1090
845 spin_lock(&hugetlb_lock); 1091 spin_lock(&hugetlb_lock);
846 if ((delta + resv_huge_pages) <= free_huge_pages) {
847 resv_huge_pages += delta;
848 ret = 0;
849 }
850 spin_unlock(&hugetlb_lock);
851 return ret;
852}
853
854int hugetlb_reserve_pages(struct inode *inode, long from, long to)
855{
856 long ret, chg;
857
858 chg = region_chg(&inode->i_mapping->private_list, from, to);
859 if (chg < 0)
860 return chg;
861 /* 1092 /*
862 * When cpuset is configured, it breaks the strict hugetlb page 1093 * When cpuset is configured, it breaks the strict hugetlb page
863 * reservation as the accounting is done on a global variable. Such 1094 * reservation as the accounting is done on a global variable. Such
@@ -875,8 +1106,31 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
875 * a best attempt and hopefully to minimize the impact of changing 1106 * a best attempt and hopefully to minimize the impact of changing
876 * semantics that cpuset has. 1107 * semantics that cpuset has.
877 */ 1108 */
878 if (chg > cpuset_mems_nr(free_huge_pages_node)) 1109 if (delta > 0) {
879 return -ENOMEM; 1110 if (gather_surplus_pages(delta) < 0)
1111 goto out;
1112
1113 if (delta > cpuset_mems_nr(free_huge_pages_node))
1114 goto out;
1115 }
1116
1117 ret = 0;
1118 resv_huge_pages += delta;
1119 if (delta < 0)
1120 return_unused_surplus_pages((unsigned long) -delta);
1121
1122out:
1123 spin_unlock(&hugetlb_lock);
1124 return ret;
1125}
1126
1127int hugetlb_reserve_pages(struct inode *inode, long from, long to)
1128{
1129 long ret, chg;
1130
1131 chg = region_chg(&inode->i_mapping->private_list, from, to);
1132 if (chg < 0)
1133 return chg;
880 1134
881 ret = hugetlb_acct_memory(chg); 1135 ret = hugetlb_acct_memory(chg);
882 if (ret < 0) 1136 if (ret < 0)
diff --git a/mm/internal.h b/mm/internal.h
index a3110c02aea7..953f941ea867 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,4 +37,14 @@ static inline void __put_page(struct page *page)
37extern void fastcall __init __free_pages_bootmem(struct page *page, 37extern void fastcall __init __free_pages_bootmem(struct page *page,
38 unsigned int order); 38 unsigned int order);
39 39
40/*
41 * function for dealing with page's order in buddy system.
42 * zone->lock is already acquired when we use these.
43 * So, we don't need atomic page->flags operations here.
44 */
45static inline unsigned long page_order(struct page *page)
46{
47 VM_BUG_ON(!PageBuddy(page));
48 return page_private(page);
49}
40#endif 50#endif
diff --git a/mm/memory.c b/mm/memory.c
index f82b359b2745..bd16dcaeefb8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -966,7 +966,7 @@ no_page_table:
966 * has touched so far, we don't want to allocate page tables. 966 * has touched so far, we don't want to allocate page tables.
967 */ 967 */
968 if (flags & FOLL_ANON) { 968 if (flags & FOLL_ANON) {
969 page = ZERO_PAGE(address); 969 page = ZERO_PAGE(0);
970 if (flags & FOLL_GET) 970 if (flags & FOLL_GET)
971 get_page(page); 971 get_page(page);
972 BUG_ON(flags & FOLL_WRITE); 972 BUG_ON(flags & FOLL_WRITE);
@@ -1111,95 +1111,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1111} 1111}
1112EXPORT_SYMBOL(get_user_pages); 1112EXPORT_SYMBOL(get_user_pages);
1113 1113
1114static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1115 unsigned long addr, unsigned long end, pgprot_t prot)
1116{
1117 pte_t *pte;
1118 spinlock_t *ptl;
1119 int err = 0;
1120
1121 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1122 if (!pte)
1123 return -EAGAIN;
1124 arch_enter_lazy_mmu_mode();
1125 do {
1126 struct page *page = ZERO_PAGE(addr);
1127 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
1128
1129 if (unlikely(!pte_none(*pte))) {
1130 err = -EEXIST;
1131 pte++;
1132 break;
1133 }
1134 page_cache_get(page);
1135 page_add_file_rmap(page);
1136 inc_mm_counter(mm, file_rss);
1137 set_pte_at(mm, addr, pte, zero_pte);
1138 } while (pte++, addr += PAGE_SIZE, addr != end);
1139 arch_leave_lazy_mmu_mode();
1140 pte_unmap_unlock(pte - 1, ptl);
1141 return err;
1142}
1143
1144static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
1145 unsigned long addr, unsigned long end, pgprot_t prot)
1146{
1147 pmd_t *pmd;
1148 unsigned long next;
1149 int err;
1150
1151 pmd = pmd_alloc(mm, pud, addr);
1152 if (!pmd)
1153 return -EAGAIN;
1154 do {
1155 next = pmd_addr_end(addr, end);
1156 err = zeromap_pte_range(mm, pmd, addr, next, prot);
1157 if (err)
1158 break;
1159 } while (pmd++, addr = next, addr != end);
1160 return err;
1161}
1162
1163static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1164 unsigned long addr, unsigned long end, pgprot_t prot)
1165{
1166 pud_t *pud;
1167 unsigned long next;
1168 int err;
1169
1170 pud = pud_alloc(mm, pgd, addr);
1171 if (!pud)
1172 return -EAGAIN;
1173 do {
1174 next = pud_addr_end(addr, end);
1175 err = zeromap_pmd_range(mm, pud, addr, next, prot);
1176 if (err)
1177 break;
1178 } while (pud++, addr = next, addr != end);
1179 return err;
1180}
1181
1182int zeromap_page_range(struct vm_area_struct *vma,
1183 unsigned long addr, unsigned long size, pgprot_t prot)
1184{
1185 pgd_t *pgd;
1186 unsigned long next;
1187 unsigned long end = addr + size;
1188 struct mm_struct *mm = vma->vm_mm;
1189 int err;
1190
1191 BUG_ON(addr >= end);
1192 pgd = pgd_offset(mm, addr);
1193 flush_cache_range(vma, addr, end);
1194 do {
1195 next = pgd_addr_end(addr, end);
1196 err = zeromap_pud_range(mm, pgd, addr, next, prot);
1197 if (err)
1198 break;
1199 } while (pgd++, addr = next, addr != end);
1200 return err;
1201}
1202
1203pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) 1114pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
1204{ 1115{
1205 pgd_t * pgd = pgd_offset(mm, addr); 1116 pgd_t * pgd = pgd_offset(mm, addr);
@@ -1700,10 +1611,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1700 flush_cache_page(vma, address, pte_pfn(orig_pte)); 1611 flush_cache_page(vma, address, pte_pfn(orig_pte));
1701 entry = pte_mkyoung(orig_pte); 1612 entry = pte_mkyoung(orig_pte);
1702 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1613 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1703 if (ptep_set_access_flags(vma, address, page_table, entry,1)) { 1614 if (ptep_set_access_flags(vma, address, page_table, entry,1))
1704 update_mmu_cache(vma, address, entry); 1615 update_mmu_cache(vma, address, entry);
1705 lazy_mmu_prot_update(entry);
1706 }
1707 ret |= VM_FAULT_WRITE; 1616 ret |= VM_FAULT_WRITE;
1708 goto unlock; 1617 goto unlock;
1709 } 1618 }
@@ -1717,16 +1626,11 @@ gotten:
1717 1626
1718 if (unlikely(anon_vma_prepare(vma))) 1627 if (unlikely(anon_vma_prepare(vma)))
1719 goto oom; 1628 goto oom;
1720 if (old_page == ZERO_PAGE(address)) { 1629 VM_BUG_ON(old_page == ZERO_PAGE(0));
1721 new_page = alloc_zeroed_user_highpage_movable(vma, address); 1630 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1722 if (!new_page) 1631 if (!new_page)
1723 goto oom; 1632 goto oom;
1724 } else { 1633 cow_user_page(new_page, old_page, address, vma);
1725 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1726 if (!new_page)
1727 goto oom;
1728 cow_user_page(new_page, old_page, address, vma);
1729 }
1730 1634
1731 /* 1635 /*
1732 * Re-check the pte - we dropped the lock 1636 * Re-check the pte - we dropped the lock
@@ -1744,7 +1648,6 @@ gotten:
1744 flush_cache_page(vma, address, pte_pfn(orig_pte)); 1648 flush_cache_page(vma, address, pte_pfn(orig_pte));
1745 entry = mk_pte(new_page, vma->vm_page_prot); 1649 entry = mk_pte(new_page, vma->vm_page_prot);
1746 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1650 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1747 lazy_mmu_prot_update(entry);
1748 /* 1651 /*
1749 * Clear the pte entry and flush it first, before updating the 1652 * Clear the pte entry and flush it first, before updating the
1750 * pte with the new entry. This will avoid a race condition 1653 * pte with the new entry. This will avoid a race condition
@@ -2252,44 +2155,28 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2252 spinlock_t *ptl; 2155 spinlock_t *ptl;
2253 pte_t entry; 2156 pte_t entry;
2254 2157
2255 if (write_access) { 2158 /* Allocate our own private page. */
2256 /* Allocate our own private page. */ 2159 pte_unmap(page_table);
2257 pte_unmap(page_table);
2258
2259 if (unlikely(anon_vma_prepare(vma)))
2260 goto oom;
2261 page = alloc_zeroed_user_highpage_movable(vma, address);
2262 if (!page)
2263 goto oom;
2264
2265 entry = mk_pte(page, vma->vm_page_prot);
2266 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2267 2160
2268 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2161 if (unlikely(anon_vma_prepare(vma)))
2269 if (!pte_none(*page_table)) 2162 goto oom;
2270 goto release; 2163 page = alloc_zeroed_user_highpage_movable(vma, address);
2271 inc_mm_counter(mm, anon_rss); 2164 if (!page)
2272 lru_cache_add_active(page); 2165 goto oom;
2273 page_add_new_anon_rmap(page, vma, address);
2274 } else {
2275 /* Map the ZERO_PAGE - vm_page_prot is readonly */
2276 page = ZERO_PAGE(address);
2277 page_cache_get(page);
2278 entry = mk_pte(page, vma->vm_page_prot);
2279 2166
2280 ptl = pte_lockptr(mm, pmd); 2167 entry = mk_pte(page, vma->vm_page_prot);
2281 spin_lock(ptl); 2168 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2282 if (!pte_none(*page_table))
2283 goto release;
2284 inc_mm_counter(mm, file_rss);
2285 page_add_file_rmap(page);
2286 }
2287 2169
2170 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2171 if (!pte_none(*page_table))
2172 goto release;
2173 inc_mm_counter(mm, anon_rss);
2174 lru_cache_add_active(page);
2175 page_add_new_anon_rmap(page, vma, address);
2288 set_pte_at(mm, address, page_table, entry); 2176 set_pte_at(mm, address, page_table, entry);
2289 2177
2290 /* No need to invalidate - it was non-present before */ 2178 /* No need to invalidate - it was non-present before */
2291 update_mmu_cache(vma, address, entry); 2179 update_mmu_cache(vma, address, entry);
2292 lazy_mmu_prot_update(entry);
2293unlock: 2180unlock:
2294 pte_unmap_unlock(page_table, ptl); 2181 pte_unmap_unlock(page_table, ptl);
2295 return 0; 2182 return 0;
@@ -2442,7 +2329,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2442 2329
2443 /* no need to invalidate: a not-present page won't be cached */ 2330 /* no need to invalidate: a not-present page won't be cached */
2444 update_mmu_cache(vma, address, entry); 2331 update_mmu_cache(vma, address, entry);
2445 lazy_mmu_prot_update(entry);
2446 } else { 2332 } else {
2447 if (anon) 2333 if (anon)
2448 page_cache_release(page); 2334 page_cache_release(page);
@@ -2470,7 +2356,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2470 int write_access, pte_t orig_pte) 2356 int write_access, pte_t orig_pte)
2471{ 2357{
2472 pgoff_t pgoff = (((address & PAGE_MASK) 2358 pgoff_t pgoff = (((address & PAGE_MASK)
2473 - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; 2359 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2474 unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); 2360 unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
2475 2361
2476 pte_unmap(page_table); 2362 pte_unmap(page_table);
@@ -2614,7 +2500,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2614 entry = pte_mkyoung(entry); 2500 entry = pte_mkyoung(entry);
2615 if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { 2501 if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
2616 update_mmu_cache(vma, address, entry); 2502 update_mmu_cache(vma, address, entry);
2617 lazy_mmu_prot_update(entry);
2618 } else { 2503 } else {
2619 /* 2504 /*
2620 * This is needed only for protection faults but the arch code 2505 * This is needed only for protection faults but the arch code
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index df9d554bea30..091b9c6c2529 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -23,6 +23,9 @@
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/ioport.h> 24#include <linux/ioport.h>
25#include <linux/cpuset.h> 25#include <linux/cpuset.h>
26#include <linux/delay.h>
27#include <linux/migrate.h>
28#include <linux/page-isolation.h>
26 29
27#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
28 31
@@ -161,14 +164,27 @@ static void grow_pgdat_span(struct pglist_data *pgdat,
161 pgdat->node_start_pfn; 164 pgdat->node_start_pfn;
162} 165}
163 166
164int online_pages(unsigned long pfn, unsigned long nr_pages) 167static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
168 void *arg)
165{ 169{
166 unsigned long i; 170 unsigned long i;
171 unsigned long onlined_pages = *(unsigned long *)arg;
172 struct page *page;
173 if (PageReserved(pfn_to_page(start_pfn)))
174 for (i = 0; i < nr_pages; i++) {
175 page = pfn_to_page(start_pfn + i);
176 online_page(page);
177 onlined_pages++;
178 }
179 *(unsigned long *)arg = onlined_pages;
180 return 0;
181}
182
183
184int online_pages(unsigned long pfn, unsigned long nr_pages)
185{
167 unsigned long flags; 186 unsigned long flags;
168 unsigned long onlined_pages = 0; 187 unsigned long onlined_pages = 0;
169 struct resource res;
170 u64 section_end;
171 unsigned long start_pfn;
172 struct zone *zone; 188 struct zone *zone;
173 int need_zonelists_rebuild = 0; 189 int need_zonelists_rebuild = 0;
174 190
@@ -191,32 +207,16 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
191 if (!populated_zone(zone)) 207 if (!populated_zone(zone))
192 need_zonelists_rebuild = 1; 208 need_zonelists_rebuild = 1;
193 209
194 res.start = (u64)pfn << PAGE_SHIFT; 210 walk_memory_resource(pfn, nr_pages, &onlined_pages,
195 res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1; 211 online_pages_range);
196 res.flags = IORESOURCE_MEM; /* we just need system ram */
197 section_end = res.end;
198
199 while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
200 start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
201 nr_pages = (unsigned long)
202 ((res.end + 1 - res.start) >> PAGE_SHIFT);
203
204 if (PageReserved(pfn_to_page(start_pfn))) {
205 /* this region's page is not onlined now */
206 for (i = 0; i < nr_pages; i++) {
207 struct page *page = pfn_to_page(start_pfn + i);
208 online_page(page);
209 onlined_pages++;
210 }
211 }
212
213 res.start = res.end + 1;
214 res.end = section_end;
215 }
216 zone->present_pages += onlined_pages; 212 zone->present_pages += onlined_pages;
217 zone->zone_pgdat->node_present_pages += onlined_pages; 213 zone->zone_pgdat->node_present_pages += onlined_pages;
218 214
219 setup_per_zone_pages_min(); 215 setup_per_zone_pages_min();
216 if (onlined_pages) {
217 kswapd_run(zone_to_nid(zone));
218 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
219 }
220 220
221 if (need_zonelists_rebuild) 221 if (need_zonelists_rebuild)
222 build_all_zonelists(); 222 build_all_zonelists();
@@ -271,9 +271,6 @@ int add_memory(int nid, u64 start, u64 size)
271 if (!pgdat) 271 if (!pgdat)
272 return -ENOMEM; 272 return -ENOMEM;
273 new_pgdat = 1; 273 new_pgdat = 1;
274 ret = kswapd_run(nid);
275 if (ret)
276 goto error;
277 } 274 }
278 275
279 /* call arch's memory hotadd */ 276 /* call arch's memory hotadd */
@@ -308,3 +305,260 @@ error:
308 return ret; 305 return ret;
309} 306}
310EXPORT_SYMBOL_GPL(add_memory); 307EXPORT_SYMBOL_GPL(add_memory);
308
309#ifdef CONFIG_MEMORY_HOTREMOVE
310/*
311 * Confirm all pages in a range [start, end) is belongs to the same zone.
312 */
313static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
314{
315 unsigned long pfn;
316 struct zone *zone = NULL;
317 struct page *page;
318 int i;
319 for (pfn = start_pfn;
320 pfn < end_pfn;
321 pfn += MAX_ORDER_NR_PAGES) {
322 i = 0;
323 /* This is just a CONFIG_HOLES_IN_ZONE check.*/
324 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
325 i++;
326 if (i == MAX_ORDER_NR_PAGES)
327 continue;
328 page = pfn_to_page(pfn + i);
329 if (zone && page_zone(page) != zone)
330 return 0;
331 zone = page_zone(page);
332 }
333 return 1;
334}
335
336/*
337 * Scanning pfn is much easier than scanning lru list.
338 * Scan pfn from start to end and Find LRU page.
339 */
340int scan_lru_pages(unsigned long start, unsigned long end)
341{
342 unsigned long pfn;
343 struct page *page;
344 for (pfn = start; pfn < end; pfn++) {
345 if (pfn_valid(pfn)) {
346 page = pfn_to_page(pfn);
347 if (PageLRU(page))
348 return pfn;
349 }
350 }
351 return 0;
352}
353
354static struct page *
355hotremove_migrate_alloc(struct page *page,
356 unsigned long private,
357 int **x)
358{
359 /* This should be improoooooved!! */
360 return alloc_page(GFP_HIGHUSER_PAGECACHE);
361}
362
363
364#define NR_OFFLINE_AT_ONCE_PAGES (256)
365static int
366do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
367{
368 unsigned long pfn;
369 struct page *page;
370 int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
371 int not_managed = 0;
372 int ret = 0;
373 LIST_HEAD(source);
374
375 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
376 if (!pfn_valid(pfn))
377 continue;
378 page = pfn_to_page(pfn);
379 if (!page_count(page))
380 continue;
381 /*
382 * We can skip free pages. And we can only deal with pages on
383 * LRU.
384 */
385 ret = isolate_lru_page(page, &source);
386 if (!ret) { /* Success */
387 move_pages--;
388 } else {
389 /* Becasue we don't have big zone->lock. we should
390 check this again here. */
391 if (page_count(page))
392 not_managed++;
393#ifdef CONFIG_DEBUG_VM
394 printk(KERN_INFO "removing from LRU failed"
395 " %lx/%d/%lx\n",
396 pfn, page_count(page), page->flags);
397#endif
398 }
399 }
400 ret = -EBUSY;
401 if (not_managed) {
402 if (!list_empty(&source))
403 putback_lru_pages(&source);
404 goto out;
405 }
406 ret = 0;
407 if (list_empty(&source))
408 goto out;
409 /* this function returns # of failed pages */
410 ret = migrate_pages(&source, hotremove_migrate_alloc, 0);
411
412out:
413 return ret;
414}
415
416/*
417 * remove from free_area[] and mark all as Reserved.
418 */
419static int
420offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
421 void *data)
422{
423 __offline_isolated_pages(start, start + nr_pages);
424 return 0;
425}
426
427static void
428offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
429{
430 walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL,
431 offline_isolated_pages_cb);
432}
433
434/*
435 * Check all pages in range, recoreded as memory resource, are isolated.
436 */
437static int
438check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
439 void *data)
440{
441 int ret;
442 long offlined = *(long *)data;
443 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
444 offlined = nr_pages;
445 if (!ret)
446 *(long *)data += offlined;
447 return ret;
448}
449
450static long
451check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
452{
453 long offlined = 0;
454 int ret;
455
456 ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined,
457 check_pages_isolated_cb);
458 if (ret < 0)
459 offlined = (long)ret;
460 return offlined;
461}
462
463extern void drain_all_local_pages(void);
464
465int offline_pages(unsigned long start_pfn,
466 unsigned long end_pfn, unsigned long timeout)
467{
468 unsigned long pfn, nr_pages, expire;
469 long offlined_pages;
470 int ret, drain, retry_max;
471 struct zone *zone;
472
473 BUG_ON(start_pfn >= end_pfn);
474 /* at least, alignment against pageblock is necessary */
475 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
476 return -EINVAL;
477 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
478 return -EINVAL;
479 /* This makes hotplug much easier...and readable.
480 we assume this for now. .*/
481 if (!test_pages_in_a_zone(start_pfn, end_pfn))
482 return -EINVAL;
483 /* set above range as isolated */
484 ret = start_isolate_page_range(start_pfn, end_pfn);
485 if (ret)
486 return ret;
487 nr_pages = end_pfn - start_pfn;
488 pfn = start_pfn;
489 expire = jiffies + timeout;
490 drain = 0;
491 retry_max = 5;
492repeat:
493 /* start memory hot removal */
494 ret = -EAGAIN;
495 if (time_after(jiffies, expire))
496 goto failed_removal;
497 ret = -EINTR;
498 if (signal_pending(current))
499 goto failed_removal;
500 ret = 0;
501 if (drain) {
502 lru_add_drain_all();
503 flush_scheduled_work();
504 cond_resched();
505 drain_all_local_pages();
506 }
507
508 pfn = scan_lru_pages(start_pfn, end_pfn);
509 if (pfn) { /* We have page on LRU */
510 ret = do_migrate_range(pfn, end_pfn);
511 if (!ret) {
512 drain = 1;
513 goto repeat;
514 } else {
515 if (ret < 0)
516 if (--retry_max == 0)
517 goto failed_removal;
518 yield();
519 drain = 1;
520 goto repeat;
521 }
522 }
523 /* drain all zone's lru pagevec, this is asyncronous... */
524 lru_add_drain_all();
525 flush_scheduled_work();
526 yield();
527 /* drain pcp pages , this is synchrouns. */
528 drain_all_local_pages();
529 /* check again */
530 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
531 if (offlined_pages < 0) {
532 ret = -EBUSY;
533 goto failed_removal;
534 }
535 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
536 /* Ok, all of our target is islaoted.
537 We cannot do rollback at this point. */
538 offline_isolated_pages(start_pfn, end_pfn);
539 /* reset pagetype flags */
540 start_isolate_page_range(start_pfn, end_pfn);
541 /* removal success */
542 zone = page_zone(pfn_to_page(start_pfn));
543 zone->present_pages -= offlined_pages;
544 zone->zone_pgdat->node_present_pages -= offlined_pages;
545 totalram_pages -= offlined_pages;
546 num_physpages -= offlined_pages;
547 vm_total_pages = nr_free_pagecache_pages();
548 writeback_set_ratelimit();
549 return 0;
550
551failed_removal:
552 printk(KERN_INFO "memory offlining %lx to %lx failed\n",
553 start_pfn, end_pfn);
554 /* pushback to free area */
555 undo_isolate_page_range(start_pfn, end_pfn);
556 return ret;
557}
558#else
559int remove_memory(u64 start, u64 size)
560{
561 return -EINVAL;
562}
563EXPORT_SYMBOL_GPL(remove_memory);
564#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3d6ac9505d07..568152ae6caf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -72,7 +72,6 @@
72#include <linux/hugetlb.h> 72#include <linux/hugetlb.h>
73#include <linux/kernel.h> 73#include <linux/kernel.h>
74#include <linux/sched.h> 74#include <linux/sched.h>
75#include <linux/mm.h>
76#include <linux/nodemask.h> 75#include <linux/nodemask.h>
77#include <linux/cpuset.h> 76#include <linux/cpuset.h>
78#include <linux/gfp.h> 77#include <linux/gfp.h>
@@ -82,13 +81,13 @@
82#include <linux/interrupt.h> 81#include <linux/interrupt.h>
83#include <linux/init.h> 82#include <linux/init.h>
84#include <linux/compat.h> 83#include <linux/compat.h>
85#include <linux/mempolicy.h>
86#include <linux/swap.h> 84#include <linux/swap.h>
87#include <linux/seq_file.h> 85#include <linux/seq_file.h>
88#include <linux/proc_fs.h> 86#include <linux/proc_fs.h>
89#include <linux/migrate.h> 87#include <linux/migrate.h>
90#include <linux/rmap.h> 88#include <linux/rmap.h>
91#include <linux/security.h> 89#include <linux/security.h>
90#include <linux/syscalls.h>
92 91
93#include <asm/tlbflush.h> 92#include <asm/tlbflush.h>
94#include <asm/uaccess.h> 93#include <asm/uaccess.h>
@@ -110,6 +109,9 @@ struct mempolicy default_policy = {
110 .policy = MPOL_DEFAULT, 109 .policy = MPOL_DEFAULT,
111}; 110};
112 111
112static void mpol_rebind_policy(struct mempolicy *pol,
113 const nodemask_t *newmask);
114
113/* Do sanity checking on a policy */ 115/* Do sanity checking on a policy */
114static int mpol_check_policy(int mode, nodemask_t *nodes) 116static int mpol_check_policy(int mode, nodemask_t *nodes)
115{ 117{
@@ -128,7 +130,7 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
128 return -EINVAL; 130 return -EINVAL;
129 break; 131 break;
130 } 132 }
131 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; 133 return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL;
132} 134}
133 135
134/* Generate a custom zonelist for the BIND policy. */ 136/* Generate a custom zonelist for the BIND policy. */
@@ -185,7 +187,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
185 switch (mode) { 187 switch (mode) {
186 case MPOL_INTERLEAVE: 188 case MPOL_INTERLEAVE:
187 policy->v.nodes = *nodes; 189 policy->v.nodes = *nodes;
188 if (nodes_weight(*nodes) == 0) { 190 nodes_and(policy->v.nodes, policy->v.nodes,
191 node_states[N_HIGH_MEMORY]);
192 if (nodes_weight(policy->v.nodes) == 0) {
189 kmem_cache_free(policy_cache, policy); 193 kmem_cache_free(policy_cache, policy);
190 return ERR_PTR(-EINVAL); 194 return ERR_PTR(-EINVAL);
191 } 195 }
@@ -459,7 +463,7 @@ static void mpol_set_task_struct_flag(void)
459} 463}
460 464
461/* Set the process memory policy */ 465/* Set the process memory policy */
462long do_set_mempolicy(int mode, nodemask_t *nodes) 466static long do_set_mempolicy(int mode, nodemask_t *nodes)
463{ 467{
464 struct mempolicy *new; 468 struct mempolicy *new;
465 469
@@ -494,9 +498,9 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
494 *nodes = p->v.nodes; 498 *nodes = p->v.nodes;
495 break; 499 break;
496 case MPOL_PREFERRED: 500 case MPOL_PREFERRED:
497 /* or use current node instead of online map? */ 501 /* or use current node instead of memory_map? */
498 if (p->v.preferred_node < 0) 502 if (p->v.preferred_node < 0)
499 *nodes = node_online_map; 503 *nodes = node_states[N_HIGH_MEMORY];
500 else 504 else
501 node_set(p->v.preferred_node, *nodes); 505 node_set(p->v.preferred_node, *nodes);
502 break; 506 break;
@@ -519,8 +523,8 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
519} 523}
520 524
521/* Retrieve NUMA policy */ 525/* Retrieve NUMA policy */
522long do_get_mempolicy(int *policy, nodemask_t *nmask, 526static long do_get_mempolicy(int *policy, nodemask_t *nmask,
523 unsigned long addr, unsigned long flags) 527 unsigned long addr, unsigned long flags)
524{ 528{
525 int err; 529 int err;
526 struct mm_struct *mm = current->mm; 530 struct mm_struct *mm = current->mm;
@@ -528,8 +532,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
528 struct mempolicy *pol = current->mempolicy; 532 struct mempolicy *pol = current->mempolicy;
529 533
530 cpuset_update_task_memory_state(); 534 cpuset_update_task_memory_state();
531 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 535 if (flags &
536 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
532 return -EINVAL; 537 return -EINVAL;
538
539 if (flags & MPOL_F_MEMS_ALLOWED) {
540 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
541 return -EINVAL;
542 *policy = 0; /* just so it's initialized */
543 *nmask = cpuset_current_mems_allowed;
544 return 0;
545 }
546
533 if (flags & MPOL_F_ADDR) { 547 if (flags & MPOL_F_ADDR) {
534 down_read(&mm->mmap_sem); 548 down_read(&mm->mmap_sem);
535 vma = find_vma_intersection(mm, addr, addr+1); 549 vma = find_vma_intersection(mm, addr, addr+1);
@@ -601,7 +615,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
601 * Migrate pages from one node to a target node. 615 * Migrate pages from one node to a target node.
602 * Returns error or the number of pages not migrated. 616 * Returns error or the number of pages not migrated.
603 */ 617 */
604int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) 618static int migrate_to_node(struct mm_struct *mm, int source, int dest,
619 int flags)
605{ 620{
606 nodemask_t nmask; 621 nodemask_t nmask;
607 LIST_HEAD(pagelist); 622 LIST_HEAD(pagelist);
@@ -732,8 +747,9 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
732} 747}
733#endif 748#endif
734 749
735long do_mbind(unsigned long start, unsigned long len, 750static long do_mbind(unsigned long start, unsigned long len,
736 unsigned long mode, nodemask_t *nmask, unsigned long flags) 751 unsigned long mode, nodemask_t *nmask,
752 unsigned long flags)
737{ 753{
738 struct vm_area_struct *vma; 754 struct vm_area_struct *vma;
739 struct mm_struct *mm = current->mm; 755 struct mm_struct *mm = current->mm;
@@ -955,7 +971,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
955 goto out; 971 goto out;
956 } 972 }
957 973
958 if (!nodes_subset(new, node_online_map)) { 974 if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
959 err = -EINVAL; 975 err = -EINVAL;
960 goto out; 976 goto out;
961 } 977 }
@@ -978,7 +994,8 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
978 unsigned long maxnode, 994 unsigned long maxnode,
979 unsigned long addr, unsigned long flags) 995 unsigned long addr, unsigned long flags)
980{ 996{
981 int err, pval; 997 int err;
998 int uninitialized_var(pval);
982 nodemask_t nodes; 999 nodemask_t nodes;
983 1000
984 if (nmask != NULL && maxnode < MAX_NUMNODES) 1001 if (nmask != NULL && maxnode < MAX_NUMNODES)
@@ -1527,8 +1544,8 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1527 kmem_cache_free(sn_cache, n); 1544 kmem_cache_free(sn_cache, n);
1528} 1545}
1529 1546
1530struct sp_node * 1547static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1531sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) 1548 struct mempolicy *pol)
1532{ 1549{
1533 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 1550 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1534 1551
@@ -1677,7 +1694,7 @@ void __init numa_policy_init(void)
1677 * fall back to the largest node if they're all smaller. 1694 * fall back to the largest node if they're all smaller.
1678 */ 1695 */
1679 nodes_clear(interleave_nodes); 1696 nodes_clear(interleave_nodes);
1680 for_each_online_node(nid) { 1697 for_each_node_state(nid, N_HIGH_MEMORY) {
1681 unsigned long total_pages = node_present_pages(nid); 1698 unsigned long total_pages = node_present_pages(nid);
1682 1699
1683 /* Preserve the largest node */ 1700 /* Preserve the largest node */
@@ -1706,7 +1723,8 @@ void numa_default_policy(void)
1706} 1723}
1707 1724
1708/* Migrate a policy to a different set of nodes */ 1725/* Migrate a policy to a different set of nodes */
1709void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) 1726static void mpol_rebind_policy(struct mempolicy *pol,
1727 const nodemask_t *newmask)
1710{ 1728{
1711 nodemask_t *mpolmask; 1729 nodemask_t *mpolmask;
1712 nodemask_t tmp; 1730 nodemask_t tmp;
@@ -1963,7 +1981,7 @@ int show_numa_map(struct seq_file *m, void *v)
1963 seq_printf(m, " huge"); 1981 seq_printf(m, " huge");
1964 } else { 1982 } else {
1965 check_pgd_range(vma, vma->vm_start, vma->vm_end, 1983 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1966 &node_online_map, MPOL_MF_STATS, md); 1984 &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
1967 } 1985 }
1968 1986
1969 if (!md->pages) 1987 if (!md->pages)
@@ -1990,7 +2008,7 @@ int show_numa_map(struct seq_file *m, void *v)
1990 if (md->writeback) 2008 if (md->writeback)
1991 seq_printf(m," writeback=%lu", md->writeback); 2009 seq_printf(m," writeback=%lu", md->writeback);
1992 2010
1993 for_each_online_node(n) 2011 for_each_node_state(n, N_HIGH_MEMORY)
1994 if (md->node[n]) 2012 if (md->node[n])
1995 seq_printf(m, " N%d=%lu", n, md->node[n]); 2013 seq_printf(m, " N%d=%lu", n, md->node[n]);
1996out: 2014out:
diff --git a/mm/migrate.c b/mm/migrate.c
index e2fdbce1874b..06d0877a66ef 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -171,6 +171,7 @@ static void remove_migration_pte(struct vm_area_struct *vma,
171 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 171 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
172 if (is_write_migration_entry(entry)) 172 if (is_write_migration_entry(entry))
173 pte = pte_mkwrite(pte); 173 pte = pte_mkwrite(pte);
174 flush_cache_page(vma, addr, pte_pfn(pte));
174 set_pte_at(mm, addr, ptep, pte); 175 set_pte_at(mm, addr, ptep, pte);
175 176
176 if (PageAnon(new)) 177 if (PageAnon(new))
@@ -180,7 +181,6 @@ static void remove_migration_pte(struct vm_area_struct *vma,
180 181
181 /* No need to invalidate - it was non-present before */ 182 /* No need to invalidate - it was non-present before */
182 update_mmu_cache(vma, addr, pte); 183 update_mmu_cache(vma, addr, pte);
183 lazy_mmu_prot_update(pte);
184 184
185out: 185out:
186 pte_unmap_unlock(ptep, ptl); 186 pte_unmap_unlock(ptep, ptl);
@@ -972,7 +972,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
972 * array. Return various errors if the user did something wrong. 972 * array. Return various errors if the user did something wrong.
973 */ 973 */
974 for (i = 0; i < nr_pages; i++) { 974 for (i = 0; i < nr_pages; i++) {
975 const void *p; 975 const void __user *p;
976 976
977 err = -EFAULT; 977 err = -EFAULT;
978 if (get_user(p, pages + i)) 978 if (get_user(p, pages + i))
@@ -986,7 +986,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
986 goto out; 986 goto out;
987 987
988 err = -ENODEV; 988 err = -ENODEV;
989 if (!node_online(node)) 989 if (!node_state(node, N_HIGH_MEMORY))
990 goto out; 990 goto out;
991 991
992 err = -EACCES; 992 err = -EACCES;
diff --git a/mm/mmap.c b/mm/mmap.c
index 0d40e66c841b..4275e81e25ba 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/backing-dev.h>
10#include <linux/mm.h> 11#include <linux/mm.h>
11#include <linux/shm.h> 12#include <linux/shm.h>
12#include <linux/mman.h> 13#include <linux/mman.h>
@@ -180,8 +181,6 @@ error:
180 return -ENOMEM; 181 return -ENOMEM;
181} 182}
182 183
183EXPORT_SYMBOL(__vm_enough_memory);
184
185/* 184/*
186 * Requires inode->i_mapping->i_mmap_lock 185 * Requires inode->i_mapping->i_mmap_lock
187 */ 186 */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e8346c30abec..1d4d69790e59 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -53,7 +53,6 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
53 if (dirty_accountable && pte_dirty(ptent)) 53 if (dirty_accountable && pte_dirty(ptent))
54 ptent = pte_mkwrite(ptent); 54 ptent = pte_mkwrite(ptent);
55 set_pte_at(mm, addr, pte, ptent); 55 set_pte_at(mm, addr, pte, ptent);
56 lazy_mmu_prot_update(ptent);
57#ifdef CONFIG_MIGRATION 56#ifdef CONFIG_MIGRATION
58 } else if (!pte_file(oldpte)) { 57 } else if (!pte_file(oldpte)) {
59 swp_entry_t entry = pte_to_swp_entry(oldpte); 58 swp_entry_t entry = pte_to_swp_entry(oldpte);
diff --git a/mm/nommu.c b/mm/nommu.c
index 8ed0cb43118a..42fb84e9e815 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -44,7 +44,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
44int heap_stack_gap = 0; 44int heap_stack_gap = 0;
45 45
46EXPORT_SYMBOL(mem_map); 46EXPORT_SYMBOL(mem_map);
47EXPORT_SYMBOL(__vm_enough_memory);
48EXPORT_SYMBOL(num_physpages); 47EXPORT_SYMBOL(num_physpages);
49 48
50/* list of shareable VMAs */ 49/* list of shareable VMAs */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f9b82ad5047f..a64decb5b13f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -27,6 +27,8 @@
27#include <linux/notifier.h> 27#include <linux/notifier.h>
28 28
29int sysctl_panic_on_oom; 29int sysctl_panic_on_oom;
30int sysctl_oom_kill_allocating_task;
31static DEFINE_SPINLOCK(zone_scan_mutex);
30/* #define DEBUG */ 32/* #define DEBUG */
31 33
32/** 34/**
@@ -141,7 +143,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
141 * because p may have allocated or otherwise mapped memory on 143 * because p may have allocated or otherwise mapped memory on
142 * this node before. However it will be less likely. 144 * this node before. However it will be less likely.
143 */ 145 */
144 if (!cpuset_excl_nodes_overlap(p)) 146 if (!cpuset_mems_allowed_intersects(current, p))
145 points /= 8; 147 points /= 8;
146 148
147 /* 149 /*
@@ -164,27 +166,14 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
164} 166}
165 167
166/* 168/*
167 * Types of limitations to the nodes from which allocations may occur
168 */
169#define CONSTRAINT_NONE 1
170#define CONSTRAINT_MEMORY_POLICY 2
171#define CONSTRAINT_CPUSET 3
172
173/*
174 * Determine the type of allocation constraint. 169 * Determine the type of allocation constraint.
175 */ 170 */
176static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) 171static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
172 gfp_t gfp_mask)
177{ 173{
178#ifdef CONFIG_NUMA 174#ifdef CONFIG_NUMA
179 struct zone **z; 175 struct zone **z;
180 nodemask_t nodes; 176 nodemask_t nodes = node_states[N_HIGH_MEMORY];
181 int node;
182
183 nodes_clear(nodes);
184 /* node has memory ? */
185 for_each_online_node(node)
186 if (NODE_DATA(node)->node_present_pages)
187 node_set(node, nodes);
188 177
189 for (z = zonelist->zones; *z; z++) 178 for (z = zonelist->zones; *z; z++)
190 if (cpuset_zone_allowed_softwall(*z, gfp_mask)) 179 if (cpuset_zone_allowed_softwall(*z, gfp_mask))
@@ -344,12 +333,20 @@ static int oom_kill_task(struct task_struct *p)
344 return 0; 333 return 0;
345} 334}
346 335
347static int oom_kill_process(struct task_struct *p, unsigned long points, 336static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
348 const char *message) 337 unsigned long points, const char *message)
349{ 338{
350 struct task_struct *c; 339 struct task_struct *c;
351 struct list_head *tsk; 340 struct list_head *tsk;
352 341
342 if (printk_ratelimit()) {
343 printk(KERN_WARNING "%s invoked oom-killer: "
344 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
345 current->comm, gfp_mask, order, current->oomkilladj);
346 dump_stack();
347 show_mem();
348 }
349
353 /* 350 /*
354 * If the task is already exiting, don't alarm the sysadmin or kill 351 * If the task is already exiting, don't alarm the sysadmin or kill
355 * its children or threads, just set TIF_MEMDIE so it can die quickly 352 * its children or threads, just set TIF_MEMDIE so it can die quickly
@@ -387,6 +384,57 @@ int unregister_oom_notifier(struct notifier_block *nb)
387} 384}
388EXPORT_SYMBOL_GPL(unregister_oom_notifier); 385EXPORT_SYMBOL_GPL(unregister_oom_notifier);
389 386
387/*
388 * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero
389 * if a parallel OOM killing is already taking place that includes a zone in
390 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
391 */
392int try_set_zone_oom(struct zonelist *zonelist)
393{
394 struct zone **z;
395 int ret = 1;
396
397 z = zonelist->zones;
398
399 spin_lock(&zone_scan_mutex);
400 do {
401 if (zone_is_oom_locked(*z)) {
402 ret = 0;
403 goto out;
404 }
405 } while (*(++z) != NULL);
406
407 /*
408 * Lock each zone in the zonelist under zone_scan_mutex so a parallel
409 * invocation of try_set_zone_oom() doesn't succeed when it shouldn't.
410 */
411 z = zonelist->zones;
412 do {
413 zone_set_flag(*z, ZONE_OOM_LOCKED);
414 } while (*(++z) != NULL);
415out:
416 spin_unlock(&zone_scan_mutex);
417 return ret;
418}
419
420/*
421 * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
422 * allocation attempts with zonelists containing them may now recall the OOM
423 * killer, if necessary.
424 */
425void clear_zonelist_oom(struct zonelist *zonelist)
426{
427 struct zone **z;
428
429 z = zonelist->zones;
430
431 spin_lock(&zone_scan_mutex);
432 do {
433 zone_clear_flag(*z, ZONE_OOM_LOCKED);
434 } while (*(++z) != NULL);
435 spin_unlock(&zone_scan_mutex);
436}
437
390/** 438/**
391 * out_of_memory - kill the "best" process when we run out of memory 439 * out_of_memory - kill the "best" process when we run out of memory
392 * 440 *
@@ -400,21 +448,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
400 struct task_struct *p; 448 struct task_struct *p;
401 unsigned long points = 0; 449 unsigned long points = 0;
402 unsigned long freed = 0; 450 unsigned long freed = 0;
403 int constraint; 451 enum oom_constraint constraint;
404 452
405 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 453 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
406 if (freed > 0) 454 if (freed > 0)
407 /* Got some memory back in the last second. */ 455 /* Got some memory back in the last second. */
408 return; 456 return;
409 457
410 if (printk_ratelimit()) {
411 printk(KERN_WARNING "%s invoked oom-killer: "
412 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
413 current->comm, gfp_mask, order, current->oomkilladj);
414 dump_stack();
415 show_mem();
416 }
417
418 if (sysctl_panic_on_oom == 2) 458 if (sysctl_panic_on_oom == 2)
419 panic("out of memory. Compulsory panic_on_oom is selected.\n"); 459 panic("out of memory. Compulsory panic_on_oom is selected.\n");
420 460
@@ -423,23 +463,24 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
423 * NUMA) that may require different handling. 463 * NUMA) that may require different handling.
424 */ 464 */
425 constraint = constrained_alloc(zonelist, gfp_mask); 465 constraint = constrained_alloc(zonelist, gfp_mask);
426 cpuset_lock();
427 read_lock(&tasklist_lock); 466 read_lock(&tasklist_lock);
428 467
429 switch (constraint) { 468 switch (constraint) {
430 case CONSTRAINT_MEMORY_POLICY: 469 case CONSTRAINT_MEMORY_POLICY:
431 oom_kill_process(current, points, 470 oom_kill_process(current, gfp_mask, order, points,
432 "No available memory (MPOL_BIND)"); 471 "No available memory (MPOL_BIND)");
433 break; 472 break;
434 473
435 case CONSTRAINT_CPUSET:
436 oom_kill_process(current, points,
437 "No available memory in cpuset");
438 break;
439
440 case CONSTRAINT_NONE: 474 case CONSTRAINT_NONE:
441 if (sysctl_panic_on_oom) 475 if (sysctl_panic_on_oom)
442 panic("out of memory. panic_on_oom is selected\n"); 476 panic("out of memory. panic_on_oom is selected\n");
477 /* Fall-through */
478 case CONSTRAINT_CPUSET:
479 if (sysctl_oom_kill_allocating_task) {
480 oom_kill_process(current, gfp_mask, order, points,
481 "Out of memory (oom_kill_allocating_task)");
482 break;
483 }
443retry: 484retry:
444 /* 485 /*
445 * Rambo mode: Shoot down a process and hope it solves whatever 486 * Rambo mode: Shoot down a process and hope it solves whatever
@@ -453,11 +494,11 @@ retry:
453 /* Found nothing?!?! Either we hang forever, or we panic. */ 494 /* Found nothing?!?! Either we hang forever, or we panic. */
454 if (!p) { 495 if (!p) {
455 read_unlock(&tasklist_lock); 496 read_unlock(&tasklist_lock);
456 cpuset_unlock();
457 panic("Out of memory and no killable processes...\n"); 497 panic("Out of memory and no killable processes...\n");
458 } 498 }
459 499
460 if (oom_kill_process(p, points, "Out of memory")) 500 if (oom_kill_process(p, points, gfp_mask, order,
501 "Out of memory"))
461 goto retry; 502 goto retry;
462 503
463 break; 504 break;
@@ -465,7 +506,6 @@ retry:
465 506
466out: 507out:
467 read_unlock(&tasklist_lock); 508 read_unlock(&tasklist_lock);
468 cpuset_unlock();
469 509
470 /* 510 /*
471 * Give "p" a good chance of killing itself before we 511 * Give "p" a good chance of killing itself before we
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 44720363374c..7845462064f4 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2,6 +2,7 @@
2 * mm/page-writeback.c 2 * mm/page-writeback.c
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds. 4 * Copyright (C) 2002, Linus Torvalds.
5 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 * 6 *
6 * Contains functions related to writing back dirty pages at the 7 * Contains functions related to writing back dirty pages at the
7 * address_space level. 8 * address_space level.
@@ -36,7 +37,7 @@
36 37
37/* 38/*
38 * The maximum number of pages to writeout in a single bdflush/kupdate 39 * The maximum number of pages to writeout in a single bdflush/kupdate
39 * operation. We do this so we don't hold I_LOCK against an inode for 40 * operation. We do this so we don't hold I_SYNC against an inode for
40 * enormous amounts of time, which would block a userspace task which has 41 * enormous amounts of time, which would block a userspace task which has
41 * been forced to throttle against that inode. Also, the code reevaluates 42 * been forced to throttle against that inode. Also, the code reevaluates
42 * the dirty each time it has written this many pages. 43 * the dirty each time it has written this many pages.
@@ -49,8 +50,6 @@
49 */ 50 */
50static long ratelimit_pages = 32; 51static long ratelimit_pages = 32;
51 52
52static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
53
54/* 53/*
55 * When balance_dirty_pages decides that the caller needs to perform some 54 * When balance_dirty_pages decides that the caller needs to perform some
56 * non-background writeback, this is how many pages it will attempt to write. 55 * non-background writeback, this is how many pages it will attempt to write.
@@ -103,6 +102,141 @@ EXPORT_SYMBOL(laptop_mode);
103static void background_writeout(unsigned long _min_pages); 102static void background_writeout(unsigned long _min_pages);
104 103
105/* 104/*
105 * Scale the writeback cache size proportional to the relative writeout speeds.
106 *
107 * We do this by keeping a floating proportion between BDIs, based on page
108 * writeback completions [end_page_writeback()]. Those devices that write out
109 * pages fastest will get the larger share, while the slower will get a smaller
110 * share.
111 *
112 * We use page writeout completions because we are interested in getting rid of
113 * dirty pages. Having them written out is the primary goal.
114 *
115 * We introduce a concept of time, a period over which we measure these events,
116 * because demand can/will vary over time. The length of this period itself is
117 * measured in page writeback completions.
118 *
119 */
120static struct prop_descriptor vm_completions;
121static struct prop_descriptor vm_dirties;
122
123static unsigned long determine_dirtyable_memory(void);
124
125/*
126 * couple the period to the dirty_ratio:
127 *
128 * period/2 ~ roundup_pow_of_two(dirty limit)
129 */
130static int calc_period_shift(void)
131{
132 unsigned long dirty_total;
133
134 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
135 return 2 + ilog2(dirty_total - 1);
136}
137
138/*
139 * update the period when the dirty ratio changes.
140 */
141int dirty_ratio_handler(struct ctl_table *table, int write,
142 struct file *filp, void __user *buffer, size_t *lenp,
143 loff_t *ppos)
144{
145 int old_ratio = vm_dirty_ratio;
146 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
147 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
148 int shift = calc_period_shift();
149 prop_change_shift(&vm_completions, shift);
150 prop_change_shift(&vm_dirties, shift);
151 }
152 return ret;
153}
154
155/*
156 * Increment the BDI's writeout completion count and the global writeout
157 * completion count. Called from test_clear_page_writeback().
158 */
159static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
160{
161 __prop_inc_percpu(&vm_completions, &bdi->completions);
162}
163
164static inline void task_dirty_inc(struct task_struct *tsk)
165{
166 prop_inc_single(&vm_dirties, &tsk->dirties);
167}
168
169/*
170 * Obtain an accurate fraction of the BDI's portion.
171 */
172static void bdi_writeout_fraction(struct backing_dev_info *bdi,
173 long *numerator, long *denominator)
174{
175 if (bdi_cap_writeback_dirty(bdi)) {
176 prop_fraction_percpu(&vm_completions, &bdi->completions,
177 numerator, denominator);
178 } else {
179 *numerator = 0;
180 *denominator = 1;
181 }
182}
183
184/*
185 * Clip the earned share of dirty pages to that which is actually available.
186 * This avoids exceeding the total dirty_limit when the floating averages
187 * fluctuate too quickly.
188 */
189static void
190clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
191{
192 long avail_dirty;
193
194 avail_dirty = dirty -
195 (global_page_state(NR_FILE_DIRTY) +
196 global_page_state(NR_WRITEBACK) +
197 global_page_state(NR_UNSTABLE_NFS));
198
199 if (avail_dirty < 0)
200 avail_dirty = 0;
201
202 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
203 bdi_stat(bdi, BDI_WRITEBACK);
204
205 *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
206}
207
208static inline void task_dirties_fraction(struct task_struct *tsk,
209 long *numerator, long *denominator)
210{
211 prop_fraction_single(&vm_dirties, &tsk->dirties,
212 numerator, denominator);
213}
214
215/*
216 * scale the dirty limit
217 *
218 * task specific dirty limit:
219 *
220 * dirty -= (dirty/8) * p_{t}
221 */
222void task_dirty_limit(struct task_struct *tsk, long *pdirty)
223{
224 long numerator, denominator;
225 long dirty = *pdirty;
226 u64 inv = dirty >> 3;
227
228 task_dirties_fraction(tsk, &numerator, &denominator);
229 inv *= numerator;
230 do_div(inv, denominator);
231
232 dirty -= inv;
233 if (dirty < *pdirty/2)
234 dirty = *pdirty/2;
235
236 *pdirty = dirty;
237}
238
239/*
106 * Work out the current dirty-memory clamping and background writeout 240 * Work out the current dirty-memory clamping and background writeout
107 * thresholds. 241 * thresholds.
108 * 242 *
@@ -126,7 +260,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
126 int node; 260 int node;
127 unsigned long x = 0; 261 unsigned long x = 0;
128 262
129 for_each_online_node(node) { 263 for_each_node_state(node, N_HIGH_MEMORY) {
130 struct zone *z = 264 struct zone *z =
131 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; 265 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
132 266
@@ -158,8 +292,8 @@ static unsigned long determine_dirtyable_memory(void)
158} 292}
159 293
160static void 294static void
161get_dirty_limits(long *pbackground, long *pdirty, 295get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
162 struct address_space *mapping) 296 struct backing_dev_info *bdi)
163{ 297{
164 int background_ratio; /* Percentages */ 298 int background_ratio; /* Percentages */
165 int dirty_ratio; 299 int dirty_ratio;
@@ -193,6 +327,23 @@ get_dirty_limits(long *pbackground, long *pdirty,
193 } 327 }
194 *pbackground = background; 328 *pbackground = background;
195 *pdirty = dirty; 329 *pdirty = dirty;
330
331 if (bdi) {
332 u64 bdi_dirty = dirty;
333 long numerator, denominator;
334
335 /*
336 * Calculate this BDI's share of the dirty ratio.
337 */
338 bdi_writeout_fraction(bdi, &numerator, &denominator);
339
340 bdi_dirty *= numerator;
341 do_div(bdi_dirty, denominator);
342
343 *pbdi_dirty = bdi_dirty;
344 clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
345 task_dirty_limit(current, pbdi_dirty);
346 }
196} 347}
197 348
198/* 349/*
@@ -204,9 +355,11 @@ get_dirty_limits(long *pbackground, long *pdirty,
204 */ 355 */
205static void balance_dirty_pages(struct address_space *mapping) 356static void balance_dirty_pages(struct address_space *mapping)
206{ 357{
207 long nr_reclaimable; 358 long bdi_nr_reclaimable;
359 long bdi_nr_writeback;
208 long background_thresh; 360 long background_thresh;
209 long dirty_thresh; 361 long dirty_thresh;
362 long bdi_thresh;
210 unsigned long pages_written = 0; 363 unsigned long pages_written = 0;
211 unsigned long write_chunk = sync_writeback_pages(); 364 unsigned long write_chunk = sync_writeback_pages();
212 365
@@ -221,15 +374,15 @@ static void balance_dirty_pages(struct address_space *mapping)
221 .range_cyclic = 1, 374 .range_cyclic = 1,
222 }; 375 };
223 376
224 get_dirty_limits(&background_thresh, &dirty_thresh, mapping); 377 get_dirty_limits(&background_thresh, &dirty_thresh,
225 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 378 &bdi_thresh, bdi);
226 global_page_state(NR_UNSTABLE_NFS); 379 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
227 if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= 380 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
228 dirty_thresh) 381 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
229 break; 382 break;
230 383
231 if (!dirty_exceeded) 384 if (!bdi->dirty_exceeded)
232 dirty_exceeded = 1; 385 bdi->dirty_exceeded = 1;
233 386
234 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 387 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
235 * Unstable writes are a feature of certain networked 388 * Unstable writes are a feature of certain networked
@@ -237,26 +390,42 @@ static void balance_dirty_pages(struct address_space *mapping)
237 * written to the server's write cache, but has not yet 390 * written to the server's write cache, but has not yet
238 * been flushed to permanent storage. 391 * been flushed to permanent storage.
239 */ 392 */
240 if (nr_reclaimable) { 393 if (bdi_nr_reclaimable) {
241 writeback_inodes(&wbc); 394 writeback_inodes(&wbc);
242 get_dirty_limits(&background_thresh,
243 &dirty_thresh, mapping);
244 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
245 global_page_state(NR_UNSTABLE_NFS);
246 if (nr_reclaimable +
247 global_page_state(NR_WRITEBACK)
248 <= dirty_thresh)
249 break;
250 pages_written += write_chunk - wbc.nr_to_write; 395 pages_written += write_chunk - wbc.nr_to_write;
251 if (pages_written >= write_chunk) 396 get_dirty_limits(&background_thresh, &dirty_thresh,
252 break; /* We've done our duty */ 397 &bdi_thresh, bdi);
253 } 398 }
399
400 /*
401 * In order to avoid the stacked BDI deadlock we need
402 * to ensure we accurately count the 'dirty' pages when
403 * the threshold is low.
404 *
405 * Otherwise it would be possible to get thresh+n pages
406 * reported dirty, even though there are thresh-m pages
407 * actually dirty; with m+n sitting in the percpu
408 * deltas.
409 */
410 if (bdi_thresh < 2*bdi_stat_error(bdi)) {
411 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
412 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
413 } else if (bdi_nr_reclaimable) {
414 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
415 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
416 }
417
418 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
419 break;
420 if (pages_written >= write_chunk)
421 break; /* We've done our duty */
422
254 congestion_wait(WRITE, HZ/10); 423 congestion_wait(WRITE, HZ/10);
255 } 424 }
256 425
257 if (nr_reclaimable + global_page_state(NR_WRITEBACK) 426 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
258 <= dirty_thresh && dirty_exceeded) 427 bdi->dirty_exceeded)
259 dirty_exceeded = 0; 428 bdi->dirty_exceeded = 0;
260 429
261 if (writeback_in_progress(bdi)) 430 if (writeback_in_progress(bdi))
262 return; /* pdflush is already working this queue */ 431 return; /* pdflush is already working this queue */
@@ -270,7 +439,9 @@ static void balance_dirty_pages(struct address_space *mapping)
270 * background_thresh, to keep the amount of dirty memory low. 439 * background_thresh, to keep the amount of dirty memory low.
271 */ 440 */
272 if ((laptop_mode && pages_written) || 441 if ((laptop_mode && pages_written) ||
273 (!laptop_mode && (nr_reclaimable > background_thresh))) 442 (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
443 + global_page_state(NR_UNSTABLE_NFS)
444 > background_thresh)))
274 pdflush_operation(background_writeout, 0); 445 pdflush_operation(background_writeout, 0);
275} 446}
276 447
@@ -306,7 +477,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
306 unsigned long *p; 477 unsigned long *p;
307 478
308 ratelimit = ratelimit_pages; 479 ratelimit = ratelimit_pages;
309 if (dirty_exceeded) 480 if (mapping->backing_dev_info->dirty_exceeded)
310 ratelimit = 8; 481 ratelimit = 8;
311 482
312 /* 483 /*
@@ -331,18 +502,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
331 long background_thresh; 502 long background_thresh;
332 long dirty_thresh; 503 long dirty_thresh;
333 504
334 if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) {
335 /*
336 * The caller might hold locks which can prevent IO completion
337 * or progress in the filesystem. So we cannot just sit here
338 * waiting for IO to complete.
339 */
340 congestion_wait(WRITE, HZ/10);
341 return;
342 }
343
344 for ( ; ; ) { 505 for ( ; ; ) {
345 get_dirty_limits(&background_thresh, &dirty_thresh, NULL); 506 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
346 507
347 /* 508 /*
348 * Boost the allowable dirty threshold a bit for page 509 * Boost the allowable dirty threshold a bit for page
@@ -354,6 +515,14 @@ void throttle_vm_writeout(gfp_t gfp_mask)
354 global_page_state(NR_WRITEBACK) <= dirty_thresh) 515 global_page_state(NR_WRITEBACK) <= dirty_thresh)
355 break; 516 break;
356 congestion_wait(WRITE, HZ/10); 517 congestion_wait(WRITE, HZ/10);
518
519 /*
520 * The caller might hold locks which can prevent IO completion
521 * or progress in the filesystem. So we cannot just sit here
522 * waiting for IO to complete.
523 */
524 if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
525 break;
357 } 526 }
358} 527}
359 528
@@ -377,11 +546,12 @@ static void background_writeout(unsigned long _min_pages)
377 long background_thresh; 546 long background_thresh;
378 long dirty_thresh; 547 long dirty_thresh;
379 548
380 get_dirty_limits(&background_thresh, &dirty_thresh, NULL); 549 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
381 if (global_page_state(NR_FILE_DIRTY) + 550 if (global_page_state(NR_FILE_DIRTY) +
382 global_page_state(NR_UNSTABLE_NFS) < background_thresh 551 global_page_state(NR_UNSTABLE_NFS) < background_thresh
383 && min_pages <= 0) 552 && min_pages <= 0)
384 break; 553 break;
554 wbc.more_io = 0;
385 wbc.encountered_congestion = 0; 555 wbc.encountered_congestion = 0;
386 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 556 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
387 wbc.pages_skipped = 0; 557 wbc.pages_skipped = 0;
@@ -389,8 +559,9 @@ static void background_writeout(unsigned long _min_pages)
389 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 559 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
390 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { 560 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
391 /* Wrote less than expected */ 561 /* Wrote less than expected */
392 congestion_wait(WRITE, HZ/10); 562 if (wbc.encountered_congestion || wbc.more_io)
393 if (!wbc.encountered_congestion) 563 congestion_wait(WRITE, HZ/10);
564 else
394 break; 565 break;
395 } 566 }
396 } 567 }
@@ -455,11 +626,12 @@ static void wb_kupdate(unsigned long arg)
455 global_page_state(NR_UNSTABLE_NFS) + 626 global_page_state(NR_UNSTABLE_NFS) +
456 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 627 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
457 while (nr_to_write > 0) { 628 while (nr_to_write > 0) {
629 wbc.more_io = 0;
458 wbc.encountered_congestion = 0; 630 wbc.encountered_congestion = 0;
459 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 631 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
460 writeback_inodes(&wbc); 632 writeback_inodes(&wbc);
461 if (wbc.nr_to_write > 0) { 633 if (wbc.nr_to_write > 0) {
462 if (wbc.encountered_congestion) 634 if (wbc.encountered_congestion || wbc.more_io)
463 congestion_wait(WRITE, HZ/10); 635 congestion_wait(WRITE, HZ/10);
464 else 636 else
465 break; /* All the old data is written */ 637 break; /* All the old data is written */
@@ -580,9 +752,15 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
580 */ 752 */
581void __init page_writeback_init(void) 753void __init page_writeback_init(void)
582{ 754{
755 int shift;
756
583 mod_timer(&wb_timer, jiffies + dirty_writeback_interval); 757 mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
584 writeback_set_ratelimit(); 758 writeback_set_ratelimit();
585 register_cpu_notifier(&ratelimit_nb); 759 register_cpu_notifier(&ratelimit_nb);
760
761 shift = calc_period_shift();
762 prop_descriptor_init(&vm_completions, shift);
763 prop_descriptor_init(&vm_dirties, shift);
586} 764}
587 765
588/** 766/**
@@ -672,8 +850,10 @@ retry:
672 850
673 ret = (*writepage)(page, wbc, data); 851 ret = (*writepage)(page, wbc, data);
674 852
675 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) 853 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
676 unlock_page(page); 854 unlock_page(page);
855 ret = 0;
856 }
677 if (ret || (--(wbc->nr_to_write) <= 0)) 857 if (ret || (--(wbc->nr_to_write) <= 0))
678 done = 1; 858 done = 1;
679 if (wbc->nonblocking && bdi_write_congested(bdi)) { 859 if (wbc->nonblocking && bdi_write_congested(bdi)) {
@@ -827,6 +1007,8 @@ int __set_page_dirty_nobuffers(struct page *page)
827 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); 1007 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
828 if (mapping_cap_account_dirty(mapping)) { 1008 if (mapping_cap_account_dirty(mapping)) {
829 __inc_zone_page_state(page, NR_FILE_DIRTY); 1009 __inc_zone_page_state(page, NR_FILE_DIRTY);
1010 __inc_bdi_stat(mapping->backing_dev_info,
1011 BDI_RECLAIMABLE);
830 task_io_account_write(PAGE_CACHE_SIZE); 1012 task_io_account_write(PAGE_CACHE_SIZE);
831 } 1013 }
832 radix_tree_tag_set(&mapping->page_tree, 1014 radix_tree_tag_set(&mapping->page_tree,
@@ -859,7 +1041,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage);
859 * If the mapping doesn't provide a set_page_dirty a_op, then 1041 * If the mapping doesn't provide a set_page_dirty a_op, then
860 * just fall through and assume that it wants buffer_heads. 1042 * just fall through and assume that it wants buffer_heads.
861 */ 1043 */
862int fastcall set_page_dirty(struct page *page) 1044static int __set_page_dirty(struct page *page)
863{ 1045{
864 struct address_space *mapping = page_mapping(page); 1046 struct address_space *mapping = page_mapping(page);
865 1047
@@ -877,6 +1059,14 @@ int fastcall set_page_dirty(struct page *page)
877 } 1059 }
878 return 0; 1060 return 0;
879} 1061}
1062
1063int fastcall set_page_dirty(struct page *page)
1064{
1065 int ret = __set_page_dirty(page);
1066 if (ret)
1067 task_dirty_inc(current);
1068 return ret;
1069}
880EXPORT_SYMBOL(set_page_dirty); 1070EXPORT_SYMBOL(set_page_dirty);
881 1071
882/* 1072/*
@@ -961,6 +1151,8 @@ int clear_page_dirty_for_io(struct page *page)
961 */ 1151 */
962 if (TestClearPageDirty(page)) { 1152 if (TestClearPageDirty(page)) {
963 dec_zone_page_state(page, NR_FILE_DIRTY); 1153 dec_zone_page_state(page, NR_FILE_DIRTY);
1154 dec_bdi_stat(mapping->backing_dev_info,
1155 BDI_RECLAIMABLE);
964 return 1; 1156 return 1;
965 } 1157 }
966 return 0; 1158 return 0;
@@ -975,14 +1167,20 @@ int test_clear_page_writeback(struct page *page)
975 int ret; 1167 int ret;
976 1168
977 if (mapping) { 1169 if (mapping) {
1170 struct backing_dev_info *bdi = mapping->backing_dev_info;
978 unsigned long flags; 1171 unsigned long flags;
979 1172
980 write_lock_irqsave(&mapping->tree_lock, flags); 1173 write_lock_irqsave(&mapping->tree_lock, flags);
981 ret = TestClearPageWriteback(page); 1174 ret = TestClearPageWriteback(page);
982 if (ret) 1175 if (ret) {
983 radix_tree_tag_clear(&mapping->page_tree, 1176 radix_tree_tag_clear(&mapping->page_tree,
984 page_index(page), 1177 page_index(page),
985 PAGECACHE_TAG_WRITEBACK); 1178 PAGECACHE_TAG_WRITEBACK);
1179 if (bdi_cap_writeback_dirty(bdi)) {
1180 __dec_bdi_stat(bdi, BDI_WRITEBACK);
1181 __bdi_writeout_inc(bdi);
1182 }
1183 }
986 write_unlock_irqrestore(&mapping->tree_lock, flags); 1184 write_unlock_irqrestore(&mapping->tree_lock, flags);
987 } else { 1185 } else {
988 ret = TestClearPageWriteback(page); 1186 ret = TestClearPageWriteback(page);
@@ -998,14 +1196,18 @@ int test_set_page_writeback(struct page *page)
998 int ret; 1196 int ret;
999 1197
1000 if (mapping) { 1198 if (mapping) {
1199 struct backing_dev_info *bdi = mapping->backing_dev_info;
1001 unsigned long flags; 1200 unsigned long flags;
1002 1201
1003 write_lock_irqsave(&mapping->tree_lock, flags); 1202 write_lock_irqsave(&mapping->tree_lock, flags);
1004 ret = TestSetPageWriteback(page); 1203 ret = TestSetPageWriteback(page);
1005 if (!ret) 1204 if (!ret) {
1006 radix_tree_tag_set(&mapping->page_tree, 1205 radix_tree_tag_set(&mapping->page_tree,
1007 page_index(page), 1206 page_index(page),
1008 PAGECACHE_TAG_WRITEBACK); 1207 PAGECACHE_TAG_WRITEBACK);
1208 if (bdi_cap_writeback_dirty(bdi))
1209 __inc_bdi_stat(bdi, BDI_WRITEBACK);
1210 }
1009 if (!PageDirty(page)) 1211 if (!PageDirty(page))
1010 radix_tree_tag_clear(&mapping->page_tree, 1212 radix_tree_tag_clear(&mapping->page_tree,
1011 page_index(page), 1213 page_index(page),
@@ -1022,17 +1224,15 @@ int test_set_page_writeback(struct page *page)
1022EXPORT_SYMBOL(test_set_page_writeback); 1224EXPORT_SYMBOL(test_set_page_writeback);
1023 1225
1024/* 1226/*
1025 * Return true if any of the pages in the mapping are marged with the 1227 * Return true if any of the pages in the mapping are marked with the
1026 * passed tag. 1228 * passed tag.
1027 */ 1229 */
1028int mapping_tagged(struct address_space *mapping, int tag) 1230int mapping_tagged(struct address_space *mapping, int tag)
1029{ 1231{
1030 unsigned long flags;
1031 int ret; 1232 int ret;
1032 1233 rcu_read_lock();
1033 read_lock_irqsave(&mapping->tree_lock, flags);
1034 ret = radix_tree_tagged(&mapping->page_tree, tag); 1234 ret = radix_tree_tagged(&mapping->page_tree, tag);
1035 read_unlock_irqrestore(&mapping->tree_lock, flags); 1235 rcu_read_unlock();
1036 return ret; 1236 return ret;
1037} 1237}
1038EXPORT_SYMBOL(mapping_tagged); 1238EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1a8c59571cb7..43f757fcf30f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -27,6 +27,7 @@
27#include <linux/pagevec.h> 27#include <linux/pagevec.h>
28#include <linux/blkdev.h> 28#include <linux/blkdev.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/oom.h>
30#include <linux/notifier.h> 31#include <linux/notifier.h>
31#include <linux/topology.h> 32#include <linux/topology.h>
32#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -41,24 +42,37 @@
41#include <linux/pfn.h> 42#include <linux/pfn.h>
42#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
43#include <linux/fault-inject.h> 44#include <linux/fault-inject.h>
45#include <linux/page-isolation.h>
44 46
45#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
46#include <asm/div64.h> 48#include <asm/div64.h>
47#include "internal.h" 49#include "internal.h"
48 50
49/* 51/*
50 * MCD - HACK: Find somewhere to initialize this EARLY, or make this 52 * Array of node states.
51 * initializer cleaner
52 */ 53 */
53nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; 54nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
54EXPORT_SYMBOL(node_online_map); 55 [N_POSSIBLE] = NODE_MASK_ALL,
55nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 56 [N_ONLINE] = { { [0] = 1UL } },
56EXPORT_SYMBOL(node_possible_map); 57#ifndef CONFIG_NUMA
58 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
59#ifdef CONFIG_HIGHMEM
60 [N_HIGH_MEMORY] = { { [0] = 1UL } },
61#endif
62 [N_CPU] = { { [0] = 1UL } },
63#endif /* NUMA */
64};
65EXPORT_SYMBOL(node_states);
66
57unsigned long totalram_pages __read_mostly; 67unsigned long totalram_pages __read_mostly;
58unsigned long totalreserve_pages __read_mostly; 68unsigned long totalreserve_pages __read_mostly;
59long nr_swap_pages; 69long nr_swap_pages;
60int percpu_pagelist_fraction; 70int percpu_pagelist_fraction;
61 71
72#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
73int pageblock_order __read_mostly;
74#endif
75
62static void __free_pages_ok(struct page *page, unsigned int order); 76static void __free_pages_ok(struct page *page, unsigned int order);
63 77
64/* 78/*
@@ -137,7 +151,7 @@ static unsigned long __meminitdata dma_reserve;
137 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; 151 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
138#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 152#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
139 unsigned long __initdata required_kernelcore; 153 unsigned long __initdata required_kernelcore;
140 unsigned long __initdata required_movablecore; 154 static unsigned long __initdata required_movablecore;
141 unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 155 unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
142 156
143 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 157 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@ -150,6 +164,14 @@ int nr_node_ids __read_mostly = MAX_NUMNODES;
150EXPORT_SYMBOL(nr_node_ids); 164EXPORT_SYMBOL(nr_node_ids);
151#endif 165#endif
152 166
167int page_group_by_mobility_disabled __read_mostly;
168
169static void set_pageblock_migratetype(struct page *page, int migratetype)
170{
171 set_pageblock_flags_group(page, (unsigned long)migratetype,
172 PB_migrate, PB_migrate_end);
173}
174
153#ifdef CONFIG_DEBUG_VM 175#ifdef CONFIG_DEBUG_VM
154static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 176static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
155{ 177{
@@ -293,16 +315,6 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
293 clear_highpage(page + i); 315 clear_highpage(page + i);
294} 316}
295 317
296/*
297 * function for dealing with page's order in buddy system.
298 * zone->lock is already acquired when we use these.
299 * So, we don't need atomic page->flags operations here.
300 */
301static inline unsigned long page_order(struct page *page)
302{
303 return page_private(page);
304}
305
306static inline void set_page_order(struct page *page, int order) 318static inline void set_page_order(struct page *page, int order)
307{ 319{
308 set_page_private(page, order); 320 set_page_private(page, order);
@@ -404,6 +416,7 @@ static inline void __free_one_page(struct page *page,
404{ 416{
405 unsigned long page_idx; 417 unsigned long page_idx;
406 int order_size = 1 << order; 418 int order_size = 1 << order;
419 int migratetype = get_pageblock_migratetype(page);
407 420
408 if (unlikely(PageCompound(page))) 421 if (unlikely(PageCompound(page)))
409 destroy_compound_page(page, order); 422 destroy_compound_page(page, order);
@@ -416,7 +429,6 @@ static inline void __free_one_page(struct page *page,
416 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); 429 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
417 while (order < MAX_ORDER-1) { 430 while (order < MAX_ORDER-1) {
418 unsigned long combined_idx; 431 unsigned long combined_idx;
419 struct free_area *area;
420 struct page *buddy; 432 struct page *buddy;
421 433
422 buddy = __page_find_buddy(page, page_idx, order); 434 buddy = __page_find_buddy(page, page_idx, order);
@@ -424,8 +436,7 @@ static inline void __free_one_page(struct page *page,
424 break; /* Move the buddy up one level. */ 436 break; /* Move the buddy up one level. */
425 437
426 list_del(&buddy->lru); 438 list_del(&buddy->lru);
427 area = zone->free_area + order; 439 zone->free_area[order].nr_free--;
428 area->nr_free--;
429 rmv_page_order(buddy); 440 rmv_page_order(buddy);
430 combined_idx = __find_combined_index(page_idx, order); 441 combined_idx = __find_combined_index(page_idx, order);
431 page = page + (combined_idx - page_idx); 442 page = page + (combined_idx - page_idx);
@@ -433,7 +444,8 @@ static inline void __free_one_page(struct page *page,
433 order++; 444 order++;
434 } 445 }
435 set_page_order(page, order); 446 set_page_order(page, order);
436 list_add(&page->lru, &zone->free_area[order].free_list); 447 list_add(&page->lru,
448 &zone->free_area[order].free_list[migratetype]);
437 zone->free_area[order].nr_free++; 449 zone->free_area[order].nr_free++;
438} 450}
439 451
@@ -478,7 +490,7 @@ static void free_pages_bulk(struct zone *zone, int count,
478 struct list_head *list, int order) 490 struct list_head *list, int order)
479{ 491{
480 spin_lock(&zone->lock); 492 spin_lock(&zone->lock);
481 zone->all_unreclaimable = 0; 493 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
482 zone->pages_scanned = 0; 494 zone->pages_scanned = 0;
483 while (count--) { 495 while (count--) {
484 struct page *page; 496 struct page *page;
@@ -495,7 +507,7 @@ static void free_pages_bulk(struct zone *zone, int count,
495static void free_one_page(struct zone *zone, struct page *page, int order) 507static void free_one_page(struct zone *zone, struct page *page, int order)
496{ 508{
497 spin_lock(&zone->lock); 509 spin_lock(&zone->lock);
498 zone->all_unreclaimable = 0; 510 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
499 zone->pages_scanned = 0; 511 zone->pages_scanned = 0;
500 __free_one_page(page, zone, order); 512 __free_one_page(page, zone, order);
501 spin_unlock(&zone->lock); 513 spin_unlock(&zone->lock);
@@ -567,7 +579,8 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
567 * -- wli 579 * -- wli
568 */ 580 */
569static inline void expand(struct zone *zone, struct page *page, 581static inline void expand(struct zone *zone, struct page *page,
570 int low, int high, struct free_area *area) 582 int low, int high, struct free_area *area,
583 int migratetype)
571{ 584{
572 unsigned long size = 1 << high; 585 unsigned long size = 1 << high;
573 586
@@ -576,7 +589,7 @@ static inline void expand(struct zone *zone, struct page *page,
576 high--; 589 high--;
577 size >>= 1; 590 size >>= 1;
578 VM_BUG_ON(bad_range(zone, &page[size])); 591 VM_BUG_ON(bad_range(zone, &page[size]));
579 list_add(&page[size].lru, &area->free_list); 592 list_add(&page[size].lru, &area->free_list[migratetype]);
580 area->nr_free++; 593 area->nr_free++;
581 set_page_order(&page[size], high); 594 set_page_order(&page[size], high);
582 } 595 }
@@ -628,49 +641,235 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
628 return 0; 641 return 0;
629} 642}
630 643
631/* 644/*
632 * Do the hard work of removing an element from the buddy allocator. 645 * Go through the free lists for the given migratetype and remove
633 * Call me with the zone->lock already held. 646 * the smallest available page from the freelists
634 */ 647 */
635static struct page *__rmqueue(struct zone *zone, unsigned int order) 648static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
649 int migratetype)
636{ 650{
637 struct free_area * area;
638 unsigned int current_order; 651 unsigned int current_order;
652 struct free_area * area;
639 struct page *page; 653 struct page *page;
640 654
655 /* Find a page of the appropriate size in the preferred list */
641 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 656 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
642 area = zone->free_area + current_order; 657 area = &(zone->free_area[current_order]);
643 if (list_empty(&area->free_list)) 658 if (list_empty(&area->free_list[migratetype]))
644 continue; 659 continue;
645 660
646 page = list_entry(area->free_list.next, struct page, lru); 661 page = list_entry(area->free_list[migratetype].next,
662 struct page, lru);
647 list_del(&page->lru); 663 list_del(&page->lru);
648 rmv_page_order(page); 664 rmv_page_order(page);
649 area->nr_free--; 665 area->nr_free--;
650 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); 666 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
651 expand(zone, page, order, current_order, area); 667 expand(zone, page, order, current_order, area, migratetype);
652 return page; 668 return page;
653 } 669 }
654 670
655 return NULL; 671 return NULL;
656} 672}
657 673
674
675/*
676 * This array describes the order lists are fallen back to when
677 * the free lists for the desirable migrate type are depleted
678 */
679static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
680 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
681 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
682 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
683 [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */
684};
685
686/*
687 * Move the free pages in a range to the free lists of the requested type.
688 * Note that start_page and end_pages are not aligned on a pageblock
689 * boundary. If alignment is required, use move_freepages_block()
690 */
691int move_freepages(struct zone *zone,
692 struct page *start_page, struct page *end_page,
693 int migratetype)
694{
695 struct page *page;
696 unsigned long order;
697 int pages_moved = 0;
698
699#ifndef CONFIG_HOLES_IN_ZONE
700 /*
701 * page_zone is not safe to call in this context when
702 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
703 * anyway as we check zone boundaries in move_freepages_block().
704 * Remove at a later date when no bug reports exist related to
705 * grouping pages by mobility
706 */
707 BUG_ON(page_zone(start_page) != page_zone(end_page));
708#endif
709
710 for (page = start_page; page <= end_page;) {
711 if (!pfn_valid_within(page_to_pfn(page))) {
712 page++;
713 continue;
714 }
715
716 if (!PageBuddy(page)) {
717 page++;
718 continue;
719 }
720
721 order = page_order(page);
722 list_del(&page->lru);
723 list_add(&page->lru,
724 &zone->free_area[order].free_list[migratetype]);
725 page += 1 << order;
726 pages_moved += 1 << order;
727 }
728
729 return pages_moved;
730}
731
732int move_freepages_block(struct zone *zone, struct page *page, int migratetype)
733{
734 unsigned long start_pfn, end_pfn;
735 struct page *start_page, *end_page;
736
737 start_pfn = page_to_pfn(page);
738 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
739 start_page = pfn_to_page(start_pfn);
740 end_page = start_page + pageblock_nr_pages - 1;
741 end_pfn = start_pfn + pageblock_nr_pages - 1;
742
743 /* Do not cross zone boundaries */
744 if (start_pfn < zone->zone_start_pfn)
745 start_page = page;
746 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
747 return 0;
748
749 return move_freepages(zone, start_page, end_page, migratetype);
750}
751
752/* Return the page with the lowest PFN in the list */
753static struct page *min_page(struct list_head *list)
754{
755 unsigned long min_pfn = -1UL;
756 struct page *min_page = NULL, *page;;
757
758 list_for_each_entry(page, list, lru) {
759 unsigned long pfn = page_to_pfn(page);
760 if (pfn < min_pfn) {
761 min_pfn = pfn;
762 min_page = page;
763 }
764 }
765
766 return min_page;
767}
768
769/* Remove an element from the buddy allocator from the fallback list */
770static struct page *__rmqueue_fallback(struct zone *zone, int order,
771 int start_migratetype)
772{
773 struct free_area * area;
774 int current_order;
775 struct page *page;
776 int migratetype, i;
777
778 /* Find the largest possible block of pages in the other list */
779 for (current_order = MAX_ORDER-1; current_order >= order;
780 --current_order) {
781 for (i = 0; i < MIGRATE_TYPES - 1; i++) {
782 migratetype = fallbacks[start_migratetype][i];
783
784 /* MIGRATE_RESERVE handled later if necessary */
785 if (migratetype == MIGRATE_RESERVE)
786 continue;
787
788 area = &(zone->free_area[current_order]);
789 if (list_empty(&area->free_list[migratetype]))
790 continue;
791
792 /* Bias kernel allocations towards low pfns */
793 page = list_entry(area->free_list[migratetype].next,
794 struct page, lru);
795 if (unlikely(start_migratetype != MIGRATE_MOVABLE))
796 page = min_page(&area->free_list[migratetype]);
797 area->nr_free--;
798
799 /*
800 * If breaking a large block of pages, move all free
801 * pages to the preferred allocation list. If falling
802 * back for a reclaimable kernel allocation, be more
803 * agressive about taking ownership of free pages
804 */
805 if (unlikely(current_order >= (pageblock_order >> 1)) ||
806 start_migratetype == MIGRATE_RECLAIMABLE) {
807 unsigned long pages;
808 pages = move_freepages_block(zone, page,
809 start_migratetype);
810
811 /* Claim the whole block if over half of it is free */
812 if (pages >= (1 << (pageblock_order-1)))
813 set_pageblock_migratetype(page,
814 start_migratetype);
815
816 migratetype = start_migratetype;
817 }
818
819 /* Remove the page from the freelists */
820 list_del(&page->lru);
821 rmv_page_order(page);
822 __mod_zone_page_state(zone, NR_FREE_PAGES,
823 -(1UL << order));
824
825 if (current_order == pageblock_order)
826 set_pageblock_migratetype(page,
827 start_migratetype);
828
829 expand(zone, page, order, current_order, area, migratetype);
830 return page;
831 }
832 }
833
834 /* Use MIGRATE_RESERVE rather than fail an allocation */
835 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
836}
837
838/*
839 * Do the hard work of removing an element from the buddy allocator.
840 * Call me with the zone->lock already held.
841 */
842static struct page *__rmqueue(struct zone *zone, unsigned int order,
843 int migratetype)
844{
845 struct page *page;
846
847 page = __rmqueue_smallest(zone, order, migratetype);
848
849 if (unlikely(!page))
850 page = __rmqueue_fallback(zone, order, migratetype);
851
852 return page;
853}
854
658/* 855/*
659 * Obtain a specified number of elements from the buddy allocator, all under 856 * Obtain a specified number of elements from the buddy allocator, all under
660 * a single hold of the lock, for efficiency. Add them to the supplied list. 857 * a single hold of the lock, for efficiency. Add them to the supplied list.
661 * Returns the number of new pages which were placed at *list. 858 * Returns the number of new pages which were placed at *list.
662 */ 859 */
663static int rmqueue_bulk(struct zone *zone, unsigned int order, 860static int rmqueue_bulk(struct zone *zone, unsigned int order,
664 unsigned long count, struct list_head *list) 861 unsigned long count, struct list_head *list,
862 int migratetype)
665{ 863{
666 int i; 864 int i;
667 865
668 spin_lock(&zone->lock); 866 spin_lock(&zone->lock);
669 for (i = 0; i < count; ++i) { 867 for (i = 0; i < count; ++i) {
670 struct page *page = __rmqueue(zone, order); 868 struct page *page = __rmqueue(zone, order, migratetype);
671 if (unlikely(page == NULL)) 869 if (unlikely(page == NULL))
672 break; 870 break;
673 list_add_tail(&page->lru, list); 871 list_add(&page->lru, list);
872 set_page_private(page, migratetype);
674 } 873 }
675 spin_unlock(&zone->lock); 874 spin_unlock(&zone->lock);
676 return i; 875 return i;
@@ -732,7 +931,7 @@ void mark_free_pages(struct zone *zone)
732{ 931{
733 unsigned long pfn, max_zone_pfn; 932 unsigned long pfn, max_zone_pfn;
734 unsigned long flags; 933 unsigned long flags;
735 int order; 934 int order, t;
736 struct list_head *curr; 935 struct list_head *curr;
737 936
738 if (!zone->spanned_pages) 937 if (!zone->spanned_pages)
@@ -749,17 +948,18 @@ void mark_free_pages(struct zone *zone)
749 swsusp_unset_page_free(page); 948 swsusp_unset_page_free(page);
750 } 949 }
751 950
752 for (order = MAX_ORDER - 1; order >= 0; --order) 951 for_each_migratetype_order(order, t) {
753 list_for_each(curr, &zone->free_area[order].free_list) { 952 list_for_each(curr, &zone->free_area[order].free_list[t]) {
754 unsigned long i; 953 unsigned long i;
755 954
756 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 955 pfn = page_to_pfn(list_entry(curr, struct page, lru));
757 for (i = 0; i < (1UL << order); i++) 956 for (i = 0; i < (1UL << order); i++)
758 swsusp_set_page_free(pfn_to_page(pfn + i)); 957 swsusp_set_page_free(pfn_to_page(pfn + i));
759 } 958 }
760 959 }
761 spin_unlock_irqrestore(&zone->lock, flags); 960 spin_unlock_irqrestore(&zone->lock, flags);
762} 961}
962#endif /* CONFIG_PM */
763 963
764/* 964/*
765 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 965 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
@@ -772,7 +972,25 @@ void drain_local_pages(void)
772 __drain_pages(smp_processor_id()); 972 __drain_pages(smp_processor_id());
773 local_irq_restore(flags); 973 local_irq_restore(flags);
774} 974}
775#endif /* CONFIG_HIBERNATION */ 975
976void smp_drain_local_pages(void *arg)
977{
978 drain_local_pages();
979}
980
981/*
982 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
983 */
984void drain_all_local_pages(void)
985{
986 unsigned long flags;
987
988 local_irq_save(flags);
989 __drain_pages(smp_processor_id());
990 local_irq_restore(flags);
991
992 smp_call_function(smp_drain_local_pages, NULL, 0, 1);
993}
776 994
777/* 995/*
778 * Free a 0-order page 996 * Free a 0-order page
@@ -797,6 +1015,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
797 local_irq_save(flags); 1015 local_irq_save(flags);
798 __count_vm_event(PGFREE); 1016 __count_vm_event(PGFREE);
799 list_add(&page->lru, &pcp->list); 1017 list_add(&page->lru, &pcp->list);
1018 set_page_private(page, get_pageblock_migratetype(page));
800 pcp->count++; 1019 pcp->count++;
801 if (pcp->count >= pcp->high) { 1020 if (pcp->count >= pcp->high) {
802 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1021 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -846,6 +1065,7 @@ static struct page *buffered_rmqueue(struct zonelist *zonelist,
846 struct page *page; 1065 struct page *page;
847 int cold = !!(gfp_flags & __GFP_COLD); 1066 int cold = !!(gfp_flags & __GFP_COLD);
848 int cpu; 1067 int cpu;
1068 int migratetype = allocflags_to_migratetype(gfp_flags);
849 1069
850again: 1070again:
851 cpu = get_cpu(); 1071 cpu = get_cpu();
@@ -856,16 +1076,28 @@ again:
856 local_irq_save(flags); 1076 local_irq_save(flags);
857 if (!pcp->count) { 1077 if (!pcp->count) {
858 pcp->count = rmqueue_bulk(zone, 0, 1078 pcp->count = rmqueue_bulk(zone, 0,
859 pcp->batch, &pcp->list); 1079 pcp->batch, &pcp->list, migratetype);
860 if (unlikely(!pcp->count)) 1080 if (unlikely(!pcp->count))
861 goto failed; 1081 goto failed;
862 } 1082 }
863 page = list_entry(pcp->list.next, struct page, lru); 1083
1084 /* Find a page of the appropriate migrate type */
1085 list_for_each_entry(page, &pcp->list, lru)
1086 if (page_private(page) == migratetype)
1087 break;
1088
1089 /* Allocate more to the pcp list if necessary */
1090 if (unlikely(&page->lru == &pcp->list)) {
1091 pcp->count += rmqueue_bulk(zone, 0,
1092 pcp->batch, &pcp->list, migratetype);
1093 page = list_entry(pcp->list.next, struct page, lru);
1094 }
1095
864 list_del(&page->lru); 1096 list_del(&page->lru);
865 pcp->count--; 1097 pcp->count--;
866 } else { 1098 } else {
867 spin_lock_irqsave(&zone->lock, flags); 1099 spin_lock_irqsave(&zone->lock, flags);
868 page = __rmqueue(zone, order); 1100 page = __rmqueue(zone, order, migratetype);
869 spin_unlock(&zone->lock); 1101 spin_unlock(&zone->lock);
870 if (!page) 1102 if (!page)
871 goto failed; 1103 goto failed;
@@ -1032,7 +1264,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1032 * 1264 *
1033 * If the zonelist cache is present in the passed in zonelist, then 1265 * If the zonelist cache is present in the passed in zonelist, then
1034 * returns a pointer to the allowed node mask (either the current 1266 * returns a pointer to the allowed node mask (either the current
1035 * tasks mems_allowed, or node_online_map.) 1267 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
1036 * 1268 *
1037 * If the zonelist cache is not available for this zonelist, does 1269 * If the zonelist cache is not available for this zonelist, does
1038 * nothing and returns NULL. 1270 * nothing and returns NULL.
@@ -1061,7 +1293,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1061 1293
1062 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1294 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1063 &cpuset_current_mems_allowed : 1295 &cpuset_current_mems_allowed :
1064 &node_online_map; 1296 &node_states[N_HIGH_MEMORY];
1065 return allowednodes; 1297 return allowednodes;
1066} 1298}
1067 1299
@@ -1183,9 +1415,6 @@ zonelist_scan:
1183 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1415 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1184 continue; 1416 continue;
1185 zone = *z; 1417 zone = *z;
1186 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
1187 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
1188 break;
1189 if ((alloc_flags & ALLOC_CPUSET) && 1418 if ((alloc_flags & ALLOC_CPUSET) &&
1190 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1419 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1191 goto try_next_zone; 1420 goto try_next_zone;
@@ -1254,7 +1483,10 @@ restart:
1254 z = zonelist->zones; /* the list of zones suitable for gfp_mask */ 1483 z = zonelist->zones; /* the list of zones suitable for gfp_mask */
1255 1484
1256 if (unlikely(*z == NULL)) { 1485 if (unlikely(*z == NULL)) {
1257 /* Should this ever happen?? */ 1486 /*
1487 * Happens if we have an empty zonelist as a result of
1488 * GFP_THISNODE being used on a memoryless node
1489 */
1258 return NULL; 1490 return NULL;
1259 } 1491 }
1260 1492
@@ -1346,12 +1578,20 @@ nofail_alloc:
1346 1578
1347 cond_resched(); 1579 cond_resched();
1348 1580
1581 if (order != 0)
1582 drain_all_local_pages();
1583
1349 if (likely(did_some_progress)) { 1584 if (likely(did_some_progress)) {
1350 page = get_page_from_freelist(gfp_mask, order, 1585 page = get_page_from_freelist(gfp_mask, order,
1351 zonelist, alloc_flags); 1586 zonelist, alloc_flags);
1352 if (page) 1587 if (page)
1353 goto got_pg; 1588 goto got_pg;
1354 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 1589 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1590 if (!try_set_zone_oom(zonelist)) {
1591 schedule_timeout_uninterruptible(1);
1592 goto restart;
1593 }
1594
1355 /* 1595 /*
1356 * Go through the zonelist yet one more time, keep 1596 * Go through the zonelist yet one more time, keep
1357 * very high watermark here, this is only to catch 1597 * very high watermark here, this is only to catch
@@ -1360,14 +1600,19 @@ nofail_alloc:
1360 */ 1600 */
1361 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1601 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
1362 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); 1602 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1363 if (page) 1603 if (page) {
1604 clear_zonelist_oom(zonelist);
1364 goto got_pg; 1605 goto got_pg;
1606 }
1365 1607
1366 /* The OOM killer will not help higher order allocs so fail */ 1608 /* The OOM killer will not help higher order allocs so fail */
1367 if (order > PAGE_ALLOC_COSTLY_ORDER) 1609 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1610 clear_zonelist_oom(zonelist);
1368 goto nopage; 1611 goto nopage;
1612 }
1369 1613
1370 out_of_memory(zonelist, gfp_mask, order); 1614 out_of_memory(zonelist, gfp_mask, order);
1615 clear_zonelist_oom(zonelist);
1371 goto restart; 1616 goto restart;
1372 } 1617 }
1373 1618
@@ -1616,7 +1861,7 @@ void show_free_areas(void)
1616 K(zone_page_state(zone, NR_INACTIVE)), 1861 K(zone_page_state(zone, NR_INACTIVE)),
1617 K(zone->present_pages), 1862 K(zone->present_pages),
1618 zone->pages_scanned, 1863 zone->pages_scanned,
1619 (zone->all_unreclaimable ? "yes" : "no") 1864 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
1620 ); 1865 );
1621 printk("lowmem_reserve[]:"); 1866 printk("lowmem_reserve[]:");
1622 for (i = 0; i < MAX_NR_ZONES; i++) 1867 for (i = 0; i < MAX_NR_ZONES; i++)
@@ -1794,7 +2039,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
1794 return node; 2039 return node;
1795 } 2040 }
1796 2041
1797 for_each_online_node(n) { 2042 for_each_node_state(n, N_HIGH_MEMORY) {
1798 cpumask_t tmp; 2043 cpumask_t tmp;
1799 2044
1800 /* Don't want a node to appear more than once */ 2045 /* Don't want a node to appear more than once */
@@ -1850,6 +2095,22 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
1850} 2095}
1851 2096
1852/* 2097/*
2098 * Build gfp_thisnode zonelists
2099 */
2100static void build_thisnode_zonelists(pg_data_t *pgdat)
2101{
2102 enum zone_type i;
2103 int j;
2104 struct zonelist *zonelist;
2105
2106 for (i = 0; i < MAX_NR_ZONES; i++) {
2107 zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i;
2108 j = build_zonelists_node(pgdat, zonelist, 0, i);
2109 zonelist->zones[j] = NULL;
2110 }
2111}
2112
2113/*
1853 * Build zonelists ordered by zone and nodes within zones. 2114 * Build zonelists ordered by zone and nodes within zones.
1854 * This results in conserving DMA zone[s] until all Normal memory is 2115 * This results in conserving DMA zone[s] until all Normal memory is
1855 * exhausted, but results in overflowing to remote node while memory 2116 * exhausted, but results in overflowing to remote node while memory
@@ -1915,7 +2176,8 @@ static int default_zonelist_order(void)
1915 * If there is a node whose DMA/DMA32 memory is very big area on 2176 * If there is a node whose DMA/DMA32 memory is very big area on
1916 * local memory, NODE_ORDER may be suitable. 2177 * local memory, NODE_ORDER may be suitable.
1917 */ 2178 */
1918 average_size = total_size / (num_online_nodes() + 1); 2179 average_size = total_size /
2180 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
1919 for_each_online_node(nid) { 2181 for_each_online_node(nid) {
1920 low_kmem_size = 0; 2182 low_kmem_size = 0;
1921 total_size = 0; 2183 total_size = 0;
@@ -1953,7 +2215,7 @@ static void build_zonelists(pg_data_t *pgdat)
1953 int order = current_zonelist_order; 2215 int order = current_zonelist_order;
1954 2216
1955 /* initialize zonelists */ 2217 /* initialize zonelists */
1956 for (i = 0; i < MAX_NR_ZONES; i++) { 2218 for (i = 0; i < MAX_ZONELISTS; i++) {
1957 zonelist = pgdat->node_zonelists + i; 2219 zonelist = pgdat->node_zonelists + i;
1958 zonelist->zones[0] = NULL; 2220 zonelist->zones[0] = NULL;
1959 } 2221 }
@@ -1998,6 +2260,8 @@ static void build_zonelists(pg_data_t *pgdat)
1998 /* calculate node order -- i.e., DMA last! */ 2260 /* calculate node order -- i.e., DMA last! */
1999 build_zonelists_in_zone_order(pgdat, j); 2261 build_zonelists_in_zone_order(pgdat, j);
2000 } 2262 }
2263
2264 build_thisnode_zonelists(pgdat);
2001} 2265}
2002 2266
2003/* Construct the zonelist performance cache - see further mmzone.h */ 2267/* Construct the zonelist performance cache - see further mmzone.h */
@@ -2078,8 +2342,10 @@ static int __build_all_zonelists(void *dummy)
2078 int nid; 2342 int nid;
2079 2343
2080 for_each_online_node(nid) { 2344 for_each_online_node(nid) {
2081 build_zonelists(NODE_DATA(nid)); 2345 pg_data_t *pgdat = NODE_DATA(nid);
2082 build_zonelist_cache(NODE_DATA(nid)); 2346
2347 build_zonelists(pgdat);
2348 build_zonelist_cache(pgdat);
2083 } 2349 }
2084 return 0; 2350 return 0;
2085} 2351}
@@ -2098,9 +2364,23 @@ void build_all_zonelists(void)
2098 /* cpuset refresh routine should be here */ 2364 /* cpuset refresh routine should be here */
2099 } 2365 }
2100 vm_total_pages = nr_free_pagecache_pages(); 2366 vm_total_pages = nr_free_pagecache_pages();
2101 printk("Built %i zonelists in %s order. Total pages: %ld\n", 2367 /*
2368 * Disable grouping by mobility if the number of pages in the
2369 * system is too low to allow the mechanism to work. It would be
2370 * more accurate, but expensive to check per-zone. This check is
2371 * made on memory-hotadd so a system can start with mobility
2372 * disabled and enable it later
2373 */
2374 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
2375 page_group_by_mobility_disabled = 1;
2376 else
2377 page_group_by_mobility_disabled = 0;
2378
2379 printk("Built %i zonelists in %s order, mobility grouping %s. "
2380 "Total pages: %ld\n",
2102 num_online_nodes(), 2381 num_online_nodes(),
2103 zonelist_order_name[current_zonelist_order], 2382 zonelist_order_name[current_zonelist_order],
2383 page_group_by_mobility_disabled ? "off" : "on",
2104 vm_total_pages); 2384 vm_total_pages);
2105#ifdef CONFIG_NUMA 2385#ifdef CONFIG_NUMA
2106 printk("Policy zone: %s\n", zone_names[policy_zone]); 2386 printk("Policy zone: %s\n", zone_names[policy_zone]);
@@ -2176,6 +2456,61 @@ static inline unsigned long wait_table_bits(unsigned long size)
2176#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 2456#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
2177 2457
2178/* 2458/*
2459 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2460 * of blocks reserved is based on zone->pages_min. The memory within the
2461 * reserve will tend to store contiguous free pages. Setting min_free_kbytes
2462 * higher will lead to a bigger reserve which will get freed as contiguous
2463 * blocks as reclaim kicks in
2464 */
2465static void setup_zone_migrate_reserve(struct zone *zone)
2466{
2467 unsigned long start_pfn, pfn, end_pfn;
2468 struct page *page;
2469 unsigned long reserve, block_migratetype;
2470
2471 /* Get the start pfn, end pfn and the number of blocks to reserve */
2472 start_pfn = zone->zone_start_pfn;
2473 end_pfn = start_pfn + zone->spanned_pages;
2474 reserve = roundup(zone->pages_min, pageblock_nr_pages) >>
2475 pageblock_order;
2476
2477 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
2478 if (!pfn_valid(pfn))
2479 continue;
2480 page = pfn_to_page(pfn);
2481
2482 /* Blocks with reserved pages will never free, skip them. */
2483 if (PageReserved(page))
2484 continue;
2485
2486 block_migratetype = get_pageblock_migratetype(page);
2487
2488 /* If this block is reserved, account for it */
2489 if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
2490 reserve--;
2491 continue;
2492 }
2493
2494 /* Suitable for reserving if this block is movable */
2495 if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
2496 set_pageblock_migratetype(page, MIGRATE_RESERVE);
2497 move_freepages_block(zone, page, MIGRATE_RESERVE);
2498 reserve--;
2499 continue;
2500 }
2501
2502 /*
2503 * If the reserve is met and this is a previous reserved block,
2504 * take it back
2505 */
2506 if (block_migratetype == MIGRATE_RESERVE) {
2507 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2508 move_freepages_block(zone, page, MIGRATE_MOVABLE);
2509 }
2510 }
2511}
2512
2513/*
2179 * Initially all pages are reserved - free ones are freed 2514 * Initially all pages are reserved - free ones are freed
2180 * up by free_all_bootmem() once the early boot process is 2515 * up by free_all_bootmem() once the early boot process is
2181 * done. Non-atomic initialization, single-pass. 2516 * done. Non-atomic initialization, single-pass.
@@ -2204,6 +2539,19 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2204 init_page_count(page); 2539 init_page_count(page);
2205 reset_page_mapcount(page); 2540 reset_page_mapcount(page);
2206 SetPageReserved(page); 2541 SetPageReserved(page);
2542
2543 /*
2544 * Mark the block movable so that blocks are reserved for
2545 * movable at startup. This will force kernel allocations
2546 * to reserve their blocks rather than leaking throughout
2547 * the address space during boot when many long-lived
2548 * kernel allocations are made. Later some blocks near
2549 * the start are marked MIGRATE_RESERVE by
2550 * setup_zone_migrate_reserve()
2551 */
2552 if ((pfn & (pageblock_nr_pages-1)))
2553 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2554
2207 INIT_LIST_HEAD(&page->lru); 2555 INIT_LIST_HEAD(&page->lru);
2208#ifdef WANT_PAGE_VIRTUAL 2556#ifdef WANT_PAGE_VIRTUAL
2209 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 2557 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
@@ -2216,9 +2564,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2216static void __meminit zone_init_free_lists(struct pglist_data *pgdat, 2564static void __meminit zone_init_free_lists(struct pglist_data *pgdat,
2217 struct zone *zone, unsigned long size) 2565 struct zone *zone, unsigned long size)
2218{ 2566{
2219 int order; 2567 int order, t;
2220 for (order = 0; order < MAX_ORDER ; order++) { 2568 for_each_migratetype_order(order, t) {
2221 INIT_LIST_HEAD(&zone->free_area[order].free_list); 2569 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
2222 zone->free_area[order].nr_free = 0; 2570 zone->free_area[order].nr_free = 0;
2223 } 2571 }
2224} 2572}
@@ -2324,6 +2672,9 @@ static struct per_cpu_pageset boot_pageset[NR_CPUS];
2324static int __cpuinit process_zones(int cpu) 2672static int __cpuinit process_zones(int cpu)
2325{ 2673{
2326 struct zone *zone, *dzone; 2674 struct zone *zone, *dzone;
2675 int node = cpu_to_node(cpu);
2676
2677 node_set_state(node, N_CPU); /* this node has a cpu */
2327 2678
2328 for_each_zone(zone) { 2679 for_each_zone(zone) {
2329 2680
@@ -2331,7 +2682,7 @@ static int __cpuinit process_zones(int cpu)
2331 continue; 2682 continue;
2332 2683
2333 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 2684 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
2334 GFP_KERNEL, cpu_to_node(cpu)); 2685 GFP_KERNEL, node);
2335 if (!zone_pcp(zone, cpu)) 2686 if (!zone_pcp(zone, cpu))
2336 goto bad; 2687 goto bad;
2337 2688
@@ -2444,7 +2795,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
2444 * To use this new node's memory, further consideration will be 2795 * To use this new node's memory, further consideration will be
2445 * necessary. 2796 * necessary.
2446 */ 2797 */
2447 zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); 2798 zone->wait_table = vmalloc(alloc_size);
2448 } 2799 }
2449 if (!zone->wait_table) 2800 if (!zone->wait_table)
2450 return -ENOMEM; 2801 return -ENOMEM;
@@ -2680,10 +3031,8 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
2680 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); 3031 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
2681 } 3032 }
2682 3033
2683 if (*start_pfn == -1UL) { 3034 if (*start_pfn == -1UL)
2684 printk(KERN_WARNING "Node %u active with no memory\n", nid);
2685 *start_pfn = 0; 3035 *start_pfn = 0;
2686 }
2687 3036
2688 /* Push the node boundaries out if requested */ 3037 /* Push the node boundaries out if requested */
2689 account_node_boundary(nid, start_pfn, end_pfn); 3038 account_node_boundary(nid, start_pfn, end_pfn);
@@ -2901,6 +3250,62 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
2901 realtotalpages); 3250 realtotalpages);
2902} 3251}
2903 3252
3253#ifndef CONFIG_SPARSEMEM
3254/*
3255 * Calculate the size of the zone->blockflags rounded to an unsigned long
3256 * Start by making sure zonesize is a multiple of pageblock_order by rounding
3257 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
3258 * round what is now in bits to nearest long in bits, then return it in
3259 * bytes.
3260 */
3261static unsigned long __init usemap_size(unsigned long zonesize)
3262{
3263 unsigned long usemapsize;
3264
3265 usemapsize = roundup(zonesize, pageblock_nr_pages);
3266 usemapsize = usemapsize >> pageblock_order;
3267 usemapsize *= NR_PAGEBLOCK_BITS;
3268 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
3269
3270 return usemapsize / 8;
3271}
3272
3273static void __init setup_usemap(struct pglist_data *pgdat,
3274 struct zone *zone, unsigned long zonesize)
3275{
3276 unsigned long usemapsize = usemap_size(zonesize);
3277 zone->pageblock_flags = NULL;
3278 if (usemapsize) {
3279 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
3280 memset(zone->pageblock_flags, 0, usemapsize);
3281 }
3282}
3283#else
3284static void inline setup_usemap(struct pglist_data *pgdat,
3285 struct zone *zone, unsigned long zonesize) {}
3286#endif /* CONFIG_SPARSEMEM */
3287
3288#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
3289/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
3290static inline void __init set_pageblock_order(unsigned int order)
3291{
3292 /* Check that pageblock_nr_pages has not already been setup */
3293 if (pageblock_order)
3294 return;
3295
3296 /*
3297 * Assume the largest contiguous order of interest is a huge page.
3298 * This value may be variable depending on boot parameters on IA64
3299 */
3300 pageblock_order = order;
3301}
3302#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
3303
3304/* Defined this way to avoid accidently referencing HUGETLB_PAGE_ORDER */
3305#define set_pageblock_order(x) do {} while (0)
3306
3307#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
3308
2904/* 3309/*
2905 * Set up the zone data structures: 3310 * Set up the zone data structures:
2906 * - mark all pages reserved 3311 * - mark all pages reserved
@@ -2977,10 +3382,12 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2977 zone->nr_scan_active = 0; 3382 zone->nr_scan_active = 0;
2978 zone->nr_scan_inactive = 0; 3383 zone->nr_scan_inactive = 0;
2979 zap_zone_vm_stats(zone); 3384 zap_zone_vm_stats(zone);
2980 atomic_set(&zone->reclaim_in_progress, 0); 3385 zone->flags = 0;
2981 if (!size) 3386 if (!size)
2982 continue; 3387 continue;
2983 3388
3389 set_pageblock_order(HUGETLB_PAGE_ORDER);
3390 setup_usemap(pgdat, zone, size);
2984 ret = init_currently_empty_zone(zone, zone_start_pfn, 3391 ret = init_currently_empty_zone(zone, zone_start_pfn,
2985 size, MEMMAP_EARLY); 3392 size, MEMMAP_EARLY);
2986 BUG_ON(ret); 3393 BUG_ON(ret);
@@ -3234,16 +3641,24 @@ unsigned long __init find_max_pfn_with_active_regions(void)
3234 return max_pfn; 3641 return max_pfn;
3235} 3642}
3236 3643
3237unsigned long __init early_calculate_totalpages(void) 3644/*
3645 * early_calculate_totalpages()
3646 * Sum pages in active regions for movable zone.
3647 * Populate N_HIGH_MEMORY for calculating usable_nodes.
3648 */
3649static unsigned long __init early_calculate_totalpages(void)
3238{ 3650{
3239 int i; 3651 int i;
3240 unsigned long totalpages = 0; 3652 unsigned long totalpages = 0;
3241 3653
3242 for (i = 0; i < nr_nodemap_entries; i++) 3654 for (i = 0; i < nr_nodemap_entries; i++) {
3243 totalpages += early_node_map[i].end_pfn - 3655 unsigned long pages = early_node_map[i].end_pfn -
3244 early_node_map[i].start_pfn; 3656 early_node_map[i].start_pfn;
3245 3657 totalpages += pages;
3246 return totalpages; 3658 if (pages)
3659 node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
3660 }
3661 return totalpages;
3247} 3662}
3248 3663
3249/* 3664/*
@@ -3257,7 +3672,8 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3257 int i, nid; 3672 int i, nid;
3258 unsigned long usable_startpfn; 3673 unsigned long usable_startpfn;
3259 unsigned long kernelcore_node, kernelcore_remaining; 3674 unsigned long kernelcore_node, kernelcore_remaining;
3260 int usable_nodes = num_online_nodes(); 3675 unsigned long totalpages = early_calculate_totalpages();
3676 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
3261 3677
3262 /* 3678 /*
3263 * If movablecore was specified, calculate what size of 3679 * If movablecore was specified, calculate what size of
@@ -3268,7 +3684,6 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3268 * what movablecore would have allowed. 3684 * what movablecore would have allowed.
3269 */ 3685 */
3270 if (required_movablecore) { 3686 if (required_movablecore) {
3271 unsigned long totalpages = early_calculate_totalpages();
3272 unsigned long corepages; 3687 unsigned long corepages;
3273 3688
3274 /* 3689 /*
@@ -3293,7 +3708,7 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3293restart: 3708restart:
3294 /* Spread kernelcore memory as evenly as possible throughout nodes */ 3709 /* Spread kernelcore memory as evenly as possible throughout nodes */
3295 kernelcore_node = required_kernelcore / usable_nodes; 3710 kernelcore_node = required_kernelcore / usable_nodes;
3296 for_each_online_node(nid) { 3711 for_each_node_state(nid, N_HIGH_MEMORY) {
3297 /* 3712 /*
3298 * Recalculate kernelcore_node if the division per node 3713 * Recalculate kernelcore_node if the division per node
3299 * now exceeds what is necessary to satisfy the requested 3714 * now exceeds what is necessary to satisfy the requested
@@ -3385,6 +3800,20 @@ restart:
3385 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 3800 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
3386} 3801}
3387 3802
3803/* Any regular memory on that node ? */
3804static void check_for_regular_memory(pg_data_t *pgdat)
3805{
3806#ifdef CONFIG_HIGHMEM
3807 enum zone_type zone_type;
3808
3809 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
3810 struct zone *zone = &pgdat->node_zones[zone_type];
3811 if (zone->present_pages)
3812 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
3813 }
3814#endif
3815}
3816
3388/** 3817/**
3389 * free_area_init_nodes - Initialise all pg_data_t and zone data 3818 * free_area_init_nodes - Initialise all pg_data_t and zone data
3390 * @max_zone_pfn: an array of max PFNs for each zone 3819 * @max_zone_pfn: an array of max PFNs for each zone
@@ -3459,6 +3888,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
3459 pg_data_t *pgdat = NODE_DATA(nid); 3888 pg_data_t *pgdat = NODE_DATA(nid);
3460 free_area_init_node(nid, pgdat, NULL, 3889 free_area_init_node(nid, pgdat, NULL,
3461 find_min_pfn_for_node(nid), NULL); 3890 find_min_pfn_for_node(nid), NULL);
3891
3892 /* Any memory on that node */
3893 if (pgdat->node_present_pages)
3894 node_set_state(nid, N_HIGH_MEMORY);
3895 check_for_regular_memory(pgdat);
3462 } 3896 }
3463} 3897}
3464 3898
@@ -3673,6 +4107,7 @@ void setup_per_zone_pages_min(void)
3673 4107
3674 zone->pages_low = zone->pages_min + (tmp >> 2); 4108 zone->pages_low = zone->pages_min + (tmp >> 2);
3675 zone->pages_high = zone->pages_min + (tmp >> 1); 4109 zone->pages_high = zone->pages_min + (tmp >> 1);
4110 setup_zone_migrate_reserve(zone);
3676 spin_unlock_irqrestore(&zone->lru_lock, flags); 4111 spin_unlock_irqrestore(&zone->lru_lock, flags);
3677 } 4112 }
3678 4113
@@ -3934,4 +4369,169 @@ EXPORT_SYMBOL(pfn_to_page);
3934EXPORT_SYMBOL(page_to_pfn); 4369EXPORT_SYMBOL(page_to_pfn);
3935#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 4370#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
3936 4371
4372/* Return a pointer to the bitmap storing bits affecting a block of pages */
4373static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
4374 unsigned long pfn)
4375{
4376#ifdef CONFIG_SPARSEMEM
4377 return __pfn_to_section(pfn)->pageblock_flags;
4378#else
4379 return zone->pageblock_flags;
4380#endif /* CONFIG_SPARSEMEM */
4381}
4382
4383static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
4384{
4385#ifdef CONFIG_SPARSEMEM
4386 pfn &= (PAGES_PER_SECTION-1);
4387 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
4388#else
4389 pfn = pfn - zone->zone_start_pfn;
4390 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
4391#endif /* CONFIG_SPARSEMEM */
4392}
4393
4394/**
4395 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
4396 * @page: The page within the block of interest
4397 * @start_bitidx: The first bit of interest to retrieve
4398 * @end_bitidx: The last bit of interest
4399 * returns pageblock_bits flags
4400 */
4401unsigned long get_pageblock_flags_group(struct page *page,
4402 int start_bitidx, int end_bitidx)
4403{
4404 struct zone *zone;
4405 unsigned long *bitmap;
4406 unsigned long pfn, bitidx;
4407 unsigned long flags = 0;
4408 unsigned long value = 1;
4409
4410 zone = page_zone(page);
4411 pfn = page_to_pfn(page);
4412 bitmap = get_pageblock_bitmap(zone, pfn);
4413 bitidx = pfn_to_bitidx(zone, pfn);
4414
4415 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
4416 if (test_bit(bitidx + start_bitidx, bitmap))
4417 flags |= value;
4418
4419 return flags;
4420}
4421
4422/**
4423 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
4424 * @page: The page within the block of interest
4425 * @start_bitidx: The first bit of interest
4426 * @end_bitidx: The last bit of interest
4427 * @flags: The flags to set
4428 */
4429void set_pageblock_flags_group(struct page *page, unsigned long flags,
4430 int start_bitidx, int end_bitidx)
4431{
4432 struct zone *zone;
4433 unsigned long *bitmap;
4434 unsigned long pfn, bitidx;
4435 unsigned long value = 1;
4436
4437 zone = page_zone(page);
4438 pfn = page_to_pfn(page);
4439 bitmap = get_pageblock_bitmap(zone, pfn);
4440 bitidx = pfn_to_bitidx(zone, pfn);
4441
4442 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
4443 if (flags & value)
4444 __set_bit(bitidx + start_bitidx, bitmap);
4445 else
4446 __clear_bit(bitidx + start_bitidx, bitmap);
4447}
4448
4449/*
4450 * This is designed as sub function...plz see page_isolation.c also.
4451 * set/clear page block's type to be ISOLATE.
4452 * page allocater never alloc memory from ISOLATE block.
4453 */
4454
4455int set_migratetype_isolate(struct page *page)
4456{
4457 struct zone *zone;
4458 unsigned long flags;
4459 int ret = -EBUSY;
4460
4461 zone = page_zone(page);
4462 spin_lock_irqsave(&zone->lock, flags);
4463 /*
4464 * In future, more migrate types will be able to be isolation target.
4465 */
4466 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
4467 goto out;
4468 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
4469 move_freepages_block(zone, page, MIGRATE_ISOLATE);
4470 ret = 0;
4471out:
4472 spin_unlock_irqrestore(&zone->lock, flags);
4473 if (!ret)
4474 drain_all_local_pages();
4475 return ret;
4476}
3937 4477
4478void unset_migratetype_isolate(struct page *page)
4479{
4480 struct zone *zone;
4481 unsigned long flags;
4482 zone = page_zone(page);
4483 spin_lock_irqsave(&zone->lock, flags);
4484 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
4485 goto out;
4486 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4487 move_freepages_block(zone, page, MIGRATE_MOVABLE);
4488out:
4489 spin_unlock_irqrestore(&zone->lock, flags);
4490}
4491
4492#ifdef CONFIG_MEMORY_HOTREMOVE
4493/*
4494 * All pages in the range must be isolated before calling this.
4495 */
4496void
4497__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
4498{
4499 struct page *page;
4500 struct zone *zone;
4501 int order, i;
4502 unsigned long pfn;
4503 unsigned long flags;
4504 /* find the first valid pfn */
4505 for (pfn = start_pfn; pfn < end_pfn; pfn++)
4506 if (pfn_valid(pfn))
4507 break;
4508 if (pfn == end_pfn)
4509 return;
4510 zone = page_zone(pfn_to_page(pfn));
4511 spin_lock_irqsave(&zone->lock, flags);
4512 pfn = start_pfn;
4513 while (pfn < end_pfn) {
4514 if (!pfn_valid(pfn)) {
4515 pfn++;
4516 continue;
4517 }
4518 page = pfn_to_page(pfn);
4519 BUG_ON(page_count(page));
4520 BUG_ON(!PageBuddy(page));
4521 order = page_order(page);
4522#ifdef CONFIG_DEBUG_VM
4523 printk(KERN_INFO "remove from free list %lx %d %lx\n",
4524 pfn, 1 << order, end_pfn);
4525#endif
4526 list_del(&page->lru);
4527 rmv_page_order(page);
4528 zone->free_area[order].nr_free--;
4529 __mod_zone_page_state(zone, NR_FREE_PAGES,
4530 - (1UL << order));
4531 for (i = 0; i < (1 << order); i++)
4532 SetPageReserved((page+i));
4533 pfn += (1 << order);
4534 }
4535 spin_unlock_irqrestore(&zone->lock, flags);
4536}
4537#endif
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
new file mode 100644
index 000000000000..8f92a29695cc
--- /dev/null
+++ b/mm/page_isolation.c
@@ -0,0 +1,138 @@
1/*
2 * linux/mm/page_isolation.c
3 */
4
5#include <stddef.h>
6#include <linux/mm.h>
7#include <linux/page-isolation.h>
8#include <linux/pageblock-flags.h>
9#include "internal.h"
10
11static inline struct page *
12__first_valid_page(unsigned long pfn, unsigned long nr_pages)
13{
14 int i;
15 for (i = 0; i < nr_pages; i++)
16 if (pfn_valid_within(pfn + i))
17 break;
18 if (unlikely(i == nr_pages))
19 return NULL;
20 return pfn_to_page(pfn + i);
21}
22
23/*
24 * start_isolate_page_range() -- make page-allocation-type of range of pages
25 * to be MIGRATE_ISOLATE.
26 * @start_pfn: The lower PFN of the range to be isolated.
27 * @end_pfn: The upper PFN of the range to be isolated.
28 *
29 * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
30 * the range will never be allocated. Any free pages and pages freed in the
31 * future will not be allocated again.
32 *
33 * start_pfn/end_pfn must be aligned to pageblock_order.
34 * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
35 */
36int
37start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
38{
39 unsigned long pfn;
40 unsigned long undo_pfn;
41 struct page *page;
42
43 BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
44 BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
45
46 for (pfn = start_pfn;
47 pfn < end_pfn;
48 pfn += pageblock_nr_pages) {
49 page = __first_valid_page(pfn, pageblock_nr_pages);
50 if (page && set_migratetype_isolate(page)) {
51 undo_pfn = pfn;
52 goto undo;
53 }
54 }
55 return 0;
56undo:
57 for (pfn = start_pfn;
58 pfn <= undo_pfn;
59 pfn += pageblock_nr_pages)
60 unset_migratetype_isolate(pfn_to_page(pfn));
61
62 return -EBUSY;
63}
64
65/*
66 * Make isolated pages available again.
67 */
68int
69undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
70{
71 unsigned long pfn;
72 struct page *page;
73 BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
74 BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
75 for (pfn = start_pfn;
76 pfn < end_pfn;
77 pfn += pageblock_nr_pages) {
78 page = __first_valid_page(pfn, pageblock_nr_pages);
79 if (!page || get_pageblock_flags(page) != MIGRATE_ISOLATE)
80 continue;
81 unset_migratetype_isolate(page);
82 }
83 return 0;
84}
85/*
86 * Test all pages in the range is free(means isolated) or not.
87 * all pages in [start_pfn...end_pfn) must be in the same zone.
88 * zone->lock must be held before call this.
89 *
90 * Returns 0 if all pages in the range is isolated.
91 */
92static int
93__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
94{
95 struct page *page;
96
97 while (pfn < end_pfn) {
98 if (!pfn_valid_within(pfn)) {
99 pfn++;
100 continue;
101 }
102 page = pfn_to_page(pfn);
103 if (PageBuddy(page))
104 pfn += 1 << page_order(page);
105 else if (page_count(page) == 0 &&
106 page_private(page) == MIGRATE_ISOLATE)
107 pfn += 1;
108 else
109 break;
110 }
111 if (pfn < end_pfn)
112 return 0;
113 return 1;
114}
115
116int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
117{
118 unsigned long pfn;
119 struct page *page;
120
121 pfn = start_pfn;
122 /*
123 * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
124 * is not aligned to pageblock_nr_pages.
125 * Then we just check pagetype fist.
126 */
127 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
128 page = __first_valid_page(pfn, pageblock_nr_pages);
129 if (page && get_pageblock_flags(page) != MIGRATE_ISOLATE)
130 break;
131 }
132 if (pfn < end_pfn)
133 return -EBUSY;
134 /* Check all pages are free or Marked as ISOLATED */
135 if (__test_page_isolated_in_pageblock(start_pfn, end_pfn))
136 return 0;
137 return -EBUSY;
138}
diff --git a/mm/readahead.c b/mm/readahead.c
index be20c9d699d3..c9c50ca1ec38 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -22,16 +22,8 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
22} 22}
23EXPORT_SYMBOL(default_unplug_io_fn); 23EXPORT_SYMBOL(default_unplug_io_fn);
24 24
25/*
26 * Convienent macros for min/max read-ahead pages.
27 * Note that MAX_RA_PAGES is rounded down, while MIN_RA_PAGES is rounded up.
28 * The latter is necessary for systems with large page size(i.e. 64k).
29 */
30#define MAX_RA_PAGES (VM_MAX_READAHEAD*1024 / PAGE_CACHE_SIZE)
31#define MIN_RA_PAGES DIV_ROUND_UP(VM_MIN_READAHEAD*1024, PAGE_CACHE_SIZE)
32
33struct backing_dev_info default_backing_dev_info = { 25struct backing_dev_info default_backing_dev_info = {
34 .ra_pages = MAX_RA_PAGES, 26 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
35 .state = 0, 27 .state = 0,
36 .capabilities = BDI_CAP_MAP_COPY, 28 .capabilities = BDI_CAP_MAP_COPY,
37 .unplug_io_fn = default_unplug_io_fn, 29 .unplug_io_fn = default_unplug_io_fn,
@@ -46,7 +38,7 @@ void
46file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) 38file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
47{ 39{
48 ra->ra_pages = mapping->backing_dev_info->ra_pages; 40 ra->ra_pages = mapping->backing_dev_info->ra_pages;
49 ra->prev_index = -1; 41 ra->prev_pos = -1;
50} 42}
51EXPORT_SYMBOL_GPL(file_ra_state_init); 43EXPORT_SYMBOL_GPL(file_ra_state_init);
52 44
@@ -66,28 +58,25 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
66 int (*filler)(void *, struct page *), void *data) 58 int (*filler)(void *, struct page *), void *data)
67{ 59{
68 struct page *page; 60 struct page *page;
69 struct pagevec lru_pvec;
70 int ret = 0; 61 int ret = 0;
71 62
72 pagevec_init(&lru_pvec, 0);
73
74 while (!list_empty(pages)) { 63 while (!list_empty(pages)) {
75 page = list_to_page(pages); 64 page = list_to_page(pages);
76 list_del(&page->lru); 65 list_del(&page->lru);
77 if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { 66 if (add_to_page_cache_lru(page, mapping,
67 page->index, GFP_KERNEL)) {
78 page_cache_release(page); 68 page_cache_release(page);
79 continue; 69 continue;
80 } 70 }
71 page_cache_release(page);
72
81 ret = filler(data, page); 73 ret = filler(data, page);
82 if (!pagevec_add(&lru_pvec, page)) 74 if (unlikely(ret)) {
83 __pagevec_lru_add(&lru_pvec);
84 if (ret) {
85 put_pages_list(pages); 75 put_pages_list(pages);
86 break; 76 break;
87 } 77 }
88 task_io_account_read(PAGE_CACHE_SIZE); 78 task_io_account_read(PAGE_CACHE_SIZE);
89 } 79 }
90 pagevec_lru_add(&lru_pvec);
91 return ret; 80 return ret;
92} 81}
93 82
@@ -97,7 +86,6 @@ static int read_pages(struct address_space *mapping, struct file *filp,
97 struct list_head *pages, unsigned nr_pages) 86 struct list_head *pages, unsigned nr_pages)
98{ 87{
99 unsigned page_idx; 88 unsigned page_idx;
100 struct pagevec lru_pvec;
101 int ret; 89 int ret;
102 90
103 if (mapping->a_ops->readpages) { 91 if (mapping->a_ops->readpages) {
@@ -107,19 +95,15 @@ static int read_pages(struct address_space *mapping, struct file *filp,
107 goto out; 95 goto out;
108 } 96 }
109 97
110 pagevec_init(&lru_pvec, 0);
111 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 98 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
112 struct page *page = list_to_page(pages); 99 struct page *page = list_to_page(pages);
113 list_del(&page->lru); 100 list_del(&page->lru);
114 if (!add_to_page_cache(page, mapping, 101 if (!add_to_page_cache_lru(page, mapping,
115 page->index, GFP_KERNEL)) { 102 page->index, GFP_KERNEL)) {
116 mapping->a_ops->readpage(filp, page); 103 mapping->a_ops->readpage(filp, page);
117 if (!pagevec_add(&lru_pvec, page)) 104 }
118 __pagevec_lru_add(&lru_pvec); 105 page_cache_release(page);
119 } else
120 page_cache_release(page);
121 } 106 }
122 pagevec_lru_add(&lru_pvec);
123 ret = 0; 107 ret = 0;
124out: 108out:
125 return ret; 109 return ret;
@@ -157,20 +141,19 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
157 /* 141 /*
158 * Preallocate as many pages as we will need. 142 * Preallocate as many pages as we will need.
159 */ 143 */
160 read_lock_irq(&mapping->tree_lock);
161 for (page_idx = 0; page_idx < nr_to_read; page_idx++) { 144 for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
162 pgoff_t page_offset = offset + page_idx; 145 pgoff_t page_offset = offset + page_idx;
163 146
164 if (page_offset > end_index) 147 if (page_offset > end_index)
165 break; 148 break;
166 149
150 rcu_read_lock();
167 page = radix_tree_lookup(&mapping->page_tree, page_offset); 151 page = radix_tree_lookup(&mapping->page_tree, page_offset);
152 rcu_read_unlock();
168 if (page) 153 if (page)
169 continue; 154 continue;
170 155
171 read_unlock_irq(&mapping->tree_lock);
172 page = page_cache_alloc_cold(mapping); 156 page = page_cache_alloc_cold(mapping);
173 read_lock_irq(&mapping->tree_lock);
174 if (!page) 157 if (!page)
175 break; 158 break;
176 page->index = page_offset; 159 page->index = page_offset;
@@ -179,7 +162,6 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
179 SetPageReadahead(page); 162 SetPageReadahead(page);
180 ret++; 163 ret++;
181 } 164 }
182 read_unlock_irq(&mapping->tree_lock);
183 165
184 /* 166 /*
185 * Now start the IO. We ignore I/O errors - if the page is not 167 * Now start the IO. We ignore I/O errors - if the page is not
@@ -251,6 +233,12 @@ unsigned long max_sane_readahead(unsigned long nr)
251 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); 233 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
252} 234}
253 235
236static int __init readahead_init(void)
237{
238 return bdi_init(&default_backing_dev_info);
239}
240subsys_initcall(readahead_init);
241
254/* 242/*
255 * Submit IO for the read-ahead request in file_ra_state. 243 * Submit IO for the read-ahead request in file_ra_state.
256 */ 244 */
@@ -327,7 +315,7 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
327 * indicator. The flag won't be set on already cached pages, to avoid the 315 * indicator. The flag won't be set on already cached pages, to avoid the
328 * readahead-for-nothing fuss, saving pointless page cache lookups. 316 * readahead-for-nothing fuss, saving pointless page cache lookups.
329 * 317 *
330 * prev_index tracks the last visited page in the _previous_ read request. 318 * prev_pos tracks the last visited byte in the _previous_ read request.
331 * It should be maintained by the caller, and will be used for detecting 319 * It should be maintained by the caller, and will be used for detecting
332 * small random reads. Note that the readahead algorithm checks loosely 320 * small random reads. Note that the readahead algorithm checks loosely
333 * for sequential patterns. Hence interleaved reads might be served as 321 * for sequential patterns. Hence interleaved reads might be served as
@@ -351,11 +339,9 @@ ondemand_readahead(struct address_space *mapping,
351 bool hit_readahead_marker, pgoff_t offset, 339 bool hit_readahead_marker, pgoff_t offset,
352 unsigned long req_size) 340 unsigned long req_size)
353{ 341{
354 unsigned long max; /* max readahead pages */ 342 int max = ra->ra_pages; /* max readahead pages */
355 int sequential; 343 pgoff_t prev_offset;
356 344 int sequential;
357 max = ra->ra_pages;
358 sequential = (offset - ra->prev_index <= 1UL) || (req_size > max);
359 345
360 /* 346 /*
361 * It's the expected callback offset, assume sequential access. 347 * It's the expected callback offset, assume sequential access.
@@ -369,6 +355,9 @@ ondemand_readahead(struct address_space *mapping,
369 goto readit; 355 goto readit;
370 } 356 }
371 357
358 prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
359 sequential = offset - prev_offset <= 1UL || req_size > max;
360
372 /* 361 /*
373 * Standalone, small read. 362 * Standalone, small read.
374 * Read as is, and do not pollute the readahead state. 363 * Read as is, and do not pollute the readahead state.
@@ -379,6 +368,29 @@ ondemand_readahead(struct address_space *mapping,
379 } 368 }
380 369
381 /* 370 /*
371 * Hit a marked page without valid readahead state.
372 * E.g. interleaved reads.
373 * Query the pagecache for async_size, which normally equals to
374 * readahead size. Ramp it up and use it as the new readahead size.
375 */
376 if (hit_readahead_marker) {
377 pgoff_t start;
378
379 read_lock_irq(&mapping->tree_lock);
380 start = radix_tree_next_hole(&mapping->page_tree, offset, max+1);
381 read_unlock_irq(&mapping->tree_lock);
382
383 if (!start || start - offset > max)
384 return 0;
385
386 ra->start = start;
387 ra->size = start - offset; /* old async_size */
388 ra->size = get_next_ra_size(ra, max);
389 ra->async_size = ra->size;
390 goto readit;
391 }
392
393 /*
382 * It may be one of 394 * It may be one of
383 * - first read on start of file 395 * - first read on start of file
384 * - sequential cache miss 396 * - sequential cache miss
@@ -389,16 +401,6 @@ ondemand_readahead(struct address_space *mapping,
389 ra->size = get_init_ra_size(req_size, max); 401 ra->size = get_init_ra_size(req_size, max);
390 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; 402 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
391 403
392 /*
393 * Hit on a marked page without valid readahead state.
394 * E.g. interleaved reads.
395 * Not knowing its readahead pos/size, bet on the minimal possible one.
396 */
397 if (hit_readahead_marker) {
398 ra->start++;
399 ra->size = get_next_ra_size(ra, max);
400 }
401
402readit: 404readit:
403 return ra_submit(ra, mapping, filp); 405 return ra_submit(ra, mapping, filp);
404} 406}
diff --git a/mm/rmap.c b/mm/rmap.c
index 41ac39749ef4..8990f909492f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,6 +36,7 @@
36 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_lock in __sync_single_inode) 38 * within inode_lock in __sync_single_inode)
39 * zone->lock (within radix tree node alloc)
39 */ 40 */
40 41
41#include <linux/mm.h> 42#include <linux/mm.h>
@@ -137,8 +138,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
137 anon_vma_free(anon_vma); 138 anon_vma_free(anon_vma);
138} 139}
139 140
140static void anon_vma_ctor(void *data, struct kmem_cache *cachep, 141static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
141 unsigned long flags)
142{ 142{
143 struct anon_vma *anon_vma = data; 143 struct anon_vma *anon_vma = data;
144 144
@@ -436,7 +436,6 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
436 entry = pte_wrprotect(entry); 436 entry = pte_wrprotect(entry);
437 entry = pte_mkclean(entry); 437 entry = pte_mkclean(entry);
438 set_pte_at(mm, address, pte, entry); 438 set_pte_at(mm, address, pte, entry);
439 lazy_mmu_prot_update(entry);
440 ret = 1; 439 ret = 1;
441 } 440 }
442 441
diff --git a/mm/shmem.c b/mm/shmem.c
index fcd19d323f9f..289dbb0a6fd6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -49,7 +49,6 @@
49#include <linux/ctype.h> 49#include <linux/ctype.h>
50#include <linux/migrate.h> 50#include <linux/migrate.h>
51#include <linux/highmem.h> 51#include <linux/highmem.h>
52#include <linux/backing-dev.h>
53 52
54#include <asm/uaccess.h> 53#include <asm/uaccess.h>
55#include <asm/div64.h> 54#include <asm/div64.h>
@@ -96,9 +95,9 @@ static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
96 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: 95 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
97 * might be reconsidered if it ever diverges from PAGE_SIZE. 96 * might be reconsidered if it ever diverges from PAGE_SIZE.
98 * 97 *
99 * __GFP_MOVABLE is masked out as swap vectors cannot move 98 * Mobility flags are masked out as swap vectors cannot move
100 */ 99 */
101 return alloc_pages((gfp_mask & ~__GFP_MOVABLE) | __GFP_ZERO, 100 return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
102 PAGE_CACHE_SHIFT-PAGE_SHIFT); 101 PAGE_CACHE_SHIFT-PAGE_SHIFT);
103} 102}
104 103
@@ -972,7 +971,7 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_
972 *nodelist++ = '\0'; 971 *nodelist++ = '\0';
973 if (nodelist_parse(nodelist, *policy_nodes)) 972 if (nodelist_parse(nodelist, *policy_nodes))
974 goto out; 973 goto out;
975 if (!nodes_subset(*policy_nodes, node_online_map)) 974 if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
976 goto out; 975 goto out;
977 } 976 }
978 if (!strcmp(value, "default")) { 977 if (!strcmp(value, "default")) {
@@ -997,9 +996,11 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_
997 err = 0; 996 err = 0;
998 } else if (!strcmp(value, "interleave")) { 997 } else if (!strcmp(value, "interleave")) {
999 *policy = MPOL_INTERLEAVE; 998 *policy = MPOL_INTERLEAVE;
1000 /* Default to nodes online if no nodelist */ 999 /*
1000 * Default to online nodes with memory if no nodelist
1001 */
1001 if (!nodelist) 1002 if (!nodelist)
1002 *policy_nodes = node_online_map; 1003 *policy_nodes = node_states[N_HIGH_MEMORY];
1003 err = 0; 1004 err = 0;
1004 } 1005 }
1005out: 1006out:
@@ -1025,8 +1026,8 @@ static struct page *shmem_swapin_async(struct shared_policy *p,
1025 return page; 1026 return page;
1026} 1027}
1027 1028
1028struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry, 1029static struct page *shmem_swapin(struct shmem_inode_info *info,
1029 unsigned long idx) 1030 swp_entry_t entry, unsigned long idx)
1030{ 1031{
1031 struct shared_policy *p = &info->policy; 1032 struct shared_policy *p = &info->policy;
1032 int i, num; 1033 int i, num;
@@ -1061,7 +1062,8 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
1061 return page; 1062 return page;
1062} 1063}
1063#else 1064#else
1064static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) 1065static inline int shmem_parse_mpol(char *value, int *policy,
1066 nodemask_t *policy_nodes)
1065{ 1067{
1066 return 1; 1068 return 1;
1067} 1069}
@@ -1109,7 +1111,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
1109 * Normally, filepage is NULL on entry, and either found 1111 * Normally, filepage is NULL on entry, and either found
1110 * uptodate immediately, or allocated and zeroed, or read 1112 * uptodate immediately, or allocated and zeroed, or read
1111 * in under swappage, which is then assigned to filepage. 1113 * in under swappage, which is then assigned to filepage.
1112 * But shmem_readpage and shmem_prepare_write pass in a locked 1114 * But shmem_readpage and shmem_write_begin pass in a locked
1113 * filepage, which may be found not uptodate by other callers 1115 * filepage, which may be found not uptodate by other callers
1114 * too, and may need to be copied from the swappage read in. 1116 * too, and may need to be copied from the swappage read in.
1115 */ 1117 */
@@ -1327,14 +1329,14 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1327} 1329}
1328 1330
1329#ifdef CONFIG_NUMA 1331#ifdef CONFIG_NUMA
1330int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) 1332static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
1331{ 1333{
1332 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1334 struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1333 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); 1335 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
1334} 1336}
1335 1337
1336struct mempolicy * 1338static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
1337shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) 1339 unsigned long addr)
1338{ 1340{
1339 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1341 struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1340 unsigned long idx; 1342 unsigned long idx;
@@ -1446,7 +1448,7 @@ static const struct inode_operations shmem_symlink_inode_operations;
1446static const struct inode_operations shmem_symlink_inline_operations; 1448static const struct inode_operations shmem_symlink_inline_operations;
1447 1449
1448/* 1450/*
1449 * Normally tmpfs avoids the use of shmem_readpage and shmem_prepare_write; 1451 * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
1450 * but providing them allows a tmpfs file to be used for splice, sendfile, and 1452 * but providing them allows a tmpfs file to be used for splice, sendfile, and
1451 * below the loop driver, in the generic fashion that many filesystems support. 1453 * below the loop driver, in the generic fashion that many filesystems support.
1452 */ 1454 */
@@ -1459,10 +1461,30 @@ static int shmem_readpage(struct file *file, struct page *page)
1459} 1461}
1460 1462
1461static int 1463static int
1462shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) 1464shmem_write_begin(struct file *file, struct address_space *mapping,
1465 loff_t pos, unsigned len, unsigned flags,
1466 struct page **pagep, void **fsdata)
1463{ 1467{
1464 struct inode *inode = page->mapping->host; 1468 struct inode *inode = mapping->host;
1465 return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL); 1469 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1470 *pagep = NULL;
1471 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1472}
1473
1474static int
1475shmem_write_end(struct file *file, struct address_space *mapping,
1476 loff_t pos, unsigned len, unsigned copied,
1477 struct page *page, void *fsdata)
1478{
1479 struct inode *inode = mapping->host;
1480
1481 set_page_dirty(page);
1482 page_cache_release(page);
1483
1484 if (pos+copied > inode->i_size)
1485 i_size_write(inode, pos+copied);
1486
1487 return copied;
1466} 1488}
1467 1489
1468static ssize_t 1490static ssize_t
@@ -2219,7 +2241,7 @@ static int shmem_fill_super(struct super_block *sb,
2219 unsigned long blocks = 0; 2241 unsigned long blocks = 0;
2220 unsigned long inodes = 0; 2242 unsigned long inodes = 0;
2221 int policy = MPOL_DEFAULT; 2243 int policy = MPOL_DEFAULT;
2222 nodemask_t policy_nodes = node_online_map; 2244 nodemask_t policy_nodes = node_states[N_HIGH_MEMORY];
2223 2245
2224#ifdef CONFIG_TMPFS 2246#ifdef CONFIG_TMPFS
2225 /* 2247 /*
@@ -2306,8 +2328,7 @@ static void shmem_destroy_inode(struct inode *inode)
2306 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2328 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2307} 2329}
2308 2330
2309static void init_once(void *foo, struct kmem_cache *cachep, 2331static void init_once(struct kmem_cache *cachep, void *foo)
2310 unsigned long flags)
2311{ 2332{
2312 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2333 struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2313 2334
@@ -2322,9 +2343,7 @@ static int init_inodecache(void)
2322{ 2343{
2323 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 2344 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2324 sizeof(struct shmem_inode_info), 2345 sizeof(struct shmem_inode_info),
2325 0, 0, init_once); 2346 0, SLAB_PANIC, init_once);
2326 if (shmem_inode_cachep == NULL)
2327 return -ENOMEM;
2328 return 0; 2347 return 0;
2329} 2348}
2330 2349
@@ -2338,8 +2357,8 @@ static const struct address_space_operations shmem_aops = {
2338 .set_page_dirty = __set_page_dirty_no_writeback, 2357 .set_page_dirty = __set_page_dirty_no_writeback,
2339#ifdef CONFIG_TMPFS 2358#ifdef CONFIG_TMPFS
2340 .readpage = shmem_readpage, 2359 .readpage = shmem_readpage,
2341 .prepare_write = shmem_prepare_write, 2360 .write_begin = shmem_write_begin,
2342 .commit_write = simple_commit_write, 2361 .write_end = shmem_write_end,
2343#endif 2362#endif
2344 .migratepage = migrate_page, 2363 .migratepage = migrate_page,
2345}; 2364};
@@ -2442,6 +2461,10 @@ static int __init init_tmpfs(void)
2442{ 2461{
2443 int error; 2462 int error;
2444 2463
2464 error = bdi_init(&shmem_backing_dev_info);
2465 if (error)
2466 goto out4;
2467
2445 error = init_inodecache(); 2468 error = init_inodecache();
2446 if (error) 2469 if (error)
2447 goto out3; 2470 goto out3;
@@ -2466,6 +2489,8 @@ out1:
2466out2: 2489out2:
2467 destroy_inodecache(); 2490 destroy_inodecache();
2468out3: 2491out3:
2492 bdi_destroy(&shmem_backing_dev_info);
2493out4:
2469 shm_mnt = ERR_PTR(error); 2494 shm_mnt = ERR_PTR(error);
2470 return error; 2495 return error;
2471} 2496}
@@ -2518,11 +2543,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2518 d_instantiate(dentry, inode); 2543 d_instantiate(dentry, inode);
2519 inode->i_size = size; 2544 inode->i_size = size;
2520 inode->i_nlink = 0; /* It is unlinked */ 2545 inode->i_nlink = 0; /* It is unlinked */
2521 file->f_path.mnt = mntget(shm_mnt); 2546 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
2522 file->f_path.dentry = dentry; 2547 &shmem_file_operations);
2523 file->f_mapping = inode->i_mapping;
2524 file->f_op = &shmem_file_operations;
2525 file->f_mode = FMODE_WRITE | FMODE_READ;
2526 return file; 2548 return file;
2527 2549
2528close_file: 2550close_file:
diff --git a/mm/slab.c b/mm/slab.c
index 6f6abef83a1a..3ce9bc024d67 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -267,11 +267,10 @@ struct array_cache {
267 unsigned int batchcount; 267 unsigned int batchcount;
268 unsigned int touched; 268 unsigned int touched;
269 spinlock_t lock; 269 spinlock_t lock;
270 void *entry[0]; /* 270 void *entry[]; /*
271 * Must have this definition in here for the proper 271 * Must have this definition in here for the proper
272 * alignment of array_cache. Also simplifies accessing 272 * alignment of array_cache. Also simplifies accessing
273 * the entries. 273 * the entries.
274 * [0] is for gcc 2.95. It should really be [].
275 */ 274 */
276}; 275};
277 276
@@ -408,7 +407,7 @@ struct kmem_cache {
408 unsigned int dflags; /* dynamic flags */ 407 unsigned int dflags; /* dynamic flags */
409 408
410 /* constructor func */ 409 /* constructor func */
411 void (*ctor) (void *, struct kmem_cache *, unsigned long); 410 void (*ctor)(struct kmem_cache *, void *);
412 411
413/* 5) cache creation/removal */ 412/* 5) cache creation/removal */
414 const char *name; 413 const char *name;
@@ -1568,7 +1567,7 @@ void __init kmem_cache_init(void)
1568 /* Replace the static kmem_list3 structures for the boot cpu */ 1567 /* Replace the static kmem_list3 structures for the boot cpu */
1569 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); 1568 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
1570 1569
1571 for_each_online_node(nid) { 1570 for_each_node_state(nid, N_NORMAL_MEMORY) {
1572 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1571 init_list(malloc_sizes[INDEX_AC].cs_cachep,
1573 &initkmem_list3[SIZE_AC + nid], nid); 1572 &initkmem_list3[SIZE_AC + nid], nid);
1574 1573
@@ -1643,6 +1642,8 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1643#endif 1642#endif
1644 1643
1645 flags |= cachep->gfpflags; 1644 flags |= cachep->gfpflags;
1645 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1646 flags |= __GFP_RECLAIMABLE;
1646 1647
1647 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1648 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1648 if (!page) 1649 if (!page)
@@ -1944,7 +1945,7 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
1944{ 1945{
1945 int node; 1946 int node;
1946 1947
1947 for_each_online_node(node) { 1948 for_each_node_state(node, N_NORMAL_MEMORY) {
1948 cachep->nodelists[node] = &initkmem_list3[index + node]; 1949 cachep->nodelists[node] = &initkmem_list3[index + node];
1949 cachep->nodelists[node]->next_reap = jiffies + 1950 cachep->nodelists[node]->next_reap = jiffies +
1950 REAPTIMEOUT_LIST3 + 1951 REAPTIMEOUT_LIST3 +
@@ -2075,7 +2076,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2075 g_cpucache_up = PARTIAL_L3; 2076 g_cpucache_up = PARTIAL_L3;
2076 } else { 2077 } else {
2077 int node; 2078 int node;
2078 for_each_online_node(node) { 2079 for_each_node_state(node, N_NORMAL_MEMORY) {
2079 cachep->nodelists[node] = 2080 cachep->nodelists[node] =
2080 kmalloc_node(sizeof(struct kmem_list3), 2081 kmalloc_node(sizeof(struct kmem_list3),
2081 GFP_KERNEL, node); 2082 GFP_KERNEL, node);
@@ -2127,7 +2128,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2127struct kmem_cache * 2128struct kmem_cache *
2128kmem_cache_create (const char *name, size_t size, size_t align, 2129kmem_cache_create (const char *name, size_t size, size_t align,
2129 unsigned long flags, 2130 unsigned long flags,
2130 void (*ctor)(void*, struct kmem_cache *, unsigned long)) 2131 void (*ctor)(struct kmem_cache *, void *))
2131{ 2132{
2132 size_t left_over, slab_size, ralign; 2133 size_t left_over, slab_size, ralign;
2133 struct kmem_cache *cachep = NULL, *pc; 2134 struct kmem_cache *cachep = NULL, *pc;
@@ -2634,8 +2635,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2634 * They must also be threaded. 2635 * They must also be threaded.
2635 */ 2636 */
2636 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2637 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2637 cachep->ctor(objp + obj_offset(cachep), cachep, 2638 cachep->ctor(cachep, objp + obj_offset(cachep));
2638 0);
2639 2639
2640 if (cachep->flags & SLAB_RED_ZONE) { 2640 if (cachep->flags & SLAB_RED_ZONE) {
2641 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2641 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2651,7 +2651,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2651 cachep->buffer_size / PAGE_SIZE, 0); 2651 cachep->buffer_size / PAGE_SIZE, 0);
2652#else 2652#else
2653 if (cachep->ctor) 2653 if (cachep->ctor)
2654 cachep->ctor(objp, cachep, 0); 2654 cachep->ctor(cachep, objp);
2655#endif 2655#endif
2656 slab_bufctl(slabp)[i] = i + 1; 2656 slab_bufctl(slabp)[i] = i + 1;
2657 } 2657 }
@@ -2746,9 +2746,9 @@ static int cache_grow(struct kmem_cache *cachep,
2746 * Be lazy and only check for valid flags here, keeping it out of the 2746 * Be lazy and only check for valid flags here, keeping it out of the
2747 * critical path in kmem_cache_alloc(). 2747 * critical path in kmem_cache_alloc().
2748 */ 2748 */
2749 BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); 2749 BUG_ON(flags & GFP_SLAB_BUG_MASK);
2750 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2750 2751
2751 local_flags = (flags & GFP_LEVEL_MASK);
2752 /* Take the l3 list lock to change the colour_next on this node */ 2752 /* Take the l3 list lock to change the colour_next on this node */
2753 check_irq_off(); 2753 check_irq_off();
2754 l3 = cachep->nodelists[nodeid]; 2754 l3 = cachep->nodelists[nodeid];
@@ -2785,7 +2785,7 @@ static int cache_grow(struct kmem_cache *cachep,
2785 2785
2786 /* Get slab management. */ 2786 /* Get slab management. */
2787 slabp = alloc_slabmgmt(cachep, objp, offset, 2787 slabp = alloc_slabmgmt(cachep, objp, offset,
2788 local_flags & ~GFP_THISNODE, nodeid); 2788 local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
2789 if (!slabp) 2789 if (!slabp)
2790 goto opps1; 2790 goto opps1;
2791 2791
@@ -3076,7 +3076,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3076#endif 3076#endif
3077 objp += obj_offset(cachep); 3077 objp += obj_offset(cachep);
3078 if (cachep->ctor && cachep->flags & SLAB_POISON) 3078 if (cachep->ctor && cachep->flags & SLAB_POISON)
3079 cachep->ctor(objp, cachep, 0); 3079 cachep->ctor(cachep, objp);
3080#if ARCH_SLAB_MINALIGN 3080#if ARCH_SLAB_MINALIGN
3081 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3081 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3082 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3082 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
@@ -3225,7 +3225,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3225 3225
3226 zonelist = &NODE_DATA(slab_node(current->mempolicy)) 3226 zonelist = &NODE_DATA(slab_node(current->mempolicy))
3227 ->node_zonelists[gfp_zone(flags)]; 3227 ->node_zonelists[gfp_zone(flags)];
3228 local_flags = (flags & GFP_LEVEL_MASK); 3228 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3229 3229
3230retry: 3230retry:
3231 /* 3231 /*
@@ -3792,7 +3792,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3792 struct array_cache *new_shared; 3792 struct array_cache *new_shared;
3793 struct array_cache **new_alien = NULL; 3793 struct array_cache **new_alien = NULL;
3794 3794
3795 for_each_online_node(node) { 3795 for_each_node_state(node, N_NORMAL_MEMORY) {
3796 3796
3797 if (use_alien_caches) { 3797 if (use_alien_caches) {
3798 new_alien = alloc_alien_cache(node, cachep->limit); 3798 new_alien = alloc_alien_cache(node, cachep->limit);
@@ -4446,7 +4446,8 @@ const struct seq_operations slabstats_op = {
4446 */ 4446 */
4447size_t ksize(const void *objp) 4447size_t ksize(const void *objp)
4448{ 4448{
4449 if (unlikely(ZERO_OR_NULL_PTR(objp))) 4449 BUG_ON(!objp);
4450 if (unlikely(objp == ZERO_SIZE_PTR))
4450 return 0; 4451 return 0;
4451 4452
4452 return obj_size(virt_to_cache(objp)); 4453 return obj_size(virt_to_cache(objp));
diff --git a/mm/slob.c b/mm/slob.c
index ec33fcdc852e..5bc2ceb692ec 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -360,7 +360,7 @@ static void slob_free(void *block, int size)
360 slobidx_t units; 360 slobidx_t units;
361 unsigned long flags; 361 unsigned long flags;
362 362
363 if (ZERO_OR_NULL_PTR(block)) 363 if (unlikely(ZERO_OR_NULL_PTR(block)))
364 return; 364 return;
365 BUG_ON(!size); 365 BUG_ON(!size);
366 366
@@ -466,7 +466,7 @@ void kfree(const void *block)
466{ 466{
467 struct slob_page *sp; 467 struct slob_page *sp;
468 468
469 if (ZERO_OR_NULL_PTR(block)) 469 if (unlikely(ZERO_OR_NULL_PTR(block)))
470 return; 470 return;
471 471
472 sp = (struct slob_page *)virt_to_page(block); 472 sp = (struct slob_page *)virt_to_page(block);
@@ -484,7 +484,8 @@ size_t ksize(const void *block)
484{ 484{
485 struct slob_page *sp; 485 struct slob_page *sp;
486 486
487 if (ZERO_OR_NULL_PTR(block)) 487 BUG_ON(!block);
488 if (unlikely(block == ZERO_SIZE_PTR))
488 return 0; 489 return 0;
489 490
490 sp = (struct slob_page *)virt_to_page(block); 491 sp = (struct slob_page *)virt_to_page(block);
@@ -498,12 +499,12 @@ struct kmem_cache {
498 unsigned int size, align; 499 unsigned int size, align;
499 unsigned long flags; 500 unsigned long flags;
500 const char *name; 501 const char *name;
501 void (*ctor)(void *, struct kmem_cache *, unsigned long); 502 void (*ctor)(struct kmem_cache *, void *);
502}; 503};
503 504
504struct kmem_cache *kmem_cache_create(const char *name, size_t size, 505struct kmem_cache *kmem_cache_create(const char *name, size_t size,
505 size_t align, unsigned long flags, 506 size_t align, unsigned long flags,
506 void (*ctor)(void*, struct kmem_cache *, unsigned long)) 507 void (*ctor)(struct kmem_cache *, void *))
507{ 508{
508 struct kmem_cache *c; 509 struct kmem_cache *c;
509 510
@@ -547,7 +548,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
547 b = slob_new_page(flags, get_order(c->size), node); 548 b = slob_new_page(flags, get_order(c->size), node);
548 549
549 if (c->ctor) 550 if (c->ctor)
550 c->ctor(b, c, 0); 551 c->ctor(c, b);
551 552
552 return b; 553 return b;
553} 554}
diff --git a/mm/slub.c b/mm/slub.c
index addb20a6d67d..e29a42988c78 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -90,7 +90,7 @@
90 * One use of this flag is to mark slabs that are 90 * One use of this flag is to mark slabs that are
91 * used for allocations. Then such a slab becomes a cpu 91 * used for allocations. Then such a slab becomes a cpu
92 * slab. The cpu slab may be equipped with an additional 92 * slab. The cpu slab may be equipped with an additional
93 * lockless_freelist that allows lockless access to 93 * freelist that allows lockless access to
94 * free objects in addition to the regular freelist 94 * free objects in addition to the regular freelist
95 * that requires the slab lock. 95 * that requires the slab lock.
96 * 96 *
@@ -140,11 +140,6 @@ static inline void ClearSlabDebug(struct page *page)
140/* 140/*
141 * Issues still to be resolved: 141 * Issues still to be resolved:
142 * 142 *
143 * - The per cpu array is updated for each new slab and and is a remote
144 * cacheline for most nodes. This could become a bouncing cacheline given
145 * enough frequent updates. There are 16 pointers in a cacheline, so at
146 * max 16 cpus could compete for the cacheline which may be okay.
147 *
148 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 143 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
149 * 144 *
150 * - Variable sizing of the per node arrays 145 * - Variable sizing of the per node arrays
@@ -205,11 +200,6 @@ static inline void ClearSlabDebug(struct page *page)
205#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) 200#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
206#endif 201#endif
207 202
208/*
209 * The page->inuse field is 16 bit thus we have this limitation
210 */
211#define MAX_OBJECTS_PER_SLAB 65535
212
213/* Internal SLUB flags */ 203/* Internal SLUB flags */
214#define __OBJECT_POISON 0x80000000 /* Poison object */ 204#define __OBJECT_POISON 0x80000000 /* Poison object */
215#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ 205#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */
@@ -277,6 +267,15 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
277#endif 267#endif
278} 268}
279 269
270static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
271{
272#ifdef CONFIG_SMP
273 return s->cpu_slab[cpu];
274#else
275 return &s->cpu_slab;
276#endif
277}
278
280static inline int check_valid_pointer(struct kmem_cache *s, 279static inline int check_valid_pointer(struct kmem_cache *s,
281 struct page *page, const void *object) 280 struct page *page, const void *object)
282{ 281{
@@ -729,11 +728,6 @@ static int check_slab(struct kmem_cache *s, struct page *page)
729 slab_err(s, page, "Not a valid slab page"); 728 slab_err(s, page, "Not a valid slab page");
730 return 0; 729 return 0;
731 } 730 }
732 if (page->offset * sizeof(void *) != s->offset) {
733 slab_err(s, page, "Corrupted offset %lu",
734 (unsigned long)(page->offset * sizeof(void *)));
735 return 0;
736 }
737 if (page->inuse > s->objects) { 731 if (page->inuse > s->objects) {
738 slab_err(s, page, "inuse %u > max %u", 732 slab_err(s, page, "inuse %u > max %u",
739 s->name, page->inuse, s->objects); 733 s->name, page->inuse, s->objects);
@@ -872,8 +866,6 @@ bad:
872 slab_fix(s, "Marking all objects used"); 866 slab_fix(s, "Marking all objects used");
873 page->inuse = s->objects; 867 page->inuse = s->objects;
874 page->freelist = NULL; 868 page->freelist = NULL;
875 /* Fix up fields that may be corrupted */
876 page->offset = s->offset / sizeof(void *);
877 } 869 }
878 return 0; 870 return 0;
879} 871}
@@ -988,7 +980,7 @@ __setup("slub_debug", setup_slub_debug);
988 980
989static unsigned long kmem_cache_flags(unsigned long objsize, 981static unsigned long kmem_cache_flags(unsigned long objsize,
990 unsigned long flags, const char *name, 982 unsigned long flags, const char *name,
991 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 983 void (*ctor)(struct kmem_cache *, void *))
992{ 984{
993 /* 985 /*
994 * The page->offset field is only 16 bit wide. This is an offset 986 * The page->offset field is only 16 bit wide. This is an offset
@@ -1035,7 +1027,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
1035static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1027static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1036static inline unsigned long kmem_cache_flags(unsigned long objsize, 1028static inline unsigned long kmem_cache_flags(unsigned long objsize,
1037 unsigned long flags, const char *name, 1029 unsigned long flags, const char *name,
1038 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 1030 void (*ctor)(struct kmem_cache *, void *))
1039{ 1031{
1040 return flags; 1032 return flags;
1041} 1033}
@@ -1055,6 +1047,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1055 if (s->flags & SLAB_CACHE_DMA) 1047 if (s->flags & SLAB_CACHE_DMA)
1056 flags |= SLUB_DMA; 1048 flags |= SLUB_DMA;
1057 1049
1050 if (s->flags & SLAB_RECLAIM_ACCOUNT)
1051 flags |= __GFP_RECLAIMABLE;
1052
1058 if (node == -1) 1053 if (node == -1)
1059 page = alloc_pages(flags, s->order); 1054 page = alloc_pages(flags, s->order);
1060 else 1055 else
@@ -1076,7 +1071,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
1076{ 1071{
1077 setup_object_debug(s, page, object); 1072 setup_object_debug(s, page, object);
1078 if (unlikely(s->ctor)) 1073 if (unlikely(s->ctor))
1079 s->ctor(object, s, 0); 1074 s->ctor(s, object);
1080} 1075}
1081 1076
1082static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1077static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1088,19 +1083,16 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1088 void *last; 1083 void *last;
1089 void *p; 1084 void *p;
1090 1085
1091 BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); 1086 BUG_ON(flags & GFP_SLAB_BUG_MASK);
1092
1093 if (flags & __GFP_WAIT)
1094 local_irq_enable();
1095 1087
1096 page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); 1088 page = allocate_slab(s,
1089 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1097 if (!page) 1090 if (!page)
1098 goto out; 1091 goto out;
1099 1092
1100 n = get_node(s, page_to_nid(page)); 1093 n = get_node(s, page_to_nid(page));
1101 if (n) 1094 if (n)
1102 atomic_long_inc(&n->nr_slabs); 1095 atomic_long_inc(&n->nr_slabs);
1103 page->offset = s->offset / sizeof(void *);
1104 page->slab = s; 1096 page->slab = s;
1105 page->flags |= 1 << PG_slab; 1097 page->flags |= 1 << PG_slab;
1106 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 1098 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
@@ -1123,11 +1115,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1123 set_freepointer(s, last, NULL); 1115 set_freepointer(s, last, NULL);
1124 1116
1125 page->freelist = start; 1117 page->freelist = start;
1126 page->lockless_freelist = NULL;
1127 page->inuse = 0; 1118 page->inuse = 0;
1128out: 1119out:
1129 if (flags & __GFP_WAIT)
1130 local_irq_disable();
1131 return page; 1120 return page;
1132} 1121}
1133 1122
@@ -1149,7 +1138,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1149 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1138 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1150 - pages); 1139 - pages);
1151 1140
1152 page->mapping = NULL;
1153 __free_pages(page, s->order); 1141 __free_pages(page, s->order);
1154} 1142}
1155 1143
@@ -1383,33 +1371,34 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page)
1383/* 1371/*
1384 * Remove the cpu slab 1372 * Remove the cpu slab
1385 */ 1373 */
1386static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) 1374static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1387{ 1375{
1376 struct page *page = c->page;
1388 /* 1377 /*
1389 * Merge cpu freelist into freelist. Typically we get here 1378 * Merge cpu freelist into freelist. Typically we get here
1390 * because both freelists are empty. So this is unlikely 1379 * because both freelists are empty. So this is unlikely
1391 * to occur. 1380 * to occur.
1392 */ 1381 */
1393 while (unlikely(page->lockless_freelist)) { 1382 while (unlikely(c->freelist)) {
1394 void **object; 1383 void **object;
1395 1384
1396 /* Retrieve object from cpu_freelist */ 1385 /* Retrieve object from cpu_freelist */
1397 object = page->lockless_freelist; 1386 object = c->freelist;
1398 page->lockless_freelist = page->lockless_freelist[page->offset]; 1387 c->freelist = c->freelist[c->offset];
1399 1388
1400 /* And put onto the regular freelist */ 1389 /* And put onto the regular freelist */
1401 object[page->offset] = page->freelist; 1390 object[c->offset] = page->freelist;
1402 page->freelist = object; 1391 page->freelist = object;
1403 page->inuse--; 1392 page->inuse--;
1404 } 1393 }
1405 s->cpu_slab[cpu] = NULL; 1394 c->page = NULL;
1406 unfreeze_slab(s, page); 1395 unfreeze_slab(s, page);
1407} 1396}
1408 1397
1409static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) 1398static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1410{ 1399{
1411 slab_lock(page); 1400 slab_lock(c->page);
1412 deactivate_slab(s, page, cpu); 1401 deactivate_slab(s, c);
1413} 1402}
1414 1403
1415/* 1404/*
@@ -1418,18 +1407,17 @@ static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
1418 */ 1407 */
1419static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1408static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1420{ 1409{
1421 struct page *page = s->cpu_slab[cpu]; 1410 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
1422 1411
1423 if (likely(page)) 1412 if (likely(c && c->page))
1424 flush_slab(s, page, cpu); 1413 flush_slab(s, c);
1425} 1414}
1426 1415
1427static void flush_cpu_slab(void *d) 1416static void flush_cpu_slab(void *d)
1428{ 1417{
1429 struct kmem_cache *s = d; 1418 struct kmem_cache *s = d;
1430 int cpu = smp_processor_id();
1431 1419
1432 __flush_cpu_slab(s, cpu); 1420 __flush_cpu_slab(s, smp_processor_id());
1433} 1421}
1434 1422
1435static void flush_all(struct kmem_cache *s) 1423static void flush_all(struct kmem_cache *s)
@@ -1446,6 +1434,19 @@ static void flush_all(struct kmem_cache *s)
1446} 1434}
1447 1435
1448/* 1436/*
1437 * Check if the objects in a per cpu structure fit numa
1438 * locality expectations.
1439 */
1440static inline int node_match(struct kmem_cache_cpu *c, int node)
1441{
1442#ifdef CONFIG_NUMA
1443 if (node != -1 && c->node != node)
1444 return 0;
1445#endif
1446 return 1;
1447}
1448
1449/*
1449 * Slow path. The lockless freelist is empty or we need to perform 1450 * Slow path. The lockless freelist is empty or we need to perform
1450 * debugging duties. 1451 * debugging duties.
1451 * 1452 *
@@ -1463,45 +1464,53 @@ static void flush_all(struct kmem_cache *s)
1463 * we need to allocate a new slab. This is slowest path since we may sleep. 1464 * we need to allocate a new slab. This is slowest path since we may sleep.
1464 */ 1465 */
1465static void *__slab_alloc(struct kmem_cache *s, 1466static void *__slab_alloc(struct kmem_cache *s,
1466 gfp_t gfpflags, int node, void *addr, struct page *page) 1467 gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
1467{ 1468{
1468 void **object; 1469 void **object;
1469 int cpu = smp_processor_id(); 1470 struct page *new;
1470 1471
1471 if (!page) 1472 if (!c->page)
1472 goto new_slab; 1473 goto new_slab;
1473 1474
1474 slab_lock(page); 1475 slab_lock(c->page);
1475 if (unlikely(node != -1 && page_to_nid(page) != node)) 1476 if (unlikely(!node_match(c, node)))
1476 goto another_slab; 1477 goto another_slab;
1477load_freelist: 1478load_freelist:
1478 object = page->freelist; 1479 object = c->page->freelist;
1479 if (unlikely(!object)) 1480 if (unlikely(!object))
1480 goto another_slab; 1481 goto another_slab;
1481 if (unlikely(SlabDebug(page))) 1482 if (unlikely(SlabDebug(c->page)))
1482 goto debug; 1483 goto debug;
1483 1484
1484 object = page->freelist; 1485 object = c->page->freelist;
1485 page->lockless_freelist = object[page->offset]; 1486 c->freelist = object[c->offset];
1486 page->inuse = s->objects; 1487 c->page->inuse = s->objects;
1487 page->freelist = NULL; 1488 c->page->freelist = NULL;
1488 slab_unlock(page); 1489 c->node = page_to_nid(c->page);
1490 slab_unlock(c->page);
1489 return object; 1491 return object;
1490 1492
1491another_slab: 1493another_slab:
1492 deactivate_slab(s, page, cpu); 1494 deactivate_slab(s, c);
1493 1495
1494new_slab: 1496new_slab:
1495 page = get_partial(s, gfpflags, node); 1497 new = get_partial(s, gfpflags, node);
1496 if (page) { 1498 if (new) {
1497 s->cpu_slab[cpu] = page; 1499 c->page = new;
1498 goto load_freelist; 1500 goto load_freelist;
1499 } 1501 }
1500 1502
1501 page = new_slab(s, gfpflags, node); 1503 if (gfpflags & __GFP_WAIT)
1502 if (page) { 1504 local_irq_enable();
1503 cpu = smp_processor_id(); 1505
1504 if (s->cpu_slab[cpu]) { 1506 new = new_slab(s, gfpflags, node);
1507
1508 if (gfpflags & __GFP_WAIT)
1509 local_irq_disable();
1510
1511 if (new) {
1512 c = get_cpu_slab(s, smp_processor_id());
1513 if (c->page) {
1505 /* 1514 /*
1506 * Someone else populated the cpu_slab while we 1515 * Someone else populated the cpu_slab while we
1507 * enabled interrupts, or we have gotten scheduled 1516 * enabled interrupts, or we have gotten scheduled
@@ -1509,34 +1518,33 @@ new_slab:
1509 * requested node even if __GFP_THISNODE was 1518 * requested node even if __GFP_THISNODE was
1510 * specified. So we need to recheck. 1519 * specified. So we need to recheck.
1511 */ 1520 */
1512 if (node == -1 || 1521 if (node_match(c, node)) {
1513 page_to_nid(s->cpu_slab[cpu]) == node) {
1514 /* 1522 /*
1515 * Current cpuslab is acceptable and we 1523 * Current cpuslab is acceptable and we
1516 * want the current one since its cache hot 1524 * want the current one since its cache hot
1517 */ 1525 */
1518 discard_slab(s, page); 1526 discard_slab(s, new);
1519 page = s->cpu_slab[cpu]; 1527 slab_lock(c->page);
1520 slab_lock(page);
1521 goto load_freelist; 1528 goto load_freelist;
1522 } 1529 }
1523 /* New slab does not fit our expectations */ 1530 /* New slab does not fit our expectations */
1524 flush_slab(s, s->cpu_slab[cpu], cpu); 1531 flush_slab(s, c);
1525 } 1532 }
1526 slab_lock(page); 1533 slab_lock(new);
1527 SetSlabFrozen(page); 1534 SetSlabFrozen(new);
1528 s->cpu_slab[cpu] = page; 1535 c->page = new;
1529 goto load_freelist; 1536 goto load_freelist;
1530 } 1537 }
1531 return NULL; 1538 return NULL;
1532debug: 1539debug:
1533 object = page->freelist; 1540 object = c->page->freelist;
1534 if (!alloc_debug_processing(s, page, object, addr)) 1541 if (!alloc_debug_processing(s, c->page, object, addr))
1535 goto another_slab; 1542 goto another_slab;
1536 1543
1537 page->inuse++; 1544 c->page->inuse++;
1538 page->freelist = object[page->offset]; 1545 c->page->freelist = object[c->offset];
1539 slab_unlock(page); 1546 c->node = -1;
1547 slab_unlock(c->page);
1540 return object; 1548 return object;
1541} 1549}
1542 1550
@@ -1553,25 +1561,24 @@ debug:
1553static void __always_inline *slab_alloc(struct kmem_cache *s, 1561static void __always_inline *slab_alloc(struct kmem_cache *s,
1554 gfp_t gfpflags, int node, void *addr) 1562 gfp_t gfpflags, int node, void *addr)
1555{ 1563{
1556 struct page *page;
1557 void **object; 1564 void **object;
1558 unsigned long flags; 1565 unsigned long flags;
1566 struct kmem_cache_cpu *c;
1559 1567
1560 local_irq_save(flags); 1568 local_irq_save(flags);
1561 page = s->cpu_slab[smp_processor_id()]; 1569 c = get_cpu_slab(s, smp_processor_id());
1562 if (unlikely(!page || !page->lockless_freelist || 1570 if (unlikely(!c->freelist || !node_match(c, node)))
1563 (node != -1 && page_to_nid(page) != node)))
1564 1571
1565 object = __slab_alloc(s, gfpflags, node, addr, page); 1572 object = __slab_alloc(s, gfpflags, node, addr, c);
1566 1573
1567 else { 1574 else {
1568 object = page->lockless_freelist; 1575 object = c->freelist;
1569 page->lockless_freelist = object[page->offset]; 1576 c->freelist = object[c->offset];
1570 } 1577 }
1571 local_irq_restore(flags); 1578 local_irq_restore(flags);
1572 1579
1573 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1580 if (unlikely((gfpflags & __GFP_ZERO) && object))
1574 memset(object, 0, s->objsize); 1581 memset(object, 0, c->objsize);
1575 1582
1576 return object; 1583 return object;
1577} 1584}
@@ -1599,7 +1606,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
1599 * handling required then we can return immediately. 1606 * handling required then we can return immediately.
1600 */ 1607 */
1601static void __slab_free(struct kmem_cache *s, struct page *page, 1608static void __slab_free(struct kmem_cache *s, struct page *page,
1602 void *x, void *addr) 1609 void *x, void *addr, unsigned int offset)
1603{ 1610{
1604 void *prior; 1611 void *prior;
1605 void **object = (void *)x; 1612 void **object = (void *)x;
@@ -1609,7 +1616,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1609 if (unlikely(SlabDebug(page))) 1616 if (unlikely(SlabDebug(page)))
1610 goto debug; 1617 goto debug;
1611checks_ok: 1618checks_ok:
1612 prior = object[page->offset] = page->freelist; 1619 prior = object[offset] = page->freelist;
1613 page->freelist = object; 1620 page->freelist = object;
1614 page->inuse--; 1621 page->inuse--;
1615 1622
@@ -1664,15 +1671,16 @@ static void __always_inline slab_free(struct kmem_cache *s,
1664{ 1671{
1665 void **object = (void *)x; 1672 void **object = (void *)x;
1666 unsigned long flags; 1673 unsigned long flags;
1674 struct kmem_cache_cpu *c;
1667 1675
1668 local_irq_save(flags); 1676 local_irq_save(flags);
1669 debug_check_no_locks_freed(object, s->objsize); 1677 debug_check_no_locks_freed(object, s->objsize);
1670 if (likely(page == s->cpu_slab[smp_processor_id()] && 1678 c = get_cpu_slab(s, smp_processor_id());
1671 !SlabDebug(page))) { 1679 if (likely(page == c->page && c->node >= 0)) {
1672 object[page->offset] = page->lockless_freelist; 1680 object[c->offset] = c->freelist;
1673 page->lockless_freelist = object; 1681 c->freelist = object;
1674 } else 1682 } else
1675 __slab_free(s, page, x, addr); 1683 __slab_free(s, page, x, addr, c->offset);
1676 1684
1677 local_irq_restore(flags); 1685 local_irq_restore(flags);
1678} 1686}
@@ -1759,14 +1767,6 @@ static inline int slab_order(int size, int min_objects,
1759 int rem; 1767 int rem;
1760 int min_order = slub_min_order; 1768 int min_order = slub_min_order;
1761 1769
1762 /*
1763 * If we would create too many object per slab then reduce
1764 * the slab order even if it goes below slub_min_order.
1765 */
1766 while (min_order > 0 &&
1767 (PAGE_SIZE << min_order) >= MAX_OBJECTS_PER_SLAB * size)
1768 min_order--;
1769
1770 for (order = max(min_order, 1770 for (order = max(min_order,
1771 fls(min_objects * size - 1) - PAGE_SHIFT); 1771 fls(min_objects * size - 1) - PAGE_SHIFT);
1772 order <= max_order; order++) { 1772 order <= max_order; order++) {
@@ -1781,9 +1781,6 @@ static inline int slab_order(int size, int min_objects,
1781 if (rem <= slab_size / fract_leftover) 1781 if (rem <= slab_size / fract_leftover)
1782 break; 1782 break;
1783 1783
1784 /* If the next size is too high then exit now */
1785 if (slab_size * 2 >= MAX_OBJECTS_PER_SLAB * size)
1786 break;
1787 } 1784 }
1788 1785
1789 return order; 1786 return order;
@@ -1858,6 +1855,16 @@ static unsigned long calculate_alignment(unsigned long flags,
1858 return ALIGN(align, sizeof(void *)); 1855 return ALIGN(align, sizeof(void *));
1859} 1856}
1860 1857
1858static void init_kmem_cache_cpu(struct kmem_cache *s,
1859 struct kmem_cache_cpu *c)
1860{
1861 c->page = NULL;
1862 c->freelist = NULL;
1863 c->node = 0;
1864 c->offset = s->offset / sizeof(void *);
1865 c->objsize = s->objsize;
1866}
1867
1861static void init_kmem_cache_node(struct kmem_cache_node *n) 1868static void init_kmem_cache_node(struct kmem_cache_node *n)
1862{ 1869{
1863 n->nr_partial = 0; 1870 n->nr_partial = 0;
@@ -1869,6 +1876,131 @@ static void init_kmem_cache_node(struct kmem_cache_node *n)
1869#endif 1876#endif
1870} 1877}
1871 1878
1879#ifdef CONFIG_SMP
1880/*
1881 * Per cpu array for per cpu structures.
1882 *
1883 * The per cpu array places all kmem_cache_cpu structures from one processor
1884 * close together meaning that it becomes possible that multiple per cpu
1885 * structures are contained in one cacheline. This may be particularly
1886 * beneficial for the kmalloc caches.
1887 *
1888 * A desktop system typically has around 60-80 slabs. With 100 here we are
1889 * likely able to get per cpu structures for all caches from the array defined
1890 * here. We must be able to cover all kmalloc caches during bootstrap.
1891 *
1892 * If the per cpu array is exhausted then fall back to kmalloc
1893 * of individual cachelines. No sharing is possible then.
1894 */
1895#define NR_KMEM_CACHE_CPU 100
1896
1897static DEFINE_PER_CPU(struct kmem_cache_cpu,
1898 kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
1899
1900static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
1901static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE;
1902
1903static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
1904 int cpu, gfp_t flags)
1905{
1906 struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
1907
1908 if (c)
1909 per_cpu(kmem_cache_cpu_free, cpu) =
1910 (void *)c->freelist;
1911 else {
1912 /* Table overflow: So allocate ourselves */
1913 c = kmalloc_node(
1914 ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
1915 flags, cpu_to_node(cpu));
1916 if (!c)
1917 return NULL;
1918 }
1919
1920 init_kmem_cache_cpu(s, c);
1921 return c;
1922}
1923
1924static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
1925{
1926 if (c < per_cpu(kmem_cache_cpu, cpu) ||
1927 c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
1928 kfree(c);
1929 return;
1930 }
1931 c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
1932 per_cpu(kmem_cache_cpu_free, cpu) = c;
1933}
1934
1935static void free_kmem_cache_cpus(struct kmem_cache *s)
1936{
1937 int cpu;
1938
1939 for_each_online_cpu(cpu) {
1940 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
1941
1942 if (c) {
1943 s->cpu_slab[cpu] = NULL;
1944 free_kmem_cache_cpu(c, cpu);
1945 }
1946 }
1947}
1948
1949static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
1950{
1951 int cpu;
1952
1953 for_each_online_cpu(cpu) {
1954 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
1955
1956 if (c)
1957 continue;
1958
1959 c = alloc_kmem_cache_cpu(s, cpu, flags);
1960 if (!c) {
1961 free_kmem_cache_cpus(s);
1962 return 0;
1963 }
1964 s->cpu_slab[cpu] = c;
1965 }
1966 return 1;
1967}
1968
1969/*
1970 * Initialize the per cpu array.
1971 */
1972static void init_alloc_cpu_cpu(int cpu)
1973{
1974 int i;
1975
1976 if (cpu_isset(cpu, kmem_cach_cpu_free_init_once))
1977 return;
1978
1979 for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
1980 free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
1981
1982 cpu_set(cpu, kmem_cach_cpu_free_init_once);
1983}
1984
1985static void __init init_alloc_cpu(void)
1986{
1987 int cpu;
1988
1989 for_each_online_cpu(cpu)
1990 init_alloc_cpu_cpu(cpu);
1991 }
1992
1993#else
1994static inline void free_kmem_cache_cpus(struct kmem_cache *s) {}
1995static inline void init_alloc_cpu(void) {}
1996
1997static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
1998{
1999 init_kmem_cache_cpu(s, &s->cpu_slab);
2000 return 1;
2001}
2002#endif
2003
1872#ifdef CONFIG_NUMA 2004#ifdef CONFIG_NUMA
1873/* 2005/*
1874 * No kmalloc_node yet so do it by hand. We know that this is the first 2006 * No kmalloc_node yet so do it by hand. We know that this is the first
@@ -1876,10 +2008,11 @@ static void init_kmem_cache_node(struct kmem_cache_node *n)
1876 * possible. 2008 * possible.
1877 * 2009 *
1878 * Note that this function only works on the kmalloc_node_cache 2010 * Note that this function only works on the kmalloc_node_cache
1879 * when allocating for the kmalloc_node_cache. 2011 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
2012 * memory on a fresh node that has no slab structures yet.
1880 */ 2013 */
1881static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags, 2014static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
1882 int node) 2015 int node)
1883{ 2016{
1884 struct page *page; 2017 struct page *page;
1885 struct kmem_cache_node *n; 2018 struct kmem_cache_node *n;
@@ -1908,12 +2041,6 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag
1908 init_kmem_cache_node(n); 2041 init_kmem_cache_node(n);
1909 atomic_long_inc(&n->nr_slabs); 2042 atomic_long_inc(&n->nr_slabs);
1910 add_partial(n, page); 2043 add_partial(n, page);
1911
1912 /*
1913 * new_slab() disables interupts. If we do not reenable interrupts here
1914 * then bootup would continue with interrupts disabled.
1915 */
1916 local_irq_enable();
1917 return n; 2044 return n;
1918} 2045}
1919 2046
@@ -1921,7 +2048,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
1921{ 2048{
1922 int node; 2049 int node;
1923 2050
1924 for_each_online_node(node) { 2051 for_each_node_state(node, N_NORMAL_MEMORY) {
1925 struct kmem_cache_node *n = s->node[node]; 2052 struct kmem_cache_node *n = s->node[node];
1926 if (n && n != &s->local_node) 2053 if (n && n != &s->local_node)
1927 kmem_cache_free(kmalloc_caches, n); 2054 kmem_cache_free(kmalloc_caches, n);
@@ -1939,7 +2066,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
1939 else 2066 else
1940 local_node = 0; 2067 local_node = 0;
1941 2068
1942 for_each_online_node(node) { 2069 for_each_node_state(node, N_NORMAL_MEMORY) {
1943 struct kmem_cache_node *n; 2070 struct kmem_cache_node *n;
1944 2071
1945 if (local_node == node) 2072 if (local_node == node)
@@ -2077,21 +2204,14 @@ static int calculate_sizes(struct kmem_cache *s)
2077 */ 2204 */
2078 s->objects = (PAGE_SIZE << s->order) / size; 2205 s->objects = (PAGE_SIZE << s->order) / size;
2079 2206
2080 /* 2207 return !!s->objects;
2081 * Verify that the number of objects is within permitted limits.
2082 * The page->inuse field is only 16 bit wide! So we cannot have
2083 * more than 64k objects per slab.
2084 */
2085 if (!s->objects || s->objects > MAX_OBJECTS_PER_SLAB)
2086 return 0;
2087 return 1;
2088 2208
2089} 2209}
2090 2210
2091static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 2211static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2092 const char *name, size_t size, 2212 const char *name, size_t size,
2093 size_t align, unsigned long flags, 2213 size_t align, unsigned long flags,
2094 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 2214 void (*ctor)(struct kmem_cache *, void *))
2095{ 2215{
2096 memset(s, 0, kmem_size); 2216 memset(s, 0, kmem_size);
2097 s->name = name; 2217 s->name = name;
@@ -2107,9 +2227,12 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2107#ifdef CONFIG_NUMA 2227#ifdef CONFIG_NUMA
2108 s->defrag_ratio = 100; 2228 s->defrag_ratio = 100;
2109#endif 2229#endif
2230 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
2231 goto error;
2110 2232
2111 if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 2233 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
2112 return 1; 2234 return 1;
2235 free_kmem_cache_nodes(s);
2113error: 2236error:
2114 if (flags & SLAB_PANIC) 2237 if (flags & SLAB_PANIC)
2115 panic("Cannot create slab %s size=%lu realsize=%u " 2238 panic("Cannot create slab %s size=%lu realsize=%u "
@@ -2192,7 +2315,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
2192 flush_all(s); 2315 flush_all(s);
2193 2316
2194 /* Attempt to free all objects */ 2317 /* Attempt to free all objects */
2195 for_each_online_node(node) { 2318 free_kmem_cache_cpus(s);
2319 for_each_node_state(node, N_NORMAL_MEMORY) {
2196 struct kmem_cache_node *n = get_node(s, node); 2320 struct kmem_cache_node *n = get_node(s, node);
2197 2321
2198 n->nr_partial -= free_list(s, n, &n->partial); 2322 n->nr_partial -= free_list(s, n, &n->partial);
@@ -2227,11 +2351,11 @@ EXPORT_SYMBOL(kmem_cache_destroy);
2227 * Kmalloc subsystem 2351 * Kmalloc subsystem
2228 *******************************************************************/ 2352 *******************************************************************/
2229 2353
2230struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; 2354struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned;
2231EXPORT_SYMBOL(kmalloc_caches); 2355EXPORT_SYMBOL(kmalloc_caches);
2232 2356
2233#ifdef CONFIG_ZONE_DMA 2357#ifdef CONFIG_ZONE_DMA
2234static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; 2358static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT];
2235#endif 2359#endif
2236 2360
2237static int __init setup_slub_min_order(char *str) 2361static int __init setup_slub_min_order(char *str)
@@ -2397,12 +2521,8 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2397 return ZERO_SIZE_PTR; 2521 return ZERO_SIZE_PTR;
2398 2522
2399 index = size_index[(size - 1) / 8]; 2523 index = size_index[(size - 1) / 8];
2400 } else { 2524 } else
2401 if (size > KMALLOC_MAX_SIZE)
2402 return NULL;
2403
2404 index = fls(size - 1); 2525 index = fls(size - 1);
2405 }
2406 2526
2407#ifdef CONFIG_ZONE_DMA 2527#ifdef CONFIG_ZONE_DMA
2408 if (unlikely((flags & SLUB_DMA))) 2528 if (unlikely((flags & SLUB_DMA)))
@@ -2414,9 +2534,15 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2414 2534
2415void *__kmalloc(size_t size, gfp_t flags) 2535void *__kmalloc(size_t size, gfp_t flags)
2416{ 2536{
2417 struct kmem_cache *s = get_slab(size, flags); 2537 struct kmem_cache *s;
2418 2538
2419 if (ZERO_OR_NULL_PTR(s)) 2539 if (unlikely(size > PAGE_SIZE / 2))
2540 return (void *)__get_free_pages(flags | __GFP_COMP,
2541 get_order(size));
2542
2543 s = get_slab(size, flags);
2544
2545 if (unlikely(ZERO_OR_NULL_PTR(s)))
2420 return s; 2546 return s;
2421 2547
2422 return slab_alloc(s, flags, -1, __builtin_return_address(0)); 2548 return slab_alloc(s, flags, -1, __builtin_return_address(0));
@@ -2426,9 +2552,15 @@ EXPORT_SYMBOL(__kmalloc);
2426#ifdef CONFIG_NUMA 2552#ifdef CONFIG_NUMA
2427void *__kmalloc_node(size_t size, gfp_t flags, int node) 2553void *__kmalloc_node(size_t size, gfp_t flags, int node)
2428{ 2554{
2429 struct kmem_cache *s = get_slab(size, flags); 2555 struct kmem_cache *s;
2430 2556
2431 if (ZERO_OR_NULL_PTR(s)) 2557 if (unlikely(size > PAGE_SIZE / 2))
2558 return (void *)__get_free_pages(flags | __GFP_COMP,
2559 get_order(size));
2560
2561 s = get_slab(size, flags);
2562
2563 if (unlikely(ZERO_OR_NULL_PTR(s)))
2432 return s; 2564 return s;
2433 2565
2434 return slab_alloc(s, flags, node, __builtin_return_address(0)); 2566 return slab_alloc(s, flags, node, __builtin_return_address(0));
@@ -2441,7 +2573,8 @@ size_t ksize(const void *object)
2441 struct page *page; 2573 struct page *page;
2442 struct kmem_cache *s; 2574 struct kmem_cache *s;
2443 2575
2444 if (ZERO_OR_NULL_PTR(object)) 2576 BUG_ON(!object);
2577 if (unlikely(object == ZERO_SIZE_PTR))
2445 return 0; 2578 return 0;
2446 2579
2447 page = get_object_page(object); 2580 page = get_object_page(object);
@@ -2473,22 +2606,17 @@ EXPORT_SYMBOL(ksize);
2473 2606
2474void kfree(const void *x) 2607void kfree(const void *x)
2475{ 2608{
2476 struct kmem_cache *s;
2477 struct page *page; 2609 struct page *page;
2478 2610
2479 /* 2611 if (unlikely(ZERO_OR_NULL_PTR(x)))
2480 * This has to be an unsigned comparison. According to Linus
2481 * some gcc version treat a pointer as a signed entity. Then
2482 * this comparison would be true for all "negative" pointers
2483 * (which would cover the whole upper half of the address space).
2484 */
2485 if (ZERO_OR_NULL_PTR(x))
2486 return; 2612 return;
2487 2613
2488 page = virt_to_head_page(x); 2614 page = virt_to_head_page(x);
2489 s = page->slab; 2615 if (unlikely(!PageSlab(page))) {
2490 2616 put_page(page);
2491 slab_free(s, page, (void *)x, __builtin_return_address(0)); 2617 return;
2618 }
2619 slab_free(page->slab, page, (void *)x, __builtin_return_address(0));
2492} 2620}
2493EXPORT_SYMBOL(kfree); 2621EXPORT_SYMBOL(kfree);
2494 2622
@@ -2517,7 +2645,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
2517 return -ENOMEM; 2645 return -ENOMEM;
2518 2646
2519 flush_all(s); 2647 flush_all(s);
2520 for_each_online_node(node) { 2648 for_each_node_state(node, N_NORMAL_MEMORY) {
2521 n = get_node(s, node); 2649 n = get_node(s, node);
2522 2650
2523 if (!n->nr_partial) 2651 if (!n->nr_partial)
@@ -2575,6 +2703,8 @@ void __init kmem_cache_init(void)
2575 int i; 2703 int i;
2576 int caches = 0; 2704 int caches = 0;
2577 2705
2706 init_alloc_cpu();
2707
2578#ifdef CONFIG_NUMA 2708#ifdef CONFIG_NUMA
2579 /* 2709 /*
2580 * Must first have the slab cache available for the allocations of the 2710 * Must first have the slab cache available for the allocations of the
@@ -2602,7 +2732,7 @@ void __init kmem_cache_init(void)
2602 caches++; 2732 caches++;
2603 } 2733 }
2604 2734
2605 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { 2735 for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) {
2606 create_kmalloc_cache(&kmalloc_caches[i], 2736 create_kmalloc_cache(&kmalloc_caches[i],
2607 "kmalloc", 1 << i, GFP_KERNEL); 2737 "kmalloc", 1 << i, GFP_KERNEL);
2608 caches++; 2738 caches++;
@@ -2629,16 +2759,18 @@ void __init kmem_cache_init(void)
2629 slab_state = UP; 2759 slab_state = UP;
2630 2760
2631 /* Provide the correct kmalloc names now that the caches are up */ 2761 /* Provide the correct kmalloc names now that the caches are up */
2632 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) 2762 for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++)
2633 kmalloc_caches[i]. name = 2763 kmalloc_caches[i]. name =
2634 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 2764 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
2635 2765
2636#ifdef CONFIG_SMP 2766#ifdef CONFIG_SMP
2637 register_cpu_notifier(&slab_notifier); 2767 register_cpu_notifier(&slab_notifier);
2768 kmem_size = offsetof(struct kmem_cache, cpu_slab) +
2769 nr_cpu_ids * sizeof(struct kmem_cache_cpu *);
2770#else
2771 kmem_size = sizeof(struct kmem_cache);
2638#endif 2772#endif
2639 2773
2640 kmem_size = offsetof(struct kmem_cache, cpu_slab) +
2641 nr_cpu_ids * sizeof(struct page *);
2642 2774
2643 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 2775 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
2644 " CPUs=%d, Nodes=%d\n", 2776 " CPUs=%d, Nodes=%d\n",
@@ -2669,7 +2801,7 @@ static int slab_unmergeable(struct kmem_cache *s)
2669 2801
2670static struct kmem_cache *find_mergeable(size_t size, 2802static struct kmem_cache *find_mergeable(size_t size,
2671 size_t align, unsigned long flags, const char *name, 2803 size_t align, unsigned long flags, const char *name,
2672 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 2804 void (*ctor)(struct kmem_cache *, void *))
2673{ 2805{
2674 struct kmem_cache *s; 2806 struct kmem_cache *s;
2675 2807
@@ -2710,19 +2842,28 @@ static struct kmem_cache *find_mergeable(size_t size,
2710 2842
2711struct kmem_cache *kmem_cache_create(const char *name, size_t size, 2843struct kmem_cache *kmem_cache_create(const char *name, size_t size,
2712 size_t align, unsigned long flags, 2844 size_t align, unsigned long flags,
2713 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 2845 void (*ctor)(struct kmem_cache *, void *))
2714{ 2846{
2715 struct kmem_cache *s; 2847 struct kmem_cache *s;
2716 2848
2717 down_write(&slub_lock); 2849 down_write(&slub_lock);
2718 s = find_mergeable(size, align, flags, name, ctor); 2850 s = find_mergeable(size, align, flags, name, ctor);
2719 if (s) { 2851 if (s) {
2852 int cpu;
2853
2720 s->refcount++; 2854 s->refcount++;
2721 /* 2855 /*
2722 * Adjust the object sizes so that we clear 2856 * Adjust the object sizes so that we clear
2723 * the complete object on kzalloc. 2857 * the complete object on kzalloc.
2724 */ 2858 */
2725 s->objsize = max(s->objsize, (int)size); 2859 s->objsize = max(s->objsize, (int)size);
2860
2861 /*
2862 * And then we need to update the object size in the
2863 * per cpu structures
2864 */
2865 for_each_online_cpu(cpu)
2866 get_cpu_slab(s, cpu)->objsize = s->objsize;
2726 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 2867 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
2727 up_write(&slub_lock); 2868 up_write(&slub_lock);
2728 if (sysfs_slab_alias(s, name)) 2869 if (sysfs_slab_alias(s, name))
@@ -2765,15 +2906,29 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
2765 unsigned long flags; 2906 unsigned long flags;
2766 2907
2767 switch (action) { 2908 switch (action) {
2909 case CPU_UP_PREPARE:
2910 case CPU_UP_PREPARE_FROZEN:
2911 init_alloc_cpu_cpu(cpu);
2912 down_read(&slub_lock);
2913 list_for_each_entry(s, &slab_caches, list)
2914 s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
2915 GFP_KERNEL);
2916 up_read(&slub_lock);
2917 break;
2918
2768 case CPU_UP_CANCELED: 2919 case CPU_UP_CANCELED:
2769 case CPU_UP_CANCELED_FROZEN: 2920 case CPU_UP_CANCELED_FROZEN:
2770 case CPU_DEAD: 2921 case CPU_DEAD:
2771 case CPU_DEAD_FROZEN: 2922 case CPU_DEAD_FROZEN:
2772 down_read(&slub_lock); 2923 down_read(&slub_lock);
2773 list_for_each_entry(s, &slab_caches, list) { 2924 list_for_each_entry(s, &slab_caches, list) {
2925 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2926
2774 local_irq_save(flags); 2927 local_irq_save(flags);
2775 __flush_cpu_slab(s, cpu); 2928 __flush_cpu_slab(s, cpu);
2776 local_irq_restore(flags); 2929 local_irq_restore(flags);
2930 free_kmem_cache_cpu(c, cpu);
2931 s->cpu_slab[cpu] = NULL;
2777 } 2932 }
2778 up_read(&slub_lock); 2933 up_read(&slub_lock);
2779 break; 2934 break;
@@ -2790,9 +2945,14 @@ static struct notifier_block __cpuinitdata slab_notifier =
2790 2945
2791void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) 2946void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
2792{ 2947{
2793 struct kmem_cache *s = get_slab(size, gfpflags); 2948 struct kmem_cache *s;
2949
2950 if (unlikely(size > PAGE_SIZE / 2))
2951 return (void *)__get_free_pages(gfpflags | __GFP_COMP,
2952 get_order(size));
2953 s = get_slab(size, gfpflags);
2794 2954
2795 if (ZERO_OR_NULL_PTR(s)) 2955 if (unlikely(ZERO_OR_NULL_PTR(s)))
2796 return s; 2956 return s;
2797 2957
2798 return slab_alloc(s, gfpflags, -1, caller); 2958 return slab_alloc(s, gfpflags, -1, caller);
@@ -2801,9 +2961,14 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
2801void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 2961void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
2802 int node, void *caller) 2962 int node, void *caller)
2803{ 2963{
2804 struct kmem_cache *s = get_slab(size, gfpflags); 2964 struct kmem_cache *s;
2965
2966 if (unlikely(size > PAGE_SIZE / 2))
2967 return (void *)__get_free_pages(gfpflags | __GFP_COMP,
2968 get_order(size));
2969 s = get_slab(size, gfpflags);
2805 2970
2806 if (ZERO_OR_NULL_PTR(s)) 2971 if (unlikely(ZERO_OR_NULL_PTR(s)))
2807 return s; 2972 return s;
2808 2973
2809 return slab_alloc(s, gfpflags, node, caller); 2974 return slab_alloc(s, gfpflags, node, caller);
@@ -2902,7 +3067,7 @@ static long validate_slab_cache(struct kmem_cache *s)
2902 return -ENOMEM; 3067 return -ENOMEM;
2903 3068
2904 flush_all(s); 3069 flush_all(s);
2905 for_each_online_node(node) { 3070 for_each_node_state(node, N_NORMAL_MEMORY) {
2906 struct kmem_cache_node *n = get_node(s, node); 3071 struct kmem_cache_node *n = get_node(s, node);
2907 3072
2908 count += validate_slab_node(s, n, map); 3073 count += validate_slab_node(s, n, map);
@@ -3116,13 +3281,13 @@ static int list_locations(struct kmem_cache *s, char *buf,
3116 int node; 3281 int node;
3117 3282
3118 if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 3283 if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
3119 GFP_KERNEL)) 3284 GFP_TEMPORARY))
3120 return sprintf(buf, "Out of memory\n"); 3285 return sprintf(buf, "Out of memory\n");
3121 3286
3122 /* Push back cpu slabs */ 3287 /* Push back cpu slabs */
3123 flush_all(s); 3288 flush_all(s);
3124 3289
3125 for_each_online_node(node) { 3290 for_each_node_state(node, N_NORMAL_MEMORY) {
3126 struct kmem_cache_node *n = get_node(s, node); 3291 struct kmem_cache_node *n = get_node(s, node);
3127 unsigned long flags; 3292 unsigned long flags;
3128 struct page *page; 3293 struct page *page;
@@ -3230,11 +3395,18 @@ static unsigned long slab_objects(struct kmem_cache *s,
3230 per_cpu = nodes + nr_node_ids; 3395 per_cpu = nodes + nr_node_ids;
3231 3396
3232 for_each_possible_cpu(cpu) { 3397 for_each_possible_cpu(cpu) {
3233 struct page *page = s->cpu_slab[cpu]; 3398 struct page *page;
3234 int node; 3399 int node;
3400 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3235 3401
3402 if (!c)
3403 continue;
3404
3405 page = c->page;
3406 node = c->node;
3407 if (node < 0)
3408 continue;
3236 if (page) { 3409 if (page) {
3237 node = page_to_nid(page);
3238 if (flags & SO_CPU) { 3410 if (flags & SO_CPU) {
3239 int x = 0; 3411 int x = 0;
3240 3412
@@ -3249,7 +3421,7 @@ static unsigned long slab_objects(struct kmem_cache *s,
3249 } 3421 }
3250 } 3422 }
3251 3423
3252 for_each_online_node(node) { 3424 for_each_node_state(node, N_NORMAL_MEMORY) {
3253 struct kmem_cache_node *n = get_node(s, node); 3425 struct kmem_cache_node *n = get_node(s, node);
3254 3426
3255 if (flags & SO_PARTIAL) { 3427 if (flags & SO_PARTIAL) {
@@ -3277,7 +3449,7 @@ static unsigned long slab_objects(struct kmem_cache *s,
3277 3449
3278 x = sprintf(buf, "%lu", total); 3450 x = sprintf(buf, "%lu", total);
3279#ifdef CONFIG_NUMA 3451#ifdef CONFIG_NUMA
3280 for_each_online_node(node) 3452 for_each_node_state(node, N_NORMAL_MEMORY)
3281 if (nodes[node]) 3453 if (nodes[node])
3282 x += sprintf(buf + x, " N%d=%lu", 3454 x += sprintf(buf + x, " N%d=%lu",
3283 node, nodes[node]); 3455 node, nodes[node]);
@@ -3291,13 +3463,19 @@ static int any_slab_objects(struct kmem_cache *s)
3291 int node; 3463 int node;
3292 int cpu; 3464 int cpu;
3293 3465
3294 for_each_possible_cpu(cpu) 3466 for_each_possible_cpu(cpu) {
3295 if (s->cpu_slab[cpu]) 3467 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3468
3469 if (c && c->page)
3296 return 1; 3470 return 1;
3471 }
3297 3472
3298 for_each_node(node) { 3473 for_each_online_node(node) {
3299 struct kmem_cache_node *n = get_node(s, node); 3474 struct kmem_cache_node *n = get_node(s, node);
3300 3475
3476 if (!n)
3477 continue;
3478
3301 if (n->nr_partial || atomic_long_read(&n->nr_slabs)) 3479 if (n->nr_partial || atomic_long_read(&n->nr_slabs))
3302 return 1; 3480 return 1;
3303 } 3481 }
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
new file mode 100644
index 000000000000..d3b718b0c20a
--- /dev/null
+++ b/mm/sparse-vmemmap.c
@@ -0,0 +1,148 @@
1/*
2 * Virtual Memory Map support
3 *
4 * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>.
5 *
6 * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
7 * virt_to_page, page_address() to be implemented as a base offset
8 * calculation without memory access.
9 *
10 * However, virtual mappings need a page table and TLBs. Many Linux
11 * architectures already map their physical space using 1-1 mappings
12 * via TLBs. For those arches the virtual memmory map is essentially
13 * for free if we use the same page size as the 1-1 mappings. In that
14 * case the overhead consists of a few additional pages that are
15 * allocated to create a view of memory for vmemmap.
16 *
17 * The architecture is expected to provide a vmemmap_populate() function
18 * to instantiate the mapping.
19 */
20#include <linux/mm.h>
21#include <linux/mmzone.h>
22#include <linux/bootmem.h>
23#include <linux/highmem.h>
24#include <linux/module.h>
25#include <linux/spinlock.h>
26#include <linux/vmalloc.h>
27#include <asm/dma.h>
28#include <asm/pgalloc.h>
29#include <asm/pgtable.h>
30
31/*
32 * Allocate a block of memory to be used to back the virtual memory map
33 * or to back the page tables that are used to create the mapping.
34 * Uses the main allocators if they are available, else bootmem.
35 */
36void * __meminit vmemmap_alloc_block(unsigned long size, int node)
37{
38 /* If the main allocator is up use that, fallback to bootmem. */
39 if (slab_is_available()) {
40 struct page *page = alloc_pages_node(node,
41 GFP_KERNEL | __GFP_ZERO, get_order(size));
42 if (page)
43 return page_address(page);
44 return NULL;
45 } else
46 return __alloc_bootmem_node(NODE_DATA(node), size, size,
47 __pa(MAX_DMA_ADDRESS));
48}
49
50void __meminit vmemmap_verify(pte_t *pte, int node,
51 unsigned long start, unsigned long end)
52{
53 unsigned long pfn = pte_pfn(*pte);
54 int actual_node = early_pfn_to_nid(pfn);
55
56 if (actual_node != node)
57 printk(KERN_WARNING "[%lx-%lx] potential offnode "
58 "page_structs\n", start, end - 1);
59}
60
61pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
62{
63 pte_t *pte = pte_offset_kernel(pmd, addr);
64 if (pte_none(*pte)) {
65 pte_t entry;
66 void *p = vmemmap_alloc_block(PAGE_SIZE, node);
67 if (!p)
68 return 0;
69 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
70 set_pte_at(&init_mm, addr, pte, entry);
71 }
72 return pte;
73}
74
75pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
76{
77 pmd_t *pmd = pmd_offset(pud, addr);
78 if (pmd_none(*pmd)) {
79 void *p = vmemmap_alloc_block(PAGE_SIZE, node);
80 if (!p)
81 return 0;
82 pmd_populate_kernel(&init_mm, pmd, p);
83 }
84 return pmd;
85}
86
87pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node)
88{
89 pud_t *pud = pud_offset(pgd, addr);
90 if (pud_none(*pud)) {
91 void *p = vmemmap_alloc_block(PAGE_SIZE, node);
92 if (!p)
93 return 0;
94 pud_populate(&init_mm, pud, p);
95 }
96 return pud;
97}
98
99pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
100{
101 pgd_t *pgd = pgd_offset_k(addr);
102 if (pgd_none(*pgd)) {
103 void *p = vmemmap_alloc_block(PAGE_SIZE, node);
104 if (!p)
105 return 0;
106 pgd_populate(&init_mm, pgd, p);
107 }
108 return pgd;
109}
110
111int __meminit vmemmap_populate_basepages(struct page *start_page,
112 unsigned long size, int node)
113{
114 unsigned long addr = (unsigned long)start_page;
115 unsigned long end = (unsigned long)(start_page + size);
116 pgd_t *pgd;
117 pud_t *pud;
118 pmd_t *pmd;
119 pte_t *pte;
120
121 for (; addr < end; addr += PAGE_SIZE) {
122 pgd = vmemmap_pgd_populate(addr, node);
123 if (!pgd)
124 return -ENOMEM;
125 pud = vmemmap_pud_populate(pgd, addr, node);
126 if (!pud)
127 return -ENOMEM;
128 pmd = vmemmap_pmd_populate(pud, addr, node);
129 if (!pmd)
130 return -ENOMEM;
131 pte = vmemmap_pte_populate(pmd, addr, node);
132 if (!pte)
133 return -ENOMEM;
134 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
135 }
136
137 return 0;
138}
139
140struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
141{
142 struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION);
143 int error = vmemmap_populate(map, PAGES_PER_SECTION, nid);
144 if (error)
145 return NULL;
146
147 return map;
148}
diff --git a/mm/sparse.c b/mm/sparse.c
index 239f5a720d38..08fb14f5eea3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -9,6 +9,8 @@
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10#include <linux/vmalloc.h> 10#include <linux/vmalloc.h>
11#include <asm/dma.h> 11#include <asm/dma.h>
12#include <asm/pgalloc.h>
13#include <asm/pgtable.h>
12 14
13/* 15/*
14 * Permanent SPARSEMEM data: 16 * Permanent SPARSEMEM data:
@@ -106,7 +108,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
106 108
107/* 109/*
108 * Although written for the SPARSEMEM_EXTREME case, this happens 110 * Although written for the SPARSEMEM_EXTREME case, this happens
109 * to also work for the flat array case becase 111 * to also work for the flat array case because
110 * NR_SECTION_ROOTS==NR_MEM_SECTIONS. 112 * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
111 */ 113 */
112int __section_nr(struct mem_section* ms) 114int __section_nr(struct mem_section* ms)
@@ -176,7 +178,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
176 if (nid != early_pfn_to_nid(pfn)) 178 if (nid != early_pfn_to_nid(pfn))
177 continue; 179 continue;
178 180
179 if (pfn_valid(pfn)) 181 if (pfn_present(pfn))
180 nr_pages += PAGES_PER_SECTION; 182 nr_pages += PAGES_PER_SECTION;
181 } 183 }
182 184
@@ -204,13 +206,16 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
204} 206}
205 207
206static int __meminit sparse_init_one_section(struct mem_section *ms, 208static int __meminit sparse_init_one_section(struct mem_section *ms,
207 unsigned long pnum, struct page *mem_map) 209 unsigned long pnum, struct page *mem_map,
210 unsigned long *pageblock_bitmap)
208{ 211{
209 if (!valid_section(ms)) 212 if (!present_section(ms))
210 return -EINVAL; 213 return -EINVAL;
211 214
212 ms->section_mem_map &= ~SECTION_MAP_MASK; 215 ms->section_mem_map &= ~SECTION_MAP_MASK;
213 ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum); 216 ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
217 SECTION_HAS_MEM_MAP;
218 ms->pageblock_flags = pageblock_bitmap;
214 219
215 return 1; 220 return 1;
216} 221}
@@ -221,12 +226,43 @@ void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
221 return NULL; 226 return NULL;
222} 227}
223 228
224static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 229static unsigned long usemap_size(void)
225{ 230{
226 struct page *map; 231 unsigned long size_bytes;
232 size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8;
233 size_bytes = roundup(size_bytes, sizeof(unsigned long));
234 return size_bytes;
235}
236
237#ifdef CONFIG_MEMORY_HOTPLUG
238static unsigned long *__kmalloc_section_usemap(void)
239{
240 return kmalloc(usemap_size(), GFP_KERNEL);
241}
242#endif /* CONFIG_MEMORY_HOTPLUG */
243
244static unsigned long *sparse_early_usemap_alloc(unsigned long pnum)
245{
246 unsigned long *usemap;
227 struct mem_section *ms = __nr_to_section(pnum); 247 struct mem_section *ms = __nr_to_section(pnum);
228 int nid = sparse_early_nid(ms); 248 int nid = sparse_early_nid(ms);
229 249
250 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
251 if (usemap)
252 return usemap;
253
254 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
255 nid = 0;
256
257 printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
258 return NULL;
259}
260
261#ifndef CONFIG_SPARSEMEM_VMEMMAP
262struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
263{
264 struct page *map;
265
230 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); 266 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
231 if (map) 267 if (map)
232 return map; 268 return map;
@@ -238,10 +274,22 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
238 274
239 map = alloc_bootmem_node(NODE_DATA(nid), 275 map = alloc_bootmem_node(NODE_DATA(nid),
240 sizeof(struct page) * PAGES_PER_SECTION); 276 sizeof(struct page) * PAGES_PER_SECTION);
277 return map;
278}
279#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
280
281struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
282{
283 struct page *map;
284 struct mem_section *ms = __nr_to_section(pnum);
285 int nid = sparse_early_nid(ms);
286
287 map = sparse_mem_map_populate(pnum, nid);
241 if (map) 288 if (map)
242 return map; 289 return map;
243 290
244 printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__); 291 printk(KERN_ERR "%s: sparsemem memory map backing failed "
292 "some memory will not be available.\n", __FUNCTION__);
245 ms->section_mem_map = 0; 293 ms->section_mem_map = 0;
246 return NULL; 294 return NULL;
247} 295}
@@ -254,19 +302,38 @@ void __init sparse_init(void)
254{ 302{
255 unsigned long pnum; 303 unsigned long pnum;
256 struct page *map; 304 struct page *map;
305 unsigned long *usemap;
257 306
258 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 307 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
259 if (!valid_section_nr(pnum)) 308 if (!present_section_nr(pnum))
260 continue; 309 continue;
261 310
262 map = sparse_early_mem_map_alloc(pnum); 311 map = sparse_early_mem_map_alloc(pnum);
263 if (!map) 312 if (!map)
264 continue; 313 continue;
265 sparse_init_one_section(__nr_to_section(pnum), pnum, map); 314
315 usemap = sparse_early_usemap_alloc(pnum);
316 if (!usemap)
317 continue;
318
319 sparse_init_one_section(__nr_to_section(pnum), pnum, map,
320 usemap);
266 } 321 }
267} 322}
268 323
269#ifdef CONFIG_MEMORY_HOTPLUG 324#ifdef CONFIG_MEMORY_HOTPLUG
325#ifdef CONFIG_SPARSEMEM_VMEMMAP
326static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
327 unsigned long nr_pages)
328{
329 /* This will make the necessary allocations eventually. */
330 return sparse_mem_map_populate(pnum, nid);
331}
332static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
333{
334 return; /* XXX: Not implemented yet */
335}
336#else
270static struct page *__kmalloc_section_memmap(unsigned long nr_pages) 337static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
271{ 338{
272 struct page *page, *ret; 339 struct page *page, *ret;
@@ -289,6 +356,12 @@ got_map_ptr:
289 return ret; 356 return ret;
290} 357}
291 358
359static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
360 unsigned long nr_pages)
361{
362 return __kmalloc_section_memmap(nr_pages);
363}
364
292static int vaddr_in_vmalloc_area(void *addr) 365static int vaddr_in_vmalloc_area(void *addr)
293{ 366{
294 if (addr >= (void *)VMALLOC_START && 367 if (addr >= (void *)VMALLOC_START &&
@@ -305,6 +378,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
305 free_pages((unsigned long)memmap, 378 free_pages((unsigned long)memmap,
306 get_order(sizeof(struct page) * nr_pages)); 379 get_order(sizeof(struct page) * nr_pages));
307} 380}
381#endif /* CONFIG_SPARSEMEM_VMEMMAP */
308 382
309/* 383/*
310 * returns the number of sections whose mem_maps were properly 384 * returns the number of sections whose mem_maps were properly
@@ -318,6 +392,7 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
318 struct pglist_data *pgdat = zone->zone_pgdat; 392 struct pglist_data *pgdat = zone->zone_pgdat;
319 struct mem_section *ms; 393 struct mem_section *ms;
320 struct page *memmap; 394 struct page *memmap;
395 unsigned long *usemap;
321 unsigned long flags; 396 unsigned long flags;
322 int ret; 397 int ret;
323 398
@@ -326,7 +401,8 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
326 * plus, it does a kmalloc 401 * plus, it does a kmalloc
327 */ 402 */
328 sparse_index_init(section_nr, pgdat->node_id); 403 sparse_index_init(section_nr, pgdat->node_id);
329 memmap = __kmalloc_section_memmap(nr_pages); 404 memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages);
405 usemap = __kmalloc_section_usemap();
330 406
331 pgdat_resize_lock(pgdat, &flags); 407 pgdat_resize_lock(pgdat, &flags);
332 408
@@ -335,9 +411,14 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
335 ret = -EEXIST; 411 ret = -EEXIST;
336 goto out; 412 goto out;
337 } 413 }
414
415 if (!usemap) {
416 ret = -ENOMEM;
417 goto out;
418 }
338 ms->section_mem_map |= SECTION_MARKED_PRESENT; 419 ms->section_mem_map |= SECTION_MARKED_PRESENT;
339 420
340 ret = sparse_init_one_section(ms, section_nr, memmap); 421 ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
341 422
342out: 423out:
343 pgdat_resize_unlock(pgdat, &flags); 424 pgdat_resize_unlock(pgdat, &flags);
diff --git a/mm/swap.c b/mm/swap.c
index d3cb966fe992..a65eff8a517a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -24,16 +24,19 @@
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/mm_inline.h> 25#include <linux/mm_inline.h>
26#include <linux/buffer_head.h> /* for try_to_release_page() */ 26#include <linux/buffer_head.h> /* for try_to_release_page() */
27#include <linux/module.h>
28#include <linux/percpu_counter.h> 27#include <linux/percpu_counter.h>
29#include <linux/percpu.h> 28#include <linux/percpu.h>
30#include <linux/cpu.h> 29#include <linux/cpu.h>
31#include <linux/notifier.h> 30#include <linux/notifier.h>
32#include <linux/init.h> 31#include <linux/backing-dev.h>
33 32
34/* How many pages do we try to swap or page in/out together? */ 33/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 34int page_cluster;
36 35
36static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
37static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
38static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
39
37/* 40/*
38 * This path almost never happens for VM activity - pages are normally 41 * This path almost never happens for VM activity - pages are normally
39 * freed via pagevecs. But it gets used by networking. 42 * freed via pagevecs. But it gets used by networking.
@@ -94,23 +97,47 @@ void put_pages_list(struct list_head *pages)
94EXPORT_SYMBOL(put_pages_list); 97EXPORT_SYMBOL(put_pages_list);
95 98
96/* 99/*
100 * pagevec_move_tail() must be called with IRQ disabled.
101 * Otherwise this may cause nasty races.
102 */
103static void pagevec_move_tail(struct pagevec *pvec)
104{
105 int i;
106 int pgmoved = 0;
107 struct zone *zone = NULL;
108
109 for (i = 0; i < pagevec_count(pvec); i++) {
110 struct page *page = pvec->pages[i];
111 struct zone *pagezone = page_zone(page);
112
113 if (pagezone != zone) {
114 if (zone)
115 spin_unlock(&zone->lru_lock);
116 zone = pagezone;
117 spin_lock(&zone->lru_lock);
118 }
119 if (PageLRU(page) && !PageActive(page)) {
120 list_move_tail(&page->lru, &zone->inactive_list);
121 pgmoved++;
122 }
123 }
124 if (zone)
125 spin_unlock(&zone->lru_lock);
126 __count_vm_events(PGROTATED, pgmoved);
127 release_pages(pvec->pages, pvec->nr, pvec->cold);
128 pagevec_reinit(pvec);
129}
130
131/*
97 * Writeback is about to end against a page which has been marked for immediate 132 * Writeback is about to end against a page which has been marked for immediate
98 * reclaim. If it still appears to be reclaimable, move it to the tail of the 133 * reclaim. If it still appears to be reclaimable, move it to the tail of the
99 * inactive list. The page still has PageWriteback set, which will pin it. 134 * inactive list.
100 *
101 * We don't expect many pages to come through here, so don't bother batching
102 * things up.
103 *
104 * To avoid placing the page at the tail of the LRU while PG_writeback is still
105 * set, this function will clear PG_writeback before performing the page
106 * motion. Do that inside the lru lock because once PG_writeback is cleared
107 * we may not touch the page.
108 * 135 *
109 * Returns zero if it cleared PG_writeback. 136 * Returns zero if it cleared PG_writeback.
110 */ 137 */
111int rotate_reclaimable_page(struct page *page) 138int rotate_reclaimable_page(struct page *page)
112{ 139{
113 struct zone *zone; 140 struct pagevec *pvec;
114 unsigned long flags; 141 unsigned long flags;
115 142
116 if (PageLocked(page)) 143 if (PageLocked(page))
@@ -122,15 +149,16 @@ int rotate_reclaimable_page(struct page *page)
122 if (!PageLRU(page)) 149 if (!PageLRU(page))
123 return 1; 150 return 1;
124 151
125 zone = page_zone(page); 152 page_cache_get(page);
126 spin_lock_irqsave(&zone->lru_lock, flags); 153 local_irq_save(flags);
127 if (PageLRU(page) && !PageActive(page)) { 154 pvec = &__get_cpu_var(lru_rotate_pvecs);
128 list_move_tail(&page->lru, &zone->inactive_list); 155 if (!pagevec_add(pvec, page))
129 __count_vm_event(PGROTATED); 156 pagevec_move_tail(pvec);
130 } 157 local_irq_restore(flags);
158
131 if (!test_clear_page_writeback(page)) 159 if (!test_clear_page_writeback(page))
132 BUG(); 160 BUG();
133 spin_unlock_irqrestore(&zone->lru_lock, flags); 161
134 return 0; 162 return 0;
135} 163}
136 164
@@ -174,9 +202,6 @@ EXPORT_SYMBOL(mark_page_accessed);
174 * lru_cache_add: add a page to the page lists 202 * lru_cache_add: add a page to the page lists
175 * @page: the page to add 203 * @page: the page to add
176 */ 204 */
177static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
178static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
179
180void fastcall lru_cache_add(struct page *page) 205void fastcall lru_cache_add(struct page *page)
181{ 206{
182 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); 207 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
@@ -197,21 +222,37 @@ void fastcall lru_cache_add_active(struct page *page)
197 put_cpu_var(lru_add_active_pvecs); 222 put_cpu_var(lru_add_active_pvecs);
198} 223}
199 224
200static void __lru_add_drain(int cpu) 225/*
226 * Drain pages out of the cpu's pagevecs.
227 * Either "cpu" is the current CPU, and preemption has already been
228 * disabled; or "cpu" is being hot-unplugged, and is already dead.
229 */
230static void drain_cpu_pagevecs(int cpu)
201{ 231{
202 struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); 232 struct pagevec *pvec;
203 233
204 /* CPU is dead, so no locking needed. */ 234 pvec = &per_cpu(lru_add_pvecs, cpu);
205 if (pagevec_count(pvec)) 235 if (pagevec_count(pvec))
206 __pagevec_lru_add(pvec); 236 __pagevec_lru_add(pvec);
237
207 pvec = &per_cpu(lru_add_active_pvecs, cpu); 238 pvec = &per_cpu(lru_add_active_pvecs, cpu);
208 if (pagevec_count(pvec)) 239 if (pagevec_count(pvec))
209 __pagevec_lru_add_active(pvec); 240 __pagevec_lru_add_active(pvec);
241
242 pvec = &per_cpu(lru_rotate_pvecs, cpu);
243 if (pagevec_count(pvec)) {
244 unsigned long flags;
245
246 /* No harm done if a racing interrupt already did this */
247 local_irq_save(flags);
248 pagevec_move_tail(pvec);
249 local_irq_restore(flags);
250 }
210} 251}
211 252
212void lru_add_drain(void) 253void lru_add_drain(void)
213{ 254{
214 __lru_add_drain(get_cpu()); 255 drain_cpu_pagevecs(get_cpu());
215 put_cpu(); 256 put_cpu();
216} 257}
217 258
@@ -258,6 +299,7 @@ void release_pages(struct page **pages, int nr, int cold)
258 int i; 299 int i;
259 struct pagevec pages_to_free; 300 struct pagevec pages_to_free;
260 struct zone *zone = NULL; 301 struct zone *zone = NULL;
302 unsigned long uninitialized_var(flags);
261 303
262 pagevec_init(&pages_to_free, cold); 304 pagevec_init(&pages_to_free, cold);
263 for (i = 0; i < nr; i++) { 305 for (i = 0; i < nr; i++) {
@@ -265,7 +307,7 @@ void release_pages(struct page **pages, int nr, int cold)
265 307
266 if (unlikely(PageCompound(page))) { 308 if (unlikely(PageCompound(page))) {
267 if (zone) { 309 if (zone) {
268 spin_unlock_irq(&zone->lru_lock); 310 spin_unlock_irqrestore(&zone->lru_lock, flags);
269 zone = NULL; 311 zone = NULL;
270 } 312 }
271 put_compound_page(page); 313 put_compound_page(page);
@@ -279,9 +321,10 @@ void release_pages(struct page **pages, int nr, int cold)
279 struct zone *pagezone = page_zone(page); 321 struct zone *pagezone = page_zone(page);
280 if (pagezone != zone) { 322 if (pagezone != zone) {
281 if (zone) 323 if (zone)
282 spin_unlock_irq(&zone->lru_lock); 324 spin_unlock_irqrestore(&zone->lru_lock,
325 flags);
283 zone = pagezone; 326 zone = pagezone;
284 spin_lock_irq(&zone->lru_lock); 327 spin_lock_irqsave(&zone->lru_lock, flags);
285 } 328 }
286 VM_BUG_ON(!PageLRU(page)); 329 VM_BUG_ON(!PageLRU(page));
287 __ClearPageLRU(page); 330 __ClearPageLRU(page);
@@ -290,7 +333,7 @@ void release_pages(struct page **pages, int nr, int cold)
290 333
291 if (!pagevec_add(&pages_to_free, page)) { 334 if (!pagevec_add(&pages_to_free, page)) {
292 if (zone) { 335 if (zone) {
293 spin_unlock_irq(&zone->lru_lock); 336 spin_unlock_irqrestore(&zone->lru_lock, flags);
294 zone = NULL; 337 zone = NULL;
295 } 338 }
296 __pagevec_free(&pages_to_free); 339 __pagevec_free(&pages_to_free);
@@ -298,7 +341,7 @@ void release_pages(struct page **pages, int nr, int cold)
298 } 341 }
299 } 342 }
300 if (zone) 343 if (zone)
301 spin_unlock_irq(&zone->lru_lock); 344 spin_unlock_irqrestore(&zone->lru_lock, flags);
302 345
303 pagevec_free(&pages_to_free); 346 pagevec_free(&pages_to_free);
304} 347}
@@ -491,7 +534,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
491 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 534 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
492 atomic_add(*committed, &vm_committed_space); 535 atomic_add(*committed, &vm_committed_space);
493 *committed = 0; 536 *committed = 0;
494 __lru_add_drain((long)hcpu); 537 drain_cpu_pagevecs((long)hcpu);
495 } 538 }
496 return NOTIFY_OK; 539 return NOTIFY_OK;
497} 540}
@@ -505,6 +548,10 @@ void __init swap_setup(void)
505{ 548{
506 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); 549 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
507 550
551#ifdef CONFIG_SWAP
552 bdi_init(swapper_space.backing_dev_info);
553#endif
554
508 /* Use a smaller cluster for small-memory machines */ 555 /* Use a smaller cluster for small-memory machines */
509 if (megs < 16) 556 if (megs < 16)
510 page_cluster = 2; 557 page_cluster = 2;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 67daecb6031a..b52635601dfe 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -74,6 +74,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
74{ 74{
75 int error; 75 int error;
76 76
77 BUG_ON(!PageLocked(page));
77 BUG_ON(PageSwapCache(page)); 78 BUG_ON(PageSwapCache(page));
78 BUG_ON(PagePrivate(page)); 79 BUG_ON(PagePrivate(page));
79 error = radix_tree_preload(gfp_mask); 80 error = radix_tree_preload(gfp_mask);
@@ -83,7 +84,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
83 entry.val, page); 84 entry.val, page);
84 if (!error) { 85 if (!error) {
85 page_cache_get(page); 86 page_cache_get(page);
86 SetPageLocked(page);
87 SetPageSwapCache(page); 87 SetPageSwapCache(page);
88 set_page_private(page, entry.val); 88 set_page_private(page, entry.val);
89 total_swapcache_pages++; 89 total_swapcache_pages++;
@@ -99,15 +99,18 @@ static int add_to_swap_cache(struct page *page, swp_entry_t entry)
99{ 99{
100 int error; 100 int error;
101 101
102 BUG_ON(PageLocked(page));
102 if (!swap_duplicate(entry)) { 103 if (!swap_duplicate(entry)) {
103 INC_CACHE_INFO(noent_race); 104 INC_CACHE_INFO(noent_race);
104 return -ENOENT; 105 return -ENOENT;
105 } 106 }
107 SetPageLocked(page);
106 error = __add_to_swap_cache(page, entry, GFP_KERNEL); 108 error = __add_to_swap_cache(page, entry, GFP_KERNEL);
107 /* 109 /*
108 * Anon pages are already on the LRU, we don't run lru_cache_add here. 110 * Anon pages are already on the LRU, we don't run lru_cache_add here.
109 */ 111 */
110 if (error) { 112 if (error) {
113 ClearPageLocked(page);
111 swap_free(entry); 114 swap_free(entry);
112 if (error == -EEXIST) 115 if (error == -EEXIST)
113 INC_CACHE_INFO(exist_race); 116 INC_CACHE_INFO(exist_race);
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index 8803471593fd..d436a9c82db7 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -66,24 +66,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
66 if (!dentry) 66 if (!dentry)
67 goto put_memory; 67 goto put_memory;
68 68
69 error = -ENFILE;
70 file = get_empty_filp();
71 if (!file)
72 goto put_dentry;
73
74 error = -ENOSPC; 69 error = -ENOSPC;
75 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); 70 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
76 if (!inode) 71 if (!inode)
77 goto close_file; 72 goto put_dentry;
78 73
79 d_instantiate(dentry, inode); 74 d_instantiate(dentry, inode);
80 inode->i_nlink = 0; /* It is unlinked */ 75 error = -ENFILE;
76 file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
77 &ramfs_file_operations);
78 if (!file)
79 goto put_dentry;
81 80
82 file->f_path.mnt = mntget(shm_mnt); 81 inode->i_nlink = 0; /* It is unlinked */
83 file->f_path.dentry = dentry;
84 file->f_mapping = inode->i_mapping;
85 file->f_op = &ramfs_file_operations;
86 file->f_mode = FMODE_WRITE | FMODE_READ;
87 82
88 /* notify everyone as to the change of file size */ 83 /* notify everyone as to the change of file size */
89 error = do_truncate(dentry, size, 0, file); 84 error = do_truncate(dentry, size, 0, file);
diff --git a/mm/truncate.c b/mm/truncate.c
index 5cdfbc1a59fd..cadc15653dde 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/backing-dev.h>
11#include <linux/mm.h> 12#include <linux/mm.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
13#include <linux/module.h> 14#include <linux/module.h>
@@ -72,6 +73,8 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
72 struct address_space *mapping = page->mapping; 73 struct address_space *mapping = page->mapping;
73 if (mapping && mapping_cap_account_dirty(mapping)) { 74 if (mapping && mapping_cap_account_dirty(mapping)) {
74 dec_zone_page_state(page, NR_FILE_DIRTY); 75 dec_zone_page_state(page, NR_FILE_DIRTY);
76 dec_bdi_stat(mapping->backing_dev_info,
77 BDI_RECLAIMABLE);
75 if (account_size) 78 if (account_size)
76 task_io_account_cancelled_write(account_size); 79 task_io_account_cancelled_write(account_size);
77 } 80 }
diff --git a/mm/util.c b/mm/util.c
index bf340d806868..5f64026cbb4d 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -81,14 +81,16 @@ EXPORT_SYMBOL(kmemdup);
81void *krealloc(const void *p, size_t new_size, gfp_t flags) 81void *krealloc(const void *p, size_t new_size, gfp_t flags)
82{ 82{
83 void *ret; 83 void *ret;
84 size_t ks; 84 size_t ks = 0;
85 85
86 if (unlikely(!new_size)) { 86 if (unlikely(!new_size)) {
87 kfree(p); 87 kfree(p);
88 return ZERO_SIZE_PTR; 88 return ZERO_SIZE_PTR;
89 } 89 }
90 90
91 ks = ksize(p); 91 if (p)
92 ks = ksize(p);
93
92 if (ks >= new_size) 94 if (ks >= new_size)
93 return (void *)p; 95 return (void *)p;
94 96
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 3cee76a8c9f0..2e01af365848 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -190,7 +190,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl
190 if (unlikely(!size)) 190 if (unlikely(!size))
191 return NULL; 191 return NULL;
192 192
193 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node); 193 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
194
194 if (unlikely(!area)) 195 if (unlikely(!area))
195 return NULL; 196 return NULL;
196 197
@@ -439,7 +440,7 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
439 area->flags |= VM_VPAGES; 440 area->flags |= VM_VPAGES;
440 } else { 441 } else {
441 pages = kmalloc_node(array_size, 442 pages = kmalloc_node(array_size,
442 (gfp_mask & GFP_LEVEL_MASK) | __GFP_ZERO, 443 (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO,
443 node); 444 node);
444 } 445 }
445 area->pages = pages; 446 area->pages = pages;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a6e65d024995..e1471385d001 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -932,6 +932,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
932 long mapped_ratio; 932 long mapped_ratio;
933 long distress; 933 long distress;
934 long swap_tendency; 934 long swap_tendency;
935 long imbalance;
935 936
936 if (zone_is_near_oom(zone)) 937 if (zone_is_near_oom(zone))
937 goto force_reclaim_mapped; 938 goto force_reclaim_mapped;
@@ -967,6 +968,46 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
967 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; 968 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
968 969
969 /* 970 /*
971 * If there's huge imbalance between active and inactive
972 * (think active 100 times larger than inactive) we should
973 * become more permissive, or the system will take too much
974 * cpu before it start swapping during memory pressure.
975 * Distress is about avoiding early-oom, this is about
976 * making swappiness graceful despite setting it to low
977 * values.
978 *
979 * Avoid div by zero with nr_inactive+1, and max resulting
980 * value is vm_total_pages.
981 */
982 imbalance = zone_page_state(zone, NR_ACTIVE);
983 imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
984
985 /*
986 * Reduce the effect of imbalance if swappiness is low,
987 * this means for a swappiness very low, the imbalance
988 * must be much higher than 100 for this logic to make
989 * the difference.
990 *
991 * Max temporary value is vm_total_pages*100.
992 */
993 imbalance *= (vm_swappiness + 1);
994 imbalance /= 100;
995
996 /*
997 * If not much of the ram is mapped, makes the imbalance
998 * less relevant, it's high priority we refill the inactive
999 * list with mapped pages only in presence of high ratio of
1000 * mapped pages.
1001 *
1002 * Max temporary value is vm_total_pages*100.
1003 */
1004 imbalance *= mapped_ratio;
1005 imbalance /= 100;
1006
1007 /* apply imbalance feedback to swap_tendency */
1008 swap_tendency += imbalance;
1009
1010 /*
970 * Now use this metric to decide whether to start moving mapped 1011 * Now use this metric to decide whether to start moving mapped
971 * memory onto the inactive list. 1012 * memory onto the inactive list.
972 */ 1013 */
@@ -1067,8 +1108,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1067 unsigned long nr_to_scan; 1108 unsigned long nr_to_scan;
1068 unsigned long nr_reclaimed = 0; 1109 unsigned long nr_reclaimed = 0;
1069 1110
1070 atomic_inc(&zone->reclaim_in_progress);
1071
1072 /* 1111 /*
1073 * Add one to `nr_to_scan' just to make sure that the kernel will 1112 * Add one to `nr_to_scan' just to make sure that the kernel will
1074 * slowly sift through the active list. 1113 * slowly sift through the active list.
@@ -1107,8 +1146,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1107 } 1146 }
1108 1147
1109 throttle_vm_writeout(sc->gfp_mask); 1148 throttle_vm_writeout(sc->gfp_mask);
1110
1111 atomic_dec(&zone->reclaim_in_progress);
1112 return nr_reclaimed; 1149 return nr_reclaimed;
1113} 1150}
1114 1151
@@ -1146,7 +1183,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
1146 1183
1147 note_zone_scanning_priority(zone, priority); 1184 note_zone_scanning_priority(zone, priority);
1148 1185
1149 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1186 if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY)
1150 continue; /* Let kswapd poll it */ 1187 continue; /* Let kswapd poll it */
1151 1188
1152 sc->all_unreclaimable = 0; 1189 sc->all_unreclaimable = 0;
@@ -1327,7 +1364,8 @@ loop_again:
1327 if (!populated_zone(zone)) 1364 if (!populated_zone(zone))
1328 continue; 1365 continue;
1329 1366
1330 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1367 if (zone_is_all_unreclaimable(zone) &&
1368 priority != DEF_PRIORITY)
1331 continue; 1369 continue;
1332 1370
1333 if (!zone_watermark_ok(zone, order, zone->pages_high, 1371 if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1362,7 +1400,8 @@ loop_again:
1362 if (!populated_zone(zone)) 1400 if (!populated_zone(zone))
1363 continue; 1401 continue;
1364 1402
1365 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1403 if (zone_is_all_unreclaimable(zone) &&
1404 priority != DEF_PRIORITY)
1366 continue; 1405 continue;
1367 1406
1368 if (!zone_watermark_ok(zone, order, zone->pages_high, 1407 if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1371,18 +1410,25 @@ loop_again:
1371 temp_priority[i] = priority; 1410 temp_priority[i] = priority;
1372 sc.nr_scanned = 0; 1411 sc.nr_scanned = 0;
1373 note_zone_scanning_priority(zone, priority); 1412 note_zone_scanning_priority(zone, priority);
1374 nr_reclaimed += shrink_zone(priority, zone, &sc); 1413 /*
1414 * We put equal pressure on every zone, unless one
1415 * zone has way too many pages free already.
1416 */
1417 if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
1418 end_zone, 0))
1419 nr_reclaimed += shrink_zone(priority, zone, &sc);
1375 reclaim_state->reclaimed_slab = 0; 1420 reclaim_state->reclaimed_slab = 0;
1376 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1421 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1377 lru_pages); 1422 lru_pages);
1378 nr_reclaimed += reclaim_state->reclaimed_slab; 1423 nr_reclaimed += reclaim_state->reclaimed_slab;
1379 total_scanned += sc.nr_scanned; 1424 total_scanned += sc.nr_scanned;
1380 if (zone->all_unreclaimable) 1425 if (zone_is_all_unreclaimable(zone))
1381 continue; 1426 continue;
1382 if (nr_slab == 0 && zone->pages_scanned >= 1427 if (nr_slab == 0 && zone->pages_scanned >=
1383 (zone_page_state(zone, NR_ACTIVE) 1428 (zone_page_state(zone, NR_ACTIVE)
1384 + zone_page_state(zone, NR_INACTIVE)) * 6) 1429 + zone_page_state(zone, NR_INACTIVE)) * 6)
1385 zone->all_unreclaimable = 1; 1430 zone_set_flag(zone,
1431 ZONE_ALL_UNRECLAIMABLE);
1386 /* 1432 /*
1387 * If we've done a decent amount of scanning and 1433 * If we've done a decent amount of scanning and
1388 * the reclaim ratio is low, start doing writepage 1434 * the reclaim ratio is low, start doing writepage
@@ -1548,7 +1594,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1548 if (!populated_zone(zone)) 1594 if (!populated_zone(zone))
1549 continue; 1595 continue;
1550 1596
1551 if (zone->all_unreclaimable && prio != DEF_PRIORITY) 1597 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
1552 continue; 1598 continue;
1553 1599
1554 /* For pass = 0 we don't shrink the active list */ 1600 /* For pass = 0 we don't shrink the active list */
@@ -1688,9 +1734,11 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
1688{ 1734{
1689 pg_data_t *pgdat; 1735 pg_data_t *pgdat;
1690 cpumask_t mask; 1736 cpumask_t mask;
1737 int nid;
1691 1738
1692 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 1739 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
1693 for_each_online_pgdat(pgdat) { 1740 for_each_node_state(nid, N_HIGH_MEMORY) {
1741 pgdat = NODE_DATA(nid);
1694 mask = node_to_cpumask(pgdat->node_id); 1742 mask = node_to_cpumask(pgdat->node_id);
1695 if (any_online_cpu(mask) != NR_CPUS) 1743 if (any_online_cpu(mask) != NR_CPUS)
1696 /* One of our CPUs online: restore mask */ 1744 /* One of our CPUs online: restore mask */
@@ -1727,7 +1775,7 @@ static int __init kswapd_init(void)
1727 int nid; 1775 int nid;
1728 1776
1729 swap_setup(); 1777 swap_setup();
1730 for_each_online_node(nid) 1778 for_each_node_state(nid, N_HIGH_MEMORY)
1731 kswapd_run(nid); 1779 kswapd_run(nid);
1732 hotcpu_notifier(cpu_callback, 0); 1780 hotcpu_notifier(cpu_callback, 0);
1733 return 0; 1781 return 0;
@@ -1847,8 +1895,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1847 1895
1848int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1896int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1849{ 1897{
1850 cpumask_t mask;
1851 int node_id; 1898 int node_id;
1899 int ret;
1852 1900
1853 /* 1901 /*
1854 * Zone reclaim reclaims unmapped file backed pages and 1902 * Zone reclaim reclaims unmapped file backed pages and
@@ -1866,15 +1914,13 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1866 <= zone->min_slab_pages) 1914 <= zone->min_slab_pages)
1867 return 0; 1915 return 0;
1868 1916
1917 if (zone_is_all_unreclaimable(zone))
1918 return 0;
1919
1869 /* 1920 /*
1870 * Avoid concurrent zone reclaims, do not reclaim in a zone that does 1921 * Do not scan if the allocation should not be delayed.
1871 * not have reclaimable pages and if we should not delay the allocation
1872 * then do not scan.
1873 */ 1922 */
1874 if (!(gfp_mask & __GFP_WAIT) || 1923 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
1875 zone->all_unreclaimable ||
1876 atomic_read(&zone->reclaim_in_progress) > 0 ||
1877 (current->flags & PF_MEMALLOC))
1878 return 0; 1924 return 0;
1879 1925
1880 /* 1926 /*
@@ -1884,9 +1930,14 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1884 * as wide as possible. 1930 * as wide as possible.
1885 */ 1931 */
1886 node_id = zone_to_nid(zone); 1932 node_id = zone_to_nid(zone);
1887 mask = node_to_cpumask(node_id); 1933 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
1888 if (!cpus_empty(mask) && node_id != numa_node_id()) 1934 return 0;
1935
1936 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
1889 return 0; 1937 return 0;
1890 return __zone_reclaim(zone, gfp_mask, order); 1938 ret = __zone_reclaim(zone, gfp_mask, order);
1939 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
1940
1941 return ret;
1891} 1942}
1892#endif 1943#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c64d169537bf..4651bf153f35 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -353,23 +353,6 @@ void refresh_cpu_vm_stats(int cpu)
353 } 353 }
354} 354}
355 355
356static void __refresh_cpu_vm_stats(void *dummy)
357{
358 refresh_cpu_vm_stats(smp_processor_id());
359}
360
361/*
362 * Consolidate all counters.
363 *
364 * Note that the result is less inaccurate but still inaccurate
365 * if concurrent processes are allowed to run.
366 */
367void refresh_vm_stats(void)
368{
369 on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
370}
371EXPORT_SYMBOL(refresh_vm_stats);
372
373#endif 356#endif
374 357
375#ifdef CONFIG_NUMA 358#ifdef CONFIG_NUMA
@@ -398,6 +381,13 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z)
398 381
399#include <linux/seq_file.h> 382#include <linux/seq_file.h>
400 383
384static char * const migratetype_names[MIGRATE_TYPES] = {
385 "Unmovable",
386 "Reclaimable",
387 "Movable",
388 "Reserve",
389};
390
401static void *frag_start(struct seq_file *m, loff_t *pos) 391static void *frag_start(struct seq_file *m, loff_t *pos)
402{ 392{
403 pg_data_t *pgdat; 393 pg_data_t *pgdat;
@@ -422,28 +412,144 @@ static void frag_stop(struct seq_file *m, void *arg)
422{ 412{
423} 413}
424 414
425/* 415/* Walk all the zones in a node and print using a callback */
426 * This walks the free areas for each zone. 416static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
427 */ 417 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
428static int frag_show(struct seq_file *m, void *arg)
429{ 418{
430 pg_data_t *pgdat = (pg_data_t *)arg;
431 struct zone *zone; 419 struct zone *zone;
432 struct zone *node_zones = pgdat->node_zones; 420 struct zone *node_zones = pgdat->node_zones;
433 unsigned long flags; 421 unsigned long flags;
434 int order;
435 422
436 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 423 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
437 if (!populated_zone(zone)) 424 if (!populated_zone(zone))
438 continue; 425 continue;
439 426
440 spin_lock_irqsave(&zone->lock, flags); 427 spin_lock_irqsave(&zone->lock, flags);
441 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 428 print(m, pgdat, zone);
442 for (order = 0; order < MAX_ORDER; ++order)
443 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
444 spin_unlock_irqrestore(&zone->lock, flags); 429 spin_unlock_irqrestore(&zone->lock, flags);
430 }
431}
432
433static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
434 struct zone *zone)
435{
436 int order;
437
438 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
439 for (order = 0; order < MAX_ORDER; ++order)
440 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
441 seq_putc(m, '\n');
442}
443
444/*
445 * This walks the free areas for each zone.
446 */
447static int frag_show(struct seq_file *m, void *arg)
448{
449 pg_data_t *pgdat = (pg_data_t *)arg;
450 walk_zones_in_node(m, pgdat, frag_show_print);
451 return 0;
452}
453
454static void pagetypeinfo_showfree_print(struct seq_file *m,
455 pg_data_t *pgdat, struct zone *zone)
456{
457 int order, mtype;
458
459 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
460 seq_printf(m, "Node %4d, zone %8s, type %12s ",
461 pgdat->node_id,
462 zone->name,
463 migratetype_names[mtype]);
464 for (order = 0; order < MAX_ORDER; ++order) {
465 unsigned long freecount = 0;
466 struct free_area *area;
467 struct list_head *curr;
468
469 area = &(zone->free_area[order]);
470
471 list_for_each(curr, &area->free_list[mtype])
472 freecount++;
473 seq_printf(m, "%6lu ", freecount);
474 }
445 seq_putc(m, '\n'); 475 seq_putc(m, '\n');
446 } 476 }
477}
478
479/* Print out the free pages at each order for each migatetype */
480static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
481{
482 int order;
483 pg_data_t *pgdat = (pg_data_t *)arg;
484
485 /* Print header */
486 seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
487 for (order = 0; order < MAX_ORDER; ++order)
488 seq_printf(m, "%6d ", order);
489 seq_putc(m, '\n');
490
491 walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
492
493 return 0;
494}
495
496static void pagetypeinfo_showblockcount_print(struct seq_file *m,
497 pg_data_t *pgdat, struct zone *zone)
498{
499 int mtype;
500 unsigned long pfn;
501 unsigned long start_pfn = zone->zone_start_pfn;
502 unsigned long end_pfn = start_pfn + zone->spanned_pages;
503 unsigned long count[MIGRATE_TYPES] = { 0, };
504
505 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
506 struct page *page;
507
508 if (!pfn_valid(pfn))
509 continue;
510
511 page = pfn_to_page(pfn);
512 mtype = get_pageblock_migratetype(page);
513
514 count[mtype]++;
515 }
516
517 /* Print counts */
518 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
519 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
520 seq_printf(m, "%12lu ", count[mtype]);
521 seq_putc(m, '\n');
522}
523
524/* Print out the free pages at each order for each migratetype */
525static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
526{
527 int mtype;
528 pg_data_t *pgdat = (pg_data_t *)arg;
529
530 seq_printf(m, "\n%-23s", "Number of blocks type ");
531 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
532 seq_printf(m, "%12s ", migratetype_names[mtype]);
533 seq_putc(m, '\n');
534 walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
535
536 return 0;
537}
538
539/*
540 * This prints out statistics in relation to grouping pages by mobility.
541 * It is expensive to collect so do not constantly read the file.
542 */
543static int pagetypeinfo_show(struct seq_file *m, void *arg)
544{
545 pg_data_t *pgdat = (pg_data_t *)arg;
546
547 seq_printf(m, "Page block order: %d\n", pageblock_order);
548 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
549 seq_putc(m, '\n');
550 pagetypeinfo_showfree(m, pgdat);
551 pagetypeinfo_showblockcount(m, pgdat);
552
447 return 0; 553 return 0;
448} 554}
449 555
@@ -454,6 +560,13 @@ const struct seq_operations fragmentation_op = {
454 .show = frag_show, 560 .show = frag_show,
455}; 561};
456 562
563const struct seq_operations pagetypeinfo_op = {
564 .start = frag_start,
565 .next = frag_next,
566 .stop = frag_stop,
567 .show = pagetypeinfo_show,
568};
569
457#ifdef CONFIG_ZONE_DMA 570#ifdef CONFIG_ZONE_DMA
458#define TEXT_FOR_DMA(xx) xx "_dma", 571#define TEXT_FOR_DMA(xx) xx "_dma",
459#else 572#else
@@ -532,84 +645,78 @@ static const char * const vmstat_text[] = {
532#endif 645#endif
533}; 646};
534 647
535/* 648static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
536 * Output information about zones in @pgdat. 649 struct zone *zone)
537 */
538static int zoneinfo_show(struct seq_file *m, void *arg)
539{ 650{
540 pg_data_t *pgdat = arg; 651 int i;
541 struct zone *zone; 652 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
542 struct zone *node_zones = pgdat->node_zones; 653 seq_printf(m,
543 unsigned long flags; 654 "\n pages free %lu"
544 655 "\n min %lu"
545 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { 656 "\n low %lu"
546 int i; 657 "\n high %lu"
547 658 "\n scanned %lu (a: %lu i: %lu)"
548 if (!populated_zone(zone)) 659 "\n spanned %lu"
549 continue; 660 "\n present %lu",
550 661 zone_page_state(zone, NR_FREE_PAGES),
551 spin_lock_irqsave(&zone->lock, flags); 662 zone->pages_min,
552 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); 663 zone->pages_low,
553 seq_printf(m, 664 zone->pages_high,
554 "\n pages free %lu" 665 zone->pages_scanned,
555 "\n min %lu" 666 zone->nr_scan_active, zone->nr_scan_inactive,
556 "\n low %lu" 667 zone->spanned_pages,
557 "\n high %lu" 668 zone->present_pages);
558 "\n scanned %lu (a: %lu i: %lu)"
559 "\n spanned %lu"
560 "\n present %lu",
561 zone_page_state(zone, NR_FREE_PAGES),
562 zone->pages_min,
563 zone->pages_low,
564 zone->pages_high,
565 zone->pages_scanned,
566 zone->nr_scan_active, zone->nr_scan_inactive,
567 zone->spanned_pages,
568 zone->present_pages);
569 669
570 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 670 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
571 seq_printf(m, "\n %-12s %lu", vmstat_text[i], 671 seq_printf(m, "\n %-12s %lu", vmstat_text[i],
572 zone_page_state(zone, i)); 672 zone_page_state(zone, i));
573 673
574 seq_printf(m, 674 seq_printf(m,
575 "\n protection: (%lu", 675 "\n protection: (%lu",
576 zone->lowmem_reserve[0]); 676 zone->lowmem_reserve[0]);
577 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) 677 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
578 seq_printf(m, ", %lu", zone->lowmem_reserve[i]); 678 seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
579 seq_printf(m, 679 seq_printf(m,
580 ")" 680 ")"
581 "\n pagesets"); 681 "\n pagesets");
582 for_each_online_cpu(i) { 682 for_each_online_cpu(i) {
583 struct per_cpu_pageset *pageset; 683 struct per_cpu_pageset *pageset;
584 int j; 684 int j;
585 685
586 pageset = zone_pcp(zone, i); 686 pageset = zone_pcp(zone, i);
587 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { 687 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
588 seq_printf(m, 688 seq_printf(m,
589 "\n cpu: %i pcp: %i" 689 "\n cpu: %i pcp: %i"
590 "\n count: %i" 690 "\n count: %i"
591 "\n high: %i" 691 "\n high: %i"
592 "\n batch: %i", 692 "\n batch: %i",
593 i, j, 693 i, j,
594 pageset->pcp[j].count, 694 pageset->pcp[j].count,
595 pageset->pcp[j].high, 695 pageset->pcp[j].high,
596 pageset->pcp[j].batch); 696 pageset->pcp[j].batch);
597 } 697 }
598#ifdef CONFIG_SMP 698#ifdef CONFIG_SMP
599 seq_printf(m, "\n vm stats threshold: %d", 699 seq_printf(m, "\n vm stats threshold: %d",
600 pageset->stat_threshold); 700 pageset->stat_threshold);
601#endif 701#endif
602 }
603 seq_printf(m,
604 "\n all_unreclaimable: %u"
605 "\n prev_priority: %i"
606 "\n start_pfn: %lu",
607 zone->all_unreclaimable,
608 zone->prev_priority,
609 zone->zone_start_pfn);
610 spin_unlock_irqrestore(&zone->lock, flags);
611 seq_putc(m, '\n');
612 } 702 }
703 seq_printf(m,
704 "\n all_unreclaimable: %u"
705 "\n prev_priority: %i"
706 "\n start_pfn: %lu",
707 zone_is_all_unreclaimable(zone),
708 zone->prev_priority,
709 zone->zone_start_pfn);
710 seq_putc(m, '\n');
711}
712
713/*
714 * Output information about zones in @pgdat.
715 */
716static int zoneinfo_show(struct seq_file *m, void *arg)
717{
718 pg_data_t *pgdat = (pg_data_t *)arg;
719 walk_zones_in_node(m, pgdat, zoneinfo_show_print);
613 return 0; 720 return 0;
614} 721}
615 722
@@ -741,7 +848,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
741static struct notifier_block __cpuinitdata vmstat_notifier = 848static struct notifier_block __cpuinitdata vmstat_notifier =
742 { &vmstat_cpuup_callback, NULL, 0 }; 849 { &vmstat_cpuup_callback, NULL, 0 };
743 850
744int __init setup_vmstat(void) 851static int __init setup_vmstat(void)
745{ 852{
746 int cpu; 853 int cpu;
747 854