aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig18
-rw-r--r--mm/Makefile3
-rw-r--r--mm/filemap.c766
-rw-r--r--mm/filemap.h103
-rw-r--r--mm/filemap_xip.c17
-rw-r--r--mm/hugetlb.c398
-rw-r--r--mm/internal.h10
-rw-r--r--mm/memory.c161
-rw-r--r--mm/memory_hotplug.c312
-rw-r--r--mm/mempolicy.c60
-rw-r--r--mm/migrate.c4
-rw-r--r--mm/mprotect.c1
-rw-r--r--mm/oom_kill.c9
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_alloc.c731
-rw-r--r--mm/page_isolation.c138
-rw-r--r--mm/readahead.c88
-rw-r--r--mm/rmap.c1
-rw-r--r--mm/shmem.c62
-rw-r--r--mm/slab.c21
-rw-r--r--mm/slob.c7
-rw-r--r--mm/slub.c490
-rw-r--r--mm/sparse-vmemmap.c148
-rw-r--r--mm/sparse.c105
-rw-r--r--mm/swap.c106
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/util.c6
-rw-r--r--mm/vmalloc.c5
-rw-r--r--mm/vmscan.c59
-rw-r--r--mm/vmstat.c305
30 files changed, 3071 insertions, 1078 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a7609cbcb00d..1cc6cada2bbf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -112,6 +112,19 @@ config SPARSEMEM_EXTREME
112 def_bool y 112 def_bool y
113 depends on SPARSEMEM && !SPARSEMEM_STATIC 113 depends on SPARSEMEM && !SPARSEMEM_STATIC
114 114
115#
116# SPARSEMEM_VMEMMAP uses a virtually mapped mem_map to optimise pfn_to_page
117# and page_to_pfn. The most efficient option where kernel virtual space is
118# not under pressure.
119#
120config SPARSEMEM_VMEMMAP_ENABLE
121 def_bool n
122
123config SPARSEMEM_VMEMMAP
124 bool
125 depends on SPARSEMEM
126 default y if (SPARSEMEM_VMEMMAP_ENABLE)
127
115# eventually, we can have this option just 'select SPARSEMEM' 128# eventually, we can have this option just 'select SPARSEMEM'
116config MEMORY_HOTPLUG 129config MEMORY_HOTPLUG
117 bool "Allow for memory hot-add" 130 bool "Allow for memory hot-add"
@@ -126,6 +139,11 @@ config MEMORY_HOTPLUG_SPARSE
126 def_bool y 139 def_bool y
127 depends on SPARSEMEM && MEMORY_HOTPLUG 140 depends on SPARSEMEM && MEMORY_HOTPLUG
128 141
142config MEMORY_HOTREMOVE
143 bool "Allow for memory hot remove"
144 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
145 depends on MIGRATION
146
129# Heavily threaded applications may benefit from splitting the mm-wide 147# Heavily threaded applications may benefit from splitting the mm-wide
130# page_table_lock, so that faults on different parts of the user address 148# page_table_lock, so that faults on different parts of the user address
131# space can be handled with less contention: split it at this NR_CPUS. 149# space can be handled with less contention: split it at this NR_CPUS.
diff --git a/mm/Makefile b/mm/Makefile
index 245e33ab00c4..5c0b0ea7572d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,13 +11,14 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 page_alloc.o page-writeback.o pdflush.o \ 11 page_alloc.o page-writeback.o pdflush.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 $(mmu-y) 14 page_isolation.o $(mmu-y)
15 15
16obj-$(CONFIG_BOUNCE) += bounce.o 16obj-$(CONFIG_BOUNCE) += bounce.o
17obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 17obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
18obj-$(CONFIG_HUGETLBFS) += hugetlb.o 18obj-$(CONFIG_HUGETLBFS) += hugetlb.o
19obj-$(CONFIG_NUMA) += mempolicy.o 19obj-$(CONFIG_NUMA) += mempolicy.o
20obj-$(CONFIG_SPARSEMEM) += sparse.o 20obj-$(CONFIG_SPARSEMEM) += sparse.o
21obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
21obj-$(CONFIG_SHMEM) += shmem.o 22obj-$(CONFIG_SHMEM) += shmem.o
22obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 23obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
23obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 24obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
diff --git a/mm/filemap.c b/mm/filemap.c
index 15c8413ee929..c6049e947cd9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,7 +30,7 @@
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/cpuset.h> 32#include <linux/cpuset.h>
33#include "filemap.h" 33#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
34#include "internal.h" 34#include "internal.h"
35 35
36/* 36/*
@@ -593,7 +593,7 @@ void fastcall __lock_page_nosync(struct page *page)
593 * Is there a pagecache struct page at the given (mapping, offset) tuple? 593 * Is there a pagecache struct page at the given (mapping, offset) tuple?
594 * If yes, increment its refcount and return it; if no, return NULL. 594 * If yes, increment its refcount and return it; if no, return NULL.
595 */ 595 */
596struct page * find_get_page(struct address_space *mapping, unsigned long offset) 596struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
597{ 597{
598 struct page *page; 598 struct page *page;
599 599
@@ -617,30 +617,31 @@ EXPORT_SYMBOL(find_get_page);
617 * Returns zero if the page was not present. find_lock_page() may sleep. 617 * Returns zero if the page was not present. find_lock_page() may sleep.
618 */ 618 */
619struct page *find_lock_page(struct address_space *mapping, 619struct page *find_lock_page(struct address_space *mapping,
620 unsigned long offset) 620 pgoff_t offset)
621{ 621{
622 struct page *page; 622 struct page *page;
623 623
624 read_lock_irq(&mapping->tree_lock);
625repeat: 624repeat:
625 read_lock_irq(&mapping->tree_lock);
626 page = radix_tree_lookup(&mapping->page_tree, offset); 626 page = radix_tree_lookup(&mapping->page_tree, offset);
627 if (page) { 627 if (page) {
628 page_cache_get(page); 628 page_cache_get(page);
629 if (TestSetPageLocked(page)) { 629 if (TestSetPageLocked(page)) {
630 read_unlock_irq(&mapping->tree_lock); 630 read_unlock_irq(&mapping->tree_lock);
631 __lock_page(page); 631 __lock_page(page);
632 read_lock_irq(&mapping->tree_lock);
633 632
634 /* Has the page been truncated while we slept? */ 633 /* Has the page been truncated while we slept? */
635 if (unlikely(page->mapping != mapping || 634 if (unlikely(page->mapping != mapping)) {
636 page->index != offset)) {
637 unlock_page(page); 635 unlock_page(page);
638 page_cache_release(page); 636 page_cache_release(page);
639 goto repeat; 637 goto repeat;
640 } 638 }
639 VM_BUG_ON(page->index != offset);
640 goto out;
641 } 641 }
642 } 642 }
643 read_unlock_irq(&mapping->tree_lock); 643 read_unlock_irq(&mapping->tree_lock);
644out:
644 return page; 645 return page;
645} 646}
646EXPORT_SYMBOL(find_lock_page); 647EXPORT_SYMBOL(find_lock_page);
@@ -663,29 +664,24 @@ EXPORT_SYMBOL(find_lock_page);
663 * memory exhaustion. 664 * memory exhaustion.
664 */ 665 */
665struct page *find_or_create_page(struct address_space *mapping, 666struct page *find_or_create_page(struct address_space *mapping,
666 unsigned long index, gfp_t gfp_mask) 667 pgoff_t index, gfp_t gfp_mask)
667{ 668{
668 struct page *page, *cached_page = NULL; 669 struct page *page;
669 int err; 670 int err;
670repeat: 671repeat:
671 page = find_lock_page(mapping, index); 672 page = find_lock_page(mapping, index);
672 if (!page) { 673 if (!page) {
673 if (!cached_page) { 674 page = __page_cache_alloc(gfp_mask);
674 cached_page = 675 if (!page)
675 __page_cache_alloc(gfp_mask); 676 return NULL;
676 if (!cached_page) 677 err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
677 return NULL; 678 if (unlikely(err)) {
679 page_cache_release(page);
680 page = NULL;
681 if (err == -EEXIST)
682 goto repeat;
678 } 683 }
679 err = add_to_page_cache_lru(cached_page, mapping,
680 index, gfp_mask);
681 if (!err) {
682 page = cached_page;
683 cached_page = NULL;
684 } else if (err == -EEXIST)
685 goto repeat;
686 } 684 }
687 if (cached_page)
688 page_cache_release(cached_page);
689 return page; 685 return page;
690} 686}
691EXPORT_SYMBOL(find_or_create_page); 687EXPORT_SYMBOL(find_or_create_page);
@@ -797,7 +793,7 @@ EXPORT_SYMBOL(find_get_pages_tag);
797 * and deadlock against the caller's locked page. 793 * and deadlock against the caller's locked page.
798 */ 794 */
799struct page * 795struct page *
800grab_cache_page_nowait(struct address_space *mapping, unsigned long index) 796grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
801{ 797{
802 struct page *page = find_get_page(mapping, index); 798 struct page *page = find_get_page(mapping, index);
803 799
@@ -859,34 +855,29 @@ static void shrink_readahead_size_eio(struct file *filp,
859 * It may be NULL. 855 * It may be NULL.
860 */ 856 */
861void do_generic_mapping_read(struct address_space *mapping, 857void do_generic_mapping_read(struct address_space *mapping,
862 struct file_ra_state *_ra, 858 struct file_ra_state *ra,
863 struct file *filp, 859 struct file *filp,
864 loff_t *ppos, 860 loff_t *ppos,
865 read_descriptor_t *desc, 861 read_descriptor_t *desc,
866 read_actor_t actor) 862 read_actor_t actor)
867{ 863{
868 struct inode *inode = mapping->host; 864 struct inode *inode = mapping->host;
869 unsigned long index; 865 pgoff_t index;
870 unsigned long offset; 866 pgoff_t last_index;
871 unsigned long last_index; 867 pgoff_t prev_index;
872 unsigned long next_index; 868 unsigned long offset; /* offset into pagecache page */
873 unsigned long prev_index;
874 unsigned int prev_offset; 869 unsigned int prev_offset;
875 struct page *cached_page;
876 int error; 870 int error;
877 struct file_ra_state ra = *_ra;
878 871
879 cached_page = NULL;
880 index = *ppos >> PAGE_CACHE_SHIFT; 872 index = *ppos >> PAGE_CACHE_SHIFT;
881 next_index = index; 873 prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
882 prev_index = ra.prev_index; 874 prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
883 prev_offset = ra.prev_offset;
884 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 875 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
885 offset = *ppos & ~PAGE_CACHE_MASK; 876 offset = *ppos & ~PAGE_CACHE_MASK;
886 877
887 for (;;) { 878 for (;;) {
888 struct page *page; 879 struct page *page;
889 unsigned long end_index; 880 pgoff_t end_index;
890 loff_t isize; 881 loff_t isize;
891 unsigned long nr, ret; 882 unsigned long nr, ret;
892 883
@@ -895,7 +886,7 @@ find_page:
895 page = find_get_page(mapping, index); 886 page = find_get_page(mapping, index);
896 if (!page) { 887 if (!page) {
897 page_cache_sync_readahead(mapping, 888 page_cache_sync_readahead(mapping,
898 &ra, filp, 889 ra, filp,
899 index, last_index - index); 890 index, last_index - index);
900 page = find_get_page(mapping, index); 891 page = find_get_page(mapping, index);
901 if (unlikely(page == NULL)) 892 if (unlikely(page == NULL))
@@ -903,7 +894,7 @@ find_page:
903 } 894 }
904 if (PageReadahead(page)) { 895 if (PageReadahead(page)) {
905 page_cache_async_readahead(mapping, 896 page_cache_async_readahead(mapping,
906 &ra, filp, page, 897 ra, filp, page,
907 index, last_index - index); 898 index, last_index - index);
908 } 899 }
909 if (!PageUptodate(page)) 900 if (!PageUptodate(page))
@@ -966,7 +957,6 @@ page_ok:
966 index += offset >> PAGE_CACHE_SHIFT; 957 index += offset >> PAGE_CACHE_SHIFT;
967 offset &= ~PAGE_CACHE_MASK; 958 offset &= ~PAGE_CACHE_MASK;
968 prev_offset = offset; 959 prev_offset = offset;
969 ra.prev_offset = offset;
970 960
971 page_cache_release(page); 961 page_cache_release(page);
972 if (ret == nr && desc->count) 962 if (ret == nr && desc->count)
@@ -1015,7 +1005,7 @@ readpage:
1015 } 1005 }
1016 unlock_page(page); 1006 unlock_page(page);
1017 error = -EIO; 1007 error = -EIO;
1018 shrink_readahead_size_eio(filp, &ra); 1008 shrink_readahead_size_eio(filp, ra);
1019 goto readpage_error; 1009 goto readpage_error;
1020 } 1010 }
1021 unlock_page(page); 1011 unlock_page(page);
@@ -1034,33 +1024,29 @@ no_cached_page:
1034 * Ok, it wasn't cached, so we need to create a new 1024 * Ok, it wasn't cached, so we need to create a new
1035 * page.. 1025 * page..
1036 */ 1026 */
1037 if (!cached_page) { 1027 page = page_cache_alloc_cold(mapping);
1038 cached_page = page_cache_alloc_cold(mapping); 1028 if (!page) {
1039 if (!cached_page) { 1029 desc->error = -ENOMEM;
1040 desc->error = -ENOMEM; 1030 goto out;
1041 goto out;
1042 }
1043 } 1031 }
1044 error = add_to_page_cache_lru(cached_page, mapping, 1032 error = add_to_page_cache_lru(page, mapping,
1045 index, GFP_KERNEL); 1033 index, GFP_KERNEL);
1046 if (error) { 1034 if (error) {
1035 page_cache_release(page);
1047 if (error == -EEXIST) 1036 if (error == -EEXIST)
1048 goto find_page; 1037 goto find_page;
1049 desc->error = error; 1038 desc->error = error;
1050 goto out; 1039 goto out;
1051 } 1040 }
1052 page = cached_page;
1053 cached_page = NULL;
1054 goto readpage; 1041 goto readpage;
1055 } 1042 }
1056 1043
1057out: 1044out:
1058 *_ra = ra; 1045 ra->prev_pos = prev_index;
1059 _ra->prev_index = prev_index; 1046 ra->prev_pos <<= PAGE_CACHE_SHIFT;
1047 ra->prev_pos |= prev_offset;
1060 1048
1061 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; 1049 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1062 if (cached_page)
1063 page_cache_release(cached_page);
1064 if (filp) 1050 if (filp)
1065 file_accessed(filp); 1051 file_accessed(filp);
1066} 1052}
@@ -1220,7 +1206,7 @@ EXPORT_SYMBOL(generic_file_aio_read);
1220 1206
1221static ssize_t 1207static ssize_t
1222do_readahead(struct address_space *mapping, struct file *filp, 1208do_readahead(struct address_space *mapping, struct file *filp,
1223 unsigned long index, unsigned long nr) 1209 pgoff_t index, unsigned long nr)
1224{ 1210{
1225 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 1211 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1226 return -EINVAL; 1212 return -EINVAL;
@@ -1240,8 +1226,8 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1240 if (file) { 1226 if (file) {
1241 if (file->f_mode & FMODE_READ) { 1227 if (file->f_mode & FMODE_READ) {
1242 struct address_space *mapping = file->f_mapping; 1228 struct address_space *mapping = file->f_mapping;
1243 unsigned long start = offset >> PAGE_CACHE_SHIFT; 1229 pgoff_t start = offset >> PAGE_CACHE_SHIFT;
1244 unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT; 1230 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1245 unsigned long len = end - start + 1; 1231 unsigned long len = end - start + 1;
1246 ret = do_readahead(mapping, file, start, len); 1232 ret = do_readahead(mapping, file, start, len);
1247 } 1233 }
@@ -1251,7 +1237,6 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1251} 1237}
1252 1238
1253#ifdef CONFIG_MMU 1239#ifdef CONFIG_MMU
1254static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1255/** 1240/**
1256 * page_cache_read - adds requested page to the page cache if not already there 1241 * page_cache_read - adds requested page to the page cache if not already there
1257 * @file: file to read 1242 * @file: file to read
@@ -1260,7 +1245,7 @@ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1260 * This adds the requested page to the page cache if it isn't already there, 1245 * This adds the requested page to the page cache if it isn't already there,
1261 * and schedules an I/O to read in its contents from disk. 1246 * and schedules an I/O to read in its contents from disk.
1262 */ 1247 */
1263static int fastcall page_cache_read(struct file * file, unsigned long offset) 1248static int fastcall page_cache_read(struct file * file, pgoff_t offset)
1264{ 1249{
1265 struct address_space *mapping = file->f_mapping; 1250 struct address_space *mapping = file->f_mapping;
1266 struct page *page; 1251 struct page *page;
@@ -1349,7 +1334,7 @@ retry_find:
1349 * Do we miss much more than hit in this file? If so, 1334 * Do we miss much more than hit in this file? If so,
1350 * stop bothering with read-ahead. It will only hurt. 1335 * stop bothering with read-ahead. It will only hurt.
1351 */ 1336 */
1352 if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) 1337 if (ra->mmap_miss > MMAP_LOTSAMISS)
1353 goto no_cached_page; 1338 goto no_cached_page;
1354 1339
1355 /* 1340 /*
@@ -1375,7 +1360,7 @@ retry_find:
1375 } 1360 }
1376 1361
1377 if (!did_readaround) 1362 if (!did_readaround)
1378 ra->mmap_hit++; 1363 ra->mmap_miss--;
1379 1364
1380 /* 1365 /*
1381 * We have a locked page in the page cache, now we need to check 1366 * We have a locked page in the page cache, now we need to check
@@ -1396,7 +1381,7 @@ retry_find:
1396 * Found the page and have a reference on it. 1381 * Found the page and have a reference on it.
1397 */ 1382 */
1398 mark_page_accessed(page); 1383 mark_page_accessed(page);
1399 ra->prev_index = page->index; 1384 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
1400 vmf->page = page; 1385 vmf->page = page;
1401 return ret | VM_FAULT_LOCKED; 1386 return ret | VM_FAULT_LOCKED;
1402 1387
@@ -1501,39 +1486,32 @@ EXPORT_SYMBOL(generic_file_mmap);
1501EXPORT_SYMBOL(generic_file_readonly_mmap); 1486EXPORT_SYMBOL(generic_file_readonly_mmap);
1502 1487
1503static struct page *__read_cache_page(struct address_space *mapping, 1488static struct page *__read_cache_page(struct address_space *mapping,
1504 unsigned long index, 1489 pgoff_t index,
1505 int (*filler)(void *,struct page*), 1490 int (*filler)(void *,struct page*),
1506 void *data) 1491 void *data)
1507{ 1492{
1508 struct page *page, *cached_page = NULL; 1493 struct page *page;
1509 int err; 1494 int err;
1510repeat: 1495repeat:
1511 page = find_get_page(mapping, index); 1496 page = find_get_page(mapping, index);
1512 if (!page) { 1497 if (!page) {
1513 if (!cached_page) { 1498 page = page_cache_alloc_cold(mapping);
1514 cached_page = page_cache_alloc_cold(mapping); 1499 if (!page)
1515 if (!cached_page) 1500 return ERR_PTR(-ENOMEM);
1516 return ERR_PTR(-ENOMEM); 1501 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
1517 } 1502 if (unlikely(err)) {
1518 err = add_to_page_cache_lru(cached_page, mapping, 1503 page_cache_release(page);
1519 index, GFP_KERNEL); 1504 if (err == -EEXIST)
1520 if (err == -EEXIST) 1505 goto repeat;
1521 goto repeat;
1522 if (err < 0) {
1523 /* Presumably ENOMEM for radix tree node */ 1506 /* Presumably ENOMEM for radix tree node */
1524 page_cache_release(cached_page);
1525 return ERR_PTR(err); 1507 return ERR_PTR(err);
1526 } 1508 }
1527 page = cached_page;
1528 cached_page = NULL;
1529 err = filler(data, page); 1509 err = filler(data, page);
1530 if (err < 0) { 1510 if (err < 0) {
1531 page_cache_release(page); 1511 page_cache_release(page);
1532 page = ERR_PTR(err); 1512 page = ERR_PTR(err);
1533 } 1513 }
1534 } 1514 }
1535 if (cached_page)
1536 page_cache_release(cached_page);
1537 return page; 1515 return page;
1538} 1516}
1539 1517
@@ -1542,7 +1520,7 @@ repeat:
1542 * after submitting it to the filler. 1520 * after submitting it to the filler.
1543 */ 1521 */
1544struct page *read_cache_page_async(struct address_space *mapping, 1522struct page *read_cache_page_async(struct address_space *mapping,
1545 unsigned long index, 1523 pgoff_t index,
1546 int (*filler)(void *,struct page*), 1524 int (*filler)(void *,struct page*),
1547 void *data) 1525 void *data)
1548{ 1526{
@@ -1590,7 +1568,7 @@ EXPORT_SYMBOL(read_cache_page_async);
1590 * If the page does not get brought uptodate, return -EIO. 1568 * If the page does not get brought uptodate, return -EIO.
1591 */ 1569 */
1592struct page *read_cache_page(struct address_space *mapping, 1570struct page *read_cache_page(struct address_space *mapping,
1593 unsigned long index, 1571 pgoff_t index,
1594 int (*filler)(void *,struct page*), 1572 int (*filler)(void *,struct page*),
1595 void *data) 1573 void *data)
1596{ 1574{
@@ -1610,40 +1588,6 @@ struct page *read_cache_page(struct address_space *mapping,
1610EXPORT_SYMBOL(read_cache_page); 1588EXPORT_SYMBOL(read_cache_page);
1611 1589
1612/* 1590/*
1613 * If the page was newly created, increment its refcount and add it to the
1614 * caller's lru-buffering pagevec. This function is specifically for
1615 * generic_file_write().
1616 */
1617static inline struct page *
1618__grab_cache_page(struct address_space *mapping, unsigned long index,
1619 struct page **cached_page, struct pagevec *lru_pvec)
1620{
1621 int err;
1622 struct page *page;
1623repeat:
1624 page = find_lock_page(mapping, index);
1625 if (!page) {
1626 if (!*cached_page) {
1627 *cached_page = page_cache_alloc(mapping);
1628 if (!*cached_page)
1629 return NULL;
1630 }
1631 err = add_to_page_cache(*cached_page, mapping,
1632 index, GFP_KERNEL);
1633 if (err == -EEXIST)
1634 goto repeat;
1635 if (err == 0) {
1636 page = *cached_page;
1637 page_cache_get(page);
1638 if (!pagevec_add(lru_pvec, page))
1639 __pagevec_lru_add(lru_pvec);
1640 *cached_page = NULL;
1641 }
1642 }
1643 return page;
1644}
1645
1646/*
1647 * The logic we want is 1591 * The logic we want is
1648 * 1592 *
1649 * if suid or (sgid and xgrp) 1593 * if suid or (sgid and xgrp)
@@ -1691,8 +1635,7 @@ int remove_suid(struct dentry *dentry)
1691} 1635}
1692EXPORT_SYMBOL(remove_suid); 1636EXPORT_SYMBOL(remove_suid);
1693 1637
1694size_t 1638static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1695__filemap_copy_from_user_iovec_inatomic(char *vaddr,
1696 const struct iovec *iov, size_t base, size_t bytes) 1639 const struct iovec *iov, size_t base, size_t bytes)
1697{ 1640{
1698 size_t copied = 0, left = 0; 1641 size_t copied = 0, left = 0;
@@ -1715,6 +1658,124 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr,
1715} 1658}
1716 1659
1717/* 1660/*
1661 * Copy as much as we can into the page and return the number of bytes which
1662 * were sucessfully copied. If a fault is encountered then return the number of
1663 * bytes which were copied.
1664 */
1665size_t iov_iter_copy_from_user_atomic(struct page *page,
1666 struct iov_iter *i, unsigned long offset, size_t bytes)
1667{
1668 char *kaddr;
1669 size_t copied;
1670
1671 BUG_ON(!in_atomic());
1672 kaddr = kmap_atomic(page, KM_USER0);
1673 if (likely(i->nr_segs == 1)) {
1674 int left;
1675 char __user *buf = i->iov->iov_base + i->iov_offset;
1676 left = __copy_from_user_inatomic_nocache(kaddr + offset,
1677 buf, bytes);
1678 copied = bytes - left;
1679 } else {
1680 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1681 i->iov, i->iov_offset, bytes);
1682 }
1683 kunmap_atomic(kaddr, KM_USER0);
1684
1685 return copied;
1686}
1687EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
1688
1689/*
1690 * This has the same sideeffects and return value as
1691 * iov_iter_copy_from_user_atomic().
1692 * The difference is that it attempts to resolve faults.
1693 * Page must not be locked.
1694 */
1695size_t iov_iter_copy_from_user(struct page *page,
1696 struct iov_iter *i, unsigned long offset, size_t bytes)
1697{
1698 char *kaddr;
1699 size_t copied;
1700
1701 kaddr = kmap(page);
1702 if (likely(i->nr_segs == 1)) {
1703 int left;
1704 char __user *buf = i->iov->iov_base + i->iov_offset;
1705 left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
1706 copied = bytes - left;
1707 } else {
1708 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1709 i->iov, i->iov_offset, bytes);
1710 }
1711 kunmap(page);
1712 return copied;
1713}
1714EXPORT_SYMBOL(iov_iter_copy_from_user);
1715
1716static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes)
1717{
1718 if (likely(i->nr_segs == 1)) {
1719 i->iov_offset += bytes;
1720 } else {
1721 const struct iovec *iov = i->iov;
1722 size_t base = i->iov_offset;
1723
1724 while (bytes) {
1725 int copy = min(bytes, iov->iov_len - base);
1726
1727 bytes -= copy;
1728 base += copy;
1729 if (iov->iov_len == base) {
1730 iov++;
1731 base = 0;
1732 }
1733 }
1734 i->iov = iov;
1735 i->iov_offset = base;
1736 }
1737}
1738
1739void iov_iter_advance(struct iov_iter *i, size_t bytes)
1740{
1741 BUG_ON(i->count < bytes);
1742
1743 __iov_iter_advance_iov(i, bytes);
1744 i->count -= bytes;
1745}
1746EXPORT_SYMBOL(iov_iter_advance);
1747
1748/*
1749 * Fault in the first iovec of the given iov_iter, to a maximum length
1750 * of bytes. Returns 0 on success, or non-zero if the memory could not be
1751 * accessed (ie. because it is an invalid address).
1752 *
1753 * writev-intensive code may want this to prefault several iovecs -- that
1754 * would be possible (callers must not rely on the fact that _only_ the
1755 * first iovec will be faulted with the current implementation).
1756 */
1757int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
1758{
1759 char __user *buf = i->iov->iov_base + i->iov_offset;
1760 bytes = min(bytes, i->iov->iov_len - i->iov_offset);
1761 return fault_in_pages_readable(buf, bytes);
1762}
1763EXPORT_SYMBOL(iov_iter_fault_in_readable);
1764
1765/*
1766 * Return the count of just the current iov_iter segment.
1767 */
1768size_t iov_iter_single_seg_count(struct iov_iter *i)
1769{
1770 const struct iovec *iov = i->iov;
1771 if (i->nr_segs == 1)
1772 return i->count;
1773 else
1774 return min(i->count, iov->iov_len - i->iov_offset);
1775}
1776EXPORT_SYMBOL(iov_iter_single_seg_count);
1777
1778/*
1718 * Performs necessary checks before doing a write 1779 * Performs necessary checks before doing a write
1719 * 1780 *
1720 * Can adjust writing position or amount of bytes to write. 1781 * Can adjust writing position or amount of bytes to write.
@@ -1796,6 +1857,91 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
1796} 1857}
1797EXPORT_SYMBOL(generic_write_checks); 1858EXPORT_SYMBOL(generic_write_checks);
1798 1859
1860int pagecache_write_begin(struct file *file, struct address_space *mapping,
1861 loff_t pos, unsigned len, unsigned flags,
1862 struct page **pagep, void **fsdata)
1863{
1864 const struct address_space_operations *aops = mapping->a_ops;
1865
1866 if (aops->write_begin) {
1867 return aops->write_begin(file, mapping, pos, len, flags,
1868 pagep, fsdata);
1869 } else {
1870 int ret;
1871 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1872 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1873 struct inode *inode = mapping->host;
1874 struct page *page;
1875again:
1876 page = __grab_cache_page(mapping, index);
1877 *pagep = page;
1878 if (!page)
1879 return -ENOMEM;
1880
1881 if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
1882 /*
1883 * There is no way to resolve a short write situation
1884 * for a !Uptodate page (except by double copying in
1885 * the caller done by generic_perform_write_2copy).
1886 *
1887 * Instead, we have to bring it uptodate here.
1888 */
1889 ret = aops->readpage(file, page);
1890 page_cache_release(page);
1891 if (ret) {
1892 if (ret == AOP_TRUNCATED_PAGE)
1893 goto again;
1894 return ret;
1895 }
1896 goto again;
1897 }
1898
1899 ret = aops->prepare_write(file, page, offset, offset+len);
1900 if (ret) {
1901 unlock_page(page);
1902 page_cache_release(page);
1903 if (pos + len > inode->i_size)
1904 vmtruncate(inode, inode->i_size);
1905 }
1906 return ret;
1907 }
1908}
1909EXPORT_SYMBOL(pagecache_write_begin);
1910
1911int pagecache_write_end(struct file *file, struct address_space *mapping,
1912 loff_t pos, unsigned len, unsigned copied,
1913 struct page *page, void *fsdata)
1914{
1915 const struct address_space_operations *aops = mapping->a_ops;
1916 int ret;
1917
1918 if (aops->write_end) {
1919 mark_page_accessed(page);
1920 ret = aops->write_end(file, mapping, pos, len, copied,
1921 page, fsdata);
1922 } else {
1923 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1924 struct inode *inode = mapping->host;
1925
1926 flush_dcache_page(page);
1927 ret = aops->commit_write(file, page, offset, offset+len);
1928 unlock_page(page);
1929 mark_page_accessed(page);
1930 page_cache_release(page);
1931
1932 if (ret < 0) {
1933 if (pos + len > inode->i_size)
1934 vmtruncate(inode, inode->i_size);
1935 } else if (ret > 0)
1936 ret = min_t(size_t, copied, ret);
1937 else
1938 ret = copied;
1939 }
1940
1941 return ret;
1942}
1943EXPORT_SYMBOL(pagecache_write_end);
1944
1799ssize_t 1945ssize_t
1800generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 1946generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1801 unsigned long *nr_segs, loff_t pos, loff_t *ppos, 1947 unsigned long *nr_segs, loff_t pos, loff_t *ppos,
@@ -1835,151 +1981,314 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1835} 1981}
1836EXPORT_SYMBOL(generic_file_direct_write); 1982EXPORT_SYMBOL(generic_file_direct_write);
1837 1983
1838ssize_t 1984/*
1839generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, 1985 * Find or create a page at the given pagecache position. Return the locked
1840 unsigned long nr_segs, loff_t pos, loff_t *ppos, 1986 * page. This function is specifically for buffered writes.
1841 size_t count, ssize_t written) 1987 */
1988struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
1842{ 1989{
1843 struct file *file = iocb->ki_filp; 1990 int status;
1844 struct address_space * mapping = file->f_mapping; 1991 struct page *page;
1845 const struct address_space_operations *a_ops = mapping->a_ops; 1992repeat:
1846 struct inode *inode = mapping->host; 1993 page = find_lock_page(mapping, index);
1847 long status = 0; 1994 if (likely(page))
1848 struct page *page; 1995 return page;
1849 struct page *cached_page = NULL;
1850 size_t bytes;
1851 struct pagevec lru_pvec;
1852 const struct iovec *cur_iov = iov; /* current iovec */
1853 size_t iov_base = 0; /* offset in the current iovec */
1854 char __user *buf;
1855
1856 pagevec_init(&lru_pvec, 0);
1857 1996
1858 /* 1997 page = page_cache_alloc(mapping);
1859 * handle partial DIO write. Adjust cur_iov if needed. 1998 if (!page)
1860 */ 1999 return NULL;
1861 if (likely(nr_segs == 1)) 2000 status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
1862 buf = iov->iov_base + written; 2001 if (unlikely(status)) {
1863 else { 2002 page_cache_release(page);
1864 filemap_set_next_iovec(&cur_iov, &iov_base, written); 2003 if (status == -EEXIST)
1865 buf = cur_iov->iov_base + iov_base; 2004 goto repeat;
2005 return NULL;
1866 } 2006 }
2007 return page;
2008}
2009EXPORT_SYMBOL(__grab_cache_page);
2010
2011static ssize_t generic_perform_write_2copy(struct file *file,
2012 struct iov_iter *i, loff_t pos)
2013{
2014 struct address_space *mapping = file->f_mapping;
2015 const struct address_space_operations *a_ops = mapping->a_ops;
2016 struct inode *inode = mapping->host;
2017 long status = 0;
2018 ssize_t written = 0;
1867 2019
1868 do { 2020 do {
1869 unsigned long index; 2021 struct page *src_page;
1870 unsigned long offset; 2022 struct page *page;
1871 size_t copied; 2023 pgoff_t index; /* Pagecache index for current page */
2024 unsigned long offset; /* Offset into pagecache page */
2025 unsigned long bytes; /* Bytes to write to page */
2026 size_t copied; /* Bytes copied from user */
1872 2027
1873 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ 2028 offset = (pos & (PAGE_CACHE_SIZE - 1));
1874 index = pos >> PAGE_CACHE_SHIFT; 2029 index = pos >> PAGE_CACHE_SHIFT;
1875 bytes = PAGE_CACHE_SIZE - offset; 2030 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
1876 2031 iov_iter_count(i));
1877 /* Limit the size of the copy to the caller's write size */
1878 bytes = min(bytes, count);
1879 2032
1880 /* We only need to worry about prefaulting when writes are from 2033 /*
1881 * user-space. NFSd uses vfs_writev with several non-aligned 2034 * a non-NULL src_page indicates that we're doing the
1882 * segments in the vector, and limiting to one segment a time is 2035 * copy via get_user_pages and kmap.
1883 * a noticeable performance for re-write
1884 */ 2036 */
1885 if (!segment_eq(get_fs(), KERNEL_DS)) { 2037 src_page = NULL;
1886 /*
1887 * Limit the size of the copy to that of the current
1888 * segment, because fault_in_pages_readable() doesn't
1889 * know how to walk segments.
1890 */
1891 bytes = min(bytes, cur_iov->iov_len - iov_base);
1892 2038
1893 /* 2039 /*
1894 * Bring in the user page that we will copy from 2040 * Bring in the user page that we will copy from _first_.
1895 * _first_. Otherwise there's a nasty deadlock on 2041 * Otherwise there's a nasty deadlock on copying from the
1896 * copying from the same page as we're writing to, 2042 * same page as we're writing to, without it being marked
1897 * without it being marked up-to-date. 2043 * up-to-date.
1898 */ 2044 *
1899 fault_in_pages_readable(buf, bytes); 2045 * Not only is this an optimisation, but it is also required
2046 * to check that the address is actually valid, when atomic
2047 * usercopies are used, below.
2048 */
2049 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2050 status = -EFAULT;
2051 break;
1900 } 2052 }
1901 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); 2053
2054 page = __grab_cache_page(mapping, index);
1902 if (!page) { 2055 if (!page) {
1903 status = -ENOMEM; 2056 status = -ENOMEM;
1904 break; 2057 break;
1905 } 2058 }
1906 2059
1907 if (unlikely(bytes == 0)) { 2060 /*
1908 status = 0; 2061 * non-uptodate pages cannot cope with short copies, and we
1909 copied = 0; 2062 * cannot take a pagefault with the destination page locked.
1910 goto zero_length_segment; 2063 * So pin the source page to copy it.
1911 } 2064 */
2065 if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
2066 unlock_page(page);
1912 2067
1913 status = a_ops->prepare_write(file, page, offset, offset+bytes); 2068 src_page = alloc_page(GFP_KERNEL);
1914 if (unlikely(status)) { 2069 if (!src_page) {
1915 loff_t isize = i_size_read(inode); 2070 page_cache_release(page);
2071 status = -ENOMEM;
2072 break;
2073 }
2074
2075 /*
2076 * Cannot get_user_pages with a page locked for the
2077 * same reason as we can't take a page fault with a
2078 * page locked (as explained below).
2079 */
2080 copied = iov_iter_copy_from_user(src_page, i,
2081 offset, bytes);
2082 if (unlikely(copied == 0)) {
2083 status = -EFAULT;
2084 page_cache_release(page);
2085 page_cache_release(src_page);
2086 break;
2087 }
2088 bytes = copied;
1916 2089
1917 if (status != AOP_TRUNCATED_PAGE) 2090 lock_page(page);
2091 /*
2092 * Can't handle the page going uptodate here, because
2093 * that means we would use non-atomic usercopies, which
2094 * zero out the tail of the page, which can cause
2095 * zeroes to become transiently visible. We could just
2096 * use a non-zeroing copy, but the APIs aren't too
2097 * consistent.
2098 */
2099 if (unlikely(!page->mapping || PageUptodate(page))) {
1918 unlock_page(page); 2100 unlock_page(page);
1919 page_cache_release(page); 2101 page_cache_release(page);
1920 if (status == AOP_TRUNCATED_PAGE) 2102 page_cache_release(src_page);
1921 continue; 2103 continue;
2104 }
2105 }
2106
2107 status = a_ops->prepare_write(file, page, offset, offset+bytes);
2108 if (unlikely(status))
2109 goto fs_write_aop_error;
2110
2111 if (!src_page) {
1922 /* 2112 /*
1923 * prepare_write() may have instantiated a few blocks 2113 * Must not enter the pagefault handler here, because
1924 * outside i_size. Trim these off again. 2114 * we hold the page lock, so we might recursively
2115 * deadlock on the same lock, or get an ABBA deadlock
2116 * against a different lock, or against the mmap_sem
2117 * (which nests outside the page lock). So increment
2118 * preempt count, and use _atomic usercopies.
2119 *
2120 * The page is uptodate so we are OK to encounter a
2121 * short copy: if unmodified parts of the page are
2122 * marked dirty and written out to disk, it doesn't
2123 * really matter.
1925 */ 2124 */
1926 if (pos + bytes > isize) 2125 pagefault_disable();
1927 vmtruncate(inode, isize); 2126 copied = iov_iter_copy_from_user_atomic(page, i,
1928 break; 2127 offset, bytes);
2128 pagefault_enable();
2129 } else {
2130 void *src, *dst;
2131 src = kmap_atomic(src_page, KM_USER0);
2132 dst = kmap_atomic(page, KM_USER1);
2133 memcpy(dst + offset, src + offset, bytes);
2134 kunmap_atomic(dst, KM_USER1);
2135 kunmap_atomic(src, KM_USER0);
2136 copied = bytes;
1929 } 2137 }
1930 if (likely(nr_segs == 1))
1931 copied = filemap_copy_from_user(page, offset,
1932 buf, bytes);
1933 else
1934 copied = filemap_copy_from_user_iovec(page, offset,
1935 cur_iov, iov_base, bytes);
1936 flush_dcache_page(page); 2138 flush_dcache_page(page);
2139
1937 status = a_ops->commit_write(file, page, offset, offset+bytes); 2140 status = a_ops->commit_write(file, page, offset, offset+bytes);
1938 if (status == AOP_TRUNCATED_PAGE) { 2141 if (unlikely(status < 0))
1939 page_cache_release(page); 2142 goto fs_write_aop_error;
1940 continue; 2143 if (unlikely(status > 0)) /* filesystem did partial write */
1941 } 2144 copied = min_t(size_t, copied, status);
1942zero_length_segment: 2145
1943 if (likely(copied >= 0)) {
1944 if (!status)
1945 status = copied;
1946
1947 if (status >= 0) {
1948 written += status;
1949 count -= status;
1950 pos += status;
1951 buf += status;
1952 if (unlikely(nr_segs > 1)) {
1953 filemap_set_next_iovec(&cur_iov,
1954 &iov_base, status);
1955 if (count)
1956 buf = cur_iov->iov_base +
1957 iov_base;
1958 } else {
1959 iov_base += status;
1960 }
1961 }
1962 }
1963 if (unlikely(copied != bytes))
1964 if (status >= 0)
1965 status = -EFAULT;
1966 unlock_page(page); 2146 unlock_page(page);
1967 mark_page_accessed(page); 2147 mark_page_accessed(page);
1968 page_cache_release(page); 2148 page_cache_release(page);
1969 if (status < 0) 2149 if (src_page)
1970 break; 2150 page_cache_release(src_page);
2151
2152 iov_iter_advance(i, copied);
2153 pos += copied;
2154 written += copied;
2155
1971 balance_dirty_pages_ratelimited(mapping); 2156 balance_dirty_pages_ratelimited(mapping);
1972 cond_resched(); 2157 cond_resched();
1973 } while (count); 2158 continue;
1974 *ppos = pos;
1975 2159
1976 if (cached_page) 2160fs_write_aop_error:
1977 page_cache_release(cached_page); 2161 unlock_page(page);
2162 page_cache_release(page);
2163 if (src_page)
2164 page_cache_release(src_page);
2165
2166 /*
2167 * prepare_write() may have instantiated a few blocks
2168 * outside i_size. Trim these off again. Don't need
2169 * i_size_read because we hold i_mutex.
2170 */
2171 if (pos + bytes > inode->i_size)
2172 vmtruncate(inode, inode->i_size);
2173 break;
2174 } while (iov_iter_count(i));
2175
2176 return written ? written : status;
2177}
2178
2179static ssize_t generic_perform_write(struct file *file,
2180 struct iov_iter *i, loff_t pos)
2181{
2182 struct address_space *mapping = file->f_mapping;
2183 const struct address_space_operations *a_ops = mapping->a_ops;
2184 long status = 0;
2185 ssize_t written = 0;
2186 unsigned int flags = 0;
1978 2187
1979 /* 2188 /*
1980 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC 2189 * Copies from kernel address space cannot fail (NFSD is a big user).
1981 */ 2190 */
2191 if (segment_eq(get_fs(), KERNEL_DS))
2192 flags |= AOP_FLAG_UNINTERRUPTIBLE;
2193
2194 do {
2195 struct page *page;
2196 pgoff_t index; /* Pagecache index for current page */
2197 unsigned long offset; /* Offset into pagecache page */
2198 unsigned long bytes; /* Bytes to write to page */
2199 size_t copied; /* Bytes copied from user */
2200 void *fsdata;
2201
2202 offset = (pos & (PAGE_CACHE_SIZE - 1));
2203 index = pos >> PAGE_CACHE_SHIFT;
2204 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2205 iov_iter_count(i));
2206
2207again:
2208
2209 /*
2210 * Bring in the user page that we will copy from _first_.
2211 * Otherwise there's a nasty deadlock on copying from the
2212 * same page as we're writing to, without it being marked
2213 * up-to-date.
2214 *
2215 * Not only is this an optimisation, but it is also required
2216 * to check that the address is actually valid, when atomic
2217 * usercopies are used, below.
2218 */
2219 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2220 status = -EFAULT;
2221 break;
2222 }
2223
2224 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2225 &page, &fsdata);
2226 if (unlikely(status))
2227 break;
2228
2229 pagefault_disable();
2230 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2231 pagefault_enable();
2232 flush_dcache_page(page);
2233
2234 status = a_ops->write_end(file, mapping, pos, bytes, copied,
2235 page, fsdata);
2236 if (unlikely(status < 0))
2237 break;
2238 copied = status;
2239
2240 cond_resched();
2241
2242 if (unlikely(copied == 0)) {
2243 /*
2244 * If we were unable to copy any data at all, we must
2245 * fall back to a single segment length write.
2246 *
2247 * If we didn't fallback here, we could livelock
2248 * because not all segments in the iov can be copied at
2249 * once without a pagefault.
2250 */
2251 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2252 iov_iter_single_seg_count(i));
2253 goto again;
2254 }
2255 iov_iter_advance(i, copied);
2256 pos += copied;
2257 written += copied;
2258
2259 balance_dirty_pages_ratelimited(mapping);
2260
2261 } while (iov_iter_count(i));
2262
2263 return written ? written : status;
2264}
2265
2266ssize_t
2267generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2268 unsigned long nr_segs, loff_t pos, loff_t *ppos,
2269 size_t count, ssize_t written)
2270{
2271 struct file *file = iocb->ki_filp;
2272 struct address_space *mapping = file->f_mapping;
2273 const struct address_space_operations *a_ops = mapping->a_ops;
2274 struct inode *inode = mapping->host;
2275 ssize_t status;
2276 struct iov_iter i;
2277
2278 iov_iter_init(&i, iov, nr_segs, count, written);
2279 if (a_ops->write_begin)
2280 status = generic_perform_write(file, &i, pos);
2281 else
2282 status = generic_perform_write_2copy(file, &i, pos);
2283
1982 if (likely(status >= 0)) { 2284 if (likely(status >= 0)) {
2285 written += status;
2286 *ppos = pos + status;
2287
2288 /*
2289 * For now, when the user asks for O_SYNC, we'll actually give
2290 * O_DSYNC
2291 */
1983 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2292 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
1984 if (!a_ops->writepage || !is_sync_kiocb(iocb)) 2293 if (!a_ops->writepage || !is_sync_kiocb(iocb))
1985 status = generic_osync_inode(inode, mapping, 2294 status = generic_osync_inode(inode, mapping,
@@ -1995,7 +2304,6 @@ zero_length_segment:
1995 if (unlikely(file->f_flags & O_DIRECT) && written) 2304 if (unlikely(file->f_flags & O_DIRECT) && written)
1996 status = filemap_write_and_wait(mapping); 2305 status = filemap_write_and_wait(mapping);
1997 2306
1998 pagevec_lru_add(&lru_pvec);
1999 return written ? written : status; 2307 return written ? written : status;
2000} 2308}
2001EXPORT_SYMBOL(generic_file_buffered_write); 2309EXPORT_SYMBOL(generic_file_buffered_write);
diff --git a/mm/filemap.h b/mm/filemap.h
deleted file mode 100644
index c2bff04c84ed..000000000000
--- a/mm/filemap.h
+++ /dev/null
@@ -1,103 +0,0 @@
1/*
2 * linux/mm/filemap.h
3 *
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
6
7#ifndef __FILEMAP_H
8#define __FILEMAP_H
9
10#include <linux/types.h>
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/highmem.h>
14#include <linux/uio.h>
15#include <linux/uaccess.h>
16
17size_t
18__filemap_copy_from_user_iovec_inatomic(char *vaddr,
19 const struct iovec *iov,
20 size_t base,
21 size_t bytes);
22
23/*
24 * Copy as much as we can into the page and return the number of bytes which
25 * were sucessfully copied. If a fault is encountered then clear the page
26 * out to (offset+bytes) and return the number of bytes which were copied.
27 *
28 * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache
29 * to *NOT* zero any tail of the buffer that it failed to copy. If it does,
30 * and if the following non-atomic copy succeeds, then there is a small window
31 * where the target page contains neither the data before the write, nor the
32 * data after the write (it contains zero). A read at this time will see
33 * data that is inconsistent with any ordering of the read and the write.
34 * (This has been detected in practice).
35 */
36static inline size_t
37filemap_copy_from_user(struct page *page, unsigned long offset,
38 const char __user *buf, unsigned bytes)
39{
40 char *kaddr;
41 int left;
42
43 kaddr = kmap_atomic(page, KM_USER0);
44 left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
45 kunmap_atomic(kaddr, KM_USER0);
46
47 if (left != 0) {
48 /* Do it the slow way */
49 kaddr = kmap(page);
50 left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
51 kunmap(page);
52 }
53 return bytes - left;
54}
55
56/*
57 * This has the same sideeffects and return value as filemap_copy_from_user().
58 * The difference is that on a fault we need to memset the remainder of the
59 * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
60 * single-segment behaviour.
61 */
62static inline size_t
63filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
64 const struct iovec *iov, size_t base, size_t bytes)
65{
66 char *kaddr;
67 size_t copied;
68
69 kaddr = kmap_atomic(page, KM_USER0);
70 copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
71 base, bytes);
72 kunmap_atomic(kaddr, KM_USER0);
73 if (copied != bytes) {
74 kaddr = kmap(page);
75 copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
76 base, bytes);
77 if (bytes - copied)
78 memset(kaddr + offset + copied, 0, bytes - copied);
79 kunmap(page);
80 }
81 return copied;
82}
83
84static inline void
85filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
86{
87 const struct iovec *iov = *iovp;
88 size_t base = *basep;
89
90 do {
91 int copy = min(bytes, iov->iov_len - base);
92
93 bytes -= copy;
94 base += copy;
95 if (iov->iov_len == base) {
96 iov++;
97 base = 0;
98 }
99 } while (bytes);
100 *iovp = iov;
101 *basep = base;
102}
103#endif
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 53ee6a299635..32132f3cd641 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -15,7 +15,6 @@
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <asm/tlbflush.h> 17#include <asm/tlbflush.h>
18#include "filemap.h"
19 18
20/* 19/*
21 * We do use our own empty page to avoid interference with other users 20 * We do use our own empty page to avoid interference with other users
@@ -288,6 +287,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
288 unsigned long index; 287 unsigned long index;
289 unsigned long offset; 288 unsigned long offset;
290 size_t copied; 289 size_t copied;
290 char *kaddr;
291 291
292 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ 292 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
293 index = pos >> PAGE_CACHE_SHIFT; 293 index = pos >> PAGE_CACHE_SHIFT;
@@ -295,14 +295,6 @@ __xip_file_write(struct file *filp, const char __user *buf,
295 if (bytes > count) 295 if (bytes > count)
296 bytes = count; 296 bytes = count;
297 297
298 /*
299 * Bring in the user page that we will copy from _first_.
300 * Otherwise there's a nasty deadlock on copying from the
301 * same page as we're writing to, without it being marked
302 * up-to-date.
303 */
304 fault_in_pages_readable(buf, bytes);
305
306 page = a_ops->get_xip_page(mapping, 298 page = a_ops->get_xip_page(mapping,
307 index*(PAGE_SIZE/512), 0); 299 index*(PAGE_SIZE/512), 0);
308 if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) { 300 if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
@@ -319,8 +311,13 @@ __xip_file_write(struct file *filp, const char __user *buf,
319 break; 311 break;
320 } 312 }
321 313
322 copied = filemap_copy_from_user(page, offset, buf, bytes); 314 fault_in_pages_readable(buf, bytes);
315 kaddr = kmap_atomic(page, KM_USER0);
316 copied = bytes -
317 __copy_from_user_inatomic_nocache(kaddr, buf, bytes);
318 kunmap_atomic(kaddr, KM_USER0);
323 flush_dcache_page(page); 319 flush_dcache_page(page);
320
324 if (likely(copied > 0)) { 321 if (likely(copied > 0)) {
325 status = copied; 322 status = copied;
326 323
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eab8c428cc93..ae2959bb59cb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -23,12 +23,16 @@
23 23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26static unsigned long surplus_huge_pages;
26unsigned long max_huge_pages; 27unsigned long max_huge_pages;
27static struct list_head hugepage_freelists[MAX_NUMNODES]; 28static struct list_head hugepage_freelists[MAX_NUMNODES];
28static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 29static unsigned int nr_huge_pages_node[MAX_NUMNODES];
29static unsigned int free_huge_pages_node[MAX_NUMNODES]; 30static unsigned int free_huge_pages_node[MAX_NUMNODES];
31static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
30static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 32static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
31unsigned long hugepages_treat_as_movable; 33unsigned long hugepages_treat_as_movable;
34int hugetlb_dynamic_pool;
35static int hugetlb_next_nid;
32 36
33/* 37/*
34 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 38 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
@@ -85,6 +89,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
85 list_del(&page->lru); 89 list_del(&page->lru);
86 free_huge_pages--; 90 free_huge_pages--;
87 free_huge_pages_node[nid]--; 91 free_huge_pages_node[nid]--;
92 if (vma && vma->vm_flags & VM_MAYSHARE)
93 resv_huge_pages--;
88 break; 94 break;
89 } 95 }
90 } 96 }
@@ -92,58 +98,269 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
92 return page; 98 return page;
93} 99}
94 100
101static void update_and_free_page(struct page *page)
102{
103 int i;
104 nr_huge_pages--;
105 nr_huge_pages_node[page_to_nid(page)]--;
106 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
107 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
108 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
109 1 << PG_private | 1<< PG_writeback);
110 }
111 set_compound_page_dtor(page, NULL);
112 set_page_refcounted(page);
113 __free_pages(page, HUGETLB_PAGE_ORDER);
114}
115
95static void free_huge_page(struct page *page) 116static void free_huge_page(struct page *page)
96{ 117{
97 BUG_ON(page_count(page)); 118 int nid = page_to_nid(page);
98 119
120 BUG_ON(page_count(page));
99 INIT_LIST_HEAD(&page->lru); 121 INIT_LIST_HEAD(&page->lru);
100 122
101 spin_lock(&hugetlb_lock); 123 spin_lock(&hugetlb_lock);
102 enqueue_huge_page(page); 124 if (surplus_huge_pages_node[nid]) {
125 update_and_free_page(page);
126 surplus_huge_pages--;
127 surplus_huge_pages_node[nid]--;
128 } else {
129 enqueue_huge_page(page);
130 }
103 spin_unlock(&hugetlb_lock); 131 spin_unlock(&hugetlb_lock);
104} 132}
105 133
106static int alloc_fresh_huge_page(void) 134/*
135 * Increment or decrement surplus_huge_pages. Keep node-specific counters
136 * balanced by operating on them in a round-robin fashion.
137 * Returns 1 if an adjustment was made.
138 */
139static int adjust_pool_surplus(int delta)
107{ 140{
108 static int prev_nid; 141 static int prev_nid;
109 struct page *page; 142 int nid = prev_nid;
110 int nid; 143 int ret = 0;
144
145 VM_BUG_ON(delta != -1 && delta != 1);
146 do {
147 nid = next_node(nid, node_online_map);
148 if (nid == MAX_NUMNODES)
149 nid = first_node(node_online_map);
150
151 /* To shrink on this node, there must be a surplus page */
152 if (delta < 0 && !surplus_huge_pages_node[nid])
153 continue;
154 /* Surplus cannot exceed the total number of pages */
155 if (delta > 0 && surplus_huge_pages_node[nid] >=
156 nr_huge_pages_node[nid])
157 continue;
158
159 surplus_huge_pages += delta;
160 surplus_huge_pages_node[nid] += delta;
161 ret = 1;
162 break;
163 } while (nid != prev_nid);
111 164
112 /*
113 * Copy static prev_nid to local nid, work on that, then copy it
114 * back to prev_nid afterwards: otherwise there's a window in which
115 * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
116 * But we don't need to use a spin_lock here: it really doesn't
117 * matter if occasionally a racer chooses the same nid as we do.
118 */
119 nid = next_node(prev_nid, node_online_map);
120 if (nid == MAX_NUMNODES)
121 nid = first_node(node_online_map);
122 prev_nid = nid; 165 prev_nid = nid;
166 return ret;
167}
168
169static struct page *alloc_fresh_huge_page_node(int nid)
170{
171 struct page *page;
123 172
124 page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, 173 page = alloc_pages_node(nid,
174 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
175 HUGETLB_PAGE_ORDER);
176 if (page) {
177 set_compound_page_dtor(page, free_huge_page);
178 spin_lock(&hugetlb_lock);
179 nr_huge_pages++;
180 nr_huge_pages_node[nid]++;
181 spin_unlock(&hugetlb_lock);
182 put_page(page); /* free it into the hugepage allocator */
183 }
184
185 return page;
186}
187
188static int alloc_fresh_huge_page(void)
189{
190 struct page *page;
191 int start_nid;
192 int next_nid;
193 int ret = 0;
194
195 start_nid = hugetlb_next_nid;
196
197 do {
198 page = alloc_fresh_huge_page_node(hugetlb_next_nid);
199 if (page)
200 ret = 1;
201 /*
202 * Use a helper variable to find the next node and then
203 * copy it back to hugetlb_next_nid afterwards:
204 * otherwise there's a window in which a racer might
205 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
206 * But we don't need to use a spin_lock here: it really
207 * doesn't matter if occasionally a racer chooses the
208 * same nid as we do. Move nid forward in the mask even
209 * if we just successfully allocated a hugepage so that
210 * the next caller gets hugepages on the next node.
211 */
212 next_nid = next_node(hugetlb_next_nid, node_online_map);
213 if (next_nid == MAX_NUMNODES)
214 next_nid = first_node(node_online_map);
215 hugetlb_next_nid = next_nid;
216 } while (!page && hugetlb_next_nid != start_nid);
217
218 return ret;
219}
220
221static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
222 unsigned long address)
223{
224 struct page *page;
225
226 /* Check if the dynamic pool is enabled */
227 if (!hugetlb_dynamic_pool)
228 return NULL;
229
230 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
125 HUGETLB_PAGE_ORDER); 231 HUGETLB_PAGE_ORDER);
126 if (page) { 232 if (page) {
127 set_compound_page_dtor(page, free_huge_page); 233 set_compound_page_dtor(page, free_huge_page);
128 spin_lock(&hugetlb_lock); 234 spin_lock(&hugetlb_lock);
129 nr_huge_pages++; 235 nr_huge_pages++;
130 nr_huge_pages_node[page_to_nid(page)]++; 236 nr_huge_pages_node[page_to_nid(page)]++;
237 surplus_huge_pages++;
238 surplus_huge_pages_node[page_to_nid(page)]++;
131 spin_unlock(&hugetlb_lock); 239 spin_unlock(&hugetlb_lock);
132 put_page(page); /* free it into the hugepage allocator */
133 return 1;
134 } 240 }
135 return 0; 241
242 return page;
243}
244
245/*
246 * Increase the hugetlb pool such that it can accomodate a reservation
247 * of size 'delta'.
248 */
249static int gather_surplus_pages(int delta)
250{
251 struct list_head surplus_list;
252 struct page *page, *tmp;
253 int ret, i;
254 int needed, allocated;
255
256 needed = (resv_huge_pages + delta) - free_huge_pages;
257 if (needed <= 0)
258 return 0;
259
260 allocated = 0;
261 INIT_LIST_HEAD(&surplus_list);
262
263 ret = -ENOMEM;
264retry:
265 spin_unlock(&hugetlb_lock);
266 for (i = 0; i < needed; i++) {
267 page = alloc_buddy_huge_page(NULL, 0);
268 if (!page) {
269 /*
270 * We were not able to allocate enough pages to
271 * satisfy the entire reservation so we free what
272 * we've allocated so far.
273 */
274 spin_lock(&hugetlb_lock);
275 needed = 0;
276 goto free;
277 }
278
279 list_add(&page->lru, &surplus_list);
280 }
281 allocated += needed;
282
283 /*
284 * After retaking hugetlb_lock, we need to recalculate 'needed'
285 * because either resv_huge_pages or free_huge_pages may have changed.
286 */
287 spin_lock(&hugetlb_lock);
288 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
289 if (needed > 0)
290 goto retry;
291
292 /*
293 * The surplus_list now contains _at_least_ the number of extra pages
294 * needed to accomodate the reservation. Add the appropriate number
295 * of pages to the hugetlb pool and free the extras back to the buddy
296 * allocator.
297 */
298 needed += allocated;
299 ret = 0;
300free:
301 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
302 list_del(&page->lru);
303 if ((--needed) >= 0)
304 enqueue_huge_page(page);
305 else {
306 /*
307 * Decrement the refcount and free the page using its
308 * destructor. This must be done with hugetlb_lock
309 * unlocked which is safe because free_huge_page takes
310 * hugetlb_lock before deciding how to free the page.
311 */
312 spin_unlock(&hugetlb_lock);
313 put_page(page);
314 spin_lock(&hugetlb_lock);
315 }
316 }
317
318 return ret;
319}
320
321/*
322 * When releasing a hugetlb pool reservation, any surplus pages that were
323 * allocated to satisfy the reservation must be explicitly freed if they were
324 * never used.
325 */
326void return_unused_surplus_pages(unsigned long unused_resv_pages)
327{
328 static int nid = -1;
329 struct page *page;
330 unsigned long nr_pages;
331
332 nr_pages = min(unused_resv_pages, surplus_huge_pages);
333
334 while (nr_pages) {
335 nid = next_node(nid, node_online_map);
336 if (nid == MAX_NUMNODES)
337 nid = first_node(node_online_map);
338
339 if (!surplus_huge_pages_node[nid])
340 continue;
341
342 if (!list_empty(&hugepage_freelists[nid])) {
343 page = list_entry(hugepage_freelists[nid].next,
344 struct page, lru);
345 list_del(&page->lru);
346 update_and_free_page(page);
347 free_huge_pages--;
348 free_huge_pages_node[nid]--;
349 surplus_huge_pages--;
350 surplus_huge_pages_node[nid]--;
351 nr_pages--;
352 }
353 }
136} 354}
137 355
138static struct page *alloc_huge_page(struct vm_area_struct *vma, 356static struct page *alloc_huge_page(struct vm_area_struct *vma,
139 unsigned long addr) 357 unsigned long addr)
140{ 358{
141 struct page *page; 359 struct page *page = NULL;
360 int use_reserved_page = vma->vm_flags & VM_MAYSHARE;
142 361
143 spin_lock(&hugetlb_lock); 362 spin_lock(&hugetlb_lock);
144 if (vma->vm_flags & VM_MAYSHARE) 363 if (!use_reserved_page && (free_huge_pages <= resv_huge_pages))
145 resv_huge_pages--;
146 else if (free_huge_pages <= resv_huge_pages)
147 goto fail; 364 goto fail;
148 365
149 page = dequeue_huge_page(vma, addr); 366 page = dequeue_huge_page(vma, addr);
@@ -155,10 +372,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
155 return page; 372 return page;
156 373
157fail: 374fail:
158 if (vma->vm_flags & VM_MAYSHARE)
159 resv_huge_pages++;
160 spin_unlock(&hugetlb_lock); 375 spin_unlock(&hugetlb_lock);
161 return NULL; 376
377 /*
378 * Private mappings do not use reserved huge pages so the allocation
379 * may have failed due to an undersized hugetlb pool. Try to grab a
380 * surplus huge page from the buddy allocator.
381 */
382 if (!use_reserved_page)
383 page = alloc_buddy_huge_page(vma, addr);
384
385 return page;
162} 386}
163 387
164static int __init hugetlb_init(void) 388static int __init hugetlb_init(void)
@@ -171,6 +395,8 @@ static int __init hugetlb_init(void)
171 for (i = 0; i < MAX_NUMNODES; ++i) 395 for (i = 0; i < MAX_NUMNODES; ++i)
172 INIT_LIST_HEAD(&hugepage_freelists[i]); 396 INIT_LIST_HEAD(&hugepage_freelists[i]);
173 397
398 hugetlb_next_nid = first_node(node_online_map);
399
174 for (i = 0; i < max_huge_pages; ++i) { 400 for (i = 0; i < max_huge_pages; ++i) {
175 if (!alloc_fresh_huge_page()) 401 if (!alloc_fresh_huge_page())
176 break; 402 break;
@@ -201,21 +427,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
201} 427}
202 428
203#ifdef CONFIG_SYSCTL 429#ifdef CONFIG_SYSCTL
204static void update_and_free_page(struct page *page)
205{
206 int i;
207 nr_huge_pages--;
208 nr_huge_pages_node[page_to_nid(page)]--;
209 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
210 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
211 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
212 1 << PG_private | 1<< PG_writeback);
213 }
214 set_compound_page_dtor(page, NULL);
215 set_page_refcounted(page);
216 __free_pages(page, HUGETLB_PAGE_ORDER);
217}
218
219#ifdef CONFIG_HIGHMEM 430#ifdef CONFIG_HIGHMEM
220static void try_to_free_low(unsigned long count) 431static void try_to_free_low(unsigned long count)
221{ 432{
@@ -224,14 +435,14 @@ static void try_to_free_low(unsigned long count)
224 for (i = 0; i < MAX_NUMNODES; ++i) { 435 for (i = 0; i < MAX_NUMNODES; ++i) {
225 struct page *page, *next; 436 struct page *page, *next;
226 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 437 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
438 if (count >= nr_huge_pages)
439 return;
227 if (PageHighMem(page)) 440 if (PageHighMem(page))
228 continue; 441 continue;
229 list_del(&page->lru); 442 list_del(&page->lru);
230 update_and_free_page(page); 443 update_and_free_page(page);
231 free_huge_pages--; 444 free_huge_pages--;
232 free_huge_pages_node[page_to_nid(page)]--; 445 free_huge_pages_node[page_to_nid(page)]--;
233 if (count >= nr_huge_pages)
234 return;
235 } 446 }
236 } 447 }
237} 448}
@@ -241,26 +452,61 @@ static inline void try_to_free_low(unsigned long count)
241} 452}
242#endif 453#endif
243 454
455#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
244static unsigned long set_max_huge_pages(unsigned long count) 456static unsigned long set_max_huge_pages(unsigned long count)
245{ 457{
246 while (count > nr_huge_pages) { 458 unsigned long min_count, ret;
247 if (!alloc_fresh_huge_page())
248 return nr_huge_pages;
249 }
250 if (count >= nr_huge_pages)
251 return nr_huge_pages;
252 459
460 /*
461 * Increase the pool size
462 * First take pages out of surplus state. Then make up the
463 * remaining difference by allocating fresh huge pages.
464 */
253 spin_lock(&hugetlb_lock); 465 spin_lock(&hugetlb_lock);
254 count = max(count, resv_huge_pages); 466 while (surplus_huge_pages && count > persistent_huge_pages) {
255 try_to_free_low(count); 467 if (!adjust_pool_surplus(-1))
256 while (count < nr_huge_pages) { 468 break;
469 }
470
471 while (count > persistent_huge_pages) {
472 int ret;
473 /*
474 * If this allocation races such that we no longer need the
475 * page, free_huge_page will handle it by freeing the page
476 * and reducing the surplus.
477 */
478 spin_unlock(&hugetlb_lock);
479 ret = alloc_fresh_huge_page();
480 spin_lock(&hugetlb_lock);
481 if (!ret)
482 goto out;
483
484 }
485
486 /*
487 * Decrease the pool size
488 * First return free pages to the buddy allocator (being careful
489 * to keep enough around to satisfy reservations). Then place
490 * pages into surplus state as needed so the pool will shrink
491 * to the desired size as pages become free.
492 */
493 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
494 min_count = max(count, min_count);
495 try_to_free_low(min_count);
496 while (min_count < persistent_huge_pages) {
257 struct page *page = dequeue_huge_page(NULL, 0); 497 struct page *page = dequeue_huge_page(NULL, 0);
258 if (!page) 498 if (!page)
259 break; 499 break;
260 update_and_free_page(page); 500 update_and_free_page(page);
261 } 501 }
502 while (count < persistent_huge_pages) {
503 if (!adjust_pool_surplus(1))
504 break;
505 }
506out:
507 ret = persistent_huge_pages;
262 spin_unlock(&hugetlb_lock); 508 spin_unlock(&hugetlb_lock);
263 return nr_huge_pages; 509 return ret;
264} 510}
265 511
266int hugetlb_sysctl_handler(struct ctl_table *table, int write, 512int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -292,10 +538,12 @@ int hugetlb_report_meminfo(char *buf)
292 "HugePages_Total: %5lu\n" 538 "HugePages_Total: %5lu\n"
293 "HugePages_Free: %5lu\n" 539 "HugePages_Free: %5lu\n"
294 "HugePages_Rsvd: %5lu\n" 540 "HugePages_Rsvd: %5lu\n"
541 "HugePages_Surp: %5lu\n"
295 "Hugepagesize: %5lu kB\n", 542 "Hugepagesize: %5lu kB\n",
296 nr_huge_pages, 543 nr_huge_pages,
297 free_huge_pages, 544 free_huge_pages,
298 resv_huge_pages, 545 resv_huge_pages,
546 surplus_huge_pages,
299 HPAGE_SIZE/1024); 547 HPAGE_SIZE/1024);
300} 548}
301 549
@@ -355,7 +603,6 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
355 entry = pte_mkwrite(pte_mkdirty(*ptep)); 603 entry = pte_mkwrite(pte_mkdirty(*ptep));
356 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { 604 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
357 update_mmu_cache(vma, address, entry); 605 update_mmu_cache(vma, address, entry);
358 lazy_mmu_prot_update(entry);
359 } 606 }
360} 607}
361 608
@@ -708,7 +955,6 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
708 pte = huge_ptep_get_and_clear(mm, address, ptep); 955 pte = huge_ptep_get_and_clear(mm, address, ptep);
709 pte = pte_mkhuge(pte_modify(pte, newprot)); 956 pte = pte_mkhuge(pte_modify(pte, newprot));
710 set_huge_pte_at(mm, address, ptep, pte); 957 set_huge_pte_at(mm, address, ptep, pte);
711 lazy_mmu_prot_update(pte);
712 } 958 }
713 } 959 }
714 spin_unlock(&mm->page_table_lock); 960 spin_unlock(&mm->page_table_lock);
@@ -843,21 +1089,6 @@ static int hugetlb_acct_memory(long delta)
843 int ret = -ENOMEM; 1089 int ret = -ENOMEM;
844 1090
845 spin_lock(&hugetlb_lock); 1091 spin_lock(&hugetlb_lock);
846 if ((delta + resv_huge_pages) <= free_huge_pages) {
847 resv_huge_pages += delta;
848 ret = 0;
849 }
850 spin_unlock(&hugetlb_lock);
851 return ret;
852}
853
854int hugetlb_reserve_pages(struct inode *inode, long from, long to)
855{
856 long ret, chg;
857
858 chg = region_chg(&inode->i_mapping->private_list, from, to);
859 if (chg < 0)
860 return chg;
861 /* 1092 /*
862 * When cpuset is configured, it breaks the strict hugetlb page 1093 * When cpuset is configured, it breaks the strict hugetlb page
863 * reservation as the accounting is done on a global variable. Such 1094 * reservation as the accounting is done on a global variable. Such
@@ -875,8 +1106,31 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
875 * a best attempt and hopefully to minimize the impact of changing 1106 * a best attempt and hopefully to minimize the impact of changing
876 * semantics that cpuset has. 1107 * semantics that cpuset has.
877 */ 1108 */
878 if (chg > cpuset_mems_nr(free_huge_pages_node)) 1109 if (delta > 0) {
879 return -ENOMEM; 1110 if (gather_surplus_pages(delta) < 0)
1111 goto out;
1112
1113 if (delta > cpuset_mems_nr(free_huge_pages_node))
1114 goto out;
1115 }
1116
1117 ret = 0;
1118 resv_huge_pages += delta;
1119 if (delta < 0)
1120 return_unused_surplus_pages((unsigned long) -delta);
1121
1122out:
1123 spin_unlock(&hugetlb_lock);
1124 return ret;
1125}
1126
1127int hugetlb_reserve_pages(struct inode *inode, long from, long to)
1128{
1129 long ret, chg;
1130
1131 chg = region_chg(&inode->i_mapping->private_list, from, to);
1132 if (chg < 0)
1133 return chg;
880 1134
881 ret = hugetlb_acct_memory(chg); 1135 ret = hugetlb_acct_memory(chg);
882 if (ret < 0) 1136 if (ret < 0)
diff --git a/mm/internal.h b/mm/internal.h
index a3110c02aea7..953f941ea867 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,4 +37,14 @@ static inline void __put_page(struct page *page)
37extern void fastcall __init __free_pages_bootmem(struct page *page, 37extern void fastcall __init __free_pages_bootmem(struct page *page,
38 unsigned int order); 38 unsigned int order);
39 39
40/*
41 * function for dealing with page's order in buddy system.
42 * zone->lock is already acquired when we use these.
43 * So, we don't need atomic page->flags operations here.
44 */
45static inline unsigned long page_order(struct page *page)
46{
47 VM_BUG_ON(!PageBuddy(page));
48 return page_private(page);
49}
40#endif 50#endif
diff --git a/mm/memory.c b/mm/memory.c
index f82b359b2745..bd16dcaeefb8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -966,7 +966,7 @@ no_page_table:
966 * has touched so far, we don't want to allocate page tables. 966 * has touched so far, we don't want to allocate page tables.
967 */ 967 */
968 if (flags & FOLL_ANON) { 968 if (flags & FOLL_ANON) {
969 page = ZERO_PAGE(address); 969 page = ZERO_PAGE(0);
970 if (flags & FOLL_GET) 970 if (flags & FOLL_GET)
971 get_page(page); 971 get_page(page);
972 BUG_ON(flags & FOLL_WRITE); 972 BUG_ON(flags & FOLL_WRITE);
@@ -1111,95 +1111,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1111} 1111}
1112EXPORT_SYMBOL(get_user_pages); 1112EXPORT_SYMBOL(get_user_pages);
1113 1113
1114static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1115 unsigned long addr, unsigned long end, pgprot_t prot)
1116{
1117 pte_t *pte;
1118 spinlock_t *ptl;
1119 int err = 0;
1120
1121 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1122 if (!pte)
1123 return -EAGAIN;
1124 arch_enter_lazy_mmu_mode();
1125 do {
1126 struct page *page = ZERO_PAGE(addr);
1127 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
1128
1129 if (unlikely(!pte_none(*pte))) {
1130 err = -EEXIST;
1131 pte++;
1132 break;
1133 }
1134 page_cache_get(page);
1135 page_add_file_rmap(page);
1136 inc_mm_counter(mm, file_rss);
1137 set_pte_at(mm, addr, pte, zero_pte);
1138 } while (pte++, addr += PAGE_SIZE, addr != end);
1139 arch_leave_lazy_mmu_mode();
1140 pte_unmap_unlock(pte - 1, ptl);
1141 return err;
1142}
1143
1144static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
1145 unsigned long addr, unsigned long end, pgprot_t prot)
1146{
1147 pmd_t *pmd;
1148 unsigned long next;
1149 int err;
1150
1151 pmd = pmd_alloc(mm, pud, addr);
1152 if (!pmd)
1153 return -EAGAIN;
1154 do {
1155 next = pmd_addr_end(addr, end);
1156 err = zeromap_pte_range(mm, pmd, addr, next, prot);
1157 if (err)
1158 break;
1159 } while (pmd++, addr = next, addr != end);
1160 return err;
1161}
1162
1163static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1164 unsigned long addr, unsigned long end, pgprot_t prot)
1165{
1166 pud_t *pud;
1167 unsigned long next;
1168 int err;
1169
1170 pud = pud_alloc(mm, pgd, addr);
1171 if (!pud)
1172 return -EAGAIN;
1173 do {
1174 next = pud_addr_end(addr, end);
1175 err = zeromap_pmd_range(mm, pud, addr, next, prot);
1176 if (err)
1177 break;
1178 } while (pud++, addr = next, addr != end);
1179 return err;
1180}
1181
1182int zeromap_page_range(struct vm_area_struct *vma,
1183 unsigned long addr, unsigned long size, pgprot_t prot)
1184{
1185 pgd_t *pgd;
1186 unsigned long next;
1187 unsigned long end = addr + size;
1188 struct mm_struct *mm = vma->vm_mm;
1189 int err;
1190
1191 BUG_ON(addr >= end);
1192 pgd = pgd_offset(mm, addr);
1193 flush_cache_range(vma, addr, end);
1194 do {
1195 next = pgd_addr_end(addr, end);
1196 err = zeromap_pud_range(mm, pgd, addr, next, prot);
1197 if (err)
1198 break;
1199 } while (pgd++, addr = next, addr != end);
1200 return err;
1201}
1202
1203pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) 1114pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
1204{ 1115{
1205 pgd_t * pgd = pgd_offset(mm, addr); 1116 pgd_t * pgd = pgd_offset(mm, addr);
@@ -1700,10 +1611,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1700 flush_cache_page(vma, address, pte_pfn(orig_pte)); 1611 flush_cache_page(vma, address, pte_pfn(orig_pte));
1701 entry = pte_mkyoung(orig_pte); 1612 entry = pte_mkyoung(orig_pte);
1702 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1613 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1703 if (ptep_set_access_flags(vma, address, page_table, entry,1)) { 1614 if (ptep_set_access_flags(vma, address, page_table, entry,1))
1704 update_mmu_cache(vma, address, entry); 1615 update_mmu_cache(vma, address, entry);
1705 lazy_mmu_prot_update(entry);
1706 }
1707 ret |= VM_FAULT_WRITE; 1616 ret |= VM_FAULT_WRITE;
1708 goto unlock; 1617 goto unlock;
1709 } 1618 }
@@ -1717,16 +1626,11 @@ gotten:
1717 1626
1718 if (unlikely(anon_vma_prepare(vma))) 1627 if (unlikely(anon_vma_prepare(vma)))
1719 goto oom; 1628 goto oom;
1720 if (old_page == ZERO_PAGE(address)) { 1629 VM_BUG_ON(old_page == ZERO_PAGE(0));
1721 new_page = alloc_zeroed_user_highpage_movable(vma, address); 1630 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1722 if (!new_page) 1631 if (!new_page)
1723 goto oom; 1632 goto oom;
1724 } else { 1633 cow_user_page(new_page, old_page, address, vma);
1725 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1726 if (!new_page)
1727 goto oom;
1728 cow_user_page(new_page, old_page, address, vma);
1729 }
1730 1634
1731 /* 1635 /*
1732 * Re-check the pte - we dropped the lock 1636 * Re-check the pte - we dropped the lock
@@ -1744,7 +1648,6 @@ gotten:
1744 flush_cache_page(vma, address, pte_pfn(orig_pte)); 1648 flush_cache_page(vma, address, pte_pfn(orig_pte));
1745 entry = mk_pte(new_page, vma->vm_page_prot); 1649 entry = mk_pte(new_page, vma->vm_page_prot);
1746 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1650 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1747 lazy_mmu_prot_update(entry);
1748 /* 1651 /*
1749 * Clear the pte entry and flush it first, before updating the 1652 * Clear the pte entry and flush it first, before updating the
1750 * pte with the new entry. This will avoid a race condition 1653 * pte with the new entry. This will avoid a race condition
@@ -2252,44 +2155,28 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2252 spinlock_t *ptl; 2155 spinlock_t *ptl;
2253 pte_t entry; 2156 pte_t entry;
2254 2157
2255 if (write_access) { 2158 /* Allocate our own private page. */
2256 /* Allocate our own private page. */ 2159 pte_unmap(page_table);
2257 pte_unmap(page_table);
2258
2259 if (unlikely(anon_vma_prepare(vma)))
2260 goto oom;
2261 page = alloc_zeroed_user_highpage_movable(vma, address);
2262 if (!page)
2263 goto oom;
2264
2265 entry = mk_pte(page, vma->vm_page_prot);
2266 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2267 2160
2268 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2161 if (unlikely(anon_vma_prepare(vma)))
2269 if (!pte_none(*page_table)) 2162 goto oom;
2270 goto release; 2163 page = alloc_zeroed_user_highpage_movable(vma, address);
2271 inc_mm_counter(mm, anon_rss); 2164 if (!page)
2272 lru_cache_add_active(page); 2165 goto oom;
2273 page_add_new_anon_rmap(page, vma, address);
2274 } else {
2275 /* Map the ZERO_PAGE - vm_page_prot is readonly */
2276 page = ZERO_PAGE(address);
2277 page_cache_get(page);
2278 entry = mk_pte(page, vma->vm_page_prot);
2279 2166
2280 ptl = pte_lockptr(mm, pmd); 2167 entry = mk_pte(page, vma->vm_page_prot);
2281 spin_lock(ptl); 2168 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2282 if (!pte_none(*page_table))
2283 goto release;
2284 inc_mm_counter(mm, file_rss);
2285 page_add_file_rmap(page);
2286 }
2287 2169
2170 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2171 if (!pte_none(*page_table))
2172 goto release;
2173 inc_mm_counter(mm, anon_rss);
2174 lru_cache_add_active(page);
2175 page_add_new_anon_rmap(page, vma, address);
2288 set_pte_at(mm, address, page_table, entry); 2176 set_pte_at(mm, address, page_table, entry);
2289 2177
2290 /* No need to invalidate - it was non-present before */ 2178 /* No need to invalidate - it was non-present before */
2291 update_mmu_cache(vma, address, entry); 2179 update_mmu_cache(vma, address, entry);
2292 lazy_mmu_prot_update(entry);
2293unlock: 2180unlock:
2294 pte_unmap_unlock(page_table, ptl); 2181 pte_unmap_unlock(page_table, ptl);
2295 return 0; 2182 return 0;
@@ -2442,7 +2329,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2442 2329
2443 /* no need to invalidate: a not-present page won't be cached */ 2330 /* no need to invalidate: a not-present page won't be cached */
2444 update_mmu_cache(vma, address, entry); 2331 update_mmu_cache(vma, address, entry);
2445 lazy_mmu_prot_update(entry);
2446 } else { 2332 } else {
2447 if (anon) 2333 if (anon)
2448 page_cache_release(page); 2334 page_cache_release(page);
@@ -2470,7 +2356,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2470 int write_access, pte_t orig_pte) 2356 int write_access, pte_t orig_pte)
2471{ 2357{
2472 pgoff_t pgoff = (((address & PAGE_MASK) 2358 pgoff_t pgoff = (((address & PAGE_MASK)
2473 - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; 2359 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2474 unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); 2360 unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
2475 2361
2476 pte_unmap(page_table); 2362 pte_unmap(page_table);
@@ -2614,7 +2500,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2614 entry = pte_mkyoung(entry); 2500 entry = pte_mkyoung(entry);
2615 if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { 2501 if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
2616 update_mmu_cache(vma, address, entry); 2502 update_mmu_cache(vma, address, entry);
2617 lazy_mmu_prot_update(entry);
2618 } else { 2503 } else {
2619 /* 2504 /*
2620 * This is needed only for protection faults but the arch code 2505 * This is needed only for protection faults but the arch code
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index df9d554bea30..091b9c6c2529 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -23,6 +23,9 @@
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/ioport.h> 24#include <linux/ioport.h>
25#include <linux/cpuset.h> 25#include <linux/cpuset.h>
26#include <linux/delay.h>
27#include <linux/migrate.h>
28#include <linux/page-isolation.h>
26 29
27#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
28 31
@@ -161,14 +164,27 @@ static void grow_pgdat_span(struct pglist_data *pgdat,
161 pgdat->node_start_pfn; 164 pgdat->node_start_pfn;
162} 165}
163 166
164int online_pages(unsigned long pfn, unsigned long nr_pages) 167static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
168 void *arg)
165{ 169{
166 unsigned long i; 170 unsigned long i;
171 unsigned long onlined_pages = *(unsigned long *)arg;
172 struct page *page;
173 if (PageReserved(pfn_to_page(start_pfn)))
174 for (i = 0; i < nr_pages; i++) {
175 page = pfn_to_page(start_pfn + i);
176 online_page(page);
177 onlined_pages++;
178 }
179 *(unsigned long *)arg = onlined_pages;
180 return 0;
181}
182
183
184int online_pages(unsigned long pfn, unsigned long nr_pages)
185{
167 unsigned long flags; 186 unsigned long flags;
168 unsigned long onlined_pages = 0; 187 unsigned long onlined_pages = 0;
169 struct resource res;
170 u64 section_end;
171 unsigned long start_pfn;
172 struct zone *zone; 188 struct zone *zone;
173 int need_zonelists_rebuild = 0; 189 int need_zonelists_rebuild = 0;
174 190
@@ -191,32 +207,16 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
191 if (!populated_zone(zone)) 207 if (!populated_zone(zone))
192 need_zonelists_rebuild = 1; 208 need_zonelists_rebuild = 1;
193 209
194 res.start = (u64)pfn << PAGE_SHIFT; 210 walk_memory_resource(pfn, nr_pages, &onlined_pages,
195 res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1; 211 online_pages_range);
196 res.flags = IORESOURCE_MEM; /* we just need system ram */
197 section_end = res.end;
198
199 while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
200 start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
201 nr_pages = (unsigned long)
202 ((res.end + 1 - res.start) >> PAGE_SHIFT);
203
204 if (PageReserved(pfn_to_page(start_pfn))) {
205 /* this region's page is not onlined now */
206 for (i = 0; i < nr_pages; i++) {
207 struct page *page = pfn_to_page(start_pfn + i);
208 online_page(page);
209 onlined_pages++;
210 }
211 }
212
213 res.start = res.end + 1;
214 res.end = section_end;
215 }
216 zone->present_pages += onlined_pages; 212 zone->present_pages += onlined_pages;
217 zone->zone_pgdat->node_present_pages += onlined_pages; 213 zone->zone_pgdat->node_present_pages += onlined_pages;
218 214
219 setup_per_zone_pages_min(); 215 setup_per_zone_pages_min();
216 if (onlined_pages) {
217 kswapd_run(zone_to_nid(zone));
218 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
219 }
220 220
221 if (need_zonelists_rebuild) 221 if (need_zonelists_rebuild)
222 build_all_zonelists(); 222 build_all_zonelists();
@@ -271,9 +271,6 @@ int add_memory(int nid, u64 start, u64 size)
271 if (!pgdat) 271 if (!pgdat)
272 return -ENOMEM; 272 return -ENOMEM;
273 new_pgdat = 1; 273 new_pgdat = 1;
274 ret = kswapd_run(nid);
275 if (ret)
276 goto error;
277 } 274 }
278 275
279 /* call arch's memory hotadd */ 276 /* call arch's memory hotadd */
@@ -308,3 +305,260 @@ error:
308 return ret; 305 return ret;
309} 306}
310EXPORT_SYMBOL_GPL(add_memory); 307EXPORT_SYMBOL_GPL(add_memory);
308
309#ifdef CONFIG_MEMORY_HOTREMOVE
310/*
311 * Confirm all pages in a range [start, end) is belongs to the same zone.
312 */
313static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
314{
315 unsigned long pfn;
316 struct zone *zone = NULL;
317 struct page *page;
318 int i;
319 for (pfn = start_pfn;
320 pfn < end_pfn;
321 pfn += MAX_ORDER_NR_PAGES) {
322 i = 0;
323 /* This is just a CONFIG_HOLES_IN_ZONE check.*/
324 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
325 i++;
326 if (i == MAX_ORDER_NR_PAGES)
327 continue;
328 page = pfn_to_page(pfn + i);
329 if (zone && page_zone(page) != zone)
330 return 0;
331 zone = page_zone(page);
332 }
333 return 1;
334}
335
336/*
337 * Scanning pfn is much easier than scanning lru list.
338 * Scan pfn from start to end and Find LRU page.
339 */
340int scan_lru_pages(unsigned long start, unsigned long end)
341{
342 unsigned long pfn;
343 struct page *page;
344 for (pfn = start; pfn < end; pfn++) {
345 if (pfn_valid(pfn)) {
346 page = pfn_to_page(pfn);
347 if (PageLRU(page))
348 return pfn;
349 }
350 }
351 return 0;
352}
353
354static struct page *
355hotremove_migrate_alloc(struct page *page,
356 unsigned long private,
357 int **x)
358{
359 /* This should be improoooooved!! */
360 return alloc_page(GFP_HIGHUSER_PAGECACHE);
361}
362
363
364#define NR_OFFLINE_AT_ONCE_PAGES (256)
365static int
366do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
367{
368 unsigned long pfn;
369 struct page *page;
370 int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
371 int not_managed = 0;
372 int ret = 0;
373 LIST_HEAD(source);
374
375 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
376 if (!pfn_valid(pfn))
377 continue;
378 page = pfn_to_page(pfn);
379 if (!page_count(page))
380 continue;
381 /*
382 * We can skip free pages. And we can only deal with pages on
383 * LRU.
384 */
385 ret = isolate_lru_page(page, &source);
386 if (!ret) { /* Success */
387 move_pages--;
388 } else {
389 /* Becasue we don't have big zone->lock. we should
390 check this again here. */
391 if (page_count(page))
392 not_managed++;
393#ifdef CONFIG_DEBUG_VM
394 printk(KERN_INFO "removing from LRU failed"
395 " %lx/%d/%lx\n",
396 pfn, page_count(page), page->flags);
397#endif
398 }
399 }
400 ret = -EBUSY;
401 if (not_managed) {
402 if (!list_empty(&source))
403 putback_lru_pages(&source);
404 goto out;
405 }
406 ret = 0;
407 if (list_empty(&source))
408 goto out;
409 /* this function returns # of failed pages */
410 ret = migrate_pages(&source, hotremove_migrate_alloc, 0);
411
412out:
413 return ret;
414}
415
416/*
417 * remove from free_area[] and mark all as Reserved.
418 */
419static int
420offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
421 void *data)
422{
423 __offline_isolated_pages(start, start + nr_pages);
424 return 0;
425}
426
427static void
428offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
429{
430 walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL,
431 offline_isolated_pages_cb);
432}
433
434/*
435 * Check all pages in range, recoreded as memory resource, are isolated.
436 */
437static int
438check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
439 void *data)
440{
441 int ret;
442 long offlined = *(long *)data;
443 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
444 offlined = nr_pages;
445 if (!ret)
446 *(long *)data += offlined;
447 return ret;
448}
449
450static long
451check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
452{
453 long offlined = 0;
454 int ret;
455
456 ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined,
457 check_pages_isolated_cb);
458 if (ret < 0)
459 offlined = (long)ret;
460 return offlined;
461}
462
463extern void drain_all_local_pages(void);
464
465int offline_pages(unsigned long start_pfn,
466 unsigned long end_pfn, unsigned long timeout)
467{
468 unsigned long pfn, nr_pages, expire;
469 long offlined_pages;
470 int ret, drain, retry_max;
471 struct zone *zone;
472
473 BUG_ON(start_pfn >= end_pfn);
474 /* at least, alignment against pageblock is necessary */
475 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
476 return -EINVAL;
477 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
478 return -EINVAL;
479 /* This makes hotplug much easier...and readable.
480 we assume this for now. .*/
481 if (!test_pages_in_a_zone(start_pfn, end_pfn))
482 return -EINVAL;
483 /* set above range as isolated */
484 ret = start_isolate_page_range(start_pfn, end_pfn);
485 if (ret)
486 return ret;
487 nr_pages = end_pfn - start_pfn;
488 pfn = start_pfn;
489 expire = jiffies + timeout;
490 drain = 0;
491 retry_max = 5;
492repeat:
493 /* start memory hot removal */
494 ret = -EAGAIN;
495 if (time_after(jiffies, expire))
496 goto failed_removal;
497 ret = -EINTR;
498 if (signal_pending(current))
499 goto failed_removal;
500 ret = 0;
501 if (drain) {
502 lru_add_drain_all();
503 flush_scheduled_work();
504 cond_resched();
505 drain_all_local_pages();
506 }
507
508 pfn = scan_lru_pages(start_pfn, end_pfn);
509 if (pfn) { /* We have page on LRU */
510 ret = do_migrate_range(pfn, end_pfn);
511 if (!ret) {
512 drain = 1;
513 goto repeat;
514 } else {
515 if (ret < 0)
516 if (--retry_max == 0)
517 goto failed_removal;
518 yield();
519 drain = 1;
520 goto repeat;
521 }
522 }
523 /* drain all zone's lru pagevec, this is asyncronous... */
524 lru_add_drain_all();
525 flush_scheduled_work();
526 yield();
527 /* drain pcp pages , this is synchrouns. */
528 drain_all_local_pages();
529 /* check again */
530 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
531 if (offlined_pages < 0) {
532 ret = -EBUSY;
533 goto failed_removal;
534 }
535 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
536 /* Ok, all of our target is islaoted.
537 We cannot do rollback at this point. */
538 offline_isolated_pages(start_pfn, end_pfn);
539 /* reset pagetype flags */
540 start_isolate_page_range(start_pfn, end_pfn);
541 /* removal success */
542 zone = page_zone(pfn_to_page(start_pfn));
543 zone->present_pages -= offlined_pages;
544 zone->zone_pgdat->node_present_pages -= offlined_pages;
545 totalram_pages -= offlined_pages;
546 num_physpages -= offlined_pages;
547 vm_total_pages = nr_free_pagecache_pages();
548 writeback_set_ratelimit();
549 return 0;
550
551failed_removal:
552 printk(KERN_INFO "memory offlining %lx to %lx failed\n",
553 start_pfn, end_pfn);
554 /* pushback to free area */
555 undo_isolate_page_range(start_pfn, end_pfn);
556 return ret;
557}
558#else
559int remove_memory(u64 start, u64 size)
560{
561 return -EINVAL;
562}
563EXPORT_SYMBOL_GPL(remove_memory);
564#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3d6ac9505d07..568152ae6caf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -72,7 +72,6 @@
72#include <linux/hugetlb.h> 72#include <linux/hugetlb.h>
73#include <linux/kernel.h> 73#include <linux/kernel.h>
74#include <linux/sched.h> 74#include <linux/sched.h>
75#include <linux/mm.h>
76#include <linux/nodemask.h> 75#include <linux/nodemask.h>
77#include <linux/cpuset.h> 76#include <linux/cpuset.h>
78#include <linux/gfp.h> 77#include <linux/gfp.h>
@@ -82,13 +81,13 @@
82#include <linux/interrupt.h> 81#include <linux/interrupt.h>
83#include <linux/init.h> 82#include <linux/init.h>
84#include <linux/compat.h> 83#include <linux/compat.h>
85#include <linux/mempolicy.h>
86#include <linux/swap.h> 84#include <linux/swap.h>
87#include <linux/seq_file.h> 85#include <linux/seq_file.h>
88#include <linux/proc_fs.h> 86#include <linux/proc_fs.h>
89#include <linux/migrate.h> 87#include <linux/migrate.h>
90#include <linux/rmap.h> 88#include <linux/rmap.h>
91#include <linux/security.h> 89#include <linux/security.h>
90#include <linux/syscalls.h>
92 91
93#include <asm/tlbflush.h> 92#include <asm/tlbflush.h>
94#include <asm/uaccess.h> 93#include <asm/uaccess.h>
@@ -110,6 +109,9 @@ struct mempolicy default_policy = {
110 .policy = MPOL_DEFAULT, 109 .policy = MPOL_DEFAULT,
111}; 110};
112 111
112static void mpol_rebind_policy(struct mempolicy *pol,
113 const nodemask_t *newmask);
114
113/* Do sanity checking on a policy */ 115/* Do sanity checking on a policy */
114static int mpol_check_policy(int mode, nodemask_t *nodes) 116static int mpol_check_policy(int mode, nodemask_t *nodes)
115{ 117{
@@ -128,7 +130,7 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
128 return -EINVAL; 130 return -EINVAL;
129 break; 131 break;
130 } 132 }
131 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; 133 return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL;
132} 134}
133 135
134/* Generate a custom zonelist for the BIND policy. */ 136/* Generate a custom zonelist for the BIND policy. */
@@ -185,7 +187,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
185 switch (mode) { 187 switch (mode) {
186 case MPOL_INTERLEAVE: 188 case MPOL_INTERLEAVE:
187 policy->v.nodes = *nodes; 189 policy->v.nodes = *nodes;
188 if (nodes_weight(*nodes) == 0) { 190 nodes_and(policy->v.nodes, policy->v.nodes,
191 node_states[N_HIGH_MEMORY]);
192 if (nodes_weight(policy->v.nodes) == 0) {
189 kmem_cache_free(policy_cache, policy); 193 kmem_cache_free(policy_cache, policy);
190 return ERR_PTR(-EINVAL); 194 return ERR_PTR(-EINVAL);
191 } 195 }
@@ -459,7 +463,7 @@ static void mpol_set_task_struct_flag(void)
459} 463}
460 464
461/* Set the process memory policy */ 465/* Set the process memory policy */
462long do_set_mempolicy(int mode, nodemask_t *nodes) 466static long do_set_mempolicy(int mode, nodemask_t *nodes)
463{ 467{
464 struct mempolicy *new; 468 struct mempolicy *new;
465 469
@@ -494,9 +498,9 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
494 *nodes = p->v.nodes; 498 *nodes = p->v.nodes;
495 break; 499 break;
496 case MPOL_PREFERRED: 500 case MPOL_PREFERRED:
497 /* or use current node instead of online map? */ 501 /* or use current node instead of memory_map? */
498 if (p->v.preferred_node < 0) 502 if (p->v.preferred_node < 0)
499 *nodes = node_online_map; 503 *nodes = node_states[N_HIGH_MEMORY];
500 else 504 else
501 node_set(p->v.preferred_node, *nodes); 505 node_set(p->v.preferred_node, *nodes);
502 break; 506 break;
@@ -519,8 +523,8 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
519} 523}
520 524
521/* Retrieve NUMA policy */ 525/* Retrieve NUMA policy */
522long do_get_mempolicy(int *policy, nodemask_t *nmask, 526static long do_get_mempolicy(int *policy, nodemask_t *nmask,
523 unsigned long addr, unsigned long flags) 527 unsigned long addr, unsigned long flags)
524{ 528{
525 int err; 529 int err;
526 struct mm_struct *mm = current->mm; 530 struct mm_struct *mm = current->mm;
@@ -528,8 +532,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
528 struct mempolicy *pol = current->mempolicy; 532 struct mempolicy *pol = current->mempolicy;
529 533
530 cpuset_update_task_memory_state(); 534 cpuset_update_task_memory_state();
531 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 535 if (flags &
536 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
532 return -EINVAL; 537 return -EINVAL;
538
539 if (flags & MPOL_F_MEMS_ALLOWED) {
540 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
541 return -EINVAL;
542 *policy = 0; /* just so it's initialized */
543 *nmask = cpuset_current_mems_allowed;
544 return 0;
545 }
546
533 if (flags & MPOL_F_ADDR) { 547 if (flags & MPOL_F_ADDR) {
534 down_read(&mm->mmap_sem); 548 down_read(&mm->mmap_sem);
535 vma = find_vma_intersection(mm, addr, addr+1); 549 vma = find_vma_intersection(mm, addr, addr+1);
@@ -601,7 +615,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
601 * Migrate pages from one node to a target node. 615 * Migrate pages from one node to a target node.
602 * Returns error or the number of pages not migrated. 616 * Returns error or the number of pages not migrated.
603 */ 617 */
604int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) 618static int migrate_to_node(struct mm_struct *mm, int source, int dest,
619 int flags)
605{ 620{
606 nodemask_t nmask; 621 nodemask_t nmask;
607 LIST_HEAD(pagelist); 622 LIST_HEAD(pagelist);
@@ -732,8 +747,9 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
732} 747}
733#endif 748#endif
734 749
735long do_mbind(unsigned long start, unsigned long len, 750static long do_mbind(unsigned long start, unsigned long len,
736 unsigned long mode, nodemask_t *nmask, unsigned long flags) 751 unsigned long mode, nodemask_t *nmask,
752 unsigned long flags)
737{ 753{
738 struct vm_area_struct *vma; 754 struct vm_area_struct *vma;
739 struct mm_struct *mm = current->mm; 755 struct mm_struct *mm = current->mm;
@@ -955,7 +971,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
955 goto out; 971 goto out;
956 } 972 }
957 973
958 if (!nodes_subset(new, node_online_map)) { 974 if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
959 err = -EINVAL; 975 err = -EINVAL;
960 goto out; 976 goto out;
961 } 977 }
@@ -978,7 +994,8 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
978 unsigned long maxnode, 994 unsigned long maxnode,
979 unsigned long addr, unsigned long flags) 995 unsigned long addr, unsigned long flags)
980{ 996{
981 int err, pval; 997 int err;
998 int uninitialized_var(pval);
982 nodemask_t nodes; 999 nodemask_t nodes;
983 1000
984 if (nmask != NULL && maxnode < MAX_NUMNODES) 1001 if (nmask != NULL && maxnode < MAX_NUMNODES)
@@ -1527,8 +1544,8 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1527 kmem_cache_free(sn_cache, n); 1544 kmem_cache_free(sn_cache, n);
1528} 1545}
1529 1546
1530struct sp_node * 1547static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1531sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) 1548 struct mempolicy *pol)
1532{ 1549{
1533 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 1550 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1534 1551
@@ -1677,7 +1694,7 @@ void __init numa_policy_init(void)
1677 * fall back to the largest node if they're all smaller. 1694 * fall back to the largest node if they're all smaller.
1678 */ 1695 */
1679 nodes_clear(interleave_nodes); 1696 nodes_clear(interleave_nodes);
1680 for_each_online_node(nid) { 1697 for_each_node_state(nid, N_HIGH_MEMORY) {
1681 unsigned long total_pages = node_present_pages(nid); 1698 unsigned long total_pages = node_present_pages(nid);
1682 1699
1683 /* Preserve the largest node */ 1700 /* Preserve the largest node */
@@ -1706,7 +1723,8 @@ void numa_default_policy(void)
1706} 1723}
1707 1724
1708/* Migrate a policy to a different set of nodes */ 1725/* Migrate a policy to a different set of nodes */
1709void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) 1726static void mpol_rebind_policy(struct mempolicy *pol,
1727 const nodemask_t *newmask)
1710{ 1728{
1711 nodemask_t *mpolmask; 1729 nodemask_t *mpolmask;
1712 nodemask_t tmp; 1730 nodemask_t tmp;
@@ -1963,7 +1981,7 @@ int show_numa_map(struct seq_file *m, void *v)
1963 seq_printf(m, " huge"); 1981 seq_printf(m, " huge");
1964 } else { 1982 } else {
1965 check_pgd_range(vma, vma->vm_start, vma->vm_end, 1983 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1966 &node_online_map, MPOL_MF_STATS, md); 1984 &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
1967 } 1985 }
1968 1986
1969 if (!md->pages) 1987 if (!md->pages)
@@ -1990,7 +2008,7 @@ int show_numa_map(struct seq_file *m, void *v)
1990 if (md->writeback) 2008 if (md->writeback)
1991 seq_printf(m," writeback=%lu", md->writeback); 2009 seq_printf(m," writeback=%lu", md->writeback);
1992 2010
1993 for_each_online_node(n) 2011 for_each_node_state(n, N_HIGH_MEMORY)
1994 if (md->node[n]) 2012 if (md->node[n])
1995 seq_printf(m, " N%d=%lu", n, md->node[n]); 2013 seq_printf(m, " N%d=%lu", n, md->node[n]);
1996out: 2014out:
diff --git a/mm/migrate.c b/mm/migrate.c
index 07f22d4a431f..06d0877a66ef 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -171,6 +171,7 @@ static void remove_migration_pte(struct vm_area_struct *vma,
171 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 171 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
172 if (is_write_migration_entry(entry)) 172 if (is_write_migration_entry(entry))
173 pte = pte_mkwrite(pte); 173 pte = pte_mkwrite(pte);
174 flush_cache_page(vma, addr, pte_pfn(pte));
174 set_pte_at(mm, addr, ptep, pte); 175 set_pte_at(mm, addr, ptep, pte);
175 176
176 if (PageAnon(new)) 177 if (PageAnon(new))
@@ -180,7 +181,6 @@ static void remove_migration_pte(struct vm_area_struct *vma,
180 181
181 /* No need to invalidate - it was non-present before */ 182 /* No need to invalidate - it was non-present before */
182 update_mmu_cache(vma, addr, pte); 183 update_mmu_cache(vma, addr, pte);
183 lazy_mmu_prot_update(pte);
184 184
185out: 185out:
186 pte_unmap_unlock(ptep, ptl); 186 pte_unmap_unlock(ptep, ptl);
@@ -986,7 +986,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
986 goto out; 986 goto out;
987 987
988 err = -ENODEV; 988 err = -ENODEV;
989 if (!node_online(node)) 989 if (!node_state(node, N_HIGH_MEMORY))
990 goto out; 990 goto out;
991 991
992 err = -EACCES; 992 err = -EACCES;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e8346c30abec..1d4d69790e59 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -53,7 +53,6 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
53 if (dirty_accountable && pte_dirty(ptent)) 53 if (dirty_accountable && pte_dirty(ptent))
54 ptent = pte_mkwrite(ptent); 54 ptent = pte_mkwrite(ptent);
55 set_pte_at(mm, addr, pte, ptent); 55 set_pte_at(mm, addr, pte, ptent);
56 lazy_mmu_prot_update(ptent);
57#ifdef CONFIG_MIGRATION 56#ifdef CONFIG_MIGRATION
58 } else if (!pte_file(oldpte)) { 57 } else if (!pte_file(oldpte)) {
59 swp_entry_t entry = pte_to_swp_entry(oldpte); 58 swp_entry_t entry = pte_to_swp_entry(oldpte);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f9b82ad5047f..41b4e362221d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -177,14 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
177{ 177{
178#ifdef CONFIG_NUMA 178#ifdef CONFIG_NUMA
179 struct zone **z; 179 struct zone **z;
180 nodemask_t nodes; 180 nodemask_t nodes = node_states[N_HIGH_MEMORY];
181 int node;
182
183 nodes_clear(nodes);
184 /* node has memory ? */
185 for_each_online_node(node)
186 if (NODE_DATA(node)->node_present_pages)
187 node_set(node, nodes);
188 181
189 for (z = zonelist->zones; *z; z++) 182 for (z = zonelist->zones; *z; z++)
190 if (cpuset_zone_allowed_softwall(*z, gfp_mask)) 183 if (cpuset_zone_allowed_softwall(*z, gfp_mask))
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 44720363374c..d821321326e3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -126,7 +126,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
126 int node; 126 int node;
127 unsigned long x = 0; 127 unsigned long x = 0;
128 128
129 for_each_online_node(node) { 129 for_each_node_state(node, N_HIGH_MEMORY) {
130 struct zone *z = 130 struct zone *z =
131 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; 131 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
132 132
@@ -1022,17 +1022,15 @@ int test_set_page_writeback(struct page *page)
1022EXPORT_SYMBOL(test_set_page_writeback); 1022EXPORT_SYMBOL(test_set_page_writeback);
1023 1023
1024/* 1024/*
1025 * Return true if any of the pages in the mapping are marged with the 1025 * Return true if any of the pages in the mapping are marked with the
1026 * passed tag. 1026 * passed tag.
1027 */ 1027 */
1028int mapping_tagged(struct address_space *mapping, int tag) 1028int mapping_tagged(struct address_space *mapping, int tag)
1029{ 1029{
1030 unsigned long flags;
1031 int ret; 1030 int ret;
1032 1031 rcu_read_lock();
1033 read_lock_irqsave(&mapping->tree_lock, flags);
1034 ret = radix_tree_tagged(&mapping->page_tree, tag); 1032 ret = radix_tree_tagged(&mapping->page_tree, tag);
1035 read_unlock_irqrestore(&mapping->tree_lock, flags); 1033 rcu_read_unlock();
1036 return ret; 1034 return ret;
1037} 1035}
1038EXPORT_SYMBOL(mapping_tagged); 1036EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1a8c59571cb7..d315e1127dc9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -41,24 +41,37 @@
41#include <linux/pfn.h> 41#include <linux/pfn.h>
42#include <linux/backing-dev.h> 42#include <linux/backing-dev.h>
43#include <linux/fault-inject.h> 43#include <linux/fault-inject.h>
44#include <linux/page-isolation.h>
44 45
45#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
46#include <asm/div64.h> 47#include <asm/div64.h>
47#include "internal.h" 48#include "internal.h"
48 49
49/* 50/*
50 * MCD - HACK: Find somewhere to initialize this EARLY, or make this 51 * Array of node states.
51 * initializer cleaner
52 */ 52 */
53nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; 53nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
54EXPORT_SYMBOL(node_online_map); 54 [N_POSSIBLE] = NODE_MASK_ALL,
55nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 55 [N_ONLINE] = { { [0] = 1UL } },
56EXPORT_SYMBOL(node_possible_map); 56#ifndef CONFIG_NUMA
57 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
58#ifdef CONFIG_HIGHMEM
59 [N_HIGH_MEMORY] = { { [0] = 1UL } },
60#endif
61 [N_CPU] = { { [0] = 1UL } },
62#endif /* NUMA */
63};
64EXPORT_SYMBOL(node_states);
65
57unsigned long totalram_pages __read_mostly; 66unsigned long totalram_pages __read_mostly;
58unsigned long totalreserve_pages __read_mostly; 67unsigned long totalreserve_pages __read_mostly;
59long nr_swap_pages; 68long nr_swap_pages;
60int percpu_pagelist_fraction; 69int percpu_pagelist_fraction;
61 70
71#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
72int pageblock_order __read_mostly;
73#endif
74
62static void __free_pages_ok(struct page *page, unsigned int order); 75static void __free_pages_ok(struct page *page, unsigned int order);
63 76
64/* 77/*
@@ -137,7 +150,7 @@ static unsigned long __meminitdata dma_reserve;
137 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; 150 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
138#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 151#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
139 unsigned long __initdata required_kernelcore; 152 unsigned long __initdata required_kernelcore;
140 unsigned long __initdata required_movablecore; 153 static unsigned long __initdata required_movablecore;
141 unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 154 unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
142 155
143 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 156 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@ -150,6 +163,14 @@ int nr_node_ids __read_mostly = MAX_NUMNODES;
150EXPORT_SYMBOL(nr_node_ids); 163EXPORT_SYMBOL(nr_node_ids);
151#endif 164#endif
152 165
166int page_group_by_mobility_disabled __read_mostly;
167
168static void set_pageblock_migratetype(struct page *page, int migratetype)
169{
170 set_pageblock_flags_group(page, (unsigned long)migratetype,
171 PB_migrate, PB_migrate_end);
172}
173
153#ifdef CONFIG_DEBUG_VM 174#ifdef CONFIG_DEBUG_VM
154static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 175static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
155{ 176{
@@ -293,16 +314,6 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
293 clear_highpage(page + i); 314 clear_highpage(page + i);
294} 315}
295 316
296/*
297 * function for dealing with page's order in buddy system.
298 * zone->lock is already acquired when we use these.
299 * So, we don't need atomic page->flags operations here.
300 */
301static inline unsigned long page_order(struct page *page)
302{
303 return page_private(page);
304}
305
306static inline void set_page_order(struct page *page, int order) 317static inline void set_page_order(struct page *page, int order)
307{ 318{
308 set_page_private(page, order); 319 set_page_private(page, order);
@@ -404,6 +415,7 @@ static inline void __free_one_page(struct page *page,
404{ 415{
405 unsigned long page_idx; 416 unsigned long page_idx;
406 int order_size = 1 << order; 417 int order_size = 1 << order;
418 int migratetype = get_pageblock_migratetype(page);
407 419
408 if (unlikely(PageCompound(page))) 420 if (unlikely(PageCompound(page)))
409 destroy_compound_page(page, order); 421 destroy_compound_page(page, order);
@@ -416,7 +428,6 @@ static inline void __free_one_page(struct page *page,
416 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); 428 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
417 while (order < MAX_ORDER-1) { 429 while (order < MAX_ORDER-1) {
418 unsigned long combined_idx; 430 unsigned long combined_idx;
419 struct free_area *area;
420 struct page *buddy; 431 struct page *buddy;
421 432
422 buddy = __page_find_buddy(page, page_idx, order); 433 buddy = __page_find_buddy(page, page_idx, order);
@@ -424,8 +435,7 @@ static inline void __free_one_page(struct page *page,
424 break; /* Move the buddy up one level. */ 435 break; /* Move the buddy up one level. */
425 436
426 list_del(&buddy->lru); 437 list_del(&buddy->lru);
427 area = zone->free_area + order; 438 zone->free_area[order].nr_free--;
428 area->nr_free--;
429 rmv_page_order(buddy); 439 rmv_page_order(buddy);
430 combined_idx = __find_combined_index(page_idx, order); 440 combined_idx = __find_combined_index(page_idx, order);
431 page = page + (combined_idx - page_idx); 441 page = page + (combined_idx - page_idx);
@@ -433,7 +443,8 @@ static inline void __free_one_page(struct page *page,
433 order++; 443 order++;
434 } 444 }
435 set_page_order(page, order); 445 set_page_order(page, order);
436 list_add(&page->lru, &zone->free_area[order].free_list); 446 list_add(&page->lru,
447 &zone->free_area[order].free_list[migratetype]);
437 zone->free_area[order].nr_free++; 448 zone->free_area[order].nr_free++;
438} 449}
439 450
@@ -567,7 +578,8 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
567 * -- wli 578 * -- wli
568 */ 579 */
569static inline void expand(struct zone *zone, struct page *page, 580static inline void expand(struct zone *zone, struct page *page,
570 int low, int high, struct free_area *area) 581 int low, int high, struct free_area *area,
582 int migratetype)
571{ 583{
572 unsigned long size = 1 << high; 584 unsigned long size = 1 << high;
573 585
@@ -576,7 +588,7 @@ static inline void expand(struct zone *zone, struct page *page,
576 high--; 588 high--;
577 size >>= 1; 589 size >>= 1;
578 VM_BUG_ON(bad_range(zone, &page[size])); 590 VM_BUG_ON(bad_range(zone, &page[size]));
579 list_add(&page[size].lru, &area->free_list); 591 list_add(&page[size].lru, &area->free_list[migratetype]);
580 area->nr_free++; 592 area->nr_free++;
581 set_page_order(&page[size], high); 593 set_page_order(&page[size], high);
582 } 594 }
@@ -628,49 +640,235 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
628 return 0; 640 return 0;
629} 641}
630 642
631/* 643/*
632 * Do the hard work of removing an element from the buddy allocator. 644 * Go through the free lists for the given migratetype and remove
633 * Call me with the zone->lock already held. 645 * the smallest available page from the freelists
634 */ 646 */
635static struct page *__rmqueue(struct zone *zone, unsigned int order) 647static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
648 int migratetype)
636{ 649{
637 struct free_area * area;
638 unsigned int current_order; 650 unsigned int current_order;
651 struct free_area * area;
639 struct page *page; 652 struct page *page;
640 653
654 /* Find a page of the appropriate size in the preferred list */
641 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 655 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
642 area = zone->free_area + current_order; 656 area = &(zone->free_area[current_order]);
643 if (list_empty(&area->free_list)) 657 if (list_empty(&area->free_list[migratetype]))
644 continue; 658 continue;
645 659
646 page = list_entry(area->free_list.next, struct page, lru); 660 page = list_entry(area->free_list[migratetype].next,
661 struct page, lru);
647 list_del(&page->lru); 662 list_del(&page->lru);
648 rmv_page_order(page); 663 rmv_page_order(page);
649 area->nr_free--; 664 area->nr_free--;
650 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); 665 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
651 expand(zone, page, order, current_order, area); 666 expand(zone, page, order, current_order, area, migratetype);
652 return page; 667 return page;
653 } 668 }
654 669
655 return NULL; 670 return NULL;
656} 671}
657 672
673
674/*
675 * This array describes the order lists are fallen back to when
676 * the free lists for the desirable migrate type are depleted
677 */
678static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
679 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
680 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
681 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
682 [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */
683};
684
685/*
686 * Move the free pages in a range to the free lists of the requested type.
687 * Note that start_page and end_pages are not aligned on a pageblock
688 * boundary. If alignment is required, use move_freepages_block()
689 */
690int move_freepages(struct zone *zone,
691 struct page *start_page, struct page *end_page,
692 int migratetype)
693{
694 struct page *page;
695 unsigned long order;
696 int pages_moved = 0;
697
698#ifndef CONFIG_HOLES_IN_ZONE
699 /*
700 * page_zone is not safe to call in this context when
701 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
702 * anyway as we check zone boundaries in move_freepages_block().
703 * Remove at a later date when no bug reports exist related to
704 * grouping pages by mobility
705 */
706 BUG_ON(page_zone(start_page) != page_zone(end_page));
707#endif
708
709 for (page = start_page; page <= end_page;) {
710 if (!pfn_valid_within(page_to_pfn(page))) {
711 page++;
712 continue;
713 }
714
715 if (!PageBuddy(page)) {
716 page++;
717 continue;
718 }
719
720 order = page_order(page);
721 list_del(&page->lru);
722 list_add(&page->lru,
723 &zone->free_area[order].free_list[migratetype]);
724 page += 1 << order;
725 pages_moved += 1 << order;
726 }
727
728 return pages_moved;
729}
730
731int move_freepages_block(struct zone *zone, struct page *page, int migratetype)
732{
733 unsigned long start_pfn, end_pfn;
734 struct page *start_page, *end_page;
735
736 start_pfn = page_to_pfn(page);
737 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
738 start_page = pfn_to_page(start_pfn);
739 end_page = start_page + pageblock_nr_pages - 1;
740 end_pfn = start_pfn + pageblock_nr_pages - 1;
741
742 /* Do not cross zone boundaries */
743 if (start_pfn < zone->zone_start_pfn)
744 start_page = page;
745 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
746 return 0;
747
748 return move_freepages(zone, start_page, end_page, migratetype);
749}
750
751/* Return the page with the lowest PFN in the list */
752static struct page *min_page(struct list_head *list)
753{
754 unsigned long min_pfn = -1UL;
755 struct page *min_page = NULL, *page;;
756
757 list_for_each_entry(page, list, lru) {
758 unsigned long pfn = page_to_pfn(page);
759 if (pfn < min_pfn) {
760 min_pfn = pfn;
761 min_page = page;
762 }
763 }
764
765 return min_page;
766}
767
768/* Remove an element from the buddy allocator from the fallback list */
769static struct page *__rmqueue_fallback(struct zone *zone, int order,
770 int start_migratetype)
771{
772 struct free_area * area;
773 int current_order;
774 struct page *page;
775 int migratetype, i;
776
777 /* Find the largest possible block of pages in the other list */
778 for (current_order = MAX_ORDER-1; current_order >= order;
779 --current_order) {
780 for (i = 0; i < MIGRATE_TYPES - 1; i++) {
781 migratetype = fallbacks[start_migratetype][i];
782
783 /* MIGRATE_RESERVE handled later if necessary */
784 if (migratetype == MIGRATE_RESERVE)
785 continue;
786
787 area = &(zone->free_area[current_order]);
788 if (list_empty(&area->free_list[migratetype]))
789 continue;
790
791 /* Bias kernel allocations towards low pfns */
792 page = list_entry(area->free_list[migratetype].next,
793 struct page, lru);
794 if (unlikely(start_migratetype != MIGRATE_MOVABLE))
795 page = min_page(&area->free_list[migratetype]);
796 area->nr_free--;
797
798 /*
799 * If breaking a large block of pages, move all free
800 * pages to the preferred allocation list. If falling
801 * back for a reclaimable kernel allocation, be more
802 * agressive about taking ownership of free pages
803 */
804 if (unlikely(current_order >= (pageblock_order >> 1)) ||
805 start_migratetype == MIGRATE_RECLAIMABLE) {
806 unsigned long pages;
807 pages = move_freepages_block(zone, page,
808 start_migratetype);
809
810 /* Claim the whole block if over half of it is free */
811 if (pages >= (1 << (pageblock_order-1)))
812 set_pageblock_migratetype(page,
813 start_migratetype);
814
815 migratetype = start_migratetype;
816 }
817
818 /* Remove the page from the freelists */
819 list_del(&page->lru);
820 rmv_page_order(page);
821 __mod_zone_page_state(zone, NR_FREE_PAGES,
822 -(1UL << order));
823
824 if (current_order == pageblock_order)
825 set_pageblock_migratetype(page,
826 start_migratetype);
827
828 expand(zone, page, order, current_order, area, migratetype);
829 return page;
830 }
831 }
832
833 /* Use MIGRATE_RESERVE rather than fail an allocation */
834 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
835}
836
837/*
838 * Do the hard work of removing an element from the buddy allocator.
839 * Call me with the zone->lock already held.
840 */
841static struct page *__rmqueue(struct zone *zone, unsigned int order,
842 int migratetype)
843{
844 struct page *page;
845
846 page = __rmqueue_smallest(zone, order, migratetype);
847
848 if (unlikely(!page))
849 page = __rmqueue_fallback(zone, order, migratetype);
850
851 return page;
852}
853
658/* 854/*
659 * Obtain a specified number of elements from the buddy allocator, all under 855 * Obtain a specified number of elements from the buddy allocator, all under
660 * a single hold of the lock, for efficiency. Add them to the supplied list. 856 * a single hold of the lock, for efficiency. Add them to the supplied list.
661 * Returns the number of new pages which were placed at *list. 857 * Returns the number of new pages which were placed at *list.
662 */ 858 */
663static int rmqueue_bulk(struct zone *zone, unsigned int order, 859static int rmqueue_bulk(struct zone *zone, unsigned int order,
664 unsigned long count, struct list_head *list) 860 unsigned long count, struct list_head *list,
861 int migratetype)
665{ 862{
666 int i; 863 int i;
667 864
668 spin_lock(&zone->lock); 865 spin_lock(&zone->lock);
669 for (i = 0; i < count; ++i) { 866 for (i = 0; i < count; ++i) {
670 struct page *page = __rmqueue(zone, order); 867 struct page *page = __rmqueue(zone, order, migratetype);
671 if (unlikely(page == NULL)) 868 if (unlikely(page == NULL))
672 break; 869 break;
673 list_add_tail(&page->lru, list); 870 list_add(&page->lru, list);
871 set_page_private(page, migratetype);
674 } 872 }
675 spin_unlock(&zone->lock); 873 spin_unlock(&zone->lock);
676 return i; 874 return i;
@@ -732,7 +930,7 @@ void mark_free_pages(struct zone *zone)
732{ 930{
733 unsigned long pfn, max_zone_pfn; 931 unsigned long pfn, max_zone_pfn;
734 unsigned long flags; 932 unsigned long flags;
735 int order; 933 int order, t;
736 struct list_head *curr; 934 struct list_head *curr;
737 935
738 if (!zone->spanned_pages) 936 if (!zone->spanned_pages)
@@ -749,17 +947,18 @@ void mark_free_pages(struct zone *zone)
749 swsusp_unset_page_free(page); 947 swsusp_unset_page_free(page);
750 } 948 }
751 949
752 for (order = MAX_ORDER - 1; order >= 0; --order) 950 for_each_migratetype_order(order, t) {
753 list_for_each(curr, &zone->free_area[order].free_list) { 951 list_for_each(curr, &zone->free_area[order].free_list[t]) {
754 unsigned long i; 952 unsigned long i;
755 953
756 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 954 pfn = page_to_pfn(list_entry(curr, struct page, lru));
757 for (i = 0; i < (1UL << order); i++) 955 for (i = 0; i < (1UL << order); i++)
758 swsusp_set_page_free(pfn_to_page(pfn + i)); 956 swsusp_set_page_free(pfn_to_page(pfn + i));
759 } 957 }
760 958 }
761 spin_unlock_irqrestore(&zone->lock, flags); 959 spin_unlock_irqrestore(&zone->lock, flags);
762} 960}
961#endif /* CONFIG_PM */
763 962
764/* 963/*
765 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 964 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
@@ -772,7 +971,25 @@ void drain_local_pages(void)
772 __drain_pages(smp_processor_id()); 971 __drain_pages(smp_processor_id());
773 local_irq_restore(flags); 972 local_irq_restore(flags);
774} 973}
775#endif /* CONFIG_HIBERNATION */ 974
975void smp_drain_local_pages(void *arg)
976{
977 drain_local_pages();
978}
979
980/*
981 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
982 */
983void drain_all_local_pages(void)
984{
985 unsigned long flags;
986
987 local_irq_save(flags);
988 __drain_pages(smp_processor_id());
989 local_irq_restore(flags);
990
991 smp_call_function(smp_drain_local_pages, NULL, 0, 1);
992}
776 993
777/* 994/*
778 * Free a 0-order page 995 * Free a 0-order page
@@ -797,6 +1014,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
797 local_irq_save(flags); 1014 local_irq_save(flags);
798 __count_vm_event(PGFREE); 1015 __count_vm_event(PGFREE);
799 list_add(&page->lru, &pcp->list); 1016 list_add(&page->lru, &pcp->list);
1017 set_page_private(page, get_pageblock_migratetype(page));
800 pcp->count++; 1018 pcp->count++;
801 if (pcp->count >= pcp->high) { 1019 if (pcp->count >= pcp->high) {
802 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1020 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -846,6 +1064,7 @@ static struct page *buffered_rmqueue(struct zonelist *zonelist,
846 struct page *page; 1064 struct page *page;
847 int cold = !!(gfp_flags & __GFP_COLD); 1065 int cold = !!(gfp_flags & __GFP_COLD);
848 int cpu; 1066 int cpu;
1067 int migratetype = allocflags_to_migratetype(gfp_flags);
849 1068
850again: 1069again:
851 cpu = get_cpu(); 1070 cpu = get_cpu();
@@ -856,16 +1075,28 @@ again:
856 local_irq_save(flags); 1075 local_irq_save(flags);
857 if (!pcp->count) { 1076 if (!pcp->count) {
858 pcp->count = rmqueue_bulk(zone, 0, 1077 pcp->count = rmqueue_bulk(zone, 0,
859 pcp->batch, &pcp->list); 1078 pcp->batch, &pcp->list, migratetype);
860 if (unlikely(!pcp->count)) 1079 if (unlikely(!pcp->count))
861 goto failed; 1080 goto failed;
862 } 1081 }
863 page = list_entry(pcp->list.next, struct page, lru); 1082
1083 /* Find a page of the appropriate migrate type */
1084 list_for_each_entry(page, &pcp->list, lru)
1085 if (page_private(page) == migratetype)
1086 break;
1087
1088 /* Allocate more to the pcp list if necessary */
1089 if (unlikely(&page->lru == &pcp->list)) {
1090 pcp->count += rmqueue_bulk(zone, 0,
1091 pcp->batch, &pcp->list, migratetype);
1092 page = list_entry(pcp->list.next, struct page, lru);
1093 }
1094
864 list_del(&page->lru); 1095 list_del(&page->lru);
865 pcp->count--; 1096 pcp->count--;
866 } else { 1097 } else {
867 spin_lock_irqsave(&zone->lock, flags); 1098 spin_lock_irqsave(&zone->lock, flags);
868 page = __rmqueue(zone, order); 1099 page = __rmqueue(zone, order, migratetype);
869 spin_unlock(&zone->lock); 1100 spin_unlock(&zone->lock);
870 if (!page) 1101 if (!page)
871 goto failed; 1102 goto failed;
@@ -1032,7 +1263,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1032 * 1263 *
1033 * If the zonelist cache is present in the passed in zonelist, then 1264 * If the zonelist cache is present in the passed in zonelist, then
1034 * returns a pointer to the allowed node mask (either the current 1265 * returns a pointer to the allowed node mask (either the current
1035 * tasks mems_allowed, or node_online_map.) 1266 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
1036 * 1267 *
1037 * If the zonelist cache is not available for this zonelist, does 1268 * If the zonelist cache is not available for this zonelist, does
1038 * nothing and returns NULL. 1269 * nothing and returns NULL.
@@ -1061,7 +1292,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1061 1292
1062 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1293 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1063 &cpuset_current_mems_allowed : 1294 &cpuset_current_mems_allowed :
1064 &node_online_map; 1295 &node_states[N_HIGH_MEMORY];
1065 return allowednodes; 1296 return allowednodes;
1066} 1297}
1067 1298
@@ -1183,9 +1414,6 @@ zonelist_scan:
1183 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1414 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1184 continue; 1415 continue;
1185 zone = *z; 1416 zone = *z;
1186 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
1187 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
1188 break;
1189 if ((alloc_flags & ALLOC_CPUSET) && 1417 if ((alloc_flags & ALLOC_CPUSET) &&
1190 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1418 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1191 goto try_next_zone; 1419 goto try_next_zone;
@@ -1254,7 +1482,10 @@ restart:
1254 z = zonelist->zones; /* the list of zones suitable for gfp_mask */ 1482 z = zonelist->zones; /* the list of zones suitable for gfp_mask */
1255 1483
1256 if (unlikely(*z == NULL)) { 1484 if (unlikely(*z == NULL)) {
1257 /* Should this ever happen?? */ 1485 /*
1486 * Happens if we have an empty zonelist as a result of
1487 * GFP_THISNODE being used on a memoryless node
1488 */
1258 return NULL; 1489 return NULL;
1259 } 1490 }
1260 1491
@@ -1346,6 +1577,9 @@ nofail_alloc:
1346 1577
1347 cond_resched(); 1578 cond_resched();
1348 1579
1580 if (order != 0)
1581 drain_all_local_pages();
1582
1349 if (likely(did_some_progress)) { 1583 if (likely(did_some_progress)) {
1350 page = get_page_from_freelist(gfp_mask, order, 1584 page = get_page_from_freelist(gfp_mask, order,
1351 zonelist, alloc_flags); 1585 zonelist, alloc_flags);
@@ -1794,7 +2028,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
1794 return node; 2028 return node;
1795 } 2029 }
1796 2030
1797 for_each_online_node(n) { 2031 for_each_node_state(n, N_HIGH_MEMORY) {
1798 cpumask_t tmp; 2032 cpumask_t tmp;
1799 2033
1800 /* Don't want a node to appear more than once */ 2034 /* Don't want a node to appear more than once */
@@ -1850,6 +2084,22 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
1850} 2084}
1851 2085
1852/* 2086/*
2087 * Build gfp_thisnode zonelists
2088 */
2089static void build_thisnode_zonelists(pg_data_t *pgdat)
2090{
2091 enum zone_type i;
2092 int j;
2093 struct zonelist *zonelist;
2094
2095 for (i = 0; i < MAX_NR_ZONES; i++) {
2096 zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i;
2097 j = build_zonelists_node(pgdat, zonelist, 0, i);
2098 zonelist->zones[j] = NULL;
2099 }
2100}
2101
2102/*
1853 * Build zonelists ordered by zone and nodes within zones. 2103 * Build zonelists ordered by zone and nodes within zones.
1854 * This results in conserving DMA zone[s] until all Normal memory is 2104 * This results in conserving DMA zone[s] until all Normal memory is
1855 * exhausted, but results in overflowing to remote node while memory 2105 * exhausted, but results in overflowing to remote node while memory
@@ -1915,7 +2165,8 @@ static int default_zonelist_order(void)
1915 * If there is a node whose DMA/DMA32 memory is very big area on 2165 * If there is a node whose DMA/DMA32 memory is very big area on
1916 * local memory, NODE_ORDER may be suitable. 2166 * local memory, NODE_ORDER may be suitable.
1917 */ 2167 */
1918 average_size = total_size / (num_online_nodes() + 1); 2168 average_size = total_size /
2169 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
1919 for_each_online_node(nid) { 2170 for_each_online_node(nid) {
1920 low_kmem_size = 0; 2171 low_kmem_size = 0;
1921 total_size = 0; 2172 total_size = 0;
@@ -1953,7 +2204,7 @@ static void build_zonelists(pg_data_t *pgdat)
1953 int order = current_zonelist_order; 2204 int order = current_zonelist_order;
1954 2205
1955 /* initialize zonelists */ 2206 /* initialize zonelists */
1956 for (i = 0; i < MAX_NR_ZONES; i++) { 2207 for (i = 0; i < MAX_ZONELISTS; i++) {
1957 zonelist = pgdat->node_zonelists + i; 2208 zonelist = pgdat->node_zonelists + i;
1958 zonelist->zones[0] = NULL; 2209 zonelist->zones[0] = NULL;
1959 } 2210 }
@@ -1998,6 +2249,8 @@ static void build_zonelists(pg_data_t *pgdat)
1998 /* calculate node order -- i.e., DMA last! */ 2249 /* calculate node order -- i.e., DMA last! */
1999 build_zonelists_in_zone_order(pgdat, j); 2250 build_zonelists_in_zone_order(pgdat, j);
2000 } 2251 }
2252
2253 build_thisnode_zonelists(pgdat);
2001} 2254}
2002 2255
2003/* Construct the zonelist performance cache - see further mmzone.h */ 2256/* Construct the zonelist performance cache - see further mmzone.h */
@@ -2078,8 +2331,10 @@ static int __build_all_zonelists(void *dummy)
2078 int nid; 2331 int nid;
2079 2332
2080 for_each_online_node(nid) { 2333 for_each_online_node(nid) {
2081 build_zonelists(NODE_DATA(nid)); 2334 pg_data_t *pgdat = NODE_DATA(nid);
2082 build_zonelist_cache(NODE_DATA(nid)); 2335
2336 build_zonelists(pgdat);
2337 build_zonelist_cache(pgdat);
2083 } 2338 }
2084 return 0; 2339 return 0;
2085} 2340}
@@ -2098,9 +2353,23 @@ void build_all_zonelists(void)
2098 /* cpuset refresh routine should be here */ 2353 /* cpuset refresh routine should be here */
2099 } 2354 }
2100 vm_total_pages = nr_free_pagecache_pages(); 2355 vm_total_pages = nr_free_pagecache_pages();
2101 printk("Built %i zonelists in %s order. Total pages: %ld\n", 2356 /*
2357 * Disable grouping by mobility if the number of pages in the
2358 * system is too low to allow the mechanism to work. It would be
2359 * more accurate, but expensive to check per-zone. This check is
2360 * made on memory-hotadd so a system can start with mobility
2361 * disabled and enable it later
2362 */
2363 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
2364 page_group_by_mobility_disabled = 1;
2365 else
2366 page_group_by_mobility_disabled = 0;
2367
2368 printk("Built %i zonelists in %s order, mobility grouping %s. "
2369 "Total pages: %ld\n",
2102 num_online_nodes(), 2370 num_online_nodes(),
2103 zonelist_order_name[current_zonelist_order], 2371 zonelist_order_name[current_zonelist_order],
2372 page_group_by_mobility_disabled ? "off" : "on",
2104 vm_total_pages); 2373 vm_total_pages);
2105#ifdef CONFIG_NUMA 2374#ifdef CONFIG_NUMA
2106 printk("Policy zone: %s\n", zone_names[policy_zone]); 2375 printk("Policy zone: %s\n", zone_names[policy_zone]);
@@ -2176,6 +2445,61 @@ static inline unsigned long wait_table_bits(unsigned long size)
2176#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 2445#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
2177 2446
2178/* 2447/*
2448 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2449 * of blocks reserved is based on zone->pages_min. The memory within the
2450 * reserve will tend to store contiguous free pages. Setting min_free_kbytes
2451 * higher will lead to a bigger reserve which will get freed as contiguous
2452 * blocks as reclaim kicks in
2453 */
2454static void setup_zone_migrate_reserve(struct zone *zone)
2455{
2456 unsigned long start_pfn, pfn, end_pfn;
2457 struct page *page;
2458 unsigned long reserve, block_migratetype;
2459
2460 /* Get the start pfn, end pfn and the number of blocks to reserve */
2461 start_pfn = zone->zone_start_pfn;
2462 end_pfn = start_pfn + zone->spanned_pages;
2463 reserve = roundup(zone->pages_min, pageblock_nr_pages) >>
2464 pageblock_order;
2465
2466 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
2467 if (!pfn_valid(pfn))
2468 continue;
2469 page = pfn_to_page(pfn);
2470
2471 /* Blocks with reserved pages will never free, skip them. */
2472 if (PageReserved(page))
2473 continue;
2474
2475 block_migratetype = get_pageblock_migratetype(page);
2476
2477 /* If this block is reserved, account for it */
2478 if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
2479 reserve--;
2480 continue;
2481 }
2482
2483 /* Suitable for reserving if this block is movable */
2484 if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
2485 set_pageblock_migratetype(page, MIGRATE_RESERVE);
2486 move_freepages_block(zone, page, MIGRATE_RESERVE);
2487 reserve--;
2488 continue;
2489 }
2490
2491 /*
2492 * If the reserve is met and this is a previous reserved block,
2493 * take it back
2494 */
2495 if (block_migratetype == MIGRATE_RESERVE) {
2496 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2497 move_freepages_block(zone, page, MIGRATE_MOVABLE);
2498 }
2499 }
2500}
2501
2502/*
2179 * Initially all pages are reserved - free ones are freed 2503 * Initially all pages are reserved - free ones are freed
2180 * up by free_all_bootmem() once the early boot process is 2504 * up by free_all_bootmem() once the early boot process is
2181 * done. Non-atomic initialization, single-pass. 2505 * done. Non-atomic initialization, single-pass.
@@ -2204,6 +2528,19 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2204 init_page_count(page); 2528 init_page_count(page);
2205 reset_page_mapcount(page); 2529 reset_page_mapcount(page);
2206 SetPageReserved(page); 2530 SetPageReserved(page);
2531
2532 /*
2533 * Mark the block movable so that blocks are reserved for
2534 * movable at startup. This will force kernel allocations
2535 * to reserve their blocks rather than leaking throughout
2536 * the address space during boot when many long-lived
2537 * kernel allocations are made. Later some blocks near
2538 * the start are marked MIGRATE_RESERVE by
2539 * setup_zone_migrate_reserve()
2540 */
2541 if ((pfn & (pageblock_nr_pages-1)))
2542 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2543
2207 INIT_LIST_HEAD(&page->lru); 2544 INIT_LIST_HEAD(&page->lru);
2208#ifdef WANT_PAGE_VIRTUAL 2545#ifdef WANT_PAGE_VIRTUAL
2209 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 2546 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
@@ -2216,9 +2553,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2216static void __meminit zone_init_free_lists(struct pglist_data *pgdat, 2553static void __meminit zone_init_free_lists(struct pglist_data *pgdat,
2217 struct zone *zone, unsigned long size) 2554 struct zone *zone, unsigned long size)
2218{ 2555{
2219 int order; 2556 int order, t;
2220 for (order = 0; order < MAX_ORDER ; order++) { 2557 for_each_migratetype_order(order, t) {
2221 INIT_LIST_HEAD(&zone->free_area[order].free_list); 2558 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
2222 zone->free_area[order].nr_free = 0; 2559 zone->free_area[order].nr_free = 0;
2223 } 2560 }
2224} 2561}
@@ -2324,6 +2661,9 @@ static struct per_cpu_pageset boot_pageset[NR_CPUS];
2324static int __cpuinit process_zones(int cpu) 2661static int __cpuinit process_zones(int cpu)
2325{ 2662{
2326 struct zone *zone, *dzone; 2663 struct zone *zone, *dzone;
2664 int node = cpu_to_node(cpu);
2665
2666 node_set_state(node, N_CPU); /* this node has a cpu */
2327 2667
2328 for_each_zone(zone) { 2668 for_each_zone(zone) {
2329 2669
@@ -2331,7 +2671,7 @@ static int __cpuinit process_zones(int cpu)
2331 continue; 2671 continue;
2332 2672
2333 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 2673 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
2334 GFP_KERNEL, cpu_to_node(cpu)); 2674 GFP_KERNEL, node);
2335 if (!zone_pcp(zone, cpu)) 2675 if (!zone_pcp(zone, cpu))
2336 goto bad; 2676 goto bad;
2337 2677
@@ -2444,7 +2784,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
2444 * To use this new node's memory, further consideration will be 2784 * To use this new node's memory, further consideration will be
2445 * necessary. 2785 * necessary.
2446 */ 2786 */
2447 zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); 2787 zone->wait_table = vmalloc(alloc_size);
2448 } 2788 }
2449 if (!zone->wait_table) 2789 if (!zone->wait_table)
2450 return -ENOMEM; 2790 return -ENOMEM;
@@ -2680,10 +3020,8 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
2680 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); 3020 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
2681 } 3021 }
2682 3022
2683 if (*start_pfn == -1UL) { 3023 if (*start_pfn == -1UL)
2684 printk(KERN_WARNING "Node %u active with no memory\n", nid);
2685 *start_pfn = 0; 3024 *start_pfn = 0;
2686 }
2687 3025
2688 /* Push the node boundaries out if requested */ 3026 /* Push the node boundaries out if requested */
2689 account_node_boundary(nid, start_pfn, end_pfn); 3027 account_node_boundary(nid, start_pfn, end_pfn);
@@ -2901,6 +3239,62 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
2901 realtotalpages); 3239 realtotalpages);
2902} 3240}
2903 3241
3242#ifndef CONFIG_SPARSEMEM
3243/*
3244 * Calculate the size of the zone->blockflags rounded to an unsigned long
3245 * Start by making sure zonesize is a multiple of pageblock_order by rounding
3246 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
3247 * round what is now in bits to nearest long in bits, then return it in
3248 * bytes.
3249 */
3250static unsigned long __init usemap_size(unsigned long zonesize)
3251{
3252 unsigned long usemapsize;
3253
3254 usemapsize = roundup(zonesize, pageblock_nr_pages);
3255 usemapsize = usemapsize >> pageblock_order;
3256 usemapsize *= NR_PAGEBLOCK_BITS;
3257 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
3258
3259 return usemapsize / 8;
3260}
3261
3262static void __init setup_usemap(struct pglist_data *pgdat,
3263 struct zone *zone, unsigned long zonesize)
3264{
3265 unsigned long usemapsize = usemap_size(zonesize);
3266 zone->pageblock_flags = NULL;
3267 if (usemapsize) {
3268 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
3269 memset(zone->pageblock_flags, 0, usemapsize);
3270 }
3271}
3272#else
3273static void inline setup_usemap(struct pglist_data *pgdat,
3274 struct zone *zone, unsigned long zonesize) {}
3275#endif /* CONFIG_SPARSEMEM */
3276
3277#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
3278/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
3279static inline void __init set_pageblock_order(unsigned int order)
3280{
3281 /* Check that pageblock_nr_pages has not already been setup */
3282 if (pageblock_order)
3283 return;
3284
3285 /*
3286 * Assume the largest contiguous order of interest is a huge page.
3287 * This value may be variable depending on boot parameters on IA64
3288 */
3289 pageblock_order = order;
3290}
3291#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
3292
3293/* Defined this way to avoid accidently referencing HUGETLB_PAGE_ORDER */
3294#define set_pageblock_order(x) do {} while (0)
3295
3296#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
3297
2904/* 3298/*
2905 * Set up the zone data structures: 3299 * Set up the zone data structures:
2906 * - mark all pages reserved 3300 * - mark all pages reserved
@@ -2981,6 +3375,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2981 if (!size) 3375 if (!size)
2982 continue; 3376 continue;
2983 3377
3378 set_pageblock_order(HUGETLB_PAGE_ORDER);
3379 setup_usemap(pgdat, zone, size);
2984 ret = init_currently_empty_zone(zone, zone_start_pfn, 3380 ret = init_currently_empty_zone(zone, zone_start_pfn,
2985 size, MEMMAP_EARLY); 3381 size, MEMMAP_EARLY);
2986 BUG_ON(ret); 3382 BUG_ON(ret);
@@ -3234,16 +3630,24 @@ unsigned long __init find_max_pfn_with_active_regions(void)
3234 return max_pfn; 3630 return max_pfn;
3235} 3631}
3236 3632
3237unsigned long __init early_calculate_totalpages(void) 3633/*
3634 * early_calculate_totalpages()
3635 * Sum pages in active regions for movable zone.
3636 * Populate N_HIGH_MEMORY for calculating usable_nodes.
3637 */
3638static unsigned long __init early_calculate_totalpages(void)
3238{ 3639{
3239 int i; 3640 int i;
3240 unsigned long totalpages = 0; 3641 unsigned long totalpages = 0;
3241 3642
3242 for (i = 0; i < nr_nodemap_entries; i++) 3643 for (i = 0; i < nr_nodemap_entries; i++) {
3243 totalpages += early_node_map[i].end_pfn - 3644 unsigned long pages = early_node_map[i].end_pfn -
3244 early_node_map[i].start_pfn; 3645 early_node_map[i].start_pfn;
3245 3646 totalpages += pages;
3246 return totalpages; 3647 if (pages)
3648 node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
3649 }
3650 return totalpages;
3247} 3651}
3248 3652
3249/* 3653/*
@@ -3257,7 +3661,8 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3257 int i, nid; 3661 int i, nid;
3258 unsigned long usable_startpfn; 3662 unsigned long usable_startpfn;
3259 unsigned long kernelcore_node, kernelcore_remaining; 3663 unsigned long kernelcore_node, kernelcore_remaining;
3260 int usable_nodes = num_online_nodes(); 3664 unsigned long totalpages = early_calculate_totalpages();
3665 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
3261 3666
3262 /* 3667 /*
3263 * If movablecore was specified, calculate what size of 3668 * If movablecore was specified, calculate what size of
@@ -3268,7 +3673,6 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3268 * what movablecore would have allowed. 3673 * what movablecore would have allowed.
3269 */ 3674 */
3270 if (required_movablecore) { 3675 if (required_movablecore) {
3271 unsigned long totalpages = early_calculate_totalpages();
3272 unsigned long corepages; 3676 unsigned long corepages;
3273 3677
3274 /* 3678 /*
@@ -3293,7 +3697,7 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3293restart: 3697restart:
3294 /* Spread kernelcore memory as evenly as possible throughout nodes */ 3698 /* Spread kernelcore memory as evenly as possible throughout nodes */
3295 kernelcore_node = required_kernelcore / usable_nodes; 3699 kernelcore_node = required_kernelcore / usable_nodes;
3296 for_each_online_node(nid) { 3700 for_each_node_state(nid, N_HIGH_MEMORY) {
3297 /* 3701 /*
3298 * Recalculate kernelcore_node if the division per node 3702 * Recalculate kernelcore_node if the division per node
3299 * now exceeds what is necessary to satisfy the requested 3703 * now exceeds what is necessary to satisfy the requested
@@ -3385,6 +3789,20 @@ restart:
3385 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 3789 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
3386} 3790}
3387 3791
3792/* Any regular memory on that node ? */
3793static void check_for_regular_memory(pg_data_t *pgdat)
3794{
3795#ifdef CONFIG_HIGHMEM
3796 enum zone_type zone_type;
3797
3798 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
3799 struct zone *zone = &pgdat->node_zones[zone_type];
3800 if (zone->present_pages)
3801 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
3802 }
3803#endif
3804}
3805
3388/** 3806/**
3389 * free_area_init_nodes - Initialise all pg_data_t and zone data 3807 * free_area_init_nodes - Initialise all pg_data_t and zone data
3390 * @max_zone_pfn: an array of max PFNs for each zone 3808 * @max_zone_pfn: an array of max PFNs for each zone
@@ -3459,6 +3877,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
3459 pg_data_t *pgdat = NODE_DATA(nid); 3877 pg_data_t *pgdat = NODE_DATA(nid);
3460 free_area_init_node(nid, pgdat, NULL, 3878 free_area_init_node(nid, pgdat, NULL,
3461 find_min_pfn_for_node(nid), NULL); 3879 find_min_pfn_for_node(nid), NULL);
3880
3881 /* Any memory on that node */
3882 if (pgdat->node_present_pages)
3883 node_set_state(nid, N_HIGH_MEMORY);
3884 check_for_regular_memory(pgdat);
3462 } 3885 }
3463} 3886}
3464 3887
@@ -3673,6 +4096,7 @@ void setup_per_zone_pages_min(void)
3673 4096
3674 zone->pages_low = zone->pages_min + (tmp >> 2); 4097 zone->pages_low = zone->pages_min + (tmp >> 2);
3675 zone->pages_high = zone->pages_min + (tmp >> 1); 4098 zone->pages_high = zone->pages_min + (tmp >> 1);
4099 setup_zone_migrate_reserve(zone);
3676 spin_unlock_irqrestore(&zone->lru_lock, flags); 4100 spin_unlock_irqrestore(&zone->lru_lock, flags);
3677 } 4101 }
3678 4102
@@ -3934,4 +4358,169 @@ EXPORT_SYMBOL(pfn_to_page);
3934EXPORT_SYMBOL(page_to_pfn); 4358EXPORT_SYMBOL(page_to_pfn);
3935#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 4359#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
3936 4360
4361/* Return a pointer to the bitmap storing bits affecting a block of pages */
4362static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
4363 unsigned long pfn)
4364{
4365#ifdef CONFIG_SPARSEMEM
4366 return __pfn_to_section(pfn)->pageblock_flags;
4367#else
4368 return zone->pageblock_flags;
4369#endif /* CONFIG_SPARSEMEM */
4370}
4371
4372static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
4373{
4374#ifdef CONFIG_SPARSEMEM
4375 pfn &= (PAGES_PER_SECTION-1);
4376 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
4377#else
4378 pfn = pfn - zone->zone_start_pfn;
4379 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
4380#endif /* CONFIG_SPARSEMEM */
4381}
4382
4383/**
4384 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
4385 * @page: The page within the block of interest
4386 * @start_bitidx: The first bit of interest to retrieve
4387 * @end_bitidx: The last bit of interest
4388 * returns pageblock_bits flags
4389 */
4390unsigned long get_pageblock_flags_group(struct page *page,
4391 int start_bitidx, int end_bitidx)
4392{
4393 struct zone *zone;
4394 unsigned long *bitmap;
4395 unsigned long pfn, bitidx;
4396 unsigned long flags = 0;
4397 unsigned long value = 1;
4398
4399 zone = page_zone(page);
4400 pfn = page_to_pfn(page);
4401 bitmap = get_pageblock_bitmap(zone, pfn);
4402 bitidx = pfn_to_bitidx(zone, pfn);
4403
4404 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
4405 if (test_bit(bitidx + start_bitidx, bitmap))
4406 flags |= value;
4407
4408 return flags;
4409}
3937 4410
4411/**
4412 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
4413 * @page: The page within the block of interest
4414 * @start_bitidx: The first bit of interest
4415 * @end_bitidx: The last bit of interest
4416 * @flags: The flags to set
4417 */
4418void set_pageblock_flags_group(struct page *page, unsigned long flags,
4419 int start_bitidx, int end_bitidx)
4420{
4421 struct zone *zone;
4422 unsigned long *bitmap;
4423 unsigned long pfn, bitidx;
4424 unsigned long value = 1;
4425
4426 zone = page_zone(page);
4427 pfn = page_to_pfn(page);
4428 bitmap = get_pageblock_bitmap(zone, pfn);
4429 bitidx = pfn_to_bitidx(zone, pfn);
4430
4431 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
4432 if (flags & value)
4433 __set_bit(bitidx + start_bitidx, bitmap);
4434 else
4435 __clear_bit(bitidx + start_bitidx, bitmap);
4436}
4437
4438/*
4439 * This is designed as sub function...plz see page_isolation.c also.
4440 * set/clear page block's type to be ISOLATE.
4441 * page allocater never alloc memory from ISOLATE block.
4442 */
4443
4444int set_migratetype_isolate(struct page *page)
4445{
4446 struct zone *zone;
4447 unsigned long flags;
4448 int ret = -EBUSY;
4449
4450 zone = page_zone(page);
4451 spin_lock_irqsave(&zone->lock, flags);
4452 /*
4453 * In future, more migrate types will be able to be isolation target.
4454 */
4455 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
4456 goto out;
4457 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
4458 move_freepages_block(zone, page, MIGRATE_ISOLATE);
4459 ret = 0;
4460out:
4461 spin_unlock_irqrestore(&zone->lock, flags);
4462 if (!ret)
4463 drain_all_local_pages();
4464 return ret;
4465}
4466
4467void unset_migratetype_isolate(struct page *page)
4468{
4469 struct zone *zone;
4470 unsigned long flags;
4471 zone = page_zone(page);
4472 spin_lock_irqsave(&zone->lock, flags);
4473 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
4474 goto out;
4475 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4476 move_freepages_block(zone, page, MIGRATE_MOVABLE);
4477out:
4478 spin_unlock_irqrestore(&zone->lock, flags);
4479}
4480
4481#ifdef CONFIG_MEMORY_HOTREMOVE
4482/*
4483 * All pages in the range must be isolated before calling this.
4484 */
4485void
4486__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
4487{
4488 struct page *page;
4489 struct zone *zone;
4490 int order, i;
4491 unsigned long pfn;
4492 unsigned long flags;
4493 /* find the first valid pfn */
4494 for (pfn = start_pfn; pfn < end_pfn; pfn++)
4495 if (pfn_valid(pfn))
4496 break;
4497 if (pfn == end_pfn)
4498 return;
4499 zone = page_zone(pfn_to_page(pfn));
4500 spin_lock_irqsave(&zone->lock, flags);
4501 pfn = start_pfn;
4502 while (pfn < end_pfn) {
4503 if (!pfn_valid(pfn)) {
4504 pfn++;
4505 continue;
4506 }
4507 page = pfn_to_page(pfn);
4508 BUG_ON(page_count(page));
4509 BUG_ON(!PageBuddy(page));
4510 order = page_order(page);
4511#ifdef CONFIG_DEBUG_VM
4512 printk(KERN_INFO "remove from free list %lx %d %lx\n",
4513 pfn, 1 << order, end_pfn);
4514#endif
4515 list_del(&page->lru);
4516 rmv_page_order(page);
4517 zone->free_area[order].nr_free--;
4518 __mod_zone_page_state(zone, NR_FREE_PAGES,
4519 - (1UL << order));
4520 for (i = 0; i < (1 << order); i++)
4521 SetPageReserved((page+i));
4522 pfn += (1 << order);
4523 }
4524 spin_unlock_irqrestore(&zone->lock, flags);
4525}
4526#endif
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
new file mode 100644
index 000000000000..8f92a29695cc
--- /dev/null
+++ b/mm/page_isolation.c
@@ -0,0 +1,138 @@
1/*
2 * linux/mm/page_isolation.c
3 */
4
5#include <stddef.h>
6#include <linux/mm.h>
7#include <linux/page-isolation.h>
8#include <linux/pageblock-flags.h>
9#include "internal.h"
10
11static inline struct page *
12__first_valid_page(unsigned long pfn, unsigned long nr_pages)
13{
14 int i;
15 for (i = 0; i < nr_pages; i++)
16 if (pfn_valid_within(pfn + i))
17 break;
18 if (unlikely(i == nr_pages))
19 return NULL;
20 return pfn_to_page(pfn + i);
21}
22
23/*
24 * start_isolate_page_range() -- make page-allocation-type of range of pages
25 * to be MIGRATE_ISOLATE.
26 * @start_pfn: The lower PFN of the range to be isolated.
27 * @end_pfn: The upper PFN of the range to be isolated.
28 *
29 * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
30 * the range will never be allocated. Any free pages and pages freed in the
31 * future will not be allocated again.
32 *
33 * start_pfn/end_pfn must be aligned to pageblock_order.
34 * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
35 */
36int
37start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
38{
39 unsigned long pfn;
40 unsigned long undo_pfn;
41 struct page *page;
42
43 BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
44 BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
45
46 for (pfn = start_pfn;
47 pfn < end_pfn;
48 pfn += pageblock_nr_pages) {
49 page = __first_valid_page(pfn, pageblock_nr_pages);
50 if (page && set_migratetype_isolate(page)) {
51 undo_pfn = pfn;
52 goto undo;
53 }
54 }
55 return 0;
56undo:
57 for (pfn = start_pfn;
58 pfn <= undo_pfn;
59 pfn += pageblock_nr_pages)
60 unset_migratetype_isolate(pfn_to_page(pfn));
61
62 return -EBUSY;
63}
64
65/*
66 * Make isolated pages available again.
67 */
68int
69undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
70{
71 unsigned long pfn;
72 struct page *page;
73 BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
74 BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
75 for (pfn = start_pfn;
76 pfn < end_pfn;
77 pfn += pageblock_nr_pages) {
78 page = __first_valid_page(pfn, pageblock_nr_pages);
79 if (!page || get_pageblock_flags(page) != MIGRATE_ISOLATE)
80 continue;
81 unset_migratetype_isolate(page);
82 }
83 return 0;
84}
85/*
86 * Test all pages in the range is free(means isolated) or not.
87 * all pages in [start_pfn...end_pfn) must be in the same zone.
88 * zone->lock must be held before call this.
89 *
90 * Returns 0 if all pages in the range is isolated.
91 */
92static int
93__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
94{
95 struct page *page;
96
97 while (pfn < end_pfn) {
98 if (!pfn_valid_within(pfn)) {
99 pfn++;
100 continue;
101 }
102 page = pfn_to_page(pfn);
103 if (PageBuddy(page))
104 pfn += 1 << page_order(page);
105 else if (page_count(page) == 0 &&
106 page_private(page) == MIGRATE_ISOLATE)
107 pfn += 1;
108 else
109 break;
110 }
111 if (pfn < end_pfn)
112 return 0;
113 return 1;
114}
115
116int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
117{
118 unsigned long pfn;
119 struct page *page;
120
121 pfn = start_pfn;
122 /*
123 * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
124 * is not aligned to pageblock_nr_pages.
125 * Then we just check pagetype fist.
126 */
127 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
128 page = __first_valid_page(pfn, pageblock_nr_pages);
129 if (page && get_pageblock_flags(page) != MIGRATE_ISOLATE)
130 break;
131 }
132 if (pfn < end_pfn)
133 return -EBUSY;
134 /* Check all pages are free or Marked as ISOLATED */
135 if (__test_page_isolated_in_pageblock(start_pfn, end_pfn))
136 return 0;
137 return -EBUSY;
138}
diff --git a/mm/readahead.c b/mm/readahead.c
index be20c9d699d3..229788884010 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -22,16 +22,8 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
22} 22}
23EXPORT_SYMBOL(default_unplug_io_fn); 23EXPORT_SYMBOL(default_unplug_io_fn);
24 24
25/*
26 * Convienent macros for min/max read-ahead pages.
27 * Note that MAX_RA_PAGES is rounded down, while MIN_RA_PAGES is rounded up.
28 * The latter is necessary for systems with large page size(i.e. 64k).
29 */
30#define MAX_RA_PAGES (VM_MAX_READAHEAD*1024 / PAGE_CACHE_SIZE)
31#define MIN_RA_PAGES DIV_ROUND_UP(VM_MIN_READAHEAD*1024, PAGE_CACHE_SIZE)
32
33struct backing_dev_info default_backing_dev_info = { 25struct backing_dev_info default_backing_dev_info = {
34 .ra_pages = MAX_RA_PAGES, 26 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
35 .state = 0, 27 .state = 0,
36 .capabilities = BDI_CAP_MAP_COPY, 28 .capabilities = BDI_CAP_MAP_COPY,
37 .unplug_io_fn = default_unplug_io_fn, 29 .unplug_io_fn = default_unplug_io_fn,
@@ -46,7 +38,7 @@ void
46file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) 38file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
47{ 39{
48 ra->ra_pages = mapping->backing_dev_info->ra_pages; 40 ra->ra_pages = mapping->backing_dev_info->ra_pages;
49 ra->prev_index = -1; 41 ra->prev_pos = -1;
50} 42}
51EXPORT_SYMBOL_GPL(file_ra_state_init); 43EXPORT_SYMBOL_GPL(file_ra_state_init);
52 44
@@ -66,28 +58,25 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
66 int (*filler)(void *, struct page *), void *data) 58 int (*filler)(void *, struct page *), void *data)
67{ 59{
68 struct page *page; 60 struct page *page;
69 struct pagevec lru_pvec;
70 int ret = 0; 61 int ret = 0;
71 62
72 pagevec_init(&lru_pvec, 0);
73
74 while (!list_empty(pages)) { 63 while (!list_empty(pages)) {
75 page = list_to_page(pages); 64 page = list_to_page(pages);
76 list_del(&page->lru); 65 list_del(&page->lru);
77 if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { 66 if (add_to_page_cache_lru(page, mapping,
67 page->index, GFP_KERNEL)) {
78 page_cache_release(page); 68 page_cache_release(page);
79 continue; 69 continue;
80 } 70 }
71 page_cache_release(page);
72
81 ret = filler(data, page); 73 ret = filler(data, page);
82 if (!pagevec_add(&lru_pvec, page)) 74 if (unlikely(ret)) {
83 __pagevec_lru_add(&lru_pvec);
84 if (ret) {
85 put_pages_list(pages); 75 put_pages_list(pages);
86 break; 76 break;
87 } 77 }
88 task_io_account_read(PAGE_CACHE_SIZE); 78 task_io_account_read(PAGE_CACHE_SIZE);
89 } 79 }
90 pagevec_lru_add(&lru_pvec);
91 return ret; 80 return ret;
92} 81}
93 82
@@ -97,7 +86,6 @@ static int read_pages(struct address_space *mapping, struct file *filp,
97 struct list_head *pages, unsigned nr_pages) 86 struct list_head *pages, unsigned nr_pages)
98{ 87{
99 unsigned page_idx; 88 unsigned page_idx;
100 struct pagevec lru_pvec;
101 int ret; 89 int ret;
102 90
103 if (mapping->a_ops->readpages) { 91 if (mapping->a_ops->readpages) {
@@ -107,19 +95,15 @@ static int read_pages(struct address_space *mapping, struct file *filp,
107 goto out; 95 goto out;
108 } 96 }
109 97
110 pagevec_init(&lru_pvec, 0);
111 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 98 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
112 struct page *page = list_to_page(pages); 99 struct page *page = list_to_page(pages);
113 list_del(&page->lru); 100 list_del(&page->lru);
114 if (!add_to_page_cache(page, mapping, 101 if (!add_to_page_cache_lru(page, mapping,
115 page->index, GFP_KERNEL)) { 102 page->index, GFP_KERNEL)) {
116 mapping->a_ops->readpage(filp, page); 103 mapping->a_ops->readpage(filp, page);
117 if (!pagevec_add(&lru_pvec, page)) 104 }
118 __pagevec_lru_add(&lru_pvec); 105 page_cache_release(page);
119 } else
120 page_cache_release(page);
121 } 106 }
122 pagevec_lru_add(&lru_pvec);
123 ret = 0; 107 ret = 0;
124out: 108out:
125 return ret; 109 return ret;
@@ -157,20 +141,19 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
157 /* 141 /*
158 * Preallocate as many pages as we will need. 142 * Preallocate as many pages as we will need.
159 */ 143 */
160 read_lock_irq(&mapping->tree_lock);
161 for (page_idx = 0; page_idx < nr_to_read; page_idx++) { 144 for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
162 pgoff_t page_offset = offset + page_idx; 145 pgoff_t page_offset = offset + page_idx;
163 146
164 if (page_offset > end_index) 147 if (page_offset > end_index)
165 break; 148 break;
166 149
150 rcu_read_lock();
167 page = radix_tree_lookup(&mapping->page_tree, page_offset); 151 page = radix_tree_lookup(&mapping->page_tree, page_offset);
152 rcu_read_unlock();
168 if (page) 153 if (page)
169 continue; 154 continue;
170 155
171 read_unlock_irq(&mapping->tree_lock);
172 page = page_cache_alloc_cold(mapping); 156 page = page_cache_alloc_cold(mapping);
173 read_lock_irq(&mapping->tree_lock);
174 if (!page) 157 if (!page)
175 break; 158 break;
176 page->index = page_offset; 159 page->index = page_offset;
@@ -179,7 +162,6 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
179 SetPageReadahead(page); 162 SetPageReadahead(page);
180 ret++; 163 ret++;
181 } 164 }
182 read_unlock_irq(&mapping->tree_lock);
183 165
184 /* 166 /*
185 * Now start the IO. We ignore I/O errors - if the page is not 167 * Now start the IO. We ignore I/O errors - if the page is not
@@ -327,7 +309,7 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
327 * indicator. The flag won't be set on already cached pages, to avoid the 309 * indicator. The flag won't be set on already cached pages, to avoid the
328 * readahead-for-nothing fuss, saving pointless page cache lookups. 310 * readahead-for-nothing fuss, saving pointless page cache lookups.
329 * 311 *
330 * prev_index tracks the last visited page in the _previous_ read request. 312 * prev_pos tracks the last visited byte in the _previous_ read request.
331 * It should be maintained by the caller, and will be used for detecting 313 * It should be maintained by the caller, and will be used for detecting
332 * small random reads. Note that the readahead algorithm checks loosely 314 * small random reads. Note that the readahead algorithm checks loosely
333 * for sequential patterns. Hence interleaved reads might be served as 315 * for sequential patterns. Hence interleaved reads might be served as
@@ -351,11 +333,9 @@ ondemand_readahead(struct address_space *mapping,
351 bool hit_readahead_marker, pgoff_t offset, 333 bool hit_readahead_marker, pgoff_t offset,
352 unsigned long req_size) 334 unsigned long req_size)
353{ 335{
354 unsigned long max; /* max readahead pages */ 336 int max = ra->ra_pages; /* max readahead pages */
355 int sequential; 337 pgoff_t prev_offset;
356 338 int sequential;
357 max = ra->ra_pages;
358 sequential = (offset - ra->prev_index <= 1UL) || (req_size > max);
359 339
360 /* 340 /*
361 * It's the expected callback offset, assume sequential access. 341 * It's the expected callback offset, assume sequential access.
@@ -369,6 +349,9 @@ ondemand_readahead(struct address_space *mapping,
369 goto readit; 349 goto readit;
370 } 350 }
371 351
352 prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
353 sequential = offset - prev_offset <= 1UL || req_size > max;
354
372 /* 355 /*
373 * Standalone, small read. 356 * Standalone, small read.
374 * Read as is, and do not pollute the readahead state. 357 * Read as is, and do not pollute the readahead state.
@@ -379,6 +362,29 @@ ondemand_readahead(struct address_space *mapping,
379 } 362 }
380 363
381 /* 364 /*
365 * Hit a marked page without valid readahead state.
366 * E.g. interleaved reads.
367 * Query the pagecache for async_size, which normally equals to
368 * readahead size. Ramp it up and use it as the new readahead size.
369 */
370 if (hit_readahead_marker) {
371 pgoff_t start;
372
373 read_lock_irq(&mapping->tree_lock);
374 start = radix_tree_next_hole(&mapping->page_tree, offset, max+1);
375 read_unlock_irq(&mapping->tree_lock);
376
377 if (!start || start - offset > max)
378 return 0;
379
380 ra->start = start;
381 ra->size = start - offset; /* old async_size */
382 ra->size = get_next_ra_size(ra, max);
383 ra->async_size = ra->size;
384 goto readit;
385 }
386
387 /*
382 * It may be one of 388 * It may be one of
383 * - first read on start of file 389 * - first read on start of file
384 * - sequential cache miss 390 * - sequential cache miss
@@ -389,16 +395,6 @@ ondemand_readahead(struct address_space *mapping,
389 ra->size = get_init_ra_size(req_size, max); 395 ra->size = get_init_ra_size(req_size, max);
390 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; 396 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
391 397
392 /*
393 * Hit on a marked page without valid readahead state.
394 * E.g. interleaved reads.
395 * Not knowing its readahead pos/size, bet on the minimal possible one.
396 */
397 if (hit_readahead_marker) {
398 ra->start++;
399 ra->size = get_next_ra_size(ra, max);
400 }
401
402readit: 398readit:
403 return ra_submit(ra, mapping, filp); 399 return ra_submit(ra, mapping, filp);
404} 400}
diff --git a/mm/rmap.c b/mm/rmap.c
index 41ac39749ef4..2b9f413c9c00 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -436,7 +436,6 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
436 entry = pte_wrprotect(entry); 436 entry = pte_wrprotect(entry);
437 entry = pte_mkclean(entry); 437 entry = pte_mkclean(entry);
438 set_pte_at(mm, address, pte, entry); 438 set_pte_at(mm, address, pte, entry);
439 lazy_mmu_prot_update(entry);
440 ret = 1; 439 ret = 1;
441 } 440 }
442 441
diff --git a/mm/shmem.c b/mm/shmem.c
index fcd19d323f9f..8a82342a8595 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -49,7 +49,6 @@
49#include <linux/ctype.h> 49#include <linux/ctype.h>
50#include <linux/migrate.h> 50#include <linux/migrate.h>
51#include <linux/highmem.h> 51#include <linux/highmem.h>
52#include <linux/backing-dev.h>
53 52
54#include <asm/uaccess.h> 53#include <asm/uaccess.h>
55#include <asm/div64.h> 54#include <asm/div64.h>
@@ -96,9 +95,9 @@ static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
96 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: 95 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
97 * might be reconsidered if it ever diverges from PAGE_SIZE. 96 * might be reconsidered if it ever diverges from PAGE_SIZE.
98 * 97 *
99 * __GFP_MOVABLE is masked out as swap vectors cannot move 98 * Mobility flags are masked out as swap vectors cannot move
100 */ 99 */
101 return alloc_pages((gfp_mask & ~__GFP_MOVABLE) | __GFP_ZERO, 100 return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
102 PAGE_CACHE_SHIFT-PAGE_SHIFT); 101 PAGE_CACHE_SHIFT-PAGE_SHIFT);
103} 102}
104 103
@@ -972,7 +971,7 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_
972 *nodelist++ = '\0'; 971 *nodelist++ = '\0';
973 if (nodelist_parse(nodelist, *policy_nodes)) 972 if (nodelist_parse(nodelist, *policy_nodes))
974 goto out; 973 goto out;
975 if (!nodes_subset(*policy_nodes, node_online_map)) 974 if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
976 goto out; 975 goto out;
977 } 976 }
978 if (!strcmp(value, "default")) { 977 if (!strcmp(value, "default")) {
@@ -997,9 +996,11 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_
997 err = 0; 996 err = 0;
998 } else if (!strcmp(value, "interleave")) { 997 } else if (!strcmp(value, "interleave")) {
999 *policy = MPOL_INTERLEAVE; 998 *policy = MPOL_INTERLEAVE;
1000 /* Default to nodes online if no nodelist */ 999 /*
1000 * Default to online nodes with memory if no nodelist
1001 */
1001 if (!nodelist) 1002 if (!nodelist)
1002 *policy_nodes = node_online_map; 1003 *policy_nodes = node_states[N_HIGH_MEMORY];
1003 err = 0; 1004 err = 0;
1004 } 1005 }
1005out: 1006out:
@@ -1025,8 +1026,8 @@ static struct page *shmem_swapin_async(struct shared_policy *p,
1025 return page; 1026 return page;
1026} 1027}
1027 1028
1028struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry, 1029static struct page *shmem_swapin(struct shmem_inode_info *info,
1029 unsigned long idx) 1030 swp_entry_t entry, unsigned long idx)
1030{ 1031{
1031 struct shared_policy *p = &info->policy; 1032 struct shared_policy *p = &info->policy;
1032 int i, num; 1033 int i, num;
@@ -1061,7 +1062,8 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
1061 return page; 1062 return page;
1062} 1063}
1063#else 1064#else
1064static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) 1065static inline int shmem_parse_mpol(char *value, int *policy,
1066 nodemask_t *policy_nodes)
1065{ 1067{
1066 return 1; 1068 return 1;
1067} 1069}
@@ -1109,7 +1111,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
1109 * Normally, filepage is NULL on entry, and either found 1111 * Normally, filepage is NULL on entry, and either found
1110 * uptodate immediately, or allocated and zeroed, or read 1112 * uptodate immediately, or allocated and zeroed, or read
1111 * in under swappage, which is then assigned to filepage. 1113 * in under swappage, which is then assigned to filepage.
1112 * But shmem_readpage and shmem_prepare_write pass in a locked 1114 * But shmem_readpage and shmem_write_begin pass in a locked
1113 * filepage, which may be found not uptodate by other callers 1115 * filepage, which may be found not uptodate by other callers
1114 * too, and may need to be copied from the swappage read in. 1116 * too, and may need to be copied from the swappage read in.
1115 */ 1117 */
@@ -1327,14 +1329,14 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1327} 1329}
1328 1330
1329#ifdef CONFIG_NUMA 1331#ifdef CONFIG_NUMA
1330int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) 1332static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
1331{ 1333{
1332 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1334 struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1333 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); 1335 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
1334} 1336}
1335 1337
1336struct mempolicy * 1338static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
1337shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) 1339 unsigned long addr)
1338{ 1340{
1339 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1341 struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1340 unsigned long idx; 1342 unsigned long idx;
@@ -1446,7 +1448,7 @@ static const struct inode_operations shmem_symlink_inode_operations;
1446static const struct inode_operations shmem_symlink_inline_operations; 1448static const struct inode_operations shmem_symlink_inline_operations;
1447 1449
1448/* 1450/*
1449 * Normally tmpfs avoids the use of shmem_readpage and shmem_prepare_write; 1451 * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
1450 * but providing them allows a tmpfs file to be used for splice, sendfile, and 1452 * but providing them allows a tmpfs file to be used for splice, sendfile, and
1451 * below the loop driver, in the generic fashion that many filesystems support. 1453 * below the loop driver, in the generic fashion that many filesystems support.
1452 */ 1454 */
@@ -1459,10 +1461,30 @@ static int shmem_readpage(struct file *file, struct page *page)
1459} 1461}
1460 1462
1461static int 1463static int
1462shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) 1464shmem_write_begin(struct file *file, struct address_space *mapping,
1465 loff_t pos, unsigned len, unsigned flags,
1466 struct page **pagep, void **fsdata)
1463{ 1467{
1464 struct inode *inode = page->mapping->host; 1468 struct inode *inode = mapping->host;
1465 return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL); 1469 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1470 *pagep = NULL;
1471 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1472}
1473
1474static int
1475shmem_write_end(struct file *file, struct address_space *mapping,
1476 loff_t pos, unsigned len, unsigned copied,
1477 struct page *page, void *fsdata)
1478{
1479 struct inode *inode = mapping->host;
1480
1481 set_page_dirty(page);
1482 page_cache_release(page);
1483
1484 if (pos+copied > inode->i_size)
1485 i_size_write(inode, pos+copied);
1486
1487 return copied;
1466} 1488}
1467 1489
1468static ssize_t 1490static ssize_t
@@ -2219,7 +2241,7 @@ static int shmem_fill_super(struct super_block *sb,
2219 unsigned long blocks = 0; 2241 unsigned long blocks = 0;
2220 unsigned long inodes = 0; 2242 unsigned long inodes = 0;
2221 int policy = MPOL_DEFAULT; 2243 int policy = MPOL_DEFAULT;
2222 nodemask_t policy_nodes = node_online_map; 2244 nodemask_t policy_nodes = node_states[N_HIGH_MEMORY];
2223 2245
2224#ifdef CONFIG_TMPFS 2246#ifdef CONFIG_TMPFS
2225 /* 2247 /*
@@ -2338,8 +2360,8 @@ static const struct address_space_operations shmem_aops = {
2338 .set_page_dirty = __set_page_dirty_no_writeback, 2360 .set_page_dirty = __set_page_dirty_no_writeback,
2339#ifdef CONFIG_TMPFS 2361#ifdef CONFIG_TMPFS
2340 .readpage = shmem_readpage, 2362 .readpage = shmem_readpage,
2341 .prepare_write = shmem_prepare_write, 2363 .write_begin = shmem_write_begin,
2342 .commit_write = simple_commit_write, 2364 .write_end = shmem_write_end,
2343#endif 2365#endif
2344 .migratepage = migrate_page, 2366 .migratepage = migrate_page,
2345}; 2367};
diff --git a/mm/slab.c b/mm/slab.c
index 6f6abef83a1a..e34bcb87a6ee 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1568,7 +1568,7 @@ void __init kmem_cache_init(void)
1568 /* Replace the static kmem_list3 structures for the boot cpu */ 1568 /* Replace the static kmem_list3 structures for the boot cpu */
1569 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); 1569 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
1570 1570
1571 for_each_online_node(nid) { 1571 for_each_node_state(nid, N_NORMAL_MEMORY) {
1572 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1572 init_list(malloc_sizes[INDEX_AC].cs_cachep,
1573 &initkmem_list3[SIZE_AC + nid], nid); 1573 &initkmem_list3[SIZE_AC + nid], nid);
1574 1574
@@ -1643,6 +1643,8 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1643#endif 1643#endif
1644 1644
1645 flags |= cachep->gfpflags; 1645 flags |= cachep->gfpflags;
1646 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1647 flags |= __GFP_RECLAIMABLE;
1646 1648
1647 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1649 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1648 if (!page) 1650 if (!page)
@@ -1944,7 +1946,7 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
1944{ 1946{
1945 int node; 1947 int node;
1946 1948
1947 for_each_online_node(node) { 1949 for_each_node_state(node, N_NORMAL_MEMORY) {
1948 cachep->nodelists[node] = &initkmem_list3[index + node]; 1950 cachep->nodelists[node] = &initkmem_list3[index + node];
1949 cachep->nodelists[node]->next_reap = jiffies + 1951 cachep->nodelists[node]->next_reap = jiffies +
1950 REAPTIMEOUT_LIST3 + 1952 REAPTIMEOUT_LIST3 +
@@ -2075,7 +2077,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2075 g_cpucache_up = PARTIAL_L3; 2077 g_cpucache_up = PARTIAL_L3;
2076 } else { 2078 } else {
2077 int node; 2079 int node;
2078 for_each_online_node(node) { 2080 for_each_node_state(node, N_NORMAL_MEMORY) {
2079 cachep->nodelists[node] = 2081 cachep->nodelists[node] =
2080 kmalloc_node(sizeof(struct kmem_list3), 2082 kmalloc_node(sizeof(struct kmem_list3),
2081 GFP_KERNEL, node); 2083 GFP_KERNEL, node);
@@ -2746,9 +2748,9 @@ static int cache_grow(struct kmem_cache *cachep,
2746 * Be lazy and only check for valid flags here, keeping it out of the 2748 * Be lazy and only check for valid flags here, keeping it out of the
2747 * critical path in kmem_cache_alloc(). 2749 * critical path in kmem_cache_alloc().
2748 */ 2750 */
2749 BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); 2751 BUG_ON(flags & GFP_SLAB_BUG_MASK);
2752 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2750 2753
2751 local_flags = (flags & GFP_LEVEL_MASK);
2752 /* Take the l3 list lock to change the colour_next on this node */ 2754 /* Take the l3 list lock to change the colour_next on this node */
2753 check_irq_off(); 2755 check_irq_off();
2754 l3 = cachep->nodelists[nodeid]; 2756 l3 = cachep->nodelists[nodeid];
@@ -2785,7 +2787,7 @@ static int cache_grow(struct kmem_cache *cachep,
2785 2787
2786 /* Get slab management. */ 2788 /* Get slab management. */
2787 slabp = alloc_slabmgmt(cachep, objp, offset, 2789 slabp = alloc_slabmgmt(cachep, objp, offset,
2788 local_flags & ~GFP_THISNODE, nodeid); 2790 local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
2789 if (!slabp) 2791 if (!slabp)
2790 goto opps1; 2792 goto opps1;
2791 2793
@@ -3225,7 +3227,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3225 3227
3226 zonelist = &NODE_DATA(slab_node(current->mempolicy)) 3228 zonelist = &NODE_DATA(slab_node(current->mempolicy))
3227 ->node_zonelists[gfp_zone(flags)]; 3229 ->node_zonelists[gfp_zone(flags)];
3228 local_flags = (flags & GFP_LEVEL_MASK); 3230 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3229 3231
3230retry: 3232retry:
3231 /* 3233 /*
@@ -3792,7 +3794,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3792 struct array_cache *new_shared; 3794 struct array_cache *new_shared;
3793 struct array_cache **new_alien = NULL; 3795 struct array_cache **new_alien = NULL;
3794 3796
3795 for_each_online_node(node) { 3797 for_each_node_state(node, N_NORMAL_MEMORY) {
3796 3798
3797 if (use_alien_caches) { 3799 if (use_alien_caches) {
3798 new_alien = alloc_alien_cache(node, cachep->limit); 3800 new_alien = alloc_alien_cache(node, cachep->limit);
@@ -4446,7 +4448,8 @@ const struct seq_operations slabstats_op = {
4446 */ 4448 */
4447size_t ksize(const void *objp) 4449size_t ksize(const void *objp)
4448{ 4450{
4449 if (unlikely(ZERO_OR_NULL_PTR(objp))) 4451 BUG_ON(!objp);
4452 if (unlikely(objp == ZERO_SIZE_PTR))
4450 return 0; 4453 return 0;
4451 4454
4452 return obj_size(virt_to_cache(objp)); 4455 return obj_size(virt_to_cache(objp));
diff --git a/mm/slob.c b/mm/slob.c
index ec33fcdc852e..de5d5563a46c 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -360,7 +360,7 @@ static void slob_free(void *block, int size)
360 slobidx_t units; 360 slobidx_t units;
361 unsigned long flags; 361 unsigned long flags;
362 362
363 if (ZERO_OR_NULL_PTR(block)) 363 if (unlikely(ZERO_OR_NULL_PTR(block)))
364 return; 364 return;
365 BUG_ON(!size); 365 BUG_ON(!size);
366 366
@@ -466,7 +466,7 @@ void kfree(const void *block)
466{ 466{
467 struct slob_page *sp; 467 struct slob_page *sp;
468 468
469 if (ZERO_OR_NULL_PTR(block)) 469 if (unlikely(ZERO_OR_NULL_PTR(block)))
470 return; 470 return;
471 471
472 sp = (struct slob_page *)virt_to_page(block); 472 sp = (struct slob_page *)virt_to_page(block);
@@ -484,7 +484,8 @@ size_t ksize(const void *block)
484{ 484{
485 struct slob_page *sp; 485 struct slob_page *sp;
486 486
487 if (ZERO_OR_NULL_PTR(block)) 487 BUG_ON(!block);
488 if (unlikely(block == ZERO_SIZE_PTR))
488 return 0; 489 return 0;
489 490
490 sp = (struct slob_page *)virt_to_page(block); 491 sp = (struct slob_page *)virt_to_page(block);
diff --git a/mm/slub.c b/mm/slub.c
index addb20a6d67d..f426f9bc644b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -90,7 +90,7 @@
90 * One use of this flag is to mark slabs that are 90 * One use of this flag is to mark slabs that are
91 * used for allocations. Then such a slab becomes a cpu 91 * used for allocations. Then such a slab becomes a cpu
92 * slab. The cpu slab may be equipped with an additional 92 * slab. The cpu slab may be equipped with an additional
93 * lockless_freelist that allows lockless access to 93 * freelist that allows lockless access to
94 * free objects in addition to the regular freelist 94 * free objects in addition to the regular freelist
95 * that requires the slab lock. 95 * that requires the slab lock.
96 * 96 *
@@ -140,11 +140,6 @@ static inline void ClearSlabDebug(struct page *page)
140/* 140/*
141 * Issues still to be resolved: 141 * Issues still to be resolved:
142 * 142 *
143 * - The per cpu array is updated for each new slab and and is a remote
144 * cacheline for most nodes. This could become a bouncing cacheline given
145 * enough frequent updates. There are 16 pointers in a cacheline, so at
146 * max 16 cpus could compete for the cacheline which may be okay.
147 *
148 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 143 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
149 * 144 *
150 * - Variable sizing of the per node arrays 145 * - Variable sizing of the per node arrays
@@ -205,11 +200,6 @@ static inline void ClearSlabDebug(struct page *page)
205#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) 200#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
206#endif 201#endif
207 202
208/*
209 * The page->inuse field is 16 bit thus we have this limitation
210 */
211#define MAX_OBJECTS_PER_SLAB 65535
212
213/* Internal SLUB flags */ 203/* Internal SLUB flags */
214#define __OBJECT_POISON 0x80000000 /* Poison object */ 204#define __OBJECT_POISON 0x80000000 /* Poison object */
215#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ 205#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */
@@ -277,6 +267,15 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
277#endif 267#endif
278} 268}
279 269
270static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
271{
272#ifdef CONFIG_SMP
273 return s->cpu_slab[cpu];
274#else
275 return &s->cpu_slab;
276#endif
277}
278
280static inline int check_valid_pointer(struct kmem_cache *s, 279static inline int check_valid_pointer(struct kmem_cache *s,
281 struct page *page, const void *object) 280 struct page *page, const void *object)
282{ 281{
@@ -729,11 +728,6 @@ static int check_slab(struct kmem_cache *s, struct page *page)
729 slab_err(s, page, "Not a valid slab page"); 728 slab_err(s, page, "Not a valid slab page");
730 return 0; 729 return 0;
731 } 730 }
732 if (page->offset * sizeof(void *) != s->offset) {
733 slab_err(s, page, "Corrupted offset %lu",
734 (unsigned long)(page->offset * sizeof(void *)));
735 return 0;
736 }
737 if (page->inuse > s->objects) { 731 if (page->inuse > s->objects) {
738 slab_err(s, page, "inuse %u > max %u", 732 slab_err(s, page, "inuse %u > max %u",
739 s->name, page->inuse, s->objects); 733 s->name, page->inuse, s->objects);
@@ -872,8 +866,6 @@ bad:
872 slab_fix(s, "Marking all objects used"); 866 slab_fix(s, "Marking all objects used");
873 page->inuse = s->objects; 867 page->inuse = s->objects;
874 page->freelist = NULL; 868 page->freelist = NULL;
875 /* Fix up fields that may be corrupted */
876 page->offset = s->offset / sizeof(void *);
877 } 869 }
878 return 0; 870 return 0;
879} 871}
@@ -1055,6 +1047,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1055 if (s->flags & SLAB_CACHE_DMA) 1047 if (s->flags & SLAB_CACHE_DMA)
1056 flags |= SLUB_DMA; 1048 flags |= SLUB_DMA;
1057 1049
1050 if (s->flags & SLAB_RECLAIM_ACCOUNT)
1051 flags |= __GFP_RECLAIMABLE;
1052
1058 if (node == -1) 1053 if (node == -1)
1059 page = alloc_pages(flags, s->order); 1054 page = alloc_pages(flags, s->order);
1060 else 1055 else
@@ -1088,19 +1083,19 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1088 void *last; 1083 void *last;
1089 void *p; 1084 void *p;
1090 1085
1091 BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); 1086 BUG_ON(flags & GFP_SLAB_BUG_MASK);
1092 1087
1093 if (flags & __GFP_WAIT) 1088 if (flags & __GFP_WAIT)
1094 local_irq_enable(); 1089 local_irq_enable();
1095 1090
1096 page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); 1091 page = allocate_slab(s,
1092 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1097 if (!page) 1093 if (!page)
1098 goto out; 1094 goto out;
1099 1095
1100 n = get_node(s, page_to_nid(page)); 1096 n = get_node(s, page_to_nid(page));
1101 if (n) 1097 if (n)
1102 atomic_long_inc(&n->nr_slabs); 1098 atomic_long_inc(&n->nr_slabs);
1103 page->offset = s->offset / sizeof(void *);
1104 page->slab = s; 1099 page->slab = s;
1105 page->flags |= 1 << PG_slab; 1100 page->flags |= 1 << PG_slab;
1106 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | 1101 if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
@@ -1123,7 +1118,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1123 set_freepointer(s, last, NULL); 1118 set_freepointer(s, last, NULL);
1124 1119
1125 page->freelist = start; 1120 page->freelist = start;
1126 page->lockless_freelist = NULL;
1127 page->inuse = 0; 1121 page->inuse = 0;
1128out: 1122out:
1129 if (flags & __GFP_WAIT) 1123 if (flags & __GFP_WAIT)
@@ -1149,7 +1143,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1149 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1143 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1150 - pages); 1144 - pages);
1151 1145
1152 page->mapping = NULL;
1153 __free_pages(page, s->order); 1146 __free_pages(page, s->order);
1154} 1147}
1155 1148
@@ -1383,33 +1376,34 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page)
1383/* 1376/*
1384 * Remove the cpu slab 1377 * Remove the cpu slab
1385 */ 1378 */
1386static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) 1379static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1387{ 1380{
1381 struct page *page = c->page;
1388 /* 1382 /*
1389 * Merge cpu freelist into freelist. Typically we get here 1383 * Merge cpu freelist into freelist. Typically we get here
1390 * because both freelists are empty. So this is unlikely 1384 * because both freelists are empty. So this is unlikely
1391 * to occur. 1385 * to occur.
1392 */ 1386 */
1393 while (unlikely(page->lockless_freelist)) { 1387 while (unlikely(c->freelist)) {
1394 void **object; 1388 void **object;
1395 1389
1396 /* Retrieve object from cpu_freelist */ 1390 /* Retrieve object from cpu_freelist */
1397 object = page->lockless_freelist; 1391 object = c->freelist;
1398 page->lockless_freelist = page->lockless_freelist[page->offset]; 1392 c->freelist = c->freelist[c->offset];
1399 1393
1400 /* And put onto the regular freelist */ 1394 /* And put onto the regular freelist */
1401 object[page->offset] = page->freelist; 1395 object[c->offset] = page->freelist;
1402 page->freelist = object; 1396 page->freelist = object;
1403 page->inuse--; 1397 page->inuse--;
1404 } 1398 }
1405 s->cpu_slab[cpu] = NULL; 1399 c->page = NULL;
1406 unfreeze_slab(s, page); 1400 unfreeze_slab(s, page);
1407} 1401}
1408 1402
1409static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) 1403static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1410{ 1404{
1411 slab_lock(page); 1405 slab_lock(c->page);
1412 deactivate_slab(s, page, cpu); 1406 deactivate_slab(s, c);
1413} 1407}
1414 1408
1415/* 1409/*
@@ -1418,18 +1412,17 @@ static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
1418 */ 1412 */
1419static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1413static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1420{ 1414{
1421 struct page *page = s->cpu_slab[cpu]; 1415 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
1422 1416
1423 if (likely(page)) 1417 if (likely(c && c->page))
1424 flush_slab(s, page, cpu); 1418 flush_slab(s, c);
1425} 1419}
1426 1420
1427static void flush_cpu_slab(void *d) 1421static void flush_cpu_slab(void *d)
1428{ 1422{
1429 struct kmem_cache *s = d; 1423 struct kmem_cache *s = d;
1430 int cpu = smp_processor_id();
1431 1424
1432 __flush_cpu_slab(s, cpu); 1425 __flush_cpu_slab(s, smp_processor_id());
1433} 1426}
1434 1427
1435static void flush_all(struct kmem_cache *s) 1428static void flush_all(struct kmem_cache *s)
@@ -1446,6 +1439,19 @@ static void flush_all(struct kmem_cache *s)
1446} 1439}
1447 1440
1448/* 1441/*
1442 * Check if the objects in a per cpu structure fit numa
1443 * locality expectations.
1444 */
1445static inline int node_match(struct kmem_cache_cpu *c, int node)
1446{
1447#ifdef CONFIG_NUMA
1448 if (node != -1 && c->node != node)
1449 return 0;
1450#endif
1451 return 1;
1452}
1453
1454/*
1449 * Slow path. The lockless freelist is empty or we need to perform 1455 * Slow path. The lockless freelist is empty or we need to perform
1450 * debugging duties. 1456 * debugging duties.
1451 * 1457 *
@@ -1463,45 +1469,46 @@ static void flush_all(struct kmem_cache *s)
1463 * we need to allocate a new slab. This is slowest path since we may sleep. 1469 * we need to allocate a new slab. This is slowest path since we may sleep.
1464 */ 1470 */
1465static void *__slab_alloc(struct kmem_cache *s, 1471static void *__slab_alloc(struct kmem_cache *s,
1466 gfp_t gfpflags, int node, void *addr, struct page *page) 1472 gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
1467{ 1473{
1468 void **object; 1474 void **object;
1469 int cpu = smp_processor_id(); 1475 struct page *new;
1470 1476
1471 if (!page) 1477 if (!c->page)
1472 goto new_slab; 1478 goto new_slab;
1473 1479
1474 slab_lock(page); 1480 slab_lock(c->page);
1475 if (unlikely(node != -1 && page_to_nid(page) != node)) 1481 if (unlikely(!node_match(c, node)))
1476 goto another_slab; 1482 goto another_slab;
1477load_freelist: 1483load_freelist:
1478 object = page->freelist; 1484 object = c->page->freelist;
1479 if (unlikely(!object)) 1485 if (unlikely(!object))
1480 goto another_slab; 1486 goto another_slab;
1481 if (unlikely(SlabDebug(page))) 1487 if (unlikely(SlabDebug(c->page)))
1482 goto debug; 1488 goto debug;
1483 1489
1484 object = page->freelist; 1490 object = c->page->freelist;
1485 page->lockless_freelist = object[page->offset]; 1491 c->freelist = object[c->offset];
1486 page->inuse = s->objects; 1492 c->page->inuse = s->objects;
1487 page->freelist = NULL; 1493 c->page->freelist = NULL;
1488 slab_unlock(page); 1494 c->node = page_to_nid(c->page);
1495 slab_unlock(c->page);
1489 return object; 1496 return object;
1490 1497
1491another_slab: 1498another_slab:
1492 deactivate_slab(s, page, cpu); 1499 deactivate_slab(s, c);
1493 1500
1494new_slab: 1501new_slab:
1495 page = get_partial(s, gfpflags, node); 1502 new = get_partial(s, gfpflags, node);
1496 if (page) { 1503 if (new) {
1497 s->cpu_slab[cpu] = page; 1504 c->page = new;
1498 goto load_freelist; 1505 goto load_freelist;
1499 } 1506 }
1500 1507
1501 page = new_slab(s, gfpflags, node); 1508 new = new_slab(s, gfpflags, node);
1502 if (page) { 1509 if (new) {
1503 cpu = smp_processor_id(); 1510 c = get_cpu_slab(s, smp_processor_id());
1504 if (s->cpu_slab[cpu]) { 1511 if (c->page) {
1505 /* 1512 /*
1506 * Someone else populated the cpu_slab while we 1513 * Someone else populated the cpu_slab while we
1507 * enabled interrupts, or we have gotten scheduled 1514 * enabled interrupts, or we have gotten scheduled
@@ -1509,34 +1516,33 @@ new_slab:
1509 * requested node even if __GFP_THISNODE was 1516 * requested node even if __GFP_THISNODE was
1510 * specified. So we need to recheck. 1517 * specified. So we need to recheck.
1511 */ 1518 */
1512 if (node == -1 || 1519 if (node_match(c, node)) {
1513 page_to_nid(s->cpu_slab[cpu]) == node) {
1514 /* 1520 /*
1515 * Current cpuslab is acceptable and we 1521 * Current cpuslab is acceptable and we
1516 * want the current one since its cache hot 1522 * want the current one since its cache hot
1517 */ 1523 */
1518 discard_slab(s, page); 1524 discard_slab(s, new);
1519 page = s->cpu_slab[cpu]; 1525 slab_lock(c->page);
1520 slab_lock(page);
1521 goto load_freelist; 1526 goto load_freelist;
1522 } 1527 }
1523 /* New slab does not fit our expectations */ 1528 /* New slab does not fit our expectations */
1524 flush_slab(s, s->cpu_slab[cpu], cpu); 1529 flush_slab(s, c);
1525 } 1530 }
1526 slab_lock(page); 1531 slab_lock(new);
1527 SetSlabFrozen(page); 1532 SetSlabFrozen(new);
1528 s->cpu_slab[cpu] = page; 1533 c->page = new;
1529 goto load_freelist; 1534 goto load_freelist;
1530 } 1535 }
1531 return NULL; 1536 return NULL;
1532debug: 1537debug:
1533 object = page->freelist; 1538 object = c->page->freelist;
1534 if (!alloc_debug_processing(s, page, object, addr)) 1539 if (!alloc_debug_processing(s, c->page, object, addr))
1535 goto another_slab; 1540 goto another_slab;
1536 1541
1537 page->inuse++; 1542 c->page->inuse++;
1538 page->freelist = object[page->offset]; 1543 c->page->freelist = object[c->offset];
1539 slab_unlock(page); 1544 c->node = -1;
1545 slab_unlock(c->page);
1540 return object; 1546 return object;
1541} 1547}
1542 1548
@@ -1553,25 +1559,24 @@ debug:
1553static void __always_inline *slab_alloc(struct kmem_cache *s, 1559static void __always_inline *slab_alloc(struct kmem_cache *s,
1554 gfp_t gfpflags, int node, void *addr) 1560 gfp_t gfpflags, int node, void *addr)
1555{ 1561{
1556 struct page *page;
1557 void **object; 1562 void **object;
1558 unsigned long flags; 1563 unsigned long flags;
1564 struct kmem_cache_cpu *c;
1559 1565
1560 local_irq_save(flags); 1566 local_irq_save(flags);
1561 page = s->cpu_slab[smp_processor_id()]; 1567 c = get_cpu_slab(s, smp_processor_id());
1562 if (unlikely(!page || !page->lockless_freelist || 1568 if (unlikely(!c->freelist || !node_match(c, node)))
1563 (node != -1 && page_to_nid(page) != node)))
1564 1569
1565 object = __slab_alloc(s, gfpflags, node, addr, page); 1570 object = __slab_alloc(s, gfpflags, node, addr, c);
1566 1571
1567 else { 1572 else {
1568 object = page->lockless_freelist; 1573 object = c->freelist;
1569 page->lockless_freelist = object[page->offset]; 1574 c->freelist = object[c->offset];
1570 } 1575 }
1571 local_irq_restore(flags); 1576 local_irq_restore(flags);
1572 1577
1573 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1578 if (unlikely((gfpflags & __GFP_ZERO) && object))
1574 memset(object, 0, s->objsize); 1579 memset(object, 0, c->objsize);
1575 1580
1576 return object; 1581 return object;
1577} 1582}
@@ -1599,7 +1604,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
1599 * handling required then we can return immediately. 1604 * handling required then we can return immediately.
1600 */ 1605 */
1601static void __slab_free(struct kmem_cache *s, struct page *page, 1606static void __slab_free(struct kmem_cache *s, struct page *page,
1602 void *x, void *addr) 1607 void *x, void *addr, unsigned int offset)
1603{ 1608{
1604 void *prior; 1609 void *prior;
1605 void **object = (void *)x; 1610 void **object = (void *)x;
@@ -1609,7 +1614,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1609 if (unlikely(SlabDebug(page))) 1614 if (unlikely(SlabDebug(page)))
1610 goto debug; 1615 goto debug;
1611checks_ok: 1616checks_ok:
1612 prior = object[page->offset] = page->freelist; 1617 prior = object[offset] = page->freelist;
1613 page->freelist = object; 1618 page->freelist = object;
1614 page->inuse--; 1619 page->inuse--;
1615 1620
@@ -1664,15 +1669,16 @@ static void __always_inline slab_free(struct kmem_cache *s,
1664{ 1669{
1665 void **object = (void *)x; 1670 void **object = (void *)x;
1666 unsigned long flags; 1671 unsigned long flags;
1672 struct kmem_cache_cpu *c;
1667 1673
1668 local_irq_save(flags); 1674 local_irq_save(flags);
1669 debug_check_no_locks_freed(object, s->objsize); 1675 debug_check_no_locks_freed(object, s->objsize);
1670 if (likely(page == s->cpu_slab[smp_processor_id()] && 1676 c = get_cpu_slab(s, smp_processor_id());
1671 !SlabDebug(page))) { 1677 if (likely(page == c->page && c->node >= 0)) {
1672 object[page->offset] = page->lockless_freelist; 1678 object[c->offset] = c->freelist;
1673 page->lockless_freelist = object; 1679 c->freelist = object;
1674 } else 1680 } else
1675 __slab_free(s, page, x, addr); 1681 __slab_free(s, page, x, addr, c->offset);
1676 1682
1677 local_irq_restore(flags); 1683 local_irq_restore(flags);
1678} 1684}
@@ -1759,14 +1765,6 @@ static inline int slab_order(int size, int min_objects,
1759 int rem; 1765 int rem;
1760 int min_order = slub_min_order; 1766 int min_order = slub_min_order;
1761 1767
1762 /*
1763 * If we would create too many object per slab then reduce
1764 * the slab order even if it goes below slub_min_order.
1765 */
1766 while (min_order > 0 &&
1767 (PAGE_SIZE << min_order) >= MAX_OBJECTS_PER_SLAB * size)
1768 min_order--;
1769
1770 for (order = max(min_order, 1768 for (order = max(min_order,
1771 fls(min_objects * size - 1) - PAGE_SHIFT); 1769 fls(min_objects * size - 1) - PAGE_SHIFT);
1772 order <= max_order; order++) { 1770 order <= max_order; order++) {
@@ -1781,9 +1779,6 @@ static inline int slab_order(int size, int min_objects,
1781 if (rem <= slab_size / fract_leftover) 1779 if (rem <= slab_size / fract_leftover)
1782 break; 1780 break;
1783 1781
1784 /* If the next size is too high then exit now */
1785 if (slab_size * 2 >= MAX_OBJECTS_PER_SLAB * size)
1786 break;
1787 } 1782 }
1788 1783
1789 return order; 1784 return order;
@@ -1858,6 +1853,16 @@ static unsigned long calculate_alignment(unsigned long flags,
1858 return ALIGN(align, sizeof(void *)); 1853 return ALIGN(align, sizeof(void *));
1859} 1854}
1860 1855
1856static void init_kmem_cache_cpu(struct kmem_cache *s,
1857 struct kmem_cache_cpu *c)
1858{
1859 c->page = NULL;
1860 c->freelist = NULL;
1861 c->node = 0;
1862 c->offset = s->offset / sizeof(void *);
1863 c->objsize = s->objsize;
1864}
1865
1861static void init_kmem_cache_node(struct kmem_cache_node *n) 1866static void init_kmem_cache_node(struct kmem_cache_node *n)
1862{ 1867{
1863 n->nr_partial = 0; 1868 n->nr_partial = 0;
@@ -1869,6 +1874,131 @@ static void init_kmem_cache_node(struct kmem_cache_node *n)
1869#endif 1874#endif
1870} 1875}
1871 1876
1877#ifdef CONFIG_SMP
1878/*
1879 * Per cpu array for per cpu structures.
1880 *
1881 * The per cpu array places all kmem_cache_cpu structures from one processor
1882 * close together meaning that it becomes possible that multiple per cpu
1883 * structures are contained in one cacheline. This may be particularly
1884 * beneficial for the kmalloc caches.
1885 *
1886 * A desktop system typically has around 60-80 slabs. With 100 here we are
1887 * likely able to get per cpu structures for all caches from the array defined
1888 * here. We must be able to cover all kmalloc caches during bootstrap.
1889 *
1890 * If the per cpu array is exhausted then fall back to kmalloc
1891 * of individual cachelines. No sharing is possible then.
1892 */
1893#define NR_KMEM_CACHE_CPU 100
1894
1895static DEFINE_PER_CPU(struct kmem_cache_cpu,
1896 kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
1897
1898static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
1899static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE;
1900
1901static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
1902 int cpu, gfp_t flags)
1903{
1904 struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
1905
1906 if (c)
1907 per_cpu(kmem_cache_cpu_free, cpu) =
1908 (void *)c->freelist;
1909 else {
1910 /* Table overflow: So allocate ourselves */
1911 c = kmalloc_node(
1912 ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
1913 flags, cpu_to_node(cpu));
1914 if (!c)
1915 return NULL;
1916 }
1917
1918 init_kmem_cache_cpu(s, c);
1919 return c;
1920}
1921
1922static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
1923{
1924 if (c < per_cpu(kmem_cache_cpu, cpu) ||
1925 c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
1926 kfree(c);
1927 return;
1928 }
1929 c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
1930 per_cpu(kmem_cache_cpu_free, cpu) = c;
1931}
1932
1933static void free_kmem_cache_cpus(struct kmem_cache *s)
1934{
1935 int cpu;
1936
1937 for_each_online_cpu(cpu) {
1938 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
1939
1940 if (c) {
1941 s->cpu_slab[cpu] = NULL;
1942 free_kmem_cache_cpu(c, cpu);
1943 }
1944 }
1945}
1946
1947static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
1948{
1949 int cpu;
1950
1951 for_each_online_cpu(cpu) {
1952 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
1953
1954 if (c)
1955 continue;
1956
1957 c = alloc_kmem_cache_cpu(s, cpu, flags);
1958 if (!c) {
1959 free_kmem_cache_cpus(s);
1960 return 0;
1961 }
1962 s->cpu_slab[cpu] = c;
1963 }
1964 return 1;
1965}
1966
1967/*
1968 * Initialize the per cpu array.
1969 */
1970static void init_alloc_cpu_cpu(int cpu)
1971{
1972 int i;
1973
1974 if (cpu_isset(cpu, kmem_cach_cpu_free_init_once))
1975 return;
1976
1977 for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
1978 free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
1979
1980 cpu_set(cpu, kmem_cach_cpu_free_init_once);
1981}
1982
1983static void __init init_alloc_cpu(void)
1984{
1985 int cpu;
1986
1987 for_each_online_cpu(cpu)
1988 init_alloc_cpu_cpu(cpu);
1989 }
1990
1991#else
1992static inline void free_kmem_cache_cpus(struct kmem_cache *s) {}
1993static inline void init_alloc_cpu(void) {}
1994
1995static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
1996{
1997 init_kmem_cache_cpu(s, &s->cpu_slab);
1998 return 1;
1999}
2000#endif
2001
1872#ifdef CONFIG_NUMA 2002#ifdef CONFIG_NUMA
1873/* 2003/*
1874 * No kmalloc_node yet so do it by hand. We know that this is the first 2004 * No kmalloc_node yet so do it by hand. We know that this is the first
@@ -1876,10 +2006,11 @@ static void init_kmem_cache_node(struct kmem_cache_node *n)
1876 * possible. 2006 * possible.
1877 * 2007 *
1878 * Note that this function only works on the kmalloc_node_cache 2008 * Note that this function only works on the kmalloc_node_cache
1879 * when allocating for the kmalloc_node_cache. 2009 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
2010 * memory on a fresh node that has no slab structures yet.
1880 */ 2011 */
1881static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags, 2012static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
1882 int node) 2013 int node)
1883{ 2014{
1884 struct page *page; 2015 struct page *page;
1885 struct kmem_cache_node *n; 2016 struct kmem_cache_node *n;
@@ -1921,7 +2052,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
1921{ 2052{
1922 int node; 2053 int node;
1923 2054
1924 for_each_online_node(node) { 2055 for_each_node_state(node, N_NORMAL_MEMORY) {
1925 struct kmem_cache_node *n = s->node[node]; 2056 struct kmem_cache_node *n = s->node[node];
1926 if (n && n != &s->local_node) 2057 if (n && n != &s->local_node)
1927 kmem_cache_free(kmalloc_caches, n); 2058 kmem_cache_free(kmalloc_caches, n);
@@ -1939,7 +2070,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
1939 else 2070 else
1940 local_node = 0; 2071 local_node = 0;
1941 2072
1942 for_each_online_node(node) { 2073 for_each_node_state(node, N_NORMAL_MEMORY) {
1943 struct kmem_cache_node *n; 2074 struct kmem_cache_node *n;
1944 2075
1945 if (local_node == node) 2076 if (local_node == node)
@@ -2077,14 +2208,7 @@ static int calculate_sizes(struct kmem_cache *s)
2077 */ 2208 */
2078 s->objects = (PAGE_SIZE << s->order) / size; 2209 s->objects = (PAGE_SIZE << s->order) / size;
2079 2210
2080 /* 2211 return !!s->objects;
2081 * Verify that the number of objects is within permitted limits.
2082 * The page->inuse field is only 16 bit wide! So we cannot have
2083 * more than 64k objects per slab.
2084 */
2085 if (!s->objects || s->objects > MAX_OBJECTS_PER_SLAB)
2086 return 0;
2087 return 1;
2088 2212
2089} 2213}
2090 2214
@@ -2107,9 +2231,12 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2107#ifdef CONFIG_NUMA 2231#ifdef CONFIG_NUMA
2108 s->defrag_ratio = 100; 2232 s->defrag_ratio = 100;
2109#endif 2233#endif
2234 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
2235 goto error;
2110 2236
2111 if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 2237 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
2112 return 1; 2238 return 1;
2239 free_kmem_cache_nodes(s);
2113error: 2240error:
2114 if (flags & SLAB_PANIC) 2241 if (flags & SLAB_PANIC)
2115 panic("Cannot create slab %s size=%lu realsize=%u " 2242 panic("Cannot create slab %s size=%lu realsize=%u "
@@ -2192,7 +2319,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
2192 flush_all(s); 2319 flush_all(s);
2193 2320
2194 /* Attempt to free all objects */ 2321 /* Attempt to free all objects */
2195 for_each_online_node(node) { 2322 free_kmem_cache_cpus(s);
2323 for_each_node_state(node, N_NORMAL_MEMORY) {
2196 struct kmem_cache_node *n = get_node(s, node); 2324 struct kmem_cache_node *n = get_node(s, node);
2197 2325
2198 n->nr_partial -= free_list(s, n, &n->partial); 2326 n->nr_partial -= free_list(s, n, &n->partial);
@@ -2227,11 +2355,11 @@ EXPORT_SYMBOL(kmem_cache_destroy);
2227 * Kmalloc subsystem 2355 * Kmalloc subsystem
2228 *******************************************************************/ 2356 *******************************************************************/
2229 2357
2230struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; 2358struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned;
2231EXPORT_SYMBOL(kmalloc_caches); 2359EXPORT_SYMBOL(kmalloc_caches);
2232 2360
2233#ifdef CONFIG_ZONE_DMA 2361#ifdef CONFIG_ZONE_DMA
2234static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; 2362static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT];
2235#endif 2363#endif
2236 2364
2237static int __init setup_slub_min_order(char *str) 2365static int __init setup_slub_min_order(char *str)
@@ -2397,12 +2525,8 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2397 return ZERO_SIZE_PTR; 2525 return ZERO_SIZE_PTR;
2398 2526
2399 index = size_index[(size - 1) / 8]; 2527 index = size_index[(size - 1) / 8];
2400 } else { 2528 } else
2401 if (size > KMALLOC_MAX_SIZE)
2402 return NULL;
2403
2404 index = fls(size - 1); 2529 index = fls(size - 1);
2405 }
2406 2530
2407#ifdef CONFIG_ZONE_DMA 2531#ifdef CONFIG_ZONE_DMA
2408 if (unlikely((flags & SLUB_DMA))) 2532 if (unlikely((flags & SLUB_DMA)))
@@ -2414,9 +2538,15 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2414 2538
2415void *__kmalloc(size_t size, gfp_t flags) 2539void *__kmalloc(size_t size, gfp_t flags)
2416{ 2540{
2417 struct kmem_cache *s = get_slab(size, flags); 2541 struct kmem_cache *s;
2542
2543 if (unlikely(size > PAGE_SIZE / 2))
2544 return (void *)__get_free_pages(flags | __GFP_COMP,
2545 get_order(size));
2418 2546
2419 if (ZERO_OR_NULL_PTR(s)) 2547 s = get_slab(size, flags);
2548
2549 if (unlikely(ZERO_OR_NULL_PTR(s)))
2420 return s; 2550 return s;
2421 2551
2422 return slab_alloc(s, flags, -1, __builtin_return_address(0)); 2552 return slab_alloc(s, flags, -1, __builtin_return_address(0));
@@ -2426,9 +2556,15 @@ EXPORT_SYMBOL(__kmalloc);
2426#ifdef CONFIG_NUMA 2556#ifdef CONFIG_NUMA
2427void *__kmalloc_node(size_t size, gfp_t flags, int node) 2557void *__kmalloc_node(size_t size, gfp_t flags, int node)
2428{ 2558{
2429 struct kmem_cache *s = get_slab(size, flags); 2559 struct kmem_cache *s;
2430 2560
2431 if (ZERO_OR_NULL_PTR(s)) 2561 if (unlikely(size > PAGE_SIZE / 2))
2562 return (void *)__get_free_pages(flags | __GFP_COMP,
2563 get_order(size));
2564
2565 s = get_slab(size, flags);
2566
2567 if (unlikely(ZERO_OR_NULL_PTR(s)))
2432 return s; 2568 return s;
2433 2569
2434 return slab_alloc(s, flags, node, __builtin_return_address(0)); 2570 return slab_alloc(s, flags, node, __builtin_return_address(0));
@@ -2441,7 +2577,8 @@ size_t ksize(const void *object)
2441 struct page *page; 2577 struct page *page;
2442 struct kmem_cache *s; 2578 struct kmem_cache *s;
2443 2579
2444 if (ZERO_OR_NULL_PTR(object)) 2580 BUG_ON(!object);
2581 if (unlikely(object == ZERO_SIZE_PTR))
2445 return 0; 2582 return 0;
2446 2583
2447 page = get_object_page(object); 2584 page = get_object_page(object);
@@ -2473,22 +2610,17 @@ EXPORT_SYMBOL(ksize);
2473 2610
2474void kfree(const void *x) 2611void kfree(const void *x)
2475{ 2612{
2476 struct kmem_cache *s;
2477 struct page *page; 2613 struct page *page;
2478 2614
2479 /* 2615 if (unlikely(ZERO_OR_NULL_PTR(x)))
2480 * This has to be an unsigned comparison. According to Linus
2481 * some gcc version treat a pointer as a signed entity. Then
2482 * this comparison would be true for all "negative" pointers
2483 * (which would cover the whole upper half of the address space).
2484 */
2485 if (ZERO_OR_NULL_PTR(x))
2486 return; 2616 return;
2487 2617
2488 page = virt_to_head_page(x); 2618 page = virt_to_head_page(x);
2489 s = page->slab; 2619 if (unlikely(!PageSlab(page))) {
2490 2620 put_page(page);
2491 slab_free(s, page, (void *)x, __builtin_return_address(0)); 2621 return;
2622 }
2623 slab_free(page->slab, page, (void *)x, __builtin_return_address(0));
2492} 2624}
2493EXPORT_SYMBOL(kfree); 2625EXPORT_SYMBOL(kfree);
2494 2626
@@ -2517,7 +2649,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
2517 return -ENOMEM; 2649 return -ENOMEM;
2518 2650
2519 flush_all(s); 2651 flush_all(s);
2520 for_each_online_node(node) { 2652 for_each_node_state(node, N_NORMAL_MEMORY) {
2521 n = get_node(s, node); 2653 n = get_node(s, node);
2522 2654
2523 if (!n->nr_partial) 2655 if (!n->nr_partial)
@@ -2575,6 +2707,8 @@ void __init kmem_cache_init(void)
2575 int i; 2707 int i;
2576 int caches = 0; 2708 int caches = 0;
2577 2709
2710 init_alloc_cpu();
2711
2578#ifdef CONFIG_NUMA 2712#ifdef CONFIG_NUMA
2579 /* 2713 /*
2580 * Must first have the slab cache available for the allocations of the 2714 * Must first have the slab cache available for the allocations of the
@@ -2602,7 +2736,7 @@ void __init kmem_cache_init(void)
2602 caches++; 2736 caches++;
2603 } 2737 }
2604 2738
2605 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { 2739 for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) {
2606 create_kmalloc_cache(&kmalloc_caches[i], 2740 create_kmalloc_cache(&kmalloc_caches[i],
2607 "kmalloc", 1 << i, GFP_KERNEL); 2741 "kmalloc", 1 << i, GFP_KERNEL);
2608 caches++; 2742 caches++;
@@ -2629,16 +2763,18 @@ void __init kmem_cache_init(void)
2629 slab_state = UP; 2763 slab_state = UP;
2630 2764
2631 /* Provide the correct kmalloc names now that the caches are up */ 2765 /* Provide the correct kmalloc names now that the caches are up */
2632 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) 2766 for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++)
2633 kmalloc_caches[i]. name = 2767 kmalloc_caches[i]. name =
2634 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 2768 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
2635 2769
2636#ifdef CONFIG_SMP 2770#ifdef CONFIG_SMP
2637 register_cpu_notifier(&slab_notifier); 2771 register_cpu_notifier(&slab_notifier);
2772 kmem_size = offsetof(struct kmem_cache, cpu_slab) +
2773 nr_cpu_ids * sizeof(struct kmem_cache_cpu *);
2774#else
2775 kmem_size = sizeof(struct kmem_cache);
2638#endif 2776#endif
2639 2777
2640 kmem_size = offsetof(struct kmem_cache, cpu_slab) +
2641 nr_cpu_ids * sizeof(struct page *);
2642 2778
2643 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 2779 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
2644 " CPUs=%d, Nodes=%d\n", 2780 " CPUs=%d, Nodes=%d\n",
@@ -2717,12 +2853,21 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
2717 down_write(&slub_lock); 2853 down_write(&slub_lock);
2718 s = find_mergeable(size, align, flags, name, ctor); 2854 s = find_mergeable(size, align, flags, name, ctor);
2719 if (s) { 2855 if (s) {
2856 int cpu;
2857
2720 s->refcount++; 2858 s->refcount++;
2721 /* 2859 /*
2722 * Adjust the object sizes so that we clear 2860 * Adjust the object sizes so that we clear
2723 * the complete object on kzalloc. 2861 * the complete object on kzalloc.
2724 */ 2862 */
2725 s->objsize = max(s->objsize, (int)size); 2863 s->objsize = max(s->objsize, (int)size);
2864
2865 /*
2866 * And then we need to update the object size in the
2867 * per cpu structures
2868 */
2869 for_each_online_cpu(cpu)
2870 get_cpu_slab(s, cpu)->objsize = s->objsize;
2726 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 2871 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
2727 up_write(&slub_lock); 2872 up_write(&slub_lock);
2728 if (sysfs_slab_alias(s, name)) 2873 if (sysfs_slab_alias(s, name))
@@ -2765,15 +2910,29 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
2765 unsigned long flags; 2910 unsigned long flags;
2766 2911
2767 switch (action) { 2912 switch (action) {
2913 case CPU_UP_PREPARE:
2914 case CPU_UP_PREPARE_FROZEN:
2915 init_alloc_cpu_cpu(cpu);
2916 down_read(&slub_lock);
2917 list_for_each_entry(s, &slab_caches, list)
2918 s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
2919 GFP_KERNEL);
2920 up_read(&slub_lock);
2921 break;
2922
2768 case CPU_UP_CANCELED: 2923 case CPU_UP_CANCELED:
2769 case CPU_UP_CANCELED_FROZEN: 2924 case CPU_UP_CANCELED_FROZEN:
2770 case CPU_DEAD: 2925 case CPU_DEAD:
2771 case CPU_DEAD_FROZEN: 2926 case CPU_DEAD_FROZEN:
2772 down_read(&slub_lock); 2927 down_read(&slub_lock);
2773 list_for_each_entry(s, &slab_caches, list) { 2928 list_for_each_entry(s, &slab_caches, list) {
2929 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2930
2774 local_irq_save(flags); 2931 local_irq_save(flags);
2775 __flush_cpu_slab(s, cpu); 2932 __flush_cpu_slab(s, cpu);
2776 local_irq_restore(flags); 2933 local_irq_restore(flags);
2934 free_kmem_cache_cpu(c, cpu);
2935 s->cpu_slab[cpu] = NULL;
2777 } 2936 }
2778 up_read(&slub_lock); 2937 up_read(&slub_lock);
2779 break; 2938 break;
@@ -2790,9 +2949,14 @@ static struct notifier_block __cpuinitdata slab_notifier =
2790 2949
2791void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) 2950void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
2792{ 2951{
2793 struct kmem_cache *s = get_slab(size, gfpflags); 2952 struct kmem_cache *s;
2953
2954 if (unlikely(size > PAGE_SIZE / 2))
2955 return (void *)__get_free_pages(gfpflags | __GFP_COMP,
2956 get_order(size));
2957 s = get_slab(size, gfpflags);
2794 2958
2795 if (ZERO_OR_NULL_PTR(s)) 2959 if (unlikely(ZERO_OR_NULL_PTR(s)))
2796 return s; 2960 return s;
2797 2961
2798 return slab_alloc(s, gfpflags, -1, caller); 2962 return slab_alloc(s, gfpflags, -1, caller);
@@ -2801,9 +2965,14 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
2801void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 2965void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
2802 int node, void *caller) 2966 int node, void *caller)
2803{ 2967{
2804 struct kmem_cache *s = get_slab(size, gfpflags); 2968 struct kmem_cache *s;
2969
2970 if (unlikely(size > PAGE_SIZE / 2))
2971 return (void *)__get_free_pages(gfpflags | __GFP_COMP,
2972 get_order(size));
2973 s = get_slab(size, gfpflags);
2805 2974
2806 if (ZERO_OR_NULL_PTR(s)) 2975 if (unlikely(ZERO_OR_NULL_PTR(s)))
2807 return s; 2976 return s;
2808 2977
2809 return slab_alloc(s, gfpflags, node, caller); 2978 return slab_alloc(s, gfpflags, node, caller);
@@ -2902,7 +3071,7 @@ static long validate_slab_cache(struct kmem_cache *s)
2902 return -ENOMEM; 3071 return -ENOMEM;
2903 3072
2904 flush_all(s); 3073 flush_all(s);
2905 for_each_online_node(node) { 3074 for_each_node_state(node, N_NORMAL_MEMORY) {
2906 struct kmem_cache_node *n = get_node(s, node); 3075 struct kmem_cache_node *n = get_node(s, node);
2907 3076
2908 count += validate_slab_node(s, n, map); 3077 count += validate_slab_node(s, n, map);
@@ -3116,13 +3285,13 @@ static int list_locations(struct kmem_cache *s, char *buf,
3116 int node; 3285 int node;
3117 3286
3118 if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 3287 if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
3119 GFP_KERNEL)) 3288 GFP_TEMPORARY))
3120 return sprintf(buf, "Out of memory\n"); 3289 return sprintf(buf, "Out of memory\n");
3121 3290
3122 /* Push back cpu slabs */ 3291 /* Push back cpu slabs */
3123 flush_all(s); 3292 flush_all(s);
3124 3293
3125 for_each_online_node(node) { 3294 for_each_node_state(node, N_NORMAL_MEMORY) {
3126 struct kmem_cache_node *n = get_node(s, node); 3295 struct kmem_cache_node *n = get_node(s, node);
3127 unsigned long flags; 3296 unsigned long flags;
3128 struct page *page; 3297 struct page *page;
@@ -3230,11 +3399,18 @@ static unsigned long slab_objects(struct kmem_cache *s,
3230 per_cpu = nodes + nr_node_ids; 3399 per_cpu = nodes + nr_node_ids;
3231 3400
3232 for_each_possible_cpu(cpu) { 3401 for_each_possible_cpu(cpu) {
3233 struct page *page = s->cpu_slab[cpu]; 3402 struct page *page;
3234 int node; 3403 int node;
3404 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3235 3405
3406 if (!c)
3407 continue;
3408
3409 page = c->page;
3410 node = c->node;
3411 if (node < 0)
3412 continue;
3236 if (page) { 3413 if (page) {
3237 node = page_to_nid(page);
3238 if (flags & SO_CPU) { 3414 if (flags & SO_CPU) {
3239 int x = 0; 3415 int x = 0;
3240 3416
@@ -3249,7 +3425,7 @@ static unsigned long slab_objects(struct kmem_cache *s,
3249 } 3425 }
3250 } 3426 }
3251 3427
3252 for_each_online_node(node) { 3428 for_each_node_state(node, N_NORMAL_MEMORY) {
3253 struct kmem_cache_node *n = get_node(s, node); 3429 struct kmem_cache_node *n = get_node(s, node);
3254 3430
3255 if (flags & SO_PARTIAL) { 3431 if (flags & SO_PARTIAL) {
@@ -3277,7 +3453,7 @@ static unsigned long slab_objects(struct kmem_cache *s,
3277 3453
3278 x = sprintf(buf, "%lu", total); 3454 x = sprintf(buf, "%lu", total);
3279#ifdef CONFIG_NUMA 3455#ifdef CONFIG_NUMA
3280 for_each_online_node(node) 3456 for_each_node_state(node, N_NORMAL_MEMORY)
3281 if (nodes[node]) 3457 if (nodes[node])
3282 x += sprintf(buf + x, " N%d=%lu", 3458 x += sprintf(buf + x, " N%d=%lu",
3283 node, nodes[node]); 3459 node, nodes[node]);
@@ -3291,13 +3467,19 @@ static int any_slab_objects(struct kmem_cache *s)
3291 int node; 3467 int node;
3292 int cpu; 3468 int cpu;
3293 3469
3294 for_each_possible_cpu(cpu) 3470 for_each_possible_cpu(cpu) {
3295 if (s->cpu_slab[cpu]) 3471 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3472
3473 if (c && c->page)
3296 return 1; 3474 return 1;
3475 }
3297 3476
3298 for_each_node(node) { 3477 for_each_online_node(node) {
3299 struct kmem_cache_node *n = get_node(s, node); 3478 struct kmem_cache_node *n = get_node(s, node);
3300 3479
3480 if (!n)
3481 continue;
3482
3301 if (n->nr_partial || atomic_long_read(&n->nr_slabs)) 3483 if (n->nr_partial || atomic_long_read(&n->nr_slabs))
3302 return 1; 3484 return 1;
3303 } 3485 }
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
new file mode 100644
index 000000000000..d3b718b0c20a
--- /dev/null
+++ b/mm/sparse-vmemmap.c
@@ -0,0 +1,148 @@
1/*
2 * Virtual Memory Map support
3 *
4 * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>.
5 *
6 * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
7 * virt_to_page, page_address() to be implemented as a base offset
8 * calculation without memory access.
9 *
10 * However, virtual mappings need a page table and TLBs. Many Linux
11 * architectures already map their physical space using 1-1 mappings
12 * via TLBs. For those arches the virtual memmory map is essentially
13 * for free if we use the same page size as the 1-1 mappings. In that
14 * case the overhead consists of a few additional pages that are
15 * allocated to create a view of memory for vmemmap.
16 *
17 * The architecture is expected to provide a vmemmap_populate() function
18 * to instantiate the mapping.
19 */
20#include <linux/mm.h>
21#include <linux/mmzone.h>
22#include <linux/bootmem.h>
23#include <linux/highmem.h>
24#include <linux/module.h>
25#include <linux/spinlock.h>
26#include <linux/vmalloc.h>
27#include <asm/dma.h>
28#include <asm/pgalloc.h>
29#include <asm/pgtable.h>
30
31/*
32 * Allocate a block of memory to be used to back the virtual memory map
33 * or to back the page tables that are used to create the mapping.
34 * Uses the main allocators if they are available, else bootmem.
35 */
36void * __meminit vmemmap_alloc_block(unsigned long size, int node)
37{
38 /* If the main allocator is up use that, fallback to bootmem. */
39 if (slab_is_available()) {
40 struct page *page = alloc_pages_node(node,
41 GFP_KERNEL | __GFP_ZERO, get_order(size));
42 if (page)
43 return page_address(page);
44 return NULL;
45 } else
46 return __alloc_bootmem_node(NODE_DATA(node), size, size,
47 __pa(MAX_DMA_ADDRESS));
48}
49
50void __meminit vmemmap_verify(pte_t *pte, int node,
51 unsigned long start, unsigned long end)
52{
53 unsigned long pfn = pte_pfn(*pte);
54 int actual_node = early_pfn_to_nid(pfn);
55
56 if (actual_node != node)
57 printk(KERN_WARNING "[%lx-%lx] potential offnode "
58 "page_structs\n", start, end - 1);
59}
60
61pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
62{
63 pte_t *pte = pte_offset_kernel(pmd, addr);
64 if (pte_none(*pte)) {
65 pte_t entry;
66 void *p = vmemmap_alloc_block(PAGE_SIZE, node);
67 if (!p)
68 return 0;
69 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
70 set_pte_at(&init_mm, addr, pte, entry);
71 }
72 return pte;
73}
74
75pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
76{
77 pmd_t *pmd = pmd_offset(pud, addr);
78 if (pmd_none(*pmd)) {
79 void *p = vmemmap_alloc_block(PAGE_SIZE, node);
80 if (!p)
81 return 0;
82 pmd_populate_kernel(&init_mm, pmd, p);
83 }
84 return pmd;
85}
86
87pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node)
88{
89 pud_t *pud = pud_offset(pgd, addr);
90 if (pud_none(*pud)) {
91 void *p = vmemmap_alloc_block(PAGE_SIZE, node);
92 if (!p)
93 return 0;
94 pud_populate(&init_mm, pud, p);
95 }
96 return pud;
97}
98
99pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
100{
101 pgd_t *pgd = pgd_offset_k(addr);
102 if (pgd_none(*pgd)) {
103 void *p = vmemmap_alloc_block(PAGE_SIZE, node);
104 if (!p)
105 return 0;
106 pgd_populate(&init_mm, pgd, p);
107 }
108 return pgd;
109}
110
111int __meminit vmemmap_populate_basepages(struct page *start_page,
112 unsigned long size, int node)
113{
114 unsigned long addr = (unsigned long)start_page;
115 unsigned long end = (unsigned long)(start_page + size);
116 pgd_t *pgd;
117 pud_t *pud;
118 pmd_t *pmd;
119 pte_t *pte;
120
121 for (; addr < end; addr += PAGE_SIZE) {
122 pgd = vmemmap_pgd_populate(addr, node);
123 if (!pgd)
124 return -ENOMEM;
125 pud = vmemmap_pud_populate(pgd, addr, node);
126 if (!pud)
127 return -ENOMEM;
128 pmd = vmemmap_pmd_populate(pud, addr, node);
129 if (!pmd)
130 return -ENOMEM;
131 pte = vmemmap_pte_populate(pmd, addr, node);
132 if (!pte)
133 return -ENOMEM;
134 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
135 }
136
137 return 0;
138}
139
140struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
141{
142 struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION);
143 int error = vmemmap_populate(map, PAGES_PER_SECTION, nid);
144 if (error)
145 return NULL;
146
147 return map;
148}
diff --git a/mm/sparse.c b/mm/sparse.c
index 239f5a720d38..08fb14f5eea3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -9,6 +9,8 @@
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10#include <linux/vmalloc.h> 10#include <linux/vmalloc.h>
11#include <asm/dma.h> 11#include <asm/dma.h>
12#include <asm/pgalloc.h>
13#include <asm/pgtable.h>
12 14
13/* 15/*
14 * Permanent SPARSEMEM data: 16 * Permanent SPARSEMEM data:
@@ -106,7 +108,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
106 108
107/* 109/*
108 * Although written for the SPARSEMEM_EXTREME case, this happens 110 * Although written for the SPARSEMEM_EXTREME case, this happens
109 * to also work for the flat array case becase 111 * to also work for the flat array case because
110 * NR_SECTION_ROOTS==NR_MEM_SECTIONS. 112 * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
111 */ 113 */
112int __section_nr(struct mem_section* ms) 114int __section_nr(struct mem_section* ms)
@@ -176,7 +178,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
176 if (nid != early_pfn_to_nid(pfn)) 178 if (nid != early_pfn_to_nid(pfn))
177 continue; 179 continue;
178 180
179 if (pfn_valid(pfn)) 181 if (pfn_present(pfn))
180 nr_pages += PAGES_PER_SECTION; 182 nr_pages += PAGES_PER_SECTION;
181 } 183 }
182 184
@@ -204,13 +206,16 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
204} 206}
205 207
206static int __meminit sparse_init_one_section(struct mem_section *ms, 208static int __meminit sparse_init_one_section(struct mem_section *ms,
207 unsigned long pnum, struct page *mem_map) 209 unsigned long pnum, struct page *mem_map,
210 unsigned long *pageblock_bitmap)
208{ 211{
209 if (!valid_section(ms)) 212 if (!present_section(ms))
210 return -EINVAL; 213 return -EINVAL;
211 214
212 ms->section_mem_map &= ~SECTION_MAP_MASK; 215 ms->section_mem_map &= ~SECTION_MAP_MASK;
213 ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum); 216 ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
217 SECTION_HAS_MEM_MAP;
218 ms->pageblock_flags = pageblock_bitmap;
214 219
215 return 1; 220 return 1;
216} 221}
@@ -221,12 +226,43 @@ void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
221 return NULL; 226 return NULL;
222} 227}
223 228
224static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 229static unsigned long usemap_size(void)
225{ 230{
226 struct page *map; 231 unsigned long size_bytes;
232 size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8;
233 size_bytes = roundup(size_bytes, sizeof(unsigned long));
234 return size_bytes;
235}
236
237#ifdef CONFIG_MEMORY_HOTPLUG
238static unsigned long *__kmalloc_section_usemap(void)
239{
240 return kmalloc(usemap_size(), GFP_KERNEL);
241}
242#endif /* CONFIG_MEMORY_HOTPLUG */
243
244static unsigned long *sparse_early_usemap_alloc(unsigned long pnum)
245{
246 unsigned long *usemap;
227 struct mem_section *ms = __nr_to_section(pnum); 247 struct mem_section *ms = __nr_to_section(pnum);
228 int nid = sparse_early_nid(ms); 248 int nid = sparse_early_nid(ms);
229 249
250 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
251 if (usemap)
252 return usemap;
253
254 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
255 nid = 0;
256
257 printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
258 return NULL;
259}
260
261#ifndef CONFIG_SPARSEMEM_VMEMMAP
262struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
263{
264 struct page *map;
265
230 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); 266 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
231 if (map) 267 if (map)
232 return map; 268 return map;
@@ -238,10 +274,22 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
238 274
239 map = alloc_bootmem_node(NODE_DATA(nid), 275 map = alloc_bootmem_node(NODE_DATA(nid),
240 sizeof(struct page) * PAGES_PER_SECTION); 276 sizeof(struct page) * PAGES_PER_SECTION);
277 return map;
278}
279#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
280
281struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
282{
283 struct page *map;
284 struct mem_section *ms = __nr_to_section(pnum);
285 int nid = sparse_early_nid(ms);
286
287 map = sparse_mem_map_populate(pnum, nid);
241 if (map) 288 if (map)
242 return map; 289 return map;
243 290
244 printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__); 291 printk(KERN_ERR "%s: sparsemem memory map backing failed "
292 "some memory will not be available.\n", __FUNCTION__);
245 ms->section_mem_map = 0; 293 ms->section_mem_map = 0;
246 return NULL; 294 return NULL;
247} 295}
@@ -254,19 +302,38 @@ void __init sparse_init(void)
254{ 302{
255 unsigned long pnum; 303 unsigned long pnum;
256 struct page *map; 304 struct page *map;
305 unsigned long *usemap;
257 306
258 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 307 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
259 if (!valid_section_nr(pnum)) 308 if (!present_section_nr(pnum))
260 continue; 309 continue;
261 310
262 map = sparse_early_mem_map_alloc(pnum); 311 map = sparse_early_mem_map_alloc(pnum);
263 if (!map) 312 if (!map)
264 continue; 313 continue;
265 sparse_init_one_section(__nr_to_section(pnum), pnum, map); 314
315 usemap = sparse_early_usemap_alloc(pnum);
316 if (!usemap)
317 continue;
318
319 sparse_init_one_section(__nr_to_section(pnum), pnum, map,
320 usemap);
266 } 321 }
267} 322}
268 323
269#ifdef CONFIG_MEMORY_HOTPLUG 324#ifdef CONFIG_MEMORY_HOTPLUG
325#ifdef CONFIG_SPARSEMEM_VMEMMAP
326static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
327 unsigned long nr_pages)
328{
329 /* This will make the necessary allocations eventually. */
330 return sparse_mem_map_populate(pnum, nid);
331}
332static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
333{
334 return; /* XXX: Not implemented yet */
335}
336#else
270static struct page *__kmalloc_section_memmap(unsigned long nr_pages) 337static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
271{ 338{
272 struct page *page, *ret; 339 struct page *page, *ret;
@@ -289,6 +356,12 @@ got_map_ptr:
289 return ret; 356 return ret;
290} 357}
291 358
359static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
360 unsigned long nr_pages)
361{
362 return __kmalloc_section_memmap(nr_pages);
363}
364
292static int vaddr_in_vmalloc_area(void *addr) 365static int vaddr_in_vmalloc_area(void *addr)
293{ 366{
294 if (addr >= (void *)VMALLOC_START && 367 if (addr >= (void *)VMALLOC_START &&
@@ -305,6 +378,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
305 free_pages((unsigned long)memmap, 378 free_pages((unsigned long)memmap,
306 get_order(sizeof(struct page) * nr_pages)); 379 get_order(sizeof(struct page) * nr_pages));
307} 380}
381#endif /* CONFIG_SPARSEMEM_VMEMMAP */
308 382
309/* 383/*
310 * returns the number of sections whose mem_maps were properly 384 * returns the number of sections whose mem_maps were properly
@@ -318,6 +392,7 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
318 struct pglist_data *pgdat = zone->zone_pgdat; 392 struct pglist_data *pgdat = zone->zone_pgdat;
319 struct mem_section *ms; 393 struct mem_section *ms;
320 struct page *memmap; 394 struct page *memmap;
395 unsigned long *usemap;
321 unsigned long flags; 396 unsigned long flags;
322 int ret; 397 int ret;
323 398
@@ -326,7 +401,8 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
326 * plus, it does a kmalloc 401 * plus, it does a kmalloc
327 */ 402 */
328 sparse_index_init(section_nr, pgdat->node_id); 403 sparse_index_init(section_nr, pgdat->node_id);
329 memmap = __kmalloc_section_memmap(nr_pages); 404 memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages);
405 usemap = __kmalloc_section_usemap();
330 406
331 pgdat_resize_lock(pgdat, &flags); 407 pgdat_resize_lock(pgdat, &flags);
332 408
@@ -335,9 +411,14 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
335 ret = -EEXIST; 411 ret = -EEXIST;
336 goto out; 412 goto out;
337 } 413 }
414
415 if (!usemap) {
416 ret = -ENOMEM;
417 goto out;
418 }
338 ms->section_mem_map |= SECTION_MARKED_PRESENT; 419 ms->section_mem_map |= SECTION_MARKED_PRESENT;
339 420
340 ret = sparse_init_one_section(ms, section_nr, memmap); 421 ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
341 422
342out: 423out:
343 pgdat_resize_unlock(pgdat, &flags); 424 pgdat_resize_unlock(pgdat, &flags);
diff --git a/mm/swap.c b/mm/swap.c
index d3cb966fe992..d034b2128d2b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -24,16 +24,18 @@
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/mm_inline.h> 25#include <linux/mm_inline.h>
26#include <linux/buffer_head.h> /* for try_to_release_page() */ 26#include <linux/buffer_head.h> /* for try_to_release_page() */
27#include <linux/module.h>
28#include <linux/percpu_counter.h> 27#include <linux/percpu_counter.h>
29#include <linux/percpu.h> 28#include <linux/percpu.h>
30#include <linux/cpu.h> 29#include <linux/cpu.h>
31#include <linux/notifier.h> 30#include <linux/notifier.h>
32#include <linux/init.h>
33 31
34/* How many pages do we try to swap or page in/out together? */ 32/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 33int page_cluster;
36 34
35static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
36static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
37static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
38
37/* 39/*
38 * This path almost never happens for VM activity - pages are normally 40 * This path almost never happens for VM activity - pages are normally
39 * freed via pagevecs. But it gets used by networking. 41 * freed via pagevecs. But it gets used by networking.
@@ -94,23 +96,47 @@ void put_pages_list(struct list_head *pages)
94EXPORT_SYMBOL(put_pages_list); 96EXPORT_SYMBOL(put_pages_list);
95 97
96/* 98/*
99 * pagevec_move_tail() must be called with IRQ disabled.
100 * Otherwise this may cause nasty races.
101 */
102static void pagevec_move_tail(struct pagevec *pvec)
103{
104 int i;
105 int pgmoved = 0;
106 struct zone *zone = NULL;
107
108 for (i = 0; i < pagevec_count(pvec); i++) {
109 struct page *page = pvec->pages[i];
110 struct zone *pagezone = page_zone(page);
111
112 if (pagezone != zone) {
113 if (zone)
114 spin_unlock(&zone->lru_lock);
115 zone = pagezone;
116 spin_lock(&zone->lru_lock);
117 }
118 if (PageLRU(page) && !PageActive(page)) {
119 list_move_tail(&page->lru, &zone->inactive_list);
120 pgmoved++;
121 }
122 }
123 if (zone)
124 spin_unlock(&zone->lru_lock);
125 __count_vm_events(PGROTATED, pgmoved);
126 release_pages(pvec->pages, pvec->nr, pvec->cold);
127 pagevec_reinit(pvec);
128}
129
130/*
97 * Writeback is about to end against a page which has been marked for immediate 131 * Writeback is about to end against a page which has been marked for immediate
98 * reclaim. If it still appears to be reclaimable, move it to the tail of the 132 * reclaim. If it still appears to be reclaimable, move it to the tail of the
99 * inactive list. The page still has PageWriteback set, which will pin it. 133 * inactive list.
100 *
101 * We don't expect many pages to come through here, so don't bother batching
102 * things up.
103 *
104 * To avoid placing the page at the tail of the LRU while PG_writeback is still
105 * set, this function will clear PG_writeback before performing the page
106 * motion. Do that inside the lru lock because once PG_writeback is cleared
107 * we may not touch the page.
108 * 134 *
109 * Returns zero if it cleared PG_writeback. 135 * Returns zero if it cleared PG_writeback.
110 */ 136 */
111int rotate_reclaimable_page(struct page *page) 137int rotate_reclaimable_page(struct page *page)
112{ 138{
113 struct zone *zone; 139 struct pagevec *pvec;
114 unsigned long flags; 140 unsigned long flags;
115 141
116 if (PageLocked(page)) 142 if (PageLocked(page))
@@ -122,15 +148,16 @@ int rotate_reclaimable_page(struct page *page)
122 if (!PageLRU(page)) 148 if (!PageLRU(page))
123 return 1; 149 return 1;
124 150
125 zone = page_zone(page); 151 page_cache_get(page);
126 spin_lock_irqsave(&zone->lru_lock, flags); 152 local_irq_save(flags);
127 if (PageLRU(page) && !PageActive(page)) { 153 pvec = &__get_cpu_var(lru_rotate_pvecs);
128 list_move_tail(&page->lru, &zone->inactive_list); 154 if (!pagevec_add(pvec, page))
129 __count_vm_event(PGROTATED); 155 pagevec_move_tail(pvec);
130 } 156 local_irq_restore(flags);
157
131 if (!test_clear_page_writeback(page)) 158 if (!test_clear_page_writeback(page))
132 BUG(); 159 BUG();
133 spin_unlock_irqrestore(&zone->lru_lock, flags); 160
134 return 0; 161 return 0;
135} 162}
136 163
@@ -174,9 +201,6 @@ EXPORT_SYMBOL(mark_page_accessed);
174 * lru_cache_add: add a page to the page lists 201 * lru_cache_add: add a page to the page lists
175 * @page: the page to add 202 * @page: the page to add
176 */ 203 */
177static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
178static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
179
180void fastcall lru_cache_add(struct page *page) 204void fastcall lru_cache_add(struct page *page)
181{ 205{
182 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); 206 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
@@ -197,21 +221,37 @@ void fastcall lru_cache_add_active(struct page *page)
197 put_cpu_var(lru_add_active_pvecs); 221 put_cpu_var(lru_add_active_pvecs);
198} 222}
199 223
200static void __lru_add_drain(int cpu) 224/*
225 * Drain pages out of the cpu's pagevecs.
226 * Either "cpu" is the current CPU, and preemption has already been
227 * disabled; or "cpu" is being hot-unplugged, and is already dead.
228 */
229static void drain_cpu_pagevecs(int cpu)
201{ 230{
202 struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); 231 struct pagevec *pvec;
203 232
204 /* CPU is dead, so no locking needed. */ 233 pvec = &per_cpu(lru_add_pvecs, cpu);
205 if (pagevec_count(pvec)) 234 if (pagevec_count(pvec))
206 __pagevec_lru_add(pvec); 235 __pagevec_lru_add(pvec);
236
207 pvec = &per_cpu(lru_add_active_pvecs, cpu); 237 pvec = &per_cpu(lru_add_active_pvecs, cpu);
208 if (pagevec_count(pvec)) 238 if (pagevec_count(pvec))
209 __pagevec_lru_add_active(pvec); 239 __pagevec_lru_add_active(pvec);
240
241 pvec = &per_cpu(lru_rotate_pvecs, cpu);
242 if (pagevec_count(pvec)) {
243 unsigned long flags;
244
245 /* No harm done if a racing interrupt already did this */
246 local_irq_save(flags);
247 pagevec_move_tail(pvec);
248 local_irq_restore(flags);
249 }
210} 250}
211 251
212void lru_add_drain(void) 252void lru_add_drain(void)
213{ 253{
214 __lru_add_drain(get_cpu()); 254 drain_cpu_pagevecs(get_cpu());
215 put_cpu(); 255 put_cpu();
216} 256}
217 257
@@ -258,6 +298,7 @@ void release_pages(struct page **pages, int nr, int cold)
258 int i; 298 int i;
259 struct pagevec pages_to_free; 299 struct pagevec pages_to_free;
260 struct zone *zone = NULL; 300 struct zone *zone = NULL;
301 unsigned long uninitialized_var(flags);
261 302
262 pagevec_init(&pages_to_free, cold); 303 pagevec_init(&pages_to_free, cold);
263 for (i = 0; i < nr; i++) { 304 for (i = 0; i < nr; i++) {
@@ -265,7 +306,7 @@ void release_pages(struct page **pages, int nr, int cold)
265 306
266 if (unlikely(PageCompound(page))) { 307 if (unlikely(PageCompound(page))) {
267 if (zone) { 308 if (zone) {
268 spin_unlock_irq(&zone->lru_lock); 309 spin_unlock_irqrestore(&zone->lru_lock, flags);
269 zone = NULL; 310 zone = NULL;
270 } 311 }
271 put_compound_page(page); 312 put_compound_page(page);
@@ -279,9 +320,10 @@ void release_pages(struct page **pages, int nr, int cold)
279 struct zone *pagezone = page_zone(page); 320 struct zone *pagezone = page_zone(page);
280 if (pagezone != zone) { 321 if (pagezone != zone) {
281 if (zone) 322 if (zone)
282 spin_unlock_irq(&zone->lru_lock); 323 spin_unlock_irqrestore(&zone->lru_lock,
324 flags);
283 zone = pagezone; 325 zone = pagezone;
284 spin_lock_irq(&zone->lru_lock); 326 spin_lock_irqsave(&zone->lru_lock, flags);
285 } 327 }
286 VM_BUG_ON(!PageLRU(page)); 328 VM_BUG_ON(!PageLRU(page));
287 __ClearPageLRU(page); 329 __ClearPageLRU(page);
@@ -290,7 +332,7 @@ void release_pages(struct page **pages, int nr, int cold)
290 332
291 if (!pagevec_add(&pages_to_free, page)) { 333 if (!pagevec_add(&pages_to_free, page)) {
292 if (zone) { 334 if (zone) {
293 spin_unlock_irq(&zone->lru_lock); 335 spin_unlock_irqrestore(&zone->lru_lock, flags);
294 zone = NULL; 336 zone = NULL;
295 } 337 }
296 __pagevec_free(&pages_to_free); 338 __pagevec_free(&pages_to_free);
@@ -298,7 +340,7 @@ void release_pages(struct page **pages, int nr, int cold)
298 } 340 }
299 } 341 }
300 if (zone) 342 if (zone)
301 spin_unlock_irq(&zone->lru_lock); 343 spin_unlock_irqrestore(&zone->lru_lock, flags);
302 344
303 pagevec_free(&pages_to_free); 345 pagevec_free(&pages_to_free);
304} 346}
@@ -491,7 +533,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
491 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 533 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
492 atomic_add(*committed, &vm_committed_space); 534 atomic_add(*committed, &vm_committed_space);
493 *committed = 0; 535 *committed = 0;
494 __lru_add_drain((long)hcpu); 536 drain_cpu_pagevecs((long)hcpu);
495 } 537 }
496 return NOTIFY_OK; 538 return NOTIFY_OK;
497} 539}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 67daecb6031a..b52635601dfe 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -74,6 +74,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
74{ 74{
75 int error; 75 int error;
76 76
77 BUG_ON(!PageLocked(page));
77 BUG_ON(PageSwapCache(page)); 78 BUG_ON(PageSwapCache(page));
78 BUG_ON(PagePrivate(page)); 79 BUG_ON(PagePrivate(page));
79 error = radix_tree_preload(gfp_mask); 80 error = radix_tree_preload(gfp_mask);
@@ -83,7 +84,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
83 entry.val, page); 84 entry.val, page);
84 if (!error) { 85 if (!error) {
85 page_cache_get(page); 86 page_cache_get(page);
86 SetPageLocked(page);
87 SetPageSwapCache(page); 87 SetPageSwapCache(page);
88 set_page_private(page, entry.val); 88 set_page_private(page, entry.val);
89 total_swapcache_pages++; 89 total_swapcache_pages++;
@@ -99,15 +99,18 @@ static int add_to_swap_cache(struct page *page, swp_entry_t entry)
99{ 99{
100 int error; 100 int error;
101 101
102 BUG_ON(PageLocked(page));
102 if (!swap_duplicate(entry)) { 103 if (!swap_duplicate(entry)) {
103 INC_CACHE_INFO(noent_race); 104 INC_CACHE_INFO(noent_race);
104 return -ENOENT; 105 return -ENOENT;
105 } 106 }
107 SetPageLocked(page);
106 error = __add_to_swap_cache(page, entry, GFP_KERNEL); 108 error = __add_to_swap_cache(page, entry, GFP_KERNEL);
107 /* 109 /*
108 * Anon pages are already on the LRU, we don't run lru_cache_add here. 110 * Anon pages are already on the LRU, we don't run lru_cache_add here.
109 */ 111 */
110 if (error) { 112 if (error) {
113 ClearPageLocked(page);
111 swap_free(entry); 114 swap_free(entry);
112 if (error == -EEXIST) 115 if (error == -EEXIST)
113 INC_CACHE_INFO(exist_race); 116 INC_CACHE_INFO(exist_race);
diff --git a/mm/util.c b/mm/util.c
index bf340d806868..5f64026cbb4d 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -81,14 +81,16 @@ EXPORT_SYMBOL(kmemdup);
81void *krealloc(const void *p, size_t new_size, gfp_t flags) 81void *krealloc(const void *p, size_t new_size, gfp_t flags)
82{ 82{
83 void *ret; 83 void *ret;
84 size_t ks; 84 size_t ks = 0;
85 85
86 if (unlikely(!new_size)) { 86 if (unlikely(!new_size)) {
87 kfree(p); 87 kfree(p);
88 return ZERO_SIZE_PTR; 88 return ZERO_SIZE_PTR;
89 } 89 }
90 90
91 ks = ksize(p); 91 if (p)
92 ks = ksize(p);
93
92 if (ks >= new_size) 94 if (ks >= new_size)
93 return (void *)p; 95 return (void *)p;
94 96
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 3cee76a8c9f0..2e01af365848 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -190,7 +190,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl
190 if (unlikely(!size)) 190 if (unlikely(!size))
191 return NULL; 191 return NULL;
192 192
193 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node); 193 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
194
194 if (unlikely(!area)) 195 if (unlikely(!area))
195 return NULL; 196 return NULL;
196 197
@@ -439,7 +440,7 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
439 area->flags |= VM_VPAGES; 440 area->flags |= VM_VPAGES;
440 } else { 441 } else {
441 pages = kmalloc_node(array_size, 442 pages = kmalloc_node(array_size,
442 (gfp_mask & GFP_LEVEL_MASK) | __GFP_ZERO, 443 (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO,
443 node); 444 node);
444 } 445 }
445 area->pages = pages; 446 area->pages = pages;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a6e65d024995..bbd194630c5b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -932,6 +932,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
932 long mapped_ratio; 932 long mapped_ratio;
933 long distress; 933 long distress;
934 long swap_tendency; 934 long swap_tendency;
935 long imbalance;
935 936
936 if (zone_is_near_oom(zone)) 937 if (zone_is_near_oom(zone))
937 goto force_reclaim_mapped; 938 goto force_reclaim_mapped;
@@ -967,6 +968,46 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
967 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; 968 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
968 969
969 /* 970 /*
971 * If there's huge imbalance between active and inactive
972 * (think active 100 times larger than inactive) we should
973 * become more permissive, or the system will take too much
974 * cpu before it start swapping during memory pressure.
975 * Distress is about avoiding early-oom, this is about
976 * making swappiness graceful despite setting it to low
977 * values.
978 *
979 * Avoid div by zero with nr_inactive+1, and max resulting
980 * value is vm_total_pages.
981 */
982 imbalance = zone_page_state(zone, NR_ACTIVE);
983 imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
984
985 /*
986 * Reduce the effect of imbalance if swappiness is low,
987 * this means for a swappiness very low, the imbalance
988 * must be much higher than 100 for this logic to make
989 * the difference.
990 *
991 * Max temporary value is vm_total_pages*100.
992 */
993 imbalance *= (vm_swappiness + 1);
994 imbalance /= 100;
995
996 /*
997 * If not much of the ram is mapped, makes the imbalance
998 * less relevant, it's high priority we refill the inactive
999 * list with mapped pages only in presence of high ratio of
1000 * mapped pages.
1001 *
1002 * Max temporary value is vm_total_pages*100.
1003 */
1004 imbalance *= mapped_ratio;
1005 imbalance /= 100;
1006
1007 /* apply imbalance feedback to swap_tendency */
1008 swap_tendency += imbalance;
1009
1010 /*
970 * Now use this metric to decide whether to start moving mapped 1011 * Now use this metric to decide whether to start moving mapped
971 * memory onto the inactive list. 1012 * memory onto the inactive list.
972 */ 1013 */
@@ -1371,7 +1412,13 @@ loop_again:
1371 temp_priority[i] = priority; 1412 temp_priority[i] = priority;
1372 sc.nr_scanned = 0; 1413 sc.nr_scanned = 0;
1373 note_zone_scanning_priority(zone, priority); 1414 note_zone_scanning_priority(zone, priority);
1374 nr_reclaimed += shrink_zone(priority, zone, &sc); 1415 /*
1416 * We put equal pressure on every zone, unless one
1417 * zone has way too many pages free already.
1418 */
1419 if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
1420 end_zone, 0))
1421 nr_reclaimed += shrink_zone(priority, zone, &sc);
1375 reclaim_state->reclaimed_slab = 0; 1422 reclaim_state->reclaimed_slab = 0;
1376 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1423 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1377 lru_pages); 1424 lru_pages);
@@ -1688,9 +1735,11 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
1688{ 1735{
1689 pg_data_t *pgdat; 1736 pg_data_t *pgdat;
1690 cpumask_t mask; 1737 cpumask_t mask;
1738 int nid;
1691 1739
1692 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 1740 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
1693 for_each_online_pgdat(pgdat) { 1741 for_each_node_state(nid, N_HIGH_MEMORY) {
1742 pgdat = NODE_DATA(nid);
1694 mask = node_to_cpumask(pgdat->node_id); 1743 mask = node_to_cpumask(pgdat->node_id);
1695 if (any_online_cpu(mask) != NR_CPUS) 1744 if (any_online_cpu(mask) != NR_CPUS)
1696 /* One of our CPUs online: restore mask */ 1745 /* One of our CPUs online: restore mask */
@@ -1727,7 +1776,7 @@ static int __init kswapd_init(void)
1727 int nid; 1776 int nid;
1728 1777
1729 swap_setup(); 1778 swap_setup();
1730 for_each_online_node(nid) 1779 for_each_node_state(nid, N_HIGH_MEMORY)
1731 kswapd_run(nid); 1780 kswapd_run(nid);
1732 hotcpu_notifier(cpu_callback, 0); 1781 hotcpu_notifier(cpu_callback, 0);
1733 return 0; 1782 return 0;
@@ -1847,7 +1896,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1847 1896
1848int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1897int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1849{ 1898{
1850 cpumask_t mask;
1851 int node_id; 1899 int node_id;
1852 1900
1853 /* 1901 /*
@@ -1884,8 +1932,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1884 * as wide as possible. 1932 * as wide as possible.
1885 */ 1933 */
1886 node_id = zone_to_nid(zone); 1934 node_id = zone_to_nid(zone);
1887 mask = node_to_cpumask(node_id); 1935 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
1888 if (!cpus_empty(mask) && node_id != numa_node_id())
1889 return 0; 1936 return 0;
1890 return __zone_reclaim(zone, gfp_mask, order); 1937 return __zone_reclaim(zone, gfp_mask, order);
1891} 1938}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c64d169537bf..3b5e9043e7db 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -353,23 +353,6 @@ void refresh_cpu_vm_stats(int cpu)
353 } 353 }
354} 354}
355 355
356static void __refresh_cpu_vm_stats(void *dummy)
357{
358 refresh_cpu_vm_stats(smp_processor_id());
359}
360
361/*
362 * Consolidate all counters.
363 *
364 * Note that the result is less inaccurate but still inaccurate
365 * if concurrent processes are allowed to run.
366 */
367void refresh_vm_stats(void)
368{
369 on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
370}
371EXPORT_SYMBOL(refresh_vm_stats);
372
373#endif 356#endif
374 357
375#ifdef CONFIG_NUMA 358#ifdef CONFIG_NUMA
@@ -398,6 +381,13 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z)
398 381
399#include <linux/seq_file.h> 382#include <linux/seq_file.h>
400 383
384static char * const migratetype_names[MIGRATE_TYPES] = {
385 "Unmovable",
386 "Reclaimable",
387 "Movable",
388 "Reserve",
389};
390
401static void *frag_start(struct seq_file *m, loff_t *pos) 391static void *frag_start(struct seq_file *m, loff_t *pos)
402{ 392{
403 pg_data_t *pgdat; 393 pg_data_t *pgdat;
@@ -422,28 +412,144 @@ static void frag_stop(struct seq_file *m, void *arg)
422{ 412{
423} 413}
424 414
425/* 415/* Walk all the zones in a node and print using a callback */
426 * This walks the free areas for each zone. 416static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
427 */ 417 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
428static int frag_show(struct seq_file *m, void *arg)
429{ 418{
430 pg_data_t *pgdat = (pg_data_t *)arg;
431 struct zone *zone; 419 struct zone *zone;
432 struct zone *node_zones = pgdat->node_zones; 420 struct zone *node_zones = pgdat->node_zones;
433 unsigned long flags; 421 unsigned long flags;
434 int order;
435 422
436 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 423 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
437 if (!populated_zone(zone)) 424 if (!populated_zone(zone))
438 continue; 425 continue;
439 426
440 spin_lock_irqsave(&zone->lock, flags); 427 spin_lock_irqsave(&zone->lock, flags);
441 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 428 print(m, pgdat, zone);
442 for (order = 0; order < MAX_ORDER; ++order)
443 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
444 spin_unlock_irqrestore(&zone->lock, flags); 429 spin_unlock_irqrestore(&zone->lock, flags);
430 }
431}
432
433static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
434 struct zone *zone)
435{
436 int order;
437
438 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
439 for (order = 0; order < MAX_ORDER; ++order)
440 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
441 seq_putc(m, '\n');
442}
443
444/*
445 * This walks the free areas for each zone.
446 */
447static int frag_show(struct seq_file *m, void *arg)
448{
449 pg_data_t *pgdat = (pg_data_t *)arg;
450 walk_zones_in_node(m, pgdat, frag_show_print);
451 return 0;
452}
453
454static void pagetypeinfo_showfree_print(struct seq_file *m,
455 pg_data_t *pgdat, struct zone *zone)
456{
457 int order, mtype;
458
459 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
460 seq_printf(m, "Node %4d, zone %8s, type %12s ",
461 pgdat->node_id,
462 zone->name,
463 migratetype_names[mtype]);
464 for (order = 0; order < MAX_ORDER; ++order) {
465 unsigned long freecount = 0;
466 struct free_area *area;
467 struct list_head *curr;
468
469 area = &(zone->free_area[order]);
470
471 list_for_each(curr, &area->free_list[mtype])
472 freecount++;
473 seq_printf(m, "%6lu ", freecount);
474 }
445 seq_putc(m, '\n'); 475 seq_putc(m, '\n');
446 } 476 }
477}
478
479/* Print out the free pages at each order for each migatetype */
480static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
481{
482 int order;
483 pg_data_t *pgdat = (pg_data_t *)arg;
484
485 /* Print header */
486 seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
487 for (order = 0; order < MAX_ORDER; ++order)
488 seq_printf(m, "%6d ", order);
489 seq_putc(m, '\n');
490
491 walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
492
493 return 0;
494}
495
496static void pagetypeinfo_showblockcount_print(struct seq_file *m,
497 pg_data_t *pgdat, struct zone *zone)
498{
499 int mtype;
500 unsigned long pfn;
501 unsigned long start_pfn = zone->zone_start_pfn;
502 unsigned long end_pfn = start_pfn + zone->spanned_pages;
503 unsigned long count[MIGRATE_TYPES] = { 0, };
504
505 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
506 struct page *page;
507
508 if (!pfn_valid(pfn))
509 continue;
510
511 page = pfn_to_page(pfn);
512 mtype = get_pageblock_migratetype(page);
513
514 count[mtype]++;
515 }
516
517 /* Print counts */
518 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
519 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
520 seq_printf(m, "%12lu ", count[mtype]);
521 seq_putc(m, '\n');
522}
523
524/* Print out the free pages at each order for each migratetype */
525static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
526{
527 int mtype;
528 pg_data_t *pgdat = (pg_data_t *)arg;
529
530 seq_printf(m, "\n%-23s", "Number of blocks type ");
531 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
532 seq_printf(m, "%12s ", migratetype_names[mtype]);
533 seq_putc(m, '\n');
534 walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
535
536 return 0;
537}
538
539/*
540 * This prints out statistics in relation to grouping pages by mobility.
541 * It is expensive to collect so do not constantly read the file.
542 */
543static int pagetypeinfo_show(struct seq_file *m, void *arg)
544{
545 pg_data_t *pgdat = (pg_data_t *)arg;
546
547 seq_printf(m, "Page block order: %d\n", pageblock_order);
548 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
549 seq_putc(m, '\n');
550 pagetypeinfo_showfree(m, pgdat);
551 pagetypeinfo_showblockcount(m, pgdat);
552
447 return 0; 553 return 0;
448} 554}
449 555
@@ -454,6 +560,13 @@ const struct seq_operations fragmentation_op = {
454 .show = frag_show, 560 .show = frag_show,
455}; 561};
456 562
563const struct seq_operations pagetypeinfo_op = {
564 .start = frag_start,
565 .next = frag_next,
566 .stop = frag_stop,
567 .show = pagetypeinfo_show,
568};
569
457#ifdef CONFIG_ZONE_DMA 570#ifdef CONFIG_ZONE_DMA
458#define TEXT_FOR_DMA(xx) xx "_dma", 571#define TEXT_FOR_DMA(xx) xx "_dma",
459#else 572#else
@@ -532,84 +645,78 @@ static const char * const vmstat_text[] = {
532#endif 645#endif
533}; 646};
534 647
535/* 648static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
536 * Output information about zones in @pgdat. 649 struct zone *zone)
537 */
538static int zoneinfo_show(struct seq_file *m, void *arg)
539{ 650{
540 pg_data_t *pgdat = arg; 651 int i;
541 struct zone *zone; 652 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
542 struct zone *node_zones = pgdat->node_zones; 653 seq_printf(m,
543 unsigned long flags; 654 "\n pages free %lu"
544 655 "\n min %lu"
545 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { 656 "\n low %lu"
546 int i; 657 "\n high %lu"
547 658 "\n scanned %lu (a: %lu i: %lu)"
548 if (!populated_zone(zone)) 659 "\n spanned %lu"
549 continue; 660 "\n present %lu",
550 661 zone_page_state(zone, NR_FREE_PAGES),
551 spin_lock_irqsave(&zone->lock, flags); 662 zone->pages_min,
552 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); 663 zone->pages_low,
553 seq_printf(m, 664 zone->pages_high,
554 "\n pages free %lu" 665 zone->pages_scanned,
555 "\n min %lu" 666 zone->nr_scan_active, zone->nr_scan_inactive,
556 "\n low %lu" 667 zone->spanned_pages,
557 "\n high %lu" 668 zone->present_pages);
558 "\n scanned %lu (a: %lu i: %lu)"
559 "\n spanned %lu"
560 "\n present %lu",
561 zone_page_state(zone, NR_FREE_PAGES),
562 zone->pages_min,
563 zone->pages_low,
564 zone->pages_high,
565 zone->pages_scanned,
566 zone->nr_scan_active, zone->nr_scan_inactive,
567 zone->spanned_pages,
568 zone->present_pages);
569 669
570 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 670 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
571 seq_printf(m, "\n %-12s %lu", vmstat_text[i], 671 seq_printf(m, "\n %-12s %lu", vmstat_text[i],
572 zone_page_state(zone, i)); 672 zone_page_state(zone, i));
573 673
574 seq_printf(m, 674 seq_printf(m,
575 "\n protection: (%lu", 675 "\n protection: (%lu",
576 zone->lowmem_reserve[0]); 676 zone->lowmem_reserve[0]);
577 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) 677 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
578 seq_printf(m, ", %lu", zone->lowmem_reserve[i]); 678 seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
579 seq_printf(m, 679 seq_printf(m,
580 ")" 680 ")"
581 "\n pagesets"); 681 "\n pagesets");
582 for_each_online_cpu(i) { 682 for_each_online_cpu(i) {
583 struct per_cpu_pageset *pageset; 683 struct per_cpu_pageset *pageset;
584 int j; 684 int j;
585 685
586 pageset = zone_pcp(zone, i); 686 pageset = zone_pcp(zone, i);
587 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { 687 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
588 seq_printf(m, 688 seq_printf(m,
589 "\n cpu: %i pcp: %i" 689 "\n cpu: %i pcp: %i"
590 "\n count: %i" 690 "\n count: %i"
591 "\n high: %i" 691 "\n high: %i"
592 "\n batch: %i", 692 "\n batch: %i",
593 i, j, 693 i, j,
594 pageset->pcp[j].count, 694 pageset->pcp[j].count,
595 pageset->pcp[j].high, 695 pageset->pcp[j].high,
596 pageset->pcp[j].batch); 696 pageset->pcp[j].batch);
597 } 697 }
598#ifdef CONFIG_SMP 698#ifdef CONFIG_SMP
599 seq_printf(m, "\n vm stats threshold: %d", 699 seq_printf(m, "\n vm stats threshold: %d",
600 pageset->stat_threshold); 700 pageset->stat_threshold);
601#endif 701#endif
602 }
603 seq_printf(m,
604 "\n all_unreclaimable: %u"
605 "\n prev_priority: %i"
606 "\n start_pfn: %lu",
607 zone->all_unreclaimable,
608 zone->prev_priority,
609 zone->zone_start_pfn);
610 spin_unlock_irqrestore(&zone->lock, flags);
611 seq_putc(m, '\n');
612 } 702 }
703 seq_printf(m,
704 "\n all_unreclaimable: %u"
705 "\n prev_priority: %i"
706 "\n start_pfn: %lu",
707 zone->all_unreclaimable,
708 zone->prev_priority,
709 zone->zone_start_pfn);
710 seq_putc(m, '\n');
711}
712
713/*
714 * Output information about zones in @pgdat.
715 */
716static int zoneinfo_show(struct seq_file *m, void *arg)
717{
718 pg_data_t *pgdat = (pg_data_t *)arg;
719 walk_zones_in_node(m, pgdat, zoneinfo_show_print);
613 return 0; 720 return 0;
614} 721}
615 722
@@ -741,7 +848,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
741static struct notifier_block __cpuinitdata vmstat_notifier = 848static struct notifier_block __cpuinitdata vmstat_notifier =
742 { &vmstat_cpuup_callback, NULL, 0 }; 849 { &vmstat_cpuup_callback, NULL, 0 };
743 850
744int __init setup_vmstat(void) 851static int __init setup_vmstat(void)
745{ 852{
746 int cpu; 853 int cpu;
747 854