aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig91
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bootmem.c23
-rw-r--r--mm/fadvise.c4
-rw-r--r--mm/filemap.c91
-rw-r--r--mm/filemap.h94
-rw-r--r--mm/filemap_xip.c440
-rw-r--r--mm/hugetlb.c11
-rw-r--r--mm/madvise.c17
-rw-r--r--mm/memory.c79
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/mempool.c22
-rw-r--r--mm/mmap.c6
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/oom_kill.c10
-rw-r--r--mm/page-writeback.c2
-rw-r--r--mm/page_alloc.c98
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/pdflush.c2
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c17
-rw-r--r--mm/slab.c37
-rw-r--r--mm/sparse.c137
-rw-r--r--mm/vmscan.c6
25 files changed, 1006 insertions, 201 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
new file mode 100644
index 000000000000..cd379936cac6
--- /dev/null
+++ b/mm/Kconfig
@@ -0,0 +1,91 @@
1config SELECT_MEMORY_MODEL
2 def_bool y
3 depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL
4
5choice
6 prompt "Memory model"
7 depends on SELECT_MEMORY_MODEL
8 default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
9 default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
10 default FLATMEM_MANUAL
11
12config FLATMEM_MANUAL
13 bool "Flat Memory"
14 depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE
15 help
16 This option allows you to change some of the ways that
17 Linux manages its memory internally. Most users will
18 only have one option here: FLATMEM. This is normal
19 and a correct option.
20
21 Some users of more advanced features like NUMA and
22 memory hotplug may have different options here.
23 DISCONTIGMEM is an more mature, better tested system,
24 but is incompatible with memory hotplug and may suffer
25 decreased performance over SPARSEMEM. If unsure between
26 "Sparse Memory" and "Discontiguous Memory", choose
27 "Discontiguous Memory".
28
29 If unsure, choose this option (Flat Memory) over any other.
30
31config DISCONTIGMEM_MANUAL
32 bool "Discontigious Memory"
33 depends on ARCH_DISCONTIGMEM_ENABLE
34 help
35 This option provides enhanced support for discontiguous
36 memory systems, over FLATMEM. These systems have holes
37 in their physical address spaces, and this option provides
38 more efficient handling of these holes. However, the vast
39 majority of hardware has quite flat address spaces, and
40 can have degraded performance from extra overhead that
41 this option imposes.
42
43 Many NUMA configurations will have this as the only option.
44
45 If unsure, choose "Flat Memory" over this option.
46
47config SPARSEMEM_MANUAL
48 bool "Sparse Memory"
49 depends on ARCH_SPARSEMEM_ENABLE
50 help
51 This will be the only option for some systems, including
52 memory hotplug systems. This is normal.
53
54 For many other systems, this will be an alternative to
55 "Discontigious Memory". This option provides some potential
56 performance benefits, along with decreased code complexity,
57 but it is newer, and more experimental.
58
59 If unsure, choose "Discontiguous Memory" or "Flat Memory"
60 over this option.
61
62endchoice
63
64config DISCONTIGMEM
65 def_bool y
66 depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
67
68config SPARSEMEM
69 def_bool y
70 depends on SPARSEMEM_MANUAL
71
72config FLATMEM
73 def_bool y
74 depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
75
76config FLAT_NODE_MEM_MAP
77 def_bool y
78 depends on !SPARSEMEM
79
80#
81# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
82# to represent different areas of memory. This variable allows
83# those dependencies to exist individually.
84#
85config NEED_MULTIPLE_NODES
86 def_bool y
87 depends on DISCONTIGMEM || NUMA
88
89config HAVE_MEMORY_PRESENT
90 def_bool y
91 depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
diff --git a/mm/Makefile b/mm/Makefile
index 097408064f6a..4cd69e3ce421 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,6 +15,8 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
16obj-$(CONFIG_HUGETLBFS) += hugetlb.o 16obj-$(CONFIG_HUGETLBFS) += hugetlb.o
17obj-$(CONFIG_NUMA) += mempolicy.o 17obj-$(CONFIG_NUMA) += mempolicy.o
18obj-$(CONFIG_SPARSEMEM) += sparse.o
18obj-$(CONFIG_SHMEM) += shmem.o 19obj-$(CONFIG_SHMEM) += shmem.o
19obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
20 21
22obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 260e703850d8..c1330cc19783 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -33,6 +33,14 @@ EXPORT_SYMBOL(max_pfn); /* This is exported so
33 * dma_get_required_mask(), which uses 33 * dma_get_required_mask(), which uses
34 * it, can be an inline function */ 34 * it, can be an inline function */
35 35
36#ifdef CONFIG_CRASH_DUMP
37/*
38 * If we have booted due to a crash, max_pfn will be a very low value. We need
39 * to know the amount of memory that the previous kernel used.
40 */
41unsigned long saved_max_pfn;
42#endif
43
36/* return the number of _pages_ that will be allocated for the boot bitmap */ 44/* return the number of _pages_ that will be allocated for the boot bitmap */
37unsigned long __init bootmem_bootmap_pages (unsigned long pages) 45unsigned long __init bootmem_bootmap_pages (unsigned long pages)
38{ 46{
@@ -57,7 +65,7 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
57 pgdat->pgdat_next = pgdat_list; 65 pgdat->pgdat_next = pgdat_list;
58 pgdat_list = pgdat; 66 pgdat_list = pgdat;
59 67
60 mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL); 68 mapsize = ALIGN(mapsize, sizeof(long));
61 bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); 69 bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
62 bdata->node_boot_start = (start << PAGE_SHIFT); 70 bdata->node_boot_start = (start << PAGE_SHIFT);
63 bdata->node_low_pfn = end; 71 bdata->node_low_pfn = end;
@@ -178,7 +186,7 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
178 } else 186 } else
179 preferred = 0; 187 preferred = 0;
180 188
181 preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT; 189 preferred = ALIGN(preferred, align) >> PAGE_SHIFT;
182 preferred += offset; 190 preferred += offset;
183 areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; 191 areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
184 incr = align >> PAGE_SHIFT ? : 1; 192 incr = align >> PAGE_SHIFT ? : 1;
@@ -219,7 +227,7 @@ found:
219 */ 227 */
220 if (align < PAGE_SIZE && 228 if (align < PAGE_SIZE &&
221 bdata->last_offset && bdata->last_pos+1 == start) { 229 bdata->last_offset && bdata->last_pos+1 == start) {
222 offset = (bdata->last_offset+align-1) & ~(align-1); 230 offset = ALIGN(bdata->last_offset, align);
223 BUG_ON(offset > PAGE_SIZE); 231 BUG_ON(offset > PAGE_SIZE);
224 remaining_size = PAGE_SIZE-offset; 232 remaining_size = PAGE_SIZE-offset;
225 if (size < remaining_size) { 233 if (size < remaining_size) {
@@ -256,6 +264,7 @@ found:
256static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) 264static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
257{ 265{
258 struct page *page; 266 struct page *page;
267 unsigned long pfn;
259 bootmem_data_t *bdata = pgdat->bdata; 268 bootmem_data_t *bdata = pgdat->bdata;
260 unsigned long i, count, total = 0; 269 unsigned long i, count, total = 0;
261 unsigned long idx; 270 unsigned long idx;
@@ -266,7 +275,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
266 275
267 count = 0; 276 count = 0;
268 /* first extant page of the node */ 277 /* first extant page of the node */
269 page = virt_to_page(phys_to_virt(bdata->node_boot_start)); 278 pfn = bdata->node_boot_start >> PAGE_SHIFT;
270 idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); 279 idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
271 map = bdata->node_bootmem_map; 280 map = bdata->node_bootmem_map;
272 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ 281 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
@@ -275,9 +284,11 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
275 gofast = 1; 284 gofast = 1;
276 for (i = 0; i < idx; ) { 285 for (i = 0; i < idx; ) {
277 unsigned long v = ~map[i / BITS_PER_LONG]; 286 unsigned long v = ~map[i / BITS_PER_LONG];
287
278 if (gofast && v == ~0UL) { 288 if (gofast && v == ~0UL) {
279 int j, order; 289 int j, order;
280 290
291 page = pfn_to_page(pfn);
281 count += BITS_PER_LONG; 292 count += BITS_PER_LONG;
282 __ClearPageReserved(page); 293 __ClearPageReserved(page);
283 order = ffs(BITS_PER_LONG) - 1; 294 order = ffs(BITS_PER_LONG) - 1;
@@ -292,6 +303,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
292 page += BITS_PER_LONG; 303 page += BITS_PER_LONG;
293 } else if (v) { 304 } else if (v) {
294 unsigned long m; 305 unsigned long m;
306
307 page = pfn_to_page(pfn);
295 for (m = 1; m && i < idx; m<<=1, page++, i++) { 308 for (m = 1; m && i < idx; m<<=1, page++, i++) {
296 if (v & m) { 309 if (v & m) {
297 count++; 310 count++;
@@ -302,8 +315,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
302 } 315 }
303 } else { 316 } else {
304 i+=BITS_PER_LONG; 317 i+=BITS_PER_LONG;
305 page += BITS_PER_LONG;
306 } 318 }
319 pfn += BITS_PER_LONG;
307 } 320 }
308 total += count; 321 total += count;
309 322
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 57264d74b8bf..5f19e87bc5af 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -43,6 +43,10 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
43 goto out; 43 goto out;
44 } 44 }
45 45
46 if (mapping->a_ops->get_xip_page)
47 /* no bad return value, but ignore advice */
48 goto out;
49
46 /* Careful about overflows. Len == 0 means "as much as possible" */ 50 /* Careful about overflows. Len == 0 means "as much as possible" */
47 endbyte = offset + len; 51 endbyte = offset + len;
48 if (!len || endbyte < len) 52 if (!len || endbyte < len)
diff --git a/mm/filemap.c b/mm/filemap.c
index 4a2fee2cb62b..c11418dd94e8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -28,6 +28,7 @@
28#include <linux/blkdev.h> 28#include <linux/blkdev.h>
29#include <linux/security.h> 29#include <linux/security.h>
30#include <linux/syscalls.h> 30#include <linux/syscalls.h>
31#include "filemap.h"
31/* 32/*
32 * FIXME: remove all knowledge of the buffer layer from the core VM 33 * FIXME: remove all knowledge of the buffer layer from the core VM
33 */ 34 */
@@ -1714,32 +1715,7 @@ int remove_suid(struct dentry *dentry)
1714} 1715}
1715EXPORT_SYMBOL(remove_suid); 1716EXPORT_SYMBOL(remove_suid);
1716 1717
1717/* 1718size_t
1718 * Copy as much as we can into the page and return the number of bytes which
1719 * were sucessfully copied. If a fault is encountered then clear the page
1720 * out to (offset+bytes) and return the number of bytes which were copied.
1721 */
1722static inline size_t
1723filemap_copy_from_user(struct page *page, unsigned long offset,
1724 const char __user *buf, unsigned bytes)
1725{
1726 char *kaddr;
1727 int left;
1728
1729 kaddr = kmap_atomic(page, KM_USER0);
1730 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
1731 kunmap_atomic(kaddr, KM_USER0);
1732
1733 if (left != 0) {
1734 /* Do it the slow way */
1735 kaddr = kmap(page);
1736 left = __copy_from_user(kaddr + offset, buf, bytes);
1737 kunmap(page);
1738 }
1739 return bytes - left;
1740}
1741
1742static size_t
1743__filemap_copy_from_user_iovec(char *vaddr, 1719__filemap_copy_from_user_iovec(char *vaddr,
1744 const struct iovec *iov, size_t base, size_t bytes) 1720 const struct iovec *iov, size_t base, size_t bytes)
1745{ 1721{
@@ -1767,52 +1743,6 @@ __filemap_copy_from_user_iovec(char *vaddr,
1767} 1743}
1768 1744
1769/* 1745/*
1770 * This has the same sideeffects and return value as filemap_copy_from_user().
1771 * The difference is that on a fault we need to memset the remainder of the
1772 * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
1773 * single-segment behaviour.
1774 */
1775static inline size_t
1776filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
1777 const struct iovec *iov, size_t base, size_t bytes)
1778{
1779 char *kaddr;
1780 size_t copied;
1781
1782 kaddr = kmap_atomic(page, KM_USER0);
1783 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
1784 base, bytes);
1785 kunmap_atomic(kaddr, KM_USER0);
1786 if (copied != bytes) {
1787 kaddr = kmap(page);
1788 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
1789 base, bytes);
1790 kunmap(page);
1791 }
1792 return copied;
1793}
1794
1795static inline void
1796filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1797{
1798 const struct iovec *iov = *iovp;
1799 size_t base = *basep;
1800
1801 while (bytes) {
1802 int copy = min(bytes, iov->iov_len - base);
1803
1804 bytes -= copy;
1805 base += copy;
1806 if (iov->iov_len == base) {
1807 iov++;
1808 base = 0;
1809 }
1810 }
1811 *iovp = iov;
1812 *basep = base;
1813}
1814
1815/*
1816 * Performs necessary checks before doing a write 1746 * Performs necessary checks before doing a write
1817 * 1747 *
1818 * Can adjust writing position aor amount of bytes to write. 1748 * Can adjust writing position aor amount of bytes to write.
@@ -1827,12 +1757,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
1827 if (unlikely(*pos < 0)) 1757 if (unlikely(*pos < 0))
1828 return -EINVAL; 1758 return -EINVAL;
1829 1759
1830 if (unlikely(file->f_error)) {
1831 int err = file->f_error;
1832 file->f_error = 0;
1833 return err;
1834 }
1835
1836 if (!isblk) { 1760 if (!isblk) {
1837 /* FIXME: this is for backwards compatibility with 2.4 */ 1761 /* FIXME: this is for backwards compatibility with 2.4 */
1838 if (file->f_flags & O_APPEND) 1762 if (file->f_flags & O_APPEND)
@@ -1927,8 +1851,11 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1927 * i_sem is held, which protects generic_osync_inode() from 1851 * i_sem is held, which protects generic_osync_inode() from
1928 * livelocking. 1852 * livelocking.
1929 */ 1853 */
1930 if (written >= 0 && file->f_flags & O_SYNC) 1854 if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
1931 generic_osync_inode(inode, mapping, OSYNC_METADATA); 1855 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
1856 if (err < 0)
1857 written = err;
1858 }
1932 if (written == count && !is_sync_kiocb(iocb)) 1859 if (written == count && !is_sync_kiocb(iocb))
1933 written = -EIOCBQUEUED; 1860 written = -EIOCBQUEUED;
1934 return written; 1861 return written;
@@ -2027,7 +1954,9 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2027 if (unlikely(nr_segs > 1)) { 1954 if (unlikely(nr_segs > 1)) {
2028 filemap_set_next_iovec(&cur_iov, 1955 filemap_set_next_iovec(&cur_iov,
2029 &iov_base, status); 1956 &iov_base, status);
2030 buf = cur_iov->iov_base + iov_base; 1957 if (count)
1958 buf = cur_iov->iov_base +
1959 iov_base;
2031 } else { 1960 } else {
2032 iov_base += status; 1961 iov_base += status;
2033 } 1962 }
diff --git a/mm/filemap.h b/mm/filemap.h
new file mode 100644
index 000000000000..13793ba0ce17
--- /dev/null
+++ b/mm/filemap.h
@@ -0,0 +1,94 @@
1/*
2 * linux/mm/filemap.h
3 *
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
6
7#ifndef __FILEMAP_H
8#define __FILEMAP_H
9
10#include <linux/types.h>
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/highmem.h>
14#include <linux/uio.h>
15#include <linux/config.h>
16#include <asm/uaccess.h>
17
18size_t
19__filemap_copy_from_user_iovec(char *vaddr,
20 const struct iovec *iov,
21 size_t base,
22 size_t bytes);
23
24/*
25 * Copy as much as we can into the page and return the number of bytes which
26 * were sucessfully copied. If a fault is encountered then clear the page
27 * out to (offset+bytes) and return the number of bytes which were copied.
28 */
29static inline size_t
30filemap_copy_from_user(struct page *page, unsigned long offset,
31 const char __user *buf, unsigned bytes)
32{
33 char *kaddr;
34 int left;
35
36 kaddr = kmap_atomic(page, KM_USER0);
37 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
38 kunmap_atomic(kaddr, KM_USER0);
39
40 if (left != 0) {
41 /* Do it the slow way */
42 kaddr = kmap(page);
43 left = __copy_from_user(kaddr + offset, buf, bytes);
44 kunmap(page);
45 }
46 return bytes - left;
47}
48
49/*
50 * This has the same sideeffects and return value as filemap_copy_from_user().
51 * The difference is that on a fault we need to memset the remainder of the
52 * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
53 * single-segment behaviour.
54 */
55static inline size_t
56filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
57 const struct iovec *iov, size_t base, size_t bytes)
58{
59 char *kaddr;
60 size_t copied;
61
62 kaddr = kmap_atomic(page, KM_USER0);
63 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
64 base, bytes);
65 kunmap_atomic(kaddr, KM_USER0);
66 if (copied != bytes) {
67 kaddr = kmap(page);
68 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
69 base, bytes);
70 kunmap(page);
71 }
72 return copied;
73}
74
75static inline void
76filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
77{
78 const struct iovec *iov = *iovp;
79 size_t base = *basep;
80
81 while (bytes) {
82 int copy = min(bytes, iov->iov_len - base);
83
84 bytes -= copy;
85 base += copy;
86 if (iov->iov_len == base) {
87 iov++;
88 base = 0;
89 }
90 }
91 *iovp = iov;
92 *basep = base;
93}
94#endif
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
new file mode 100644
index 000000000000..8c199f537732
--- /dev/null
+++ b/mm/filemap_xip.c
@@ -0,0 +1,440 @@
1/*
2 * linux/mm/filemap_xip.c
3 *
4 * Copyright (C) 2005 IBM Corporation
5 * Author: Carsten Otte <cotte@de.ibm.com>
6 *
7 * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
8 *
9 */
10
11#include <linux/fs.h>
12#include <linux/pagemap.h>
13#include <linux/module.h>
14#include <linux/uio.h>
15#include <linux/rmap.h>
16#include <asm/tlbflush.h>
17#include "filemap.h"
18
19/*
20 * This is a file read routine for execute in place files, and uses
21 * the mapping->a_ops->get_xip_page() function for the actual low-level
22 * stuff.
23 *
24 * Note the struct file* is not used at all. It may be NULL.
25 */
26static void
27do_xip_mapping_read(struct address_space *mapping,
28 struct file_ra_state *_ra,
29 struct file *filp,
30 loff_t *ppos,
31 read_descriptor_t *desc,
32 read_actor_t actor)
33{
34 struct inode *inode = mapping->host;
35 unsigned long index, end_index, offset;
36 loff_t isize;
37
38 BUG_ON(!mapping->a_ops->get_xip_page);
39
40 index = *ppos >> PAGE_CACHE_SHIFT;
41 offset = *ppos & ~PAGE_CACHE_MASK;
42
43 isize = i_size_read(inode);
44 if (!isize)
45 goto out;
46
47 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
48 for (;;) {
49 struct page *page;
50 unsigned long nr, ret;
51
52 /* nr is the maximum number of bytes to copy from this page */
53 nr = PAGE_CACHE_SIZE;
54 if (index >= end_index) {
55 if (index > end_index)
56 goto out;
57 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
58 if (nr <= offset) {
59 goto out;
60 }
61 }
62 nr = nr - offset;
63
64 page = mapping->a_ops->get_xip_page(mapping,
65 index*(PAGE_SIZE/512), 0);
66 if (!page)
67 goto no_xip_page;
68 if (unlikely(IS_ERR(page))) {
69 if (PTR_ERR(page) == -ENODATA) {
70 /* sparse */
71 page = ZERO_PAGE(0);
72 } else {
73 desc->error = PTR_ERR(page);
74 goto out;
75 }
76 }
77
78 /* If users can be writing to this page using arbitrary
79 * virtual addresses, take care about potential aliasing
80 * before reading the page on the kernel side.
81 */
82 if (mapping_writably_mapped(mapping))
83 flush_dcache_page(page);
84
85 /*
86 * Ok, we have the page, so now we can copy it to user space...
87 *
88 * The actor routine returns how many bytes were actually used..
89 * NOTE! This may not be the same as how much of a user buffer
90 * we filled up (we may be padding etc), so we can only update
91 * "pos" here (the actor routine has to update the user buffer
92 * pointers and the remaining count).
93 */
94 ret = actor(desc, page, offset, nr);
95 offset += ret;
96 index += offset >> PAGE_CACHE_SHIFT;
97 offset &= ~PAGE_CACHE_MASK;
98
99 if (ret == nr && desc->count)
100 continue;
101 goto out;
102
103no_xip_page:
104 /* Did not get the page. Report it */
105 desc->error = -EIO;
106 goto out;
107 }
108
109out:
110 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
111 if (filp)
112 file_accessed(filp);
113}
114
115ssize_t
116xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
117{
118 read_descriptor_t desc;
119
120 if (!access_ok(VERIFY_WRITE, buf, len))
121 return -EFAULT;
122
123 desc.written = 0;
124 desc.arg.buf = buf;
125 desc.count = len;
126 desc.error = 0;
127
128 do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
129 ppos, &desc, file_read_actor);
130
131 if (desc.written)
132 return desc.written;
133 else
134 return desc.error;
135}
136EXPORT_SYMBOL_GPL(xip_file_read);
137
138ssize_t
139xip_file_sendfile(struct file *in_file, loff_t *ppos,
140 size_t count, read_actor_t actor, void *target)
141{
142 read_descriptor_t desc;
143
144 if (!count)
145 return 0;
146
147 desc.written = 0;
148 desc.count = count;
149 desc.arg.data = target;
150 desc.error = 0;
151
152 do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file,
153 ppos, &desc, actor);
154 if (desc.written)
155 return desc.written;
156 return desc.error;
157}
158EXPORT_SYMBOL_GPL(xip_file_sendfile);
159
160/*
161 * __xip_unmap is invoked from xip_unmap and
162 * xip_write
163 *
164 * This function walks all vmas of the address_space and unmaps the
165 * ZERO_PAGE when found at pgoff. Should it go in rmap.c?
166 */
167static void
168__xip_unmap (struct address_space * mapping,
169 unsigned long pgoff)
170{
171 struct vm_area_struct *vma;
172 struct mm_struct *mm;
173 struct prio_tree_iter iter;
174 unsigned long address;
175 pte_t *pte;
176 pte_t pteval;
177
178 spin_lock(&mapping->i_mmap_lock);
179 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
180 mm = vma->vm_mm;
181 address = vma->vm_start +
182 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
183 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
184 /*
185 * We need the page_table_lock to protect us from page faults,
186 * munmap, fork, etc...
187 */
188 pte = page_check_address(ZERO_PAGE(address), mm,
189 address);
190 if (!IS_ERR(pte)) {
191 /* Nuke the page table entry. */
192 flush_cache_page(vma, address, pte_pfn(*pte));
193 pteval = ptep_clear_flush(vma, address, pte);
194 BUG_ON(pte_dirty(pteval));
195 pte_unmap(pte);
196 spin_unlock(&mm->page_table_lock);
197 }
198 }
199 spin_unlock(&mapping->i_mmap_lock);
200}
201
202/*
203 * xip_nopage() is invoked via the vma operations vector for a
204 * mapped memory region to read in file data during a page fault.
205 *
206 * This function is derived from filemap_nopage, but used for execute in place
207 */
208static struct page *
209xip_file_nopage(struct vm_area_struct * area,
210 unsigned long address,
211 int *type)
212{
213 struct file *file = area->vm_file;
214 struct address_space *mapping = file->f_mapping;
215 struct inode *inode = mapping->host;
216 struct page *page;
217 unsigned long size, pgoff, endoff;
218
219 pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
220 + area->vm_pgoff;
221 endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT)
222 + area->vm_pgoff;
223
224 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
225 if (pgoff >= size) {
226 return NULL;
227 }
228
229 page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
230 if (!IS_ERR(page)) {
231 return page;
232 }
233 if (PTR_ERR(page) != -ENODATA)
234 return NULL;
235
236 /* sparse block */
237 if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
238 (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
239 (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
240 /* maybe shared writable, allocate new block */
241 page = mapping->a_ops->get_xip_page (mapping,
242 pgoff*(PAGE_SIZE/512), 1);
243 if (IS_ERR(page))
244 return NULL;
245 /* unmap page at pgoff from all other vmas */
246 __xip_unmap(mapping, pgoff);
247 } else {
248 /* not shared and writable, use ZERO_PAGE() */
249 page = ZERO_PAGE(address);
250 }
251
252 return page;
253}
254
255static struct vm_operations_struct xip_file_vm_ops = {
256 .nopage = xip_file_nopage,
257};
258
259int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
260{
261 BUG_ON(!file->f_mapping->a_ops->get_xip_page);
262
263 file_accessed(file);
264 vma->vm_ops = &xip_file_vm_ops;
265 return 0;
266}
267EXPORT_SYMBOL_GPL(xip_file_mmap);
268
269static ssize_t
270__xip_file_write(struct file *filp, const char __user *buf,
271 size_t count, loff_t pos, loff_t *ppos)
272{
273 struct address_space * mapping = filp->f_mapping;
274 struct address_space_operations *a_ops = mapping->a_ops;
275 struct inode *inode = mapping->host;
276 long status = 0;
277 struct page *page;
278 size_t bytes;
279 ssize_t written = 0;
280
281 BUG_ON(!mapping->a_ops->get_xip_page);
282
283 do {
284 unsigned long index;
285 unsigned long offset;
286 size_t copied;
287
288 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
289 index = pos >> PAGE_CACHE_SHIFT;
290 bytes = PAGE_CACHE_SIZE - offset;
291 if (bytes > count)
292 bytes = count;
293
294 /*
295 * Bring in the user page that we will copy from _first_.
296 * Otherwise there's a nasty deadlock on copying from the
297 * same page as we're writing to, without it being marked
298 * up-to-date.
299 */
300 fault_in_pages_readable(buf, bytes);
301
302 page = a_ops->get_xip_page(mapping,
303 index*(PAGE_SIZE/512), 0);
304 if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
305 /* we allocate a new page unmap it */
306 page = a_ops->get_xip_page(mapping,
307 index*(PAGE_SIZE/512), 1);
308 if (!IS_ERR(page))
309 /* unmap page at pgoff from all other vmas */
310 __xip_unmap(mapping, index);
311 }
312
313 if (IS_ERR(page)) {
314 status = PTR_ERR(page);
315 break;
316 }
317
318 copied = filemap_copy_from_user(page, offset, buf, bytes);
319 flush_dcache_page(page);
320 if (likely(copied > 0)) {
321 status = copied;
322
323 if (status >= 0) {
324 written += status;
325 count -= status;
326 pos += status;
327 buf += status;
328 }
329 }
330 if (unlikely(copied != bytes))
331 if (status >= 0)
332 status = -EFAULT;
333 if (status < 0)
334 break;
335 } while (count);
336 *ppos = pos;
337 /*
338 * No need to use i_size_read() here, the i_size
339 * cannot change under us because we hold i_sem.
340 */
341 if (pos > inode->i_size) {
342 i_size_write(inode, pos);
343 mark_inode_dirty(inode);
344 }
345
346 return written ? written : status;
347}
348
349ssize_t
350xip_file_write(struct file *filp, const char __user *buf, size_t len,
351 loff_t *ppos)
352{
353 struct address_space *mapping = filp->f_mapping;
354 struct inode *inode = mapping->host;
355 size_t count;
356 loff_t pos;
357 ssize_t ret;
358
359 down(&inode->i_sem);
360
361 if (!access_ok(VERIFY_READ, buf, len)) {
362 ret=-EFAULT;
363 goto out_up;
364 }
365
366 pos = *ppos;
367 count = len;
368
369 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
370
371 /* We can write back this queue in page reclaim */
372 current->backing_dev_info = mapping->backing_dev_info;
373
374 ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
375 if (ret)
376 goto out_backing;
377 if (count == 0)
378 goto out_backing;
379
380 ret = remove_suid(filp->f_dentry);
381 if (ret)
382 goto out_backing;
383
384 inode_update_time(inode, 1);
385
386 ret = __xip_file_write (filp, buf, count, pos, ppos);
387
388 out_backing:
389 current->backing_dev_info = NULL;
390 out_up:
391 up(&inode->i_sem);
392 return ret;
393}
394EXPORT_SYMBOL_GPL(xip_file_write);
395
396/*
397 * truncate a page used for execute in place
398 * functionality is analog to block_truncate_page but does use get_xip_page
399 * to get the page instead of page cache
400 */
401int
402xip_truncate_page(struct address_space *mapping, loff_t from)
403{
404 pgoff_t index = from >> PAGE_CACHE_SHIFT;
405 unsigned offset = from & (PAGE_CACHE_SIZE-1);
406 unsigned blocksize;
407 unsigned length;
408 struct page *page;
409 void *kaddr;
410
411 BUG_ON(!mapping->a_ops->get_xip_page);
412
413 blocksize = 1 << mapping->host->i_blkbits;
414 length = offset & (blocksize - 1);
415
416 /* Block boundary? Nothing to do */
417 if (!length)
418 return 0;
419
420 length = blocksize - length;
421
422 page = mapping->a_ops->get_xip_page(mapping,
423 index*(PAGE_SIZE/512), 0);
424 if (!page)
425 return -ENOMEM;
426 if (unlikely(IS_ERR(page))) {
427 if (PTR_ERR(page) == -ENODATA)
428 /* Hole? No need to truncate */
429 return 0;
430 else
431 return PTR_ERR(page);
432 }
433 kaddr = kmap_atomic(page, KM_USER0);
434 memset(kaddr + offset, 0, length);
435 kunmap_atomic(kaddr, KM_USER0);
436
437 flush_dcache_page(page);
438 return 0;
439}
440EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fbd1111ea119..6bf720bc662c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -301,6 +301,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
301{ 301{
302 struct mm_struct *mm = vma->vm_mm; 302 struct mm_struct *mm = vma->vm_mm;
303 unsigned long address; 303 unsigned long address;
304 pte_t *ptep;
304 pte_t pte; 305 pte_t pte;
305 struct page *page; 306 struct page *page;
306 307
@@ -309,9 +310,17 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
309 BUG_ON(end & ~HPAGE_MASK); 310 BUG_ON(end & ~HPAGE_MASK);
310 311
311 for (address = start; address < end; address += HPAGE_SIZE) { 312 for (address = start; address < end; address += HPAGE_SIZE) {
312 pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address)); 313 ptep = huge_pte_offset(mm, address);
314 if (! ptep)
315 /* This can happen on truncate, or if an
316 * mmap() is aborted due to an error before
317 * the prefault */
318 continue;
319
320 pte = huge_ptep_get_and_clear(mm, address, ptep);
313 if (pte_none(pte)) 321 if (pte_none(pte))
314 continue; 322 continue;
323
315 page = pte_page(pte); 324 page = pte_page(pte);
316 put_page(page); 325 put_page(page);
317 } 326 }
diff --git a/mm/madvise.c b/mm/madvise.c
index e3108054733c..c8c01a12fea4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -65,7 +65,6 @@ static long madvise_behavior(struct vm_area_struct * vma,
65 /* 65 /*
66 * vm_flags is protected by the mmap_sem held in write mode. 66 * vm_flags is protected by the mmap_sem held in write mode.
67 */ 67 */
68 VM_ClearReadHint(vma);
69 vma->vm_flags = new_flags; 68 vma->vm_flags = new_flags;
70 69
71out: 70out:
@@ -84,8 +83,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
84{ 83{
85 struct file *file = vma->vm_file; 84 struct file *file = vma->vm_file;
86 85
87 if (!file) 86 if (file->f_mapping->a_ops->get_xip_page) {
88 return -EBADF; 87 /* no bad return value, but ignore advice */
88 return 0;
89 }
89 90
90 *prev = vma; 91 *prev = vma;
91 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 92 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -136,11 +137,16 @@ static long madvise_dontneed(struct vm_area_struct * vma,
136 return 0; 137 return 0;
137} 138}
138 139
139static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 140static long
140 unsigned long start, unsigned long end, int behavior) 141madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
142 unsigned long start, unsigned long end, int behavior)
141{ 143{
144 struct file *filp = vma->vm_file;
142 long error = -EBADF; 145 long error = -EBADF;
143 146
147 if (!filp)
148 goto out;
149
144 switch (behavior) { 150 switch (behavior) {
145 case MADV_NORMAL: 151 case MADV_NORMAL:
146 case MADV_SEQUENTIAL: 152 case MADV_SEQUENTIAL:
@@ -161,6 +167,7 @@ static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev
161 break; 167 break;
162 } 168 }
163 169
170out:
164 return error; 171 return error;
165} 172}
166 173
diff --git a/mm/memory.c b/mm/memory.c
index da91b7bf9986..e046b7e4b530 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -58,7 +58,7 @@
58#include <linux/swapops.h> 58#include <linux/swapops.h>
59#include <linux/elf.h> 59#include <linux/elf.h>
60 60
61#ifndef CONFIG_DISCONTIGMEM 61#ifndef CONFIG_NEED_MULTIPLE_NODES
62/* use the per-pgdat data instead for discontigmem - mbligh */ 62/* use the per-pgdat data instead for discontigmem - mbligh */
63unsigned long max_mapnr; 63unsigned long max_mapnr;
64struct page *mem_map; 64struct page *mem_map;
@@ -776,8 +776,8 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
776 * Do a quick page-table lookup for a single page. 776 * Do a quick page-table lookup for a single page.
777 * mm->page_table_lock must be held. 777 * mm->page_table_lock must be held.
778 */ 778 */
779static struct page * 779static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
780__follow_page(struct mm_struct *mm, unsigned long address, int read, int write) 780 int read, int write, int accessed)
781{ 781{
782 pgd_t *pgd; 782 pgd_t *pgd;
783 pud_t *pud; 783 pud_t *pud;
@@ -818,9 +818,11 @@ __follow_page(struct mm_struct *mm, unsigned long address, int read, int write)
818 pfn = pte_pfn(pte); 818 pfn = pte_pfn(pte);
819 if (pfn_valid(pfn)) { 819 if (pfn_valid(pfn)) {
820 page = pfn_to_page(pfn); 820 page = pfn_to_page(pfn);
821 if (write && !pte_dirty(pte) && !PageDirty(page)) 821 if (accessed) {
822 set_page_dirty(page); 822 if (write && !pte_dirty(pte) &&!PageDirty(page))
823 mark_page_accessed(page); 823 set_page_dirty(page);
824 mark_page_accessed(page);
825 }
824 return page; 826 return page;
825 } 827 }
826 } 828 }
@@ -829,16 +831,19 @@ out:
829 return NULL; 831 return NULL;
830} 832}
831 833
832struct page * 834inline struct page *
833follow_page(struct mm_struct *mm, unsigned long address, int write) 835follow_page(struct mm_struct *mm, unsigned long address, int write)
834{ 836{
835 return __follow_page(mm, address, /*read*/0, write); 837 return __follow_page(mm, address, 0, write, 1);
836} 838}
837 839
838int 840/*
839check_user_page_readable(struct mm_struct *mm, unsigned long address) 841 * check_user_page_readable() can be called frm niterrupt context by oprofile,
842 * so we need to avoid taking any non-irq-safe locks
843 */
844int check_user_page_readable(struct mm_struct *mm, unsigned long address)
840{ 845{
841 return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL; 846 return __follow_page(mm, address, 1, 0, 0) != NULL;
842} 847}
843EXPORT_SYMBOL(check_user_page_readable); 848EXPORT_SYMBOL(check_user_page_readable);
844 849
@@ -908,9 +913,13 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
908 pud = pud_offset(pgd, pg); 913 pud = pud_offset(pgd, pg);
909 BUG_ON(pud_none(*pud)); 914 BUG_ON(pud_none(*pud));
910 pmd = pmd_offset(pud, pg); 915 pmd = pmd_offset(pud, pg);
911 BUG_ON(pmd_none(*pmd)); 916 if (pmd_none(*pmd))
917 return i ? : -EFAULT;
912 pte = pte_offset_map(pmd, pg); 918 pte = pte_offset_map(pmd, pg);
913 BUG_ON(pte_none(*pte)); 919 if (pte_none(*pte)) {
920 pte_unmap(pte);
921 return i ? : -EFAULT;
922 }
914 if (pages) { 923 if (pages) {
915 pages[i] = pte_page(*pte); 924 pages[i] = pte_page(*pte);
916 get_page(pages[i]); 925 get_page(pages[i]);
@@ -935,11 +944,13 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
935 } 944 }
936 spin_lock(&mm->page_table_lock); 945 spin_lock(&mm->page_table_lock);
937 do { 946 do {
947 int write_access = write;
938 struct page *page; 948 struct page *page;
939 int lookup_write = write;
940 949
941 cond_resched_lock(&mm->page_table_lock); 950 cond_resched_lock(&mm->page_table_lock);
942 while (!(page = follow_page(mm, start, lookup_write))) { 951 while (!(page = follow_page(mm, start, write_access))) {
952 int ret;
953
943 /* 954 /*
944 * Shortcut for anonymous pages. We don't want 955 * Shortcut for anonymous pages. We don't want
945 * to force the creation of pages tables for 956 * to force the creation of pages tables for
@@ -947,13 +958,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
947 * nobody touched so far. This is important 958 * nobody touched so far. This is important
948 * for doing a core dump for these mappings. 959 * for doing a core dump for these mappings.
949 */ 960 */
950 if (!lookup_write && 961 if (!write && untouched_anonymous_page(mm,vma,start)) {
951 untouched_anonymous_page(mm,vma,start)) {
952 page = ZERO_PAGE(start); 962 page = ZERO_PAGE(start);
953 break; 963 break;
954 } 964 }
955 spin_unlock(&mm->page_table_lock); 965 spin_unlock(&mm->page_table_lock);
956 switch (handle_mm_fault(mm,vma,start,write)) { 966 ret = __handle_mm_fault(mm, vma, start, write_access);
967
968 /*
969 * The VM_FAULT_WRITE bit tells us that do_wp_page has
970 * broken COW when necessary, even if maybe_mkwrite
971 * decided not to set pte_write. We can thus safely do
972 * subsequent page lookups as if they were reads.
973 */
974 if (ret & VM_FAULT_WRITE)
975 write_access = 0;
976
977 switch (ret & ~VM_FAULT_WRITE) {
957 case VM_FAULT_MINOR: 978 case VM_FAULT_MINOR:
958 tsk->min_flt++; 979 tsk->min_flt++;
959 break; 980 break;
@@ -967,14 +988,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
967 default: 988 default:
968 BUG(); 989 BUG();
969 } 990 }
970 /*
971 * Now that we have performed a write fault
972 * and surely no longer have a shared page we
973 * shouldn't write, we shouldn't ignore an
974 * unwritable page in the page table if
975 * we are forcing write access.
976 */
977 lookup_write = write && !force;
978 spin_lock(&mm->page_table_lock); 991 spin_lock(&mm->page_table_lock);
979 } 992 }
980 if (pages) { 993 if (pages) {
@@ -1139,7 +1152,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1139{ 1152{
1140 pgd_t *pgd; 1153 pgd_t *pgd;
1141 unsigned long next; 1154 unsigned long next;
1142 unsigned long end = addr + size; 1155 unsigned long end = addr + PAGE_ALIGN(size);
1143 struct mm_struct *mm = vma->vm_mm; 1156 struct mm_struct *mm = vma->vm_mm;
1144 int err; 1157 int err;
1145 1158
@@ -1224,6 +1237,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
1224 struct page *old_page, *new_page; 1237 struct page *old_page, *new_page;
1225 unsigned long pfn = pte_pfn(pte); 1238 unsigned long pfn = pte_pfn(pte);
1226 pte_t entry; 1239 pte_t entry;
1240 int ret;
1227 1241
1228 if (unlikely(!pfn_valid(pfn))) { 1242 if (unlikely(!pfn_valid(pfn))) {
1229 /* 1243 /*
@@ -1251,7 +1265,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
1251 lazy_mmu_prot_update(entry); 1265 lazy_mmu_prot_update(entry);
1252 pte_unmap(page_table); 1266 pte_unmap(page_table);
1253 spin_unlock(&mm->page_table_lock); 1267 spin_unlock(&mm->page_table_lock);
1254 return VM_FAULT_MINOR; 1268 return VM_FAULT_MINOR|VM_FAULT_WRITE;
1255 } 1269 }
1256 } 1270 }
1257 pte_unmap(page_table); 1271 pte_unmap(page_table);
@@ -1278,6 +1292,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
1278 /* 1292 /*
1279 * Re-check the pte - we dropped the lock 1293 * Re-check the pte - we dropped the lock
1280 */ 1294 */
1295 ret = VM_FAULT_MINOR;
1281 spin_lock(&mm->page_table_lock); 1296 spin_lock(&mm->page_table_lock);
1282 page_table = pte_offset_map(pmd, address); 1297 page_table = pte_offset_map(pmd, address);
1283 if (likely(pte_same(*page_table, pte))) { 1298 if (likely(pte_same(*page_table, pte))) {
@@ -1294,12 +1309,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
1294 1309
1295 /* Free the old page.. */ 1310 /* Free the old page.. */
1296 new_page = old_page; 1311 new_page = old_page;
1312 ret |= VM_FAULT_WRITE;
1297 } 1313 }
1298 pte_unmap(page_table); 1314 pte_unmap(page_table);
1299 page_cache_release(new_page); 1315 page_cache_release(new_page);
1300 page_cache_release(old_page); 1316 page_cache_release(old_page);
1301 spin_unlock(&mm->page_table_lock); 1317 spin_unlock(&mm->page_table_lock);
1302 return VM_FAULT_MINOR; 1318 return ret;
1303 1319
1304no_new_page: 1320no_new_page:
1305 page_cache_release(old_page); 1321 page_cache_release(old_page);
@@ -1458,7 +1474,7 @@ restart:
1458 * unmap_mapping_range - unmap the portion of all mmaps 1474 * unmap_mapping_range - unmap the portion of all mmaps
1459 * in the specified address_space corresponding to the specified 1475 * in the specified address_space corresponding to the specified
1460 * page range in the underlying file. 1476 * page range in the underlying file.
1461 * @address_space: the address space containing mmaps to be unmapped. 1477 * @mapping: the address space containing mmaps to be unmapped.
1462 * @holebegin: byte in first page to unmap, relative to the start of 1478 * @holebegin: byte in first page to unmap, relative to the start of
1463 * the underlying file. This will be rounded down to a PAGE_SIZE 1479 * the underlying file. This will be rounded down to a PAGE_SIZE
1464 * boundary. Note that this is different from vmtruncate(), which 1480 * boundary. Note that this is different from vmtruncate(), which
@@ -1991,7 +2007,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
1991 if (write_access) { 2007 if (write_access) {
1992 if (!pte_write(entry)) 2008 if (!pte_write(entry))
1993 return do_wp_page(mm, vma, address, pte, pmd, entry); 2009 return do_wp_page(mm, vma, address, pte, pmd, entry);
1994
1995 entry = pte_mkdirty(entry); 2010 entry = pte_mkdirty(entry);
1996 } 2011 }
1997 entry = pte_mkyoung(entry); 2012 entry = pte_mkyoung(entry);
@@ -2006,7 +2021,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2006/* 2021/*
2007 * By the time we get here, we already hold the mm semaphore 2022 * By the time we get here, we already hold the mm semaphore
2008 */ 2023 */
2009int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, 2024int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
2010 unsigned long address, int write_access) 2025 unsigned long address, int write_access)
2011{ 2026{
2012 pgd_t *pgd; 2027 pgd_t *pgd;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cb41c31e7c87..b4eababc8198 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -443,7 +443,7 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
443 struct mempolicy *new; 443 struct mempolicy *new;
444 DECLARE_BITMAP(nodes, MAX_NUMNODES); 444 DECLARE_BITMAP(nodes, MAX_NUMNODES);
445 445
446 if (mode > MPOL_MAX) 446 if (mode < 0 || mode > MPOL_MAX)
447 return -EINVAL; 447 return -EINVAL;
448 err = get_nodes(nodes, nmask, maxnode, mode); 448 err = get_nodes(nodes, nmask, maxnode, mode);
449 if (err) 449 if (err)
@@ -1138,11 +1138,11 @@ void mpol_free_shared_policy(struct shared_policy *p)
1138 while (next) { 1138 while (next) {
1139 n = rb_entry(next, struct sp_node, nd); 1139 n = rb_entry(next, struct sp_node, nd);
1140 next = rb_next(&n->nd); 1140 next = rb_next(&n->nd);
1141 rb_erase(&n->nd, &p->root);
1141 mpol_free(n->policy); 1142 mpol_free(n->policy);
1142 kmem_cache_free(sn_cache, n); 1143 kmem_cache_free(sn_cache, n);
1143 } 1144 }
1144 spin_unlock(&p->lock); 1145 spin_unlock(&p->lock);
1145 p->root = RB_ROOT;
1146} 1146}
1147 1147
1148/* assumes fs == KERNEL_DS */ 1148/* assumes fs == KERNEL_DS */
diff --git a/mm/mempool.c b/mm/mempool.c
index c9f3d4620428..65f2957b8d51 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -51,16 +51,23 @@ static void free_pool(mempool_t *pool)
51 * functions might sleep - as long as the mempool_alloc function is not called 51 * functions might sleep - as long as the mempool_alloc function is not called
52 * from IRQ contexts. 52 * from IRQ contexts.
53 */ 53 */
54mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, 54mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
55 mempool_free_t *free_fn, void *pool_data) 55 mempool_free_t *free_fn, void *pool_data)
56{ 56{
57 mempool_t *pool; 57 return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1);
58}
59EXPORT_SYMBOL(mempool_create);
58 60
59 pool = kmalloc(sizeof(*pool), GFP_KERNEL); 61mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
62 mempool_free_t *free_fn, void *pool_data, int node_id)
63{
64 mempool_t *pool;
65 pool = kmalloc_node(sizeof(*pool), GFP_KERNEL, node_id);
60 if (!pool) 66 if (!pool)
61 return NULL; 67 return NULL;
62 memset(pool, 0, sizeof(*pool)); 68 memset(pool, 0, sizeof(*pool));
63 pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL); 69 pool->elements = kmalloc_node(min_nr * sizeof(void *),
70 GFP_KERNEL, node_id);
64 if (!pool->elements) { 71 if (!pool->elements) {
65 kfree(pool); 72 kfree(pool);
66 return NULL; 73 return NULL;
@@ -87,7 +94,7 @@ mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
87 } 94 }
88 return pool; 95 return pool;
89} 96}
90EXPORT_SYMBOL(mempool_create); 97EXPORT_SYMBOL(mempool_create_node);
91 98
92/** 99/**
93 * mempool_resize - resize an existing memory pool 100 * mempool_resize - resize an existing memory pool
@@ -197,8 +204,8 @@ void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask)
197{ 204{
198 void *element; 205 void *element;
199 unsigned long flags; 206 unsigned long flags;
200 DEFINE_WAIT(wait); 207 wait_queue_t wait;
201 int gfp_temp; 208 unsigned int gfp_temp;
202 209
203 might_sleep_if(gfp_mask & __GFP_WAIT); 210 might_sleep_if(gfp_mask & __GFP_WAIT);
204 211
@@ -228,6 +235,7 @@ repeat_alloc:
228 235
229 /* Now start performing page reclaim */ 236 /* Now start performing page reclaim */
230 gfp_temp = gfp_mask; 237 gfp_temp = gfp_mask;
238 init_wait(&wait);
231 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); 239 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
232 smp_mb(); 240 smp_mb();
233 if (!pool->curr_nr) 241 if (!pool->curr_nr)
diff --git a/mm/mmap.c b/mm/mmap.c
index da3fa90a0aae..404319477e71 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -143,7 +143,11 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
143 leave 3% of the size of this process for other processes */ 143 leave 3% of the size of this process for other processes */
144 allowed -= current->mm->total_vm / 32; 144 allowed -= current->mm->total_vm / 32;
145 145
146 if (atomic_read(&vm_committed_space) < allowed) 146 /*
147 * cast `allowed' as a signed long because vm_committed_space
148 * sometimes has a negative value
149 */
150 if (atomic_read(&vm_committed_space) < (long)allowed)
147 return 0; 151 return 0;
148 152
149 vm_unacct_memory(pages); 153 vm_unacct_memory(pages);
diff --git a/mm/mremap.c b/mm/mremap.c
index ec7238a78f36..fc45dc9a617b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -229,6 +229,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
229 * since do_munmap() will decrement it by old_len == new_len 229 * since do_munmap() will decrement it by old_len == new_len
230 */ 230 */
231 mm->total_vm += new_len >> PAGE_SHIFT; 231 mm->total_vm += new_len >> PAGE_SHIFT;
232 __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
232 233
233 if (do_munmap(mm, old_addr, old_len) < 0) { 234 if (do_munmap(mm, old_addr, old_len) < 0) {
234 /* OOM: unable to split vma, just get accounts right */ 235 /* OOM: unable to split vma, just get accounts right */
@@ -243,7 +244,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
243 vma->vm_next->vm_flags |= VM_ACCOUNT; 244 vma->vm_next->vm_flags |= VM_ACCOUNT;
244 } 245 }
245 246
246 __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
247 if (vm_flags & VM_LOCKED) { 247 if (vm_flags & VM_LOCKED) {
248 mm->locked_vm += new_len >> PAGE_SHIFT; 248 mm->locked_vm += new_len >> PAGE_SHIFT;
249 if (new_len > old_len) 249 if (new_len > old_len)
diff --git a/mm/nommu.c b/mm/nommu.c
index ce74452c02d9..fd4e8df0f02d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1167,7 +1167,11 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
1167 leave 3% of the size of this process for other processes */ 1167 leave 3% of the size of this process for other processes */
1168 allowed -= current->mm->total_vm / 32; 1168 allowed -= current->mm->total_vm / 32;
1169 1169
1170 if (atomic_read(&vm_committed_space) < allowed) 1170 /*
1171 * cast `allowed' as a signed long because vm_committed_space
1172 * sometimes has a negative value
1173 */
1174 if (atomic_read(&vm_committed_space) < (long)allowed)
1171 return 0; 1175 return 0;
1172 1176
1173 vm_unacct_memory(pages); 1177 vm_unacct_memory(pages);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 59666d905f19..1e56076672f5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -253,14 +253,16 @@ static struct mm_struct *oom_kill_process(struct task_struct *p)
253 * OR try to be smart about which process to kill. Note that we 253 * OR try to be smart about which process to kill. Note that we
254 * don't have to be perfect here, we just have to be good. 254 * don't have to be perfect here, we just have to be good.
255 */ 255 */
256void out_of_memory(unsigned int __nocast gfp_mask) 256void out_of_memory(unsigned int __nocast gfp_mask, int order)
257{ 257{
258 struct mm_struct *mm = NULL; 258 struct mm_struct *mm = NULL;
259 task_t * p; 259 task_t * p;
260 260
261 printk("oom-killer: gfp_mask=0x%x\n", gfp_mask); 261 if (printk_ratelimit()) {
262 /* print memory stats */ 262 printk("oom-killer: gfp_mask=0x%x, order=%d\n",
263 show_mem(); 263 gfp_mask, order);
264 show_mem();
265 }
264 266
265 read_lock(&tasklist_lock); 267 read_lock(&tasklist_lock);
266retry: 268retry:
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 613b99a55917..a6329fa8f862 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -354,7 +354,7 @@ static void background_writeout(unsigned long _min_pages)
354 * the whole world. Returns 0 if a pdflush thread was dispatched. Returns 354 * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
355 * -1 if all pdflush threads were busy. 355 * -1 if all pdflush threads were busy.
356 */ 356 */
357int wakeup_bdflush(long nr_pages) 357int wakeup_pdflush(long nr_pages)
358{ 358{
359 if (nr_pages == 0) { 359 if (nr_pages == 0) {
360 struct writeback_state wbs; 360 struct writeback_state wbs;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 206920796f5f..8d088371196a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages);
68 * Used by page_zone() to look up the address of the struct zone whose 68 * Used by page_zone() to look up the address of the struct zone whose
69 * id is encoded in the upper bits of page->flags 69 * id is encoded in the upper bits of page->flags
70 */ 70 */
71struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; 71struct zone *zone_table[1 << ZONETABLE_SHIFT];
72EXPORT_SYMBOL(zone_table); 72EXPORT_SYMBOL(zone_table);
73 73
74static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 74static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -897,12 +897,6 @@ rebalance:
897 cond_resched(); 897 cond_resched();
898 898
899 if (likely(did_some_progress)) { 899 if (likely(did_some_progress)) {
900 /*
901 * Go through the zonelist yet one more time, keep
902 * very high watermark here, this is only to catch
903 * a parallel oom killing, we must fail if we're still
904 * under heavy pressure.
905 */
906 for (i = 0; (z = zones[i]) != NULL; i++) { 900 for (i = 0; (z = zones[i]) != NULL; i++) {
907 if (!zone_watermark_ok(z, order, z->pages_min, 901 if (!zone_watermark_ok(z, order, z->pages_min,
908 classzone_idx, can_try_harder, 902 classzone_idx, can_try_harder,
@@ -936,7 +930,7 @@ rebalance:
936 goto got_pg; 930 goto got_pg;
937 } 931 }
938 932
939 out_of_memory(gfp_mask); 933 out_of_memory(gfp_mask, order);
940 goto restart; 934 goto restart;
941 } 935 }
942 936
@@ -1067,20 +1061,19 @@ unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
1067 1061
1068static unsigned int nr_free_zone_pages(int offset) 1062static unsigned int nr_free_zone_pages(int offset)
1069{ 1063{
1070 pg_data_t *pgdat; 1064 /* Just pick one node, since fallback list is circular */
1065 pg_data_t *pgdat = NODE_DATA(numa_node_id());
1071 unsigned int sum = 0; 1066 unsigned int sum = 0;
1072 1067
1073 for_each_pgdat(pgdat) { 1068 struct zonelist *zonelist = pgdat->node_zonelists + offset;
1074 struct zonelist *zonelist = pgdat->node_zonelists + offset; 1069 struct zone **zonep = zonelist->zones;
1075 struct zone **zonep = zonelist->zones; 1070 struct zone *zone;
1076 struct zone *zone;
1077 1071
1078 for (zone = *zonep++; zone; zone = *zonep++) { 1072 for (zone = *zonep++; zone; zone = *zonep++) {
1079 unsigned long size = zone->present_pages; 1073 unsigned long size = zone->present_pages;
1080 unsigned long high = zone->pages_high; 1074 unsigned long high = zone->pages_high;
1081 if (size > high) 1075 if (size > high)
1082 sum += size - high; 1076 sum += size - high;
1083 }
1084 } 1077 }
1085 1078
1086 return sum; 1079 return sum;
@@ -1649,11 +1642,17 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1649void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1642void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1650 unsigned long start_pfn) 1643 unsigned long start_pfn)
1651{ 1644{
1652 struct page *start = pfn_to_page(start_pfn);
1653 struct page *page; 1645 struct page *page;
1646 unsigned long end_pfn = start_pfn + size;
1647 unsigned long pfn;
1654 1648
1655 for (page = start; page < (start + size); page++) { 1649 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
1656 set_page_zone(page, NODEZONE(nid, zone)); 1650 if (!early_pfn_valid(pfn))
1651 continue;
1652 if (!early_pfn_in_nid(pfn, nid))
1653 continue;
1654 page = pfn_to_page(pfn);
1655 set_page_links(page, zone, nid, pfn);
1657 set_page_count(page, 0); 1656 set_page_count(page, 0);
1658 reset_page_mapcount(page); 1657 reset_page_mapcount(page);
1659 SetPageReserved(page); 1658 SetPageReserved(page);
@@ -1661,9 +1660,8 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1661#ifdef WANT_PAGE_VIRTUAL 1660#ifdef WANT_PAGE_VIRTUAL
1662 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 1661 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1663 if (!is_highmem_idx(zone)) 1662 if (!is_highmem_idx(zone))
1664 set_page_address(page, __va(start_pfn << PAGE_SHIFT)); 1663 set_page_address(page, __va(pfn << PAGE_SHIFT));
1665#endif 1664#endif
1666 start_pfn++;
1667 } 1665 }
1668} 1666}
1669 1667
@@ -1677,6 +1675,20 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1677 } 1675 }
1678} 1676}
1679 1677
1678#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
1679void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
1680 unsigned long size)
1681{
1682 unsigned long snum = pfn_to_section_nr(pfn);
1683 unsigned long end = pfn_to_section_nr(pfn + size);
1684
1685 if (FLAGS_HAS_NODE)
1686 zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
1687 else
1688 for (; snum <= end; snum++)
1689 zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
1690}
1691
1680#ifndef __HAVE_ARCH_MEMMAP_INIT 1692#ifndef __HAVE_ARCH_MEMMAP_INIT
1681#define memmap_init(size, nid, zone, start_pfn) \ 1693#define memmap_init(size, nid, zone, start_pfn) \
1682 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1694 memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -1742,10 +1754,17 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1742 * with interrupts disabled. 1754 * with interrupts disabled.
1743 * 1755 *
1744 * Some NUMA counter updates may also be caught by the boot pagesets. 1756 * Some NUMA counter updates may also be caught by the boot pagesets.
1745 * These will be discarded when bootup is complete. 1757 *
1758 * The boot_pagesets must be kept even after bootup is complete for
1759 * unused processors and/or zones. They do play a role for bootstrapping
1760 * hotplugged processors.
1761 *
1762 * zoneinfo_show() and maybe other functions do
1763 * not check if the processor is online before following the pageset pointer.
1764 * Other parts of the kernel may not check if the zone is available.
1746 */ 1765 */
1747static struct per_cpu_pageset 1766static struct per_cpu_pageset
1748 boot_pageset[NR_CPUS] __initdata; 1767 boot_pageset[NR_CPUS];
1749 1768
1750/* 1769/*
1751 * Dynamically allocate memory for the 1770 * Dynamically allocate memory for the
@@ -1841,7 +1860,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1841 unsigned long *zones_size, unsigned long *zholes_size) 1860 unsigned long *zones_size, unsigned long *zholes_size)
1842{ 1861{
1843 unsigned long i, j; 1862 unsigned long i, j;
1844 const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
1845 int cpu, nid = pgdat->node_id; 1863 int cpu, nid = pgdat->node_id;
1846 unsigned long zone_start_pfn = pgdat->node_start_pfn; 1864 unsigned long zone_start_pfn = pgdat->node_start_pfn;
1847 1865
@@ -1854,7 +1872,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1854 unsigned long size, realsize; 1872 unsigned long size, realsize;
1855 unsigned long batch; 1873 unsigned long batch;
1856 1874
1857 zone_table[NODEZONE(nid, j)] = zone;
1858 realsize = size = zones_size[j]; 1875 realsize = size = zones_size[j];
1859 if (zholes_size) 1876 if (zholes_size)
1860 realsize -= zholes_size[j]; 1877 realsize -= zholes_size[j];
@@ -1915,11 +1932,10 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1915 zone->zone_mem_map = pfn_to_page(zone_start_pfn); 1932 zone->zone_mem_map = pfn_to_page(zone_start_pfn);
1916 zone->zone_start_pfn = zone_start_pfn; 1933 zone->zone_start_pfn = zone_start_pfn;
1917 1934
1918 if ((zone_start_pfn) & (zone_required_alignment-1))
1919 printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n");
1920
1921 memmap_init(size, nid, j, zone_start_pfn); 1935 memmap_init(size, nid, j, zone_start_pfn);
1922 1936
1937 zonetable_add(zone, nid, j, zone_start_pfn, size);
1938
1923 zone_start_pfn += size; 1939 zone_start_pfn += size;
1924 1940
1925 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 1941 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
@@ -1928,24 +1944,30 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1928 1944
1929static void __init alloc_node_mem_map(struct pglist_data *pgdat) 1945static void __init alloc_node_mem_map(struct pglist_data *pgdat)
1930{ 1946{
1931 unsigned long size;
1932
1933 /* Skip empty nodes */ 1947 /* Skip empty nodes */
1934 if (!pgdat->node_spanned_pages) 1948 if (!pgdat->node_spanned_pages)
1935 return; 1949 return;
1936 1950
1951#ifdef CONFIG_FLAT_NODE_MEM_MAP
1937 /* ia64 gets its own node_mem_map, before this, without bootmem */ 1952 /* ia64 gets its own node_mem_map, before this, without bootmem */
1938 if (!pgdat->node_mem_map) { 1953 if (!pgdat->node_mem_map) {
1954 unsigned long size;
1955 struct page *map;
1956
1939 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); 1957 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
1940 pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); 1958 map = alloc_remap(pgdat->node_id, size);
1959 if (!map)
1960 map = alloc_bootmem_node(pgdat, size);
1961 pgdat->node_mem_map = map;
1941 } 1962 }
1942#ifndef CONFIG_DISCONTIGMEM 1963#ifdef CONFIG_FLATMEM
1943 /* 1964 /*
1944 * With no DISCONTIG, the global mem_map is just set as node 0's 1965 * With no DISCONTIG, the global mem_map is just set as node 0's
1945 */ 1966 */
1946 if (pgdat == NODE_DATA(0)) 1967 if (pgdat == NODE_DATA(0))
1947 mem_map = NODE_DATA(0)->node_mem_map; 1968 mem_map = NODE_DATA(0)->node_mem_map;
1948#endif 1969#endif
1970#endif /* CONFIG_FLAT_NODE_MEM_MAP */
1949} 1971}
1950 1972
1951void __init free_area_init_node(int nid, struct pglist_data *pgdat, 1973void __init free_area_init_node(int nid, struct pglist_data *pgdat,
@@ -1961,18 +1983,18 @@ void __init free_area_init_node(int nid, struct pglist_data *pgdat,
1961 free_area_init_core(pgdat, zones_size, zholes_size); 1983 free_area_init_core(pgdat, zones_size, zholes_size);
1962} 1984}
1963 1985
1964#ifndef CONFIG_DISCONTIGMEM 1986#ifndef CONFIG_NEED_MULTIPLE_NODES
1965static bootmem_data_t contig_bootmem_data; 1987static bootmem_data_t contig_bootmem_data;
1966struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 1988struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
1967 1989
1968EXPORT_SYMBOL(contig_page_data); 1990EXPORT_SYMBOL(contig_page_data);
1991#endif
1969 1992
1970void __init free_area_init(unsigned long *zones_size) 1993void __init free_area_init(unsigned long *zones_size)
1971{ 1994{
1972 free_area_init_node(0, &contig_page_data, zones_size, 1995 free_area_init_node(0, NODE_DATA(0), zones_size,
1973 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 1996 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
1974} 1997}
1975#endif
1976 1998
1977#ifdef CONFIG_PROC_FS 1999#ifdef CONFIG_PROC_FS
1978 2000
diff --git a/mm/page_io.c b/mm/page_io.c
index 667c76df1ec2..2e605a19ce57 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -127,7 +127,7 @@ out:
127 return ret; 127 return ret;
128} 128}
129 129
130#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_PM_DISK) 130#ifdef CONFIG_SOFTWARE_SUSPEND
131/* 131/*
132 * A scruffy utility function to read or write an arbitrary swap page 132 * A scruffy utility function to read or write an arbitrary swap page
133 * and wait on the I/O. The caller must have a ref on the page. 133 * and wait on the I/O. The caller must have a ref on the page.
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 38ce279cc8cd..d6781951267e 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -105,7 +105,7 @@ static int __pdflush(struct pdflush_work *my_work)
105 spin_unlock_irq(&pdflush_lock); 105 spin_unlock_irq(&pdflush_lock);
106 106
107 schedule(); 107 schedule();
108 if (try_to_freeze(PF_FREEZE)) { 108 if (try_to_freeze()) {
109 spin_lock_irq(&pdflush_lock); 109 spin_lock_irq(&pdflush_lock);
110 continue; 110 continue;
111 } 111 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 89770bd25f31..08ac5c7fa91f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -247,8 +247,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
247 * 247 *
248 * On success returns with mapped pte and locked mm->page_table_lock. 248 * On success returns with mapped pte and locked mm->page_table_lock.
249 */ 249 */
250static pte_t *page_check_address(struct page *page, struct mm_struct *mm, 250pte_t *page_check_address(struct page *page, struct mm_struct *mm,
251 unsigned long address) 251 unsigned long address)
252{ 252{
253 pgd_t *pgd; 253 pgd_t *pgd;
254 pud_t *pud; 254 pud_t *pud;
diff --git a/mm/shmem.c b/mm/shmem.c
index e64fa726a790..5a81b1ee4f7a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1773,32 +1773,27 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1773 return 0; 1773 return 0;
1774} 1774}
1775 1775
1776static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) 1776static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
1777{ 1777{
1778 nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode)); 1778 nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
1779 return 0; 1779 return NULL;
1780} 1780}
1781 1781
1782static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd) 1782static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
1783{ 1783{
1784 struct page *page = NULL; 1784 struct page *page = NULL;
1785 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); 1785 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
1786 nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); 1786 nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
1787 return 0; 1787 return page;
1788} 1788}
1789 1789
1790static void shmem_put_link(struct dentry *dentry, struct nameidata *nd) 1790static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
1791{ 1791{
1792 if (!IS_ERR(nd_get_link(nd))) { 1792 if (!IS_ERR(nd_get_link(nd))) {
1793 struct page *page; 1793 struct page *page = cookie;
1794
1795 page = find_get_page(dentry->d_inode->i_mapping, 0);
1796 if (!page)
1797 BUG();
1798 kunmap(page); 1794 kunmap(page);
1799 mark_page_accessed(page); 1795 mark_page_accessed(page);
1800 page_cache_release(page); 1796 page_cache_release(page);
1801 page_cache_release(page);
1802 } 1797 }
1803} 1798}
1804 1799
diff --git a/mm/slab.c b/mm/slab.c
index 93cbbbb39f42..c9e706db4634 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -92,6 +92,7 @@
92#include <linux/sysctl.h> 92#include <linux/sysctl.h>
93#include <linux/module.h> 93#include <linux/module.h>
94#include <linux/rcupdate.h> 94#include <linux/rcupdate.h>
95#include <linux/string.h>
95 96
96#include <asm/uaccess.h> 97#include <asm/uaccess.h>
97#include <asm/cacheflush.h> 98#include <asm/cacheflush.h>
@@ -583,7 +584,8 @@ static inline struct array_cache *ac_data(kmem_cache_t *cachep)
583 return cachep->array[smp_processor_id()]; 584 return cachep->array[smp_processor_id()];
584} 585}
585 586
586static inline kmem_cache_t *__find_general_cachep(size_t size, int gfpflags) 587static inline kmem_cache_t *__find_general_cachep(size_t size,
588 unsigned int __nocast gfpflags)
587{ 589{
588 struct cache_sizes *csizep = malloc_sizes; 590 struct cache_sizes *csizep = malloc_sizes;
589 591
@@ -607,7 +609,8 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, int gfpflags)
607 return csizep->cs_cachep; 609 return csizep->cs_cachep;
608} 610}
609 611
610kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags) 612kmem_cache_t *kmem_find_general_cachep(size_t size,
613 unsigned int __nocast gfpflags)
611{ 614{
612 return __find_general_cachep(size, gfpflags); 615 return __find_general_cachep(size, gfpflags);
613} 616}
@@ -2099,7 +2102,7 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags)
2099#if DEBUG 2102#if DEBUG
2100static void * 2103static void *
2101cache_alloc_debugcheck_after(kmem_cache_t *cachep, 2104cache_alloc_debugcheck_after(kmem_cache_t *cachep,
2102 unsigned long flags, void *objp, void *caller) 2105 unsigned int __nocast flags, void *objp, void *caller)
2103{ 2106{
2104 if (!objp) 2107 if (!objp)
2105 return objp; 2108 return objp;
@@ -2371,6 +2374,9 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
2371 struct slab *slabp; 2374 struct slab *slabp;
2372 kmem_bufctl_t next; 2375 kmem_bufctl_t next;
2373 2376
2377 if (nodeid == -1)
2378 return kmem_cache_alloc(cachep, flags);
2379
2374 for (loop = 0;;loop++) { 2380 for (loop = 0;;loop++) {
2375 struct list_head *q; 2381 struct list_head *q;
2376 2382
@@ -2438,7 +2444,7 @@ got_slabp:
2438} 2444}
2439EXPORT_SYMBOL(kmem_cache_alloc_node); 2445EXPORT_SYMBOL(kmem_cache_alloc_node);
2440 2446
2441void *kmalloc_node(size_t size, int flags, int node) 2447void *kmalloc_node(size_t size, unsigned int __nocast flags, int node)
2442{ 2448{
2443 kmem_cache_t *cachep; 2449 kmem_cache_t *cachep;
2444 2450
@@ -3082,3 +3088,26 @@ unsigned int ksize(const void *objp)
3082 3088
3083 return size; 3089 return size;
3084} 3090}
3091
3092
3093/*
3094 * kstrdup - allocate space for and copy an existing string
3095 *
3096 * @s: the string to duplicate
3097 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
3098 */
3099char *kstrdup(const char *s, unsigned int __nocast gfp)
3100{
3101 size_t len;
3102 char *buf;
3103
3104 if (!s)
3105 return NULL;
3106
3107 len = strlen(s) + 1;
3108 buf = kmalloc(len, gfp);
3109 if (buf)
3110 memcpy(buf, s, len);
3111 return buf;
3112}
3113EXPORT_SYMBOL(kstrdup);
diff --git a/mm/sparse.c b/mm/sparse.c
new file mode 100644
index 000000000000..b54e304df4a7
--- /dev/null
+++ b/mm/sparse.c
@@ -0,0 +1,137 @@
1/*
2 * sparse memory mappings.
3 */
4#include <linux/config.h>
5#include <linux/mm.h>
6#include <linux/mmzone.h>
7#include <linux/bootmem.h>
8#include <linux/module.h>
9#include <asm/dma.h>
10
11/*
12 * Permanent SPARSEMEM data:
13 *
14 * 1) mem_section - memory sections, mem_map's for valid memory
15 */
16struct mem_section mem_section[NR_MEM_SECTIONS];
17EXPORT_SYMBOL(mem_section);
18
19/* Record a memory area against a node. */
20void memory_present(int nid, unsigned long start, unsigned long end)
21{
22 unsigned long pfn;
23
24 start &= PAGE_SECTION_MASK;
25 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
26 unsigned long section = pfn_to_section_nr(pfn);
27 if (!mem_section[section].section_mem_map)
28 mem_section[section].section_mem_map = SECTION_MARKED_PRESENT;
29 }
30}
31
32/*
33 * Only used by the i386 NUMA architecures, but relatively
34 * generic code.
35 */
36unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
37 unsigned long end_pfn)
38{
39 unsigned long pfn;
40 unsigned long nr_pages = 0;
41
42 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
43 if (nid != early_pfn_to_nid(pfn))
44 continue;
45
46 if (pfn_valid(pfn))
47 nr_pages += PAGES_PER_SECTION;
48 }
49
50 return nr_pages * sizeof(struct page);
51}
52
53/*
54 * Subtle, we encode the real pfn into the mem_map such that
55 * the identity pfn - section_mem_map will return the actual
56 * physical page frame number.
57 */
58static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
59{
60 return (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
61}
62
63/*
64 * We need this if we ever free the mem_maps. While not implemented yet,
65 * this function is included for parity with its sibling.
66 */
67static __attribute((unused))
68struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
69{
70 return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
71}
72
73static int sparse_init_one_section(struct mem_section *ms,
74 unsigned long pnum, struct page *mem_map)
75{
76 if (!valid_section(ms))
77 return -EINVAL;
78
79 ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
80
81 return 1;
82}
83
84static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
85{
86 struct page *map;
87 int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
88
89 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
90 if (map)
91 return map;
92
93 map = alloc_bootmem_node(NODE_DATA(nid),
94 sizeof(struct page) * PAGES_PER_SECTION);
95 if (map)
96 return map;
97
98 printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
99 mem_section[pnum].section_mem_map = 0;
100 return NULL;
101}
102
103/*
104 * Allocate the accumulated non-linear sections, allocate a mem_map
105 * for each and record the physical to section mapping.
106 */
107void sparse_init(void)
108{
109 unsigned long pnum;
110 struct page *map;
111
112 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
113 if (!valid_section_nr(pnum))
114 continue;
115
116 map = sparse_early_mem_map_alloc(pnum);
117 if (map)
118 sparse_init_one_section(&mem_section[pnum], pnum, map);
119 }
120}
121
122/*
123 * returns the number of sections whose mem_maps were properly
124 * set. If this is <=0, then that means that the passed-in
125 * map was not consumed and must be freed.
126 */
127int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map)
128{
129 struct mem_section *ms = __pfn_to_section(start_pfn);
130
131 if (ms->section_mem_map & SECTION_MARKED_PRESENT)
132 return -EEXIST;
133
134 ms->section_mem_map |= SECTION_MARKED_PRESENT;
135
136 return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map);
137}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4b8e62a19370..cfffe5098d53 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -972,7 +972,7 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
972 * writeout. So in laptop mode, write out the whole world. 972 * writeout. So in laptop mode, write out the whole world.
973 */ 973 */
974 if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { 974 if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
975 wakeup_bdflush(laptop_mode ? 0 : total_scanned); 975 wakeup_pdflush(laptop_mode ? 0 : total_scanned);
976 sc.may_writepage = 1; 976 sc.may_writepage = 1;
977 } 977 }
978 978
@@ -1216,8 +1216,8 @@ static int kswapd(void *p)
1216 order = 0; 1216 order = 0;
1217 for ( ; ; ) { 1217 for ( ; ; ) {
1218 unsigned long new_order; 1218 unsigned long new_order;
1219 if (current->flags & PF_FREEZE) 1219
1220 refrigerator(PF_FREEZE); 1220 try_to_freeze();
1221 1221
1222 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 1222 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
1223 new_order = pgdat->kswapd_max_order; 1223 new_order = pgdat->kswapd_max_order;