aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig91
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bootmem.c23
-rw-r--r--mm/fadvise.c4
-rw-r--r--mm/filemap.c91
-rw-r--r--mm/filemap.h94
-rw-r--r--mm/filemap_xip.c447
-rw-r--r--mm/madvise.c6
-rw-r--r--mm/memory.c6
-rw-r--r--mm/mempool.c20
-rw-r--r--mm/page_alloc.c51
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/pdflush.c2
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/slab.c24
-rw-r--r--mm/sparse.c137
-rw-r--r--mm/vmscan.c4
17 files changed, 894 insertions, 114 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
new file mode 100644
index 000000000000..cd379936cac6
--- /dev/null
+++ b/mm/Kconfig
@@ -0,0 +1,91 @@
1config SELECT_MEMORY_MODEL
2 def_bool y
3 depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL
4
5choice
6 prompt "Memory model"
7 depends on SELECT_MEMORY_MODEL
8 default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
9 default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
10 default FLATMEM_MANUAL
11
12config FLATMEM_MANUAL
13 bool "Flat Memory"
14 depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE
15 help
16 This option allows you to change some of the ways that
17 Linux manages its memory internally. Most users will
18 only have one option here: FLATMEM. This is normal
19 and a correct option.
20
21 Some users of more advanced features like NUMA and
22 memory hotplug may have different options here.
23 DISCONTIGMEM is an more mature, better tested system,
24 but is incompatible with memory hotplug and may suffer
25 decreased performance over SPARSEMEM. If unsure between
26 "Sparse Memory" and "Discontiguous Memory", choose
27 "Discontiguous Memory".
28
29 If unsure, choose this option (Flat Memory) over any other.
30
31config DISCONTIGMEM_MANUAL
32 bool "Discontigious Memory"
33 depends on ARCH_DISCONTIGMEM_ENABLE
34 help
35 This option provides enhanced support for discontiguous
36 memory systems, over FLATMEM. These systems have holes
37 in their physical address spaces, and this option provides
38 more efficient handling of these holes. However, the vast
39 majority of hardware has quite flat address spaces, and
40 can have degraded performance from extra overhead that
41 this option imposes.
42
43 Many NUMA configurations will have this as the only option.
44
45 If unsure, choose "Flat Memory" over this option.
46
47config SPARSEMEM_MANUAL
48 bool "Sparse Memory"
49 depends on ARCH_SPARSEMEM_ENABLE
50 help
51 This will be the only option for some systems, including
52 memory hotplug systems. This is normal.
53
54 For many other systems, this will be an alternative to
55 "Discontigious Memory". This option provides some potential
56 performance benefits, along with decreased code complexity,
57 but it is newer, and more experimental.
58
59 If unsure, choose "Discontiguous Memory" or "Flat Memory"
60 over this option.
61
62endchoice
63
64config DISCONTIGMEM
65 def_bool y
66 depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
67
68config SPARSEMEM
69 def_bool y
70 depends on SPARSEMEM_MANUAL
71
72config FLATMEM
73 def_bool y
74 depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
75
76config FLAT_NODE_MEM_MAP
77 def_bool y
78 depends on !SPARSEMEM
79
80#
81# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
82# to represent different areas of memory. This variable allows
83# those dependencies to exist individually.
84#
85config NEED_MULTIPLE_NODES
86 def_bool y
87 depends on DISCONTIGMEM || NUMA
88
89config HAVE_MEMORY_PRESENT
90 def_bool y
91 depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
diff --git a/mm/Makefile b/mm/Makefile
index 097408064f6a..4cd69e3ce421 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,6 +15,8 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
16obj-$(CONFIG_HUGETLBFS) += hugetlb.o 16obj-$(CONFIG_HUGETLBFS) += hugetlb.o
17obj-$(CONFIG_NUMA) += mempolicy.o 17obj-$(CONFIG_NUMA) += mempolicy.o
18obj-$(CONFIG_SPARSEMEM) += sparse.o
18obj-$(CONFIG_SHMEM) += shmem.o 19obj-$(CONFIG_SHMEM) += shmem.o
19obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
20 21
22obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 260e703850d8..c1330cc19783 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -33,6 +33,14 @@ EXPORT_SYMBOL(max_pfn); /* This is exported so
33 * dma_get_required_mask(), which uses 33 * dma_get_required_mask(), which uses
34 * it, can be an inline function */ 34 * it, can be an inline function */
35 35
36#ifdef CONFIG_CRASH_DUMP
37/*
38 * If we have booted due to a crash, max_pfn will be a very low value. We need
39 * to know the amount of memory that the previous kernel used.
40 */
41unsigned long saved_max_pfn;
42#endif
43
36/* return the number of _pages_ that will be allocated for the boot bitmap */ 44/* return the number of _pages_ that will be allocated for the boot bitmap */
37unsigned long __init bootmem_bootmap_pages (unsigned long pages) 45unsigned long __init bootmem_bootmap_pages (unsigned long pages)
38{ 46{
@@ -57,7 +65,7 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
57 pgdat->pgdat_next = pgdat_list; 65 pgdat->pgdat_next = pgdat_list;
58 pgdat_list = pgdat; 66 pgdat_list = pgdat;
59 67
60 mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL); 68 mapsize = ALIGN(mapsize, sizeof(long));
61 bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); 69 bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
62 bdata->node_boot_start = (start << PAGE_SHIFT); 70 bdata->node_boot_start = (start << PAGE_SHIFT);
63 bdata->node_low_pfn = end; 71 bdata->node_low_pfn = end;
@@ -178,7 +186,7 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
178 } else 186 } else
179 preferred = 0; 187 preferred = 0;
180 188
181 preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT; 189 preferred = ALIGN(preferred, align) >> PAGE_SHIFT;
182 preferred += offset; 190 preferred += offset;
183 areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; 191 areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
184 incr = align >> PAGE_SHIFT ? : 1; 192 incr = align >> PAGE_SHIFT ? : 1;
@@ -219,7 +227,7 @@ found:
219 */ 227 */
220 if (align < PAGE_SIZE && 228 if (align < PAGE_SIZE &&
221 bdata->last_offset && bdata->last_pos+1 == start) { 229 bdata->last_offset && bdata->last_pos+1 == start) {
222 offset = (bdata->last_offset+align-1) & ~(align-1); 230 offset = ALIGN(bdata->last_offset, align);
223 BUG_ON(offset > PAGE_SIZE); 231 BUG_ON(offset > PAGE_SIZE);
224 remaining_size = PAGE_SIZE-offset; 232 remaining_size = PAGE_SIZE-offset;
225 if (size < remaining_size) { 233 if (size < remaining_size) {
@@ -256,6 +264,7 @@ found:
256static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) 264static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
257{ 265{
258 struct page *page; 266 struct page *page;
267 unsigned long pfn;
259 bootmem_data_t *bdata = pgdat->bdata; 268 bootmem_data_t *bdata = pgdat->bdata;
260 unsigned long i, count, total = 0; 269 unsigned long i, count, total = 0;
261 unsigned long idx; 270 unsigned long idx;
@@ -266,7 +275,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
266 275
267 count = 0; 276 count = 0;
268 /* first extant page of the node */ 277 /* first extant page of the node */
269 page = virt_to_page(phys_to_virt(bdata->node_boot_start)); 278 pfn = bdata->node_boot_start >> PAGE_SHIFT;
270 idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); 279 idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
271 map = bdata->node_bootmem_map; 280 map = bdata->node_bootmem_map;
272 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ 281 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
@@ -275,9 +284,11 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
275 gofast = 1; 284 gofast = 1;
276 for (i = 0; i < idx; ) { 285 for (i = 0; i < idx; ) {
277 unsigned long v = ~map[i / BITS_PER_LONG]; 286 unsigned long v = ~map[i / BITS_PER_LONG];
287
278 if (gofast && v == ~0UL) { 288 if (gofast && v == ~0UL) {
279 int j, order; 289 int j, order;
280 290
291 page = pfn_to_page(pfn);
281 count += BITS_PER_LONG; 292 count += BITS_PER_LONG;
282 __ClearPageReserved(page); 293 __ClearPageReserved(page);
283 order = ffs(BITS_PER_LONG) - 1; 294 order = ffs(BITS_PER_LONG) - 1;
@@ -292,6 +303,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
292 page += BITS_PER_LONG; 303 page += BITS_PER_LONG;
293 } else if (v) { 304 } else if (v) {
294 unsigned long m; 305 unsigned long m;
306
307 page = pfn_to_page(pfn);
295 for (m = 1; m && i < idx; m<<=1, page++, i++) { 308 for (m = 1; m && i < idx; m<<=1, page++, i++) {
296 if (v & m) { 309 if (v & m) {
297 count++; 310 count++;
@@ -302,8 +315,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
302 } 315 }
303 } else { 316 } else {
304 i+=BITS_PER_LONG; 317 i+=BITS_PER_LONG;
305 page += BITS_PER_LONG;
306 } 318 }
319 pfn += BITS_PER_LONG;
307 } 320 }
308 total += count; 321 total += count;
309 322
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 57264d74b8bf..5f19e87bc5af 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -43,6 +43,10 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
43 goto out; 43 goto out;
44 } 44 }
45 45
46 if (mapping->a_ops->get_xip_page)
47 /* no bad return value, but ignore advice */
48 goto out;
49
46 /* Careful about overflows. Len == 0 means "as much as possible" */ 50 /* Careful about overflows. Len == 0 means "as much as possible" */
47 endbyte = offset + len; 51 endbyte = offset + len;
48 if (!len || endbyte < len) 52 if (!len || endbyte < len)
diff --git a/mm/filemap.c b/mm/filemap.c
index 4a2fee2cb62b..c11418dd94e8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -28,6 +28,7 @@
28#include <linux/blkdev.h> 28#include <linux/blkdev.h>
29#include <linux/security.h> 29#include <linux/security.h>
30#include <linux/syscalls.h> 30#include <linux/syscalls.h>
31#include "filemap.h"
31/* 32/*
32 * FIXME: remove all knowledge of the buffer layer from the core VM 33 * FIXME: remove all knowledge of the buffer layer from the core VM
33 */ 34 */
@@ -1714,32 +1715,7 @@ int remove_suid(struct dentry *dentry)
1714} 1715}
1715EXPORT_SYMBOL(remove_suid); 1716EXPORT_SYMBOL(remove_suid);
1716 1717
1717/* 1718size_t
1718 * Copy as much as we can into the page and return the number of bytes which
1719 * were sucessfully copied. If a fault is encountered then clear the page
1720 * out to (offset+bytes) and return the number of bytes which were copied.
1721 */
1722static inline size_t
1723filemap_copy_from_user(struct page *page, unsigned long offset,
1724 const char __user *buf, unsigned bytes)
1725{
1726 char *kaddr;
1727 int left;
1728
1729 kaddr = kmap_atomic(page, KM_USER0);
1730 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
1731 kunmap_atomic(kaddr, KM_USER0);
1732
1733 if (left != 0) {
1734 /* Do it the slow way */
1735 kaddr = kmap(page);
1736 left = __copy_from_user(kaddr + offset, buf, bytes);
1737 kunmap(page);
1738 }
1739 return bytes - left;
1740}
1741
1742static size_t
1743__filemap_copy_from_user_iovec(char *vaddr, 1719__filemap_copy_from_user_iovec(char *vaddr,
1744 const struct iovec *iov, size_t base, size_t bytes) 1720 const struct iovec *iov, size_t base, size_t bytes)
1745{ 1721{
@@ -1767,52 +1743,6 @@ __filemap_copy_from_user_iovec(char *vaddr,
1767} 1743}
1768 1744
1769/* 1745/*
1770 * This has the same sideeffects and return value as filemap_copy_from_user().
1771 * The difference is that on a fault we need to memset the remainder of the
1772 * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
1773 * single-segment behaviour.
1774 */
1775static inline size_t
1776filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
1777 const struct iovec *iov, size_t base, size_t bytes)
1778{
1779 char *kaddr;
1780 size_t copied;
1781
1782 kaddr = kmap_atomic(page, KM_USER0);
1783 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
1784 base, bytes);
1785 kunmap_atomic(kaddr, KM_USER0);
1786 if (copied != bytes) {
1787 kaddr = kmap(page);
1788 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
1789 base, bytes);
1790 kunmap(page);
1791 }
1792 return copied;
1793}
1794
1795static inline void
1796filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1797{
1798 const struct iovec *iov = *iovp;
1799 size_t base = *basep;
1800
1801 while (bytes) {
1802 int copy = min(bytes, iov->iov_len - base);
1803
1804 bytes -= copy;
1805 base += copy;
1806 if (iov->iov_len == base) {
1807 iov++;
1808 base = 0;
1809 }
1810 }
1811 *iovp = iov;
1812 *basep = base;
1813}
1814
1815/*
1816 * Performs necessary checks before doing a write 1746 * Performs necessary checks before doing a write
1817 * 1747 *
1818 * Can adjust writing position aor amount of bytes to write. 1748 * Can adjust writing position aor amount of bytes to write.
@@ -1827,12 +1757,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
1827 if (unlikely(*pos < 0)) 1757 if (unlikely(*pos < 0))
1828 return -EINVAL; 1758 return -EINVAL;
1829 1759
1830 if (unlikely(file->f_error)) {
1831 int err = file->f_error;
1832 file->f_error = 0;
1833 return err;
1834 }
1835
1836 if (!isblk) { 1760 if (!isblk) {
1837 /* FIXME: this is for backwards compatibility with 2.4 */ 1761 /* FIXME: this is for backwards compatibility with 2.4 */
1838 if (file->f_flags & O_APPEND) 1762 if (file->f_flags & O_APPEND)
@@ -1927,8 +1851,11 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1927 * i_sem is held, which protects generic_osync_inode() from 1851 * i_sem is held, which protects generic_osync_inode() from
1928 * livelocking. 1852 * livelocking.
1929 */ 1853 */
1930 if (written >= 0 && file->f_flags & O_SYNC) 1854 if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
1931 generic_osync_inode(inode, mapping, OSYNC_METADATA); 1855 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
1856 if (err < 0)
1857 written = err;
1858 }
1932 if (written == count && !is_sync_kiocb(iocb)) 1859 if (written == count && !is_sync_kiocb(iocb))
1933 written = -EIOCBQUEUED; 1860 written = -EIOCBQUEUED;
1934 return written; 1861 return written;
@@ -2027,7 +1954,9 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2027 if (unlikely(nr_segs > 1)) { 1954 if (unlikely(nr_segs > 1)) {
2028 filemap_set_next_iovec(&cur_iov, 1955 filemap_set_next_iovec(&cur_iov,
2029 &iov_base, status); 1956 &iov_base, status);
2030 buf = cur_iov->iov_base + iov_base; 1957 if (count)
1958 buf = cur_iov->iov_base +
1959 iov_base;
2031 } else { 1960 } else {
2032 iov_base += status; 1961 iov_base += status;
2033 } 1962 }
diff --git a/mm/filemap.h b/mm/filemap.h
new file mode 100644
index 000000000000..13793ba0ce17
--- /dev/null
+++ b/mm/filemap.h
@@ -0,0 +1,94 @@
1/*
2 * linux/mm/filemap.h
3 *
4 * Copyright (C) 1994-1999 Linus Torvalds
5 */
6
7#ifndef __FILEMAP_H
8#define __FILEMAP_H
9
10#include <linux/types.h>
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/highmem.h>
14#include <linux/uio.h>
15#include <linux/config.h>
16#include <asm/uaccess.h>
17
18size_t
19__filemap_copy_from_user_iovec(char *vaddr,
20 const struct iovec *iov,
21 size_t base,
22 size_t bytes);
23
24/*
25 * Copy as much as we can into the page and return the number of bytes which
26 * were sucessfully copied. If a fault is encountered then clear the page
27 * out to (offset+bytes) and return the number of bytes which were copied.
28 */
29static inline size_t
30filemap_copy_from_user(struct page *page, unsigned long offset,
31 const char __user *buf, unsigned bytes)
32{
33 char *kaddr;
34 int left;
35
36 kaddr = kmap_atomic(page, KM_USER0);
37 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
38 kunmap_atomic(kaddr, KM_USER0);
39
40 if (left != 0) {
41 /* Do it the slow way */
42 kaddr = kmap(page);
43 left = __copy_from_user(kaddr + offset, buf, bytes);
44 kunmap(page);
45 }
46 return bytes - left;
47}
48
49/*
50 * This has the same sideeffects and return value as filemap_copy_from_user().
51 * The difference is that on a fault we need to memset the remainder of the
52 * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
53 * single-segment behaviour.
54 */
55static inline size_t
56filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
57 const struct iovec *iov, size_t base, size_t bytes)
58{
59 char *kaddr;
60 size_t copied;
61
62 kaddr = kmap_atomic(page, KM_USER0);
63 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
64 base, bytes);
65 kunmap_atomic(kaddr, KM_USER0);
66 if (copied != bytes) {
67 kaddr = kmap(page);
68 copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
69 base, bytes);
70 kunmap(page);
71 }
72 return copied;
73}
74
75static inline void
76filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
77{
78 const struct iovec *iov = *iovp;
79 size_t base = *basep;
80
81 while (bytes) {
82 int copy = min(bytes, iov->iov_len - base);
83
84 bytes -= copy;
85 base += copy;
86 if (iov->iov_len == base) {
87 iov++;
88 base = 0;
89 }
90 }
91 *iovp = iov;
92 *basep = base;
93}
94#endif
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
new file mode 100644
index 000000000000..3b6e384b98a6
--- /dev/null
+++ b/mm/filemap_xip.c
@@ -0,0 +1,447 @@
1/*
2 * linux/mm/filemap_xip.c
3 *
4 * Copyright (C) 2005 IBM Corporation
5 * Author: Carsten Otte <cotte@de.ibm.com>
6 *
7 * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
8 *
9 */
10
11#include <linux/fs.h>
12#include <linux/pagemap.h>
13#include <linux/module.h>
14#include <linux/uio.h>
15#include <linux/rmap.h>
16#include <asm/tlbflush.h>
17#include "filemap.h"
18
19/*
20 * This is a file read routine for execute in place files, and uses
21 * the mapping->a_ops->get_xip_page() function for the actual low-level
22 * stuff.
23 *
24 * Note the struct file* is not used at all. It may be NULL.
25 */
26static void
27do_xip_mapping_read(struct address_space *mapping,
28 struct file_ra_state *_ra,
29 struct file *filp,
30 loff_t *ppos,
31 read_descriptor_t *desc,
32 read_actor_t actor)
33{
34 struct inode *inode = mapping->host;
35 unsigned long index, end_index, offset;
36 loff_t isize;
37
38 BUG_ON(!mapping->a_ops->get_xip_page);
39
40 index = *ppos >> PAGE_CACHE_SHIFT;
41 offset = *ppos & ~PAGE_CACHE_MASK;
42
43 isize = i_size_read(inode);
44 if (!isize)
45 goto out;
46
47 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
48 for (;;) {
49 struct page *page;
50 unsigned long nr, ret;
51
52 /* nr is the maximum number of bytes to copy from this page */
53 nr = PAGE_CACHE_SIZE;
54 if (index >= end_index) {
55 if (index > end_index)
56 goto out;
57 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
58 if (nr <= offset) {
59 goto out;
60 }
61 }
62 nr = nr - offset;
63
64 page = mapping->a_ops->get_xip_page(mapping,
65 index*(PAGE_SIZE/512), 0);
66 if (!page)
67 goto no_xip_page;
68 if (unlikely(IS_ERR(page))) {
69 if (PTR_ERR(page) == -ENODATA) {
70 /* sparse */
71 page = virt_to_page(empty_zero_page);
72 } else {
73 desc->error = PTR_ERR(page);
74 goto out;
75 }
76 } else
77 BUG_ON(!PageUptodate(page));
78
79 /* If users can be writing to this page using arbitrary
80 * virtual addresses, take care about potential aliasing
81 * before reading the page on the kernel side.
82 */
83 if (mapping_writably_mapped(mapping))
84 flush_dcache_page(page);
85
86 /*
87 * Ok, we have the page, and it's up-to-date, so
88 * now we can copy it to user space...
89 *
90 * The actor routine returns how many bytes were actually used..
91 * NOTE! This may not be the same as how much of a user buffer
92 * we filled up (we may be padding etc), so we can only update
93 * "pos" here (the actor routine has to update the user buffer
94 * pointers and the remaining count).
95 */
96 ret = actor(desc, page, offset, nr);
97 offset += ret;
98 index += offset >> PAGE_CACHE_SHIFT;
99 offset &= ~PAGE_CACHE_MASK;
100
101 if (ret == nr && desc->count)
102 continue;
103 goto out;
104
105no_xip_page:
106 /* Did not get the page. Report it */
107 desc->error = -EIO;
108 goto out;
109 }
110
111out:
112 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
113 if (filp)
114 file_accessed(filp);
115}
116
117ssize_t
118xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
119{
120 read_descriptor_t desc;
121
122 if (!access_ok(VERIFY_WRITE, buf, len))
123 return -EFAULT;
124
125 desc.written = 0;
126 desc.arg.buf = buf;
127 desc.count = len;
128 desc.error = 0;
129
130 do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
131 ppos, &desc, file_read_actor);
132
133 if (desc.written)
134 return desc.written;
135 else
136 return desc.error;
137}
138EXPORT_SYMBOL_GPL(xip_file_read);
139
140ssize_t
141xip_file_sendfile(struct file *in_file, loff_t *ppos,
142 size_t count, read_actor_t actor, void *target)
143{
144 read_descriptor_t desc;
145
146 if (!count)
147 return 0;
148
149 desc.written = 0;
150 desc.count = count;
151 desc.arg.data = target;
152 desc.error = 0;
153
154 do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file,
155 ppos, &desc, actor);
156 if (desc.written)
157 return desc.written;
158 return desc.error;
159}
160EXPORT_SYMBOL_GPL(xip_file_sendfile);
161
162/*
163 * __xip_unmap is invoked from xip_unmap and
164 * xip_write
165 *
166 * This function walks all vmas of the address_space and unmaps the
167 * empty_zero_page when found at pgoff. Should it go in rmap.c?
168 */
169static void
170__xip_unmap (struct address_space * mapping,
171 unsigned long pgoff)
172{
173 struct vm_area_struct *vma;
174 struct mm_struct *mm;
175 struct prio_tree_iter iter;
176 unsigned long address;
177 pte_t *pte;
178 pte_t pteval;
179
180 spin_lock(&mapping->i_mmap_lock);
181 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
182 mm = vma->vm_mm;
183 address = vma->vm_start +
184 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
185 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
186 /*
187 * We need the page_table_lock to protect us from page faults,
188 * munmap, fork, etc...
189 */
190 pte = page_check_address(virt_to_page(empty_zero_page), mm,
191 address);
192 if (!IS_ERR(pte)) {
193 /* Nuke the page table entry. */
194 flush_cache_page(vma, address, pte_pfn(pte));
195 pteval = ptep_clear_flush(vma, address, pte);
196 BUG_ON(pte_dirty(pteval));
197 pte_unmap(pte);
198 spin_unlock(&mm->page_table_lock);
199 }
200 }
201 spin_unlock(&mapping->i_mmap_lock);
202}
203
204/*
205 * xip_nopage() is invoked via the vma operations vector for a
206 * mapped memory region to read in file data during a page fault.
207 *
208 * This function is derived from filemap_nopage, but used for execute in place
209 */
210static struct page *
211xip_file_nopage(struct vm_area_struct * area,
212 unsigned long address,
213 int *type)
214{
215 struct file *file = area->vm_file;
216 struct address_space *mapping = file->f_mapping;
217 struct inode *inode = mapping->host;
218 struct page *page;
219 unsigned long size, pgoff, endoff;
220
221 pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
222 + area->vm_pgoff;
223 endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT)
224 + area->vm_pgoff;
225
226 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
227 if (pgoff >= size) {
228 return NULL;
229 }
230
231 page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
232 if (!IS_ERR(page)) {
233 BUG_ON(!PageUptodate(page));
234 return page;
235 }
236 if (PTR_ERR(page) != -ENODATA)
237 return NULL;
238
239 /* sparse block */
240 if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
241 (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
242 (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
243 /* maybe shared writable, allocate new block */
244 page = mapping->a_ops->get_xip_page (mapping,
245 pgoff*(PAGE_SIZE/512), 1);
246 if (IS_ERR(page))
247 return NULL;
248 BUG_ON(!PageUptodate(page));
249 /* unmap page at pgoff from all other vmas */
250 __xip_unmap(mapping, pgoff);
251 } else {
252 /* not shared and writable, use empty_zero_page */
253 page = virt_to_page(empty_zero_page);
254 }
255
256 return page;
257}
258
259static struct vm_operations_struct xip_file_vm_ops = {
260 .nopage = xip_file_nopage,
261};
262
263int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
264{
265 BUG_ON(!file->f_mapping->a_ops->get_xip_page);
266
267 file_accessed(file);
268 vma->vm_ops = &xip_file_vm_ops;
269 return 0;
270}
271EXPORT_SYMBOL_GPL(xip_file_mmap);
272
273static ssize_t
274__xip_file_write(struct file *filp, const char __user *buf,
275 size_t count, loff_t pos, loff_t *ppos)
276{
277 struct address_space * mapping = filp->f_mapping;
278 struct address_space_operations *a_ops = mapping->a_ops;
279 struct inode *inode = mapping->host;
280 long status = 0;
281 struct page *page;
282 size_t bytes;
283 ssize_t written = 0;
284
285 BUG_ON(!mapping->a_ops->get_xip_page);
286
287 do {
288 unsigned long index;
289 unsigned long offset;
290 size_t copied;
291
292 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
293 index = pos >> PAGE_CACHE_SHIFT;
294 bytes = PAGE_CACHE_SIZE - offset;
295 if (bytes > count)
296 bytes = count;
297
298 /*
299 * Bring in the user page that we will copy from _first_.
300 * Otherwise there's a nasty deadlock on copying from the
301 * same page as we're writing to, without it being marked
302 * up-to-date.
303 */
304 fault_in_pages_readable(buf, bytes);
305
306 page = a_ops->get_xip_page(mapping,
307 index*(PAGE_SIZE/512), 0);
308 if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
309 /* we allocate a new page unmap it */
310 page = a_ops->get_xip_page(mapping,
311 index*(PAGE_SIZE/512), 1);
312 if (!IS_ERR(page))
313 /* unmap page at pgoff from all other vmas */
314 __xip_unmap(mapping, index);
315 }
316
317 if (IS_ERR(page)) {
318 status = PTR_ERR(page);
319 break;
320 }
321
322 BUG_ON(!PageUptodate(page));
323
324 copied = filemap_copy_from_user(page, offset, buf, bytes);
325 flush_dcache_page(page);
326 if (likely(copied > 0)) {
327 status = copied;
328
329 if (status >= 0) {
330 written += status;
331 count -= status;
332 pos += status;
333 buf += status;
334 }
335 }
336 if (unlikely(copied != bytes))
337 if (status >= 0)
338 status = -EFAULT;
339 if (status < 0)
340 break;
341 } while (count);
342 *ppos = pos;
343 /*
344 * No need to use i_size_read() here, the i_size
345 * cannot change under us because we hold i_sem.
346 */
347 if (pos > inode->i_size) {
348 i_size_write(inode, pos);
349 mark_inode_dirty(inode);
350 }
351
352 return written ? written : status;
353}
354
355ssize_t
356xip_file_write(struct file *filp, const char __user *buf, size_t len,
357 loff_t *ppos)
358{
359 struct address_space *mapping = filp->f_mapping;
360 struct inode *inode = mapping->host;
361 size_t count;
362 loff_t pos;
363 ssize_t ret;
364
365 down(&inode->i_sem);
366
367 if (!access_ok(VERIFY_READ, buf, len)) {
368 ret=-EFAULT;
369 goto out_up;
370 }
371
372 pos = *ppos;
373 count = len;
374
375 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
376
377 /* We can write back this queue in page reclaim */
378 current->backing_dev_info = mapping->backing_dev_info;
379
380 ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
381 if (ret)
382 goto out_backing;
383 if (count == 0)
384 goto out_backing;
385
386 ret = remove_suid(filp->f_dentry);
387 if (ret)
388 goto out_backing;
389
390 inode_update_time(inode, 1);
391
392 ret = __xip_file_write (filp, buf, count, pos, ppos);
393
394 out_backing:
395 current->backing_dev_info = NULL;
396 out_up:
397 up(&inode->i_sem);
398 return ret;
399}
400EXPORT_SYMBOL_GPL(xip_file_write);
401
402/*
403 * truncate a page used for execute in place
404 * functionality is analog to block_truncate_page but does use get_xip_page
405 * to get the page instead of page cache
406 */
407int
408xip_truncate_page(struct address_space *mapping, loff_t from)
409{
410 pgoff_t index = from >> PAGE_CACHE_SHIFT;
411 unsigned offset = from & (PAGE_CACHE_SIZE-1);
412 unsigned blocksize;
413 unsigned length;
414 struct page *page;
415 void *kaddr;
416
417 BUG_ON(!mapping->a_ops->get_xip_page);
418
419 blocksize = 1 << mapping->host->i_blkbits;
420 length = offset & (blocksize - 1);
421
422 /* Block boundary? Nothing to do */
423 if (!length)
424 return 0;
425
426 length = blocksize - length;
427
428 page = mapping->a_ops->get_xip_page(mapping,
429 index*(PAGE_SIZE/512), 0);
430 if (!page)
431 return -ENOMEM;
432 if (unlikely(IS_ERR(page))) {
433 if (PTR_ERR(page) == -ENODATA)
434 /* Hole? No need to truncate */
435 return 0;
436 else
437 return PTR_ERR(page);
438 } else
439 BUG_ON(!PageUptodate(page));
440 kaddr = kmap_atomic(page, KM_USER0);
441 memset(kaddr + offset, 0, length);
442 kunmap_atomic(kaddr, KM_USER0);
443
444 flush_dcache_page(page);
445 return 0;
446}
447EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/madvise.c b/mm/madvise.c
index e3108054733c..73180a22877e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -65,7 +65,6 @@ static long madvise_behavior(struct vm_area_struct * vma,
65 /* 65 /*
66 * vm_flags is protected by the mmap_sem held in write mode. 66 * vm_flags is protected by the mmap_sem held in write mode.
67 */ 67 */
68 VM_ClearReadHint(vma);
69 vma->vm_flags = new_flags; 68 vma->vm_flags = new_flags;
70 69
71out: 70out:
@@ -87,6 +86,11 @@ static long madvise_willneed(struct vm_area_struct * vma,
87 if (!file) 86 if (!file)
88 return -EBADF; 87 return -EBADF;
89 88
89 if (file->f_mapping->a_ops->get_xip_page) {
90 /* no bad return value, but ignore advice */
91 return 0;
92 }
93
90 *prev = vma; 94 *prev = vma;
91 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 95 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
92 if (end > vma->vm_end) 96 if (end > vma->vm_end)
diff --git a/mm/memory.c b/mm/memory.c
index da91b7bf9986..beabdefa6254 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -58,7 +58,7 @@
58#include <linux/swapops.h> 58#include <linux/swapops.h>
59#include <linux/elf.h> 59#include <linux/elf.h>
60 60
61#ifndef CONFIG_DISCONTIGMEM 61#ifndef CONFIG_NEED_MULTIPLE_NODES
62/* use the per-pgdat data instead for discontigmem - mbligh */ 62/* use the per-pgdat data instead for discontigmem - mbligh */
63unsigned long max_mapnr; 63unsigned long max_mapnr;
64struct page *mem_map; 64struct page *mem_map;
@@ -1139,7 +1139,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1139{ 1139{
1140 pgd_t *pgd; 1140 pgd_t *pgd;
1141 unsigned long next; 1141 unsigned long next;
1142 unsigned long end = addr + size; 1142 unsigned long end = addr + PAGE_ALIGN(size);
1143 struct mm_struct *mm = vma->vm_mm; 1143 struct mm_struct *mm = vma->vm_mm;
1144 int err; 1144 int err;
1145 1145
@@ -1458,7 +1458,7 @@ restart:
1458 * unmap_mapping_range - unmap the portion of all mmaps 1458 * unmap_mapping_range - unmap the portion of all mmaps
1459 * in the specified address_space corresponding to the specified 1459 * in the specified address_space corresponding to the specified
1460 * page range in the underlying file. 1460 * page range in the underlying file.
1461 * @address_space: the address space containing mmaps to be unmapped. 1461 * @mapping: the address space containing mmaps to be unmapped.
1462 * @holebegin: byte in first page to unmap, relative to the start of 1462 * @holebegin: byte in first page to unmap, relative to the start of
1463 * the underlying file. This will be rounded down to a PAGE_SIZE 1463 * the underlying file. This will be rounded down to a PAGE_SIZE
1464 * boundary. Note that this is different from vmtruncate(), which 1464 * boundary. Note that this is different from vmtruncate(), which
diff --git a/mm/mempool.c b/mm/mempool.c
index c9f3d4620428..9a72f7d918fa 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -51,16 +51,23 @@ static void free_pool(mempool_t *pool)
51 * functions might sleep - as long as the mempool_alloc function is not called 51 * functions might sleep - as long as the mempool_alloc function is not called
52 * from IRQ contexts. 52 * from IRQ contexts.
53 */ 53 */
54mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, 54mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
55 mempool_free_t *free_fn, void *pool_data) 55 mempool_free_t *free_fn, void *pool_data)
56{ 56{
57 mempool_t *pool; 57 return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1);
58}
59EXPORT_SYMBOL(mempool_create);
58 60
59 pool = kmalloc(sizeof(*pool), GFP_KERNEL); 61mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
62 mempool_free_t *free_fn, void *pool_data, int node_id)
63{
64 mempool_t *pool;
65 pool = kmalloc_node(sizeof(*pool), GFP_KERNEL, node_id);
60 if (!pool) 66 if (!pool)
61 return NULL; 67 return NULL;
62 memset(pool, 0, sizeof(*pool)); 68 memset(pool, 0, sizeof(*pool));
63 pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL); 69 pool->elements = kmalloc_node(min_nr * sizeof(void *),
70 GFP_KERNEL, node_id);
64 if (!pool->elements) { 71 if (!pool->elements) {
65 kfree(pool); 72 kfree(pool);
66 return NULL; 73 return NULL;
@@ -87,7 +94,7 @@ mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
87 } 94 }
88 return pool; 95 return pool;
89} 96}
90EXPORT_SYMBOL(mempool_create); 97EXPORT_SYMBOL(mempool_create_node);
91 98
92/** 99/**
93 * mempool_resize - resize an existing memory pool 100 * mempool_resize - resize an existing memory pool
@@ -197,7 +204,7 @@ void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask)
197{ 204{
198 void *element; 205 void *element;
199 unsigned long flags; 206 unsigned long flags;
200 DEFINE_WAIT(wait); 207 wait_queue_t wait;
201 int gfp_temp; 208 int gfp_temp;
202 209
203 might_sleep_if(gfp_mask & __GFP_WAIT); 210 might_sleep_if(gfp_mask & __GFP_WAIT);
@@ -228,6 +235,7 @@ repeat_alloc:
228 235
229 /* Now start performing page reclaim */ 236 /* Now start performing page reclaim */
230 gfp_temp = gfp_mask; 237 gfp_temp = gfp_mask;
238 init_wait(&wait);
231 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); 239 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
232 smp_mb(); 240 smp_mb();
233 if (!pool->curr_nr) 241 if (!pool->curr_nr)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 559336de9687..7ee675ad101e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages);
68 * Used by page_zone() to look up the address of the struct zone whose 68 * Used by page_zone() to look up the address of the struct zone whose
69 * id is encoded in the upper bits of page->flags 69 * id is encoded in the upper bits of page->flags
70 */ 70 */
71struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; 71struct zone *zone_table[1 << ZONETABLE_SHIFT];
72EXPORT_SYMBOL(zone_table); 72EXPORT_SYMBOL(zone_table);
73 73
74static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 74static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -1649,11 +1649,17 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1649void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1649void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1650 unsigned long start_pfn) 1650 unsigned long start_pfn)
1651{ 1651{
1652 struct page *start = pfn_to_page(start_pfn);
1653 struct page *page; 1652 struct page *page;
1653 unsigned long end_pfn = start_pfn + size;
1654 unsigned long pfn;
1654 1655
1655 for (page = start; page < (start + size); page++) { 1656 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
1656 set_page_zone(page, NODEZONE(nid, zone)); 1657 if (!early_pfn_valid(pfn))
1658 continue;
1659 if (!early_pfn_in_nid(pfn, nid))
1660 continue;
1661 page = pfn_to_page(pfn);
1662 set_page_links(page, zone, nid, pfn);
1657 set_page_count(page, 0); 1663 set_page_count(page, 0);
1658 reset_page_mapcount(page); 1664 reset_page_mapcount(page);
1659 SetPageReserved(page); 1665 SetPageReserved(page);
@@ -1677,6 +1683,20 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1677 } 1683 }
1678} 1684}
1679 1685
1686#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
1687void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
1688 unsigned long size)
1689{
1690 unsigned long snum = pfn_to_section_nr(pfn);
1691 unsigned long end = pfn_to_section_nr(pfn + size);
1692
1693 if (FLAGS_HAS_NODE)
1694 zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
1695 else
1696 for (; snum <= end; snum++)
1697 zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
1698}
1699
1680#ifndef __HAVE_ARCH_MEMMAP_INIT 1700#ifndef __HAVE_ARCH_MEMMAP_INIT
1681#define memmap_init(size, nid, zone, start_pfn) \ 1701#define memmap_init(size, nid, zone, start_pfn) \
1682 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1702 memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -1861,7 +1881,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1861 unsigned long size, realsize; 1881 unsigned long size, realsize;
1862 unsigned long batch; 1882 unsigned long batch;
1863 1883
1864 zone_table[NODEZONE(nid, j)] = zone;
1865 realsize = size = zones_size[j]; 1884 realsize = size = zones_size[j];
1866 if (zholes_size) 1885 if (zholes_size)
1867 realsize -= zholes_size[j]; 1886 realsize -= zholes_size[j];
@@ -1927,6 +1946,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1927 1946
1928 memmap_init(size, nid, j, zone_start_pfn); 1947 memmap_init(size, nid, j, zone_start_pfn);
1929 1948
1949 zonetable_add(zone, nid, j, zone_start_pfn, size);
1950
1930 zone_start_pfn += size; 1951 zone_start_pfn += size;
1931 1952
1932 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 1953 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
@@ -1935,24 +1956,30 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1935 1956
1936static void __init alloc_node_mem_map(struct pglist_data *pgdat) 1957static void __init alloc_node_mem_map(struct pglist_data *pgdat)
1937{ 1958{
1938 unsigned long size;
1939
1940 /* Skip empty nodes */ 1959 /* Skip empty nodes */
1941 if (!pgdat->node_spanned_pages) 1960 if (!pgdat->node_spanned_pages)
1942 return; 1961 return;
1943 1962
1963#ifdef CONFIG_FLAT_NODE_MEM_MAP
1944 /* ia64 gets its own node_mem_map, before this, without bootmem */ 1964 /* ia64 gets its own node_mem_map, before this, without bootmem */
1945 if (!pgdat->node_mem_map) { 1965 if (!pgdat->node_mem_map) {
1966 unsigned long size;
1967 struct page *map;
1968
1946 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); 1969 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
1947 pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); 1970 map = alloc_remap(pgdat->node_id, size);
1971 if (!map)
1972 map = alloc_bootmem_node(pgdat, size);
1973 pgdat->node_mem_map = map;
1948 } 1974 }
1949#ifndef CONFIG_DISCONTIGMEM 1975#ifdef CONFIG_FLATMEM
1950 /* 1976 /*
1951 * With no DISCONTIG, the global mem_map is just set as node 0's 1977 * With no DISCONTIG, the global mem_map is just set as node 0's
1952 */ 1978 */
1953 if (pgdat == NODE_DATA(0)) 1979 if (pgdat == NODE_DATA(0))
1954 mem_map = NODE_DATA(0)->node_mem_map; 1980 mem_map = NODE_DATA(0)->node_mem_map;
1955#endif 1981#endif
1982#endif /* CONFIG_FLAT_NODE_MEM_MAP */
1956} 1983}
1957 1984
1958void __init free_area_init_node(int nid, struct pglist_data *pgdat, 1985void __init free_area_init_node(int nid, struct pglist_data *pgdat,
@@ -1968,18 +1995,18 @@ void __init free_area_init_node(int nid, struct pglist_data *pgdat,
1968 free_area_init_core(pgdat, zones_size, zholes_size); 1995 free_area_init_core(pgdat, zones_size, zholes_size);
1969} 1996}
1970 1997
1971#ifndef CONFIG_DISCONTIGMEM 1998#ifndef CONFIG_NEED_MULTIPLE_NODES
1972static bootmem_data_t contig_bootmem_data; 1999static bootmem_data_t contig_bootmem_data;
1973struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 2000struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
1974 2001
1975EXPORT_SYMBOL(contig_page_data); 2002EXPORT_SYMBOL(contig_page_data);
2003#endif
1976 2004
1977void __init free_area_init(unsigned long *zones_size) 2005void __init free_area_init(unsigned long *zones_size)
1978{ 2006{
1979 free_area_init_node(0, &contig_page_data, zones_size, 2007 free_area_init_node(0, NODE_DATA(0), zones_size,
1980 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 2008 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
1981} 2009}
1982#endif
1983 2010
1984#ifdef CONFIG_PROC_FS 2011#ifdef CONFIG_PROC_FS
1985 2012
diff --git a/mm/page_io.c b/mm/page_io.c
index 667c76df1ec2..2e605a19ce57 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -127,7 +127,7 @@ out:
127 return ret; 127 return ret;
128} 128}
129 129
130#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_PM_DISK) 130#ifdef CONFIG_SOFTWARE_SUSPEND
131/* 131/*
132 * A scruffy utility function to read or write an arbitrary swap page 132 * A scruffy utility function to read or write an arbitrary swap page
133 * and wait on the I/O. The caller must have a ref on the page. 133 * and wait on the I/O. The caller must have a ref on the page.
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 38ce279cc8cd..d6781951267e 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -105,7 +105,7 @@ static int __pdflush(struct pdflush_work *my_work)
105 spin_unlock_irq(&pdflush_lock); 105 spin_unlock_irq(&pdflush_lock);
106 106
107 schedule(); 107 schedule();
108 if (try_to_freeze(PF_FREEZE)) { 108 if (try_to_freeze()) {
109 spin_lock_irq(&pdflush_lock); 109 spin_lock_irq(&pdflush_lock);
110 continue; 110 continue;
111 } 111 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 89770bd25f31..08ac5c7fa91f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -247,8 +247,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
247 * 247 *
248 * On success returns with mapped pte and locked mm->page_table_lock. 248 * On success returns with mapped pte and locked mm->page_table_lock.
249 */ 249 */
250static pte_t *page_check_address(struct page *page, struct mm_struct *mm, 250pte_t *page_check_address(struct page *page, struct mm_struct *mm,
251 unsigned long address) 251 unsigned long address)
252{ 252{
253 pgd_t *pgd; 253 pgd_t *pgd;
254 pud_t *pud; 254 pud_t *pud;
diff --git a/mm/slab.c b/mm/slab.c
index 93cbbbb39f42..122d031baab2 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -92,6 +92,7 @@
92#include <linux/sysctl.h> 92#include <linux/sysctl.h>
93#include <linux/module.h> 93#include <linux/module.h>
94#include <linux/rcupdate.h> 94#include <linux/rcupdate.h>
95#include <linux/string.h>
95 96
96#include <asm/uaccess.h> 97#include <asm/uaccess.h>
97#include <asm/cacheflush.h> 98#include <asm/cacheflush.h>
@@ -3082,3 +3083,26 @@ unsigned int ksize(const void *objp)
3082 3083
3083 return size; 3084 return size;
3084} 3085}
3086
3087
3088/*
3089 * kstrdup - allocate space for and copy an existing string
3090 *
3091 * @s: the string to duplicate
3092 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
3093 */
3094char *kstrdup(const char *s, int gfp)
3095{
3096 size_t len;
3097 char *buf;
3098
3099 if (!s)
3100 return NULL;
3101
3102 len = strlen(s) + 1;
3103 buf = kmalloc(len, gfp);
3104 if (buf)
3105 memcpy(buf, s, len);
3106 return buf;
3107}
3108EXPORT_SYMBOL(kstrdup);
diff --git a/mm/sparse.c b/mm/sparse.c
new file mode 100644
index 000000000000..b54e304df4a7
--- /dev/null
+++ b/mm/sparse.c
@@ -0,0 +1,137 @@
1/*
2 * sparse memory mappings.
3 */
4#include <linux/config.h>
5#include <linux/mm.h>
6#include <linux/mmzone.h>
7#include <linux/bootmem.h>
8#include <linux/module.h>
9#include <asm/dma.h>
10
11/*
12 * Permanent SPARSEMEM data:
13 *
14 * 1) mem_section - memory sections, mem_map's for valid memory
15 */
16struct mem_section mem_section[NR_MEM_SECTIONS];
17EXPORT_SYMBOL(mem_section);
18
19/* Record a memory area against a node. */
20void memory_present(int nid, unsigned long start, unsigned long end)
21{
22 unsigned long pfn;
23
24 start &= PAGE_SECTION_MASK;
25 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
26 unsigned long section = pfn_to_section_nr(pfn);
27 if (!mem_section[section].section_mem_map)
28 mem_section[section].section_mem_map = SECTION_MARKED_PRESENT;
29 }
30}
31
32/*
33 * Only used by the i386 NUMA architecures, but relatively
34 * generic code.
35 */
36unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
37 unsigned long end_pfn)
38{
39 unsigned long pfn;
40 unsigned long nr_pages = 0;
41
42 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
43 if (nid != early_pfn_to_nid(pfn))
44 continue;
45
46 if (pfn_valid(pfn))
47 nr_pages += PAGES_PER_SECTION;
48 }
49
50 return nr_pages * sizeof(struct page);
51}
52
53/*
54 * Subtle, we encode the real pfn into the mem_map such that
55 * the identity pfn - section_mem_map will return the actual
56 * physical page frame number.
57 */
58static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
59{
60 return (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
61}
62
63/*
64 * We need this if we ever free the mem_maps. While not implemented yet,
65 * this function is included for parity with its sibling.
66 */
67static __attribute((unused))
68struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
69{
70 return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
71}
72
73static int sparse_init_one_section(struct mem_section *ms,
74 unsigned long pnum, struct page *mem_map)
75{
76 if (!valid_section(ms))
77 return -EINVAL;
78
79 ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
80
81 return 1;
82}
83
84static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
85{
86 struct page *map;
87 int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
88
89 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
90 if (map)
91 return map;
92
93 map = alloc_bootmem_node(NODE_DATA(nid),
94 sizeof(struct page) * PAGES_PER_SECTION);
95 if (map)
96 return map;
97
98 printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
99 mem_section[pnum].section_mem_map = 0;
100 return NULL;
101}
102
103/*
104 * Allocate the accumulated non-linear sections, allocate a mem_map
105 * for each and record the physical to section mapping.
106 */
107void sparse_init(void)
108{
109 unsigned long pnum;
110 struct page *map;
111
112 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
113 if (!valid_section_nr(pnum))
114 continue;
115
116 map = sparse_early_mem_map_alloc(pnum);
117 if (map)
118 sparse_init_one_section(&mem_section[pnum], pnum, map);
119 }
120}
121
122/*
123 * returns the number of sections whose mem_maps were properly
124 * set. If this is <=0, then that means that the passed-in
125 * map was not consumed and must be freed.
126 */
127int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map)
128{
129 struct mem_section *ms = __pfn_to_section(start_pfn);
130
131 if (ms->section_mem_map & SECTION_MARKED_PRESENT)
132 return -EEXIST;
133
134 ms->section_mem_map |= SECTION_MARKED_PRESENT;
135
136 return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map);
137}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4b8e62a19370..1fa312a8db77 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1216,8 +1216,8 @@ static int kswapd(void *p)
1216 order = 0; 1216 order = 0;
1217 for ( ; ; ) { 1217 for ( ; ; ) {
1218 unsigned long new_order; 1218 unsigned long new_order;
1219 if (current->flags & PF_FREEZE) 1219
1220 refrigerator(PF_FREEZE); 1220 try_to_freeze();
1221 1221
1222 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 1222 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
1223 new_order = pgdat->kswapd_max_order; 1223 new_order = pgdat->kswapd_max_order;