17 files changed, 894 insertions, 114 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
new file mode 100644
index 000000000000..cd379936cac6
--- /dev/null
+++ b/mm/Kconfig
@@ -0,0 +1,91 @@
+config SELECT_MEMORY_MODEL
+        def_bool y
+        depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL
+choice
+        prompt "Memory model"
+        depends on SELECT_MEMORY_MODEL
+        default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
+        default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
+        default FLATMEM_MANUAL
+config FLATMEM_MANUAL
+        bool "Flat Memory"
+        depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE
+        help
+          This option allows you to change some of the ways that
+          Linux manages its memory internally.  Most users will
+          only have one option here: FLATMEM.  This is normal
+          and a correct option.
+          Some users of more advanced features like NUMA and
+          memory hotplug may have different options here.
+          DISCONTIGMEM is an more mature, better tested system,
+          but is incompatible with memory hotplug and may suffer
+          decreased performance over SPARSEMEM.  If unsure between
+          "Sparse Memory" and "Discontiguous Memory", choose
+          "Discontiguous Memory".
+          If unsure, choose this option (Flat Memory) over any other.
+config DISCONTIGMEM_MANUAL
+        bool "Discontigious Memory"
+        depends on ARCH_DISCONTIGMEM_ENABLE
+        help
+          This option provides enhanced support for discontiguous
+          memory systems, over FLATMEM.  These systems have holes
+          in their physical address spaces, and this option provides
+          more efficient handling of these holes.  However, the vast
+          majority of hardware has quite flat address spaces, and
+          can have degraded performance from extra overhead that
+          this option imposes.
+          Many NUMA configurations will have this as the only option.
+          If unsure, choose "Flat Memory" over this option.
+config SPARSEMEM_MANUAL
+        bool "Sparse Memory"
+        depends on ARCH_SPARSEMEM_ENABLE
+        help
+          This will be the only option for some systems, including
+          memory hotplug systems.  This is normal.
+          For many other systems, this will be an alternative to
+          "Discontigious Memory".  This option provides some potential
+          performance benefits, along with decreased code complexity,
+          but it is newer, and more experimental.
+          If unsure, choose "Discontiguous Memory" or "Flat Memory"
+          over this option.
+endchoice
+config DISCONTIGMEM
+        def_bool y
+        depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
+config SPARSEMEM
+        def_bool y
+        depends on SPARSEMEM_MANUAL
+config FLATMEM
+        def_bool y
+        depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
+config FLAT_NODE_MEM_MAP
+        def_bool y
+        depends on !SPARSEMEM
+#
+# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
+# to represent different areas of memory.  This variable allows
+# those dependencies to exist individually.
+#
+config NEED_MULTIPLE_NODES
+        def_bool y
+        depends on DISCONTIGMEM || NUMA
+config HAVE_MEMORY_PRESENT
+        def_bool y
+        depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
diff --git a/mm/Makefile b/mm/Makefile
index 097408064f6a..4cd69e3ce421 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,6 +15,8 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
+obj-$(CONFIG_SPARSEMEM) += sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 260e703850d8..c1330cc19783 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -33,6 +33,14 @@ EXPORT_SYMBOL(max_pfn);		/* This is exported so
                                 * dma_get_required_mask(), which uses
                                 * it, can be an inline function */
+#ifdef CONFIG_CRASH_DUMP
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+#endif
 /* return the number of _pages_ that will be allocated for the boot bitmap */
 unsigned long __init bootmem_bootmap_pages (unsigned long pages)
 {
@@ -57,7 +65,7 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
        pgdat->pgdat_next = pgdat_list;
        pgdat_list = pgdat;
-        mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);
+        mapsize = ALIGN(mapsize, sizeof(long));
        bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
        bdata->node_boot_start = (start << PAGE_SHIFT);
        bdata->node_low_pfn = end;
@@ -178,7 +186,7 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
        } else
                preferred = 0;
-        preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT;
+        preferred = ALIGN(preferred, align) >> PAGE_SHIFT;
        preferred += offset;
        areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
        incr = align >> PAGE_SHIFT ? : 1;
@@ -219,7 +227,7 @@ found:
         */
        if (align < PAGE_SIZE &&
            bdata->last_offset && bdata->last_pos+1 == start) {
-                offset = (bdata->last_offset+align-1) & ~(align-1);
+                offset = ALIGN(bdata->last_offset, align);
                BUG_ON(offset > PAGE_SIZE);
                remaining_size = PAGE_SIZE-offset;
                if (size < remaining_size) {
@@ -256,6 +264,7 @@ found:
 static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 {
        struct page *page;
+        unsigned long pfn;
        bootmem_data_t *bdata = pgdat->bdata;
        unsigned long i, count, total = 0;
        unsigned long idx;
@@ -266,7 +275,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
        count = 0;
        /* first extant page of the node */
-        page = virt_to_page(phys_to_virt(bdata->node_boot_start));
+        pfn = bdata->node_boot_start >> PAGE_SHIFT;
        idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
        map = bdata->node_bootmem_map;
        /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
@@ -275,9 +284,11 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
                gofast = 1;
        for (i = 0; i < idx; ) {
                unsigned long v = ~map[i / BITS_PER_LONG];
                if (gofast && v == ~0UL) {
                        int j, order;
+                        page = pfn_to_page(pfn);
                        count += BITS_PER_LONG;
                        __ClearPageReserved(page);
                        order = ffs(BITS_PER_LONG) - 1;
@@ -292,6 +303,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
                        page += BITS_PER_LONG;
                } else if (v) {
                        unsigned long m;
+                        page = pfn_to_page(pfn);
                        for (m = 1; m && i < idx; m<<=1, page++, i++) {
                                if (v & m) {
                                        count++;
@@ -302,8 +315,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
                        }
                } else {
                        i+=BITS_PER_LONG;
-                        page += BITS_PER_LONG;
                }
+                pfn += BITS_PER_LONG;
        }
        total += count;
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 57264d74b8bf..5f19e87bc5af 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -43,6 +43,10 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
                goto out;
        }
+        if (mapping->a_ops->get_xip_page)
+                /* no bad return value, but ignore advice */
+                goto out;
        /* Careful about overflows. Len == 0 means "as much as possible" */
        endbyte = offset + len;
        if (!len || endbyte < len)
diff --git a/mm/filemap.c b/mm/filemap.c
index 4a2fee2cb62b..c11418dd94e8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -28,6 +28,7 @@
 #include <linux/blkdev.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include "filemap.h"
 /*
 * FIXME: remove all knowledge of the buffer layer from the core VM
 */
@@ -1714,32 +1715,7 @@ int remove_suid(struct dentry *dentry)
 }
 EXPORT_SYMBOL(remove_suid);
-/*
+size_t
- * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then clear the page
- * out to (offset+bytes) and return the number of bytes which were copied.
- */
-static inline size_t
-filemap_copy_from_user(struct page *page, unsigned long offset,
-                        const char __user *buf, unsigned bytes)
-{
-        char *kaddr;
-        int left;
-        kaddr = kmap_atomic(page, KM_USER0);
-        left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
-        kunmap_atomic(kaddr, KM_USER0);
-        if (left != 0) {
-                /* Do it the slow way */
-                kaddr = kmap(page);
-                left = __copy_from_user(kaddr + offset, buf, bytes);
-                kunmap(page);
-        }
-        return bytes - left;
-}
-static size_t
 __filemap_copy_from_user_iovec(char *vaddr, 
                        const struct iovec *iov, size_t base, size_t bytes)
 {
@@ -1767,52 +1743,6 @@ __filemap_copy_from_user_iovec(char *vaddr,
 }
 /*
- * This has the same sideeffects and return value as filemap_copy_from_user().
- * The difference is that on a fault we need to memset the remainder of the
- * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
- * single-segment behaviour.
- */
-static inline size_t
-filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
-                        const struct iovec *iov, size_t base, size_t bytes)
-{
-        char *kaddr;
-        size_t copied;
-        kaddr = kmap_atomic(page, KM_USER0);
-        copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
-                                                base, bytes);
-        kunmap_atomic(kaddr, KM_USER0);
-        if (copied != bytes) {
-                kaddr = kmap(page);
-                copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
-                                                        base, bytes);
-                kunmap(page);
-        }
-        return copied;
-}
-static inline void
-filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
-{
-        const struct iovec *iov = *iovp;
-        size_t base = *basep;
-        while (bytes) {
-                int copy = min(bytes, iov->iov_len - base);
-                bytes -= copy;
-                base += copy;
-                if (iov->iov_len == base) {
-                        iov++;
-                        base = 0;
-                }
-        }
-        *iovp = iov;
-        *basep = base;
-}
-/*
 * Performs necessary checks before doing a write
 *
 * Can adjust writing position aor amount of bytes to write.
@@ -1827,12 +1757,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
        if (unlikely(*pos < 0))
                return -EINVAL;
-        if (unlikely(file->f_error)) {
-                int err = file->f_error;
-                file->f_error = 0;
-                return err;
-        }
        if (!isblk) {
                /* FIXME: this is for backwards compatibility with 2.4 */
                if (file->f_flags & O_APPEND)
@@ -1927,8 +1851,11 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
         * i_sem is held, which protects generic_osync_inode() from
         * livelocking.
         */
-        if (written >= 0 && file->f_flags & O_SYNC)
+        if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-                generic_osync_inode(inode, mapping, OSYNC_METADATA);
+                int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
+                if (err < 0)
+                        written = err;
+        }
        if (written == count && !is_sync_kiocb(iocb))
                written = -EIOCBQUEUED;
        return written;
@@ -2027,7 +1954,9 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                                if (unlikely(nr_segs > 1)) {
                                        filemap_set_next_iovec(&cur_iov,
                                                        &iov_base, status);
-                                        buf = cur_iov->iov_base + iov_base;
+                                        if (count)
+                                                buf = cur_iov->iov_base +
+                                                        iov_base;
                                } else {
                                        iov_base += status;
                                }
diff --git a/mm/filemap.h b/mm/filemap.h
new file mode 100644
index 000000000000..13793ba0ce17
--- /dev/null
+++ b/mm/filemap.h
@@ -0,0 +1,94 @@
+/*
+ *      linux/mm/filemap.h
+ *
+ * Copyright (C) 1994-1999  Linus Torvalds
+ */
+#ifndef __FILEMAP_H
+#define __FILEMAP_H
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/uio.h>
+#include <linux/config.h>
+#include <asm/uaccess.h>
+size_t
+__filemap_copy_from_user_iovec(char *vaddr,
+                               const struct iovec *iov,
+                               size_t base,
+                               size_t bytes);
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then clear the page
+ * out to (offset+bytes) and return the number of bytes which were copied.
+ */
+static inline size_t
+filemap_copy_from_user(struct page *page, unsigned long offset,
+                        const char __user *buf, unsigned bytes)
+{
+        char *kaddr;
+        int left;
+        kaddr = kmap_atomic(page, KM_USER0);
+        left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (left != 0) {
+                /* Do it the slow way */
+                kaddr = kmap(page);
+                left = __copy_from_user(kaddr + offset, buf, bytes);
+                kunmap(page);
+        }
+        return bytes - left;
+}
+/*
+ * This has the same sideeffects and return value as filemap_copy_from_user().
+ * The difference is that on a fault we need to memset the remainder of the
+ * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
+ * single-segment behaviour.
+ */
+static inline size_t
+filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
+                        const struct iovec *iov, size_t base, size_t bytes)
+{
+        char *kaddr;
+        size_t copied;
+        kaddr = kmap_atomic(page, KM_USER0);
+        copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
+                                                base, bytes);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (copied != bytes) {
+                kaddr = kmap(page);
+                copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
+                                                        base, bytes);
+                kunmap(page);
+        }
+        return copied;
+}
+static inline void
+filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
+{
+        const struct iovec *iov = *iovp;
+        size_t base = *basep;
+        while (bytes) {
+                int copy = min(bytes, iov->iov_len - base);
+                bytes -= copy;
+                base += copy;
+                if (iov->iov_len == base) {
+                        iov++;
+                        base = 0;
+                }
+        }
+        *iovp = iov;
+        *basep = base;
+}
+#endif
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
new file mode 100644
index 000000000000..3b6e384b98a6
--- /dev/null
+++ b/mm/filemap_xip.c
@@ -0,0 +1,447 @@
+/*
+ *      linux/mm/filemap_xip.c
+ *
+ * Copyright (C) 2005 IBM Corporation
+ * Author: Carsten Otte <cotte@de.ibm.com>
+ *
+ * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
+ *
+ */
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/uio.h>
+#include <linux/rmap.h>
+#include <asm/tlbflush.h>
+#include "filemap.h"
+/*
+ * This is a file read routine for execute in place files, and uses
+ * the mapping->a_ops->get_xip_page() function for the actual low-level
+ * stuff.
+ *
+ * Note the struct file* is not used at all.  It may be NULL.
+ */
+static void
+do_xip_mapping_read(struct address_space *mapping,
+                    struct file_ra_state *_ra,
+                    struct file *filp,
+                    loff_t *ppos,
+                    read_descriptor_t *desc,
+                    read_actor_t actor)
+{
+        struct inode *inode = mapping->host;
+        unsigned long index, end_index, offset;
+        loff_t isize;
+        BUG_ON(!mapping->a_ops->get_xip_page);
+        index = *ppos >> PAGE_CACHE_SHIFT;
+        offset = *ppos & ~PAGE_CACHE_MASK;
+        isize = i_size_read(inode);
+        if (!isize)
+                goto out;
+        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+        for (;;) {
+                struct page *page;
+                unsigned long nr, ret;
+                /* nr is the maximum number of bytes to copy from this page */
+                nr = PAGE_CACHE_SIZE;
+                if (index >= end_index) {
+                        if (index > end_index)
+                                goto out;
+                        nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+                        if (nr <= offset) {
+                                goto out;
+                        }
+                }
+                nr = nr - offset;
+                page = mapping->a_ops->get_xip_page(mapping,
+                        index*(PAGE_SIZE/512), 0);
+                if (!page)
+                        goto no_xip_page;
+                if (unlikely(IS_ERR(page))) {
+                        if (PTR_ERR(page) == -ENODATA) {
+                                /* sparse */
+                                page = virt_to_page(empty_zero_page);
+                        } else {
+                                desc->error = PTR_ERR(page);
+                                goto out;
+                        }
+                } else
+                        BUG_ON(!PageUptodate(page));
+                /* If users can be writing to this page using arbitrary
+                 * virtual addresses, take care about potential aliasing
+                 * before reading the page on the kernel side.
+                 */
+                if (mapping_writably_mapped(mapping))
+                        flush_dcache_page(page);
+                /*
+                 * Ok, we have the page, and it's up-to-date, so
+                 * now we can copy it to user space...
+                 *
+                 * The actor routine returns how many bytes were actually used..
+                 * NOTE! This may not be the same as how much of a user buffer
+                 * we filled up (we may be padding etc), so we can only update
+                 * "pos" here (the actor routine has to update the user buffer
+                 * pointers and the remaining count).
+                 */
+                ret = actor(desc, page, offset, nr);
+                offset += ret;
+                index += offset >> PAGE_CACHE_SHIFT;
+                offset &= ~PAGE_CACHE_MASK;
+                if (ret == nr && desc->count)
+                        continue;
+                goto out;
+no_xip_page:
+                /* Did not get the page. Report it */
+                desc->error = -EIO;
+                goto out;
+        }
+out:
+        *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+        if (filp)
+                file_accessed(filp);
+}
+ssize_t
+xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+{
+        read_descriptor_t desc;
+        if (!access_ok(VERIFY_WRITE, buf, len))
+                return -EFAULT;
+        desc.written = 0;
+        desc.arg.buf = buf;
+        desc.count = len;
+        desc.error = 0;
+        do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
+                            ppos, &desc, file_read_actor);
+        if (desc.written)
+                return desc.written;
+        else
+                return desc.error;
+}
+EXPORT_SYMBOL_GPL(xip_file_read);
+ssize_t
+xip_file_sendfile(struct file *in_file, loff_t *ppos,
+             size_t count, read_actor_t actor, void *target)
+{
+        read_descriptor_t desc;
+        if (!count)
+                return 0;
+        desc.written = 0;
+        desc.count = count;
+        desc.arg.data = target;
+        desc.error = 0;
+        do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file,
+                            ppos, &desc, actor);
+        if (desc.written)
+                return desc.written;
+        return desc.error;
+}
+EXPORT_SYMBOL_GPL(xip_file_sendfile);
+/*
+ * __xip_unmap is invoked from xip_unmap and
+ * xip_write
+ *
+ * This function walks all vmas of the address_space and unmaps the
+ * empty_zero_page when found at pgoff. Should it go in rmap.c?
+ */
+static void
+__xip_unmap (struct address_space * mapping,
+                     unsigned long pgoff)
+{
+        struct vm_area_struct *vma;
+        struct mm_struct *mm;
+        struct prio_tree_iter iter;
+        unsigned long address;
+        pte_t *pte;
+        pte_t pteval;
+        spin_lock(&mapping->i_mmap_lock);
+        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+                mm = vma->vm_mm;
+                address = vma->vm_start +
+                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+                BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+                /*
+                 * We need the page_table_lock to protect us from page faults,
+                 * munmap, fork, etc...
+                 */
+                pte = page_check_address(virt_to_page(empty_zero_page), mm,
+                                         address);
+                if (!IS_ERR(pte)) {
+                        /* Nuke the page table entry. */
+                        flush_cache_page(vma, address, pte_pfn(pte));
+                        pteval = ptep_clear_flush(vma, address, pte);
+                        BUG_ON(pte_dirty(pteval));
+                        pte_unmap(pte);
+                        spin_unlock(&mm->page_table_lock);
+                }
+        }
+        spin_unlock(&mapping->i_mmap_lock);
+}
+/*
+ * xip_nopage() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * This function is derived from filemap_nopage, but used for execute in place
+ */
+static struct page *
+xip_file_nopage(struct vm_area_struct * area,
+                   unsigned long address,
+                   int *type)
+{
+        struct file *file = area->vm_file;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
+        struct page *page;
+        unsigned long size, pgoff, endoff;
+        pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
+                + area->vm_pgoff;
+        endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT)
+                + area->vm_pgoff;
+        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        if (pgoff >= size) {
+                return NULL;
+        }
+        page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
+        if (!IS_ERR(page)) {
+                BUG_ON(!PageUptodate(page));
+                return page;
+        }
+        if (PTR_ERR(page) != -ENODATA)
+                return NULL;
+        /* sparse block */
+        if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
+            (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
+            (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
+                /* maybe shared writable, allocate new block */
+                page = mapping->a_ops->get_xip_page (mapping,
+                        pgoff*(PAGE_SIZE/512), 1);
+                if (IS_ERR(page))
+                        return NULL;
+                BUG_ON(!PageUptodate(page));
+                /* unmap page at pgoff from all other vmas */
+                __xip_unmap(mapping, pgoff);
+        } else {
+                /* not shared and writable, use empty_zero_page */
+                page = virt_to_page(empty_zero_page);
+        }
+        return page;
+}
+static struct vm_operations_struct xip_file_vm_ops = {
+        .nopage         = xip_file_nopage,
+};
+int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+        BUG_ON(!file->f_mapping->a_ops->get_xip_page);
+        file_accessed(file);
+        vma->vm_ops = &xip_file_vm_ops;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(xip_file_mmap);
+static ssize_t
+__xip_file_write(struct file *filp, const char __user *buf,
+                  size_t count, loff_t pos, loff_t *ppos)
+{
+        struct address_space * mapping = filp->f_mapping;
+        struct address_space_operations *a_ops = mapping->a_ops;
+        struct inode    *inode = mapping->host;
+        long            status = 0;
+        struct page     *page;
+        size_t          bytes;
+        ssize_t         written = 0;
+        BUG_ON(!mapping->a_ops->get_xip_page);
+        do {
+                unsigned long index;
+                unsigned long offset;
+                size_t copied;
+                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+                index = pos >> PAGE_CACHE_SHIFT;
+                bytes = PAGE_CACHE_SIZE - offset;
+                if (bytes > count)
+                        bytes = count;
+                /*
+                 * Bring in the user page that we will copy from _first_.
+                 * Otherwise there's a nasty deadlock on copying from the
+                 * same page as we're writing to, without it being marked
+                 * up-to-date.
+                 */
+                fault_in_pages_readable(buf, bytes);
+                page = a_ops->get_xip_page(mapping,
+                                           index*(PAGE_SIZE/512), 0);
+                if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
+                        /* we allocate a new page unmap it */
+                        page = a_ops->get_xip_page(mapping,
+                                                   index*(PAGE_SIZE/512), 1);
+                        if (!IS_ERR(page))
+                                /* unmap page at pgoff from all other vmas */
+                                __xip_unmap(mapping, index);
+                }
+                if (IS_ERR(page)) {
+                        status = PTR_ERR(page);
+                        break;
+                }
+                BUG_ON(!PageUptodate(page));
+                copied = filemap_copy_from_user(page, offset, buf, bytes);
+                flush_dcache_page(page);
+                if (likely(copied > 0)) {
+                        status = copied;
+                        if (status >= 0) {
+                                written += status;
+                                count -= status;
+                                pos += status;
+                                buf += status;
+                        }
+                }
+                if (unlikely(copied != bytes))
+                        if (status >= 0)
+                                status = -EFAULT;
+                if (status < 0)
+                        break;
+        } while (count);
+        *ppos = pos;
+        /*
+         * No need to use i_size_read() here, the i_size
+         * cannot change under us because we hold i_sem.
+         */
+        if (pos > inode->i_size) {
+                i_size_write(inode, pos);
+                mark_inode_dirty(inode);
+        }
+        return written ? written : status;
+}
+ssize_t
+xip_file_write(struct file *filp, const char __user *buf, size_t len,
+               loff_t *ppos)
+{
+        struct address_space *mapping = filp->f_mapping;
+        struct inode *inode = mapping->host;
+        size_t count;
+        loff_t pos;
+        ssize_t ret;
+        down(&inode->i_sem);
+        if (!access_ok(VERIFY_READ, buf, len)) {
+                ret=-EFAULT;
+                goto out_up;
+        }
+        pos = *ppos;
+        count = len;
+        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+        /* We can write back this queue in page reclaim */
+        current->backing_dev_info = mapping->backing_dev_info;
+        ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
+        if (ret)
+                goto out_backing;
+        if (count == 0)
+                goto out_backing;
+        ret = remove_suid(filp->f_dentry);
+        if (ret)
+                goto out_backing;
+        inode_update_time(inode, 1);
+        ret = __xip_file_write (filp, buf, count, pos, ppos);
+ out_backing:
+        current->backing_dev_info = NULL;
+ out_up:
+        up(&inode->i_sem);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(xip_file_write);
+/*
+ * truncate a page used for execute in place
+ * functionality is analog to block_truncate_page but does use get_xip_page
+ * to get the page instead of page cache
+ */
+int
+xip_truncate_page(struct address_space *mapping, loff_t from)
+{
+        pgoff_t index = from >> PAGE_CACHE_SHIFT;
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned blocksize;
+        unsigned length;
+        struct page *page;
+        void *kaddr;
+        BUG_ON(!mapping->a_ops->get_xip_page);
+        blocksize = 1 << mapping->host->i_blkbits;
+        length = offset & (blocksize - 1);
+        /* Block boundary? Nothing to do */
+        if (!length)
+                return 0;
+        length = blocksize - length;
+        page = mapping->a_ops->get_xip_page(mapping,
+                                            index*(PAGE_SIZE/512), 0);
+        if (!page)
+                return -ENOMEM;
+        if (unlikely(IS_ERR(page))) {
+                if (PTR_ERR(page) == -ENODATA)
+                        /* Hole? No need to truncate */
+                        return 0;
+                else
+                        return PTR_ERR(page);
+        } else
+                BUG_ON(!PageUptodate(page));
+        kaddr = kmap_atomic(page, KM_USER0);
+        memset(kaddr + offset, 0, length);
+        kunmap_atomic(kaddr, KM_USER0);
+        flush_dcache_page(page);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/madvise.c b/mm/madvise.c
index e3108054733c..73180a22877e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -65,7 +65,6 @@ static long madvise_behavior(struct vm_area_struct * vma,
        /*
         * vm_flags is protected by the mmap_sem held in write mode.
         */
-        VM_ClearReadHint(vma);
        vma->vm_flags = new_flags;
 out:
@@ -87,6 +86,11 @@ static long madvise_willneed(struct vm_area_struct * vma,
        if (!file)
                return -EBADF;
+        if (file->f_mapping->a_ops->get_xip_page) {
+                /* no bad return value, but ignore advice */
+                return 0;
+        }
        *prev = vma;
        start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
        if (end > vma->vm_end)
diff --git a/mm/memory.c b/mm/memory.c
index da91b7bf9986..beabdefa6254 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -58,7 +58,7 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
 struct page *mem_map;
@@ -1139,7 +1139,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 {
        pgd_t *pgd;
        unsigned long next;
-        unsigned long end = addr + size;
+        unsigned long end = addr + PAGE_ALIGN(size);
        struct mm_struct *mm = vma->vm_mm;
        int err;
@@ -1458,7 +1458,7 @@ restart:
 * unmap_mapping_range - unmap the portion of all mmaps
 * in the specified address_space corresponding to the specified
 * page range in the underlying file.
- * @address_space: the address space containing mmaps to be unmapped.
+ * @mapping: the address space containing mmaps to be unmapped.
 * @holebegin: byte in first page to unmap, relative to the start of
 * the underlying file.  This will be rounded down to a PAGE_SIZE
 * boundary.  Note that this is different from vmtruncate(), which
diff --git a/mm/mempool.c b/mm/mempool.c
index c9f3d4620428..9a72f7d918fa 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -51,16 +51,23 @@ static void free_pool(mempool_t *pool)
 * functions might sleep - as long as the mempool_alloc function is not called
 * from IRQ contexts.
 */
-mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
                                mempool_free_t *free_fn, void *pool_data)
 {
-        mempool_t *pool;
+        return  mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1);
+}
+EXPORT_SYMBOL(mempool_create);
-        pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
+                        mempool_free_t *free_fn, void *pool_data, int node_id)
+{
+        mempool_t *pool;
+        pool = kmalloc_node(sizeof(*pool), GFP_KERNEL, node_id);
        if (!pool)
                return NULL;
        memset(pool, 0, sizeof(*pool));
-        pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL);
+        pool->elements = kmalloc_node(min_nr * sizeof(void *),
+                                        GFP_KERNEL, node_id);
        if (!pool->elements) {
                kfree(pool);
                return NULL;
@@ -87,7 +94,7 @@ mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
        }
        return pool;
 }
-EXPORT_SYMBOL(mempool_create);
+EXPORT_SYMBOL(mempool_create_node);
 /**
 * mempool_resize - resize an existing memory pool
@@ -197,7 +204,7 @@ void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask)
 {
        void *element;
        unsigned long flags;
-        DEFINE_WAIT(wait);
+        wait_queue_t wait;
        int gfp_temp;
        might_sleep_if(gfp_mask & __GFP_WAIT);
@@ -228,6 +235,7 @@ repeat_alloc:
        /* Now start performing page reclaim */
        gfp_temp = gfp_mask;
+        init_wait(&wait);
        prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
        smp_mb();
        if (!pool->curr_nr)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 559336de9687..7ee675ad101e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages);
 * Used by page_zone() to look up the address of the struct zone whose
 * id is encoded in the upper bits of page->flags
 */
-struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
+struct zone *zone_table[1 << ZONETABLE_SHIFT];
 EXPORT_SYMBOL(zone_table);
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -1649,11 +1649,17 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
 void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                unsigned long start_pfn)
 {
-        struct page *start = pfn_to_page(start_pfn);
        struct page *page;
+        unsigned long end_pfn = start_pfn + size;
+        unsigned long pfn;
-        for (page = start; page < (start + size); page++) {
+        for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
-                set_page_zone(page, NODEZONE(nid, zone));
+                if (!early_pfn_valid(pfn))
+                        continue;
+                if (!early_pfn_in_nid(pfn, nid))
+                        continue;
+                page = pfn_to_page(pfn);
+                set_page_links(page, zone, nid, pfn);
                set_page_count(page, 0);
                reset_page_mapcount(page);
                SetPageReserved(page);
@@ -1677,6 +1683,20 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
        }
 }
+#define ZONETABLE_INDEX(x, zone_nr)     ((x << ZONES_SHIFT) | zone_nr)
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+                unsigned long size)
+{
+        unsigned long snum = pfn_to_section_nr(pfn);
+        unsigned long end = pfn_to_section_nr(pfn + size);
+        if (FLAGS_HAS_NODE)
+                zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
+        else
+                for (; snum <= end; snum++)
+                        zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
+}
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
        memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -1861,7 +1881,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                unsigned long size, realsize;
                unsigned long batch;
-                zone_table[NODEZONE(nid, j)] = zone;
                realsize = size = zones_size[j];
                if (zholes_size)
                        realsize -= zholes_size[j];
@@ -1927,6 +1946,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                memmap_init(size, nid, j, zone_start_pfn);
+                zonetable_add(zone, nid, j, zone_start_pfn, size);
                zone_start_pfn += size;
                zone_init_free_lists(pgdat, zone, zone->spanned_pages);
@@ -1935,24 +1956,30 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 {
-        unsigned long size;
        /* Skip empty nodes */
        if (!pgdat->node_spanned_pages)
                return;
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
        /* ia64 gets its own node_mem_map, before this, without bootmem */
        if (!pgdat->node_mem_map) {
+                unsigned long size;
+                struct page *map;
                size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
-                pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
+                map = alloc_remap(pgdat->node_id, size);
+                if (!map)
+                        map = alloc_bootmem_node(pgdat, size);
+                pgdat->node_mem_map = map;
        }
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
        /*
         * With no DISCONTIG, the global mem_map is just set as node 0's
         */
        if (pgdat == NODE_DATA(0))
                mem_map = NODE_DATA(0)->node_mem_map;
 #endif
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 void __init free_area_init_node(int nid, struct pglist_data *pgdat,
@@ -1968,18 +1995,18 @@ void __init free_area_init_node(int nid, struct pglist_data *pgdat,
        free_area_init_core(pgdat, zones_size, zholes_size);
 }
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
 static bootmem_data_t contig_bootmem_data;
 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
 EXPORT_SYMBOL(contig_page_data);
+#endif
 void __init free_area_init(unsigned long *zones_size)
 {
-        free_area_init_node(0, &contig_page_data, zones_size,
+        free_area_init_node(0, NODE_DATA(0), zones_size,
                        __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
-#endif
 #ifdef CONFIG_PROC_FS
diff --git a/mm/page_io.c b/mm/page_io.c
index 667c76df1ec2..2e605a19ce57 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -127,7 +127,7 @@ out:
        return ret;
 }
-#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_PM_DISK)
+#ifdef CONFIG_SOFTWARE_SUSPEND
 /*
 * A scruffy utility function to read or write an arbitrary swap page
 * and wait on the I/O.  The caller must have a ref on the page.
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 38ce279cc8cd..d6781951267e 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -105,7 +105,7 @@ static int __pdflush(struct pdflush_work *my_work)
                spin_unlock_irq(&pdflush_lock);
                schedule();
-                if (try_to_freeze(PF_FREEZE)) {
+                if (try_to_freeze()) {
                        spin_lock_irq(&pdflush_lock);
                        continue;
                }
diff --git a/mm/rmap.c b/mm/rmap.c
index 89770bd25f31..08ac5c7fa91f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -247,8 +247,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 *
 * On success returns with mapped pte and locked mm->page_table_lock.
 */
-static pte_t *page_check_address(struct page *page, struct mm_struct *mm,
+pte_t *page_check_address(struct page *page, struct mm_struct *mm,
-                                        unsigned long address)
+                          unsigned long address)
 {
        pgd_t *pgd;
        pud_t *pud;
diff --git a/mm/slab.c b/mm/slab.c
index 93cbbbb39f42..122d031baab2 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -92,6 +92,7 @@
 #include        <linux/sysctl.h>
 #include        <linux/module.h>
 #include        <linux/rcupdate.h>
+#include        <linux/string.h>
 #include        <asm/uaccess.h>
 #include        <asm/cacheflush.h>
@@ -3082,3 +3083,26 @@ unsigned int ksize(const void *objp)
        return size;
 }
+/*
+ * kstrdup - allocate space for and copy an existing string
+ *
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrdup(const char *s, int gfp)
+{
+        size_t len;
+        char *buf;
+        if (!s)
+                return NULL;
+        len = strlen(s) + 1;
+        buf = kmalloc(len, gfp);
+        if (buf)
+                memcpy(buf, s, len);
+        return buf;
+}
+EXPORT_SYMBOL(kstrdup);
diff --git a/mm/sparse.c b/mm/sparse.c
new file mode 100644
index 000000000000..b54e304df4a7
--- /dev/null
+++ b/mm/sparse.c
@@ -0,0 +1,137 @@
+/*
+ * sparse memory mappings.
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <asm/dma.h>
+/*
+ * Permanent SPARSEMEM data:
+ *
+ * 1) mem_section       - memory sections, mem_map's for valid memory
+ */
+struct mem_section mem_section[NR_MEM_SECTIONS];
+EXPORT_SYMBOL(mem_section);
+/* Record a memory area against a node. */
+void memory_present(int nid, unsigned long start, unsigned long end)
+{
+        unsigned long pfn;
+        start &= PAGE_SECTION_MASK;
+        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+                unsigned long section = pfn_to_section_nr(pfn);
+                if (!mem_section[section].section_mem_map)
+                        mem_section[section].section_mem_map = SECTION_MARKED_PRESENT;
+        }
+}
+/*
+ * Only used by the i386 NUMA architecures, but relatively
+ * generic code.
+ */
+unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
+                                                     unsigned long end_pfn)
+{
+        unsigned long pfn;
+        unsigned long nr_pages = 0;
+        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+                if (nid != early_pfn_to_nid(pfn))
+                        continue;
+                if (pfn_valid(pfn))
+                        nr_pages += PAGES_PER_SECTION;
+        }
+        return nr_pages * sizeof(struct page);
+}
+/*
+ * Subtle, we encode the real pfn into the mem_map such that
+ * the identity pfn - section_mem_map will return the actual
+ * physical page frame number.
+ */
+static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
+{
+        return (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
+}
+/*
+ * We need this if we ever free the mem_maps.  While not implemented yet,
+ * this function is included for parity with its sibling.
+ */
+static __attribute((unused))
+struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
+{
+        return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
+}
+static int sparse_init_one_section(struct mem_section *ms,
+                unsigned long pnum, struct page *mem_map)
+{
+        if (!valid_section(ms))
+                return -EINVAL;
+        ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
+        return 1;
+}
+static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
+{
+        struct page *map;
+        int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
+        map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
+        if (map)
+                return map;
+        map = alloc_bootmem_node(NODE_DATA(nid),
+                        sizeof(struct page) * PAGES_PER_SECTION);
+        if (map)
+                return map;
+        printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
+        mem_section[pnum].section_mem_map = 0;
+        return NULL;
+}
+/*
+ * Allocate the accumulated non-linear sections, allocate a mem_map
+ * for each and record the physical to section mapping.
+ */
+void sparse_init(void)
+{
+        unsigned long pnum;
+        struct page *map;
+        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+                if (!valid_section_nr(pnum))
+                        continue;
+                map = sparse_early_mem_map_alloc(pnum);
+                if (map)
+                        sparse_init_one_section(&mem_section[pnum], pnum, map);
+        }
+}
+/*
+ * returns the number of sections whose mem_maps were properly
+ * set.  If this is <=0, then that means that the passed-in
+ * map was not consumed and must be freed.
+ */
+int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map)
+{
+        struct mem_section *ms = __pfn_to_section(start_pfn);
+        if (ms->section_mem_map & SECTION_MARKED_PRESENT)
+                return -EEXIST;
+        ms->section_mem_map |= SECTION_MARKED_PRESENT;
+        return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map);
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4b8e62a19370..1fa312a8db77 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1216,8 +1216,8 @@ static int kswapd(void *p)
        order = 0;
        for ( ; ; ) {
                unsigned long new_order;
-                if (current->flags & PF_FREEZE)
-                        refrigerator(PF_FREEZE);
+                try_to_freeze();
                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
                new_order = pgdat->kswapd_max_order;