25 files changed, 1006 insertions, 201 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
new file mode 100644
index 000000000000..cd379936cac6
--- /dev/null
+++ b/mm/Kconfig
@@ -0,0 +1,91 @@
+config SELECT_MEMORY_MODEL
+        def_bool y
+        depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL
+choice
+        prompt "Memory model"
+        depends on SELECT_MEMORY_MODEL
+        default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
+        default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
+        default FLATMEM_MANUAL
+config FLATMEM_MANUAL
+        bool "Flat Memory"
+        depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE
+        help
+          This option allows you to change some of the ways that
+          Linux manages its memory internally.  Most users will
+          only have one option here: FLATMEM.  This is normal
+          and a correct option.
+          Some users of more advanced features like NUMA and
+          memory hotplug may have different options here.
+          DISCONTIGMEM is an more mature, better tested system,
+          but is incompatible with memory hotplug and may suffer
+          decreased performance over SPARSEMEM.  If unsure between
+          "Sparse Memory" and "Discontiguous Memory", choose
+          "Discontiguous Memory".
+          If unsure, choose this option (Flat Memory) over any other.
+config DISCONTIGMEM_MANUAL
+        bool "Discontigious Memory"
+        depends on ARCH_DISCONTIGMEM_ENABLE
+        help
+          This option provides enhanced support for discontiguous
+          memory systems, over FLATMEM.  These systems have holes
+          in their physical address spaces, and this option provides
+          more efficient handling of these holes.  However, the vast
+          majority of hardware has quite flat address spaces, and
+          can have degraded performance from extra overhead that
+          this option imposes.
+          Many NUMA configurations will have this as the only option.
+          If unsure, choose "Flat Memory" over this option.
+config SPARSEMEM_MANUAL
+        bool "Sparse Memory"
+        depends on ARCH_SPARSEMEM_ENABLE
+        help
+          This will be the only option for some systems, including
+          memory hotplug systems.  This is normal.
+          For many other systems, this will be an alternative to
+          "Discontigious Memory".  This option provides some potential
+          performance benefits, along with decreased code complexity,
+          but it is newer, and more experimental.
+          If unsure, choose "Discontiguous Memory" or "Flat Memory"
+          over this option.
+endchoice
+config DISCONTIGMEM
+        def_bool y
+        depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
+config SPARSEMEM
+        def_bool y
+        depends on SPARSEMEM_MANUAL
+config FLATMEM
+        def_bool y
+        depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
+config FLAT_NODE_MEM_MAP
+        def_bool y
+        depends on !SPARSEMEM
+#
+# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
+# to represent different areas of memory.  This variable allows
+# those dependencies to exist individually.
+#
+config NEED_MULTIPLE_NODES
+        def_bool y
+        depends on DISCONTIGMEM || NUMA
+config HAVE_MEMORY_PRESENT
+        def_bool y
+        depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
diff --git a/mm/Makefile b/mm/Makefile
index 097408064f6a..4cd69e3ce421 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,6 +15,8 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
+obj-$(CONFIG_SPARSEMEM) += sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 260e703850d8..c1330cc19783 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -33,6 +33,14 @@ EXPORT_SYMBOL(max_pfn);		/* This is exported so
                                 * dma_get_required_mask(), which uses
                                 * it, can be an inline function */
+#ifdef CONFIG_CRASH_DUMP
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+#endif
 /* return the number of _pages_ that will be allocated for the boot bitmap */
 unsigned long __init bootmem_bootmap_pages (unsigned long pages)
 {
@@ -57,7 +65,7 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
        pgdat->pgdat_next = pgdat_list;
        pgdat_list = pgdat;
-        mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);
+        mapsize = ALIGN(mapsize, sizeof(long));
        bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
        bdata->node_boot_start = (start << PAGE_SHIFT);
        bdata->node_low_pfn = end;
@@ -178,7 +186,7 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
        } else
                preferred = 0;
-        preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT;
+        preferred = ALIGN(preferred, align) >> PAGE_SHIFT;
        preferred += offset;
        areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
        incr = align >> PAGE_SHIFT ? : 1;
@@ -219,7 +227,7 @@ found:
         */
        if (align < PAGE_SIZE &&
            bdata->last_offset && bdata->last_pos+1 == start) {
-                offset = (bdata->last_offset+align-1) & ~(align-1);
+                offset = ALIGN(bdata->last_offset, align);
                BUG_ON(offset > PAGE_SIZE);
                remaining_size = PAGE_SIZE-offset;
                if (size < remaining_size) {
@@ -256,6 +264,7 @@ found:
 static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 {
        struct page *page;
+        unsigned long pfn;
        bootmem_data_t *bdata = pgdat->bdata;
        unsigned long i, count, total = 0;
        unsigned long idx;
@@ -266,7 +275,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
        count = 0;
        /* first extant page of the node */
-        page = virt_to_page(phys_to_virt(bdata->node_boot_start));
+        pfn = bdata->node_boot_start >> PAGE_SHIFT;
        idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
        map = bdata->node_bootmem_map;
        /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
@@ -275,9 +284,11 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
                gofast = 1;
        for (i = 0; i < idx; ) {
                unsigned long v = ~map[i / BITS_PER_LONG];
                if (gofast && v == ~0UL) {
                        int j, order;
+                        page = pfn_to_page(pfn);
                        count += BITS_PER_LONG;
                        __ClearPageReserved(page);
                        order = ffs(BITS_PER_LONG) - 1;
@@ -292,6 +303,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
                        page += BITS_PER_LONG;
                } else if (v) {
                        unsigned long m;
+                        page = pfn_to_page(pfn);
                        for (m = 1; m && i < idx; m<<=1, page++, i++) {
                                if (v & m) {
                                        count++;
@@ -302,8 +315,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
                        }
                } else {
                        i+=BITS_PER_LONG;
-                        page += BITS_PER_LONG;
                }
+                pfn += BITS_PER_LONG;
        }
        total += count;
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 57264d74b8bf..5f19e87bc5af 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -43,6 +43,10 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
                goto out;
        }
+        if (mapping->a_ops->get_xip_page)
+                /* no bad return value, but ignore advice */
+                goto out;
        /* Careful about overflows. Len == 0 means "as much as possible" */
        endbyte = offset + len;
        if (!len || endbyte < len)
diff --git a/mm/filemap.c b/mm/filemap.c
index 4a2fee2cb62b..c11418dd94e8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -28,6 +28,7 @@
 #include <linux/blkdev.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include "filemap.h"
 /*
 * FIXME: remove all knowledge of the buffer layer from the core VM
 */
@@ -1714,32 +1715,7 @@ int remove_suid(struct dentry *dentry)
 }
 EXPORT_SYMBOL(remove_suid);
-/*
+size_t
- * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then clear the page
- * out to (offset+bytes) and return the number of bytes which were copied.
- */
-static inline size_t
-filemap_copy_from_user(struct page *page, unsigned long offset,
-                        const char __user *buf, unsigned bytes)
-{
-        char *kaddr;
-        int left;
-        kaddr = kmap_atomic(page, KM_USER0);
-        left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
-        kunmap_atomic(kaddr, KM_USER0);
-        if (left != 0) {
-                /* Do it the slow way */
-                kaddr = kmap(page);
-                left = __copy_from_user(kaddr + offset, buf, bytes);
-                kunmap(page);
-        }
-        return bytes - left;
-}
-static size_t
 __filemap_copy_from_user_iovec(char *vaddr, 
                        const struct iovec *iov, size_t base, size_t bytes)
 {
@@ -1767,52 +1743,6 @@ __filemap_copy_from_user_iovec(char *vaddr,
 }
 /*
- * This has the same sideeffects and return value as filemap_copy_from_user().
- * The difference is that on a fault we need to memset the remainder of the
- * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
- * single-segment behaviour.
- */
-static inline size_t
-filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
-                        const struct iovec *iov, size_t base, size_t bytes)
-{
-        char *kaddr;
-        size_t copied;
-        kaddr = kmap_atomic(page, KM_USER0);
-        copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
-                                                base, bytes);
-        kunmap_atomic(kaddr, KM_USER0);
-        if (copied != bytes) {
-                kaddr = kmap(page);
-                copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
-                                                        base, bytes);
-                kunmap(page);
-        }
-        return copied;
-}
-static inline void
-filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
-{
-        const struct iovec *iov = *iovp;
-        size_t base = *basep;
-        while (bytes) {
-                int copy = min(bytes, iov->iov_len - base);
-                bytes -= copy;
-                base += copy;
-                if (iov->iov_len == base) {
-                        iov++;
-                        base = 0;
-                }
-        }
-        *iovp = iov;
-        *basep = base;
-}
-/*
 * Performs necessary checks before doing a write
 *
 * Can adjust writing position aor amount of bytes to write.
@@ -1827,12 +1757,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
        if (unlikely(*pos < 0))
                return -EINVAL;
-        if (unlikely(file->f_error)) {
-                int err = file->f_error;
-                file->f_error = 0;
-                return err;
-        }
        if (!isblk) {
                /* FIXME: this is for backwards compatibility with 2.4 */
                if (file->f_flags & O_APPEND)
@@ -1927,8 +1851,11 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
         * i_sem is held, which protects generic_osync_inode() from
         * livelocking.
         */
-        if (written >= 0 && file->f_flags & O_SYNC)
+        if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-                generic_osync_inode(inode, mapping, OSYNC_METADATA);
+                int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
+                if (err < 0)
+                        written = err;
+        }
        if (written == count && !is_sync_kiocb(iocb))
                written = -EIOCBQUEUED;
        return written;
@@ -2027,7 +1954,9 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                                if (unlikely(nr_segs > 1)) {
                                        filemap_set_next_iovec(&cur_iov,
                                                        &iov_base, status);
-                                        buf = cur_iov->iov_base + iov_base;
+                                        if (count)
+                                                buf = cur_iov->iov_base +
+                                                        iov_base;
                                } else {
                                        iov_base += status;
                                }
diff --git a/mm/filemap.h b/mm/filemap.h
new file mode 100644
index 000000000000..13793ba0ce17
--- /dev/null
+++ b/mm/filemap.h
@@ -0,0 +1,94 @@
+/*
+ *      linux/mm/filemap.h
+ *
+ * Copyright (C) 1994-1999  Linus Torvalds
+ */
+#ifndef __FILEMAP_H
+#define __FILEMAP_H
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/uio.h>
+#include <linux/config.h>
+#include <asm/uaccess.h>
+size_t
+__filemap_copy_from_user_iovec(char *vaddr,
+                               const struct iovec *iov,
+                               size_t base,
+                               size_t bytes);
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then clear the page
+ * out to (offset+bytes) and return the number of bytes which were copied.
+ */
+static inline size_t
+filemap_copy_from_user(struct page *page, unsigned long offset,
+                        const char __user *buf, unsigned bytes)
+{
+        char *kaddr;
+        int left;
+        kaddr = kmap_atomic(page, KM_USER0);
+        left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (left != 0) {
+                /* Do it the slow way */
+                kaddr = kmap(page);
+                left = __copy_from_user(kaddr + offset, buf, bytes);
+                kunmap(page);
+        }
+        return bytes - left;
+}
+/*
+ * This has the same sideeffects and return value as filemap_copy_from_user().
+ * The difference is that on a fault we need to memset the remainder of the
+ * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
+ * single-segment behaviour.
+ */
+static inline size_t
+filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
+                        const struct iovec *iov, size_t base, size_t bytes)
+{
+        char *kaddr;
+        size_t copied;
+        kaddr = kmap_atomic(page, KM_USER0);
+        copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
+                                                base, bytes);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (copied != bytes) {
+                kaddr = kmap(page);
+                copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
+                                                        base, bytes);
+                kunmap(page);
+        }
+        return copied;
+}
+static inline void
+filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
+{
+        const struct iovec *iov = *iovp;
+        size_t base = *basep;
+        while (bytes) {
+                int copy = min(bytes, iov->iov_len - base);
+                bytes -= copy;
+                base += copy;
+                if (iov->iov_len == base) {
+                        iov++;
+                        base = 0;
+                }
+        }
+        *iovp = iov;
+        *basep = base;
+}
+#endif
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
new file mode 100644
index 000000000000..8c199f537732
--- /dev/null
+++ b/mm/filemap_xip.c
@@ -0,0 +1,440 @@
+/*
+ *      linux/mm/filemap_xip.c
+ *
+ * Copyright (C) 2005 IBM Corporation
+ * Author: Carsten Otte <cotte@de.ibm.com>
+ *
+ * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
+ *
+ */
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/uio.h>
+#include <linux/rmap.h>
+#include <asm/tlbflush.h>
+#include "filemap.h"
+/*
+ * This is a file read routine for execute in place files, and uses
+ * the mapping->a_ops->get_xip_page() function for the actual low-level
+ * stuff.
+ *
+ * Note the struct file* is not used at all.  It may be NULL.
+ */
+static void
+do_xip_mapping_read(struct address_space *mapping,
+                    struct file_ra_state *_ra,
+                    struct file *filp,
+                    loff_t *ppos,
+                    read_descriptor_t *desc,
+                    read_actor_t actor)
+{
+        struct inode *inode = mapping->host;
+        unsigned long index, end_index, offset;
+        loff_t isize;
+        BUG_ON(!mapping->a_ops->get_xip_page);
+        index = *ppos >> PAGE_CACHE_SHIFT;
+        offset = *ppos & ~PAGE_CACHE_MASK;
+        isize = i_size_read(inode);
+        if (!isize)
+                goto out;
+        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+        for (;;) {
+                struct page *page;
+                unsigned long nr, ret;
+                /* nr is the maximum number of bytes to copy from this page */
+                nr = PAGE_CACHE_SIZE;
+                if (index >= end_index) {
+                        if (index > end_index)
+                                goto out;
+                        nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+                        if (nr <= offset) {
+                                goto out;
+                        }
+                }
+                nr = nr - offset;
+                page = mapping->a_ops->get_xip_page(mapping,
+                        index*(PAGE_SIZE/512), 0);
+                if (!page)
+                        goto no_xip_page;
+                if (unlikely(IS_ERR(page))) {
+                        if (PTR_ERR(page) == -ENODATA) {
+                                /* sparse */
+                                page = ZERO_PAGE(0);
+                        } else {
+                                desc->error = PTR_ERR(page);
+                                goto out;
+                        }
+                }
+                /* If users can be writing to this page using arbitrary
+                 * virtual addresses, take care about potential aliasing
+                 * before reading the page on the kernel side.
+                 */
+                if (mapping_writably_mapped(mapping))
+                        flush_dcache_page(page);
+                /*
+                 * Ok, we have the page, so now we can copy it to user space...
+                 *
+                 * The actor routine returns how many bytes were actually used..
+                 * NOTE! This may not be the same as how much of a user buffer
+                 * we filled up (we may be padding etc), so we can only update
+                 * "pos" here (the actor routine has to update the user buffer
+                 * pointers and the remaining count).
+                 */
+                ret = actor(desc, page, offset, nr);
+                offset += ret;
+                index += offset >> PAGE_CACHE_SHIFT;
+                offset &= ~PAGE_CACHE_MASK;
+                if (ret == nr && desc->count)
+                        continue;
+                goto out;
+no_xip_page:
+                /* Did not get the page. Report it */
+                desc->error = -EIO;
+                goto out;
+        }
+out:
+        *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+        if (filp)
+                file_accessed(filp);
+}
+ssize_t
+xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+{
+        read_descriptor_t desc;
+        if (!access_ok(VERIFY_WRITE, buf, len))
+                return -EFAULT;
+        desc.written = 0;
+        desc.arg.buf = buf;
+        desc.count = len;
+        desc.error = 0;
+        do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
+                            ppos, &desc, file_read_actor);
+        if (desc.written)
+                return desc.written;
+        else
+                return desc.error;
+}
+EXPORT_SYMBOL_GPL(xip_file_read);
+ssize_t
+xip_file_sendfile(struct file *in_file, loff_t *ppos,
+             size_t count, read_actor_t actor, void *target)
+{
+        read_descriptor_t desc;
+        if (!count)
+                return 0;
+        desc.written = 0;
+        desc.count = count;
+        desc.arg.data = target;
+        desc.error = 0;
+        do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file,
+                            ppos, &desc, actor);
+        if (desc.written)
+                return desc.written;
+        return desc.error;
+}
+EXPORT_SYMBOL_GPL(xip_file_sendfile);
+/*
+ * __xip_unmap is invoked from xip_unmap and
+ * xip_write
+ *
+ * This function walks all vmas of the address_space and unmaps the
+ * ZERO_PAGE when found at pgoff. Should it go in rmap.c?
+ */
+static void
+__xip_unmap (struct address_space * mapping,
+                     unsigned long pgoff)
+{
+        struct vm_area_struct *vma;
+        struct mm_struct *mm;
+        struct prio_tree_iter iter;
+        unsigned long address;
+        pte_t *pte;
+        pte_t pteval;
+        spin_lock(&mapping->i_mmap_lock);
+        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+                mm = vma->vm_mm;
+                address = vma->vm_start +
+                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+                BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+                /*
+                 * We need the page_table_lock to protect us from page faults,
+                 * munmap, fork, etc...
+                 */
+                pte = page_check_address(ZERO_PAGE(address), mm,
+                                         address);
+                if (!IS_ERR(pte)) {
+                        /* Nuke the page table entry. */
+                        flush_cache_page(vma, address, pte_pfn(*pte));
+                        pteval = ptep_clear_flush(vma, address, pte);
+                        BUG_ON(pte_dirty(pteval));
+                        pte_unmap(pte);
+                        spin_unlock(&mm->page_table_lock);
+                }
+        }
+        spin_unlock(&mapping->i_mmap_lock);
+}
+/*
+ * xip_nopage() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * This function is derived from filemap_nopage, but used for execute in place
+ */
+static struct page *
+xip_file_nopage(struct vm_area_struct * area,
+                   unsigned long address,
+                   int *type)
+{
+        struct file *file = area->vm_file;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
+        struct page *page;
+        unsigned long size, pgoff, endoff;
+        pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
+                + area->vm_pgoff;
+        endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT)
+                + area->vm_pgoff;
+        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        if (pgoff >= size) {
+                return NULL;
+        }
+        page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
+        if (!IS_ERR(page)) {
+                return page;
+        }
+        if (PTR_ERR(page) != -ENODATA)
+                return NULL;
+        /* sparse block */
+        if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
+            (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
+            (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
+                /* maybe shared writable, allocate new block */
+                page = mapping->a_ops->get_xip_page (mapping,
+                        pgoff*(PAGE_SIZE/512), 1);
+                if (IS_ERR(page))
+                        return NULL;
+                /* unmap page at pgoff from all other vmas */
+                __xip_unmap(mapping, pgoff);
+        } else {
+                /* not shared and writable, use ZERO_PAGE() */
+                page = ZERO_PAGE(address);
+        }
+        return page;
+}
+static struct vm_operations_struct xip_file_vm_ops = {
+        .nopage         = xip_file_nopage,
+};
+int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+        BUG_ON(!file->f_mapping->a_ops->get_xip_page);
+        file_accessed(file);
+        vma->vm_ops = &xip_file_vm_ops;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(xip_file_mmap);
+static ssize_t
+__xip_file_write(struct file *filp, const char __user *buf,
+                  size_t count, loff_t pos, loff_t *ppos)
+{
+        struct address_space * mapping = filp->f_mapping;
+        struct address_space_operations *a_ops = mapping->a_ops;
+        struct inode    *inode = mapping->host;
+        long            status = 0;
+        struct page     *page;
+        size_t          bytes;
+        ssize_t         written = 0;
+        BUG_ON(!mapping->a_ops->get_xip_page);
+        do {
+                unsigned long index;
+                unsigned long offset;
+                size_t copied;
+                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+                index = pos >> PAGE_CACHE_SHIFT;
+                bytes = PAGE_CACHE_SIZE - offset;
+                if (bytes > count)
+                        bytes = count;
+                /*
+                 * Bring in the user page that we will copy from _first_.
+                 * Otherwise there's a nasty deadlock on copying from the
+                 * same page as we're writing to, without it being marked
+                 * up-to-date.
+                 */
+                fault_in_pages_readable(buf, bytes);
+                page = a_ops->get_xip_page(mapping,
+                                           index*(PAGE_SIZE/512), 0);
+                if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
+                        /* we allocate a new page unmap it */
+                        page = a_ops->get_xip_page(mapping,
+                                                   index*(PAGE_SIZE/512), 1);
+                        if (!IS_ERR(page))
+                                /* unmap page at pgoff from all other vmas */
+                                __xip_unmap(mapping, index);
+                }
+                if (IS_ERR(page)) {
+                        status = PTR_ERR(page);
+                        break;
+                }
+                copied = filemap_copy_from_user(page, offset, buf, bytes);
+                flush_dcache_page(page);
+                if (likely(copied > 0)) {
+                        status = copied;
+                        if (status >= 0) {
+                                written += status;
+                                count -= status;
+                                pos += status;
+                                buf += status;
+                        }
+                }
+                if (unlikely(copied != bytes))
+                        if (status >= 0)
+                                status = -EFAULT;
+                if (status < 0)
+                        break;
+        } while (count);
+        *ppos = pos;
+        /*
+         * No need to use i_size_read() here, the i_size
+         * cannot change under us because we hold i_sem.
+         */
+        if (pos > inode->i_size) {
+                i_size_write(inode, pos);
+                mark_inode_dirty(inode);
+        }
+        return written ? written : status;
+}
+ssize_t
+xip_file_write(struct file *filp, const char __user *buf, size_t len,
+               loff_t *ppos)
+{
+        struct address_space *mapping = filp->f_mapping;
+        struct inode *inode = mapping->host;
+        size_t count;
+        loff_t pos;
+        ssize_t ret;
+        down(&inode->i_sem);
+        if (!access_ok(VERIFY_READ, buf, len)) {
+                ret=-EFAULT;
+                goto out_up;
+        }
+        pos = *ppos;
+        count = len;
+        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+        /* We can write back this queue in page reclaim */
+        current->backing_dev_info = mapping->backing_dev_info;
+        ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
+        if (ret)
+                goto out_backing;
+        if (count == 0)
+                goto out_backing;
+        ret = remove_suid(filp->f_dentry);
+        if (ret)
+                goto out_backing;
+        inode_update_time(inode, 1);
+        ret = __xip_file_write (filp, buf, count, pos, ppos);
+ out_backing:
+        current->backing_dev_info = NULL;
+ out_up:
+        up(&inode->i_sem);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(xip_file_write);
+/*
+ * truncate a page used for execute in place
+ * functionality is analog to block_truncate_page but does use get_xip_page
+ * to get the page instead of page cache
+ */
+int
+xip_truncate_page(struct address_space *mapping, loff_t from)
+{
+        pgoff_t index = from >> PAGE_CACHE_SHIFT;
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned blocksize;
+        unsigned length;
+        struct page *page;
+        void *kaddr;
+        BUG_ON(!mapping->a_ops->get_xip_page);
+        blocksize = 1 << mapping->host->i_blkbits;
+        length = offset & (blocksize - 1);
+        /* Block boundary? Nothing to do */
+        if (!length)
+                return 0;
+        length = blocksize - length;
+        page = mapping->a_ops->get_xip_page(mapping,
+                                            index*(PAGE_SIZE/512), 0);
+        if (!page)
+                return -ENOMEM;
+        if (unlikely(IS_ERR(page))) {
+                if (PTR_ERR(page) == -ENODATA)
+                        /* Hole? No need to truncate */
+                        return 0;
+                else
+                        return PTR_ERR(page);
+        }
+        kaddr = kmap_atomic(page, KM_USER0);
+        memset(kaddr + offset, 0, length);
+        kunmap_atomic(kaddr, KM_USER0);
+        flush_dcache_page(page);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fbd1111ea119..6bf720bc662c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -301,6 +301,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
+        pte_t *ptep;
        pte_t pte;
        struct page *page;
@@ -309,9 +310,17 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        BUG_ON(end & ~HPAGE_MASK);
        for (address = start; address < end; address += HPAGE_SIZE) {
-                pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address));
+                ptep = huge_pte_offset(mm, address);
+                if (! ptep)
+                        /* This can happen on truncate, or if an
+                         * mmap() is aborted due to an error before
+                         * the prefault */
+                        continue;
+                pte = huge_ptep_get_and_clear(mm, address, ptep);
                if (pte_none(pte))
                        continue;
                page = pte_page(pte);
                put_page(page);
        }
diff --git a/mm/madvise.c b/mm/madvise.c
index e3108054733c..c8c01a12fea4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -65,7 +65,6 @@ static long madvise_behavior(struct vm_area_struct * vma,
        /*
         * vm_flags is protected by the mmap_sem held in write mode.
         */
-        VM_ClearReadHint(vma);
        vma->vm_flags = new_flags;
 out:
@@ -84,8 +83,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
 {
        struct file *file = vma->vm_file;
-        if (!file)
+        if (file->f_mapping->a_ops->get_xip_page) {
-                return -EBADF;
+                /* no bad return value, but ignore advice */
+                return 0;
+        }
        *prev = vma;
        start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -136,11 +137,16 @@ static long madvise_dontneed(struct vm_area_struct * vma,
        return 0;
 }
-static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
+static long
-                        unsigned long start, unsigned long end, int behavior)
+madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
+                unsigned long start, unsigned long end, int behavior)
 {
+        struct file *filp = vma->vm_file;
        long error = -EBADF;
+        if (!filp)
+                goto  out;
        switch (behavior) {
        case MADV_NORMAL:
        case MADV_SEQUENTIAL:
@@ -161,6 +167,7 @@ static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev
                break;
        }
                
+out:
        return error;
 }
diff --git a/mm/memory.c b/mm/memory.c
index da91b7bf9986..e046b7e4b530 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -58,7 +58,7 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
 struct page *mem_map;
@@ -776,8 +776,8 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 * Do a quick page-table lookup for a single page.
 * mm->page_table_lock must be held.
 */
-static struct page *
+static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
-__follow_page(struct mm_struct *mm, unsigned long address, int read, int write)
+                        int read, int write, int accessed)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -818,9 +818,11 @@ __follow_page(struct mm_struct *mm, unsigned long address, int read, int write)
                pfn = pte_pfn(pte);
                if (pfn_valid(pfn)) {
                        page = pfn_to_page(pfn);
-                        if (write && !pte_dirty(pte) && !PageDirty(page))
+                        if (accessed) {
-                                set_page_dirty(page);
+                                if (write && !pte_dirty(pte) &&!PageDirty(page))
-                        mark_page_accessed(page);
+                                        set_page_dirty(page);
+                                mark_page_accessed(page);
+                        }
                        return page;
                }
        }
@@ -829,16 +831,19 @@ out:
        return NULL;
 }
-struct page *
+inline struct page *
 follow_page(struct mm_struct *mm, unsigned long address, int write)
 {
-        return __follow_page(mm, address, /*read*/0, write);
+        return __follow_page(mm, address, 0, write, 1);
 }
-int
+/*
-check_user_page_readable(struct mm_struct *mm, unsigned long address)
+ * check_user_page_readable() can be called frm niterrupt context by oprofile,
+ * so we need to avoid taking any non-irq-safe locks
+ */
+int check_user_page_readable(struct mm_struct *mm, unsigned long address)
 {
-        return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL;
+        return __follow_page(mm, address, 1, 0, 0) != NULL;
 }
 EXPORT_SYMBOL(check_user_page_readable);
@@ -908,9 +913,13 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        pud = pud_offset(pgd, pg);
                        BUG_ON(pud_none(*pud));
                        pmd = pmd_offset(pud, pg);
-                        BUG_ON(pmd_none(*pmd));
+                        if (pmd_none(*pmd))
+                                return i ? : -EFAULT;
                        pte = pte_offset_map(pmd, pg);
-                        BUG_ON(pte_none(*pte));
+                        if (pte_none(*pte)) {
+                                pte_unmap(pte);
+                                return i ? : -EFAULT;
+                        }
                        if (pages) {
                                pages[i] = pte_page(*pte);
                                get_page(pages[i]);
@@ -935,11 +944,13 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                }
                spin_lock(&mm->page_table_lock);
                do {
+                        int write_access = write;
                        struct page *page;
-                        int lookup_write = write;
                        cond_resched_lock(&mm->page_table_lock);
-                        while (!(page = follow_page(mm, start, lookup_write))) {
+                        while (!(page = follow_page(mm, start, write_access))) {
+                                int ret;
                                /*
                                 * Shortcut for anonymous pages. We don't want
                                 * to force the creation of pages tables for
@@ -947,13 +958,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                 * nobody touched so far. This is important
                                 * for doing a core dump for these mappings.
                                 */
-                                if (!lookup_write &&
+                                if (!write && untouched_anonymous_page(mm,vma,start)) {
-                                    untouched_anonymous_page(mm,vma,start)) {
                                        page = ZERO_PAGE(start);
                                        break;
                                }
                                spin_unlock(&mm->page_table_lock);
-                                switch (handle_mm_fault(mm,vma,start,write)) {
+                                ret = __handle_mm_fault(mm, vma, start, write_access);
+                                /*
+                                 * The VM_FAULT_WRITE bit tells us that do_wp_page has
+                                 * broken COW when necessary, even if maybe_mkwrite
+                                 * decided not to set pte_write. We can thus safely do
+                                 * subsequent page lookups as if they were reads.
+                                 */
+                                if (ret & VM_FAULT_WRITE)
+                                        write_access = 0;
+                                
+                                switch (ret & ~VM_FAULT_WRITE) {
                                case VM_FAULT_MINOR:
                                        tsk->min_flt++;
                                        break;
@@ -967,14 +988,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                default:
                                        BUG();
                                }
-                                /*
-                                 * Now that we have performed a write fault
-                                 * and surely no longer have a shared page we
-                                 * shouldn't write, we shouldn't ignore an
-                                 * unwritable page in the page table if
-                                 * we are forcing write access.
-                                 */
-                                lookup_write = write && !force;
                                spin_lock(&mm->page_table_lock);
                        }
                        if (pages) {
@@ -1139,7 +1152,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 {
        pgd_t *pgd;
        unsigned long next;
-        unsigned long end = addr + size;
+        unsigned long end = addr + PAGE_ALIGN(size);
        struct mm_struct *mm = vma->vm_mm;
        int err;
@@ -1224,6 +1237,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
        struct page *old_page, *new_page;
        unsigned long pfn = pte_pfn(pte);
        pte_t entry;
+        int ret;
        if (unlikely(!pfn_valid(pfn))) {
                /*
@@ -1251,7 +1265,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
                        lazy_mmu_prot_update(entry);
                        pte_unmap(page_table);
                        spin_unlock(&mm->page_table_lock);
-                        return VM_FAULT_MINOR;
+                        return VM_FAULT_MINOR|VM_FAULT_WRITE;
                }
        }
        pte_unmap(page_table);
@@ -1278,6 +1292,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
        /*
         * Re-check the pte - we dropped the lock
         */
+        ret = VM_FAULT_MINOR;
        spin_lock(&mm->page_table_lock);
        page_table = pte_offset_map(pmd, address);
        if (likely(pte_same(*page_table, pte))) {
@@ -1294,12 +1309,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
                /* Free the old page.. */
                new_page = old_page;
+                ret |= VM_FAULT_WRITE;
        }
        pte_unmap(page_table);
        page_cache_release(new_page);
        page_cache_release(old_page);
        spin_unlock(&mm->page_table_lock);
-        return VM_FAULT_MINOR;
+        return ret;
 no_new_page:
        page_cache_release(old_page);
@@ -1458,7 +1474,7 @@ restart:
 * unmap_mapping_range - unmap the portion of all mmaps
 * in the specified address_space corresponding to the specified
 * page range in the underlying file.
- * @address_space: the address space containing mmaps to be unmapped.
+ * @mapping: the address space containing mmaps to be unmapped.
 * @holebegin: byte in first page to unmap, relative to the start of
 * the underlying file.  This will be rounded down to a PAGE_SIZE
 * boundary.  Note that this is different from vmtruncate(), which
@@ -1991,7 +2007,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
        if (write_access) {
                if (!pte_write(entry))
                        return do_wp_page(mm, vma, address, pte, pmd, entry);
                entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
@@ -2006,7 +2021,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 /*
 * By the time we get here, we already hold the mm semaphore
 */
-int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
                unsigned long address, int write_access)
 {
        pgd_t *pgd;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cb41c31e7c87..b4eababc8198 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -443,7 +443,7 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
        struct mempolicy *new;
        DECLARE_BITMAP(nodes, MAX_NUMNODES);
-        if (mode > MPOL_MAX)
+        if (mode < 0 || mode > MPOL_MAX)
                return -EINVAL;
        err = get_nodes(nodes, nmask, maxnode, mode);
        if (err)
@@ -1138,11 +1138,11 @@ void mpol_free_shared_policy(struct shared_policy *p)
        while (next) {
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
+                rb_erase(&n->nd, &p->root);
                mpol_free(n->policy);
                kmem_cache_free(sn_cache, n);
        }
        spin_unlock(&p->lock);
-        p->root = RB_ROOT;
 }
 /* assumes fs == KERNEL_DS */
diff --git a/mm/mempool.c b/mm/mempool.c
index c9f3d4620428..65f2957b8d51 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -51,16 +51,23 @@ static void free_pool(mempool_t *pool)
 * functions might sleep - as long as the mempool_alloc function is not called
 * from IRQ contexts.
 */
-mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
                                mempool_free_t *free_fn, void *pool_data)
 {
-        mempool_t *pool;
+        return  mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1);
+}
+EXPORT_SYMBOL(mempool_create);
-        pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
+                        mempool_free_t *free_fn, void *pool_data, int node_id)
+{
+        mempool_t *pool;
+        pool = kmalloc_node(sizeof(*pool), GFP_KERNEL, node_id);
        if (!pool)
                return NULL;
        memset(pool, 0, sizeof(*pool));
-        pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL);
+        pool->elements = kmalloc_node(min_nr * sizeof(void *),
+                                        GFP_KERNEL, node_id);
        if (!pool->elements) {
                kfree(pool);
                return NULL;
@@ -87,7 +94,7 @@ mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
        }
        return pool;
 }
-EXPORT_SYMBOL(mempool_create);
+EXPORT_SYMBOL(mempool_create_node);
 /**
 * mempool_resize - resize an existing memory pool
@@ -197,8 +204,8 @@ void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask)
 {
        void *element;
        unsigned long flags;
-        DEFINE_WAIT(wait);
+        wait_queue_t wait;
-        int gfp_temp;
+        unsigned int gfp_temp;
        might_sleep_if(gfp_mask & __GFP_WAIT);
@@ -228,6 +235,7 @@ repeat_alloc:
        /* Now start performing page reclaim */
        gfp_temp = gfp_mask;
+        init_wait(&wait);
        prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
        smp_mb();
        if (!pool->curr_nr)
diff --git a/mm/mmap.c b/mm/mmap.c
index da3fa90a0aae..404319477e71 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -143,7 +143,11 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
           leave 3% of the size of this process for other processes */
        allowed -= current->mm->total_vm / 32;
-        if (atomic_read(&vm_committed_space) < allowed)
+        /*
+         * cast `allowed' as a signed long because vm_committed_space
+         * sometimes has a negative value
+         */
+        if (atomic_read(&vm_committed_space) < (long)allowed)
                return 0;
        vm_unacct_memory(pages);
diff --git a/mm/mremap.c b/mm/mremap.c
index ec7238a78f36..fc45dc9a617b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -229,6 +229,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         * since do_munmap() will decrement it by old_len == new_len
         */
        mm->total_vm += new_len >> PAGE_SHIFT;
+        __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
        if (do_munmap(mm, old_addr, old_len) < 0) {
                /* OOM: unable to split vma, just get accounts right */
@@ -243,7 +244,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                        vma->vm_next->vm_flags |= VM_ACCOUNT;
        }
-        __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
                mm->locked_vm += new_len >> PAGE_SHIFT;
                if (new_len > old_len)
diff --git a/mm/nommu.c b/mm/nommu.c
index ce74452c02d9..fd4e8df0f02d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1167,7 +1167,11 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
           leave 3% of the size of this process for other processes */
        allowed -= current->mm->total_vm / 32;
-        if (atomic_read(&vm_committed_space) < allowed)
+        /*
+         * cast `allowed' as a signed long because vm_committed_space
+         * sometimes has a negative value
+         */
+        if (atomic_read(&vm_committed_space) < (long)allowed)
                return 0;
        vm_unacct_memory(pages);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 59666d905f19..1e56076672f5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -253,14 +253,16 @@ static struct mm_struct *oom_kill_process(struct task_struct *p)
 * OR try to be smart about which process to kill. Note that we
 * don't have to be perfect here, we just have to be good.
 */
-void out_of_memory(unsigned int __nocast gfp_mask)
+void out_of_memory(unsigned int __nocast gfp_mask, int order)
 {
        struct mm_struct *mm = NULL;
        task_t * p;
-        printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
+        if (printk_ratelimit()) {
-        /* print memory stats */
+                printk("oom-killer: gfp_mask=0x%x, order=%d\n",
-        show_mem();
+                        gfp_mask, order);
+                show_mem();
+        }
        read_lock(&tasklist_lock);
 retry:
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 613b99a55917..a6329fa8f862 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -354,7 +354,7 @@ static void background_writeout(unsigned long _min_pages)
 * the whole world.  Returns 0 if a pdflush thread was dispatched.  Returns
 * -1 if all pdflush threads were busy.
 */
-int wakeup_bdflush(long nr_pages)
+int wakeup_pdflush(long nr_pages)
 {
        if (nr_pages == 0) {
                struct writeback_state wbs;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 206920796f5f..8d088371196a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages);
 * Used by page_zone() to look up the address of the struct zone whose
 * id is encoded in the upper bits of page->flags
 */
-struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
+struct zone *zone_table[1 << ZONETABLE_SHIFT];
 EXPORT_SYMBOL(zone_table);
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -897,12 +897,6 @@ rebalance:
        cond_resched();
        if (likely(did_some_progress)) {
-                /*
-                 * Go through the zonelist yet one more time, keep
-                 * very high watermark here, this is only to catch
-                 * a parallel oom killing, we must fail if we're still
-                 * under heavy pressure.
-                 */
                for (i = 0; (z = zones[i]) != NULL; i++) {
                        if (!zone_watermark_ok(z, order, z->pages_min,
                                               classzone_idx, can_try_harder,
@@ -936,7 +930,7 @@ rebalance:
                                goto got_pg;
                }
-                out_of_memory(gfp_mask);
+                out_of_memory(gfp_mask, order);
                goto restart;
        }
@@ -1067,20 +1061,19 @@ unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
 static unsigned int nr_free_zone_pages(int offset)
 {
-        pg_data_t *pgdat;
+        /* Just pick one node, since fallback list is circular */
+        pg_data_t *pgdat = NODE_DATA(numa_node_id());
        unsigned int sum = 0;
-        for_each_pgdat(pgdat) {
+        struct zonelist *zonelist = pgdat->node_zonelists + offset;
-                struct zonelist *zonelist = pgdat->node_zonelists + offset;
+        struct zone **zonep = zonelist->zones;
-                struct zone **zonep = zonelist->zones;
+        struct zone *zone;
-                struct zone *zone;
-                for (zone = *zonep++; zone; zone = *zonep++) {
+        for (zone = *zonep++; zone; zone = *zonep++) {
-                        unsigned long size = zone->present_pages;
+                unsigned long size = zone->present_pages;
-                        unsigned long high = zone->pages_high;
+                unsigned long high = zone->pages_high;
-                        if (size > high)
+                if (size > high)
-                                sum += size - high;
+                        sum += size - high;
-                }
        }
        return sum;
@@ -1649,11 +1642,17 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
 void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                unsigned long start_pfn)
 {
-        struct page *start = pfn_to_page(start_pfn);
        struct page *page;
+        unsigned long end_pfn = start_pfn + size;
+        unsigned long pfn;
-        for (page = start; page < (start + size); page++) {
+        for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
-                set_page_zone(page, NODEZONE(nid, zone));
+                if (!early_pfn_valid(pfn))
+                        continue;
+                if (!early_pfn_in_nid(pfn, nid))
+                        continue;
+                page = pfn_to_page(pfn);
+                set_page_links(page, zone, nid, pfn);
                set_page_count(page, 0);
                reset_page_mapcount(page);
                SetPageReserved(page);
@@ -1661,9 +1660,8 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 #ifdef WANT_PAGE_VIRTUAL
                /* The shift won't overflow because ZONE_NORMAL is below 4G. */
                if (!is_highmem_idx(zone))
-                        set_page_address(page, __va(start_pfn << PAGE_SHIFT));
+                        set_page_address(page, __va(pfn << PAGE_SHIFT));
 #endif
-                start_pfn++;
        }
 }
@@ -1677,6 +1675,20 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
        }
 }
+#define ZONETABLE_INDEX(x, zone_nr)     ((x << ZONES_SHIFT) | zone_nr)
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+                unsigned long size)
+{
+        unsigned long snum = pfn_to_section_nr(pfn);
+        unsigned long end = pfn_to_section_nr(pfn + size);
+        if (FLAGS_HAS_NODE)
+                zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
+        else
+                for (; snum <= end; snum++)
+                        zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
+}
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
        memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -1742,10 +1754,17 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 * with interrupts disabled.
 *
 * Some NUMA counter updates may also be caught by the boot pagesets.
- * These will be discarded when bootup is complete.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
 */
 static struct per_cpu_pageset
-        boot_pageset[NR_CPUS] __initdata;
+        boot_pageset[NR_CPUS];
 /*
 * Dynamically allocate memory for the
@@ -1841,7 +1860,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                unsigned long *zones_size, unsigned long *zholes_size)
 {
        unsigned long i, j;
-        const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
        int cpu, nid = pgdat->node_id;
        unsigned long zone_start_pfn = pgdat->node_start_pfn;
@@ -1854,7 +1872,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                unsigned long size, realsize;
                unsigned long batch;
-                zone_table[NODEZONE(nid, j)] = zone;
                realsize = size = zones_size[j];
                if (zholes_size)
                        realsize -= zholes_size[j];
@@ -1915,11 +1932,10 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                zone->zone_mem_map = pfn_to_page(zone_start_pfn);
                zone->zone_start_pfn = zone_start_pfn;
-                if ((zone_start_pfn) & (zone_required_alignment-1))
-                        printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n");
                memmap_init(size, nid, j, zone_start_pfn);
+                zonetable_add(zone, nid, j, zone_start_pfn, size);
                zone_start_pfn += size;
                zone_init_free_lists(pgdat, zone, zone->spanned_pages);
@@ -1928,24 +1944,30 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 {
-        unsigned long size;
        /* Skip empty nodes */
        if (!pgdat->node_spanned_pages)
                return;
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
        /* ia64 gets its own node_mem_map, before this, without bootmem */
        if (!pgdat->node_mem_map) {
+                unsigned long size;
+                struct page *map;
                size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
-                pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
+                map = alloc_remap(pgdat->node_id, size);
+                if (!map)
+                        map = alloc_bootmem_node(pgdat, size);
+                pgdat->node_mem_map = map;
        }
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
        /*
         * With no DISCONTIG, the global mem_map is just set as node 0's
         */
        if (pgdat == NODE_DATA(0))
                mem_map = NODE_DATA(0)->node_mem_map;
 #endif
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 void __init free_area_init_node(int nid, struct pglist_data *pgdat,
@@ -1961,18 +1983,18 @@ void __init free_area_init_node(int nid, struct pglist_data *pgdat,
        free_area_init_core(pgdat, zones_size, zholes_size);
 }
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
 static bootmem_data_t contig_bootmem_data;
 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
 EXPORT_SYMBOL(contig_page_data);
+#endif
 void __init free_area_init(unsigned long *zones_size)
 {
-        free_area_init_node(0, &contig_page_data, zones_size,
+        free_area_init_node(0, NODE_DATA(0), zones_size,
                        __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
-#endif
 #ifdef CONFIG_PROC_FS
diff --git a/mm/page_io.c b/mm/page_io.c
index 667c76df1ec2..2e605a19ce57 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -127,7 +127,7 @@ out:
        return ret;
 }
-#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_PM_DISK)
+#ifdef CONFIG_SOFTWARE_SUSPEND
 /*
 * A scruffy utility function to read or write an arbitrary swap page
 * and wait on the I/O.  The caller must have a ref on the page.
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 38ce279cc8cd..d6781951267e 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -105,7 +105,7 @@ static int __pdflush(struct pdflush_work *my_work)
                spin_unlock_irq(&pdflush_lock);
                schedule();
-                if (try_to_freeze(PF_FREEZE)) {
+                if (try_to_freeze()) {
                        spin_lock_irq(&pdflush_lock);
                        continue;
                }
diff --git a/mm/rmap.c b/mm/rmap.c
index 89770bd25f31..08ac5c7fa91f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -247,8 +247,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 *
 * On success returns with mapped pte and locked mm->page_table_lock.
 */
-static pte_t *page_check_address(struct page *page, struct mm_struct *mm,
+pte_t *page_check_address(struct page *page, struct mm_struct *mm,
-                                        unsigned long address)
+                          unsigned long address)
 {
        pgd_t *pgd;
        pud_t *pud;
diff --git a/mm/shmem.c b/mm/shmem.c
index e64fa726a790..5a81b1ee4f7a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1773,32 +1773,27 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        return 0;
 }
-static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
+static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
 {
        nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
-        return 0;
+        return NULL;
 }
-static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
        struct page *page = NULL;
        int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
        nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
-        return 0;
+        return page;
 }
-static void shmem_put_link(struct dentry *dentry, struct nameidata *nd)
+static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 {
        if (!IS_ERR(nd_get_link(nd))) {
-                struct page *page;
+                struct page *page = cookie;
-                page = find_get_page(dentry->d_inode->i_mapping, 0);
-                if (!page)
-                        BUG();
                kunmap(page);
                mark_page_accessed(page);
                page_cache_release(page);
-                page_cache_release(page);
        }
 }
diff --git a/mm/slab.c b/mm/slab.c
index 93cbbbb39f42..c9e706db4634 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -92,6 +92,7 @@
 #include        <linux/sysctl.h>
 #include        <linux/module.h>
 #include        <linux/rcupdate.h>
+#include        <linux/string.h>
 #include        <asm/uaccess.h>
 #include        <asm/cacheflush.h>
@@ -583,7 +584,8 @@ static inline struct array_cache *ac_data(kmem_cache_t *cachep)
        return cachep->array[smp_processor_id()];
 }
-static inline kmem_cache_t *__find_general_cachep(size_t size, int gfpflags)
+static inline kmem_cache_t *__find_general_cachep(size_t size,
+                                                unsigned int __nocast gfpflags)
 {
        struct cache_sizes *csizep = malloc_sizes;
@@ -607,7 +609,8 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, int gfpflags)
        return csizep->cs_cachep;
 }
-kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags)
+kmem_cache_t *kmem_find_general_cachep(size_t size,
+                unsigned int __nocast gfpflags)
 {
        return __find_general_cachep(size, gfpflags);
 }
@@ -2099,7 +2102,7 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags)
 #if DEBUG
 static void *
 cache_alloc_debugcheck_after(kmem_cache_t *cachep,
-                        unsigned long flags, void *objp, void *caller)
+                        unsigned int __nocast flags, void *objp, void *caller)
 {
        if (!objp)      
                return objp;
@@ -2371,6 +2374,9 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
        struct slab *slabp;
        kmem_bufctl_t next;
+        if (nodeid == -1)
+                return kmem_cache_alloc(cachep, flags);
        for (loop = 0;;loop++) {
                struct list_head *q;
@@ -2438,7 +2444,7 @@ got_slabp:
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
-void *kmalloc_node(size_t size, int flags, int node)
+void *kmalloc_node(size_t size, unsigned int __nocast flags, int node)
 {
        kmem_cache_t *cachep;
@@ -3082,3 +3088,26 @@ unsigned int ksize(const void *objp)
        return size;
 }
+/*
+ * kstrdup - allocate space for and copy an existing string
+ *
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrdup(const char *s, unsigned int __nocast gfp)
+{
+        size_t len;
+        char *buf;
+        if (!s)
+                return NULL;
+        len = strlen(s) + 1;
+        buf = kmalloc(len, gfp);
+        if (buf)
+                memcpy(buf, s, len);
+        return buf;
+}
+EXPORT_SYMBOL(kstrdup);
diff --git a/mm/sparse.c b/mm/sparse.c
new file mode 100644
index 000000000000..b54e304df4a7
--- /dev/null
+++ b/mm/sparse.c
@@ -0,0 +1,137 @@
+/*
+ * sparse memory mappings.
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <asm/dma.h>
+/*
+ * Permanent SPARSEMEM data:
+ *
+ * 1) mem_section       - memory sections, mem_map's for valid memory
+ */
+struct mem_section mem_section[NR_MEM_SECTIONS];
+EXPORT_SYMBOL(mem_section);
+/* Record a memory area against a node. */
+void memory_present(int nid, unsigned long start, unsigned long end)
+{
+        unsigned long pfn;
+        start &= PAGE_SECTION_MASK;
+        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+                unsigned long section = pfn_to_section_nr(pfn);
+                if (!mem_section[section].section_mem_map)
+                        mem_section[section].section_mem_map = SECTION_MARKED_PRESENT;
+        }
+}
+/*
+ * Only used by the i386 NUMA architecures, but relatively
+ * generic code.
+ */
+unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
+                                                     unsigned long end_pfn)
+{
+        unsigned long pfn;
+        unsigned long nr_pages = 0;
+        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+                if (nid != early_pfn_to_nid(pfn))
+                        continue;
+                if (pfn_valid(pfn))
+                        nr_pages += PAGES_PER_SECTION;
+        }
+        return nr_pages * sizeof(struct page);
+}
+/*
+ * Subtle, we encode the real pfn into the mem_map such that
+ * the identity pfn - section_mem_map will return the actual
+ * physical page frame number.
+ */
+static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
+{
+        return (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
+}
+/*
+ * We need this if we ever free the mem_maps.  While not implemented yet,
+ * this function is included for parity with its sibling.
+ */
+static __attribute((unused))
+struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
+{
+        return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
+}
+static int sparse_init_one_section(struct mem_section *ms,
+                unsigned long pnum, struct page *mem_map)
+{
+        if (!valid_section(ms))
+                return -EINVAL;
+        ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
+        return 1;
+}
+static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
+{
+        struct page *map;
+        int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
+        map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
+        if (map)
+                return map;
+        map = alloc_bootmem_node(NODE_DATA(nid),
+                        sizeof(struct page) * PAGES_PER_SECTION);
+        if (map)
+                return map;
+        printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
+        mem_section[pnum].section_mem_map = 0;
+        return NULL;
+}
+/*
+ * Allocate the accumulated non-linear sections, allocate a mem_map
+ * for each and record the physical to section mapping.
+ */
+void sparse_init(void)
+{
+        unsigned long pnum;
+        struct page *map;
+        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+                if (!valid_section_nr(pnum))
+                        continue;
+                map = sparse_early_mem_map_alloc(pnum);
+                if (map)
+                        sparse_init_one_section(&mem_section[pnum], pnum, map);
+        }
+}
+/*
+ * returns the number of sections whose mem_maps were properly
+ * set.  If this is <=0, then that means that the passed-in
+ * map was not consumed and must be freed.
+ */
+int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map)
+{
+        struct mem_section *ms = __pfn_to_section(start_pfn);
+        if (ms->section_mem_map & SECTION_MARKED_PRESENT)
+                return -EEXIST;
+        ms->section_mem_map |= SECTION_MARKED_PRESENT;
+        return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map);
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4b8e62a19370..cfffe5098d53 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -972,7 +972,7 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
                 * writeout.  So in laptop mode, write out the whole world.
                 */
                if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
-                        wakeup_bdflush(laptop_mode ? 0 : total_scanned);
+                        wakeup_pdflush(laptop_mode ? 0 : total_scanned);
                        sc.may_writepage = 1;
                }
@@ -1216,8 +1216,8 @@ static int kswapd(void *p)
        order = 0;
        for ( ; ; ) {
                unsigned long new_order;
-                if (current->flags & PF_FREEZE)
-                        refrigerator(PF_FREEZE);
+                try_to_freeze();
                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
                new_order = pgdat->kswapd_max_order;