25 files changed, 1009 insertions, 478 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 62e5d0d0bd5a..086af703da43 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -168,3 +168,7 @@ config NR_QUICK
        depends on QUICKLIST
        default "2" if (SUPERH && !SUPERH64)
        default "1"
+config VIRT_TO_BUS
+        def_bool y
+        depends on !ARCH_NO_VIRT_TO_BUS
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e5de3781d3fe..f50a2811f9dc 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -55,22 +55,6 @@ long congestion_wait(int rw, long timeout)
 }
 EXPORT_SYMBOL(congestion_wait);
-long congestion_wait_interruptible(int rw, long timeout)
-{
-        long ret;
-        DEFINE_WAIT(wait);
-        wait_queue_head_t *wqh = &congestion_wqh[rw];
-        prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
-        if (signal_pending(current))
-                ret = -ERESTARTSYS;
-        else
-                ret = io_schedule_timeout(timeout);
-        finish_wait(wqh, &wait);
-        return ret;
-}
-EXPORT_SYMBOL(congestion_wait_interruptible);
 /**
 * congestion_end - wake up sleepers on a congested backing_dev_info
 * @rw: READ or WRITE
diff --git a/mm/filemap.c b/mm/filemap.c
index edb1b0b5cc8d..100b99c2d504 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -120,6 +120,7 @@ void __remove_from_page_cache(struct page *page)
        page->mapping = NULL;
        mapping->nrpages--;
        __dec_zone_page_state(page, NR_FILE_PAGES);
+        BUG_ON(page_mapped(page));
 }
 void remove_from_page_cache(struct page *page)
@@ -1218,6 +1219,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                                retval = retval ?: desc.error;
                                break;
                        }
+                        if (desc.count > 0)
+                                break;
                }
        }
 out:
@@ -1245,26 +1248,6 @@ int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long o
        return written;
 }
-ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
-                         size_t count, read_actor_t actor, void *target)
-{
-        read_descriptor_t desc;
-        if (!count)
-                return 0;
-        desc.written = 0;
-        desc.count = count;
-        desc.arg.data = target;
-        desc.error = 0;
-        do_generic_file_read(in_file, ppos, &desc, actor);
-        if (desc.written)
-                return desc.written;
-        return desc.error;
-}
-EXPORT_SYMBOL(generic_file_sendfile);
 static ssize_t
 do_readahead(struct address_space *mapping, struct file *filp,
             unsigned long index, unsigned long nr)
@@ -1786,7 +1769,6 @@ retry:
        page = __read_cache_page(mapping, index, filler, data);
        if (IS_ERR(page))
                return page;
-        mark_page_accessed(page);
        if (PageUptodate(page))
                goto out;
@@ -1985,7 +1967,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
        if (unlikely(*pos + *count > MAX_NON_LFS &&
                                !(file->f_flags & O_LARGEFILE))) {
                if (*pos >= MAX_NON_LFS) {
-                        send_sig(SIGXFSZ, current, 0);
                        return -EFBIG;
                }
                if (*count > MAX_NON_LFS - (unsigned long)*pos) {
@@ -2003,7 +1984,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
        if (likely(!isblk)) {
                if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
                        if (*count || *pos > inode->i_sb->s_maxbytes) {
-                                send_sig(SIGXFSZ, current, 0);
                                return -EFBIG;
                        }
                        /* zero-length writes at ->s_maxbytes are OK */
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index fa360e566d88..65ffc321f0c0 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -159,28 +159,6 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 }
 EXPORT_SYMBOL_GPL(xip_file_read);
-ssize_t
-xip_file_sendfile(struct file *in_file, loff_t *ppos,
-             size_t count, read_actor_t actor, void *target)
-{
-        read_descriptor_t desc;
-        if (!count)
-                return 0;
-        desc.written = 0;
-        desc.count = count;
-        desc.arg.data = target;
-        desc.error = 0;
-        do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file,
-                            ppos, &desc, actor);
-        if (desc.written)
-                return desc.written;
-        return desc.error;
-}
-EXPORT_SYMBOL_GPL(xip_file_sendfile);
 /*
 * __xip_unmap is invoked from xip_unmap and
 * xip_write
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eb7180db3033..acc0fb3cf067 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -66,7 +66,7 @@ static void enqueue_huge_page(struct page *page)
 static struct page *dequeue_huge_page(struct vm_area_struct *vma,
                                unsigned long address)
 {
-        int nid = numa_node_id();
+        int nid;
        struct page *page = NULL;
        struct zonelist *zonelist = huge_zonelist(vma, address);
        struct zone **z;
@@ -101,13 +101,20 @@ static void free_huge_page(struct page *page)
 static int alloc_fresh_huge_page(void)
 {
-        static int nid = 0;
+        static int prev_nid;
        struct page *page;
-        page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
+        static DEFINE_SPINLOCK(nid_lock);
-                                        HUGETLB_PAGE_ORDER);
+        int nid;
-        nid = next_node(nid, node_online_map);
+        spin_lock(&nid_lock);
+        nid = next_node(prev_nid, node_online_map);
        if (nid == MAX_NUMNODES)
                nid = first_node(node_online_map);
+        prev_nid = nid;
+        spin_unlock(&nid_lock);
+        page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
+                                        HUGETLB_PAGE_ORDER);
        if (page) {
                set_compound_page_dtor(page, free_huge_page);
                spin_lock(&hugetlb_lock);
@@ -326,9 +333,10 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
        pte_t entry;
        entry = pte_mkwrite(pte_mkdirty(*ptep));
-        ptep_set_access_flags(vma, address, ptep, entry, 1);
+        if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
-        update_mmu_cache(vma, address, entry);
+                update_mmu_cache(vma, address, entry);
-        lazy_mmu_prot_update(entry);
+                lazy_mmu_prot_update(entry);
+        }
 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 60542d006ec1..93ee375b38e7 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -287,9 +287,11 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
        struct vm_area_struct * vma, *prev;
        int unmapped_error = 0;
        int error = -EINVAL;
+        int write;
        size_t len;
-        if (madvise_need_mmap_write(behavior))
+        write = madvise_need_mmap_write(behavior);
+        if (write)
                down_write(&current->mm->mmap_sem);
        else
                down_read(&current->mm->mmap_sem);
@@ -354,7 +356,7 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
                        vma = find_vma(current->mm, start);
        }
 out:
-        if (madvise_need_mmap_write(behavior))
+        if (write)
                up_write(&current->mm->mmap_sem);
        else
                up_read(&current->mm->mmap_sem);
diff --git a/mm/memory.c b/mm/memory.c
index cb94488ab96d..b3d73bb1f680 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -78,11 +78,9 @@ unsigned long num_physpages;
 * and ZONE_HIGHMEM.
 */
 void * high_memory;
-unsigned long vmalloc_earlyreserve;
 EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
-EXPORT_SYMBOL(vmalloc_earlyreserve);
 int randomize_va_space __read_mostly = 1;
@@ -1055,6 +1053,14 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                do {
                        struct page *page;
+                        /*
+                         * If tsk is ooming, cut off its access to large memory
+                         * allocations. It has a pending SIGKILL, but it can't
+                         * be processed until returning to user space.
+                         */
+                        if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
+                                return -ENOMEM;
                        if (write)
                                foll_flags |= FOLL_WRITE;
@@ -1691,9 +1697,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = pte_mkyoung(orig_pte);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                ptep_set_access_flags(vma, address, page_table, entry, 1);
+                if (ptep_set_access_flags(vma, address, page_table, entry,1)) {
-                update_mmu_cache(vma, address, entry);
+                        update_mmu_cache(vma, address, entry);
-                lazy_mmu_prot_update(entry);
+                        lazy_mmu_prot_update(entry);
+                }
                ret |= VM_FAULT_WRITE;
                goto unlock;
        }
@@ -2525,10 +2532,9 @@ static inline int handle_pte_fault(struct mm_struct *mm,
                pte_t *pte, pmd_t *pmd, int write_access)
 {
        pte_t entry;
-        pte_t old_entry;
        spinlock_t *ptl;
-        old_entry = entry = *pte;
+        entry = *pte;
        if (!pte_present(entry)) {
                if (pte_none(entry)) {
                        if (vma->vm_ops) {
@@ -2561,8 +2567,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
                entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
-        if (!pte_same(old_entry, entry)) {
+        if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
-                ptep_set_access_flags(vma, address, pte, entry, write_access);
                update_mmu_cache(vma, address, entry);
                lazy_mmu_prot_update(entry);
        } else {
@@ -2674,7 +2679,7 @@ int make_pages_present(unsigned long addr, unsigned long end)
        write = (vma->vm_flags & VM_WRITE) != 0;
        BUG_ON(addr >= end);
        BUG_ON(end > vma->vm_end);
-        len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
+        len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
        ret = get_user_pages(current, current->mm, addr,
                        len, write, 0, NULL, NULL);
        if (ret < 0)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d76e8eb342d0..188f8d9c4aed 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -101,8 +101,6 @@
 static struct kmem_cache *policy_cache;
 static struct kmem_cache *sn_cache;
-#define PDprintk(fmt...)
 /* Highest zone. An specific allocation for a zone below that is not
   policied. */
 enum zone_type policy_zone = 0;
@@ -175,7 +173,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 {
        struct mempolicy *policy;
-        PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
+        pr_debug("setting mode %d nodes[0] %lx\n",
+                 mode, nodes ? nodes_addr(*nodes)[0] : -1);
        if (mode == MPOL_DEFAULT)
                return NULL;
        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -379,7 +379,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
        int err = 0;
        struct mempolicy *old = vma->vm_policy;
-        PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
                 vma->vm_ops, vma->vm_file,
                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
@@ -776,8 +776,8 @@ long do_mbind(unsigned long start, unsigned long len,
        if (!new)
                flags |= MPOL_MF_DISCONTIG_OK;
-        PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+        pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
-                        mode,nodes_addr(nodes)[0]);
+                 mode, nmask ? nodes_addr(*nmask)[0] : -1);
        down_write(&mm->mmap_sem);
        vma = check_range(mm, start, end, nmask,
@@ -1434,7 +1434,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new)
        }
        rb_link_node(&new->nd, parent, p);
        rb_insert_color(&new->nd, &sp->root);
-        PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
+        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
                 new->policy ? new->policy->policy : 0);
 }
@@ -1459,7 +1459,7 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
-        PDprintk("deleting %lx-l%x\n", n->start, n->end);
+        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
        rb_erase(&n->nd, &sp->root);
        mpol_free(n->policy);
        kmem_cache_free(sn_cache, n);
@@ -1558,10 +1558,10 @@ int mpol_set_shared_policy(struct shared_policy *info,
        struct sp_node *new = NULL;
        unsigned long sz = vma_pages(vma);
-        PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
+        pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
                 vma->vm_pgoff,
                 sz, npol? npol->policy : -1,
-                npol ? nodes_addr(npol->v.nodes)[0] : -1);
+                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
        if (npol) {
                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -1597,6 +1597,10 @@ void mpol_free_shared_policy(struct shared_policy *p)
 /* assumes fs == KERNEL_DS */
 void __init numa_policy_init(void)
 {
+        nodemask_t interleave_nodes;
+        unsigned long largest = 0;
+        int nid, prefer = 0;
        policy_cache = kmem_cache_create("numa_policy",
                                         sizeof(struct mempolicy),
                                         0, SLAB_PANIC, NULL, NULL);
@@ -1605,10 +1609,31 @@ void __init numa_policy_init(void)
                                     sizeof(struct sp_node),
                                     0, SLAB_PANIC, NULL, NULL);
-        /* Set interleaving policy for system init. This way not all
+        /*
-           the data structures allocated at system boot end up in node zero. */
+         * Set interleaving policy for system init. Interleaving is only
+         * enabled across suitably sized nodes (default is >= 16MB), or
+         * fall back to the largest node if they're all smaller.
+         */
+        nodes_clear(interleave_nodes);
+        for_each_online_node(nid) {
+                unsigned long total_pages = node_present_pages(nid);
+                /* Preserve the largest node */
+                if (largest < total_pages) {
+                        largest = total_pages;
+                        prefer = nid;
+                }
+                /* Interleave this node? */
+                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
+                        node_set(nid, interleave_nodes);
+        }
+        /* All too small, use the largest */
+        if (unlikely(nodes_empty(interleave_nodes)))
+                node_set(prefer, interleave_nodes);
-        if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
+        if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
                printk("numa_policy_init: interleaving failed\n");
 }
diff --git a/mm/mempool.c b/mm/mempool.c
index cc1ca86dfc24..3e8f1fed0e1f 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -263,6 +263,9 @@ void mempool_free(void *element, mempool_t *pool)
 {
        unsigned long flags;
+        if (unlikely(element == NULL))
+                return;
        smp_mb();
        if (pool->curr_nr < pool->min_nr) {
                spin_lock_irqsave(&pool->lock, flags);
diff --git a/mm/mlock.c b/mm/mlock.c
index 4d3fea267e0d..7b2656055d6a 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -244,9 +244,12 @@ int user_shm_lock(size_t size, struct user_struct *user)
        locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+        if (lock_limit == RLIM_INFINITY)
+                allowed = 1;
        lock_limit >>= PAGE_SHIFT;
        spin_lock(&shmlock_user_lock);
-        if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+        if (!allowed &&
+            locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
                goto out;
        get_uid(user);
        user->locked_shm += locked;
diff --git a/mm/mmap.c b/mm/mmap.c
index 68b9ad2ef1d6..144b4a290f2c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -894,14 +894,11 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
                        unsigned long flags, unsigned long pgoff)
 {
        struct mm_struct * mm = current->mm;
-        struct vm_area_struct * vma, * prev;
        struct inode *inode;
        unsigned int vm_flags;
-        int correct_wcount = 0;
        int error;
-        struct rb_node ** rb_link, * rb_parent;
        int accountable = 1;
-        unsigned long charged = 0, reqprot = prot;
+        unsigned long reqprot = prot;
        /*
         * Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1023,10 +1020,28 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
                }
        }
-        error = security_file_mmap(file, reqprot, prot, flags);
+        error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
        if (error)
                return error;
-                
+        return mmap_region(file, addr, len, flags, vm_flags, pgoff,
+                           accountable);
+}
+EXPORT_SYMBOL(do_mmap_pgoff);
+unsigned long mmap_region(struct file *file, unsigned long addr,
+                          unsigned long len, unsigned long flags,
+                          unsigned int vm_flags, unsigned long pgoff,
+                          int accountable)
+{
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma, *prev;
+        int correct_wcount = 0;
+        int error;
+        struct rb_node **rb_link, *rb_parent;
+        unsigned long charged = 0;
+        struct inode *inode =  file ? file->f_path.dentry->d_inode : NULL;
        /* Clear old maps */
        error = -ENOMEM;
 munmap_back:
@@ -1175,8 +1190,6 @@ unacct_error:
        return error;
 }
-EXPORT_SYMBOL(do_mmap_pgoff);
 /* Get an address range which is currently unmapped.
 * For shmat() with addr=0.
 *
@@ -1536,9 +1549,14 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
         * vma->vm_start/vm_end cannot change under us because the caller
         * is required to hold the mmap_sem in read mode.  We need the
         * anon_vma lock to serialize against concurrent expand_stacks.
+         * Also guard against wrapping around to address 0.
         */
-        address += 4 + PAGE_SIZE - 1;
+        if (address < PAGE_ALIGN(address+4))
-        address &= PAGE_MASK;
+                address = PAGE_ALIGN(address+4);
+        else {
+                anon_vma_unlock(vma);
+                return -ENOMEM;
+        }
        error = 0;
        /* Somebody else might have raced and expanded it already */
diff --git a/mm/mremap.c b/mm/mremap.c
index 5d4bd4f95b8e..bc7c52efc71b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -291,6 +291,10 @@ unsigned long do_mremap(unsigned long addr,
                if ((addr <= new_addr) && (addr+old_len) > new_addr)
                        goto out;
+                ret = security_file_mmap(0, 0, 0, 0, new_addr, 1);
+                if (ret)
+                        goto out;
                ret = do_munmap(mm, new_addr, new_len);
                if (ret)
                        goto out;
@@ -390,8 +394,13 @@ unsigned long do_mremap(unsigned long addr,
                        new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
                                                vma->vm_pgoff, map_flags);
-                        ret = new_addr;
+                        if (new_addr & ~PAGE_MASK) {
-                        if (new_addr & ~PAGE_MASK)
+                                ret = new_addr;
+                                goto out;
+                        }
+                        ret = security_file_mmap(0, 0, 0, 0, new_addr, 1);
+                        if (ret)
                                goto out;
                }
                ret = move_vma(vma, addr, old_len, new_len, new_addr);
diff --git a/mm/nommu.c b/mm/nommu.c
index 2b16b00a5b11..8bbbf147a794 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -367,6 +367,11 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
        return find_vma(mm, addr);
 }
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+        return -ENOMEM;
+}
 /*
 * look up the first VMA exactly that exactly matches addr
 * - should be called with mm->mmap_sem at least held readlocked
@@ -639,7 +644,7 @@ static int validate_mmap_request(struct file *file,
        }
        /* allow the security API to have its say */
-        ret = security_file_mmap(file, reqprot, prot, flags);
+        ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
        if (ret < 0)
                return ret;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index eec1481ba44f..ea9da3bed3e9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -476,15 +476,13 @@ static void wb_kupdate(unsigned long arg)
 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
 */
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
-                struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+        struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
        proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
-        if (dirty_writeback_interval) {
+        if (dirty_writeback_interval)
-                mod_timer(&wb_timer,
+                mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
-                        jiffies + dirty_writeback_interval);
+        else
-                } else {
                del_timer(&wb_timer);
-        }
        return 0;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bd8e33582d25..f9e4e647d7e8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -126,13 +126,13 @@ static unsigned long __meminitdata dma_reserve;
    #endif
  #endif
-  struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
+  static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
-  int __meminitdata nr_nodemap_entries;
+  static int __meminitdata nr_nodemap_entries;
-  unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+  static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
-  unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+  static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-  unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
+  static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
-  unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
+  static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
@@ -900,11 +900,13 @@ static struct fail_page_alloc_attr {
        u32 ignore_gfp_highmem;
        u32 ignore_gfp_wait;
+        u32 min_order;
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
        struct dentry *ignore_gfp_highmem_file;
        struct dentry *ignore_gfp_wait_file;
+        struct dentry *min_order_file;
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
@@ -912,6 +914,7 @@ static struct fail_page_alloc_attr {
        .attr = FAULT_ATTR_INITIALIZER,
        .ignore_gfp_wait = 1,
        .ignore_gfp_highmem = 1,
+        .min_order = 1,
 };
 static int __init setup_fail_page_alloc(char *str)
@@ -922,6 +925,8 @@ __setup("fail_page_alloc=", setup_fail_page_alloc);
 static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
+        if (order < fail_page_alloc.min_order)
+                return 0;
        if (gfp_mask & __GFP_NOFAIL)
                return 0;
        if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
@@ -953,12 +958,17 @@ static int __init fail_page_alloc_debugfs(void)
        fail_page_alloc.ignore_gfp_highmem_file =
                debugfs_create_bool("ignore-gfp-highmem", mode, dir,
                                      &fail_page_alloc.ignore_gfp_highmem);
+        fail_page_alloc.min_order_file =
+                debugfs_create_u32("min-order", mode, dir,
+                                   &fail_page_alloc.min_order);
        if (!fail_page_alloc.ignore_gfp_wait_file ||
-                        !fail_page_alloc.ignore_gfp_highmem_file) {
+            !fail_page_alloc.ignore_gfp_highmem_file ||
+            !fail_page_alloc.min_order_file) {
                err = -ENOMEM;
                debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
                debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
+                debugfs_remove(fail_page_alloc.min_order_file);
                cleanup_fault_attr_dentries(&fail_page_alloc.attr);
        }
@@ -1621,8 +1631,8 @@ void show_free_areas(void)
 *
 * Add all populated zones of a node to the zonelist.
 */
-static int __meminit build_zonelists_node(pg_data_t *pgdat,
+static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
-                        struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
+                                int nr_zones, enum zone_type zone_type)
 {
        struct zone *zone;
@@ -1641,9 +1651,102 @@ static int __meminit build_zonelists_node(pg_data_t *pgdat,
        return nr_zones;
 }
+/*
+ *  zonelist_order:
+ *  0 = automatic detection of better ordering.
+ *  1 = order by ([node] distance, -zonetype)
+ *  2 = order by (-zonetype, [node] distance)
+ *
+ *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
+ *  the same zonelist. So only NUMA can configure this param.
+ */
+#define ZONELIST_ORDER_DEFAULT  0
+#define ZONELIST_ORDER_NODE     1
+#define ZONELIST_ORDER_ZONE     2
+/* zonelist order in the kernel.
+ * set_zonelist_order() will set this to NODE or ZONE.
+ */
+static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
+static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
 #ifdef CONFIG_NUMA
+/* The value user specified ....changed by config */
+static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
+/* string for sysctl */
+#define NUMA_ZONELIST_ORDER_LEN 16
+char numa_zonelist_order[16] = "default";
+/*
+ * interface for configure zonelist ordering.
+ * command line option "numa_zonelist_order"
+ *      = "[dD]efault   - default, automatic configuration.
+ *      = "[nN]ode      - order by node locality, then by zone within node
+ *      = "[zZ]one      - order by zone, then by locality within zone
+ */
+static int __parse_numa_zonelist_order(char *s)
+{
+        if (*s == 'd' || *s == 'D') {
+                user_zonelist_order = ZONELIST_ORDER_DEFAULT;
+        } else if (*s == 'n' || *s == 'N') {
+                user_zonelist_order = ZONELIST_ORDER_NODE;
+        } else if (*s == 'z' || *s == 'Z') {
+                user_zonelist_order = ZONELIST_ORDER_ZONE;
+        } else {
+                printk(KERN_WARNING
+                        "Ignoring invalid numa_zonelist_order value:  "
+                        "%s\n", s);
+                return -EINVAL;
+        }
+        return 0;
+}
+static __init int setup_numa_zonelist_order(char *s)
+{
+        if (s)
+                return __parse_numa_zonelist_order(s);
+        return 0;
+}
+early_param("numa_zonelist_order", setup_numa_zonelist_order);
+/*
+ * sysctl handler for numa_zonelist_order
+ */
+int numa_zonelist_order_handler(ctl_table *table, int write,
+                struct file *file, void __user *buffer, size_t *length,
+                loff_t *ppos)
+{
+        char saved_string[NUMA_ZONELIST_ORDER_LEN];
+        int ret;
+        if (write)
+                strncpy(saved_string, (char*)table->data,
+                        NUMA_ZONELIST_ORDER_LEN);
+        ret = proc_dostring(table, write, file, buffer, length, ppos);
+        if (ret)
+                return ret;
+        if (write) {
+                int oldval = user_zonelist_order;
+                if (__parse_numa_zonelist_order((char*)table->data)) {
+                        /*
+                         * bogus value.  restore saved string
+                         */
+                        strncpy((char*)table->data, saved_string,
+                                NUMA_ZONELIST_ORDER_LEN);
+                        user_zonelist_order = oldval;
+                } else if (oldval != user_zonelist_order)
+                        build_all_zonelists();
+        }
+        return 0;
+}
 #define MAX_NODE_LOAD (num_online_nodes())
-static int __meminitdata node_load[MAX_NUMNODES];
+static int node_load[MAX_NUMNODES];
 /**
 * find_next_best_node - find the next node that should appear in a given node's fallback list
 * @node: node whose fallback list we're appending
@@ -1658,7 +1761,7 @@ static int __meminitdata node_load[MAX_NUMNODES];
 * on them otherwise.
 * It returns -1 if no node is found.
 */
-static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
+static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
        int n, val;
        int min_val = INT_MAX;
@@ -1704,13 +1807,129 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
        return best_node;
 }
-static void __meminit build_zonelists(pg_data_t *pgdat)
+/*
+ * Build zonelists ordered by node and zones within node.
+ * This results in maximum locality--normal zone overflows into local
+ * DMA zone, if any--but risks exhausting DMA zone.
+ */
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 {
-        int j, node, local_node;
        enum zone_type i;
-        int prev_node, load;
+        int j;
        struct zonelist *zonelist;
+        for (i = 0; i < MAX_NR_ZONES; i++) {
+                zonelist = pgdat->node_zonelists + i;
+                for (j = 0; zonelist->zones[j] != NULL; j++)
+                        ;
+                j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
+                zonelist->zones[j] = NULL;
+        }
+}
+/*
+ * Build zonelists ordered by zone and nodes within zones.
+ * This results in conserving DMA zone[s] until all Normal memory is
+ * exhausted, but results in overflowing to remote node while memory
+ * may still exist in local DMA zone.
+ */
+static int node_order[MAX_NUMNODES];
+static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
+{
+        enum zone_type i;
+        int pos, j, node;
+        int zone_type;          /* needs to be signed */
+        struct zone *z;
+        struct zonelist *zonelist;
+        for (i = 0; i < MAX_NR_ZONES; i++) {
+                zonelist = pgdat->node_zonelists + i;
+                pos = 0;
+                for (zone_type = i; zone_type >= 0; zone_type--) {
+                        for (j = 0; j < nr_nodes; j++) {
+                                node = node_order[j];
+                                z = &NODE_DATA(node)->node_zones[zone_type];
+                                if (populated_zone(z)) {
+                                        zonelist->zones[pos++] = z;
+                                        check_highest_zone(zone_type);
+                                }
+                        }
+                }
+                zonelist->zones[pos] = NULL;
+        }
+}
+static int default_zonelist_order(void)
+{
+        int nid, zone_type;
+        unsigned long low_kmem_size,total_size;
+        struct zone *z;
+        int average_size;
+        /*
+         * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
+         * If they are really small and used heavily, the system can fall
+         * into OOM very easily.
+         * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
+         */
+        /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
+        low_kmem_size = 0;
+        total_size = 0;
+        for_each_online_node(nid) {
+                for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+                        z = &NODE_DATA(nid)->node_zones[zone_type];
+                        if (populated_zone(z)) {
+                                if (zone_type < ZONE_NORMAL)
+                                        low_kmem_size += z->present_pages;
+                                total_size += z->present_pages;
+                        }
+                }
+        }
+        if (!low_kmem_size ||  /* there are no DMA area. */
+            low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
+                return ZONELIST_ORDER_NODE;
+        /*
+         * look into each node's config.
+         * If there is a node whose DMA/DMA32 memory is very big area on
+         * local memory, NODE_ORDER may be suitable.
+         */
+        average_size = total_size / (num_online_nodes() + 1);
+        for_each_online_node(nid) {
+                low_kmem_size = 0;
+                total_size = 0;
+                for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+                        z = &NODE_DATA(nid)->node_zones[zone_type];
+                        if (populated_zone(z)) {
+                                if (zone_type < ZONE_NORMAL)
+                                        low_kmem_size += z->present_pages;
+                                total_size += z->present_pages;
+                        }
+                }
+                if (low_kmem_size &&
+                    total_size > average_size && /* ignore small node */
+                    low_kmem_size > total_size * 70/100)
+                        return ZONELIST_ORDER_NODE;
+        }
+        return ZONELIST_ORDER_ZONE;
+}
+static void set_zonelist_order(void)
+{
+        if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
+                current_zonelist_order = default_zonelist_order();
+        else
+                current_zonelist_order = user_zonelist_order;
+}
+static void build_zonelists(pg_data_t *pgdat)
+{
+        int j, node, load;
+        enum zone_type i;
        nodemask_t used_mask;
+        int local_node, prev_node;
+        struct zonelist *zonelist;
+        int order = current_zonelist_order;
        /* initialize zonelists */
        for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -1723,6 +1942,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
        load = num_online_nodes();
        prev_node = local_node;
        nodes_clear(used_mask);
+        memset(node_load, 0, sizeof(node_load));
+        memset(node_order, 0, sizeof(node_order));
+        j = 0;
        while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
                int distance = node_distance(local_node, node);
@@ -1738,23 +1962,25 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
                 * So adding penalty to the first node in same
                 * distance group to make it round-robin.
                 */
                if (distance != node_distance(local_node, prev_node))
-                        node_load[node] += load;
+                        node_load[node] = load;
                prev_node = node;
                load--;
-                for (i = 0; i < MAX_NR_ZONES; i++) {
+                if (order == ZONELIST_ORDER_NODE)
-                        zonelist = pgdat->node_zonelists + i;
+                        build_zonelists_in_node_order(pgdat, node);
-                        for (j = 0; zonelist->zones[j] != NULL; j++);
+                else
+                        node_order[j++] = node; /* remember order */
+        }
-                        j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
+        if (order == ZONELIST_ORDER_ZONE) {
-                        zonelist->zones[j] = NULL;
+                /* calculate node order -- i.e., DMA last! */
-                }
+                build_zonelists_in_zone_order(pgdat, j);
        }
 }
 /* Construct the zonelist performance cache - see further mmzone.h */
-static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+static void build_zonelist_cache(pg_data_t *pgdat)
 {
        int i;
@@ -1771,9 +1997,15 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat)
        }
 }
 #else   /* CONFIG_NUMA */
-static void __meminit build_zonelists(pg_data_t *pgdat)
+static void set_zonelist_order(void)
+{
+        current_zonelist_order = ZONELIST_ORDER_ZONE;
+}
+static void build_zonelists(pg_data_t *pgdat)
 {
        int node, local_node;
        enum zone_type i,j;
@@ -1809,7 +2041,7 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
 }
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
-static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+static void build_zonelist_cache(pg_data_t *pgdat)
 {
        int i;
@@ -1820,7 +2052,7 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat)
 #endif  /* CONFIG_NUMA */
 /* return values int ....just for stop_machine_run() */
-static int __meminit __build_all_zonelists(void *dummy)
+static int __build_all_zonelists(void *dummy)
 {
        int nid;
@@ -1831,8 +2063,10 @@ static int __meminit __build_all_zonelists(void *dummy)
        return 0;
 }
-void __meminit build_all_zonelists(void)
+void build_all_zonelists(void)
 {
+        set_zonelist_order();
        if (system_state == SYSTEM_BOOTING) {
                __build_all_zonelists(NULL);
                cpuset_init_current_mems_allowed();
@@ -1843,8 +2077,13 @@ void __meminit build_all_zonelists(void)
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
-        printk("Built %i zonelists.  Total pages: %ld\n",
+        printk("Built %i zonelists in %s order.  Total pages: %ld\n",
-                        num_online_nodes(), vm_total_pages);
+                        num_online_nodes(),
+                        zonelist_order_name[current_zonelist_order],
+                        vm_total_pages);
+#ifdef CONFIG_NUMA
+        printk("Policy zone: %s\n", zone_names[policy_zone]);
+#endif
 }
 /*
@@ -1953,8 +2192,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
        }
 }
-void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
+static void __meminit zone_init_free_lists(struct pglist_data *pgdat,
-                                unsigned long size)
+                                struct zone *zone, unsigned long size)
 {
        int order;
        for (order = 0; order < MAX_ORDER ; order++) {
@@ -1968,7 +2207,7 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
        memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
-static int __cpuinit zone_batchsize(struct zone *zone)
+static int __devinit zone_batchsize(struct zone *zone)
 {
        int batch;
@@ -2370,7 +2609,7 @@ void __init push_node_boundaries(unsigned int nid,
 }
 /* If necessary, push the node boundary out for reserve hotadd */
-static void __init account_node_boundary(unsigned int nid,
+static void __meminit account_node_boundary(unsigned int nid,
                unsigned long *start_pfn, unsigned long *end_pfn)
 {
        printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
@@ -2390,7 +2629,7 @@ static void __init account_node_boundary(unsigned int nid,
 void __init push_node_boundaries(unsigned int nid,
                unsigned long start_pfn, unsigned long end_pfn) {}
-static void __init account_node_boundary(unsigned int nid,
+static void __meminit account_node_boundary(unsigned int nid,
                unsigned long *start_pfn, unsigned long *end_pfn) {}
 #endif
@@ -2431,7 +2670,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
 * Return the number of pages a zone spans in a node, including holes
 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
 */
-unsigned long __meminit zone_spanned_pages_in_node(int nid,
+static unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long *ignored)
 {
@@ -2519,7 +2758,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 }
 /* Return the number of page frames in holes in a zone on a node */
-unsigned long __meminit zone_absent_pages_in_node(int nid,
+static unsigned long __meminit zone_absent_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long *ignored)
 {
@@ -2536,14 +2775,14 @@ unsigned long __meminit zone_absent_pages_in_node(int nid,
 }
 #else
-static inline unsigned long zone_spanned_pages_in_node(int nid,
+static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long *zones_size)
 {
        return zones_size[zone_type];
 }
-static inline unsigned long zone_absent_pages_in_node(int nid,
+static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
                                                unsigned long zone_type,
                                                unsigned long *zholes_size)
 {
@@ -3355,13 +3594,28 @@ void *__init alloc_large_system_hash(const char *tablename,
                        for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
                                ;
                        table = (void*) __get_free_pages(GFP_ATOMIC, order);
+                        /*
+                         * If bucketsize is not a power-of-two, we may free
+                         * some pages at the end of hash table.
+                         */
+                        if (table) {
+                                unsigned long alloc_end = (unsigned long)table +
+                                                (PAGE_SIZE << order);
+                                unsigned long used = (unsigned long)table +
+                                                PAGE_ALIGN(size);
+                                split_page(virt_to_page(table), order);
+                                while (used < alloc_end) {
+                                        free_page(used);
+                                        used += PAGE_SIZE;
+                                }
+                        }
                }
        } while (!table && size > PAGE_SIZE && --log2qty);
        if (!table)
                panic("Failed to allocate %s hash table\n", tablename);
-        printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
+        printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
               tablename,
               (1U << log2qty),
               ilog2(size) - PAGE_SHIFT,
diff --git a/mm/rmap.c b/mm/rmap.c
index 850165d32b7a..61e492597a0b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -53,24 +53,6 @@
 struct kmem_cache *anon_vma_cachep;
-static inline void validate_anon_vma(struct vm_area_struct *find_vma)
-{
-#ifdef CONFIG_DEBUG_VM
-        struct anon_vma *anon_vma = find_vma->anon_vma;
-        struct vm_area_struct *vma;
-        unsigned int mapcount = 0;
-        int found = 0;
-        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-                mapcount++;
-                BUG_ON(mapcount > 100000);
-                if (vma == find_vma)
-                        found = 1;
-        }
-        BUG_ON(!found);
-#endif
-}
 /* This must be called under the mmap_sem. */
 int anon_vma_prepare(struct vm_area_struct *vma)
 {
@@ -121,10 +103,8 @@ void __anon_vma_link(struct vm_area_struct *vma)
 {
        struct anon_vma *anon_vma = vma->anon_vma;
-        if (anon_vma) {
+        if (anon_vma)
                list_add_tail(&vma->anon_vma_node, &anon_vma->head);
-                validate_anon_vma(vma);
-        }
 }
 void anon_vma_link(struct vm_area_struct *vma)
@@ -134,7 +114,6 @@ void anon_vma_link(struct vm_area_struct *vma)
        if (anon_vma) {
                spin_lock(&anon_vma->lock);
                list_add_tail(&vma->anon_vma_node, &anon_vma->head);
-                validate_anon_vma(vma);
                spin_unlock(&anon_vma->lock);
        }
 }
@@ -148,7 +127,6 @@ void anon_vma_unlink(struct vm_area_struct *vma)
                return;
        spin_lock(&anon_vma->lock);
-        validate_anon_vma(vma);
        list_del(&vma->anon_vma_node);
        /* We must garbage collect the anon_vma if it's empty */
diff --git a/mm/shmem.c b/mm/shmem.c
index e537317bec4d..0493e4d0bcaa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -967,6 +967,8 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_
                *nodelist++ = '\0';
                if (nodelist_parse(nodelist, *policy_nodes))
                        goto out;
+                if (!nodes_subset(*policy_nodes, node_online_map))
+                        goto out;
        }
        if (!strcmp(value, "default")) {
                *policy = MPOL_DEFAULT;
@@ -1098,9 +1100,9 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
         * Normally, filepage is NULL on entry, and either found
         * uptodate immediately, or allocated and zeroed, or read
         * in under swappage, which is then assigned to filepage.
-         * But shmem_prepare_write passes in a locked filepage,
+         * But shmem_readpage and shmem_prepare_write pass in a locked
-         * which may be found not uptodate by other callers too,
+         * filepage, which may be found not uptodate by other callers
-         * and may need to be copied from the swappage read in.
+         * too, and may need to be copied from the swappage read in.
         */
 repeat:
        if (!filepage)
@@ -1483,9 +1485,18 @@ static const struct inode_operations shmem_symlink_inode_operations;
 static const struct inode_operations shmem_symlink_inline_operations;
 /*
- * Normally tmpfs makes no use of shmem_prepare_write, but it
+ * Normally tmpfs avoids the use of shmem_readpage and shmem_prepare_write;
- * lets a tmpfs file be used read-write below the loop driver.
+ * but providing them allows a tmpfs file to be used for splice, sendfile, and
+ * below the loop driver, in the generic fashion that many filesystems support.
 */
+static int shmem_readpage(struct file *file, struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
+        unlock_page(page);
+        return error;
+}
 static int
 shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
 {
@@ -1709,25 +1720,6 @@ static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count
        return desc.error;
 }
-static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
-                         size_t count, read_actor_t actor, void *target)
-{
-        read_descriptor_t desc;
-        if (!count)
-                return 0;
-        desc.written = 0;
-        desc.count = count;
-        desc.arg.data = target;
-        desc.error = 0;
-        do_shmem_file_read(in_file, ppos, &desc, actor);
-        if (desc.written)
-                return desc.written;
-        return desc.error;
-}
 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -2384,6 +2376,7 @@ static const struct address_space_operations shmem_aops = {
        .writepage      = shmem_writepage,
        .set_page_dirty = __set_page_dirty_no_writeback,
 #ifdef CONFIG_TMPFS
+        .readpage       = shmem_readpage,
        .prepare_write  = shmem_prepare_write,
        .commit_write   = simple_commit_write,
 #endif
@@ -2397,7 +2390,8 @@ static const struct file_operations shmem_file_operations = {
        .read           = shmem_file_read,
        .write          = shmem_file_write,
        .fsync          = simple_sync_file,
-        .sendfile       = shmem_file_sendfile,
+        .splice_read    = generic_file_splice_read,
+        .splice_write   = generic_file_splice_write,
 #endif
 };
diff --git a/mm/slab.c b/mm/slab.c
index 2e71a328aa09..a453383333fc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -137,6 +137,7 @@
 /* Shouldn't this be in a header file somewhere? */
 #define BYTES_PER_WORD          sizeof(void *)
+#define REDZONE_ALIGN           max(BYTES_PER_WORD, __alignof__(unsigned long long))
 #ifndef cache_line_size
 #define cache_line_size()       L1_CACHE_BYTES
@@ -547,7 +548,7 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
        if (cachep->flags & SLAB_STORE_USER)
                return (unsigned long long *)(objp + cachep->buffer_size -
                                              sizeof(unsigned long long) -
-                                              BYTES_PER_WORD);
+                                              REDZONE_ALIGN);
        return (unsigned long long *) (objp + cachep->buffer_size -
                                       sizeof(unsigned long long));
 }
@@ -774,7 +775,6 @@ static inline struct kmem_cache *__find_general_cachep(size_t size,
         */
        BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
 #endif
-        WARN_ON_ONCE(size == 0);
        while (size > csizep->cs_size)
                csizep++;
@@ -929,7 +929,7 @@ static void next_reap_node(void)
 * the CPUs getting into lockstep and contending for the global cache chain
 * lock.
 */
-static void __devinit start_cpu_timer(int cpu)
+static void __cpuinit start_cpu_timer(int cpu)
 {
        struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
@@ -2179,7 +2179,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         * above the next power of two: caches with object sizes just above a
         * power of two have a significant amount of internal fragmentation.
         */
-        if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
+        if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
+                                                2 * sizeof(unsigned long long)))
                flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
        if (!(flags & SLAB_DESTROY_BY_RCU))
                flags |= SLAB_POISON;
@@ -2220,12 +2221,20 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        }
        /*
-         * Redzoning and user store require word alignment. Note this will be
+         * Redzoning and user store require word alignment or possibly larger.
-         * overridden by architecture or caller mandated alignment if either
+         * Note this will be overridden by architecture or caller mandated
-         * is greater than BYTES_PER_WORD.
+         * alignment if either is greater than BYTES_PER_WORD.
         */
-        if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
+        if (flags & SLAB_STORE_USER)
-                ralign = __alignof__(unsigned long long);
+                ralign = BYTES_PER_WORD;
+        if (flags & SLAB_RED_ZONE) {
+                ralign = REDZONE_ALIGN;
+                /* If redzoning, ensure that the second redzone is suitably
+                 * aligned, by adjusting the object size accordingly. */
+                size += REDZONE_ALIGN - 1;
+                size &= ~(REDZONE_ALIGN - 1);
+        }
        /* 2) arch mandated alignment */
        if (ralign < ARCH_SLAB_MINALIGN) {
@@ -2262,9 +2271,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        }
        if (flags & SLAB_STORE_USER) {
                /* user store requires one word storage behind the end of
-                 * the real object.
+                 * the real object. But if the second red zone needs to be
+                 * aligned to 64 bits, we must allow that much space.
                 */
-                size += BYTES_PER_WORD;
+                if (flags & SLAB_RED_ZONE)
+                        size += REDZONE_ALIGN;
+                else
+                        size += BYTES_PER_WORD;
        }
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
        if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
@@ -3539,7 +3552,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
        check_irq_off();
        objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
-        if (use_alien_caches && cache_free_alien(cachep, objp))
+        if (cache_free_alien(cachep, objp))
                return;
        if (likely(ac->avail < ac->limit)) {
@@ -4144,26 +4157,17 @@ static void print_slabinfo_header(struct seq_file *m)
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
        loff_t n = *pos;
-        struct list_head *p;
        mutex_lock(&cache_chain_mutex);
        if (!n)
                print_slabinfo_header(m);
-        p = cache_chain.next;
-        while (n--) {
+        return seq_list_start(&cache_chain, *pos);
-                p = p->next;
-                if (p == &cache_chain)
-                        return NULL;
-        }
-        return list_entry(p, struct kmem_cache, next);
 }
 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
-        struct kmem_cache *cachep = p;
+        return seq_list_next(p, &cache_chain, pos);
-        ++*pos;
-        return cachep->next.next == &cache_chain ?
-                NULL : list_entry(cachep->next.next, struct kmem_cache, next);
 }
 static void s_stop(struct seq_file *m, void *p)
@@ -4173,7 +4177,7 @@ static void s_stop(struct seq_file *m, void *p)
 static int s_show(struct seq_file *m, void *p)
 {
-        struct kmem_cache *cachep = p;
+        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
        struct slab *slabp;
        unsigned long active_objs;
        unsigned long num_objs;
@@ -4342,17 +4346,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 static void *leaks_start(struct seq_file *m, loff_t *pos)
 {
-        loff_t n = *pos;
-        struct list_head *p;
        mutex_lock(&cache_chain_mutex);
-        p = cache_chain.next;
+        return seq_list_start(&cache_chain, *pos);
-        while (n--) {
-                p = p->next;
-                if (p == &cache_chain)
-                        return NULL;
-        }
-        return list_entry(p, struct kmem_cache, next);
 }
 static inline int add_caller(unsigned long *n, unsigned long v)
@@ -4417,7 +4412,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
 static int leaks_show(struct seq_file *m, void *p)
 {
-        struct kmem_cache *cachep = p;
+        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
        struct slab *slabp;
        struct kmem_list3 *l3;
        const char *name;
diff --git a/mm/slob.c b/mm/slob.c
index 71976c5d40d3..b4899079d8b0 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -3,57 +3,159 @@
 *
 * Matt Mackall <mpm@selenic.com> 12/30/03
 *
+ * NUMA support by Paul Mundt, 2007.
+ *
 * How SLOB works:
 *
 * The core of SLOB is a traditional K&R style heap allocator, with
 * support for returning aligned objects. The granularity of this
- * allocator is 8 bytes on x86, though it's perhaps possible to reduce
+ * allocator is as little as 2 bytes, however typically most architectures
- * this to 4 if it's deemed worth the effort. The slob heap is a
+ * will require 4 bytes on 32-bit and 8 bytes on 64-bit.
- * singly-linked list of pages from __get_free_page, grown on demand
+ *
- * and allocation from the heap is currently first-fit.
+ * The slob heap is a linked list of pages from alloc_pages(), and
+ * within each page, there is a singly-linked list of free blocks (slob_t).
+ * The heap is grown on demand and allocation from the heap is currently
+ * first-fit.
 *
 * Above this is an implementation of kmalloc/kfree. Blocks returned
- * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
+ * from kmalloc are prepended with a 4-byte header with the kmalloc size.
 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
- * __get_free_pages directly so that it can return page-aligned blocks
+ * alloc_pages() directly, allocating compound pages so the page order
- * and keeps a linked list of such pages and their orders. These
+ * does not have to be separately tracked, and also stores the exact
- * objects are detected in kfree() by their page alignment.
+ * allocation size in page->private so that it can be used to accurately
+ * provide ksize(). These objects are detected in kfree() because slob_page()
+ * is false for them.
 *
 * SLAB is emulated on top of SLOB by simply calling constructors and
- * destructors for every SLAB allocation. Objects are returned with
+ * destructors for every SLAB allocation. Objects are returned with the
- * the 8-byte alignment unless the SLAB_HWCACHE_ALIGN flag is
+ * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which
- * set, in which case the low-level allocator will fragment blocks to
+ * case the low-level allocator will fragment blocks to create the proper
- * create the proper alignment. Again, objects of page-size or greater
+ * alignment. Again, objects of page-size or greater are allocated by
- * are allocated by calling __get_free_pages. As SLAB objects know
+ * calling alloc_pages(). As SLAB objects know their size, no separate
- * their size, no separate size bookkeeping is necessary and there is
+ * size bookkeeping is necessary and there is essentially no allocation
- * essentially no allocation space overhead.
+ * space overhead, and compound pages aren't needed for multi-page
+ * allocations.
+ *
+ * NUMA support in SLOB is fairly simplistic, pushing most of the real
+ * logic down to the page allocator, and simply doing the node accounting
+ * on the upper levels. In the event that a node id is explicitly
+ * provided, alloc_pages_node() with the specified node id is used
+ * instead. The common case (or when the node id isn't explicitly provided)
+ * will default to the current node, as per numa_node_id().
+ *
+ * Node aware pages are still inserted in to the global freelist, and
+ * these are scanned for by matching against the node id encoded in the
+ * page flags. As a result, block allocations that can be satisfied from
+ * the freelist will only be done so on pages residing on the same node,
+ * in order to prevent random node placement.
 */
+#include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/cache.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/timer.h>
 #include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
+/*
+ * slob_block has a field 'units', which indicates size of block if +ve,
+ * or offset of next block if -ve (in SLOB_UNITs).
+ *
+ * Free blocks of size 1 unit simply contain the offset of the next block.
+ * Those with larger size contain their size in the first SLOB_UNIT of
+ * memory, and the offset of the next free block in the second SLOB_UNIT.
+ */
+#if PAGE_SIZE <= (32767 * 2)
+typedef s16 slobidx_t;
+#else
+typedef s32 slobidx_t;
+#endif
 struct slob_block {
-        int units;
+        slobidx_t units;
-        struct slob_block *next;
 };
 typedef struct slob_block slob_t;
+/*
+ * We use struct page fields to manage some slob allocation aspects,
+ * however to avoid the horrible mess in include/linux/mm_types.h, we'll
+ * just define our own struct page type variant here.
+ */
+struct slob_page {
+        union {
+                struct {
+                        unsigned long flags;    /* mandatory */
+                        atomic_t _count;        /* mandatory */
+                        slobidx_t units;        /* free units left in page */
+                        unsigned long pad[2];
+                        slob_t *free;           /* first free slob_t in page */
+                        struct list_head list;  /* linked list of free pages */
+                };
+                struct page page;
+        };
+};
+static inline void struct_slob_page_wrong_size(void)
+{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); }
+/*
+ * free_slob_page: call before a slob_page is returned to the page allocator.
+ */
+static inline void free_slob_page(struct slob_page *sp)
+{
+        reset_page_mapcount(&sp->page);
+        sp->page.mapping = NULL;
+}
+/*
+ * All (partially) free slob pages go on this list.
+ */
+static LIST_HEAD(free_slob_pages);
+/*
+ * slob_page: True for all slob pages (false for bigblock pages)
+ */
+static inline int slob_page(struct slob_page *sp)
+{
+        return test_bit(PG_active, &sp->flags);
+}
+static inline void set_slob_page(struct slob_page *sp)
+{
+        __set_bit(PG_active, &sp->flags);
+}
+static inline void clear_slob_page(struct slob_page *sp)
+{
+        __clear_bit(PG_active, &sp->flags);
+}
+/*
+ * slob_page_free: true for pages on free_slob_pages list.
+ */
+static inline int slob_page_free(struct slob_page *sp)
+{
+        return test_bit(PG_private, &sp->flags);
+}
+static inline void set_slob_page_free(struct slob_page *sp)
+{
+        list_add(&sp->list, &free_slob_pages);
+        __set_bit(PG_private, &sp->flags);
+}
+static inline void clear_slob_page_free(struct slob_page *sp)
+{
+        list_del(&sp->list);
+        __clear_bit(PG_private, &sp->flags);
+}
 #define SLOB_UNIT sizeof(slob_t)
 #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
 #define SLOB_ALIGN L1_CACHE_BYTES
-struct bigblock {
-        int order;
-        void *pages;
-        struct bigblock *next;
-};
-typedef struct bigblock bigblock_t;
 /*
 * struct slob_rcu is inserted at the tail of allocated slob blocks, which
 * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free
@@ -64,133 +166,285 @@ struct slob_rcu {
        int size;
 };
-static slob_t arena = { .next = &arena, .units = 1 };
+/*
-static slob_t *slobfree = &arena;
+ * slob_lock protects all slob allocator structures.
-static bigblock_t *bigblocks;
+ */
 static DEFINE_SPINLOCK(slob_lock);
-static DEFINE_SPINLOCK(block_lock);
-static void slob_free(void *b, int size);
+/*
-static void slob_timer_cbk(void);
+ * Encode the given size and next info into a free slob block s.
+ */
+static void set_slob(slob_t *s, slobidx_t size, slob_t *next)
+{
+        slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
+        slobidx_t offset = next - base;
+        if (size > 1) {
+                s[0].units = size;
+                s[1].units = offset;
+        } else
+                s[0].units = -offset;
+}
-static void *slob_alloc(size_t size, gfp_t gfp, int align)
+/*
+ * Return the size of a slob block.
+ */
+static slobidx_t slob_units(slob_t *s)
+{
+        if (s->units > 0)
+                return s->units;
+        return 1;
+}
+/*
+ * Return the next free slob block pointer after this one.
+ */
+static slob_t *slob_next(slob_t *s)
+{
+        slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
+        slobidx_t next;
+        if (s[0].units < 0)
+                next = -s[0].units;
+        else
+                next = s[1].units;
+        return base+next;
+}
+/*
+ * Returns true if s is the last free block in its page.
+ */
+static int slob_last(slob_t *s)
+{
+        return !((unsigned long)slob_next(s) & ~PAGE_MASK);
+}
+static void *slob_new_page(gfp_t gfp, int order, int node)
+{
+        void *page;
+#ifdef CONFIG_NUMA
+        if (node != -1)
+                page = alloc_pages_node(node, gfp, order);
+        else
+#endif
+                page = alloc_pages(gfp, order);
+        if (!page)
+                return NULL;
+        return page_address(page);
+}
+/*
+ * Allocate a slob block within a given slob_page sp.
+ */
+static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
 {
        slob_t *prev, *cur, *aligned = 0;
        int delta = 0, units = SLOB_UNITS(size);
-        unsigned long flags;
-        spin_lock_irqsave(&slob_lock, flags);
+        for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) {
-        prev = slobfree;
+                slobidx_t avail = slob_units(cur);
-        for (cur = prev->next; ; prev = cur, cur = cur->next) {
                if (align) {
                        aligned = (slob_t *)ALIGN((unsigned long)cur, align);
                        delta = aligned - cur;
                }
-                if (cur->units >= units + delta) { /* room enough? */
+                if (avail >= units + delta) { /* room enough? */
+                        slob_t *next;
                        if (delta) { /* need to fragment head to align? */
-                                aligned->units = cur->units - delta;
+                                next = slob_next(cur);
-                                aligned->next = cur->next;
+                                set_slob(aligned, avail - delta, next);
-                                cur->next = aligned;
+                                set_slob(cur, delta, aligned);
-                                cur->units = delta;
                                prev = cur;
                                cur = aligned;
+                                avail = slob_units(cur);
                        }
-                        if (cur->units == units) /* exact fit? */
+                        next = slob_next(cur);
-                                prev->next = cur->next; /* unlink */
+                        if (avail == units) { /* exact fit? unlink. */
-                        else { /* fragment */
+                                if (prev)
-                                prev->next = cur + units;
+                                        set_slob(prev, slob_units(prev), next);
-                                prev->next->units = cur->units - units;
+                                else
-                                prev->next->next = cur->next;
+                                        sp->free = next;
-                                cur->units = units;
+                        } else { /* fragment */
+                                if (prev)
+                                        set_slob(prev, slob_units(prev), cur + units);
+                                else
+                                        sp->free = cur + units;
+                                set_slob(cur + units, avail - units, next);
                        }
-                        slobfree = prev;
+                        sp->units -= units;
-                        spin_unlock_irqrestore(&slob_lock, flags);
+                        if (!sp->units)
+                                clear_slob_page_free(sp);
                        return cur;
                }
-                if (cur == slobfree) {
+                if (slob_last(cur))
-                        spin_unlock_irqrestore(&slob_lock, flags);
+                        return NULL;
+        }
-                        if (size == PAGE_SIZE) /* trying to shrink arena? */
+}
-                                return 0;
-                        cur = (slob_t *)__get_free_page(gfp);
+/*
-                        if (!cur)
+ * slob_alloc: entry point into the slob allocator.
-                                return 0;
+ */
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
+{
+        struct slob_page *sp;
+        slob_t *b = NULL;
+        unsigned long flags;
-                        slob_free(cur, PAGE_SIZE);
+        spin_lock_irqsave(&slob_lock, flags);
-                        spin_lock_irqsave(&slob_lock, flags);
+        /* Iterate through each partially free page, try to find room */
-                        cur = slobfree;
+        list_for_each_entry(sp, &free_slob_pages, list) {
+#ifdef CONFIG_NUMA
+                /*
+                 * If there's a node specification, search for a partial
+                 * page with a matching node id in the freelist.
+                 */
+                if (node != -1 && page_to_nid(&sp->page) != node)
+                        continue;
+#endif
+                if (sp->units >= SLOB_UNITS(size)) {
+                        b = slob_page_alloc(sp, size, align);
+                        if (b)
+                                break;
                }
        }
+        spin_unlock_irqrestore(&slob_lock, flags);
+        /* Not enough space: must allocate a new page */
+        if (!b) {
+                b = slob_new_page(gfp, 0, node);
+                if (!b)
+                        return 0;
+                sp = (struct slob_page *)virt_to_page(b);
+                set_slob_page(sp);
+                spin_lock_irqsave(&slob_lock, flags);
+                sp->units = SLOB_UNITS(PAGE_SIZE);
+                sp->free = b;
+                INIT_LIST_HEAD(&sp->list);
+                set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
+                set_slob_page_free(sp);
+                b = slob_page_alloc(sp, size, align);
+                BUG_ON(!b);
+                spin_unlock_irqrestore(&slob_lock, flags);
+        }
+        return b;
 }
+/*
+ * slob_free: entry point into the slob allocator.
+ */
 static void slob_free(void *block, int size)
 {
-        slob_t *cur, *b = (slob_t *)block;
+        struct slob_page *sp;
+        slob_t *prev, *next, *b = (slob_t *)block;
+        slobidx_t units;
        unsigned long flags;
        if (!block)
                return;
+        BUG_ON(!size);
-        if (size)
+        sp = (struct slob_page *)virt_to_page(block);
-                b->units = SLOB_UNITS(size);
+        units = SLOB_UNITS(size);
-        /* Find reinsertion point */
        spin_lock_irqsave(&slob_lock, flags);
-        for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
-                if (cur >= cur->next && (b > cur || b < cur->next))
-                        break;
-        if (b + b->units == cur->next) {
+        if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) {
-                b->units += cur->next->units;
+                /* Go directly to page allocator. Do not pass slob allocator */
-                b->next = cur->next->next;
+                if (slob_page_free(sp))
-        } else
+                        clear_slob_page_free(sp);
-                b->next = cur->next;
+                clear_slob_page(sp);
+                free_slob_page(sp);
+                free_page((unsigned long)b);
+                goto out;
+        }
-        if (cur + cur->units == b) {
+        if (!slob_page_free(sp)) {
-                cur->units += b->units;
+                /* This slob page is about to become partially free. Easy! */
-                cur->next = b->next;
+                sp->units = units;
-        } else
+                sp->free = b;
-                cur->next = b;
+                set_slob(b, units,
+                        (void *)((unsigned long)(b +
+                                        SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
+                set_slob_page_free(sp);
+                goto out;
+        }
-        slobfree = cur;
+        /*
+         * Otherwise the page is already partially free, so find reinsertion
+         * point.
+         */
+        sp->units += units;
+        if (b < sp->free) {
+                set_slob(b, units, sp->free);
+                sp->free = b;
+        } else {
+                prev = sp->free;
+                next = slob_next(prev);
+                while (b > next) {
+                        prev = next;
+                        next = slob_next(prev);
+                }
+                if (!slob_last(prev) && b + units == next) {
+                        units += slob_units(next);
+                        set_slob(b, units, slob_next(next));
+                } else
+                        set_slob(b, units, next);
+                if (prev + slob_units(prev) == b) {
+                        units = slob_units(b) + slob_units(prev);
+                        set_slob(prev, units, slob_next(b));
+                } else
+                        set_slob(prev, slob_units(prev), b);
+        }
+out:
        spin_unlock_irqrestore(&slob_lock, flags);
 }
-void *__kmalloc(size_t size, gfp_t gfp)
+/*
-{
+ * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
-        slob_t *m;
+ */
-        bigblock_t *bb;
-        unsigned long flags;
-        if (size < PAGE_SIZE - SLOB_UNIT) {
+#ifndef ARCH_KMALLOC_MINALIGN
-                m = slob_alloc(size + SLOB_UNIT, gfp, 0);
+#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long)
-                return m ? (void *)(m + 1) : 0;
+#endif
-        }
-        bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
+#ifndef ARCH_SLAB_MINALIGN
-        if (!bb)
+#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
-                return 0;
+#endif
-        bb->order = get_order(size);
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
-        bb->pages = (void *)__get_free_pages(gfp, bb->order);
+{
+        int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+        if (size < PAGE_SIZE - align) {
+                unsigned int *m;
+                m = slob_alloc(size + align, gfp, align, node);
+                if (m)
+                        *m = size;
+                return (void *)m + align;
+        } else {
+                void *ret;
-        if (bb->pages) {
+                ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
-                spin_lock_irqsave(&block_lock, flags);
+                if (ret) {
-                bb->next = bigblocks;
+                        struct page *page;
-                bigblocks = bb;
+                        page = virt_to_page(ret);
-                spin_unlock_irqrestore(&block_lock, flags);
+                        page->private = size;
-                return bb->pages;
+                }
+                return ret;
        }
-        slob_free(bb, sizeof(bigblock_t));
-        return 0;
 }
-EXPORT_SYMBOL(__kmalloc);
+EXPORT_SYMBOL(__kmalloc_node);
 /**
 * krealloc - reallocate memory. The contents will remain unchanged.
@@ -227,52 +481,34 @@ EXPORT_SYMBOL(krealloc);
 void kfree(const void *block)
 {
-        bigblock_t *bb, **last = &bigblocks;
+        struct slob_page *sp;
-        unsigned long flags;
        if (!block)
                return;
-        if (!((unsigned long)block & (PAGE_SIZE-1))) {
+        sp = (struct slob_page *)virt_to_page(block);
-                /* might be on the big block list */
+        if (slob_page(sp)) {
-                spin_lock_irqsave(&block_lock, flags);
+                int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
-                for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
+                unsigned int *m = (unsigned int *)(block - align);
-                        if (bb->pages == block) {
+                slob_free(m, *m + align);
-                                *last = bb->next;
+        } else
-                                spin_unlock_irqrestore(&block_lock, flags);
+                put_page(&sp->page);
-                                free_pages((unsigned long)block, bb->order);
-                                slob_free(bb, sizeof(bigblock_t));
-                                return;
-                        }
-                }
-                spin_unlock_irqrestore(&block_lock, flags);
-        }
-        slob_free((slob_t *)block - 1, 0);
-        return;
 }
 EXPORT_SYMBOL(kfree);
+/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
 size_t ksize(const void *block)
 {
-        bigblock_t *bb;
+        struct slob_page *sp;
-        unsigned long flags;
        if (!block)
                return 0;
-        if (!((unsigned long)block & (PAGE_SIZE-1))) {
+        sp = (struct slob_page *)virt_to_page(block);
-                spin_lock_irqsave(&block_lock, flags);
+        if (slob_page(sp))
-                for (bb = bigblocks; bb; bb = bb->next)
+                return ((slob_t *)block - 1)->units + SLOB_UNIT;
-                        if (bb->pages == block) {
+        else
-                                spin_unlock_irqrestore(&slob_lock, flags);
+                return sp->page.private;
-                                return PAGE_SIZE << bb->order;
-                        }
-                spin_unlock_irqrestore(&block_lock, flags);
-        }
-        return ((slob_t *)block - 1)->units * SLOB_UNIT;
 }
 struct kmem_cache {
@@ -289,7 +525,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 {
        struct kmem_cache *c;
-        c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+        c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1);
        if (c) {
                c->name = name;
@@ -302,6 +538,8 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
                c->ctor = ctor;
                /* ignore alignment unless it's forced */
                c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+                if (c->align < ARCH_SLAB_MINALIGN)
+                        c->align = ARCH_SLAB_MINALIGN;
                if (c->align < align)
                        c->align = align;
        } else if (flags & SLAB_PANIC)
@@ -317,21 +555,21 @@ void kmem_cache_destroy(struct kmem_cache *c)
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
-void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
        void *b;
        if (c->size < PAGE_SIZE)
-                b = slob_alloc(c->size, flags, c->align);
+                b = slob_alloc(c->size, flags, c->align, node);
        else
-                b = (void *)__get_free_pages(flags, get_order(c->size));
+                b = slob_new_page(flags, get_order(c->size), node);
        if (c->ctor)
                c->ctor(b, c, 0);
        return b;
 }
-EXPORT_SYMBOL(kmem_cache_alloc);
+EXPORT_SYMBOL(kmem_cache_alloc_node);
 void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
 {
@@ -385,9 +623,6 @@ const char *kmem_cache_name(struct kmem_cache *c)
 }
 EXPORT_SYMBOL(kmem_cache_name);
-static struct timer_list slob_timer = TIMER_INITIALIZER(
-        (void (*)(unsigned long))slob_timer_cbk, 0, 0);
 int kmem_cache_shrink(struct kmem_cache *d)
 {
        return 0;
@@ -399,17 +634,14 @@ int kmem_ptr_validate(struct kmem_cache *a, const void *b)
        return 0;
 }
-void __init kmem_cache_init(void)
+static unsigned int slob_ready __read_mostly;
+int slab_is_available(void)
 {
-        slob_timer_cbk();
+        return slob_ready;
 }
-static void slob_timer_cbk(void)
+void __init kmem_cache_init(void)
 {
-        void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
+        slob_ready = 1;
-        if (p)
-                free_page((unsigned long)p);
-        mod_timer(&slob_timer, jiffies + HZ);
 }
diff --git a/mm/slub.c b/mm/slub.c
index 51663a3c3c24..6aea48942c29 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -323,7 +323,11 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
 /*
 * Debug settings:
 */
+#ifdef CONFIG_SLUB_DEBUG_ON
+static int slub_debug = DEBUG_DEFAULT_FLAGS;
+#else
 static int slub_debug;
+#endif
 static char *slub_debug_slabs;
@@ -888,38 +892,57 @@ fail:
 static int __init setup_slub_debug(char *str)
 {
-        if (!str || *str != '=')
+        slub_debug = DEBUG_DEFAULT_FLAGS;
-                slub_debug = DEBUG_DEFAULT_FLAGS;
+        if (*str++ != '=' || !*str)
-        else {
+                /*
-                str++;
+                 * No options specified. Switch on full debugging.
-                if (*str == 0 || *str == ',')
+                 */
-                        slub_debug = DEBUG_DEFAULT_FLAGS;
+                goto out;
-                else
-                for( ;*str && *str != ','; str++)
+        if (*str == ',')
-                        switch (*str) {
+                /*
-                        case 'f' : case 'F' :
+                 * No options but restriction on slabs. This means full
-                                slub_debug |= SLAB_DEBUG_FREE;
+                 * debugging for slabs matching a pattern.
-                                break;
+                 */
-                        case 'z' : case 'Z' :
+                goto check_slabs;
-                                slub_debug |= SLAB_RED_ZONE;
-                                break;
+        slub_debug = 0;
-                        case 'p' : case 'P' :
+        if (*str == '-')
-                                slub_debug |= SLAB_POISON;
+                /*
-                                break;
+                 * Switch off all debugging measures.
-                        case 'u' : case 'U' :
+                 */
-                                slub_debug |= SLAB_STORE_USER;
+                goto out;
-                                break;
-                        case 't' : case 'T' :
+        /*
-                                slub_debug |= SLAB_TRACE;
+         * Determine which debug features should be switched on
-                                break;
+         */
-                        default:
+        for ( ;*str && *str != ','; str++) {
-                                printk(KERN_ERR "slub_debug option '%c' "
+                switch (tolower(*str)) {
-                                        "unknown. skipped\n",*str);
+                case 'f':
-                        }
+                        slub_debug |= SLAB_DEBUG_FREE;
+                        break;
+                case 'z':
+                        slub_debug |= SLAB_RED_ZONE;
+                        break;
+                case 'p':
+                        slub_debug |= SLAB_POISON;
+                        break;
+                case 'u':
+                        slub_debug |= SLAB_STORE_USER;
+                        break;
+                case 't':
+                        slub_debug |= SLAB_TRACE;
+                        break;
+                default:
+                        printk(KERN_ERR "slub_debug option '%c' "
+                                "unknown. skipped\n",*str);
+                }
        }
+check_slabs:
        if (*str == ',')
                slub_debug_slabs = str + 1;
+out:
        return 1;
 }
@@ -1798,8 +1821,6 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag
        BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
        page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node);
-        /* new_slab() disables interupts */
-        local_irq_enable();
        BUG_ON(!page);
        n = page->freelist;
@@ -1811,6 +1832,12 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag
        init_kmem_cache_node(n);
        atomic_long_inc(&n->nr_slabs);
        add_partial(n, page);
+        /*
+         * new_slab() disables interupts. If we do not reenable interrupts here
+         * then bootup would continue with interrupts disabled.
+         */
+        local_irq_enable();
        return n;
 }
@@ -2016,7 +2043,6 @@ error:
                        s->offset, flags);
        return 0;
 }
-EXPORT_SYMBOL(kmem_cache_open);
 /*
 * Check if a given pointer is valid
@@ -2241,7 +2267,7 @@ void *__kmalloc(size_t size, gfp_t flags)
        if (s)
                return slab_alloc(s, flags, -1, __builtin_return_address(0));
-        return NULL;
+        return ZERO_SIZE_PTR;
 }
 EXPORT_SYMBOL(__kmalloc);
@@ -2252,16 +2278,20 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
        if (s)
                return slab_alloc(s, flags, node, __builtin_return_address(0));
-        return NULL;
+        return ZERO_SIZE_PTR;
 }
 EXPORT_SYMBOL(__kmalloc_node);
 #endif
 size_t ksize(const void *object)
 {
-        struct page *page = get_object_page(object);
+        struct page *page;
        struct kmem_cache *s;
+        if (object == ZERO_SIZE_PTR)
+                return 0;
+        page = get_object_page(object);
        BUG_ON(!page);
        s = page->slab;
        BUG_ON(!s);
@@ -2293,7 +2323,13 @@ void kfree(const void *x)
        struct kmem_cache *s;
        struct page *page;
-        if (!x)
+        /*
+         * This has to be an unsigned comparison. According to Linus
+         * some gcc version treat a pointer as a signed entity. Then
+         * this comparison would be true for all "negative" pointers
+         * (which would cover the whole upper half of the address space).
+         */
+        if ((unsigned long)x <= (unsigned long)ZERO_SIZE_PTR)
                return;
        page = virt_to_head_page(x);
@@ -2398,12 +2434,12 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
        void *ret;
        size_t ks;
-        if (unlikely(!p))
+        if (unlikely(!p || p == ZERO_SIZE_PTR))
                return kmalloc(new_size, flags);
        if (unlikely(!new_size)) {
                kfree(p);
-                return NULL;
+                return ZERO_SIZE_PTR;
        }
        ks = ksize(p);
@@ -2426,6 +2462,7 @@ EXPORT_SYMBOL(krealloc);
 void __init kmem_cache_init(void)
 {
        int i;
+        int caches = 0;
 #ifdef CONFIG_NUMA
        /*
@@ -2436,20 +2473,29 @@ void __init kmem_cache_init(void)
        create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
                sizeof(struct kmem_cache_node), GFP_KERNEL);
        kmalloc_caches[0].refcount = -1;
+        caches++;
 #endif
        /* Able to allocate the per node structures */
        slab_state = PARTIAL;
        /* Caches that are not of the two-to-the-power-of size */
-        create_kmalloc_cache(&kmalloc_caches[1],
+        if (KMALLOC_MIN_SIZE <= 64) {
+                create_kmalloc_cache(&kmalloc_caches[1],
                                "kmalloc-96", 96, GFP_KERNEL);
-        create_kmalloc_cache(&kmalloc_caches[2],
+                caches++;
+        }
+        if (KMALLOC_MIN_SIZE <= 128) {
+                create_kmalloc_cache(&kmalloc_caches[2],
                                "kmalloc-192", 192, GFP_KERNEL);
+                caches++;
+        }
-        for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
+        for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
                create_kmalloc_cache(&kmalloc_caches[i],
                        "kmalloc", 1 << i, GFP_KERNEL);
+                caches++;
+        }
        slab_state = UP;
@@ -2466,8 +2512,8 @@ void __init kmem_cache_init(void)
                                nr_cpu_ids * sizeof(struct page *);
        printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
-                " Processors=%d, Nodes=%d\n",
+                " CPUs=%d, Nodes=%d\n",
-                KMALLOC_SHIFT_HIGH, cache_line_size(),
+                caches, cache_line_size(),
                slub_min_order, slub_max_order, slub_min_objects,
                nr_cpu_ids, nr_node_ids);
 }
@@ -2652,7 +2698,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
        struct kmem_cache *s = get_slab(size, gfpflags);
        if (!s)
-                return NULL;
+                return ZERO_SIZE_PTR;
        return slab_alloc(s, gfpflags, -1, caller);
 }
@@ -2663,7 +2709,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
        struct kmem_cache *s = get_slab(size, gfpflags);
        if (!s)
-                return NULL;
+                return ZERO_SIZE_PTR;
        return slab_alloc(s, gfpflags, node, caller);
 }
@@ -2857,7 +2903,7 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max)
        order = get_order(sizeof(struct location) * max);
-        l = (void *)__get_free_pages(GFP_KERNEL, order);
+        l = (void *)__get_free_pages(GFP_ATOMIC, order);
        if (!l)
                return 0;
@@ -3022,13 +3068,15 @@ static int list_locations(struct kmem_cache *s, char *buf,
                        n += sprintf(buf + n, " pid=%ld",
                                l->min_pid);
-                if (num_online_cpus() > 1 && !cpus_empty(l->cpus)) {
+                if (num_online_cpus() > 1 && !cpus_empty(l->cpus) &&
+                                n < PAGE_SIZE - 60) {
                        n += sprintf(buf + n, " cpus=");
                        n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50,
                                        l->cpus);
                }
-                if (num_online_nodes() > 1 && !nodes_empty(l->nodes)) {
+                if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
+                                n < PAGE_SIZE - 60) {
                        n += sprintf(buf + n, " nodes=");
                        n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50,
                                        l->nodes);
diff --git a/mm/sparse.c b/mm/sparse.c
index 545e4d3afcdf..e03b39f3540f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -240,6 +240,27 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
        return NULL;
 }
+/*
+ * Allocate the accumulated non-linear sections, allocate a mem_map
+ * for each and record the physical to section mapping.
+ */
+void __init sparse_init(void)
+{
+        unsigned long pnum;
+        struct page *map;
+        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+                if (!valid_section_nr(pnum))
+                        continue;
+                map = sparse_early_mem_map_alloc(pnum);
+                if (!map)
+                        continue;
+                sparse_init_one_section(__nr_to_section(pnum), pnum, map);
+        }
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
 static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
 {
        struct page *page, *ret;
@@ -280,27 +301,6 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 }
 /*
- * Allocate the accumulated non-linear sections, allocate a mem_map
- * for each and record the physical to section mapping.
- */
-void __init sparse_init(void)
-{
-        unsigned long pnum;
-        struct page *map;
-        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
-                if (!valid_section_nr(pnum))
-                        continue;
-                map = sparse_early_mem_map_alloc(pnum);
-                if (!map)
-                        continue;
-                sparse_init_one_section(__nr_to_section(pnum), pnum, map);
-        }
-}
-#ifdef CONFIG_MEMORY_HOTPLUG
-/*
 * returns the number of sections whose mem_maps were properly
 * set.  If this is <=0, then that means that the passed-in
 * map was not consumed and must be freed.
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5f7cf2a4cb55..925d5c50f18d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -21,7 +21,7 @@
 /*
 * swapper_space is a fiction, retained to simplify the path through
- * vmscan's shrink_list, to make sync_page look nicer, and to allow
+ * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
 * future use of radix_tree tags in the swap cache.
 */
 static const struct address_space_operations swap_aops = {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index acc172cbe3aa..7ff0a81c7b01 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -885,7 +885,7 @@ static int try_to_unuse(unsigned int type)
                /*
                 * So we could skip searching mms once swap count went
                 * to 1, we did not mark any present ptes as dirty: must
-                 * mark page dirty so shrink_list will preserve it.
+                 * mark page dirty so shrink_page_list will preserve it.
                 */
                SetPageDirty(page);
                unlock_page(page);
diff --git a/mm/truncate.c b/mm/truncate.c
index 4fbe1a2da5fb..7c994f2d6145 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -253,21 +253,8 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 }
 EXPORT_SYMBOL(truncate_inode_pages);
-/**
+unsigned long __invalidate_mapping_pages(struct address_space *mapping,
- * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
+                                pgoff_t start, pgoff_t end, bool be_atomic)
- * @mapping: the address_space which holds the pages to invalidate
- * @start: the offset 'from' which to invalidate
- * @end: the offset 'to' which to invalidate (inclusive)
- *
- * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
- *
- * invalidate_mapping_pages() will not block on IO activity. It will not
- * invalidate pages which are dirty, locked, under writeback or mapped into
- * pagetables.
- */
-unsigned long invalidate_mapping_pages(struct address_space *mapping,
-                                pgoff_t start, pgoff_t end)
 {
        struct pagevec pvec;
        pgoff_t next = start;
@@ -308,17 +295,38 @@ unlock:
                                break;
                }
                pagevec_release(&pvec);
+                if (likely(!be_atomic))
+                        cond_resched();
        }
        return ret;
 }
+/**
+ * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
+ * @mapping: the address_space which holds the pages to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_mapping_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ */
+unsigned long invalidate_mapping_pages(struct address_space *mapping,
+                                pgoff_t start, pgoff_t end)
+{
+        return __invalidate_mapping_pages(mapping, start, end, false);
+}
 EXPORT_SYMBOL(invalidate_mapping_pages);
 /*
 * This is like invalidate_complete_page(), except it ignores the page's
 * refcount.  We do this because invalidate_inode_pages2() needs stronger
 * invalidation guarantees, and cannot afford to leave pages behind because
- * shrink_list() has a temp ref on them, or because they're transiently sitting
+ * shrink_page_list() has a temp ref on them, or because they're transiently
- * in the lru_cache_add() pagevecs.
+ * sitting in the lru_cache_add() pagevecs.
 */
 static int
 invalidate_complete_page2(struct address_space *mapping, struct page *page)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 38254297a494..eceaf496210f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -477,8 +477,8 @@ const struct seq_operations fragmentation_op = {
 static const char * const vmstat_text[] = {
        /* Zoned VM counters */
        "nr_free_pages",
-        "nr_active",
        "nr_inactive",
+        "nr_active",
        "nr_anon_pages",
        "nr_mapped",
        "nr_file_pages",