12 files changed, 202 insertions, 162 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 93595c327bbd..d5fdae2eb183 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -123,8 +123,7 @@ void remove_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
-        if (unlikely(!PageLocked(page)))
+        BUG_ON(!PageLocked(page));
-                PAGE_BUG(page);
        write_lock_irq(&mapping->tree_lock);
        __remove_from_page_cache(page);
@@ -139,7 +138,25 @@ static int sync_page(void *word)
        page = container_of((page_flags_t *)word, struct page, flags);
        /*
-         * FIXME, fercrissake.  What is this barrier here for?
+         * page_mapping() is being called without PG_locked held.
+         * Some knowledge of the state and use of the page is used to
+         * reduce the requirements down to a memory barrier.
+         * The danger here is of a stale page_mapping() return value
+         * indicating a struct address_space different from the one it's
+         * associated with when it is associated with one.
+         * After smp_mb(), it's either the correct page_mapping() for
+         * the page, or an old page_mapping() and the page's own
+         * page_mapping() has gone NULL.
+         * The ->sync_page() address_space operation must tolerate
+         * page_mapping() going NULL. By an amazing coincidence,
+         * this comes about because none of the users of the page
+         * in the ->sync_page() methods make essential use of the
+         * page_mapping(), merely passing the page down to the backing
+         * device's unplug functions when it's non-NULL, which in turn
+         * ignore it for all cases but swap, where only page->private is
+         * of interest. When page_mapping() does go NULL, the entire
+         * call stack gracefully ignores the page and returns.
+         * -- wli
         */
        smp_mb();
        mapping = page_mapping(page);
@@ -152,9 +169,10 @@ static int sync_page(void *word)
 /**
 * filemap_fdatawrite_range - start writeback against all of a mapping's
 * dirty pages that lie within the byte offsets <start, end>
- * @mapping: address space structure to write
+ * @mapping:    address space structure to write
- * @start: offset in bytes where the range starts
+ * @start:      offset in bytes where the range starts
- * @end : offset in bytes where the range ends
+ * @end:        offset in bytes where the range ends
+ * @sync_mode:  enable synchronous operation
 *
 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
 * opposed to a regular memory * cleansing writeback.  The difference between
@@ -518,8 +536,8 @@ EXPORT_SYMBOL(find_trylock_page);
 /**
 * find_lock_page - locate, pin and lock a pagecache page
 *
- * @mapping - the address_space to search
+ * @mapping: the address_space to search
- * @offset - the page index
+ * @offset: the page index
 *
 * Locates the desired pagecache page, locks it, increments its reference
 * count and returns its address.
@@ -558,9 +576,9 @@ EXPORT_SYMBOL(find_lock_page);
 /**
 * find_or_create_page - locate or add a pagecache page
 *
- * @mapping - the page's address_space
+ * @mapping: the page's address_space
- * @index - the page's index into the mapping
+ * @index: the page's index into the mapping
- * @gfp_mask - page allocation mode
+ * @gfp_mask: page allocation mode
 *
 * Locates a page in the pagecache.  If the page is not present, a new page
 * is allocated using @gfp_mask and is added to the pagecache and to the VM's
@@ -1949,7 +1967,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                buf = iov->iov_base + written;
        else {
                filemap_set_next_iovec(&cur_iov, &iov_base, written);
-                buf = iov->iov_base + iov_base;
+                buf = cur_iov->iov_base + iov_base;
        }
        do {
@@ -2007,9 +2025,11 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                                count -= status;
                                pos += status;
                                buf += status;
-                                if (unlikely(nr_segs > 1))
+                                if (unlikely(nr_segs > 1)) {
                                        filemap_set_next_iovec(&cur_iov,
                                                        &iov_base, status);
+                                        buf = cur_iov->iov_base + iov_base;
+                                }
                        }
                }
                if (unlikely(copied != bytes))
diff --git a/mm/highmem.c b/mm/highmem.c
index d01276506b00..400911599468 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -325,6 +325,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
                        continue;
                mempool_free(bvec->bv_page, pool);      
+                dec_page_state(nr_bounce);
        }
        bio_endio(bio_orig, bio_orig->bi_size, err);
@@ -405,6 +406,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
                to->bv_page = mempool_alloc(pool, q->bounce_gfp);
                to->bv_len = from->bv_len;
                to->bv_offset = from->bv_offset;
+                inc_page_state(nr_bounce);
                if (rw == WRITE) {
                        char *vto, *vfrom;
diff --git a/mm/mempool.c b/mm/mempool.c
index b014ffeaa413..c9f3d4620428 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -198,31 +198,22 @@ void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask)
        void *element;
        unsigned long flags;
        DEFINE_WAIT(wait);
-        int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
+        int gfp_temp;
        might_sleep_if(gfp_mask & __GFP_WAIT);
+        gfp_mask |= __GFP_NOMEMALLOC;   /* don't allocate emergency reserves */
+        gfp_mask |= __GFP_NORETRY;      /* don't loop in __alloc_pages */
+        gfp_mask |= __GFP_NOWARN;       /* failures are OK */
+        gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO);
 repeat_alloc:
-        element = pool->alloc(gfp_nowait|__GFP_NOWARN, pool->pool_data);
+        element = pool->alloc(gfp_temp, pool->pool_data);
        if (likely(element != NULL))
                return element;
-        /*
-         * If the pool is less than 50% full and we can perform effective
-         * page reclaim then try harder to allocate an element.
-         */
-        mb();
-        if ((gfp_mask & __GFP_FS) && (gfp_mask != gfp_nowait) &&
-                                (pool->curr_nr <= pool->min_nr/2)) {
-                element = pool->alloc(gfp_mask, pool->pool_data);
-                if (likely(element != NULL))
-                        return element;
-        }
-        /*
-         * Kick the VM at this point.
-         */
-        wakeup_bdflush(0);
        spin_lock_irqsave(&pool->lock, flags);
        if (likely(pool->curr_nr)) {
                element = remove_element(pool);
@@ -235,8 +226,10 @@ repeat_alloc:
        if (!(gfp_mask & __GFP_WAIT))
                return NULL;
+        /* Now start performing page reclaim */
+        gfp_temp = gfp_mask;
        prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
-        mb();
+        smp_mb();
        if (!pool->curr_nr)
                io_schedule();
        finish_wait(&pool->wait, &wait);
@@ -257,7 +250,7 @@ void mempool_free(void *element, mempool_t *pool)
 {
        unsigned long flags;
-        mb();
+        smp_mb();
        if (pool->curr_nr < pool->min_nr) {
                spin_lock_irqsave(&pool->lock, flags);
                if (pool->curr_nr < pool->min_nr) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 6ea204cc751e..01f9793591f6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -937,9 +937,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
        /* mlock MCL_FUTURE? */
        if (vm_flags & VM_LOCKED) {
                unsigned long locked, lock_limit;
-                locked = mm->locked_vm << PAGE_SHIFT;
+                locked = len >> PAGE_SHIFT;
+                locked += mm->locked_vm;
                lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
-                locked += len;
+                lock_limit >>= PAGE_SHIFT;
                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
                        return -EAGAIN;
        }
@@ -1009,8 +1010,7 @@ munmap_back:
        }
        /* Check against address space limit. */
-        if ((mm->total_vm << PAGE_SHIFT) + len
+        if (!may_expand_vm(mm, len >> PAGE_SHIFT))
-            > current->signal->rlim[RLIMIT_AS].rlim_cur)
                return -ENOMEM;
        if (accountable && (!(flags & MAP_NORESERVE) ||
@@ -1421,7 +1421,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
        struct rlimit *rlim = current->signal->rlim;
        /* address space limit tests */
-        if (mm->total_vm + grow > rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT)
+        if (!may_expand_vm(mm, grow))
                return -ENOMEM;
        /* Stack limit test */
@@ -1823,9 +1823,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
         */
        if (mm->def_flags & VM_LOCKED) {
                unsigned long locked, lock_limit;
-                locked = mm->locked_vm << PAGE_SHIFT;
+                locked = len >> PAGE_SHIFT;
+                locked += mm->locked_vm;
                lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
-                locked += len;
+                lock_limit >>= PAGE_SHIFT;
                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
                        return -EAGAIN;
        }
@@ -1848,8 +1849,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
        }
        /* Check against address space limits *after* clearing old maps... */
-        if ((mm->total_vm << PAGE_SHIFT) + len
+        if (!may_expand_vm(mm, len >> PAGE_SHIFT))
-            > current->signal->rlim[RLIMIT_AS].rlim_cur)
                return -ENOMEM;
        if (mm->map_count > sysctl_max_map_count)
@@ -2019,3 +2019,19 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        }
        return new_vma;
 }
+/*
+ * Return true if the calling process may expand its vm space by the passed
+ * number of pages
+ */
+int may_expand_vm(struct mm_struct *mm, unsigned long npages)
+{
+        unsigned long cur = mm->total_vm;       /* pages */
+        unsigned long lim;
+        lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+        if (cur + npages > lim)
+                return 0;
+        return 1;
+}
diff --git a/mm/mremap.c b/mm/mremap.c
index 0d1c1b9c7a0a..0dd7ace94e51 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -347,10 +347,10 @@ unsigned long do_mremap(unsigned long addr,
                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
                        goto out;
        }
-        ret = -ENOMEM;
+        if (!may_expand_vm(current->mm, (new_len - old_len) >> PAGE_SHIFT)) {
-        if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len)
+                ret = -ENOMEM;
-            > current->signal->rlim[RLIMIT_AS].rlim_cur)
                goto out;
+        }
        if (vma->vm_flags & VM_ACCOUNT) {
                charged = (new_len - old_len) >> PAGE_SHIFT;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6ddd6a29c73b..613b99a55917 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -255,7 +255,7 @@ static void balance_dirty_pages(struct address_space *mapping)
 /**
 * balance_dirty_pages_ratelimited - balance dirty memory state
- * @mapping - address_space which was dirtied
+ * @mapping: address_space which was dirtied
 *
 * Processes which are dirtying memory should call in here once for each page
 * which was newly dirtied.  The function will periodically check the system's
@@ -562,8 +562,8 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 /**
 * write_one_page - write out a single page and optionally wait on I/O
 *
- * @page - the page to write
+ * @page: the page to write
- * @wait - if true, wait on writeout
+ * @wait: if true, wait on writeout
 *
 * The page must be locked by the caller and will be unlocked upon return.
 *
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c73dbbc1cd8f..fc1b1064c505 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -799,14 +799,18 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
        }
        /* This allocation should allow future memory freeing. */
-        if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) {
-                /* go through the zonelist yet again, ignoring mins */
+        if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
-                for (i = 0; (z = zones[i]) != NULL; i++) {
+                        && !in_interrupt()) {
-                        if (!cpuset_zone_allowed(z))
+                if (!(gfp_mask & __GFP_NOMEMALLOC)) {
-                                continue;
+                        /* go through the zonelist yet again, ignoring mins */
-                        page = buffered_rmqueue(z, order, gfp_mask);
+                        for (i = 0; (z = zones[i]) != NULL; i++) {
-                        if (page)
+                                if (!cpuset_zone_allowed(z))
-                                goto got_pg;
+                                        continue;
+                                page = buffered_rmqueue(z, order, gfp_mask);
+                                if (page)
+                                        goto got_pg;
+                        }
                }
                goto nopage;
        }
@@ -1351,8 +1355,7 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli
 #define MAX_NODE_LOAD (num_online_nodes())
 static int __initdata node_load[MAX_NUMNODES];
 /**
- * find_next_best_node - find the next node that should appear in a given
+ * find_next_best_node - find the next node that should appear in a given node's fallback list
- *    node's fallback list
 * @node: node whose fallback list we're appending
 * @used_node_mask: nodemask_t of already used nodes
 *
@@ -1671,6 +1674,18 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                if (batch < 1)
                        batch = 1;
+                /*
+                 * Clamp the batch to a 2^n - 1 value. Having a power
+                 * of 2 value was found to be more likely to have
+                 * suboptimal cache aliasing properties in some cases.
+                 *
+                 * For example if 2 tasks are alternately allocating
+                 * batches of pages, one task can end up with a lot
+                 * of pages of one half of the possible page colors
+                 * and the other with pages of the other colors.
+                 */
+                batch = (1 << fls(batch + batch/2)) - 1;
                for (cpu = 0; cpu < NR_CPUS; cpu++) {
                        struct per_cpu_pages *pcp;
@@ -1881,6 +1896,7 @@ static char *vmstat_text[] = {
        "allocstall",
        "pgrotated",
+        "nr_bounce",
 };
 static void *vmstat_start(struct seq_file *m, loff_t *pos)
diff --git a/mm/rmap.c b/mm/rmap.c
index 884d6d1928bc..378de234c12b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -243,6 +243,42 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 }
 /*
+ * Check that @page is mapped at @address into @mm.
+ *
+ * On success returns with mapped pte and locked mm->page_table_lock.
+ */
+static pte_t *page_check_address(struct page *page, struct mm_struct *mm,
+                                        unsigned long address)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        /*
+         * We need the page_table_lock to protect us from page faults,
+         * munmap, fork, etc...
+         */
+        spin_lock(&mm->page_table_lock);
+        pgd = pgd_offset(mm, address);
+        if (likely(pgd_present(*pgd))) {
+                pud = pud_offset(pgd, address);
+                if (likely(pud_present(*pud))) {
+                        pmd = pmd_offset(pud, address);
+                        if (likely(pmd_present(*pmd))) {
+                                pte = pte_offset_map(pmd, address);
+                                if (likely(pte_present(*pte) &&
+                                           page_to_pfn(page) == pte_pfn(*pte)))
+                                        return pte;
+                                pte_unmap(pte);
+                        }
+                }
+        }
+        spin_unlock(&mm->page_table_lock);
+        return ERR_PTR(-ENOENT);
+}
+/*
 * Subfunctions of page_referenced: page_referenced_one called
 * repeatedly from either page_referenced_anon or page_referenced_file.
 */
@@ -251,9 +287,6 @@ static int page_referenced_one(struct page *page,
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
-        pgd_t *pgd;
-        pud_t *pud;
-        pmd_t *pmd;
        pte_t *pte;
        int referenced = 0;
@@ -263,39 +296,18 @@ static int page_referenced_one(struct page *page,
        if (address == -EFAULT)
                goto out;
-        spin_lock(&mm->page_table_lock);
+        pte = page_check_address(page, mm, address);
+        if (!IS_ERR(pte)) {
-        pgd = pgd_offset(mm, address);
+                if (ptep_clear_flush_young(vma, address, pte))
-        if (!pgd_present(*pgd))
+                        referenced++;
-                goto out_unlock;
-        pud = pud_offset(pgd, address);
-        if (!pud_present(*pud))
-                goto out_unlock;
-        pmd = pmd_offset(pud, address);
-        if (!pmd_present(*pmd))
-                goto out_unlock;
-        pte = pte_offset_map(pmd, address);
-        if (!pte_present(*pte))
-                goto out_unmap;
-        if (page_to_pfn(page) != pte_pfn(*pte))
-                goto out_unmap;
-        if (ptep_clear_flush_young(vma, address, pte))
-                referenced++;
-        if (mm != current->mm && !ignore_token && has_swap_token(mm))
-                referenced++;
-        (*mapcount)--;
+                if (mm != current->mm && !ignore_token && has_swap_token(mm))
+                        referenced++;
-out_unmap:
+                (*mapcount)--;
-        pte_unmap(pte);
+                pte_unmap(pte);
-out_unlock:
+                spin_unlock(&mm->page_table_lock);
-        spin_unlock(&mm->page_table_lock);
+        }
 out:
        return referenced;
 }
@@ -502,9 +514,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
-        pgd_t *pgd;
-        pud_t *pud;
-        pmd_t *pmd;
        pte_t *pte;
        pte_t pteval;
        int ret = SWAP_AGAIN;
@@ -515,30 +524,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
        if (address == -EFAULT)
                goto out;
-        /*
+        pte = page_check_address(page, mm, address);
-         * We need the page_table_lock to protect us from page faults,
+        if (IS_ERR(pte))
-         * munmap, fork, etc...
+                goto out;
-         */
-        spin_lock(&mm->page_table_lock);
-        pgd = pgd_offset(mm, address);
-        if (!pgd_present(*pgd))
-                goto out_unlock;
-        pud = pud_offset(pgd, address);
-        if (!pud_present(*pud))
-                goto out_unlock;
-        pmd = pmd_offset(pud, address);
-        if (!pmd_present(*pmd))
-                goto out_unlock;
-        pte = pte_offset_map(pmd, address);
-        if (!pte_present(*pte))
-                goto out_unmap;
-        if (page_to_pfn(page) != pte_pfn(*pte))
-                goto out_unmap;
        /*
         * If the page is mlock()d, we cannot swap it out.
@@ -604,7 +592,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 out_unmap:
        pte_unmap(pte);
-out_unlock:
        spin_unlock(&mm->page_table_lock);
 out:
        return ret;
@@ -708,7 +695,6 @@ static void try_to_unmap_cluster(unsigned long cursor,
        }
        pte_unmap(pte);
 out_unlock:
        spin_unlock(&mm->page_table_lock);
 }
@@ -860,3 +846,4 @@ int try_to_unmap(struct page *page)
                ret = SWAP_SUCCESS;
        return ret;
 }
diff --git a/mm/slab.c b/mm/slab.c
index ec660d85ddd7..840742641152 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -583,7 +583,7 @@ static inline struct array_cache *ac_data(kmem_cache_t *cachep)
        return cachep->array[smp_processor_id()];
 }
-static inline kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags)
+static inline kmem_cache_t *__find_general_cachep(size_t size, int gfpflags)
 {
        struct cache_sizes *csizep = malloc_sizes;
@@ -607,6 +607,12 @@ static inline kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags)
        return csizep->cs_cachep;
 }
+kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags)
+{
+        return __find_general_cachep(size, gfpflags);
+}
+EXPORT_SYMBOL(kmem_find_general_cachep);
 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
 static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
                 int flags, size_t *left_over, unsigned int *num)
@@ -672,14 +678,11 @@ static struct array_cache *alloc_arraycache(int cpu, int entries,
        int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
        struct array_cache *nc = NULL;
-        if (cpu != -1) {
+        if (cpu == -1)
-                kmem_cache_t *cachep;
-                cachep = kmem_find_general_cachep(memsize, GFP_KERNEL);
-                if (cachep)
-                        nc = kmem_cache_alloc_node(cachep, cpu_to_node(cpu));
-        }
-        if (!nc)
                nc = kmalloc(memsize, GFP_KERNEL);
+        else
+                nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu));
        if (nc) {
                nc->avail = 0;
                nc->limit = entries;
@@ -1663,7 +1666,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
        }
        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
-                synchronize_kernel();
+                synchronize_rcu();
        /* no cpu_online check required here since we clear the percpu
         * array on cpu offline and set this to NULL.
@@ -2361,7 +2364,7 @@ out:
 * and can sleep. And it will allocate memory on the given node, which
 * can improve the performance for cpu bound structures.
 */
-void *kmem_cache_alloc_node(kmem_cache_t *cachep, int nodeid)
+void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
 {
        int loop;
        void *objp;
@@ -2393,7 +2396,7 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, int nodeid)
                spin_unlock_irq(&cachep->spinlock);
                local_irq_disable();
-                if (!cache_grow(cachep, GFP_KERNEL, nodeid)) {
+                if (!cache_grow(cachep, flags, nodeid)) {
                        local_irq_enable();
                        return NULL;
                }
@@ -2435,6 +2438,16 @@ got_slabp:
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
+void *kmalloc_node(size_t size, int flags, int node)
+{
+        kmem_cache_t *cachep;
+        cachep = kmem_find_general_cachep(size, flags);
+        if (unlikely(cachep == NULL))
+                return NULL;
+        return kmem_cache_alloc_node(cachep, flags, node);
+}
+EXPORT_SYMBOL(kmalloc_node);
 #endif
 /**
@@ -2462,7 +2475,12 @@ void *__kmalloc(size_t size, unsigned int __nocast flags)
 {
        kmem_cache_t *cachep;
-        cachep = kmem_find_general_cachep(size, flags);
+        /* If you want to save a few bytes .text space: replace
+         * __ with kmem_.
+         * Then kmalloc uses the uninlined functions instead of the inline
+         * functions.
+         */
+        cachep = __find_general_cachep(size, flags);
        if (unlikely(cachep == NULL))
                return NULL;
        return __cache_alloc(cachep, flags);
@@ -2489,9 +2507,8 @@ void *__alloc_percpu(size_t size, size_t align)
        for (i = 0; i < NR_CPUS; i++) {
                if (!cpu_possible(i))
                        continue;
-                pdata->ptrs[i] = kmem_cache_alloc_node(
+                pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL,
-                                kmem_find_general_cachep(size, GFP_KERNEL),
+                                                cpu_to_node(i));
-                                cpu_to_node(i));
                if (!pdata->ptrs[i])
                        goto unwind_oom;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index a063a902ed03..4f251775ef90 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -143,7 +143,6 @@ void __delete_from_swap_cache(struct page *page)
 int add_to_swap(struct page * page)
 {
        swp_entry_t entry;
-        int pf_flags;
        int err;
        if (!PageLocked(page))
@@ -154,29 +153,19 @@ int add_to_swap(struct page * page)
                if (!entry.val)
                        return 0;
-                /* Radix-tree node allocations are performing
+                /*
-                 * GFP_ATOMIC allocations under PF_MEMALLOC.  
+                 * Radix-tree node allocations from PF_MEMALLOC contexts could
-                 * They can completely exhaust the page allocator.  
+                 * completely exhaust the page allocator. __GFP_NOMEMALLOC
-                 *
+                 * stops emergency reserves from being allocated.
-                 * So PF_MEMALLOC is dropped here.  This causes the slab 
-                 * allocations to fail earlier, so radix-tree nodes will 
-                 * then be allocated from the mempool reserves.
                 *
-                 * We're still using __GFP_HIGH for radix-tree node
+                 * TODO: this could cause a theoretical memory reclaim
-                 * allocations, so some of the emergency pools are available,
+                 * deadlock in the swap out path.
-                 * just not all of them.
                 */
-                pf_flags = current->flags;
-                current->flags &= ~PF_MEMALLOC;
                /*
                 * Add it to the swap cache and mark it dirty
                 */
-                err = __add_to_swap_cache(page, entry, GFP_ATOMIC|__GFP_NOWARN);
+                err = __add_to_swap_cache(page, entry,
+                                GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN);
-                if (pf_flags & PF_MEMALLOC)
-                        current->flags |= PF_MEMALLOC;
                switch (err) {
                case 0:                         /* Success */
diff --git a/mm/truncate.c b/mm/truncate.c
index c9a63f0b69a2..60c8764bfac2 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -242,7 +242,7 @@ EXPORT_SYMBOL(invalidate_inode_pages);
 /**
 * invalidate_inode_pages2_range - remove range of pages from an address_space
- * @mapping - the address_space
+ * @mapping: the address_space
 * @start: the page offset 'from' which to invalidate
 * @end: the page offset 'to' which to invalidate (inclusive)
 *
@@ -322,7 +322,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
 /**
 * invalidate_inode_pages2 - remove all pages from an address_space
- * @mapping - the address_space
+ * @mapping: the address_space
 *
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c6182f6f1305..2bd83e5c2bbf 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -475,6 +475,10 @@ void *vmalloc(unsigned long size)
 EXPORT_SYMBOL(vmalloc);
+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
 /**
 *      vmalloc_exec  -  allocate virtually contiguous, executable memory
 *
@@ -488,10 +492,6 @@ EXPORT_SYMBOL(vmalloc);
 *      use __vmalloc() instead.
 */
-#ifndef PAGE_KERNEL_EXEC
-# define PAGE_KERNEL_EXEC PAGE_KERNEL
-#endif
 void *vmalloc_exec(unsigned long size)
 {
        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);