26 files changed, 654 insertions, 378 deletions
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index eaa9abeea536..b2486cf887a0 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -17,10 +17,9 @@
 void percpu_depopulate(void *__pdata, int cpu)
 {
        struct percpu_data *pdata = __percpu_disguise(__pdata);
-        if (pdata->ptrs[cpu]) {
-                kfree(pdata->ptrs[cpu]);
+        kfree(pdata->ptrs[cpu]);
-                pdata->ptrs[cpu] = NULL;
+        pdata->ptrs[cpu] = NULL;
-        }
 }
 EXPORT_SYMBOL_GPL(percpu_depopulate);
@@ -123,6 +122,8 @@ EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
 */
 void percpu_free(void *__pdata)
 {
+        if (unlikely(!__pdata))
+                return;
        __percpu_depopulate_mask(__pdata, &cpu_possible_map);
        kfree(__percpu_disguise(__pdata));
 }
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d53112fcb404..00a96970b237 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -27,8 +27,6 @@ unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
-EXPORT_UNUSED_SYMBOL(max_pfn);  /*  June 2006  */
 static LIST_HEAD(bdata_list);
 #ifdef CONFIG_CRASH_DUMP
 /*
@@ -196,6 +194,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
        if (limit && bdata->node_boot_start >= limit)
                return NULL;
+        /* on nodes without memory - bootmem_map is NULL */
+        if (!bdata->node_bootmem_map)
+                return NULL;
        end_pfn = bdata->node_low_pfn;
        limit = PFN_DOWN(limit);
        if (limit && end_pfn > limit)
diff --git a/mm/filemap.c b/mm/filemap.c
index 13df01c50479..af7e2f5caea9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1445,7 +1445,6 @@ no_cached_page:
         * effect.
         */
        error = page_cache_read(file, pgoff);
-        grab_swap_token();
        /*
         * The page we want has now been added to the page cache.
diff --git a/mm/fremap.c b/mm/fremap.c
index 7a9d0f5d246d..b77a002c3352 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -101,7 +101,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        int err = -ENOMEM;
        pte_t *pte;
-        pte_t pte_val;
        spinlock_t *ptl;
        pte = get_locked_pte(mm, addr, &ptl);
@@ -114,7 +113,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
-        pte_val = *pte;
        /*
         * We don't need to run update_mmu_cache() here because the "file pte"
         * being installed by install_file_pte() is not a real pte - it's a
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a088f593a807..0ccc7f230252 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -109,7 +109,7 @@ static int alloc_fresh_huge_page(void)
        if (nid == MAX_NUMNODES)
                nid = first_node(node_online_map);
        if (page) {
-                page[1].lru.next = (void *)free_huge_page;      /* dtor */
+                set_compound_page_dtor(page, free_huge_page);
                spin_lock(&hugetlb_lock);
                nr_huge_pages++;
                nr_huge_pages_node[page_to_nid(page)]++;
@@ -344,7 +344,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                        entry = *src_pte;
                        ptepage = pte_page(entry);
                        get_page(ptepage);
-                        add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
                }
                spin_unlock(&src->page_table_lock);
@@ -365,6 +364,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        pte_t pte;
        struct page *page;
        struct page *tmp;
+        /*
+         * A page gathering list, protected by per file i_mmap_lock. The
+         * lock is used to avoid list corruption from multiple unmapping
+         * of the same page since we are using page->lru.
+         */
        LIST_HEAD(page_list);
        WARN_ON(!is_vm_hugetlb_page(vma));
@@ -372,24 +376,21 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        BUG_ON(end & ~HPAGE_MASK);
        spin_lock(&mm->page_table_lock);
-        /* Update high watermark before we lower rss */
-        update_hiwater_rss(mm);
        for (address = start; address < end; address += HPAGE_SIZE) {
                ptep = huge_pte_offset(mm, address);
                if (!ptep)
                        continue;
+                if (huge_pmd_unshare(mm, &address, ptep))
+                        continue;
                pte = huge_ptep_get_and_clear(mm, address, ptep);
                if (pte_none(pte))
                        continue;
                page = pte_page(pte);
                list_add(&page->lru, &page_list);
-                add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
        }
        spin_unlock(&mm->page_table_lock);
        flush_tlb_range(vma, start, end);
        list_for_each_entry_safe(page, tmp, &page_list, lru) {
@@ -515,7 +516,6 @@ retry:
        if (!pte_none(*ptep))
                goto backout;
-        add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
        new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
                                && (vma->vm_flags & VM_SHARED)));
        set_huge_pte_at(mm, address, ptep, new_pte);
@@ -653,11 +653,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        BUG_ON(address >= end);
        flush_cache_range(vma, address, end);
+        spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
        spin_lock(&mm->page_table_lock);
        for (; address < end; address += HPAGE_SIZE) {
                ptep = huge_pte_offset(mm, address);
                if (!ptep)
                        continue;
+                if (huge_pmd_unshare(mm, &address, ptep))
+                        continue;
                if (!pte_none(*ptep)) {
                        pte = huge_ptep_get_and_clear(mm, address, ptep);
                        pte = pte_mkhuge(pte_modify(pte, newprot));
@@ -666,6 +669,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                }
        }
        spin_unlock(&mm->page_table_lock);
+        spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
        flush_tlb_range(vma, start, end);
 }
diff --git a/mm/memory.c b/mm/memory.c
index 156861fcac43..4198df0dff1c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1902,7 +1902,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
        return 0;
 }
-EXPORT_UNUSED_SYMBOL(vmtruncate_range);  /*  June 2006  */
 /**
 * swapin_readahead - swap in pages in hope we need them soon
@@ -1991,6 +1990,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        delayacct_set_flag(DELAYACCT_PF_SWAPIN);
        page = lookup_swap_cache(entry);
        if (!page) {
+                grab_swap_token(); /* Contend for token _before_ read-in */
                swapin_readahead(entry, address, vma);
                page = read_swap_cache_async(entry, vma, address);
                if (!page) {
@@ -2008,7 +2008,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                /* Had to read the page from swap area: Major fault */
                ret = VM_FAULT_MAJOR;
                count_vm_event(PGMAJFAULT);
-                grab_swap_token();
        }
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fd678a662eae..0c055a090f4d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -72,7 +72,6 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
                        return ret;
        }
        memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
-        zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
        return 0;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 617fb31086ee..b917d6fdc1bb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
        enum zone_type k;
        max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
+        max++;                  /* space for zlcache_ptr (see mmzone.h) */
        zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
        if (!zl)
                return NULL;
+        zl->zlcache_ptr = NULL;
        num = 0;
        /* First put in the highest zones from all nodes, then all the next 
           lower zones etc. Avoid empty zones because the memory allocator
@@ -219,7 +221,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
        orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        do {
                struct page *page;
-                unsigned int nid;
+                int nid;
                if (!pte_present(*pte))
                        continue;
@@ -1324,7 +1326,7 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
        atomic_set(&new->refcnt, 1);
        if (new->policy == MPOL_BIND) {
                int sz = ksize(old->v.zonelist);
-                new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL);
+                new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
                if (!new->v.zonelist) {
                        kmem_cache_free(policy_cache, new);
                        return ERR_PTR(-ENOMEM);
@@ -1705,8 +1707,8 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 * Display pages allocated per node and memory policy via /proc.
 */
-static const char *policy_types[] = { "default", "prefer", "bind",
+static const char * const policy_types[] =
-                                      "interleave" };
+        { "default", "prefer", "bind", "interleave" };
 /*
 * Convert a mempolicy into a string.
diff --git a/mm/migrate.c b/mm/migrate.c
index b4979d423d2b..e9b161bde95b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -294,7 +294,7 @@ out:
 static int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page)
 {
-        struct page **radix_pointer;
+        void **pslot;
        if (!mapping) {
                /* Anonymous page */
@@ -305,12 +305,11 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        write_lock_irq(&mapping->tree_lock);
-        radix_pointer = (struct page **)radix_tree_lookup_slot(
+        pslot = radix_tree_lookup_slot(&mapping->page_tree,
-                                                &mapping->page_tree,
+                                        page_index(page));
-                                                page_index(page));
        if (page_count(page) != 2 + !!PagePrivate(page) ||
-                        *radix_pointer != page) {
+                        (struct page *)radix_tree_deref_slot(pslot) != page) {
                write_unlock_irq(&mapping->tree_lock);
                return -EAGAIN;
        }
@@ -318,7 +317,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        /*
         * Now we know that no one else is looking at the page.
         */
-        get_page(newpage);
+        get_page(newpage);      /* add cache reference */
 #ifdef CONFIG_SWAP
        if (PageSwapCache(page)) {
                SetPageSwapCache(newpage);
@@ -326,8 +325,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        }
 #endif
-        *radix_pointer = newpage;
+        radix_tree_replace_slot(pslot, newpage);
+        /*
+         * Drop cache reference from old page.
+         * We know this isn't the last reference.
+         */
        __put_page(page);
        write_unlock_irq(&mapping->tree_lock);
        return 0;
diff --git a/mm/mlock.c b/mm/mlock.c
index b90c59573abf..3446b7ef731e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -65,7 +65,7 @@ success:
                        ret = make_pages_present(start, end);
        }
-        vma->vm_mm->locked_vm -= pages;
+        mm->locked_vm -= pages;
 out:
        if (ret == -ENOMEM)
                ret = -EAGAIN;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7b40abd7cba2..7be110e98d4c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1736,7 +1736,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        if (mm->map_count >= sysctl_max_map_count)
                return -ENOMEM;
-        new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+        new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
        if (!new)
                return -ENOMEM;
@@ -2057,7 +2057,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                    vma_start < new_vma->vm_end)
                        *vmap = new_vma;
        } else {
-                new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+                new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
                if (new_vma) {
                        *new_vma = *vma;
                        pol = mpol_copy(vma_policy(vma));
diff --git a/mm/mmzone.c b/mm/mmzone.c
index febea1c98168..eb5838634f18 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -14,8 +14,6 @@ struct pglist_data *first_online_pgdat(void)
        return NODE_DATA(first_online_node);
 }
-EXPORT_UNUSED_SYMBOL(first_online_pgdat);  /*  June 2006  */
 struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 {
        int nid = next_online_node(pgdat->node_id);
@@ -24,8 +22,6 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
                return NULL;
        return NODE_DATA(nid);
 }
-EXPORT_UNUSED_SYMBOL(next_online_pgdat);  /*  June 2006  */
 /*
 * next_zone - helper magic for for_each_zone()
@@ -45,5 +41,4 @@ struct zone *next_zone(struct zone *zone)
        }
        return zone;
 }
-EXPORT_UNUSED_SYMBOL(next_zone);  /*  June 2006  */
diff --git a/mm/nommu.c b/mm/nommu.c
index 6a2a8aada401..af874569d0f1 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -808,10 +808,9 @@ unsigned long do_mmap_pgoff(struct file *file,
        vm_flags = determine_vm_flags(file, prot, flags, capabilities);
        /* we're going to need to record the mapping if it works */
-        vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
+        vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
        if (!vml)
                goto error_getting_vml;
-        memset(vml, 0, sizeof(*vml));
        down_write(&nommu_vma_sem);
@@ -887,11 +886,10 @@ unsigned long do_mmap_pgoff(struct file *file,
        }
        /* we're going to need a VMA struct as well */
-        vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
+        vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
        if (!vma)
                goto error_getting_vma;
-        memset(vma, 0, sizeof(*vma));
        INIT_LIST_HEAD(&vma->anon_vma_node);
        atomic_set(&vma->vm_usage, 1);
        if (file)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2e3ce3a928b9..223d9ccb7d64 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -264,7 +264,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 * flag though it's unlikely that  we select a process with CAP_SYS_RAW_IO
 * set.
 */
-static void __oom_kill_task(struct task_struct *p, const char *message)
+static void __oom_kill_task(struct task_struct *p, int verbose)
 {
        if (is_init(p)) {
                WARN_ON(1);
@@ -278,10 +278,8 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
                return;
        }
-        if (message) {
+        if (verbose)
-                printk(KERN_ERR "%s: Killed process %d (%s).\n",
+                printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm);
-                                message, p->pid, p->comm);
-        }
        /*
         * We give our sacrificial lamb high priority and access to
@@ -294,7 +292,7 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
        force_sig(SIGKILL, p);
 }
-static int oom_kill_task(struct task_struct *p, const char *message)
+static int oom_kill_task(struct task_struct *p)
 {
        struct mm_struct *mm;
        struct task_struct *g, *q;
@@ -313,15 +311,25 @@ static int oom_kill_task(struct task_struct *p, const char *message)
        if (mm == NULL)
                return 1;
-        __oom_kill_task(p, message);
+        /*
+         * Don't kill the process if any threads are set to OOM_DISABLE
+         */
+        do_each_thread(g, q) {
+                if (q->mm == mm && p->oomkilladj == OOM_DISABLE)
+                        return 1;
+        } while_each_thread(g, q);
+        __oom_kill_task(p, 1);
        /*
         * kill all processes that share the ->mm (i.e. all threads),
-         * but are in a different thread group
+         * but are in a different thread group. Don't let them have access
+         * to memory reserves though, otherwise we might deplete all memory.
         */
-        do_each_thread(g, q)
+        do_each_thread(g, q) {
                if (q->mm == mm && q->tgid != p->tgid)
-                        __oom_kill_task(q, message);
+                        force_sig(SIGKILL, p);
-        while_each_thread(g, q);
+        } while_each_thread(g, q);
        return 0;
 }
@@ -337,21 +345,22 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
         * its children or threads, just set TIF_MEMDIE so it can die quickly
         */
        if (p->flags & PF_EXITING) {
-                __oom_kill_task(p, NULL);
+                __oom_kill_task(p, 0);
                return 0;
        }
-        printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li"
+        printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
-                        " and children.\n", p->pid, p->comm, points);
+                                        message, p->pid, p->comm, points);
        /* Try to kill a child first */
        list_for_each(tsk, &p->children) {
                c = list_entry(tsk, struct task_struct, sibling);
                if (c->mm == p->mm)
                        continue;
-                if (!oom_kill_task(c, message))
+                if (!oom_kill_task(c))
                        return 0;
        }
-        return oom_kill_task(p, message);
+        return oom_kill_task(p);
 }
 static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index aa6fcc7ca66f..cace22b3ac25 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -83,14 +83,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 EXPORT_SYMBOL(totalram_pages);
-/*
+static char * const zone_names[MAX_NR_ZONES] = {
- * Used by page_zone() to look up the address of the struct zone whose
- * id is encoded in the upper bits of page->flags
- */
-struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
-EXPORT_SYMBOL(zone_table);
-static char *zone_names[MAX_NR_ZONES] = {
         "DMA",
 #ifdef CONFIG_ZONE_DMA32
         "DMA32",
@@ -237,7 +230,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
        int i;
        int nr_pages = 1 << order;
-        page[1].lru.next = (void *)free_compound_page;  /* set dtor */
+        set_compound_page_dtor(page, free_compound_page);
        page[1].lru.prev = (void *)order;
        for (i = 0; i < nr_pages; i++) {
                struct page *p = page + i;
@@ -486,7 +479,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order)
        spin_lock(&zone->lock);
        zone->all_unreclaimable = 0;
        zone->pages_scanned = 0;
-        __free_one_page(page, zone ,order);
+        __free_one_page(page, zone, order);
        spin_unlock(&zone->lock);
 }
@@ -605,6 +598,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
                        1 << PG_checked | 1 << PG_mappedtodisk);
        set_page_private(page, 0);
        set_page_refcounted(page);
+        arch_alloc_page(page, order);
        kernel_map_pages(page, 1 << order, 1);
        if (gfp_flags & __GFP_ZERO)
@@ -690,9 +685,15 @@ void drain_node_pages(int nodeid)
                        pcp = &pset->pcp[i];
                        if (pcp->count) {
+                                int to_drain;
                                local_irq_save(flags);
-                                free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+                                if (pcp->count >= pcp->batch)
-                                pcp->count = 0;
+                                        to_drain = pcp->batch;
+                                else
+                                        to_drain = pcp->count;
+                                free_pages_bulk(zone, to_drain, &pcp->list, 0);
+                                pcp->count -= to_drain;
                                local_irq_restore(flags);
                        }
                }
@@ -700,7 +701,6 @@ void drain_node_pages(int nodeid)
 }
 #endif
-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
 {
        unsigned long flags;
@@ -722,7 +722,6 @@ static void __drain_pages(unsigned int cpu)
                }
        }
 }
-#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
 #ifdef CONFIG_PM
@@ -925,31 +924,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        return 1;
 }
+#ifdef CONFIG_NUMA
+/*
+ * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
+ * skip over zones that are not allowed by the cpuset, or that have
+ * been recently (in last second) found to be nearly full.  See further
+ * comments in mmzone.h.  Reduces cache footprint of zonelist scans
+ * that have to skip over alot of full or unallowed zones.
+ *
+ * If the zonelist cache is present in the passed in zonelist, then
+ * returns a pointer to the allowed node mask (either the current
+ * tasks mems_allowed, or node_online_map.)
+ *
+ * If the zonelist cache is not available for this zonelist, does
+ * nothing and returns NULL.
+ *
+ * If the fullzones BITMAP in the zonelist cache is stale (more than
+ * a second since last zap'd) then we zap it out (clear its bits.)
+ *
+ * We hold off even calling zlc_setup, until after we've checked the
+ * first zone in the zonelist, on the theory that most allocations will
+ * be satisfied from that first zone, so best to examine that zone as
+ * quickly as we can.
+ */
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+        struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+        nodemask_t *allowednodes;       /* zonelist_cache approximation */
+        zlc = zonelist->zlcache_ptr;
+        if (!zlc)
+                return NULL;
+        if (jiffies - zlc->last_full_zap > 1 * HZ) {
+                bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+                zlc->last_full_zap = jiffies;
+        }
+        allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
+                                        &cpuset_current_mems_allowed :
+                                        &node_online_map;
+        return allowednodes;
+}
+/*
+ * Given 'z' scanning a zonelist, run a couple of quick checks to see
+ * if it is worth looking at further for free memory:
+ *  1) Check that the zone isn't thought to be full (doesn't have its
+ *     bit set in the zonelist_cache fullzones BITMAP).
+ *  2) Check that the zones node (obtained from the zonelist_cache
+ *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
+ * Return true (non-zero) if zone is worth looking at further, or
+ * else return false (zero) if it is not.
+ *
+ * This check -ignores- the distinction between various watermarks,
+ * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
+ * found to be full for any variation of these watermarks, it will
+ * be considered full for up to one second by all requests, unless
+ * we are so low on memory on all allowed nodes that we are forced
+ * into the second scan of the zonelist.
+ *
+ * In the second scan we ignore this zonelist cache and exactly
+ * apply the watermarks to all zones, even it is slower to do so.
+ * We are low on memory in the second scan, and should leave no stone
+ * unturned looking for a free page.
+ */
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+                                                nodemask_t *allowednodes)
+{
+        struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+        int i;                          /* index of *z in zonelist zones */
+        int n;                          /* node that zone *z is on */
+        zlc = zonelist->zlcache_ptr;
+        if (!zlc)
+                return 1;
+        i = z - zonelist->zones;
+        n = zlc->z_to_n[i];
+        /* This zone is worth trying if it is allowed but not full */
+        return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
+}
 /*
- * get_page_from_freeliest goes through the zonelist trying to allocate
+ * Given 'z' scanning a zonelist, set the corresponding bit in
+ * zlc->fullzones, so that subsequent attempts to allocate a page
+ * from that zone don't waste time re-examining it.
+ */
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+        struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+        int i;                          /* index of *z in zonelist zones */
+        zlc = zonelist->zlcache_ptr;
+        if (!zlc)
+                return;
+        i = z - zonelist->zones;
+        set_bit(i, zlc->fullzones);
+}
+#else   /* CONFIG_NUMA */
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+        return NULL;
+}
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+                                nodemask_t *allowednodes)
+{
+        return 1;
+}
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+}
+#endif  /* CONFIG_NUMA */
+/*
+ * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
 */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                struct zonelist *zonelist, int alloc_flags)
 {
-        struct zone **z = zonelist->zones;
+        struct zone **z;
        struct page *page = NULL;
-        int classzone_idx = zone_idx(*z);
+        int classzone_idx = zone_idx(zonelist->zones[0]);
        struct zone *zone;
+        nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
+        int zlc_active = 0;             /* set if using zonelist_cache */
+        int did_zlc_setup = 0;          /* just call zlc_setup() one time */
+zonelist_scan:
        /*
-         * Go through the zonelist once, looking for a zone with enough free.
+         * Scan zonelist, looking for a zone with enough free.
         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
         */
+        z = zonelist->zones;
        do {
+                if (NUMA_BUILD && zlc_active &&
+                        !zlc_zone_worth_trying(zonelist, z, allowednodes))
+                                continue;
                zone = *z;
                if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
                        zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
                                break;
                if ((alloc_flags & ALLOC_CPUSET) &&
-                                !cpuset_zone_allowed(zone, gfp_mask))
+                        !cpuset_zone_allowed(zone, gfp_mask))
-                        continue;
+                                goto try_next_zone;
                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
                        unsigned long mark;
@@ -959,18 +1087,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                                mark = zone->pages_low;
                        else
                                mark = zone->pages_high;
-                        if (!zone_watermark_ok(zone , order, mark,
+                        if (!zone_watermark_ok(zone, order, mark,
-                                    classzone_idx, alloc_flags))
+                                    classzone_idx, alloc_flags)) {
                                if (!zone_reclaim_mode ||
                                    !zone_reclaim(zone, gfp_mask, order))
-                                        continue;
+                                        goto this_zone_full;
+                        }
                }
                page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
-                if (page) {
+                if (page)
                        break;
+this_zone_full:
+                if (NUMA_BUILD)
+                        zlc_mark_zone_full(zonelist, z);
+try_next_zone:
+                if (NUMA_BUILD && !did_zlc_setup) {
+                        /* we do zlc_setup after the first zone is tried */
+                        allowednodes = zlc_setup(zonelist, alloc_flags);
+                        zlc_active = 1;
+                        did_zlc_setup = 1;
                }
        } while (*(++z) != NULL);
+        if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
+                /* Disable zlc cache for second zonelist scan */
+                zlc_active = 0;
+                goto zonelist_scan;
+        }
        return page;
 }
@@ -1005,9 +1149,19 @@ restart:
        if (page)
                goto got_pg;
-        do {
+        /*
+         * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
+         * __GFP_NOWARN set) should not cause reclaim since the subsystem
+         * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
+         * using a larger set of nodes after it has established that the
+         * allowed per node queues are empty and that nodes are
+         * over allocated.
+         */
+        if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+                goto nopage;
+        for (z = zonelist->zones; *z; z++)
                wakeup_kswapd(*z, order);
-        } while (*(++z));
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -1041,6 +1195,7 @@ restart:
        /* This allocation should allow future memory freeing. */
+rebalance:
        if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
                        && !in_interrupt()) {
                if (!(gfp_mask & __GFP_NOMEMALLOC)) {
@@ -1062,7 +1217,6 @@ nofail_alloc:
        if (!wait)
                goto nopage;
-rebalance:
        cond_resched();
        /* We now go into synchronous reclaim */
@@ -1262,7 +1416,7 @@ unsigned int nr_free_pagecache_pages(void)
 static inline void show_node(struct zone *zone)
 {
        if (NUMA_BUILD)
-                printk("Node %ld ", zone_to_nid(zone));
+                printk("Node %d ", zone_to_nid(zone));
 }
 void si_meminfo(struct sysinfo *val)
@@ -1542,6 +1696,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
        }
 }
+/* Construct the zonelist performance cache - see further mmzone.h */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+        int i;
+        for (i = 0; i < MAX_NR_ZONES; i++) {
+                struct zonelist *zonelist;
+                struct zonelist_cache *zlc;
+                struct zone **z;
+                zonelist = pgdat->node_zonelists + i;
+                zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
+                bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+                for (z = zonelist->zones; *z; z++)
+                        zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
+        }
+}
 #else   /* CONFIG_NUMA */
 static void __meminit build_zonelists(pg_data_t *pgdat)
@@ -1579,14 +1751,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
        }
 }
+/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+        int i;
+        for (i = 0; i < MAX_NR_ZONES; i++)
+                pgdat->node_zonelists[i].zlcache_ptr = NULL;
+}
 #endif  /* CONFIG_NUMA */
 /* return values int ....just for stop_machine_run() */
 static int __meminit __build_all_zonelists(void *dummy)
 {
        int nid;
-        for_each_online_node(nid)
+        for_each_online_node(nid) {
                build_zonelists(NODE_DATA(nid));
+                build_zonelist_cache(NODE_DATA(nid));
+        }
        return 0;
 }
@@ -1715,20 +1899,6 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
        }
 }
-#define ZONETABLE_INDEX(x, zone_nr)     ((x << ZONES_SHIFT) | zone_nr)
-void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
-                unsigned long pfn, unsigned long size)
-{
-        unsigned long snum = pfn_to_section_nr(pfn);
-        unsigned long end = pfn_to_section_nr(pfn + size);
-        if (FLAGS_HAS_NODE)
-                zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
-        else
-                for (; snum <= end; snum++)
-                        zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
-}
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
        memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -1881,16 +2051,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
        int ret = NOTIFY_OK;
        switch (action) {
-                case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE:
-                        if (process_zones(cpu))
+                if (process_zones(cpu))
-                                ret = NOTIFY_BAD;
+                        ret = NOTIFY_BAD;
-                        break;
+                break;
-                case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED:
-                case CPU_DEAD:
+        case CPU_DEAD:
-                        free_zone_pagesets(cpu);
+                free_zone_pagesets(cpu);
-                        break;
+                break;
-                default:
+        default:
-                        break;
+                break;
        }
        return ret;
 }
@@ -2421,7 +2591,6 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                if (!size)
                        continue;
-                zonetable_add(zone, nid, j, zone_start_pfn, size);
                ret = init_currently_empty_zone(zone, zone_start_pfn, size);
                BUG_ON(ret);
                zone_start_pfn += size;
@@ -2736,7 +2905,6 @@ void __init free_area_init(unsigned long *zones_size)
                        __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
-#ifdef CONFIG_HOTPLUG_CPU
 static int page_alloc_cpu_notify(struct notifier_block *self,
                                 unsigned long action, void *hcpu)
 {
@@ -2751,7 +2919,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
        }
        return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 void __init page_alloc_init(void)
 {
@@ -3055,7 +3222,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        /* allow the kernel cmdline to have a say */
        if (!numentries) {
                /* round applicable memory size up to nearest megabyte */
-                numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
+                numentries = nr_kernel_pages;
                numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
                numentries >>= 20 - PAGE_SHIFT;
                numentries <<= 20 - PAGE_SHIFT;
diff --git a/mm/page_io.c b/mm/page_io.c
index d4840ecbf8f9..dbffec0d78c9 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -147,48 +147,3 @@ int swap_readpage(struct file *file, struct page *page)
 out:
        return ret;
 }
-#ifdef CONFIG_SOFTWARE_SUSPEND
-/*
- * A scruffy utility function to read or write an arbitrary swap page
- * and wait on the I/O.  The caller must have a ref on the page.
- *
- * We use end_swap_bio_read() even for writes, because it happens to do what
- * we want.
- */
-int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
-                        struct bio **bio_chain)
-{
-        struct bio *bio;
-        int ret = 0;
-        int bio_rw;
-        lock_page(page);
-        bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read);
-        if (bio == NULL) {
-                unlock_page(page);
-                ret = -ENOMEM;
-                goto out;
-        }
-        bio_rw = rw;
-        if (!bio_chain)
-                bio_rw |= (1 << BIO_RW_SYNC);
-        if (bio_chain)
-                bio_get(bio);
-        submit_bio(bio_rw, bio);
-        if (bio_chain == NULL) {
-                wait_on_page_locked(page);
-                if (!PageUptodate(page) || PageError(page))
-                        ret = -EIO;
-        }
-        if (bio_chain) {
-                bio->bi_private = *bio_chain;
-                *bio_chain = bio;
-        }
-out:
-        return ret;
-}
-#endif
diff --git a/mm/pdflush.c b/mm/pdflush.c
index b02102feeb4b..8ce0900dc95c 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -21,6 +21,7 @@
 #include <linux/writeback.h>    // Prototypes pdflush_operation()
 #include <linux/kthread.h>
 #include <linux/cpuset.h>
+#include <linux/freezer.h>
 /*
diff --git a/mm/readahead.c b/mm/readahead.c
index 23cb61a01c6e..a386f2b6b335 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -148,13 +148,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
                if (!pagevec_add(&lru_pvec, page))
                        __pagevec_lru_add(&lru_pvec);
                if (ret) {
-                        while (!list_empty(pages)) {
+                        put_pages_list(pages);
-                                struct page *victim;
-                                victim = list_to_page(pages);
-                                list_del(&victim->lru);
-                                page_cache_release(victim);
-                        }
                        break;
                }
        }
diff --git a/mm/shmem.c b/mm/shmem.c
index 4959535fc14c..c820b4f77b8d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -177,7 +177,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 static struct super_operations shmem_ops;
 static const struct address_space_operations shmem_aops;
-static struct file_operations shmem_file_operations;
+static const struct file_operations shmem_file_operations;
 static struct inode_operations shmem_inode_operations;
 static struct inode_operations shmem_dir_inode_operations;
 static struct inode_operations shmem_special_inode_operations;
@@ -1943,7 +1943,7 @@ static int shmem_xattr_security_set(struct inode *inode, const char *name,
        return security_inode_setsecurity(inode, name, value, size, flags);
 }
-struct xattr_handler shmem_xattr_security_handler = {
+static struct xattr_handler shmem_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = shmem_xattr_security_list,
        .get    = shmem_xattr_security_get,
@@ -2263,7 +2263,7 @@ static struct kmem_cache *shmem_inode_cachep;
 static struct inode *shmem_alloc_inode(struct super_block *sb)
 {
        struct shmem_inode_info *p;
-        p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL);
+        p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
        if (!p)
                return NULL;
        return &p->vfs_inode;
@@ -2319,7 +2319,7 @@ static const struct address_space_operations shmem_aops = {
        .migratepage    = migrate_page,
 };
-static struct file_operations shmem_file_operations = {
+static const struct file_operations shmem_file_operations = {
        .mmap           = shmem_mmap,
 #ifdef CONFIG_TMPFS
        .llseek         = generic_file_llseek,
diff --git a/mm/slab.c b/mm/slab.c
index 5de81473df34..068cb4503c15 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -103,12 +103,12 @@
 #include        <linux/module.h>
 #include        <linux/rcupdate.h>
 #include        <linux/string.h>
+#include        <linux/uaccess.h>
 #include        <linux/nodemask.h>
 #include        <linux/mempolicy.h>
 #include        <linux/mutex.h>
 #include        <linux/rtmutex.h>
-#include        <asm/uaccess.h>
 #include        <asm/cacheflush.h>
 #include        <asm/tlbflush.h>
 #include        <asm/page.h>
@@ -730,7 +730,10 @@ static inline void init_lock_keys(void)
 }
 #endif
-/* Guard access to the cache-chain. */
+/*
+ * 1. Guard access to the cache-chain.
+ * 2. Protect sanity of cpu_online_map against cpu hotplug events
+ */
 static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
@@ -866,6 +869,22 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
        dump_stack();
 }
+/*
+ * By default on NUMA we use alien caches to stage the freeing of
+ * objects allocated from other nodes. This causes massive memory
+ * inefficiencies when using fake NUMA setup to split memory into a
+ * large number of small nodes, so it can be disabled on the command
+ * line
+  */
+static int use_alien_caches __read_mostly = 1;
+static int __init noaliencache_setup(char *s)
+{
+        use_alien_caches = 0;
+        return 1;
+}
+__setup("noaliencache", noaliencache_setup);
 #ifdef CONFIG_NUMA
 /*
 * Special reaping functions for NUMA systems called from cache_reap().
@@ -996,7 +1015,7 @@ static inline void *alternate_node_alloc(struct kmem_cache *cachep,
        return NULL;
 }
-static inline void *__cache_alloc_node(struct kmem_cache *cachep,
+static inline void *____cache_alloc_node(struct kmem_cache *cachep,
                 gfp_t flags, int nodeid)
 {
        return NULL;
@@ -1004,7 +1023,7 @@ static inline void *__cache_alloc_node(struct kmem_cache *cachep,
 #else   /* CONFIG_NUMA */
-static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 static struct array_cache **alloc_alien_cache(int node, int limit)
@@ -1114,7 +1133,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
         * Make sure we are not freeing a object from another node to the array
         * cache on this cpu.
         */
-        if (likely(slabp->nodeid == node))
+        if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))
                return 0;
        l3 = cachep->nodelists[node];
@@ -1192,7 +1211,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
                list_for_each_entry(cachep, &cache_chain, next) {
                        struct array_cache *nc;
                        struct array_cache *shared;
-                        struct array_cache **alien;
+                        struct array_cache **alien = NULL;
                        nc = alloc_arraycache(node, cachep->limit,
                                                cachep->batchcount);
@@ -1204,9 +1223,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
                        if (!shared)
                                goto bad;
-                        alien = alloc_alien_cache(node, cachep->limit);
+                        if (use_alien_caches) {
-                        if (!alien)
+                                alien = alloc_alien_cache(node, cachep->limit);
-                                goto bad;
+                                if (!alien)
+                                        goto bad;
+                        }
                        cachep->array[cpu] = nc;
                        l3 = cachep->nodelists[node];
                        BUG_ON(!l3);
@@ -1230,12 +1251,18 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
                        kfree(shared);
                        free_alien_cache(alien);
                }
-                mutex_unlock(&cache_chain_mutex);
                break;
        case CPU_ONLINE:
+                mutex_unlock(&cache_chain_mutex);
                start_cpu_timer(cpu);
                break;
 #ifdef CONFIG_HOTPLUG_CPU
+        case CPU_DOWN_PREPARE:
+                mutex_lock(&cache_chain_mutex);
+                break;
+        case CPU_DOWN_FAILED:
+                mutex_unlock(&cache_chain_mutex);
+                break;
        case CPU_DEAD:
                /*
                 * Even if all the cpus of a node are down, we don't free the
@@ -1246,8 +1273,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
                 * gets destroyed at kmem_cache_destroy().
                 */
                /* fall thru */
+#endif
        case CPU_UP_CANCELED:
-                mutex_lock(&cache_chain_mutex);
                list_for_each_entry(cachep, &cache_chain, next) {
                        struct array_cache *nc;
                        struct array_cache *shared;
@@ -1308,11 +1335,9 @@ free_array_cache:
                }
                mutex_unlock(&cache_chain_mutex);
                break;
-#endif
        }
        return NOTIFY_OK;
 bad:
-        mutex_unlock(&cache_chain_mutex);
        return NOTIFY_BAD;
 }
@@ -1580,12 +1605,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        flags |= __GFP_COMP;
 #endif
-        /*
+        flags |= cachep->gfpflags;
-         * Under NUMA we want memory on the indicated node. We will handle
-         * the needed fallback ourselves since we want to serve from our
-         * per node object lists first for other nodes.
-         */
-        flags |= cachep->gfpflags | GFP_THISNODE;
        page = alloc_pages_node(nodeid, flags, cachep->gfporder);
        if (!page)
@@ -2098,15 +2118,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        }
        /*
-         * Prevent CPUs from coming and going.
+         * We use cache_chain_mutex to ensure a consistent view of
-         * lock_cpu_hotplug() nests outside cache_chain_mutex
+         * cpu_online_map as well.  Please see cpuup_callback
         */
-        lock_cpu_hotplug();
        mutex_lock(&cache_chain_mutex);
        list_for_each_entry(pc, &cache_chain, next) {
-                mm_segment_t old_fs = get_fs();
                char tmp;
                int res;
@@ -2115,9 +2132,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                 * destroy its slab cache and no-one else reuses the vmalloc
                 * area of the module.  Print a warning.
                 */
-                set_fs(KERNEL_DS);
+                res = probe_kernel_address(pc->name, tmp);
-                res = __get_user(tmp, pc->name);
-                set_fs(old_fs);
                if (res) {
                        printk("SLAB: cache with size %d has lost its name\n",
                               pc->buffer_size);
@@ -2197,25 +2212,24 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
                ralign = BYTES_PER_WORD;
-        /* 2) arch mandated alignment: disables debug if necessary */
+        /* 2) arch mandated alignment */
        if (ralign < ARCH_SLAB_MINALIGN) {
                ralign = ARCH_SLAB_MINALIGN;
-                if (ralign > BYTES_PER_WORD)
-                        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
        }
-        /* 3) caller mandated alignment: disables debug if necessary */
+        /* 3) caller mandated alignment */
        if (ralign < align) {
                ralign = align;
-                if (ralign > BYTES_PER_WORD)
-                        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
        }
+        /* disable debug if necessary */
+        if (ralign > BYTES_PER_WORD)
+                flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
        /*
         * 4) Store it.
         */
        align = ralign;
        /* Get cache's description obj. */
-        cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL);
+        cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
        if (!cachep)
                goto oops;
@@ -2326,7 +2340,6 @@ oops:
                panic("kmem_cache_create(): failed to create slab `%s'\n",
                      name);
        mutex_unlock(&cache_chain_mutex);
-        unlock_cpu_hotplug();
        return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
@@ -2444,6 +2457,7 @@ out:
        return nr_freed;
 }
+/* Called with cache_chain_mutex held to protect against cpu hotplug */
 static int __cache_shrink(struct kmem_cache *cachep)
 {
        int ret = 0, i = 0;
@@ -2474,9 +2488,13 @@ static int __cache_shrink(struct kmem_cache *cachep)
 */
 int kmem_cache_shrink(struct kmem_cache *cachep)
 {
+        int ret;
        BUG_ON(!cachep || in_interrupt());
-        return __cache_shrink(cachep);
+        mutex_lock(&cache_chain_mutex);
+        ret = __cache_shrink(cachep);
+        mutex_unlock(&cache_chain_mutex);
+        return ret;
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
@@ -2500,23 +2518,16 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
 {
        BUG_ON(!cachep || in_interrupt());
-        /* Don't let CPUs to come and go */
-        lock_cpu_hotplug();
        /* Find the cache in the chain of caches. */
        mutex_lock(&cache_chain_mutex);
        /*
         * the chain is never empty, cache_cache is never destroyed
         */
        list_del(&cachep->next);
-        mutex_unlock(&cache_chain_mutex);
        if (__cache_shrink(cachep)) {
                slab_error(cachep, "Can't free all objects");
-                mutex_lock(&cache_chain_mutex);
                list_add(&cachep->next, &cache_chain);
                mutex_unlock(&cache_chain_mutex);
-                unlock_cpu_hotplug();
                return;
        }
@@ -2524,7 +2535,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
                synchronize_rcu();
        __kmem_cache_destroy(cachep);
-        unlock_cpu_hotplug();
+        mutex_unlock(&cache_chain_mutex);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2548,7 +2559,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
        if (OFF_SLAB(cachep)) {
                /* Slab management obj is off-slab. */
                slabp = kmem_cache_alloc_node(cachep->slabp_cache,
-                                              local_flags, nodeid);
+                                              local_flags & ~GFP_THISNODE, nodeid);
                if (!slabp)
                        return NULL;
        } else {
@@ -2618,7 +2629,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
-        if (flags & SLAB_DMA)
+        if (flags & GFP_DMA)
                BUG_ON(!(cachep->gfpflags & GFP_DMA));
        else
                BUG_ON(cachep->gfpflags & GFP_DMA);
@@ -2689,10 +2700,10 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
 * Grow (by 1) the number of slabs within a cache.  This is called by
 * kmem_cache_alloc() when there are no active objs left in a cache.
 */
-static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static int cache_grow(struct kmem_cache *cachep,
+                gfp_t flags, int nodeid, void *objp)
 {
        struct slab *slabp;
-        void *objp;
        size_t offset;
        gfp_t local_flags;
        unsigned long ctor_flags;
@@ -2702,12 +2713,12 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
         * Be lazy and only check for valid flags here,  keeping it out of the
         * critical path in kmem_cache_alloc().
         */
-        BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW));
+        BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));
-        if (flags & SLAB_NO_GROW)
+        if (flags & __GFP_NO_GROW)
                return 0;
        ctor_flags = SLAB_CTOR_CONSTRUCTOR;
-        local_flags = (flags & SLAB_LEVEL_MASK);
+        local_flags = (flags & GFP_LEVEL_MASK);
        if (!(local_flags & __GFP_WAIT))
                /*
                 * Not allowed to sleep.  Need to tell a constructor about
@@ -2744,12 +2755,14 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
         * Get mem for the objs.  Attempt to allocate a physical page from
         * 'nodeid'.
         */
-        objp = kmem_getpages(cachep, flags, nodeid);
+        if (!objp)
+                objp = kmem_getpages(cachep, flags, nodeid);
        if (!objp)
                goto failed;
        /* Get slab management. */
-        slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid);
+        slabp = alloc_slabmgmt(cachep, objp, offset,
+                        local_flags & ~GFP_THISNODE, nodeid);
        if (!slabp)
                goto opps1;
@@ -2987,7 +3000,7 @@ alloc_done:
        if (unlikely(!ac->avail)) {
                int x;
-                x = cache_grow(cachep, flags, node);
+                x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
                /* cache_grow can reenable interrupts, then ac could change. */
                ac = cpu_cache_get(cachep);
@@ -3063,6 +3076,12 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                cachep->ctor(objp, cachep, ctor_flags);
        }
+#if ARCH_SLAB_MINALIGN
+        if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
+                printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
+                       objp, ARCH_SLAB_MINALIGN);
+        }
+#endif
        return objp;
 }
 #else
@@ -3105,10 +3124,10 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
                objp = ____cache_alloc(cachep, flags);
        /*
         * We may just have run out of memory on the local node.
-         * __cache_alloc_node() knows how to locate memory on other nodes
+         * ____cache_alloc_node() knows how to locate memory on other nodes
         */
        if (NUMA_BUILD && !objp)
-                objp = __cache_alloc_node(cachep, flags, numa_node_id());
+                objp = ____cache_alloc_node(cachep, flags, numa_node_id());
        local_irq_restore(save_flags);
        objp = cache_alloc_debugcheck_after(cachep, flags, objp,
                                            caller);
@@ -3135,15 +3154,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
        else if (current->mempolicy)
                nid_alloc = slab_node(current->mempolicy);
        if (nid_alloc != nid_here)
-                return __cache_alloc_node(cachep, flags, nid_alloc);
+                return ____cache_alloc_node(cachep, flags, nid_alloc);
        return NULL;
 }
 /*
 * Fallback function if there was no memory available and no objects on a
- * certain node and we are allowed to fall back. We mimick the behavior of
+ * certain node and fall back is permitted. First we scan all the
- * the page allocator. We fall back according to a zonelist determined by
+ * available nodelists for available objects. If that fails then we
- * the policy layer while obeying cpuset constraints.
+ * perform an allocation without specifying a node. This allows the page
+ * allocator to do its reclaim / fallback magic. We then insert the
+ * slab into the proper nodelist and then allocate from it.
 */
 void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 {
@@ -3151,15 +3172,51 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
                                        ->node_zonelists[gfp_zone(flags)];
        struct zone **z;
        void *obj = NULL;
+        int nid;
+retry:
+        /*
+         * Look through allowed nodes for objects available
+         * from existing per node queues.
+         */
        for (z = zonelist->zones; *z && !obj; z++) {
-                int nid = zone_to_nid(*z);
+                nid = zone_to_nid(*z);
+                if (cpuset_zone_allowed(*z, flags) &&
+                        cache->nodelists[nid] &&
+                        cache->nodelists[nid]->free_objects)
+                                obj = ____cache_alloc_node(cache,
+                                        flags | GFP_THISNODE, nid);
+        }
-                if (zone_idx(*z) <= ZONE_NORMAL &&
+        if (!obj) {
-                                cpuset_zone_allowed(*z, flags) &&
+                /*
-                                cache->nodelists[nid])
+                 * This allocation will be performed within the constraints
-                        obj = __cache_alloc_node(cache,
+                 * of the current cpuset / memory policy requirements.
-                                        flags | __GFP_THISNODE, nid);
+                 * We may trigger various forms of reclaim on the allowed
+                 * set and go into memory reserves if necessary.
+                 */
+                obj = kmem_getpages(cache, flags, -1);
+                if (obj) {
+                        /*
+                         * Insert into the appropriate per node queues
+                         */
+                        nid = page_to_nid(virt_to_page(obj));
+                        if (cache_grow(cache, flags, nid, obj)) {
+                                obj = ____cache_alloc_node(cache,
+                                        flags | GFP_THISNODE, nid);
+                                if (!obj)
+                                        /*
+                                         * Another processor may allocate the
+                                         * objects in the slab since we are
+                                         * not holding any locks.
+                                         */
+                                        goto retry;
+                        } else {
+                                kmem_freepages(cache, obj);
+                                obj = NULL;
+                        }
+                }
        }
        return obj;
 }
@@ -3167,7 +3224,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 /*
 * A interface to enable slab creation on nodeid
 */
-static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
                                int nodeid)
 {
        struct list_head *entry;
@@ -3216,7 +3273,7 @@ retry:
 must_grow:
        spin_unlock(&l3->list_lock);
-        x = cache_grow(cachep, flags, nodeid);
+        x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
        if (x)
                goto retry;
@@ -3434,35 +3491,59 @@ out:
 * @flags: See kmalloc().
 * @nodeid: node number of the target node.
 *
- * Identical to kmem_cache_alloc, except that this function is slow
+ * Identical to kmem_cache_alloc but it will allocate memory on the given
- * and can sleep. And it will allocate memory on the given node, which
+ * node, which can improve the performance for cpu bound structures.
- * can improve the performance for cpu bound structures.
+ *
- * New and improved: it will now make sure that the object gets
+ * Fallback to other node is possible if __GFP_THISNODE is not set.
- * put on the correct node list so that there is no false sharing.
 */
-void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static __always_inline void *
+__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+                int nodeid, void *caller)
 {
        unsigned long save_flags;
-        void *ptr;
+        void *ptr = NULL;
        cache_alloc_debugcheck_before(cachep, flags);
        local_irq_save(save_flags);
-        if (nodeid == -1 || nodeid == numa_node_id() ||
+        if (unlikely(nodeid == -1))
-                        !cachep->nodelists[nodeid])
+                nodeid = numa_node_id();
-                ptr = ____cache_alloc(cachep, flags);
-        else
+        if (likely(cachep->nodelists[nodeid])) {
-                ptr = __cache_alloc_node(cachep, flags, nodeid);
+                if (nodeid == numa_node_id()) {
-        local_irq_restore(save_flags);
+                        /*
+                         * Use the locally cached objects if possible.
+                         * However ____cache_alloc does not allow fallback
+                         * to other nodes. It may fail while we still have
+                         * objects on other nodes available.
+                         */
+                        ptr = ____cache_alloc(cachep, flags);
+                }
+                if (!ptr) {
+                        /* ___cache_alloc_node can fall back to other nodes */
+                        ptr = ____cache_alloc_node(cachep, flags, nodeid);
+                }
+        } else {
+                /* Node not bootstrapped yet */
+                if (!(flags & __GFP_THISNODE))
+                        ptr = fallback_alloc(cachep, flags);
+        }
-        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
+        local_irq_restore(save_flags);
-                                           __builtin_return_address(0));
+        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
        return ptr;
 }
+void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+{
+        return __cache_alloc_node(cachep, flags, nodeid,
+                        __builtin_return_address(0));
+}
 EXPORT_SYMBOL(kmem_cache_alloc_node);
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
+static __always_inline void *
+__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
 {
        struct kmem_cache *cachep;
@@ -3471,8 +3552,29 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
                return NULL;
        return kmem_cache_alloc_node(cachep, flags, node);
 }
+#ifdef CONFIG_DEBUG_SLAB
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+        return __do_kmalloc_node(size, flags, node,
+                        __builtin_return_address(0));
+}
 EXPORT_SYMBOL(__kmalloc_node);
-#endif
+void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
+                int node, void *caller)
+{
+        return __do_kmalloc_node(size, flags, node, caller);
+}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
+#else
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+        return __do_kmalloc_node(size, flags, node, NULL);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+#endif /* CONFIG_DEBUG_SLAB */
+#endif /* CONFIG_NUMA */
 /**
 * __do_kmalloc - allocate memory
@@ -3583,13 +3685,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
        int node;
        struct kmem_list3 *l3;
        struct array_cache *new_shared;
-        struct array_cache **new_alien;
+        struct array_cache **new_alien = NULL;
        for_each_online_node(node) {
-                new_alien = alloc_alien_cache(node, cachep->limit);
+                if (use_alien_caches) {
-                if (!new_alien)
+                        new_alien = alloc_alien_cache(node, cachep->limit);
-                        goto fail;
+                        if (!new_alien)
+                                goto fail;
+                }
                new_shared = alloc_arraycache(node,
                                cachep->shared*cachep->batchcount,
@@ -4038,7 +4142,7 @@ static int s_show(struct seq_file *m, void *p)
 * + further values on SMP and with statistics enabled
 */
-struct seq_operations slabinfo_op = {
+const struct seq_operations slabinfo_op = {
        .start = s_start,
        .next = s_next,
        .stop = s_stop,
@@ -4236,7 +4340,7 @@ static int leaks_show(struct seq_file *m, void *p)
        return 0;
 }
-struct seq_operations slabstats_op = {
+const struct seq_operations slabstats_op = {
        .start = leaks_start,
        .next = s_next,
        .stop = s_stop,
diff --git a/mm/sparse.c b/mm/sparse.c
index b3c82ba30012..ac26eb0d73cd 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -24,6 +24,25 @@ struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
 #endif
 EXPORT_SYMBOL(mem_section);
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+/*
+ * If we did not store the node number in the page then we have to
+ * do a lookup in the section_to_node_table in order to find which
+ * node the page belongs to.
+ */
+#if MAX_NUMNODES <= 256
+static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#else
+static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#endif
+int page_to_nid(struct page *page)
+{
+        return section_to_node_table[page_to_section(page)];
+}
+EXPORT_SYMBOL(page_to_nid);
+#endif
 #ifdef CONFIG_SPARSEMEM_EXTREME
 static struct mem_section *sparse_index_alloc(int nid)
 {
@@ -49,6 +68,10 @@ static int sparse_index_init(unsigned long section_nr, int nid)
        struct mem_section *section;
        int ret = 0;
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+        section_to_node_table[section_nr] = nid;
+#endif
        if (mem_section[root])
                return -EEXIST;
diff --git a/mm/swap.c b/mm/swap.c
index d9a3770d8f3c..2ed7be39795e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -57,9 +57,9 @@ static void put_compound_page(struct page *page)
 {
        page = (struct page *)page_private(page);
        if (put_page_testzero(page)) {
-                void (*dtor)(struct page *page);
+                compound_page_dtor *dtor;
-                dtor = (void (*)(struct page *))page[1].lru.next;
+                dtor = get_compound_page_dtor(page);
                (*dtor)(page);
        }
 }
@@ -514,5 +514,7 @@ void __init swap_setup(void)
         * Right now other parts of the system means that we
         * _really_ don't want to cluster much more
         */
+#ifdef CONFIG_HOTPLUG_CPU
        hotcpu_notifier(cpu_swap_callback, 0);
+#endif
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a15def63f28f..c5431072f422 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -427,34 +427,48 @@ void free_swap_and_cache(swp_entry_t entry)
 #ifdef CONFIG_SOFTWARE_SUSPEND
 /*
- * Find the swap type that corresponds to given device (if any)
+ * Find the swap type that corresponds to given device (if any).
 *
- * This is needed for software suspend and is done in such a way that inode
+ * @offset - number of the PAGE_SIZE-sized block of the device, starting
- * aliasing is allowed.
+ * from 0, in which the swap header is expected to be located.
+ *
+ * This is needed for the suspend to disk (aka swsusp).
 */
-int swap_type_of(dev_t device)
+int swap_type_of(dev_t device, sector_t offset)
 {
+        struct block_device *bdev = NULL;
        int i;
+        if (device)
+                bdev = bdget(device);
        spin_lock(&swap_lock);
        for (i = 0; i < nr_swapfiles; i++) {
-                struct inode *inode;
+                struct swap_info_struct *sis = swap_info + i;
-                if (!(swap_info[i].flags & SWP_WRITEOK))
+                if (!(sis->flags & SWP_WRITEOK))
                        continue;
-                if (!device) {
+                if (!bdev) {
                        spin_unlock(&swap_lock);
                        return i;
                }
-                inode = swap_info[i].swap_file->f_dentry->d_inode;
+                if (bdev == sis->bdev) {
-                if (S_ISBLK(inode->i_mode) &&
+                        struct swap_extent *se;
-                    device == MKDEV(imajor(inode), iminor(inode))) {
-                        spin_unlock(&swap_lock);
+                        se = list_entry(sis->extent_list.next,
-                        return i;
+                                        struct swap_extent, list);
+                        if (se->start_block == offset) {
+                                spin_unlock(&swap_lock);
+                                bdput(bdev);
+                                return i;
+                        }
                }
        }
        spin_unlock(&swap_lock);
+        if (bdev)
+                bdput(bdev);
        return -ENODEV;
 }
@@ -931,6 +945,23 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
        }
 }
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/*
+ * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
+ * corresponding to given index in swap_info (swap type).
+ */
+sector_t swapdev_block(int swap_type, pgoff_t offset)
+{
+        struct swap_info_struct *sis;
+        if (swap_type >= nr_swapfiles)
+                return 0;
+        sis = swap_info + swap_type;
+        return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
+}
+#endif /* CONFIG_SOFTWARE_SUSPEND */
 /*
 * Free all of a swapdev's extent information
 */
@@ -1274,10 +1305,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
        mutex_lock(&swapon_mutex);
+        if (!l)
+                return SEQ_START_TOKEN;
        for (i = 0; i < nr_swapfiles; i++, ptr++) {
                if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
                        continue;
-                if (!l--)
+                if (!--l)
                        return ptr;
        }
@@ -1286,10 +1320,17 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
 {
-        struct swap_info_struct *ptr = v;
+        struct swap_info_struct *ptr;
        struct swap_info_struct *endptr = swap_info + nr_swapfiles;
-        for (++ptr; ptr < endptr; ptr++) {
+        if (v == SEQ_START_TOKEN)
+                ptr = swap_info;
+        else {
+                ptr = v;
+                ptr++;
+        }
+        for (; ptr < endptr; ptr++) {
                if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
                        continue;
                ++*pos;
@@ -1310,8 +1351,10 @@ static int swap_show(struct seq_file *swap, void *v)
        struct file *file;
        int len;
-        if (v == swap_info)
+        if (ptr == SEQ_START_TOKEN) {
-                seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+                seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+                return 0;
+        }
        file = ptr->swap_file;
        len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
@@ -1325,7 +1368,7 @@ static int swap_show(struct seq_file *swap, void *v)
        return 0;
 }
-static struct seq_operations swaps_op = {
+static const struct seq_operations swaps_op = {
        .start =        swap_start,
        .next =         swap_next,
        .stop =         swap_stop,
@@ -1337,7 +1380,7 @@ static int swaps_open(struct inode *inode, struct file *file)
        return seq_open(file, &swaps_op);
 }
-static struct file_operations proc_swaps_operations = {
+static const struct file_operations proc_swaps_operations = {
        .open           = swaps_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
@@ -1540,6 +1583,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                error = -EINVAL;
                if (!maxpages)
                        goto bad_swap;
+                if (swapfilesize && maxpages > swapfilesize) {
+                        printk(KERN_WARNING
+                               "Swap area shorter than signature indicates\n");
+                        goto bad_swap;
+                }
                if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
                        goto bad_swap;
                if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
@@ -1567,12 +1615,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                        goto bad_swap;
        }
-        if (swapfilesize && maxpages > swapfilesize) {
-                printk(KERN_WARNING
-                       "Swap area shorter than signature indicates\n");
-                error = -EINVAL;
-                goto bad_swap;
-        }
        if (nr_good_pages) {
                p->swap_map[0] = SWAP_MAP_BAD;
                p->max = maxpages;
diff --git a/mm/thrash.c b/mm/thrash.c
index f4c560b4a2b7..9ef9071f99bc 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -7,100 +7,74 @@
 *
 * Simple token based thrashing protection, using the algorithm
 * described in:  http://www.cs.wm.edu/~sjiang/token.pdf
+ *
+ * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
+ * Improved algorithm to pass token:
+ * Each task has a priority which is incremented if it contended
+ * for the token in an interval less than its previous attempt.
+ * If the token is acquired, that task's priority is boosted to prevent
+ * the token from bouncing around too often and to let the task make
+ * some progress in its execution.
 */
 #include <linux/jiffies.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/swap.h>
 static DEFINE_SPINLOCK(swap_token_lock);
-static unsigned long swap_token_timeout;
+struct mm_struct *swap_token_mm;
-static unsigned long swap_token_check;
+static unsigned int global_faults;
-struct mm_struct * swap_token_mm = &init_mm;
-#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
-#define SWAP_TOKEN_TIMEOUT      (300 * HZ)
-/*
- * Currently disabled; Needs further code to work at HZ * 300.
- */
-unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT;
-/*
- * Take the token away if the process had no page faults
- * in the last interval, or if it has held the token for
- * too long.
- */
-#define SWAP_TOKEN_ENOUGH_RSS 1
-#define SWAP_TOKEN_TIMED_OUT 2
-static int should_release_swap_token(struct mm_struct *mm)
-{
-        int ret = 0;
-        if (!mm->recent_pagein)
-                ret = SWAP_TOKEN_ENOUGH_RSS;
-        else if (time_after(jiffies, swap_token_timeout))
-                ret = SWAP_TOKEN_TIMED_OUT;
-        mm->recent_pagein = 0;
-        return ret;
-}
-/*
- * Try to grab the swapout protection token.  We only try to
- * grab it once every TOKEN_CHECK_INTERVAL, both to prevent
- * SMP lock contention and to check that the process that held
- * the token before is no longer thrashing.
- */
 void grab_swap_token(void)
 {
-        struct mm_struct *mm;
+        int current_interval;
-        int reason;
-        /* We have the token. Let others know we still need it. */
+        global_faults++;
-        if (has_swap_token(current->mm)) {
-                current->mm->recent_pagein = 1;
-                if (unlikely(!swap_token_default_timeout))
-                        disable_swap_token();
-                return;
-        }
-        if (time_after(jiffies, swap_token_check)) {
-                if (!swap_token_default_timeout) {
+        current_interval = global_faults - current->mm->faultstamp;
-                        swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
-                        return;
-                }
-                /* ... or if we recently held the token. */
-                if (time_before(jiffies, current->mm->swap_token_time))
-                        return;
-                if (!spin_trylock(&swap_token_lock))
+        if (!spin_trylock(&swap_token_lock))
-                        return;
+                return;
-                swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
+        /* First come first served */
+        if (swap_token_mm == NULL) {
+                current->mm->token_priority = current->mm->token_priority + 2;
+                swap_token_mm = current->mm;
+                goto out;
+        }
-                mm = swap_token_mm;
+        if (current->mm != swap_token_mm) {
-                if ((reason = should_release_swap_token(mm))) {
+                if (current_interval < current->mm->last_interval)
-                        unsigned long eligible = jiffies;
+                        current->mm->token_priority++;
-                        if (reason == SWAP_TOKEN_TIMED_OUT) {
+                else {
-                                eligible += swap_token_default_timeout;
+                        current->mm->token_priority--;
-                        }
+                        if (unlikely(current->mm->token_priority < 0))
-                        mm->swap_token_time = eligible;
+                                current->mm->token_priority = 0;
-                        swap_token_timeout = jiffies + swap_token_default_timeout;
+                }
+                /* Check if we deserve the token */
+                if (current->mm->token_priority >
+                                swap_token_mm->token_priority) {
+                        current->mm->token_priority += 2;
                        swap_token_mm = current->mm;
                }
-                spin_unlock(&swap_token_lock);
+        } else {
+                /* Token holder came in again! */
+                current->mm->token_priority += 2;
        }
-        return;
+out:
+        current->mm->faultstamp = global_faults;
+        current->mm->last_interval = current_interval;
+        spin_unlock(&swap_token_lock);
+return;
 }
 /* Called on process exit. */
 void __put_swap_token(struct mm_struct *mm)
 {
        spin_lock(&swap_token_lock);
-        if (likely(mm == swap_token_mm)) {
+        if (likely(mm == swap_token_mm))
-                mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
+                swap_token_mm = NULL;
-                swap_token_mm = &init_mm;
-                swap_token_check = jiffies;
-        }
        spin_unlock(&swap_token_lock);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 518540a4a2a6..093f5fe6dd77 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -36,6 +36,7 @@
 #include <linux/rwsem.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/freezer.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1172,11 +1173,12 @@ loop_again:
                        if (!zone_watermark_ok(zone, order, zone->pages_high,
                                               0, 0)) {
                                end_zone = i;
-                                goto scan;
+                                break;
                        }
                }
-                goto out;
+                if (i < 0)
-scan:
+                        goto out;
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
@@ -1259,6 +1261,9 @@ out:
        }
        if (!all_zones_ok) {
                cond_resched();
+                try_to_freeze();
                goto loop_again;
        }
@@ -1508,7 +1513,6 @@ out:
 }
 #endif
-#ifdef CONFIG_HOTPLUG_CPU
 /* It's optimal to keep kswapds on the same CPUs as their memory, but
   not required for correctness.  So if the last cpu in a node goes
   away, we get changed to run anywhere: as the first one comes back,
@@ -1529,7 +1533,6 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
        }
        return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 /*
 * This kswapd start function will be called by init and node-hot-add.
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8614e8f6743b..dc005a0c96ae 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -430,7 +430,7 @@ static int frag_show(struct seq_file *m, void *arg)
        return 0;
 }
-struct seq_operations fragmentation_op = {
+const struct seq_operations fragmentation_op = {
        .start  = frag_start,
        .next   = frag_next,
        .stop   = frag_stop,
@@ -452,7 +452,7 @@ struct seq_operations fragmentation_op = {
 #define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
                                        TEXT_FOR_HIGHMEM(xx)
-static char *vmstat_text[] = {
+static const char * const vmstat_text[] = {
        /* Zoned VM counters */
        "nr_anon_pages",
        "nr_mapped",
@@ -597,7 +597,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
        return 0;
 }
-struct seq_operations zoneinfo_op = {
+const struct seq_operations zoneinfo_op = {
        .start  = frag_start, /* iterate over all zones. The same as in
                               * fragmentation. */
        .next   = frag_next,
@@ -660,7 +660,7 @@ static void vmstat_stop(struct seq_file *m, void *arg)
        m->private = NULL;
 }
-struct seq_operations vmstat_op = {
+const struct seq_operations vmstat_op = {
        .start  = vmstat_start,
        .next   = vmstat_next,
        .stop   = vmstat_stop,
@@ -679,13 +679,13 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
                void *hcpu)
 {
        switch (action) {
-                case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE:
-                case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED:
-                case CPU_DEAD:
+        case CPU_DEAD:
-                        refresh_zone_stat_thresholds();
+                refresh_zone_stat_thresholds();
-                        break;
+                break;
-                default:
+        default:
-                        break;
+                break;
        }
        return NOTIFY_OK;
 }