15 files changed, 276 insertions, 252 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 1a4473fcb2ca..ae9ce6b73e8a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -126,9 +126,11 @@ comment "Memory hotplug is currently incompatible with Software Suspend"
 # Default to 4 for wider testing, though 8 might be more appropriate.
 # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
 # PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
+# ARM26 and SPARC32 and PPC64 may use one page for multiple page tables.
 #
 config SPLIT_PTLOCK_CPUS
        int
        default "4096" if ARM && !CPU_CACHE_VIPT
        default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
+        default "4096" if ARM26 || SPARC32 || PPC64
        default "4"
diff --git a/mm/filemap.c b/mm/filemap.c
index 5d6e4c2000dc..33a28bfde158 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -134,7 +134,7 @@ static int sync_page(void *word)
        struct address_space *mapping;
        struct page *page;
-        page = container_of((page_flags_t *)word, struct page, flags);
+        page = container_of((unsigned long *)word, struct page, flags);
        /*
         * page_mapping() is being called without PG_locked held.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9a565808da3f..728e9bda12ea 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -237,7 +237,6 @@ unsigned long hugetlb_total_pages(void)
 {
        return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
 }
-EXPORT_SYMBOL(hugetlb_total_pages);
 /*
 * We cannot handle pagefaults against hugetlb pages at all.  They cause
diff --git a/mm/memory.c b/mm/memory.c
index 0f60baf6f69b..2998cfc12f5b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -549,10 +549,10 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        return 0;
 }
-static void zap_pte_range(struct mmu_gather *tlb,
+static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end,
-                                struct zap_details *details)
+                                long *zap_work, struct zap_details *details)
 {
        struct mm_struct *mm = tlb->mm;
        pte_t *pte;
@@ -563,10 +563,15 @@ static void zap_pte_range(struct mmu_gather *tlb,
        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        do {
                pte_t ptent = *pte;
-                if (pte_none(ptent))
+                if (pte_none(ptent)) {
+                        (*zap_work)--;
                        continue;
+                }
                if (pte_present(ptent)) {
                        struct page *page = NULL;
+                        (*zap_work) -= PAGE_SIZE;
                        if (!(vma->vm_flags & VM_RESERVED)) {
                                unsigned long pfn = pte_pfn(ptent);
                                if (unlikely(!pfn_valid(pfn)))
@@ -624,16 +629,18 @@ static void zap_pte_range(struct mmu_gather *tlb,
                if (!pte_file(ptent))
                        free_swap_and_cache(pte_to_swp_entry(ptent));
                pte_clear_full(mm, addr, pte, tlb->fullmm);
-        } while (pte++, addr += PAGE_SIZE, addr != end);
+        } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
        add_mm_rss(mm, file_rss, anon_rss);
        pte_unmap_unlock(pte - 1, ptl);
+        return addr;
 }
-static inline void zap_pmd_range(struct mmu_gather *tlb,
+static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
-                                struct zap_details *details)
+                                long *zap_work, struct zap_details *details)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -641,16 +648,21 @@ static inline void zap_pmd_range(struct mmu_gather *tlb,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-                if (pmd_none_or_clear_bad(pmd))
+                if (pmd_none_or_clear_bad(pmd)) {
+                        (*zap_work)--;
                        continue;
-                zap_pte_range(tlb, vma, pmd, addr, next, details);
+                }
-        } while (pmd++, addr = next, addr != end);
+                next = zap_pte_range(tlb, vma, pmd, addr, next,
+                                                zap_work, details);
+        } while (pmd++, addr = next, (addr != end && *zap_work > 0));
+        return addr;
 }
-static inline void zap_pud_range(struct mmu_gather *tlb,
+static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
-                                struct zap_details *details)
+                                long *zap_work, struct zap_details *details)
 {
        pud_t *pud;
        unsigned long next;
@@ -658,15 +670,21 @@ static inline void zap_pud_range(struct mmu_gather *tlb,
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
-                if (pud_none_or_clear_bad(pud))
+                if (pud_none_or_clear_bad(pud)) {
+                        (*zap_work)--;
                        continue;
-                zap_pmd_range(tlb, vma, pud, addr, next, details);
+                }
-        } while (pud++, addr = next, addr != end);
+                next = zap_pmd_range(tlb, vma, pud, addr, next,
+                                                zap_work, details);
+        } while (pud++, addr = next, (addr != end && *zap_work > 0));
+        return addr;
 }
-static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+static unsigned long unmap_page_range(struct mmu_gather *tlb,
+                                struct vm_area_struct *vma,
                                unsigned long addr, unsigned long end,
-                                struct zap_details *details)
+                                long *zap_work, struct zap_details *details)
 {
        pgd_t *pgd;
        unsigned long next;
@@ -679,11 +697,16 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
-                if (pgd_none_or_clear_bad(pgd))
+                if (pgd_none_or_clear_bad(pgd)) {
+                        (*zap_work)--;
                        continue;
-                zap_pud_range(tlb, vma, pgd, addr, next, details);
+                }
-        } while (pgd++, addr = next, addr != end);
+                next = zap_pud_range(tlb, vma, pgd, addr, next,
+                                                zap_work, details);
+        } while (pgd++, addr = next, (addr != end && *zap_work > 0));
        tlb_end_vma(tlb, vma);
+        return addr;
 }
 #ifdef CONFIG_PREEMPT
@@ -724,7 +747,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                unsigned long end_addr, unsigned long *nr_accounted,
                struct zap_details *details)
 {
-        unsigned long zap_bytes = ZAP_BLOCK_SIZE;
+        long zap_work = ZAP_BLOCK_SIZE;
        unsigned long tlb_start = 0;    /* For tlb_finish_mmu */
        int tlb_start_valid = 0;
        unsigned long start = start_addr;
@@ -745,27 +768,25 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                        *nr_accounted += (end - start) >> PAGE_SHIFT;
                while (start != end) {
-                        unsigned long block;
                        if (!tlb_start_valid) {
                                tlb_start = start;
                                tlb_start_valid = 1;
                        }
-                        if (is_vm_hugetlb_page(vma)) {
+                        if (unlikely(is_vm_hugetlb_page(vma))) {
-                                block = end - start;
                                unmap_hugepage_range(vma, start, end);
-                        } else {
+                                zap_work -= (end - start) /
-                                block = min(zap_bytes, end - start);
+                                                (HPAGE_SIZE / PAGE_SIZE);
-                                unmap_page_range(*tlbp, vma, start,
+                                start = end;
-                                                start + block, details);
+                        } else
+                                start = unmap_page_range(*tlbp, vma,
+                                                start, end, &zap_work, details);
+                        if (zap_work > 0) {
+                                BUG_ON(start != end);
+                                break;
                        }
-                        start += block;
-                        zap_bytes -= block;
-                        if ((long)zap_bytes > 0)
-                                continue;
                        tlb_finish_mmu(*tlbp, tlb_start, start);
                        if (need_resched() ||
@@ -779,7 +800,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                        *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
                        tlb_start_valid = 0;
-                        zap_bytes = ZAP_BLOCK_SIZE;
+                        zap_work = ZAP_BLOCK_SIZE;
                }
        }
 out:
diff --git a/mm/mmap.c b/mm/mmap.c
index 320dda1778c3..6c997b159600 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -155,10 +155,6 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
        return -ENOMEM;
 }
-EXPORT_SYMBOL(sysctl_overcommit_memory);
-EXPORT_SYMBOL(sysctl_overcommit_ratio);
-EXPORT_SYMBOL(sysctl_max_map_count);
-EXPORT_SYMBOL(vm_committed_space);
 EXPORT_SYMBOL(__vm_enough_memory);
 /*
diff --git a/mm/nommu.c b/mm/nommu.c
index d1e076a487cb..6deb6ab3d6ad 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -44,10 +44,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int heap_stack_gap = 0;
 EXPORT_SYMBOL(mem_map);
-EXPORT_SYMBOL(sysctl_max_map_count);
-EXPORT_SYMBOL(sysctl_overcommit_memory);
-EXPORT_SYMBOL(sysctl_overcommit_ratio);
-EXPORT_SYMBOL(vm_committed_space);
 EXPORT_SYMBOL(__vm_enough_memory);
 /* list of shareable VMAs */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0166ea15c9ee..74138c9a22b9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -750,7 +750,6 @@ int clear_page_dirty_for_io(struct page *page)
        }
        return TestClearPageDirty(page);
 }
-EXPORT_SYMBOL(clear_page_dirty_for_io);
 int test_clear_page_writeback(struct page *page)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2dbdd98426fd..bd4de592dc23 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -60,11 +60,13 @@ long nr_swap_pages;
 *      NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
 *      HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
 *      HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ *
+ * TBD: should special case ZONE_DMA32 machines here - in those we normally
+ * don't need any ZONE_NORMAL reservation
 */
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
 EXPORT_SYMBOL(totalram_pages);
-EXPORT_SYMBOL(nr_swap_pages);
 /*
 * Used by page_zone() to look up the address of the struct zone whose
@@ -73,7 +75,7 @@ EXPORT_SYMBOL(nr_swap_pages);
 struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
 EXPORT_SYMBOL(zone_table);
-static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 unsigned long __initdata nr_kernel_pages;
@@ -125,7 +127,7 @@ static void bad_page(const char *function, struct page *page)
        printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
                function, current->comm, page);
        printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
-                (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
+                (int)(2*sizeof(unsigned long)), (unsigned long)page->flags,
                page->mapping, page_mapcount(page), page_count(page));
        printk(KERN_EMERG "Backtrace:\n");
        dump_stack();
@@ -733,9 +735,7 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
                }
                local_irq_restore(flags);
                put_cpu();
-        }
+        } else {
-        if (page == NULL) {
                spin_lock_irqsave(&zone->lock, flags);
                page = __rmqueue(zone, order);
                spin_unlock_irqrestore(&zone->lock, flags);
@@ -755,20 +755,25 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
        return page;
 }
+#define ALLOC_NO_WATERMARKS     0x01 /* don't check watermarks at all */
+#define ALLOC_HARDER            0x02 /* try to alloc harder */
+#define ALLOC_HIGH              0x04 /* __GFP_HIGH set */
+#define ALLOC_CPUSET            0x08 /* check for correct cpuset */
 /*
 * Return 1 if free pages are above 'mark'. This takes into account the order
 * of the allocation.
 */
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                      int classzone_idx, int can_try_harder, gfp_t gfp_high)
+                      int classzone_idx, int alloc_flags)
 {
        /* free_pages my go negative - that's OK */
        long min = mark, free_pages = z->free_pages - (1 << order) + 1;
        int o;
-        if (gfp_high)
+        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
-        if (can_try_harder)
+        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
@@ -786,14 +791,40 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        return 1;
 }
-static inline int
+/*
-should_reclaim_zone(struct zone *z, gfp_t gfp_mask)
+ * get_page_from_freeliest goes through the zonelist trying to allocate
+ * a page.
+ */
+static struct page *
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+                struct zonelist *zonelist, int alloc_flags)
 {
-        if (!z->reclaim_pages)
+        struct zone **z = zonelist->zones;
-                return 0;
+        struct page *page = NULL;
-        if (gfp_mask & __GFP_NORECLAIM)
+        int classzone_idx = zone_idx(*z);
-                return 0;
-        return 1;
+        /*
+         * Go through the zonelist once, looking for a zone with enough free.
+         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+         */
+        do {
+                if ((alloc_flags & ALLOC_CPUSET) &&
+                                !cpuset_zone_allowed(*z, gfp_mask))
+                        continue;
+                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
+                        if (!zone_watermark_ok(*z, order, (*z)->pages_low,
+                                    classzone_idx, alloc_flags))
+                                continue;
+                }
+                page = buffered_rmqueue(*z, order, gfp_mask);
+                if (page) {
+                        zone_statistics(zonelist, *z);
+                        break;
+                }
+        } while (*(++z) != NULL);
+        return page;
 }
 /*
@@ -804,105 +835,76 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
                struct zonelist *zonelist)
 {
        const gfp_t wait = gfp_mask & __GFP_WAIT;
-        struct zone **zones, *z;
+        struct zone **z;
        struct page *page;
        struct reclaim_state reclaim_state;
        struct task_struct *p = current;
-        int i;
-        int classzone_idx;
        int do_retry;
-        int can_try_harder;
+        int alloc_flags;
        int did_some_progress;
        might_sleep_if(wait);
-        /*
+restart:
-         * The caller may dip into page reserves a bit more if the caller
+        z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
-         * cannot run direct reclaim, or is the caller has realtime scheduling
-         * policy
-         */
-        can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
-        zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
-        if (unlikely(zones[0] == NULL)) {
+        if (unlikely(*z == NULL)) {
                /* Should this ever happen?? */
                return NULL;
        }
-        classzone_idx = zone_idx(zones[0]);
+        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+                                zonelist, ALLOC_CPUSET);
+        if (page)
+                goto got_pg;
+        do {
+                wakeup_kswapd(*z, order);
+        } while (*(++z));
-restart:
        /*
-         * Go through the zonelist once, looking for a zone with enough free.
+         * OK, we're below the kswapd watermark and have kicked background
-         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+         * reclaim. Now things get more complex, so set up alloc_flags according
+         * to how we want to proceed.
+         *
+         * The caller may dip into page reserves a bit more if the caller
+         * cannot run direct reclaim, or if the caller has realtime scheduling
+         * policy.
         */
-        for (i = 0; (z = zones[i]) != NULL; i++) {
+        alloc_flags = 0;
-                int do_reclaim = should_reclaim_zone(z, gfp_mask);
+        if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
+                alloc_flags |= ALLOC_HARDER;
-                if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
+        if (gfp_mask & __GFP_HIGH)
-                        continue;
+                alloc_flags |= ALLOC_HIGH;
+        if (wait)
-                /*
+                alloc_flags |= ALLOC_CPUSET;
-                 * If the zone is to attempt early page reclaim then this loop
-                 * will try to reclaim pages and check the watermark a second
-                 * time before giving up and falling back to the next zone.
-                 */
-zone_reclaim_retry:
-                if (!zone_watermark_ok(z, order, z->pages_low,
-                                       classzone_idx, 0, 0)) {
-                        if (!do_reclaim)
-                                continue;
-                        else {
-                                zone_reclaim(z, gfp_mask, order);
-                                /* Only try reclaim once */
-                                do_reclaim = 0;
-                                goto zone_reclaim_retry;
-                        }
-                }
-                page = buffered_rmqueue(z, order, gfp_mask);
-                if (page)
-                        goto got_pg;
-        }
-        for (i = 0; (z = zones[i]) != NULL; i++)
-                wakeup_kswapd(z, order);
        /*
         * Go through the zonelist again. Let __GFP_HIGH and allocations
-         * coming from realtime tasks to go deeper into reserves
+         * coming from realtime tasks go deeper into reserves.
         *
         * This is the last chance, in general, before the goto nopage.
         * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
         */
-        for (i = 0; (z = zones[i]) != NULL; i++) {
+        page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
-                if (!zone_watermark_ok(z, order, z->pages_min,
+        if (page)
-                                       classzone_idx, can_try_harder,
+                goto got_pg;
-                                       gfp_mask & __GFP_HIGH))
-                        continue;
-                if (wait && !cpuset_zone_allowed(z, gfp_mask))
-                        continue;
-                page = buffered_rmqueue(z, order, gfp_mask);
-                if (page)
-                        goto got_pg;
-        }
        /* This allocation should allow future memory freeing. */
        if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
                        && !in_interrupt()) {
                if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+nofail_alloc:
                        /* go through the zonelist yet again, ignoring mins */
-                        for (i = 0; (z = zones[i]) != NULL; i++) {
+                        page = get_page_from_freelist(gfp_mask, order,
-                                if (!cpuset_zone_allowed(z, gfp_mask))
+                                zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
-                                        continue;
+                        if (page)
-                                page = buffered_rmqueue(z, order, gfp_mask);
+                                goto got_pg;
-                                if (page)
+                        if (gfp_mask & __GFP_NOFAIL) {
-                                        goto got_pg;
+                                blk_congestion_wait(WRITE, HZ/50);
+                                goto nofail_alloc;
                        }
                }
                goto nopage;
@@ -920,7 +922,7 @@ rebalance:
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
-        did_some_progress = try_to_free_pages(zones, gfp_mask);
+        did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
        p->reclaim_state = NULL;
        p->flags &= ~PF_MEMALLOC;
@@ -928,19 +930,10 @@ rebalance:
        cond_resched();
        if (likely(did_some_progress)) {
-                for (i = 0; (z = zones[i]) != NULL; i++) {
+                page = get_page_from_freelist(gfp_mask, order,
-                        if (!zone_watermark_ok(z, order, z->pages_min,
+                                                zonelist, alloc_flags);
-                                               classzone_idx, can_try_harder,
+                if (page)
-                                               gfp_mask & __GFP_HIGH))
+                        goto got_pg;
-                                continue;
-                        if (!cpuset_zone_allowed(z, gfp_mask))
-                                continue;
-                        page = buffered_rmqueue(z, order, gfp_mask);
-                        if (page)
-                                goto got_pg;
-                }
        } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
                /*
                 * Go through the zonelist yet one more time, keep
@@ -948,18 +941,10 @@ rebalance:
                 * a parallel oom killing, we must fail if we're still
                 * under heavy pressure.
                 */
-                for (i = 0; (z = zones[i]) != NULL; i++) {
+                page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-                        if (!zone_watermark_ok(z, order, z->pages_high,
+                                                zonelist, ALLOC_CPUSET);
-                                               classzone_idx, 0, 0))
+                if (page)
-                                continue;
+                        goto got_pg;
-                        if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
-                                continue;
-                        page = buffered_rmqueue(z, order, gfp_mask);
-                        if (page)
-                                goto got_pg;
-                }
                out_of_memory(gfp_mask, order);
                goto restart;
@@ -992,9 +977,7 @@ nopage:
                dump_stack();
                show_mem();
        }
-        return NULL;
 got_pg:
-        zone_statistics(zonelist, z);
        return page;
 }
@@ -1331,7 +1314,7 @@ void show_free_areas(void)
                } else
                        printk("\n");
-                for_each_cpu(cpu) {
+                for_each_online_cpu(cpu) {
                        struct per_cpu_pageset *pageset;
                        pageset = zone_pcp(zone, cpu);
@@ -1442,6 +1425,10 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli
                zone = pgdat->node_zones + ZONE_NORMAL;
                if (zone->present_pages)
                        zonelist->zones[j++] = zone;
+        case ZONE_DMA32:
+                zone = pgdat->node_zones + ZONE_DMA32;
+                if (zone->present_pages)
+                        zonelist->zones[j++] = zone;
        case ZONE_DMA:
                zone = pgdat->node_zones + ZONE_DMA;
                if (zone->present_pages)
@@ -1456,6 +1443,8 @@ static inline int highest_zone(int zone_bits)
        int res = ZONE_NORMAL;
        if (zone_bits & (__force int)__GFP_HIGHMEM)
                res = ZONE_HIGHMEM;
+        if (zone_bits & (__force int)__GFP_DMA32)
+                res = ZONE_DMA32;
        if (zone_bits & (__force int)__GFP_DMA)
                res = ZONE_DMA;
        return res;
@@ -1867,11 +1856,10 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
                        if (process_zones(cpu))
                                ret = NOTIFY_BAD;
                        break;
-#ifdef CONFIG_HOTPLUG_CPU
+                case CPU_UP_CANCELED:
                case CPU_DEAD:
                        free_zone_pagesets(cpu);
                        break;
-#endif
                default:
                        break;
        }
@@ -1976,7 +1964,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                if (zholes_size)
                        realsize -= zholes_size[j];
-                if (j == ZONE_DMA || j == ZONE_NORMAL)
+                if (j < ZONE_HIGHMEM)
                        nr_kernel_pages += realsize;
                nr_all_pages += realsize;
@@ -2418,13 +2406,18 @@ void setup_per_zone_pages_min(void)
        }
        for_each_zone(zone) {
+                unsigned long tmp;
                spin_lock_irqsave(&zone->lru_lock, flags);
+                tmp = (pages_min * zone->present_pages) / lowmem_pages;
                if (is_highmem(zone)) {
                        /*
-                         * Often, highmem doesn't need to reserve any pages.
+                         * __GFP_HIGH and PF_MEMALLOC allocations usually don't
-                         * But the pages_min/low/high values are also used for
+                         * need highmem pages, so cap pages_min to a small
-                         * batching up page reclaim activity so we need a
+                         * value here.
-                         * decent value here.
+                         *
+                         * The (pages_high-pages_low) and (pages_low-pages_min)
+                         * deltas controls asynch page reclaim, and so should
+                         * not be capped for highmem.
                         */
                        int min_pages;
@@ -2435,19 +2428,15 @@ void setup_per_zone_pages_min(void)
                                min_pages = 128;
                        zone->pages_min = min_pages;
                } else {
-                        /* if it's a lowmem zone, reserve a number of pages
+                        /*
+                         * If it's a lowmem zone, reserve a number of pages
                         * proportionate to the zone's size.
                         */
-                        zone->pages_min = (pages_min * zone->present_pages) /
+                        zone->pages_min = tmp;
-                                           lowmem_pages;
                }
-                /*
+                zone->pages_low   = zone->pages_min + tmp / 4;
-                 * When interpreting these watermarks, just keep in mind that:
+                zone->pages_high  = zone->pages_min + tmp / 2;
-                 * zone->pages_min == (zone->pages_min * 4) / 4;
-                 */
-                zone->pages_low   = (zone->pages_min * 5) / 4;
-                zone->pages_high  = (zone->pages_min * 6) / 4;
                spin_unlock_irqrestore(&zone->lru_lock, flags);
        }
 }
diff --git a/mm/readahead.c b/mm/readahead.c
index d0b50034e245..72e7adbb87c7 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -254,7 +254,7 @@ out:
 */
 static int
 __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
-                        unsigned long offset, unsigned long nr_to_read)
+                        pgoff_t offset, unsigned long nr_to_read)
 {
        struct inode *inode = mapping->host;
        struct page *page;
@@ -274,7 +274,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
         */
        read_lock_irq(&mapping->tree_lock);
        for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
-                unsigned long page_offset = offset + page_idx;
+                pgoff_t page_offset = offset + page_idx;
                
                if (page_offset > end_index)
                        break;
@@ -311,7 +311,7 @@ out:
 * memory at once.
 */
 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
-                unsigned long offset, unsigned long nr_to_read)
+                pgoff_t offset, unsigned long nr_to_read)
 {
        int ret = 0;
@@ -368,7 +368,7 @@ static inline int check_ra_success(struct file_ra_state *ra,
 * request queues.
 */
 int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
-                        unsigned long offset, unsigned long nr_to_read)
+                        pgoff_t offset, unsigned long nr_to_read)
 {
        if (bdi_read_congested(mapping->backing_dev_info))
                return -1;
@@ -385,7 +385,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 */
 static int
 blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
-                        unsigned long offset, unsigned long nr_to_read,
+                        pgoff_t offset, unsigned long nr_to_read,
                        struct file_ra_state *ra, int block)
 {
        int actual;
@@ -430,14 +430,27 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
        return ret;
 }
-/*
+/**
- * page_cache_readahead is the main function.  If performs the adaptive
+ * page_cache_readahead - generic adaptive readahead
+ * @mapping: address_space which holds the pagecache and I/O vectors
+ * @ra: file_ra_state which holds the readahead state
+ * @filp: passed on to ->readpage() and ->readpages()
+ * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
+ * @req_size: hint: total size of the read which the caller is performing in
+ *            PAGE_CACHE_SIZE units
+ *
+ * page_cache_readahead() is the main function.  If performs the adaptive
 * readahead window size management and submits the readahead I/O.
+ *
+ * Note that @filp is purely used for passing on to the ->readpage[s]()
+ * handler: it may refer to a different file from @mapping (so we may not use
+ * @filp->f_mapping or @filp->f_dentry->d_inode here).
+ * Also, @ra may not be equal to &@filp->f_ra.
+ *
 */
 unsigned long
 page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
-                     struct file *filp, unsigned long offset,
+                     struct file *filp, pgoff_t offset, unsigned long req_size)
-                     unsigned long req_size)
 {
        unsigned long max, newsize;
        int sequential;
diff --git a/mm/slab.c b/mm/slab.c
index 22bfb0b2ac8b..e5ec26e0c460 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -368,7 +368,7 @@ static inline void kmem_list3_init(struct kmem_list3 *parent)
 * manages a cache.
 */
        
-struct kmem_cache_s {
+struct kmem_cache {
 /* 1) per-cpu data, touched during every alloc/free */
        struct array_cache      *array[NR_CPUS];
        unsigned int            batchcount;
@@ -434,7 +434,7 @@ struct kmem_cache_s {
 /* Optimization question: fewer reaps means less 
 * probability for unnessary cpucache drain/refill cycles.
 *
- * OTHO the cpuarrays can contain lots of objects,
+ * OTOH the cpuarrays can contain lots of objects,
 * which could lock up otherwise freeable slabs.
 */
 #define REAPTIMEOUT_CPUC        (2*HZ)
@@ -565,14 +565,29 @@ static void **dbg_userword(kmem_cache_t *cachep, void *objp)
 #define BREAK_GFP_ORDER_LO      0
 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
-/* Macros for storing/retrieving the cachep and or slab from the
+/* Functions for storing/retrieving the cachep and or slab from the
 * global 'mem_map'. These are used to find the slab an obj belongs to.
 * With kfree(), these are used to find the cache which an obj belongs to.
 */
-#define SET_PAGE_CACHE(pg,x)  ((pg)->lru.next = (struct list_head *)(x))
+static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
-#define GET_PAGE_CACHE(pg)    ((kmem_cache_t *)(pg)->lru.next)
+{
-#define SET_PAGE_SLAB(pg,x)   ((pg)->lru.prev = (struct list_head *)(x))
+        page->lru.next = (struct list_head *)cache;
-#define GET_PAGE_SLAB(pg)     ((struct slab *)(pg)->lru.prev)
+}
+static inline struct kmem_cache *page_get_cache(struct page *page)
+{
+        return (struct kmem_cache *)page->lru.next;
+}
+static inline void page_set_slab(struct page *page, struct slab *slab)
+{
+        page->lru.prev = (struct list_head *)slab;
+}
+static inline struct slab *page_get_slab(struct page *page)
+{
+        return (struct slab *)page->lru.prev;
+}
 /* These are the default caches for kmalloc. Custom caches can have other sizes. */
 struct cache_sizes malloc_sizes[] = {
@@ -1190,11 +1205,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
        int i;
        flags |= cachep->gfpflags;
-        if (likely(nodeid == -1)) {
+        page = alloc_pages_node(nodeid, flags, cachep->gfporder);
-                page = alloc_pages(flags, cachep->gfporder);
-        } else {
-                page = alloc_pages_node(nodeid, flags, cachep->gfporder);
-        }
        if (!page)
                return NULL;
        addr = page_address(page);
@@ -1368,7 +1379,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
                /* Print some data about the neighboring objects, if they
                 * exist:
                 */
-                struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp));
+                struct slab *slabp = page_get_slab(virt_to_page(objp));
                int objnr;
                objnr = (objp-slabp->s_mem)/cachep->objsize;
@@ -1502,6 +1513,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 {
        size_t left_over, slab_size, ralign;
        kmem_cache_t *cachep = NULL;
+        struct list_head *p;
        /*
         * Sanity checks... these are all serious usage bugs.
@@ -1516,6 +1528,35 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                        BUG();
                }
+        down(&cache_chain_sem);
+        list_for_each(p, &cache_chain) {
+                kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
+                mm_segment_t old_fs = get_fs();
+                char tmp;
+                int res;
+                /*
+                 * This happens when the module gets unloaded and doesn't
+                 * destroy its slab cache and no-one else reuses the vmalloc
+                 * area of the module.  Print a warning.
+                 */
+                set_fs(KERNEL_DS);
+                res = __get_user(tmp, pc->name);
+                set_fs(old_fs);
+                if (res) {
+                        printk("SLAB: cache with size %d has lost its name\n",
+                                        pc->objsize);
+                        continue;
+                }
+                if (!strcmp(pc->name,name)) {
+                        printk("kmem_cache_create: duplicate cache %s\n", name);
+                        dump_stack();
+                        goto oops;
+                }
+        }
 #if DEBUG
        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
        if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
@@ -1592,7 +1633,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        /* Get cache's description obj. */
        cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
        if (!cachep)
-                goto opps;
+                goto oops;
        memset(cachep, 0, sizeof(kmem_cache_t));
 #if DEBUG
@@ -1686,7 +1727,7 @@ next:
                printk("kmem_cache_create: couldn't create cache %s.\n", name);
                kmem_cache_free(&cache_cache, cachep);
                cachep = NULL;
-                goto opps;
+                goto oops;
        }
        slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
                                + sizeof(struct slab), align);
@@ -1781,43 +1822,14 @@ next:
                cachep->limit = BOOT_CPUCACHE_ENTRIES;
        } 
-        /* Need the semaphore to access the chain. */
-        down(&cache_chain_sem);
-        {
-                struct list_head *p;
-                mm_segment_t old_fs;
-                old_fs = get_fs();
-                set_fs(KERNEL_DS);
-                list_for_each(p, &cache_chain) {
-                        kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
-                        char tmp;
-                        /* This happens when the module gets unloaded and doesn't
-                           destroy its slab cache and noone else reuses the vmalloc
-                           area of the module. Print a warning. */
-                        if (__get_user(tmp,pc->name)) { 
-                                printk("SLAB: cache with size %d has lost its name\n", 
-                                        pc->objsize); 
-                                continue; 
-                        }       
-                        if (!strcmp(pc->name,name)) { 
-                                printk("kmem_cache_create: duplicate cache %s\n",name); 
-                                up(&cache_chain_sem); 
-                                unlock_cpu_hotplug();
-                                BUG(); 
-                        }       
-                }
-                set_fs(old_fs);
-        }
        /* cache setup completed, link it into the list */
        list_add(&cachep->next, &cache_chain);
-        up(&cache_chain_sem);
        unlock_cpu_hotplug();
-opps:
+oops:
        if (!cachep && (flags & SLAB_PANIC))
                panic("kmem_cache_create(): failed to create slab `%s'\n",
                        name);
+        up(&cache_chain_sem);
        return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
@@ -2137,8 +2149,8 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
        i = 1 << cachep->gfporder;
        page = virt_to_page(objp);
        do {
-                SET_PAGE_CACHE(page, cachep);
+                page_set_cache(page, cachep);
-                SET_PAGE_SLAB(page, slabp);
+                page_set_slab(page, slabp);
                page++;
        } while (--i);
 }
@@ -2268,14 +2280,14 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
        kfree_debugcheck(objp);
        page = virt_to_page(objp);
-        if (GET_PAGE_CACHE(page) != cachep) {
+        if (page_get_cache(page) != cachep) {
                printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n",
-                                GET_PAGE_CACHE(page),cachep);
+                                page_get_cache(page),cachep);
                printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
-                printk(KERN_ERR "%p is %s.\n", GET_PAGE_CACHE(page), GET_PAGE_CACHE(page)->name);
+                printk(KERN_ERR "%p is %s.\n", page_get_cache(page), page_get_cache(page)->name);
                WARN_ON(1);
        }
-        slabp = GET_PAGE_SLAB(page);
+        slabp = page_get_slab(page);
        if (cachep->flags & SLAB_RED_ZONE) {
                if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
@@ -2627,7 +2639,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
                struct slab *slabp;
                unsigned int objnr;
-                slabp = GET_PAGE_SLAB(virt_to_page(objp));
+                slabp = page_get_slab(virt_to_page(objp));
                l3 = cachep->nodelists[node];
                list_del(&slabp->list);
                objnr = (objp - slabp->s_mem) / cachep->objsize;
@@ -2743,7 +2755,7 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
 #ifdef CONFIG_NUMA
        {
                struct slab *slabp;
-                slabp = GET_PAGE_SLAB(virt_to_page(objp));
+                slabp = page_get_slab(virt_to_page(objp));
                if (unlikely(slabp->nodeid != numa_node_id())) {
                        struct array_cache *alien = NULL;
                        int nodeid = slabp->nodeid;
@@ -2829,7 +2841,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
        page = virt_to_page(ptr);
        if (unlikely(!PageSlab(page)))
                goto out;
-        if (unlikely(GET_PAGE_CACHE(page) != cachep))
+        if (unlikely(page_get_cache(page) != cachep))
                goto out;
        return 1;
 out:
@@ -3025,7 +3037,7 @@ void kfree(const void *objp)
                return;
        local_irq_save(flags);
        kfree_debugcheck(objp);
-        c = GET_PAGE_CACHE(virt_to_page(objp));
+        c = page_get_cache(virt_to_page(objp));
        __cache_free(c, (void*)objp);
        local_irq_restore(flags);
 }
@@ -3262,6 +3274,7 @@ static void drain_array_locked(kmem_cache_t *cachep,
 /**
 * cache_reap - Reclaim memory from caches.
+ * @unused: unused parameter
 *
 * Called from workqueue/eventd every few seconds.
 * Purpose:
@@ -3278,7 +3291,7 @@ static void cache_reap(void *unused)
        if (down_trylock(&cache_chain_sem)) {
                /* Give up. Setup the next iteration. */
-                schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
+                schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
                return;
        }
@@ -3347,7 +3360,7 @@ next:
        up(&cache_chain_sem);
        drain_remote_pages();
        /* Setup the next iteration */
-        schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
+        schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
 }
 #ifdef CONFIG_PROC_FS
@@ -3594,7 +3607,7 @@ unsigned int ksize(const void *objp)
        if (unlikely(objp == NULL))
                return 0;
-        return obj_reallen(GET_PAGE_CACHE(virt_to_page(objp)));
+        return obj_reallen(page_get_cache(virt_to_page(objp)));
 }
diff --git a/mm/swap.c b/mm/swap.c
index 154ae13d8b7e..d09cf7f03e76 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -413,7 +413,6 @@ void vm_acct_memory(long pages)
        }
        preempt_enable();
 }
-EXPORT_SYMBOL(vm_acct_memory);
 #ifdef CONFIG_HOTPLUG_CPU
 static void lru_drain_cache(unsigned int cpu)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index dfd9a46755b8..0df9a57b1de8 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -40,7 +40,6 @@ struct address_space swapper_space = {
        .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
        .backing_dev_info = &swap_backing_dev_info,
 };
-EXPORT_SYMBOL(swapper_space);
 #define INC_CACHE_INFO(x)       do { swap_cache_info.x++; } while (0)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8970c0b74194..edafeace301f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -36,8 +36,6 @@ unsigned int nr_swapfiles;
 long total_swap_pages;
 static int swap_overflow;
-EXPORT_SYMBOL(total_swap_pages);
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 54a90e83cb31..729eb3eec75f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -457,7 +457,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 *      @size:          allocation size
 *      @gfp_mask:      flags for the page level allocator
 *      @prot:          protection mask for the allocated pages
- *      @node           node to use for allocation or -1
+ *      @node:          node to use for allocation or -1
 *
 *      Allocate enough pages to cover @size from the page level
 *      allocator with @gfp_mask flags.  Map them into contiguous
@@ -507,7 +507,7 @@ EXPORT_SYMBOL(vmalloc);
 *      vmalloc_node  -  allocate memory on a specific node
 *
 *      @size:          allocation size
- *      @node;          numa node
+ *      @node:          numa node
 *
 *      Allocate enough pages to cover @size from the page level
 *      allocator and map them into contiguous kernel virtual space.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 135bf8ca96ee..28130541270f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1074,7 +1074,7 @@ loop_again:
                                        continue;
                                if (!zone_watermark_ok(zone, order,
-                                                zone->pages_high, 0, 0, 0)) {
+                                                zone->pages_high, 0, 0)) {
                                        end_zone = i;
                                        goto scan;
                                }
@@ -1111,7 +1111,7 @@ scan:
                        if (nr_pages == 0) {    /* Not software suspend */
                                if (!zone_watermark_ok(zone, order,
-                                                zone->pages_high, end_zone, 0, 0))
+                                                zone->pages_high, end_zone, 0))
                                        all_zones_ok = 0;
                        }
                        zone->temp_priority = priority;
@@ -1259,7 +1259,7 @@ void wakeup_kswapd(struct zone *zone, int order)
                return;
        pgdat = zone->zone_pgdat;
-        if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0))
+        if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
                return;
        if (pgdat->kswapd_max_order < order)
                pgdat->kswapd_max_order = order;