14 files changed, 241 insertions, 200 deletions
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 8d667617f558..45b3553865cf 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -189,7 +189,7 @@ __xip_unmap (struct address_space * mapping,
                        /* Nuke the page table entry. */
                        flush_cache_page(vma, address, pte_pfn(*pte));
                        pteval = ptep_clear_flush(vma, address, pte);
-                        page_remove_rmap(page);
+                        page_remove_rmap(page, vma);
                        dec_mm_counter(mm, file_rss);
                        BUG_ON(pte_dirty(pteval));
                        pte_unmap_unlock(pte, ptl);
diff --git a/mm/fremap.c b/mm/fremap.c
index b77a002c3352..4e3f53dd5fd4 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -33,7 +33,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
                if (page) {
                        if (pte_dirty(pte))
                                set_page_dirty(page);
-                        page_remove_rmap(page);
+                        page_remove_rmap(page, vma);
                        page_cache_release(page);
                }
        } else {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0ccc7f230252..cb362f761f17 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -44,14 +44,14 @@ static void clear_huge_page(struct page *page, unsigned long addr)
 }
 static void copy_huge_page(struct page *dst, struct page *src,
-                           unsigned long addr)
+                           unsigned long addr, struct vm_area_struct *vma)
 {
        int i;
        might_sleep();
        for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
                cond_resched();
-                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
+                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
        }
 }
@@ -73,7 +73,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
        for (z = zonelist->zones; *z; z++) {
                nid = zone_to_nid(*z);
-                if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
+                if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
                    !list_empty(&hugepage_freelists[nid]))
                        break;
        }
@@ -442,7 +442,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        spin_unlock(&mm->page_table_lock);
-        copy_huge_page(new_page, old_page, address);
+        copy_huge_page(new_page, old_page, address, vma);
        spin_lock(&mm->page_table_lock);
        ptep = huge_pte_offset(mm, address & HPAGE_MASK);
diff --git a/mm/memory.c b/mm/memory.c
index bf6100236e62..563792f4f687 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -681,7 +681,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                        mark_page_accessed(page);
                                file_rss--;
                        }
-                        page_remove_rmap(page);
+                        page_remove_rmap(page, vma);
                        tlb_remove_page(tlb, page);
                        continue;
                }
@@ -1441,7 +1441,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
        return pte;
 }
-static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
 {
        /*
         * If the source page was a PFN mapping, we don't have
@@ -1464,9 +1464,9 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
                kunmap_atomic(kaddr, KM_USER0);
                flush_dcache_page(dst);
                return;
-                
        }
-        copy_user_highpage(dst, src, va);
+        copy_user_highpage(dst, src, va, vma);
 }
 /*
@@ -1577,7 +1577,7 @@ gotten:
                new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
                if (!new_page)
                        goto oom;
-                cow_user_page(new_page, old_page, address);
+                cow_user_page(new_page, old_page, address, vma);
        }
        /*
@@ -1586,7 +1586,7 @@ gotten:
        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (likely(pte_same(*page_table, orig_pte))) {
                if (old_page) {
-                        page_remove_rmap(old_page);
+                        page_remove_rmap(old_page, vma);
                        if (!PageAnon(old_page)) {
                                dec_mm_counter(mm, file_rss);
                                inc_mm_counter(mm, anon_rss);
@@ -2200,7 +2200,7 @@ retry:
                        page = alloc_page_vma(GFP_HIGHUSER, vma, address);
                        if (!page)
                                goto oom;
-                        copy_user_highpage(page, new_page, address);
+                        copy_user_highpage(page, new_page, address, vma);
                        page_cache_release(new_page);
                        new_page = page;
                        anon = 1;
diff --git a/mm/mincore.c b/mm/mincore.c
index 72890780c1c9..8aca6f7167bb 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -1,7 +1,7 @@
 /*
 *      linux/mm/mincore.c
 *
- * Copyright (C) 1994-1999  Linus Torvalds
+ * Copyright (C) 1994-2006  Linus Torvalds
 */
 /*
@@ -38,46 +38,51 @@ static unsigned char mincore_page(struct vm_area_struct * vma,
        return present;
 }
-static long mincore_vma(struct vm_area_struct * vma,
+/*
-        unsigned long start, unsigned long end, unsigned char __user * vec)
+ * Do a chunk of "sys_mincore()". We've already checked
+ * all the arguments, we hold the mmap semaphore: we should
+ * just return the amount of info we're asked for.
+ */
+static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
 {
-        long error, i, remaining;
+        unsigned long i, nr, pgoff;
-        unsigned char * tmp;
+        struct vm_area_struct *vma = find_vma(current->mm, addr);
-        error = -ENOMEM;
-        if (!vma->vm_file)
-                return error;
-        start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-        if (end > vma->vm_end)
-                end = vma->vm_end;
-        end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-        error = -EAGAIN;
+        /*
-        tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
+         * find_vma() didn't find anything above us, or we're
-        if (!tmp)
+         * in an unmapped hole in the address space: ENOMEM.
-                return error;
+         */
+        if (!vma || addr < vma->vm_start)
+                return -ENOMEM;
-        /* (end - start) is # of pages, and also # of bytes in "vec */
+        /*
-        remaining = (end - start),
+         * Ok, got it. But check whether it's a segment we support
+         * mincore() on. Right now, we don't do any anonymous mappings.
+         *
+         * FIXME: This is just stupid. And returning ENOMEM is 
+         * stupid too. We should just look at the page tables. But
+         * this is what we've traditionally done, so we'll just
+         * continue doing it.
+         */
+        if (!vma->vm_file)
+                return -ENOMEM;
-        error = 0;
+        /*
-        for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
+         * Calculate how many pages there are left in the vma, and
-                int j = 0;
+         * what the pgoff is for our address.
-                long thispiece = (remaining < PAGE_SIZE) ?
+         */
-                                                remaining : PAGE_SIZE;
+        nr = (vma->vm_end - addr) >> PAGE_SHIFT;
+        if (nr > pages)
+                nr = pages;
-                while (j < thispiece)
+        pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
-                        tmp[j++] = mincore_page(vma, start++);
+        pgoff += vma->vm_pgoff;
-                if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
+        /* And then we just fill the sucker in.. */
-                        error = -EFAULT;
+        for (i = 0 ; i < nr; i++, pgoff++)
-                        break;
+                vec[i] = mincore_page(vma, pgoff);
-                }
-        }
-        free_page((unsigned long) tmp);
+        return nr;
-        return error;
 }
 /*
@@ -107,82 +112,50 @@ static long mincore_vma(struct vm_area_struct * vma,
 asmlinkage long sys_mincore(unsigned long start, size_t len,
        unsigned char __user * vec)
 {
-        int index = 0;
+        long retval;
-        unsigned long end, limit;
+        unsigned long pages;
-        struct vm_area_struct * vma;
+        unsigned char *tmp;
-        size_t max;
-        int unmapped_error = 0;
-        long error;
-        /* check the arguments */
-        if (start & ~PAGE_CACHE_MASK)
-                goto einval;
-        limit = TASK_SIZE;
-        if (start >= limit)
-                goto enomem;
-        if (!len)
-                return 0;
-        max = limit - start;
-        len = PAGE_CACHE_ALIGN(len);
-        if (len > max || !len)
-                goto enomem;
-        end = start + len;
+        /* Check the start address: needs to be page-aligned.. */
+        if (start & ~PAGE_CACHE_MASK)
+                return -EINVAL;
-        /* check the output buffer whilst holding the lock */
+        /* ..and we need to be passed a valid user-space range */
-        error = -EFAULT;
+        if (!access_ok(VERIFY_READ, (void __user *) start, len))
-        down_read(&current->mm->mmap_sem);
+                return -ENOMEM;
-        if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT))
+        /* This also avoids any overflows on PAGE_CACHE_ALIGN */
-                goto out;
+        pages = len >> PAGE_SHIFT;
+        pages += (len & ~PAGE_MASK) != 0;
-        /*
+        if (!access_ok(VERIFY_WRITE, vec, pages))
-         * If the interval [start,end) covers some unmapped address
+                return -EFAULT;
-         * ranges, just ignore them, but return -ENOMEM at the end.
-         */
-        error = 0;
-        vma = find_vma(current->mm, start);
-        while (vma) {
-                /* Here start < vma->vm_end. */
-                if (start < vma->vm_start) {
-                        unmapped_error = -ENOMEM;
-                        start = vma->vm_start;
-                }
-                /* Here vma->vm_start <= start < vma->vm_end. */
+        tmp = (void *) __get_free_page(GFP_USER);
-                if (end <= vma->vm_end) {
+        if (!tmp)
-                        if (start < end) {
+                return -EAGAIN;
-                                error = mincore_vma(vma, start, end,
-                                                        &vec[index]);
+        retval = 0;
-                                if (error)
+        while (pages) {
-                                        goto out;
+                /*
-                        }
+                 * Do at most PAGE_SIZE entries per iteration, due to
-                        error = unmapped_error;
+                 * the temporary buffer size.
-                        goto out;
+                 */
+                down_read(&current->mm->mmap_sem);
+                retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
+                up_read(&current->mm->mmap_sem);
+                if (retval <= 0)
+                        break;
+                if (copy_to_user(vec, tmp, retval)) {
+                        retval = -EFAULT;
+                        break;
                }
+                pages -= retval;
-                /* Here vma->vm_start <= start < vma->vm_end < end. */
+                vec += retval;
-                error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
+                start += retval << PAGE_SHIFT;
-                if (error)
+                retval = 0;
-                        goto out;
-                index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
-                start = vma->vm_end;
-                vma = vma->vm_next;
        }
+        free_page((unsigned long) tmp);
-        /* we found a hole in the area queried if we arrive here */
+        return retval;
-        error = -ENOMEM;
-out:
-        up_read(&current->mm->mmap_sem);
-        return error;
-einval:
-        return -EINVAL;
-enomem:
-        return -ENOMEM;
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 223d9ccb7d64..6969cfb33901 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -174,10 +174,15 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
 {
 #ifdef CONFIG_NUMA
        struct zone **z;
-        nodemask_t nodes = node_online_map;
+        nodemask_t nodes;
+        int node;
+        /* node has memory ? */
+        for_each_online_node(node)
+                if (NODE_DATA(node)->node_present_pages)
+                        node_set(node, nodes);
        for (z = zonelist->zones; *z; z++)
-                if (cpuset_zone_allowed(*z, gfp_mask))
+                if (cpuset_zone_allowed_softwall(*z, gfp_mask))
                        node_clear(zone_to_nid(*z), nodes);
                else
                        return CONSTRAINT_CPUSET;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 237107c1b084..1d2fc89ca56d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -845,38 +845,6 @@ int set_page_dirty_lock(struct page *page)
 EXPORT_SYMBOL(set_page_dirty_lock);
 /*
- * Clear a page's dirty flag, while caring for dirty memory accounting. 
- * Returns true if the page was previously dirty.
- */
-int test_clear_page_dirty(struct page *page)
-{
-        struct address_space *mapping = page_mapping(page);
-        unsigned long flags;
-        if (!mapping)
-                return TestClearPageDirty(page);
-        write_lock_irqsave(&mapping->tree_lock, flags);
-        if (TestClearPageDirty(page)) {
-                radix_tree_tag_clear(&mapping->page_tree,
-                                page_index(page), PAGECACHE_TAG_DIRTY);
-                write_unlock_irqrestore(&mapping->tree_lock, flags);
-                /*
-                 * We can continue to use `mapping' here because the
-                 * page is locked, which pins the address_space
-                 */
-                if (mapping_cap_account_dirty(mapping)) {
-                        page_mkclean(page);
-                        dec_zone_page_state(page, NR_FILE_DIRTY);
-                }
-                return 1;
-        }
-        write_unlock_irqrestore(&mapping->tree_lock, flags);
-        return 0;
-}
-EXPORT_SYMBOL(test_clear_page_dirty);
-/*
 * Clear a page's dirty flag, while caring for dirty memory accounting.
 * Returns true if the page was previously dirty.
 *
@@ -894,17 +862,46 @@ int clear_page_dirty_for_io(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
-        if (!mapping)
+        if (mapping && mapping_cap_account_dirty(mapping)) {
-                return TestClearPageDirty(page);
+                /*
+                 * Yes, Virginia, this is indeed insane.
-        if (TestClearPageDirty(page)) {
+                 *
-                if (mapping_cap_account_dirty(mapping)) {
+                 * We use this sequence to make sure that
-                        page_mkclean(page);
+                 *  (a) we account for dirty stats properly
+                 *  (b) we tell the low-level filesystem to
+                 *      mark the whole page dirty if it was
+                 *      dirty in a pagetable. Only to then
+                 *  (c) clean the page again and return 1 to
+                 *      cause the writeback.
+                 *
+                 * This way we avoid all nasty races with the
+                 * dirty bit in multiple places and clearing
+                 * them concurrently from different threads.
+                 *
+                 * Note! Normally the "set_page_dirty(page)"
+                 * has no effect on the actual dirty bit - since
+                 * that will already usually be set. But we
+                 * need the side effects, and it can help us
+                 * avoid races.
+                 *
+                 * We basically use the page "master dirty bit"
+                 * as a serialization point for all the different
+                 * threads doing their things.
+                 *
+                 * FIXME! We still have a race here: if somebody
+                 * adds the page back to the page tables in
+                 * between the "page_mkclean()" and the "TestClearPageDirty()",
+                 * we might have it mapped without the dirty bit set.
+                 */
+                if (page_mkclean(page))
+                        set_page_dirty(page);
+                if (TestClearPageDirty(page)) {
                        dec_zone_page_state(page, NR_FILE_DIRTY);
+                        return 1;
                }
-                return 1;
+                return 0;
        }
-        return 0;
+        return TestClearPageDirty(page);
 }
 EXPORT_SYMBOL(clear_page_dirty_for_io);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e6b17b2989e0..8c1a116875bc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1162,7 +1162,7 @@ zonelist_scan:
                        zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
                                break;
                if ((alloc_flags & ALLOC_CPUSET) &&
-                        !cpuset_zone_allowed(zone, gfp_mask))
+                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
                                goto try_next_zone;
                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
diff --git a/mm/rmap.c b/mm/rmap.c
index d8a842a586db..669acb22b572 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -47,6 +47,7 @@
 #include <linux/rmap.h>
 #include <linux/rcupdate.h>
 #include <linux/module.h>
+#include <linux/kallsyms.h>
 #include <asm/tlbflush.h>
@@ -432,7 +433,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
-        pte_t *pte, entry;
+        pte_t *pte;
        spinlock_t *ptl;
        int ret = 0;
@@ -444,17 +445,18 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
        if (!pte)
                goto out;
-        if (!pte_dirty(*pte) && !pte_write(*pte))
+        if (pte_dirty(*pte) || pte_write(*pte)) {
-                goto unlock;
+                pte_t entry;
-        entry = ptep_get_and_clear(mm, address, pte);
+                flush_cache_page(vma, address, pte_pfn(*pte));
-        entry = pte_mkclean(entry);
+                entry = ptep_clear_flush(vma, address, pte);
-        entry = pte_wrprotect(entry);
+                entry = pte_wrprotect(entry);
-        ptep_establish(vma, address, pte, entry);
+                entry = pte_mkclean(entry);
-        lazy_mmu_prot_update(entry);
+                set_pte_at(mm, address, pte, entry);
-        ret = 1;
+                lazy_mmu_prot_update(entry);
+                ret = 1;
+        }
-unlock:
        pte_unmap_unlock(pte, ptl);
 out:
        return ret;
@@ -489,6 +491,8 @@ int page_mkclean(struct page *page)
                if (mapping)
                        ret = page_mkclean_file(mapping, page);
        }
+        if (page_test_and_clear_dirty(page))
+                ret = 1;
        return ret;
 }
@@ -567,14 +571,20 @@ void page_add_file_rmap(struct page *page)
 *
 * The caller needs to hold the pte lock.
 */
-void page_remove_rmap(struct page *page)
+void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
 {
        if (atomic_add_negative(-1, &page->_mapcount)) {
                if (unlikely(page_mapcount(page) < 0)) {
                        printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
+                        printk (KERN_EMERG "  page pfn = %lx\n", page_to_pfn(page));
                        printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
                        printk (KERN_EMERG "  page->count = %x\n", page_count(page));
                        printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
+                        print_symbol (KERN_EMERG "  vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
+                        if (vma->vm_ops)
+                                print_symbol (KERN_EMERG "  vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
+                        if (vma->vm_file && vma->vm_file->f_op)
+                                print_symbol (KERN_EMERG "  vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
                        BUG();
                }
@@ -679,7 +689,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                dec_mm_counter(mm, file_rss);
-        page_remove_rmap(page);
+        page_remove_rmap(page, vma);
        page_cache_release(page);
 out_unmap:
@@ -769,7 +779,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
                if (pte_dirty(pteval))
                        set_page_dirty(page);
-                page_remove_rmap(page);
+                page_remove_rmap(page, vma);
                page_cache_release(page);
                dec_mm_counter(mm, file_rss);
                (*mapcount)--;
diff --git a/mm/shmem.c b/mm/shmem.c
index 4bb28d218eb5..70da7a0981bf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -515,7 +515,12 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
                        size = SHMEM_NR_DIRECT;
                nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
        }
-        if (!topdir)
+        /*
+         * If there are no indirect blocks or we are punching a hole
+         * below indirect blocks, nothing to be done.
+         */
+        if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT)))
                goto done2;
        BUG_ON(limit <= SHMEM_NR_DIRECT);
diff --git a/mm/slab.c b/mm/slab.c
index 2c655532f5ef..0d4e57431de4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -109,6 +109,7 @@
 #include        <linux/mutex.h>
 #include        <linux/fault-inject.h>
 #include        <linux/rtmutex.h>
+#include        <linux/reciprocal_div.h>
 #include        <asm/cacheflush.h>
 #include        <asm/tlbflush.h>
@@ -386,6 +387,7 @@ struct kmem_cache {
        unsigned int shared;
        unsigned int buffer_size;
+        u32 reciprocal_buffer_size;
 /* 3) touched by every alloc & free from the backend */
        struct kmem_list3 *nodelists[MAX_NUMNODES];
@@ -627,10 +629,17 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
        return slab->s_mem + cache->buffer_size * idx;
 }
-static inline unsigned int obj_to_index(struct kmem_cache *cache,
+/*
-                                        struct slab *slab, void *obj)
+ * We want to avoid an expensive divide : (offset / cache->buffer_size)
+ *   Using the fact that buffer_size is a constant for a particular cache,
+ *   we can replace (offset / cache->buffer_size) by
+ *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
+ */
+static inline unsigned int obj_to_index(const struct kmem_cache *cache,
+                                        const struct slab *slab, void *obj)
 {
-        return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
+        u32 offset = (obj - slab->s_mem);
+        return reciprocal_divide(offset, cache->reciprocal_buffer_size);
 }
 /*
@@ -1427,6 +1436,8 @@ void __init kmem_cache_init(void)
        cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
                                        cache_line_size());
+        cache_cache.reciprocal_buffer_size =
+                reciprocal_value(cache_cache.buffer_size);
        for (order = 0; order < MAX_ORDER; order++) {
                cache_estimate(order, cache_cache.buffer_size,
@@ -2313,6 +2324,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (flags & SLAB_CACHE_DMA)
                cachep->gfpflags |= GFP_DMA;
        cachep->buffer_size = size;
+        cachep->reciprocal_buffer_size = reciprocal_value(size);
        if (flags & CFLGS_OFF_SLAB) {
                cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
@@ -3252,6 +3264,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
        struct zone **z;
        void *obj = NULL;
        int nid;
+        gfp_t local_flags = (flags & GFP_LEVEL_MASK);
 retry:
        /*
@@ -3261,7 +3274,7 @@ retry:
        for (z = zonelist->zones; *z && !obj; z++) {
                nid = zone_to_nid(*z);
-                if (cpuset_zone_allowed(*z, flags | __GFP_HARDWALL) &&
+                if (cpuset_zone_allowed_hardwall(*z, flags) &&
                        cache->nodelists[nid] &&
                        cache->nodelists[nid]->free_objects)
                                obj = ____cache_alloc_node(cache,
@@ -3275,7 +3288,12 @@ retry:
                 * We may trigger various forms of reclaim on the allowed
                 * set and go into memory reserves if necessary.
                 */
+                if (local_flags & __GFP_WAIT)
+                        local_irq_enable();
+                kmem_flagcheck(cache, flags);
                obj = kmem_getpages(cache, flags, -1);
+                if (local_flags & __GFP_WAIT)
+                        local_irq_disable();
                if (obj) {
                        /*
                         * Insert into the appropriate per node queues
@@ -3535,7 +3553,7 @@ EXPORT_SYMBOL(kmem_cache_zalloc);
 *
 * Currently only used for dentry validation.
 */
-int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
+int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
 {
        unsigned long addr = (unsigned long)ptr;
        unsigned long min_addr = PAGE_OFFSET;
@@ -3569,6 +3587,7 @@ out:
 * @cachep: The cache to allocate from.
 * @flags: See kmalloc().
 * @nodeid: node number of the target node.
+ * @caller: return address of caller, used for debug information
 *
 * Identical to kmem_cache_alloc but it will allocate memory on the given
 * node, which can improve the performance for cpu bound structures.
diff --git a/mm/slob.c b/mm/slob.c
index 542394184a58..5adc29cb58dd 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -60,6 +60,8 @@ static DEFINE_SPINLOCK(slob_lock);
 static DEFINE_SPINLOCK(block_lock);
 static void slob_free(void *b, int size);
+static void slob_timer_cbk(void);
 static void *slob_alloc(size_t size, gfp_t gfp, int align)
 {
@@ -157,7 +159,7 @@ static int fastcall find_order(int size)
        return order;
 }
-void *kmalloc(size_t size, gfp_t gfp)
+void *__kmalloc(size_t size, gfp_t gfp)
 {
        slob_t *m;
        bigblock_t *bb;
@@ -186,8 +188,7 @@ void *kmalloc(size_t size, gfp_t gfp)
        slob_free(bb, sizeof(bigblock_t));
        return 0;
 }
+EXPORT_SYMBOL(__kmalloc);
-EXPORT_SYMBOL(kmalloc);
 void kfree(const void *block)
 {
@@ -327,9 +328,25 @@ const char *kmem_cache_name(struct kmem_cache *c)
 EXPORT_SYMBOL(kmem_cache_name);
 static struct timer_list slob_timer = TIMER_INITIALIZER(
-        (void (*)(unsigned long))kmem_cache_init, 0, 0);
+        (void (*)(unsigned long))slob_timer_cbk, 0, 0);
+int kmem_cache_shrink(struct kmem_cache *d)
+{
+        return 0;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+int kmem_ptr_validate(struct kmem_cache *a, const void *b)
+{
+        return 0;
+}
+void __init kmem_cache_init(void)
+{
+        slob_timer_cbk();
+}
-void kmem_cache_init(void)
+static void slob_timer_cbk(void)
 {
        void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
diff --git a/mm/truncate.c b/mm/truncate.c
index 9bfb8e853860..ecdfdcc50522 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -51,6 +51,26 @@ static inline void truncate_partial_page(struct page *page, unsigned partial)
                do_invalidatepage(page, partial);
 }
+void cancel_dirty_page(struct page *page, unsigned int account_size)
+{
+        /* If we're cancelling the page, it had better not be mapped any more */
+        if (page_mapped(page)) {
+                static unsigned int warncount;
+                WARN_ON(++warncount < 5);
+        }
+                
+        if (TestClearPageDirty(page)) {
+                struct address_space *mapping = page->mapping;
+                if (mapping && mapping_cap_account_dirty(mapping)) {
+                        dec_zone_page_state(page, NR_FILE_DIRTY);
+                        if (account_size)
+                                task_io_account_cancelled_write(account_size);
+                }
+        }
+}
+EXPORT_SYMBOL(cancel_dirty_page);
 /*
 * If truncate cannot remove the fs-private metadata from the page, the page
 * becomes anonymous.  It will be left on the LRU and may even be mapped into
@@ -67,11 +87,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        if (page->mapping != mapping)
                return;
+        cancel_dirty_page(page, PAGE_CACHE_SIZE);
        if (PagePrivate(page))
                do_invalidatepage(page, 0);
-        if (test_clear_page_dirty(page))
-                task_io_account_cancelled_write(PAGE_CACHE_SIZE);
        ClearPageUptodate(page);
        ClearPageMappedToDisk(page);
        remove_from_page_cache(page);
@@ -350,7 +370,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
                        pgoff_t page_index;
-                        int was_dirty;
                        lock_page(page);
                        if (page->mapping != mapping) {
@@ -386,12 +405,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                                          PAGE_CACHE_SIZE, 0);
                                }
                        }
-                        was_dirty = test_clear_page_dirty(page);
+                        if (!invalidate_complete_page2(mapping, page))
-                        if (!invalidate_complete_page2(mapping, page)) {
-                                if (was_dirty)
-                                        set_page_dirty(page);
                                ret = -EIO;
-                        }
                        unlock_page(page);
                }
                pagevec_release(&pvec);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 093f5fe6dd77..40fea4918390 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -692,7 +692,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                        __count_vm_events(KSWAPD_STEAL, nr_freed);
                } else
                        __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
-                __count_vm_events(PGACTIVATE, nr_freed);
+                __count_zone_vm_events(PGSTEAL, zone, nr_freed);
                if (nr_taken == 0)
                        goto done;
@@ -984,7 +984,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
                if (!populated_zone(zone))
                        continue;
-                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                        continue;
                note_zone_scanning_priority(zone, priority);
@@ -1034,7 +1034,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
        for (i = 0; zones[i] != NULL; i++) {
                struct zone *zone = zones[i];
-                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                        continue;
                lru_pages += zone->nr_active + zone->nr_inactive;
@@ -1089,7 +1089,7 @@ out:
        for (i = 0; zones[i] != 0; i++) {
                struct zone *zone = zones[i];
-                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                        continue;
                zone->prev_priority = priority;
@@ -1354,7 +1354,7 @@ void wakeup_kswapd(struct zone *zone, int order)
                return;
        if (pgdat->kswapd_max_order < order)
                pgdat->kswapd_max_order = order;
-        if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                return;
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
@@ -1369,8 +1369,8 @@ void wakeup_kswapd(struct zone *zone, int order)
 *
 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
 */
-static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
+static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
-                                      int prio, struct scan_control *sc)
+                                      int pass, struct scan_control *sc)
 {
        struct zone *zone;
        unsigned long nr_to_scan, ret = 0;