Merge rsync://rsync.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

author: Dmitry Torokhov <dtor@insightbb.com> 2007-02-10 01:26:32 -0500
committer: Dmitry Torokhov <dtor@insightbb.com> 2007-02-10 01:26:32 -0500
commit: b22364c8eec89e6b0c081a237f3b6348df87796f (patch)
tree: 233a923281fb640106465d076997ff511efb6edf /mm
parent: 2c8dc071517ec2843869024dc82be2e246f41064 (diff)
parent: 66efc5a7e3061c3597ac43a8bb1026488d57e66b (diff)
25 files changed, 685 insertions, 358 deletions
diff --git a/mm/bounce.c b/mm/bounce.c
index e4b62d2a4024..643efbe82402 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -237,6 +237,8 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
        if (!bio)
                return;
+        blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
        /*
         * at least one page was bounced, fill in possible non-highmem
         * pages
@@ -291,8 +293,6 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
                pool = isa_page_pool;
        }
-        blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
        /*
         * slow path
         */
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 168c78a121bb..0df4c899e979 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -38,7 +38,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
        if (!file)
                return -EBADF;
-        if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {
+        if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
                ret = -ESPIPE;
                goto out;
        }
diff --git a/mm/filemap.c b/mm/filemap.c
index af7e2f5caea9..f30ef28405d3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -606,26 +606,6 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset)
 EXPORT_SYMBOL(find_get_page);
 /**
- * find_trylock_page - find and lock a page
- * @mapping: the address_space to search
- * @offset: the page index
- *
- * Same as find_get_page(), but trylock it instead of incrementing the count.
- */
-struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
-{
-        struct page *page;
-        read_lock_irq(&mapping->tree_lock);
-        page = radix_tree_lookup(&mapping->page_tree, offset);
-        if (page && TestSetPageLocked(page))
-                page = NULL;
-        read_unlock_irq(&mapping->tree_lock);
-        return page;
-}
-EXPORT_SYMBOL(find_trylock_page);
-/**
 * find_lock_page - locate, pin and lock a pagecache page
 * @mapping: the address_space to search
 * @offset: the page index
@@ -1181,8 +1161,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                if (pos < size) {
                        retval = generic_file_direct_IO(READ, iocb,
                                                iov, pos, nr_segs);
-                        if (retval > 0 && !is_sync_kiocb(iocb))
-                                retval = -EIOCBQUEUED;
                        if (retval > 0)
                                *ppos = pos + retval;
                }
@@ -2047,15 +2025,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
         * Sync the fs metadata but not the minor inode changes and
         * of course not the data as we did direct DMA for the IO.
         * i_mutex is held, which protects generic_osync_inode() from
-         * livelocking.
+         * livelocking.  AIO O_DIRECT ops attempt to sync metadata here.
         */
-        if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+        if ((written >= 0 || written == -EIOCBQUEUED) &&
+            ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
                if (err < 0)
                        written = err;
        }
-        if (written == count && !is_sync_kiocb(iocb))
-                written = -EIOCBQUEUED;
        return written;
 }
 EXPORT_SYMBOL(generic_file_direct_write);
@@ -2269,7 +2246,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
        if (count == 0)
                goto out;
-        err = remove_suid(file->f_dentry);
+        err = remove_suid(file->f_path.dentry);
        if (err)
                goto out;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b4fd0d7c9bfb..9dd9fbb75139 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -183,13 +183,13 @@ __xip_unmap (struct address_space * mapping,
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-                page = ZERO_PAGE(address);
+                page = ZERO_PAGE(0);
                pte = page_check_address(page, mm, address, &ptl);
                if (pte) {
                        /* Nuke the page table entry. */
                        flush_cache_page(vma, address, pte_pfn(*pte));
                        pteval = ptep_clear_flush(vma, address, pte);
-                        page_remove_rmap(page);
+                        page_remove_rmap(page, vma);
                        dec_mm_counter(mm, file_rss);
                        BUG_ON(pte_dirty(pteval));
                        pte_unmap_unlock(pte, ptl);
@@ -246,7 +246,7 @@ xip_file_nopage(struct vm_area_struct * area,
                __xip_unmap(mapping, pgoff);
        } else {
                /* not shared and writable, use ZERO_PAGE() */
-                page = ZERO_PAGE(address);
+                page = ZERO_PAGE(0);
        }
 out:
@@ -379,7 +379,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        if (count == 0)
                goto out_backing;
-        ret = remove_suid(filp->f_dentry);
+        ret = remove_suid(filp->f_path.dentry);
        if (ret)
                goto out_backing;
diff --git a/mm/fremap.c b/mm/fremap.c
index b77a002c3352..4e3f53dd5fd4 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -33,7 +33,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
                if (page) {
                        if (pte_dirty(pte))
                                set_page_dirty(page);
-                        page_remove_rmap(page);
+                        page_remove_rmap(page, vma);
                        page_cache_release(page);
                }
        } else {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0ccc7f230252..36db012b38dd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -44,14 +44,14 @@ static void clear_huge_page(struct page *page, unsigned long addr)
 }
 static void copy_huge_page(struct page *dst, struct page *src,
-                           unsigned long addr)
+                           unsigned long addr, struct vm_area_struct *vma)
 {
        int i;
        might_sleep();
        for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
                cond_resched();
-                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
+                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
        }
 }
@@ -73,7 +73,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
        for (z = zonelist->zones; *z; z++) {
                nid = zone_to_nid(*z);
-                if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
+                if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
                    !list_empty(&hugepage_freelists[nid]))
                        break;
        }
@@ -389,6 +389,8 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                        continue;
                page = pte_page(pte);
+                if (pte_dirty(pte))
+                        set_page_dirty(page);
                list_add(&page->lru, &page_list);
        }
        spin_unlock(&mm->page_table_lock);
@@ -442,7 +444,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        spin_unlock(&mm->page_table_lock);
-        copy_huge_page(new_page, old_page, address);
+        copy_huge_page(new_page, old_page, address, vma);
        spin_lock(&mm->page_table_lock);
        ptep = huge_pte_offset(mm, address & HPAGE_MASK);
diff --git a/mm/memory.c b/mm/memory.c
index 4198df0dff1c..ef09f0acb1d8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -681,7 +681,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                        mark_page_accessed(page);
                                file_rss--;
                        }
-                        page_remove_rmap(page);
+                        page_remove_rmap(page, vma);
                        tlb_remove_page(tlb, page);
                        continue;
                }
@@ -1091,7 +1091,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        if (pages) {
                                pages[i] = page;
-                                flush_anon_page(page, start);
+                                flush_anon_page(vma, page, start);
                                flush_dcache_page(page);
                        }
                        if (vmas)
@@ -1110,23 +1110,29 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 {
        pte_t *pte;
        spinlock_t *ptl;
+        int err = 0;
        pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
-                return -ENOMEM;
+                return -EAGAIN;
        arch_enter_lazy_mmu_mode();
        do {
                struct page *page = ZERO_PAGE(addr);
                pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
+                if (unlikely(!pte_none(*pte))) {
+                        err = -EEXIST;
+                        pte++;
+                        break;
+                }
                page_cache_get(page);
                page_add_file_rmap(page);
                inc_mm_counter(mm, file_rss);
-                BUG_ON(!pte_none(*pte));
                set_pte_at(mm, addr, pte, zero_pte);
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(pte - 1, ptl);
-        return 0;
+        return err;
 }
 static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -1134,16 +1140,18 @@ static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
 {
        pmd_t *pmd;
        unsigned long next;
+        int err;
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
-                return -ENOMEM;
+                return -EAGAIN;
        do {
                next = pmd_addr_end(addr, end);
-                if (zeromap_pte_range(mm, pmd, addr, next, prot))
+                err = zeromap_pte_range(mm, pmd, addr, next, prot);
-                        return -ENOMEM;
+                if (err)
+                        break;
        } while (pmd++, addr = next, addr != end);
-        return 0;
+        return err;
 }
 static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
@@ -1151,16 +1159,18 @@ static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
 {
        pud_t *pud;
        unsigned long next;
+        int err;
        pud = pud_alloc(mm, pgd, addr);
        if (!pud)
-                return -ENOMEM;
+                return -EAGAIN;
        do {
                next = pud_addr_end(addr, end);
-                if (zeromap_pmd_range(mm, pud, addr, next, prot))
+                err = zeromap_pmd_range(mm, pud, addr, next, prot);
-                        return -ENOMEM;
+                if (err)
+                        break;
        } while (pud++, addr = next, addr != end);
-        return 0;
+        return err;
 }
 int zeromap_page_range(struct vm_area_struct *vma,
@@ -1431,7 +1441,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
        return pte;
 }
-static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
 {
        /*
         * If the source page was a PFN mapping, we don't have
@@ -1454,9 +1464,9 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
                kunmap_atomic(kaddr, KM_USER0);
                flush_dcache_page(dst);
                return;
-                
        }
-        copy_user_highpage(dst, src, va);
+        copy_user_highpage(dst, src, va, vma);
 }
 /*
@@ -1567,7 +1577,7 @@ gotten:
                new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
                if (!new_page)
                        goto oom;
-                cow_user_page(new_page, old_page, address);
+                cow_user_page(new_page, old_page, address, vma);
        }
        /*
@@ -1576,7 +1586,7 @@ gotten:
        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (likely(pte_same(*page_table, orig_pte))) {
                if (old_page) {
-                        page_remove_rmap(old_page);
+                        page_remove_rmap(old_page, vma);
                        if (!PageAnon(old_page)) {
                                dec_mm_counter(mm, file_rss);
                                inc_mm_counter(mm, anon_rss);
@@ -2190,7 +2200,7 @@ retry:
                        page = alloc_page_vma(GFP_HIGHUSER, vma, address);
                        if (!page)
                                goto oom;
-                        copy_user_highpage(page, new_page, address);
+                        copy_user_highpage(page, new_page, address, vma);
                        page_cache_release(new_page);
                        new_page = page;
                        anon = 1;
@@ -2596,8 +2606,15 @@ static int __init gate_vma_init(void)
        gate_vma.vm_mm = NULL;
        gate_vma.vm_start = FIXADDR_USER_START;
        gate_vma.vm_end = FIXADDR_USER_END;
-        gate_vma.vm_page_prot = PAGE_READONLY;
+        gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
-        gate_vma.vm_flags = 0;
+        gate_vma.vm_page_prot = __P101;
+        /*
+         * Make sure the vDSO gets into every core dump.
+         * Dumping its contents makes post-mortem fully interpretable later
+         * without matching up the same kernel and hardware config to see
+         * what PC values meant.
+         */
+        gate_vma.vm_flags |= VM_ALWAYSDUMP;
        return 0;
 }
 __initcall(gate_vma_init);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0c055a090f4d..84279127fcd3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -67,11 +67,13 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
        zone_type = zone - pgdat->node_zones;
        if (!populated_zone(zone)) {
                int ret = 0;
-                ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages);
+                ret = init_currently_empty_zone(zone, phys_start_pfn,
+                                                nr_pages, MEMMAP_HOTPLUG);
                if (ret < 0)
                        return ret;
        }
-        memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
+        memmap_init_zone(nr_pages, nid, zone_type,
+                         phys_start_pfn, MEMMAP_HOTPLUG);
        return 0;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b917d6fdc1bb..c2aec0e1090d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -884,6 +884,10 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;
+#ifdef CONFIG_CPUSETS
+        /* Restrict the nodes to the allowed nodes in the cpuset */
+        nodes_and(nodes, nodes, current->mems_allowed);
+#endif
        return do_mbind(start, len, mode, &nodes, flags);
 }
@@ -1857,7 +1861,7 @@ int show_numa_map(struct seq_file *m, void *v)
        if (file) {
                seq_printf(m, " file=");
-                seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
+                seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
                seq_printf(m, " heap");
        } else if (vma->vm_start <= mm->start_stack &&
diff --git a/mm/mincore.c b/mm/mincore.c
index 72890780c1c9..8aca6f7167bb 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -1,7 +1,7 @@
 /*
 *      linux/mm/mincore.c
 *
- * Copyright (C) 1994-1999  Linus Torvalds
+ * Copyright (C) 1994-2006  Linus Torvalds
 */
 /*
@@ -38,46 +38,51 @@ static unsigned char mincore_page(struct vm_area_struct * vma,
        return present;
 }
-static long mincore_vma(struct vm_area_struct * vma,
+/*
-        unsigned long start, unsigned long end, unsigned char __user * vec)
+ * Do a chunk of "sys_mincore()". We've already checked
+ * all the arguments, we hold the mmap semaphore: we should
+ * just return the amount of info we're asked for.
+ */
+static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
 {
-        long error, i, remaining;
+        unsigned long i, nr, pgoff;
-        unsigned char * tmp;
+        struct vm_area_struct *vma = find_vma(current->mm, addr);
-        error = -ENOMEM;
-        if (!vma->vm_file)
-                return error;
-        start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-        if (end > vma->vm_end)
-                end = vma->vm_end;
-        end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-        error = -EAGAIN;
+        /*
-        tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
+         * find_vma() didn't find anything above us, or we're
-        if (!tmp)
+         * in an unmapped hole in the address space: ENOMEM.
-                return error;
+         */
+        if (!vma || addr < vma->vm_start)
+                return -ENOMEM;
-        /* (end - start) is # of pages, and also # of bytes in "vec */
+        /*
-        remaining = (end - start),
+         * Ok, got it. But check whether it's a segment we support
+         * mincore() on. Right now, we don't do any anonymous mappings.
+         *
+         * FIXME: This is just stupid. And returning ENOMEM is 
+         * stupid too. We should just look at the page tables. But
+         * this is what we've traditionally done, so we'll just
+         * continue doing it.
+         */
+        if (!vma->vm_file)
+                return -ENOMEM;
-        error = 0;
+        /*
-        for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
+         * Calculate how many pages there are left in the vma, and
-                int j = 0;
+         * what the pgoff is for our address.
-                long thispiece = (remaining < PAGE_SIZE) ?
+         */
-                                                remaining : PAGE_SIZE;
+        nr = (vma->vm_end - addr) >> PAGE_SHIFT;
+        if (nr > pages)
+                nr = pages;
-                while (j < thispiece)
+        pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
-                        tmp[j++] = mincore_page(vma, start++);
+        pgoff += vma->vm_pgoff;
-                if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
+        /* And then we just fill the sucker in.. */
-                        error = -EFAULT;
+        for (i = 0 ; i < nr; i++, pgoff++)
-                        break;
+                vec[i] = mincore_page(vma, pgoff);
-                }
-        }
-        free_page((unsigned long) tmp);
+        return nr;
-        return error;
 }
 /*
@@ -107,82 +112,50 @@ static long mincore_vma(struct vm_area_struct * vma,
 asmlinkage long sys_mincore(unsigned long start, size_t len,
        unsigned char __user * vec)
 {
-        int index = 0;
+        long retval;
-        unsigned long end, limit;
+        unsigned long pages;
-        struct vm_area_struct * vma;
+        unsigned char *tmp;
-        size_t max;
-        int unmapped_error = 0;
-        long error;
-        /* check the arguments */
-        if (start & ~PAGE_CACHE_MASK)
-                goto einval;
-        limit = TASK_SIZE;
-        if (start >= limit)
-                goto enomem;
-        if (!len)
-                return 0;
-        max = limit - start;
-        len = PAGE_CACHE_ALIGN(len);
-        if (len > max || !len)
-                goto enomem;
-        end = start + len;
+        /* Check the start address: needs to be page-aligned.. */
+        if (start & ~PAGE_CACHE_MASK)
+                return -EINVAL;
-        /* check the output buffer whilst holding the lock */
+        /* ..and we need to be passed a valid user-space range */
-        error = -EFAULT;
+        if (!access_ok(VERIFY_READ, (void __user *) start, len))
-        down_read(&current->mm->mmap_sem);
+                return -ENOMEM;
-        if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT))
+        /* This also avoids any overflows on PAGE_CACHE_ALIGN */
-                goto out;
+        pages = len >> PAGE_SHIFT;
+        pages += (len & ~PAGE_MASK) != 0;
-        /*
+        if (!access_ok(VERIFY_WRITE, vec, pages))
-         * If the interval [start,end) covers some unmapped address
+                return -EFAULT;
-         * ranges, just ignore them, but return -ENOMEM at the end.
-         */
-        error = 0;
-        vma = find_vma(current->mm, start);
-        while (vma) {
-                /* Here start < vma->vm_end. */
-                if (start < vma->vm_start) {
-                        unmapped_error = -ENOMEM;
-                        start = vma->vm_start;
-                }
-                /* Here vma->vm_start <= start < vma->vm_end. */
+        tmp = (void *) __get_free_page(GFP_USER);
-                if (end <= vma->vm_end) {
+        if (!tmp)
-                        if (start < end) {
+                return -EAGAIN;
-                                error = mincore_vma(vma, start, end,
-                                                        &vec[index]);
+        retval = 0;
-                                if (error)
+        while (pages) {
-                                        goto out;
+                /*
-                        }
+                 * Do at most PAGE_SIZE entries per iteration, due to
-                        error = unmapped_error;
+                 * the temporary buffer size.
-                        goto out;
+                 */
+                down_read(&current->mm->mmap_sem);
+                retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
+                up_read(&current->mm->mmap_sem);
+                if (retval <= 0)
+                        break;
+                if (copy_to_user(vec, tmp, retval)) {
+                        retval = -EFAULT;
+                        break;
                }
+                pages -= retval;
-                /* Here vma->vm_start <= start < vma->vm_end < end. */
+                vec += retval;
-                error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
+                start += retval << PAGE_SHIFT;
-                if (error)
+                retval = 0;
-                        goto out;
-                index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
-                start = vma->vm_end;
-                vma = vma->vm_next;
        }
+        free_page((unsigned long) tmp);
-        /* we found a hole in the area queried if we arrive here */
+        return retval;
-        error = -ENOMEM;
-out:
-        up_read(&current->mm->mmap_sem);
-        return error;
-einval:
-        return -EINVAL;
-enomem:
-        return -ENOMEM;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 7be110e98d4c..eb509ae76553 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -188,7 +188,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                struct file *file, struct address_space *mapping)
 {
        if (vma->vm_flags & VM_DENYWRITE)
-                atomic_inc(&file->f_dentry->d_inode->i_writecount);
+                atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
        if (vma->vm_flags & VM_SHARED)
                mapping->i_mmap_writable--;
@@ -399,7 +399,7 @@ static inline void __vma_link_file(struct vm_area_struct *vma)
                struct address_space *mapping = file->f_mapping;
                if (vma->vm_flags & VM_DENYWRITE)
-                        atomic_dec(&file->f_dentry->d_inode->i_writecount);
+                        atomic_dec(&file->f_path.dentry->d_inode->i_writecount);
                if (vma->vm_flags & VM_SHARED)
                        mapping->i_mmap_writable++;
@@ -907,7 +907,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
         *  mounted, in which case we dont add PROT_EXEC.)
         */
        if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
-                if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
+                if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
                        prot |= PROT_EXEC;
        if (!len)
@@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
                        return -EAGAIN;
        }
-        inode = file ? file->f_dentry->d_inode : NULL;
+        inode = file ? file->f_path.dentry->d_inode : NULL;
        if (file) {
                switch (flags & MAP_TYPE) {
@@ -989,7 +989,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
                case MAP_PRIVATE:
                        if (!(file->f_mode & FMODE_READ))
                                return -EACCES;
-                        if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) {
+                        if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
                                if (vm_flags & VM_EXEC)
                                        return -EPERM;
                                vm_flags &= ~VM_MAYEXEC;
@@ -1477,6 +1477,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
 {
        struct mm_struct *mm = vma->vm_mm;
        struct rlimit *rlim = current->signal->rlim;
+        unsigned long new_start;
        /* address space limit tests */
        if (!may_expand_vm(mm, grow))
@@ -1496,6 +1497,12 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
                        return -ENOMEM;
        }
+        /* Check to ensure the stack will not grow into a hugetlb-only region */
+        new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
+                        vma->vm_end - size;
+        if (is_hugepage_only_range(vma->vm_mm, new_start, size))
+                return -EFAULT;
        /*
         * Overcommit..  This must be the final test, as it will
         * update security statistics.
@@ -2094,3 +2101,75 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages)
                return 0;
        return 1;
 }
+static struct page *special_mapping_nopage(struct vm_area_struct *vma,
+                                           unsigned long address, int *type)
+{
+        struct page **pages;
+        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+        address -= vma->vm_start;
+        for (pages = vma->vm_private_data; address > 0 && *pages; ++pages)
+                address -= PAGE_SIZE;
+        if (*pages) {
+                struct page *page = *pages;
+                get_page(page);
+                return page;
+        }
+        return NOPAGE_SIGBUS;
+}
+/*
+ * Having a close hook prevents vma merging regardless of flags.
+ */
+static void special_mapping_close(struct vm_area_struct *vma)
+{
+}
+static struct vm_operations_struct special_mapping_vmops = {
+        .close = special_mapping_close,
+        .nopage = special_mapping_nopage,
+};
+/*
+ * Called with mm->mmap_sem held for writing.
+ * Insert a new vma covering the given region, with the given flags.
+ * Its pages are supplied by the given array of struct page *.
+ * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
+ * The region past the last page supplied will always produce SIGBUS.
+ * The array pointer and the pages it points to are assumed to stay alive
+ * for as long as this mapping might exist.
+ */
+int install_special_mapping(struct mm_struct *mm,
+                            unsigned long addr, unsigned long len,
+                            unsigned long vm_flags, struct page **pages)
+{
+        struct vm_area_struct *vma;
+        vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+        if (unlikely(vma == NULL))
+                return -ENOMEM;
+        vma->vm_mm = mm;
+        vma->vm_start = addr;
+        vma->vm_end = addr + len;
+        vma->vm_flags = vm_flags | mm->def_flags;
+        vma->vm_page_prot = protection_map[vma->vm_flags & 7];
+        vma->vm_ops = &special_mapping_vmops;
+        vma->vm_private_data = pages;
+        if (unlikely(insert_vm_struct(mm, vma))) {
+                kmem_cache_free(vm_area_cachep, vma);
+                return -ENOMEM;
+        }
+        mm->total_vm += len >> PAGE_SHIFT;
+        return 0;
+}
diff --git a/mm/mremap.c b/mm/mremap.c
index 9c769fa29f32..5d4bd4f95b8e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -105,7 +105,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                if (pte_none(*old_pte))
                        continue;
                pte = ptep_clear_flush(vma, old_addr, old_pte);
-                /* ZERO_PAGE can be dependant on virtual addr */
                pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
                set_pte_at(mm, new_addr, new_pte, pte);
        }
diff --git a/mm/nommu.c b/mm/nommu.c
index af874569d0f1..23fb033e596d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -523,7 +523,7 @@ static int validate_mmap_request(struct file *file,
                 */
                mapping = file->f_mapping;
                if (!mapping)
-                        mapping = file->f_dentry->d_inode->i_mapping;
+                        mapping = file->f_path.dentry->d_inode->i_mapping;
                capabilities = 0;
                if (mapping && mapping->backing_dev_info)
@@ -532,7 +532,7 @@ static int validate_mmap_request(struct file *file,
                if (!capabilities) {
                        /* no explicit capabilities set, so assume some
                         * defaults */
-                        switch (file->f_dentry->d_inode->i_mode & S_IFMT) {
+                        switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) {
                        case S_IFREG:
                        case S_IFBLK:
                                capabilities = BDI_CAP_MAP_COPY;
@@ -563,11 +563,11 @@ static int validate_mmap_request(struct file *file,
                            !(file->f_mode & FMODE_WRITE))
                                return -EACCES;
-                        if (IS_APPEND(file->f_dentry->d_inode) &&
+                        if (IS_APPEND(file->f_path.dentry->d_inode) &&
                            (file->f_mode & FMODE_WRITE))
                                return -EACCES;
-                        if (locks_verify_locked(file->f_dentry->d_inode))
+                        if (locks_verify_locked(file->f_path.dentry->d_inode))
                                return -EAGAIN;
                        if (!(capabilities & BDI_CAP_MAP_DIRECT))
@@ -598,7 +598,7 @@ static int validate_mmap_request(struct file *file,
                /* handle executable mappings and implied executable
                 * mappings */
-                if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) {
+                if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
                        if (prot & PROT_EXEC)
                                return -EPERM;
                }
@@ -833,7 +833,7 @@ unsigned long do_mmap_pgoff(struct file *file,
                                continue;
                        /* search for overlapping mappings on the same file */
-                        if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode)
+                        if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode)
                                continue;
                        if (vma->vm_pgoff >= pgoff + pglen)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 223d9ccb7d64..b278b8d60eee 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -61,12 +61,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
        }
        /*
-         * swapoff can easily use up all memory, so kill those first.
-         */
-        if (p->flags & PF_SWAPOFF)
-                return ULONG_MAX;
-        /*
         * The memory size of the process is the basis for the badness.
         */
        points = mm->total_vm;
@@ -77,6 +71,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
        task_unlock(p);
        /*
+         * swapoff can easily use up all memory, so kill those first.
+         */
+        if (p->flags & PF_SWAPOFF)
+                return ULONG_MAX;
+        /*
         * Processes which fork a lot of child processes are likely
         * a good choice. We add half the vmsize of the children if they
         * have an own mm. This prevents forking servers to flood the
@@ -174,10 +174,15 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
 {
 #ifdef CONFIG_NUMA
        struct zone **z;
-        nodemask_t nodes = node_online_map;
+        nodemask_t nodes;
+        int node;
+        /* node has memory ? */
+        for_each_online_node(node)
+                if (NODE_DATA(node)->node_present_pages)
+                        node_set(node, nodes);
        for (z = zonelist->zones; *z; z++)
-                if (cpuset_zone_allowed(*z, gfp_mask))
+                if (cpuset_zone_allowed_softwall(*z, gfp_mask))
                        node_clear(zone_to_nid(*z), nodes);
                else
                        return CONSTRAINT_CPUSET;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8d9b19f239c3..be0efbde4994 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -21,6 +21,7 @@
 #include <linux/writeback.h>
 #include <linux/init.h>
 #include <linux/backing-dev.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/blkdev.h>
 #include <linux/mpage.h>
 #include <linux/rmap.h>
@@ -132,11 +133,9 @@ get_dirty_limits(long *pbackground, long *pdirty,
 #ifdef CONFIG_HIGHMEM
        /*
-         * If this mapping can only allocate from low memory,
+         * We always exclude high memory from our count.
-         * we exclude high memory from our count.
         */
-        if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM))
+        available_memory -= totalhigh_pages;
-                available_memory -= totalhigh_pages;
 #endif
@@ -525,28 +524,25 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
 };
 /*
- * If the machine has a large highmem:lowmem ratio then scale back the default
+ * Called early on to tune the page writeback dirty limits.
- * dirty memory thresholds: allowing too much dirty highmem pins an excessive
+ *
- * number of buffer_heads.
+ * We used to scale dirty pages according to how total memory
+ * related to pages that could be allocated for buffers (by
+ * comparing nr_free_buffer_pages() to vm_total_pages.
+ *
+ * However, that was when we used "dirty_ratio" to scale with
+ * all memory, and we don't do that any more. "dirty_ratio"
+ * is now applied to total non-HIGHPAGE memory (by subtracting
+ * totalhigh_pages from vm_total_pages), and as such we can't
+ * get into the old insane situation any more where we had
+ * large amounts of dirty pages compared to a small amount of
+ * non-HIGHMEM memory.
+ *
+ * But we might still want to scale the dirty_ratio by how
+ * much memory the box has..
 */
 void __init page_writeback_init(void)
 {
-        long buffer_pages = nr_free_buffer_pages();
-        long correction;
-        correction = (100 * 4 * buffer_pages) / vm_total_pages;
-        if (correction < 100) {
-                dirty_background_ratio *= correction;
-                dirty_background_ratio /= 100;
-                vm_dirty_ratio *= correction;
-                vm_dirty_ratio /= 100;
-                if (dirty_background_ratio <= 0)
-                        dirty_background_ratio = 1;
-                if (vm_dirty_ratio <= 0)
-                        vm_dirty_ratio = 1;
-        }
        mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
@@ -761,23 +757,24 @@ int __set_page_dirty_nobuffers(struct page *page)
                struct address_space *mapping = page_mapping(page);
                struct address_space *mapping2;
-                if (mapping) {
+                if (!mapping)
-                        write_lock_irq(&mapping->tree_lock);
+                        return 1;
-                        mapping2 = page_mapping(page);
-                        if (mapping2) { /* Race with truncate? */
+                write_lock_irq(&mapping->tree_lock);
-                                BUG_ON(mapping2 != mapping);
+                mapping2 = page_mapping(page);
-                                if (mapping_cap_account_dirty(mapping))
+                if (mapping2) { /* Race with truncate? */
-                                        __inc_zone_page_state(page,
+                        BUG_ON(mapping2 != mapping);
-                                                                NR_FILE_DIRTY);
+                        if (mapping_cap_account_dirty(mapping)) {
-                                radix_tree_tag_set(&mapping->page_tree,
+                                __inc_zone_page_state(page, NR_FILE_DIRTY);
-                                        page_index(page), PAGECACHE_TAG_DIRTY);
+                                task_io_account_write(PAGE_CACHE_SIZE);
-                        }
-                        write_unlock_irq(&mapping->tree_lock);
-                        if (mapping->host) {
-                                /* !PageAnon && !swapper_space */
-                                __mark_inode_dirty(mapping->host,
-                                                        I_DIRTY_PAGES);
                        }
+                        radix_tree_tag_set(&mapping->page_tree,
+                                page_index(page), PAGECACHE_TAG_DIRTY);
+                }
+                write_unlock_irq(&mapping->tree_lock);
+                if (mapping->host) {
+                        /* !PageAnon && !swapper_space */
+                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
                }
                return 1;
        }
@@ -843,39 +840,6 @@ int set_page_dirty_lock(struct page *page)
 EXPORT_SYMBOL(set_page_dirty_lock);
 /*
- * Clear a page's dirty flag, while caring for dirty memory accounting. 
- * Returns true if the page was previously dirty.
- */
-int test_clear_page_dirty(struct page *page)
-{
-        struct address_space *mapping = page_mapping(page);
-        unsigned long flags;
-        if (mapping) {
-                write_lock_irqsave(&mapping->tree_lock, flags);
-                if (TestClearPageDirty(page)) {
-                        radix_tree_tag_clear(&mapping->page_tree,
-                                                page_index(page),
-                                                PAGECACHE_TAG_DIRTY);
-                        write_unlock_irqrestore(&mapping->tree_lock, flags);
-                        /*
-                         * We can continue to use `mapping' here because the
-                         * page is locked, which pins the address_space
-                         */
-                        if (mapping_cap_account_dirty(mapping)) {
-                                page_mkclean(page);
-                                dec_zone_page_state(page, NR_FILE_DIRTY);
-                        }
-                        return 1;
-                }
-                write_unlock_irqrestore(&mapping->tree_lock, flags);
-                return 0;
-        }
-        return TestClearPageDirty(page);
-}
-EXPORT_SYMBOL(test_clear_page_dirty);
-/*
 * Clear a page's dirty flag, while caring for dirty memory accounting.
 * Returns true if the page was previously dirty.
 *
@@ -893,12 +857,41 @@ int clear_page_dirty_for_io(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
-        if (mapping) {
+        if (mapping && mapping_cap_account_dirty(mapping)) {
+                /*
+                 * Yes, Virginia, this is indeed insane.
+                 *
+                 * We use this sequence to make sure that
+                 *  (a) we account for dirty stats properly
+                 *  (b) we tell the low-level filesystem to
+                 *      mark the whole page dirty if it was
+                 *      dirty in a pagetable. Only to then
+                 *  (c) clean the page again and return 1 to
+                 *      cause the writeback.
+                 *
+                 * This way we avoid all nasty races with the
+                 * dirty bit in multiple places and clearing
+                 * them concurrently from different threads.
+                 *
+                 * Note! Normally the "set_page_dirty(page)"
+                 * has no effect on the actual dirty bit - since
+                 * that will already usually be set. But we
+                 * need the side effects, and it can help us
+                 * avoid races.
+                 *
+                 * We basically use the page "master dirty bit"
+                 * as a serialization point for all the different
+                 * threads doing their things.
+                 *
+                 * FIXME! We still have a race here: if somebody
+                 * adds the page back to the page tables in
+                 * between the "page_mkclean()" and the "TestClearPageDirty()",
+                 * we might have it mapped without the dirty bit set.
+                 */
+                if (page_mkclean(page))
+                        set_page_dirty(page);
                if (TestClearPageDirty(page)) {
-                        if (mapping_cap_account_dirty(mapping)) {
+                        dec_zone_page_state(page, NR_FILE_DIRTY);
-                                page_mkclean(page);
-                                dec_zone_page_state(page, NR_FILE_DIRTY);
-                        }
                        return 1;
                }
                return 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cace22b3ac25..f12052dc23ff 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -40,6 +40,7 @@
 #include <linux/sort.h>
 #include <linux/pfn.h>
 #include <linux/backing-dev.h>
+#include <linux/fault-inject.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -710,6 +711,9 @@ static void __drain_pages(unsigned int cpu)
        for_each_zone(zone) {
                struct per_cpu_pageset *pset;
+                if (!populated_zone(zone))
+                        continue;
                pset = zone_pcp(zone, cpu);
                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                        struct per_cpu_pages *pcp;
@@ -892,6 +896,91 @@ failed:
 #define ALLOC_HIGH              0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET            0x40 /* check for correct cpuset */
+#ifdef CONFIG_FAIL_PAGE_ALLOC
+static struct fail_page_alloc_attr {
+        struct fault_attr attr;
+        u32 ignore_gfp_highmem;
+        u32 ignore_gfp_wait;
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+        struct dentry *ignore_gfp_highmem_file;
+        struct dentry *ignore_gfp_wait_file;
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+} fail_page_alloc = {
+        .attr = FAULT_ATTR_INITIALIZER,
+        .ignore_gfp_wait = 1,
+        .ignore_gfp_highmem = 1,
+};
+static int __init setup_fail_page_alloc(char *str)
+{
+        return setup_fault_attr(&fail_page_alloc.attr, str);
+}
+__setup("fail_page_alloc=", setup_fail_page_alloc);
+static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+        if (gfp_mask & __GFP_NOFAIL)
+                return 0;
+        if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
+                return 0;
+        if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+                return 0;
+        return should_fail(&fail_page_alloc.attr, 1 << order);
+}
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+static int __init fail_page_alloc_debugfs(void)
+{
+        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+        struct dentry *dir;
+        int err;
+        err = init_fault_attr_dentries(&fail_page_alloc.attr,
+                                       "fail_page_alloc");
+        if (err)
+                return err;
+        dir = fail_page_alloc.attr.dentries.dir;
+        fail_page_alloc.ignore_gfp_wait_file =
+                debugfs_create_bool("ignore-gfp-wait", mode, dir,
+                                      &fail_page_alloc.ignore_gfp_wait);
+        fail_page_alloc.ignore_gfp_highmem_file =
+                debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+                                      &fail_page_alloc.ignore_gfp_highmem);
+        if (!fail_page_alloc.ignore_gfp_wait_file ||
+                        !fail_page_alloc.ignore_gfp_highmem_file) {
+                err = -ENOMEM;
+                debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
+                debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
+                cleanup_fault_attr_dentries(&fail_page_alloc.attr);
+        }
+        return err;
+}
+late_initcall(fail_page_alloc_debugfs);
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+#else /* CONFIG_FAIL_PAGE_ALLOC */
+static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+        return 0;
+}
+#endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
 * Return 1 if free pages are above 'mark'. This takes into account the order
 * of the allocation.
@@ -900,8 +989,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                      int classzone_idx, int alloc_flags)
 {
        /* free_pages my go negative - that's OK */
-        unsigned long min = mark;
+        long min = mark, free_pages = z->free_pages - (1 << order) + 1;
-        long free_pages = z->free_pages - (1 << order) + 1;
        int o;
        if (alloc_flags & ALLOC_HIGH)
@@ -1076,7 +1164,7 @@ zonelist_scan:
                        zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
                                break;
                if ((alloc_flags & ALLOC_CPUSET) &&
-                        !cpuset_zone_allowed(zone, gfp_mask))
+                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
                                goto try_next_zone;
                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1136,6 +1224,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
        might_sleep_if(wait);
+        if (should_fail_alloc_page(gfp_mask, order))
+                return NULL;
 restart:
        z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
@@ -1488,8 +1579,8 @@ void show_free_areas(void)
        get_zone_counts(&active, &inactive, &free);
-        printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
+        printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
-                "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
+                " free:%u slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
                active,
                inactive,
                global_page_state(NR_FILE_DIRTY),
@@ -1499,7 +1590,8 @@ void show_free_areas(void)
                global_page_state(NR_SLAB_RECLAIMABLE) +
                        global_page_state(NR_SLAB_UNRECLAIMABLE),
                global_page_state(NR_FILE_MAPPED),
-                global_page_state(NR_PAGETABLE));
+                global_page_state(NR_PAGETABLE),
+                global_page_state(NR_BOUNCE));
        for_each_zone(zone) {
                int i;
@@ -1864,17 +1956,24 @@ static inline unsigned long wait_table_bits(unsigned long size)
 * done. Non-atomic initialization, single-pass.
 */
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
-                unsigned long start_pfn)
+                unsigned long start_pfn, enum memmap_context context)
 {
        struct page *page;
        unsigned long end_pfn = start_pfn + size;
        unsigned long pfn;
        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-                if (!early_pfn_valid(pfn))
+                /*
-                        continue;
+                 * There can be holes in boot-time mem_map[]s
-                if (!early_pfn_in_nid(pfn, nid))
+                 * handed to this function.  They do not
-                        continue;
+                 * exist on hotplugged memory.
+                 */
+                if (context == MEMMAP_EARLY) {
+                        if (!early_pfn_valid(pfn))
+                                continue;
+                        if (!early_pfn_in_nid(pfn, nid))
+                                continue;
+                }
                page = pfn_to_page(pfn);
                set_page_links(page, zone, nid, pfn);
                init_page_count(page);
@@ -1901,7 +2000,7 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
-        memmap_init_zone((size), (nid), (zone), (start_pfn))
+        memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
 static int __cpuinit zone_batchsize(struct zone *zone)
@@ -2147,7 +2246,8 @@ static __meminit void zone_pcp_init(struct zone *zone)
 __meminit int init_currently_empty_zone(struct zone *zone,
                                        unsigned long zone_start_pfn,
-                                        unsigned long size)
+                                        unsigned long size,
+                                        enum memmap_context context)
 {
        struct pglist_data *pgdat = zone->zone_pgdat;
        int ret;
@@ -2591,7 +2691,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                if (!size)
                        continue;
-                ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+                ret = init_currently_empty_zone(zone, zone_start_pfn,
+                                                size, MEMMAP_EARLY);
                BUG_ON(ret);
                zone_start_pfn += size;
        }
@@ -3232,6 +3333,10 @@ void *__init alloc_large_system_hash(const char *tablename,
                        numentries >>= (scale - PAGE_SHIFT);
                else
                        numentries <<= (PAGE_SHIFT - scale);
+                /* Make sure we've got at least a 0-order allocation.. */
+                if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+                        numentries = PAGE_SIZE / bucketsize;
        }
        numentries = roundup_pow_of_two(numentries);
@@ -3244,7 +3349,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        if (numentries > max)
                numentries = max;
-        log2qty = long_log2(numentries);
+        log2qty = ilog2(numentries);
        do {
                size = bucketsize << log2qty;
@@ -3266,7 +3371,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
               tablename,
               (1U << log2qty),
-               long_log2(size) - PAGE_SHIFT,
+               ilog2(size) - PAGE_SHIFT,
               size);
        if (_hash_shift)
diff --git a/mm/readahead.c b/mm/readahead.c
index a386f2b6b335..0f539e8e827a 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/pagevec.h>
 void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
@@ -151,6 +152,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
                        put_pages_list(pages);
                        break;
                }
+                task_io_account_read(PAGE_CACHE_SIZE);
        }
        pagevec_lru_add(&lru_pvec);
        return ret;
@@ -450,7 +452,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
 *
 * Note that @filp is purely used for passing on to the ->readpage[s]()
 * handler: it may refer to a different file from @mapping (so we may not use
- * @filp->f_mapping or @filp->f_dentry->d_inode here).
+ * @filp->f_mapping or @filp->f_path.dentry->d_inode here).
 * Also, @ra may not be equal to &@filp->f_ra.
 *
 */
diff --git a/mm/rmap.c b/mm/rmap.c
index d8a842a586db..669acb22b572 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -47,6 +47,7 @@
 #include <linux/rmap.h>
 #include <linux/rcupdate.h>
 #include <linux/module.h>
+#include <linux/kallsyms.h>
 #include <asm/tlbflush.h>
@@ -432,7 +433,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
-        pte_t *pte, entry;
+        pte_t *pte;
        spinlock_t *ptl;
        int ret = 0;
@@ -444,17 +445,18 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
        if (!pte)
                goto out;
-        if (!pte_dirty(*pte) && !pte_write(*pte))
+        if (pte_dirty(*pte) || pte_write(*pte)) {
-                goto unlock;
+                pte_t entry;
-        entry = ptep_get_and_clear(mm, address, pte);
+                flush_cache_page(vma, address, pte_pfn(*pte));
-        entry = pte_mkclean(entry);
+                entry = ptep_clear_flush(vma, address, pte);
-        entry = pte_wrprotect(entry);
+                entry = pte_wrprotect(entry);
-        ptep_establish(vma, address, pte, entry);
+                entry = pte_mkclean(entry);
-        lazy_mmu_prot_update(entry);
+                set_pte_at(mm, address, pte, entry);
-        ret = 1;
+                lazy_mmu_prot_update(entry);
+                ret = 1;
+        }
-unlock:
        pte_unmap_unlock(pte, ptl);
 out:
        return ret;
@@ -489,6 +491,8 @@ int page_mkclean(struct page *page)
                if (mapping)
                        ret = page_mkclean_file(mapping, page);
        }
+        if (page_test_and_clear_dirty(page))
+                ret = 1;
        return ret;
 }
@@ -567,14 +571,20 @@ void page_add_file_rmap(struct page *page)
 *
 * The caller needs to hold the pte lock.
 */
-void page_remove_rmap(struct page *page)
+void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
 {
        if (atomic_add_negative(-1, &page->_mapcount)) {
                if (unlikely(page_mapcount(page) < 0)) {
                        printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
+                        printk (KERN_EMERG "  page pfn = %lx\n", page_to_pfn(page));
                        printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
                        printk (KERN_EMERG "  page->count = %x\n", page_count(page));
                        printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
+                        print_symbol (KERN_EMERG "  vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
+                        if (vma->vm_ops)
+                                print_symbol (KERN_EMERG "  vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
+                        if (vma->vm_file && vma->vm_file->f_op)
+                                print_symbol (KERN_EMERG "  vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
                        BUG();
                }
@@ -679,7 +689,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                dec_mm_counter(mm, file_rss);
-        page_remove_rmap(page);
+        page_remove_rmap(page, vma);
        page_cache_release(page);
 out_unmap:
@@ -769,7 +779,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
                if (pte_dirty(pteval))
                        set_page_dirty(page);
-                page_remove_rmap(page);
+                page_remove_rmap(page, vma);
                page_cache_release(page);
                dec_mm_counter(mm, file_rss);
                (*mapcount)--;
diff --git a/mm/shmem.c b/mm/shmem.c
index c820b4f77b8d..70da7a0981bf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -515,7 +515,12 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
                        size = SHMEM_NR_DIRECT;
                nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
        }
-        if (!topdir)
+        /*
+         * If there are no indirect blocks or we are punching a hole
+         * below indirect blocks, nothing to be done.
+         */
+        if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT)))
                goto done2;
        BUG_ON(limit <= SHMEM_NR_DIRECT);
@@ -1225,7 +1230,7 @@ failed:
 struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
 {
-        struct inode *inode = vma->vm_file->f_dentry->d_inode;
+        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        struct page *page = NULL;
        unsigned long idx;
        int error;
@@ -1248,7 +1253,7 @@ static int shmem_populate(struct vm_area_struct *vma,
        unsigned long addr, unsigned long len,
        pgprot_t prot, unsigned long pgoff, int nonblock)
 {
-        struct inode *inode = vma->vm_file->f_dentry->d_inode;
+        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        struct mm_struct *mm = vma->vm_mm;
        enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
        unsigned long size;
@@ -1293,14 +1298,14 @@ static int shmem_populate(struct vm_area_struct *vma,
 #ifdef CONFIG_NUMA
 int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
 {
-        struct inode *i = vma->vm_file->f_dentry->d_inode;
+        struct inode *i = vma->vm_file->f_path.dentry->d_inode;
        return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
 }
 struct mempolicy *
 shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
 {
-        struct inode *i = vma->vm_file->f_dentry->d_inode;
+        struct inode *i = vma->vm_file->f_path.dentry->d_inode;
        unsigned long idx;
        idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -1310,7 +1315,7 @@ shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
 int shmem_lock(struct file *file, int lock, struct user_struct *user)
 {
-        struct inode *inode = file->f_dentry->d_inode;
+        struct inode *inode = file->f_path.dentry->d_inode;
        struct shmem_inode_info *info = SHMEM_I(inode);
        int retval = -ENOMEM;
@@ -1422,7 +1427,7 @@ shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsig
 static ssize_t
 shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 {
-        struct inode    *inode = file->f_dentry->d_inode;
+        struct inode    *inode = file->f_path.dentry->d_inode;
        loff_t          pos;
        unsigned long   written;
        ssize_t         err;
@@ -1442,7 +1447,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
        if (err || !count)
                goto out;
-        err = remove_suid(file->f_dentry);
+        err = remove_suid(file->f_path.dentry);
        if (err)
                goto out;
@@ -1524,7 +1529,7 @@ out:
 static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
 {
-        struct inode *inode = filp->f_dentry->d_inode;
+        struct inode *inode = filp->f_path.dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
        unsigned long index, offset;
@@ -2493,8 +2498,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
        d_instantiate(dentry, inode);
        inode->i_size = size;
        inode->i_nlink = 0;     /* It is unlinked */
-        file->f_vfsmnt = mntget(shm_mnt);
+        file->f_path.mnt = mntget(shm_mnt);
-        file->f_dentry = dentry;
+        file->f_path.dentry = dentry;
        file->f_mapping = inode->i_mapping;
        file->f_op = &shmem_file_operations;
        file->f_mode = FMODE_WRITE | FMODE_READ;
diff --git a/mm/slab.c b/mm/slab.c
index 068cb4503c15..c6100628a6ef 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -107,7 +107,9 @@
 #include        <linux/nodemask.h>
 #include        <linux/mempolicy.h>
 #include        <linux/mutex.h>
+#include        <linux/fault-inject.h>
 #include        <linux/rtmutex.h>
+#include        <linux/reciprocal_div.h>
 #include        <asm/cacheflush.h>
 #include        <asm/tlbflush.h>
@@ -385,6 +387,7 @@ struct kmem_cache {
        unsigned int shared;
        unsigned int buffer_size;
+        u32 reciprocal_buffer_size;
 /* 3) touched by every alloc & free from the backend */
        struct kmem_list3 *nodelists[MAX_NUMNODES];
@@ -626,10 +629,17 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
        return slab->s_mem + cache->buffer_size * idx;
 }
-static inline unsigned int obj_to_index(struct kmem_cache *cache,
+/*
-                                        struct slab *slab, void *obj)
+ * We want to avoid an expensive divide : (offset / cache->buffer_size)
+ *   Using the fact that buffer_size is a constant for a particular cache,
+ *   we can replace (offset / cache->buffer_size) by
+ *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
+ */
+static inline unsigned int obj_to_index(const struct kmem_cache *cache,
+                                        const struct slab *slab, void *obj)
 {
-        return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
+        u32 offset = (obj - slab->s_mem);
+        return reciprocal_divide(offset, cache->reciprocal_buffer_size);
 }
 /*
@@ -945,7 +955,8 @@ static void __devinit start_cpu_timer(int cpu)
        if (keventd_up() && reap_work->work.func == NULL) {
                init_reap_node(cpu);
                INIT_DELAYED_WORK(reap_work, cache_reap);
-                schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
+                schedule_delayed_work_on(cpu, reap_work,
+                                        __round_jiffies_relative(HZ, cpu));
        }
 }
@@ -1425,6 +1436,8 @@ void __init kmem_cache_init(void)
        cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
                                        cache_line_size());
+        cache_cache.reciprocal_buffer_size =
+                reciprocal_value(cache_cache.buffer_size);
        for (order = 0; order < MAX_ORDER; order++) {
                cache_estimate(order, cache_cache.buffer_size,
@@ -2311,6 +2324,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (flags & SLAB_CACHE_DMA)
                cachep->gfpflags |= GFP_DMA;
        cachep->buffer_size = size;
+        cachep->reciprocal_buffer_size = reciprocal_value(size);
        if (flags & CFLGS_OFF_SLAB) {
                cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
@@ -3088,12 +3102,89 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
 #endif
+#ifdef CONFIG_FAILSLAB
+static struct failslab_attr {
+        struct fault_attr attr;
+        u32 ignore_gfp_wait;
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+        struct dentry *ignore_gfp_wait_file;
+#endif
+} failslab = {
+        .attr = FAULT_ATTR_INITIALIZER,
+        .ignore_gfp_wait = 1,
+};
+static int __init setup_failslab(char *str)
+{
+        return setup_fault_attr(&failslab.attr, str);
+}
+__setup("failslab=", setup_failslab);
+static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
+{
+        if (cachep == &cache_cache)
+                return 0;
+        if (flags & __GFP_NOFAIL)
+                return 0;
+        if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
+                return 0;
+        return should_fail(&failslab.attr, obj_size(cachep));
+}
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+static int __init failslab_debugfs(void)
+{
+        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+        struct dentry *dir;
+        int err;
+        err = init_fault_attr_dentries(&failslab.attr, "failslab");
+        if (err)
+                return err;
+        dir = failslab.attr.dentries.dir;
+        failslab.ignore_gfp_wait_file =
+                debugfs_create_bool("ignore-gfp-wait", mode, dir,
+                                      &failslab.ignore_gfp_wait);
+        if (!failslab.ignore_gfp_wait_file) {
+                err = -ENOMEM;
+                debugfs_remove(failslab.ignore_gfp_wait_file);
+                cleanup_fault_attr_dentries(&failslab.attr);
+        }
+        return err;
+}
+late_initcall(failslab_debugfs);
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+#else /* CONFIG_FAILSLAB */
+static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
+{
+        return 0;
+}
+#endif /* CONFIG_FAILSLAB */
 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
        void *objp;
        struct array_cache *ac;
        check_irq_off();
+        if (should_failslab(cachep, flags))
+                return NULL;
        ac = cpu_cache_get(cachep);
        if (likely(ac->avail)) {
                STATS_INC_ALLOCHIT(cachep);
@@ -3173,6 +3264,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
        struct zone **z;
        void *obj = NULL;
        int nid;
+        gfp_t local_flags = (flags & GFP_LEVEL_MASK);
 retry:
        /*
@@ -3182,21 +3274,26 @@ retry:
        for (z = zonelist->zones; *z && !obj; z++) {
                nid = zone_to_nid(*z);
-                if (cpuset_zone_allowed(*z, flags) &&
+                if (cpuset_zone_allowed_hardwall(*z, flags) &&
                        cache->nodelists[nid] &&
                        cache->nodelists[nid]->free_objects)
                                obj = ____cache_alloc_node(cache,
                                        flags | GFP_THISNODE, nid);
        }
-        if (!obj) {
+        if (!obj && !(flags & __GFP_NO_GROW)) {
                /*
                 * This allocation will be performed within the constraints
                 * of the current cpuset / memory policy requirements.
                 * We may trigger various forms of reclaim on the allowed
                 * set and go into memory reserves if necessary.
                 */
+                if (local_flags & __GFP_WAIT)
+                        local_irq_enable();
+                kmem_flagcheck(cache, flags);
                obj = kmem_getpages(cache, flags, -1);
+                if (local_flags & __GFP_WAIT)
+                        local_irq_disable();
                if (obj) {
                        /*
                         * Insert into the appropriate per node queues
@@ -3213,7 +3310,7 @@ retry:
                                         */
                                        goto retry;
                        } else {
-                                kmem_freepages(cache, obj);
+                                /* cache_grow already freed obj */
                                obj = NULL;
                        }
                }
@@ -3456,7 +3553,7 @@ EXPORT_SYMBOL(kmem_cache_zalloc);
 *
 * Currently only used for dentry validation.
 */
-int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
+int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
 {
        unsigned long addr = (unsigned long)ptr;
        unsigned long min_addr = PAGE_OFFSET;
@@ -3490,6 +3587,7 @@ out:
 * @cachep: The cache to allocate from.
 * @flags: See kmalloc().
 * @nodeid: node number of the target node.
+ * @caller: return address of caller, used for debug information
 *
 * Identical to kmem_cache_alloc but it will allocate memory on the given
 * node, which can improve the performance for cpu bound structures.
@@ -3928,7 +4026,7 @@ static void cache_reap(struct work_struct *unused)
        if (!mutex_trylock(&cache_chain_mutex)) {
                /* Give up. Setup the next iteration. */
                schedule_delayed_work(&__get_cpu_var(reap_work),
-                                      REAPTIMEOUT_CPUC);
+                                      round_jiffies_relative(REAPTIMEOUT_CPUC));
                return;
        }
@@ -3974,7 +4072,8 @@ next:
        next_reap_node();
        refresh_cpu_vm_stats(smp_processor_id());
        /* Set up the next iteration */
-        schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
+        schedule_delayed_work(&__get_cpu_var(reap_work),
+                round_jiffies_relative(REAPTIMEOUT_CPUC));
 }
 #ifdef CONFIG_PROC_FS
diff --git a/mm/slob.c b/mm/slob.c
index 542394184a58..5adc29cb58dd 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -60,6 +60,8 @@ static DEFINE_SPINLOCK(slob_lock);
 static DEFINE_SPINLOCK(block_lock);
 static void slob_free(void *b, int size);
+static void slob_timer_cbk(void);
 static void *slob_alloc(size_t size, gfp_t gfp, int align)
 {
@@ -157,7 +159,7 @@ static int fastcall find_order(int size)
        return order;
 }
-void *kmalloc(size_t size, gfp_t gfp)
+void *__kmalloc(size_t size, gfp_t gfp)
 {
        slob_t *m;
        bigblock_t *bb;
@@ -186,8 +188,7 @@ void *kmalloc(size_t size, gfp_t gfp)
        slob_free(bb, sizeof(bigblock_t));
        return 0;
 }
+EXPORT_SYMBOL(__kmalloc);
-EXPORT_SYMBOL(kmalloc);
 void kfree(const void *block)
 {
@@ -327,9 +328,25 @@ const char *kmem_cache_name(struct kmem_cache *c)
 EXPORT_SYMBOL(kmem_cache_name);
 static struct timer_list slob_timer = TIMER_INITIALIZER(
-        (void (*)(unsigned long))kmem_cache_init, 0, 0);
+        (void (*)(unsigned long))slob_timer_cbk, 0, 0);
+int kmem_cache_shrink(struct kmem_cache *d)
+{
+        return 0;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+int kmem_ptr_validate(struct kmem_cache *a, const void *b)
+{
+        return 0;
+}
+void __init kmem_cache_init(void)
+{
+        slob_timer_cbk();
+}
-void kmem_cache_init(void)
+static void slob_timer_cbk(void)
 {
        void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c5431072f422..a2d9bb4e80df 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -434,7 +434,7 @@ void free_swap_and_cache(swp_entry_t entry)
 *
 * This is needed for the suspend to disk (aka swsusp).
 */
-int swap_type_of(dev_t device, sector_t offset)
+int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
 {
        struct block_device *bdev = NULL;
        int i;
@@ -450,6 +450,9 @@ int swap_type_of(dev_t device, sector_t offset)
                        continue;
                if (!bdev) {
+                        if (bdev_p)
+                                *bdev_p = sis->bdev;
                        spin_unlock(&swap_lock);
                        return i;
                }
@@ -459,6 +462,9 @@ int swap_type_of(dev_t device, sector_t offset)
                        se = list_entry(sis->extent_list.next,
                                        struct swap_extent, list);
                        if (se->start_block == offset) {
+                                if (bdev_p)
+                                        *bdev_p = sis->bdev;
                                spin_unlock(&swap_lock);
                                bdput(bdev);
                                return i;
@@ -1357,10 +1363,10 @@ static int swap_show(struct seq_file *swap, void *v)
        }
        file = ptr->swap_file;
-        len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
+        len = seq_path(swap, file->f_path.mnt, file->f_path.dentry, " \t\n\\");
        seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
                       len < 40 ? 40 - len : 1, " ",
-                       S_ISBLK(file->f_dentry->d_inode->i_mode) ?
+                       S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
                                "partition" : "file\t",
                       ptr->pages << (PAGE_SHIFT - 10),
                       ptr->inuse_pages << (PAGE_SHIFT - 10),
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index 5f2cbf0f153c..c7f6e1914bc4 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -79,8 +79,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
        d_instantiate(dentry, inode);
        inode->i_nlink = 0;     /* It is unlinked */
-        file->f_vfsmnt = mntget(shm_mnt);
+        file->f_path.mnt = mntget(shm_mnt);
-        file->f_dentry = dentry;
+        file->f_path.dentry = dentry;
        file->f_mapping = inode->i_mapping;
        file->f_op = &ramfs_file_operations;
        file->f_mode = FMODE_WRITE | FMODE_READ;
diff --git a/mm/truncate.c b/mm/truncate.c
index e07b1e682c38..5df947de7654 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>  /* grr. try_to_release_page,
                                   do_invalidatepage */
@@ -51,6 +52,33 @@ static inline void truncate_partial_page(struct page *page, unsigned partial)
 }
 /*
+ * This cancels just the dirty bit on the kernel page itself, it
+ * does NOT actually remove dirty bits on any mmap's that may be
+ * around. It also leaves the page tagged dirty, so any sync
+ * activity will still find it on the dirty lists, and in particular,
+ * clear_page_dirty_for_io() will still look at the dirty bits in
+ * the VM.
+ *
+ * Doing this should *normally* only ever be done when a page
+ * is truncated, and is not actually mapped anywhere at all. However,
+ * fs/buffer.c does this when it notices that somebody has cleaned
+ * out all the buffers on a page without actually doing it through
+ * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
+ */
+void cancel_dirty_page(struct page *page, unsigned int account_size)
+{
+        if (TestClearPageDirty(page)) {
+                struct address_space *mapping = page->mapping;
+                if (mapping && mapping_cap_account_dirty(mapping)) {
+                        dec_zone_page_state(page, NR_FILE_DIRTY);
+                        if (account_size)
+                                task_io_account_cancelled_write(account_size);
+                }
+        }
+}
+EXPORT_SYMBOL(cancel_dirty_page);
+/*
 * If truncate cannot remove the fs-private metadata from the page, the page
 * becomes anonymous.  It will be left on the LRU and may even be mapped into
 * user pagetables if we're racing with filemap_nopage().
@@ -66,10 +94,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        if (page->mapping != mapping)
                return;
+        cancel_dirty_page(page, PAGE_CACHE_SIZE);
        if (PagePrivate(page))
                do_invalidatepage(page, 0);
-        clear_page_dirty(page);
        ClearPageUptodate(page);
        ClearPageMappedToDisk(page);
        remove_from_page_cache(page);
@@ -319,6 +348,15 @@ failed:
        return 0;
 }
+static int do_launder_page(struct address_space *mapping, struct page *page)
+{
+        if (!PageDirty(page))
+                return 0;
+        if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
+                return 0;
+        return mapping->a_ops->launder_page(page);
+}
 /**
 * invalidate_inode_pages2_range - remove range of pages from an address_space
 * @mapping: the address_space
@@ -348,7 +386,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
                        pgoff_t page_index;
-                        int was_dirty;
                        lock_page(page);
                        if (page->mapping != mapping) {
@@ -384,18 +421,14 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                                          PAGE_CACHE_SIZE, 0);
                                }
                        }
-                        was_dirty = test_clear_page_dirty(page);
+                        ret = do_launder_page(mapping, page);
-                        if (!invalidate_complete_page2(mapping, page)) {
+                        if (ret == 0 && !invalidate_complete_page2(mapping, page))
-                                if (was_dirty)
-                                        set_page_dirty(page);
                                ret = -EIO;
-                        }
                        unlock_page(page);
                }
                pagevec_release(&pvec);
                cond_resched();
        }
-        WARN_ON_ONCE(ret);
        return ret;
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 093f5fe6dd77..7430df68cb64 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -692,7 +692,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                        __count_vm_events(KSWAPD_STEAL, nr_freed);
                } else
                        __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
-                __count_vm_events(PGACTIVATE, nr_freed);
+                __count_zone_vm_events(PGSTEAL, zone, nr_freed);
                if (nr_taken == 0)
                        goto done;
@@ -984,7 +984,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
                if (!populated_zone(zone))
                        continue;
-                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                        continue;
                note_zone_scanning_priority(zone, priority);
@@ -1034,7 +1034,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
        for (i = 0; zones[i] != NULL; i++) {
                struct zone *zone = zones[i];
-                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                        continue;
                lru_pages += zone->nr_active + zone->nr_inactive;
@@ -1089,7 +1089,7 @@ out:
        for (i = 0; zones[i] != 0; i++) {
                struct zone *zone = zones[i];
-                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                        continue;
                zone->prev_priority = priority;
@@ -1354,7 +1354,7 @@ void wakeup_kswapd(struct zone *zone, int order)
                return;
        if (pgdat->kswapd_max_order < order)
                pgdat->kswapd_max_order = order;
-        if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                return;
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
@@ -1369,8 +1369,8 @@ void wakeup_kswapd(struct zone *zone, int order)
 *
 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
 */
-static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
+static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
-                                      int prio, struct scan_control *sc)
+                                      int pass, struct scan_control *sc)
 {
        struct zone *zone;
        unsigned long nr_to_scan, ret = 0;
@@ -1406,6 +1406,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
        return ret;
 }
+static unsigned long count_lru_pages(void)
+{
+        struct zone *zone;
+        unsigned long ret = 0;
+        for_each_zone(zone)
+                ret += zone->nr_active + zone->nr_inactive;
+        return ret;
+}
 /*
 * Try to free `nr_pages' of memory, system-wide, and return the number of
 * freed pages.
@@ -1420,7 +1430,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
        unsigned long ret = 0;
        int pass;
        struct reclaim_state reclaim_state;
-        struct zone *zone;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .may_swap = 0,
@@ -1431,10 +1440,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
        current->reclaim_state = &reclaim_state;
-        lru_pages = 0;
+        lru_pages = count_lru_pages();
-        for_each_zone(zone)
-                lru_pages += zone->nr_active + zone->nr_inactive;
        nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
        /* If slab caches are huge, it's better to hit them first */
        while (nr_slab >= lru_pages) {
@@ -1461,13 +1467,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
        for (pass = 0; pass < 5; pass++) {
                int prio;
-                /* Needed for shrinking slab caches later on */
-                if (!lru_pages)
-                        for_each_zone(zone) {
-                                lru_pages += zone->nr_active;
-                                lru_pages += zone->nr_inactive;
-                        }
                /* Force reclaiming mapped pages in the passes #3 and #4 */
                if (pass > 2) {
                        sc.may_swap = 1;
@@ -1483,7 +1482,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
                                goto out;
                        reclaim_state.reclaimed_slab = 0;
-                        shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
+                        shrink_slab(sc.nr_scanned, sc.gfp_mask,
+                                        count_lru_pages());
                        ret += reclaim_state.reclaimed_slab;
                        if (ret >= nr_pages)
                                goto out;
@@ -1491,20 +1491,19 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
                        if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
                                congestion_wait(WRITE, HZ / 10);
                }
-                lru_pages = 0;
        }
        /*
         * If ret = 0, we could not shrink LRUs, but there may be something
         * in slab caches
         */
-        if (!ret)
+        if (!ret) {
                do {
                        reclaim_state.reclaimed_slab = 0;
-                        shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+                        shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
                        ret += reclaim_state.reclaimed_slab;
                } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+        }
 out:
        current->reclaim_state = NULL;
author	Dmitry Torokhov <dtor@insightbb.com>	2007-02-10 01:26:32 -0500
committer	Dmitry Torokhov <dtor@insightbb.com>	2007-02-10 01:26:32 -0500
commit	b22364c8eec89e6b0c081a237f3b6348df87796f (patch)
tree	233a923281fb640106465d076997ff511efb6edf /mm
parent	2c8dc071517ec2843869024dc82be2e246f41064 (diff)
parent	66efc5a7e3061c3597ac43a8bb1026488d57e66b (diff)