Merge branch 'linus' into x86/defconfig

author: Ingo Molnar <mingo@elte.hu> 2008-09-05 12:56:57 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-09-05 12:56:57 -0400
commit: 616ad8c44281c0c6711a72b560e01ec335ff27e0 (patch)
tree: 0a20453ffedb09db6fb41a0c2208ccc2c7751d3a /mm
parent: 99809963c99e1ed868d9ebeb4a5e7ee1cbe0309f (diff)
parent: b380b0d4f7dffcc235c0facefa537d4655619101 (diff)
20 files changed, 243 insertions, 75 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 446c6588c753..0bd9c2dbb2a0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -77,9 +77,6 @@ config FLAT_NODE_MEM_MAP
        def_bool y
        depends on !SPARSEMEM
-config HAVE_GET_USER_PAGES_FAST
-        bool
 #
 # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
 # to represent different areas of memory.  This variable allows
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 4af15d0340ad..ad8eec6e44a8 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -405,6 +405,29 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
 }
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
+static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
+                        unsigned long step)
+{
+        unsigned long base = bdata->node_min_pfn;
+        /*
+         * Align the index with respect to the node start so that the
+         * combination of both satisfies the requested alignment.
+         */
+        return ALIGN(base + idx, step) - base;
+}
+static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
+                        unsigned long align)
+{
+        unsigned long base = PFN_PHYS(bdata->node_min_pfn);
+        /* Same as align_idx for byte offsets */
+        return ALIGN(base + off, align) - base;
+}
 static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
                                unsigned long size, unsigned long align,
                                unsigned long goal, unsigned long limit)
@@ -441,7 +464,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
        else
                start = ALIGN(min, step);
-        sidx = start - bdata->node_min_pfn;;
+        sidx = start - bdata->node_min_pfn;
        midx = max - bdata->node_min_pfn;
        if (bdata->hint_idx > sidx) {
@@ -450,7 +473,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
                 * catch the fallback below.
                 */
                fallback = sidx + 1;
-                sidx = ALIGN(bdata->hint_idx, step);
+                sidx = align_idx(bdata, bdata->hint_idx, step);
        }
        while (1) {
@@ -459,7 +482,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
                unsigned long eidx, i, start_off, end_off;
 find_block:
                sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
-                sidx = ALIGN(sidx, step);
+                sidx = align_idx(bdata, sidx, step);
                eidx = sidx + PFN_UP(size);
                if (sidx >= midx || eidx > midx)
@@ -467,15 +490,15 @@ find_block:
                for (i = sidx; i < eidx; i++)
                        if (test_bit(i, bdata->node_bootmem_map)) {
-                                sidx = ALIGN(i, step);
+                                sidx = align_idx(bdata, i, step);
                                if (sidx == i)
                                        sidx += step;
                                goto find_block;
                        }
-                if (bdata->last_end_off &&
+                if (bdata->last_end_off & (PAGE_SIZE - 1) &&
                                PFN_DOWN(bdata->last_end_off) + 1 == sidx)
-                        start_off = ALIGN(bdata->last_end_off, align);
+                        start_off = align_off(bdata, bdata->last_end_off, align);
                else
                        start_off = PFN_PHYS(sidx);
@@ -499,7 +522,7 @@ find_block:
        }
        if (fallback) {
-                sidx = ALIGN(fallback - 1, step);
+                sidx = align_idx(bdata, fallback - 1, step);
                fallback = 0;
                goto find_block;
        }
diff --git a/mm/filemap.c b/mm/filemap.c
index 54e968650855..876bc595d0f8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2129,13 +2129,20 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
         * After a write we want buffered reads to be sure to go to disk to get
         * the new data.  We invalidate clean cached page from the region we're
         * about to write.  We do this *before* the write so that we can return
-         * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
+         * without clobbering -EIOCBQUEUED from ->direct_IO().
         */
        if (mapping->nrpages) {
                written = invalidate_inode_pages2_range(mapping,
                                        pos >> PAGE_CACHE_SHIFT, end);
-                if (written)
+                /*
+                 * If a page can not be invalidated, return 0 to fall back
+                 * to buffered write.
+                 */
+                if (written) {
+                        if (written == -EBUSY)
+                                return 0;
                        goto out;
+                }
        }
        written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 380ab402d711..b5167dfb2f2d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -15,6 +15,8 @@
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
 #include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
@@ -22,22 +24,18 @@
 * We do use our own empty page to avoid interference with other users
 * of ZERO_PAGE(), such as /dev/zero
 */
+static DEFINE_MUTEX(xip_sparse_mutex);
+static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
 static struct page *__xip_sparse_page;
+/* called under xip_sparse_mutex */
 static struct page *xip_sparse_page(void)
 {
        if (!__xip_sparse_page) {
                struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
-                if (page) {
+                if (page)
-                        static DEFINE_SPINLOCK(xip_alloc_lock);
+                        __xip_sparse_page = page;
-                        spin_lock(&xip_alloc_lock);
-                        if (!__xip_sparse_page)
-                                __xip_sparse_page = page;
-                        else
-                                __free_page(page);
-                        spin_unlock(&xip_alloc_lock);
-                }
        }
        return __xip_sparse_page;
 }
@@ -174,18 +172,23 @@ __xip_unmap (struct address_space * mapping,
        pte_t pteval;
        spinlock_t *ptl;
        struct page *page;
+        unsigned count;
+        int locked = 0;
+        count = read_seqcount_begin(&xip_sparse_seq);
        page = __xip_sparse_page;
        if (!page)
                return;
+retry:
        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                mm = vma->vm_mm;
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-                pte = page_check_address(page, mm, address, &ptl);
+                pte = page_check_address(page, mm, address, &ptl, 1);
                if (pte) {
                        /* Nuke the page table entry. */
                        flush_cache_page(vma, address, pte_pfn(*pte));
@@ -198,6 +201,14 @@ __xip_unmap (struct address_space * mapping,
                }
        }
        spin_unlock(&mapping->i_mmap_lock);
+        if (locked) {
+                mutex_unlock(&xip_sparse_mutex);
+        } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
+                mutex_lock(&xip_sparse_mutex);
+                locked = 1;
+                goto retry;
+        }
 }
 /*
@@ -218,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        int error;
        /* XXX: are VM_FAULT_ codes OK? */
+again:
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (vmf->pgoff >= size)
                return VM_FAULT_SIGBUS;
@@ -237,8 +248,10 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                int err;
                /* maybe shared writable, allocate new block */
+                mutex_lock(&xip_sparse_mutex);
                error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
                                                        &xip_mem, &xip_pfn);
+                mutex_unlock(&xip_sparse_mutex);
                if (error)
                        return VM_FAULT_SIGBUS;
                /* unmap sparse mappings at pgoff from all other vmas */
@@ -252,14 +265,34 @@ found:
                BUG_ON(err);
                return VM_FAULT_NOPAGE;
        } else {
+                int err, ret = VM_FAULT_OOM;
+                mutex_lock(&xip_sparse_mutex);
+                write_seqcount_begin(&xip_sparse_seq);
+                error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+                                                        &xip_mem, &xip_pfn);
+                if (unlikely(!error)) {
+                        write_seqcount_end(&xip_sparse_seq);
+                        mutex_unlock(&xip_sparse_mutex);
+                        goto again;
+                }
+                if (error != -ENODATA)
+                        goto out;
                /* not shared and writable, use xip_sparse_page() */
                page = xip_sparse_page();
                if (!page)
-                        return VM_FAULT_OOM;
+                        goto out;
+                err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
+                                                        page);
+                if (err == -ENOMEM)
+                        goto out;
-                page_cache_get(page);
+                ret = VM_FAULT_NOPAGE;
-                vmf->page = page;
+out:
-                return 0;
+                write_seqcount_end(&xip_sparse_seq);
+                mutex_unlock(&xip_sparse_mutex);
+                return ret;
        }
 }
@@ -308,8 +341,10 @@ __xip_file_write(struct file *filp, const char __user *buf,
                                                &xip_mem, &xip_pfn);
                if (status == -ENODATA) {
                        /* we allocate a new page unmap it */
+                        mutex_lock(&xip_sparse_mutex);
                        status = a_ops->get_xip_mem(mapping, index, 1,
                                                        &xip_mem, &xip_pfn);
+                        mutex_unlock(&xip_sparse_mutex);
                        if (!status)
                                /* unmap page at pgoff from all other vmas */
                                __xip_unmap(mapping, index);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 757ca983fd99..67a71191136e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -565,7 +565,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
                huge_page_order(h));
        if (page) {
                if (arch_prepare_hugepage(page)) {
-                        __free_pages(page, HUGETLB_PAGE_ORDER);
+                        __free_pages(page, huge_page_order(h));
                        return NULL;
                }
                prep_new_huge_page(h, page, nid);
@@ -665,6 +665,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
                                        __GFP_REPEAT|__GFP_NOWARN,
                                        huge_page_order(h));
+        if (page && arch_prepare_hugepage(page)) {
+                __free_pages(page, huge_page_order(h));
+                return NULL;
+        }
        spin_lock(&hugetlb_lock);
        if (page) {
                /*
@@ -1937,6 +1942,18 @@ retry:
                        lock_page(page);
        }
+        /*
+         * If we are going to COW a private mapping later, we examine the
+         * pending reservations for this page now. This will ensure that
+         * any allocations necessary to record that reservation occur outside
+         * the spinlock.
+         */
+        if (write_access && !(vma->vm_flags & VM_SHARED))
+                if (vma_needs_reservation(h, vma, address) < 0) {
+                        ret = VM_FAULT_OOM;
+                        goto backout_unlocked;
+                }
        spin_lock(&mm->page_table_lock);
        size = i_size_read(mapping->host) >> huge_page_shift(h);
        if (idx >= size)
@@ -1962,6 +1979,7 @@ out:
 backout:
        spin_unlock(&mm->page_table_lock);
+backout_unlocked:
        unlock_page(page);
        put_page(page);
        goto out;
@@ -1973,6 +1991,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t *ptep;
        pte_t entry;
        int ret;
+        struct page *pagecache_page = NULL;
        static DEFINE_MUTEX(hugetlb_instantiation_mutex);
        struct hstate *h = hstate_vma(vma);
@@ -1989,25 +2008,44 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        entry = huge_ptep_get(ptep);
        if (huge_pte_none(entry)) {
                ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
-                mutex_unlock(&hugetlb_instantiation_mutex);
+                goto out_unlock;
-                return ret;
        }
        ret = 0;
+        /*
+         * If we are going to COW the mapping later, we examine the pending
+         * reservations for this page now. This will ensure that any
+         * allocations necessary to record that reservation occur outside the
+         * spinlock. For private mappings, we also lookup the pagecache
+         * page now as it is used to determine if a reservation has been
+         * consumed.
+         */
+        if (write_access && !pte_write(entry)) {
+                if (vma_needs_reservation(h, vma, address) < 0) {
+                        ret = VM_FAULT_OOM;
+                        goto out_unlock;
+                }
+                if (!(vma->vm_flags & VM_SHARED))
+                        pagecache_page = hugetlbfs_pagecache_page(h,
+                                                                vma, address);
+        }
        spin_lock(&mm->page_table_lock);
        /* Check for a racing update before calling hugetlb_cow */
        if (likely(pte_same(entry, huge_ptep_get(ptep))))
-                if (write_access && !pte_write(entry)) {
+                if (write_access && !pte_write(entry))
-                        struct page *page;
+                        ret = hugetlb_cow(mm, vma, address, ptep, entry,
-                        page = hugetlbfs_pagecache_page(h, vma, address);
+                                                        pagecache_page);
-                        ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
-                        if (page) {
-                                unlock_page(page);
-                                put_page(page);
-                        }
-                }
        spin_unlock(&mm->page_table_lock);
+        if (pagecache_page) {
+                unlock_page(pagecache_page);
+                put_page(pagecache_page);
+        }
+out_unlock:
        mutex_unlock(&hugetlb_instantiation_mutex);
        return ret;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7056c3bdb478..0f1f7a7374ba 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -796,6 +796,8 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
        if (mem_cgroup_subsys.disabled)
                return 0;
+        if (!mm)
+                return 0;
        rcu_read_lock();
        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e550bec20582..83369058ec13 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -803,7 +803,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 int do_migrate_pages(struct mm_struct *mm,
        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 {
-        LIST_HEAD(pagelist);
        int busy = 0;
        int err = 0;
        nodemask_t tmp;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 936ef2efd892..4e0e26591dfa 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -12,7 +12,7 @@
 #include "internal.h"
 #ifdef CONFIG_DEBUG_MEMORY_INIT
-int __meminitdata mminit_loglevel;
+int mminit_loglevel;
 #ifndef SECTIONS_SHIFT
 #define SECTIONS_SHIFT  0
diff --git a/mm/mmap.c b/mm/mmap.c
index 971d0eda754a..e7a5a68a9c2e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1030,6 +1030,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
        } else {
                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
+                        /*
+                         * Ignore pgoff.
+                         */
+                        pgoff = 0;
                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        break;
                case MAP_PRIVATE:
@@ -2273,14 +2277,14 @@ int install_special_mapping(struct mm_struct *mm,
 static DEFINE_MUTEX(mm_all_locks_mutex);
-static void vm_lock_anon_vma(struct anon_vma *anon_vma)
+static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
        if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
                /*
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-                spin_lock(&anon_vma->lock);
+                spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
                /*
                 * We can safely modify head.next after taking the
                 * anon_vma->lock. If some other vma in this mm shares
@@ -2296,7 +2300,7 @@ static void vm_lock_anon_vma(struct anon_vma *anon_vma)
        }
 }
-static void vm_lock_mapping(struct address_space *mapping)
+static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
 {
        if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
                /*
@@ -2310,7 +2314,7 @@ static void vm_lock_mapping(struct address_space *mapping)
                 */
                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
                        BUG();
-                spin_lock(&mapping->i_mmap_lock);
+                spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
        }
 }
@@ -2358,11 +2362,17 @@ int mm_take_all_locks(struct mm_struct *mm)
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (signal_pending(current))
                        goto out_unlock;
-                if (vma->anon_vma)
-                        vm_lock_anon_vma(vma->anon_vma);
                if (vma->vm_file && vma->vm_file->f_mapping)
-                        vm_lock_mapping(vma->vm_file->f_mapping);
+                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
+        }
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (signal_pending(current))
+                        goto out_unlock;
+                if (vma->anon_vma)
+                        vm_lock_anon_vma(mm, vma->anon_vma);
        }
        ret = 0;
 out_unlock:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8a5467ee6265..64e5b4bcd964 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/memcontrol.h>
+#include <linux/security.h>
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
@@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
         * Superuser processes are usually more important, so we make it
         * less likely that we kill those.
         */
-        if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE))
+        if (has_capability(p, CAP_SYS_ADMIN) ||
+            has_capability(p, CAP_SYS_RESOURCE))
                points /= 4;
        /*
@@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
         * tend to only have this flag set on applications they think
         * of as important.
         */
-        if (__capable(p, CAP_SYS_RAWIO))
+        if (has_capability(p, CAP_SYS_RAWIO))
                points /= 4;
        /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 401d104d2bb6..e293c58bea58 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -694,6 +694,9 @@ static int move_freepages(struct zone *zone,
 #endif
        for (page = start_page; page <= end_page;) {
+                /* Make sure we are not inadvertently changing nodes */
+                VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
                if (!pfn_valid_within(page_to_pfn(page))) {
                        page++;
                        continue;
@@ -2516,6 +2519,10 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                        continue;
                page = pfn_to_page(pfn);
+                /* Watch out for overlapping nodes */
+                if (page_to_nid(page) != zone_to_nid(zone))
+                        continue;
                /* Blocks with reserved pages will never free, skip them. */
                if (PageReserved(page))
                        continue;
@@ -4064,7 +4071,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
 }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] };
+struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
 EXPORT_SYMBOL(contig_page_data);
 #endif
@@ -4437,7 +4444,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        do {
                size = bucketsize << log2qty;
                if (flags & HASH_EARLY)
-                        table = alloc_bootmem(size);
+                        table = alloc_bootmem_nopanic(size);
                else if (hashdist)
                        table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
                else {
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 3444b58033c8..c69f84fe038d 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -2,7 +2,6 @@
 * linux/mm/page_isolation.c
 */
-#include <stddef.h>
 #include <linux/mm.h>
 #include <linux/page-isolation.h>
 #include <linux/pageblock-flags.h>
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 3f703f7cb398..8dbb6805ef35 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
 static unsigned long max_pages(unsigned long min_pages)
 {
        unsigned long node_free_pages, max;
-        struct zone *zones = NODE_DATA(numa_node_id())->node_zones;
+        int node = numa_node_id();
+        struct zone *zones = NODE_DATA(node)->node_zones;
+        int num_cpus_on_node;
+        node_to_cpumask_ptr(cpumask_on_node, node);
        node_free_pages =
 #ifdef CONFIG_ZONE_DMA
@@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages)
                zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
        max = node_free_pages / FRACTION_OF_NODE_MEM;
+        num_cpus_on_node = cpus_weight_nr(*cpumask_on_node);
+        max /= num_cpus_on_node;
        return max(max, min_pages);
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 1ea4e6fcee77..0383acfcb068 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -224,10 +224,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 /*
 * Check that @page is mapped at @address into @mm.
 *
+ * If @sync is false, page_check_address may perform a racy check to avoid
+ * the page table lock when the pte is not present (helpful when reclaiming
+ * highly shared pages).
+ *
 * On success returns with pte mapped and locked.
 */
 pte_t *page_check_address(struct page *page, struct mm_struct *mm,
-                          unsigned long address, spinlock_t **ptlp)
+                          unsigned long address, spinlock_t **ptlp, int sync)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -249,7 +253,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
        pte = pte_offset_map(pmd, address);
        /* Make a quick check before getting the lock */
-        if (!pte_present(*pte)) {
+        if (!sync && !pte_present(*pte)) {
                pte_unmap(pte);
                return NULL;
        }
@@ -281,7 +285,7 @@ static int page_referenced_one(struct page *page,
        if (address == -EFAULT)
                goto out;
-        pte = page_check_address(page, mm, address, &ptl);
+        pte = page_check_address(page, mm, address, &ptl, 0);
        if (!pte)
                goto out;
@@ -450,7 +454,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
        if (address == -EFAULT)
                goto out;
-        pte = page_check_address(page, mm, address, &ptl);
+        pte = page_check_address(page, mm, address, &ptl, 1);
        if (!pte)
                goto out;
@@ -659,23 +663,30 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
                }
                /*
-                 * It would be tidy to reset the PageAnon mapping here,
+                 * Now that the last pte has gone, s390 must transfer dirty
-                 * but that might overwrite a racing page_add_anon_rmap
+                 * flag from storage key to struct page.  We can usually skip
-                 * which increments mapcount after us but sets mapping
+                 * this if the page is anon, so about to be freed; but perhaps
-                 * before us: so leave the reset to free_hot_cold_page,
+                 * not if it's in swapcache - there might be another pte slot
-                 * and remember that it's only reliable while mapped.
+                 * containing the swap entry, but page not yet written to swap.
-                 * Leaving it set also helps swapoff to reinstate ptes
-                 * faster for those pages still in swapcache.
                 */
                if ((!PageAnon(page) || PageSwapCache(page)) &&
                    page_test_dirty(page)) {
                        page_clear_dirty(page);
                        set_page_dirty(page);
                }
-                mem_cgroup_uncharge_page(page);
+                mem_cgroup_uncharge_page(page);
                __dec_zone_page_state(page,
-                                PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+                        PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+                /*
+                 * It would be tidy to reset the PageAnon mapping here,
+                 * but that might overwrite a racing page_add_anon_rmap
+                 * which increments mapcount after us but sets mapping
+                 * before us: so leave the reset to free_hot_cold_page,
+                 * and remember that it's only reliable while mapped.
+                 * Leaving it set also helps swapoff to reinstate ptes
+                 * faster for those pages still in swapcache.
+                 */
        }
 }
@@ -697,7 +708,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        if (address == -EFAULT)
                goto out;
-        pte = page_check_address(page, mm, address, &ptl);
+        pte = page_check_address(page, mm, address, &ptl, 0);
        if (!pte)
                goto out;
diff --git a/mm/slub.c b/mm/slub.c
index 4f5b96149458..fb486d5540f8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2312,7 +2312,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
        s->refcount = 1;
 #ifdef CONFIG_NUMA
-        s->remote_node_defrag_ratio = 100;
+        s->remote_node_defrag_ratio = 1000;
 #endif
        if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
                goto error;
@@ -4058,7 +4058,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
        if (err)
                return err;
-        if (ratio < 100)
+        if (ratio <= 100)
                s->remote_node_defrag_ratio = ratio * 10;
        return length;
diff --git a/mm/sparse.c b/mm/sparse.c
index 5d9dbbb9d39e..39db301b920d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -12,7 +12,6 @@
 #include <asm/dma.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
-#include "internal.h"
 /*
 * Permanent SPARSEMEM data:
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 167cf2dc8a03..797c3831cbec 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -60,7 +60,7 @@ void show_swap_cache_info(void)
        printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
                swap_cache_info.add_total, swap_cache_info.del_total,
                swap_cache_info.find_success, swap_cache_info.find_total);
-        printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+        printk("Free swap  = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
        printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index 250505091d37..6650c1d878b4 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -380,7 +380,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
- * Returns -EIO if any pages could not be invalidated.
+ * Returns -EBUSY if any pages could not be invalidated.
 */
 int invalidate_inode_pages2_range(struct address_space *mapping,
                                  pgoff_t start, pgoff_t end)
@@ -440,7 +440,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                        ret2 = do_launder_page(mapping, page);
                        if (ret2 == 0) {
                                if (!invalidate_complete_page2(mapping, page))
-                                        ret2 = -EIO;
+                                        ret2 = -EBUSY;
                        }
                        if (ret2 < 0)
                                ret = ret2;
diff --git a/mm/util.c b/mm/util.c
index 9341ca77bd88..cb00b748ce47 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -171,3 +171,18 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
        mm->unmap_area = arch_unmap_area;
 }
 #endif
+int __attribute__((weak)) get_user_pages_fast(unsigned long start,
+                                int nr_pages, int write, struct page **pages)
+{
+        struct mm_struct *mm = current->mm;
+        int ret;
+        down_read(&mm->mmap_sem);
+        ret = get_user_pages(current, mm, start, nr_pages,
+                                        write, 0, pages, NULL);
+        up_read(&mm->mmap_sem);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b0d08e667ece..d7826af2fb07 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -516,9 +516,26 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
                        continue;
                page = pfn_to_page(pfn);
+#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES
+                /*
+                 * Ordinarily, memory holes in flatmem still have a valid
+                 * memmap for the PFN range. However, an architecture for
+                 * embedded systems (e.g. ARM) can free up the memmap backing
+                 * holes to save memory on the assumption the memmap is
+                 * never used. The page_zone linkages are then broken even
+                 * though pfn_valid() returns true. Skip the page if the
+                 * linkages are broken. Even if this test passed, the impact
+                 * is that the counters for the movable type are off but
+                 * fragmentation monitoring is likely meaningless on small
+                 * systems.
+                 */
+                if (page_zone(page) != zone)
+                        continue;
+#endif
                mtype = get_pageblock_migratetype(page);
-                count[mtype]++;
+                if (mtype < MIGRATE_TYPES)
+                        count[mtype]++;
        }
        /* Print counts */
author	Ingo Molnar <mingo@elte.hu>	2008-09-05 12:56:57 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-09-05 12:56:57 -0400
commit	616ad8c44281c0c6711a72b560e01ec335ff27e0 (patch)
tree	0a20453ffedb09db6fb41a0c2208ccc2c7751d3a /mm
parent	99809963c99e1ed868d9ebeb4a5e7ee1cbe0309f (diff)
parent	b380b0d4f7dffcc235c0facefa537d4655619101 (diff)