14 files changed, 162 insertions, 50 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 4af15d0340ad..ad8eec6e44a8 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -405,6 +405,29 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
 }
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
+static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
+                        unsigned long step)
+{
+        unsigned long base = bdata->node_min_pfn;
+        /*
+         * Align the index with respect to the node start so that the
+         * combination of both satisfies the requested alignment.
+         */
+        return ALIGN(base + idx, step) - base;
+}
+static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
+                        unsigned long align)
+{
+        unsigned long base = PFN_PHYS(bdata->node_min_pfn);
+        /* Same as align_idx for byte offsets */
+        return ALIGN(base + off, align) - base;
+}
 static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
                                unsigned long size, unsigned long align,
                                unsigned long goal, unsigned long limit)
@@ -441,7 +464,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
        else
                start = ALIGN(min, step);
-        sidx = start - bdata->node_min_pfn;;
+        sidx = start - bdata->node_min_pfn;
        midx = max - bdata->node_min_pfn;
        if (bdata->hint_idx > sidx) {
@@ -450,7 +473,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
                 * catch the fallback below.
                 */
                fallback = sidx + 1;
-                sidx = ALIGN(bdata->hint_idx, step);
+                sidx = align_idx(bdata, bdata->hint_idx, step);
        }
        while (1) {
@@ -459,7 +482,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
                unsigned long eidx, i, start_off, end_off;
 find_block:
                sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
-                sidx = ALIGN(sidx, step);
+                sidx = align_idx(bdata, sidx, step);
                eidx = sidx + PFN_UP(size);
                if (sidx >= midx || eidx > midx)
@@ -467,15 +490,15 @@ find_block:
                for (i = sidx; i < eidx; i++)
                        if (test_bit(i, bdata->node_bootmem_map)) {
-                                sidx = ALIGN(i, step);
+                                sidx = align_idx(bdata, i, step);
                                if (sidx == i)
                                        sidx += step;
                                goto find_block;
                        }
-                if (bdata->last_end_off &&
+                if (bdata->last_end_off & (PAGE_SIZE - 1) &&
                                PFN_DOWN(bdata->last_end_off) + 1 == sidx)
-                        start_off = ALIGN(bdata->last_end_off, align);
+                        start_off = align_off(bdata, bdata->last_end_off, align);
                else
                        start_off = PFN_PHYS(sidx);
@@ -499,7 +522,7 @@ find_block:
        }
        if (fallback) {
-                sidx = ALIGN(fallback - 1, step);
+                sidx = align_idx(bdata, fallback - 1, step);
                fallback = 0;
                goto find_block;
        }
diff --git a/mm/filemap.c b/mm/filemap.c
index 54e968650855..876bc595d0f8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2129,13 +2129,20 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
         * After a write we want buffered reads to be sure to go to disk to get
         * the new data.  We invalidate clean cached page from the region we're
         * about to write.  We do this *before* the write so that we can return
-         * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
+         * without clobbering -EIOCBQUEUED from ->direct_IO().
         */
        if (mapping->nrpages) {
                written = invalidate_inode_pages2_range(mapping,
                                        pos >> PAGE_CACHE_SHIFT, end);
-                if (written)
+                /*
+                 * If a page can not be invalidated, return 0 to fall back
+                 * to buffered write.
+                 */
+                if (written) {
+                        if (written == -EBUSY)
+                                return 0;
                        goto out;
+                }
        }
        written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 380ab402d711..b5167dfb2f2d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -15,6 +15,8 @@
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
 #include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
@@ -22,22 +24,18 @@
 * We do use our own empty page to avoid interference with other users
 * of ZERO_PAGE(), such as /dev/zero
 */
+static DEFINE_MUTEX(xip_sparse_mutex);
+static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
 static struct page *__xip_sparse_page;
+/* called under xip_sparse_mutex */
 static struct page *xip_sparse_page(void)
 {
        if (!__xip_sparse_page) {
                struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
-                if (page) {
+                if (page)
-                        static DEFINE_SPINLOCK(xip_alloc_lock);
+                        __xip_sparse_page = page;
-                        spin_lock(&xip_alloc_lock);
-                        if (!__xip_sparse_page)
-                                __xip_sparse_page = page;
-                        else
-                                __free_page(page);
-                        spin_unlock(&xip_alloc_lock);
-                }
        }
        return __xip_sparse_page;
 }
@@ -174,18 +172,23 @@ __xip_unmap (struct address_space * mapping,
        pte_t pteval;
        spinlock_t *ptl;
        struct page *page;
+        unsigned count;
+        int locked = 0;
+        count = read_seqcount_begin(&xip_sparse_seq);
        page = __xip_sparse_page;
        if (!page)
                return;
+retry:
        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                mm = vma->vm_mm;
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-                pte = page_check_address(page, mm, address, &ptl);
+                pte = page_check_address(page, mm, address, &ptl, 1);
                if (pte) {
                        /* Nuke the page table entry. */
                        flush_cache_page(vma, address, pte_pfn(*pte));
@@ -198,6 +201,14 @@ __xip_unmap (struct address_space * mapping,
                }
        }
        spin_unlock(&mapping->i_mmap_lock);
+        if (locked) {
+                mutex_unlock(&xip_sparse_mutex);
+        } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
+                mutex_lock(&xip_sparse_mutex);
+                locked = 1;
+                goto retry;
+        }
 }
 /*
@@ -218,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        int error;
        /* XXX: are VM_FAULT_ codes OK? */
+again:
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (vmf->pgoff >= size)
                return VM_FAULT_SIGBUS;
@@ -237,8 +248,10 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                int err;
                /* maybe shared writable, allocate new block */
+                mutex_lock(&xip_sparse_mutex);
                error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
                                                        &xip_mem, &xip_pfn);
+                mutex_unlock(&xip_sparse_mutex);
                if (error)
                        return VM_FAULT_SIGBUS;
                /* unmap sparse mappings at pgoff from all other vmas */
@@ -252,14 +265,34 @@ found:
                BUG_ON(err);
                return VM_FAULT_NOPAGE;
        } else {
+                int err, ret = VM_FAULT_OOM;
+                mutex_lock(&xip_sparse_mutex);
+                write_seqcount_begin(&xip_sparse_seq);
+                error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+                                                        &xip_mem, &xip_pfn);
+                if (unlikely(!error)) {
+                        write_seqcount_end(&xip_sparse_seq);
+                        mutex_unlock(&xip_sparse_mutex);
+                        goto again;
+                }
+                if (error != -ENODATA)
+                        goto out;
                /* not shared and writable, use xip_sparse_page() */
                page = xip_sparse_page();
                if (!page)
-                        return VM_FAULT_OOM;
+                        goto out;
+                err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
+                                                        page);
+                if (err == -ENOMEM)
+                        goto out;
-                page_cache_get(page);
+                ret = VM_FAULT_NOPAGE;
-                vmf->page = page;
+out:
-                return 0;
+                write_seqcount_end(&xip_sparse_seq);
+                mutex_unlock(&xip_sparse_mutex);
+                return ret;
        }
 }
@@ -308,8 +341,10 @@ __xip_file_write(struct file *filp, const char __user *buf,
                                                &xip_mem, &xip_pfn);
                if (status == -ENODATA) {
                        /* we allocate a new page unmap it */
+                        mutex_lock(&xip_sparse_mutex);
                        status = a_ops->get_xip_mem(mapping, index, 1,
                                                        &xip_mem, &xip_pfn);
+                        mutex_unlock(&xip_sparse_mutex);
                        if (!status)
                                /* unmap page at pgoff from all other vmas */
                                __xip_unmap(mapping, index);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 936ef2efd892..4e0e26591dfa 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -12,7 +12,7 @@
 #include "internal.h"
 #ifdef CONFIG_DEBUG_MEMORY_INIT
-int __meminitdata mminit_loglevel;
+int mminit_loglevel;
 #ifndef SECTIONS_SHIFT
 #define SECTIONS_SHIFT  0
diff --git a/mm/mmap.c b/mm/mmap.c
index 339cf5c4d5d8..e7a5a68a9c2e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1030,6 +1030,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
        } else {
                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
+                        /*
+                         * Ignore pgoff.
+                         */
+                        pgoff = 0;
                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        break;
                case MAP_PRIVATE:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8a5467ee6265..64e5b4bcd964 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/memcontrol.h>
+#include <linux/security.h>
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
@@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
         * Superuser processes are usually more important, so we make it
         * less likely that we kill those.
         */
-        if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE))
+        if (has_capability(p, CAP_SYS_ADMIN) ||
+            has_capability(p, CAP_SYS_RESOURCE))
                points /= 4;
        /*
@@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
         * tend to only have this flag set on applications they think
         * of as important.
         */
-        if (__capable(p, CAP_SYS_RAWIO))
+        if (has_capability(p, CAP_SYS_RAWIO))
                points /= 4;
        /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index af982f7cdb2a..e293c58bea58 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -694,6 +694,9 @@ static int move_freepages(struct zone *zone,
 #endif
        for (page = start_page; page <= end_page;) {
+                /* Make sure we are not inadvertently changing nodes */
+                VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
                if (!pfn_valid_within(page_to_pfn(page))) {
                        page++;
                        continue;
@@ -2516,6 +2519,10 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                        continue;
                page = pfn_to_page(pfn);
+                /* Watch out for overlapping nodes */
+                if (page_to_nid(page) != zone_to_nid(zone))
+                        continue;
                /* Blocks with reserved pages will never free, skip them. */
                if (PageReserved(page))
                        continue;
@@ -4064,7 +4071,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
 }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] };
+struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
 EXPORT_SYMBOL(contig_page_data);
 #endif
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 3444b58033c8..c69f84fe038d 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -2,7 +2,6 @@
 * linux/mm/page_isolation.c
 */
-#include <stddef.h>
 #include <linux/mm.h>
 #include <linux/page-isolation.h>
 #include <linux/pageblock-flags.h>
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 3f703f7cb398..8dbb6805ef35 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
 static unsigned long max_pages(unsigned long min_pages)
 {
        unsigned long node_free_pages, max;
-        struct zone *zones = NODE_DATA(numa_node_id())->node_zones;
+        int node = numa_node_id();
+        struct zone *zones = NODE_DATA(node)->node_zones;
+        int num_cpus_on_node;
+        node_to_cpumask_ptr(cpumask_on_node, node);
        node_free_pages =
 #ifdef CONFIG_ZONE_DMA
@@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages)
                zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
        max = node_free_pages / FRACTION_OF_NODE_MEM;
+        num_cpus_on_node = cpus_weight_nr(*cpumask_on_node);
+        max /= num_cpus_on_node;
        return max(max, min_pages);
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 1ea4e6fcee77..0383acfcb068 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -224,10 +224,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 /*
 * Check that @page is mapped at @address into @mm.
 *
+ * If @sync is false, page_check_address may perform a racy check to avoid
+ * the page table lock when the pte is not present (helpful when reclaiming
+ * highly shared pages).
+ *
 * On success returns with pte mapped and locked.
 */
 pte_t *page_check_address(struct page *page, struct mm_struct *mm,
-                          unsigned long address, spinlock_t **ptlp)
+                          unsigned long address, spinlock_t **ptlp, int sync)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -249,7 +253,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
        pte = pte_offset_map(pmd, address);
        /* Make a quick check before getting the lock */
-        if (!pte_present(*pte)) {
+        if (!sync && !pte_present(*pte)) {
                pte_unmap(pte);
                return NULL;
        }
@@ -281,7 +285,7 @@ static int page_referenced_one(struct page *page,
        if (address == -EFAULT)
                goto out;
-        pte = page_check_address(page, mm, address, &ptl);
+        pte = page_check_address(page, mm, address, &ptl, 0);
        if (!pte)
                goto out;
@@ -450,7 +454,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
        if (address == -EFAULT)
                goto out;
-        pte = page_check_address(page, mm, address, &ptl);
+        pte = page_check_address(page, mm, address, &ptl, 1);
        if (!pte)
                goto out;
@@ -659,23 +663,30 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
                }
                /*
-                 * It would be tidy to reset the PageAnon mapping here,
+                 * Now that the last pte has gone, s390 must transfer dirty
-                 * but that might overwrite a racing page_add_anon_rmap
+                 * flag from storage key to struct page.  We can usually skip
-                 * which increments mapcount after us but sets mapping
+                 * this if the page is anon, so about to be freed; but perhaps
-                 * before us: so leave the reset to free_hot_cold_page,
+                 * not if it's in swapcache - there might be another pte slot
-                 * and remember that it's only reliable while mapped.
+                 * containing the swap entry, but page not yet written to swap.
-                 * Leaving it set also helps swapoff to reinstate ptes
-                 * faster for those pages still in swapcache.
                 */
                if ((!PageAnon(page) || PageSwapCache(page)) &&
                    page_test_dirty(page)) {
                        page_clear_dirty(page);
                        set_page_dirty(page);
                }
-                mem_cgroup_uncharge_page(page);
+                mem_cgroup_uncharge_page(page);
                __dec_zone_page_state(page,
-                                PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+                        PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+                /*
+                 * It would be tidy to reset the PageAnon mapping here,
+                 * but that might overwrite a racing page_add_anon_rmap
+                 * which increments mapcount after us but sets mapping
+                 * before us: so leave the reset to free_hot_cold_page,
+                 * and remember that it's only reliable while mapped.
+                 * Leaving it set also helps swapoff to reinstate ptes
+                 * faster for those pages still in swapcache.
+                 */
        }
 }
@@ -697,7 +708,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        if (address == -EFAULT)
                goto out;
-        pte = page_check_address(page, mm, address, &ptl);
+        pte = page_check_address(page, mm, address, &ptl, 0);
        if (!pte)
                goto out;
diff --git a/mm/slub.c b/mm/slub.c
index 4f5b96149458..fb486d5540f8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2312,7 +2312,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
        s->refcount = 1;
 #ifdef CONFIG_NUMA
-        s->remote_node_defrag_ratio = 100;
+        s->remote_node_defrag_ratio = 1000;
 #endif
        if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
                goto error;
@@ -4058,7 +4058,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
        if (err)
                return err;
-        if (ratio < 100)
+        if (ratio <= 100)
                s->remote_node_defrag_ratio = ratio * 10;
        return length;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 167cf2dc8a03..797c3831cbec 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -60,7 +60,7 @@ void show_swap_cache_info(void)
        printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
                swap_cache_info.add_total, swap_cache_info.del_total,
                swap_cache_info.find_success, swap_cache_info.find_total);
-        printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+        printk("Free swap  = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
        printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index 250505091d37..6650c1d878b4 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -380,7 +380,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
- * Returns -EIO if any pages could not be invalidated.
+ * Returns -EBUSY if any pages could not be invalidated.
 */
 int invalidate_inode_pages2_range(struct address_space *mapping,
                                  pgoff_t start, pgoff_t end)
@@ -440,7 +440,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                        ret2 = do_launder_page(mapping, page);
                        if (ret2 == 0) {
                                if (!invalidate_complete_page2(mapping, page))
-                                        ret2 = -EIO;
+                                        ret2 = -EBUSY;
                        }
                        if (ret2 < 0)
                                ret = ret2;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b0d08e667ece..d7826af2fb07 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -516,9 +516,26 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
                        continue;
                page = pfn_to_page(pfn);
+#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES
+                /*
+                 * Ordinarily, memory holes in flatmem still have a valid
+                 * memmap for the PFN range. However, an architecture for
+                 * embedded systems (e.g. ARM) can free up the memmap backing
+                 * holes to save memory on the assumption the memmap is
+                 * never used. The page_zone linkages are then broken even
+                 * though pfn_valid() returns true. Skip the page if the
+                 * linkages are broken. Even if this test passed, the impact
+                 * is that the counters for the movable type are off but
+                 * fragmentation monitoring is likely meaningless on small
+                 * systems.
+                 */
+                if (page_zone(page) != zone)
+                        continue;
+#endif
                mtype = get_pageblock_migratetype(page);
-                count[mtype]++;
+                if (mtype < MIGRATE_TYPES)
+                        count[mtype]++;
        }
        /* Print counts */