Merge branch 'for-jens' of git://git.drbd.org/linux-drbd into for-3.6/drivers

author: Jens Axboe <axboe@kernel.dk> 2012-07-30 03:03:10 -0400
committer: Jens Axboe <axboe@kernel.dk> 2012-07-30 03:03:10 -0400
commit: 72ea1f74fcdf874cca6d2c0962379523bbd99e2c (patch)
tree: 4c67be6c73356086ff44ef1b8b1c9479702689ca /mm
parent: b1af9be5ef77898c05667bb9dbf3b180d91d3292 (diff)
parent: a73ff3231df59a4b92ccd0dd4e73897c5822489b (diff)
23 files changed, 666 insertions, 266 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b2176374b98e..82fed4eb2b6f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -389,3 +389,20 @@ config CLEANCACHE
          in a negligible performance hit.
          If unsure, say Y to enable cleancache
+config FRONTSWAP
+        bool "Enable frontswap to cache swap pages if tmem is present"
+        depends on SWAP
+        default n
+        help
+          Frontswap is so named because it can be thought of as the opposite
+          of a "backing" store for a swap device.  The data is stored into
+          "transcendent memory", memory that is not directly accessible or
+          addressable by the kernel and is of unknown and possibly
+          time-varying size.  When space in transcendent memory is available,
+          a significant swap I/O reduction may be achieved.  When none is
+          available, all frontswap calls are reduced to a single pointer-
+          compare-against-NULL resulting in a negligible performance hit
+          and swap data is stored as normal on the matching swap device.
+          If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile
index a156285ce88d..2e2fbbefb99f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
 obj-$(CONFIG_BOUNCE)    += bounce.o
 obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_FRONTSWAP) += frontswap.o
 obj-$(CONFIG_HAS_DMA)   += dmapool.o
 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ec4fcb7a56c8..bcb63ac48cc5 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -698,7 +698,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
        return ___alloc_bootmem(size, align, goal, limit);
 }
-static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                unsigned long size, unsigned long align,
                                unsigned long goal, unsigned long limit)
 {
@@ -710,6 +710,10 @@ again:
        if (ptr)
                return ptr;
+        /* do not panic in alloc_bootmem_bdata() */
+        if (limit && goal + size > limit)
+                limit = 0;
        ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
        if (ptr)
                return ptr;
diff --git a/mm/compaction.c b/mm/compaction.c
index 7ea259d82a99..2f42d9528539 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -701,8 +701,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                if (err) {
                        putback_lru_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
+                        if (err == -ENOMEM) {
+                                ret = COMPACT_PARTIAL;
+                                goto out;
+                        }
                }
        }
 out:
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 000000000000..e25025574a02
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,314 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap.  See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2012 Oracle Corp.  All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops frontswap_ops __read_mostly;
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+bool frontswap_enabled __read_mostly;
+EXPORT_SYMBOL(frontswap_enabled);
+/*
+ * If enabled, frontswap_store will return failure even on success.  As
+ * a result, the swap subsystem will always write the page to swap, in
+ * effect converting frontswap into a writethrough cache.  In this mode,
+ * there is no direct reduction in swap writes, but a frontswap backend
+ * can unilaterally "reclaim" any pages in use with no data loss, thus
+ * providing increases control over maximum memory usage due to frontswap.
+ */
+static bool frontswap_writethrough_enabled __read_mostly;
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * properly configured).  These are for information only so are not protected
+ * against increment races.
+ */
+static u64 frontswap_loads;
+static u64 frontswap_succ_stores;
+static u64 frontswap_failed_stores;
+static u64 frontswap_invalidates;
+static inline void inc_frontswap_loads(void) {
+        frontswap_loads++;
+}
+static inline void inc_frontswap_succ_stores(void) {
+        frontswap_succ_stores++;
+}
+static inline void inc_frontswap_failed_stores(void) {
+        frontswap_failed_stores++;
+}
+static inline void inc_frontswap_invalidates(void) {
+        frontswap_invalidates++;
+}
+#else
+static inline void inc_frontswap_loads(void) { }
+static inline void inc_frontswap_succ_stores(void) { }
+static inline void inc_frontswap_failed_stores(void) { }
+static inline void inc_frontswap_invalidates(void) { }
+#endif
+/*
+ * Register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting.
+ */
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+{
+        struct frontswap_ops old = frontswap_ops;
+        frontswap_ops = *ops;
+        frontswap_enabled = true;
+        return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+/*
+ * Enable/disable frontswap writethrough (see above).
+ */
+void frontswap_writethrough(bool enable)
+{
+        frontswap_writethrough_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_writethrough);
+/*
+ * Called when a swap device is swapon'd.
+ */
+void __frontswap_init(unsigned type)
+{
+        struct swap_info_struct *sis = swap_info[type];
+        BUG_ON(sis == NULL);
+        if (sis->frontswap_map == NULL)
+                return;
+        if (frontswap_enabled)
+                (*frontswap_ops.init)(type);
+}
+EXPORT_SYMBOL(__frontswap_init);
+/*
+ * "Store" data from a page to frontswap and associate it with the page's
+ * swaptype and offset.  Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implmentation may either overwrite the data and
+ * return success or invalidate the page from frontswap and return failure.
+ */
+int __frontswap_store(struct page *page)
+{
+        int ret = -1, dup = 0;
+        swp_entry_t entry = { .val = page_private(page), };
+        int type = swp_type(entry);
+        struct swap_info_struct *sis = swap_info[type];
+        pgoff_t offset = swp_offset(entry);
+        BUG_ON(!PageLocked(page));
+        BUG_ON(sis == NULL);
+        if (frontswap_test(sis, offset))
+                dup = 1;
+        ret = (*frontswap_ops.store)(type, offset, page);
+        if (ret == 0) {
+                frontswap_set(sis, offset);
+                inc_frontswap_succ_stores();
+                if (!dup)
+                        atomic_inc(&sis->frontswap_pages);
+        } else if (dup) {
+                /*
+                  failed dup always results in automatic invalidate of
+                  the (older) page from frontswap
+                 */
+                frontswap_clear(sis, offset);
+                atomic_dec(&sis->frontswap_pages);
+                inc_frontswap_failed_stores();
+        } else
+                inc_frontswap_failed_stores();
+        if (frontswap_writethrough_enabled)
+                /* report failure so swap also writes to swap device */
+                ret = -1;
+        return ret;
+}
+EXPORT_SYMBOL(__frontswap_store);
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache.
+ */
+int __frontswap_load(struct page *page)
+{
+        int ret = -1;
+        swp_entry_t entry = { .val = page_private(page), };
+        int type = swp_type(entry);
+        struct swap_info_struct *sis = swap_info[type];
+        pgoff_t offset = swp_offset(entry);
+        BUG_ON(!PageLocked(page));
+        BUG_ON(sis == NULL);
+        if (frontswap_test(sis, offset))
+                ret = (*frontswap_ops.load)(type, offset, page);
+        if (ret == 0)
+                inc_frontswap_loads();
+        return ret;
+}
+EXPORT_SYMBOL(__frontswap_load);
+/*
+ * Invalidate any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+        struct swap_info_struct *sis = swap_info[type];
+        BUG_ON(sis == NULL);
+        if (frontswap_test(sis, offset)) {
+                (*frontswap_ops.invalidate_page)(type, offset);
+                atomic_dec(&sis->frontswap_pages);
+                frontswap_clear(sis, offset);
+                inc_frontswap_invalidates();
+        }
+}
+EXPORT_SYMBOL(__frontswap_invalidate_page);
+/*
+ * Invalidate all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_invalidate_area(unsigned type)
+{
+        struct swap_info_struct *sis = swap_info[type];
+        BUG_ON(sis == NULL);
+        if (sis->frontswap_map == NULL)
+                return;
+        (*frontswap_ops.invalidate_area)(type);
+        atomic_set(&sis->frontswap_pages, 0);
+        memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+}
+EXPORT_SYMBOL(__frontswap_invalidate_area);
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap to the
+ * number given in the parameter target_pages.
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+        struct swap_info_struct *si = NULL;
+        int si_frontswap_pages;
+        unsigned long total_pages = 0, total_pages_to_unuse;
+        unsigned long pages = 0, pages_to_unuse = 0;
+        int type;
+        bool locked = false;
+        /*
+         * we don't want to hold swap_lock while doing a very
+         * lengthy try_to_unuse, but swap_list may change
+         * so restart scan from swap_list.head each time
+         */
+        spin_lock(&swap_lock);
+        locked = true;
+        total_pages = 0;
+        for (type = swap_list.head; type >= 0; type = si->next) {
+                si = swap_info[type];
+                total_pages += atomic_read(&si->frontswap_pages);
+        }
+        if (total_pages <= target_pages)
+                goto out;
+        total_pages_to_unuse = total_pages - target_pages;
+        for (type = swap_list.head; type >= 0; type = si->next) {
+                si = swap_info[type];
+                si_frontswap_pages = atomic_read(&si->frontswap_pages);
+                if (total_pages_to_unuse < si_frontswap_pages)
+                        pages = pages_to_unuse = total_pages_to_unuse;
+                else {
+                        pages = si_frontswap_pages;
+                        pages_to_unuse = 0; /* unuse all */
+                }
+                /* ensure there is enough RAM to fetch pages from frontswap */
+                if (security_vm_enough_memory_mm(current->mm, pages))
+                        continue;
+                vm_unacct_memory(pages);
+                break;
+        }
+        if (type < 0)
+                goto out;
+        locked = false;
+        spin_unlock(&swap_lock);
+        try_to_unuse(type, true, pages_to_unuse);
+out:
+        if (locked)
+                spin_unlock(&swap_lock);
+        return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+/*
+ * Count and return the number of frontswap pages across all
+ * swap devices.  This is exported so that backend drivers can
+ * determine current usage without reading debugfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+        int type;
+        unsigned long totalpages = 0;
+        struct swap_info_struct *si = NULL;
+        spin_lock(&swap_lock);
+        for (type = swap_list.head; type >= 0; type = si->next) {
+                si = swap_info[type];
+                totalpages += atomic_read(&si->frontswap_pages);
+        }
+        spin_unlock(&swap_lock);
+        return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+static int __init init_frontswap(void)
+{
+#ifdef CONFIG_DEBUG_FS
+        struct dentry *root = debugfs_create_dir("frontswap", NULL);
+        if (root == NULL)
+                return -ENXIO;
+        debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
+        debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
+        debugfs_create_u64("failed_stores", S_IRUGO, root,
+                                &frontswap_failed_stores);
+        debugfs_create_u64("invalidates", S_IRUGO,
+                                root, &frontswap_invalidates);
+#endif
+        return 0;
+}
+module_init(init_frontswap);
diff --git a/mm/madvise.c b/mm/madvise.c
index deff1b64a08c..14d260fa0d17 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/ksm.h>
 #include <linux/fs.h>
+#include <linux/file.h>
 /*
 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -204,14 +205,16 @@ static long madvise_remove(struct vm_area_struct *vma,
 {
        loff_t offset;
        int error;
+        struct file *f;
        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
        if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
                return -EINVAL;
-        if (!vma->vm_file || !vma->vm_file->f_mapping
+        f = vma->vm_file;
-                || !vma->vm_file->f_mapping->host) {
+        if (!f || !f->f_mapping || !f->f_mapping->host) {
                        return -EINVAL;
        }
@@ -221,11 +224,18 @@ static long madvise_remove(struct vm_area_struct *vma,
        offset = (loff_t)(start - vma->vm_start)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-        /* filesystem's fallocate may need to take i_mutex */
+        /*
+         * Filesystem's fallocate may need to take i_mutex.  We need to
+         * explicitly grab a reference because the vma (and hence the
+         * vma's reference to the file) can go away as soon as we drop
+         * mmap_sem.
+         */
+        get_file(f);
        up_read(&current->mm->mmap_sem);
-        error = do_fallocate(vma->vm_file,
+        error = do_fallocate(f,
                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                                offset, end - start);
+        fput(f);
        down_read(&current->mm->mmap_sem);
        return error;
 }
diff --git a/mm/memblock.c b/mm/memblock.c
index 952123eba433..5cc6731b00cc 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -143,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
                                           MAX_NUMNODES);
 }
-/*
- * Free memblock.reserved.regions
- */
-int __init_memblock memblock_free_reserved_regions(void)
-{
-        if (memblock.reserved.regions == memblock_reserved_init_regions)
-                return 0;
-        return memblock_free(__pa(memblock.reserved.regions),
-                 sizeof(struct memblock_region) * memblock.reserved.max);
-}
-/*
- * Reserve memblock.reserved.regions
- */
-int __init_memblock memblock_reserve_reserved_regions(void)
-{
-        if (memblock.reserved.regions == memblock_reserved_init_regions)
-                return 0;
-        return memblock_reserve(__pa(memblock.reserved.regions),
-                 sizeof(struct memblock_region) * memblock.reserved.max);
-}
 static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
 {
        type->total_size -= type->regions[r].size;
@@ -184,9 +160,39 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
        }
 }
-static int __init_memblock memblock_double_array(struct memblock_type *type)
+phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
+                                        phys_addr_t *addr)
+{
+        if (memblock.reserved.regions == memblock_reserved_init_regions)
+                return 0;
+        *addr = __pa(memblock.reserved.regions);
+        return PAGE_ALIGN(sizeof(struct memblock_region) *
+                          memblock.reserved.max);
+}
+/**
+ * memblock_double_array - double the size of the memblock regions array
+ * @type: memblock type of the regions array being doubled
+ * @new_area_start: starting address of memory range to avoid overlap with
+ * @new_area_size: size of memory range to avoid overlap with
+ *
+ * Double the size of the @type regions array. If memblock is being used to
+ * allocate memory for a new reserved regions array and there is a previously
+ * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
+ * waiting to be reserved, ensure the memory used by the new array does
+ * not overlap.
+ *
+ * RETURNS:
+ * 0 on success, -1 on failure.
+ */
+static int __init_memblock memblock_double_array(struct memblock_type *type,
+                                                phys_addr_t new_area_start,
+                                                phys_addr_t new_area_size)
 {
        struct memblock_region *new_array, *old_array;
+        phys_addr_t old_alloc_size, new_alloc_size;
        phys_addr_t old_size, new_size, addr;
        int use_slab = slab_is_available();
        int *in_slab;
@@ -200,6 +206,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
        /* Calculate new doubled size */
        old_size = type->max * sizeof(struct memblock_region);
        new_size = old_size << 1;
+        /*
+         * We need to allocated new one align to PAGE_SIZE,
+         *   so we can free them completely later.
+         */
+        old_alloc_size = PAGE_ALIGN(old_size);
+        new_alloc_size = PAGE_ALIGN(new_size);
        /* Retrieve the slab flag */
        if (type == &memblock.memory)
@@ -222,7 +234,18 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
                new_array = kmalloc(new_size, GFP_KERNEL);
                addr = new_array ? __pa(new_array) : 0;
        } else {
-                addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
+                /* only exclude range when trying to double reserved.regions */
+                if (type != &memblock.reserved)
+                        new_area_start = new_area_size = 0;
+                addr = memblock_find_in_range(new_area_start + new_area_size,
+                                                memblock.current_limit,
+                                                new_alloc_size, PAGE_SIZE);
+                if (!addr && new_area_size)
+                        addr = memblock_find_in_range(0,
+                                        min(new_area_start, memblock.current_limit),
+                                        new_alloc_size, PAGE_SIZE);
                new_array = addr ? __va(addr) : 0;
        }
        if (!addr) {
@@ -251,13 +274,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
                kfree(old_array);
        else if (old_array != memblock_memory_init_regions &&
                 old_array != memblock_reserved_init_regions)
-                memblock_free(__pa(old_array), old_size);
+                memblock_free(__pa(old_array), old_alloc_size);
        /* Reserve the new array if that comes from the memblock.
         * Otherwise, we needn't do it
         */
        if (!use_slab)
-                BUG_ON(memblock_reserve(addr, new_size));
+                BUG_ON(memblock_reserve(addr, new_alloc_size));
        /* Update slab flag */
        *in_slab = use_slab;
@@ -399,7 +422,7 @@ repeat:
         */
        if (!insert) {
                while (type->cnt + nr_new > type->max)
-                        if (memblock_double_array(type) < 0)
+                        if (memblock_double_array(type, obase, size) < 0)
                                return -ENOMEM;
                insert = true;
                goto repeat;
@@ -450,7 +473,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
        /* we'll create at most two more regions */
        while (type->cnt + 2 > type->max)
-                if (memblock_double_array(type) < 0)
+                if (memblock_double_array(type, base, size) < 0)
                        return -ENOMEM;
        for (i = 0; i < type->cnt; i++) {
@@ -540,9 +563,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
 * __next_free_mem_range - next function for for_each_free_mem_range()
 * @idx: pointer to u64 loop variable
 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
- * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
 *
 * Find the first free area from *@idx which matches @nid, fill the out
 * parameters, and update *@idx for the next iteration.  The lower 32bit of
@@ -616,9 +639,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
 * @idx: pointer to u64 loop variable
 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
- * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
 *
 * Reverse of __next_free_mem_range().
 */
@@ -867,6 +890,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
        return memblock_search(&memblock.memory, addr) != -1;
 }
+/**
+ * memblock_is_region_memory - check if a region is a subset of memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) is a subset of a memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
 int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
 {
        int idx = memblock_search(&memblock.memory, base);
@@ -879,6 +912,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
                 memblock.memory.regions[idx].size) >= end;
 }
+/**
+ * memblock_is_region_reserved - check if a region intersects reserved memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) intersects a reserved memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
 int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
 {
        memblock_cap_size(base, &size);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ac35bccadb7b..f72b5e52451a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1148,7 +1148,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 {
        if (root_memcg == memcg)
                return true;
-        if (!root_memcg->use_hierarchy)
+        if (!root_memcg->use_hierarchy || !memcg)
                return false;
        return css_is_ancestor(&memcg->css, &root_memcg->css);
 }
@@ -1234,7 +1234,7 @@ int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
 /**
 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
- * @mem: the memory cgroup
+ * @memcg: the memory cgroup
 *
 * Returns the maximum amount of memory @mem can be charged with, in
 * pages.
@@ -1508,7 +1508,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 /**
 * test_mem_cgroup_node_reclaimable
- * @mem: the target memcg
+ * @memcg: the target memcg
 * @nid: the node ID to be checked.
 * @noswap : specify true here if the user wants flle only information.
 *
diff --git a/mm/memory.c b/mm/memory.c
index 1b7dc662bf9f..2466d1250231 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1225,7 +1225,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                next = pmd_addr_end(addr, end);
                if (pmd_trans_huge(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE) {
-                                VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+#ifdef CONFIG_DEBUG_VM
+                                if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
+                                        pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
+                                                __func__, addr, end,
+                                                vma->vm_start,
+                                                vma->vm_end);
+                                        BUG();
+                                }
+#endif
                                split_huge_page_pmd(vma->vm_mm, pmd);
                        } else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
@@ -1366,7 +1374,7 @@ void unmap_vmas(struct mmu_gather *tlb,
 /**
 * zap_page_range - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
- * @address: starting address of pages to zap
+ * @start: starting address of pages to zap
 * @size: number of bytes to zap
 * @details: details of nonlinear truncation or shared cache invalidation
 *
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0d7e3ec8e0f3..427bb291dd0f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -618,7 +618,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
                pgdat = hotadd_new_pgdat(nid, start);
                ret = -ENOMEM;
                if (!pgdat)
-                        goto out;
+                        goto error;
                new_pgdat = 1;
        }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f15c1b24ca18..1d771e4200d2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1177,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (!list_empty(&pagelist)) {
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
                                                (unsigned long)vma,
-                                                false, true);
+                                                false, MIGRATE_SYNC);
                        if (nr_failed)
                                putback_lru_pages(&pagelist);
                }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index d23415c001bc..405573010f99 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
                __free_pages_bootmem(pfn_to_page(i), 0);
 }
+static unsigned long __init __free_memory_core(phys_addr_t start,
+                                 phys_addr_t end)
+{
+        unsigned long start_pfn = PFN_UP(start);
+        unsigned long end_pfn = min_t(unsigned long,
+                                      PFN_DOWN(end), max_low_pfn);
+        if (start_pfn > end_pfn)
+                return 0;
+        __free_pages_memory(start_pfn, end_pfn);
+        return end_pfn - start_pfn;
+}
 unsigned long __init free_low_memory_core_early(int nodeid)
 {
        unsigned long count = 0;
-        phys_addr_t start, end;
+        phys_addr_t start, end, size;
        u64 i;
-        /* free reserved array temporarily so that it's treated as free area */
+        for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
-        memblock_free_reserved_regions();
+                count += __free_memory_core(start, end);
-        for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+        /* free range that is used for reserved array if we allocate it */
-                unsigned long start_pfn = PFN_UP(start);
+        size = get_allocated_memblock_reserved_regions_info(&start);
-                unsigned long end_pfn = min_t(unsigned long,
+        if (size)
-                                              PFN_DOWN(end), max_low_pfn);
+                count += __free_memory_core(start, start + size);
-                if (start_pfn < end_pfn) {
-                        __free_pages_memory(start_pfn, end_pfn);
-                        count += end_pfn - start_pfn;
-                }
-        }
-        /* put region array back? */
-        memblock_reserve_reserved_regions();
        return count;
 }
@@ -274,7 +282,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
        return ___alloc_bootmem(size, align, goal, limit);
 }
-static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                                   unsigned long size,
                                                   unsigned long align,
                                                   unsigned long goal,
diff --git a/mm/nommu.c b/mm/nommu.c
index c4acfbc09972..d4b0c10872de 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1486,7 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-        ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+        retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
        if (file)
                fput(file);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ed0e19677360..ac300c99baf6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -183,7 +183,8 @@ static bool oom_unkillable_task(struct task_struct *p,
 unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
                          const nodemask_t *nodemask, unsigned long totalpages)
 {
-        unsigned long points;
+        long points;
+        long adj;
        if (oom_unkillable_task(p, memcg, nodemask))
                return 0;
@@ -192,7 +193,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
        if (!p)
                return 0;
-        if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+        adj = p->signal->oom_score_adj;
+        if (adj == OOM_SCORE_ADJ_MIN) {
                task_unlock(p);
                return 0;
        }
@@ -210,20 +212,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
         * implementation used by LSMs.
         */
        if (has_capability_noaudit(p, CAP_SYS_ADMIN))
-                points -= 30 * totalpages / 1000;
+                adj -= 30;
-        /*
+        /* Normalize to oom_score_adj units */
-         * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
+        adj *= totalpages / 1000;
-         * either completely disable oom killing or always prefer a certain
+        points += adj;
-         * task.
-         */
-        points += p->signal->oom_score_adj * totalpages / 1000;
        /*
         * Never return 0 for an eligible task regardless of the root bonus and
         * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
         */
-        return points ? points : 1;
+        return points > 0 ? points : 1;
 }
 /*
@@ -366,7 +365,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 /**
 * dump_tasks - dump current memory state of all system tasks
- * @mem: current's memory controller, if constrained
+ * @memcg: current's memory controller, if constrained
 * @nodemask: nodemask passed to page allocator for mempolicy ooms
 *
 * Dumps the current memory state of all eligible tasks.  Tasks not in the same
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44030096da63..4a4f9219683f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5635,7 +5635,12 @@ static struct page *
 __alloc_contig_migrate_alloc(struct page *page, unsigned long private,
                             int **resultp)
 {
-        return alloc_page(GFP_HIGHUSER_MOVABLE);
+        gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+        if (PageHighMem(page))
+                gfp_mask |= __GFP_HIGHMEM;
+        return alloc_page(gfp_mask);
 }
 /* [start, end) must belong to a single zone. */
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 1ccbd714059c..eb750f851395 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
 /**
 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
- * @end: swap entry to be cmpxchged
+ * @ent: swap entry to be cmpxchged
 * @old: old id
 * @new: new id
 *
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 /**
 * swap_cgroup_record - record mem_cgroup for this swp_entry.
 * @ent: swap entry to be recorded into
- * @mem: mem_cgroup to be recorded
+ * @id: mem_cgroup to be recorded
 *
 * Returns old value at success, 0 at failure.
 * (Of course, old value can be 0.)
diff --git a/mm/page_io.c b/mm/page_io.c
index dc76b4d0611e..34f02923744c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,6 +18,7 @@
 #include <linux/bio.h>
 #include <linux/swapops.h>
 #include <linux/writeback.h>
+#include <linux/frontswap.h>
 #include <asm/pgtable.h>
 static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
                unlock_page(page);
                goto out;
        }
+        if (frontswap_store(page) == 0) {
+                set_page_writeback(page);
+                unlock_page(page);
+                end_page_writeback(page);
+                goto out;
+        }
        bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
        if (bio == NULL) {
                set_page_dirty(page);
@@ -122,6 +129,11 @@ int swap_readpage(struct page *page)
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(PageUptodate(page));
+        if (frontswap_load(page) == 0) {
+                SetPageUptodate(page);
+                unlock_page(page);
+                goto out;
+        }
        bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
        if (bio == NULL) {
                unlock_page(page);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index aa9701e12714..6c118d012bb5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
 /**
 * walk_page_range - walk a memory map's page tables with a callback
- * @mm: memory map to walk
 * @addr: starting address
 * @end: ending address
 * @walk: set of callbacks to invoke for each level of the tree
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 405d331804c3..3707c71ae4cd 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -360,7 +360,6 @@ err_free:
 * @chunk: chunk to depopulate
 * @off: offset to the area to depopulate
 * @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
 *
 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
 * from @chunk.  If @flush is true, vcache is flushed before unmapping
diff --git a/mm/shmem.c b/mm/shmem.c
index c244e93a70fa..bd106361be4b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -264,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
 }
 /*
+ * Sometimes, before we decide whether to proceed or to fail, we must check
+ * that an entry was not already brought back from swap by a racing thread.
+ *
+ * Checking page is not enough: by the time a SwapCache page is locked, it
+ * might be reused, and again be SwapCache, using the same swap as before.
+ */
+static bool shmem_confirm_swap(struct address_space *mapping,
+                               pgoff_t index, swp_entry_t swap)
+{
+        void *item;
+        rcu_read_lock();
+        item = radix_tree_lookup(&mapping->page_tree, index);
+        rcu_read_unlock();
+        return item == swp_to_radix_entry(swap);
+}
+/*
 * Like add_to_page_cache_locked, but error if expected item has gone.
 */
 static int shmem_add_to_page_cache(struct page *page,
                                   struct address_space *mapping,
                                   pgoff_t index, gfp_t gfp, void *expected)
 {
-        int error = 0;
+        int error;
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(!PageSwapBacked(page));
+        page_cache_get(page);
+        page->mapping = mapping;
+        page->index = index;
+        spin_lock_irq(&mapping->tree_lock);
        if (!expected)
-                error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+                error = radix_tree_insert(&mapping->page_tree, index, page);
+        else
+                error = shmem_radix_tree_replace(mapping, index, expected,
+                                                                 page);
        if (!error) {
-                page_cache_get(page);
+                mapping->nrpages++;
-                page->mapping = mapping;
+                __inc_zone_page_state(page, NR_FILE_PAGES);
-                page->index = index;
+                __inc_zone_page_state(page, NR_SHMEM);
+                spin_unlock_irq(&mapping->tree_lock);
-                spin_lock_irq(&mapping->tree_lock);
+        } else {
-                if (!expected)
+                page->mapping = NULL;
-                        error = radix_tree_insert(&mapping->page_tree,
+                spin_unlock_irq(&mapping->tree_lock);
-                                                        index, page);
+                page_cache_release(page);
-                else
-                        error = shmem_radix_tree_replace(mapping, index,
-                                                        expected, page);
-                if (!error) {
-                        mapping->nrpages++;
-                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                        __inc_zone_page_state(page, NR_SHMEM);
-                        spin_unlock_irq(&mapping->tree_lock);
-                } else {
-                        page->mapping = NULL;
-                        spin_unlock_irq(&mapping->tree_lock);
-                        page_cache_release(page);
-                }
-                if (!expected)
-                        radix_tree_preload_end();
        }
-        if (error)
-                mem_cgroup_uncharge_cache_page(page);
        return error;
 }
@@ -683,10 +692,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
                mutex_lock(&shmem_swaplist_mutex);
                /*
                 * We needed to drop mutex to make that restrictive page
-                 * allocation; but the inode might already be freed by now,
+                 * allocation, but the inode might have been freed while we
-                 * and we cannot refer to inode or mapping or info to check.
+                 * dropped it: although a racing shmem_evict_inode() cannot
-                 * However, we do hold page lock on the PageSwapCache page,
+                 * complete without emptying the radix_tree, our page lock
-                 * so can check if that still has our reference remaining.
+                 * on this swapcache page is not enough to prevent that -
+                 * free_swap_and_cache() of our swap entry will only
+                 * trylock_page(), removing swap from radix_tree whatever.
+                 *
+                 * We must not proceed to shmem_add_to_page_cache() if the
+                 * inode has been freed, but of course we cannot rely on
+                 * inode or mapping or info to check that.  However, we can
+                 * safely check if our swap entry is still in use (and here
+                 * it can't have got reused for another page): if it's still
+                 * in use, then the inode cannot have been freed yet, and we
+                 * can safely proceed (if it's no longer in use, that tells
+                 * nothing about the inode, but we don't need to unuse swap).
                 */
                if (!page_swapcount(*pagep))
                        error = -ENOENT;
@@ -730,9 +750,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
        /*
         * There's a faint possibility that swap page was replaced before
-         * caller locked it: it will come back later with the right page.
+         * caller locked it: caller will come back later with the right page.
         */
-        if (unlikely(!PageSwapCache(page)))
+        if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
                goto out;
        /*
@@ -995,21 +1015,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
        newpage = shmem_alloc_page(gfp, info, index);
        if (!newpage)
                return -ENOMEM;
-        VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
-        *pagep = newpage;
        page_cache_get(newpage);
        copy_highpage(newpage, oldpage);
+        flush_dcache_page(newpage);
-        VM_BUG_ON(!PageLocked(oldpage));
        __set_page_locked(newpage);
-        VM_BUG_ON(!PageUptodate(oldpage));
        SetPageUptodate(newpage);
-        VM_BUG_ON(!PageSwapBacked(oldpage));
        SetPageSwapBacked(newpage);
-        VM_BUG_ON(!swap_index);
        set_page_private(newpage, swap_index);
-        VM_BUG_ON(!PageSwapCache(oldpage));
        SetPageSwapCache(newpage);
        /*
@@ -1019,13 +1033,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
        spin_lock_irq(&swap_mapping->tree_lock);
        error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
                                                                   newpage);
-        __inc_zone_page_state(newpage, NR_FILE_PAGES);
+        if (!error) {
-        __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+                __inc_zone_page_state(newpage, NR_FILE_PAGES);
+                __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+        }
        spin_unlock_irq(&swap_mapping->tree_lock);
-        BUG_ON(error);
-        mem_cgroup_replace_page_cache(oldpage, newpage);
+        if (unlikely(error)) {
-        lru_cache_add_anon(newpage);
+                /*
+                 * Is this possible?  I think not, now that our callers check
+                 * both PageSwapCache and page_private after getting page lock;
+                 * but be defensive.  Reverse old to newpage for clear and free.
+                 */
+                oldpage = newpage;
+        } else {
+                mem_cgroup_replace_page_cache(oldpage, newpage);
+                lru_cache_add_anon(newpage);
+                *pagep = newpage;
+        }
        ClearPageSwapCache(oldpage);
        set_page_private(oldpage, 0);
@@ -1033,7 +1058,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
        unlock_page(oldpage);
        page_cache_release(oldpage);
        page_cache_release(oldpage);
-        return 0;
+        return error;
 }
 /*
@@ -1107,9 +1132,10 @@ repeat:
                /* We have to do this with page locked to prevent races */
                lock_page(page);
-                if (!PageSwapCache(page) || page->mapping) {
+                if (!PageSwapCache(page) || page_private(page) != swap.val ||
+                    !shmem_confirm_swap(mapping, index, swap)) {
                        error = -EEXIST;        /* try again */
-                        goto failed;
+                        goto unlock;
                }
                if (!PageUptodate(page)) {
                        error = -EIO;
@@ -1125,9 +1151,12 @@ repeat:
                error = mem_cgroup_cache_charge(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
-                if (!error)
+                if (!error) {
                        error = shmem_add_to_page_cache(page, mapping, index,
                                                gfp, swp_to_radix_entry(swap));
+                        /* We already confirmed swap, and make no allocation */
+                        VM_BUG_ON(error);
+                }
                if (error)
                        goto failed;
@@ -1164,11 +1193,18 @@ repeat:
                __set_page_locked(page);
                error = mem_cgroup_cache_charge(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
-                if (!error)
-                        error = shmem_add_to_page_cache(page, mapping, index,
-                                                gfp, NULL);
                if (error)
                        goto decused;
+                error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+                if (!error) {
+                        error = shmem_add_to_page_cache(page, mapping, index,
+                                                        gfp, NULL);
+                        radix_tree_preload_end();
+                }
+                if (error) {
+                        mem_cgroup_uncharge_cache_page(page);
+                        goto decused;
+                }
                lru_cache_add_anon(page);
                spin_lock(&info->lock);
@@ -1228,14 +1264,10 @@ decused:
 unacct:
        shmem_unacct_blocks(info->flags, 1);
 failed:
-        if (swap.val && error != -EINVAL) {
+        if (swap.val && error != -EINVAL &&
-                struct page *test = find_get_page(mapping, index);
+            !shmem_confirm_swap(mapping, index, swap))
-                if (test && !radix_tree_exceptional_entry(test))
+                error = -EEXIST;
-                        page_cache_release(test);
+unlock:
-                /* Have another try if the entry has changed */
-                if (test != swp_to_radix_entry(swap))
-                        error = -EEXIST;
-        }
        if (page) {
                unlock_page(page);
                page_cache_release(page);
@@ -1247,7 +1279,7 @@ failed:
                spin_unlock(&info->lock);
                goto repeat;
        }
-        if (error == -EEXIST)
+        if (error == -EEXIST)   /* from above or from radix_tree_insert */
                goto repeat;
        return error;
 }
@@ -1675,98 +1707,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
        return error;
 }
-/*
- * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
- */
-static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
-                                    pgoff_t index, pgoff_t end, int origin)
-{
-        struct page *page;
-        struct pagevec pvec;
-        pgoff_t indices[PAGEVEC_SIZE];
-        bool done = false;
-        int i;
-        pagevec_init(&pvec, 0);
-        pvec.nr = 1;            /* start small: we may be there already */
-        while (!done) {
-                pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-                                        pvec.nr, pvec.pages, indices);
-                if (!pvec.nr) {
-                        if (origin == SEEK_DATA)
-                                index = end;
-                        break;
-                }
-                for (i = 0; i < pvec.nr; i++, index++) {
-                        if (index < indices[i]) {
-                                if (origin == SEEK_HOLE) {
-                                        done = true;
-                                        break;
-                                }
-                                index = indices[i];
-                        }
-                        page = pvec.pages[i];
-                        if (page && !radix_tree_exceptional_entry(page)) {
-                                if (!PageUptodate(page))
-                                        page = NULL;
-                        }
-                        if (index >= end ||
-                            (page && origin == SEEK_DATA) ||
-                            (!page && origin == SEEK_HOLE)) {
-                                done = true;
-                                break;
-                        }
-                }
-                shmem_deswap_pagevec(&pvec);
-                pagevec_release(&pvec);
-                pvec.nr = PAGEVEC_SIZE;
-                cond_resched();
-        }
-        return index;
-}
-static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
-{
-        struct address_space *mapping;
-        struct inode *inode;
-        pgoff_t start, end;
-        loff_t new_offset;
-        if (origin != SEEK_DATA && origin != SEEK_HOLE)
-                return generic_file_llseek_size(file, offset, origin,
-                                                        MAX_LFS_FILESIZE);
-        mapping = file->f_mapping;
-        inode = mapping->host;
-        mutex_lock(&inode->i_mutex);
-        /* We're holding i_mutex so we can access i_size directly */
-        if (offset < 0)
-                offset = -EINVAL;
-        else if (offset >= inode->i_size)
-                offset = -ENXIO;
-        else {
-                start = offset >> PAGE_CACHE_SHIFT;
-                end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-                new_offset = shmem_seek_hole_data(mapping, start, end, origin);
-                new_offset <<= PAGE_CACHE_SHIFT;
-                if (new_offset > offset) {
-                        if (new_offset < inode->i_size)
-                                offset = new_offset;
-                        else if (origin == SEEK_DATA)
-                                offset = -ENXIO;
-                        else
-                                offset = inode->i_size;
-                }
-        }
-        if (offset >= 0 && offset != file->f_pos) {
-                file->f_pos = offset;
-                file->f_version = 0;
-        }
-        mutex_unlock(&inode->i_mutex);
-        return offset;
-}
 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                                                         loff_t len)
 {
@@ -2770,7 +2710,7 @@ static const struct address_space_operations shmem_aops = {
 static const struct file_operations shmem_file_operations = {
        .mmap           = shmem_mmap,
 #ifdef CONFIG_TMPFS
-        .llseek         = shmem_file_llseek,
+        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = shmem_file_aio_read,
diff --git a/mm/sparse.c b/mm/sparse.c
index 6a4bf9160e85..c7bb952400c8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -275,8 +275,9 @@ static unsigned long * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
                                         unsigned long size)
 {
-        pg_data_t *host_pgdat;
+        unsigned long goal, limit;
-        unsigned long goal;
+        unsigned long *p;
+        int nid;
        /*
         * A page may contain usemaps for other sections preventing the
         * page being freed and making a section unremovable while
@@ -287,10 +288,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
         * from the same section as the pgdat where possible to avoid
         * this problem.
         */
-        goal = __pa(pgdat) & PAGE_SECTION_MASK;
+        goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
-        host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT));
+        limit = goal + (1UL << PA_SECTION_SHIFT);
-        return __alloc_bootmem_node_nopanic(host_pgdat, size,
+        nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
-                                            SMP_CACHE_BYTES, goal);
+again:
+        p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
+                                          SMP_CACHE_BYTES, goal, limit);
+        if (!p && limit) {
+                limit = 0;
+                goto again;
+        }
+        return p;
 }
 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 457b10baef59..71373d03fcee 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,8 @@
 #include <linux/memcontrol.h>
 #include <linux/poll.h>
 #include <linux/oom.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
 static void free_swap_count_continuations(struct swap_info_struct *);
 static sector_t map_swap_entry(swp_entry_t, struct block_device**);
-static DEFINE_SPINLOCK(swap_lock);
+DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
 long nr_swap_pages;
 long total_swap_pages;
@@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
 static const char Unused_offset[] = "Unused swap offset entry ";
-static struct swap_list_t swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};
-static struct swap_info_struct *swap_info[MAX_SWAPFILES];
+struct swap_info_struct *swap_info[MAX_SWAPFILES];
 static DEFINE_MUTEX(swapon_mutex);
@@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
                        swap_list.next = p->type;
                nr_swap_pages++;
                p->inuse_pages--;
+                frontswap_invalidate_page(p->type, offset);
                if ((p->flags & SWP_BLKDEV) &&
                                disk->fops->swap_slot_free_notify)
                        disk->fops->swap_slot_free_notify(p->bdev, offset);
@@ -985,11 +988,12 @@ static int unuse_mm(struct mm_struct *mm,
 }
 /*
- * Scan swap_map from current position to next entry still in use.
+ * Scan swap_map (or frontswap_map if frontswap parameter is true)
+ * from current position to next entry still in use.
 * Recycle to start on reaching the end, returning 0 when empty.
 */
 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
-                                        unsigned int prev)
+                                        unsigned int prev, bool frontswap)
 {
        unsigned int max = si->max;
        unsigned int i = prev;
@@ -1015,6 +1019,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                        prev = 0;
                        i = 1;
                }
+                if (frontswap) {
+                        if (frontswap_test(si, i))
+                                break;
+                        else
+                                continue;
+                }
                count = si->swap_map[i];
                if (count && swap_count(count) != SWAP_MAP_BAD)
                        break;
@@ -1026,8 +1036,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
 * We completely avoid races by reading each swap page in advance,
 * and then search for the process using it.  All the necessary
 * page table adjustments can then be made atomically.
+ *
+ * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages; ignored if frontswap is false
 */
-static int try_to_unuse(unsigned int type)
+int try_to_unuse(unsigned int type, bool frontswap,
+                 unsigned long pages_to_unuse)
 {
        struct swap_info_struct *si = swap_info[type];
        struct mm_struct *start_mm;
@@ -1060,7 +1074,7 @@ static int try_to_unuse(unsigned int type)
         * one pass through swap_map is enough, but not necessarily:
         * there are races when an instance of an entry might be missed.
         */
-        while ((i = find_next_to_unuse(si, i)) != 0) {
+        while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
                if (signal_pending(current)) {
                        retval = -EINTR;
                        break;
@@ -1227,6 +1241,10 @@ static int try_to_unuse(unsigned int type)
                 * interactive performance.
                 */
                cond_resched();
+                if (frontswap && pages_to_unuse > 0) {
+                        if (!--pages_to_unuse)
+                                break;
+                }
        }
        mmput(start_mm);
@@ -1486,7 +1504,8 @@ bad_bmap:
 }
 static void enable_swap_info(struct swap_info_struct *p, int prio,
-                                unsigned char *swap_map)
+                                unsigned char *swap_map,
+                                unsigned long *frontswap_map)
 {
        int i, prev;
@@ -1496,6 +1515,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
        else
                p->prio = --least_priority;
        p->swap_map = swap_map;
+        frontswap_map_set(p, frontswap_map);
        p->flags |= SWP_WRITEOK;
        nr_swap_pages += p->pages;
        total_swap_pages += p->pages;
@@ -1512,6 +1532,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
                swap_list.head = swap_list.next = p->type;
        else
                swap_info[prev]->next = p->type;
+        frontswap_init(p->type);
        spin_unlock(&swap_lock);
 }
@@ -1585,7 +1606,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        spin_unlock(&swap_lock);
        oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
-        err = try_to_unuse(type);
+        err = try_to_unuse(type, false, 0); /* force all pages to be unused */
        compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
        if (err) {
@@ -1596,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                 * sys_swapoff for this swap_info_struct at this point.
                 */
                /* re-insert swap space back into swap_list */
-                enable_swap_info(p, p->prio, p->swap_map);
+                enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
                goto out_dput;
        }
@@ -1622,9 +1643,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        swap_map = p->swap_map;
        p->swap_map = NULL;
        p->flags = 0;
+        frontswap_invalidate_area(type);
        spin_unlock(&swap_lock);
        mutex_unlock(&swapon_mutex);
        vfree(swap_map);
+        vfree(frontswap_map_get(p));
        /* Destroy swap account informatin */
        swap_cgroup_swapoff(type);
@@ -1893,24 +1916,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
        /*
         * Find out how many pages are allowed for a single swap
-         * device. There are three limiting factors: 1) the number
+         * device. There are two limiting factors: 1) the number
         * of bits for the swap offset in the swp_entry_t type, and
         * 2) the number of bits in the swap pte as defined by the
-         * the different architectures, and 3) the number of free bits
+         * different architectures. In order to find the
-         * in an exceptional radix_tree entry. In order to find the
         * largest possible bit mask, a swap entry with swap type 0
         * and swap offset ~0UL is created, encoded to a swap pte,
         * decoded to a swp_entry_t again, and finally the swap
         * offset is extracted. This will mask all the bits from
         * the initial ~0UL mask that can't be encoded in either
         * the swp_entry_t or the architecture definition of a
-         * swap pte.  Then the same is done for a radix_tree entry.
+         * swap pte.
         */
        maxpages = swp_offset(pte_to_swp_entry(
-                        swp_entry_to_pte(swp_entry(0, ~0UL))));
+                        swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
-        maxpages = swp_offset(radix_to_swp_entry(
-                        swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
        if (maxpages > swap_header->info.last_page) {
                maxpages = swap_header->info.last_page + 1;
                /* p->max is an unsigned int: don't overflow it */
@@ -1988,6 +2007,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        sector_t span;
        unsigned long maxpages;
        unsigned char *swap_map = NULL;
+        unsigned long *frontswap_map = NULL;
        struct page *page = NULL;
        struct inode *inode = NULL;
@@ -2071,6 +2091,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                error = nr_extents;
                goto bad_swap;
        }
+        /* frontswap enabled? set up bit-per-page map for frontswap */
+        if (frontswap_enabled)
+                frontswap_map = vzalloc(maxpages / sizeof(long));
        if (p->bdev) {
                if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2086,14 +2109,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        if (swap_flags & SWAP_FLAG_PREFER)
                prio =
                  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
-        enable_swap_info(p, prio, swap_map);
+        enable_swap_info(p, prio, swap_map, frontswap_map);
        printk(KERN_INFO "Adding %uk swap on %s.  "
-                        "Priority:%d extents:%d across:%lluk %s%s\n",
+                        "Priority:%d extents:%d across:%lluk %s%s%s\n",
                p->pages<<(PAGE_SHIFT-10), name, p->prio,
                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
                (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
-                (p->flags & SWP_DISCARDABLE) ? "D" : "");
+                (p->flags & SWP_DISCARDABLE) ? "D" : "",
+                (frontswap_map) ? "FS" : "");
        mutex_unlock(&swapon_mutex);
        atomic_inc(&proc_poll_event);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eeb3bc9d1d36..66e431060c05 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2688,7 +2688,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
                 * them before going back to sleep.
                 */
                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
-                schedule();
+                if (!kthread_should_stop())
+                        schedule();
                set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
        } else {
                if (remaining)
@@ -2955,14 +2958,17 @@ int kswapd_run(int nid)
 }
 /*
- * Called by memory hotplug when all memory in a node is offlined.
+ * Called by memory hotplug when all memory in a node is offlined.  Caller must
+ * hold lock_memory_hotplug().
 */
 void kswapd_stop(int nid)
 {
        struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
-        if (kswapd)
+        if (kswapd) {
                kthread_stop(kswapd);
+                NODE_DATA(nid)->kswapd = NULL;
+        }
 }
 static int __init kswapd_init(void)
author	Jens Axboe <axboe@kernel.dk>	2012-07-30 03:03:10 -0400
committer	Jens Axboe <axboe@kernel.dk>	2012-07-30 03:03:10 -0400
commit	72ea1f74fcdf874cca6d2c0962379523bbd99e2c (patch)
tree	4c67be6c73356086ff44ef1b8b1c9479702689ca /mm
parent	b1af9be5ef77898c05667bb9dbf3b180d91d3292 (diff)
parent	a73ff3231df59a4b92ccd0dd4e73897c5822489b (diff)