Merge tag 'v3.5'

Linux 3.5 * tag 'v3.5': (1242 commits) Linux 3.5 Remove SYSTEM_SUSPEND_DISK system state kdb: Switch to nolock variants of kmsg_dump functions printk: Implement some unlocked kmsg_dump functions printk: Remove kdb_syslog_data kdb: Revive dmesg command dm raid1: set discard_zeroes_data_unsupported dm thin: do not send discards to shared blocks dm raid1: fix crash with mirror recovery and discard pnfs-obj: Fix __r4w_get_page when offset is beyond i_size pnfs-obj: don't leak objio_state if ore_write/read fails ore: Unlock r4w pages in exact reverse order of locking ore: Remove support of partial IO request (NFS crash) ore: Fix NFS crash by supporting any unaligned RAID IO UBIFS: fix a bug in empty space fix-up cx25821: Remove bad strcpy to read-only char* HID: hid-multitouch: add support for Zytronic panels MIPS: PCI: Move fixups from __init to __devinit. MIPS: Fix bug.h MIPS build regression MIPS: sync-r4k: remove redundant irq operation ...
author: Mauro Carvalho Chehab <mchehab@redhat.com> 2012-07-29 20:09:39 -0400
committer: Mauro Carvalho Chehab <mchehab@redhat.com> 2012-07-29 20:09:39 -0400
commit: 73bcc49959e4e40911dd0dd634bf1b353827df66 (patch)
tree: 6b0c1d440c490a65c51ab5cf5aee7095cb4089d3 /mm
parent: 8447c4d15e357a458c9051ddc84aa6c8b9c27000 (diff)
parent: 28a33cbc24e4256c143dce96c7d93bf423229f92 (diff)
18 files changed, 244 insertions, 232 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ec4fcb7a56c8..bcb63ac48cc5 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -698,7 +698,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
        return ___alloc_bootmem(size, align, goal, limit);
 }
-static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                unsigned long size, unsigned long align,
                                unsigned long goal, unsigned long limit)
 {
@@ -710,6 +710,10 @@ again:
        if (ptr)
                return ptr;
+        /* do not panic in alloc_bootmem_bdata() */
+        if (limit && goal + size > limit)
+                limit = 0;
        ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
        if (ptr)
                return ptr;
diff --git a/mm/compaction.c b/mm/compaction.c
index 7ea259d82a99..2f42d9528539 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -701,8 +701,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                if (err) {
                        putback_lru_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
+                        if (err == -ENOMEM) {
+                                ret = COMPACT_PARTIAL;
+                                goto out;
+                        }
                }
        }
 out:
diff --git a/mm/madvise.c b/mm/madvise.c
index deff1b64a08c..14d260fa0d17 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/ksm.h>
 #include <linux/fs.h>
+#include <linux/file.h>
 /*
 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -204,14 +205,16 @@ static long madvise_remove(struct vm_area_struct *vma,
 {
        loff_t offset;
        int error;
+        struct file *f;
        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
        if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
                return -EINVAL;
-        if (!vma->vm_file || !vma->vm_file->f_mapping
+        f = vma->vm_file;
-                || !vma->vm_file->f_mapping->host) {
+        if (!f || !f->f_mapping || !f->f_mapping->host) {
                        return -EINVAL;
        }
@@ -221,11 +224,18 @@ static long madvise_remove(struct vm_area_struct *vma,
        offset = (loff_t)(start - vma->vm_start)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-        /* filesystem's fallocate may need to take i_mutex */
+        /*
+         * Filesystem's fallocate may need to take i_mutex.  We need to
+         * explicitly grab a reference because the vma (and hence the
+         * vma's reference to the file) can go away as soon as we drop
+         * mmap_sem.
+         */
+        get_file(f);
        up_read(&current->mm->mmap_sem);
-        error = do_fallocate(vma->vm_file,
+        error = do_fallocate(f,
                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                                offset, end - start);
+        fput(f);
        down_read(&current->mm->mmap_sem);
        return error;
 }
diff --git a/mm/memblock.c b/mm/memblock.c
index 952123eba433..5cc6731b00cc 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -143,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
                                           MAX_NUMNODES);
 }
-/*
- * Free memblock.reserved.regions
- */
-int __init_memblock memblock_free_reserved_regions(void)
-{
-        if (memblock.reserved.regions == memblock_reserved_init_regions)
-                return 0;
-        return memblock_free(__pa(memblock.reserved.regions),
-                 sizeof(struct memblock_region) * memblock.reserved.max);
-}
-/*
- * Reserve memblock.reserved.regions
- */
-int __init_memblock memblock_reserve_reserved_regions(void)
-{
-        if (memblock.reserved.regions == memblock_reserved_init_regions)
-                return 0;
-        return memblock_reserve(__pa(memblock.reserved.regions),
-                 sizeof(struct memblock_region) * memblock.reserved.max);
-}
 static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
 {
        type->total_size -= type->regions[r].size;
@@ -184,9 +160,39 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
        }
 }
-static int __init_memblock memblock_double_array(struct memblock_type *type)
+phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
+                                        phys_addr_t *addr)
+{
+        if (memblock.reserved.regions == memblock_reserved_init_regions)
+                return 0;
+        *addr = __pa(memblock.reserved.regions);
+        return PAGE_ALIGN(sizeof(struct memblock_region) *
+                          memblock.reserved.max);
+}
+/**
+ * memblock_double_array - double the size of the memblock regions array
+ * @type: memblock type of the regions array being doubled
+ * @new_area_start: starting address of memory range to avoid overlap with
+ * @new_area_size: size of memory range to avoid overlap with
+ *
+ * Double the size of the @type regions array. If memblock is being used to
+ * allocate memory for a new reserved regions array and there is a previously
+ * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
+ * waiting to be reserved, ensure the memory used by the new array does
+ * not overlap.
+ *
+ * RETURNS:
+ * 0 on success, -1 on failure.
+ */
+static int __init_memblock memblock_double_array(struct memblock_type *type,
+                                                phys_addr_t new_area_start,
+                                                phys_addr_t new_area_size)
 {
        struct memblock_region *new_array, *old_array;
+        phys_addr_t old_alloc_size, new_alloc_size;
        phys_addr_t old_size, new_size, addr;
        int use_slab = slab_is_available();
        int *in_slab;
@@ -200,6 +206,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
        /* Calculate new doubled size */
        old_size = type->max * sizeof(struct memblock_region);
        new_size = old_size << 1;
+        /*
+         * We need to allocated new one align to PAGE_SIZE,
+         *   so we can free them completely later.
+         */
+        old_alloc_size = PAGE_ALIGN(old_size);
+        new_alloc_size = PAGE_ALIGN(new_size);
        /* Retrieve the slab flag */
        if (type == &memblock.memory)
@@ -222,7 +234,18 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
                new_array = kmalloc(new_size, GFP_KERNEL);
                addr = new_array ? __pa(new_array) : 0;
        } else {
-                addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
+                /* only exclude range when trying to double reserved.regions */
+                if (type != &memblock.reserved)
+                        new_area_start = new_area_size = 0;
+                addr = memblock_find_in_range(new_area_start + new_area_size,
+                                                memblock.current_limit,
+                                                new_alloc_size, PAGE_SIZE);
+                if (!addr && new_area_size)
+                        addr = memblock_find_in_range(0,
+                                        min(new_area_start, memblock.current_limit),
+                                        new_alloc_size, PAGE_SIZE);
                new_array = addr ? __va(addr) : 0;
        }
        if (!addr) {
@@ -251,13 +274,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
                kfree(old_array);
        else if (old_array != memblock_memory_init_regions &&
                 old_array != memblock_reserved_init_regions)
-                memblock_free(__pa(old_array), old_size);
+                memblock_free(__pa(old_array), old_alloc_size);
        /* Reserve the new array if that comes from the memblock.
         * Otherwise, we needn't do it
         */
        if (!use_slab)
-                BUG_ON(memblock_reserve(addr, new_size));
+                BUG_ON(memblock_reserve(addr, new_alloc_size));
        /* Update slab flag */
        *in_slab = use_slab;
@@ -399,7 +422,7 @@ repeat:
         */
        if (!insert) {
                while (type->cnt + nr_new > type->max)
-                        if (memblock_double_array(type) < 0)
+                        if (memblock_double_array(type, obase, size) < 0)
                                return -ENOMEM;
                insert = true;
                goto repeat;
@@ -450,7 +473,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
        /* we'll create at most two more regions */
        while (type->cnt + 2 > type->max)
-                if (memblock_double_array(type) < 0)
+                if (memblock_double_array(type, base, size) < 0)
                        return -ENOMEM;
        for (i = 0; i < type->cnt; i++) {
@@ -540,9 +563,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
 * __next_free_mem_range - next function for for_each_free_mem_range()
 * @idx: pointer to u64 loop variable
 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
- * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
 *
 * Find the first free area from *@idx which matches @nid, fill the out
 * parameters, and update *@idx for the next iteration.  The lower 32bit of
@@ -616,9 +639,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
 * @idx: pointer to u64 loop variable
 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
- * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
 *
 * Reverse of __next_free_mem_range().
 */
@@ -867,6 +890,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
        return memblock_search(&memblock.memory, addr) != -1;
 }
+/**
+ * memblock_is_region_memory - check if a region is a subset of memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) is a subset of a memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
 int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
 {
        int idx = memblock_search(&memblock.memory, base);
@@ -879,6 +912,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
                 memblock.memory.regions[idx].size) >= end;
 }
+/**
+ * memblock_is_region_reserved - check if a region intersects reserved memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) intersects a reserved memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
 int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
 {
        memblock_cap_size(base, &size);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ac35bccadb7b..f72b5e52451a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1148,7 +1148,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 {
        if (root_memcg == memcg)
                return true;
-        if (!root_memcg->use_hierarchy)
+        if (!root_memcg->use_hierarchy || !memcg)
                return false;
        return css_is_ancestor(&memcg->css, &root_memcg->css);
 }
@@ -1234,7 +1234,7 @@ int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
 /**
 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
- * @mem: the memory cgroup
+ * @memcg: the memory cgroup
 *
 * Returns the maximum amount of memory @mem can be charged with, in
 * pages.
@@ -1508,7 +1508,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 /**
 * test_mem_cgroup_node_reclaimable
- * @mem: the target memcg
+ * @memcg: the target memcg
 * @nid: the node ID to be checked.
 * @noswap : specify true here if the user wants flle only information.
 *
diff --git a/mm/memory.c b/mm/memory.c
index 1b7dc662bf9f..2466d1250231 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1225,7 +1225,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                next = pmd_addr_end(addr, end);
                if (pmd_trans_huge(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE) {
-                                VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+#ifdef CONFIG_DEBUG_VM
+                                if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
+                                        pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
+                                                __func__, addr, end,
+                                                vma->vm_start,
+                                                vma->vm_end);
+                                        BUG();
+                                }
+#endif
                                split_huge_page_pmd(vma->vm_mm, pmd);
                        } else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
@@ -1366,7 +1374,7 @@ void unmap_vmas(struct mmu_gather *tlb,
 /**
 * zap_page_range - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
- * @address: starting address of pages to zap
+ * @start: starting address of pages to zap
 * @size: number of bytes to zap
 * @details: details of nonlinear truncation or shared cache invalidation
 *
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0d7e3ec8e0f3..427bb291dd0f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -618,7 +618,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
                pgdat = hotadd_new_pgdat(nid, start);
                ret = -ENOMEM;
                if (!pgdat)
-                        goto out;
+                        goto error;
                new_pgdat = 1;
        }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f15c1b24ca18..1d771e4200d2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1177,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (!list_empty(&pagelist)) {
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
                                                (unsigned long)vma,
-                                                false, true);
+                                                false, MIGRATE_SYNC);
                        if (nr_failed)
                                putback_lru_pages(&pagelist);
                }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index d23415c001bc..405573010f99 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
                __free_pages_bootmem(pfn_to_page(i), 0);
 }
+static unsigned long __init __free_memory_core(phys_addr_t start,
+                                 phys_addr_t end)
+{
+        unsigned long start_pfn = PFN_UP(start);
+        unsigned long end_pfn = min_t(unsigned long,
+                                      PFN_DOWN(end), max_low_pfn);
+        if (start_pfn > end_pfn)
+                return 0;
+        __free_pages_memory(start_pfn, end_pfn);
+        return end_pfn - start_pfn;
+}
 unsigned long __init free_low_memory_core_early(int nodeid)
 {
        unsigned long count = 0;
-        phys_addr_t start, end;
+        phys_addr_t start, end, size;
        u64 i;
-        /* free reserved array temporarily so that it's treated as free area */
+        for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
-        memblock_free_reserved_regions();
+                count += __free_memory_core(start, end);
-        for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+        /* free range that is used for reserved array if we allocate it */
-                unsigned long start_pfn = PFN_UP(start);
+        size = get_allocated_memblock_reserved_regions_info(&start);
-                unsigned long end_pfn = min_t(unsigned long,
+        if (size)
-                                              PFN_DOWN(end), max_low_pfn);
+                count += __free_memory_core(start, start + size);
-                if (start_pfn < end_pfn) {
-                        __free_pages_memory(start_pfn, end_pfn);
-                        count += end_pfn - start_pfn;
-                }
-        }
-        /* put region array back? */
-        memblock_reserve_reserved_regions();
        return count;
 }
@@ -274,7 +282,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
        return ___alloc_bootmem(size, align, goal, limit);
 }
-static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                                   unsigned long size,
                                                   unsigned long align,
                                                   unsigned long goal,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 416637f0e924..ac300c99baf6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -184,6 +184,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
                          const nodemask_t *nodemask, unsigned long totalpages)
 {
        long points;
+        long adj;
        if (oom_unkillable_task(p, memcg, nodemask))
                return 0;
@@ -192,7 +193,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
        if (!p)
                return 0;
-        if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+        adj = p->signal->oom_score_adj;
+        if (adj == OOM_SCORE_ADJ_MIN) {
                task_unlock(p);
                return 0;
        }
@@ -210,14 +212,11 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
         * implementation used by LSMs.
         */
        if (has_capability_noaudit(p, CAP_SYS_ADMIN))
-                points -= 30 * totalpages / 1000;
+                adj -= 30;
-        /*
+        /* Normalize to oom_score_adj units */
-         * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
+        adj *= totalpages / 1000;
-         * either completely disable oom killing or always prefer a certain
+        points += adj;
-         * task.
-         */
-        points += p->signal->oom_score_adj * totalpages / 1000;
        /*
         * Never return 0 for an eligible task regardless of the root bonus and
@@ -366,7 +365,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 /**
 * dump_tasks - dump current memory state of all system tasks
- * @mem: current's memory controller, if constrained
+ * @memcg: current's memory controller, if constrained
 * @nodemask: nodemask passed to page allocator for mempolicy ooms
 *
 * Dumps the current memory state of all eligible tasks.  Tasks not in the same
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44030096da63..4a4f9219683f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5635,7 +5635,12 @@ static struct page *
 __alloc_contig_migrate_alloc(struct page *page, unsigned long private,
                             int **resultp)
 {
-        return alloc_page(GFP_HIGHUSER_MOVABLE);
+        gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+        if (PageHighMem(page))
+                gfp_mask |= __GFP_HIGHMEM;
+        return alloc_page(gfp_mask);
 }
 /* [start, end) must belong to a single zone. */
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 1ccbd714059c..eb750f851395 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
 /**
 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
- * @end: swap entry to be cmpxchged
+ * @ent: swap entry to be cmpxchged
 * @old: old id
 * @new: new id
 *
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 /**
 * swap_cgroup_record - record mem_cgroup for this swp_entry.
 * @ent: swap entry to be recorded into
- * @mem: mem_cgroup to be recorded
+ * @id: mem_cgroup to be recorded
 *
 * Returns old value at success, 0 at failure.
 * (Of course, old value can be 0.)
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index aa9701e12714..6c118d012bb5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
 /**
 * walk_page_range - walk a memory map's page tables with a callback
- * @mm: memory map to walk
 * @addr: starting address
 * @end: ending address
 * @walk: set of callbacks to invoke for each level of the tree
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 405d331804c3..3707c71ae4cd 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -360,7 +360,6 @@ err_free:
 * @chunk: chunk to depopulate
 * @off: offset to the area to depopulate
 * @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
 *
 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
 * from @chunk.  If @flush is true, vcache is flushed before unmapping
diff --git a/mm/shmem.c b/mm/shmem.c
index a15a466d0d1d..bd106361be4b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -264,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
 }
 /*
+ * Sometimes, before we decide whether to proceed or to fail, we must check
+ * that an entry was not already brought back from swap by a racing thread.
+ *
+ * Checking page is not enough: by the time a SwapCache page is locked, it
+ * might be reused, and again be SwapCache, using the same swap as before.
+ */
+static bool shmem_confirm_swap(struct address_space *mapping,
+                               pgoff_t index, swp_entry_t swap)
+{
+        void *item;
+        rcu_read_lock();
+        item = radix_tree_lookup(&mapping->page_tree, index);
+        rcu_read_unlock();
+        return item == swp_to_radix_entry(swap);
+}
+/*
 * Like add_to_page_cache_locked, but error if expected item has gone.
 */
 static int shmem_add_to_page_cache(struct page *page,
                                   struct address_space *mapping,
                                   pgoff_t index, gfp_t gfp, void *expected)
 {
-        int error = 0;
+        int error;
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(!PageSwapBacked(page));
+        page_cache_get(page);
+        page->mapping = mapping;
+        page->index = index;
+        spin_lock_irq(&mapping->tree_lock);
        if (!expected)
-                error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+                error = radix_tree_insert(&mapping->page_tree, index, page);
+        else
+                error = shmem_radix_tree_replace(mapping, index, expected,
+                                                                 page);
        if (!error) {
-                page_cache_get(page);
+                mapping->nrpages++;
-                page->mapping = mapping;
+                __inc_zone_page_state(page, NR_FILE_PAGES);
-                page->index = index;
+                __inc_zone_page_state(page, NR_SHMEM);
+                spin_unlock_irq(&mapping->tree_lock);
-                spin_lock_irq(&mapping->tree_lock);
+        } else {
-                if (!expected)
+                page->mapping = NULL;
-                        error = radix_tree_insert(&mapping->page_tree,
+                spin_unlock_irq(&mapping->tree_lock);
-                                                        index, page);
+                page_cache_release(page);
-                else
-                        error = shmem_radix_tree_replace(mapping, index,
-                                                        expected, page);
-                if (!error) {
-                        mapping->nrpages++;
-                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                        __inc_zone_page_state(page, NR_SHMEM);
-                        spin_unlock_irq(&mapping->tree_lock);
-                } else {
-                        page->mapping = NULL;
-                        spin_unlock_irq(&mapping->tree_lock);
-                        page_cache_release(page);
-                }
-                if (!expected)
-                        radix_tree_preload_end();
        }
-        if (error)
-                mem_cgroup_uncharge_cache_page(page);
        return error;
 }
@@ -1124,9 +1133,9 @@ repeat:
                /* We have to do this with page locked to prevent races */
                lock_page(page);
                if (!PageSwapCache(page) || page_private(page) != swap.val ||
-                    page->mapping) {
+                    !shmem_confirm_swap(mapping, index, swap)) {
                        error = -EEXIST;        /* try again */
-                        goto failed;
+                        goto unlock;
                }
                if (!PageUptodate(page)) {
                        error = -EIO;
@@ -1142,9 +1151,12 @@ repeat:
                error = mem_cgroup_cache_charge(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
-                if (!error)
+                if (!error) {
                        error = shmem_add_to_page_cache(page, mapping, index,
                                                gfp, swp_to_radix_entry(swap));
+                        /* We already confirmed swap, and make no allocation */
+                        VM_BUG_ON(error);
+                }
                if (error)
                        goto failed;
@@ -1181,11 +1193,18 @@ repeat:
                __set_page_locked(page);
                error = mem_cgroup_cache_charge(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
-                if (!error)
-                        error = shmem_add_to_page_cache(page, mapping, index,
-                                                gfp, NULL);
                if (error)
                        goto decused;
+                error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+                if (!error) {
+                        error = shmem_add_to_page_cache(page, mapping, index,
+                                                        gfp, NULL);
+                        radix_tree_preload_end();
+                }
+                if (error) {
+                        mem_cgroup_uncharge_cache_page(page);
+                        goto decused;
+                }
                lru_cache_add_anon(page);
                spin_lock(&info->lock);
@@ -1245,14 +1264,10 @@ decused:
 unacct:
        shmem_unacct_blocks(info->flags, 1);
 failed:
-        if (swap.val && error != -EINVAL) {
+        if (swap.val && error != -EINVAL &&
-                struct page *test = find_get_page(mapping, index);
+            !shmem_confirm_swap(mapping, index, swap))
-                if (test && !radix_tree_exceptional_entry(test))
+                error = -EEXIST;
-                        page_cache_release(test);
+unlock:
-                /* Have another try if the entry has changed */
-                if (test != swp_to_radix_entry(swap))
-                        error = -EEXIST;
-        }
        if (page) {
                unlock_page(page);
                page_cache_release(page);
@@ -1264,7 +1279,7 @@ failed:
                spin_unlock(&info->lock);
                goto repeat;
        }
-        if (error == -EEXIST)
+        if (error == -EEXIST)   /* from above or from radix_tree_insert */
                goto repeat;
        return error;
 }
@@ -1594,6 +1609,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
        struct splice_pipe_desc spd = {
                .pages = pages,
                .partial = partial,
+                .nr_pages_max = PIPE_DEF_BUFFERS,
                .flags = flags,
                .ops = &page_cache_pipe_buf_ops,
                .spd_release = spd_release_page,
@@ -1682,7 +1698,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
        if (spd.nr_pages)
                error = splice_to_pipe(pipe, &spd);
-        splice_shrink_spd(pipe, &spd);
+        splice_shrink_spd(&spd);
        if (error > 0) {
                *ppos += error;
@@ -1691,98 +1707,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
        return error;
 }
-/*
- * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
- */
-static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
-                                    pgoff_t index, pgoff_t end, int origin)
-{
-        struct page *page;
-        struct pagevec pvec;
-        pgoff_t indices[PAGEVEC_SIZE];
-        bool done = false;
-        int i;
-        pagevec_init(&pvec, 0);
-        pvec.nr = 1;            /* start small: we may be there already */
-        while (!done) {
-                pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-                                        pvec.nr, pvec.pages, indices);
-                if (!pvec.nr) {
-                        if (origin == SEEK_DATA)
-                                index = end;
-                        break;
-                }
-                for (i = 0; i < pvec.nr; i++, index++) {
-                        if (index < indices[i]) {
-                                if (origin == SEEK_HOLE) {
-                                        done = true;
-                                        break;
-                                }
-                                index = indices[i];
-                        }
-                        page = pvec.pages[i];
-                        if (page && !radix_tree_exceptional_entry(page)) {
-                                if (!PageUptodate(page))
-                                        page = NULL;
-                        }
-                        if (index >= end ||
-                            (page && origin == SEEK_DATA) ||
-                            (!page && origin == SEEK_HOLE)) {
-                                done = true;
-                                break;
-                        }
-                }
-                shmem_deswap_pagevec(&pvec);
-                pagevec_release(&pvec);
-                pvec.nr = PAGEVEC_SIZE;
-                cond_resched();
-        }
-        return index;
-}
-static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
-{
-        struct address_space *mapping;
-        struct inode *inode;
-        pgoff_t start, end;
-        loff_t new_offset;
-        if (origin != SEEK_DATA && origin != SEEK_HOLE)
-                return generic_file_llseek_size(file, offset, origin,
-                                                        MAX_LFS_FILESIZE);
-        mapping = file->f_mapping;
-        inode = mapping->host;
-        mutex_lock(&inode->i_mutex);
-        /* We're holding i_mutex so we can access i_size directly */
-        if (offset < 0)
-                offset = -EINVAL;
-        else if (offset >= inode->i_size)
-                offset = -ENXIO;
-        else {
-                start = offset >> PAGE_CACHE_SHIFT;
-                end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-                new_offset = shmem_seek_hole_data(mapping, start, end, origin);
-                new_offset <<= PAGE_CACHE_SHIFT;
-                if (new_offset > offset) {
-                        if (new_offset < inode->i_size)
-                                offset = new_offset;
-                        else if (origin == SEEK_DATA)
-                                offset = -ENXIO;
-                        else
-                                offset = inode->i_size;
-                }
-        }
-        if (offset >= 0 && offset != file->f_pos) {
-                file->f_pos = offset;
-                file->f_version = 0;
-        }
-        mutex_unlock(&inode->i_mutex);
-        return offset;
-}
 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                                                         loff_t len)
 {
@@ -2786,7 +2710,7 @@ static const struct address_space_operations shmem_aops = {
 static const struct file_operations shmem_file_operations = {
        .mmap           = shmem_mmap,
 #ifdef CONFIG_TMPFS
-        .llseek         = shmem_file_llseek,
+        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = shmem_file_aio_read,
diff --git a/mm/sparse.c b/mm/sparse.c
index 6a4bf9160e85..c7bb952400c8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -275,8 +275,9 @@ static unsigned long * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
                                         unsigned long size)
 {
-        pg_data_t *host_pgdat;
+        unsigned long goal, limit;
-        unsigned long goal;
+        unsigned long *p;
+        int nid;
        /*
         * A page may contain usemaps for other sections preventing the
         * page being freed and making a section unremovable while
@@ -287,10 +288,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
         * from the same section as the pgdat where possible to avoid
         * this problem.
         */
-        goal = __pa(pgdat) & PAGE_SECTION_MASK;
+        goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
-        host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT));
+        limit = goal + (1UL << PA_SECTION_SHIFT);
-        return __alloc_bootmem_node_nopanic(host_pgdat, size,
+        nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
-                                            SMP_CACHE_BYTES, goal);
+again:
+        p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
+                                          SMP_CACHE_BYTES, goal, limit);
+        if (!p && limit) {
+                limit = 0;
+                goto again;
+        }
+        return p;
 }
 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index de5bc51c4a66..71373d03fcee 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1916,24 +1916,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
        /*
         * Find out how many pages are allowed for a single swap
-         * device. There are three limiting factors: 1) the number
+         * device. There are two limiting factors: 1) the number
         * of bits for the swap offset in the swp_entry_t type, and
         * 2) the number of bits in the swap pte as defined by the
-         * the different architectures, and 3) the number of free bits
+         * different architectures. In order to find the
-         * in an exceptional radix_tree entry. In order to find the
         * largest possible bit mask, a swap entry with swap type 0
         * and swap offset ~0UL is created, encoded to a swap pte,
         * decoded to a swp_entry_t again, and finally the swap
         * offset is extracted. This will mask all the bits from
         * the initial ~0UL mask that can't be encoded in either
         * the swp_entry_t or the architecture definition of a
-         * swap pte.  Then the same is done for a radix_tree entry.
+         * swap pte.
         */
        maxpages = swp_offset(pte_to_swp_entry(
-                        swp_entry_to_pte(swp_entry(0, ~0UL))));
+                        swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
-        maxpages = swp_offset(radix_to_swp_entry(
-                        swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
        if (maxpages > swap_header->info.last_page) {
                maxpages = swap_header->info.last_page + 1;
                /* p->max is an unsigned int: don't overflow it */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eeb3bc9d1d36..66e431060c05 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2688,7 +2688,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
                 * them before going back to sleep.
                 */
                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
-                schedule();
+                if (!kthread_should_stop())
+                        schedule();
                set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
        } else {
                if (remaining)
@@ -2955,14 +2958,17 @@ int kswapd_run(int nid)
 }
 /*
- * Called by memory hotplug when all memory in a node is offlined.
+ * Called by memory hotplug when all memory in a node is offlined.  Caller must
+ * hold lock_memory_hotplug().
 */
 void kswapd_stop(int nid)
 {
        struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
-        if (kswapd)
+        if (kswapd) {
                kthread_stop(kswapd);
+                NODE_DATA(nid)->kswapd = NULL;
+        }
 }
 static int __init kswapd_init(void)
author	Mauro Carvalho Chehab <mchehab@redhat.com>	2012-07-29 20:09:39 -0400
committer	Mauro Carvalho Chehab <mchehab@redhat.com>	2012-07-29 20:09:39 -0400
commit	73bcc49959e4e40911dd0dd634bf1b353827df66 (patch)
tree	6b0c1d440c490a65c51ab5cf5aee7095cb4089d3 /mm
parent	8447c4d15e357a458c9051ddc84aa6c8b9c27000 (diff)
parent	28a33cbc24e4256c143dce96c7d93bf423229f92 (diff)