Merge branch 'for_rmk' of git://git.linaro.org/kernel/linux-linaro-next into devel-stable

author: Russell King <rmk+kernel@arm.linux.org.uk> 2010-10-04 17:23:26 -0400
committer: Russell King <rmk+kernel@arm.linux.org.uk> 2010-10-04 17:23:30 -0400
commit: a9fda02bfc91a281cd812ae15dabe6bfb9574f90 (patch)
tree: d7be703f341870f15f87a59a63976f650078b4d6 /mm
parent: aa3090005d27f3c7fba915ccea36b97b669fa3ab (diff)
parent: ec706dab290c486837d4a825870ab052bf200279 (diff)
9 files changed, 97 insertions, 54 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c2bf86f470ed..65d420499a61 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -30,6 +30,7 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info);
 struct backing_dev_info noop_backing_dev_info = {
        .name           = "noop",
+        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 EXPORT_SYMBOL_GPL(noop_backing_dev_info);
@@ -243,6 +244,7 @@ static int __init default_bdi_init(void)
        err = bdi_init(&default_backing_dev_info);
        if (!err)
                bdi_register(&default_backing_dev_info, NULL, "default");
+        err = bdi_init(&noop_backing_dev_info);
        return err;
 }
diff --git a/mm/fremap.c b/mm/fremap.c
index 46f5dacf90a2..ec520c7b28df 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -125,7 +125,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 {
        struct mm_struct *mm = current->mm;
        struct address_space *mapping;
-        unsigned long end = start + size;
        struct vm_area_struct *vma;
        int err = -EINVAL;
        int has_write_lock = 0;
@@ -142,6 +141,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        if (start + size <= start)
                return err;
+        /* Does pgoff wrap? */
+        if (pgoff + (size >> PAGE_SHIFT) < pgoff)
+                return err;
        /* Can we represent this offset inside this architecture's pte's? */
 #if PTE_FILE_MAX_BITS < BITS_PER_LONG
        if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
@@ -168,7 +171,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        if (!(vma->vm_flags & VM_CAN_NONLINEAR))
                goto out;
-        if (end <= start || start < vma->vm_start || end > vma->vm_end)
+        if (start < vma->vm_start || start + size > vma->vm_end)
                goto out;
        /* Must set VM_NONLINEAR before any pages are populated. */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cc5be788a39f..c03273807182 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2324,11 +2324,8 @@ retry_avoidcopy:
         * and just make the page writable */
        avoidcopy = (page_mapcount(old_page) == 1);
        if (avoidcopy) {
-                if (!trylock_page(old_page)) {
+                if (PageAnon(old_page))
-                        if (PageAnon(old_page))
+                        page_move_anon_rmap(old_page, vma, address);
-                                page_move_anon_rmap(old_page, vma, address);
-                } else
-                        unlock_page(old_page);
                set_huge_ptep_writable(vma, address, ptep);
                return 0;
        }
@@ -2404,7 +2401,7 @@ retry_avoidcopy:
                set_huge_pte_at(mm, address, ptep,
                                make_huge_pte(vma, new_page, 1));
                page_remove_rmap(old_page);
-                hugepage_add_anon_rmap(new_page, vma, address);
+                hugepage_add_new_anon_rmap(new_page, vma, address);
                /* Make the old page be freed below */
                new_page = old_page;
                mmu_notifier_invalidate_range_end(mm,
@@ -2631,10 +2628,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                                                vma, address);
        }
-        if (!pagecache_page) {
+        /*
-                page = pte_page(entry);
+         * hugetlb_cow() requires page locks of pte_page(entry) and
+         * pagecache_page, so here we need take the former one
+         * when page != pagecache_page or !pagecache_page.
+         * Note that locking order is always pagecache_page -> page,
+         * so no worry about deadlock.
+         */
+        page = pte_page(entry);
+        if (page != pagecache_page)
                lock_page(page);
-        }
        spin_lock(&mm->page_table_lock);
        /* Check for a racing update before calling hugetlb_cow */
@@ -2661,9 +2664,8 @@ out_page_table_lock:
        if (pagecache_page) {
                unlock_page(pagecache_page);
                put_page(pagecache_page);
-        } else {
-                unlock_page(page);
        }
+        unlock_page(page);
 out_mutex:
        mutex_unlock(&hugetlb_instantiation_mutex);
diff --git a/mm/memory.c b/mm/memory.c
index 71b161b73bb5..0e18b4d649ec 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2680,10 +2680,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
        /*
-         * Make sure try_to_free_swap didn't release the swapcache
+         * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
-         * from under us. The page pin isn't enough to prevent that.
+         * release the swapcache from under us.  The page pin, and pte_same
+         * test below, are not enough to exclude that.  Even if it is still
+         * swapcache, we need to check that the page's swap has not changed.
         */
-        if (unlikely(!PageSwapCache(page)))
+        if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
                goto out_page;
        if (ksm_might_need_to_copy(page, vma, address)) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 6128dc8e5ede..00161a48a451 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2009,6 +2009,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
                        removed_exe_file_vma(mm);
                fput(new->vm_file);
        }
+        unlink_anon_vmas(new);
 out_free_mpol:
        mpol_put(pol);
 out_free_vma:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index fc81cb22869e..4029583a1024 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -121,8 +121,8 @@ struct task_struct *find_lock_task_mm(struct task_struct *p)
 }
 /* return true if the task is not adequate as candidate victim task. */
-static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
+static bool oom_unkillable_task(struct task_struct *p,
-                           const nodemask_t *nodemask)
+                const struct mem_cgroup *mem, const nodemask_t *nodemask)
 {
        if (is_global_init(p))
                return true;
@@ -208,8 +208,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
         */
        points += p->signal->oom_score_adj;
-        if (points < 0)
+        /*
-                return 0;
+         * Never return 0 for an eligible task that may be killed since it's
+         * possible that no single user task uses more than 0.1% of memory and
+         * no single admin tasks uses more than 3.0%.
+         */
+        if (points <= 0)
+                return 1;
        return (points < 1000) ? points : 1000;
 }
@@ -339,26 +344,24 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 /**
 * dump_tasks - dump current memory state of all system tasks
 * @mem: current's memory controller, if constrained
+ * @nodemask: nodemask passed to page allocator for mempolicy ooms
 *
- * Dumps the current memory state of all system tasks, excluding kernel threads.
+ * Dumps the current memory state of all eligible tasks.  Tasks not in the same
+ * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
+ * are not shown.
 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
 * value, oom_score_adj value, and name.
 *
- * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
- * shown.
- *
 * Call with tasklist_lock read-locked.
 */
-static void dump_tasks(const struct mem_cgroup *mem)
+static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
 {
        struct task_struct *p;
        struct task_struct *task;
        pr_info("[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name\n");
        for_each_process(p) {
-                if (p->flags & PF_KTHREAD)
+                if (oom_unkillable_task(p, mem, nodemask))
-                        continue;
-                if (mem && !task_in_mem_cgroup(p, mem))
                        continue;
                task = find_lock_task_mm(p);
@@ -381,7 +384,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
 }
 static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
-                                                        struct mem_cgroup *mem)
+                        struct mem_cgroup *mem, const nodemask_t *nodemask)
 {
        task_lock(current);
        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
@@ -394,7 +397,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
        mem_cgroup_print_oom_info(mem, p);
        show_mem();
        if (sysctl_oom_dump_tasks)
-                dump_tasks(mem);
+                dump_tasks(mem, nodemask);
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -436,7 +439,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
        unsigned int victim_points = 0;
        if (printk_ratelimit())
-                dump_header(p, gfp_mask, order, mem);
+                dump_header(p, gfp_mask, order, mem, nodemask);
        /*
         * If the task is already exiting, don't alarm the sysadmin or kill
@@ -482,7 +485,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
 */
 static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
-                                int order)
+                                int order, const nodemask_t *nodemask)
 {
        if (likely(!sysctl_panic_on_oom))
                return;
@@ -496,7 +499,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
                        return;
        }
        read_lock(&tasklist_lock);
-        dump_header(NULL, gfp_mask, order, NULL);
+        dump_header(NULL, gfp_mask, order, NULL, nodemask);
        read_unlock(&tasklist_lock);
        panic("Out of memory: %s panic_on_oom is enabled\n",
                sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
@@ -509,7 +512,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
        unsigned int points = 0;
        struct task_struct *p;
-        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
+        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
        limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
        read_lock(&tasklist_lock);
 retry:
@@ -641,6 +644,7 @@ static void clear_system_oom(void)
 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                int order, nodemask_t *nodemask)
 {
+        const nodemask_t *mpol_mask;
        struct task_struct *p;
        unsigned long totalpages;
        unsigned long freed = 0;
@@ -670,7 +674,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
         */
        constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
                                                &totalpages);
-        check_panic_on_oom(constraint, gfp_mask, order);
+        mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
+        check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
        read_lock(&tasklist_lock);
        if (sysctl_oom_kill_allocating_task &&
@@ -688,15 +693,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        }
 retry:
-        p = select_bad_process(&points, totalpages, NULL,
+        p = select_bad_process(&points, totalpages, NULL, mpol_mask);
-                        constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
-                                                                 NULL);
        if (PTR_ERR(p) == -1UL)
                goto out;
        /* Found nothing?!?! Either we hang forever, or we panic. */
        if (!p) {
-                dump_header(NULL, gfp_mask, order, NULL);
+                dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
                read_unlock(&tasklist_lock);
                panic("Out of memory and no killable processes...\n");
        }
diff --git a/mm/percpu.c b/mm/percpu.c
index 58c572b18b07..c76ef3891e0d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1401,9 +1401,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                        if (pcpu_first_unit_cpu == NR_CPUS)
                                pcpu_first_unit_cpu = cpu;
+                        pcpu_last_unit_cpu = cpu;
                }
        }
-        pcpu_last_unit_cpu = cpu;
        pcpu_nr_units = unit;
        for_each_possible_cpu(cpu)
diff --git a/mm/rmap.c b/mm/rmap.c
index f6f0d2dda2ea..9d2ba01bd4f9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1564,13 +1564,14 @@ static void __hugepage_set_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address, int exclusive)
 {
        struct anon_vma *anon_vma = vma->anon_vma;
        BUG_ON(!anon_vma);
-        if (!exclusive) {
-                struct anon_vma_chain *avc;
+        if (PageAnon(page))
-                avc = list_entry(vma->anon_vma_chain.prev,
+                return;
-                                 struct anon_vma_chain, same_vma);
+        if (!exclusive)
-                anon_vma = avc->anon_vma;
+                anon_vma = anon_vma->root;
-        }
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        page->mapping = (struct address_space *) anon_vma;
        page->index = linear_page_index(vma, address);
@@ -1581,6 +1582,8 @@ void hugepage_add_anon_rmap(struct page *page,
 {
        struct anon_vma *anon_vma = vma->anon_vma;
        int first;
+        BUG_ON(!PageLocked(page));
        BUG_ON(!anon_vma);
        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        first = atomic_inc_and_test(&page->_mapcount);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c391c320dbaf..c5dfabf25f11 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1804,12 +1804,11 @@ static void shrink_zone(int priority, struct zone *zone,
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 */
-static bool shrink_zones(int priority, struct zonelist *zonelist,
+static void shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
        struct zoneref *z;
        struct zone *zone;
-        bool all_unreclaimable = true;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1827,8 +1826,38 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
                }
                shrink_zone(priority, zone, sc);
-                all_unreclaimable = false;
        }
+}
+static bool zone_reclaimable(struct zone *zone)
+{
+        return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+}
+/*
+ * As hibernation is going on, kswapd is freezed so that it can't mark
+ * the zone into all_unreclaimable. It can't handle OOM during hibernation.
+ * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
+ */
+static bool all_unreclaimable(struct zonelist *zonelist,
+                struct scan_control *sc)
+{
+        struct zoneref *z;
+        struct zone *zone;
+        bool all_unreclaimable = true;
+        for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                        gfp_zone(sc->gfp_mask), sc->nodemask) {
+                if (!populated_zone(zone))
+                        continue;
+                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                        continue;
+                if (zone_reclaimable(zone)) {
+                        all_unreclaimable = false;
+                        break;
+                }
+        }
        return all_unreclaimable;
 }
@@ -1852,7 +1881,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
        int priority;
-        bool all_unreclaimable;
        unsigned long total_scanned = 0;
        struct reclaim_state *reclaim_state = current->reclaim_state;
        struct zoneref *z;
@@ -1869,7 +1897,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                sc->nr_scanned = 0;
                if (!priority)
                        disable_swap_token();
-                all_unreclaimable = shrink_zones(priority, zonelist, sc);
+                shrink_zones(priority, zonelist, sc);
                /*
                 * Don't shrink slabs when reclaiming memory from
                 * over limit cgroups
@@ -1931,7 +1959,7 @@ out:
                return sc->nr_reclaimed;
        /* top priority shrink_zones still had more to do? don't OOM, then */
-        if (scanning_global_lru(sc) && !all_unreclaimable)
+        if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
                return 1;
        return 0;
@@ -2197,8 +2225,7 @@ loop_again:
                        total_scanned += sc.nr_scanned;
                        if (zone->all_unreclaimable)
                                continue;
-                        if (nr_slab == 0 &&
+                        if (nr_slab == 0 && !zone_reclaimable(zone))
-                            zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
                                zone->all_unreclaimable = 1;
                        /*
                         * If we've done a decent amount of scanning and
author	Russell King <rmk+kernel@arm.linux.org.uk>	2010-10-04 17:23:26 -0400
committer	Russell King <rmk+kernel@arm.linux.org.uk>	2010-10-04 17:23:30 -0400
commit	a9fda02bfc91a281cd812ae15dabe6bfb9574f90 (patch)
tree	d7be703f341870f15f87a59a63976f650078b4d6 /mm
parent	aa3090005d27f3c7fba915ccea36b97b669fa3ab (diff)
parent	ec706dab290c486837d4a825870ab052bf200279 (diff)