Merge commit 'v2.6.28-rc7' into core/locking

author: Ingo Molnar <mingo@elte.hu> 2008-12-04 02:52:14 -0500
committer: Ingo Molnar <mingo@elte.hu> 2008-12-04 02:52:14 -0500
commit: cb9c34e6d090d376b77becaa5d29a65dec7f4272 (patch)
tree: 3678abce20d6825aebe3fec218057d4131e13fd6 /mm
parent: 470c66239ef0336429b35345f3f615d47341e13b (diff)
parent: 061e41fdb5047b1fb161e89664057835935ca1d2 (diff)
11 files changed, 103 insertions, 98 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d143ab67be44..6058b53dcb89 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1796,6 +1796,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                                struct page *page, unsigned long address)
 {
+        struct hstate *h = hstate_vma(vma);
        struct vm_area_struct *iter_vma;
        struct address_space *mapping;
        struct prio_tree_iter iter;
@@ -1805,7 +1806,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         * vm_pgoff is in PAGE_SIZE units, hence the different calculation
         * from page cache lookup which is in HPAGE_SIZE units.
         */
-        address = address & huge_page_mask(hstate_vma(vma));
+        address = address & huge_page_mask(h);
        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
                + (vma->vm_pgoff >> PAGE_SHIFT);
        mapping = (struct address_space *)page_private(page);
@@ -1824,7 +1825,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                 */
                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
                        unmap_hugepage_range(iter_vma,
-                                address, address + HPAGE_SIZE,
+                                address, address + huge_page_size(h),
                                page);
        }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6837a1014372..b17371185468 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -22,7 +22,6 @@
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/ioport.h>
-#include <linux/cpuset.h>
 #include <linux/delay.h>
 #include <linux/migrate.h>
 #include <linux/page-isolation.h>
@@ -190,7 +189,7 @@ static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
                                        pgdat->node_start_pfn;
 }
-static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
        struct pglist_data *pgdat = zone->zone_pgdat;
        int nr_pages = PAGES_PER_SECTION;
@@ -217,7 +216,7 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
        return 0;
 }
-static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
+static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn)
 {
        int nr_pages = PAGES_PER_SECTION;
        int ret;
@@ -274,7 +273,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
 * call this function after deciding the zone to which to
 * add the new pages.
 */
-int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
+int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn,
                 unsigned long nr_pages)
 {
        unsigned long i;
@@ -471,7 +470,8 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 }
-int add_memory(int nid, u64 start, u64 size)
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+int __ref add_memory(int nid, u64 start, u64 size)
 {
        pg_data_t *pgdat = NULL;
        int new_pgdat = 0;
@@ -498,8 +498,6 @@ int add_memory(int nid, u64 start, u64 size)
        /* we online node here. we can't roll back from here. */
        node_set_online(nid);
-        cpuset_track_online_nodes();
        if (new_pgdat) {
                ret = register_one_node(nid);
                /*
diff --git a/mm/migrate.c b/mm/migrate.c
index 385db89f0c33..1e0d6b237f44 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -522,15 +522,12 @@ static int writeout(struct address_space *mapping, struct page *page)
        remove_migration_ptes(page, page);
        rc = mapping->a_ops->writepage(page, &wbc);
-        if (rc < 0)
-                /* I/O Error writing */
-                return -EIO;
        if (rc != AOP_WRITEPAGE_ACTIVATE)
                /* unlocked. Relock */
                lock_page(page);
-        return -EAGAIN;
+        return (rc < 0) ? -EIO : -EAGAIN;
 }
 /*
diff --git a/mm/mlock.c b/mm/mlock.c
index 008ea70b7afa..1ada366570cb 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -66,14 +66,10 @@ void __clear_page_mlock(struct page *page)
                putback_lru_page(page);
        } else {
                /*
-                 * Page not on the LRU yet.  Flush all pagevecs and retry.
+                 * We lost the race. the page already moved to evictable list.
                 */
-                lru_add_drain_all();
+                if (PageUnevictable(page))
-                if (!isolate_lru_page(page))
-                        putback_lru_page(page);
-                else if (PageUnevictable(page))
                        count_vm_event(UNEVICTABLE_PGSTRANDED);
        }
 }
@@ -166,7 +162,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
        unsigned long addr = start;
        struct page *pages[16]; /* 16 gives a reasonable batch */
        int nr_pages = (end - start) / PAGE_SIZE;
-        int ret;
+        int ret = 0;
        int gup_flags = 0;
        VM_BUG_ON(start & ~PAGE_MASK);
@@ -187,8 +183,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
        if (vma->vm_flags & VM_WRITE)
                gup_flags |= GUP_FLAGS_WRITE;
-        lru_add_drain_all();    /* push cached pages to LRU */
        while (nr_pages > 0) {
                int i;
@@ -251,8 +245,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
                ret = 0;
        }
-        lru_add_drain_all();    /* to update stats */
        return ret;     /* count entire vma as locked_vm */
 }
@@ -546,6 +538,8 @@ asmlinkage long sys_mlock(unsigned long start, size_t len)
        if (!can_do_mlock())
                return -EPERM;
+        lru_add_drain_all();    /* flush pagevec */
        down_write(&current->mm->mmap_sem);
        len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
        start &= PAGE_MASK;
@@ -612,6 +606,8 @@ asmlinkage long sys_mlockall(int flags)
        if (!can_do_mlock())
                goto out;
+        lru_add_drain_all();    /* flush pagevec */
        down_write(&current->mm->mmap_sem);
        lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
diff --git a/mm/mmap.c b/mm/mmap.c
index de14ac21e5b5..d4855a682ab6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1704,7 +1704,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
        vma = find_vma_prev(mm, addr, &prev);
        if (vma && (vma->vm_start <= addr))
                return vma;
-        if (expand_stack(prev, addr))
+        if (!prev || expand_stack(prev, addr))
                return NULL;
        if (prev->vm_flags & VM_LOCKED) {
                if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 54069e64e3a8..d8ac01474563 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1561,6 +1561,10 @@ nofail_alloc:
        /* We now go into synchronous reclaim */
        cpuset_memory_pressure_bump();
+        /*
+         * The task's cpuset might have expanded its set of allowable nodes
+         */
+        cpuset_update_task_memory_state();
        p->flags |= PF_MEMALLOC;
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index f59d797dc5a9..0b3cbf090a67 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -21,7 +21,7 @@ static unsigned long total_usage;
 #if !defined(CONFIG_SPARSEMEM)
-void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 {
        pgdat->node_page_cgroup = NULL;
 }
@@ -97,7 +97,8 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
        return section->page_cgroup + pfn;
 }
-int __meminit init_section_page_cgroup(unsigned long pfn)
+/* __alloc_bootmem...() is protected by !slab_available() */
+int __init_refok init_section_page_cgroup(unsigned long pfn)
 {
        struct mem_section *section;
        struct page_cgroup *base, *pc;
@@ -106,19 +107,29 @@ int __meminit init_section_page_cgroup(unsigned long pfn)
        section = __pfn_to_section(pfn);
-        if (section->page_cgroup)
+        if (!section->page_cgroup) {
-                return 0;
+                nid = page_to_nid(pfn_to_page(pfn));
+                table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
-        nid = page_to_nid(pfn_to_page(pfn));
+                if (slab_is_available()) {
+                        base = kmalloc_node(table_size, GFP_KERNEL, nid);
-        table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+                        if (!base)
-        if (slab_is_available()) {
+                                base = vmalloc_node(table_size, nid);
-                base = kmalloc_node(table_size, GFP_KERNEL, nid);
+                } else {
-                if (!base)
+                        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
-                        base = vmalloc_node(table_size, nid);
+                                table_size,
-        } else {
-                base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size,
                                PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+                }
+        } else {
+                /*
+                 * We don't have to allocate page_cgroup again, but
+                 * address of memmap may be changed. So, we have to initialize
+                 * again.
+                 */
+                base = section->page_cgroup + pfn;
+                table_size = 0;
+                /* check address of memmap is changed or not. */
+                if (base->page == pfn_to_page(pfn))
+                        return 0;
        }
        if (!base) {
@@ -158,14 +169,14 @@ void __free_page_cgroup(unsigned long pfn)
        }
 }
-int online_page_cgroup(unsigned long start_pfn,
+int __meminit online_page_cgroup(unsigned long start_pfn,
                        unsigned long nr_pages,
                        int nid)
 {
        unsigned long start, end, pfn;
        int fail = 0;
-        start = start_pfn & (PAGES_PER_SECTION - 1);
+        start = start_pfn & ~(PAGES_PER_SECTION - 1);
        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
@@ -183,12 +194,12 @@ int online_page_cgroup(unsigned long start_pfn,
        return -ENOMEM;
 }
-int offline_page_cgroup(unsigned long start_pfn,
+int __meminit offline_page_cgroup(unsigned long start_pfn,
                unsigned long nr_pages, int nid)
 {
        unsigned long start, end, pfn;
-        start = start_pfn & (PAGES_PER_SECTION - 1);
+        start = start_pfn & ~(PAGES_PER_SECTION - 1);
        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
@@ -197,7 +208,7 @@ int offline_page_cgroup(unsigned long start_pfn,
 }
-static int page_cgroup_callback(struct notifier_block *self,
+static int __meminit page_cgroup_callback(struct notifier_block *self,
                               unsigned long action, void *arg)
 {
        struct memory_notify *mn = arg;
@@ -207,18 +218,23 @@ static int page_cgroup_callback(struct notifier_block *self,
                ret = online_page_cgroup(mn->start_pfn,
                                   mn->nr_pages, mn->status_change_nid);
                break;
-        case MEM_CANCEL_ONLINE:
        case MEM_OFFLINE:
                offline_page_cgroup(mn->start_pfn,
                                mn->nr_pages, mn->status_change_nid);
                break;
+        case MEM_CANCEL_ONLINE:
        case MEM_GOING_OFFLINE:
                break;
        case MEM_ONLINE:
        case MEM_CANCEL_OFFLINE:
                break;
        }
-        ret = notifier_from_errno(ret);
+        if (ret)
+                ret = notifier_from_errno(ret);
+        else
+                ret = NOTIFY_OK;
        return ret;
 }
@@ -248,7 +264,7 @@ void __init page_cgroup_init(void)
        " want\n");
 }
-void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 {
        return;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 7ad489af9561..749588a50a5a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2931,8 +2931,10 @@ static int slab_memory_callback(struct notifier_block *self,
        case MEM_CANCEL_OFFLINE:
                break;
        }
+        if (ret)
-        ret = notifier_from_errno(ret);
+                ret = notifier_from_errno(ret);
+        else
+                ret = NOTIFY_OK;
        return ret;
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 39db301b920d..083f5b63e7a8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -570,7 +570,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
 * set.  If this is <=0, then that means that the passed-in
 * map was not consumed and must be freed.
 */
-int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
                           int nr_pages)
 {
        unsigned long section_nr = pfn_to_section_nr(start_pfn);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ba6b0f5f7fac..f3f6e0758562 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -77,7 +77,6 @@ static void vunmap_page_range(unsigned long addr, unsigned long end)
        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
-        flush_cache_vunmap(addr, end);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
@@ -324,14 +323,14 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
        BUG_ON(size & ~PAGE_MASK);
-        addr = ALIGN(vstart, align);
        va = kmalloc_node(sizeof(struct vmap_area),
                        gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!va))
                return ERR_PTR(-ENOMEM);
 retry:
+        addr = ALIGN(vstart, align);
        spin_lock(&vmap_area_lock);
        /* XXX: could have a last_hole cache */
        n = vmap_area_root.rb_node;
@@ -362,7 +361,7 @@ retry:
                                goto found;
                }
-                while (addr + size >= first->va_start && addr + size <= vend) {
+                while (addr + size > first->va_start && addr + size <= vend) {
                        addr = ALIGN(first->va_end + PAGE_SIZE, align);
                        n = rb_next(&first->rb_node);
@@ -522,24 +521,45 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 }
 /*
+ * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
+ * is already purging.
+ */
+static void try_purge_vmap_area_lazy(void)
+{
+        unsigned long start = ULONG_MAX, end = 0;
+        __purge_vmap_area_lazy(&start, &end, 0, 0);
+}
+/*
 * Kick off a purge of the outstanding lazy areas.
 */
 static void purge_vmap_area_lazy(void)
 {
        unsigned long start = ULONG_MAX, end = 0;
-        __purge_vmap_area_lazy(&start, &end, 0, 0);
+        __purge_vmap_area_lazy(&start, &end, 1, 0);
 }
 /*
- * Free and unmap a vmap area
+ * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
+ * called for the correct range previously.
 */
-static void free_unmap_vmap_area(struct vmap_area *va)
+static void free_unmap_vmap_area_noflush(struct vmap_area *va)
 {
        va->flags |= VM_LAZY_FREE;
        atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
        if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
-                purge_vmap_area_lazy();
+                try_purge_vmap_area_lazy();
+}
+/*
+ * Free and unmap a vmap area
+ */
+static void free_unmap_vmap_area(struct vmap_area *va)
+{
+        flush_cache_vunmap(va->va_start, va->va_end);
+        free_unmap_vmap_area_noflush(va);
 }
 static struct vmap_area *find_vmap_area(unsigned long addr)
@@ -723,7 +743,7 @@ static void free_vmap_block(struct vmap_block *vb)
        spin_unlock(&vmap_block_tree_lock);
        BUG_ON(tmp != vb);
-        free_unmap_vmap_area(vb->va);
+        free_unmap_vmap_area_noflush(vb->va);
        call_rcu(&vb->rcu_head, rcu_free_vb);
 }
@@ -785,6 +805,9 @@ static void vb_free(const void *addr, unsigned long size)
        BUG_ON(size & ~PAGE_MASK);
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+        flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
        order = get_order(size);
        offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3b5860294bb6..62e7f62fb559 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -623,6 +623,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * Try to allocate it some swap space here.
                 */
                if (PageAnon(page) && !PageSwapCache(page)) {
+                        if (!(sc->gfp_mask & __GFP_IO))
+                                goto keep_locked;
                        switch (try_to_munlock(page)) {
                        case SWAP_FAIL:         /* shouldn't happen */
                        case SWAP_AGAIN:
@@ -634,6 +636,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        }
                        if (!add_to_swap(page, GFP_ATOMIC))
                                goto activate_locked;
+                        may_enter_fs = 1;
                }
 #endif /* CONFIG_SWAP */
@@ -1245,6 +1248,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                list_add(&page->lru, &l_inactive);
        }
+        spin_lock_irq(&zone->lru_lock);
        /*
         * Count referenced pages from currently used mappings as
         * rotated, even though they are moved to the inactive list.
@@ -1260,7 +1264,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        pgmoved = 0;
        lru = LRU_BASE + file * LRU_FILE;
-        spin_lock_irq(&zone->lru_lock);
        while (!list_empty(&l_inactive)) {
                page = lru_to_page(&l_inactive);
                prefetchw_prev_lru_page(page, &l_inactive, flags);
@@ -1386,9 +1389,9 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
        file_prio = 200 - sc->swappiness;
        /*
-         *                  anon       recent_rotated[0]
+         * The amount of pressure on anon vs file pages is inversely
-         * %anon = 100 * ----------- / ----------------- * IO cost
+         * proportional to the fraction of recently scanned pages on
-         *               anon + file      rotate_sum
+         * each list that were recently referenced and in active use.
         */
        ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
        ap /= zone->recent_rotated[0] + 1;
@@ -2368,39 +2371,6 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
        return 1;
 }
-static void show_page_path(struct page *page)
-{
-        char buf[256];
-        if (page_is_file_cache(page)) {
-                struct address_space *mapping = page->mapping;
-                struct dentry *dentry;
-                pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-                spin_lock(&mapping->i_mmap_lock);
-                dentry = d_find_alias(mapping->host);
-                printk(KERN_INFO "rescued: %s %lu\n",
-                       dentry_path(dentry, buf, 256), pgoff);
-                spin_unlock(&mapping->i_mmap_lock);
-        } else {
-#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU)
-                struct anon_vma *anon_vma;
-                struct vm_area_struct *vma;
-                anon_vma = page_lock_anon_vma(page);
-                if (!anon_vma)
-                        return;
-                list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-                        printk(KERN_INFO "rescued: anon %s\n",
-                               vma->vm_mm->owner->comm);
-                        break;
-                }
-                page_unlock_anon_vma(anon_vma);
-#endif
-        }
-}
 /**
 * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
 * @page: page to check evictability and move to appropriate lru list
@@ -2421,8 +2391,6 @@ retry:
        if (page_evictable(page, NULL)) {
                enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
-                show_page_path(page);
                __dec_zone_state(zone, NR_UNEVICTABLE);
                list_move(&page->lru, &zone->lru[l].list);
                __inc_zone_state(zone, NR_INACTIVE_ANON + l);
author	Ingo Molnar <mingo@elte.hu>	2008-12-04 02:52:14 -0500
committer	Ingo Molnar <mingo@elte.hu>	2008-12-04 02:52:14 -0500
commit	cb9c34e6d090d376b77becaa5d29a65dec7f4272 (patch)
tree	3678abce20d6825aebe3fec218057d4131e13fd6 /mm
parent	470c66239ef0336429b35345f3f615d47341e13b (diff)
parent	061e41fdb5047b1fb161e89664057835935ca1d2 (diff)