Merge commit 'v2.6.33' into core/rcu

Merge reason: Update from -rc4 to -final. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2010-02-25 03:40:22 -0500
committer: Ingo Molnar <mingo@elte.hu> 2010-02-25 03:40:26 -0500
commit: 996de8c6fe95c5a9fc524241cc8f142ef0605d3d (patch)
tree: 0f637ab0d80d6d7e213707ac2d8c1cc16b69523c /mm
parent: 017c426138122c8e9b9f5057fbd0567c37b35247 (diff)
parent: 60b341b778cc2929df16c0a504c91621b3c6a4ad (diff)
11 files changed, 279 insertions, 143 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 96ac6b0eb6cb..698ea80f2102 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1634,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
 static struct page *__read_cache_page(struct address_space *mapping,
                                pgoff_t index,
                                int (*filler)(void *,struct page*),
-                                void *data)
+                                void *data,
+                                gfp_t gfp)
 {
        struct page *page;
        int err;
 repeat:
        page = find_get_page(mapping, index);
        if (!page) {
-                page = page_cache_alloc_cold(mapping);
+                page = __page_cache_alloc(gfp | __GFP_COLD);
                if (!page)
                        return ERR_PTR(-ENOMEM);
                err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
@@ -1661,31 +1662,18 @@ repeat:
        return page;
 }
-/**
+static struct page *do_read_cache_page(struct address_space *mapping,
- * read_cache_page_async - read into page cache, fill it if needed
- * @mapping:    the page's address_space
- * @index:      the page index
- * @filler:     function to perform the read
- * @data:       destination for read data
- *
- * Same as read_cache_page, but don't wait for page to become unlocked
- * after submitting it to the filler.
- *
- * Read into the page cache. If a page already exists, and PageUptodate() is
- * not set, try to fill the page but don't wait for it to become unlocked.
- *
- * If the page does not get brought uptodate, return -EIO.
- */
-struct page *read_cache_page_async(struct address_space *mapping,
                                pgoff_t index,
                                int (*filler)(void *,struct page*),
-                                void *data)
+                                void *data,
+                                gfp_t gfp)
 {
        struct page *page;
        int err;
 retry:
-        page = __read_cache_page(mapping, index, filler, data);
+        page = __read_cache_page(mapping, index, filler, data, gfp);
        if (IS_ERR(page))
                return page;
        if (PageUptodate(page))
@@ -1710,8 +1698,67 @@ out:
        mark_page_accessed(page);
        return page;
 }
+/**
+ * read_cache_page_async - read into page cache, fill it if needed
+ * @mapping:    the page's address_space
+ * @index:      the page index
+ * @filler:     function to perform the read
+ * @data:       destination for read data
+ *
+ * Same as read_cache_page, but don't wait for page to become unlocked
+ * after submitting it to the filler.
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page but don't wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_async(struct address_space *mapping,
+                                pgoff_t index,
+                                int (*filler)(void *,struct page*),
+                                void *data)
+{
+        return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
+}
 EXPORT_SYMBOL(read_cache_page_async);
+static struct page *wait_on_page_read(struct page *page)
+{
+        if (!IS_ERR(page)) {
+                wait_on_page_locked(page);
+                if (!PageUptodate(page)) {
+                        page_cache_release(page);
+                        page = ERR_PTR(-EIO);
+                }
+        }
+        return page;
+}
+/**
+ * read_cache_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping:    the page's address_space
+ * @index:      the page index
+ * @gfp:        the page allocator flags to use if allocating
+ *
+ * This is the same as "read_mapping_page(mapping, index, NULL)", but with
+ * any new page allocations done using the specified allocation flags. Note
+ * that the Radix tree operations will still use GFP_KERNEL, so you can't
+ * expect to do this atomically or anything like that - but you can pass in
+ * other page requirements.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_gfp(struct address_space *mapping,
+                                pgoff_t index,
+                                gfp_t gfp)
+{
+        filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+        return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
+}
+EXPORT_SYMBOL(read_cache_page_gfp);
 /**
 * read_cache_page - read into page cache, fill it if needed
 * @mapping:    the page's address_space
@@ -1729,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping,
                                int (*filler)(void *,struct page*),
                                void *data)
 {
-        struct page *page;
+        return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
-        page = read_cache_page_async(mapping, index, filler, data);
-        if (IS_ERR(page))
-                goto out;
-        wait_on_page_locked(page);
-        if (!PageUptodate(page)) {
-                page_cache_release(page);
-                page = ERR_PTR(-EIO);
-        }
- out:
-        return page;
 }
 EXPORT_SYMBOL(read_cache_page);
@@ -2196,6 +2232,9 @@ again:
                if (unlikely(status))
                        break;
+                if (mapping_writably_mapped(mapping))
+                        flush_dcache_page(page);
                pagefault_disable();
                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
                pagefault_enable();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e91b81b63670..2d16fa6b8c2d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1515,10 +1515,9 @@ static struct attribute_group hstate_attr_group = {
        .attrs = hstate_attrs,
 };
-static int __init hugetlb_sysfs_add_hstate(struct hstate *h,
+static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
-                                struct kobject *parent,
+                                    struct kobject **hstate_kobjs,
-                                struct kobject **hstate_kobjs,
+                                    struct attribute_group *hstate_attr_group)
-                                struct attribute_group *hstate_attr_group)
 {
        int retval;
        int hi = h - hstates;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 488b644e0e8e..954032b80bed 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2586,7 +2586,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
        if (free_all)
                goto try_to_free;
 move_account:
-        while (mem->res.usage > 0) {
+        do {
                ret = -EBUSY;
                if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
                        goto out;
@@ -2614,8 +2614,8 @@ move_account:
                if (ret == -ENOMEM)
                        goto try_to_free;
                cond_resched();
-        }
+        /* "ret" should also be checked to ensure all lists are empty. */
-        ret = 0;
+        } while (mem->res.usage > 0 || ret);
 out:
        css_put(&mem->css);
        return ret;
@@ -2648,10 +2648,7 @@ try_to_free:
        }
        lru_add_drain();
        /* try move_account...there may be some *locked* pages. */
-        if (mem->res.usage)
+        goto move_account;
-                goto move_account;
-        ret = 0;
-        goto out;
 }
 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
diff --git a/mm/migrate.c b/mm/migrate.c
index efddbf0926b2..880bd592d38e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -912,6 +912,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
                                goto out_pm;
                        err = -ENODEV;
+                        if (node < 0 || node >= MAX_NUMNODES)
+                                goto out_pm;
                        if (!node_state(node, N_HIGH_MEMORY))
                                goto out_pm;
@@ -999,33 +1002,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
 #define DO_PAGES_STAT_CHUNK_NR 16
        const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
        int chunk_status[DO_PAGES_STAT_CHUNK_NR];
-        unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR;
-        int err;
-        for (i = 0; i < nr_pages; i += chunk_nr) {
+        while (nr_pages) {
-                if (chunk_nr > nr_pages - i)
+                unsigned long chunk_nr;
-                        chunk_nr = nr_pages - i;
-                err = copy_from_user(chunk_pages, &pages[i],
+                chunk_nr = nr_pages;
-                                     chunk_nr * sizeof(*chunk_pages));
+                if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
-                if (err) {
+                        chunk_nr = DO_PAGES_STAT_CHUNK_NR;
-                        err = -EFAULT;
-                        goto out;
+                if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
-                }
+                        break;
                do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
-                err = copy_to_user(&status[i], chunk_status,
+                if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
-                                   chunk_nr * sizeof(*chunk_status));
+                        break;
-                if (err) {
-                        err = -EFAULT;
-                        goto out;
-                }
-        }
-        err = 0;
-out:
+                pages += chunk_nr;
-        return err;
+                status += chunk_nr;
+                nr_pages -= chunk_nr;
+        }
+        return nr_pages ? -EFAULT : 0;
 }
 /*
diff --git a/mm/nommu.c b/mm/nommu.c
index 17773862619b..48a2ecfaf059 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -552,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to)
 static void __put_nommu_region(struct vm_region *region)
        __releases(nommu_region_sem)
 {
-        kenter("%p{%d}", region, atomic_read(&region->vm_usage));
+        kenter("%p{%d}", region, region->vm_usage);
        BUG_ON(!nommu_region_tree.rb_node);
-        if (atomic_dec_and_test(&region->vm_usage)) {
+        if (--region->vm_usage == 0) {
                if (region->vm_top > region->vm_start)
                        delete_nommu_region(region);
                up_write(&nommu_region_sem);
@@ -1205,7 +1205,7 @@ unsigned long do_mmap_pgoff(struct file *file,
        if (!vma)
                goto error_getting_vma;
-        atomic_set(&region->vm_usage, 1);
+        region->vm_usage = 1;
        region->vm_flags = vm_flags;
        region->vm_pgoff = pgoff;
@@ -1272,7 +1272,7 @@ unsigned long do_mmap_pgoff(struct file *file,
                        }
                        /* we've found a region we can share */
-                        atomic_inc(&pregion->vm_usage);
+                        pregion->vm_usage++;
                        vma->vm_region = pregion;
                        start = pregion->vm_start;
                        start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
@@ -1289,7 +1289,7 @@ unsigned long do_mmap_pgoff(struct file *file,
                                        vma->vm_region = NULL;
                                        vma->vm_start = 0;
                                        vma->vm_end = 0;
-                                        atomic_dec(&pregion->vm_usage);
+                                        pregion->vm_usage--;
                                        pregion = NULL;
                                        goto error_just_free;
                                }
@@ -1441,10 +1441,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
        kenter("");
-        /* we're only permitted to split anonymous regions that have a single
+        /* we're only permitted to split anonymous regions (these should have
-         * owner */
+         * only a single usage on the region) */
-        if (vma->vm_file ||
+        if (vma->vm_file)
-            atomic_read(&vma->vm_region->vm_usage) != 1)
                return -ENOMEM;
        if (mm->map_count >= sysctl_max_map_count)
@@ -1518,7 +1517,7 @@ static int shrink_vma(struct mm_struct *mm,
        /* cut the backing region down to size */
        region = vma->vm_region;
-        BUG_ON(atomic_read(&region->vm_usage) != 1);
+        BUG_ON(region->vm_usage != 1);
        down_write(&nommu_region_sem);
        delete_nommu_region(region);
@@ -1762,27 +1761,6 @@ void unmap_mapping_range(struct address_space *mapping,
 EXPORT_SYMBOL(unmap_mapping_range);
 /*
- * ask for an unmapped area at which to create a mapping on a file
- */
-unsigned long get_unmapped_area(struct file *file, unsigned long addr,
-                                unsigned long len, unsigned long pgoff,
-                                unsigned long flags)
-{
-        unsigned long (*get_area)(struct file *, unsigned long, unsigned long,
-                                  unsigned long, unsigned long);
-        get_area = current->mm->get_unmapped_area;
-        if (file && file->f_op && file->f_op->get_unmapped_area)
-                get_area = file->f_op->get_unmapped_area;
-        if (!get_area)
-                return -ENOSYS;
-        return get_area(file, addr, len, pgoff, flags);
-}
-EXPORT_SYMBOL(get_unmapped_area);
-/*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
 * succeed and -ENOMEM implies there is not.
@@ -1936,3 +1914,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
        mmput(mm);
        return len;
 }
+/**
+ * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
+ * @inode: The inode to check
+ * @size: The current filesize of the inode
+ * @newsize: The proposed filesize of the inode
+ *
+ * Check the shared mappings on an inode on behalf of a shrinking truncate to
+ * make sure that that any outstanding VMAs aren't broken and then shrink the
+ * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
+ * automatically grant mappings that are too large.
+ */
+int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
+                                size_t newsize)
+{
+        struct vm_area_struct *vma;
+        struct prio_tree_iter iter;
+        struct vm_region *region;
+        pgoff_t low, high;
+        size_t r_size, r_top;
+        low = newsize >> PAGE_SHIFT;
+        high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        down_write(&nommu_region_sem);
+        /* search for VMAs that fall within the dead zone */
+        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+                              low, high) {
+                /* found one - only interested if it's shared out of the page
+                 * cache */
+                if (vma->vm_flags & VM_SHARED) {
+                        up_write(&nommu_region_sem);
+                        return -ETXTBSY; /* not quite true, but near enough */
+                }
+        }
+        /* reduce any regions that overlap the dead zone - if in existence,
+         * these will be pointed to by VMAs that don't overlap the dead zone
+         *
+         * we don't check for any regions that start beyond the EOF as there
+         * shouldn't be any
+         */
+        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+                              0, ULONG_MAX) {
+                if (!(vma->vm_flags & VM_SHARED))
+                        continue;
+                region = vma->vm_region;
+                r_size = region->vm_top - region->vm_start;
+                r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
+                if (r_top > newsize) {
+                        region->vm_top -= r_top - newsize;
+                        if (region->vm_end > region->vm_top)
+                                region->vm_end = region->vm_top;
+                }
+        }
+        up_write(&nommu_region_sem);
+        return 0;
+}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f52481b1c1e5..237050478f28 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -459,6 +459,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
        list_for_each_entry(c, &p->children, sibling) {
                if (c->mm == p->mm)
                        continue;
+                if (mem && !task_in_mem_cgroup(c, mem))
+                        continue;
                if (!oom_kill_task(c))
                        return 0;
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e9f5cc5fb59..8deb9d0fd5b1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -556,8 +556,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        page = list_entry(list->prev, struct page, lru);
                        /* must delete as __free_one_page list manipulates */
                        list_del(&page->lru);
-                        __free_one_page(page, zone, 0, migratetype);
+                        /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
-                        trace_mm_page_pcpu_drain(page, 0, migratetype);
+                        __free_one_page(page, zone, 0, page_private(page));
+                        trace_mm_page_pcpu_drain(page, 0, page_private(page));
                } while (--count && --batch_free && !list_empty(list));
        }
        spin_unlock(&zone->lock);
@@ -1222,10 +1223,10 @@ again:
                }
                spin_lock_irqsave(&zone->lock, flags);
                page = __rmqueue(zone, order, migratetype);
-                __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
                spin_unlock(&zone->lock);
                if (!page)
                        goto failed;
+                __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
        }
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -3998,7 +3999,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
                }
                /* Merge backward if suitable */
-                if (start_pfn < early_node_map[i].end_pfn &&
+                if (start_pfn < early_node_map[i].start_pfn &&
                                end_pfn >= early_node_map[i].start_pfn) {
                        early_node_map[i].start_pfn = start_pfn;
                        return;
diff --git a/mm/truncate.c b/mm/truncate.c
index 342deee22684..e87e37244829 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -522,22 +522,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
 */
 void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
 {
-        if (new < old) {
+        struct address_space *mapping = inode->i_mapping;
-                struct address_space *mapping = inode->i_mapping;
+        /*
-                /*
+         * unmap_mapping_range is called twice, first simply for
-                 * unmap_mapping_range is called twice, first simply for
+         * efficiency so that truncate_inode_pages does fewer
-                 * efficiency so that truncate_inode_pages does fewer
+         * single-page unmaps.  However after this first call, and
-                 * single-page unmaps.  However after this first call, and
+         * before truncate_inode_pages finishes, it is possible for
-                 * before truncate_inode_pages finishes, it is possible for
+         * private pages to be COWed, which remain after
-                 * private pages to be COWed, which remain after
+         * truncate_inode_pages finishes, hence the second
-                 * truncate_inode_pages finishes, hence the second
+         * unmap_mapping_range call must be made for correctness.
-                 * unmap_mapping_range call must be made for correctness.
+         */
-                 */
+        unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
-                unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+        truncate_inode_pages(mapping, new);
-                truncate_inode_pages(mapping, new);
+        unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
-                unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
-        }
 }
 EXPORT_SYMBOL(truncate_pagecache);
diff --git a/mm/util.c b/mm/util.c
index 7c35ad95f927..834db7be240f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -220,7 +220,7 @@ char *strndup_user(const char __user *s, long n)
 }
 EXPORT_SYMBOL(strndup_user);
-#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
+#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
        mm->mmap_base = TASK_UNMAPPED_BASE;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 37e69295f250..ae007462b7f6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void)
 static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+/* for per-CPU blocks */
+static void purge_fragmented_blocks_allcpus(void);
 /*
 * Purges all lazily-freed vmap areas.
 *
@@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
        } else
                spin_lock(&purge_lock);
+        if (sync)
+                purge_fragmented_blocks_allcpus();
        rcu_read_lock();
        list_for_each_entry_rcu(va, &vmap_area_list, list) {
                if (va->flags & VM_LAZY_FREE) {
@@ -555,10 +561,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
        }
        rcu_read_unlock();
-        if (nr) {
+        if (nr)
-                BUG_ON(nr > atomic_read(&vmap_lazy_nr));
                atomic_sub(nr, &vmap_lazy_nr);
-        }
        if (nr || force_flush)
                flush_tlb_kernel_range(*start, *end);
@@ -669,8 +673,6 @@ static bool vmap_initialized __read_mostly = false;
 struct vmap_block_queue {
        spinlock_t lock;
        struct list_head free;
-        struct list_head dirty;
-        unsigned int nr_dirty;
 };
 struct vmap_block {
@@ -680,10 +682,9 @@ struct vmap_block {
        unsigned long free, dirty;
        DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
        DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
-        union {
+        struct list_head free_list;
-                struct list_head free_list;
+        struct rcu_head rcu_head;
-                struct rcu_head rcu_head;
+        struct list_head purge;
-        };
 };
 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
@@ -759,7 +760,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
        vbq = &get_cpu_var(vmap_block_queue);
        vb->vbq = vbq;
        spin_lock(&vbq->lock);
-        list_add(&vb->free_list, &vbq->free);
+        list_add_rcu(&vb->free_list, &vbq->free);
        spin_unlock(&vbq->lock);
        put_cpu_var(vmap_block_queue);
@@ -778,8 +779,6 @@ static void free_vmap_block(struct vmap_block *vb)
        struct vmap_block *tmp;
        unsigned long vb_idx;
-        BUG_ON(!list_empty(&vb->free_list));
        vb_idx = addr_to_vb_idx(vb->va->va_start);
        spin_lock(&vmap_block_tree_lock);
        tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
@@ -790,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb)
        call_rcu(&vb->rcu_head, rcu_free_vb);
 }
+static void purge_fragmented_blocks(int cpu)
+{
+        LIST_HEAD(purge);
+        struct vmap_block *vb;
+        struct vmap_block *n_vb;
+        struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
+        rcu_read_lock();
+        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+                if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
+                        continue;
+                spin_lock(&vb->lock);
+                if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
+                        vb->free = 0; /* prevent further allocs after releasing lock */
+                        vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
+                        bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
+                        bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
+                        spin_lock(&vbq->lock);
+                        list_del_rcu(&vb->free_list);
+                        spin_unlock(&vbq->lock);
+                        spin_unlock(&vb->lock);
+                        list_add_tail(&vb->purge, &purge);
+                } else
+                        spin_unlock(&vb->lock);
+        }
+        rcu_read_unlock();
+        list_for_each_entry_safe(vb, n_vb, &purge, purge) {
+                list_del(&vb->purge);
+                free_vmap_block(vb);
+        }
+}
+static void purge_fragmented_blocks_thiscpu(void)
+{
+        purge_fragmented_blocks(smp_processor_id());
+}
+static void purge_fragmented_blocks_allcpus(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                purge_fragmented_blocks(cpu);
+}
 static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 {
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        unsigned long addr = 0;
        unsigned int order;
+        int purge = 0;
        BUG_ON(size & ~PAGE_MASK);
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -808,24 +856,38 @@ again:
                int i;
                spin_lock(&vb->lock);
+                if (vb->free < 1UL << order)
+                        goto next;
                i = bitmap_find_free_region(vb->alloc_map,
                                                VMAP_BBMAP_BITS, order);
-                if (i >= 0) {
+                if (i < 0) {
-                        addr = vb->va->va_start + (i << PAGE_SHIFT);
+                        if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
-                        BUG_ON(addr_to_vb_idx(addr) !=
+                                /* fragmented and no outstanding allocations */
-                                        addr_to_vb_idx(vb->va->va_start));
+                                BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
-                        vb->free -= 1UL << order;
+                                purge = 1;
-                        if (vb->free == 0) {
-                                spin_lock(&vbq->lock);
-                                list_del_init(&vb->free_list);
-                                spin_unlock(&vbq->lock);
                        }
-                        spin_unlock(&vb->lock);
+                        goto next;
-                        break;
+                }
+                addr = vb->va->va_start + (i << PAGE_SHIFT);
+                BUG_ON(addr_to_vb_idx(addr) !=
+                                addr_to_vb_idx(vb->va->va_start));
+                vb->free -= 1UL << order;
+                if (vb->free == 0) {
+                        spin_lock(&vbq->lock);
+                        list_del_rcu(&vb->free_list);
+                        spin_unlock(&vbq->lock);
                }
                spin_unlock(&vb->lock);
+                break;
+next:
+                spin_unlock(&vb->lock);
        }
+        if (purge)
+                purge_fragmented_blocks_thiscpu();
        put_cpu_var(vmap_block_queue);
        rcu_read_unlock();
@@ -862,11 +924,11 @@ static void vb_free(const void *addr, unsigned long size)
        BUG_ON(!vb);
        spin_lock(&vb->lock);
-        bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
+        BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
        vb->dirty += 1UL << order;
        if (vb->dirty == VMAP_BBMAP_BITS) {
-                BUG_ON(vb->free || !list_empty(&vb->free_list));
+                BUG_ON(vb->free);
                spin_unlock(&vb->lock);
                free_vmap_block(vb);
        } else
@@ -1035,8 +1097,6 @@ void __init vmalloc_init(void)
                vbq = &per_cpu(vmap_block_queue, i);
                spin_lock_init(&vbq->lock);
                INIT_LIST_HEAD(&vbq->free);
-                INIT_LIST_HEAD(&vbq->dirty);
-                vbq->nr_dirty = 0;
        }
        /* Import existing vmlist entries. */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 885207a6b6b7..c26986c85ce0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1922,6 +1922,9 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
                if (!populated_zone(zone))
                        continue;
+                if (zone_is_all_unreclaimable(zone))
+                        continue;
                if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
                                                                0, 0))
                        return 1;
author	Ingo Molnar <mingo@elte.hu>	2010-02-25 03:40:22 -0500
committer	Ingo Molnar <mingo@elte.hu>	2010-02-25 03:40:26 -0500
commit	996de8c6fe95c5a9fc524241cc8f142ef0605d3d (patch)
tree	0f637ab0d80d6d7e213707ac2d8c1cc16b69523c /mm
parent	017c426138122c8e9b9f5057fbd0567c37b35247 (diff)
parent	60b341b778cc2929df16c0a504c91621b3c6a4ad (diff)