Merge branch 'x86/irq' into x86/apic

Merge reason: Conflicts in arch/x86/kernel/apic/io_apic.c Resolved Conflicts: arch/x86/kernel/apic/io_apic.c Signed-off-by: H. Peter Anvin <hpa@zytor.com>
author: H. Peter Anvin <hpa@zytor.com> 2010-02-22 19:20:34 -0500
committer: H. Peter Anvin <hpa@zytor.com> 2010-02-22 19:20:34 -0500
commit: d02e30c31c57683a66ed68a1bcff900ca78f6d56 (patch)
tree: c3ce99a00061bcc1199b50fa838147d876c56717 /mm
parent: 0fdc7a8022c3eaff6b5ee27ffb9e913e5e58d8e9 (diff)
parent: aef55d4922e62a0d887e60d87319f3718aec6ced (diff)
14 files changed, 354 insertions, 177 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 96ac6b0eb6cb..698ea80f2102 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1634,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
 static struct page *__read_cache_page(struct address_space *mapping,
                                pgoff_t index,
                                int (*filler)(void *,struct page*),
-                                void *data)
+                                void *data,
+                                gfp_t gfp)
 {
        struct page *page;
        int err;
 repeat:
        page = find_get_page(mapping, index);
        if (!page) {
-                page = page_cache_alloc_cold(mapping);
+                page = __page_cache_alloc(gfp | __GFP_COLD);
                if (!page)
                        return ERR_PTR(-ENOMEM);
                err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
@@ -1661,31 +1662,18 @@ repeat:
        return page;
 }
-/**
+static struct page *do_read_cache_page(struct address_space *mapping,
- * read_cache_page_async - read into page cache, fill it if needed
- * @mapping:    the page's address_space
- * @index:      the page index
- * @filler:     function to perform the read
- * @data:       destination for read data
- *
- * Same as read_cache_page, but don't wait for page to become unlocked
- * after submitting it to the filler.
- *
- * Read into the page cache. If a page already exists, and PageUptodate() is
- * not set, try to fill the page but don't wait for it to become unlocked.
- *
- * If the page does not get brought uptodate, return -EIO.
- */
-struct page *read_cache_page_async(struct address_space *mapping,
                                pgoff_t index,
                                int (*filler)(void *,struct page*),
-                                void *data)
+                                void *data,
+                                gfp_t gfp)
 {
        struct page *page;
        int err;
 retry:
-        page = __read_cache_page(mapping, index, filler, data);
+        page = __read_cache_page(mapping, index, filler, data, gfp);
        if (IS_ERR(page))
                return page;
        if (PageUptodate(page))
@@ -1710,8 +1698,67 @@ out:
        mark_page_accessed(page);
        return page;
 }
+/**
+ * read_cache_page_async - read into page cache, fill it if needed
+ * @mapping:    the page's address_space
+ * @index:      the page index
+ * @filler:     function to perform the read
+ * @data:       destination for read data
+ *
+ * Same as read_cache_page, but don't wait for page to become unlocked
+ * after submitting it to the filler.
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page but don't wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_async(struct address_space *mapping,
+                                pgoff_t index,
+                                int (*filler)(void *,struct page*),
+                                void *data)
+{
+        return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
+}
 EXPORT_SYMBOL(read_cache_page_async);
+static struct page *wait_on_page_read(struct page *page)
+{
+        if (!IS_ERR(page)) {
+                wait_on_page_locked(page);
+                if (!PageUptodate(page)) {
+                        page_cache_release(page);
+                        page = ERR_PTR(-EIO);
+                }
+        }
+        return page;
+}
+/**
+ * read_cache_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping:    the page's address_space
+ * @index:      the page index
+ * @gfp:        the page allocator flags to use if allocating
+ *
+ * This is the same as "read_mapping_page(mapping, index, NULL)", but with
+ * any new page allocations done using the specified allocation flags. Note
+ * that the Radix tree operations will still use GFP_KERNEL, so you can't
+ * expect to do this atomically or anything like that - but you can pass in
+ * other page requirements.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_gfp(struct address_space *mapping,
+                                pgoff_t index,
+                                gfp_t gfp)
+{
+        filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+        return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
+}
+EXPORT_SYMBOL(read_cache_page_gfp);
 /**
 * read_cache_page - read into page cache, fill it if needed
 * @mapping:    the page's address_space
@@ -1729,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping,
                                int (*filler)(void *,struct page*),
                                void *data)
 {
-        struct page *page;
+        return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
-        page = read_cache_page_async(mapping, index, filler, data);
-        if (IS_ERR(page))
-                goto out;
-        wait_on_page_locked(page);
-        if (!PageUptodate(page)) {
-                page_cache_release(page);
-                page = ERR_PTR(-EIO);
-        }
- out:
-        return page;
 }
 EXPORT_SYMBOL(read_cache_page);
@@ -2196,6 +2232,9 @@ again:
                if (unlikely(status))
                        break;
+                if (mapping_writably_mapped(mapping))
+                        flush_dcache_page(page);
                pagefault_disable();
                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
                pagefault_enable();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 65f38c218207..2d16fa6b8c2d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -402,7 +402,7 @@ static void clear_huge_page(struct page *page,
 {
        int i;
-        if (unlikely(sz > MAX_ORDER_NR_PAGES)) {
+        if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
                clear_gigantic_page(page, addr, sz);
                return;
        }
@@ -1515,10 +1515,9 @@ static struct attribute_group hstate_attr_group = {
        .attrs = hstate_attrs,
 };
-static int __init hugetlb_sysfs_add_hstate(struct hstate *h,
+static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
-                                struct kobject *parent,
+                                    struct kobject **hstate_kobjs,
-                                struct kobject **hstate_kobjs,
+                                    struct attribute_group *hstate_attr_group)
-                                struct attribute_group *hstate_attr_group)
 {
        int retval;
        int hi = h - hstates;
diff --git a/mm/maccess.c b/mm/maccess.c
index 9073695ff25f..4e348dbaecd7 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -14,7 +14,11 @@
 * Safely read from address @src to the buffer at @dst.  If a kernel fault
 * happens, handle that and return -EFAULT.
 */
-long probe_kernel_read(void *dst, void *src, size_t size)
+long __weak probe_kernel_read(void *dst, void *src, size_t size)
+    __attribute__((alias("__probe_kernel_read")));
+long __probe_kernel_read(void *dst, void *src, size_t size)
 {
        long ret;
        mm_segment_t old_fs = get_fs();
@@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
 * Safely write to address @dst from the buffer at @src.  If a kernel fault
 * happens, handle that and return -EFAULT.
 */
-long notrace __weak probe_kernel_write(void *dst, void *src, size_t size)
+long __weak probe_kernel_write(void *dst, void *src, size_t size)
+    __attribute__((alias("__probe_kernel_write")));
+long __probe_kernel_write(void *dst, void *src, size_t size)
 {
        long ret;
        mm_segment_t old_fs = get_fs();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 488b644e0e8e..954032b80bed 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2586,7 +2586,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
        if (free_all)
                goto try_to_free;
 move_account:
-        while (mem->res.usage > 0) {
+        do {
                ret = -EBUSY;
                if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
                        goto out;
@@ -2614,8 +2614,8 @@ move_account:
                if (ret == -ENOMEM)
                        goto try_to_free;
                cond_resched();
-        }
+        /* "ret" should also be checked to ensure all lists are empty. */
-        ret = 0;
+        } while (mem->res.usage > 0 || ret);
 out:
        css_put(&mem->css);
        return ret;
@@ -2648,10 +2648,7 @@ try_to_free:
        }
        lru_add_drain();
        /* try move_account...there may be some *locked* pages. */
-        if (mem->res.usage)
+        goto move_account;
-                goto move_account;
-        ret = 0;
-        goto out;
 }
 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
diff --git a/mm/migrate.c b/mm/migrate.c
index efddbf0926b2..9a0db5bbabe4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -912,6 +912,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
                                goto out_pm;
                        err = -ENODEV;
+                        if (node < 0 || node >= MAX_NUMNODES)
+                                goto out_pm;
                        if (!node_state(node, N_HIGH_MEMORY))
                                goto out_pm;
diff --git a/mm/mmap.c b/mm/mmap.c
index d9c77b2dbe9d..ee2298936fe6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1043,6 +1043,46 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 }
 EXPORT_SYMBOL(do_mmap_pgoff);
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+                unsigned long, prot, unsigned long, flags,
+                unsigned long, fd, unsigned long, pgoff)
+{
+        struct file *file = NULL;
+        unsigned long retval = -EBADF;
+        if (!(flags & MAP_ANONYMOUS)) {
+                if (unlikely(flags & MAP_HUGETLB))
+                        return -EINVAL;
+                file = fget(fd);
+                if (!file)
+                        goto out;
+        } else if (flags & MAP_HUGETLB) {
+                struct user_struct *user = NULL;
+                /*
+                 * VM_NORESERVE is used because the reservations will be
+                 * taken when vm_ops->mmap() is called
+                 * A dummy user value is used because we are not locking
+                 * memory so no accounting is necessary
+                 */
+                len = ALIGN(len, huge_page_size(&default_hstate));
+                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
+                                                &user, HUGETLB_ANONHUGE_INODE);
+                if (IS_ERR(file))
+                        return PTR_ERR(file);
+        }
+        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+        down_write(&current->mm->mmap_sem);
+        retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+        up_write(&current->mm->mmap_sem);
+        if (file)
+                fput(file);
+out:
+        return retval;
+}
 /*
 * Some shared mappigns will want the pages marked read-only
 * to track write events. If so, we'll downgrade vm_page_prot
diff --git a/mm/nommu.c b/mm/nommu.c
index 8687973462bb..48a2ecfaf059 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
        /*
         * Ok, looks good - let it rip.
         */
+        flush_icache_range(mm->brk, brk);
        return mm->brk = brk;
 }
@@ -551,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to)
 static void __put_nommu_region(struct vm_region *region)
        __releases(nommu_region_sem)
 {
-        kenter("%p{%d}", region, atomic_read(&region->vm_usage));
+        kenter("%p{%d}", region, region->vm_usage);
        BUG_ON(!nommu_region_tree.rb_node);
-        if (atomic_dec_and_test(&region->vm_usage)) {
+        if (--region->vm_usage == 0) {
                if (region->vm_top > region->vm_start)
                        delete_nommu_region(region);
                up_write(&nommu_region_sem);
@@ -1204,7 +1205,7 @@ unsigned long do_mmap_pgoff(struct file *file,
        if (!vma)
                goto error_getting_vma;
-        atomic_set(&region->vm_usage, 1);
+        region->vm_usage = 1;
        region->vm_flags = vm_flags;
        region->vm_pgoff = pgoff;
@@ -1271,7 +1272,7 @@ unsigned long do_mmap_pgoff(struct file *file,
                        }
                        /* we've found a region we can share */
-                        atomic_inc(&pregion->vm_usage);
+                        pregion->vm_usage++;
                        vma->vm_region = pregion;
                        start = pregion->vm_start;
                        start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
@@ -1288,7 +1289,7 @@ unsigned long do_mmap_pgoff(struct file *file,
                                        vma->vm_region = NULL;
                                        vma->vm_start = 0;
                                        vma->vm_end = 0;
-                                        atomic_dec(&pregion->vm_usage);
+                                        pregion->vm_usage--;
                                        pregion = NULL;
                                        goto error_just_free;
                                }
@@ -1353,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file,
 share:
        add_vma_to_mm(current->mm, vma);
-        up_write(&nommu_region_sem);
+        /* we flush the region from the icache only when the first executable
+         * mapping of it is made  */
+        if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
+                flush_icache_range(region->vm_start, region->vm_end);
+                region->vm_icache_flushed = true;
+        }
-        if (prot & PROT_EXEC)
+        up_write(&nommu_region_sem);
-                flush_icache_range(result, result + len);
        kleave(" = %lx", result);
        return result;
@@ -1398,6 +1403,31 @@ error_getting_region:
 }
 EXPORT_SYMBOL(do_mmap_pgoff);
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+                unsigned long, prot, unsigned long, flags,
+                unsigned long, fd, unsigned long, pgoff)
+{
+        struct file *file = NULL;
+        unsigned long retval = -EBADF;
+        if (!(flags & MAP_ANONYMOUS)) {
+                file = fget(fd);
+                if (!file)
+                        goto out;
+        }
+        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+        down_write(&current->mm->mmap_sem);
+        retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+        up_write(&current->mm->mmap_sem);
+        if (file)
+                fput(file);
+out:
+        return retval;
+}
 /*
 * split a vma into two pieces at address 'addr', a new vma is allocated either
 * for the first part or the tail.
@@ -1411,10 +1441,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
        kenter("");
-        /* we're only permitted to split anonymous regions that have a single
+        /* we're only permitted to split anonymous regions (these should have
-         * owner */
+         * only a single usage on the region) */
-        if (vma->vm_file ||
+        if (vma->vm_file)
-            atomic_read(&vma->vm_region->vm_usage) != 1)
                return -ENOMEM;
        if (mm->map_count >= sysctl_max_map_count)
@@ -1488,7 +1517,7 @@ static int shrink_vma(struct mm_struct *mm,
        /* cut the backing region down to size */
        region = vma->vm_region;
-        BUG_ON(atomic_read(&region->vm_usage) != 1);
+        BUG_ON(region->vm_usage != 1);
        down_write(&nommu_region_sem);
        delete_nommu_region(region);
@@ -1732,27 +1761,6 @@ void unmap_mapping_range(struct address_space *mapping,
 EXPORT_SYMBOL(unmap_mapping_range);
 /*
- * ask for an unmapped area at which to create a mapping on a file
- */
-unsigned long get_unmapped_area(struct file *file, unsigned long addr,
-                                unsigned long len, unsigned long pgoff,
-                                unsigned long flags)
-{
-        unsigned long (*get_area)(struct file *, unsigned long, unsigned long,
-                                  unsigned long, unsigned long);
-        get_area = current->mm->get_unmapped_area;
-        if (file && file->f_op && file->f_op->get_unmapped_area)
-                get_area = file->f_op->get_unmapped_area;
-        if (!get_area)
-                return -ENOSYS;
-        return get_area(file, addr, len, pgoff, flags);
-}
-EXPORT_SYMBOL(get_unmapped_area);
-/*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
 * succeed and -ENOMEM implies there is not.
@@ -1891,9 +1899,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
                /* only read or write mappings where it is permitted */
                if (write && vma->vm_flags & VM_MAYWRITE)
-                        len -= copy_to_user((void *) addr, buf, len);
+                        copy_to_user_page(vma, NULL, addr,
+                                         (void *) addr, buf, len);
                else if (!write && vma->vm_flags & VM_MAYREAD)
-                        len -= copy_from_user(buf, (void *) addr, len);
+                        copy_from_user_page(vma, NULL, addr,
+                                            buf, (void *) addr, len);
                else
                        len = 0;
        } else {
@@ -1904,3 +1914,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
        mmput(mm);
        return len;
 }
+/**
+ * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
+ * @inode: The inode to check
+ * @size: The current filesize of the inode
+ * @newsize: The proposed filesize of the inode
+ *
+ * Check the shared mappings on an inode on behalf of a shrinking truncate to
+ * make sure that that any outstanding VMAs aren't broken and then shrink the
+ * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
+ * automatically grant mappings that are too large.
+ */
+int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
+                                size_t newsize)
+{
+        struct vm_area_struct *vma;
+        struct prio_tree_iter iter;
+        struct vm_region *region;
+        pgoff_t low, high;
+        size_t r_size, r_top;
+        low = newsize >> PAGE_SHIFT;
+        high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        down_write(&nommu_region_sem);
+        /* search for VMAs that fall within the dead zone */
+        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+                              low, high) {
+                /* found one - only interested if it's shared out of the page
+                 * cache */
+                if (vma->vm_flags & VM_SHARED) {
+                        up_write(&nommu_region_sem);
+                        return -ETXTBSY; /* not quite true, but near enough */
+                }
+        }
+        /* reduce any regions that overlap the dead zone - if in existence,
+         * these will be pointed to by VMAs that don't overlap the dead zone
+         *
+         * we don't check for any regions that start beyond the EOF as there
+         * shouldn't be any
+         */
+        vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+                              0, ULONG_MAX) {
+                if (!(vma->vm_flags & VM_SHARED))
+                        continue;
+                region = vma->vm_region;
+                r_size = region->vm_top - region->vm_start;
+                r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
+                if (r_top > newsize) {
+                        region->vm_top -= r_top - newsize;
+                        if (region->vm_end > region->vm_top)
+                                region->vm_end = region->vm_top;
+                }
+        }
+        up_write(&nommu_region_sem);
+        return 0;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e9f5cc5fb59..8deb9d0fd5b1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -556,8 +556,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        page = list_entry(list->prev, struct page, lru);
                        /* must delete as __free_one_page list manipulates */
                        list_del(&page->lru);
-                        __free_one_page(page, zone, 0, migratetype);
+                        /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
-                        trace_mm_page_pcpu_drain(page, 0, migratetype);
+                        __free_one_page(page, zone, 0, page_private(page));
+                        trace_mm_page_pcpu_drain(page, 0, page_private(page));
                } while (--count && --batch_free && !list_empty(list));
        }
        spin_unlock(&zone->lock);
@@ -1222,10 +1223,10 @@ again:
                }
                spin_lock_irqsave(&zone->lock, flags);
                page = __rmqueue(zone, order, migratetype);
-                __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
                spin_unlock(&zone->lock);
                if (!page)
                        goto failed;
+                __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
        }
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -3998,7 +3999,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
                }
                /* Merge backward if suitable */
-                if (start_pfn < early_node_map[i].end_pfn &&
+                if (start_pfn < early_node_map[i].start_pfn &&
                                end_pfn >= early_node_map[i].start_pfn) {
                        early_node_map[i].start_pfn = start_pfn;
                        return;
diff --git a/mm/percpu.c b/mm/percpu.c
index 442010cc91c6..083e7c91e5f6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1271,7 +1271,7 @@ static void pcpu_reclaim(struct work_struct *work)
 */
 void free_percpu(void *ptr)
 {
-        void *addr = __pcpu_ptr_to_addr(ptr);
+        void *addr;
        struct pcpu_chunk *chunk;
        unsigned long flags;
        int off;
@@ -1279,6 +1279,8 @@ void free_percpu(void *ptr)
        if (!ptr)
                return;
+        addr = __pcpu_ptr_to_addr(ptr);
        spin_lock_irqsave(&pcpu_lock, flags);
        chunk = pcpu_chunk_addr_search(addr);
diff --git a/mm/slab.c b/mm/slab.c
index 7d41f15b48d3..7451bdacaf18 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -654,7 +654,7 @@ static void init_node_lock_keys(int q)
                l3 = s->cs_cachep->nodelists[q];
                if (!l3 || OFF_SLAB(s->cs_cachep))
-                        return;
+                        continue;
                lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
                alc = l3->alien;
                /*
@@ -665,7 +665,7 @@ static void init_node_lock_keys(int q)
                 * for alloc_alien_cache,
                 */
                if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
-                        return;
+                        continue;
                for_each_node(r) {
                        if (alc[r])
                                lockdep_set_class(&alc[r]->lock,
diff --git a/mm/truncate.c b/mm/truncate.c
index 342deee22684..e87e37244829 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -522,22 +522,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
 */
 void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
 {
-        if (new < old) {
+        struct address_space *mapping = inode->i_mapping;
-                struct address_space *mapping = inode->i_mapping;
+        /*
-                /*
+         * unmap_mapping_range is called twice, first simply for
-                 * unmap_mapping_range is called twice, first simply for
+         * efficiency so that truncate_inode_pages does fewer
-                 * efficiency so that truncate_inode_pages does fewer
+         * single-page unmaps.  However after this first call, and
-                 * single-page unmaps.  However after this first call, and
+         * before truncate_inode_pages finishes, it is possible for
-                 * before truncate_inode_pages finishes, it is possible for
+         * private pages to be COWed, which remain after
-                 * private pages to be COWed, which remain after
+         * truncate_inode_pages finishes, hence the second
-                 * truncate_inode_pages finishes, hence the second
+         * unmap_mapping_range call must be made for correctness.
-                 * unmap_mapping_range call must be made for correctness.
+         */
-                 */
+        unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
-                unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+        truncate_inode_pages(mapping, new);
-                truncate_inode_pages(mapping, new);
+        unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
-                unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
-        }
 }
 EXPORT_SYMBOL(truncate_pagecache);
diff --git a/mm/util.c b/mm/util.c
index b377ce430803..834db7be240f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,10 +4,6 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/sched.h>
-#include <linux/hugetlb.h>
-#include <linux/syscalls.h>
-#include <linux/mman.h>
-#include <linux/file.h>
 #include <asm/uaccess.h>
 #define CREATE_TRACE_POINTS
@@ -224,7 +220,7 @@ char *strndup_user(const char __user *s, long n)
 }
 EXPORT_SYMBOL(strndup_user);
-#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
+#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
        mm->mmap_base = TASK_UNMAPPED_BASE;
@@ -272,46 +268,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
-SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
-                unsigned long, prot, unsigned long, flags,
-                unsigned long, fd, unsigned long, pgoff)
-{
-        struct file * file = NULL;
-        unsigned long retval = -EBADF;
-        if (!(flags & MAP_ANONYMOUS)) {
-                if (unlikely(flags & MAP_HUGETLB))
-                        return -EINVAL;
-                file = fget(fd);
-                if (!file)
-                        goto out;
-        } else if (flags & MAP_HUGETLB) {
-                struct user_struct *user = NULL;
-                /*
-                 * VM_NORESERVE is used because the reservations will be
-                 * taken when vm_ops->mmap() is called
-                 * A dummy user value is used because we are not locking
-                 * memory so no accounting is necessary
-                 */
-                len = ALIGN(len, huge_page_size(&default_hstate));
-                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
-                                                &user, HUGETLB_ANONHUGE_INODE);
-                if (IS_ERR(file))
-                        return PTR_ERR(file);
-        }
-        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-        down_write(&current->mm->mmap_sem);
-        retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-        up_write(&current->mm->mmap_sem);
-        if (file)
-                fput(file);
-out:
-        return retval;
-}
 /* Tracepoints definitions. */
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 37e69295f250..ae007462b7f6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void)
 static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+/* for per-CPU blocks */
+static void purge_fragmented_blocks_allcpus(void);
 /*
 * Purges all lazily-freed vmap areas.
 *
@@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
        } else
                spin_lock(&purge_lock);
+        if (sync)
+                purge_fragmented_blocks_allcpus();
        rcu_read_lock();
        list_for_each_entry_rcu(va, &vmap_area_list, list) {
                if (va->flags & VM_LAZY_FREE) {
@@ -555,10 +561,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
        }
        rcu_read_unlock();
-        if (nr) {
+        if (nr)
-                BUG_ON(nr > atomic_read(&vmap_lazy_nr));
                atomic_sub(nr, &vmap_lazy_nr);
-        }
        if (nr || force_flush)
                flush_tlb_kernel_range(*start, *end);
@@ -669,8 +673,6 @@ static bool vmap_initialized __read_mostly = false;
 struct vmap_block_queue {
        spinlock_t lock;
        struct list_head free;
-        struct list_head dirty;
-        unsigned int nr_dirty;
 };
 struct vmap_block {
@@ -680,10 +682,9 @@ struct vmap_block {
        unsigned long free, dirty;
        DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
        DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
-        union {
+        struct list_head free_list;
-                struct list_head free_list;
+        struct rcu_head rcu_head;
-                struct rcu_head rcu_head;
+        struct list_head purge;
-        };
 };
 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
@@ -759,7 +760,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
        vbq = &get_cpu_var(vmap_block_queue);
        vb->vbq = vbq;
        spin_lock(&vbq->lock);
-        list_add(&vb->free_list, &vbq->free);
+        list_add_rcu(&vb->free_list, &vbq->free);
        spin_unlock(&vbq->lock);
        put_cpu_var(vmap_block_queue);
@@ -778,8 +779,6 @@ static void free_vmap_block(struct vmap_block *vb)
        struct vmap_block *tmp;
        unsigned long vb_idx;
-        BUG_ON(!list_empty(&vb->free_list));
        vb_idx = addr_to_vb_idx(vb->va->va_start);
        spin_lock(&vmap_block_tree_lock);
        tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
@@ -790,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb)
        call_rcu(&vb->rcu_head, rcu_free_vb);
 }
+static void purge_fragmented_blocks(int cpu)
+{
+        LIST_HEAD(purge);
+        struct vmap_block *vb;
+        struct vmap_block *n_vb;
+        struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
+        rcu_read_lock();
+        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+                if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
+                        continue;
+                spin_lock(&vb->lock);
+                if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
+                        vb->free = 0; /* prevent further allocs after releasing lock */
+                        vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
+                        bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
+                        bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
+                        spin_lock(&vbq->lock);
+                        list_del_rcu(&vb->free_list);
+                        spin_unlock(&vbq->lock);
+                        spin_unlock(&vb->lock);
+                        list_add_tail(&vb->purge, &purge);
+                } else
+                        spin_unlock(&vb->lock);
+        }
+        rcu_read_unlock();
+        list_for_each_entry_safe(vb, n_vb, &purge, purge) {
+                list_del(&vb->purge);
+                free_vmap_block(vb);
+        }
+}
+static void purge_fragmented_blocks_thiscpu(void)
+{
+        purge_fragmented_blocks(smp_processor_id());
+}
+static void purge_fragmented_blocks_allcpus(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                purge_fragmented_blocks(cpu);
+}
 static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 {
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        unsigned long addr = 0;
        unsigned int order;
+        int purge = 0;
        BUG_ON(size & ~PAGE_MASK);
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -808,24 +856,38 @@ again:
                int i;
                spin_lock(&vb->lock);
+                if (vb->free < 1UL << order)
+                        goto next;
                i = bitmap_find_free_region(vb->alloc_map,
                                                VMAP_BBMAP_BITS, order);
-                if (i >= 0) {
+                if (i < 0) {
-                        addr = vb->va->va_start + (i << PAGE_SHIFT);
+                        if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
-                        BUG_ON(addr_to_vb_idx(addr) !=
+                                /* fragmented and no outstanding allocations */
-                                        addr_to_vb_idx(vb->va->va_start));
+                                BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
-                        vb->free -= 1UL << order;
+                                purge = 1;
-                        if (vb->free == 0) {
-                                spin_lock(&vbq->lock);
-                                list_del_init(&vb->free_list);
-                                spin_unlock(&vbq->lock);
                        }
-                        spin_unlock(&vb->lock);
+                        goto next;
-                        break;
+                }
+                addr = vb->va->va_start + (i << PAGE_SHIFT);
+                BUG_ON(addr_to_vb_idx(addr) !=
+                                addr_to_vb_idx(vb->va->va_start));
+                vb->free -= 1UL << order;
+                if (vb->free == 0) {
+                        spin_lock(&vbq->lock);
+                        list_del_rcu(&vb->free_list);
+                        spin_unlock(&vbq->lock);
                }
                spin_unlock(&vb->lock);
+                break;
+next:
+                spin_unlock(&vb->lock);
        }
+        if (purge)
+                purge_fragmented_blocks_thiscpu();
        put_cpu_var(vmap_block_queue);
        rcu_read_unlock();
@@ -862,11 +924,11 @@ static void vb_free(const void *addr, unsigned long size)
        BUG_ON(!vb);
        spin_lock(&vb->lock);
-        bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
+        BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
        vb->dirty += 1UL << order;
        if (vb->dirty == VMAP_BBMAP_BITS) {
-                BUG_ON(vb->free || !list_empty(&vb->free_list));
+                BUG_ON(vb->free);
                spin_unlock(&vb->lock);
                free_vmap_block(vb);
        } else
@@ -1035,8 +1097,6 @@ void __init vmalloc_init(void)
                vbq = &per_cpu(vmap_block_queue, i);
                spin_lock_init(&vbq->lock);
                INIT_LIST_HEAD(&vbq->free);
-                INIT_LIST_HEAD(&vbq->dirty);
-                vbq->nr_dirty = 0;
        }
        /* Import existing vmlist entries. */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 885207a6b6b7..c26986c85ce0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1922,6 +1922,9 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
                if (!populated_zone(zone))
                        continue;
+                if (zone_is_all_unreclaimable(zone))
+                        continue;
                if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
                                                                0, 0))
                        return 1;
author	H. Peter Anvin <hpa@zytor.com>	2010-02-22 19:20:34 -0500
committer	H. Peter Anvin <hpa@zytor.com>	2010-02-22 19:20:34 -0500
commit	d02e30c31c57683a66ed68a1bcff900ca78f6d56 (patch)
tree	c3ce99a00061bcc1199b50fa838147d876c56717 /mm
parent	0fdc7a8022c3eaff6b5ee27ffb9e913e5e58d8e9 (diff)
parent	aef55d4922e62a0d887e60d87319f3718aec6ced (diff)