Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into for-linus

author: Felix Blyakher <felixb@sgi.com> 2009-04-01 17:58:39 -0400
committer: Felix Blyakher <felixb@sgi.com> 2009-04-01 17:58:39 -0400
commit: f36345ff9a4a77f2cc576a2777b6256d5c8798fa (patch)
tree: 7ae4c607f6baae74060c2e385f744e171fbbf92b /mm
parent: 1aacc064e029f0017384e463121b98f06d3a2cc3 (diff)
parent: 8b53ef33d9d8fa5f771ae11cc6a6e7bc0182beec (diff)
18 files changed, 363 insertions, 159 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a5b77811fdf..b53427ad30a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -206,7 +206,6 @@ config VIRT_TO_BUS
 config UNEVICTABLE_LRU
        bool "Add LRU list to track non-evictable pages"
        default y
-        depends on MMU
        help
          Keeps unevictable pages off of the active and inactive pageout
          lists, so kswapd will not waste CPU time or have its balancing
@@ -214,5 +213,13 @@ config UNEVICTABLE_LRU
          will use one page flag and increase the code size a little,
          say Y unless you know what you are doing.
+config HAVE_MLOCK
+        bool
+        default y if MMU=y
+config HAVE_MLOCKED_PAGE_BIT
+        bool
+        default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y
 config MMU_NOTIFIER
        bool
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
new file mode 100644
index 00000000000..c8d62d49a44
--- /dev/null
+++ b/mm/Kconfig.debug
@@ -0,0 +1,17 @@
+config WANT_PAGE_DEBUG_FLAGS
+        bool
+config PAGE_POISONING
+        bool "Debug page memory allocations"
+        depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC
+        depends on !HIBERNATION
+        select DEBUG_PAGEALLOC
+        select WANT_PAGE_DEBUG_FLAGS
+        help
+           Fill the pages with poison patterns after free_pages() and verify
+           the patterns before alloc_pages(). This results in a large slowdown,
+           but helps to find certain types of memory corruptions.
+           This option cannot enalbe with hibernation. Otherwise, it will get
+           wrong messages for memory corruption because the free pages are not
+           saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index 818569b68f4..ec73c68b601 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_FAILSLAB) += failslab.o
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
new file mode 100644
index 00000000000..a1e3324de2b
--- /dev/null
+++ b/mm/debug-pagealloc.c
@@ -0,0 +1,129 @@
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/page-debug-flags.h>
+#include <linux/poison.h>
+static inline void set_page_poison(struct page *page)
+{
+        __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+static inline void clear_page_poison(struct page *page)
+{
+        __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+static inline bool page_poison(struct page *page)
+{
+        return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+static void poison_highpage(struct page *page)
+{
+        /*
+         * Page poisoning for highmem pages is not implemented.
+         *
+         * This can be called from interrupt contexts.
+         * So we need to create a new kmap_atomic slot for this
+         * application and it will need interrupt protection.
+         */
+}
+static void poison_page(struct page *page)
+{
+        void *addr;
+        if (PageHighMem(page)) {
+                poison_highpage(page);
+                return;
+        }
+        set_page_poison(page);
+        addr = page_address(page);
+        memset(addr, PAGE_POISON, PAGE_SIZE);
+}
+static void poison_pages(struct page *page, int n)
+{
+        int i;
+        for (i = 0; i < n; i++)
+                poison_page(page + i);
+}
+static bool single_bit_flip(unsigned char a, unsigned char b)
+{
+        unsigned char error = a ^ b;
+        return error && !(error & (error - 1));
+}
+static void check_poison_mem(unsigned char *mem, size_t bytes)
+{
+        unsigned char *start;
+        unsigned char *end;
+        for (start = mem; start < mem + bytes; start++) {
+                if (*start != PAGE_POISON)
+                        break;
+        }
+        if (start == mem + bytes)
+                return;
+        for (end = mem + bytes - 1; end > start; end--) {
+                if (*end != PAGE_POISON)
+                        break;
+        }
+        if (!printk_ratelimit())
+                return;
+        else if (start == end && single_bit_flip(*start, PAGE_POISON))
+                printk(KERN_ERR "pagealloc: single bit error\n");
+        else
+                printk(KERN_ERR "pagealloc: memory corruption\n");
+        print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
+                        end - start + 1, 1);
+        dump_stack();
+}
+static void unpoison_highpage(struct page *page)
+{
+        /*
+         * See comment in poison_highpage().
+         * Highmem pages should not be poisoned for now
+         */
+        BUG_ON(page_poison(page));
+}
+static void unpoison_page(struct page *page)
+{
+        if (PageHighMem(page)) {
+                unpoison_highpage(page);
+                return;
+        }
+        if (page_poison(page)) {
+                void *addr = page_address(page);
+                check_poison_mem(addr, PAGE_SIZE);
+                clear_page_poison(page);
+        }
+}
+static void unpoison_pages(struct page *page, int n)
+{
+        int i;
+        for (i = 0; i < n; i++)
+                unpoison_page(page + i);
+}
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+        if (!debug_pagealloc_enabled)
+                return;
+        if (enable)
+                unpoison_pages(page, numpages);
+        else
+                poison_pages(page, numpages);
+}
diff --git a/mm/highmem.c b/mm/highmem.c
index 910198037bf..68eb1d9b63f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -422,3 +422,48 @@ void __init page_address_init(void)
 }
 #endif  /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
+#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
+void debug_kmap_atomic(enum km_type type)
+{
+        static unsigned warn_count = 10;
+        if (unlikely(warn_count == 0))
+                return;
+        if (unlikely(in_interrupt())) {
+                if (in_irq()) {
+                        if (type != KM_IRQ0 && type != KM_IRQ1 &&
+                            type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
+                            type != KM_BOUNCE_READ) {
+                                WARN_ON(1);
+                                warn_count--;
+                        }
+                } else if (!irqs_disabled()) {  /* softirq */
+                        if (type != KM_IRQ0 && type != KM_IRQ1 &&
+                            type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
+                            type != KM_SKB_SUNRPC_DATA &&
+                            type != KM_SKB_DATA_SOFTIRQ &&
+                            type != KM_BOUNCE_READ) {
+                                WARN_ON(1);
+                                warn_count--;
+                        }
+                }
+        }
+        if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
+                        type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
+                if (!irqs_disabled()) {
+                        WARN_ON(1);
+                        warn_count--;
+                }
+        } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
+                if (irq_count() == 0 && !irqs_disabled()) {
+                        WARN_ON(1);
+                        warn_count--;
+                }
+        }
+}
+#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 107da3d809a..28c655ba935 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -918,7 +918,7 @@ static void return_unused_surplus_pages(struct hstate *h,
 * an instantiated the change should be committed via vma_commit_reservation.
 * No action is required on failure.
 */
-static int vma_needs_reservation(struct hstate *h,
+static long vma_needs_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
 {
        struct address_space *mapping = vma->vm_file->f_mapping;
@@ -933,7 +933,7 @@ static int vma_needs_reservation(struct hstate *h,
                return 1;
        } else  {
-                int err;
+                long err;
                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
                struct resv_map *reservations = vma_resv_map(vma);
@@ -969,7 +969,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        struct page *page;
        struct address_space *mapping = vma->vm_file->f_mapping;
        struct inode *inode = mapping->host;
-        unsigned int chg;
+        long chg;
        /*
         * Processes that did not create the mapping will have no reserves and
diff --git a/mm/internal.h b/mm/internal.h
index 478223b73a2..987bb03fbdd 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -63,6 +63,7 @@ static inline unsigned long page_order(struct page *page)
        return page_private(page);
 }
+#ifdef CONFIG_HAVE_MLOCK
 extern long mlock_vma_pages_range(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
 extern void munlock_vma_pages_range(struct vm_area_struct *vma,
@@ -71,6 +72,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 {
        munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
 }
+#endif
 #ifdef CONFIG_UNEVICTABLE_LRU
 /*
@@ -90,7 +92,7 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
 }
 #endif
-#ifdef CONFIG_UNEVICTABLE_LRU
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 /*
 * Called only in fault path via page_evictable() for a new page
 * to determine if it's being mapped into a LOCKED vma.
@@ -165,7 +167,7 @@ static inline void free_page_mlock(struct page *page)
        }
 }
-#else /* CONFIG_UNEVICTABLE_LRU */
+#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
 static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
 {
        return 0;
@@ -175,7 +177,7 @@ static inline void mlock_vma_page(struct page *page) { }
 static inline void mlock_migrate_page(struct page *new, struct page *old) { }
 static inline void free_page_mlock(struct page *page) { }
-#endif /* CONFIG_UNEVICTABLE_LRU */
+#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
 /*
 * Return the mem_map entry representing the 'offset' subpage within
diff --git a/mm/memory.c b/mm/memory.c
index 2032ad2fc34..cf6873e91c6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1151,6 +1151,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                if ((flags & FOLL_WRITE) &&
                    !pte_dirty(pte) && !PageDirty(page))
                        set_page_dirty(page);
+                /*
+                 * pte_mkyoung() would be more correct here, but atomic care
+                 * is needed to avoid losing the dirty bit: it is easier to use
+                 * mark_page_accessed().
+                 */
                mark_page_accessed(page);
        }
 unlock:
@@ -1940,6 +1945,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 * get_user_pages(.write=1, .force=1).
                 */
                if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+                        struct vm_fault vmf;
+                        int tmp;
+                        vmf.virtual_address = (void __user *)(address &
+                                                                PAGE_MASK);
+                        vmf.pgoff = old_page->index;
+                        vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+                        vmf.page = old_page;
                        /*
                         * Notify the address space that the page is about to
                         * become writable so that it can prohibit this or wait
@@ -1951,8 +1965,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        page_cache_get(old_page);
                        pte_unmap_unlock(page_table, ptl);
-                        if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+                        tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+                        if (unlikely(tmp &
+                                        (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+                                ret = tmp;
                                goto unwritable_page;
+                        }
                        /*
                         * Since we dropped the lock we need to revalidate
@@ -2101,7 +2119,7 @@ oom:
 unwritable_page:
        page_cache_release(old_page);
-        return VM_FAULT_SIGBUS;
+        return ret;
 }
 /*
@@ -2435,8 +2453,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(PGMAJFAULT);
        }
-        mark_page_accessed(page);
        lock_page(page);
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2645,9 +2661,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         * to become writable
                         */
                        if (vma->vm_ops->page_mkwrite) {
+                                int tmp;
                                unlock_page(page);
-                                if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
+                                vmf.flags |= FAULT_FLAG_MKWRITE;
-                                        ret = VM_FAULT_SIGBUS;
+                                tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+                                if (unlikely(tmp &
+                                          (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+                                        ret = tmp;
                                        anon = 1; /* no anon but release vmf.page */
                                        goto out_unlocked;
                                }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 40ba05061a4..d3b9bac085b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -55,7 +55,7 @@ static DEFINE_SPINLOCK(zone_scan_lock);
 unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
-        unsigned long points, cpu_time, run_time, s;
+        unsigned long points, cpu_time, run_time;
        struct mm_struct *mm;
        struct task_struct *child;
@@ -110,12 +110,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
        else
                run_time = 0;
-        s = int_sqrt(cpu_time);
+        if (cpu_time)
-        if (s)
+                points /= int_sqrt(cpu_time);
-                points /= s;
+        if (run_time)
-        s = int_sqrt(int_sqrt(run_time));
+                points /= int_sqrt(int_sqrt(run_time));
-        if (s)
-                points /= s;
        /*
         * Niced processes are most likely less important, so double
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 40ca7cdb653..30351f0063a 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -92,14 +92,14 @@ int vm_dirty_ratio = 20;
 unsigned long vm_dirty_bytes;
 /*
- * The interval between `kupdate'-style writebacks, in jiffies
+ * The interval between `kupdate'-style writebacks
 */
-int dirty_writeback_interval = 5 * HZ;
+unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */
 /*
- * The longest number of jiffies for which data is allowed to remain dirty
+ * The longest time for which data is allowed to remain dirty
 */
-int dirty_expire_interval = 30 * HZ;
+unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */
 /*
 * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -770,9 +770,9 @@ static void wb_kupdate(unsigned long arg)
        sync_supers();
-        oldest_jif = jiffies - dirty_expire_interval;
+        oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval);
        start_jif = jiffies;
-        next_jif = start_jif + dirty_writeback_interval;
+        next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
        nr_to_write = global_page_state(NR_FILE_DIRTY) +
                        global_page_state(NR_UNSTABLE_NFS) +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
@@ -801,9 +801,10 @@ static void wb_kupdate(unsigned long arg)
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
        struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
-        proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+        proc_dointvec(table, write, file, buffer, length, ppos);
        if (dirty_writeback_interval)
-                mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+                mod_timer(&wb_timer, jiffies +
+                        msecs_to_jiffies(dirty_writeback_interval * 10));
        else
                del_timer(&wb_timer);
        return 0;
@@ -905,7 +906,8 @@ void __init page_writeback_init(void)
 {
        int shift;
-        mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+        mod_timer(&wb_timer,
+                  jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
@@ -1198,6 +1200,20 @@ int __set_page_dirty_no_writeback(struct page *page)
 }
 /*
+ * Helper function for set_page_dirty family.
+ * NOTE: This relies on being atomic wrt interrupts.
+ */
+void account_page_dirtied(struct page *page, struct address_space *mapping)
+{
+        if (mapping_cap_account_dirty(mapping)) {
+                __inc_zone_page_state(page, NR_FILE_DIRTY);
+                __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+                task_dirty_inc(current);
+                task_io_account_write(PAGE_CACHE_SIZE);
+        }
+}
+/*
 * For address_spaces which do not use buffers.  Just tag the page as dirty in
 * its radix tree.
 *
@@ -1226,13 +1242,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                if (mapping2) { /* Race with truncate? */
                        BUG_ON(mapping2 != mapping);
                        WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-                        if (mapping_cap_account_dirty(mapping)) {
+                        account_page_dirtied(page, mapping);
-                                __inc_zone_page_state(page, NR_FILE_DIRTY);
-                                __inc_bdi_stat(mapping->backing_dev_info,
-                                                BDI_RECLAIMABLE);
-                                task_dirty_inc(current);
-                                task_io_account_write(PAGE_CACHE_SIZE);
-                        }
                        radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
                }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a3803ea8c27..0284e528748 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -922,13 +922,10 @@ static void drain_pages(unsigned int cpu)
        unsigned long flags;
        struct zone *zone;
-        for_each_zone(zone) {
+        for_each_populated_zone(zone) {
                struct per_cpu_pageset *pset;
                struct per_cpu_pages *pcp;
-                if (!populated_zone(zone))
-                        continue;
                pset = zone_pcp(zone, cpu);
                pcp = &pset->pcp;
@@ -1585,7 +1582,8 @@ nofail_alloc:
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
-        did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
+        did_some_progress = try_to_free_pages(zonelist, order,
+                                                gfp_mask, nodemask);
        p->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
@@ -1879,10 +1877,7 @@ void show_free_areas(void)
        int cpu;
        struct zone *zone;
-        for_each_zone(zone) {
+        for_each_populated_zone(zone) {
-                if (!populated_zone(zone))
-                        continue;
                show_node(zone);
                printk("%s per-cpu:\n", zone->name);
@@ -1922,12 +1917,9 @@ void show_free_areas(void)
                global_page_state(NR_PAGETABLE),
                global_page_state(NR_BOUNCE));
-        for_each_zone(zone) {
+        for_each_populated_zone(zone) {
                int i;
-                if (!populated_zone(zone))
-                        continue;
                show_node(zone);
                printk("%s"
                        " free:%lukB"
@@ -1967,12 +1959,9 @@ void show_free_areas(void)
                printk("\n");
        }
-        for_each_zone(zone) {
+        for_each_populated_zone(zone) {
                unsigned long nr[MAX_ORDER], flags, order, total = 0;
-                if (!populated_zone(zone))
-                        continue;
                show_node(zone);
                printk("%s: ", zone->name);
@@ -2784,11 +2773,7 @@ static int __cpuinit process_zones(int cpu)
        node_set_state(node, N_CPU);    /* this node has a cpu */
-        for_each_zone(zone) {
+        for_each_populated_zone(zone) {
-                if (!populated_zone(zone))
-                        continue;
                zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
                                         GFP_KERNEL, node);
                if (!zone_pcp(zone, cpu))
diff --git a/mm/shmem.c b/mm/shmem.c
index 7ec78e24a30..d94d2e9146b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1068,8 +1068,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
                swap_duplicate(swap);
                BUG_ON(page_mapped(page));
                page_cache_release(page);       /* pagecache ref */
-                set_page_dirty(page);
+                swap_writepage(page, wbc);
-                unlock_page(page);
                if (inode) {
                        mutex_lock(&shmem_swaplist_mutex);
                        /* move instead of add in case we're racing */
diff --git a/mm/sparse.c b/mm/sparse.c
index 083f5b63e7a..da432d9f0ae 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -164,9 +164,7 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
                WARN_ON_ONCE(1);
                *start_pfn = max_sparsemem_pfn;
                *end_pfn = max_sparsemem_pfn;
-        }
+        } else if (*end_pfn > max_sparsemem_pfn) {
-        if (*end_pfn > max_sparsemem_pfn) {
                mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
                        "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
                        *start_pfn, *end_pfn, max_sparsemem_pfn);
diff --git a/mm/swap.c b/mm/swap.c
index 8adb9feb61e..6e83084c1f6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -457,29 +457,6 @@ void pagevec_strip(struct pagevec *pvec)
 }
 /**
- * pagevec_swap_free - try to free swap space from the pages in a pagevec
- * @pvec: pagevec with swapcache pages to free the swap space of
- *
- * The caller needs to hold an extra reference to each page and
- * not hold the page lock on the pages.  This function uses a
- * trylock on the page lock so it may not always free the swap
- * space associated with a page.
- */
-void pagevec_swap_free(struct pagevec *pvec)
-{
-        int i;
-        for (i = 0; i < pagevec_count(pvec); i++) {
-                struct page *page = pvec->pages[i];
-                if (PageSwapCache(page) && trylock_page(page)) {
-                        try_to_free_swap(page);
-                        unlock_page(page);
-                }
-        }
-}
-/**
 * pagevec_lookup - gang pagecache lookup
 * @pvec:       Where the resulting pages are placed
 * @mapping:    The address_space to search
diff --git a/mm/util.c b/mm/util.c
index 37eaccdf305..7c122e49f76 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -70,6 +70,36 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
 EXPORT_SYMBOL(kmemdup);
 /**
+ * memdup_user - duplicate memory region from user space
+ *
+ * @src: source address in user space
+ * @len: number of bytes to copy
+ *
+ * Returns an ERR_PTR() on failure.
+ */
+void *memdup_user(const void __user *src, size_t len)
+{
+        void *p;
+        /*
+         * Always use GFP_KERNEL, since copy_from_user() can sleep and
+         * cause pagefault, which makes it pointless to use GFP_NOFS
+         * or GFP_ATOMIC.
+         */
+        p = kmalloc_track_caller(len, GFP_KERNEL);
+        if (!p)
+                return ERR_PTR(-ENOMEM);
+        if (copy_from_user(p, src, len)) {
+                kfree(p);
+                return ERR_PTR(-EFAULT);
+        }
+        return p;
+}
+EXPORT_SYMBOL(memdup_user);
+/**
 * __krealloc - like krealloc() but don't free @p.
 * @p: object to reallocate memory for.
 * @new_size: how many bytes of memory are required.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index af58324c361..fab19876b4d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -671,10 +671,7 @@ struct vmap_block {
        DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
        DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
        union {
-                struct {
+                struct list_head free_list;
-                        struct list_head free_list;
-                        struct list_head dirty_list;
-                };
                struct rcu_head rcu_head;
        };
 };
@@ -741,7 +738,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
        bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
        bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
        INIT_LIST_HEAD(&vb->free_list);
-        INIT_LIST_HEAD(&vb->dirty_list);
        vb_idx = addr_to_vb_idx(va->va_start);
        spin_lock(&vmap_block_tree_lock);
@@ -772,12 +768,7 @@ static void free_vmap_block(struct vmap_block *vb)
        struct vmap_block *tmp;
        unsigned long vb_idx;
-        spin_lock(&vb->vbq->lock);
+        BUG_ON(!list_empty(&vb->free_list));
-        if (!list_empty(&vb->free_list))
-                list_del(&vb->free_list);
-        if (!list_empty(&vb->dirty_list))
-                list_del(&vb->dirty_list);
-        spin_unlock(&vb->vbq->lock);
        vb_idx = addr_to_vb_idx(vb->va->va_start);
        spin_lock(&vmap_block_tree_lock);
@@ -862,11 +853,7 @@ static void vb_free(const void *addr, unsigned long size)
        spin_lock(&vb->lock);
        bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
-        if (!vb->dirty) {
-                spin_lock(&vb->vbq->lock);
-                list_add(&vb->dirty_list, &vb->vbq->dirty);
-                spin_unlock(&vb->vbq->lock);
-        }
        vb->dirty += 1UL << order;
        if (vb->dirty == VMAP_BBMAP_BITS) {
                BUG_ON(vb->free || !list_empty(&vb->free_list));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 479e4671939..06e72693b45 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -60,8 +60,8 @@ struct scan_control {
        int may_writepage;
-        /* Can pages be swapped as part of reclaim? */
+        /* Can mapped pages be reclaimed? */
-        int may_swap;
+        int may_unmap;
        /* This context's SWAP_CLUSTER_MAX. If freeing memory for
         * suspend, we effectively ignore SWAP_CLUSTER_MAX.
@@ -78,6 +78,12 @@ struct scan_control {
        /* Which cgroup do we reclaim from */
        struct mem_cgroup *mem_cgroup;
+        /*
+         * Nodemask of nodes allowed by the caller. If NULL, all nodes
+         * are scanned.
+         */
+        nodemask_t      *nodemask;
        /* Pluggable isolate pages callback */
        unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
                        unsigned long *scanned, int order, int mode,
@@ -214,8 +220,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                do_div(delta, lru_pages + 1);
                shrinker->nr += delta;
                if (shrinker->nr < 0) {
-                        printk(KERN_ERR "%s: nr=%ld\n",
+                        printk(KERN_ERR "shrink_slab: %pF negative objects to "
-                                        __func__, shrinker->nr);
+                               "delete nr=%ld\n",
+                               shrinker->shrink, shrinker->nr);
                        shrinker->nr = max_pass;
                }
@@ -606,7 +613,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (unlikely(!page_evictable(page, NULL)))
                        goto cull_mlocked;
-                if (!sc->may_swap && page_mapped(page))
+                if (!sc->may_unmap && page_mapped(page))
                        goto keep_locked;
                /* Double the slab pressure for mapped and swapcache pages */
@@ -1298,17 +1305,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        }
        __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
        pgdeactivate += pgmoved;
-        if (buffer_heads_over_limit) {
-                spin_unlock_irq(&zone->lru_lock);
-                pagevec_strip(&pvec);
-                spin_lock_irq(&zone->lru_lock);
-        }
        __count_zone_vm_events(PGREFILL, zone, pgscanned);
        __count_vm_events(PGDEACTIVATE, pgdeactivate);
        spin_unlock_irq(&zone->lru_lock);
-        if (vm_swap_full())
+        if (buffer_heads_over_limit)
-                pagevec_swap_free(&pvec);
+                pagevec_strip(&pvec);
        pagevec_release(&pvec);
 }
@@ -1543,7 +1544,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
        struct zone *zone;
        sc->all_unreclaimable = 1;
-        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+                                        sc->nodemask) {
                if (!populated_zone(zone))
                        continue;
                /*
@@ -1688,17 +1690,18 @@ out:
 }
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
-                                                                gfp_t gfp_mask)
+                                gfp_t gfp_mask, nodemask_t *nodemask)
 {
        struct scan_control sc = {
                .gfp_mask = gfp_mask,
                .may_writepage = !laptop_mode,
                .swap_cluster_max = SWAP_CLUSTER_MAX,
-                .may_swap = 1,
+                .may_unmap = 1,
                .swappiness = vm_swappiness,
                .order = order,
                .mem_cgroup = NULL,
                .isolate_pages = isolate_pages_global,
+                .nodemask = nodemask,
        };
        return do_try_to_free_pages(zonelist, &sc);
@@ -1713,17 +1716,18 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 {
        struct scan_control sc = {
                .may_writepage = !laptop_mode,
-                .may_swap = 1,
+                .may_unmap = 1,
                .swap_cluster_max = SWAP_CLUSTER_MAX,
                .swappiness = swappiness,
                .order = 0,
                .mem_cgroup = mem_cont,
                .isolate_pages = mem_cgroup_isolate_pages,
+                .nodemask = NULL, /* we don't care the placement */
        };
        struct zonelist *zonelist;
        if (noswap)
-                sc.may_swap = 0;
+                sc.may_unmap = 0;
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1762,7 +1766,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
        struct reclaim_state *reclaim_state = current->reclaim_state;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
-                .may_swap = 1,
+                .may_unmap = 1,
                .swap_cluster_max = SWAP_CLUSTER_MAX,
                .swappiness = vm_swappiness,
                .order = order,
@@ -2050,22 +2054,19 @@ unsigned long global_lru_pages(void)
 #ifdef CONFIG_PM
 /*
 * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
- * from LRU lists system-wide, for given pass and priority, and returns the
+ * from LRU lists system-wide, for given pass and priority.
- * number of reclaimed pages
 *
 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
 */
-static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
+static void shrink_all_zones(unsigned long nr_pages, int prio,
                                      int pass, struct scan_control *sc)
 {
        struct zone *zone;
-        unsigned long ret = 0;
+        unsigned long nr_reclaimed = 0;
-        for_each_zone(zone) {
+        for_each_populated_zone(zone) {
                enum lru_list l;
-                if (!populated_zone(zone))
-                        continue;
                if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
                        continue;
@@ -2084,14 +2085,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
                                zone->lru[l].nr_scan = 0;
                                nr_to_scan = min(nr_pages, lru_pages);
-                                ret += shrink_list(l, nr_to_scan, zone,
+                                nr_reclaimed += shrink_list(l, nr_to_scan, zone,
                                                                sc, prio);
-                                if (ret >= nr_pages)
+                                if (nr_reclaimed >= nr_pages) {
-                                        return ret;
+                                        sc->nr_reclaimed = nr_reclaimed;
+                                        return;
+                                }
                        }
                }
        }
-        return ret;
+        sc->nr_reclaimed = nr_reclaimed;
 }
 /*
@@ -2105,13 +2108,11 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 unsigned long shrink_all_memory(unsigned long nr_pages)
 {
        unsigned long lru_pages, nr_slab;
-        unsigned long ret = 0;
        int pass;
        struct reclaim_state reclaim_state;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
-                .may_swap = 0,
+                .may_unmap = 0,
-                .swap_cluster_max = nr_pages,
                .may_writepage = 1,
                .isolate_pages = isolate_pages_global,
        };
@@ -2127,8 +2128,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
                if (!reclaim_state.reclaimed_slab)
                        break;
-                ret += reclaim_state.reclaimed_slab;
+                sc.nr_reclaimed += reclaim_state.reclaimed_slab;
-                if (ret >= nr_pages)
+                if (sc.nr_reclaimed >= nr_pages)
                        goto out;
                nr_slab -= reclaim_state.reclaimed_slab;
@@ -2147,21 +2148,22 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
                /* Force reclaiming mapped pages in the passes #3 and #4 */
                if (pass > 2)
-                        sc.may_swap = 1;
+                        sc.may_unmap = 1;
                for (prio = DEF_PRIORITY; prio >= 0; prio--) {
-                        unsigned long nr_to_scan = nr_pages - ret;
+                        unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed;
                        sc.nr_scanned = 0;
-                        ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
+                        sc.swap_cluster_max = nr_to_scan;
-                        if (ret >= nr_pages)
+                        shrink_all_zones(nr_to_scan, prio, pass, &sc);
+                        if (sc.nr_reclaimed >= nr_pages)
                                goto out;
                        reclaim_state.reclaimed_slab = 0;
                        shrink_slab(sc.nr_scanned, sc.gfp_mask,
                                        global_lru_pages());
-                        ret += reclaim_state.reclaimed_slab;
+                        sc.nr_reclaimed += reclaim_state.reclaimed_slab;
-                        if (ret >= nr_pages)
+                        if (sc.nr_reclaimed >= nr_pages)
                                goto out;
                        if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
@@ -2170,21 +2172,23 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
        }
        /*
-         * If ret = 0, we could not shrink LRUs, but there may be something
+         * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be
-         * in slab caches
+         * something in slab caches
         */
-        if (!ret) {
+        if (!sc.nr_reclaimed) {
                do {
                        reclaim_state.reclaimed_slab = 0;
                        shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
-                        ret += reclaim_state.reclaimed_slab;
+                        sc.nr_reclaimed += reclaim_state.reclaimed_slab;
-                } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+                } while (sc.nr_reclaimed < nr_pages &&
+                                reclaim_state.reclaimed_slab > 0);
        }
 out:
        current->reclaim_state = NULL;
-        return ret;
+        return sc.nr_reclaimed;
 }
 #endif
@@ -2290,11 +2294,12 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        int priority;
        struct scan_control sc = {
                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
-                .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+                .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
                .swap_cluster_max = max_t(unsigned long, nr_pages,
                                        SWAP_CLUSTER_MAX),
                .gfp_mask = gfp_mask,
                .swappiness = vm_swappiness,
+                .order = order,
                .isolate_pages = isolate_pages_global,
        };
        unsigned long slab_reclaimable;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8cd81ea1ddc..9826766f127 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -135,11 +135,7 @@ static void refresh_zone_stat_thresholds(void)
        int cpu;
        int threshold;
-        for_each_zone(zone) {
+        for_each_populated_zone(zone) {
-                if (!zone->present_pages)
-                        continue;
                threshold = calculate_threshold(zone);
                for_each_online_cpu(cpu)
@@ -301,12 +297,9 @@ void refresh_cpu_vm_stats(int cpu)
        int i;
        int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
-        for_each_zone(zone) {
+        for_each_populated_zone(zone) {
                struct per_cpu_pageset *p;
-                if (!populated_zone(zone))
-                        continue;
                p = zone_pcp(zone, cpu);
                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
author	Felix Blyakher <felixb@sgi.com>	2009-04-01 17:58:39 -0400
committer	Felix Blyakher <felixb@sgi.com>	2009-04-01 17:58:39 -0400
commit	f36345ff9a4a77f2cc576a2777b6256d5c8798fa (patch)
tree	7ae4c607f6baae74060c2e385f744e171fbbf92b /mm
parent	1aacc064e029f0017384e463121b98f06d3a2cc3 (diff)
parent	8b53ef33d9d8fa5f771ae11cc6a6e7bc0182beec (diff)