8 files changed, 154 insertions, 47 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 9abc6008544b..85e80a57db29 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -623,11 +623,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                        (*zap_work)--;
                        continue;
                }
+                (*zap_work) -= PAGE_SIZE;
                if (pte_present(ptent)) {
                        struct page *page;
-                        (*zap_work) -= PAGE_SIZE;
                        page = vm_normal_page(vma, addr, ptent);
                        if (unlikely(details) && page) {
                                /*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a918f77f02f3..1fe76d963ac2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -130,6 +130,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
                onlined_pages++;
        }
        zone->present_pages += onlined_pages;
+        zone->zone_pgdat->node_present_pages += onlined_pages;
        setup_per_zone_pages_min();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d80fa7d8f720..b21869a39f0b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -330,9 +330,19 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
        int err;
        struct vm_area_struct *first, *vma, *prev;
-        /* Clear the LRU lists so pages can be isolated */
+        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+                /* Must have swap device for migration */
+                if (nr_swap_pages <= 0)
+                        return ERR_PTR(-ENODEV);
+                /*
+                 * Clear the LRU lists so pages can be isolated.
+                 * Note that pages may be moved off the LRU after we have
+                 * drained them. Those pages will fail to migrate like other
+                 * pages that may be busy.
+                 */
                lru_add_drain_all();
+        }
        first = find_vma(mm, start);
        if (!first)
@@ -748,7 +758,7 @@ long do_mbind(unsigned long start, unsigned long len,
                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
            || mode > MPOL_MAX)
                return -EINVAL;
-        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
                return -EPERM;
        if (start & ~PAGE_MASK)
@@ -942,20 +952,20 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
         */
        if ((current->euid != task->suid) && (current->euid != task->uid) &&
            (current->uid != task->suid) && (current->uid != task->uid) &&
-            !capable(CAP_SYS_ADMIN)) {
+            !capable(CAP_SYS_NICE)) {
                err = -EPERM;
                goto out;
        }
        task_nodes = cpuset_mems_allowed(task);
        /* Is the user allowed to access the target nodes? */
-        if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+        if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
                err = -EPERM;
                goto out;
        }
        err = do_migrate_pages(mm, &old, &new,
-                capable(CAP_SYS_ADMIN) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
+                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 out:
        mmput(mm);
        return err;
@@ -1789,6 +1799,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
        cond_resched();
 }
+#ifdef CONFIG_HUGETLB_PAGE
 static void check_huge_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end,
                struct numa_maps *md)
@@ -1814,6 +1825,13 @@ static void check_huge_range(struct vm_area_struct *vma,
                gather_stats(page, md, pte_dirty(*ptep));
        }
 }
+#else
+static inline void check_huge_range(struct vm_area_struct *vma,
+                unsigned long start, unsigned long end,
+                struct numa_maps *md)
+{
+}
+#endif
 int show_numa_map(struct seq_file *m, void *v)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 791690d7d3fa..234bd4895d14 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -590,21 +590,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 }
 #ifdef CONFIG_NUMA
-/* Called from the slab reaper to drain remote pagesets */
+/*
-void drain_remote_pages(void)
+ * Called from the slab reaper to drain pagesets on a particular node that
+ * belong to the currently executing processor.
+ */
+void drain_node_pages(int nodeid)
 {
-        struct zone *zone;
+        int i, z;
-        int i;
        unsigned long flags;
        local_irq_save(flags);
-        for_each_zone(zone) {
+        for (z = 0; z < MAX_NR_ZONES; z++) {
+                struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
                struct per_cpu_pageset *pset;
-                /* Do not drain local pagesets */
-                if (zone->zone_pgdat->node_id == numa_node_id())
-                        continue;
                pset = zone_pcp(zone, smp_processor_id());
                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                        struct per_cpu_pages *pcp;
diff --git a/mm/rmap.c b/mm/rmap.c
index d8ce5ff61454..67f0e20b101f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -537,9 +537,6 @@ void page_add_new_anon_rmap(struct page *page,
 */
 void page_add_file_rmap(struct page *page)
 {
-        BUG_ON(PageAnon(page));
-        BUG_ON(!pfn_valid(page_to_pfn(page)));
        if (atomic_inc_and_test(&page->_mapcount))
                __inc_page_state(nr_mapped);
 }
diff --git a/mm/slab.c b/mm/slab.c
index f2e92dc1c9ce..d0bd7f07ab04 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -789,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char *
        dump_stack();
 }
+#ifdef CONFIG_NUMA
+/*
+ * Special reaping functions for NUMA systems called from cache_reap().
+ * These take care of doing round robin flushing of alien caches (containing
+ * objects freed on different nodes from which they were allocated) and the
+ * flushing of remote pcps by calling drain_node_pages.
+ */
+static DEFINE_PER_CPU(unsigned long, reap_node);
+static void init_reap_node(int cpu)
+{
+        int node;
+        node = next_node(cpu_to_node(cpu), node_online_map);
+        if (node == MAX_NUMNODES)
+                node = 0;
+        __get_cpu_var(reap_node) = node;
+}
+static void next_reap_node(void)
+{
+        int node = __get_cpu_var(reap_node);
+        /*
+         * Also drain per cpu pages on remote zones
+         */
+        if (node != numa_node_id())
+                drain_node_pages(node);
+        node = next_node(node, node_online_map);
+        if (unlikely(node >= MAX_NUMNODES))
+                node = first_node(node_online_map);
+        __get_cpu_var(reap_node) = node;
+}
+#else
+#define init_reap_node(cpu) do { } while (0)
+#define next_reap_node(void) do { } while (0)
+#endif
 /*
 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
 * via the workqueue/eventd.
@@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)
         * at that time.
         */
        if (keventd_up() && reap_work->func == NULL) {
+                init_reap_node(cpu);
                INIT_WORK(reap_work, cache_reap, NULL);
                schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
        }
@@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
        }
 }
+/*
+ * Called from cache_reap() to regularly drain alien caches round robin.
+ */
+static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+{
+        int node = __get_cpu_var(reap_node);
+        if (l3->alien) {
+                struct array_cache *ac = l3->alien[node];
+                if (ac && ac->avail) {
+                        spin_lock_irq(&ac->lock);
+                        __drain_alien_cache(cachep, ac, node);
+                        spin_unlock_irq(&ac->lock);
+                }
+        }
+}
 static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
 {
        int i = 0;
@@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **al
 #else
 #define drain_alien_cache(cachep, alien) do { } while (0)
+#define reap_alien(cachep, l3) do { } while (0)
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -1124,6 +1184,7 @@ void __init kmem_cache_init(void)
        struct cache_sizes *sizes;
        struct cache_names *names;
        int i;
+        int order;
        for (i = 0; i < NUM_INIT_LISTS; i++) {
                kmem_list3_init(&initkmem_list3[i]);
@@ -1167,11 +1228,15 @@ void __init kmem_cache_init(void)
        cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
-        cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
+        for (order = 0; order < MAX_ORDER; order++) {
-                       &left_over, &cache_cache.num);
+                cache_estimate(order, cache_cache.buffer_size,
+                        cache_line_size(), 0, &left_over, &cache_cache.num);
+                if (cache_cache.num)
+                        break;
+        }
        if (!cache_cache.num)
                BUG();
+        cache_cache.gfporder = order;
        cache_cache.colour = left_over / cache_cache.colour_off;
        cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
                                      sizeof(struct slab), cache_line_size());
@@ -1648,6 +1713,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
                left_over = remainder;
                /*
+                 * A VFS-reclaimable slab tends to have most allocations
+                 * as GFP_NOFS and we really don't want to have to be allocating
+                 * higher-order pages when we are unable to shrink dcache.
+                 */
+                if (flags & SLAB_RECLAIM_ACCOUNT)
+                        break;
+                /*
                 * Large number of objects is good, but very large slabs are
                 * currently bad for the gfp()s.
                 */
@@ -1869,17 +1942,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        size = ALIGN(size, align);
-        if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
+        left_over = calculate_slab_order(cachep, size, align, flags);
-                /*
-                 * A VFS-reclaimable slab tends to have most allocations
-                 * as GFP_NOFS and we really don't want to have to be allocating
-                 * higher-order pages when we are unable to shrink dcache.
-                 */
-                cachep->gfporder = 0;
-                cache_estimate(cachep->gfporder, size, align, flags,
-                               &left_over, &cachep->num);
-        } else
-                left_over = calculate_slab_order(cachep, size, align, flags);
        if (!cachep->num) {
                printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -3494,8 +3557,7 @@ static void cache_reap(void *unused)
                check_irq_on();
                l3 = searchp->nodelists[numa_node_id()];
-                if (l3->alien)
+                reap_alien(searchp, l3);
-                        drain_alien_cache(searchp, l3->alien);
                spin_lock_irq(&l3->list_lock);
                drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3545,7 +3607,7 @@ static void cache_reap(void *unused)
        }
        check_irq_on();
        mutex_unlock(&cache_chain_mutex);
-        drain_remote_pages();
+        next_reap_node();
        /* Setup the next iteration */
        schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
 }
diff --git a/mm/swap.c b/mm/swap.c
index cce3dda59c59..b524ea90bddb 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -393,7 +393,8 @@ void pagevec_strip(struct pagevec *pvec)
                struct page *page = pvec->pages[i];
                if (PagePrivate(page) && !TestSetPageLocked(page)) {
-                        try_to_release_page(page, 0);
+                        if (PagePrivate(page))
+                                try_to_release_page(page, 0);
                        unlock_page(page);
                }
        }
@@ -489,13 +490,34 @@ void percpu_counter_mod(struct percpu_counter *fbc, long amount)
        if (count >= FBC_BATCH || count <= -FBC_BATCH) {
                spin_lock(&fbc->lock);
                fbc->count += count;
+                *pcount = 0;
                spin_unlock(&fbc->lock);
-                count = 0;
+        } else {
+                *pcount = count;
        }
-        *pcount = count;
        put_cpu();
 }
 EXPORT_SYMBOL(percpu_counter_mod);
+/*
+ * Add up all the per-cpu counts, return the result.  This is a more accurate
+ * but much slower version of percpu_counter_read_positive()
+ */
+long percpu_counter_sum(struct percpu_counter *fbc)
+{
+        long ret;
+        int cpu;
+        spin_lock(&fbc->lock);
+        ret = fbc->count;
+        for_each_cpu(cpu) {
+                long *pcount = per_cpu_ptr(fbc->counters, cpu);
+                ret += *pcount;
+        }
+        spin_unlock(&fbc->lock);
+        return ret < 0 ? 0 : ret;
+}
+EXPORT_SYMBOL(percpu_counter_sum);
 #endif
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0af7593d01e..4fe7e3aa02e2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -700,7 +700,7 @@ int migrate_page_remove_references(struct page *newpage,
         * the page.
         */
        if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
-                return 1;
+                return -EAGAIN;
        /*
         * Establish swap ptes for anonymous pages or destroy pte
@@ -721,13 +721,15 @@ int migrate_page_remove_references(struct page *newpage,
         * If the page was not migrated then the PageSwapCache bit
         * is still set and the operation may continue.
         */
-        try_to_unmap(page, 1);
+        if (try_to_unmap(page, 1) == SWAP_FAIL)
+                /* A vma has VM_LOCKED set -> Permanent failure */
+                return -EPERM;
        /*
         * Give up if we were unable to remove all mappings.
         */
        if (page_mapcount(page))
-                return 1;
+                return -EAGAIN;
        write_lock_irq(&mapping->tree_lock);
@@ -738,7 +740,7 @@ int migrate_page_remove_references(struct page *newpage,
        if (!page_mapping(page) || page_count(page) != nr_refs ||
                        *radix_pointer != page) {
                write_unlock_irq(&mapping->tree_lock);
-                return 1;
+                return -EAGAIN;
        }
        /*
@@ -813,10 +815,14 @@ EXPORT_SYMBOL(migrate_page_copy);
 */
 int migrate_page(struct page *newpage, struct page *page)
 {
+        int rc;
        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
-        if (migrate_page_remove_references(newpage, page, 2))
+        rc = migrate_page_remove_references(newpage, page, 2);
-                return -EAGAIN;
+        if (rc)
+                return rc;
        migrate_page_copy(newpage, page);
@@ -1883,7 +1889,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        if (!(gfp_mask & __GFP_WAIT) ||
                zone->all_unreclaimable ||
-                atomic_read(&zone->reclaim_in_progress) > 0)
+                atomic_read(&zone->reclaim_in_progress) > 0 ||
+                (p->flags & PF_MEMALLOC))
                        return 0;
        node_id = zone->zone_pgdat->node_id;

diff --git a/mm/memory.c b/mm/memory.c index 9abc6008544b..85e80a57db29 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -623,11 +623,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
623	(*zap_work)--;	623	(*zap_work)--;
624	continue;	624	continue;
625	}	625	}
		626
		627	(*zap_work) -= PAGE_SIZE;
		628
626	if (pte_present(ptent)) {	629	if (pte_present(ptent)) {
627	struct page *page;	630	struct page *page;
628		631
629	(*zap_work) -= PAGE_SIZE;
630
631	page = vm_normal_page(vma, addr, ptent);	632	page = vm_normal_page(vma, addr, ptent);
632	if (unlikely(details) && page) {	633	if (unlikely(details) && page) {
633	/*	634	/*


diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a918f77f02f3..1fe76d963ac2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c
@@ -130,6 +130,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
130	onlined_pages++;	130	onlined_pages++;
131	}	131	}
132	zone->present_pages += onlined_pages;	132	zone->present_pages += onlined_pages;
		133	zone->zone_pgdat->node_present_pages += onlined_pages;
133		134
134	setup_per_zone_pages_min();	135	setup_per_zone_pages_min();
135		136


diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d80fa7d8f720..b21869a39f0b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c
@@ -330,9 +330,19 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
330	int err;	330	int err;
331	struct vm_area_struct first, vma, *prev;	331	struct vm_area_struct first, vma, *prev;
332		332
333	/* Clear the LRU lists so pages can be isolated */	333	if (flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)) {
334	if (flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL))	334	/* Must have swap device for migration */
		335	if (nr_swap_pages <= 0)
		336	return ERR_PTR(-ENODEV);
		337
		338	/*
		339	* Clear the LRU lists so pages can be isolated.
		340	* Note that pages may be moved off the LRU after we have
		341	* drained them. Those pages will fail to migrate like other
		342	* pages that may be busy.
		343	*/
335	lru_add_drain_all();	344	lru_add_drain_all();
		345	}
336		346
337	first = find_vma(mm, start);	347	first = find_vma(mm, start);
338	if (!first)	348	if (!first)
@@ -748,7 +758,7 @@ long do_mbind(unsigned long start, unsigned long len,
748	MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL))	758	MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL))
749	\|\| mode > MPOL_MAX)	759	\|\| mode > MPOL_MAX)
750	return -EINVAL;	760	return -EINVAL;
751	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))	761	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
752	return -EPERM;	762	return -EPERM;
753		763
754	if (start & ~PAGE_MASK)	764	if (start & ~PAGE_MASK)
@@ -942,20 +952,20 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
942	*/	952	*/
943	if ((current->euid != task->suid) && (current->euid != task->uid) &&	953	if ((current->euid != task->suid) && (current->euid != task->uid) &&
944	(current->uid != task->suid) && (current->uid != task->uid) &&	954	(current->uid != task->suid) && (current->uid != task->uid) &&
945	!capable(CAP_SYS_ADMIN)) {	955	!capable(CAP_SYS_NICE)) {
946	err = -EPERM;	956	err = -EPERM;
947	goto out;	957	goto out;
948	}	958	}
949		959
950	task_nodes = cpuset_mems_allowed(task);	960	task_nodes = cpuset_mems_allowed(task);
951	/* Is the user allowed to access the target nodes? */	961	/* Is the user allowed to access the target nodes? */
952	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {	962	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
953	err = -EPERM;	963	err = -EPERM;
954	goto out;	964	goto out;
955	}	965	}
956		966
957	err = do_migrate_pages(mm, &old, &new,	967	err = do_migrate_pages(mm, &old, &new,
958	capable(CAP_SYS_ADMIN) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);	968	capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
959	out:	969	out:
960	mmput(mm);	970	mmput(mm);
961	return err;	971	return err;
@@ -1789,6 +1799,7 @@ static void gather_stats(struct page page, void private, int pte_dirty)
1789	cond_resched();	1799	cond_resched();
1790	}	1800	}
1791		1801
		1802	#ifdef CONFIG_HUGETLB_PAGE
1792	static void check_huge_range(struct vm_area_struct *vma,	1803	static void check_huge_range(struct vm_area_struct *vma,
1793	unsigned long start, unsigned long end,	1804	unsigned long start, unsigned long end,
1794	struct numa_maps *md)	1805	struct numa_maps *md)
@@ -1814,6 +1825,13 @@ static void check_huge_range(struct vm_area_struct *vma,
1814	gather_stats(page, md, pte_dirty(*ptep));	1825	gather_stats(page, md, pte_dirty(*ptep));
1815	}	1826	}
1816	}	1827	}
		1828	#else
		1829	static inline void check_huge_range(struct vm_area_struct *vma,
		1830	unsigned long start, unsigned long end,
		1831	struct numa_maps *md)
		1832	{
		1833	}
		1834	#endif
1817		1835
1818	int show_numa_map(struct seq_file m, void v)	1836	int show_numa_map(struct seq_file m, void v)
1819	{	1837	{


diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 791690d7d3fa..234bd4895d14 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -590,21 +590,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
590	}	590	}
591		591
592	#ifdef CONFIG_NUMA	592	#ifdef CONFIG_NUMA
593	/* Called from the slab reaper to drain remote pagesets */	593	/*
594	void drain_remote_pages(void)	594	* Called from the slab reaper to drain pagesets on a particular node that
		595	* belong to the currently executing processor.
		596	*/
		597	void drain_node_pages(int nodeid)
595	{	598	{
596	struct zone *zone;	599	int i, z;
597	int i;
598	unsigned long flags;	600	unsigned long flags;
599		601
600	local_irq_save(flags);	602	local_irq_save(flags);
601	for_each_zone(zone) {	603	for (z = 0; z < MAX_NR_ZONES; z++) {
		604	struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
602	struct per_cpu_pageset *pset;	605	struct per_cpu_pageset *pset;
603		606
604	/* Do not drain local pagesets */
605	if (zone->zone_pgdat->node_id == numa_node_id())
606	continue;
607
608	pset = zone_pcp(zone, smp_processor_id());	607	pset = zone_pcp(zone, smp_processor_id());
609	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {	608	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
610	struct per_cpu_pages *pcp;	609	struct per_cpu_pages *pcp;


diff --git a/mm/rmap.c b/mm/rmap.c index d8ce5ff61454..67f0e20b101f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c
@@ -537,9 +537,6 @@ void page_add_new_anon_rmap(struct page *page,
537	*/	537	*/
538	void page_add_file_rmap(struct page *page)	538	void page_add_file_rmap(struct page *page)
539	{	539	{
540	BUG_ON(PageAnon(page));
541	BUG_ON(!pfn_valid(page_to_pfn(page)));
542
543	if (atomic_inc_and_test(&page->_mapcount))	540	if (atomic_inc_and_test(&page->_mapcount))
544	__inc_page_state(nr_mapped);	541	__inc_page_state(nr_mapped);
545	}	542	}


diff --git a/mm/slab.c b/mm/slab.c index f2e92dc1c9ce..d0bd7f07ab04 100644 --- a/mm/slab.c +++ b/mm/slab.c
@@ -789,6 +789,47 @@ static void __slab_error(const char function, struct kmem_cache cachep, char *
789	dump_stack();	789	dump_stack();
790	}	790	}
791		791
		792	#ifdef CONFIG_NUMA
		793	/*
		794	* Special reaping functions for NUMA systems called from cache_reap().
		795	* These take care of doing round robin flushing of alien caches (containing
		796	* objects freed on different nodes from which they were allocated) and the
		797	* flushing of remote pcps by calling drain_node_pages.
		798	*/
		799	static DEFINE_PER_CPU(unsigned long, reap_node);
		800
		801	static void init_reap_node(int cpu)
		802	{
		803	int node;
		804
		805	node = next_node(cpu_to_node(cpu), node_online_map);
		806	if (node == MAX_NUMNODES)
		807	node = 0;
		808
		809	__get_cpu_var(reap_node) = node;
		810	}
		811
		812	static void next_reap_node(void)
		813	{
		814	int node = __get_cpu_var(reap_node);
		815
		816	/*
		817	* Also drain per cpu pages on remote zones
		818	*/
		819	if (node != numa_node_id())
		820	drain_node_pages(node);
		821
		822	node = next_node(node, node_online_map);
		823	if (unlikely(node >= MAX_NUMNODES))
		824	node = first_node(node_online_map);
		825	__get_cpu_var(reap_node) = node;
		826	}
		827
		828	#else
		829	#define init_reap_node(cpu) do { } while (0)
		830	#define next_reap_node(void) do { } while (0)
		831	#endif
		832
792	/*	833	/*
793	* Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz	834	* Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
794	* via the workqueue/eventd.	835	* via the workqueue/eventd.
@@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)
806	* at that time.	847	* at that time.
807	*/	848	*/
808	if (keventd_up() && reap_work->func == NULL) {	849	if (keventd_up() && reap_work->func == NULL) {
		850	init_reap_node(cpu);
809	INIT_WORK(reap_work, cache_reap, NULL);	851	INIT_WORK(reap_work, cache_reap, NULL);
810	schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);	852	schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
811	}	853	}
@@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
884	}	926	}
885	}	927	}
886		928
		929	/*
		930	* Called from cache_reap() to regularly drain alien caches round robin.
		931	*/
		932	static void reap_alien(struct kmem_cache cachep, struct kmem_list3 l3)
		933	{
		934	int node = __get_cpu_var(reap_node);
		935
		936	if (l3->alien) {
		937	struct array_cache *ac = l3->alien[node];
		938	if (ac && ac->avail) {
		939	spin_lock_irq(&ac->lock);
		940	__drain_alien_cache(cachep, ac, node);
		941	spin_unlock_irq(&ac->lock);
		942	}
		943	}
		944	}
		945
887	static void drain_alien_cache(struct kmem_cache cachep, struct array_cache *alien)	946	static void drain_alien_cache(struct kmem_cache cachep, struct array_cache *alien)
888	{	947	{
889	int i = 0;	948	int i = 0;
@@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache cachep, struct array_cache *al
902	#else	961	#else
903		962
904	#define drain_alien_cache(cachep, alien) do { } while (0)	963	#define drain_alien_cache(cachep, alien) do { } while (0)
		964	#define reap_alien(cachep, l3) do { } while (0)
905		965
906	static inline struct array_cache **alloc_alien_cache(int node, int limit)	966	static inline struct array_cache **alloc_alien_cache(int node, int limit)
907	{	967	{
@@ -1124,6 +1184,7 @@ void __init kmem_cache_init(void)
1124	struct cache_sizes *sizes;	1184	struct cache_sizes *sizes;
1125	struct cache_names *names;	1185	struct cache_names *names;
1126	int i;	1186	int i;
		1187	int order;
1127		1188
1128	for (i = 0; i < NUM_INIT_LISTS; i++) {	1189	for (i = 0; i < NUM_INIT_LISTS; i++) {
1129	kmem_list3_init(&initkmem_list3[i]);	1190	kmem_list3_init(&initkmem_list3[i]);
@@ -1167,11 +1228,15 @@ void __init kmem_cache_init(void)
1167		1228
1168	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());	1229	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
1169		1230
1170	cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,	1231	for (order = 0; order < MAX_ORDER; order++) {
1171	&left_over, &cache_cache.num);	1232	cache_estimate(order, cache_cache.buffer_size,
		1233	cache_line_size(), 0, &left_over, &cache_cache.num);
		1234	if (cache_cache.num)
		1235	break;
		1236	}
1172	if (!cache_cache.num)	1237	if (!cache_cache.num)
1173	BUG();	1238	BUG();
1174		1239	cache_cache.gfporder = order;
1175	cache_cache.colour = left_over / cache_cache.colour_off;	1240	cache_cache.colour = left_over / cache_cache.colour_off;
1176	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +	1241	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1177	sizeof(struct slab), cache_line_size());	1242	sizeof(struct slab), cache_line_size());
@@ -1648,6 +1713,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1648	left_over = remainder;	1713	left_over = remainder;
1649		1714
1650	/*	1715	/*
		1716	* A VFS-reclaimable slab tends to have most allocations
		1717	* as GFP_NOFS and we really don't want to have to be allocating
		1718	* higher-order pages when we are unable to shrink dcache.
		1719	*/
		1720	if (flags & SLAB_RECLAIM_ACCOUNT)
		1721	break;
		1722
		1723	/*
1651	* Large number of objects is good, but very large slabs are	1724	* Large number of objects is good, but very large slabs are
1652	* currently bad for the gfp()s.	1725	* currently bad for the gfp()s.
1653	*/	1726	*/
@@ -1869,17 +1942,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1869		1942
1870	size = ALIGN(size, align);	1943	size = ALIGN(size, align);
1871		1944
1872	if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {	1945	left_over = calculate_slab_order(cachep, size, align, flags);
1873	/*
1874	* A VFS-reclaimable slab tends to have most allocations
1875	* as GFP_NOFS and we really don't want to have to be allocating
1876	* higher-order pages when we are unable to shrink dcache.
1877	*/
1878	cachep->gfporder = 0;
1879	cache_estimate(cachep->gfporder, size, align, flags,
1880	&left_over, &cachep->num);
1881	} else
1882	left_over = calculate_slab_order(cachep, size, align, flags);
1883		1946
1884	if (!cachep->num) {	1947	if (!cachep->num) {
1885	printk("kmem_cache_create: couldn't create cache %s.\n", name);	1948	printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -3494,8 +3557,7 @@ static void cache_reap(void *unused)
3494	check_irq_on();	3557	check_irq_on();
3495		3558
3496	l3 = searchp->nodelists[numa_node_id()];	3559	l3 = searchp->nodelists[numa_node_id()];
3497	if (l3->alien)	3560	reap_alien(searchp, l3);
3498	drain_alien_cache(searchp, l3->alien);
3499	spin_lock_irq(&l3->list_lock);	3561	spin_lock_irq(&l3->list_lock);
3500		3562
3501	drain_array_locked(searchp, cpu_cache_get(searchp), 0,	3563	drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3545,7 +3607,7 @@ static void cache_reap(void *unused)
3545	}	3607	}
3546	check_irq_on();	3608	check_irq_on();
3547	mutex_unlock(&cache_chain_mutex);	3609	mutex_unlock(&cache_chain_mutex);
3548	drain_remote_pages();	3610	next_reap_node();
3549	/* Setup the next iteration */	3611	/* Setup the next iteration */
3550	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);	3612	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3551	}	3613	}


diff --git a/mm/swap.c b/mm/swap.c index cce3dda59c59..b524ea90bddb 100644 --- a/mm/swap.c +++ b/mm/swap.c
@@ -393,7 +393,8 @@ void pagevec_strip(struct pagevec *pvec)
393	struct page *page = pvec->pages[i];	393	struct page *page = pvec->pages[i];
394		394
395	if (PagePrivate(page) && !TestSetPageLocked(page)) {	395	if (PagePrivate(page) && !TestSetPageLocked(page)) {
396	try_to_release_page(page, 0);	396	if (PagePrivate(page))
		397	try_to_release_page(page, 0);
397	unlock_page(page);	398	unlock_page(page);
398	}	399	}
399	}	400	}
@@ -489,13 +490,34 @@ void percpu_counter_mod(struct percpu_counter *fbc, long amount)
489	if (count >= FBC_BATCH \|\| count <= -FBC_BATCH) {	490	if (count >= FBC_BATCH \|\| count <= -FBC_BATCH) {
490	spin_lock(&fbc->lock);	491	spin_lock(&fbc->lock);
491	fbc->count += count;	492	fbc->count += count;
		493	*pcount = 0;
492	spin_unlock(&fbc->lock);	494	spin_unlock(&fbc->lock);
493	count = 0;	495	} else {
		496	*pcount = count;
494	}	497	}
495	*pcount = count;
496	put_cpu();	498	put_cpu();
497	}	499	}
498	EXPORT_SYMBOL(percpu_counter_mod);	500	EXPORT_SYMBOL(percpu_counter_mod);
		501
		502	/*
		503	* Add up all the per-cpu counts, return the result. This is a more accurate
		504	* but much slower version of percpu_counter_read_positive()
		505	*/
		506	long percpu_counter_sum(struct percpu_counter *fbc)
		507	{
		508	long ret;
		509	int cpu;
		510
		511	spin_lock(&fbc->lock);
		512	ret = fbc->count;
		513	for_each_cpu(cpu) {
		514	long *pcount = per_cpu_ptr(fbc->counters, cpu);
		515	ret += *pcount;
		516	}
		517	spin_unlock(&fbc->lock);
		518	return ret < 0 ? 0 : ret;
		519	}
		520	EXPORT_SYMBOL(percpu_counter_sum);
499	#endif	521	#endif
500		522
501	/*	523	/*


diff --git a/mm/vmscan.c b/mm/vmscan.c index b0af7593d01e..4fe7e3aa02e2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -700,7 +700,7 @@ int migrate_page_remove_references(struct page *newpage,
700	* the page.	700	* the page.
701	*/	701	*/
702	if (!mapping \|\| page_mapcount(page) + nr_refs != page_count(page))	702	if (!mapping \|\| page_mapcount(page) + nr_refs != page_count(page))
703	return 1;	703	return -EAGAIN;
704		704
705	/*	705	/*
706	* Establish swap ptes for anonymous pages or destroy pte	706	* Establish swap ptes for anonymous pages or destroy pte
@@ -721,13 +721,15 @@ int migrate_page_remove_references(struct page *newpage,
721	* If the page was not migrated then the PageSwapCache bit	721	* If the page was not migrated then the PageSwapCache bit
722	* is still set and the operation may continue.	722	* is still set and the operation may continue.
723	*/	723	*/
724	try_to_unmap(page, 1);	724	if (try_to_unmap(page, 1) == SWAP_FAIL)
		725	/* A vma has VM_LOCKED set -> Permanent failure */
		726	return -EPERM;
725		727
726	/*	728	/*
727	* Give up if we were unable to remove all mappings.	729	* Give up if we were unable to remove all mappings.
728	*/	730	*/
729	if (page_mapcount(page))	731	if (page_mapcount(page))
730	return 1;	732	return -EAGAIN;
731		733
732	write_lock_irq(&mapping->tree_lock);	734	write_lock_irq(&mapping->tree_lock);
733		735
@@ -738,7 +740,7 @@ int migrate_page_remove_references(struct page *newpage,
738	if (!page_mapping(page) \|\| page_count(page) != nr_refs \|\|	740	if (!page_mapping(page) \|\| page_count(page) != nr_refs \|\|
739	*radix_pointer != page) {	741	*radix_pointer != page) {
740	write_unlock_irq(&mapping->tree_lock);	742	write_unlock_irq(&mapping->tree_lock);
741	return 1;	743	return -EAGAIN;
742	}	744	}
743		745
744	/*	746	/*
@@ -813,10 +815,14 @@ EXPORT_SYMBOL(migrate_page_copy);
813	*/	815	*/
814	int migrate_page(struct page newpage, struct page page)	816	int migrate_page(struct page newpage, struct page page)
815	{	817	{
		818	int rc;
		819
816	BUG_ON(PageWriteback(page)); /* Writeback must be complete */	820	BUG_ON(PageWriteback(page)); /* Writeback must be complete */
817		821
818	if (migrate_page_remove_references(newpage, page, 2))	822	rc = migrate_page_remove_references(newpage, page, 2);
819	return -EAGAIN;	823
		824	if (rc)
		825	return rc;
820		826
821	migrate_page_copy(newpage, page);	827	migrate_page_copy(newpage, page);
822		828
@@ -1883,7 +1889,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1883		1889
1884	if (!(gfp_mask & __GFP_WAIT) \|\|	1890	if (!(gfp_mask & __GFP_WAIT) \|\|
1885	zone->all_unreclaimable \|\|	1891	zone->all_unreclaimable \|\|
1886	atomic_read(&zone->reclaim_in_progress) > 0)	1892	atomic_read(&zone->reclaim_in_progress) > 0 \|\|
		1893	(p->flags & PF_MEMALLOC))
1887	return 0;	1894	return 0;
1888		1895
1889	node_id = zone->zone_pgdat->node_id;	1896	node_id = zone->zone_pgdat->node_id;