Merge branch 'upstream-fixes'

author: Jeff Garzik <jgarzik@pobox.com> 2006-02-17 16:20:30 -0500
committer: Jeff Garzik <jgarzik@pobox.com> 2006-02-17 16:20:30 -0500
commit: b04a92e1601eb6df3a3b6599e7fb7ee021eef2cb (patch)
tree: f190980d5bffae81a67c0cea0d913ed2444ceb0c /mm
parent: 70c07e02625ec46d0ffbfce1acef42d660803528 (diff)
parent: f5e2a7b22e7d7dfda8794906d0fddeaaa09bb944 (diff)
8 files changed, 129 insertions, 64 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 67f2951666..508707704d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -85,7 +85,7 @@ void free_huge_page(struct page *page)
        BUG_ON(page_count(page));
        INIT_LIST_HEAD(&page->lru);
-        page[1].mapping = NULL;
+        page[1].lru.next = NULL;                        /* reset dtor */
        spin_lock(&hugetlb_lock);
        enqueue_huge_page(page);
@@ -105,7 +105,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
        }
        spin_unlock(&hugetlb_lock);
        set_page_count(page, 1);
-        page[1].mapping = (void *)free_huge_page;
+        page[1].lru.next = (void *)free_huge_page;      /* set dtor */
        for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
                clear_user_highpage(&page[i], addr);
        return page;
diff --git a/mm/madvise.c b/mm/madvise.c
index ae0ae3ea29..af3d573b01 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -22,16 +22,23 @@ static long madvise_behavior(struct vm_area_struct * vma,
        struct mm_struct * mm = vma->vm_mm;
        int error = 0;
        pgoff_t pgoff;
-        int new_flags = vma->vm_flags & ~VM_READHINTMASK;
+        int new_flags = vma->vm_flags;
        switch (behavior) {
+        case MADV_NORMAL:
+                new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+                break;
        case MADV_SEQUENTIAL:
-                new_flags |= VM_SEQ_READ;
+                new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
                break;
        case MADV_RANDOM:
-                new_flags |= VM_RAND_READ;
+                new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
                break;
-        default:
+        case MADV_DONTFORK:
+                new_flags |= VM_DONTCOPY;
+                break;
+        case MADV_DOFORK:
+                new_flags &= ~VM_DONTCOPY;
                break;
        }
@@ -177,6 +184,12 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
        long error;
        switch (behavior) {
+        case MADV_DOFORK:
+                if (vma->vm_flags & VM_IO) {
+                        error = -EINVAL;
+                        break;
+                }
+        case MADV_DONTFORK:
        case MADV_NORMAL:
        case MADV_SEQUENTIAL:
        case MADV_RANDOM:
diff --git a/mm/memory.c b/mm/memory.c
index 2bee1f21aa..9abc600854 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
 EXPORT_SYMBOL(vmalloc_earlyreserve);
+int randomize_va_space __read_mostly = 1;
+static int __init disable_randmaps(char *s)
+{
+        randomize_va_space = 0;
+        return 0;
+}
+__setup("norandmaps", disable_randmaps);
 /*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3bd7fb7e4b..323fdcf128 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -132,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
        }
        return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 }
 /* Generate a custom zonelist for the BIND policy. */
 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 {
        struct zonelist *zl;
-        int num, max, nd;
+        int num, max, nd, k;
        max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
-        zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
+        zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
        if (!zl)
                return NULL;
        num = 0;
-        for_each_node_mask(nd, *nodes)
+        /* First put in the highest zones from all nodes, then all the next 
-                zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
+           lower zones etc. Avoid empty zones because the memory allocator
+           doesn't like them. If you implement node hot removal you
+           have to fix that. */
+        for (k = policy_zone; k >= 0; k--) { 
+                for_each_node_mask(nd, *nodes) { 
+                        struct zone *z = &NODE_DATA(nd)->node_zones[k];
+                        if (z->present_pages > 0) 
+                                zl->zones[num++] = z;
+                }
+        }
        zl->zones[num] = NULL;
        return zl;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dde04ff4be..62c1225285 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@ long nr_swap_pages;
 int percpu_pagelist_fraction;
 static void fastcall free_hot_cold_page(struct page *page, int cold);
+static void __free_pages_ok(struct page *page, unsigned int order);
 /*
 * results with 256, 32 in the lowmem_reserve sysctl:
@@ -169,20 +170,23 @@ static void bad_page(struct page *page)
 * All pages have PG_compound set.  All pages have their ->private pointing at
 * the head page (even the head page has this).
 *
- * The first tail page's ->mapping, if non-zero, holds the address of the
+ * The first tail page's ->lru.next holds the address of the compound page's
- * compound page's put_page() function.
+ * put_page() function.  Its ->lru.prev holds the order of allocation.
- *
+ * This usage means that zero-order pages may not be compound.
- * The order of the allocation is stored in the first tail page's ->index
- * This is only for debug at present.  This usage means that zero-order pages
- * may not be compound.
 */
+static void free_compound_page(struct page *page)
+{
+        __free_pages_ok(page, (unsigned long)page[1].lru.prev);
+}
 static void prep_compound_page(struct page *page, unsigned long order)
 {
        int i;
        int nr_pages = 1 << order;
-        page[1].mapping = NULL;
+        page[1].lru.next = (void *)free_compound_page;  /* set dtor */
-        page[1].index = order;
+        page[1].lru.prev = (void *)order;
        for (i = 0; i < nr_pages; i++) {
                struct page *p = page + i;
@@ -196,7 +200,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
        int i;
        int nr_pages = 1 << order;
-        if (unlikely(page[1].index != order))
+        if (unlikely((unsigned long)page[1].lru.prev != order))
                bad_page(page);
        for (i = 0; i < nr_pages; i++) {
diff --git a/mm/slab.c b/mm/slab.c
index d66c2b0d97..add05d808a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1717,6 +1717,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                BUG();
        }
+        /*
+         * Prevent CPUs from coming and going.
+         * lock_cpu_hotplug() nests outside cache_chain_mutex
+         */
+        lock_cpu_hotplug();
        mutex_lock(&cache_chain_mutex);
        list_for_each(p, &cache_chain) {
@@ -1918,8 +1924,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        cachep->dtor = dtor;
        cachep->name = name;
-        /* Don't let CPUs to come and go */
-        lock_cpu_hotplug();
        if (g_cpucache_up == FULL) {
                enable_cpucache(cachep);
@@ -1978,12 +1982,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        /* cache setup completed, link it into the list */
        list_add(&cachep->next, &cache_chain);
-        unlock_cpu_hotplug();
      oops:
        if (!cachep && (flags & SLAB_PANIC))
                panic("kmem_cache_create(): failed to create slab `%s'\n",
                      name);
        mutex_unlock(&cache_chain_mutex);
+        unlock_cpu_hotplug();
        return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
diff --git a/mm/swap.c b/mm/swap.c
index 76247424de..cce3dda59c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -40,7 +40,7 @@ static void put_compound_page(struct page *page)
        if (put_page_testzero(page)) {
                void (*dtor)(struct page *page);
-                dtor = (void (*)(struct page *))page[1].mapping;
+                dtor = (void (*)(struct page *))page[1].lru.next;
                (*dtor)(page);
        }
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5a610804cd..1838c15ca4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                BUG_ON(PageActive(page));
                sc->nr_scanned++;
+                if (!sc->may_swap && page_mapped(page))
+                        goto keep_locked;
                /* Double the slab pressure for mapped and swapcache pages */
                if (page_mapped(page) || PageSwapCache(page))
                        sc->nr_scanned++;
@@ -632,7 +636,7 @@ static int swap_page(struct page *page)
        struct address_space *mapping = page_mapping(page);
        if (page_mapped(page) && mapping)
-                if (try_to_unmap(page, 0) != SWAP_SUCCESS)
+                if (try_to_unmap(page, 1) != SWAP_SUCCESS)
                        goto unlock_retry;
        if (PageDirty(page)) {
@@ -839,7 +843,7 @@ EXPORT_SYMBOL(migrate_page);
 * pages are swapped out.
 *
 * The function returns after 10 attempts or if no pages
- * are movable anymore because t has become empty
+ * are movable anymore because to has become empty
 * or no retryable pages exist anymore.
 *
 * Return: Number of pages not migrated when "to" ran empty.
@@ -928,12 +932,21 @@ redo:
                        goto unlock_both;
                if (mapping->a_ops->migratepage) {
+                        /*
+                         * Most pages have a mapping and most filesystems
+                         * should provide a migration function. Anonymous
+                         * pages are part of swap space which also has its
+                         * own migration function. This is the most common
+                         * path for page migration.
+                         */
                        rc = mapping->a_ops->migratepage(newpage, page);
                        goto unlock_both;
                }
                /*
-                 * Trigger writeout if page is dirty
+                 * Default handling if a filesystem does not provide
+                 * a migration function. We can only migrate clean
+                 * pages so try to write out any dirty pages first.
                 */
                if (PageDirty(page)) {
                        switch (pageout(page, mapping)) {
@@ -949,9 +962,10 @@ redo:
                                ; /* try to migrate the page below */
                        }
                }
                /*
-                 * If we have no buffer or can release the buffer
+                 * Buffers are managed in a filesystem specific way.
-                 * then do a simple migration.
+                 * We must have no buffers or drop them.
                 */
                if (!page_has_buffers(page) ||
                    try_to_release_page(page, GFP_KERNEL)) {
@@ -966,6 +980,11 @@ redo:
                 * swap them out.
                 */
                if (pass > 4) {
+                        /*
+                         * Persistently unable to drop buffers..... As a
+                         * measure of last resort we fall back to
+                         * swap_page().
+                         */
                        unlock_page(newpage);
                        newpage = NULL;
                        rc = swap_page(page);
@@ -1176,9 +1195,47 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
        struct page *page;
        struct pagevec pvec;
        int reclaim_mapped = 0;
-        long mapped_ratio;
-        long distress;
+        if (unlikely(sc->may_swap)) {
-        long swap_tendency;
+                long mapped_ratio;
+                long distress;
+                long swap_tendency;
+                /*
+                 * `distress' is a measure of how much trouble we're having
+                 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
+                 */
+                distress = 100 >> zone->prev_priority;
+                /*
+                 * The point of this algorithm is to decide when to start
+                 * reclaiming mapped memory instead of just pagecache.  Work out
+                 * how much memory
+                 * is mapped.
+                 */
+                mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+                /*
+                 * Now decide how much we really want to unmap some pages.  The
+                 * mapped ratio is downgraded - just because there's a lot of
+                 * mapped memory doesn't necessarily mean that page reclaim
+                 * isn't succeeding.
+                 *
+                 * The distress ratio is important - we don't want to start
+                 * going oom.
+                 *
+                 * A 100% value of vm_swappiness overrides this algorithm
+                 * altogether.
+                 */
+                swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+                /*
+                 * Now use this metric to decide whether to start moving mapped
+                 * memory onto the inactive list.
+                 */
+                if (swap_tendency >= 100)
+                        reclaim_mapped = 1;
+        }
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
@@ -1188,37 +1245,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
        zone->nr_active -= pgmoved;
        spin_unlock_irq(&zone->lru_lock);
-        /*
-         * `distress' is a measure of how much trouble we're having reclaiming
-         * pages.  0 -> no problems.  100 -> great trouble.
-         */
-        distress = 100 >> zone->prev_priority;
-        /*
-         * The point of this algorithm is to decide when to start reclaiming
-         * mapped memory instead of just pagecache.  Work out how much memory
-         * is mapped.
-         */
-        mapped_ratio = (sc->nr_mapped * 100) / total_memory;
-        /*
-         * Now decide how much we really want to unmap some pages.  The mapped
-         * ratio is downgraded - just because there's a lot of mapped memory
-         * doesn't necessarily mean that page reclaim isn't succeeding.
-         *
-         * The distress ratio is important - we don't want to start going oom.
-         *
-         * A 100% value of vm_swappiness overrides this algorithm altogether.
-         */
-        swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-        /*
-         * Now use this metric to decide whether to start moving mapped memory
-         * onto the inactive list.
-         */
-        if (swap_tendency >= 100)
-                reclaim_mapped = 1;
        while (!list_empty(&l_hold)) {
                cond_resched();
                page = lru_to_page(&l_hold);
@@ -1595,9 +1621,7 @@ scan:
                        sc.nr_reclaimed = 0;
                        sc.priority = priority;
                        sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
-                        atomic_inc(&zone->reclaim_in_progress);
                        shrink_zone(zone, &sc);
-                        atomic_dec(&zone->reclaim_in_progress);
                        reclaim_state->reclaimed_slab = 0;
                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
                                                lru_pages);
author	Jeff Garzik <jgarzik@pobox.com>	2006-02-17 16:20:30 -0500
committer	Jeff Garzik <jgarzik@pobox.com>	2006-02-17 16:20:30 -0500
commit	b04a92e1601eb6df3a3b6599e7fb7ee021eef2cb (patch)
tree	f190980d5bffae81a67c0cea0d913ed2444ceb0c /mm
parent	70c07e02625ec46d0ffbfce1acef42d660803528 (diff)
parent	f5e2a7b22e7d7dfda8794906d0fddeaaa09bb944 (diff)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 67f2951666..508707704d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -85,7 +85,7 @@ void free_huge_page(struct page *page)
85	BUG_ON(page_count(page));	85	BUG_ON(page_count(page));
86		86
87	INIT_LIST_HEAD(&page->lru);	87	INIT_LIST_HEAD(&page->lru);
88	page[1].mapping = NULL;	88	page[1].lru.next = NULL; /* reset dtor */
89		89
90	spin_lock(&hugetlb_lock);	90	spin_lock(&hugetlb_lock);
91	enqueue_huge_page(page);	91	enqueue_huge_page(page);
@@ -105,7 +105,7 @@ struct page alloc_huge_page(struct vm_area_struct vma, unsigned long addr)
105	}	105	}
106	spin_unlock(&hugetlb_lock);	106	spin_unlock(&hugetlb_lock);
107	set_page_count(page, 1);	107	set_page_count(page, 1);
108	page[1].mapping = (void *)free_huge_page;	108	page[1].lru.next = (void )free_huge_page; / set dtor */
109	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)	109	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
110	clear_user_highpage(&page[i], addr);	110	clear_user_highpage(&page[i], addr);
111	return page;	111	return page;


diff --git a/mm/madvise.c b/mm/madvise.c index ae0ae3ea29..af3d573b01 100644 --- a/mm/madvise.c +++ b/mm/madvise.c
@@ -22,16 +22,23 @@ static long madvise_behavior(struct vm_area_struct * vma,
22	struct mm_struct * mm = vma->vm_mm;	22	struct mm_struct * mm = vma->vm_mm;
23	int error = 0;	23	int error = 0;
24	pgoff_t pgoff;	24	pgoff_t pgoff;
25	int new_flags = vma->vm_flags & ~VM_READHINTMASK;	25	int new_flags = vma->vm_flags;
26		26
27	switch (behavior) {	27	switch (behavior) {
		28	case MADV_NORMAL:
		29	new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
		30	break;
28	case MADV_SEQUENTIAL:	31	case MADV_SEQUENTIAL:
29	new_flags \|= VM_SEQ_READ;	32	new_flags = (new_flags & ~VM_RAND_READ) \| VM_SEQ_READ;
30	break;	33	break;
31	case MADV_RANDOM:	34	case MADV_RANDOM:
32	new_flags \|= VM_RAND_READ;	35	new_flags = (new_flags & ~VM_SEQ_READ) \| VM_RAND_READ;
33	break;	36	break;
34	default:	37	case MADV_DONTFORK:
		38	new_flags \|= VM_DONTCOPY;
		39	break;
		40	case MADV_DOFORK:
		41	new_flags &= ~VM_DONTCOPY;
35	break;	42	break;
36	}	43	}
37		44
@@ -177,6 +184,12 @@ madvise_vma(struct vm_area_struct vma, struct vm_area_struct *prev,
177	long error;	184	long error;
178		185
179	switch (behavior) {	186	switch (behavior) {
		187	case MADV_DOFORK:
		188	if (vma->vm_flags & VM_IO) {
		189	error = -EINVAL;
		190	break;
		191	}
		192	case MADV_DONTFORK:
180	case MADV_NORMAL:	193	case MADV_NORMAL:
181	case MADV_SEQUENTIAL:	194	case MADV_SEQUENTIAL:
182	case MADV_RANDOM:	195	case MADV_RANDOM:


diff --git a/mm/memory.c b/mm/memory.c index 2bee1f21aa..9abc600854 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages);
82	EXPORT_SYMBOL(high_memory);	82	EXPORT_SYMBOL(high_memory);
83	EXPORT_SYMBOL(vmalloc_earlyreserve);	83	EXPORT_SYMBOL(vmalloc_earlyreserve);
84		84
		85	int randomize_va_space __read_mostly = 1;
		86
		87	static int __init disable_randmaps(char *s)
		88	{
		89	randomize_va_space = 0;
		90	return 0;
		91	}
		92	__setup("norandmaps", disable_randmaps);
		93
		94
85	/*	95	/*
86	* If a p?d_bad entry is found while walking page tables, report	96	* If a p?d_bad entry is found while walking page tables, report
87	* the error, before resetting entry to p?d_none. Usually (but	97	* the error, before resetting entry to p?d_none. Usually (but


diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3bd7fb7e4b..323fdcf128 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c
@@ -132,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
132	}	132	}
133	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;	133	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
134	}	134	}
		135
135	/* Generate a custom zonelist for the BIND policy. */	136	/* Generate a custom zonelist for the BIND policy. */
136	static struct zonelist bind_zonelist(nodemask_t nodes)	137	static struct zonelist bind_zonelist(nodemask_t nodes)
137	{	138	{
138	struct zonelist *zl;	139	struct zonelist *zl;
139	int num, max, nd;	140	int num, max, nd, k;
140		141
141	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);	142	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
142	zl = kmalloc(sizeof(void ) max, GFP_KERNEL);	143	zl = kmalloc(sizeof(struct zone ) max, GFP_KERNEL);
143	if (!zl)	144	if (!zl)
144	return NULL;	145	return NULL;
145	num = 0;	146	num = 0;
146	for_each_node_mask(nd, *nodes)	147	/* First put in the highest zones from all nodes, then all the next
147	zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];	148	lower zones etc. Avoid empty zones because the memory allocator
		149	doesn't like them. If you implement node hot removal you
		150	have to fix that. */
		151	for (k = policy_zone; k >= 0; k--) {
		152	for_each_node_mask(nd, *nodes) {
		153	struct zone *z = &NODE_DATA(nd)->node_zones[k];
		154	if (z->present_pages > 0)
		155	zl->zones[num++] = z;
		156	}
		157	}
148	zl->zones[num] = NULL;	158	zl->zones[num] = NULL;
149	return zl;	159	return zl;
150	}	160	}


diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dde04ff4be..62c1225285 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@ long nr_swap_pages;
56	int percpu_pagelist_fraction;	56	int percpu_pagelist_fraction;
57		57
58	static void fastcall free_hot_cold_page(struct page *page, int cold);	58	static void fastcall free_hot_cold_page(struct page *page, int cold);
		59	static void __free_pages_ok(struct page *page, unsigned int order);
59		60
60	/*	61	/*
61	* results with 256, 32 in the lowmem_reserve sysctl:	62	* results with 256, 32 in the lowmem_reserve sysctl:
@@ -169,20 +170,23 @@ static void bad_page(struct page *page)
169	* All pages have PG_compound set. All pages have their ->private pointing at	170	* All pages have PG_compound set. All pages have their ->private pointing at
170	* the head page (even the head page has this).	171	* the head page (even the head page has this).
171	*	172	*
172	* The first tail page's ->mapping, if non-zero, holds the address of the	173	* The first tail page's ->lru.next holds the address of the compound page's
173	* compound page's put_page() function.	174	* put_page() function. Its ->lru.prev holds the order of allocation.
174	*	175	* This usage means that zero-order pages may not be compound.
175	* The order of the allocation is stored in the first tail page's ->index
176	* This is only for debug at present. This usage means that zero-order pages
177	* may not be compound.
178	*/	176	*/
		177
		178	static void free_compound_page(struct page *page)
		179	{
		180	__free_pages_ok(page, (unsigned long)page[1].lru.prev);
		181	}
		182
179	static void prep_compound_page(struct page *page, unsigned long order)	183	static void prep_compound_page(struct page *page, unsigned long order)
180	{	184	{
181	int i;	185	int i;
182	int nr_pages = 1 << order;	186	int nr_pages = 1 << order;
183		187
184	page[1].mapping = NULL;	188	page[1].lru.next = (void )free_compound_page; / set dtor */
185	page[1].index = order;	189	page[1].lru.prev = (void *)order;
186	for (i = 0; i < nr_pages; i++) {	190	for (i = 0; i < nr_pages; i++) {
187	struct page *p = page + i;	191	struct page *p = page + i;
188		192
@@ -196,7 +200,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
196	int i;	200	int i;
197	int nr_pages = 1 << order;	201	int nr_pages = 1 << order;
198		202
199	if (unlikely(page[1].index != order))	203	if (unlikely((unsigned long)page[1].lru.prev != order))
200	bad_page(page);	204	bad_page(page);
201		205
202	for (i = 0; i < nr_pages; i++) {	206	for (i = 0; i < nr_pages; i++) {


diff --git a/mm/slab.c b/mm/slab.c index d66c2b0d97..add05d808a 100644 --- a/mm/slab.c +++ b/mm/slab.c
@@ -1717,6 +1717,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1717	BUG();	1717	BUG();
1718	}	1718	}
1719		1719
		1720	/*
		1721	* Prevent CPUs from coming and going.
		1722	* lock_cpu_hotplug() nests outside cache_chain_mutex
		1723	*/
		1724	lock_cpu_hotplug();
		1725
1720	mutex_lock(&cache_chain_mutex);	1726	mutex_lock(&cache_chain_mutex);
1721		1727
1722	list_for_each(p, &cache_chain) {	1728	list_for_each(p, &cache_chain) {
@@ -1918,8 +1924,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1918	cachep->dtor = dtor;	1924	cachep->dtor = dtor;
1919	cachep->name = name;	1925	cachep->name = name;
1920		1926
1921	/* Don't let CPUs to come and go */
1922	lock_cpu_hotplug();
1923		1927
1924	if (g_cpucache_up == FULL) {	1928	if (g_cpucache_up == FULL) {
1925	enable_cpucache(cachep);	1929	enable_cpucache(cachep);
@@ -1978,12 +1982,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1978		1982
1979	/* cache setup completed, link it into the list */	1983	/* cache setup completed, link it into the list */
1980	list_add(&cachep->next, &cache_chain);	1984	list_add(&cachep->next, &cache_chain);
1981	unlock_cpu_hotplug();
1982	oops:	1985	oops:
1983	if (!cachep && (flags & SLAB_PANIC))	1986	if (!cachep && (flags & SLAB_PANIC))
1984	panic("kmem_cache_create(): failed to create slab `%s'\n",	1987	panic("kmem_cache_create(): failed to create slab `%s'\n",
1985	name);	1988	name);
1986	mutex_unlock(&cache_chain_mutex);	1989	mutex_unlock(&cache_chain_mutex);
		1990	unlock_cpu_hotplug();
1987	return cachep;	1991	return cachep;
1988	}	1992	}
1989	EXPORT_SYMBOL(kmem_cache_create);	1993	EXPORT_SYMBOL(kmem_cache_create);


diff --git a/mm/swap.c b/mm/swap.c index 76247424de..cce3dda59c 100644 --- a/mm/swap.c +++ b/mm/swap.c
@@ -40,7 +40,7 @@ static void put_compound_page(struct page *page)
40	if (put_page_testzero(page)) {	40	if (put_page_testzero(page)) {
41	void (dtor)(struct page page);	41	void (dtor)(struct page page);
42		42
43	dtor = (void ()(struct page ))page[1].mapping;	43	dtor = (void ()(struct page ))page[1].lru.next;
44	(*dtor)(page);	44	(*dtor)(page);
45	}	45	}
46	}	46	}


diff --git a/mm/vmscan.c b/mm/vmscan.c index 5a610804cd..1838c15ca4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head page_list, struct scan_control sc)
443	BUG_ON(PageActive(page));	443	BUG_ON(PageActive(page));
444		444
445	sc->nr_scanned++;	445	sc->nr_scanned++;
		446
		447	if (!sc->may_swap && page_mapped(page))
		448	goto keep_locked;
		449
446	/* Double the slab pressure for mapped and swapcache pages */	450	/* Double the slab pressure for mapped and swapcache pages */
447	if (page_mapped(page) \|\| PageSwapCache(page))	451	if (page_mapped(page) \|\| PageSwapCache(page))
448	sc->nr_scanned++;	452	sc->nr_scanned++;
@@ -632,7 +636,7 @@ static int swap_page(struct page *page)
632	struct address_space *mapping = page_mapping(page);	636	struct address_space *mapping = page_mapping(page);
633		637
634	if (page_mapped(page) && mapping)	638	if (page_mapped(page) && mapping)
635	if (try_to_unmap(page, 0) != SWAP_SUCCESS)	639	if (try_to_unmap(page, 1) != SWAP_SUCCESS)
636	goto unlock_retry;	640	goto unlock_retry;
637		641
638	if (PageDirty(page)) {	642	if (PageDirty(page)) {
@@ -839,7 +843,7 @@ EXPORT_SYMBOL(migrate_page);
839	* pages are swapped out.	843	* pages are swapped out.
840	*	844	*
841	* The function returns after 10 attempts or if no pages	845	* The function returns after 10 attempts or if no pages
842	* are movable anymore because t has become empty	846	* are movable anymore because to has become empty
843	* or no retryable pages exist anymore.	847	* or no retryable pages exist anymore.
844	*	848	*
845	* Return: Number of pages not migrated when "to" ran empty.	849	* Return: Number of pages not migrated when "to" ran empty.
@@ -928,12 +932,21 @@ redo:
928	goto unlock_both;	932	goto unlock_both;
929		933
930	if (mapping->a_ops->migratepage) {	934	if (mapping->a_ops->migratepage) {
		935	/*
		936	* Most pages have a mapping and most filesystems
		937	* should provide a migration function. Anonymous
		938	* pages are part of swap space which also has its
		939	* own migration function. This is the most common
		940	* path for page migration.
		941	*/
931	rc = mapping->a_ops->migratepage(newpage, page);	942	rc = mapping->a_ops->migratepage(newpage, page);
932	goto unlock_both;	943	goto unlock_both;
933	}	944	}
934		945
935	/*	946	/*
936	* Trigger writeout if page is dirty	947	* Default handling if a filesystem does not provide
		948	* a migration function. We can only migrate clean
		949	* pages so try to write out any dirty pages first.
937	*/	950	*/
938	if (PageDirty(page)) {	951	if (PageDirty(page)) {
939	switch (pageout(page, mapping)) {	952	switch (pageout(page, mapping)) {
@@ -949,9 +962,10 @@ redo:
949	; /* try to migrate the page below */	962	; /* try to migrate the page below */
950	}	963	}
951	}	964	}
		965
952	/*	966	/*
953	* If we have no buffer or can release the buffer	967	* Buffers are managed in a filesystem specific way.
954	* then do a simple migration.	968	* We must have no buffers or drop them.
955	*/	969	*/
956	if (!page_has_buffers(page) \|\|	970	if (!page_has_buffers(page) \|\|
957	try_to_release_page(page, GFP_KERNEL)) {	971	try_to_release_page(page, GFP_KERNEL)) {
@@ -966,6 +980,11 @@ redo:
966	* swap them out.	980	* swap them out.
967	*/	981	*/
968	if (pass > 4) {	982	if (pass > 4) {
		983	/*
		984	* Persistently unable to drop buffers..... As a
		985	* measure of last resort we fall back to
		986	* swap_page().
		987	*/
969	unlock_page(newpage);	988	unlock_page(newpage);
970	newpage = NULL;	989	newpage = NULL;
971	rc = swap_page(page);	990	rc = swap_page(page);
@@ -1176,9 +1195,47 @@ refill_inactive_zone(struct zone zone, struct scan_control sc)
1176	struct page *page;	1195	struct page *page;
1177	struct pagevec pvec;	1196	struct pagevec pvec;
1178	int reclaim_mapped = 0;	1197	int reclaim_mapped = 0;
1179	long mapped_ratio;	1198
1180	long distress;	1199	if (unlikely(sc->may_swap)) {
1181	long swap_tendency;	1200	long mapped_ratio;
		1201	long distress;
		1202	long swap_tendency;
		1203
		1204	/*
		1205	* `distress' is a measure of how much trouble we're having
		1206	* reclaiming pages. 0 -> no problems. 100 -> great trouble.
		1207	*/
		1208	distress = 100 >> zone->prev_priority;
		1209
		1210	/*
		1211	* The point of this algorithm is to decide when to start
		1212	* reclaiming mapped memory instead of just pagecache. Work out
		1213	* how much memory
		1214	* is mapped.
		1215	*/
		1216	mapped_ratio = (sc->nr_mapped * 100) / total_memory;
		1217
		1218	/*
		1219	* Now decide how much we really want to unmap some pages. The
		1220	* mapped ratio is downgraded - just because there's a lot of
		1221	* mapped memory doesn't necessarily mean that page reclaim
		1222	* isn't succeeding.
		1223	*
		1224	* The distress ratio is important - we don't want to start
		1225	* going oom.
		1226	*
		1227	* A 100% value of vm_swappiness overrides this algorithm
		1228	* altogether.
		1229	*/
		1230	swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
		1231
		1232	/*
		1233	* Now use this metric to decide whether to start moving mapped
		1234	* memory onto the inactive list.
		1235	*/
		1236	if (swap_tendency >= 100)
		1237	reclaim_mapped = 1;
		1238	}
1182		1239
1183	lru_add_drain();	1240	lru_add_drain();
1184	spin_lock_irq(&zone->lru_lock);	1241	spin_lock_irq(&zone->lru_lock);
@@ -1188,37 +1245,6 @@ refill_inactive_zone(struct zone zone, struct scan_control sc)
1188	zone->nr_active -= pgmoved;	1245	zone->nr_active -= pgmoved;
1189	spin_unlock_irq(&zone->lru_lock);	1246	spin_unlock_irq(&zone->lru_lock);
1190		1247
1191	/*
1192	* `distress' is a measure of how much trouble we're having reclaiming
1193	* pages. 0 -> no problems. 100 -> great trouble.
1194	*/
1195	distress = 100 >> zone->prev_priority;
1196
1197	/*
1198	* The point of this algorithm is to decide when to start reclaiming
1199	* mapped memory instead of just pagecache. Work out how much memory
1200	* is mapped.
1201	*/
1202	mapped_ratio = (sc->nr_mapped * 100) / total_memory;
1203
1204	/*
1205	* Now decide how much we really want to unmap some pages. The mapped
1206	* ratio is downgraded - just because there's a lot of mapped memory
1207	* doesn't necessarily mean that page reclaim isn't succeeding.
1208	*
1209	* The distress ratio is important - we don't want to start going oom.
1210	*
1211	* A 100% value of vm_swappiness overrides this algorithm altogether.
1212	*/
1213	swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
1214
1215	/*
1216	* Now use this metric to decide whether to start moving mapped memory
1217	* onto the inactive list.
1218	*/
1219	if (swap_tendency >= 100)
1220	reclaim_mapped = 1;
1221
1222	while (!list_empty(&l_hold)) {	1248	while (!list_empty(&l_hold)) {
1223	cond_resched();	1249	cond_resched();
1224	page = lru_to_page(&l_hold);	1250	page = lru_to_page(&l_hold);
@@ -1595,9 +1621,7 @@ scan:
1595	sc.nr_reclaimed = 0;	1621	sc.nr_reclaimed = 0;
1596	sc.priority = priority;	1622	sc.priority = priority;
1597	sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;	1623	sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
1598	atomic_inc(&zone->reclaim_in_progress);
1599	shrink_zone(zone, &sc);	1624	shrink_zone(zone, &sc);
1600	atomic_dec(&zone->reclaim_in_progress);
1601	reclaim_state->reclaimed_slab = 0;	1625	reclaim_state->reclaimed_slab = 0;
1602	nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,	1626	nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1603	lru_pages);	1627	lru_pages);