Merge branch 'master'

author: Steven Whitehouse <steve@men-an-tol.chygwyn.com> 2006-02-23 04:49:43 -0500
committer: Steven Whitehouse <swhiteho@redhat.com> 2006-02-23 04:49:43 -0500
commit: d35462b4bb847b68321c55e95c926aa485aecce2 (patch)
tree: b08e18bf6e672633402871ee763102fdb5e63229 /mm/vmscan.c
parent: 91ffd7db71e7451f89941a8f428b4daa2a7c1e38 (diff)
parent: 9e956c2dac9bec602ed1ba29181b45ba6d2b6448 (diff)
1 files changed, 373 insertions, 68 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2e34b61a70c7..1838c15ca4fd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                BUG_ON(PageActive(page));
                sc->nr_scanned++;
+                if (!sc->may_swap && page_mapped(page))
+                        goto keep_locked;
                /* Double the slab pressure for mapped and swapcache pages */
                if (page_mapped(page) || PageSwapCache(page))
                        sc->nr_scanned++;
@@ -477,7 +481,13 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                        switch (try_to_unmap(page)) {
+                        /*
+                         * No unmapping if we do not swap
+                         */
+                        if (!sc->may_swap)
+                                goto keep_locked;
+                        switch (try_to_unmap(page, 0)) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
@@ -492,7 +502,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                                goto keep_locked;
                        if (!may_enter_fs)
                                goto keep_locked;
-                        if (laptop_mode && !sc->may_writepage)
+                        if (!sc->may_writepage)
                                goto keep_locked;
                        /* Page is dirty, try to write it out here */
@@ -609,6 +619,15 @@ int putback_lru_pages(struct list_head *l)
 }
 /*
+ * Non migratable page
+ */
+int fail_migrate_page(struct page *newpage, struct page *page)
+{
+        return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+/*
 * swapout a single page
 * page is locked upon entry, unlocked on exit
 */
@@ -617,7 +636,7 @@ static int swap_page(struct page *page)
        struct address_space *mapping = page_mapping(page);
        if (page_mapped(page) && mapping)
-                if (try_to_unmap(page) != SWAP_SUCCESS)
+                if (try_to_unmap(page, 1) != SWAP_SUCCESS)
                        goto unlock_retry;
        if (PageDirty(page)) {
@@ -653,6 +672,167 @@ unlock_retry:
 retry:
        return -EAGAIN;
 }
+EXPORT_SYMBOL(swap_page);
+/*
+ * Page migration was first developed in the context of the memory hotplug
+ * project. The main authors of the migration code are:
+ *
+ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
+ * Hirokazu Takahashi <taka@valinux.co.jp>
+ * Dave Hansen <haveblue@us.ibm.com>
+ * Christoph Lameter <clameter@sgi.com>
+ */
+/*
+ * Remove references for a page and establish the new page with the correct
+ * basic settings to be able to stop accesses to the page.
+ */
+int migrate_page_remove_references(struct page *newpage,
+                                struct page *page, int nr_refs)
+{
+        struct address_space *mapping = page_mapping(page);
+        struct page **radix_pointer;
+        /*
+         * Avoid doing any of the following work if the page count
+         * indicates that the page is in use or truncate has removed
+         * the page.
+         */
+        if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
+                return 1;
+        /*
+         * Establish swap ptes for anonymous pages or destroy pte
+         * maps for files.
+         *
+         * In order to reestablish file backed mappings the fault handlers
+         * will take the radix tree_lock which may then be used to stop
+         * processses from accessing this page until the new page is ready.
+         *
+         * A process accessing via a swap pte (an anonymous page) will take a
+         * page_lock on the old page which will block the process until the
+         * migration attempt is complete. At that time the PageSwapCache bit
+         * will be examined. If the page was migrated then the PageSwapCache
+         * bit will be clear and the operation to retrieve the page will be
+         * retried which will find the new page in the radix tree. Then a new
+         * direct mapping may be generated based on the radix tree contents.
+         *
+         * If the page was not migrated then the PageSwapCache bit
+         * is still set and the operation may continue.
+         */
+        try_to_unmap(page, 1);
+        /*
+         * Give up if we were unable to remove all mappings.
+         */
+        if (page_mapcount(page))
+                return 1;
+        write_lock_irq(&mapping->tree_lock);
+        radix_pointer = (struct page **)radix_tree_lookup_slot(
+                                                &mapping->page_tree,
+                                                page_index(page));
+        if (!page_mapping(page) || page_count(page) != nr_refs ||
+                        *radix_pointer != page) {
+                write_unlock_irq(&mapping->tree_lock);
+                return 1;
+        }
+        /*
+         * Now we know that no one else is looking at the page.
+         *
+         * Certain minimal information about a page must be available
+         * in order for other subsystems to properly handle the page if they
+         * find it through the radix tree update before we are finished
+         * copying the page.
+         */
+        get_page(newpage);
+        newpage->index = page->index;
+        newpage->mapping = page->mapping;
+        if (PageSwapCache(page)) {
+                SetPageSwapCache(newpage);
+                set_page_private(newpage, page_private(page));
+        }
+        *radix_pointer = newpage;
+        __put_page(page);
+        write_unlock_irq(&mapping->tree_lock);
+        return 0;
+}
+EXPORT_SYMBOL(migrate_page_remove_references);
+/*
+ * Copy the page to its new location
+ */
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+        copy_highpage(newpage, page);
+        if (PageError(page))
+                SetPageError(newpage);
+        if (PageReferenced(page))
+                SetPageReferenced(newpage);
+        if (PageUptodate(page))
+                SetPageUptodate(newpage);
+        if (PageActive(page))
+                SetPageActive(newpage);
+        if (PageChecked(page))
+                SetPageChecked(newpage);
+        if (PageMappedToDisk(page))
+                SetPageMappedToDisk(newpage);
+        if (PageDirty(page)) {
+                clear_page_dirty_for_io(page);
+                set_page_dirty(newpage);
+        }
+        ClearPageSwapCache(page);
+        ClearPageActive(page);
+        ClearPagePrivate(page);
+        set_page_private(page, 0);
+        page->mapping = NULL;
+        /*
+         * If any waiters have accumulated on the new page then
+         * wake them up.
+         */
+        if (PageWriteback(newpage))
+                end_page_writeback(newpage);
+}
+EXPORT_SYMBOL(migrate_page_copy);
+/*
+ * Common logic to directly migrate a single page suitable for
+ * pages that do not use PagePrivate.
+ *
+ * Pages are locked upon entry and exit.
+ */
+int migrate_page(struct page *newpage, struct page *page)
+{
+        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
+        if (migrate_page_remove_references(newpage, page, 2))
+                return -EAGAIN;
+        migrate_page_copy(newpage, page);
+        /*
+         * Remove auxiliary swap entries and replace
+         * them with real ptes.
+         *
+         * Note that a real pte entry will allow processes that are not
+         * waiting on the page lock to use the new page via the page tables
+         * before the new page is unlocked.
+         */
+        remove_from_swap(newpage);
+        return 0;
+}
+EXPORT_SYMBOL(migrate_page);
 /*
 * migrate_pages
 *
@@ -663,14 +843,9 @@ retry:
 * pages are swapped out.
 *
 * The function returns after 10 attempts or if no pages
- * are movable anymore because t has become empty
+ * are movable anymore because to has become empty
 * or no retryable pages exist anymore.
 *
- * SIMPLIFIED VERSION: This implementation of migrate_pages
- * is only swapping out pages and never touches the second
- * list. The direct migration patchset
- * extends this function to avoid the use of swap.
- *
 * Return: Number of pages not migrated when "to" ran empty.
 */
 int migrate_pages(struct list_head *from, struct list_head *to,
@@ -691,6 +866,9 @@ redo:
        retry = 0;
        list_for_each_entry_safe(page, page2, from, lru) {
+                struct page *newpage = NULL;
+                struct address_space *mapping;
                cond_resched();
                rc = 0;
@@ -698,6 +876,9 @@ redo:
                        /* page was freed from under us. So we are done. */
                        goto next;
+                if (to && list_empty(to))
+                        break;
                /*
                 * Skip locked pages during the first two passes to give the
                 * functions holding the lock time to release the page. Later we
@@ -734,12 +915,84 @@ redo:
                        }
                }
+                if (!to) {
+                        rc = swap_page(page);
+                        goto next;
+                }
+                newpage = lru_to_page(to);
+                lock_page(newpage);
                /*
-                 * Page is properly locked and writeback is complete.
+                 * Pages are properly locked and writeback is complete.
                 * Try to migrate the page.
                 */
-                rc = swap_page(page);
+                mapping = page_mapping(page);
-                goto next;
+                if (!mapping)
+                        goto unlock_both;
+                if (mapping->a_ops->migratepage) {
+                        /*
+                         * Most pages have a mapping and most filesystems
+                         * should provide a migration function. Anonymous
+                         * pages are part of swap space which also has its
+                         * own migration function. This is the most common
+                         * path for page migration.
+                         */
+                        rc = mapping->a_ops->migratepage(newpage, page);
+                        goto unlock_both;
+                }
+                /*
+                 * Default handling if a filesystem does not provide
+                 * a migration function. We can only migrate clean
+                 * pages so try to write out any dirty pages first.
+                 */
+                if (PageDirty(page)) {
+                        switch (pageout(page, mapping)) {
+                        case PAGE_KEEP:
+                        case PAGE_ACTIVATE:
+                                goto unlock_both;
+                        case PAGE_SUCCESS:
+                                unlock_page(newpage);
+                                goto next;
+                        case PAGE_CLEAN:
+                                ; /* try to migrate the page below */
+                        }
+                }
+                /*
+                 * Buffers are managed in a filesystem specific way.
+                 * We must have no buffers or drop them.
+                 */
+                if (!page_has_buffers(page) ||
+                    try_to_release_page(page, GFP_KERNEL)) {
+                        rc = migrate_page(newpage, page);
+                        goto unlock_both;
+                }
+                /*
+                 * On early passes with mapped pages simply
+                 * retry. There may be a lock held for some
+                 * buffers that may go away. Later
+                 * swap them out.
+                 */
+                if (pass > 4) {
+                        /*
+                         * Persistently unable to drop buffers..... As a
+                         * measure of last resort we fall back to
+                         * swap_page().
+                         */
+                        unlock_page(newpage);
+                        newpage = NULL;
+                        rc = swap_page(page);
+                        goto next;
+                }
+unlock_both:
+                unlock_page(newpage);
 unlock_page:
                unlock_page(page);
@@ -752,7 +1005,10 @@ next:
                        list_move(&page->lru, failed);
                        nr_failed++;
                } else {
-                        /* Success */
+                        if (newpage) {
+                                /* Successful migration. Return page to LRU */
+                                move_to_lru(newpage);
+                        }
                        list_move(&page->lru, moved);
                }
        }
@@ -939,9 +1195,47 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
        struct page *page;
        struct pagevec pvec;
        int reclaim_mapped = 0;
-        long mapped_ratio;
-        long distress;
+        if (unlikely(sc->may_swap)) {
-        long swap_tendency;
+                long mapped_ratio;
+                long distress;
+                long swap_tendency;
+                /*
+                 * `distress' is a measure of how much trouble we're having
+                 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
+                 */
+                distress = 100 >> zone->prev_priority;
+                /*
+                 * The point of this algorithm is to decide when to start
+                 * reclaiming mapped memory instead of just pagecache.  Work out
+                 * how much memory
+                 * is mapped.
+                 */
+                mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+                /*
+                 * Now decide how much we really want to unmap some pages.  The
+                 * mapped ratio is downgraded - just because there's a lot of
+                 * mapped memory doesn't necessarily mean that page reclaim
+                 * isn't succeeding.
+                 *
+                 * The distress ratio is important - we don't want to start
+                 * going oom.
+                 *
+                 * A 100% value of vm_swappiness overrides this algorithm
+                 * altogether.
+                 */
+                swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+                /*
+                 * Now use this metric to decide whether to start moving mapped
+                 * memory onto the inactive list.
+                 */
+                if (swap_tendency >= 100)
+                        reclaim_mapped = 1;
+        }
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
@@ -951,37 +1245,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
        zone->nr_active -= pgmoved;
        spin_unlock_irq(&zone->lru_lock);
-        /*
-         * `distress' is a measure of how much trouble we're having reclaiming
-         * pages.  0 -> no problems.  100 -> great trouble.
-         */
-        distress = 100 >> zone->prev_priority;
-        /*
-         * The point of this algorithm is to decide when to start reclaiming
-         * mapped memory instead of just pagecache.  Work out how much memory
-         * is mapped.
-         */
-        mapped_ratio = (sc->nr_mapped * 100) / total_memory;
-        /*
-         * Now decide how much we really want to unmap some pages.  The mapped
-         * ratio is downgraded - just because there's a lot of mapped memory
-         * doesn't necessarily mean that page reclaim isn't succeeding.
-         *
-         * The distress ratio is important - we don't want to start going oom.
-         *
-         * A 100% value of vm_swappiness overrides this algorithm altogether.
-         */
-        swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-        /*
-         * Now use this metric to decide whether to start moving mapped memory
-         * onto the inactive list.
-         */
-        if (swap_tendency >= 100)
-                reclaim_mapped = 1;
        while (!list_empty(&l_hold)) {
                cond_resched();
                page = lru_to_page(&l_hold);
@@ -1170,7 +1433,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
        int i;
        sc.gfp_mask = gfp_mask;
-        sc.may_writepage = 0;
+        sc.may_writepage = !laptop_mode;
        sc.may_swap = 1;
        inc_page_state(allocstall);
@@ -1273,7 +1536,7 @@ loop_again:
        total_scanned = 0;
        total_reclaimed = 0;
        sc.gfp_mask = GFP_KERNEL;
-        sc.may_writepage = 0;
+        sc.may_writepage = !laptop_mode;
        sc.may_swap = 1;
        sc.nr_mapped = read_page_state(nr_mapped);
@@ -1358,9 +1621,7 @@ scan:
                        sc.nr_reclaimed = 0;
                        sc.priority = priority;
                        sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
-                        atomic_inc(&zone->reclaim_in_progress);
                        shrink_zone(zone, &sc);
-                        atomic_dec(&zone->reclaim_in_progress);
                        reclaim_state->reclaimed_slab = 0;
                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
                                                lru_pages);
@@ -1586,40 +1847,61 @@ module_init(kswapd_init)
 */
 int zone_reclaim_mode __read_mostly;
+#define RECLAIM_OFF 0
+#define RECLAIM_ZONE (1<<0)     /* Run shrink_cache on the zone */
+#define RECLAIM_WRITE (1<<1)    /* Writeout pages during reclaim */
+#define RECLAIM_SWAP (1<<2)     /* Swap pages out during reclaim */
+#define RECLAIM_SLAB (1<<3)     /* Do a global slab shrink if the zone is out of memory */
 /*
 * Mininum time between zone reclaim scans
 */
-#define ZONE_RECLAIM_INTERVAL HZ/2
+int zone_reclaim_interval __read_mostly = 30*HZ;
+/*
+ * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * of a node considered for each zone_reclaim. 4 scans 1/16th of
+ * a zone.
+ */
+#define ZONE_RECLAIM_PRIORITY 4
 /*
 * Try to free up some pages from this zone through reclaim.
 */
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-        int nr_pages = 1 << order;
+        int nr_pages;
        struct task_struct *p = current;
        struct reclaim_state reclaim_state;
-        struct scan_control sc = {
+        struct scan_control sc;
-                .gfp_mask       = gfp_mask,
+        cpumask_t mask;
-                .may_writepage  = 0,
+        int node_id;
-                .may_swap       = 0,
-                .nr_mapped      = read_page_state(nr_mapped),
+        if (time_before(jiffies,
-                .nr_scanned     = 0,
+                zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
-                .nr_reclaimed   = 0,
+                        return 0;
-                .priority       = 0
-        };
        if (!(gfp_mask & __GFP_WAIT) ||
-                zone->zone_pgdat->node_id != numa_node_id() ||
                zone->all_unreclaimable ||
                atomic_read(&zone->reclaim_in_progress) > 0)
                        return 0;
-        if (time_before(jiffies,
+        node_id = zone->zone_pgdat->node_id;
-                zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+        mask = node_to_cpumask(node_id);
-                        return 0;
+        if (!cpus_empty(mask) && node_id != numa_node_id())
+                return 0;
+        sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
+        sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
+        sc.nr_scanned = 0;
+        sc.nr_reclaimed = 0;
+        sc.priority = ZONE_RECLAIM_PRIORITY + 1;
+        sc.nr_mapped = read_page_state(nr_mapped);
+        sc.gfp_mask = gfp_mask;
        disable_swap_token();
+        nr_pages = 1 << order;
        if (nr_pages > SWAP_CLUSTER_MAX)
                sc.swap_cluster_max = nr_pages;
        else
@@ -1629,14 +1911,37 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        p->flags |= PF_MEMALLOC;
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
-        shrink_zone(zone, &sc);
+        /*
+         * Free memory by calling shrink zone with increasing priorities
+         * until we have enough memory freed.
+         */
+        do {
+                sc.priority--;
+                shrink_zone(zone, &sc);
+        } while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
+        if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+                /*
+                 * shrink_slab does not currently allow us to determine
+                 * how many pages were freed in the zone. So we just
+                 * shake the slab and then go offnode for a single allocation.
+                 *
+                 * shrink_slab will free memory on all zones and may take
+                 * a long time.
+                 */
+                shrink_slab(sc.nr_scanned, gfp_mask, order);
+                sc.nr_reclaimed = 1;    /* Avoid getting the off node timeout */
+        }
        p->reclaim_state = NULL;
        current->flags &= ~PF_MEMALLOC;
        if (sc.nr_reclaimed == 0)
                zone->last_unsuccessful_zone_reclaim = jiffies;
-        return sc.nr_reclaimed > nr_pages;
+        return sc.nr_reclaimed >= nr_pages;
 }
 #endif
author	Steven Whitehouse <steve@men-an-tol.chygwyn.com>	2006-02-23 04:49:43 -0500
committer	Steven Whitehouse <swhiteho@redhat.com>	2006-02-23 04:49:43 -0500
commit	d35462b4bb847b68321c55e95c926aa485aecce2 (patch)
tree	b08e18bf6e672633402871ee763102fdb5e63229 /mm/vmscan.c
parent	91ffd7db71e7451f89941a8f428b4daa2a7c1e38 (diff)
parent	9e956c2dac9bec602ed1ba29181b45ba6d2b6448 (diff)

diff --git a/mm/vmscan.c b/mm/vmscan.c index 2e34b61a70c7..1838c15ca4fd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head page_list, struct scan_control sc)
443	BUG_ON(PageActive(page));	443	BUG_ON(PageActive(page));
444		444
445	sc->nr_scanned++;	445	sc->nr_scanned++;
		446
		447	if (!sc->may_swap && page_mapped(page))
		448	goto keep_locked;
		449
446	/* Double the slab pressure for mapped and swapcache pages */	450	/* Double the slab pressure for mapped and swapcache pages */
447	if (page_mapped(page) \|\| PageSwapCache(page))	451	if (page_mapped(page) \|\| PageSwapCache(page))
448	sc->nr_scanned++;	452	sc->nr_scanned++;
@@ -477,7 +481,13 @@ static int shrink_list(struct list_head page_list, struct scan_control sc)
477	* processes. Try to unmap it here.	481	* processes. Try to unmap it here.
478	*/	482	*/
479	if (page_mapped(page) && mapping) {	483	if (page_mapped(page) && mapping) {
480	switch (try_to_unmap(page)) {	484	/*
		485	* No unmapping if we do not swap
		486	*/
		487	if (!sc->may_swap)
		488	goto keep_locked;
		489
		490	switch (try_to_unmap(page, 0)) {
481	case SWAP_FAIL:	491	case SWAP_FAIL:
482	goto activate_locked;	492	goto activate_locked;
483	case SWAP_AGAIN:	493	case SWAP_AGAIN:
@@ -492,7 +502,7 @@ static int shrink_list(struct list_head page_list, struct scan_control sc)
492	goto keep_locked;	502	goto keep_locked;
493	if (!may_enter_fs)	503	if (!may_enter_fs)
494	goto keep_locked;	504	goto keep_locked;
495	if (laptop_mode && !sc->may_writepage)	505	if (!sc->may_writepage)
496	goto keep_locked;	506	goto keep_locked;
497		507
498	/* Page is dirty, try to write it out here */	508	/* Page is dirty, try to write it out here */
@@ -609,6 +619,15 @@ int putback_lru_pages(struct list_head *l)
609	}	619	}
610		620
611	/*	621	/*
		622	* Non migratable page
		623	*/
		624	int fail_migrate_page(struct page newpage, struct page page)
		625	{
		626	return -EIO;
		627	}
		628	EXPORT_SYMBOL(fail_migrate_page);
		629
		630	/*
612	* swapout a single page	631	* swapout a single page
613	* page is locked upon entry, unlocked on exit	632	* page is locked upon entry, unlocked on exit
614	*/	633	*/
@@ -617,7 +636,7 @@ static int swap_page(struct page *page)
617	struct address_space *mapping = page_mapping(page);	636	struct address_space *mapping = page_mapping(page);
618		637
619	if (page_mapped(page) && mapping)	638	if (page_mapped(page) && mapping)
620	if (try_to_unmap(page) != SWAP_SUCCESS)	639	if (try_to_unmap(page, 1) != SWAP_SUCCESS)
621	goto unlock_retry;	640	goto unlock_retry;
622		641
623	if (PageDirty(page)) {	642	if (PageDirty(page)) {
@@ -653,6 +672,167 @@ unlock_retry:
653	retry:	672	retry:
654	return -EAGAIN;	673	return -EAGAIN;
655	}	674	}
		675	EXPORT_SYMBOL(swap_page);
		676
		677	/*
		678	* Page migration was first developed in the context of the memory hotplug
		679	* project. The main authors of the migration code are:
		680	*
		681	* IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
		682	* Hirokazu Takahashi <taka@valinux.co.jp>
		683	* Dave Hansen <haveblue@us.ibm.com>
		684	* Christoph Lameter <clameter@sgi.com>
		685	*/
		686
		687	/*
		688	* Remove references for a page and establish the new page with the correct
		689	* basic settings to be able to stop accesses to the page.
		690	*/
		691	int migrate_page_remove_references(struct page *newpage,
		692	struct page *page, int nr_refs)
		693	{
		694	struct address_space *mapping = page_mapping(page);
		695	struct page **radix_pointer;
		696
		697	/*
		698	* Avoid doing any of the following work if the page count
		699	* indicates that the page is in use or truncate has removed
		700	* the page.
		701	*/
		702	if (!mapping \|\| page_mapcount(page) + nr_refs != page_count(page))
		703	return 1;
		704
		705	/*
		706	* Establish swap ptes for anonymous pages or destroy pte
		707	* maps for files.
		708	*
		709	* In order to reestablish file backed mappings the fault handlers
		710	* will take the radix tree_lock which may then be used to stop
		711	* processses from accessing this page until the new page is ready.
		712	*
		713	* A process accessing via a swap pte (an anonymous page) will take a
		714	* page_lock on the old page which will block the process until the
		715	* migration attempt is complete. At that time the PageSwapCache bit
		716	* will be examined. If the page was migrated then the PageSwapCache
		717	* bit will be clear and the operation to retrieve the page will be
		718	* retried which will find the new page in the radix tree. Then a new
		719	* direct mapping may be generated based on the radix tree contents.
		720	*
		721	* If the page was not migrated then the PageSwapCache bit
		722	* is still set and the operation may continue.
		723	*/
		724	try_to_unmap(page, 1);
		725
		726	/*
		727	* Give up if we were unable to remove all mappings.
		728	*/
		729	if (page_mapcount(page))
		730	return 1;
		731
		732	write_lock_irq(&mapping->tree_lock);
		733
		734	radix_pointer = (struct page **)radix_tree_lookup_slot(
		735	&mapping->page_tree,
		736	page_index(page));
		737
		738	if (!page_mapping(page) \|\| page_count(page) != nr_refs \|\|
		739	*radix_pointer != page) {
		740	write_unlock_irq(&mapping->tree_lock);
		741	return 1;
		742	}
		743
		744	/*
		745	* Now we know that no one else is looking at the page.
		746	*
		747	* Certain minimal information about a page must be available
		748	* in order for other subsystems to properly handle the page if they
		749	* find it through the radix tree update before we are finished
		750	* copying the page.
		751	*/
		752	get_page(newpage);
		753	newpage->index = page->index;
		754	newpage->mapping = page->mapping;
		755	if (PageSwapCache(page)) {
		756	SetPageSwapCache(newpage);
		757	set_page_private(newpage, page_private(page));
		758	}
		759
		760	*radix_pointer = newpage;
		761	__put_page(page);
		762	write_unlock_irq(&mapping->tree_lock);
		763
		764	return 0;
		765	}
		766	EXPORT_SYMBOL(migrate_page_remove_references);
		767
		768	/*
		769	* Copy the page to its new location
		770	*/
		771	void migrate_page_copy(struct page newpage, struct page page)
		772	{
		773	copy_highpage(newpage, page);
		774
		775	if (PageError(page))
		776	SetPageError(newpage);
		777	if (PageReferenced(page))
		778	SetPageReferenced(newpage);
		779	if (PageUptodate(page))
		780	SetPageUptodate(newpage);
		781	if (PageActive(page))
		782	SetPageActive(newpage);
		783	if (PageChecked(page))
		784	SetPageChecked(newpage);
		785	if (PageMappedToDisk(page))
		786	SetPageMappedToDisk(newpage);
		787
		788	if (PageDirty(page)) {
		789	clear_page_dirty_for_io(page);
		790	set_page_dirty(newpage);
		791	}
		792
		793	ClearPageSwapCache(page);
		794	ClearPageActive(page);
		795	ClearPagePrivate(page);
		796	set_page_private(page, 0);
		797	page->mapping = NULL;
		798
		799	/*
		800	* If any waiters have accumulated on the new page then
		801	* wake them up.
		802	*/
		803	if (PageWriteback(newpage))
		804	end_page_writeback(newpage);
		805	}
		806	EXPORT_SYMBOL(migrate_page_copy);
		807
		808	/*
		809	* Common logic to directly migrate a single page suitable for
		810	* pages that do not use PagePrivate.
		811	*
		812	* Pages are locked upon entry and exit.
		813	*/
		814	int migrate_page(struct page newpage, struct page page)
		815	{
		816	BUG_ON(PageWriteback(page)); /* Writeback must be complete */
		817
		818	if (migrate_page_remove_references(newpage, page, 2))
		819	return -EAGAIN;
		820
		821	migrate_page_copy(newpage, page);
		822
		823	/*
		824	* Remove auxiliary swap entries and replace
		825	* them with real ptes.
		826	*
		827	* Note that a real pte entry will allow processes that are not
		828	* waiting on the page lock to use the new page via the page tables
		829	* before the new page is unlocked.
		830	*/
		831	remove_from_swap(newpage);
		832	return 0;
		833	}
		834	EXPORT_SYMBOL(migrate_page);
		835
656	/*	836	/*
657	* migrate_pages	837	* migrate_pages
658	*	838	*
@@ -663,14 +843,9 @@ retry:
663	* pages are swapped out.	843	* pages are swapped out.
664	*	844	*
665	* The function returns after 10 attempts or if no pages	845	* The function returns after 10 attempts or if no pages
666	* are movable anymore because t has become empty	846	* are movable anymore because to has become empty
667	* or no retryable pages exist anymore.	847	* or no retryable pages exist anymore.
668	*	848	*
669	* SIMPLIFIED VERSION: This implementation of migrate_pages
670	* is only swapping out pages and never touches the second
671	* list. The direct migration patchset
672	* extends this function to avoid the use of swap.
673	*
674	* Return: Number of pages not migrated when "to" ran empty.	849	* Return: Number of pages not migrated when "to" ran empty.
675	*/	850	*/
676	int migrate_pages(struct list_head from, struct list_head to,	851	int migrate_pages(struct list_head from, struct list_head to,
@@ -691,6 +866,9 @@ redo:
691	retry = 0;	866	retry = 0;
692		867
693	list_for_each_entry_safe(page, page2, from, lru) {	868	list_for_each_entry_safe(page, page2, from, lru) {
		869	struct page *newpage = NULL;
		870	struct address_space *mapping;
		871
694	cond_resched();	872	cond_resched();
695		873
696	rc = 0;	874	rc = 0;
@@ -698,6 +876,9 @@ redo:
698	/* page was freed from under us. So we are done. */	876	/* page was freed from under us. So we are done. */
699	goto next;	877	goto next;
700		878
		879	if (to && list_empty(to))
		880	break;
		881
701	/*	882	/*
702	* Skip locked pages during the first two passes to give the	883	* Skip locked pages during the first two passes to give the
703	* functions holding the lock time to release the page. Later we	884	* functions holding the lock time to release the page. Later we
@@ -734,12 +915,84 @@ redo:
734	}	915	}
735	}	916	}
736		917
		918	if (!to) {
		919	rc = swap_page(page);
		920	goto next;
		921	}
		922
		923	newpage = lru_to_page(to);
		924	lock_page(newpage);
		925
737	/*	926	/*
738	* Page is properly locked and writeback is complete.	927	* Pages are properly locked and writeback is complete.
739	* Try to migrate the page.	928	* Try to migrate the page.
740	*/	929	*/
741	rc = swap_page(page);	930	mapping = page_mapping(page);
742	goto next;	931	if (!mapping)
		932	goto unlock_both;
		933
		934	if (mapping->a_ops->migratepage) {
		935	/*
		936	* Most pages have a mapping and most filesystems
		937	* should provide a migration function. Anonymous
		938	* pages are part of swap space which also has its
		939	* own migration function. This is the most common
		940	* path for page migration.
		941	*/
		942	rc = mapping->a_ops->migratepage(newpage, page);
		943	goto unlock_both;
		944	}
		945
		946	/*
		947	* Default handling if a filesystem does not provide
		948	* a migration function. We can only migrate clean
		949	* pages so try to write out any dirty pages first.
		950	*/
		951	if (PageDirty(page)) {
		952	switch (pageout(page, mapping)) {
		953	case PAGE_KEEP:
		954	case PAGE_ACTIVATE:
		955	goto unlock_both;
		956
		957	case PAGE_SUCCESS:
		958	unlock_page(newpage);
		959	goto next;
		960
		961	case PAGE_CLEAN:
		962	; /* try to migrate the page below */
		963	}
		964	}
		965
		966	/*
		967	* Buffers are managed in a filesystem specific way.
		968	* We must have no buffers or drop them.
		969	*/
		970	if (!page_has_buffers(page) \|\|
		971	try_to_release_page(page, GFP_KERNEL)) {
		972	rc = migrate_page(newpage, page);
		973	goto unlock_both;
		974	}
		975
		976	/*
		977	* On early passes with mapped pages simply
		978	* retry. There may be a lock held for some
		979	* buffers that may go away. Later
		980	* swap them out.
		981	*/
		982	if (pass > 4) {
		983	/*
		984	* Persistently unable to drop buffers..... As a
		985	* measure of last resort we fall back to
		986	* swap_page().
		987	*/
		988	unlock_page(newpage);
		989	newpage = NULL;
		990	rc = swap_page(page);
		991	goto next;
		992	}
		993
		994	unlock_both:
		995	unlock_page(newpage);
743		996
744	unlock_page:	997	unlock_page:
745	unlock_page(page);	998	unlock_page(page);
@@ -752,7 +1005,10 @@ next:
752	list_move(&page->lru, failed);	1005	list_move(&page->lru, failed);
753	nr_failed++;	1006	nr_failed++;
754	} else {	1007	} else {
755	/* Success */	1008	if (newpage) {
		1009	/* Successful migration. Return page to LRU */
		1010	move_to_lru(newpage);
		1011	}
756	list_move(&page->lru, moved);	1012	list_move(&page->lru, moved);
757	}	1013	}
758	}	1014	}
@@ -939,9 +1195,47 @@ refill_inactive_zone(struct zone zone, struct scan_control sc)
939	struct page *page;	1195	struct page *page;
940	struct pagevec pvec;	1196	struct pagevec pvec;
941	int reclaim_mapped = 0;	1197	int reclaim_mapped = 0;
942	long mapped_ratio;	1198
943	long distress;	1199	if (unlikely(sc->may_swap)) {
944	long swap_tendency;	1200	long mapped_ratio;
		1201	long distress;
		1202	long swap_tendency;
		1203
		1204	/*
		1205	* `distress' is a measure of how much trouble we're having
		1206	* reclaiming pages. 0 -> no problems. 100 -> great trouble.
		1207	*/
		1208	distress = 100 >> zone->prev_priority;
		1209
		1210	/*
		1211	* The point of this algorithm is to decide when to start
		1212	* reclaiming mapped memory instead of just pagecache. Work out
		1213	* how much memory
		1214	* is mapped.
		1215	*/
		1216	mapped_ratio = (sc->nr_mapped * 100) / total_memory;
		1217
		1218	/*
		1219	* Now decide how much we really want to unmap some pages. The
		1220	* mapped ratio is downgraded - just because there's a lot of
		1221	* mapped memory doesn't necessarily mean that page reclaim
		1222	* isn't succeeding.
		1223	*
		1224	* The distress ratio is important - we don't want to start
		1225	* going oom.
		1226	*
		1227	* A 100% value of vm_swappiness overrides this algorithm
		1228	* altogether.
		1229	*/
		1230	swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
		1231
		1232	/*
		1233	* Now use this metric to decide whether to start moving mapped
		1234	* memory onto the inactive list.
		1235	*/
		1236	if (swap_tendency >= 100)
		1237	reclaim_mapped = 1;
		1238	}
945		1239
946	lru_add_drain();	1240	lru_add_drain();
947	spin_lock_irq(&zone->lru_lock);	1241	spin_lock_irq(&zone->lru_lock);
@@ -951,37 +1245,6 @@ refill_inactive_zone(struct zone zone, struct scan_control sc)
951	zone->nr_active -= pgmoved;	1245	zone->nr_active -= pgmoved;
952	spin_unlock_irq(&zone->lru_lock);	1246	spin_unlock_irq(&zone->lru_lock);
953		1247
954	/*
955	* `distress' is a measure of how much trouble we're having reclaiming
956	* pages. 0 -> no problems. 100 -> great trouble.
957	*/
958	distress = 100 >> zone->prev_priority;
959
960	/*
961	* The point of this algorithm is to decide when to start reclaiming
962	* mapped memory instead of just pagecache. Work out how much memory
963	* is mapped.
964	*/
965	mapped_ratio = (sc->nr_mapped * 100) / total_memory;
966
967	/*
968	* Now decide how much we really want to unmap some pages. The mapped
969	* ratio is downgraded - just because there's a lot of mapped memory
970	* doesn't necessarily mean that page reclaim isn't succeeding.
971	*
972	* The distress ratio is important - we don't want to start going oom.
973	*
974	* A 100% value of vm_swappiness overrides this algorithm altogether.
975	*/
976	swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
977
978	/*
979	* Now use this metric to decide whether to start moving mapped memory
980	* onto the inactive list.
981	*/
982	if (swap_tendency >= 100)
983	reclaim_mapped = 1;
984
985	while (!list_empty(&l_hold)) {	1248	while (!list_empty(&l_hold)) {
986	cond_resched();	1249	cond_resched();
987	page = lru_to_page(&l_hold);	1250	page = lru_to_page(&l_hold);
@@ -1170,7 +1433,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1170	int i;	1433	int i;
1171		1434
1172	sc.gfp_mask = gfp_mask;	1435	sc.gfp_mask = gfp_mask;
1173	sc.may_writepage = 0;	1436	sc.may_writepage = !laptop_mode;
1174	sc.may_swap = 1;	1437	sc.may_swap = 1;
1175		1438
1176	inc_page_state(allocstall);	1439	inc_page_state(allocstall);
@@ -1273,7 +1536,7 @@ loop_again:
1273	total_scanned = 0;	1536	total_scanned = 0;
1274	total_reclaimed = 0;	1537	total_reclaimed = 0;
1275	sc.gfp_mask = GFP_KERNEL;	1538	sc.gfp_mask = GFP_KERNEL;
1276	sc.may_writepage = 0;	1539	sc.may_writepage = !laptop_mode;
1277	sc.may_swap = 1;	1540	sc.may_swap = 1;
1278	sc.nr_mapped = read_page_state(nr_mapped);	1541	sc.nr_mapped = read_page_state(nr_mapped);
1279		1542
@@ -1358,9 +1621,7 @@ scan:
1358	sc.nr_reclaimed = 0;	1621	sc.nr_reclaimed = 0;
1359	sc.priority = priority;	1622	sc.priority = priority;
1360	sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;	1623	sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
1361	atomic_inc(&zone->reclaim_in_progress);
1362	shrink_zone(zone, &sc);	1624	shrink_zone(zone, &sc);
1363	atomic_dec(&zone->reclaim_in_progress);
1364	reclaim_state->reclaimed_slab = 0;	1625	reclaim_state->reclaimed_slab = 0;
1365	nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,	1626	nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1366	lru_pages);	1627	lru_pages);
@@ -1586,40 +1847,61 @@ module_init(kswapd_init)
1586	*/	1847	*/
1587	int zone_reclaim_mode __read_mostly;	1848	int zone_reclaim_mode __read_mostly;
1588		1849
		1850	#define RECLAIM_OFF 0
		1851	#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
		1852	#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
		1853	#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
		1854	#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */
		1855
1589	/*	1856	/*
1590	* Mininum time between zone reclaim scans	1857	* Mininum time between zone reclaim scans
1591	*/	1858	*/
1592	#define ZONE_RECLAIM_INTERVAL HZ/2	1859	int zone_reclaim_interval __read_mostly = 30*HZ;
		1860
		1861	/*
		1862	* Priority for ZONE_RECLAIM. This determines the fraction of pages
		1863	* of a node considered for each zone_reclaim. 4 scans 1/16th of
		1864	* a zone.
		1865	*/
		1866	#define ZONE_RECLAIM_PRIORITY 4
		1867
1593	/*	1868	/*
1594	* Try to free up some pages from this zone through reclaim.	1869	* Try to free up some pages from this zone through reclaim.
1595	*/	1870	*/
1596	int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)	1871	int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1597	{	1872	{
1598	int nr_pages = 1 << order;	1873	int nr_pages;
1599	struct task_struct *p = current;	1874	struct task_struct *p = current;
1600	struct reclaim_state reclaim_state;	1875	struct reclaim_state reclaim_state;
1601	struct scan_control sc = {	1876	struct scan_control sc;
1602	.gfp_mask = gfp_mask,	1877	cpumask_t mask;
1603	.may_writepage = 0,	1878	int node_id;
1604	.may_swap = 0,	1879
1605	.nr_mapped = read_page_state(nr_mapped),	1880	if (time_before(jiffies,
1606	.nr_scanned = 0,	1881	zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
1607	.nr_reclaimed = 0,	1882	return 0;
1608	.priority = 0
1609	};
1610		1883
1611	if (!(gfp_mask & __GFP_WAIT) \|\|	1884	if (!(gfp_mask & __GFP_WAIT) \|\|
1612	zone->zone_pgdat->node_id != numa_node_id() \|\|
1613	zone->all_unreclaimable \|\|	1885	zone->all_unreclaimable \|\|
1614	atomic_read(&zone->reclaim_in_progress) > 0)	1886	atomic_read(&zone->reclaim_in_progress) > 0)
1615	return 0;	1887	return 0;
1616		1888
1617	if (time_before(jiffies,	1889	node_id = zone->zone_pgdat->node_id;
1618	zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))	1890	mask = node_to_cpumask(node_id);
1619	return 0;	1891	if (!cpus_empty(mask) && node_id != numa_node_id())
		1892	return 0;
		1893
		1894	sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
		1895	sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
		1896	sc.nr_scanned = 0;
		1897	sc.nr_reclaimed = 0;
		1898	sc.priority = ZONE_RECLAIM_PRIORITY + 1;
		1899	sc.nr_mapped = read_page_state(nr_mapped);
		1900	sc.gfp_mask = gfp_mask;
1620		1901
1621	disable_swap_token();	1902	disable_swap_token();
1622		1903
		1904	nr_pages = 1 << order;
1623	if (nr_pages > SWAP_CLUSTER_MAX)	1905	if (nr_pages > SWAP_CLUSTER_MAX)
1624	sc.swap_cluster_max = nr_pages;	1906	sc.swap_cluster_max = nr_pages;
1625	else	1907	else
@@ -1629,14 +1911,37 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1629	p->flags \|= PF_MEMALLOC;	1911	p->flags \|= PF_MEMALLOC;
1630	reclaim_state.reclaimed_slab = 0;	1912	reclaim_state.reclaimed_slab = 0;
1631	p->reclaim_state = &reclaim_state;	1913	p->reclaim_state = &reclaim_state;
1632	shrink_zone(zone, &sc);	1914
		1915	/*
		1916	* Free memory by calling shrink zone with increasing priorities
		1917	* until we have enough memory freed.
		1918	*/
		1919	do {
		1920	sc.priority--;
		1921	shrink_zone(zone, &sc);
		1922
		1923	} while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
		1924
		1925	if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
		1926	/*
		1927	* shrink_slab does not currently allow us to determine
		1928	* how many pages were freed in the zone. So we just
		1929	* shake the slab and then go offnode for a single allocation.
		1930	*
		1931	* shrink_slab will free memory on all zones and may take
		1932	* a long time.
		1933	*/
		1934	shrink_slab(sc.nr_scanned, gfp_mask, order);
		1935	sc.nr_reclaimed = 1; /* Avoid getting the off node timeout */
		1936	}
		1937
1633	p->reclaim_state = NULL;	1938	p->reclaim_state = NULL;
1634	current->flags &= ~PF_MEMALLOC;	1939	current->flags &= ~PF_MEMALLOC;
1635		1940
1636	if (sc.nr_reclaimed == 0)	1941	if (sc.nr_reclaimed == 0)
1637	zone->last_unsuccessful_zone_reclaim = jiffies;	1942	zone->last_unsuccessful_zone_reclaim = jiffies;
1638		1943
1639	return sc.nr_reclaimed > nr_pages;	1944	return sc.nr_reclaimed >= nr_pages;
1640	}	1945	}
1641	#endif	1946	#endif
1642		1947