1 files changed, 312 insertions, 31 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2e34b61a70c7..5a610804cd06 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -477,7 +477,13 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                        switch (try_to_unmap(page)) {
+                        /*
+                         * No unmapping if we do not swap
+                         */
+                        if (!sc->may_swap)
+                                goto keep_locked;
+                        switch (try_to_unmap(page, 0)) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
@@ -492,7 +498,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                                goto keep_locked;
                        if (!may_enter_fs)
                                goto keep_locked;
-                        if (laptop_mode && !sc->may_writepage)
+                        if (!sc->may_writepage)
                                goto keep_locked;
                        /* Page is dirty, try to write it out here */
@@ -609,6 +615,15 @@ int putback_lru_pages(struct list_head *l)
 }
 /*
+ * Non migratable page
+ */
+int fail_migrate_page(struct page *newpage, struct page *page)
+{
+        return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+/*
 * swapout a single page
 * page is locked upon entry, unlocked on exit
 */
@@ -617,7 +632,7 @@ static int swap_page(struct page *page)
        struct address_space *mapping = page_mapping(page);
        if (page_mapped(page) && mapping)
-                if (try_to_unmap(page) != SWAP_SUCCESS)
+                if (try_to_unmap(page, 0) != SWAP_SUCCESS)
                        goto unlock_retry;
        if (PageDirty(page)) {
@@ -653,6 +668,167 @@ unlock_retry:
 retry:
        return -EAGAIN;
 }
+EXPORT_SYMBOL(swap_page);
+/*
+ * Page migration was first developed in the context of the memory hotplug
+ * project. The main authors of the migration code are:
+ *
+ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
+ * Hirokazu Takahashi <taka@valinux.co.jp>
+ * Dave Hansen <haveblue@us.ibm.com>
+ * Christoph Lameter <clameter@sgi.com>
+ */
+/*
+ * Remove references for a page and establish the new page with the correct
+ * basic settings to be able to stop accesses to the page.
+ */
+int migrate_page_remove_references(struct page *newpage,
+                                struct page *page, int nr_refs)
+{
+        struct address_space *mapping = page_mapping(page);
+        struct page **radix_pointer;
+        /*
+         * Avoid doing any of the following work if the page count
+         * indicates that the page is in use or truncate has removed
+         * the page.
+         */
+        if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
+                return 1;
+        /*
+         * Establish swap ptes for anonymous pages or destroy pte
+         * maps for files.
+         *
+         * In order to reestablish file backed mappings the fault handlers
+         * will take the radix tree_lock which may then be used to stop
+         * processses from accessing this page until the new page is ready.
+         *
+         * A process accessing via a swap pte (an anonymous page) will take a
+         * page_lock on the old page which will block the process until the
+         * migration attempt is complete. At that time the PageSwapCache bit
+         * will be examined. If the page was migrated then the PageSwapCache
+         * bit will be clear and the operation to retrieve the page will be
+         * retried which will find the new page in the radix tree. Then a new
+         * direct mapping may be generated based on the radix tree contents.
+         *
+         * If the page was not migrated then the PageSwapCache bit
+         * is still set and the operation may continue.
+         */
+        try_to_unmap(page, 1);
+        /*
+         * Give up if we were unable to remove all mappings.
+         */
+        if (page_mapcount(page))
+                return 1;
+        write_lock_irq(&mapping->tree_lock);
+        radix_pointer = (struct page **)radix_tree_lookup_slot(
+                                                &mapping->page_tree,
+                                                page_index(page));
+        if (!page_mapping(page) || page_count(page) != nr_refs ||
+                        *radix_pointer != page) {
+                write_unlock_irq(&mapping->tree_lock);
+                return 1;
+        }
+        /*
+         * Now we know that no one else is looking at the page.
+         *
+         * Certain minimal information about a page must be available
+         * in order for other subsystems to properly handle the page if they
+         * find it through the radix tree update before we are finished
+         * copying the page.
+         */
+        get_page(newpage);
+        newpage->index = page->index;
+        newpage->mapping = page->mapping;
+        if (PageSwapCache(page)) {
+                SetPageSwapCache(newpage);
+                set_page_private(newpage, page_private(page));
+        }
+        *radix_pointer = newpage;
+        __put_page(page);
+        write_unlock_irq(&mapping->tree_lock);
+        return 0;
+}
+EXPORT_SYMBOL(migrate_page_remove_references);
+/*
+ * Copy the page to its new location
+ */
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+        copy_highpage(newpage, page);
+        if (PageError(page))
+                SetPageError(newpage);
+        if (PageReferenced(page))
+                SetPageReferenced(newpage);
+        if (PageUptodate(page))
+                SetPageUptodate(newpage);
+        if (PageActive(page))
+                SetPageActive(newpage);
+        if (PageChecked(page))
+                SetPageChecked(newpage);
+        if (PageMappedToDisk(page))
+                SetPageMappedToDisk(newpage);
+        if (PageDirty(page)) {
+                clear_page_dirty_for_io(page);
+                set_page_dirty(newpage);
+        }
+        ClearPageSwapCache(page);
+        ClearPageActive(page);
+        ClearPagePrivate(page);
+        set_page_private(page, 0);
+        page->mapping = NULL;
+        /*
+         * If any waiters have accumulated on the new page then
+         * wake them up.
+         */
+        if (PageWriteback(newpage))
+                end_page_writeback(newpage);
+}
+EXPORT_SYMBOL(migrate_page_copy);
+/*
+ * Common logic to directly migrate a single page suitable for
+ * pages that do not use PagePrivate.
+ *
+ * Pages are locked upon entry and exit.
+ */
+int migrate_page(struct page *newpage, struct page *page)
+{
+        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
+        if (migrate_page_remove_references(newpage, page, 2))
+                return -EAGAIN;
+        migrate_page_copy(newpage, page);
+        /*
+         * Remove auxiliary swap entries and replace
+         * them with real ptes.
+         *
+         * Note that a real pte entry will allow processes that are not
+         * waiting on the page lock to use the new page via the page tables
+         * before the new page is unlocked.
+         */
+        remove_from_swap(newpage);
+        return 0;
+}
+EXPORT_SYMBOL(migrate_page);
 /*
 * migrate_pages
 *
@@ -666,11 +842,6 @@ retry:
 * are movable anymore because t has become empty
 * or no retryable pages exist anymore.
 *
- * SIMPLIFIED VERSION: This implementation of migrate_pages
- * is only swapping out pages and never touches the second
- * list. The direct migration patchset
- * extends this function to avoid the use of swap.
- *
 * Return: Number of pages not migrated when "to" ran empty.
 */
 int migrate_pages(struct list_head *from, struct list_head *to,
@@ -691,6 +862,9 @@ redo:
        retry = 0;
        list_for_each_entry_safe(page, page2, from, lru) {
+                struct page *newpage = NULL;
+                struct address_space *mapping;
                cond_resched();
                rc = 0;
@@ -698,6 +872,9 @@ redo:
                        /* page was freed from under us. So we are done. */
                        goto next;
+                if (to && list_empty(to))
+                        break;
                /*
                 * Skip locked pages during the first two passes to give the
                 * functions holding the lock time to release the page. Later we
@@ -734,12 +911,69 @@ redo:
                        }
                }
+                if (!to) {
+                        rc = swap_page(page);
+                        goto next;
+                }
+                newpage = lru_to_page(to);
+                lock_page(newpage);
                /*
-                 * Page is properly locked and writeback is complete.
+                 * Pages are properly locked and writeback is complete.
                 * Try to migrate the page.
                 */
-                rc = swap_page(page);
+                mapping = page_mapping(page);
-                goto next;
+                if (!mapping)
+                        goto unlock_both;
+                if (mapping->a_ops->migratepage) {
+                        rc = mapping->a_ops->migratepage(newpage, page);
+                        goto unlock_both;
+                }
+                /*
+                 * Trigger writeout if page is dirty
+                 */
+                if (PageDirty(page)) {
+                        switch (pageout(page, mapping)) {
+                        case PAGE_KEEP:
+                        case PAGE_ACTIVATE:
+                                goto unlock_both;
+                        case PAGE_SUCCESS:
+                                unlock_page(newpage);
+                                goto next;
+                        case PAGE_CLEAN:
+                                ; /* try to migrate the page below */
+                        }
+                }
+                /*
+                 * If we have no buffer or can release the buffer
+                 * then do a simple migration.
+                 */
+                if (!page_has_buffers(page) ||
+                    try_to_release_page(page, GFP_KERNEL)) {
+                        rc = migrate_page(newpage, page);
+                        goto unlock_both;
+                }
+                /*
+                 * On early passes with mapped pages simply
+                 * retry. There may be a lock held for some
+                 * buffers that may go away. Later
+                 * swap them out.
+                 */
+                if (pass > 4) {
+                        unlock_page(newpage);
+                        newpage = NULL;
+                        rc = swap_page(page);
+                        goto next;
+                }
+unlock_both:
+                unlock_page(newpage);
 unlock_page:
                unlock_page(page);
@@ -752,7 +986,10 @@ next:
                        list_move(&page->lru, failed);
                        nr_failed++;
                } else {
-                        /* Success */
+                        if (newpage) {
+                                /* Successful migration. Return page to LRU */
+                                move_to_lru(newpage);
+                        }
                        list_move(&page->lru, moved);
                }
        }
@@ -1170,7 +1407,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
        int i;
        sc.gfp_mask = gfp_mask;
-        sc.may_writepage = 0;
+        sc.may_writepage = !laptop_mode;
        sc.may_swap = 1;
        inc_page_state(allocstall);
@@ -1273,7 +1510,7 @@ loop_again:
        total_scanned = 0;
        total_reclaimed = 0;
        sc.gfp_mask = GFP_KERNEL;
-        sc.may_writepage = 0;
+        sc.may_writepage = !laptop_mode;
        sc.may_swap = 1;
        sc.nr_mapped = read_page_state(nr_mapped);
@@ -1586,40 +1823,61 @@ module_init(kswapd_init)
 */
 int zone_reclaim_mode __read_mostly;
+#define RECLAIM_OFF 0
+#define RECLAIM_ZONE (1<<0)     /* Run shrink_cache on the zone */
+#define RECLAIM_WRITE (1<<1)    /* Writeout pages during reclaim */
+#define RECLAIM_SWAP (1<<2)     /* Swap pages out during reclaim */
+#define RECLAIM_SLAB (1<<3)     /* Do a global slab shrink if the zone is out of memory */
 /*
 * Mininum time between zone reclaim scans
 */
-#define ZONE_RECLAIM_INTERVAL HZ/2
+int zone_reclaim_interval __read_mostly = 30*HZ;
+/*
+ * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * of a node considered for each zone_reclaim. 4 scans 1/16th of
+ * a zone.
+ */
+#define ZONE_RECLAIM_PRIORITY 4
 /*
 * Try to free up some pages from this zone through reclaim.
 */
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-        int nr_pages = 1 << order;
+        int nr_pages;
        struct task_struct *p = current;
        struct reclaim_state reclaim_state;
-        struct scan_control sc = {
+        struct scan_control sc;
-                .gfp_mask       = gfp_mask,
+        cpumask_t mask;
-                .may_writepage  = 0,
+        int node_id;
-                .may_swap       = 0,
-                .nr_mapped      = read_page_state(nr_mapped),
+        if (time_before(jiffies,
-                .nr_scanned     = 0,
+                zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
-                .nr_reclaimed   = 0,
+                        return 0;
-                .priority       = 0
-        };
        if (!(gfp_mask & __GFP_WAIT) ||
-                zone->zone_pgdat->node_id != numa_node_id() ||
                zone->all_unreclaimable ||
                atomic_read(&zone->reclaim_in_progress) > 0)
                        return 0;
-        if (time_before(jiffies,
+        node_id = zone->zone_pgdat->node_id;
-                zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+        mask = node_to_cpumask(node_id);
-                        return 0;
+        if (!cpus_empty(mask) && node_id != numa_node_id())
+                return 0;
+        sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
+        sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
+        sc.nr_scanned = 0;
+        sc.nr_reclaimed = 0;
+        sc.priority = ZONE_RECLAIM_PRIORITY + 1;
+        sc.nr_mapped = read_page_state(nr_mapped);
+        sc.gfp_mask = gfp_mask;
        disable_swap_token();
+        nr_pages = 1 << order;
        if (nr_pages > SWAP_CLUSTER_MAX)
                sc.swap_cluster_max = nr_pages;
        else
@@ -1629,14 +1887,37 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        p->flags |= PF_MEMALLOC;
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
-        shrink_zone(zone, &sc);
+        /*
+         * Free memory by calling shrink zone with increasing priorities
+         * until we have enough memory freed.
+         */
+        do {
+                sc.priority--;
+                shrink_zone(zone, &sc);
+        } while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
+        if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+                /*
+                 * shrink_slab does not currently allow us to determine
+                 * how many pages were freed in the zone. So we just
+                 * shake the slab and then go offnode for a single allocation.
+                 *
+                 * shrink_slab will free memory on all zones and may take
+                 * a long time.
+                 */
+                shrink_slab(sc.nr_scanned, gfp_mask, order);
+                sc.nr_reclaimed = 1;    /* Avoid getting the off node timeout */
+        }
        p->reclaim_state = NULL;
        current->flags &= ~PF_MEMALLOC;
        if (sc.nr_reclaimed == 0)
                zone->last_unsuccessful_zone_reclaim = jiffies;
-        return sc.nr_reclaimed > nr_pages;
+        return sc.nr_reclaimed >= nr_pages;
 }
 #endif

diff --git a/mm/vmscan.c b/mm/vmscan.c index 2e34b61a70c7..5a610804cd06 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -477,7 +477,13 @@ static int shrink_list(struct list_head page_list, struct scan_control sc)
477	* processes. Try to unmap it here.	477	* processes. Try to unmap it here.
478	*/	478	*/
479	if (page_mapped(page) && mapping) {	479	if (page_mapped(page) && mapping) {
480	switch (try_to_unmap(page)) {	480	/*
		481	* No unmapping if we do not swap
		482	*/
		483	if (!sc->may_swap)
		484	goto keep_locked;
		485
		486	switch (try_to_unmap(page, 0)) {
481	case SWAP_FAIL:	487	case SWAP_FAIL:
482	goto activate_locked;	488	goto activate_locked;
483	case SWAP_AGAIN:	489	case SWAP_AGAIN:
@@ -492,7 +498,7 @@ static int shrink_list(struct list_head page_list, struct scan_control sc)
492	goto keep_locked;	498	goto keep_locked;
493	if (!may_enter_fs)	499	if (!may_enter_fs)
494	goto keep_locked;	500	goto keep_locked;
495	if (laptop_mode && !sc->may_writepage)	501	if (!sc->may_writepage)
496	goto keep_locked;	502	goto keep_locked;
497		503
498	/* Page is dirty, try to write it out here */	504	/* Page is dirty, try to write it out here */
@@ -609,6 +615,15 @@ int putback_lru_pages(struct list_head *l)
609	}	615	}
610		616
611	/*	617	/*
		618	* Non migratable page
		619	*/
		620	int fail_migrate_page(struct page newpage, struct page page)
		621	{
		622	return -EIO;
		623	}
		624	EXPORT_SYMBOL(fail_migrate_page);
		625
		626	/*
612	* swapout a single page	627	* swapout a single page
613	* page is locked upon entry, unlocked on exit	628	* page is locked upon entry, unlocked on exit
614	*/	629	*/
@@ -617,7 +632,7 @@ static int swap_page(struct page *page)
617	struct address_space *mapping = page_mapping(page);	632	struct address_space *mapping = page_mapping(page);
618		633
619	if (page_mapped(page) && mapping)	634	if (page_mapped(page) && mapping)
620	if (try_to_unmap(page) != SWAP_SUCCESS)	635	if (try_to_unmap(page, 0) != SWAP_SUCCESS)
621	goto unlock_retry;	636	goto unlock_retry;
622		637
623	if (PageDirty(page)) {	638	if (PageDirty(page)) {
@@ -653,6 +668,167 @@ unlock_retry:
653	retry:	668	retry:
654	return -EAGAIN;	669	return -EAGAIN;
655	}	670	}
		671	EXPORT_SYMBOL(swap_page);
		672
		673	/*
		674	* Page migration was first developed in the context of the memory hotplug
		675	* project. The main authors of the migration code are:
		676	*
		677	* IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
		678	* Hirokazu Takahashi <taka@valinux.co.jp>
		679	* Dave Hansen <haveblue@us.ibm.com>
		680	* Christoph Lameter <clameter@sgi.com>
		681	*/
		682
		683	/*
		684	* Remove references for a page and establish the new page with the correct
		685	* basic settings to be able to stop accesses to the page.
		686	*/
		687	int migrate_page_remove_references(struct page *newpage,
		688	struct page *page, int nr_refs)
		689	{
		690	struct address_space *mapping = page_mapping(page);
		691	struct page **radix_pointer;
		692
		693	/*
		694	* Avoid doing any of the following work if the page count
		695	* indicates that the page is in use or truncate has removed
		696	* the page.
		697	*/
		698	if (!mapping \|\| page_mapcount(page) + nr_refs != page_count(page))
		699	return 1;
		700
		701	/*
		702	* Establish swap ptes for anonymous pages or destroy pte
		703	* maps for files.
		704	*
		705	* In order to reestablish file backed mappings the fault handlers
		706	* will take the radix tree_lock which may then be used to stop
		707	* processses from accessing this page until the new page is ready.
		708	*
		709	* A process accessing via a swap pte (an anonymous page) will take a
		710	* page_lock on the old page which will block the process until the
		711	* migration attempt is complete. At that time the PageSwapCache bit
		712	* will be examined. If the page was migrated then the PageSwapCache
		713	* bit will be clear and the operation to retrieve the page will be
		714	* retried which will find the new page in the radix tree. Then a new
		715	* direct mapping may be generated based on the radix tree contents.
		716	*
		717	* If the page was not migrated then the PageSwapCache bit
		718	* is still set and the operation may continue.
		719	*/
		720	try_to_unmap(page, 1);
		721
		722	/*
		723	* Give up if we were unable to remove all mappings.
		724	*/
		725	if (page_mapcount(page))
		726	return 1;
		727
		728	write_lock_irq(&mapping->tree_lock);
		729
		730	radix_pointer = (struct page **)radix_tree_lookup_slot(
		731	&mapping->page_tree,
		732	page_index(page));
		733
		734	if (!page_mapping(page) \|\| page_count(page) != nr_refs \|\|
		735	*radix_pointer != page) {
		736	write_unlock_irq(&mapping->tree_lock);
		737	return 1;
		738	}
		739
		740	/*
		741	* Now we know that no one else is looking at the page.
		742	*
		743	* Certain minimal information about a page must be available
		744	* in order for other subsystems to properly handle the page if they
		745	* find it through the radix tree update before we are finished
		746	* copying the page.
		747	*/
		748	get_page(newpage);
		749	newpage->index = page->index;
		750	newpage->mapping = page->mapping;
		751	if (PageSwapCache(page)) {
		752	SetPageSwapCache(newpage);
		753	set_page_private(newpage, page_private(page));
		754	}
		755
		756	*radix_pointer = newpage;
		757	__put_page(page);
		758	write_unlock_irq(&mapping->tree_lock);
		759
		760	return 0;
		761	}
		762	EXPORT_SYMBOL(migrate_page_remove_references);
		763
		764	/*
		765	* Copy the page to its new location
		766	*/
		767	void migrate_page_copy(struct page newpage, struct page page)
		768	{
		769	copy_highpage(newpage, page);
		770
		771	if (PageError(page))
		772	SetPageError(newpage);
		773	if (PageReferenced(page))
		774	SetPageReferenced(newpage);
		775	if (PageUptodate(page))
		776	SetPageUptodate(newpage);
		777	if (PageActive(page))
		778	SetPageActive(newpage);
		779	if (PageChecked(page))
		780	SetPageChecked(newpage);
		781	if (PageMappedToDisk(page))
		782	SetPageMappedToDisk(newpage);
		783
		784	if (PageDirty(page)) {
		785	clear_page_dirty_for_io(page);
		786	set_page_dirty(newpage);
		787	}
		788
		789	ClearPageSwapCache(page);
		790	ClearPageActive(page);
		791	ClearPagePrivate(page);
		792	set_page_private(page, 0);
		793	page->mapping = NULL;
		794
		795	/*
		796	* If any waiters have accumulated on the new page then
		797	* wake them up.
		798	*/
		799	if (PageWriteback(newpage))
		800	end_page_writeback(newpage);
		801	}
		802	EXPORT_SYMBOL(migrate_page_copy);
		803
		804	/*
		805	* Common logic to directly migrate a single page suitable for
		806	* pages that do not use PagePrivate.
		807	*
		808	* Pages are locked upon entry and exit.
		809	*/
		810	int migrate_page(struct page newpage, struct page page)
		811	{
		812	BUG_ON(PageWriteback(page)); /* Writeback must be complete */
		813
		814	if (migrate_page_remove_references(newpage, page, 2))
		815	return -EAGAIN;
		816
		817	migrate_page_copy(newpage, page);
		818
		819	/*
		820	* Remove auxiliary swap entries and replace
		821	* them with real ptes.
		822	*
		823	* Note that a real pte entry will allow processes that are not
		824	* waiting on the page lock to use the new page via the page tables
		825	* before the new page is unlocked.
		826	*/
		827	remove_from_swap(newpage);
		828	return 0;
		829	}
		830	EXPORT_SYMBOL(migrate_page);
		831
656	/*	832	/*
657	* migrate_pages	833	* migrate_pages
658	*	834	*
@@ -666,11 +842,6 @@ retry:
666	* are movable anymore because t has become empty	842	* are movable anymore because t has become empty
667	* or no retryable pages exist anymore.	843	* or no retryable pages exist anymore.
668	*	844	*
669	* SIMPLIFIED VERSION: This implementation of migrate_pages
670	* is only swapping out pages and never touches the second
671	* list. The direct migration patchset
672	* extends this function to avoid the use of swap.
673	*
674	* Return: Number of pages not migrated when "to" ran empty.	845	* Return: Number of pages not migrated when "to" ran empty.
675	*/	846	*/
676	int migrate_pages(struct list_head from, struct list_head to,	847	int migrate_pages(struct list_head from, struct list_head to,
@@ -691,6 +862,9 @@ redo:
691	retry = 0;	862	retry = 0;
692		863
693	list_for_each_entry_safe(page, page2, from, lru) {	864	list_for_each_entry_safe(page, page2, from, lru) {
		865	struct page *newpage = NULL;
		866	struct address_space *mapping;
		867
694	cond_resched();	868	cond_resched();
695		869
696	rc = 0;	870	rc = 0;
@@ -698,6 +872,9 @@ redo:
698	/* page was freed from under us. So we are done. */	872	/* page was freed from under us. So we are done. */
699	goto next;	873	goto next;
700		874
		875	if (to && list_empty(to))
		876	break;
		877
701	/*	878	/*
702	* Skip locked pages during the first two passes to give the	879	* Skip locked pages during the first two passes to give the
703	* functions holding the lock time to release the page. Later we	880	* functions holding the lock time to release the page. Later we
@@ -734,12 +911,69 @@ redo:
734	}	911	}
735	}	912	}
736		913
		914	if (!to) {
		915	rc = swap_page(page);
		916	goto next;
		917	}
		918
		919	newpage = lru_to_page(to);
		920	lock_page(newpage);
		921
737	/*	922	/*
738	* Page is properly locked and writeback is complete.	923	* Pages are properly locked and writeback is complete.
739	* Try to migrate the page.	924	* Try to migrate the page.
740	*/	925	*/
741	rc = swap_page(page);	926	mapping = page_mapping(page);
742	goto next;	927	if (!mapping)
		928	goto unlock_both;
		929
		930	if (mapping->a_ops->migratepage) {
		931	rc = mapping->a_ops->migratepage(newpage, page);
		932	goto unlock_both;
		933	}
		934
		935	/*
		936	* Trigger writeout if page is dirty
		937	*/
		938	if (PageDirty(page)) {
		939	switch (pageout(page, mapping)) {
		940	case PAGE_KEEP:
		941	case PAGE_ACTIVATE:
		942	goto unlock_both;
		943
		944	case PAGE_SUCCESS:
		945	unlock_page(newpage);
		946	goto next;
		947
		948	case PAGE_CLEAN:
		949	; /* try to migrate the page below */
		950	}
		951	}
		952	/*
		953	* If we have no buffer or can release the buffer
		954	* then do a simple migration.
		955	*/
		956	if (!page_has_buffers(page) \|\|
		957	try_to_release_page(page, GFP_KERNEL)) {
		958	rc = migrate_page(newpage, page);
		959	goto unlock_both;
		960	}
		961
		962	/*
		963	* On early passes with mapped pages simply
		964	* retry. There may be a lock held for some
		965	* buffers that may go away. Later
		966	* swap them out.
		967	*/
		968	if (pass > 4) {
		969	unlock_page(newpage);
		970	newpage = NULL;
		971	rc = swap_page(page);
		972	goto next;
		973	}
		974
		975	unlock_both:
		976	unlock_page(newpage);
743		977
744	unlock_page:	978	unlock_page:
745	unlock_page(page);	979	unlock_page(page);
@@ -752,7 +986,10 @@ next:
752	list_move(&page->lru, failed);	986	list_move(&page->lru, failed);
753	nr_failed++;	987	nr_failed++;
754	} else {	988	} else {
755	/* Success */	989	if (newpage) {
		990	/* Successful migration. Return page to LRU */
		991	move_to_lru(newpage);
		992	}
756	list_move(&page->lru, moved);	993	list_move(&page->lru, moved);
757	}	994	}
758	}	995	}
@@ -1170,7 +1407,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1170	int i;	1407	int i;
1171		1408
1172	sc.gfp_mask = gfp_mask;	1409	sc.gfp_mask = gfp_mask;
1173	sc.may_writepage = 0;	1410	sc.may_writepage = !laptop_mode;
1174	sc.may_swap = 1;	1411	sc.may_swap = 1;
1175		1412
1176	inc_page_state(allocstall);	1413	inc_page_state(allocstall);
@@ -1273,7 +1510,7 @@ loop_again:
1273	total_scanned = 0;	1510	total_scanned = 0;
1274	total_reclaimed = 0;	1511	total_reclaimed = 0;
1275	sc.gfp_mask = GFP_KERNEL;	1512	sc.gfp_mask = GFP_KERNEL;
1276	sc.may_writepage = 0;	1513	sc.may_writepage = !laptop_mode;
1277	sc.may_swap = 1;	1514	sc.may_swap = 1;
1278	sc.nr_mapped = read_page_state(nr_mapped);	1515	sc.nr_mapped = read_page_state(nr_mapped);
1279		1516
@@ -1586,40 +1823,61 @@ module_init(kswapd_init)
1586	*/	1823	*/
1587	int zone_reclaim_mode __read_mostly;	1824	int zone_reclaim_mode __read_mostly;
1588		1825
		1826	#define RECLAIM_OFF 0
		1827	#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
		1828	#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
		1829	#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
		1830	#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */
		1831
1589	/*	1832	/*
1590	* Mininum time between zone reclaim scans	1833	* Mininum time between zone reclaim scans
1591	*/	1834	*/
1592	#define ZONE_RECLAIM_INTERVAL HZ/2	1835	int zone_reclaim_interval __read_mostly = 30*HZ;
		1836
		1837	/*
		1838	* Priority for ZONE_RECLAIM. This determines the fraction of pages
		1839	* of a node considered for each zone_reclaim. 4 scans 1/16th of
		1840	* a zone.
		1841	*/
		1842	#define ZONE_RECLAIM_PRIORITY 4
		1843
1593	/*	1844	/*
1594	* Try to free up some pages from this zone through reclaim.	1845	* Try to free up some pages from this zone through reclaim.
1595	*/	1846	*/
1596	int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)	1847	int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1597	{	1848	{
1598	int nr_pages = 1 << order;	1849	int nr_pages;
1599	struct task_struct *p = current;	1850	struct task_struct *p = current;
1600	struct reclaim_state reclaim_state;	1851	struct reclaim_state reclaim_state;
1601	struct scan_control sc = {	1852	struct scan_control sc;
1602	.gfp_mask = gfp_mask,	1853	cpumask_t mask;
1603	.may_writepage = 0,	1854	int node_id;
1604	.may_swap = 0,	1855
1605	.nr_mapped = read_page_state(nr_mapped),	1856	if (time_before(jiffies,
1606	.nr_scanned = 0,	1857	zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
1607	.nr_reclaimed = 0,	1858	return 0;
1608	.priority = 0
1609	};
1610		1859
1611	if (!(gfp_mask & __GFP_WAIT) \|\|	1860	if (!(gfp_mask & __GFP_WAIT) \|\|
1612	zone->zone_pgdat->node_id != numa_node_id() \|\|
1613	zone->all_unreclaimable \|\|	1861	zone->all_unreclaimable \|\|
1614	atomic_read(&zone->reclaim_in_progress) > 0)	1862	atomic_read(&zone->reclaim_in_progress) > 0)
1615	return 0;	1863	return 0;
1616		1864
1617	if (time_before(jiffies,	1865	node_id = zone->zone_pgdat->node_id;
1618	zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))	1866	mask = node_to_cpumask(node_id);
1619	return 0;	1867	if (!cpus_empty(mask) && node_id != numa_node_id())
		1868	return 0;
		1869
		1870	sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
		1871	sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
		1872	sc.nr_scanned = 0;
		1873	sc.nr_reclaimed = 0;
		1874	sc.priority = ZONE_RECLAIM_PRIORITY + 1;
		1875	sc.nr_mapped = read_page_state(nr_mapped);
		1876	sc.gfp_mask = gfp_mask;
1620		1877
1621	disable_swap_token();	1878	disable_swap_token();
1622		1879
		1880	nr_pages = 1 << order;
1623	if (nr_pages > SWAP_CLUSTER_MAX)	1881	if (nr_pages > SWAP_CLUSTER_MAX)
1624	sc.swap_cluster_max = nr_pages;	1882	sc.swap_cluster_max = nr_pages;
1625	else	1883	else
@@ -1629,14 +1887,37 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1629	p->flags \|= PF_MEMALLOC;	1887	p->flags \|= PF_MEMALLOC;
1630	reclaim_state.reclaimed_slab = 0;	1888	reclaim_state.reclaimed_slab = 0;
1631	p->reclaim_state = &reclaim_state;	1889	p->reclaim_state = &reclaim_state;
1632	shrink_zone(zone, &sc);	1890
		1891	/*
		1892	* Free memory by calling shrink zone with increasing priorities
		1893	* until we have enough memory freed.
		1894	*/
		1895	do {
		1896	sc.priority--;
		1897	shrink_zone(zone, &sc);
		1898
		1899	} while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
		1900
		1901	if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
		1902	/*
		1903	* shrink_slab does not currently allow us to determine
		1904	* how many pages were freed in the zone. So we just
		1905	* shake the slab and then go offnode for a single allocation.
		1906	*
		1907	* shrink_slab will free memory on all zones and may take
		1908	* a long time.
		1909	*/
		1910	shrink_slab(sc.nr_scanned, gfp_mask, order);
		1911	sc.nr_reclaimed = 1; /* Avoid getting the off node timeout */
		1912	}
		1913
1633	p->reclaim_state = NULL;	1914	p->reclaim_state = NULL;
1634	current->flags &= ~PF_MEMALLOC;	1915	current->flags &= ~PF_MEMALLOC;
1635		1916
1636	if (sc.nr_reclaimed == 0)	1917	if (sc.nr_reclaimed == 0)
1637	zone->last_unsuccessful_zone_reclaim = jiffies;	1918	zone->last_unsuccessful_zone_reclaim = jiffies;
1638		1919
1639	return sc.nr_reclaimed > nr_pages;	1920	return sc.nr_reclaimed >= nr_pages;
1640	}	1921	}
1641	#endif	1922	#endif
1642		1923