[PATCH] Swap Migration V5: LRU operations

This is the start of the `swap migration' patch series. Swap migration allows the moving of the physical location of pages between nodes in a numa system while the process is running. This means that the virtual addresses that the process sees do not change. However, the system rearranges the physical location of those pages. The main intent of page migration patches here is to reduce the latency of memory access by moving pages near to the processor where the process accessing that memory is running. The patchset allows a process to manually relocate the node on which its pages are located through the MF_MOVE and MF_MOVE_ALL options while setting a new memory policy. The pages of process can also be relocated from another process using the sys_migrate_pages() function call. Requires CAP_SYS_ADMIN. The migrate_pages function call takes two sets of nodes and moves pages of a process that are located on the from nodes to the destination nodes. Manual migration is very useful if for example the scheduler has relocated a process to a processor on a distant node. A batch scheduler or an administrator can detect the situation and move the pages of the process nearer to the new processor. sys_migrate_pages() could be used on non-numa machines as well, to force all of a particualr process's pages out to swap, if someone thinks that's useful. Larger installations usually partition the system using cpusets into sections of nodes. Paul has equipped cpusets with the ability to move pages when a task is moved to another cpuset. This allows automatic control over locality of a process. If a task is moved to a new cpuset then also all its pages are moved with it so that the performance of the process does not sink dramatically (as is the case today). Swap migration works by simply evicting the page. The pages must be faulted back in. The pages are then typically reallocated by the system near the node where the process is executing. For swap migration the destination of the move is controlled by the allocation policy. Cpusets set the allocation policy before calling sys_migrate_pages() in order to move the pages as intended. No allocation policy changes are performed for sys_migrate_pages(). This means that the pages may not faulted in to the specified nodes if no allocation policy was set by other means. The pages will just end up near the node where the fault occurred. There's another patch series in the pipeline which implements "direct migration". The direct migration patchset extends the migration functionality to avoid going through swap. The destination node of the relation is controllable during the actual moving of pages. The crutch of using the allocation policy to relocate is not necessary and the pages are moved directly to the target. Its also faster since swap is not used. And sys_migrate_pages() can then move pages directly to the specified node. Implement functions to isolate pages from the LRU and put them back later. This patch: An earlier implementation was provided by Hirokazu Takahashi <taka@valinux.co.jp> and IWAMOTO Toshihiro <iwamoto@valinux.co.jp> for the memory hotplug project. From: Magnus This breaks out isolate_lru_page() and putpack_lru_page(). Needed for swap migration. Signed-off-by: Magnus Damm <magnus.damm@gmail.com> Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Christoph Lameter <clameter@sgi.com> 2006-01-08 04:00:45 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-01-08 23:12:41 -0500
commit: 21eac81f252fe31c3cf64b805a1e8652192f3a3b (patch)
tree: 255662bda67f54ffde484046fd9ab9b0900ab409 /mm
parent: 15316ba81aee6775d6079fb46c66c801989e7d10 (diff)
1 files changed, 87 insertions, 13 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 428c5801d4b4..261a56ee11b6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -593,20 +593,18 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);
-                if (!TestClearPageLRU(page))
+                switch (__isolate_lru_page(page)) {
-                        BUG();
+                case 1:
-                list_del(&page->lru);
+                        /* Succeeded to isolate page */
-                if (get_page_testone(page)) {
+                        list_move(&page->lru, dst);
-                        /*
-                         * It is being freed elsewhere
-                         */
-                        __put_page(page);
-                        SetPageLRU(page);
-                        list_add(&page->lru, src);
-                        continue;
-                } else {
-                        list_add(&page->lru, dst);
                        nr_taken++;
+                        break;
+                case -ENOENT:
+                        /* Not possible to isolate */
+                        list_move(&page->lru, src);
+                        break;
+                default:
+                        BUG();
                }
        }
@@ -614,6 +612,48 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
        return nr_taken;
 }
+static void lru_add_drain_per_cpu(void *dummy)
+{
+        lru_add_drain();
+}
+/*
+ * Isolate one page from the LRU lists and put it on the
+ * indicated list. Do necessary cache draining if the
+ * page is not on the LRU lists yet.
+ *
+ * Result:
+ *  0 = page not on LRU list
+ *  1 = page removed from LRU list and added to the specified list.
+ * -ENOENT = page is being freed elsewhere.
+ */
+int isolate_lru_page(struct page *page)
+{
+        int rc = 0;
+        struct zone *zone = page_zone(page);
+redo:
+        spin_lock_irq(&zone->lru_lock);
+        rc = __isolate_lru_page(page);
+        if (rc == 1) {
+                if (PageActive(page))
+                        del_page_from_active_list(zone, page);
+                else
+                        del_page_from_inactive_list(zone, page);
+        }
+        spin_unlock_irq(&zone->lru_lock);
+        if (rc == 0) {
+                /*
+                 * Maybe this page is still waiting for a cpu to drain it
+                 * from one of the lru lists?
+                 */
+                rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+                if (rc == 0 && PageLRU(page))
+                        goto redo;
+        }
+        return rc;
+}
 /*
 * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
 */
@@ -679,6 +719,40 @@ done:
        pagevec_release(&pvec);
 }
+static inline void move_to_lru(struct page *page)
+{
+        list_del(&page->lru);
+        if (PageActive(page)) {
+                /*
+                 * lru_cache_add_active checks that
+                 * the PG_active bit is off.
+                 */
+                ClearPageActive(page);
+                lru_cache_add_active(page);
+        } else {
+                lru_cache_add(page);
+        }
+        put_page(page);
+}
+/*
+ * Add isolated pages on the list back to the LRU
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+        struct page *page;
+        struct page *page2;
+        int count = 0;
+        list_for_each_entry_safe(page, page2, l, lru) {
+                move_to_lru(page);
+                count++;
+        }
+        return count;
+}
 /*
 * This moves pages from the active list to the inactive list.
 *
author	Christoph Lameter <clameter@sgi.com>	2006-01-08 04:00:45 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-01-08 23:12:41 -0500
commit	21eac81f252fe31c3cf64b805a1e8652192f3a3b (patch)
tree	255662bda67f54ffde484046fd9ab9b0900ab409 /mm
parent	15316ba81aee6775d6079fb46c66c801989e7d10 (diff)

diff --git a/mm/vmscan.c b/mm/vmscan.c index 428c5801d4b4..261a56ee11b6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -593,20 +593,18 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
593	page = lru_to_page(src);	593	page = lru_to_page(src);
594	prefetchw_prev_lru_page(page, src, flags);	594	prefetchw_prev_lru_page(page, src, flags);
595		595
596	if (!TestClearPageLRU(page))	596	switch (__isolate_lru_page(page)) {
597	BUG();	597	case 1:
598	list_del(&page->lru);	598	/* Succeeded to isolate page */
599	if (get_page_testone(page)) {	599	list_move(&page->lru, dst);
600	/*
601	* It is being freed elsewhere
602	*/
603	__put_page(page);
604	SetPageLRU(page);
605	list_add(&page->lru, src);
606	continue;
607	} else {
608	list_add(&page->lru, dst);
609	nr_taken++;	600	nr_taken++;
		601	break;
		602	case -ENOENT:
		603	/* Not possible to isolate */
		604	list_move(&page->lru, src);
		605	break;
		606	default:
		607	BUG();
610	}	608	}
611	}	609	}
612		610
@@ -614,6 +612,48 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
614	return nr_taken;	612	return nr_taken;
615	}	613	}
616		614
		615	static void lru_add_drain_per_cpu(void *dummy)
		616	{
		617	lru_add_drain();
		618	}
		619
		620	/*
		621	* Isolate one page from the LRU lists and put it on the
		622	* indicated list. Do necessary cache draining if the
		623	* page is not on the LRU lists yet.
		624	*
		625	* Result:
		626	* 0 = page not on LRU list
		627	* 1 = page removed from LRU list and added to the specified list.
		628	* -ENOENT = page is being freed elsewhere.
		629	*/
		630	int isolate_lru_page(struct page *page)
		631	{
		632	int rc = 0;
		633	struct zone *zone = page_zone(page);
		634
		635	redo:
		636	spin_lock_irq(&zone->lru_lock);
		637	rc = __isolate_lru_page(page);
		638	if (rc == 1) {
		639	if (PageActive(page))
		640	del_page_from_active_list(zone, page);
		641	else
		642	del_page_from_inactive_list(zone, page);
		643	}
		644	spin_unlock_irq(&zone->lru_lock);
		645	if (rc == 0) {
		646	/*
		647	* Maybe this page is still waiting for a cpu to drain it
		648	* from one of the lru lists?
		649	*/
		650	rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
		651	if (rc == 0 && PageLRU(page))
		652	goto redo;
		653	}
		654	return rc;
		655	}
		656
617	/*	657	/*
618	* shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed	658	* shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
619	*/	659	*/
@@ -679,6 +719,40 @@ done:
679	pagevec_release(&pvec);	719	pagevec_release(&pvec);
680	}	720	}
681		721
		722	static inline void move_to_lru(struct page *page)
		723	{
		724	list_del(&page->lru);
		725	if (PageActive(page)) {
		726	/*
		727	* lru_cache_add_active checks that
		728	* the PG_active bit is off.
		729	*/
		730	ClearPageActive(page);
		731	lru_cache_add_active(page);
		732	} else {
		733	lru_cache_add(page);
		734	}
		735	put_page(page);
		736	}
		737
		738	/*
		739	* Add isolated pages on the list back to the LRU
		740	*
		741	* returns the number of pages put back.
		742	*/
		743	int putback_lru_pages(struct list_head *l)
		744	{
		745	struct page *page;
		746	struct page *page2;
		747	int count = 0;
		748
		749	list_for_each_entry_safe(page, page2, l, lru) {
		750	move_to_lru(page);
		751	count++;
		752	}
		753	return count;
		754	}
		755
682	/*	756	/*
683	* This moves pages from the active list to the inactive list.	757	* This moves pages from the active list to the inactive list.
684	*	758	*