1 files changed, 105 insertions, 41 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bf903b2d198f..2e34b61a70c7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -71,6 +71,9 @@ struct scan_control {
        int may_writepage;
+        /* Can pages be swapped as part of reclaim? */
+        int may_swap;
        /* This context's SWAP_CLUSTER_MAX. If freeing memory for
         * suspend, we effectively ignore SWAP_CLUSTER_MAX.
         * In this context, it doesn't matter that we scan the
@@ -458,6 +461,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                 * Try to allocate it some swap space here.
                 */
                if (PageAnon(page) && !PageSwapCache(page)) {
+                        if (!sc->may_swap)
+                                goto keep_locked;
                        if (!add_to_swap(page, GFP_ATOMIC))
                                goto activate_locked;
                }
@@ -586,7 +591,7 @@ static inline void move_to_lru(struct page *page)
 }
 /*
- * Add isolated pages on the list back to the LRU
+ * Add isolated pages on the list back to the LRU.
 *
 * returns the number of pages put back.
 */
@@ -760,46 +765,33 @@ next:
        return nr_failed + retry;
 }
-static void lru_add_drain_per_cpu(void *dummy)
-{
-        lru_add_drain();
-}
 /*
 * Isolate one page from the LRU lists and put it on the
- * indicated list. Do necessary cache draining if the
+ * indicated list with elevated refcount.
- * page is not on the LRU lists yet.
 *
 * Result:
 *  0 = page not on LRU list
 *  1 = page removed from LRU list and added to the specified list.
- * -ENOENT = page is being freed elsewhere.
 */
 int isolate_lru_page(struct page *page)
 {
-        int rc = 0;
+        int ret = 0;
-        struct zone *zone = page_zone(page);
-redo:
+        if (PageLRU(page)) {
-        spin_lock_irq(&zone->lru_lock);
+                struct zone *zone = page_zone(page);
-        rc = __isolate_lru_page(page);
+                spin_lock_irq(&zone->lru_lock);
-        if (rc == 1) {
+                if (TestClearPageLRU(page)) {
-                if (PageActive(page))
+                        ret = 1;
-                        del_page_from_active_list(zone, page);
+                        get_page(page);
-                else
+                        if (PageActive(page))
-                        del_page_from_inactive_list(zone, page);
+                                del_page_from_active_list(zone, page);
-        }
+                        else
-        spin_unlock_irq(&zone->lru_lock);
+                                del_page_from_inactive_list(zone, page);
-        if (rc == 0) {
+                }
-                /*
+                spin_unlock_irq(&zone->lru_lock);
-                 * Maybe this page is still waiting for a cpu to drain it
-                 * from one of the lru lists?
-                 */
-                rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
-                if (rc == 0 && PageLRU(page))
-                        goto redo;
        }
-        return rc;
+        return ret;
 }
 #endif
@@ -831,18 +823,20 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);
-                switch (__isolate_lru_page(page)) {
+                if (!TestClearPageLRU(page))
-                case 1:
-                        /* Succeeded to isolate page */
-                        list_move(&page->lru, dst);
-                        nr_taken++;
-                        break;
-                case -ENOENT:
-                        /* Not possible to isolate */
-                        list_move(&page->lru, src);
-                        break;
-                default:
                        BUG();
+                list_del(&page->lru);
+                if (get_page_testone(page)) {
+                        /*
+                         * It is being freed elsewhere
+                         */
+                        __put_page(page);
+                        SetPageLRU(page);
+                        list_add(&page->lru, src);
+                        continue;
+                } else {
+                        list_add(&page->lru, dst);
+                        nr_taken++;
                }
        }
@@ -1177,6 +1171,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
        sc.gfp_mask = gfp_mask;
        sc.may_writepage = 0;
+        sc.may_swap = 1;
        inc_page_state(allocstall);
@@ -1279,6 +1274,7 @@ loop_again:
        total_reclaimed = 0;
        sc.gfp_mask = GFP_KERNEL;
        sc.may_writepage = 0;
+        sc.may_swap = 1;
        sc.nr_mapped = read_page_state(nr_mapped);
        inc_page_state(pageoutrun);
@@ -1576,3 +1572,71 @@ static int __init kswapd_init(void)
 }
 module_init(kswapd_init)
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
+/*
+ * Mininum time between zone reclaim scans
+ */
+#define ZONE_RECLAIM_INTERVAL HZ/2
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+        int nr_pages = 1 << order;
+        struct task_struct *p = current;
+        struct reclaim_state reclaim_state;
+        struct scan_control sc = {
+                .gfp_mask       = gfp_mask,
+                .may_writepage  = 0,
+                .may_swap       = 0,
+                .nr_mapped      = read_page_state(nr_mapped),
+                .nr_scanned     = 0,
+                .nr_reclaimed   = 0,
+                .priority       = 0
+        };
+        if (!(gfp_mask & __GFP_WAIT) ||
+                zone->zone_pgdat->node_id != numa_node_id() ||
+                zone->all_unreclaimable ||
+                atomic_read(&zone->reclaim_in_progress) > 0)
+                        return 0;
+        if (time_before(jiffies,
+                zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+                        return 0;
+        disable_swap_token();
+        if (nr_pages > SWAP_CLUSTER_MAX)
+                sc.swap_cluster_max = nr_pages;
+        else
+                sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+        cond_resched();
+        p->flags |= PF_MEMALLOC;
+        reclaim_state.reclaimed_slab = 0;
+        p->reclaim_state = &reclaim_state;
+        shrink_zone(zone, &sc);
+        p->reclaim_state = NULL;
+        current->flags &= ~PF_MEMALLOC;
+        if (sc.nr_reclaimed == 0)
+                zone->last_unsuccessful_zone_reclaim = jiffies;
+        return sc.nr_reclaimed > nr_pages;
+}
+#endif

diff --git a/mm/vmscan.c b/mm/vmscan.c index bf903b2d198f..2e34b61a70c7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -71,6 +71,9 @@ struct scan_control {
71		71
72	int may_writepage;	72	int may_writepage;
73		73
		74	/* Can pages be swapped as part of reclaim? */
		75	int may_swap;
		76
74	/* This context's SWAP_CLUSTER_MAX. If freeing memory for	77	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
75	* suspend, we effectively ignore SWAP_CLUSTER_MAX.	78	* suspend, we effectively ignore SWAP_CLUSTER_MAX.
76	* In this context, it doesn't matter that we scan the	79	* In this context, it doesn't matter that we scan the
@@ -458,6 +461,8 @@ static int shrink_list(struct list_head page_list, struct scan_control sc)
458	* Try to allocate it some swap space here.	461	* Try to allocate it some swap space here.
459	*/	462	*/
460	if (PageAnon(page) && !PageSwapCache(page)) {	463	if (PageAnon(page) && !PageSwapCache(page)) {
		464	if (!sc->may_swap)
		465	goto keep_locked;
461	if (!add_to_swap(page, GFP_ATOMIC))	466	if (!add_to_swap(page, GFP_ATOMIC))
462	goto activate_locked;	467	goto activate_locked;
463	}	468	}
@@ -586,7 +591,7 @@ static inline void move_to_lru(struct page *page)
586	}	591	}
587		592
588	/*	593	/*
589	* Add isolated pages on the list back to the LRU	594	* Add isolated pages on the list back to the LRU.
590	*	595	*
591	* returns the number of pages put back.	596	* returns the number of pages put back.
592	*/	597	*/
@@ -760,46 +765,33 @@ next:
760	return nr_failed + retry;	765	return nr_failed + retry;
761	}	766	}
762		767
763	static void lru_add_drain_per_cpu(void *dummy)
764	{
765	lru_add_drain();
766	}
767
768	/*	768	/*
769	* Isolate one page from the LRU lists and put it on the	769	* Isolate one page from the LRU lists and put it on the
770	* indicated list. Do necessary cache draining if the	770	* indicated list with elevated refcount.
771	* page is not on the LRU lists yet.
772	*	771	*
773	* Result:	772	* Result:
774	* 0 = page not on LRU list	773	* 0 = page not on LRU list
775	* 1 = page removed from LRU list and added to the specified list.	774	* 1 = page removed from LRU list and added to the specified list.
776	* -ENOENT = page is being freed elsewhere.
777	*/	775	*/
778	int isolate_lru_page(struct page *page)	776	int isolate_lru_page(struct page *page)
779	{	777	{
780	int rc = 0;	778	int ret = 0;
781	struct zone *zone = page_zone(page);
782		779
783	redo:	780	if (PageLRU(page)) {
784	spin_lock_irq(&zone->lru_lock);	781	struct zone *zone = page_zone(page);
785	rc = __isolate_lru_page(page);	782	spin_lock_irq(&zone->lru_lock);
786	if (rc == 1) {	783	if (TestClearPageLRU(page)) {
787	if (PageActive(page))	784	ret = 1;
788	del_page_from_active_list(zone, page);	785	get_page(page);
789	else	786	if (PageActive(page))
790	del_page_from_inactive_list(zone, page);	787	del_page_from_active_list(zone, page);
791	}	788	else
792	spin_unlock_irq(&zone->lru_lock);	789	del_page_from_inactive_list(zone, page);
793	if (rc == 0) {	790	}
794	/*	791	spin_unlock_irq(&zone->lru_lock);
795	* Maybe this page is still waiting for a cpu to drain it
796	* from one of the lru lists?
797	*/
798	rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
799	if (rc == 0 && PageLRU(page))
800	goto redo;
801	}	792	}
802	return rc;	793
		794	return ret;
803	}	795	}
804	#endif	796	#endif
805		797
@@ -831,18 +823,20 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
831	page = lru_to_page(src);	823	page = lru_to_page(src);
832	prefetchw_prev_lru_page(page, src, flags);	824	prefetchw_prev_lru_page(page, src, flags);
833		825
834	switch (__isolate_lru_page(page)) {	826	if (!TestClearPageLRU(page))
835	case 1:
836	/* Succeeded to isolate page */
837	list_move(&page->lru, dst);
838	nr_taken++;
839	break;
840	case -ENOENT:
841	/* Not possible to isolate */
842	list_move(&page->lru, src);
843	break;
844	default:
845	BUG();	827	BUG();
		828	list_del(&page->lru);
		829	if (get_page_testone(page)) {
		830	/*
		831	* It is being freed elsewhere
		832	*/
		833	__put_page(page);
		834	SetPageLRU(page);
		835	list_add(&page->lru, src);
		836	continue;
		837	} else {
		838	list_add(&page->lru, dst);
		839	nr_taken++;
846	}	840	}
847	}	841	}
848		842
@@ -1177,6 +1171,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1177		1171
1178	sc.gfp_mask = gfp_mask;	1172	sc.gfp_mask = gfp_mask;
1179	sc.may_writepage = 0;	1173	sc.may_writepage = 0;
		1174	sc.may_swap = 1;
1180		1175
1181	inc_page_state(allocstall);	1176	inc_page_state(allocstall);
1182		1177
@@ -1279,6 +1274,7 @@ loop_again:
1279	total_reclaimed = 0;	1274	total_reclaimed = 0;
1280	sc.gfp_mask = GFP_KERNEL;	1275	sc.gfp_mask = GFP_KERNEL;
1281	sc.may_writepage = 0;	1276	sc.may_writepage = 0;
		1277	sc.may_swap = 1;
1282	sc.nr_mapped = read_page_state(nr_mapped);	1278	sc.nr_mapped = read_page_state(nr_mapped);
1283		1279
1284	inc_page_state(pageoutrun);	1280	inc_page_state(pageoutrun);
@@ -1576,3 +1572,71 @@ static int __init kswapd_init(void)
1576	}	1572	}
1577		1573
1578	module_init(kswapd_init)	1574	module_init(kswapd_init)
		1575
		1576	#ifdef CONFIG_NUMA
		1577	/*
		1578	* Zone reclaim mode
		1579	*
		1580	* If non-zero call zone_reclaim when the number of free pages falls below
		1581	* the watermarks.
		1582	*
		1583	* In the future we may add flags to the mode. However, the page allocator
		1584	* should only have to check that zone_reclaim_mode != 0 before calling
		1585	* zone_reclaim().
		1586	*/
		1587	int zone_reclaim_mode __read_mostly;
		1588
		1589	/*
		1590	* Mininum time between zone reclaim scans
		1591	*/
		1592	#define ZONE_RECLAIM_INTERVAL HZ/2
		1593	/*
		1594	* Try to free up some pages from this zone through reclaim.
		1595	*/
		1596	int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
		1597	{
		1598	int nr_pages = 1 << order;
		1599	struct task_struct *p = current;
		1600	struct reclaim_state reclaim_state;
		1601	struct scan_control sc = {
		1602	.gfp_mask = gfp_mask,
		1603	.may_writepage = 0,
		1604	.may_swap = 0,
		1605	.nr_mapped = read_page_state(nr_mapped),
		1606	.nr_scanned = 0,
		1607	.nr_reclaimed = 0,
		1608	.priority = 0
		1609	};
		1610
		1611	if (!(gfp_mask & __GFP_WAIT) \|\|
		1612	zone->zone_pgdat->node_id != numa_node_id() \|\|
		1613	zone->all_unreclaimable \|\|
		1614	atomic_read(&zone->reclaim_in_progress) > 0)
		1615	return 0;
		1616
		1617	if (time_before(jiffies,
		1618	zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
		1619	return 0;
		1620
		1621	disable_swap_token();
		1622
		1623	if (nr_pages > SWAP_CLUSTER_MAX)
		1624	sc.swap_cluster_max = nr_pages;
		1625	else
		1626	sc.swap_cluster_max = SWAP_CLUSTER_MAX;
		1627
		1628	cond_resched();
		1629	p->flags \|= PF_MEMALLOC;
		1630	reclaim_state.reclaimed_slab = 0;
		1631	p->reclaim_state = &reclaim_state;
		1632	shrink_zone(zone, &sc);
		1633	p->reclaim_state = NULL;
		1634	current->flags &= ~PF_MEMALLOC;
		1635
		1636	if (sc.nr_reclaimed == 0)
		1637	zone->last_unsuccessful_zone_reclaim = jiffies;
		1638
		1639	return sc.nr_reclaimed > nr_pages;
		1640	}
		1641	#endif
		1642