1 files changed, 100 insertions, 40 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5d4c4d02254d..eca70310adb2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
+#include <linux/vmstat.h>
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
@@ -62,6 +63,8 @@ struct scan_control {
        int swap_cluster_max;
        int swappiness;
+        int all_unreclaimable;
 };
 /*
@@ -368,7 +371,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
                        /* synchronous write or broken a_ops? */
                        ClearPageReclaim(page);
                }
+                inc_zone_page_state(page, NR_VMSCAN_WRITE);
                return PAGE_SUCCESS;
        }
@@ -377,15 +380,34 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 int remove_mapping(struct address_space *mapping, struct page *page)
 {
-        if (!mapping)
+        BUG_ON(!PageLocked(page));
-                return 0;               /* truncate got there first */
+        BUG_ON(mapping != page_mapping(page));
        write_lock_irq(&mapping->tree_lock);
        /*
-         * The non-racy check for busy page.  It is critical to check
+         * The non racy check for a busy page.
-         * PageDirty _after_ making sure that the page is freeable and
+         *
-         * not in use by anybody.       (pagecache + us == 2)
+         * Must be careful with the order of the tests. When someone has
+         * a ref to the page, it may be possible that they dirty it then
+         * drop the reference. So if PageDirty is tested before page_count
+         * here, then the following race may occur:
+         *
+         * get_user_pages(&page);
+         * [user mapping goes away]
+         * write_to(page);
+         *                              !PageDirty(page)    [good]
+         * SetPageDirty(page);
+         * put_page(page);
+         *                              !page_count(page)   [good, discard it]
+         *
+         * [oops, our write_to data is lost]
+         *
+         * Reversing the order of the tests ensures such a situation cannot
+         * escape unnoticed. The smp_rmb is needed to ensure the page->flags
+         * load is not satisfied before that of page->_count.
+         *
+         * Note that if SetPageDirty is always performed via set_page_dirty,
+         * and thus under tree_lock, then this ordering is not required.
         */
        if (unlikely(page_count(page) != 2))
                goto cannot_free;
@@ -440,7 +462,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (TestSetPageLocked(page))
                        goto keep;
-                BUG_ON(PageActive(page));
+                VM_BUG_ON(PageActive(page));
                sc->nr_scanned++;
@@ -547,7 +569,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto free_it;
                }
-                if (!remove_mapping(mapping, page))
+                if (!mapping || !remove_mapping(mapping, page))
                        goto keep_locked;
 free_it:
@@ -564,7 +586,7 @@ keep_locked:
                unlock_page(page);
 keep:
                list_add(&page->lru, &ret_pages);
-                BUG_ON(PageLRU(page));
+                VM_BUG_ON(PageLRU(page));
        }
        list_splice(&ret_pages, page_list);
        if (pagevec_count(&freed_pvec))
@@ -603,7 +625,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);
-                BUG_ON(!PageLRU(page));
+                VM_BUG_ON(!PageLRU(page));
                list_del(&page->lru);
                target = src;
@@ -674,7 +696,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                 */
                while (!list_empty(&page_list)) {
                        page = lru_to_page(&page_list);
-                        BUG_ON(PageLRU(page));
+                        VM_BUG_ON(PageLRU(page));
                        SetPageLRU(page);
                        list_del(&page->lru);
                        if (PageActive(page))
@@ -695,6 +717,11 @@ done:
        return nr_reclaimed;
 }
+static inline int zone_is_near_oom(struct zone *zone)
+{
+        return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
+}
 /*
 * This moves pages from the active list to the inactive list.
 *
@@ -730,6 +757,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                long distress;
                long swap_tendency;
+                if (zone_is_near_oom(zone))
+                        goto force_reclaim_mapped;
                /*
                 * `distress' is a measure of how much trouble we're having
                 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
@@ -765,6 +795,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                 * memory onto the inactive list.
                 */
                if (swap_tendency >= 100)
+force_reclaim_mapped:
                        reclaim_mapped = 1;
        }
@@ -797,9 +828,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        while (!list_empty(&l_inactive)) {
                page = lru_to_page(&l_inactive);
                prefetchw_prev_lru_page(page, &l_inactive, flags);
-                BUG_ON(PageLRU(page));
+                VM_BUG_ON(PageLRU(page));
                SetPageLRU(page);
-                BUG_ON(!PageActive(page));
+                VM_BUG_ON(!PageActive(page));
                ClearPageActive(page);
                list_move(&page->lru, &zone->inactive_list);
@@ -827,9 +858,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        while (!list_empty(&l_active)) {
                page = lru_to_page(&l_active);
                prefetchw_prev_lru_page(page, &l_active, flags);
-                BUG_ON(PageLRU(page));
+                VM_BUG_ON(PageLRU(page));
                SetPageLRU(page);
-                BUG_ON(!PageActive(page));
+                VM_BUG_ON(!PageActive(page));
                list_move(&page->lru, &zone->active_list);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
@@ -925,6 +956,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
        unsigned long nr_reclaimed = 0;
        int i;
+        sc->all_unreclaimable = 1;
        for (i = 0; zones[i] != NULL; i++) {
                struct zone *zone = zones[i];
@@ -941,6 +973,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
                if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                        continue;       /* Let kswapd poll it */
+                sc->all_unreclaimable = 0;
                nr_reclaimed += shrink_zone(priority, zone, sc);
        }
        return nr_reclaimed;
@@ -1021,6 +1055,9 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
                        blk_congestion_wait(WRITE, HZ/10);
        }
+        /* top priority shrink_caches still had more to do? don't OOM, then */
+        if (!sc.all_unreclaimable)
+                ret = 1;
 out:
        for (i = 0; zones[i] != 0; i++) {
                struct zone *zone = zones[i];
@@ -1153,7 +1190,7 @@ scan:
                        if (zone->all_unreclaimable)
                                continue;
                        if (nr_slab == 0 && zone->pages_scanned >=
-                                    (zone->nr_active + zone->nr_inactive) * 4)
+                                    (zone->nr_active + zone->nr_inactive) * 6)
                                zone->all_unreclaimable = 1;
                        /*
                         * If we've done a decent amount of scanning and
@@ -1361,7 +1398,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
        for_each_zone(zone)
                lru_pages += zone->nr_active + zone->nr_inactive;
-        nr_slab = global_page_state(NR_SLAB);
+        nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
        /* If slab caches are huge, it's better to hit them first */
        while (nr_slab >= lru_pages) {
                reclaim_state.reclaimed_slab = 0;
@@ -1510,7 +1547,6 @@ int zone_reclaim_mode __read_mostly;
 #define RECLAIM_ZONE (1<<0)     /* Run shrink_cache on the zone */
 #define RECLAIM_WRITE (1<<1)    /* Writeout pages during reclaim */
 #define RECLAIM_SWAP (1<<2)     /* Swap pages out during reclaim */
-#define RECLAIM_SLAB (1<<3)     /* Do a global slab shrink if the zone is out of memory */
 /*
 * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -1526,6 +1562,12 @@ int zone_reclaim_mode __read_mostly;
 int sysctl_min_unmapped_ratio = 1;
 /*
+ * If the number of slab pages in a zone grows beyond this percentage then
+ * slab reclaim needs to occur.
+ */
+int sysctl_min_slab_ratio = 5;
+/*
 * Try to free up some pages from this zone through reclaim.
 */
 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -1544,6 +1586,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .gfp_mask = gfp_mask,
                .swappiness = vm_swappiness,
        };
+        unsigned long slab_reclaimable;
        disable_swap_token();
        cond_resched();
@@ -1556,29 +1599,43 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
-        /*
+        if (zone_page_state(zone, NR_FILE_PAGES) -
-         * Free memory by calling shrink zone with increasing priorities
+                zone_page_state(zone, NR_FILE_MAPPED) >
-         * until we have enough memory freed.
+                zone->min_unmapped_pages) {
-         */
+                /*
-        priority = ZONE_RECLAIM_PRIORITY;
+                 * Free memory by calling shrink zone with increasing
-        do {
+                 * priorities until we have enough memory freed.
-                nr_reclaimed += shrink_zone(priority, zone, &sc);
+                 */
-                priority--;
+                priority = ZONE_RECLAIM_PRIORITY;
-        } while (priority >= 0 && nr_reclaimed < nr_pages);
+                do {
+                        nr_reclaimed += shrink_zone(priority, zone, &sc);
+                        priority--;
+                } while (priority >= 0 && nr_reclaimed < nr_pages);
+        }
-        if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+        slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+        if (slab_reclaimable > zone->min_slab_pages) {
                /*
                 * shrink_slab() does not currently allow us to determine how
-                 * many pages were freed in this zone. So we just shake the slab
+                 * many pages were freed in this zone. So we take the current
-                 * a bit and then go off node for this particular allocation
+                 * number of slab pages and shake the slab until it is reduced
-                 * despite possibly having freed enough memory to allocate in
+                 * by the same nr_pages that we used for reclaiming unmapped
-                 * this zone.  If we freed local memory then the next
+                 * pages.
-                 * allocations will be local again.
                 *
-                 * shrink_slab will free memory on all zones and may take
+                 * Note that shrink_slab will free memory on all zones and may
-                 * a long time.
+                 * take a long time.
+                 */
+                while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+                        zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
+                                slab_reclaimable - nr_pages)
+                        ;
+                /*
+                 * Update nr_reclaimed by the number of slab pages we
+                 * reclaimed from this zone.
                 */
-                shrink_slab(sc.nr_scanned, gfp_mask, order);
+                nr_reclaimed += slab_reclaimable -
+                        zone_page_state(zone, NR_SLAB_RECLAIMABLE);
        }
        p->reclaim_state = NULL;
@@ -1592,7 +1649,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        int node_id;
        /*
-         * Zone reclaim reclaims unmapped file backed pages.
+         * Zone reclaim reclaims unmapped file backed pages and
+         * slab pages if we are over the defined limits.
         *
         * A small portion of unmapped file backed pages is needed for
         * file I/O otherwise pages read by file I/O will be immediately
@@ -1601,7 +1659,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         * unmapped file backed pages.
         */
        if (zone_page_state(zone, NR_FILE_PAGES) -
-            zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio)
+            zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
+            && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
+                        <= zone->min_slab_pages)
                return 0;
        /*
@@ -1621,7 +1681,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         * over remote processors and spread off node memory allocations
         * as wide as possible.
         */
-        node_id = zone->zone_pgdat->node_id;
+        node_id = zone_to_nid(zone);
        mask = node_to_cpumask(node_id);
        if (!cpus_empty(mask) && node_id != numa_node_id())
                return 0;

diff --git a/mm/vmscan.c b/mm/vmscan.c index 5d4c4d02254d..eca70310adb2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
19	#include <linux/pagemap.h>	19	#include <linux/pagemap.h>
20	#include <linux/init.h>	20	#include <linux/init.h>
21	#include <linux/highmem.h>	21	#include <linux/highmem.h>
		22	#include <linux/vmstat.h>
22	#include <linux/file.h>	23	#include <linux/file.h>
23	#include <linux/writeback.h>	24	#include <linux/writeback.h>
24	#include <linux/blkdev.h>	25	#include <linux/blkdev.h>
@@ -62,6 +63,8 @@ struct scan_control {
62	int swap_cluster_max;	63	int swap_cluster_max;
63		64
64	int swappiness;	65	int swappiness;
		66
		67	int all_unreclaimable;
65	};	68	};
66		69
67	/*	70	/*
@@ -368,7 +371,7 @@ static pageout_t pageout(struct page page, struct address_space mapping)
368	/* synchronous write or broken a_ops? */	371	/* synchronous write or broken a_ops? */
369	ClearPageReclaim(page);	372	ClearPageReclaim(page);
370	}	373	}
371		374	inc_zone_page_state(page, NR_VMSCAN_WRITE);
372	return PAGE_SUCCESS;	375	return PAGE_SUCCESS;
373	}	376	}
374		377
@@ -377,15 +380,34 @@ static pageout_t pageout(struct page page, struct address_space mapping)
377		380
378	int remove_mapping(struct address_space mapping, struct page page)	381	int remove_mapping(struct address_space mapping, struct page page)
379	{	382	{
380	if (!mapping)	383	BUG_ON(!PageLocked(page));
381	return 0; /* truncate got there first */	384	BUG_ON(mapping != page_mapping(page));
382		385
383	write_lock_irq(&mapping->tree_lock);	386	write_lock_irq(&mapping->tree_lock);
384
385	/*	387	/*
386	* The non-racy check for busy page. It is critical to check	388	* The non racy check for a busy page.
387	* PageDirty _after_ making sure that the page is freeable and	389	*
388	* not in use by anybody. (pagecache + us == 2)	390	* Must be careful with the order of the tests. When someone has
		391	* a ref to the page, it may be possible that they dirty it then
		392	* drop the reference. So if PageDirty is tested before page_count
		393	* here, then the following race may occur:
		394	*
		395	* get_user_pages(&page);
		396	* [user mapping goes away]
		397	* write_to(page);
		398	* !PageDirty(page) [good]
		399	* SetPageDirty(page);
		400	* put_page(page);
		401	* !page_count(page) [good, discard it]
		402	*
		403	* [oops, our write_to data is lost]
		404	*
		405	* Reversing the order of the tests ensures such a situation cannot
		406	* escape unnoticed. The smp_rmb is needed to ensure the page->flags
		407	* load is not satisfied before that of page->_count.
		408	*
		409	* Note that if SetPageDirty is always performed via set_page_dirty,
		410	* and thus under tree_lock, then this ordering is not required.
389	*/	411	*/
390	if (unlikely(page_count(page) != 2))	412	if (unlikely(page_count(page) != 2))
391	goto cannot_free;	413	goto cannot_free;
@@ -440,7 +462,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
440	if (TestSetPageLocked(page))	462	if (TestSetPageLocked(page))
441	goto keep;	463	goto keep;
442		464
443	BUG_ON(PageActive(page));	465	VM_BUG_ON(PageActive(page));
444		466
445	sc->nr_scanned++;	467	sc->nr_scanned++;
446		468
@@ -547,7 +569,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
547	goto free_it;	569	goto free_it;
548	}	570	}
549		571
550	if (!remove_mapping(mapping, page))	572	if (!mapping \|\| !remove_mapping(mapping, page))
551	goto keep_locked;	573	goto keep_locked;
552		574
553	free_it:	575	free_it:
@@ -564,7 +586,7 @@ keep_locked:
564	unlock_page(page);	586	unlock_page(page);
565	keep:	587	keep:
566	list_add(&page->lru, &ret_pages);	588	list_add(&page->lru, &ret_pages);
567	BUG_ON(PageLRU(page));	589	VM_BUG_ON(PageLRU(page));
568	}	590	}
569	list_splice(&ret_pages, page_list);	591	list_splice(&ret_pages, page_list);
570	if (pagevec_count(&freed_pvec))	592	if (pagevec_count(&freed_pvec))
@@ -603,7 +625,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
603	page = lru_to_page(src);	625	page = lru_to_page(src);
604	prefetchw_prev_lru_page(page, src, flags);	626	prefetchw_prev_lru_page(page, src, flags);
605		627
606	BUG_ON(!PageLRU(page));	628	VM_BUG_ON(!PageLRU(page));
607		629
608	list_del(&page->lru);	630	list_del(&page->lru);
609	target = src;	631	target = src;
@@ -674,7 +696,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
674	*/	696	*/
675	while (!list_empty(&page_list)) {	697	while (!list_empty(&page_list)) {
676	page = lru_to_page(&page_list);	698	page = lru_to_page(&page_list);
677	BUG_ON(PageLRU(page));	699	VM_BUG_ON(PageLRU(page));
678	SetPageLRU(page);	700	SetPageLRU(page);
679	list_del(&page->lru);	701	list_del(&page->lru);
680	if (PageActive(page))	702	if (PageActive(page))
@@ -695,6 +717,11 @@ done:
695	return nr_reclaimed;	717	return nr_reclaimed;
696	}	718	}
697		719
		720	static inline int zone_is_near_oom(struct zone *zone)
		721	{
		722	return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
		723	}
		724
698	/*	725	/*
699	* This moves pages from the active list to the inactive list.	726	* This moves pages from the active list to the inactive list.
700	*	727	*
@@ -730,6 +757,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
730	long distress;	757	long distress;
731	long swap_tendency;	758	long swap_tendency;
732		759
		760	if (zone_is_near_oom(zone))
		761	goto force_reclaim_mapped;
		762
733	/*	763	/*
734	* `distress' is a measure of how much trouble we're having	764	* `distress' is a measure of how much trouble we're having
735	* reclaiming pages. 0 -> no problems. 100 -> great trouble.	765	* reclaiming pages. 0 -> no problems. 100 -> great trouble.
@@ -765,6 +795,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
765	* memory onto the inactive list.	795	* memory onto the inactive list.
766	*/	796	*/
767	if (swap_tendency >= 100)	797	if (swap_tendency >= 100)
		798	force_reclaim_mapped:
768	reclaim_mapped = 1;	799	reclaim_mapped = 1;
769	}	800	}
770		801
@@ -797,9 +828,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
797	while (!list_empty(&l_inactive)) {	828	while (!list_empty(&l_inactive)) {
798	page = lru_to_page(&l_inactive);	829	page = lru_to_page(&l_inactive);
799	prefetchw_prev_lru_page(page, &l_inactive, flags);	830	prefetchw_prev_lru_page(page, &l_inactive, flags);
800	BUG_ON(PageLRU(page));	831	VM_BUG_ON(PageLRU(page));
801	SetPageLRU(page);	832	SetPageLRU(page);
802	BUG_ON(!PageActive(page));	833	VM_BUG_ON(!PageActive(page));
803	ClearPageActive(page);	834	ClearPageActive(page);
804		835
805	list_move(&page->lru, &zone->inactive_list);	836	list_move(&page->lru, &zone->inactive_list);
@@ -827,9 +858,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
827	while (!list_empty(&l_active)) {	858	while (!list_empty(&l_active)) {
828	page = lru_to_page(&l_active);	859	page = lru_to_page(&l_active);
829	prefetchw_prev_lru_page(page, &l_active, flags);	860	prefetchw_prev_lru_page(page, &l_active, flags);
830	BUG_ON(PageLRU(page));	861	VM_BUG_ON(PageLRU(page));
831	SetPageLRU(page);	862	SetPageLRU(page);
832	BUG_ON(!PageActive(page));	863	VM_BUG_ON(!PageActive(page));
833	list_move(&page->lru, &zone->active_list);	864	list_move(&page->lru, &zone->active_list);
834	pgmoved++;	865	pgmoved++;
835	if (!pagevec_add(&pvec, page)) {	866	if (!pagevec_add(&pvec, page)) {
@@ -925,6 +956,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
925	unsigned long nr_reclaimed = 0;	956	unsigned long nr_reclaimed = 0;
926	int i;	957	int i;
927		958
		959	sc->all_unreclaimable = 1;
928	for (i = 0; zones[i] != NULL; i++) {	960	for (i = 0; zones[i] != NULL; i++) {
929	struct zone *zone = zones[i];	961	struct zone *zone = zones[i];
930		962
@@ -941,6 +973,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
941	if (zone->all_unreclaimable && priority != DEF_PRIORITY)	973	if (zone->all_unreclaimable && priority != DEF_PRIORITY)
942	continue; /* Let kswapd poll it */	974	continue; /* Let kswapd poll it */
943		975
		976	sc->all_unreclaimable = 0;
		977
944	nr_reclaimed += shrink_zone(priority, zone, sc);	978	nr_reclaimed += shrink_zone(priority, zone, sc);
945	}	979	}
946	return nr_reclaimed;	980	return nr_reclaimed;
@@ -1021,6 +1055,9 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1021	if (sc.nr_scanned && priority < DEF_PRIORITY - 2)	1055	if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
1022	blk_congestion_wait(WRITE, HZ/10);	1056	blk_congestion_wait(WRITE, HZ/10);
1023	}	1057	}
		1058	/* top priority shrink_caches still had more to do? don't OOM, then */
		1059	if (!sc.all_unreclaimable)
		1060	ret = 1;
1024	out:	1061	out:
1025	for (i = 0; zones[i] != 0; i++) {	1062	for (i = 0; zones[i] != 0; i++) {
1026	struct zone *zone = zones[i];	1063	struct zone *zone = zones[i];
@@ -1153,7 +1190,7 @@ scan:
1153	if (zone->all_unreclaimable)	1190	if (zone->all_unreclaimable)
1154	continue;	1191	continue;
1155	if (nr_slab == 0 && zone->pages_scanned >=	1192	if (nr_slab == 0 && zone->pages_scanned >=
1156	(zone->nr_active + zone->nr_inactive) * 4)	1193	(zone->nr_active + zone->nr_inactive) * 6)
1157	zone->all_unreclaimable = 1;	1194	zone->all_unreclaimable = 1;
1158	/*	1195	/*
1159	* If we've done a decent amount of scanning and	1196	* If we've done a decent amount of scanning and
@@ -1361,7 +1398,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1361	for_each_zone(zone)	1398	for_each_zone(zone)
1362	lru_pages += zone->nr_active + zone->nr_inactive;	1399	lru_pages += zone->nr_active + zone->nr_inactive;
1363		1400
1364	nr_slab = global_page_state(NR_SLAB);	1401	nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
1365	/* If slab caches are huge, it's better to hit them first */	1402	/* If slab caches are huge, it's better to hit them first */
1366	while (nr_slab >= lru_pages) {	1403	while (nr_slab >= lru_pages) {
1367	reclaim_state.reclaimed_slab = 0;	1404	reclaim_state.reclaimed_slab = 0;
@@ -1510,7 +1547,6 @@ int zone_reclaim_mode __read_mostly;
1510	#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */	1547	#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
1511	#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */	1548	#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
1512	#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */	1549	#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
1513	#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */
1514		1550
1515	/*	1551	/*
1516	* Priority for ZONE_RECLAIM. This determines the fraction of pages	1552	* Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -1526,6 +1562,12 @@ int zone_reclaim_mode __read_mostly;
1526	int sysctl_min_unmapped_ratio = 1;	1562	int sysctl_min_unmapped_ratio = 1;
1527		1563
1528	/*	1564	/*
		1565	* If the number of slab pages in a zone grows beyond this percentage then
		1566	* slab reclaim needs to occur.
		1567	*/
		1568	int sysctl_min_slab_ratio = 5;
		1569
		1570	/*
1529	* Try to free up some pages from this zone through reclaim.	1571	* Try to free up some pages from this zone through reclaim.
1530	*/	1572	*/
1531	static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)	1573	static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -1544,6 +1586,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1544	.gfp_mask = gfp_mask,	1586	.gfp_mask = gfp_mask,
1545	.swappiness = vm_swappiness,	1587	.swappiness = vm_swappiness,
1546	};	1588	};
		1589	unsigned long slab_reclaimable;
1547		1590
1548	disable_swap_token();	1591	disable_swap_token();
1549	cond_resched();	1592	cond_resched();
@@ -1556,29 +1599,43 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1556	reclaim_state.reclaimed_slab = 0;	1599	reclaim_state.reclaimed_slab = 0;
1557	p->reclaim_state = &reclaim_state;	1600	p->reclaim_state = &reclaim_state;
1558		1601
1559	/*	1602	if (zone_page_state(zone, NR_FILE_PAGES) -
1560	* Free memory by calling shrink zone with increasing priorities	1603	zone_page_state(zone, NR_FILE_MAPPED) >
1561	* until we have enough memory freed.	1604	zone->min_unmapped_pages) {
1562	*/	1605	/*
1563	priority = ZONE_RECLAIM_PRIORITY;	1606	* Free memory by calling shrink zone with increasing
1564	do {	1607	* priorities until we have enough memory freed.
1565	nr_reclaimed += shrink_zone(priority, zone, &sc);	1608	*/
1566	priority--;	1609	priority = ZONE_RECLAIM_PRIORITY;
1567	} while (priority >= 0 && nr_reclaimed < nr_pages);	1610	do {
		1611	nr_reclaimed += shrink_zone(priority, zone, &sc);
		1612	priority--;
		1613	} while (priority >= 0 && nr_reclaimed < nr_pages);
		1614	}
1568		1615
1569	if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {	1616	slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
		1617	if (slab_reclaimable > zone->min_slab_pages) {
1570	/*	1618	/*
1571	* shrink_slab() does not currently allow us to determine how	1619	* shrink_slab() does not currently allow us to determine how
1572	* many pages were freed in this zone. So we just shake the slab	1620	* many pages were freed in this zone. So we take the current
1573	* a bit and then go off node for this particular allocation	1621	* number of slab pages and shake the slab until it is reduced
1574	* despite possibly having freed enough memory to allocate in	1622	* by the same nr_pages that we used for reclaiming unmapped
1575	* this zone. If we freed local memory then the next	1623	* pages.
1576	* allocations will be local again.
1577	*	1624	*
1578	* shrink_slab will free memory on all zones and may take	1625	* Note that shrink_slab will free memory on all zones and may
1579	* a long time.	1626	* take a long time.
		1627	*/
		1628	while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
		1629	zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
		1630	slab_reclaimable - nr_pages)
		1631	;
		1632
		1633	/*
		1634	* Update nr_reclaimed by the number of slab pages we
		1635	* reclaimed from this zone.
1580	*/	1636	*/
1581	shrink_slab(sc.nr_scanned, gfp_mask, order);	1637	nr_reclaimed += slab_reclaimable -
		1638	zone_page_state(zone, NR_SLAB_RECLAIMABLE);
1582	}	1639	}
1583		1640
1584	p->reclaim_state = NULL;	1641	p->reclaim_state = NULL;
@@ -1592,7 +1649,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1592	int node_id;	1649	int node_id;
1593		1650
1594	/*	1651	/*
1595	* Zone reclaim reclaims unmapped file backed pages.	1652	* Zone reclaim reclaims unmapped file backed pages and
		1653	* slab pages if we are over the defined limits.
1596	*	1654	*
1597	* A small portion of unmapped file backed pages is needed for	1655	* A small portion of unmapped file backed pages is needed for
1598	* file I/O otherwise pages read by file I/O will be immediately	1656	* file I/O otherwise pages read by file I/O will be immediately
@@ -1601,7 +1659,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1601	* unmapped file backed pages.	1659	* unmapped file backed pages.
1602	*/	1660	*/
1603	if (zone_page_state(zone, NR_FILE_PAGES) -	1661	if (zone_page_state(zone, NR_FILE_PAGES) -
1604	zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio)	1662	zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
		1663	&& zone_page_state(zone, NR_SLAB_RECLAIMABLE)
		1664	<= zone->min_slab_pages)
1605	return 0;	1665	return 0;
1606		1666
1607	/*	1667	/*
@@ -1621,7 +1681,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1621	* over remote processors and spread off node memory allocations	1681	* over remote processors and spread off node memory allocations
1622	* as wide as possible.	1682	* as wide as possible.
1623	*/	1683	*/
1624	node_id = zone->zone_pgdat->node_id;	1684	node_id = zone_to_nid(zone);
1625	mask = node_to_cpumask(node_id);	1685	mask = node_to_cpumask(node_id);
1626	if (!cpus_empty(mask) && node_id != numa_node_id())	1686	if (!cpus_empty(mask) && node_id != numa_node_id())
1627	return 0;	1687	return 0;