Merge branch 'master' into for-next

Sync with Linus' tree to be able to apply pending patches that are based on newer code already present upstream.
author: Jiri Kosina <jkosina@suse.cz> 2011-07-11 08:15:48 -0400
committer: Jiri Kosina <jkosina@suse.cz> 2011-07-11 08:15:55 -0400
commit: b7e9c223be8ce335e30f2cf6ba588e6a4092275c (patch)
tree: 2d1e3b75606abc18df7ad65e51ac3f90cd68b38d /mm/memcontrol.c
parent: c172d82500a6cf3c32d1e650722a1055d72ce858 (diff)
parent: e3bbfa78bab125f58b831b5f7f45b5a305091d72 (diff)
1 files changed, 158 insertions, 64 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bd9052a5d3ad..e013b8e57d25 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,6 +35,7 @@
 #include <linux/limits.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
+#include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
@@ -107,10 +108,12 @@ enum mem_cgroup_events_index {
 enum mem_cgroup_events_target {
        MEM_CGROUP_TARGET_THRESH,
        MEM_CGROUP_TARGET_SOFTLIMIT,
+        MEM_CGROUP_TARGET_NUMAINFO,
        MEM_CGROUP_NTARGETS,
 };
 #define THRESHOLDS_EVENTS_TARGET (128)
 #define SOFTLIMIT_EVENTS_TARGET (1024)
+#define NUMAINFO_EVENTS_TARGET  (1024)
 struct mem_cgroup_stat_cpu {
        long count[MEM_CGROUP_STAT_NSTATS];
@@ -236,7 +239,8 @@ struct mem_cgroup {
        int last_scanned_node;
 #if MAX_NUMNODES > 1
        nodemask_t      scan_nodes;
-        unsigned long   next_scan_node_update;
+        atomic_t        numainfo_events;
+        atomic_t        numainfo_updating;
 #endif
        /*
         * Should the accounting and control be hierarchical, per subtree?
@@ -359,7 +363,7 @@ enum charge_type {
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
-static void drain_all_stock_async(void);
+static void drain_all_stock_async(struct mem_cgroup *mem);
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -576,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
        return val;
 }
-static long mem_cgroup_local_usage(struct mem_cgroup *mem)
-{
-        long ret;
-        ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
-        ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
-        return ret;
-}
 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
                                         bool charge)
 {
@@ -688,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
        case MEM_CGROUP_TARGET_SOFTLIMIT:
                next = val + SOFTLIMIT_EVENTS_TARGET;
                break;
+        case MEM_CGROUP_TARGET_NUMAINFO:
+                next = val + NUMAINFO_EVENTS_TARGET;
+                break;
        default:
                return;
        }
@@ -706,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
                mem_cgroup_threshold(mem);
                __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
                if (unlikely(__memcg_event_check(mem,
-                        MEM_CGROUP_TARGET_SOFTLIMIT))){
+                             MEM_CGROUP_TARGET_SOFTLIMIT))) {
                        mem_cgroup_update_tree(mem, page);
                        __mem_cgroup_target_update(mem,
-                                MEM_CGROUP_TARGET_SOFTLIMIT);
+                                                   MEM_CGROUP_TARGET_SOFTLIMIT);
                }
+#if MAX_NUMNODES > 1
+                if (unlikely(__memcg_event_check(mem,
+                        MEM_CGROUP_TARGET_NUMAINFO))) {
+                        atomic_inc(&mem->numainfo_events);
+                        __mem_cgroup_target_update(mem,
+                                MEM_CGROUP_TARGET_NUMAINFO);
+                }
+#endif
        }
 }
@@ -735,7 +741,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
                                struct mem_cgroup, css);
 }
-static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
        struct mem_cgroup *mem = NULL;
@@ -1128,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
        return MEM_CGROUP_ZSTAT(mz, lru);
 }
-#ifdef CONFIG_NUMA
 static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
                                                        int nid)
 {
@@ -1140,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
        return ret;
 }
+static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
+                                                        int nid)
+{
+        unsigned long ret;
+        ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
+                mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
+        return ret;
+}
+#if MAX_NUMNODES > 1
 static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
 {
        u64 total = 0;
@@ -1151,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
        return total;
 }
-static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
-                                                        int nid)
-{
-        unsigned long ret;
-        ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
-                mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
-        return ret;
-}
 static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
 {
        u64 total = 0;
@@ -1558,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
        return ret;
 }
+/**
+ * test_mem_cgroup_node_reclaimable
+ * @mem: the target memcg
+ * @nid: the node ID to be checked.
+ * @noswap : specify true here if the user wants flle only information.
+ *
+ * This function returns whether the specified memcg contains any
+ * reclaimable pages on a node. Returns true if there are any reclaimable
+ * pages in the node.
+ */
+static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
+                int nid, bool noswap)
+{
+        if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
+                return true;
+        if (noswap || !total_swap_pages)
+                return false;
+        if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
+                return true;
+        return false;
+}
 #if MAX_NUMNODES > 1
 /*
@@ -1569,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
 {
        int nid;
+        /*
-        if (time_after(mem->next_scan_node_update, jiffies))
+         * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
+         * pagein/pageout changes since the last update.
+         */
+        if (!atomic_read(&mem->numainfo_events))
+                return;
+        if (atomic_inc_return(&mem->numainfo_updating) > 1)
                return;
-        mem->next_scan_node_update = jiffies + 10*HZ;
        /* make a nodemask where this memcg uses memory from */
        mem->scan_nodes = node_states[N_HIGH_MEMORY];
        for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
-                if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
+                if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
-                    mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
+                        node_clear(nid, mem->scan_nodes);
-                        continue;
-                if (total_swap_pages &&
-                    (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
-                     mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
-                        continue;
-                node_clear(nid, mem->scan_nodes);
        }
+        atomic_set(&mem->numainfo_events, 0);
+        atomic_set(&mem->numainfo_updating, 0);
 }
 /*
@@ -1626,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
        return node;
 }
+/*
+ * Check all nodes whether it contains reclaimable pages or not.
+ * For quick scan, we make use of scan_nodes. This will allow us to skip
+ * unused nodes. But scan_nodes is lazily updated and may not cotain
+ * enough new information. We need to do double check.
+ */
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+        int nid;
+        /*
+         * quick check...making use of scan_node.
+         * We can skip unused nodes.
+         */
+        if (!nodes_empty(mem->scan_nodes)) {
+                for (nid = first_node(mem->scan_nodes);
+                     nid < MAX_NUMNODES;
+                     nid = next_node(nid, mem->scan_nodes)) {
+                        if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                                return true;
+                }
+        }
+        /*
+         * Check rest of nodes.
+         */
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                if (node_isset(nid, mem->scan_nodes))
+                        continue;
+                if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                        return true;
+        }
+        return false;
+}
 #else
 int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
 {
        return 0;
 }
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+        return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
+}
 #endif
 /*
@@ -1663,15 +1730,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
        /* If memsw_is_minimum==1, swap-out is of-no-use. */
-        if (root_mem->memsw_is_minimum)
+        if (!check_soft && root_mem->memsw_is_minimum)
                noswap = true;
        while (1) {
                victim = mem_cgroup_select_victim(root_mem);
                if (victim == root_mem) {
                        loop++;
-                        if (loop >= 1)
+                        /*
-                                drain_all_stock_async();
+                         * We are not draining per cpu cached charges during
+                         * soft limit reclaim  because global reclaim doesn't
+                         * care about charges. It tries to free some memory and
+                         * charges will not give any.
+                         */
+                        if (!check_soft && loop >= 1)
+                                drain_all_stock_async(root_mem);
                        if (loop >= 2) {
                                /*
                                 * If we have not been able to reclaim
@@ -1695,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                                }
                        }
                }
-                if (!mem_cgroup_local_usage(victim)) {
+                if (!mem_cgroup_reclaimable(victim, noswap)) {
                        /* this cgroup's local usage == 0 */
                        css_put(&victim->css);
                        continue;
@@ -1934,9 +2007,11 @@ struct memcg_stock_pcp {
        struct mem_cgroup *cached; /* this never be root cgroup */
        unsigned int nr_pages;
        struct work_struct work;
+        unsigned long flags;
+#define FLUSHING_CACHED_CHARGE  (0)
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
-static atomic_t memcg_drain_count;
+static DEFINE_MUTEX(percpu_charge_mutex);
 /*
 * Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -1984,6 +2059,7 @@ static void drain_local_stock(struct work_struct *dummy)
 {
        struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
        drain_stock(stock);
+        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
 /*
@@ -2008,26 +2084,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
 * expects some charges will be back to res_counter later but cannot wait for
 * it.
 */
-static void drain_all_stock_async(void)
+static void drain_all_stock_async(struct mem_cgroup *root_mem)
 {
-        int cpu;
+        int cpu, curcpu;
-        /* This function is for scheduling "drain" in asynchronous way.
+        /*
-         * The result of "drain" is not directly handled by callers. Then,
+         * If someone calls draining, avoid adding more kworker runs.
-         * if someone is calling drain, we don't have to call drain more.
-         * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
-         * there is a race. We just do loose check here.
         */
-        if (atomic_read(&memcg_drain_count))
+        if (!mutex_trylock(&percpu_charge_mutex))
                return;
        /* Notify other cpus that system-wide "drain" is running */
-        atomic_inc(&memcg_drain_count);
        get_online_cpus();
+        /*
+         * Get a hint for avoiding draining charges on the current cpu,
+         * which must be exhausted by our charging.  It is not required that
+         * this be a precise check, so we use raw_smp_processor_id() instead of
+         * getcpu()/putcpu().
+         */
+        curcpu = raw_smp_processor_id();
        for_each_online_cpu(cpu) {
                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-                schedule_work_on(cpu, &stock->work);
+                struct mem_cgroup *mem;
+                if (cpu == curcpu)
+                        continue;
+                mem = stock->cached;
+                if (!mem)
+                        continue;
+                if (mem != root_mem) {
+                        if (!root_mem->use_hierarchy)
+                                continue;
+                        /* check whether "mem" is under tree of "root_mem" */
+                        if (!css_is_ancestor(&mem->css, &root_mem->css))
+                                continue;
+                }
+                if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+                        schedule_work_on(cpu, &stock->work);
        }
        put_online_cpus();
-        atomic_dec(&memcg_drain_count);
+        mutex_unlock(&percpu_charge_mutex);
        /* We don't wait for flush_work */
 }
@@ -2035,9 +2130,9 @@ static void drain_all_stock_async(void)
 static void drain_all_stock_sync(void)
 {
        /* called when force_empty is called */
-        atomic_inc(&memcg_drain_count);
+        mutex_lock(&percpu_charge_mutex);
        schedule_on_each_cpu(drain_local_stock);
-        atomic_dec(&memcg_drain_count);
+        mutex_unlock(&percpu_charge_mutex);
 }
 /*
@@ -4640,6 +4735,7 @@ static struct cftype mem_cgroup_files[] = {
        {
                .name = "numa_stat",
                .open = mem_control_numa_stat_open,
+                .mode = S_IRUGO,
        },
 #endif
 };
@@ -5414,18 +5510,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct cgroup *old_cont,
                                struct task_struct *p)
 {
-        struct mm_struct *mm;
+        struct mm_struct *mm = get_task_mm(p);
-        if (!mc.to)
-                /* no need to move charge */
-                return;
-        mm = get_task_mm(p);
        if (mm) {
-                mem_cgroup_move_charge(mm);
+                if (mc.to)
+                        mem_cgroup_move_charge(mm);
+                put_swap_token(mm);
                mmput(mm);
        }
-        mem_cgroup_clear_mc();
+        if (mc.to)
+                mem_cgroup_clear_mc();
 }
 #else   /* !CONFIG_MMU */
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
author	Jiri Kosina <jkosina@suse.cz>	2011-07-11 08:15:48 -0400
committer	Jiri Kosina <jkosina@suse.cz>	2011-07-11 08:15:55 -0400
commit	b7e9c223be8ce335e30f2cf6ba588e6a4092275c (patch)
tree	2d1e3b75606abc18df7ad65e51ac3f90cd68b38d /mm/memcontrol.c
parent	c172d82500a6cf3c32d1e650722a1055d72ce858 (diff)
parent	e3bbfa78bab125f58b831b5f7f45b5a305091d72 (diff)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bd9052a5d3ad..e013b8e57d25 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -35,6 +35,7 @@
35	#include <linux/limits.h>	35	#include <linux/limits.h>
36	#include <linux/mutex.h>	36	#include <linux/mutex.h>
37	#include <linux/rbtree.h>	37	#include <linux/rbtree.h>
		38	#include <linux/shmem_fs.h>
38	#include <linux/slab.h>	39	#include <linux/slab.h>
39	#include <linux/swap.h>	40	#include <linux/swap.h>
40	#include <linux/swapops.h>	41	#include <linux/swapops.h>
@@ -107,10 +108,12 @@ enum mem_cgroup_events_index {
107	enum mem_cgroup_events_target {	108	enum mem_cgroup_events_target {
108	MEM_CGROUP_TARGET_THRESH,	109	MEM_CGROUP_TARGET_THRESH,
109	MEM_CGROUP_TARGET_SOFTLIMIT,	110	MEM_CGROUP_TARGET_SOFTLIMIT,
		111	MEM_CGROUP_TARGET_NUMAINFO,
110	MEM_CGROUP_NTARGETS,	112	MEM_CGROUP_NTARGETS,
111	};	113	};
112	#define THRESHOLDS_EVENTS_TARGET (128)	114	#define THRESHOLDS_EVENTS_TARGET (128)
113	#define SOFTLIMIT_EVENTS_TARGET (1024)	115	#define SOFTLIMIT_EVENTS_TARGET (1024)
		116	#define NUMAINFO_EVENTS_TARGET (1024)
114		117
115	struct mem_cgroup_stat_cpu {	118	struct mem_cgroup_stat_cpu {
116	long count[MEM_CGROUP_STAT_NSTATS];	119	long count[MEM_CGROUP_STAT_NSTATS];
@@ -236,7 +239,8 @@ struct mem_cgroup {
236	int last_scanned_node;	239	int last_scanned_node;
237	#if MAX_NUMNODES > 1	240	#if MAX_NUMNODES > 1
238	nodemask_t scan_nodes;	241	nodemask_t scan_nodes;
239	unsigned long next_scan_node_update;	242	atomic_t numainfo_events;
		243	atomic_t numainfo_updating;
240	#endif	244	#endif
241	/*	245	/*
242	* Should the accounting and control be hierarchical, per subtree?	246	* Should the accounting and control be hierarchical, per subtree?
@@ -359,7 +363,7 @@ enum charge_type {
359	static void mem_cgroup_get(struct mem_cgroup *mem);	363	static void mem_cgroup_get(struct mem_cgroup *mem);
360	static void mem_cgroup_put(struct mem_cgroup *mem);	364	static void mem_cgroup_put(struct mem_cgroup *mem);
361	static struct mem_cgroup parent_mem_cgroup(struct mem_cgroup mem);	365	static struct mem_cgroup parent_mem_cgroup(struct mem_cgroup mem);
362	static void drain_all_stock_async(void);	366	static void drain_all_stock_async(struct mem_cgroup *mem);
363		367
364	static struct mem_cgroup_per_zone *	368	static struct mem_cgroup_per_zone *
365	mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)	369	mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -576,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
576	return val;	580	return val;
577	}	581	}
578		582
579	static long mem_cgroup_local_usage(struct mem_cgroup *mem)
580	{
581	long ret;
582
583	ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
584	ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
585	return ret;
586	}
587
588	static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,	583	static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
589	bool charge)	584	bool charge)
590	{	585	{
@@ -688,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
688	case MEM_CGROUP_TARGET_SOFTLIMIT:	683	case MEM_CGROUP_TARGET_SOFTLIMIT:
689	next = val + SOFTLIMIT_EVENTS_TARGET;	684	next = val + SOFTLIMIT_EVENTS_TARGET;
690	break;	685	break;
		686	case MEM_CGROUP_TARGET_NUMAINFO:
		687	next = val + NUMAINFO_EVENTS_TARGET;
		688	break;
691	default:	689	default:
692	return;	690	return;
693	}	691	}
@@ -706,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup mem, struct page page)
706	mem_cgroup_threshold(mem);	704	mem_cgroup_threshold(mem);
707	__mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);	705	__mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
708	if (unlikely(__memcg_event_check(mem,	706	if (unlikely(__memcg_event_check(mem,
709	MEM_CGROUP_TARGET_SOFTLIMIT))){	707	MEM_CGROUP_TARGET_SOFTLIMIT))) {
710	mem_cgroup_update_tree(mem, page);	708	mem_cgroup_update_tree(mem, page);
711	__mem_cgroup_target_update(mem,	709	__mem_cgroup_target_update(mem,
712	MEM_CGROUP_TARGET_SOFTLIMIT);	710	MEM_CGROUP_TARGET_SOFTLIMIT);
713	}	711	}
		712	#if MAX_NUMNODES > 1
		713	if (unlikely(__memcg_event_check(mem,
		714	MEM_CGROUP_TARGET_NUMAINFO))) {
		715	atomic_inc(&mem->numainfo_events);
		716	__mem_cgroup_target_update(mem,
		717	MEM_CGROUP_TARGET_NUMAINFO);
		718	}
		719	#endif
714	}	720	}
715	}	721	}
716		722
@@ -735,7 +741,7 @@ struct mem_cgroup mem_cgroup_from_task(struct task_struct p)
735	struct mem_cgroup, css);	741	struct mem_cgroup, css);
736	}	742	}
737		743
738	static struct mem_cgroup try_get_mem_cgroup_from_mm(struct mm_struct mm)	744	struct mem_cgroup try_get_mem_cgroup_from_mm(struct mm_struct mm)
739	{	745	{
740	struct mem_cgroup *mem = NULL;	746	struct mem_cgroup *mem = NULL;
741		747
@@ -1128,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
1128	return MEM_CGROUP_ZSTAT(mz, lru);	1134	return MEM_CGROUP_ZSTAT(mz, lru);
1129	}	1135	}
1130		1136
1131	#ifdef CONFIG_NUMA
1132	static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,	1137	static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1133	int nid)	1138	int nid)
1134	{	1139	{
@@ -1140,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1140	return ret;	1145	return ret;
1141	}	1146	}
1142		1147
		1148	static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
		1149	int nid)
		1150	{
		1151	unsigned long ret;
		1152
		1153	ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
		1154	mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
		1155	return ret;
		1156	}
		1157
		1158	#if MAX_NUMNODES > 1
1143	static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)	1159	static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1144	{	1160	{
1145	u64 total = 0;	1161	u64 total = 0;
@@ -1151,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1151	return total;	1167	return total;
1152	}	1168	}
1153		1169
1154	static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
1155	int nid)
1156	{
1157	unsigned long ret;
1158
1159	ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
1160	mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
1161
1162	return ret;
1163	}
1164
1165	static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)	1170	static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
1166	{	1171	{
1167	u64 total = 0;	1172	u64 total = 0;
@@ -1558,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1558	return ret;	1563	return ret;
1559	}	1564	}
1560		1565
		1566	/**
		1567	* test_mem_cgroup_node_reclaimable
		1568	* @mem: the target memcg
		1569	* @nid: the node ID to be checked.
		1570	* @noswap : specify true here if the user wants flle only information.
		1571	*
		1572	* This function returns whether the specified memcg contains any
		1573	* reclaimable pages on a node. Returns true if there are any reclaimable
		1574	* pages in the node.
		1575	*/
		1576	static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
		1577	int nid, bool noswap)
		1578	{
		1579	if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
		1580	return true;
		1581	if (noswap \|\| !total_swap_pages)
		1582	return false;
		1583	if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
		1584	return true;
		1585	return false;
		1586
		1587	}
1561	#if MAX_NUMNODES > 1	1588	#if MAX_NUMNODES > 1
1562		1589
1563	/*	1590	/*
@@ -1569,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1569	static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)	1596	static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
1570	{	1597	{
1571	int nid;	1598	int nid;
1572		1599	/*
1573	if (time_after(mem->next_scan_node_update, jiffies))	1600	* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
		1601	* pagein/pageout changes since the last update.
		1602	*/
		1603	if (!atomic_read(&mem->numainfo_events))
		1604	return;
		1605	if (atomic_inc_return(&mem->numainfo_updating) > 1)
1574	return;	1606	return;
1575		1607
1576	mem->next_scan_node_update = jiffies + 10*HZ;
1577	/* make a nodemask where this memcg uses memory from */	1608	/* make a nodemask where this memcg uses memory from */
1578	mem->scan_nodes = node_states[N_HIGH_MEMORY];	1609	mem->scan_nodes = node_states[N_HIGH_MEMORY];
1579		1610
1580	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {	1611	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1581		1612
1582	if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) \|\|	1613	if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
1583	mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))	1614	node_clear(nid, mem->scan_nodes);
1584	continue;
1585
1586	if (total_swap_pages &&
1587	(mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) \|\|
1588	mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
1589	continue;
1590	node_clear(nid, mem->scan_nodes);
1591	}	1615	}
		1616
		1617	atomic_set(&mem->numainfo_events, 0);
		1618	atomic_set(&mem->numainfo_updating, 0);
1592	}	1619	}
1593		1620
1594	/*	1621	/*
@@ -1626,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1626	return node;	1653	return node;
1627	}	1654	}
1628		1655
		1656	/*
		1657	* Check all nodes whether it contains reclaimable pages or not.
		1658	* For quick scan, we make use of scan_nodes. This will allow us to skip
		1659	* unused nodes. But scan_nodes is lazily updated and may not cotain
		1660	* enough new information. We need to do double check.
		1661	*/
		1662	bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
		1663	{
		1664	int nid;
		1665
		1666	/*
		1667	* quick check...making use of scan_node.
		1668	* We can skip unused nodes.
		1669	*/
		1670	if (!nodes_empty(mem->scan_nodes)) {
		1671	for (nid = first_node(mem->scan_nodes);
		1672	nid < MAX_NUMNODES;
		1673	nid = next_node(nid, mem->scan_nodes)) {
		1674
		1675	if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
		1676	return true;
		1677	}
		1678	}
		1679	/*
		1680	* Check rest of nodes.
		1681	*/
		1682	for_each_node_state(nid, N_HIGH_MEMORY) {
		1683	if (node_isset(nid, mem->scan_nodes))
		1684	continue;
		1685	if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
		1686	return true;
		1687	}
		1688	return false;
		1689	}
		1690
1629	#else	1691	#else
1630	int mem_cgroup_select_victim_node(struct mem_cgroup *mem)	1692	int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1631	{	1693	{
1632	return 0;	1694	return 0;
1633	}	1695	}
		1696
		1697	bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
		1698	{
		1699	return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
		1700	}
1634	#endif	1701	#endif
1635		1702
1636	/*	1703	/*
@@ -1663,15 +1730,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1663	excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;	1730	excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1664		1731
1665	/* If memsw_is_minimum==1, swap-out is of-no-use. */	1732	/* If memsw_is_minimum==1, swap-out is of-no-use. */
1666	if (root_mem->memsw_is_minimum)	1733	if (!check_soft && root_mem->memsw_is_minimum)
1667	noswap = true;	1734	noswap = true;
1668		1735
1669	while (1) {	1736	while (1) {
1670	victim = mem_cgroup_select_victim(root_mem);	1737	victim = mem_cgroup_select_victim(root_mem);
1671	if (victim == root_mem) {	1738	if (victim == root_mem) {
1672	loop++;	1739	loop++;
1673	if (loop >= 1)	1740	/*
1674	drain_all_stock_async();	1741	* We are not draining per cpu cached charges during
		1742	* soft limit reclaim because global reclaim doesn't
		1743	* care about charges. It tries to free some memory and
		1744	* charges will not give any.
		1745	*/
		1746	if (!check_soft && loop >= 1)
		1747	drain_all_stock_async(root_mem);
1675	if (loop >= 2) {	1748	if (loop >= 2) {
1676	/*	1749	/*
1677	* If we have not been able to reclaim	1750	* If we have not been able to reclaim
@@ -1695,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1695	}	1768	}
1696	}	1769	}
1697	}	1770	}
1698	if (!mem_cgroup_local_usage(victim)) {	1771	if (!mem_cgroup_reclaimable(victim, noswap)) {
1699	/* this cgroup's local usage == 0 */	1772	/* this cgroup's local usage == 0 */
1700	css_put(&victim->css);	1773	css_put(&victim->css);
1701	continue;	1774	continue;
@@ -1934,9 +2007,11 @@ struct memcg_stock_pcp {
1934	struct mem_cgroup cached; / this never be root cgroup */	2007	struct mem_cgroup cached; / this never be root cgroup */
1935	unsigned int nr_pages;	2008	unsigned int nr_pages;
1936	struct work_struct work;	2009	struct work_struct work;
		2010	unsigned long flags;
		2011	#define FLUSHING_CACHED_CHARGE (0)
1937	};	2012	};
1938	static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);	2013	static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1939	static atomic_t memcg_drain_count;	2014	static DEFINE_MUTEX(percpu_charge_mutex);
1940		2015
1941	/*	2016	/*
1942	* Try to consume stocked charge on this cpu. If success, one page is consumed	2017	* Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -1984,6 +2059,7 @@ static void drain_local_stock(struct work_struct *dummy)
1984	{	2059	{
1985	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);	2060	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1986	drain_stock(stock);	2061	drain_stock(stock);
		2062	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1987	}	2063	}
1988		2064
1989	/*	2065	/*
@@ -2008,26 +2084,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
2008	* expects some charges will be back to res_counter later but cannot wait for	2084	* expects some charges will be back to res_counter later but cannot wait for
2009	* it.	2085	* it.
2010	*/	2086	*/
2011	static void drain_all_stock_async(void)	2087	static void drain_all_stock_async(struct mem_cgroup *root_mem)
2012	{	2088	{
2013	int cpu;	2089	int cpu, curcpu;
2014	/* This function is for scheduling "drain" in asynchronous way.	2090	/*
2015	* The result of "drain" is not directly handled by callers. Then,	2091	* If someone calls draining, avoid adding more kworker runs.
2016	* if someone is calling drain, we don't have to call drain more.
2017	* Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
2018	* there is a race. We just do loose check here.
2019	*/	2092	*/
2020	if (atomic_read(&memcg_drain_count))	2093	if (!mutex_trylock(&percpu_charge_mutex))
2021	return;	2094	return;
2022	/* Notify other cpus that system-wide "drain" is running */	2095	/* Notify other cpus that system-wide "drain" is running */
2023	atomic_inc(&memcg_drain_count);
2024	get_online_cpus();	2096	get_online_cpus();
		2097	/*
		2098	* Get a hint for avoiding draining charges on the current cpu,
		2099	* which must be exhausted by our charging. It is not required that
		2100	* this be a precise check, so we use raw_smp_processor_id() instead of
		2101	* getcpu()/putcpu().
		2102	*/
		2103	curcpu = raw_smp_processor_id();
2025	for_each_online_cpu(cpu) {	2104	for_each_online_cpu(cpu) {
2026	struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);	2105	struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2027	schedule_work_on(cpu, &stock->work);	2106	struct mem_cgroup *mem;
		2107
		2108	if (cpu == curcpu)
		2109	continue;
		2110
		2111	mem = stock->cached;
		2112	if (!mem)
		2113	continue;
		2114	if (mem != root_mem) {
		2115	if (!root_mem->use_hierarchy)
		2116	continue;
		2117	/* check whether "mem" is under tree of "root_mem" */
		2118	if (!css_is_ancestor(&mem->css, &root_mem->css))
		2119	continue;
		2120	}
		2121	if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
		2122	schedule_work_on(cpu, &stock->work);
2028	}	2123	}
2029	put_online_cpus();	2124	put_online_cpus();
2030	atomic_dec(&memcg_drain_count);	2125	mutex_unlock(&percpu_charge_mutex);
2031	/* We don't wait for flush_work */	2126	/* We don't wait for flush_work */
2032	}	2127	}
2033		2128
@@ -2035,9 +2130,9 @@ static void drain_all_stock_async(void)
2035	static void drain_all_stock_sync(void)	2130	static void drain_all_stock_sync(void)
2036	{	2131	{
2037	/* called when force_empty is called */	2132	/* called when force_empty is called */
2038	atomic_inc(&memcg_drain_count);	2133	mutex_lock(&percpu_charge_mutex);
2039	schedule_on_each_cpu(drain_local_stock);	2134	schedule_on_each_cpu(drain_local_stock);
2040	atomic_dec(&memcg_drain_count);	2135	mutex_unlock(&percpu_charge_mutex);
2041	}	2136	}
2042		2137
2043	/*	2138	/*
@@ -4640,6 +4735,7 @@ static struct cftype mem_cgroup_files[] = {
4640	{	4735	{
4641	.name = "numa_stat",	4736	.name = "numa_stat",
4642	.open = mem_control_numa_stat_open,	4737	.open = mem_control_numa_stat_open,
		4738	.mode = S_IRUGO,
4643	},	4739	},
4644	#endif	4740	#endif
4645	};	4741	};
@@ -5414,18 +5510,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5414	struct cgroup *old_cont,	5510	struct cgroup *old_cont,
5415	struct task_struct *p)	5511	struct task_struct *p)
5416	{	5512	{
5417	struct mm_struct *mm;	5513	struct mm_struct *mm = get_task_mm(p);
5418		5514
5419	if (!mc.to)
5420	/* no need to move charge */
5421	return;
5422
5423	mm = get_task_mm(p);
5424	if (mm) {	5515	if (mm) {
5425	mem_cgroup_move_charge(mm);	5516	if (mc.to)
		5517	mem_cgroup_move_charge(mm);
		5518	put_swap_token(mm);
5426	mmput(mm);	5519	mmput(mm);
5427	}	5520	}
5428	mem_cgroup_clear_mc();	5521	if (mc.to)
		5522	mem_cgroup_clear_mc();
5429	}	5523	}
5430	#else /* !CONFIG_MMU */	5524	#else /* !CONFIG_MMU */
5431	static int mem_cgroup_can_attach(struct cgroup_subsys *ss,	5525	static int mem_cgroup_can_attach(struct cgroup_subsys *ss,