aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c473
1 files changed, 290 insertions, 183 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 26c6f4ec20f4..b2ee6df0e9bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -89,7 +89,6 @@ enum mem_cgroup_stat_index {
89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
92 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
93 MEM_CGROUP_STAT_NSTATS, 92 MEM_CGROUP_STAT_NSTATS,
94}; 93};
95 94
@@ -135,7 +134,7 @@ struct mem_cgroup_reclaim_iter {
135 */ 134 */
136struct mem_cgroup_per_zone { 135struct mem_cgroup_per_zone {
137 struct lruvec lruvec; 136 struct lruvec lruvec;
138 unsigned long count[NR_LRU_LISTS]; 137 unsigned long lru_size[NR_LRU_LISTS];
139 138
140 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 139 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
141 140
@@ -144,11 +143,9 @@ struct mem_cgroup_per_zone {
144 unsigned long long usage_in_excess;/* Set to the value by which */ 143 unsigned long long usage_in_excess;/* Set to the value by which */
145 /* the soft limit is exceeded*/ 144 /* the soft limit is exceeded*/
146 bool on_tree; 145 bool on_tree;
147 struct mem_cgroup *mem; /* Back pointer, we cannot */ 146 struct mem_cgroup *memcg; /* Back pointer, we cannot */
148 /* use container_of */ 147 /* use container_of */
149}; 148};
150/* Macro for accessing counter */
151#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
152 149
153struct mem_cgroup_per_node { 150struct mem_cgroup_per_node {
154 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 151 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
@@ -300,6 +297,12 @@ struct mem_cgroup {
300 */ 297 */
301 unsigned long move_charge_at_immigrate; 298 unsigned long move_charge_at_immigrate;
302 /* 299 /*
300 * set > 0 if pages under this cgroup are moving to other cgroup.
301 */
302 atomic_t moving_account;
303 /* taken only while moving_account > 0 */
304 spinlock_t move_lock;
305 /*
303 * percpu counter. 306 * percpu counter.
304 */ 307 */
305 struct mem_cgroup_stat_cpu *stat; 308 struct mem_cgroup_stat_cpu *stat;
@@ -612,9 +615,9 @@ retry:
612 * we will to add it back at the end of reclaim to its correct 615 * we will to add it back at the end of reclaim to its correct
613 * position in the tree. 616 * position in the tree.
614 */ 617 */
615 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 618 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
616 if (!res_counter_soft_limit_excess(&mz->mem->res) || 619 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
617 !css_tryget(&mz->mem->css)) 620 !css_tryget(&mz->memcg->css))
618 goto retry; 621 goto retry;
619done: 622done:
620 return mz; 623 return mz;
@@ -692,15 +695,19 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
692} 695}
693 696
694static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 697static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
695 bool file, int nr_pages) 698 bool anon, int nr_pages)
696{ 699{
697 preempt_disable(); 700 preempt_disable();
698 701
699 if (file) 702 /*
700 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 703 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
704 * counted as CACHE even if it's on ANON LRU.
705 */
706 if (anon)
707 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
701 nr_pages); 708 nr_pages);
702 else 709 else
703 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 710 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
704 nr_pages); 711 nr_pages);
705 712
706 /* pagein of a big page is an event. So, ignore page size */ 713 /* pagein of a big page is an event. So, ignore page size */
@@ -721,14 +728,14 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
721 unsigned int lru_mask) 728 unsigned int lru_mask)
722{ 729{
723 struct mem_cgroup_per_zone *mz; 730 struct mem_cgroup_per_zone *mz;
724 enum lru_list l; 731 enum lru_list lru;
725 unsigned long ret = 0; 732 unsigned long ret = 0;
726 733
727 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 734 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
728 735
729 for_each_lru(l) { 736 for_each_lru(lru) {
730 if (BIT(l) & lru_mask) 737 if (BIT(lru) & lru_mask)
731 ret += MEM_CGROUP_ZSTAT(mz, l); 738 ret += mz->lru_size[lru];
732 } 739 }
733 return ret; 740 return ret;
734} 741}
@@ -1077,7 +1084,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1077 1084
1078 mz = page_cgroup_zoneinfo(memcg, page); 1085 mz = page_cgroup_zoneinfo(memcg, page);
1079 /* compound_order() is stabilized through lru_lock */ 1086 /* compound_order() is stabilized through lru_lock */
1080 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 1087 mz->lru_size[lru] += 1 << compound_order(page);
1081 return &mz->lruvec; 1088 return &mz->lruvec;
1082} 1089}
1083 1090
@@ -1105,8 +1112,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
1105 VM_BUG_ON(!memcg); 1112 VM_BUG_ON(!memcg);
1106 mz = page_cgroup_zoneinfo(memcg, page); 1113 mz = page_cgroup_zoneinfo(memcg, page);
1107 /* huge page split is done under lru_lock. so, we have no races. */ 1114 /* huge page split is done under lru_lock. so, we have no races. */
1108 VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); 1115 VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
1109 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); 1116 mz->lru_size[lru] -= 1 << compound_order(page);
1110} 1117}
1111 1118
1112void mem_cgroup_lru_del(struct page *page) 1119void mem_cgroup_lru_del(struct page *page)
@@ -1285,40 +1292,48 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1285 return memcg->swappiness; 1292 return memcg->swappiness;
1286} 1293}
1287 1294
1288static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1295/*
1289{ 1296 * memcg->moving_account is used for checking possibility that some thread is
1290 int cpu; 1297 * calling move_account(). When a thread on CPU-A starts moving pages under
1298 * a memcg, other threads should check memcg->moving_account under
1299 * rcu_read_lock(), like this:
1300 *
1301 * CPU-A CPU-B
1302 * rcu_read_lock()
1303 * memcg->moving_account+1 if (memcg->mocing_account)
1304 * take heavy locks.
1305 * synchronize_rcu() update something.
1306 * rcu_read_unlock()
1307 * start move here.
1308 */
1291 1309
1292 get_online_cpus(); 1310/* for quick checking without looking up memcg */
1293 spin_lock(&memcg->pcp_counter_lock); 1311atomic_t memcg_moving __read_mostly;
1294 for_each_online_cpu(cpu)
1295 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1296 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1297 spin_unlock(&memcg->pcp_counter_lock);
1298 put_online_cpus();
1299 1312
1313static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1314{
1315 atomic_inc(&memcg_moving);
1316 atomic_inc(&memcg->moving_account);
1300 synchronize_rcu(); 1317 synchronize_rcu();
1301} 1318}
1302 1319
1303static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1320static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1304{ 1321{
1305 int cpu; 1322 /*
1306 1323 * Now, mem_cgroup_clear_mc() may call this function with NULL.
1307 if (!memcg) 1324 * We check NULL in callee rather than caller.
1308 return; 1325 */
1309 get_online_cpus(); 1326 if (memcg) {
1310 spin_lock(&memcg->pcp_counter_lock); 1327 atomic_dec(&memcg_moving);
1311 for_each_online_cpu(cpu) 1328 atomic_dec(&memcg->moving_account);
1312 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; 1329 }
1313 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1314 spin_unlock(&memcg->pcp_counter_lock);
1315 put_online_cpus();
1316} 1330}
1331
1317/* 1332/*
1318 * 2 routines for checking "mem" is under move_account() or not. 1333 * 2 routines for checking "mem" is under move_account() or not.
1319 * 1334 *
1320 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used 1335 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
1321 * for avoiding race in accounting. If true, 1336 * is used for avoiding races in accounting. If true,
1322 * pc->mem_cgroup may be overwritten. 1337 * pc->mem_cgroup may be overwritten.
1323 * 1338 *
1324 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1339 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
@@ -1326,10 +1341,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1326 * waiting at hith-memory prressure caused by "move". 1341 * waiting at hith-memory prressure caused by "move".
1327 */ 1342 */
1328 1343
1329static bool mem_cgroup_stealed(struct mem_cgroup *memcg) 1344static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1330{ 1345{
1331 VM_BUG_ON(!rcu_read_lock_held()); 1346 VM_BUG_ON(!rcu_read_lock_held());
1332 return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; 1347 return atomic_read(&memcg->moving_account) > 0;
1333} 1348}
1334 1349
1335static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1350static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
@@ -1370,6 +1385,24 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1370 return false; 1385 return false;
1371} 1386}
1372 1387
1388/*
1389 * Take this lock when
1390 * - a code tries to modify page's memcg while it's USED.
1391 * - a code tries to modify page state accounting in a memcg.
1392 * see mem_cgroup_stolen(), too.
1393 */
1394static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1395 unsigned long *flags)
1396{
1397 spin_lock_irqsave(&memcg->move_lock, *flags);
1398}
1399
1400static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1401 unsigned long *flags)
1402{
1403 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1404}
1405
1373/** 1406/**
1374 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1407 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1375 * @memcg: The memory cgroup that went over limit 1408 * @memcg: The memory cgroup that went over limit
@@ -1393,7 +1426,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1393 if (!memcg || !p) 1426 if (!memcg || !p)
1394 return; 1427 return;
1395 1428
1396
1397 rcu_read_lock(); 1429 rcu_read_lock();
1398 1430
1399 mem_cgrp = memcg->css.cgroup; 1431 mem_cgrp = memcg->css.cgroup;
@@ -1772,22 +1804,22 @@ static DEFINE_SPINLOCK(memcg_oom_lock);
1772static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1804static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1773 1805
1774struct oom_wait_info { 1806struct oom_wait_info {
1775 struct mem_cgroup *mem; 1807 struct mem_cgroup *memcg;
1776 wait_queue_t wait; 1808 wait_queue_t wait;
1777}; 1809};
1778 1810
1779static int memcg_oom_wake_function(wait_queue_t *wait, 1811static int memcg_oom_wake_function(wait_queue_t *wait,
1780 unsigned mode, int sync, void *arg) 1812 unsigned mode, int sync, void *arg)
1781{ 1813{
1782 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg, 1814 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1783 *oom_wait_memcg; 1815 struct mem_cgroup *oom_wait_memcg;
1784 struct oom_wait_info *oom_wait_info; 1816 struct oom_wait_info *oom_wait_info;
1785 1817
1786 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1818 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1787 oom_wait_memcg = oom_wait_info->mem; 1819 oom_wait_memcg = oom_wait_info->memcg;
1788 1820
1789 /* 1821 /*
1790 * Both of oom_wait_info->mem and wake_mem are stable under us. 1822 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
1791 * Then we can use css_is_ancestor without taking care of RCU. 1823 * Then we can use css_is_ancestor without taking care of RCU.
1792 */ 1824 */
1793 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 1825 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
@@ -1811,12 +1843,12 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
1811/* 1843/*
1812 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1844 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1813 */ 1845 */
1814bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) 1846bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1815{ 1847{
1816 struct oom_wait_info owait; 1848 struct oom_wait_info owait;
1817 bool locked, need_to_kill; 1849 bool locked, need_to_kill;
1818 1850
1819 owait.mem = memcg; 1851 owait.memcg = memcg;
1820 owait.wait.flags = 0; 1852 owait.wait.flags = 0;
1821 owait.wait.func = memcg_oom_wake_function; 1853 owait.wait.func = memcg_oom_wake_function;
1822 owait.wait.private = current; 1854 owait.wait.private = current;
@@ -1841,7 +1873,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1841 1873
1842 if (need_to_kill) { 1874 if (need_to_kill) {
1843 finish_wait(&memcg_oom_waitq, &owait.wait); 1875 finish_wait(&memcg_oom_waitq, &owait.wait);
1844 mem_cgroup_out_of_memory(memcg, mask); 1876 mem_cgroup_out_of_memory(memcg, mask, order);
1845 } else { 1877 } else {
1846 schedule(); 1878 schedule();
1847 finish_wait(&memcg_oom_waitq, &owait.wait); 1879 finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1881,41 +1913,66 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1881 * by flags. 1913 * by flags.
1882 * 1914 *
1883 * Considering "move", this is an only case we see a race. To make the race 1915 * Considering "move", this is an only case we see a race. To make the race
1884 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are 1916 * small, we check mm->moving_account and detect there are possibility of race
1885 * possibility of race condition. If there is, we take a lock. 1917 * If there is, we take a lock.
1886 */ 1918 */
1887 1919
1920void __mem_cgroup_begin_update_page_stat(struct page *page,
1921 bool *locked, unsigned long *flags)
1922{
1923 struct mem_cgroup *memcg;
1924 struct page_cgroup *pc;
1925
1926 pc = lookup_page_cgroup(page);
1927again:
1928 memcg = pc->mem_cgroup;
1929 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1930 return;
1931 /*
1932 * If this memory cgroup is not under account moving, we don't
1933 * need to take move_lock_page_cgroup(). Because we already hold
1934 * rcu_read_lock(), any calls to move_account will be delayed until
1935 * rcu_read_unlock() if mem_cgroup_stolen() == true.
1936 */
1937 if (!mem_cgroup_stolen(memcg))
1938 return;
1939
1940 move_lock_mem_cgroup(memcg, flags);
1941 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
1942 move_unlock_mem_cgroup(memcg, flags);
1943 goto again;
1944 }
1945 *locked = true;
1946}
1947
1948void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
1949{
1950 struct page_cgroup *pc = lookup_page_cgroup(page);
1951
1952 /*
1953 * It's guaranteed that pc->mem_cgroup never changes while
1954 * lock is held because a routine modifies pc->mem_cgroup
1955 * should take move_lock_page_cgroup().
1956 */
1957 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
1958}
1959
1888void mem_cgroup_update_page_stat(struct page *page, 1960void mem_cgroup_update_page_stat(struct page *page,
1889 enum mem_cgroup_page_stat_item idx, int val) 1961 enum mem_cgroup_page_stat_item idx, int val)
1890{ 1962{
1891 struct mem_cgroup *memcg; 1963 struct mem_cgroup *memcg;
1892 struct page_cgroup *pc = lookup_page_cgroup(page); 1964 struct page_cgroup *pc = lookup_page_cgroup(page);
1893 bool need_unlock = false;
1894 unsigned long uninitialized_var(flags); 1965 unsigned long uninitialized_var(flags);
1895 1966
1896 if (mem_cgroup_disabled()) 1967 if (mem_cgroup_disabled())
1897 return; 1968 return;
1898 1969
1899 rcu_read_lock();
1900 memcg = pc->mem_cgroup; 1970 memcg = pc->mem_cgroup;
1901 if (unlikely(!memcg || !PageCgroupUsed(pc))) 1971 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1902 goto out; 1972 return;
1903 /* pc->mem_cgroup is unstable ? */
1904 if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
1905 /* take a lock against to access pc->mem_cgroup */
1906 move_lock_page_cgroup(pc, &flags);
1907 need_unlock = true;
1908 memcg = pc->mem_cgroup;
1909 if (!memcg || !PageCgroupUsed(pc))
1910 goto out;
1911 }
1912 1973
1913 switch (idx) { 1974 switch (idx) {
1914 case MEMCG_NR_FILE_MAPPED: 1975 case MEMCG_NR_FILE_MAPPED:
1915 if (val > 0)
1916 SetPageCgroupFileMapped(pc);
1917 else if (!page_mapped(page))
1918 ClearPageCgroupFileMapped(pc);
1919 idx = MEM_CGROUP_STAT_FILE_MAPPED; 1976 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1920 break; 1977 break;
1921 default: 1978 default:
@@ -1923,14 +1980,7 @@ void mem_cgroup_update_page_stat(struct page *page,
1923 } 1980 }
1924 1981
1925 this_cpu_add(memcg->stat->count[idx], val); 1982 this_cpu_add(memcg->stat->count[idx], val);
1926
1927out:
1928 if (unlikely(need_unlock))
1929 move_unlock_page_cgroup(pc, &flags);
1930 rcu_read_unlock();
1931 return;
1932} 1983}
1933EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1934 1984
1935/* 1985/*
1936 * size of first charge trial. "32" comes from vmscan.c's magic value. 1986 * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -2101,17 +2151,6 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2101 per_cpu(memcg->stat->events[i], cpu) = 0; 2151 per_cpu(memcg->stat->events[i], cpu) = 0;
2102 memcg->nocpu_base.events[i] += x; 2152 memcg->nocpu_base.events[i] += x;
2103 } 2153 }
2104 /* need to clear ON_MOVE value, works as a kind of lock. */
2105 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
2106 spin_unlock(&memcg->pcp_counter_lock);
2107}
2108
2109static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
2110{
2111 int idx = MEM_CGROUP_ON_MOVE;
2112
2113 spin_lock(&memcg->pcp_counter_lock);
2114 per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
2115 spin_unlock(&memcg->pcp_counter_lock); 2154 spin_unlock(&memcg->pcp_counter_lock);
2116} 2155}
2117 2156
@@ -2123,11 +2162,8 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2123 struct memcg_stock_pcp *stock; 2162 struct memcg_stock_pcp *stock;
2124 struct mem_cgroup *iter; 2163 struct mem_cgroup *iter;
2125 2164
2126 if ((action == CPU_ONLINE)) { 2165 if (action == CPU_ONLINE)
2127 for_each_mem_cgroup(iter)
2128 synchronize_mem_cgroup_on_move(iter, cpu);
2129 return NOTIFY_OK; 2166 return NOTIFY_OK;
2130 }
2131 2167
2132 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 2168 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2133 return NOTIFY_OK; 2169 return NOTIFY_OK;
@@ -2212,7 +2248,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2212 if (!oom_check) 2248 if (!oom_check)
2213 return CHARGE_NOMEM; 2249 return CHARGE_NOMEM;
2214 /* check OOM */ 2250 /* check OOM */
2215 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) 2251 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
2216 return CHARGE_OOM_DIE; 2252 return CHARGE_OOM_DIE;
2217 2253
2218 return CHARGE_RETRY; 2254 return CHARGE_RETRY;
@@ -2446,6 +2482,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2446{ 2482{
2447 struct zone *uninitialized_var(zone); 2483 struct zone *uninitialized_var(zone);
2448 bool was_on_lru = false; 2484 bool was_on_lru = false;
2485 bool anon;
2449 2486
2450 lock_page_cgroup(pc); 2487 lock_page_cgroup(pc);
2451 if (unlikely(PageCgroupUsed(pc))) { 2488 if (unlikely(PageCgroupUsed(pc))) {
@@ -2481,19 +2518,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2481 * See mem_cgroup_add_lru_list(), etc. 2518 * See mem_cgroup_add_lru_list(), etc.
2482 */ 2519 */
2483 smp_wmb(); 2520 smp_wmb();
2484 switch (ctype) { 2521 SetPageCgroupUsed(pc);
2485 case MEM_CGROUP_CHARGE_TYPE_CACHE:
2486 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
2487 SetPageCgroupCache(pc);
2488 SetPageCgroupUsed(pc);
2489 break;
2490 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2491 ClearPageCgroupCache(pc);
2492 SetPageCgroupUsed(pc);
2493 break;
2494 default:
2495 break;
2496 }
2497 2522
2498 if (lrucare) { 2523 if (lrucare) {
2499 if (was_on_lru) { 2524 if (was_on_lru) {
@@ -2504,7 +2529,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2504 spin_unlock_irq(&zone->lru_lock); 2529 spin_unlock_irq(&zone->lru_lock);
2505 } 2530 }
2506 2531
2507 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); 2532 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
2533 anon = true;
2534 else
2535 anon = false;
2536
2537 mem_cgroup_charge_statistics(memcg, anon, nr_pages);
2508 unlock_page_cgroup(pc); 2538 unlock_page_cgroup(pc);
2509 2539
2510 /* 2540 /*
@@ -2517,8 +2547,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2517 2547
2518#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2548#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2519 2549
2520#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2550#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION))
2521 (1 << PCG_MIGRATION))
2522/* 2551/*
2523 * Because tail pages are not marked as "used", set it. We're under 2552 * Because tail pages are not marked as "used", set it. We're under
2524 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2553 * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -2569,6 +2598,7 @@ static int mem_cgroup_move_account(struct page *page,
2569{ 2598{
2570 unsigned long flags; 2599 unsigned long flags;
2571 int ret; 2600 int ret;
2601 bool anon = PageAnon(page);
2572 2602
2573 VM_BUG_ON(from == to); 2603 VM_BUG_ON(from == to);
2574 VM_BUG_ON(PageLRU(page)); 2604 VM_BUG_ON(PageLRU(page));
@@ -2588,23 +2618,23 @@ static int mem_cgroup_move_account(struct page *page,
2588 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 2618 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2589 goto unlock; 2619 goto unlock;
2590 2620
2591 move_lock_page_cgroup(pc, &flags); 2621 move_lock_mem_cgroup(from, &flags);
2592 2622
2593 if (PageCgroupFileMapped(pc)) { 2623 if (!anon && page_mapped(page)) {
2594 /* Update mapped_file data for mem_cgroup */ 2624 /* Update mapped_file data for mem_cgroup */
2595 preempt_disable(); 2625 preempt_disable();
2596 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2626 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2597 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2627 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2598 preempt_enable(); 2628 preempt_enable();
2599 } 2629 }
2600 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); 2630 mem_cgroup_charge_statistics(from, anon, -nr_pages);
2601 if (uncharge) 2631 if (uncharge)
2602 /* This is not "cancel", but cancel_charge does all we need. */ 2632 /* This is not "cancel", but cancel_charge does all we need. */
2603 __mem_cgroup_cancel_charge(from, nr_pages); 2633 __mem_cgroup_cancel_charge(from, nr_pages);
2604 2634
2605 /* caller should have done css_get */ 2635 /* caller should have done css_get */
2606 pc->mem_cgroup = to; 2636 pc->mem_cgroup = to;
2607 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); 2637 mem_cgroup_charge_statistics(to, anon, nr_pages);
2608 /* 2638 /*
2609 * We charges against "to" which may not have any tasks. Then, "to" 2639 * We charges against "to" which may not have any tasks. Then, "to"
2610 * can be under rmdir(). But in current implementation, caller of 2640 * can be under rmdir(). But in current implementation, caller of
@@ -2612,7 +2642,7 @@ static int mem_cgroup_move_account(struct page *page,
2612 * guaranteed that "to" is never removed. So, we don't check rmdir 2642 * guaranteed that "to" is never removed. So, we don't check rmdir
2613 * status here. 2643 * status here.
2614 */ 2644 */
2615 move_unlock_page_cgroup(pc, &flags); 2645 move_unlock_mem_cgroup(from, &flags);
2616 ret = 0; 2646 ret = 0;
2617unlock: 2647unlock:
2618 unlock_page_cgroup(pc); 2648 unlock_page_cgroup(pc);
@@ -2914,7 +2944,6 @@ direct_uncharge:
2914 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); 2944 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
2915 if (unlikely(batch->memcg != memcg)) 2945 if (unlikely(batch->memcg != memcg))
2916 memcg_oom_recover(memcg); 2946 memcg_oom_recover(memcg);
2917 return;
2918} 2947}
2919 2948
2920/* 2949/*
@@ -2926,6 +2955,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2926 struct mem_cgroup *memcg = NULL; 2955 struct mem_cgroup *memcg = NULL;
2927 unsigned int nr_pages = 1; 2956 unsigned int nr_pages = 1;
2928 struct page_cgroup *pc; 2957 struct page_cgroup *pc;
2958 bool anon;
2929 2959
2930 if (mem_cgroup_disabled()) 2960 if (mem_cgroup_disabled())
2931 return NULL; 2961 return NULL;
@@ -2951,8 +2981,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2951 if (!PageCgroupUsed(pc)) 2981 if (!PageCgroupUsed(pc))
2952 goto unlock_out; 2982 goto unlock_out;
2953 2983
2984 anon = PageAnon(page);
2985
2954 switch (ctype) { 2986 switch (ctype) {
2955 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2987 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2988 /*
2989 * Generally PageAnon tells if it's the anon statistics to be
2990 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
2991 * used before page reached the stage of being marked PageAnon.
2992 */
2993 anon = true;
2994 /* fallthrough */
2956 case MEM_CGROUP_CHARGE_TYPE_DROP: 2995 case MEM_CGROUP_CHARGE_TYPE_DROP:
2957 /* See mem_cgroup_prepare_migration() */ 2996 /* See mem_cgroup_prepare_migration() */
2958 if (page_mapped(page) || PageCgroupMigration(pc)) 2997 if (page_mapped(page) || PageCgroupMigration(pc))
@@ -2969,7 +3008,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2969 break; 3008 break;
2970 } 3009 }
2971 3010
2972 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages); 3011 mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
2973 3012
2974 ClearPageCgroupUsed(pc); 3013 ClearPageCgroupUsed(pc);
2975 /* 3014 /*
@@ -3276,6 +3315,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3276{ 3315{
3277 struct page *used, *unused; 3316 struct page *used, *unused;
3278 struct page_cgroup *pc; 3317 struct page_cgroup *pc;
3318 bool anon;
3279 3319
3280 if (!memcg) 3320 if (!memcg)
3281 return; 3321 return;
@@ -3297,8 +3337,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3297 lock_page_cgroup(pc); 3337 lock_page_cgroup(pc);
3298 ClearPageCgroupMigration(pc); 3338 ClearPageCgroupMigration(pc);
3299 unlock_page_cgroup(pc); 3339 unlock_page_cgroup(pc);
3300 3340 anon = PageAnon(used);
3301 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 3341 __mem_cgroup_uncharge_common(unused,
3342 anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
3343 : MEM_CGROUP_CHARGE_TYPE_CACHE);
3302 3344
3303 /* 3345 /*
3304 * If a page is a file cache, radix-tree replacement is very atomic 3346 * If a page is a file cache, radix-tree replacement is very atomic
@@ -3308,7 +3350,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3308 * and USED bit check in mem_cgroup_uncharge_page() will do enough 3350 * and USED bit check in mem_cgroup_uncharge_page() will do enough
3309 * check. (see prepare_charge() also) 3351 * check. (see prepare_charge() also)
3310 */ 3352 */
3311 if (PageAnon(used)) 3353 if (anon)
3312 mem_cgroup_uncharge_page(used); 3354 mem_cgroup_uncharge_page(used);
3313 /* 3355 /*
3314 * At migration, we may charge account against cgroup which has no 3356 * At migration, we may charge account against cgroup which has no
@@ -3338,7 +3380,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3338 /* fix accounting on old pages */ 3380 /* fix accounting on old pages */
3339 lock_page_cgroup(pc); 3381 lock_page_cgroup(pc);
3340 memcg = pc->mem_cgroup; 3382 memcg = pc->mem_cgroup;
3341 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); 3383 mem_cgroup_charge_statistics(memcg, false, -1);
3342 ClearPageCgroupUsed(pc); 3384 ClearPageCgroupUsed(pc);
3343 unlock_page_cgroup(pc); 3385 unlock_page_cgroup(pc);
3344 3386
@@ -3549,7 +3591,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3549 break; 3591 break;
3550 3592
3551 nr_scanned = 0; 3593 nr_scanned = 0;
3552 reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, 3594 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
3553 gfp_mask, &nr_scanned); 3595 gfp_mask, &nr_scanned);
3554 nr_reclaimed += reclaimed; 3596 nr_reclaimed += reclaimed;
3555 *total_scanned += nr_scanned; 3597 *total_scanned += nr_scanned;
@@ -3576,13 +3618,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3576 next_mz = 3618 next_mz =
3577 __mem_cgroup_largest_soft_limit_node(mctz); 3619 __mem_cgroup_largest_soft_limit_node(mctz);
3578 if (next_mz == mz) 3620 if (next_mz == mz)
3579 css_put(&next_mz->mem->css); 3621 css_put(&next_mz->memcg->css);
3580 else /* next_mz == NULL or other memcg */ 3622 else /* next_mz == NULL or other memcg */
3581 break; 3623 break;
3582 } while (1); 3624 } while (1);
3583 } 3625 }
3584 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 3626 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
3585 excess = res_counter_soft_limit_excess(&mz->mem->res); 3627 excess = res_counter_soft_limit_excess(&mz->memcg->res);
3586 /* 3628 /*
3587 * One school of thought says that we should not add 3629 * One school of thought says that we should not add
3588 * back the node to the tree if reclaim returns 0. 3630 * back the node to the tree if reclaim returns 0.
@@ -3592,9 +3634,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3592 * term TODO. 3634 * term TODO.
3593 */ 3635 */
3594 /* If excess == 0, no tree ops */ 3636 /* If excess == 0, no tree ops */
3595 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 3637 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
3596 spin_unlock(&mctz->lock); 3638 spin_unlock(&mctz->lock);
3597 css_put(&mz->mem->css); 3639 css_put(&mz->memcg->css);
3598 loop++; 3640 loop++;
3599 /* 3641 /*
3600 * Could not reclaim anything and there are no more 3642 * Could not reclaim anything and there are no more
@@ -3607,7 +3649,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3607 break; 3649 break;
3608 } while (!nr_reclaimed); 3650 } while (!nr_reclaimed);
3609 if (next_mz) 3651 if (next_mz)
3610 css_put(&next_mz->mem->css); 3652 css_put(&next_mz->memcg->css);
3611 return nr_reclaimed; 3653 return nr_reclaimed;
3612} 3654}
3613 3655
@@ -3629,7 +3671,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3629 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3671 mz = mem_cgroup_zoneinfo(memcg, node, zid);
3630 list = &mz->lruvec.lists[lru]; 3672 list = &mz->lruvec.lists[lru];
3631 3673
3632 loop = MEM_CGROUP_ZSTAT(mz, lru); 3674 loop = mz->lru_size[lru];
3633 /* give some margin against EBUSY etc...*/ 3675 /* give some margin against EBUSY etc...*/
3634 loop += 256; 3676 loop += 256;
3635 busy = NULL; 3677 busy = NULL;
@@ -3703,10 +3745,10 @@ move_account:
3703 mem_cgroup_start_move(memcg); 3745 mem_cgroup_start_move(memcg);
3704 for_each_node_state(node, N_HIGH_MEMORY) { 3746 for_each_node_state(node, N_HIGH_MEMORY) {
3705 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3747 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3706 enum lru_list l; 3748 enum lru_list lru;
3707 for_each_lru(l) { 3749 for_each_lru(lru) {
3708 ret = mem_cgroup_force_empty_list(memcg, 3750 ret = mem_cgroup_force_empty_list(memcg,
3709 node, zid, l); 3751 node, zid, lru);
3710 if (ret) 3752 if (ret)
3711 break; 3753 break;
3712 } 3754 }
@@ -3860,7 +3902,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3860 break; 3902 break;
3861 default: 3903 default:
3862 BUG(); 3904 BUG();
3863 break;
3864 } 3905 }
3865 return val; 3906 return val;
3866} 3907}
@@ -3939,7 +3980,6 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3939out: 3980out:
3940 *mem_limit = min_limit; 3981 *mem_limit = min_limit;
3941 *memsw_limit = min_memsw_limit; 3982 *memsw_limit = min_memsw_limit;
3942 return;
3943} 3983}
3944 3984
3945static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3985static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -4098,38 +4138,38 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4098 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 4138 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4099 unsigned long node_nr; 4139 unsigned long node_nr;
4100 struct cgroup *cont = m->private; 4140 struct cgroup *cont = m->private;
4101 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4141 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4102 4142
4103 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); 4143 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
4104 seq_printf(m, "total=%lu", total_nr); 4144 seq_printf(m, "total=%lu", total_nr);
4105 for_each_node_state(nid, N_HIGH_MEMORY) { 4145 for_each_node_state(nid, N_HIGH_MEMORY) {
4106 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); 4146 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
4107 seq_printf(m, " N%d=%lu", nid, node_nr); 4147 seq_printf(m, " N%d=%lu", nid, node_nr);
4108 } 4148 }
4109 seq_putc(m, '\n'); 4149 seq_putc(m, '\n');
4110 4150
4111 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); 4151 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
4112 seq_printf(m, "file=%lu", file_nr); 4152 seq_printf(m, "file=%lu", file_nr);
4113 for_each_node_state(nid, N_HIGH_MEMORY) { 4153 for_each_node_state(nid, N_HIGH_MEMORY) {
4114 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4154 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4115 LRU_ALL_FILE); 4155 LRU_ALL_FILE);
4116 seq_printf(m, " N%d=%lu", nid, node_nr); 4156 seq_printf(m, " N%d=%lu", nid, node_nr);
4117 } 4157 }
4118 seq_putc(m, '\n'); 4158 seq_putc(m, '\n');
4119 4159
4120 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); 4160 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
4121 seq_printf(m, "anon=%lu", anon_nr); 4161 seq_printf(m, "anon=%lu", anon_nr);
4122 for_each_node_state(nid, N_HIGH_MEMORY) { 4162 for_each_node_state(nid, N_HIGH_MEMORY) {
4123 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4163 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4124 LRU_ALL_ANON); 4164 LRU_ALL_ANON);
4125 seq_printf(m, " N%d=%lu", nid, node_nr); 4165 seq_printf(m, " N%d=%lu", nid, node_nr);
4126 } 4166 }
4127 seq_putc(m, '\n'); 4167 seq_putc(m, '\n');
4128 4168
4129 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); 4169 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4130 seq_printf(m, "unevictable=%lu", unevictable_nr); 4170 seq_printf(m, "unevictable=%lu", unevictable_nr);
4131 for_each_node_state(nid, N_HIGH_MEMORY) { 4171 for_each_node_state(nid, N_HIGH_MEMORY) {
4132 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4172 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4133 BIT(LRU_UNEVICTABLE)); 4173 BIT(LRU_UNEVICTABLE));
4134 seq_printf(m, " N%d=%lu", nid, node_nr); 4174 seq_printf(m, " N%d=%lu", nid, node_nr);
4135 } 4175 }
@@ -4141,12 +4181,12 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4141static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4181static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4142 struct cgroup_map_cb *cb) 4182 struct cgroup_map_cb *cb)
4143{ 4183{
4144 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4184 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4145 struct mcs_total_stat mystat; 4185 struct mcs_total_stat mystat;
4146 int i; 4186 int i;
4147 4187
4148 memset(&mystat, 0, sizeof(mystat)); 4188 memset(&mystat, 0, sizeof(mystat));
4149 mem_cgroup_get_local_stat(mem_cont, &mystat); 4189 mem_cgroup_get_local_stat(memcg, &mystat);
4150 4190
4151 4191
4152 for (i = 0; i < NR_MCS_STAT; i++) { 4192 for (i = 0; i < NR_MCS_STAT; i++) {
@@ -4158,14 +4198,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4158 /* Hierarchical information */ 4198 /* Hierarchical information */
4159 { 4199 {
4160 unsigned long long limit, memsw_limit; 4200 unsigned long long limit, memsw_limit;
4161 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 4201 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
4162 cb->fill(cb, "hierarchical_memory_limit", limit); 4202 cb->fill(cb, "hierarchical_memory_limit", limit);
4163 if (do_swap_account) 4203 if (do_swap_account)
4164 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 4204 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
4165 } 4205 }
4166 4206
4167 memset(&mystat, 0, sizeof(mystat)); 4207 memset(&mystat, 0, sizeof(mystat));
4168 mem_cgroup_get_total_stat(mem_cont, &mystat); 4208 mem_cgroup_get_total_stat(memcg, &mystat);
4169 for (i = 0; i < NR_MCS_STAT; i++) { 4209 for (i = 0; i < NR_MCS_STAT; i++) {
4170 if (i == MCS_SWAP && !do_swap_account) 4210 if (i == MCS_SWAP && !do_swap_account)
4171 continue; 4211 continue;
@@ -4181,7 +4221,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4181 4221
4182 for_each_online_node(nid) 4222 for_each_online_node(nid)
4183 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4223 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4184 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 4224 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
4185 4225
4186 recent_rotated[0] += 4226 recent_rotated[0] +=
4187 mz->reclaim_stat.recent_rotated[0]; 4227 mz->reclaim_stat.recent_rotated[0];
@@ -4426,12 +4466,6 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4426 else 4466 else
4427 BUG(); 4467 BUG();
4428 4468
4429 /*
4430 * Something went wrong if we trying to unregister a threshold
4431 * if we don't have thresholds
4432 */
4433 BUG_ON(!thresholds);
4434
4435 if (!thresholds->primary) 4469 if (!thresholds->primary)
4436 goto unlock; 4470 goto unlock;
4437 4471
@@ -4736,7 +4770,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4736{ 4770{
4737 struct mem_cgroup_per_node *pn; 4771 struct mem_cgroup_per_node *pn;
4738 struct mem_cgroup_per_zone *mz; 4772 struct mem_cgroup_per_zone *mz;
4739 enum lru_list l; 4773 enum lru_list lru;
4740 int zone, tmp = node; 4774 int zone, tmp = node;
4741 /* 4775 /*
4742 * This routine is called against possible nodes. 4776 * This routine is called against possible nodes.
@@ -4754,11 +4788,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4754 4788
4755 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4789 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4756 mz = &pn->zoneinfo[zone]; 4790 mz = &pn->zoneinfo[zone];
4757 for_each_lru(l) 4791 for_each_lru(lru)
4758 INIT_LIST_HEAD(&mz->lruvec.lists[l]); 4792 INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
4759 mz->usage_in_excess = 0; 4793 mz->usage_in_excess = 0;
4760 mz->on_tree = false; 4794 mz->on_tree = false;
4761 mz->mem = memcg; 4795 mz->memcg = memcg;
4762 } 4796 }
4763 memcg->info.nodeinfo[node] = pn; 4797 memcg->info.nodeinfo[node] = pn;
4764 return 0; 4798 return 0;
@@ -4771,29 +4805,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4771 4805
4772static struct mem_cgroup *mem_cgroup_alloc(void) 4806static struct mem_cgroup *mem_cgroup_alloc(void)
4773{ 4807{
4774 struct mem_cgroup *mem; 4808 struct mem_cgroup *memcg;
4775 int size = sizeof(struct mem_cgroup); 4809 int size = sizeof(struct mem_cgroup);
4776 4810
4777 /* Can be very big if MAX_NUMNODES is very big */ 4811 /* Can be very big if MAX_NUMNODES is very big */
4778 if (size < PAGE_SIZE) 4812 if (size < PAGE_SIZE)
4779 mem = kzalloc(size, GFP_KERNEL); 4813 memcg = kzalloc(size, GFP_KERNEL);
4780 else 4814 else
4781 mem = vzalloc(size); 4815 memcg = vzalloc(size);
4782 4816
4783 if (!mem) 4817 if (!memcg)
4784 return NULL; 4818 return NULL;
4785 4819
4786 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4820 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4787 if (!mem->stat) 4821 if (!memcg->stat)
4788 goto out_free; 4822 goto out_free;
4789 spin_lock_init(&mem->pcp_counter_lock); 4823 spin_lock_init(&memcg->pcp_counter_lock);
4790 return mem; 4824 return memcg;
4791 4825
4792out_free: 4826out_free:
4793 if (size < PAGE_SIZE) 4827 if (size < PAGE_SIZE)
4794 kfree(mem); 4828 kfree(memcg);
4795 else 4829 else
4796 vfree(mem); 4830 vfree(memcg);
4797 return NULL; 4831 return NULL;
4798} 4832}
4799 4833
@@ -4981,6 +5015,7 @@ mem_cgroup_create(struct cgroup *cont)
4981 atomic_set(&memcg->refcnt, 1); 5015 atomic_set(&memcg->refcnt, 1);
4982 memcg->move_charge_at_immigrate = 0; 5016 memcg->move_charge_at_immigrate = 0;
4983 mutex_init(&memcg->thresholds_lock); 5017 mutex_init(&memcg->thresholds_lock);
5018 spin_lock_init(&memcg->move_lock);
4984 return &memcg->css; 5019 return &memcg->css;
4985free_out: 5020free_out:
4986 __mem_cgroup_free(memcg); 5021 __mem_cgroup_free(memcg);
@@ -5075,7 +5110,7 @@ one_by_one:
5075} 5110}
5076 5111
5077/** 5112/**
5078 * is_target_pte_for_mc - check a pte whether it is valid for move charge 5113 * get_mctgt_type - get target type of moving charge
5079 * @vma: the vma the pte to be checked belongs 5114 * @vma: the vma the pte to be checked belongs
5080 * @addr: the address corresponding to the pte to be checked 5115 * @addr: the address corresponding to the pte to be checked
5081 * @ptent: the pte to be checked 5116 * @ptent: the pte to be checked
@@ -5098,7 +5133,7 @@ union mc_target {
5098}; 5133};
5099 5134
5100enum mc_target_type { 5135enum mc_target_type {
5101 MC_TARGET_NONE, /* not used */ 5136 MC_TARGET_NONE = 0,
5102 MC_TARGET_PAGE, 5137 MC_TARGET_PAGE,
5103 MC_TARGET_SWAP, 5138 MC_TARGET_SWAP,
5104}; 5139};
@@ -5179,12 +5214,12 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5179 return page; 5214 return page;
5180} 5215}
5181 5216
5182static int is_target_pte_for_mc(struct vm_area_struct *vma, 5217static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5183 unsigned long addr, pte_t ptent, union mc_target *target) 5218 unsigned long addr, pte_t ptent, union mc_target *target)
5184{ 5219{
5185 struct page *page = NULL; 5220 struct page *page = NULL;
5186 struct page_cgroup *pc; 5221 struct page_cgroup *pc;
5187 int ret = 0; 5222 enum mc_target_type ret = MC_TARGET_NONE;
5188 swp_entry_t ent = { .val = 0 }; 5223 swp_entry_t ent = { .val = 0 };
5189 5224
5190 if (pte_present(ptent)) 5225 if (pte_present(ptent))
@@ -5195,7 +5230,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
5195 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5230 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5196 5231
5197 if (!page && !ent.val) 5232 if (!page && !ent.val)
5198 return 0; 5233 return ret;
5199 if (page) { 5234 if (page) {
5200 pc = lookup_page_cgroup(page); 5235 pc = lookup_page_cgroup(page);
5201 /* 5236 /*
@@ -5221,6 +5256,41 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
5221 return ret; 5256 return ret;
5222} 5257}
5223 5258
5259#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5260/*
5261 * We don't consider swapping or file mapped pages because THP does not
5262 * support them for now.
5263 * Caller should make sure that pmd_trans_huge(pmd) is true.
5264 */
5265static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5266 unsigned long addr, pmd_t pmd, union mc_target *target)
5267{
5268 struct page *page = NULL;
5269 struct page_cgroup *pc;
5270 enum mc_target_type ret = MC_TARGET_NONE;
5271
5272 page = pmd_page(pmd);
5273 VM_BUG_ON(!page || !PageHead(page));
5274 if (!move_anon())
5275 return ret;
5276 pc = lookup_page_cgroup(page);
5277 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5278 ret = MC_TARGET_PAGE;
5279 if (target) {
5280 get_page(page);
5281 target->page = page;
5282 }
5283 }
5284 return ret;
5285}
5286#else
5287static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5288 unsigned long addr, pmd_t pmd, union mc_target *target)
5289{
5290 return MC_TARGET_NONE;
5291}
5292#endif
5293
5224static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5294static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5225 unsigned long addr, unsigned long end, 5295 unsigned long addr, unsigned long end,
5226 struct mm_walk *walk) 5296 struct mm_walk *walk)
@@ -5229,11 +5299,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5229 pte_t *pte; 5299 pte_t *pte;
5230 spinlock_t *ptl; 5300 spinlock_t *ptl;
5231 5301
5232 split_huge_page_pmd(walk->mm, pmd); 5302 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5303 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5304 mc.precharge += HPAGE_PMD_NR;
5305 spin_unlock(&vma->vm_mm->page_table_lock);
5306 return 0;
5307 }
5233 5308
5234 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5309 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5235 for (; addr != end; pte++, addr += PAGE_SIZE) 5310 for (; addr != end; pte++, addr += PAGE_SIZE)
5236 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 5311 if (get_mctgt_type(vma, addr, *pte, NULL))
5237 mc.precharge++; /* increment precharge temporarily */ 5312 mc.precharge++; /* increment precharge temporarily */
5238 pte_unmap_unlock(pte - 1, ptl); 5313 pte_unmap_unlock(pte - 1, ptl);
5239 cond_resched(); 5314 cond_resched();
@@ -5388,23 +5463,55 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5388 struct vm_area_struct *vma = walk->private; 5463 struct vm_area_struct *vma = walk->private;
5389 pte_t *pte; 5464 pte_t *pte;
5390 spinlock_t *ptl; 5465 spinlock_t *ptl;
5466 enum mc_target_type target_type;
5467 union mc_target target;
5468 struct page *page;
5469 struct page_cgroup *pc;
5470
5471 /*
5472 * We don't take compound_lock() here but no race with splitting thp
5473 * happens because:
5474 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
5475 * under splitting, which means there's no concurrent thp split,
5476 * - if another thread runs into split_huge_page() just after we
5477 * entered this if-block, the thread must wait for page table lock
5478 * to be unlocked in __split_huge_page_splitting(), where the main
5479 * part of thp split is not executed yet.
5480 */
5481 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5482 if (!mc.precharge) {
5483 spin_unlock(&vma->vm_mm->page_table_lock);
5484 return 0;
5485 }
5486 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5487 if (target_type == MC_TARGET_PAGE) {
5488 page = target.page;
5489 if (!isolate_lru_page(page)) {
5490 pc = lookup_page_cgroup(page);
5491 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
5492 pc, mc.from, mc.to,
5493 false)) {
5494 mc.precharge -= HPAGE_PMD_NR;
5495 mc.moved_charge += HPAGE_PMD_NR;
5496 }
5497 putback_lru_page(page);
5498 }
5499 put_page(page);
5500 }
5501 spin_unlock(&vma->vm_mm->page_table_lock);
5502 return 0;
5503 }
5391 5504
5392 split_huge_page_pmd(walk->mm, pmd);
5393retry: 5505retry:
5394 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5506 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5395 for (; addr != end; addr += PAGE_SIZE) { 5507 for (; addr != end; addr += PAGE_SIZE) {
5396 pte_t ptent = *(pte++); 5508 pte_t ptent = *(pte++);
5397 union mc_target target;
5398 int type;
5399 struct page *page;
5400 struct page_cgroup *pc;
5401 swp_entry_t ent; 5509 swp_entry_t ent;
5402 5510
5403 if (!mc.precharge) 5511 if (!mc.precharge)
5404 break; 5512 break;
5405 5513
5406 type = is_target_pte_for_mc(vma, addr, ptent, &target); 5514 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5407 switch (type) {
5408 case MC_TARGET_PAGE: 5515 case MC_TARGET_PAGE:
5409 page = target.page; 5516 page = target.page;
5410 if (isolate_lru_page(page)) 5517 if (isolate_lru_page(page))
@@ -5417,7 +5524,7 @@ retry:
5417 mc.moved_charge++; 5524 mc.moved_charge++;
5418 } 5525 }
5419 putback_lru_page(page); 5526 putback_lru_page(page);
5420put: /* is_target_pte_for_mc() gets the page */ 5527put: /* get_mctgt_type() gets the page */
5421 put_page(page); 5528 put_page(page);
5422 break; 5529 break;
5423 case MC_TARGET_SWAP: 5530 case MC_TARGET_SWAP: