aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c431
1 files changed, 294 insertions, 137 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2efa8ea07ff7..da53a252b259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,7 +61,14 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
62/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 62/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
63int do_swap_account __read_mostly; 63int do_swap_account __read_mostly;
64static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 64
65/* for remember boot option*/
66#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
67static int really_do_swap_account __initdata = 1;
68#else
69static int really_do_swap_account __initdata = 0;
70#endif
71
65#else 72#else
66#define do_swap_account (0) 73#define do_swap_account (0)
67#endif 74#endif
@@ -278,7 +285,7 @@ enum move_type {
278 285
279/* "mc" and its members are protected by cgroup_mutex */ 286/* "mc" and its members are protected by cgroup_mutex */
280static struct move_charge_struct { 287static struct move_charge_struct {
281 spinlock_t lock; /* for from, to, moving_task */ 288 spinlock_t lock; /* for from, to */
282 struct mem_cgroup *from; 289 struct mem_cgroup *from;
283 struct mem_cgroup *to; 290 struct mem_cgroup *to;
284 unsigned long precharge; 291 unsigned long precharge;
@@ -593,23 +600,24 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
593} 600}
594 601
595static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 602static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
596 struct page_cgroup *pc, 603 bool file, int nr_pages)
597 bool charge)
598{ 604{
599 int val = (charge) ? 1 : -1;
600
601 preempt_disable(); 605 preempt_disable();
602 606
603 if (PageCgroupCache(pc)) 607 if (file)
604 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); 608 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
605 else 609 else
606 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); 610 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
607 611
608 if (charge) 612 /* pagein of a big page is an event. So, ignore page size */
613 if (nr_pages > 0)
609 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); 614 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
610 else 615 else {
611 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); 616 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
612 __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); 617 nr_pages = -nr_pages; /* for event */
618 }
619
620 __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
613 621
614 preempt_enable(); 622 preempt_enable();
615} 623}
@@ -808,12 +816,12 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
808 * removed from global LRU. 816 * removed from global LRU.
809 */ 817 */
810 mz = page_cgroup_zoneinfo(pc); 818 mz = page_cgroup_zoneinfo(pc);
811 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 819 /* huge page split is done under lru_lock. so, we have no races. */
820 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
812 if (mem_cgroup_is_root(pc->mem_cgroup)) 821 if (mem_cgroup_is_root(pc->mem_cgroup))
813 return; 822 return;
814 VM_BUG_ON(list_empty(&pc->lru)); 823 VM_BUG_ON(list_empty(&pc->lru));
815 list_del_init(&pc->lru); 824 list_del_init(&pc->lru);
816 return;
817} 825}
818 826
819void mem_cgroup_del_lru(struct page *page) 827void mem_cgroup_del_lru(struct page *page)
@@ -830,13 +838,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
830 return; 838 return;
831 839
832 pc = lookup_page_cgroup(page); 840 pc = lookup_page_cgroup(page);
833 /*
834 * Used bit is set without atomic ops but after smp_wmb().
835 * For making pc->mem_cgroup visible, insert smp_rmb() here.
836 */
837 smp_rmb();
838 /* unused or root page is not rotated. */ 841 /* unused or root page is not rotated. */
839 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) 842 if (!PageCgroupUsed(pc))
843 return;
844 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
845 smp_rmb();
846 if (mem_cgroup_is_root(pc->mem_cgroup))
840 return; 847 return;
841 mz = page_cgroup_zoneinfo(pc); 848 mz = page_cgroup_zoneinfo(pc);
842 list_move(&pc->lru, &mz->lists[lru]); 849 list_move(&pc->lru, &mz->lists[lru]);
@@ -851,16 +858,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
851 return; 858 return;
852 pc = lookup_page_cgroup(page); 859 pc = lookup_page_cgroup(page);
853 VM_BUG_ON(PageCgroupAcctLRU(pc)); 860 VM_BUG_ON(PageCgroupAcctLRU(pc));
854 /*
855 * Used bit is set without atomic ops but after smp_wmb().
856 * For making pc->mem_cgroup visible, insert smp_rmb() here.
857 */
858 smp_rmb();
859 if (!PageCgroupUsed(pc)) 861 if (!PageCgroupUsed(pc))
860 return; 862 return;
861 863 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
864 smp_rmb();
862 mz = page_cgroup_zoneinfo(pc); 865 mz = page_cgroup_zoneinfo(pc);
863 MEM_CGROUP_ZSTAT(mz, lru) += 1; 866 /* huge page split is done under lru_lock. so, we have no races. */
867 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
864 SetPageCgroupAcctLRU(pc); 868 SetPageCgroupAcctLRU(pc);
865 if (mem_cgroup_is_root(pc->mem_cgroup)) 869 if (mem_cgroup_is_root(pc->mem_cgroup))
866 return; 870 return;
@@ -1024,14 +1028,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1024 return NULL; 1028 return NULL;
1025 1029
1026 pc = lookup_page_cgroup(page); 1030 pc = lookup_page_cgroup(page);
1027 /*
1028 * Used bit is set without atomic ops but after smp_wmb().
1029 * For making pc->mem_cgroup visible, insert smp_rmb() here.
1030 */
1031 smp_rmb();
1032 if (!PageCgroupUsed(pc)) 1031 if (!PageCgroupUsed(pc))
1033 return NULL; 1032 return NULL;
1034 1033 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1034 smp_rmb();
1035 mz = page_cgroup_zoneinfo(pc); 1035 mz = page_cgroup_zoneinfo(pc);
1036 if (!mz) 1036 if (!mz)
1037 return NULL; 1037 return NULL;
@@ -1079,7 +1079,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1079 case 0: 1079 case 0:
1080 list_move(&page->lru, dst); 1080 list_move(&page->lru, dst);
1081 mem_cgroup_del_lru(page); 1081 mem_cgroup_del_lru(page);
1082 nr_taken++; 1082 nr_taken += hpage_nr_pages(page);
1083 break; 1083 break;
1084 case -EBUSY: 1084 case -EBUSY:
1085 /* we don't affect global LRU but rotate in our LRU */ 1085 /* we don't affect global LRU but rotate in our LRU */
@@ -1113,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
1113 return false; 1113 return false;
1114} 1114}
1115 1115
1116/**
1117 * mem_cgroup_check_margin - check if the memory cgroup allows charging
1118 * @mem: memory cgroup to check
1119 * @bytes: the number of bytes the caller intends to charge
1120 *
1121 * Returns a boolean value on whether @mem can be charged @bytes or
1122 * whether this would exceed the limit.
1123 */
1124static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
1125{
1126 if (!res_counter_check_margin(&mem->res, bytes))
1127 return false;
1128 if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
1129 return false;
1130 return true;
1131}
1132
1116static unsigned int get_swappiness(struct mem_cgroup *memcg) 1133static unsigned int get_swappiness(struct mem_cgroup *memcg)
1117{ 1134{
1118 struct cgroup *cgrp = memcg->css.cgroup; 1135 struct cgroup *cgrp = memcg->css.cgroup;
@@ -1304,8 +1321,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1304 u64 limit; 1321 u64 limit;
1305 u64 memsw; 1322 u64 memsw;
1306 1323
1307 limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + 1324 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1308 total_swap_pages; 1325 limit += total_swap_pages << PAGE_SHIFT;
1326
1309 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1327 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1310 /* 1328 /*
1311 * If memsw is finite and limits the amount of swap space available 1329 * If memsw is finite and limits the amount of swap space available
@@ -1592,11 +1610,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1592 * possibility of race condition. If there is, we take a lock. 1610 * possibility of race condition. If there is, we take a lock.
1593 */ 1611 */
1594 1612
1595static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) 1613void mem_cgroup_update_page_stat(struct page *page,
1614 enum mem_cgroup_page_stat_item idx, int val)
1596{ 1615{
1597 struct mem_cgroup *mem; 1616 struct mem_cgroup *mem;
1598 struct page_cgroup *pc = lookup_page_cgroup(page); 1617 struct page_cgroup *pc = lookup_page_cgroup(page);
1599 bool need_unlock = false; 1618 bool need_unlock = false;
1619 unsigned long uninitialized_var(flags);
1600 1620
1601 if (unlikely(!pc)) 1621 if (unlikely(!pc))
1602 return; 1622 return;
@@ -1606,39 +1626,36 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
1606 if (unlikely(!mem || !PageCgroupUsed(pc))) 1626 if (unlikely(!mem || !PageCgroupUsed(pc)))
1607 goto out; 1627 goto out;
1608 /* pc->mem_cgroup is unstable ? */ 1628 /* pc->mem_cgroup is unstable ? */
1609 if (unlikely(mem_cgroup_stealed(mem))) { 1629 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
1610 /* take a lock against to access pc->mem_cgroup */ 1630 /* take a lock against to access pc->mem_cgroup */
1611 lock_page_cgroup(pc); 1631 move_lock_page_cgroup(pc, &flags);
1612 need_unlock = true; 1632 need_unlock = true;
1613 mem = pc->mem_cgroup; 1633 mem = pc->mem_cgroup;
1614 if (!mem || !PageCgroupUsed(pc)) 1634 if (!mem || !PageCgroupUsed(pc))
1615 goto out; 1635 goto out;
1616 } 1636 }
1617 1637
1618 this_cpu_add(mem->stat->count[idx], val);
1619
1620 switch (idx) { 1638 switch (idx) {
1621 case MEM_CGROUP_STAT_FILE_MAPPED: 1639 case MEMCG_NR_FILE_MAPPED:
1622 if (val > 0) 1640 if (val > 0)
1623 SetPageCgroupFileMapped(pc); 1641 SetPageCgroupFileMapped(pc);
1624 else if (!page_mapped(page)) 1642 else if (!page_mapped(page))
1625 ClearPageCgroupFileMapped(pc); 1643 ClearPageCgroupFileMapped(pc);
1644 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1626 break; 1645 break;
1627 default: 1646 default:
1628 BUG(); 1647 BUG();
1629 } 1648 }
1630 1649
1650 this_cpu_add(mem->stat->count[idx], val);
1651
1631out: 1652out:
1632 if (unlikely(need_unlock)) 1653 if (unlikely(need_unlock))
1633 unlock_page_cgroup(pc); 1654 move_unlock_page_cgroup(pc, &flags);
1634 rcu_read_unlock(); 1655 rcu_read_unlock();
1635 return; 1656 return;
1636} 1657}
1637 1658EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1638void mem_cgroup_update_file_mapped(struct page *page, int val)
1639{
1640 mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
1641}
1642 1659
1643/* 1660/*
1644 * size of first charge trial. "32" comes from vmscan.c's magic value. 1661 * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -1834,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1834 if (likely(!ret)) 1851 if (likely(!ret))
1835 return CHARGE_OK; 1852 return CHARGE_OK;
1836 1853
1854 res_counter_uncharge(&mem->res, csize);
1837 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 1855 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1838 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1856 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1839 } else 1857 } else
1840 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 1858 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1841 1859 /*
1842 if (csize > PAGE_SIZE) /* change csize and retry */ 1860 * csize can be either a huge page (HPAGE_SIZE), a batch of
1861 * regular pages (CHARGE_SIZE), or a single regular page
1862 * (PAGE_SIZE).
1863 *
1864 * Never reclaim on behalf of optional batching, retry with a
1865 * single page instead.
1866 */
1867 if (csize == CHARGE_SIZE)
1843 return CHARGE_RETRY; 1868 return CHARGE_RETRY;
1844 1869
1845 if (!(gfp_mask & __GFP_WAIT)) 1870 if (!(gfp_mask & __GFP_WAIT))
1846 return CHARGE_WOULDBLOCK; 1871 return CHARGE_WOULDBLOCK;
1847 1872
1848 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1873 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1849 gfp_mask, flags); 1874 gfp_mask, flags);
1875 if (mem_cgroup_check_margin(mem_over_limit, csize))
1876 return CHARGE_RETRY;
1850 /* 1877 /*
1851 * try_to_free_mem_cgroup_pages() might not give us a full 1878 * Even though the limit is exceeded at this point, reclaim
1852 * picture of reclaim. Some pages are reclaimed and might be 1879 * may have been able to free some pages. Retry the charge
1853 * moved to swap cache or just unmapped from the cgroup. 1880 * before killing the task.
1854 * Check the limit again to see if the reclaim reduced the 1881 *
1855 * current usage of the cgroup before giving up 1882 * Only for regular pages, though: huge pages are rather
1883 * unlikely to succeed so close to the limit, and we fall back
1884 * to regular pages anyway in case of failure.
1856 */ 1885 */
1857 if (ret || mem_cgroup_check_under_limit(mem_over_limit)) 1886 if (csize == PAGE_SIZE && ret)
1858 return CHARGE_RETRY; 1887 return CHARGE_RETRY;
1859 1888
1860 /* 1889 /*
@@ -1879,12 +1908,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1879 * oom-killer can be invoked. 1908 * oom-killer can be invoked.
1880 */ 1909 */
1881static int __mem_cgroup_try_charge(struct mm_struct *mm, 1910static int __mem_cgroup_try_charge(struct mm_struct *mm,
1882 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 1911 gfp_t gfp_mask,
1912 struct mem_cgroup **memcg, bool oom,
1913 int page_size)
1883{ 1914{
1884 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 1915 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1885 struct mem_cgroup *mem = NULL; 1916 struct mem_cgroup *mem = NULL;
1886 int ret; 1917 int ret;
1887 int csize = CHARGE_SIZE; 1918 int csize = max(CHARGE_SIZE, (unsigned long) page_size);
1888 1919
1889 /* 1920 /*
1890 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 1921 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1909,7 +1940,7 @@ again:
1909 VM_BUG_ON(css_is_removed(&mem->css)); 1940 VM_BUG_ON(css_is_removed(&mem->css));
1910 if (mem_cgroup_is_root(mem)) 1941 if (mem_cgroup_is_root(mem))
1911 goto done; 1942 goto done;
1912 if (consume_stock(mem)) 1943 if (page_size == PAGE_SIZE && consume_stock(mem))
1913 goto done; 1944 goto done;
1914 css_get(&mem->css); 1945 css_get(&mem->css);
1915 } else { 1946 } else {
@@ -1917,23 +1948,22 @@ again:
1917 1948
1918 rcu_read_lock(); 1949 rcu_read_lock();
1919 p = rcu_dereference(mm->owner); 1950 p = rcu_dereference(mm->owner);
1920 VM_BUG_ON(!p);
1921 /* 1951 /*
1922 * because we don't have task_lock(), "p" can exit while 1952 * Because we don't have task_lock(), "p" can exit.
1923 * we're here. In that case, "mem" can point to root 1953 * In that case, "mem" can point to root or p can be NULL with
1924 * cgroup but never be NULL. (and task_struct itself is freed 1954 * race with swapoff. Then, we have small risk of mis-accouning.
1925 * by RCU, cgroup itself is RCU safe.) Then, we have small 1955 * But such kind of mis-account by race always happens because
1926 * risk here to get wrong cgroup. But such kind of mis-account 1956 * we don't have cgroup_mutex(). It's overkill and we allo that
1927 * by race always happens because we don't have cgroup_mutex(). 1957 * small race, here.
1928 * It's overkill and we allow that small race, here. 1958 * (*) swapoff at el will charge against mm-struct not against
1959 * task-struct. So, mm->owner can be NULL.
1929 */ 1960 */
1930 mem = mem_cgroup_from_task(p); 1961 mem = mem_cgroup_from_task(p);
1931 VM_BUG_ON(!mem); 1962 if (!mem || mem_cgroup_is_root(mem)) {
1932 if (mem_cgroup_is_root(mem)) {
1933 rcu_read_unlock(); 1963 rcu_read_unlock();
1934 goto done; 1964 goto done;
1935 } 1965 }
1936 if (consume_stock(mem)) { 1966 if (page_size == PAGE_SIZE && consume_stock(mem)) {
1937 /* 1967 /*
1938 * It seems dagerous to access memcg without css_get(). 1968 * It seems dagerous to access memcg without css_get().
1939 * But considering how consume_stok works, it's not 1969 * But considering how consume_stok works, it's not
@@ -1974,7 +2004,7 @@ again:
1974 case CHARGE_OK: 2004 case CHARGE_OK:
1975 break; 2005 break;
1976 case CHARGE_RETRY: /* not in OOM situation but retry */ 2006 case CHARGE_RETRY: /* not in OOM situation but retry */
1977 csize = PAGE_SIZE; 2007 csize = page_size;
1978 css_put(&mem->css); 2008 css_put(&mem->css);
1979 mem = NULL; 2009 mem = NULL;
1980 goto again; 2010 goto again;
@@ -1995,8 +2025,8 @@ again:
1995 } 2025 }
1996 } while (ret != CHARGE_OK); 2026 } while (ret != CHARGE_OK);
1997 2027
1998 if (csize > PAGE_SIZE) 2028 if (csize > page_size)
1999 refill_stock(mem, csize - PAGE_SIZE); 2029 refill_stock(mem, csize - page_size);
2000 css_put(&mem->css); 2030 css_put(&mem->css);
2001done: 2031done:
2002 *memcg = mem; 2032 *memcg = mem;
@@ -2024,9 +2054,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2024 } 2054 }
2025} 2055}
2026 2056
2027static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 2057static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2058 int page_size)
2028{ 2059{
2029 __mem_cgroup_cancel_charge(mem, 1); 2060 __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
2030} 2061}
2031 2062
2032/* 2063/*
@@ -2076,15 +2107,13 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2076 return mem; 2107 return mem;
2077} 2108}
2078 2109
2079/*
2080 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
2081 * USED state. If already USED, uncharge and return.
2082 */
2083
2084static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2110static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2085 struct page_cgroup *pc, 2111 struct page_cgroup *pc,
2086 enum charge_type ctype) 2112 enum charge_type ctype,
2113 int page_size)
2087{ 2114{
2115 int nr_pages = page_size >> PAGE_SHIFT;
2116
2088 /* try_charge() can return NULL to *memcg, taking care of it. */ 2117 /* try_charge() can return NULL to *memcg, taking care of it. */
2089 if (!mem) 2118 if (!mem)
2090 return; 2119 return;
@@ -2092,10 +2121,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2092 lock_page_cgroup(pc); 2121 lock_page_cgroup(pc);
2093 if (unlikely(PageCgroupUsed(pc))) { 2122 if (unlikely(PageCgroupUsed(pc))) {
2094 unlock_page_cgroup(pc); 2123 unlock_page_cgroup(pc);
2095 mem_cgroup_cancel_charge(mem); 2124 mem_cgroup_cancel_charge(mem, page_size);
2096 return; 2125 return;
2097 } 2126 }
2098 2127 /*
2128 * we don't need page_cgroup_lock about tail pages, becase they are not
2129 * accessed by any other context at this point.
2130 */
2099 pc->mem_cgroup = mem; 2131 pc->mem_cgroup = mem;
2100 /* 2132 /*
2101 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2133 * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2119,8 +2151,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2119 break; 2151 break;
2120 } 2152 }
2121 2153
2122 mem_cgroup_charge_statistics(mem, pc, true); 2154 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
2123
2124 unlock_page_cgroup(pc); 2155 unlock_page_cgroup(pc);
2125 /* 2156 /*
2126 * "charge_statistics" updated event counter. Then, check it. 2157 * "charge_statistics" updated event counter. Then, check it.
@@ -2130,6 +2161,48 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2130 memcg_check_events(mem, pc->page); 2161 memcg_check_events(mem, pc->page);
2131} 2162}
2132 2163
2164#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2165
2166#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2167 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
2168/*
2169 * Because tail pages are not marked as "used", set it. We're under
2170 * zone->lru_lock, 'splitting on pmd' and compund_lock.
2171 */
2172void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2173{
2174 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2175 struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
2176 unsigned long flags;
2177
2178 if (mem_cgroup_disabled())
2179 return;
2180 /*
2181 * We have no races with charge/uncharge but will have races with
2182 * page state accounting.
2183 */
2184 move_lock_page_cgroup(head_pc, &flags);
2185
2186 tail_pc->mem_cgroup = head_pc->mem_cgroup;
2187 smp_wmb(); /* see __commit_charge() */
2188 if (PageCgroupAcctLRU(head_pc)) {
2189 enum lru_list lru;
2190 struct mem_cgroup_per_zone *mz;
2191
2192 /*
2193 * LRU flags cannot be copied because we need to add tail
2194 *.page to LRU by generic call and our hook will be called.
2195 * We hold lru_lock, then, reduce counter directly.
2196 */
2197 lru = page_lru(head);
2198 mz = page_cgroup_zoneinfo(head_pc);
2199 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2200 }
2201 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2202 move_unlock_page_cgroup(head_pc, &flags);
2203}
2204#endif
2205
2133/** 2206/**
2134 * __mem_cgroup_move_account - move account of the page 2207 * __mem_cgroup_move_account - move account of the page
2135 * @pc: page_cgroup of the page. 2208 * @pc: page_cgroup of the page.
@@ -2148,11 +2221,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2148 */ 2221 */
2149 2222
2150static void __mem_cgroup_move_account(struct page_cgroup *pc, 2223static void __mem_cgroup_move_account(struct page_cgroup *pc,
2151 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 2224 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge,
2225 int charge_size)
2152{ 2226{
2227 int nr_pages = charge_size >> PAGE_SHIFT;
2228
2153 VM_BUG_ON(from == to); 2229 VM_BUG_ON(from == to);
2154 VM_BUG_ON(PageLRU(pc->page)); 2230 VM_BUG_ON(PageLRU(pc->page));
2155 VM_BUG_ON(!PageCgroupLocked(pc)); 2231 VM_BUG_ON(!page_is_cgroup_locked(pc));
2156 VM_BUG_ON(!PageCgroupUsed(pc)); 2232 VM_BUG_ON(!PageCgroupUsed(pc));
2157 VM_BUG_ON(pc->mem_cgroup != from); 2233 VM_BUG_ON(pc->mem_cgroup != from);
2158 2234
@@ -2163,14 +2239,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
2163 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2239 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2164 preempt_enable(); 2240 preempt_enable();
2165 } 2241 }
2166 mem_cgroup_charge_statistics(from, pc, false); 2242 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2167 if (uncharge) 2243 if (uncharge)
2168 /* This is not "cancel", but cancel_charge does all we need. */ 2244 /* This is not "cancel", but cancel_charge does all we need. */
2169 mem_cgroup_cancel_charge(from); 2245 mem_cgroup_cancel_charge(from, charge_size);
2170 2246
2171 /* caller should have done css_get */ 2247 /* caller should have done css_get */
2172 pc->mem_cgroup = to; 2248 pc->mem_cgroup = to;
2173 mem_cgroup_charge_statistics(to, pc, true); 2249 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
2174 /* 2250 /*
2175 * We charges against "to" which may not have any tasks. Then, "to" 2251 * We charges against "to" which may not have any tasks. Then, "to"
2176 * can be under rmdir(). But in current implementation, caller of 2252 * can be under rmdir(). But in current implementation, caller of
@@ -2185,12 +2261,25 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
2185 * __mem_cgroup_move_account() 2261 * __mem_cgroup_move_account()
2186 */ 2262 */
2187static int mem_cgroup_move_account(struct page_cgroup *pc, 2263static int mem_cgroup_move_account(struct page_cgroup *pc,
2188 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 2264 struct mem_cgroup *from, struct mem_cgroup *to,
2265 bool uncharge, int charge_size)
2189{ 2266{
2190 int ret = -EINVAL; 2267 int ret = -EINVAL;
2268 unsigned long flags;
2269 /*
2270 * The page is isolated from LRU. So, collapse function
2271 * will not handle this page. But page splitting can happen.
2272 * Do this check under compound_page_lock(). The caller should
2273 * hold it.
2274 */
2275 if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
2276 return -EBUSY;
2277
2191 lock_page_cgroup(pc); 2278 lock_page_cgroup(pc);
2192 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 2279 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
2193 __mem_cgroup_move_account(pc, from, to, uncharge); 2280 move_lock_page_cgroup(pc, &flags);
2281 __mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
2282 move_unlock_page_cgroup(pc, &flags);
2194 ret = 0; 2283 ret = 0;
2195 } 2284 }
2196 unlock_page_cgroup(pc); 2285 unlock_page_cgroup(pc);
@@ -2214,6 +2303,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2214 struct cgroup *cg = child->css.cgroup; 2303 struct cgroup *cg = child->css.cgroup;
2215 struct cgroup *pcg = cg->parent; 2304 struct cgroup *pcg = cg->parent;
2216 struct mem_cgroup *parent; 2305 struct mem_cgroup *parent;
2306 int page_size = PAGE_SIZE;
2307 unsigned long flags;
2217 int ret; 2308 int ret;
2218 2309
2219 /* Is ROOT ? */ 2310 /* Is ROOT ? */
@@ -2226,14 +2317,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2226 if (isolate_lru_page(page)) 2317 if (isolate_lru_page(page))
2227 goto put; 2318 goto put;
2228 2319
2320 if (PageTransHuge(page))
2321 page_size = HPAGE_SIZE;
2322
2229 parent = mem_cgroup_from_cont(pcg); 2323 parent = mem_cgroup_from_cont(pcg);
2230 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 2324 ret = __mem_cgroup_try_charge(NULL, gfp_mask,
2325 &parent, false, page_size);
2231 if (ret || !parent) 2326 if (ret || !parent)
2232 goto put_back; 2327 goto put_back;
2233 2328
2234 ret = mem_cgroup_move_account(pc, child, parent, true); 2329 if (page_size > PAGE_SIZE)
2330 flags = compound_lock_irqsave(page);
2331
2332 ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
2235 if (ret) 2333 if (ret)
2236 mem_cgroup_cancel_charge(parent); 2334 mem_cgroup_cancel_charge(parent, page_size);
2335
2336 if (page_size > PAGE_SIZE)
2337 compound_unlock_irqrestore(page, flags);
2237put_back: 2338put_back:
2238 putback_lru_page(page); 2339 putback_lru_page(page);
2239put: 2340put:
@@ -2252,20 +2353,32 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2252 gfp_t gfp_mask, enum charge_type ctype) 2353 gfp_t gfp_mask, enum charge_type ctype)
2253{ 2354{
2254 struct mem_cgroup *mem = NULL; 2355 struct mem_cgroup *mem = NULL;
2356 int page_size = PAGE_SIZE;
2255 struct page_cgroup *pc; 2357 struct page_cgroup *pc;
2358 bool oom = true;
2256 int ret; 2359 int ret;
2257 2360
2361 if (PageTransHuge(page)) {
2362 page_size <<= compound_order(page);
2363 VM_BUG_ON(!PageTransHuge(page));
2364 /*
2365 * Never OOM-kill a process for a huge page. The
2366 * fault handler will fall back to regular pages.
2367 */
2368 oom = false;
2369 }
2370
2258 pc = lookup_page_cgroup(page); 2371 pc = lookup_page_cgroup(page);
2259 /* can happen at boot */ 2372 /* can happen at boot */
2260 if (unlikely(!pc)) 2373 if (unlikely(!pc))
2261 return 0; 2374 return 0;
2262 prefetchw(pc); 2375 prefetchw(pc);
2263 2376
2264 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 2377 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
2265 if (ret || !mem) 2378 if (ret || !mem)
2266 return ret; 2379 return ret;
2267 2380
2268 __mem_cgroup_commit_charge(mem, pc, ctype); 2381 __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
2269 return 0; 2382 return 0;
2270} 2383}
2271 2384
@@ -2274,8 +2387,6 @@ int mem_cgroup_newpage_charge(struct page *page,
2274{ 2387{
2275 if (mem_cgroup_disabled()) 2388 if (mem_cgroup_disabled())
2276 return 0; 2389 return 0;
2277 if (PageCompound(page))
2278 return 0;
2279 /* 2390 /*
2280 * If already mapped, we don't have to account. 2391 * If already mapped, we don't have to account.
2281 * If page cache, page->mapping has address_space. 2392 * If page cache, page->mapping has address_space.
@@ -2381,13 +2492,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2381 if (!mem) 2492 if (!mem)
2382 goto charge_cur_mm; 2493 goto charge_cur_mm;
2383 *ptr = mem; 2494 *ptr = mem;
2384 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 2495 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE);
2385 css_put(&mem->css); 2496 css_put(&mem->css);
2386 return ret; 2497 return ret;
2387charge_cur_mm: 2498charge_cur_mm:
2388 if (unlikely(!mm)) 2499 if (unlikely(!mm))
2389 mm = &init_mm; 2500 mm = &init_mm;
2390 return __mem_cgroup_try_charge(mm, mask, ptr, true); 2501 return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE);
2391} 2502}
2392 2503
2393static void 2504static void
@@ -2403,7 +2514,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2403 cgroup_exclude_rmdir(&ptr->css); 2514 cgroup_exclude_rmdir(&ptr->css);
2404 pc = lookup_page_cgroup(page); 2515 pc = lookup_page_cgroup(page);
2405 mem_cgroup_lru_del_before_commit_swapcache(page); 2516 mem_cgroup_lru_del_before_commit_swapcache(page);
2406 __mem_cgroup_commit_charge(ptr, pc, ctype); 2517 __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
2407 mem_cgroup_lru_add_after_commit_swapcache(page); 2518 mem_cgroup_lru_add_after_commit_swapcache(page);
2408 /* 2519 /*
2409 * Now swap is on-memory. This means this page may be 2520 * Now swap is on-memory. This means this page may be
@@ -2452,11 +2563,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2452 return; 2563 return;
2453 if (!mem) 2564 if (!mem)
2454 return; 2565 return;
2455 mem_cgroup_cancel_charge(mem); 2566 mem_cgroup_cancel_charge(mem, PAGE_SIZE);
2456} 2567}
2457 2568
2458static void 2569static void
2459__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) 2570__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
2571 int page_size)
2460{ 2572{
2461 struct memcg_batch_info *batch = NULL; 2573 struct memcg_batch_info *batch = NULL;
2462 bool uncharge_memsw = true; 2574 bool uncharge_memsw = true;
@@ -2483,6 +2595,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2483 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2595 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2484 goto direct_uncharge; 2596 goto direct_uncharge;
2485 2597
2598 if (page_size != PAGE_SIZE)
2599 goto direct_uncharge;
2600
2486 /* 2601 /*
2487 * In typical case, batch->memcg == mem. This means we can 2602 * In typical case, batch->memcg == mem. This means we can
2488 * merge a series of uncharges to an uncharge of res_counter. 2603 * merge a series of uncharges to an uncharge of res_counter.
@@ -2496,9 +2611,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2496 batch->memsw_bytes += PAGE_SIZE; 2611 batch->memsw_bytes += PAGE_SIZE;
2497 return; 2612 return;
2498direct_uncharge: 2613direct_uncharge:
2499 res_counter_uncharge(&mem->res, PAGE_SIZE); 2614 res_counter_uncharge(&mem->res, page_size);
2500 if (uncharge_memsw) 2615 if (uncharge_memsw)
2501 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 2616 res_counter_uncharge(&mem->memsw, page_size);
2502 if (unlikely(batch->memcg != mem)) 2617 if (unlikely(batch->memcg != mem))
2503 memcg_oom_recover(mem); 2618 memcg_oom_recover(mem);
2504 return; 2619 return;
@@ -2510,8 +2625,10 @@ direct_uncharge:
2510static struct mem_cgroup * 2625static struct mem_cgroup *
2511__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2626__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2512{ 2627{
2628 int count;
2513 struct page_cgroup *pc; 2629 struct page_cgroup *pc;
2514 struct mem_cgroup *mem = NULL; 2630 struct mem_cgroup *mem = NULL;
2631 int page_size = PAGE_SIZE;
2515 2632
2516 if (mem_cgroup_disabled()) 2633 if (mem_cgroup_disabled())
2517 return NULL; 2634 return NULL;
@@ -2519,6 +2636,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2519 if (PageSwapCache(page)) 2636 if (PageSwapCache(page))
2520 return NULL; 2637 return NULL;
2521 2638
2639 if (PageTransHuge(page)) {
2640 page_size <<= compound_order(page);
2641 VM_BUG_ON(!PageTransHuge(page));
2642 }
2643
2644 count = page_size >> PAGE_SHIFT;
2522 /* 2645 /*
2523 * Check if our page_cgroup is valid 2646 * Check if our page_cgroup is valid
2524 */ 2647 */
@@ -2551,7 +2674,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2551 break; 2674 break;
2552 } 2675 }
2553 2676
2554 mem_cgroup_charge_statistics(mem, pc, false); 2677 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count);
2555 2678
2556 ClearPageCgroupUsed(pc); 2679 ClearPageCgroupUsed(pc);
2557 /* 2680 /*
@@ -2572,7 +2695,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2572 mem_cgroup_get(mem); 2695 mem_cgroup_get(mem);
2573 } 2696 }
2574 if (!mem_cgroup_is_root(mem)) 2697 if (!mem_cgroup_is_root(mem))
2575 __do_uncharge(mem, ctype); 2698 __do_uncharge(mem, ctype, page_size);
2576 2699
2577 return mem; 2700 return mem;
2578 2701
@@ -2767,6 +2890,7 @@ int mem_cgroup_prepare_migration(struct page *page,
2767 enum charge_type ctype; 2890 enum charge_type ctype;
2768 int ret = 0; 2891 int ret = 0;
2769 2892
2893 VM_BUG_ON(PageTransHuge(page));
2770 if (mem_cgroup_disabled()) 2894 if (mem_cgroup_disabled())
2771 return 0; 2895 return 0;
2772 2896
@@ -2816,7 +2940,7 @@ int mem_cgroup_prepare_migration(struct page *page,
2816 return 0; 2940 return 0;
2817 2941
2818 *ptr = mem; 2942 *ptr = mem;
2819 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); 2943 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE);
2820 css_put(&mem->css);/* drop extra refcnt */ 2944 css_put(&mem->css);/* drop extra refcnt */
2821 if (ret || *ptr == NULL) { 2945 if (ret || *ptr == NULL) {
2822 if (PageAnon(page)) { 2946 if (PageAnon(page)) {
@@ -2843,13 +2967,13 @@ int mem_cgroup_prepare_migration(struct page *page,
2843 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 2967 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2844 else 2968 else
2845 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2969 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2846 __mem_cgroup_commit_charge(mem, pc, ctype); 2970 __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
2847 return ret; 2971 return ret;
2848} 2972}
2849 2973
2850/* remove redundant charge if migration failed*/ 2974/* remove redundant charge if migration failed*/
2851void mem_cgroup_end_migration(struct mem_cgroup *mem, 2975void mem_cgroup_end_migration(struct mem_cgroup *mem,
2852 struct page *oldpage, struct page *newpage) 2976 struct page *oldpage, struct page *newpage, bool migration_ok)
2853{ 2977{
2854 struct page *used, *unused; 2978 struct page *used, *unused;
2855 struct page_cgroup *pc; 2979 struct page_cgroup *pc;
@@ -2858,8 +2982,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
2858 return; 2982 return;
2859 /* blocks rmdir() */ 2983 /* blocks rmdir() */
2860 cgroup_exclude_rmdir(&mem->css); 2984 cgroup_exclude_rmdir(&mem->css);
2861 /* at migration success, oldpage->mapping is NULL. */ 2985 if (!migration_ok) {
2862 if (oldpage->mapping) {
2863 used = oldpage; 2986 used = oldpage;
2864 unused = newpage; 2987 unused = newpage;
2865 } else { 2988 } else {
@@ -4169,13 +4292,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4169 */ 4292 */
4170 if (!node_state(node, N_NORMAL_MEMORY)) 4293 if (!node_state(node, N_NORMAL_MEMORY))
4171 tmp = -1; 4294 tmp = -1;
4172 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4295 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4173 if (!pn) 4296 if (!pn)
4174 return 1; 4297 return 1;
4175 4298
4176 mem->info.nodeinfo[node] = pn; 4299 mem->info.nodeinfo[node] = pn;
4177 memset(pn, 0, sizeof(*pn));
4178
4179 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4300 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4180 mz = &pn->zoneinfo[zone]; 4301 mz = &pn->zoneinfo[zone];
4181 for_each_lru(l) 4302 for_each_lru(l)
@@ -4199,14 +4320,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
4199 4320
4200 /* Can be very big if MAX_NUMNODES is very big */ 4321 /* Can be very big if MAX_NUMNODES is very big */
4201 if (size < PAGE_SIZE) 4322 if (size < PAGE_SIZE)
4202 mem = kmalloc(size, GFP_KERNEL); 4323 mem = kzalloc(size, GFP_KERNEL);
4203 else 4324 else
4204 mem = vmalloc(size); 4325 mem = vzalloc(size);
4205 4326
4206 if (!mem) 4327 if (!mem)
4207 return NULL; 4328 return NULL;
4208 4329
4209 memset(mem, 0, size);
4210 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4330 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4211 if (!mem->stat) 4331 if (!mem->stat)
4212 goto out_free; 4332 goto out_free;
@@ -4454,7 +4574,8 @@ one_by_one:
4454 batch_count = PRECHARGE_COUNT_AT_ONCE; 4574 batch_count = PRECHARGE_COUNT_AT_ONCE;
4455 cond_resched(); 4575 cond_resched();
4456 } 4576 }
4457 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 4577 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
4578 PAGE_SIZE);
4458 if (ret || !mem) 4579 if (ret || !mem)
4459 /* mem_cgroup_clear_mc() will do uncharge later */ 4580 /* mem_cgroup_clear_mc() will do uncharge later */
4460 return -ENOMEM; 4581 return -ENOMEM;
@@ -4616,6 +4737,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4616 pte_t *pte; 4737 pte_t *pte;
4617 spinlock_t *ptl; 4738 spinlock_t *ptl;
4618 4739
4740 VM_BUG_ON(pmd_trans_huge(*pmd));
4619 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4741 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4620 for (; addr != end; pte++, addr += PAGE_SIZE) 4742 for (; addr != end; pte++, addr += PAGE_SIZE)
4621 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 4743 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4653,10 +4775,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4653 4775
4654static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4776static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4655{ 4777{
4656 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); 4778 unsigned long precharge = mem_cgroup_count_precharge(mm);
4779
4780 VM_BUG_ON(mc.moving_task);
4781 mc.moving_task = current;
4782 return mem_cgroup_do_precharge(precharge);
4657} 4783}
4658 4784
4659static void mem_cgroup_clear_mc(void) 4785/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
4786static void __mem_cgroup_clear_mc(void)
4660{ 4787{
4661 struct mem_cgroup *from = mc.from; 4788 struct mem_cgroup *from = mc.from;
4662 struct mem_cgroup *to = mc.to; 4789 struct mem_cgroup *to = mc.to;
@@ -4691,18 +4818,28 @@ static void mem_cgroup_clear_mc(void)
4691 PAGE_SIZE * mc.moved_swap); 4818 PAGE_SIZE * mc.moved_swap);
4692 } 4819 }
4693 /* we've already done mem_cgroup_get(mc.to) */ 4820 /* we've already done mem_cgroup_get(mc.to) */
4694
4695 mc.moved_swap = 0; 4821 mc.moved_swap = 0;
4696 } 4822 }
4823 memcg_oom_recover(from);
4824 memcg_oom_recover(to);
4825 wake_up_all(&mc.waitq);
4826}
4827
4828static void mem_cgroup_clear_mc(void)
4829{
4830 struct mem_cgroup *from = mc.from;
4831
4832 /*
4833 * we must clear moving_task before waking up waiters at the end of
4834 * task migration.
4835 */
4836 mc.moving_task = NULL;
4837 __mem_cgroup_clear_mc();
4697 spin_lock(&mc.lock); 4838 spin_lock(&mc.lock);
4698 mc.from = NULL; 4839 mc.from = NULL;
4699 mc.to = NULL; 4840 mc.to = NULL;
4700 mc.moving_task = NULL;
4701 spin_unlock(&mc.lock); 4841 spin_unlock(&mc.lock);
4702 mem_cgroup_end_move(from); 4842 mem_cgroup_end_move(from);
4703 memcg_oom_recover(from);
4704 memcg_oom_recover(to);
4705 wake_up_all(&mc.waitq);
4706} 4843}
4707 4844
4708static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4845static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -4729,16 +4866,12 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4729 VM_BUG_ON(mc.precharge); 4866 VM_BUG_ON(mc.precharge);
4730 VM_BUG_ON(mc.moved_charge); 4867 VM_BUG_ON(mc.moved_charge);
4731 VM_BUG_ON(mc.moved_swap); 4868 VM_BUG_ON(mc.moved_swap);
4732 VM_BUG_ON(mc.moving_task);
4733 mem_cgroup_start_move(from); 4869 mem_cgroup_start_move(from);
4734 spin_lock(&mc.lock); 4870 spin_lock(&mc.lock);
4735 mc.from = from; 4871 mc.from = from;
4736 mc.to = mem; 4872 mc.to = mem;
4737 mc.precharge = 0;
4738 mc.moved_charge = 0;
4739 mc.moved_swap = 0;
4740 mc.moving_task = current;
4741 spin_unlock(&mc.lock); 4873 spin_unlock(&mc.lock);
4874 /* We set mc.moving_task later */
4742 4875
4743 ret = mem_cgroup_precharge_mc(mm); 4876 ret = mem_cgroup_precharge_mc(mm);
4744 if (ret) 4877 if (ret)
@@ -4767,6 +4900,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4767 spinlock_t *ptl; 4900 spinlock_t *ptl;
4768 4901
4769retry: 4902retry:
4903 VM_BUG_ON(pmd_trans_huge(*pmd));
4770 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4904 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4771 for (; addr != end; addr += PAGE_SIZE) { 4905 for (; addr != end; addr += PAGE_SIZE) {
4772 pte_t ptent = *(pte++); 4906 pte_t ptent = *(pte++);
@@ -4787,7 +4921,7 @@ retry:
4787 goto put; 4921 goto put;
4788 pc = lookup_page_cgroup(page); 4922 pc = lookup_page_cgroup(page);
4789 if (!mem_cgroup_move_account(pc, 4923 if (!mem_cgroup_move_account(pc,
4790 mc.from, mc.to, false)) { 4924 mc.from, mc.to, false, PAGE_SIZE)) {
4791 mc.precharge--; 4925 mc.precharge--;
4792 /* we uncharge from mc.from later. */ 4926 /* we uncharge from mc.from later. */
4793 mc.moved_charge++; 4927 mc.moved_charge++;
@@ -4832,7 +4966,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4832 struct vm_area_struct *vma; 4966 struct vm_area_struct *vma;
4833 4967
4834 lru_add_drain_all(); 4968 lru_add_drain_all();
4835 down_read(&mm->mmap_sem); 4969retry:
4970 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
4971 /*
4972 * Someone who are holding the mmap_sem might be waiting in
4973 * waitq. So we cancel all extra charges, wake up all waiters,
4974 * and retry. Because we cancel precharges, we might not be able
4975 * to move enough charges, but moving charge is a best-effort
4976 * feature anyway, so it wouldn't be a big problem.
4977 */
4978 __mem_cgroup_clear_mc();
4979 cond_resched();
4980 goto retry;
4981 }
4836 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4982 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4837 int ret; 4983 int ret;
4838 struct mm_walk mem_cgroup_move_charge_walk = { 4984 struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4911,10 +5057,21 @@ struct cgroup_subsys mem_cgroup_subsys = {
4911}; 5057};
4912 5058
4913#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5059#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
5060static int __init enable_swap_account(char *s)
5061{
5062 /* consider enabled if no parameter or 1 is given */
5063 if (!(*s) || !strcmp(s, "=1"))
5064 really_do_swap_account = 1;
5065 else if (!strcmp(s, "=0"))
5066 really_do_swap_account = 0;
5067 return 1;
5068}
5069__setup("swapaccount", enable_swap_account);
4914 5070
4915static int __init disable_swap_account(char *s) 5071static int __init disable_swap_account(char *s)
4916{ 5072{
4917 really_do_swap_account = 0; 5073 printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
5074 enable_swap_account("=0");
4918 return 1; 5075 return 1;
4919} 5076}
4920__setup("noswapaccount", disable_swap_account); 5077__setup("noswapaccount", disable_swap_account);