diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 431 |
1 files changed, 294 insertions, 137 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2efa8ea07ff7..da53a252b259 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -61,7 +61,14 @@ struct mem_cgroup *root_mem_cgroup __read_mostly; | |||
61 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 61 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
62 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | 62 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
63 | int do_swap_account __read_mostly; | 63 | int do_swap_account __read_mostly; |
64 | static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | 64 | |
65 | /* for remember boot option*/ | ||
66 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED | ||
67 | static int really_do_swap_account __initdata = 1; | ||
68 | #else | ||
69 | static int really_do_swap_account __initdata = 0; | ||
70 | #endif | ||
71 | |||
65 | #else | 72 | #else |
66 | #define do_swap_account (0) | 73 | #define do_swap_account (0) |
67 | #endif | 74 | #endif |
@@ -278,7 +285,7 @@ enum move_type { | |||
278 | 285 | ||
279 | /* "mc" and its members are protected by cgroup_mutex */ | 286 | /* "mc" and its members are protected by cgroup_mutex */ |
280 | static struct move_charge_struct { | 287 | static struct move_charge_struct { |
281 | spinlock_t lock; /* for from, to, moving_task */ | 288 | spinlock_t lock; /* for from, to */ |
282 | struct mem_cgroup *from; | 289 | struct mem_cgroup *from; |
283 | struct mem_cgroup *to; | 290 | struct mem_cgroup *to; |
284 | unsigned long precharge; | 291 | unsigned long precharge; |
@@ -593,23 +600,24 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | |||
593 | } | 600 | } |
594 | 601 | ||
595 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 602 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
596 | struct page_cgroup *pc, | 603 | bool file, int nr_pages) |
597 | bool charge) | ||
598 | { | 604 | { |
599 | int val = (charge) ? 1 : -1; | ||
600 | |||
601 | preempt_disable(); | 605 | preempt_disable(); |
602 | 606 | ||
603 | if (PageCgroupCache(pc)) | 607 | if (file) |
604 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); | 608 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); |
605 | else | 609 | else |
606 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); | 610 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); |
607 | 611 | ||
608 | if (charge) | 612 | /* pagein of a big page is an event. So, ignore page size */ |
613 | if (nr_pages > 0) | ||
609 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); | 614 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); |
610 | else | 615 | else { |
611 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); | 616 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); |
612 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); | 617 | nr_pages = -nr_pages; /* for event */ |
618 | } | ||
619 | |||
620 | __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); | ||
613 | 621 | ||
614 | preempt_enable(); | 622 | preempt_enable(); |
615 | } | 623 | } |
@@ -808,12 +816,12 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | |||
808 | * removed from global LRU. | 816 | * removed from global LRU. |
809 | */ | 817 | */ |
810 | mz = page_cgroup_zoneinfo(pc); | 818 | mz = page_cgroup_zoneinfo(pc); |
811 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 819 | /* huge page split is done under lru_lock. so, we have no races. */ |
820 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); | ||
812 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 821 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
813 | return; | 822 | return; |
814 | VM_BUG_ON(list_empty(&pc->lru)); | 823 | VM_BUG_ON(list_empty(&pc->lru)); |
815 | list_del_init(&pc->lru); | 824 | list_del_init(&pc->lru); |
816 | return; | ||
817 | } | 825 | } |
818 | 826 | ||
819 | void mem_cgroup_del_lru(struct page *page) | 827 | void mem_cgroup_del_lru(struct page *page) |
@@ -830,13 +838,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | |||
830 | return; | 838 | return; |
831 | 839 | ||
832 | pc = lookup_page_cgroup(page); | 840 | pc = lookup_page_cgroup(page); |
833 | /* | ||
834 | * Used bit is set without atomic ops but after smp_wmb(). | ||
835 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | ||
836 | */ | ||
837 | smp_rmb(); | ||
838 | /* unused or root page is not rotated. */ | 841 | /* unused or root page is not rotated. */ |
839 | if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) | 842 | if (!PageCgroupUsed(pc)) |
843 | return; | ||
844 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | ||
845 | smp_rmb(); | ||
846 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
840 | return; | 847 | return; |
841 | mz = page_cgroup_zoneinfo(pc); | 848 | mz = page_cgroup_zoneinfo(pc); |
842 | list_move(&pc->lru, &mz->lists[lru]); | 849 | list_move(&pc->lru, &mz->lists[lru]); |
@@ -851,16 +858,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
851 | return; | 858 | return; |
852 | pc = lookup_page_cgroup(page); | 859 | pc = lookup_page_cgroup(page); |
853 | VM_BUG_ON(PageCgroupAcctLRU(pc)); | 860 | VM_BUG_ON(PageCgroupAcctLRU(pc)); |
854 | /* | ||
855 | * Used bit is set without atomic ops but after smp_wmb(). | ||
856 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | ||
857 | */ | ||
858 | smp_rmb(); | ||
859 | if (!PageCgroupUsed(pc)) | 861 | if (!PageCgroupUsed(pc)) |
860 | return; | 862 | return; |
861 | 863 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | |
864 | smp_rmb(); | ||
862 | mz = page_cgroup_zoneinfo(pc); | 865 | mz = page_cgroup_zoneinfo(pc); |
863 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 866 | /* huge page split is done under lru_lock. so, we have no races. */ |
867 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); | ||
864 | SetPageCgroupAcctLRU(pc); | 868 | SetPageCgroupAcctLRU(pc); |
865 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 869 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
866 | return; | 870 | return; |
@@ -1024,14 +1028,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) | |||
1024 | return NULL; | 1028 | return NULL; |
1025 | 1029 | ||
1026 | pc = lookup_page_cgroup(page); | 1030 | pc = lookup_page_cgroup(page); |
1027 | /* | ||
1028 | * Used bit is set without atomic ops but after smp_wmb(). | ||
1029 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | ||
1030 | */ | ||
1031 | smp_rmb(); | ||
1032 | if (!PageCgroupUsed(pc)) | 1031 | if (!PageCgroupUsed(pc)) |
1033 | return NULL; | 1032 | return NULL; |
1034 | 1033 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | |
1034 | smp_rmb(); | ||
1035 | mz = page_cgroup_zoneinfo(pc); | 1035 | mz = page_cgroup_zoneinfo(pc); |
1036 | if (!mz) | 1036 | if (!mz) |
1037 | return NULL; | 1037 | return NULL; |
@@ -1079,7 +1079,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
1079 | case 0: | 1079 | case 0: |
1080 | list_move(&page->lru, dst); | 1080 | list_move(&page->lru, dst); |
1081 | mem_cgroup_del_lru(page); | 1081 | mem_cgroup_del_lru(page); |
1082 | nr_taken++; | 1082 | nr_taken += hpage_nr_pages(page); |
1083 | break; | 1083 | break; |
1084 | case -EBUSY: | 1084 | case -EBUSY: |
1085 | /* we don't affect global LRU but rotate in our LRU */ | 1085 | /* we don't affect global LRU but rotate in our LRU */ |
@@ -1113,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) | |||
1113 | return false; | 1113 | return false; |
1114 | } | 1114 | } |
1115 | 1115 | ||
1116 | /** | ||
1117 | * mem_cgroup_check_margin - check if the memory cgroup allows charging | ||
1118 | * @mem: memory cgroup to check | ||
1119 | * @bytes: the number of bytes the caller intends to charge | ||
1120 | * | ||
1121 | * Returns a boolean value on whether @mem can be charged @bytes or | ||
1122 | * whether this would exceed the limit. | ||
1123 | */ | ||
1124 | static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) | ||
1125 | { | ||
1126 | if (!res_counter_check_margin(&mem->res, bytes)) | ||
1127 | return false; | ||
1128 | if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) | ||
1129 | return false; | ||
1130 | return true; | ||
1131 | } | ||
1132 | |||
1116 | static unsigned int get_swappiness(struct mem_cgroup *memcg) | 1133 | static unsigned int get_swappiness(struct mem_cgroup *memcg) |
1117 | { | 1134 | { |
1118 | struct cgroup *cgrp = memcg->css.cgroup; | 1135 | struct cgroup *cgrp = memcg->css.cgroup; |
@@ -1304,8 +1321,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
1304 | u64 limit; | 1321 | u64 limit; |
1305 | u64 memsw; | 1322 | u64 memsw; |
1306 | 1323 | ||
1307 | limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + | 1324 | limit = res_counter_read_u64(&memcg->res, RES_LIMIT); |
1308 | total_swap_pages; | 1325 | limit += total_swap_pages << PAGE_SHIFT; |
1326 | |||
1309 | memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 1327 | memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); |
1310 | /* | 1328 | /* |
1311 | * If memsw is finite and limits the amount of swap space available | 1329 | * If memsw is finite and limits the amount of swap space available |
@@ -1592,11 +1610,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1592 | * possibility of race condition. If there is, we take a lock. | 1610 | * possibility of race condition. If there is, we take a lock. |
1593 | */ | 1611 | */ |
1594 | 1612 | ||
1595 | static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) | 1613 | void mem_cgroup_update_page_stat(struct page *page, |
1614 | enum mem_cgroup_page_stat_item idx, int val) | ||
1596 | { | 1615 | { |
1597 | struct mem_cgroup *mem; | 1616 | struct mem_cgroup *mem; |
1598 | struct page_cgroup *pc = lookup_page_cgroup(page); | 1617 | struct page_cgroup *pc = lookup_page_cgroup(page); |
1599 | bool need_unlock = false; | 1618 | bool need_unlock = false; |
1619 | unsigned long uninitialized_var(flags); | ||
1600 | 1620 | ||
1601 | if (unlikely(!pc)) | 1621 | if (unlikely(!pc)) |
1602 | return; | 1622 | return; |
@@ -1606,39 +1626,36 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) | |||
1606 | if (unlikely(!mem || !PageCgroupUsed(pc))) | 1626 | if (unlikely(!mem || !PageCgroupUsed(pc))) |
1607 | goto out; | 1627 | goto out; |
1608 | /* pc->mem_cgroup is unstable ? */ | 1628 | /* pc->mem_cgroup is unstable ? */ |
1609 | if (unlikely(mem_cgroup_stealed(mem))) { | 1629 | if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { |
1610 | /* take a lock against to access pc->mem_cgroup */ | 1630 | /* take a lock against to access pc->mem_cgroup */ |
1611 | lock_page_cgroup(pc); | 1631 | move_lock_page_cgroup(pc, &flags); |
1612 | need_unlock = true; | 1632 | need_unlock = true; |
1613 | mem = pc->mem_cgroup; | 1633 | mem = pc->mem_cgroup; |
1614 | if (!mem || !PageCgroupUsed(pc)) | 1634 | if (!mem || !PageCgroupUsed(pc)) |
1615 | goto out; | 1635 | goto out; |
1616 | } | 1636 | } |
1617 | 1637 | ||
1618 | this_cpu_add(mem->stat->count[idx], val); | ||
1619 | |||
1620 | switch (idx) { | 1638 | switch (idx) { |
1621 | case MEM_CGROUP_STAT_FILE_MAPPED: | 1639 | case MEMCG_NR_FILE_MAPPED: |
1622 | if (val > 0) | 1640 | if (val > 0) |
1623 | SetPageCgroupFileMapped(pc); | 1641 | SetPageCgroupFileMapped(pc); |
1624 | else if (!page_mapped(page)) | 1642 | else if (!page_mapped(page)) |
1625 | ClearPageCgroupFileMapped(pc); | 1643 | ClearPageCgroupFileMapped(pc); |
1644 | idx = MEM_CGROUP_STAT_FILE_MAPPED; | ||
1626 | break; | 1645 | break; |
1627 | default: | 1646 | default: |
1628 | BUG(); | 1647 | BUG(); |
1629 | } | 1648 | } |
1630 | 1649 | ||
1650 | this_cpu_add(mem->stat->count[idx], val); | ||
1651 | |||
1631 | out: | 1652 | out: |
1632 | if (unlikely(need_unlock)) | 1653 | if (unlikely(need_unlock)) |
1633 | unlock_page_cgroup(pc); | 1654 | move_unlock_page_cgroup(pc, &flags); |
1634 | rcu_read_unlock(); | 1655 | rcu_read_unlock(); |
1635 | return; | 1656 | return; |
1636 | } | 1657 | } |
1637 | 1658 | EXPORT_SYMBOL(mem_cgroup_update_page_stat); | |
1638 | void mem_cgroup_update_file_mapped(struct page *page, int val) | ||
1639 | { | ||
1640 | mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); | ||
1641 | } | ||
1642 | 1659 | ||
1643 | /* | 1660 | /* |
1644 | * size of first charge trial. "32" comes from vmscan.c's magic value. | 1661 | * size of first charge trial. "32" comes from vmscan.c's magic value. |
@@ -1834,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1834 | if (likely(!ret)) | 1851 | if (likely(!ret)) |
1835 | return CHARGE_OK; | 1852 | return CHARGE_OK; |
1836 | 1853 | ||
1854 | res_counter_uncharge(&mem->res, csize); | ||
1837 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | 1855 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); |
1838 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 1856 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1839 | } else | 1857 | } else |
1840 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | 1858 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); |
1841 | 1859 | /* | |
1842 | if (csize > PAGE_SIZE) /* change csize and retry */ | 1860 | * csize can be either a huge page (HPAGE_SIZE), a batch of |
1861 | * regular pages (CHARGE_SIZE), or a single regular page | ||
1862 | * (PAGE_SIZE). | ||
1863 | * | ||
1864 | * Never reclaim on behalf of optional batching, retry with a | ||
1865 | * single page instead. | ||
1866 | */ | ||
1867 | if (csize == CHARGE_SIZE) | ||
1843 | return CHARGE_RETRY; | 1868 | return CHARGE_RETRY; |
1844 | 1869 | ||
1845 | if (!(gfp_mask & __GFP_WAIT)) | 1870 | if (!(gfp_mask & __GFP_WAIT)) |
1846 | return CHARGE_WOULDBLOCK; | 1871 | return CHARGE_WOULDBLOCK; |
1847 | 1872 | ||
1848 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | 1873 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, |
1849 | gfp_mask, flags); | 1874 | gfp_mask, flags); |
1875 | if (mem_cgroup_check_margin(mem_over_limit, csize)) | ||
1876 | return CHARGE_RETRY; | ||
1850 | /* | 1877 | /* |
1851 | * try_to_free_mem_cgroup_pages() might not give us a full | 1878 | * Even though the limit is exceeded at this point, reclaim |
1852 | * picture of reclaim. Some pages are reclaimed and might be | 1879 | * may have been able to free some pages. Retry the charge |
1853 | * moved to swap cache or just unmapped from the cgroup. | 1880 | * before killing the task. |
1854 | * Check the limit again to see if the reclaim reduced the | 1881 | * |
1855 | * current usage of the cgroup before giving up | 1882 | * Only for regular pages, though: huge pages are rather |
1883 | * unlikely to succeed so close to the limit, and we fall back | ||
1884 | * to regular pages anyway in case of failure. | ||
1856 | */ | 1885 | */ |
1857 | if (ret || mem_cgroup_check_under_limit(mem_over_limit)) | 1886 | if (csize == PAGE_SIZE && ret) |
1858 | return CHARGE_RETRY; | 1887 | return CHARGE_RETRY; |
1859 | 1888 | ||
1860 | /* | 1889 | /* |
@@ -1879,12 +1908,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1879 | * oom-killer can be invoked. | 1908 | * oom-killer can be invoked. |
1880 | */ | 1909 | */ |
1881 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1910 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
1882 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) | 1911 | gfp_t gfp_mask, |
1912 | struct mem_cgroup **memcg, bool oom, | ||
1913 | int page_size) | ||
1883 | { | 1914 | { |
1884 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1915 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1885 | struct mem_cgroup *mem = NULL; | 1916 | struct mem_cgroup *mem = NULL; |
1886 | int ret; | 1917 | int ret; |
1887 | int csize = CHARGE_SIZE; | 1918 | int csize = max(CHARGE_SIZE, (unsigned long) page_size); |
1888 | 1919 | ||
1889 | /* | 1920 | /* |
1890 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage | 1921 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
@@ -1909,7 +1940,7 @@ again: | |||
1909 | VM_BUG_ON(css_is_removed(&mem->css)); | 1940 | VM_BUG_ON(css_is_removed(&mem->css)); |
1910 | if (mem_cgroup_is_root(mem)) | 1941 | if (mem_cgroup_is_root(mem)) |
1911 | goto done; | 1942 | goto done; |
1912 | if (consume_stock(mem)) | 1943 | if (page_size == PAGE_SIZE && consume_stock(mem)) |
1913 | goto done; | 1944 | goto done; |
1914 | css_get(&mem->css); | 1945 | css_get(&mem->css); |
1915 | } else { | 1946 | } else { |
@@ -1917,23 +1948,22 @@ again: | |||
1917 | 1948 | ||
1918 | rcu_read_lock(); | 1949 | rcu_read_lock(); |
1919 | p = rcu_dereference(mm->owner); | 1950 | p = rcu_dereference(mm->owner); |
1920 | VM_BUG_ON(!p); | ||
1921 | /* | 1951 | /* |
1922 | * because we don't have task_lock(), "p" can exit while | 1952 | * Because we don't have task_lock(), "p" can exit. |
1923 | * we're here. In that case, "mem" can point to root | 1953 | * In that case, "mem" can point to root or p can be NULL with |
1924 | * cgroup but never be NULL. (and task_struct itself is freed | 1954 | * race with swapoff. Then, we have small risk of mis-accouning. |
1925 | * by RCU, cgroup itself is RCU safe.) Then, we have small | 1955 | * But such kind of mis-account by race always happens because |
1926 | * risk here to get wrong cgroup. But such kind of mis-account | 1956 | * we don't have cgroup_mutex(). It's overkill and we allo that |
1927 | * by race always happens because we don't have cgroup_mutex(). | 1957 | * small race, here. |
1928 | * It's overkill and we allow that small race, here. | 1958 | * (*) swapoff at el will charge against mm-struct not against |
1959 | * task-struct. So, mm->owner can be NULL. | ||
1929 | */ | 1960 | */ |
1930 | mem = mem_cgroup_from_task(p); | 1961 | mem = mem_cgroup_from_task(p); |
1931 | VM_BUG_ON(!mem); | 1962 | if (!mem || mem_cgroup_is_root(mem)) { |
1932 | if (mem_cgroup_is_root(mem)) { | ||
1933 | rcu_read_unlock(); | 1963 | rcu_read_unlock(); |
1934 | goto done; | 1964 | goto done; |
1935 | } | 1965 | } |
1936 | if (consume_stock(mem)) { | 1966 | if (page_size == PAGE_SIZE && consume_stock(mem)) { |
1937 | /* | 1967 | /* |
1938 | * It seems dagerous to access memcg without css_get(). | 1968 | * It seems dagerous to access memcg without css_get(). |
1939 | * But considering how consume_stok works, it's not | 1969 | * But considering how consume_stok works, it's not |
@@ -1974,7 +2004,7 @@ again: | |||
1974 | case CHARGE_OK: | 2004 | case CHARGE_OK: |
1975 | break; | 2005 | break; |
1976 | case CHARGE_RETRY: /* not in OOM situation but retry */ | 2006 | case CHARGE_RETRY: /* not in OOM situation but retry */ |
1977 | csize = PAGE_SIZE; | 2007 | csize = page_size; |
1978 | css_put(&mem->css); | 2008 | css_put(&mem->css); |
1979 | mem = NULL; | 2009 | mem = NULL; |
1980 | goto again; | 2010 | goto again; |
@@ -1995,8 +2025,8 @@ again: | |||
1995 | } | 2025 | } |
1996 | } while (ret != CHARGE_OK); | 2026 | } while (ret != CHARGE_OK); |
1997 | 2027 | ||
1998 | if (csize > PAGE_SIZE) | 2028 | if (csize > page_size) |
1999 | refill_stock(mem, csize - PAGE_SIZE); | 2029 | refill_stock(mem, csize - page_size); |
2000 | css_put(&mem->css); | 2030 | css_put(&mem->css); |
2001 | done: | 2031 | done: |
2002 | *memcg = mem; | 2032 | *memcg = mem; |
@@ -2024,9 +2054,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | |||
2024 | } | 2054 | } |
2025 | } | 2055 | } |
2026 | 2056 | ||
2027 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | 2057 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, |
2058 | int page_size) | ||
2028 | { | 2059 | { |
2029 | __mem_cgroup_cancel_charge(mem, 1); | 2060 | __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); |
2030 | } | 2061 | } |
2031 | 2062 | ||
2032 | /* | 2063 | /* |
@@ -2076,15 +2107,13 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
2076 | return mem; | 2107 | return mem; |
2077 | } | 2108 | } |
2078 | 2109 | ||
2079 | /* | ||
2080 | * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be | ||
2081 | * USED state. If already USED, uncharge and return. | ||
2082 | */ | ||
2083 | |||
2084 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | 2110 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, |
2085 | struct page_cgroup *pc, | 2111 | struct page_cgroup *pc, |
2086 | enum charge_type ctype) | 2112 | enum charge_type ctype, |
2113 | int page_size) | ||
2087 | { | 2114 | { |
2115 | int nr_pages = page_size >> PAGE_SHIFT; | ||
2116 | |||
2088 | /* try_charge() can return NULL to *memcg, taking care of it. */ | 2117 | /* try_charge() can return NULL to *memcg, taking care of it. */ |
2089 | if (!mem) | 2118 | if (!mem) |
2090 | return; | 2119 | return; |
@@ -2092,10 +2121,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2092 | lock_page_cgroup(pc); | 2121 | lock_page_cgroup(pc); |
2093 | if (unlikely(PageCgroupUsed(pc))) { | 2122 | if (unlikely(PageCgroupUsed(pc))) { |
2094 | unlock_page_cgroup(pc); | 2123 | unlock_page_cgroup(pc); |
2095 | mem_cgroup_cancel_charge(mem); | 2124 | mem_cgroup_cancel_charge(mem, page_size); |
2096 | return; | 2125 | return; |
2097 | } | 2126 | } |
2098 | 2127 | /* | |
2128 | * we don't need page_cgroup_lock about tail pages, becase they are not | ||
2129 | * accessed by any other context at this point. | ||
2130 | */ | ||
2099 | pc->mem_cgroup = mem; | 2131 | pc->mem_cgroup = mem; |
2100 | /* | 2132 | /* |
2101 | * We access a page_cgroup asynchronously without lock_page_cgroup(). | 2133 | * We access a page_cgroup asynchronously without lock_page_cgroup(). |
@@ -2119,8 +2151,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2119 | break; | 2151 | break; |
2120 | } | 2152 | } |
2121 | 2153 | ||
2122 | mem_cgroup_charge_statistics(mem, pc, true); | 2154 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); |
2123 | |||
2124 | unlock_page_cgroup(pc); | 2155 | unlock_page_cgroup(pc); |
2125 | /* | 2156 | /* |
2126 | * "charge_statistics" updated event counter. Then, check it. | 2157 | * "charge_statistics" updated event counter. Then, check it. |
@@ -2130,6 +2161,48 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2130 | memcg_check_events(mem, pc->page); | 2161 | memcg_check_events(mem, pc->page); |
2131 | } | 2162 | } |
2132 | 2163 | ||
2164 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
2165 | |||
2166 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ | ||
2167 | (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) | ||
2168 | /* | ||
2169 | * Because tail pages are not marked as "used", set it. We're under | ||
2170 | * zone->lru_lock, 'splitting on pmd' and compund_lock. | ||
2171 | */ | ||
2172 | void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) | ||
2173 | { | ||
2174 | struct page_cgroup *head_pc = lookup_page_cgroup(head); | ||
2175 | struct page_cgroup *tail_pc = lookup_page_cgroup(tail); | ||
2176 | unsigned long flags; | ||
2177 | |||
2178 | if (mem_cgroup_disabled()) | ||
2179 | return; | ||
2180 | /* | ||
2181 | * We have no races with charge/uncharge but will have races with | ||
2182 | * page state accounting. | ||
2183 | */ | ||
2184 | move_lock_page_cgroup(head_pc, &flags); | ||
2185 | |||
2186 | tail_pc->mem_cgroup = head_pc->mem_cgroup; | ||
2187 | smp_wmb(); /* see __commit_charge() */ | ||
2188 | if (PageCgroupAcctLRU(head_pc)) { | ||
2189 | enum lru_list lru; | ||
2190 | struct mem_cgroup_per_zone *mz; | ||
2191 | |||
2192 | /* | ||
2193 | * LRU flags cannot be copied because we need to add tail | ||
2194 | *.page to LRU by generic call and our hook will be called. | ||
2195 | * We hold lru_lock, then, reduce counter directly. | ||
2196 | */ | ||
2197 | lru = page_lru(head); | ||
2198 | mz = page_cgroup_zoneinfo(head_pc); | ||
2199 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | ||
2200 | } | ||
2201 | tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; | ||
2202 | move_unlock_page_cgroup(head_pc, &flags); | ||
2203 | } | ||
2204 | #endif | ||
2205 | |||
2133 | /** | 2206 | /** |
2134 | * __mem_cgroup_move_account - move account of the page | 2207 | * __mem_cgroup_move_account - move account of the page |
2135 | * @pc: page_cgroup of the page. | 2208 | * @pc: page_cgroup of the page. |
@@ -2148,11 +2221,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2148 | */ | 2221 | */ |
2149 | 2222 | ||
2150 | static void __mem_cgroup_move_account(struct page_cgroup *pc, | 2223 | static void __mem_cgroup_move_account(struct page_cgroup *pc, |
2151 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) | 2224 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, |
2225 | int charge_size) | ||
2152 | { | 2226 | { |
2227 | int nr_pages = charge_size >> PAGE_SHIFT; | ||
2228 | |||
2153 | VM_BUG_ON(from == to); | 2229 | VM_BUG_ON(from == to); |
2154 | VM_BUG_ON(PageLRU(pc->page)); | 2230 | VM_BUG_ON(PageLRU(pc->page)); |
2155 | VM_BUG_ON(!PageCgroupLocked(pc)); | 2231 | VM_BUG_ON(!page_is_cgroup_locked(pc)); |
2156 | VM_BUG_ON(!PageCgroupUsed(pc)); | 2232 | VM_BUG_ON(!PageCgroupUsed(pc)); |
2157 | VM_BUG_ON(pc->mem_cgroup != from); | 2233 | VM_BUG_ON(pc->mem_cgroup != from); |
2158 | 2234 | ||
@@ -2163,14 +2239,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
2163 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 2239 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
2164 | preempt_enable(); | 2240 | preempt_enable(); |
2165 | } | 2241 | } |
2166 | mem_cgroup_charge_statistics(from, pc, false); | 2242 | mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); |
2167 | if (uncharge) | 2243 | if (uncharge) |
2168 | /* This is not "cancel", but cancel_charge does all we need. */ | 2244 | /* This is not "cancel", but cancel_charge does all we need. */ |
2169 | mem_cgroup_cancel_charge(from); | 2245 | mem_cgroup_cancel_charge(from, charge_size); |
2170 | 2246 | ||
2171 | /* caller should have done css_get */ | 2247 | /* caller should have done css_get */ |
2172 | pc->mem_cgroup = to; | 2248 | pc->mem_cgroup = to; |
2173 | mem_cgroup_charge_statistics(to, pc, true); | 2249 | mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); |
2174 | /* | 2250 | /* |
2175 | * We charges against "to" which may not have any tasks. Then, "to" | 2251 | * We charges against "to" which may not have any tasks. Then, "to" |
2176 | * can be under rmdir(). But in current implementation, caller of | 2252 | * can be under rmdir(). But in current implementation, caller of |
@@ -2185,12 +2261,25 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
2185 | * __mem_cgroup_move_account() | 2261 | * __mem_cgroup_move_account() |
2186 | */ | 2262 | */ |
2187 | static int mem_cgroup_move_account(struct page_cgroup *pc, | 2263 | static int mem_cgroup_move_account(struct page_cgroup *pc, |
2188 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) | 2264 | struct mem_cgroup *from, struct mem_cgroup *to, |
2265 | bool uncharge, int charge_size) | ||
2189 | { | 2266 | { |
2190 | int ret = -EINVAL; | 2267 | int ret = -EINVAL; |
2268 | unsigned long flags; | ||
2269 | /* | ||
2270 | * The page is isolated from LRU. So, collapse function | ||
2271 | * will not handle this page. But page splitting can happen. | ||
2272 | * Do this check under compound_page_lock(). The caller should | ||
2273 | * hold it. | ||
2274 | */ | ||
2275 | if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) | ||
2276 | return -EBUSY; | ||
2277 | |||
2191 | lock_page_cgroup(pc); | 2278 | lock_page_cgroup(pc); |
2192 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | 2279 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { |
2193 | __mem_cgroup_move_account(pc, from, to, uncharge); | 2280 | move_lock_page_cgroup(pc, &flags); |
2281 | __mem_cgroup_move_account(pc, from, to, uncharge, charge_size); | ||
2282 | move_unlock_page_cgroup(pc, &flags); | ||
2194 | ret = 0; | 2283 | ret = 0; |
2195 | } | 2284 | } |
2196 | unlock_page_cgroup(pc); | 2285 | unlock_page_cgroup(pc); |
@@ -2214,6 +2303,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
2214 | struct cgroup *cg = child->css.cgroup; | 2303 | struct cgroup *cg = child->css.cgroup; |
2215 | struct cgroup *pcg = cg->parent; | 2304 | struct cgroup *pcg = cg->parent; |
2216 | struct mem_cgroup *parent; | 2305 | struct mem_cgroup *parent; |
2306 | int page_size = PAGE_SIZE; | ||
2307 | unsigned long flags; | ||
2217 | int ret; | 2308 | int ret; |
2218 | 2309 | ||
2219 | /* Is ROOT ? */ | 2310 | /* Is ROOT ? */ |
@@ -2226,14 +2317,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
2226 | if (isolate_lru_page(page)) | 2317 | if (isolate_lru_page(page)) |
2227 | goto put; | 2318 | goto put; |
2228 | 2319 | ||
2320 | if (PageTransHuge(page)) | ||
2321 | page_size = HPAGE_SIZE; | ||
2322 | |||
2229 | parent = mem_cgroup_from_cont(pcg); | 2323 | parent = mem_cgroup_from_cont(pcg); |
2230 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | 2324 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, |
2325 | &parent, false, page_size); | ||
2231 | if (ret || !parent) | 2326 | if (ret || !parent) |
2232 | goto put_back; | 2327 | goto put_back; |
2233 | 2328 | ||
2234 | ret = mem_cgroup_move_account(pc, child, parent, true); | 2329 | if (page_size > PAGE_SIZE) |
2330 | flags = compound_lock_irqsave(page); | ||
2331 | |||
2332 | ret = mem_cgroup_move_account(pc, child, parent, true, page_size); | ||
2235 | if (ret) | 2333 | if (ret) |
2236 | mem_cgroup_cancel_charge(parent); | 2334 | mem_cgroup_cancel_charge(parent, page_size); |
2335 | |||
2336 | if (page_size > PAGE_SIZE) | ||
2337 | compound_unlock_irqrestore(page, flags); | ||
2237 | put_back: | 2338 | put_back: |
2238 | putback_lru_page(page); | 2339 | putback_lru_page(page); |
2239 | put: | 2340 | put: |
@@ -2252,20 +2353,32 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2252 | gfp_t gfp_mask, enum charge_type ctype) | 2353 | gfp_t gfp_mask, enum charge_type ctype) |
2253 | { | 2354 | { |
2254 | struct mem_cgroup *mem = NULL; | 2355 | struct mem_cgroup *mem = NULL; |
2356 | int page_size = PAGE_SIZE; | ||
2255 | struct page_cgroup *pc; | 2357 | struct page_cgroup *pc; |
2358 | bool oom = true; | ||
2256 | int ret; | 2359 | int ret; |
2257 | 2360 | ||
2361 | if (PageTransHuge(page)) { | ||
2362 | page_size <<= compound_order(page); | ||
2363 | VM_BUG_ON(!PageTransHuge(page)); | ||
2364 | /* | ||
2365 | * Never OOM-kill a process for a huge page. The | ||
2366 | * fault handler will fall back to regular pages. | ||
2367 | */ | ||
2368 | oom = false; | ||
2369 | } | ||
2370 | |||
2258 | pc = lookup_page_cgroup(page); | 2371 | pc = lookup_page_cgroup(page); |
2259 | /* can happen at boot */ | 2372 | /* can happen at boot */ |
2260 | if (unlikely(!pc)) | 2373 | if (unlikely(!pc)) |
2261 | return 0; | 2374 | return 0; |
2262 | prefetchw(pc); | 2375 | prefetchw(pc); |
2263 | 2376 | ||
2264 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | 2377 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); |
2265 | if (ret || !mem) | 2378 | if (ret || !mem) |
2266 | return ret; | 2379 | return ret; |
2267 | 2380 | ||
2268 | __mem_cgroup_commit_charge(mem, pc, ctype); | 2381 | __mem_cgroup_commit_charge(mem, pc, ctype, page_size); |
2269 | return 0; | 2382 | return 0; |
2270 | } | 2383 | } |
2271 | 2384 | ||
@@ -2274,8 +2387,6 @@ int mem_cgroup_newpage_charge(struct page *page, | |||
2274 | { | 2387 | { |
2275 | if (mem_cgroup_disabled()) | 2388 | if (mem_cgroup_disabled()) |
2276 | return 0; | 2389 | return 0; |
2277 | if (PageCompound(page)) | ||
2278 | return 0; | ||
2279 | /* | 2390 | /* |
2280 | * If already mapped, we don't have to account. | 2391 | * If already mapped, we don't have to account. |
2281 | * If page cache, page->mapping has address_space. | 2392 | * If page cache, page->mapping has address_space. |
@@ -2381,13 +2492,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2381 | if (!mem) | 2492 | if (!mem) |
2382 | goto charge_cur_mm; | 2493 | goto charge_cur_mm; |
2383 | *ptr = mem; | 2494 | *ptr = mem; |
2384 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | 2495 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); |
2385 | css_put(&mem->css); | 2496 | css_put(&mem->css); |
2386 | return ret; | 2497 | return ret; |
2387 | charge_cur_mm: | 2498 | charge_cur_mm: |
2388 | if (unlikely(!mm)) | 2499 | if (unlikely(!mm)) |
2389 | mm = &init_mm; | 2500 | mm = &init_mm; |
2390 | return __mem_cgroup_try_charge(mm, mask, ptr, true); | 2501 | return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); |
2391 | } | 2502 | } |
2392 | 2503 | ||
2393 | static void | 2504 | static void |
@@ -2403,7 +2514,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
2403 | cgroup_exclude_rmdir(&ptr->css); | 2514 | cgroup_exclude_rmdir(&ptr->css); |
2404 | pc = lookup_page_cgroup(page); | 2515 | pc = lookup_page_cgroup(page); |
2405 | mem_cgroup_lru_del_before_commit_swapcache(page); | 2516 | mem_cgroup_lru_del_before_commit_swapcache(page); |
2406 | __mem_cgroup_commit_charge(ptr, pc, ctype); | 2517 | __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); |
2407 | mem_cgroup_lru_add_after_commit_swapcache(page); | 2518 | mem_cgroup_lru_add_after_commit_swapcache(page); |
2408 | /* | 2519 | /* |
2409 | * Now swap is on-memory. This means this page may be | 2520 | * Now swap is on-memory. This means this page may be |
@@ -2452,11 +2563,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
2452 | return; | 2563 | return; |
2453 | if (!mem) | 2564 | if (!mem) |
2454 | return; | 2565 | return; |
2455 | mem_cgroup_cancel_charge(mem); | 2566 | mem_cgroup_cancel_charge(mem, PAGE_SIZE); |
2456 | } | 2567 | } |
2457 | 2568 | ||
2458 | static void | 2569 | static void |
2459 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | 2570 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, |
2571 | int page_size) | ||
2460 | { | 2572 | { |
2461 | struct memcg_batch_info *batch = NULL; | 2573 | struct memcg_batch_info *batch = NULL; |
2462 | bool uncharge_memsw = true; | 2574 | bool uncharge_memsw = true; |
@@ -2483,6 +2595,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
2483 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) | 2595 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) |
2484 | goto direct_uncharge; | 2596 | goto direct_uncharge; |
2485 | 2597 | ||
2598 | if (page_size != PAGE_SIZE) | ||
2599 | goto direct_uncharge; | ||
2600 | |||
2486 | /* | 2601 | /* |
2487 | * In typical case, batch->memcg == mem. This means we can | 2602 | * In typical case, batch->memcg == mem. This means we can |
2488 | * merge a series of uncharges to an uncharge of res_counter. | 2603 | * merge a series of uncharges to an uncharge of res_counter. |
@@ -2496,9 +2611,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
2496 | batch->memsw_bytes += PAGE_SIZE; | 2611 | batch->memsw_bytes += PAGE_SIZE; |
2497 | return; | 2612 | return; |
2498 | direct_uncharge: | 2613 | direct_uncharge: |
2499 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2614 | res_counter_uncharge(&mem->res, page_size); |
2500 | if (uncharge_memsw) | 2615 | if (uncharge_memsw) |
2501 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 2616 | res_counter_uncharge(&mem->memsw, page_size); |
2502 | if (unlikely(batch->memcg != mem)) | 2617 | if (unlikely(batch->memcg != mem)) |
2503 | memcg_oom_recover(mem); | 2618 | memcg_oom_recover(mem); |
2504 | return; | 2619 | return; |
@@ -2510,8 +2625,10 @@ direct_uncharge: | |||
2510 | static struct mem_cgroup * | 2625 | static struct mem_cgroup * |
2511 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 2626 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) |
2512 | { | 2627 | { |
2628 | int count; | ||
2513 | struct page_cgroup *pc; | 2629 | struct page_cgroup *pc; |
2514 | struct mem_cgroup *mem = NULL; | 2630 | struct mem_cgroup *mem = NULL; |
2631 | int page_size = PAGE_SIZE; | ||
2515 | 2632 | ||
2516 | if (mem_cgroup_disabled()) | 2633 | if (mem_cgroup_disabled()) |
2517 | return NULL; | 2634 | return NULL; |
@@ -2519,6 +2636,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2519 | if (PageSwapCache(page)) | 2636 | if (PageSwapCache(page)) |
2520 | return NULL; | 2637 | return NULL; |
2521 | 2638 | ||
2639 | if (PageTransHuge(page)) { | ||
2640 | page_size <<= compound_order(page); | ||
2641 | VM_BUG_ON(!PageTransHuge(page)); | ||
2642 | } | ||
2643 | |||
2644 | count = page_size >> PAGE_SHIFT; | ||
2522 | /* | 2645 | /* |
2523 | * Check if our page_cgroup is valid | 2646 | * Check if our page_cgroup is valid |
2524 | */ | 2647 | */ |
@@ -2551,7 +2674,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2551 | break; | 2674 | break; |
2552 | } | 2675 | } |
2553 | 2676 | ||
2554 | mem_cgroup_charge_statistics(mem, pc, false); | 2677 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count); |
2555 | 2678 | ||
2556 | ClearPageCgroupUsed(pc); | 2679 | ClearPageCgroupUsed(pc); |
2557 | /* | 2680 | /* |
@@ -2572,7 +2695,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2572 | mem_cgroup_get(mem); | 2695 | mem_cgroup_get(mem); |
2573 | } | 2696 | } |
2574 | if (!mem_cgroup_is_root(mem)) | 2697 | if (!mem_cgroup_is_root(mem)) |
2575 | __do_uncharge(mem, ctype); | 2698 | __do_uncharge(mem, ctype, page_size); |
2576 | 2699 | ||
2577 | return mem; | 2700 | return mem; |
2578 | 2701 | ||
@@ -2767,6 +2890,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
2767 | enum charge_type ctype; | 2890 | enum charge_type ctype; |
2768 | int ret = 0; | 2891 | int ret = 0; |
2769 | 2892 | ||
2893 | VM_BUG_ON(PageTransHuge(page)); | ||
2770 | if (mem_cgroup_disabled()) | 2894 | if (mem_cgroup_disabled()) |
2771 | return 0; | 2895 | return 0; |
2772 | 2896 | ||
@@ -2816,7 +2940,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
2816 | return 0; | 2940 | return 0; |
2817 | 2941 | ||
2818 | *ptr = mem; | 2942 | *ptr = mem; |
2819 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); | 2943 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); |
2820 | css_put(&mem->css);/* drop extra refcnt */ | 2944 | css_put(&mem->css);/* drop extra refcnt */ |
2821 | if (ret || *ptr == NULL) { | 2945 | if (ret || *ptr == NULL) { |
2822 | if (PageAnon(page)) { | 2946 | if (PageAnon(page)) { |
@@ -2843,13 +2967,13 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
2843 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | 2967 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
2844 | else | 2968 | else |
2845 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 2969 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
2846 | __mem_cgroup_commit_charge(mem, pc, ctype); | 2970 | __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); |
2847 | return ret; | 2971 | return ret; |
2848 | } | 2972 | } |
2849 | 2973 | ||
2850 | /* remove redundant charge if migration failed*/ | 2974 | /* remove redundant charge if migration failed*/ |
2851 | void mem_cgroup_end_migration(struct mem_cgroup *mem, | 2975 | void mem_cgroup_end_migration(struct mem_cgroup *mem, |
2852 | struct page *oldpage, struct page *newpage) | 2976 | struct page *oldpage, struct page *newpage, bool migration_ok) |
2853 | { | 2977 | { |
2854 | struct page *used, *unused; | 2978 | struct page *used, *unused; |
2855 | struct page_cgroup *pc; | 2979 | struct page_cgroup *pc; |
@@ -2858,8 +2982,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
2858 | return; | 2982 | return; |
2859 | /* blocks rmdir() */ | 2983 | /* blocks rmdir() */ |
2860 | cgroup_exclude_rmdir(&mem->css); | 2984 | cgroup_exclude_rmdir(&mem->css); |
2861 | /* at migration success, oldpage->mapping is NULL. */ | 2985 | if (!migration_ok) { |
2862 | if (oldpage->mapping) { | ||
2863 | used = oldpage; | 2986 | used = oldpage; |
2864 | unused = newpage; | 2987 | unused = newpage; |
2865 | } else { | 2988 | } else { |
@@ -4169,13 +4292,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
4169 | */ | 4292 | */ |
4170 | if (!node_state(node, N_NORMAL_MEMORY)) | 4293 | if (!node_state(node, N_NORMAL_MEMORY)) |
4171 | tmp = -1; | 4294 | tmp = -1; |
4172 | pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); | 4295 | pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); |
4173 | if (!pn) | 4296 | if (!pn) |
4174 | return 1; | 4297 | return 1; |
4175 | 4298 | ||
4176 | mem->info.nodeinfo[node] = pn; | 4299 | mem->info.nodeinfo[node] = pn; |
4177 | memset(pn, 0, sizeof(*pn)); | ||
4178 | |||
4179 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4300 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
4180 | mz = &pn->zoneinfo[zone]; | 4301 | mz = &pn->zoneinfo[zone]; |
4181 | for_each_lru(l) | 4302 | for_each_lru(l) |
@@ -4199,14 +4320,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
4199 | 4320 | ||
4200 | /* Can be very big if MAX_NUMNODES is very big */ | 4321 | /* Can be very big if MAX_NUMNODES is very big */ |
4201 | if (size < PAGE_SIZE) | 4322 | if (size < PAGE_SIZE) |
4202 | mem = kmalloc(size, GFP_KERNEL); | 4323 | mem = kzalloc(size, GFP_KERNEL); |
4203 | else | 4324 | else |
4204 | mem = vmalloc(size); | 4325 | mem = vzalloc(size); |
4205 | 4326 | ||
4206 | if (!mem) | 4327 | if (!mem) |
4207 | return NULL; | 4328 | return NULL; |
4208 | 4329 | ||
4209 | memset(mem, 0, size); | ||
4210 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | 4330 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); |
4211 | if (!mem->stat) | 4331 | if (!mem->stat) |
4212 | goto out_free; | 4332 | goto out_free; |
@@ -4454,7 +4574,8 @@ one_by_one: | |||
4454 | batch_count = PRECHARGE_COUNT_AT_ONCE; | 4574 | batch_count = PRECHARGE_COUNT_AT_ONCE; |
4455 | cond_resched(); | 4575 | cond_resched(); |
4456 | } | 4576 | } |
4457 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | 4577 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, |
4578 | PAGE_SIZE); | ||
4458 | if (ret || !mem) | 4579 | if (ret || !mem) |
4459 | /* mem_cgroup_clear_mc() will do uncharge later */ | 4580 | /* mem_cgroup_clear_mc() will do uncharge later */ |
4460 | return -ENOMEM; | 4581 | return -ENOMEM; |
@@ -4616,6 +4737,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
4616 | pte_t *pte; | 4737 | pte_t *pte; |
4617 | spinlock_t *ptl; | 4738 | spinlock_t *ptl; |
4618 | 4739 | ||
4740 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
4619 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 4741 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
4620 | for (; addr != end; pte++, addr += PAGE_SIZE) | 4742 | for (; addr != end; pte++, addr += PAGE_SIZE) |
4621 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | 4743 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) |
@@ -4653,10 +4775,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4653 | 4775 | ||
4654 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | 4776 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) |
4655 | { | 4777 | { |
4656 | return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); | 4778 | unsigned long precharge = mem_cgroup_count_precharge(mm); |
4779 | |||
4780 | VM_BUG_ON(mc.moving_task); | ||
4781 | mc.moving_task = current; | ||
4782 | return mem_cgroup_do_precharge(precharge); | ||
4657 | } | 4783 | } |
4658 | 4784 | ||
4659 | static void mem_cgroup_clear_mc(void) | 4785 | /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ |
4786 | static void __mem_cgroup_clear_mc(void) | ||
4660 | { | 4787 | { |
4661 | struct mem_cgroup *from = mc.from; | 4788 | struct mem_cgroup *from = mc.from; |
4662 | struct mem_cgroup *to = mc.to; | 4789 | struct mem_cgroup *to = mc.to; |
@@ -4691,18 +4818,28 @@ static void mem_cgroup_clear_mc(void) | |||
4691 | PAGE_SIZE * mc.moved_swap); | 4818 | PAGE_SIZE * mc.moved_swap); |
4692 | } | 4819 | } |
4693 | /* we've already done mem_cgroup_get(mc.to) */ | 4820 | /* we've already done mem_cgroup_get(mc.to) */ |
4694 | |||
4695 | mc.moved_swap = 0; | 4821 | mc.moved_swap = 0; |
4696 | } | 4822 | } |
4823 | memcg_oom_recover(from); | ||
4824 | memcg_oom_recover(to); | ||
4825 | wake_up_all(&mc.waitq); | ||
4826 | } | ||
4827 | |||
4828 | static void mem_cgroup_clear_mc(void) | ||
4829 | { | ||
4830 | struct mem_cgroup *from = mc.from; | ||
4831 | |||
4832 | /* | ||
4833 | * we must clear moving_task before waking up waiters at the end of | ||
4834 | * task migration. | ||
4835 | */ | ||
4836 | mc.moving_task = NULL; | ||
4837 | __mem_cgroup_clear_mc(); | ||
4697 | spin_lock(&mc.lock); | 4838 | spin_lock(&mc.lock); |
4698 | mc.from = NULL; | 4839 | mc.from = NULL; |
4699 | mc.to = NULL; | 4840 | mc.to = NULL; |
4700 | mc.moving_task = NULL; | ||
4701 | spin_unlock(&mc.lock); | 4841 | spin_unlock(&mc.lock); |
4702 | mem_cgroup_end_move(from); | 4842 | mem_cgroup_end_move(from); |
4703 | memcg_oom_recover(from); | ||
4704 | memcg_oom_recover(to); | ||
4705 | wake_up_all(&mc.waitq); | ||
4706 | } | 4843 | } |
4707 | 4844 | ||
4708 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | 4845 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |
@@ -4729,16 +4866,12 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
4729 | VM_BUG_ON(mc.precharge); | 4866 | VM_BUG_ON(mc.precharge); |
4730 | VM_BUG_ON(mc.moved_charge); | 4867 | VM_BUG_ON(mc.moved_charge); |
4731 | VM_BUG_ON(mc.moved_swap); | 4868 | VM_BUG_ON(mc.moved_swap); |
4732 | VM_BUG_ON(mc.moving_task); | ||
4733 | mem_cgroup_start_move(from); | 4869 | mem_cgroup_start_move(from); |
4734 | spin_lock(&mc.lock); | 4870 | spin_lock(&mc.lock); |
4735 | mc.from = from; | 4871 | mc.from = from; |
4736 | mc.to = mem; | 4872 | mc.to = mem; |
4737 | mc.precharge = 0; | ||
4738 | mc.moved_charge = 0; | ||
4739 | mc.moved_swap = 0; | ||
4740 | mc.moving_task = current; | ||
4741 | spin_unlock(&mc.lock); | 4873 | spin_unlock(&mc.lock); |
4874 | /* We set mc.moving_task later */ | ||
4742 | 4875 | ||
4743 | ret = mem_cgroup_precharge_mc(mm); | 4876 | ret = mem_cgroup_precharge_mc(mm); |
4744 | if (ret) | 4877 | if (ret) |
@@ -4767,6 +4900,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
4767 | spinlock_t *ptl; | 4900 | spinlock_t *ptl; |
4768 | 4901 | ||
4769 | retry: | 4902 | retry: |
4903 | VM_BUG_ON(pmd_trans_huge(*pmd)); | ||
4770 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 4904 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
4771 | for (; addr != end; addr += PAGE_SIZE) { | 4905 | for (; addr != end; addr += PAGE_SIZE) { |
4772 | pte_t ptent = *(pte++); | 4906 | pte_t ptent = *(pte++); |
@@ -4787,7 +4921,7 @@ retry: | |||
4787 | goto put; | 4921 | goto put; |
4788 | pc = lookup_page_cgroup(page); | 4922 | pc = lookup_page_cgroup(page); |
4789 | if (!mem_cgroup_move_account(pc, | 4923 | if (!mem_cgroup_move_account(pc, |
4790 | mc.from, mc.to, false)) { | 4924 | mc.from, mc.to, false, PAGE_SIZE)) { |
4791 | mc.precharge--; | 4925 | mc.precharge--; |
4792 | /* we uncharge from mc.from later. */ | 4926 | /* we uncharge from mc.from later. */ |
4793 | mc.moved_charge++; | 4927 | mc.moved_charge++; |
@@ -4832,7 +4966,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
4832 | struct vm_area_struct *vma; | 4966 | struct vm_area_struct *vma; |
4833 | 4967 | ||
4834 | lru_add_drain_all(); | 4968 | lru_add_drain_all(); |
4835 | down_read(&mm->mmap_sem); | 4969 | retry: |
4970 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { | ||
4971 | /* | ||
4972 | * Someone who are holding the mmap_sem might be waiting in | ||
4973 | * waitq. So we cancel all extra charges, wake up all waiters, | ||
4974 | * and retry. Because we cancel precharges, we might not be able | ||
4975 | * to move enough charges, but moving charge is a best-effort | ||
4976 | * feature anyway, so it wouldn't be a big problem. | ||
4977 | */ | ||
4978 | __mem_cgroup_clear_mc(); | ||
4979 | cond_resched(); | ||
4980 | goto retry; | ||
4981 | } | ||
4836 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4982 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
4837 | int ret; | 4983 | int ret; |
4838 | struct mm_walk mem_cgroup_move_charge_walk = { | 4984 | struct mm_walk mem_cgroup_move_charge_walk = { |
@@ -4911,10 +5057,21 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
4911 | }; | 5057 | }; |
4912 | 5058 | ||
4913 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 5059 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
5060 | static int __init enable_swap_account(char *s) | ||
5061 | { | ||
5062 | /* consider enabled if no parameter or 1 is given */ | ||
5063 | if (!(*s) || !strcmp(s, "=1")) | ||
5064 | really_do_swap_account = 1; | ||
5065 | else if (!strcmp(s, "=0")) | ||
5066 | really_do_swap_account = 0; | ||
5067 | return 1; | ||
5068 | } | ||
5069 | __setup("swapaccount", enable_swap_account); | ||
4914 | 5070 | ||
4915 | static int __init disable_swap_account(char *s) | 5071 | static int __init disable_swap_account(char *s) |
4916 | { | 5072 | { |
4917 | really_do_swap_account = 0; | 5073 | printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); |
5074 | enable_swap_account("=0"); | ||
4918 | return 1; | 5075 | return 1; |
4919 | } | 5076 | } |
4920 | __setup("noswapaccount", disable_swap_account); | 5077 | __setup("noswapaccount", disable_swap_account); |