aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2011-07-11 08:15:48 -0400
committerJiri Kosina <jkosina@suse.cz>2011-07-11 08:15:55 -0400
commitb7e9c223be8ce335e30f2cf6ba588e6a4092275c (patch)
tree2d1e3b75606abc18df7ad65e51ac3f90cd68b38d /mm/memcontrol.c
parentc172d82500a6cf3c32d1e650722a1055d72ce858 (diff)
parente3bbfa78bab125f58b831b5f7f45b5a305091d72 (diff)
Merge branch 'master' into for-next
Sync with Linus' tree to be able to apply pending patches that are based on newer code already present upstream.
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c222
1 files changed, 158 insertions, 64 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bd9052a5d3ad..e013b8e57d25 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,6 +35,7 @@
35#include <linux/limits.h> 35#include <linux/limits.h>
36#include <linux/mutex.h> 36#include <linux/mutex.h>
37#include <linux/rbtree.h> 37#include <linux/rbtree.h>
38#include <linux/shmem_fs.h>
38#include <linux/slab.h> 39#include <linux/slab.h>
39#include <linux/swap.h> 40#include <linux/swap.h>
40#include <linux/swapops.h> 41#include <linux/swapops.h>
@@ -107,10 +108,12 @@ enum mem_cgroup_events_index {
107enum mem_cgroup_events_target { 108enum mem_cgroup_events_target {
108 MEM_CGROUP_TARGET_THRESH, 109 MEM_CGROUP_TARGET_THRESH,
109 MEM_CGROUP_TARGET_SOFTLIMIT, 110 MEM_CGROUP_TARGET_SOFTLIMIT,
111 MEM_CGROUP_TARGET_NUMAINFO,
110 MEM_CGROUP_NTARGETS, 112 MEM_CGROUP_NTARGETS,
111}; 113};
112#define THRESHOLDS_EVENTS_TARGET (128) 114#define THRESHOLDS_EVENTS_TARGET (128)
113#define SOFTLIMIT_EVENTS_TARGET (1024) 115#define SOFTLIMIT_EVENTS_TARGET (1024)
116#define NUMAINFO_EVENTS_TARGET (1024)
114 117
115struct mem_cgroup_stat_cpu { 118struct mem_cgroup_stat_cpu {
116 long count[MEM_CGROUP_STAT_NSTATS]; 119 long count[MEM_CGROUP_STAT_NSTATS];
@@ -236,7 +239,8 @@ struct mem_cgroup {
236 int last_scanned_node; 239 int last_scanned_node;
237#if MAX_NUMNODES > 1 240#if MAX_NUMNODES > 1
238 nodemask_t scan_nodes; 241 nodemask_t scan_nodes;
239 unsigned long next_scan_node_update; 242 atomic_t numainfo_events;
243 atomic_t numainfo_updating;
240#endif 244#endif
241 /* 245 /*
242 * Should the accounting and control be hierarchical, per subtree? 246 * Should the accounting and control be hierarchical, per subtree?
@@ -359,7 +363,7 @@ enum charge_type {
359static void mem_cgroup_get(struct mem_cgroup *mem); 363static void mem_cgroup_get(struct mem_cgroup *mem);
360static void mem_cgroup_put(struct mem_cgroup *mem); 364static void mem_cgroup_put(struct mem_cgroup *mem);
361static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 365static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
362static void drain_all_stock_async(void); 366static void drain_all_stock_async(struct mem_cgroup *mem);
363 367
364static struct mem_cgroup_per_zone * 368static struct mem_cgroup_per_zone *
365mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 369mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -576,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
576 return val; 580 return val;
577} 581}
578 582
579static long mem_cgroup_local_usage(struct mem_cgroup *mem)
580{
581 long ret;
582
583 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
584 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
585 return ret;
586}
587
588static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 583static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
589 bool charge) 584 bool charge)
590{ 585{
@@ -688,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
688 case MEM_CGROUP_TARGET_SOFTLIMIT: 683 case MEM_CGROUP_TARGET_SOFTLIMIT:
689 next = val + SOFTLIMIT_EVENTS_TARGET; 684 next = val + SOFTLIMIT_EVENTS_TARGET;
690 break; 685 break;
686 case MEM_CGROUP_TARGET_NUMAINFO:
687 next = val + NUMAINFO_EVENTS_TARGET;
688 break;
691 default: 689 default:
692 return; 690 return;
693 } 691 }
@@ -706,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
706 mem_cgroup_threshold(mem); 704 mem_cgroup_threshold(mem);
707 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); 705 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
708 if (unlikely(__memcg_event_check(mem, 706 if (unlikely(__memcg_event_check(mem,
709 MEM_CGROUP_TARGET_SOFTLIMIT))){ 707 MEM_CGROUP_TARGET_SOFTLIMIT))) {
710 mem_cgroup_update_tree(mem, page); 708 mem_cgroup_update_tree(mem, page);
711 __mem_cgroup_target_update(mem, 709 __mem_cgroup_target_update(mem,
712 MEM_CGROUP_TARGET_SOFTLIMIT); 710 MEM_CGROUP_TARGET_SOFTLIMIT);
713 } 711 }
712#if MAX_NUMNODES > 1
713 if (unlikely(__memcg_event_check(mem,
714 MEM_CGROUP_TARGET_NUMAINFO))) {
715 atomic_inc(&mem->numainfo_events);
716 __mem_cgroup_target_update(mem,
717 MEM_CGROUP_TARGET_NUMAINFO);
718 }
719#endif
714 } 720 }
715} 721}
716 722
@@ -735,7 +741,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
735 struct mem_cgroup, css); 741 struct mem_cgroup, css);
736} 742}
737 743
738static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 744struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
739{ 745{
740 struct mem_cgroup *mem = NULL; 746 struct mem_cgroup *mem = NULL;
741 747
@@ -1128,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
1128 return MEM_CGROUP_ZSTAT(mz, lru); 1134 return MEM_CGROUP_ZSTAT(mz, lru);
1129} 1135}
1130 1136
1131#ifdef CONFIG_NUMA
1132static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, 1137static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1133 int nid) 1138 int nid)
1134{ 1139{
@@ -1140,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1140 return ret; 1145 return ret;
1141} 1146}
1142 1147
1148static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
1149 int nid)
1150{
1151 unsigned long ret;
1152
1153 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
1154 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
1155 return ret;
1156}
1157
1158#if MAX_NUMNODES > 1
1143static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) 1159static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1144{ 1160{
1145 u64 total = 0; 1161 u64 total = 0;
@@ -1151,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1151 return total; 1167 return total;
1152} 1168}
1153 1169
1154static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
1155 int nid)
1156{
1157 unsigned long ret;
1158
1159 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
1160 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
1161
1162 return ret;
1163}
1164
1165static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) 1170static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
1166{ 1171{
1167 u64 total = 0; 1172 u64 total = 0;
@@ -1558,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1558 return ret; 1563 return ret;
1559} 1564}
1560 1565
1566/**
1567 * test_mem_cgroup_node_reclaimable
1568 * @mem: the target memcg
1569 * @nid: the node ID to be checked.
1570 * @noswap : specify true here if the user wants flle only information.
1571 *
1572 * This function returns whether the specified memcg contains any
1573 * reclaimable pages on a node. Returns true if there are any reclaimable
1574 * pages in the node.
1575 */
1576static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
1577 int nid, bool noswap)
1578{
1579 if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
1580 return true;
1581 if (noswap || !total_swap_pages)
1582 return false;
1583 if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
1584 return true;
1585 return false;
1586
1587}
1561#if MAX_NUMNODES > 1 1588#if MAX_NUMNODES > 1
1562 1589
1563/* 1590/*
@@ -1569,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1569static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) 1596static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
1570{ 1597{
1571 int nid; 1598 int nid;
1572 1599 /*
1573 if (time_after(mem->next_scan_node_update, jiffies)) 1600 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1601 * pagein/pageout changes since the last update.
1602 */
1603 if (!atomic_read(&mem->numainfo_events))
1604 return;
1605 if (atomic_inc_return(&mem->numainfo_updating) > 1)
1574 return; 1606 return;
1575 1607
1576 mem->next_scan_node_update = jiffies + 10*HZ;
1577 /* make a nodemask where this memcg uses memory from */ 1608 /* make a nodemask where this memcg uses memory from */
1578 mem->scan_nodes = node_states[N_HIGH_MEMORY]; 1609 mem->scan_nodes = node_states[N_HIGH_MEMORY];
1579 1610
1580 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1611 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1581 1612
1582 if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || 1613 if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
1583 mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) 1614 node_clear(nid, mem->scan_nodes);
1584 continue;
1585
1586 if (total_swap_pages &&
1587 (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
1588 mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
1589 continue;
1590 node_clear(nid, mem->scan_nodes);
1591 } 1615 }
1616
1617 atomic_set(&mem->numainfo_events, 0);
1618 atomic_set(&mem->numainfo_updating, 0);
1592} 1619}
1593 1620
1594/* 1621/*
@@ -1626,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1626 return node; 1653 return node;
1627} 1654}
1628 1655
1656/*
1657 * Check all nodes whether it contains reclaimable pages or not.
1658 * For quick scan, we make use of scan_nodes. This will allow us to skip
1659 * unused nodes. But scan_nodes is lazily updated and may not cotain
1660 * enough new information. We need to do double check.
1661 */
1662bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1663{
1664 int nid;
1665
1666 /*
1667 * quick check...making use of scan_node.
1668 * We can skip unused nodes.
1669 */
1670 if (!nodes_empty(mem->scan_nodes)) {
1671 for (nid = first_node(mem->scan_nodes);
1672 nid < MAX_NUMNODES;
1673 nid = next_node(nid, mem->scan_nodes)) {
1674
1675 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1676 return true;
1677 }
1678 }
1679 /*
1680 * Check rest of nodes.
1681 */
1682 for_each_node_state(nid, N_HIGH_MEMORY) {
1683 if (node_isset(nid, mem->scan_nodes))
1684 continue;
1685 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1686 return true;
1687 }
1688 return false;
1689}
1690
1629#else 1691#else
1630int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1692int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1631{ 1693{
1632 return 0; 1694 return 0;
1633} 1695}
1696
1697bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1698{
1699 return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
1700}
1634#endif 1701#endif
1635 1702
1636/* 1703/*
@@ -1663,15 +1730,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1663 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1730 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1664 1731
1665 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1732 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1666 if (root_mem->memsw_is_minimum) 1733 if (!check_soft && root_mem->memsw_is_minimum)
1667 noswap = true; 1734 noswap = true;
1668 1735
1669 while (1) { 1736 while (1) {
1670 victim = mem_cgroup_select_victim(root_mem); 1737 victim = mem_cgroup_select_victim(root_mem);
1671 if (victim == root_mem) { 1738 if (victim == root_mem) {
1672 loop++; 1739 loop++;
1673 if (loop >= 1) 1740 /*
1674 drain_all_stock_async(); 1741 * We are not draining per cpu cached charges during
1742 * soft limit reclaim because global reclaim doesn't
1743 * care about charges. It tries to free some memory and
1744 * charges will not give any.
1745 */
1746 if (!check_soft && loop >= 1)
1747 drain_all_stock_async(root_mem);
1675 if (loop >= 2) { 1748 if (loop >= 2) {
1676 /* 1749 /*
1677 * If we have not been able to reclaim 1750 * If we have not been able to reclaim
@@ -1695,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1695 } 1768 }
1696 } 1769 }
1697 } 1770 }
1698 if (!mem_cgroup_local_usage(victim)) { 1771 if (!mem_cgroup_reclaimable(victim, noswap)) {
1699 /* this cgroup's local usage == 0 */ 1772 /* this cgroup's local usage == 0 */
1700 css_put(&victim->css); 1773 css_put(&victim->css);
1701 continue; 1774 continue;
@@ -1934,9 +2007,11 @@ struct memcg_stock_pcp {
1934 struct mem_cgroup *cached; /* this never be root cgroup */ 2007 struct mem_cgroup *cached; /* this never be root cgroup */
1935 unsigned int nr_pages; 2008 unsigned int nr_pages;
1936 struct work_struct work; 2009 struct work_struct work;
2010 unsigned long flags;
2011#define FLUSHING_CACHED_CHARGE (0)
1937}; 2012};
1938static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2013static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1939static atomic_t memcg_drain_count; 2014static DEFINE_MUTEX(percpu_charge_mutex);
1940 2015
1941/* 2016/*
1942 * Try to consume stocked charge on this cpu. If success, one page is consumed 2017 * Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -1984,6 +2059,7 @@ static void drain_local_stock(struct work_struct *dummy)
1984{ 2059{
1985 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2060 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1986 drain_stock(stock); 2061 drain_stock(stock);
2062 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1987} 2063}
1988 2064
1989/* 2065/*
@@ -2008,26 +2084,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
2008 * expects some charges will be back to res_counter later but cannot wait for 2084 * expects some charges will be back to res_counter later but cannot wait for
2009 * it. 2085 * it.
2010 */ 2086 */
2011static void drain_all_stock_async(void) 2087static void drain_all_stock_async(struct mem_cgroup *root_mem)
2012{ 2088{
2013 int cpu; 2089 int cpu, curcpu;
2014 /* This function is for scheduling "drain" in asynchronous way. 2090 /*
2015 * The result of "drain" is not directly handled by callers. Then, 2091 * If someone calls draining, avoid adding more kworker runs.
2016 * if someone is calling drain, we don't have to call drain more.
2017 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
2018 * there is a race. We just do loose check here.
2019 */ 2092 */
2020 if (atomic_read(&memcg_drain_count)) 2093 if (!mutex_trylock(&percpu_charge_mutex))
2021 return; 2094 return;
2022 /* Notify other cpus that system-wide "drain" is running */ 2095 /* Notify other cpus that system-wide "drain" is running */
2023 atomic_inc(&memcg_drain_count);
2024 get_online_cpus(); 2096 get_online_cpus();
2097 /*
2098 * Get a hint for avoiding draining charges on the current cpu,
2099 * which must be exhausted by our charging. It is not required that
2100 * this be a precise check, so we use raw_smp_processor_id() instead of
2101 * getcpu()/putcpu().
2102 */
2103 curcpu = raw_smp_processor_id();
2025 for_each_online_cpu(cpu) { 2104 for_each_online_cpu(cpu) {
2026 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2105 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2027 schedule_work_on(cpu, &stock->work); 2106 struct mem_cgroup *mem;
2107
2108 if (cpu == curcpu)
2109 continue;
2110
2111 mem = stock->cached;
2112 if (!mem)
2113 continue;
2114 if (mem != root_mem) {
2115 if (!root_mem->use_hierarchy)
2116 continue;
2117 /* check whether "mem" is under tree of "root_mem" */
2118 if (!css_is_ancestor(&mem->css, &root_mem->css))
2119 continue;
2120 }
2121 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2122 schedule_work_on(cpu, &stock->work);
2028 } 2123 }
2029 put_online_cpus(); 2124 put_online_cpus();
2030 atomic_dec(&memcg_drain_count); 2125 mutex_unlock(&percpu_charge_mutex);
2031 /* We don't wait for flush_work */ 2126 /* We don't wait for flush_work */
2032} 2127}
2033 2128
@@ -2035,9 +2130,9 @@ static void drain_all_stock_async(void)
2035static void drain_all_stock_sync(void) 2130static void drain_all_stock_sync(void)
2036{ 2131{
2037 /* called when force_empty is called */ 2132 /* called when force_empty is called */
2038 atomic_inc(&memcg_drain_count); 2133 mutex_lock(&percpu_charge_mutex);
2039 schedule_on_each_cpu(drain_local_stock); 2134 schedule_on_each_cpu(drain_local_stock);
2040 atomic_dec(&memcg_drain_count); 2135 mutex_unlock(&percpu_charge_mutex);
2041} 2136}
2042 2137
2043/* 2138/*
@@ -4640,6 +4735,7 @@ static struct cftype mem_cgroup_files[] = {
4640 { 4735 {
4641 .name = "numa_stat", 4736 .name = "numa_stat",
4642 .open = mem_control_numa_stat_open, 4737 .open = mem_control_numa_stat_open,
4738 .mode = S_IRUGO,
4643 }, 4739 },
4644#endif 4740#endif
4645}; 4741};
@@ -5414,18 +5510,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5414 struct cgroup *old_cont, 5510 struct cgroup *old_cont,
5415 struct task_struct *p) 5511 struct task_struct *p)
5416{ 5512{
5417 struct mm_struct *mm; 5513 struct mm_struct *mm = get_task_mm(p);
5418 5514
5419 if (!mc.to)
5420 /* no need to move charge */
5421 return;
5422
5423 mm = get_task_mm(p);
5424 if (mm) { 5515 if (mm) {
5425 mem_cgroup_move_charge(mm); 5516 if (mc.to)
5517 mem_cgroup_move_charge(mm);
5518 put_swap_token(mm);
5426 mmput(mm); 5519 mmput(mm);
5427 } 5520 }
5428 mem_cgroup_clear_mc(); 5521 if (mc.to)
5522 mem_cgroup_clear_mc();
5429} 5523}
5430#else /* !CONFIG_MMU */ 5524#else /* !CONFIG_MMU */
5431static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5525static int mem_cgroup_can_attach(struct cgroup_subsys *ss,