aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c226
-rw-r--r--mm/vmscan.c45
2 files changed, 256 insertions, 15 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 90f0b13e1c3c..011aba6cad70 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -139,6 +139,8 @@ struct mem_cgroup_per_zone {
139 unsigned long long usage_in_excess;/* Set to the value by which */ 139 unsigned long long usage_in_excess;/* Set to the value by which */
140 /* the soft limit is exceeded*/ 140 /* the soft limit is exceeded*/
141 bool on_tree; 141 bool on_tree;
142 struct mem_cgroup *mem; /* Back pointer, we cannot */
143 /* use container_of */
142}; 144};
143/* Macro for accessing counter */ 145/* Macro for accessing counter */
144#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 146#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -228,6 +230,13 @@ struct mem_cgroup {
228 struct mem_cgroup_stat stat; 230 struct mem_cgroup_stat stat;
229}; 231};
230 232
233/*
234 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
235 * limit reclaim to prevent infinite loops, if they ever occur.
236 */
237#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
238#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
239
231enum charge_type { 240enum charge_type {
232 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 241 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
233 MEM_CGROUP_CHARGE_TYPE_MAPPED, 242 MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -259,6 +268,8 @@ enum charge_type {
259#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 268#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
260#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 269#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
261#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 270#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
271#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
272#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
262 273
263static void mem_cgroup_get(struct mem_cgroup *mem); 274static void mem_cgroup_get(struct mem_cgroup *mem);
264static void mem_cgroup_put(struct mem_cgroup *mem); 275static void mem_cgroup_put(struct mem_cgroup *mem);
@@ -299,7 +310,7 @@ soft_limit_tree_from_page(struct page *page)
299} 310}
300 311
301static void 312static void
302mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 313__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
303 struct mem_cgroup_per_zone *mz, 314 struct mem_cgroup_per_zone *mz,
304 struct mem_cgroup_tree_per_zone *mctz) 315 struct mem_cgroup_tree_per_zone *mctz)
305{ 316{
@@ -311,7 +322,6 @@ mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
311 return; 322 return;
312 323
313 mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res); 324 mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
314 spin_lock(&mctz->lock);
315 while (*p) { 325 while (*p) {
316 parent = *p; 326 parent = *p;
317 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 327 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
@@ -328,6 +338,26 @@ mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
328 rb_link_node(&mz->tree_node, parent, p); 338 rb_link_node(&mz->tree_node, parent, p);
329 rb_insert_color(&mz->tree_node, &mctz->rb_root); 339 rb_insert_color(&mz->tree_node, &mctz->rb_root);
330 mz->on_tree = true; 340 mz->on_tree = true;
341}
342
343static void
344__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
345 struct mem_cgroup_per_zone *mz,
346 struct mem_cgroup_tree_per_zone *mctz)
347{
348 if (!mz->on_tree)
349 return;
350 rb_erase(&mz->tree_node, &mctz->rb_root);
351 mz->on_tree = false;
352}
353
354static void
355mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
356 struct mem_cgroup_per_zone *mz,
357 struct mem_cgroup_tree_per_zone *mctz)
358{
359 spin_lock(&mctz->lock);
360 __mem_cgroup_insert_exceeded(mem, mz, mctz);
331 spin_unlock(&mctz->lock); 361 spin_unlock(&mctz->lock);
332} 362}
333 363
@@ -337,8 +367,7 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
337 struct mem_cgroup_tree_per_zone *mctz) 367 struct mem_cgroup_tree_per_zone *mctz)
338{ 368{
339 spin_lock(&mctz->lock); 369 spin_lock(&mctz->lock);
340 rb_erase(&mz->tree_node, &mctz->rb_root); 370 __mem_cgroup_remove_exceeded(mem, mz, mctz);
341 mz->on_tree = false;
342 spin_unlock(&mctz->lock); 371 spin_unlock(&mctz->lock);
343} 372}
344 373
@@ -408,6 +437,47 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
408 } 437 }
409} 438}
410 439
440static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
441{
442 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
443}
444
445static struct mem_cgroup_per_zone *
446__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
447{
448 struct rb_node *rightmost = NULL;
449 struct mem_cgroup_per_zone *mz = NULL;
450
451retry:
452 rightmost = rb_last(&mctz->rb_root);
453 if (!rightmost)
454 goto done; /* Nothing to reclaim from */
455
456 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
457 /*
458 * Remove the node now but someone else can add it back,
459 * we will to add it back at the end of reclaim to its correct
460 * position in the tree.
461 */
462 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
463 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
464 !css_tryget(&mz->mem->css))
465 goto retry;
466done:
467 return mz;
468}
469
470static struct mem_cgroup_per_zone *
471mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
472{
473 struct mem_cgroup_per_zone *mz;
474
475 spin_lock(&mctz->lock);
476 mz = __mem_cgroup_largest_soft_limit_node(mctz);
477 spin_unlock(&mctz->lock);
478 return mz;
479}
480
411static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 481static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
412 struct page_cgroup *pc, 482 struct page_cgroup *pc,
413 bool charge) 483 bool charge)
@@ -1037,6 +1107,7 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1037 * If shrink==true, for avoiding to free too much, this returns immedieately. 1107 * If shrink==true, for avoiding to free too much, this returns immedieately.
1038 */ 1108 */
1039static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1109static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1110 struct zone *zone,
1040 gfp_t gfp_mask, 1111 gfp_t gfp_mask,
1041 unsigned long reclaim_options) 1112 unsigned long reclaim_options)
1042{ 1113{
@@ -1045,23 +1116,53 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1045 int loop = 0; 1116 int loop = 0;
1046 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1117 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1047 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1118 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1119 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1120 unsigned long excess = mem_cgroup_get_excess(root_mem);
1048 1121
1049 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1122 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1050 if (root_mem->memsw_is_minimum) 1123 if (root_mem->memsw_is_minimum)
1051 noswap = true; 1124 noswap = true;
1052 1125
1053 while (loop < 2) { 1126 while (1) {
1054 victim = mem_cgroup_select_victim(root_mem); 1127 victim = mem_cgroup_select_victim(root_mem);
1055 if (victim == root_mem) 1128 if (victim == root_mem) {
1056 loop++; 1129 loop++;
1130 if (loop >= 2) {
1131 /*
1132 * If we have not been able to reclaim
1133 * anything, it might because there are
1134 * no reclaimable pages under this hierarchy
1135 */
1136 if (!check_soft || !total) {
1137 css_put(&victim->css);
1138 break;
1139 }
1140 /*
1141 * We want to do more targetted reclaim.
1142 * excess >> 2 is not to excessive so as to
1143 * reclaim too much, nor too less that we keep
1144 * coming back to reclaim from this cgroup
1145 */
1146 if (total >= (excess >> 2) ||
1147 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1148 css_put(&victim->css);
1149 break;
1150 }
1151 }
1152 }
1057 if (!mem_cgroup_local_usage(&victim->stat)) { 1153 if (!mem_cgroup_local_usage(&victim->stat)) {
1058 /* this cgroup's local usage == 0 */ 1154 /* this cgroup's local usage == 0 */
1059 css_put(&victim->css); 1155 css_put(&victim->css);
1060 continue; 1156 continue;
1061 } 1157 }
1062 /* we use swappiness of local cgroup */ 1158 /* we use swappiness of local cgroup */
1063 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, 1159 if (check_soft)
1064 get_swappiness(victim)); 1160 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1161 noswap, get_swappiness(victim), zone,
1162 zone->zone_pgdat->node_id);
1163 else
1164 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1165 noswap, get_swappiness(victim));
1065 css_put(&victim->css); 1166 css_put(&victim->css);
1066 /* 1167 /*
1067 * At shrinking usage, we can't check we should stop here or 1168 * At shrinking usage, we can't check we should stop here or
@@ -1071,7 +1172,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1071 if (shrink) 1172 if (shrink)
1072 return ret; 1173 return ret;
1073 total += ret; 1174 total += ret;
1074 if (mem_cgroup_check_under_limit(root_mem)) 1175 if (check_soft) {
1176 if (res_counter_check_under_soft_limit(&root_mem->res))
1177 return total;
1178 } else if (mem_cgroup_check_under_limit(root_mem))
1075 return 1 + total; 1179 return 1 + total;
1076 } 1180 }
1077 return total; 1181 return total;
@@ -1206,8 +1310,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1206 if (!(gfp_mask & __GFP_WAIT)) 1310 if (!(gfp_mask & __GFP_WAIT))
1207 goto nomem; 1311 goto nomem;
1208 1312
1209 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 1313 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1210 flags); 1314 gfp_mask, flags);
1211 if (ret) 1315 if (ret)
1212 continue; 1316 continue;
1213 1317
@@ -2018,8 +2122,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2018 if (!ret) 2122 if (!ret)
2019 break; 2123 break;
2020 2124
2021 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 2125 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
2022 MEM_CGROUP_RECLAIM_SHRINK); 2126 GFP_KERNEL,
2127 MEM_CGROUP_RECLAIM_SHRINK);
2023 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2128 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2024 /* Usage is reduced ? */ 2129 /* Usage is reduced ? */
2025 if (curusage >= oldusage) 2130 if (curusage >= oldusage)
@@ -2071,7 +2176,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2071 if (!ret) 2176 if (!ret)
2072 break; 2177 break;
2073 2178
2074 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 2179 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2075 MEM_CGROUP_RECLAIM_NOSWAP | 2180 MEM_CGROUP_RECLAIM_NOSWAP |
2076 MEM_CGROUP_RECLAIM_SHRINK); 2181 MEM_CGROUP_RECLAIM_SHRINK);
2077 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2182 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
@@ -2084,6 +2189,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2084 return ret; 2189 return ret;
2085} 2190}
2086 2191
2192unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2193 gfp_t gfp_mask, int nid,
2194 int zid)
2195{
2196 unsigned long nr_reclaimed = 0;
2197 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2198 unsigned long reclaimed;
2199 int loop = 0;
2200 struct mem_cgroup_tree_per_zone *mctz;
2201
2202 if (order > 0)
2203 return 0;
2204
2205 mctz = soft_limit_tree_node_zone(nid, zid);
2206 /*
2207 * This loop can run a while, specially if mem_cgroup's continuously
2208 * keep exceeding their soft limit and putting the system under
2209 * pressure
2210 */
2211 do {
2212 if (next_mz)
2213 mz = next_mz;
2214 else
2215 mz = mem_cgroup_largest_soft_limit_node(mctz);
2216 if (!mz)
2217 break;
2218
2219 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2220 gfp_mask,
2221 MEM_CGROUP_RECLAIM_SOFT);
2222 nr_reclaimed += reclaimed;
2223 spin_lock(&mctz->lock);
2224
2225 /*
2226 * If we failed to reclaim anything from this memory cgroup
2227 * it is time to move on to the next cgroup
2228 */
2229 next_mz = NULL;
2230 if (!reclaimed) {
2231 do {
2232 /*
2233 * Loop until we find yet another one.
2234 *
2235 * By the time we get the soft_limit lock
2236 * again, someone might have aded the
2237 * group back on the RB tree. Iterate to
2238 * make sure we get a different mem.
2239 * mem_cgroup_largest_soft_limit_node returns
2240 * NULL if no other cgroup is present on
2241 * the tree
2242 */
2243 next_mz =
2244 __mem_cgroup_largest_soft_limit_node(mctz);
2245 if (next_mz == mz) {
2246 css_put(&next_mz->mem->css);
2247 next_mz = NULL;
2248 } else /* next_mz == NULL or other memcg */
2249 break;
2250 } while (1);
2251 }
2252 mz->usage_in_excess =
2253 res_counter_soft_limit_excess(&mz->mem->res);
2254 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2255 /*
2256 * One school of thought says that we should not add
2257 * back the node to the tree if reclaim returns 0.
2258 * But our reclaim could return 0, simply because due
2259 * to priority we are exposing a smaller subset of
2260 * memory to reclaim from. Consider this as a longer
2261 * term TODO.
2262 */
2263 if (mz->usage_in_excess)
2264 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
2265 spin_unlock(&mctz->lock);
2266 css_put(&mz->mem->css);
2267 loop++;
2268 /*
2269 * Could not reclaim anything and there are no more
2270 * mem cgroups to try or we seem to be looping without
2271 * reclaiming anything.
2272 */
2273 if (!nr_reclaimed &&
2274 (next_mz == NULL ||
2275 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2276 break;
2277 } while (!nr_reclaimed);
2278 if (next_mz)
2279 css_put(&next_mz->mem->css);
2280 return nr_reclaimed;
2281}
2282
2087/* 2283/*
2088 * This routine traverse page_cgroup in given list and drop them all. 2284 * This routine traverse page_cgroup in given list and drop them all.
2089 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2285 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -2686,6 +2882,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2686 for_each_lru(l) 2882 for_each_lru(l)
2687 INIT_LIST_HEAD(&mz->lists[l]); 2883 INIT_LIST_HEAD(&mz->lists[l]);
2688 mz->usage_in_excess = 0; 2884 mz->usage_in_excess = 0;
2885 mz->on_tree = false;
2886 mz->mem = mem;
2689 } 2887 }
2690 return 0; 2888 return 0;
2691} 2889}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 613e89f471d9..2423782214ab 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1836,11 +1836,45 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1836 1836
1837#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1837#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1838 1838
1839unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1840 gfp_t gfp_mask, bool noswap,
1841 unsigned int swappiness,
1842 struct zone *zone, int nid)
1843{
1844 struct scan_control sc = {
1845 .may_writepage = !laptop_mode,
1846 .may_unmap = 1,
1847 .may_swap = !noswap,
1848 .swap_cluster_max = SWAP_CLUSTER_MAX,
1849 .swappiness = swappiness,
1850 .order = 0,
1851 .mem_cgroup = mem,
1852 .isolate_pages = mem_cgroup_isolate_pages,
1853 };
1854 nodemask_t nm = nodemask_of_node(nid);
1855
1856 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1857 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1858 sc.nodemask = &nm;
1859 sc.nr_reclaimed = 0;
1860 sc.nr_scanned = 0;
1861 /*
1862 * NOTE: Although we can get the priority field, using it
1863 * here is not a good idea, since it limits the pages we can scan.
1864 * if we don't reclaim here, the shrink_zone from balance_pgdat
1865 * will pick up pages from other mem cgroup's as well. We hack
1866 * the priority and make it zero.
1867 */
1868 shrink_zone(0, zone, &sc);
1869 return sc.nr_reclaimed;
1870}
1871
1839unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 1872unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1840 gfp_t gfp_mask, 1873 gfp_t gfp_mask,
1841 bool noswap, 1874 bool noswap,
1842 unsigned int swappiness) 1875 unsigned int swappiness)
1843{ 1876{
1877 struct zonelist *zonelist;
1844 struct scan_control sc = { 1878 struct scan_control sc = {
1845 .may_writepage = !laptop_mode, 1879 .may_writepage = !laptop_mode,
1846 .may_unmap = 1, 1880 .may_unmap = 1,
@@ -1852,7 +1886,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1852 .isolate_pages = mem_cgroup_isolate_pages, 1886 .isolate_pages = mem_cgroup_isolate_pages,
1853 .nodemask = NULL, /* we don't care the placement */ 1887 .nodemask = NULL, /* we don't care the placement */
1854 }; 1888 };
1855 struct zonelist *zonelist;
1856 1889
1857 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1890 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1858 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1891 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1974,6 +2007,7 @@ loop_again:
1974 for (i = 0; i <= end_zone; i++) { 2007 for (i = 0; i <= end_zone; i++) {
1975 struct zone *zone = pgdat->node_zones + i; 2008 struct zone *zone = pgdat->node_zones + i;
1976 int nr_slab; 2009 int nr_slab;
2010 int nid, zid;
1977 2011
1978 if (!populated_zone(zone)) 2012 if (!populated_zone(zone))
1979 continue; 2013 continue;
@@ -1988,6 +2022,15 @@ loop_again:
1988 temp_priority[i] = priority; 2022 temp_priority[i] = priority;
1989 sc.nr_scanned = 0; 2023 sc.nr_scanned = 0;
1990 note_zone_scanning_priority(zone, priority); 2024 note_zone_scanning_priority(zone, priority);
2025
2026 nid = pgdat->node_id;
2027 zid = zone_idx(zone);
2028 /*
2029 * Call soft limit reclaim before calling shrink_zone.
2030 * For now we ignore the return value
2031 */
2032 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
2033 nid, zid);
1991 /* 2034 /*
1992 * We put equal pressure on every zone, unless one 2035 * We put equal pressure on every zone, unless one
1993 * zone has way too many pages free already. 2036 * zone has way too many pages free already.