summaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c188
1 files changed, 164 insertions, 24 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f3c15bb07cce..2156ef775d04 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -57,6 +57,7 @@
57#include <linux/lockdep.h> 57#include <linux/lockdep.h>
58#include <linux/file.h> 58#include <linux/file.h>
59#include <linux/tracehook.h> 59#include <linux/tracehook.h>
60#include <linux/psi.h>
60#include <linux/seq_buf.h> 61#include <linux/seq_buf.h>
61#include "internal.h" 62#include "internal.h"
62#include <net/sock.h> 63#include <net/sock.h>
@@ -317,6 +318,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
317EXPORT_SYMBOL(memcg_kmem_enabled_key); 318EXPORT_SYMBOL(memcg_kmem_enabled_key);
318 319
319struct workqueue_struct *memcg_kmem_cache_wq; 320struct workqueue_struct *memcg_kmem_cache_wq;
321#endif
320 322
321static int memcg_shrinker_map_size; 323static int memcg_shrinker_map_size;
322static DEFINE_MUTEX(memcg_shrinker_map_mutex); 324static DEFINE_MUTEX(memcg_shrinker_map_mutex);
@@ -440,14 +442,6 @@ void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
440 } 442 }
441} 443}
442 444
443#else /* CONFIG_MEMCG_KMEM */
444static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
445{
446 return 0;
447}
448static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
449#endif /* CONFIG_MEMCG_KMEM */
450
451/** 445/**
452 * mem_cgroup_css_from_page - css of the memcg associated with a page 446 * mem_cgroup_css_from_page - css of the memcg associated with a page
453 * @page: page of interest 447 * @page: page of interest
@@ -2270,21 +2264,22 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
2270 for_each_online_cpu(cpu) { 2264 for_each_online_cpu(cpu) {
2271 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2265 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2272 struct mem_cgroup *memcg; 2266 struct mem_cgroup *memcg;
2267 bool flush = false;
2273 2268
2269 rcu_read_lock();
2274 memcg = stock->cached; 2270 memcg = stock->cached;
2275 if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css)) 2271 if (memcg && stock->nr_pages &&
2276 continue; 2272 mem_cgroup_is_descendant(memcg, root_memcg))
2277 if (!mem_cgroup_is_descendant(memcg, root_memcg)) { 2273 flush = true;
2278 css_put(&memcg->css); 2274 rcu_read_unlock();
2279 continue; 2275
2280 } 2276 if (flush &&
2281 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2277 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2282 if (cpu == curcpu) 2278 if (cpu == curcpu)
2283 drain_local_stock(&stock->work); 2279 drain_local_stock(&stock->work);
2284 else 2280 else
2285 schedule_work_on(cpu, &stock->work); 2281 schedule_work_on(cpu, &stock->work);
2286 } 2282 }
2287 css_put(&memcg->css);
2288 } 2283 }
2289 put_cpu(); 2284 put_cpu();
2290 mutex_unlock(&percpu_charge_mutex); 2285 mutex_unlock(&percpu_charge_mutex);
@@ -2359,11 +2354,67 @@ static void high_work_func(struct work_struct *work)
2359} 2354}
2360 2355
2361/* 2356/*
2357 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
2358 * enough to still cause a significant slowdown in most cases, while still
2359 * allowing diagnostics and tracing to proceed without becoming stuck.
2360 */
2361#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2362
2363/*
2364 * When calculating the delay, we use these either side of the exponentiation to
2365 * maintain precision and scale to a reasonable number of jiffies (see the table
2366 * below.
2367 *
2368 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
2369 * overage ratio to a delay.
2370 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
2371 * proposed penalty in order to reduce to a reasonable number of jiffies, and
2372 * to produce a reasonable delay curve.
2373 *
2374 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
2375 * reasonable delay curve compared to precision-adjusted overage, not
2376 * penalising heavily at first, but still making sure that growth beyond the
2377 * limit penalises misbehaviour cgroups by slowing them down exponentially. For
2378 * example, with a high of 100 megabytes:
2379 *
2380 * +-------+------------------------+
2381 * | usage | time to allocate in ms |
2382 * +-------+------------------------+
2383 * | 100M | 0 |
2384 * | 101M | 6 |
2385 * | 102M | 25 |
2386 * | 103M | 57 |
2387 * | 104M | 102 |
2388 * | 105M | 159 |
2389 * | 106M | 230 |
2390 * | 107M | 313 |
2391 * | 108M | 409 |
2392 * | 109M | 518 |
2393 * | 110M | 639 |
2394 * | 111M | 774 |
2395 * | 112M | 921 |
2396 * | 113M | 1081 |
2397 * | 114M | 1254 |
2398 * | 115M | 1439 |
2399 * | 116M | 1638 |
2400 * | 117M | 1849 |
2401 * | 118M | 2000 |
2402 * | 119M | 2000 |
2403 * | 120M | 2000 |
2404 * +-------+------------------------+
2405 */
2406 #define MEMCG_DELAY_PRECISION_SHIFT 20
2407 #define MEMCG_DELAY_SCALING_SHIFT 14
2408
2409/*
2362 * Scheduled by try_charge() to be executed from the userland return path 2410 * Scheduled by try_charge() to be executed from the userland return path
2363 * and reclaims memory over the high limit. 2411 * and reclaims memory over the high limit.
2364 */ 2412 */
2365void mem_cgroup_handle_over_high(void) 2413void mem_cgroup_handle_over_high(void)
2366{ 2414{
2415 unsigned long usage, high, clamped_high;
2416 unsigned long pflags;
2417 unsigned long penalty_jiffies, overage;
2367 unsigned int nr_pages = current->memcg_nr_pages_over_high; 2418 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2368 struct mem_cgroup *memcg; 2419 struct mem_cgroup *memcg;
2369 2420
@@ -2372,8 +2423,75 @@ void mem_cgroup_handle_over_high(void)
2372 2423
2373 memcg = get_mem_cgroup_from_mm(current->mm); 2424 memcg = get_mem_cgroup_from_mm(current->mm);
2374 reclaim_high(memcg, nr_pages, GFP_KERNEL); 2425 reclaim_high(memcg, nr_pages, GFP_KERNEL);
2375 css_put(&memcg->css);
2376 current->memcg_nr_pages_over_high = 0; 2426 current->memcg_nr_pages_over_high = 0;
2427
2428 /*
2429 * memory.high is breached and reclaim is unable to keep up. Throttle
2430 * allocators proactively to slow down excessive growth.
2431 *
2432 * We use overage compared to memory.high to calculate the number of
2433 * jiffies to sleep (penalty_jiffies). Ideally this value should be
2434 * fairly lenient on small overages, and increasingly harsh when the
2435 * memcg in question makes it clear that it has no intention of stopping
2436 * its crazy behaviour, so we exponentially increase the delay based on
2437 * overage amount.
2438 */
2439
2440 usage = page_counter_read(&memcg->memory);
2441 high = READ_ONCE(memcg->high);
2442
2443 if (usage <= high)
2444 goto out;
2445
2446 /*
2447 * Prevent division by 0 in overage calculation by acting as if it was a
2448 * threshold of 1 page
2449 */
2450 clamped_high = max(high, 1UL);
2451
2452 overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
2453 clamped_high);
2454
2455 penalty_jiffies = ((u64)overage * overage * HZ)
2456 >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
2457
2458 /*
2459 * Factor in the task's own contribution to the overage, such that four
2460 * N-sized allocations are throttled approximately the same as one
2461 * 4N-sized allocation.
2462 *
2463 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2464 * larger the current charge patch is than that.
2465 */
2466 penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2467
2468 /*
2469 * Clamp the max delay per usermode return so as to still keep the
2470 * application moving forwards and also permit diagnostics, albeit
2471 * extremely slowly.
2472 */
2473 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2474
2475 /*
2476 * Don't sleep if the amount of jiffies this memcg owes us is so low
2477 * that it's not even worth doing, in an attempt to be nice to those who
2478 * go only a small amount over their memory.high value and maybe haven't
2479 * been aggressively reclaimed enough yet.
2480 */
2481 if (penalty_jiffies <= HZ / 100)
2482 goto out;
2483
2484 /*
2485 * If we exit early, we're guaranteed to die (since
2486 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2487 * need to account for any ill-begotten jiffies to pay them off later.
2488 */
2489 psi_memstall_enter(&pflags);
2490 schedule_timeout_killable(penalty_jiffies);
2491 psi_memstall_leave(&pflags);
2492
2493out:
2494 css_put(&memcg->css);
2377} 2495}
2378 2496
2379static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2497static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -3512,6 +3630,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3512 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 3630 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3513 break; 3631 break;
3514 case _KMEM: 3632 case _KMEM:
3633 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3634 "Please report your usecase to linux-mm@kvack.org if you "
3635 "depend on this functionality.\n");
3515 ret = memcg_update_kmem_max(memcg, nr_pages); 3636 ret = memcg_update_kmem_max(memcg, nr_pages);
3516 break; 3637 break;
3517 case _TCP: 3638 case _TCP:
@@ -4805,11 +4926,6 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4805 } 4926 }
4806} 4927}
4807 4928
4808static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4809{
4810 mem_cgroup_id_get_many(memcg, 1);
4811}
4812
4813static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 4929static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
4814{ 4930{
4815 mem_cgroup_id_put_many(memcg, 1); 4931 mem_cgroup_id_put_many(memcg, 1);
@@ -4955,6 +5071,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
4955 memcg->cgwb_frn[i].done = 5071 memcg->cgwb_frn[i].done =
4956 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 5072 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
4957#endif 5073#endif
5074#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5075 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5076 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5077 memcg->deferred_split_queue.split_queue_len = 0;
5078#endif
4958 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 5079 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
4959 return memcg; 5080 return memcg;
4960fail: 5081fail:
@@ -5333,6 +5454,14 @@ static int mem_cgroup_move_account(struct page *page,
5333 __mod_memcg_state(to, NR_WRITEBACK, nr_pages); 5454 __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
5334 } 5455 }
5335 5456
5457#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5458 if (compound && !list_empty(page_deferred_list(page))) {
5459 spin_lock(&from->deferred_split_queue.split_queue_lock);
5460 list_del_init(page_deferred_list(page));
5461 from->deferred_split_queue.split_queue_len--;
5462 spin_unlock(&from->deferred_split_queue.split_queue_lock);
5463 }
5464#endif
5336 /* 5465 /*
5337 * It is safe to change page->mem_cgroup here because the page 5466 * It is safe to change page->mem_cgroup here because the page
5338 * is referenced, charged, and isolated - we can't race with 5467 * is referenced, charged, and isolated - we can't race with
@@ -5341,6 +5470,17 @@ static int mem_cgroup_move_account(struct page *page,
5341 5470
5342 /* caller should have done css_get */ 5471 /* caller should have done css_get */
5343 page->mem_cgroup = to; 5472 page->mem_cgroup = to;
5473
5474#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5475 if (compound && list_empty(page_deferred_list(page))) {
5476 spin_lock(&to->deferred_split_queue.split_queue_lock);
5477 list_add_tail(page_deferred_list(page),
5478 &to->deferred_split_queue.split_queue);
5479 to->deferred_split_queue.split_queue_len++;
5480 spin_unlock(&to->deferred_split_queue.split_queue_lock);
5481 }
5482#endif
5483
5344 spin_unlock_irqrestore(&from->move_lock, flags); 5484 spin_unlock_irqrestore(&from->move_lock, flags);
5345 5485
5346 ret = 0; 5486 ret = 0;
@@ -6511,7 +6651,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6511 unsigned int nr_pages = 1; 6651 unsigned int nr_pages = 1;
6512 6652
6513 if (PageTransHuge(page)) { 6653 if (PageTransHuge(page)) {
6514 nr_pages <<= compound_order(page); 6654 nr_pages = compound_nr(page);
6515 ug->nr_huge += nr_pages; 6655 ug->nr_huge += nr_pages;
6516 } 6656 }
6517 if (PageAnon(page)) 6657 if (PageAnon(page))
@@ -6523,7 +6663,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6523 } 6663 }
6524 ug->pgpgout++; 6664 ug->pgpgout++;
6525 } else { 6665 } else {
6526 ug->nr_kmem += 1 << compound_order(page); 6666 ug->nr_kmem += compound_nr(page);
6527 __ClearPageKmemcg(page); 6667 __ClearPageKmemcg(page);
6528 } 6668 }
6529 6669