diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 188 |
1 files changed, 164 insertions, 24 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3c15bb07cce..2156ef775d04 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/lockdep.h> | 57 | #include <linux/lockdep.h> |
58 | #include <linux/file.h> | 58 | #include <linux/file.h> |
59 | #include <linux/tracehook.h> | 59 | #include <linux/tracehook.h> |
60 | #include <linux/psi.h> | ||
60 | #include <linux/seq_buf.h> | 61 | #include <linux/seq_buf.h> |
61 | #include "internal.h" | 62 | #include "internal.h" |
62 | #include <net/sock.h> | 63 | #include <net/sock.h> |
@@ -317,6 +318,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); | |||
317 | EXPORT_SYMBOL(memcg_kmem_enabled_key); | 318 | EXPORT_SYMBOL(memcg_kmem_enabled_key); |
318 | 319 | ||
319 | struct workqueue_struct *memcg_kmem_cache_wq; | 320 | struct workqueue_struct *memcg_kmem_cache_wq; |
321 | #endif | ||
320 | 322 | ||
321 | static int memcg_shrinker_map_size; | 323 | static int memcg_shrinker_map_size; |
322 | static DEFINE_MUTEX(memcg_shrinker_map_mutex); | 324 | static DEFINE_MUTEX(memcg_shrinker_map_mutex); |
@@ -440,14 +442,6 @@ void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) | |||
440 | } | 442 | } |
441 | } | 443 | } |
442 | 444 | ||
443 | #else /* CONFIG_MEMCG_KMEM */ | ||
444 | static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) | ||
445 | { | ||
446 | return 0; | ||
447 | } | ||
448 | static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } | ||
449 | #endif /* CONFIG_MEMCG_KMEM */ | ||
450 | |||
451 | /** | 445 | /** |
452 | * mem_cgroup_css_from_page - css of the memcg associated with a page | 446 | * mem_cgroup_css_from_page - css of the memcg associated with a page |
453 | * @page: page of interest | 447 | * @page: page of interest |
@@ -2270,21 +2264,22 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) | |||
2270 | for_each_online_cpu(cpu) { | 2264 | for_each_online_cpu(cpu) { |
2271 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | 2265 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); |
2272 | struct mem_cgroup *memcg; | 2266 | struct mem_cgroup *memcg; |
2267 | bool flush = false; | ||
2273 | 2268 | ||
2269 | rcu_read_lock(); | ||
2274 | memcg = stock->cached; | 2270 | memcg = stock->cached; |
2275 | if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css)) | 2271 | if (memcg && stock->nr_pages && |
2276 | continue; | 2272 | mem_cgroup_is_descendant(memcg, root_memcg)) |
2277 | if (!mem_cgroup_is_descendant(memcg, root_memcg)) { | 2273 | flush = true; |
2278 | css_put(&memcg->css); | 2274 | rcu_read_unlock(); |
2279 | continue; | 2275 | |
2280 | } | 2276 | if (flush && |
2281 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { | 2277 | !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { |
2282 | if (cpu == curcpu) | 2278 | if (cpu == curcpu) |
2283 | drain_local_stock(&stock->work); | 2279 | drain_local_stock(&stock->work); |
2284 | else | 2280 | else |
2285 | schedule_work_on(cpu, &stock->work); | 2281 | schedule_work_on(cpu, &stock->work); |
2286 | } | 2282 | } |
2287 | css_put(&memcg->css); | ||
2288 | } | 2283 | } |
2289 | put_cpu(); | 2284 | put_cpu(); |
2290 | mutex_unlock(&percpu_charge_mutex); | 2285 | mutex_unlock(&percpu_charge_mutex); |
@@ -2359,11 +2354,67 @@ static void high_work_func(struct work_struct *work) | |||
2359 | } | 2354 | } |
2360 | 2355 | ||
2361 | /* | 2356 | /* |
2357 | * Clamp the maximum sleep time per allocation batch to 2 seconds. This is | ||
2358 | * enough to still cause a significant slowdown in most cases, while still | ||
2359 | * allowing diagnostics and tracing to proceed without becoming stuck. | ||
2360 | */ | ||
2361 | #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) | ||
2362 | |||
2363 | /* | ||
2364 | * When calculating the delay, we use these either side of the exponentiation to | ||
2365 | * maintain precision and scale to a reasonable number of jiffies (see the table | ||
2366 | * below. | ||
2367 | * | ||
2368 | * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the | ||
2369 | * overage ratio to a delay. | ||
2370 | * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the | ||
2371 | * proposed penalty in order to reduce to a reasonable number of jiffies, and | ||
2372 | * to produce a reasonable delay curve. | ||
2373 | * | ||
2374 | * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a | ||
2375 | * reasonable delay curve compared to precision-adjusted overage, not | ||
2376 | * penalising heavily at first, but still making sure that growth beyond the | ||
2377 | * limit penalises misbehaviour cgroups by slowing them down exponentially. For | ||
2378 | * example, with a high of 100 megabytes: | ||
2379 | * | ||
2380 | * +-------+------------------------+ | ||
2381 | * | usage | time to allocate in ms | | ||
2382 | * +-------+------------------------+ | ||
2383 | * | 100M | 0 | | ||
2384 | * | 101M | 6 | | ||
2385 | * | 102M | 25 | | ||
2386 | * | 103M | 57 | | ||
2387 | * | 104M | 102 | | ||
2388 | * | 105M | 159 | | ||
2389 | * | 106M | 230 | | ||
2390 | * | 107M | 313 | | ||
2391 | * | 108M | 409 | | ||
2392 | * | 109M | 518 | | ||
2393 | * | 110M | 639 | | ||
2394 | * | 111M | 774 | | ||
2395 | * | 112M | 921 | | ||
2396 | * | 113M | 1081 | | ||
2397 | * | 114M | 1254 | | ||
2398 | * | 115M | 1439 | | ||
2399 | * | 116M | 1638 | | ||
2400 | * | 117M | 1849 | | ||
2401 | * | 118M | 2000 | | ||
2402 | * | 119M | 2000 | | ||
2403 | * | 120M | 2000 | | ||
2404 | * +-------+------------------------+ | ||
2405 | */ | ||
2406 | #define MEMCG_DELAY_PRECISION_SHIFT 20 | ||
2407 | #define MEMCG_DELAY_SCALING_SHIFT 14 | ||
2408 | |||
2409 | /* | ||
2362 | * Scheduled by try_charge() to be executed from the userland return path | 2410 | * Scheduled by try_charge() to be executed from the userland return path |
2363 | * and reclaims memory over the high limit. | 2411 | * and reclaims memory over the high limit. |
2364 | */ | 2412 | */ |
2365 | void mem_cgroup_handle_over_high(void) | 2413 | void mem_cgroup_handle_over_high(void) |
2366 | { | 2414 | { |
2415 | unsigned long usage, high, clamped_high; | ||
2416 | unsigned long pflags; | ||
2417 | unsigned long penalty_jiffies, overage; | ||
2367 | unsigned int nr_pages = current->memcg_nr_pages_over_high; | 2418 | unsigned int nr_pages = current->memcg_nr_pages_over_high; |
2368 | struct mem_cgroup *memcg; | 2419 | struct mem_cgroup *memcg; |
2369 | 2420 | ||
@@ -2372,8 +2423,75 @@ void mem_cgroup_handle_over_high(void) | |||
2372 | 2423 | ||
2373 | memcg = get_mem_cgroup_from_mm(current->mm); | 2424 | memcg = get_mem_cgroup_from_mm(current->mm); |
2374 | reclaim_high(memcg, nr_pages, GFP_KERNEL); | 2425 | reclaim_high(memcg, nr_pages, GFP_KERNEL); |
2375 | css_put(&memcg->css); | ||
2376 | current->memcg_nr_pages_over_high = 0; | 2426 | current->memcg_nr_pages_over_high = 0; |
2427 | |||
2428 | /* | ||
2429 | * memory.high is breached and reclaim is unable to keep up. Throttle | ||
2430 | * allocators proactively to slow down excessive growth. | ||
2431 | * | ||
2432 | * We use overage compared to memory.high to calculate the number of | ||
2433 | * jiffies to sleep (penalty_jiffies). Ideally this value should be | ||
2434 | * fairly lenient on small overages, and increasingly harsh when the | ||
2435 | * memcg in question makes it clear that it has no intention of stopping | ||
2436 | * its crazy behaviour, so we exponentially increase the delay based on | ||
2437 | * overage amount. | ||
2438 | */ | ||
2439 | |||
2440 | usage = page_counter_read(&memcg->memory); | ||
2441 | high = READ_ONCE(memcg->high); | ||
2442 | |||
2443 | if (usage <= high) | ||
2444 | goto out; | ||
2445 | |||
2446 | /* | ||
2447 | * Prevent division by 0 in overage calculation by acting as if it was a | ||
2448 | * threshold of 1 page | ||
2449 | */ | ||
2450 | clamped_high = max(high, 1UL); | ||
2451 | |||
2452 | overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT, | ||
2453 | clamped_high); | ||
2454 | |||
2455 | penalty_jiffies = ((u64)overage * overage * HZ) | ||
2456 | >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT); | ||
2457 | |||
2458 | /* | ||
2459 | * Factor in the task's own contribution to the overage, such that four | ||
2460 | * N-sized allocations are throttled approximately the same as one | ||
2461 | * 4N-sized allocation. | ||
2462 | * | ||
2463 | * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or | ||
2464 | * larger the current charge patch is than that. | ||
2465 | */ | ||
2466 | penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; | ||
2467 | |||
2468 | /* | ||
2469 | * Clamp the max delay per usermode return so as to still keep the | ||
2470 | * application moving forwards and also permit diagnostics, albeit | ||
2471 | * extremely slowly. | ||
2472 | */ | ||
2473 | penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); | ||
2474 | |||
2475 | /* | ||
2476 | * Don't sleep if the amount of jiffies this memcg owes us is so low | ||
2477 | * that it's not even worth doing, in an attempt to be nice to those who | ||
2478 | * go only a small amount over their memory.high value and maybe haven't | ||
2479 | * been aggressively reclaimed enough yet. | ||
2480 | */ | ||
2481 | if (penalty_jiffies <= HZ / 100) | ||
2482 | goto out; | ||
2483 | |||
2484 | /* | ||
2485 | * If we exit early, we're guaranteed to die (since | ||
2486 | * schedule_timeout_killable sets TASK_KILLABLE). This means we don't | ||
2487 | * need to account for any ill-begotten jiffies to pay them off later. | ||
2488 | */ | ||
2489 | psi_memstall_enter(&pflags); | ||
2490 | schedule_timeout_killable(penalty_jiffies); | ||
2491 | psi_memstall_leave(&pflags); | ||
2492 | |||
2493 | out: | ||
2494 | css_put(&memcg->css); | ||
2377 | } | 2495 | } |
2378 | 2496 | ||
2379 | static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | 2497 | static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, |
@@ -3512,6 +3630,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, | |||
3512 | ret = mem_cgroup_resize_max(memcg, nr_pages, true); | 3630 | ret = mem_cgroup_resize_max(memcg, nr_pages, true); |
3513 | break; | 3631 | break; |
3514 | case _KMEM: | 3632 | case _KMEM: |
3633 | pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " | ||
3634 | "Please report your usecase to linux-mm@kvack.org if you " | ||
3635 | "depend on this functionality.\n"); | ||
3515 | ret = memcg_update_kmem_max(memcg, nr_pages); | 3636 | ret = memcg_update_kmem_max(memcg, nr_pages); |
3516 | break; | 3637 | break; |
3517 | case _TCP: | 3638 | case _TCP: |
@@ -4805,11 +4926,6 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) | |||
4805 | } | 4926 | } |
4806 | } | 4927 | } |
4807 | 4928 | ||
4808 | static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) | ||
4809 | { | ||
4810 | mem_cgroup_id_get_many(memcg, 1); | ||
4811 | } | ||
4812 | |||
4813 | static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) | 4929 | static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) |
4814 | { | 4930 | { |
4815 | mem_cgroup_id_put_many(memcg, 1); | 4931 | mem_cgroup_id_put_many(memcg, 1); |
@@ -4955,6 +5071,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
4955 | memcg->cgwb_frn[i].done = | 5071 | memcg->cgwb_frn[i].done = |
4956 | __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); | 5072 | __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); |
4957 | #endif | 5073 | #endif |
5074 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
5075 | spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); | ||
5076 | INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); | ||
5077 | memcg->deferred_split_queue.split_queue_len = 0; | ||
5078 | #endif | ||
4958 | idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); | 5079 | idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); |
4959 | return memcg; | 5080 | return memcg; |
4960 | fail: | 5081 | fail: |
@@ -5333,6 +5454,14 @@ static int mem_cgroup_move_account(struct page *page, | |||
5333 | __mod_memcg_state(to, NR_WRITEBACK, nr_pages); | 5454 | __mod_memcg_state(to, NR_WRITEBACK, nr_pages); |
5334 | } | 5455 | } |
5335 | 5456 | ||
5457 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
5458 | if (compound && !list_empty(page_deferred_list(page))) { | ||
5459 | spin_lock(&from->deferred_split_queue.split_queue_lock); | ||
5460 | list_del_init(page_deferred_list(page)); | ||
5461 | from->deferred_split_queue.split_queue_len--; | ||
5462 | spin_unlock(&from->deferred_split_queue.split_queue_lock); | ||
5463 | } | ||
5464 | #endif | ||
5336 | /* | 5465 | /* |
5337 | * It is safe to change page->mem_cgroup here because the page | 5466 | * It is safe to change page->mem_cgroup here because the page |
5338 | * is referenced, charged, and isolated - we can't race with | 5467 | * is referenced, charged, and isolated - we can't race with |
@@ -5341,6 +5470,17 @@ static int mem_cgroup_move_account(struct page *page, | |||
5341 | 5470 | ||
5342 | /* caller should have done css_get */ | 5471 | /* caller should have done css_get */ |
5343 | page->mem_cgroup = to; | 5472 | page->mem_cgroup = to; |
5473 | |||
5474 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
5475 | if (compound && list_empty(page_deferred_list(page))) { | ||
5476 | spin_lock(&to->deferred_split_queue.split_queue_lock); | ||
5477 | list_add_tail(page_deferred_list(page), | ||
5478 | &to->deferred_split_queue.split_queue); | ||
5479 | to->deferred_split_queue.split_queue_len++; | ||
5480 | spin_unlock(&to->deferred_split_queue.split_queue_lock); | ||
5481 | } | ||
5482 | #endif | ||
5483 | |||
5344 | spin_unlock_irqrestore(&from->move_lock, flags); | 5484 | spin_unlock_irqrestore(&from->move_lock, flags); |
5345 | 5485 | ||
5346 | ret = 0; | 5486 | ret = 0; |
@@ -6511,7 +6651,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) | |||
6511 | unsigned int nr_pages = 1; | 6651 | unsigned int nr_pages = 1; |
6512 | 6652 | ||
6513 | if (PageTransHuge(page)) { | 6653 | if (PageTransHuge(page)) { |
6514 | nr_pages <<= compound_order(page); | 6654 | nr_pages = compound_nr(page); |
6515 | ug->nr_huge += nr_pages; | 6655 | ug->nr_huge += nr_pages; |
6516 | } | 6656 | } |
6517 | if (PageAnon(page)) | 6657 | if (PageAnon(page)) |
@@ -6523,7 +6663,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) | |||
6523 | } | 6663 | } |
6524 | ug->pgpgout++; | 6664 | ug->pgpgout++; |
6525 | } else { | 6665 | } else { |
6526 | ug->nr_kmem += 1 << compound_order(page); | 6666 | ug->nr_kmem += compound_nr(page); |
6527 | __ClearPageKmemcg(page); | 6667 | __ClearPageKmemcg(page); |
6528 | } | 6668 | } |
6529 | 6669 | ||