aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/memory.txt19
-rw-r--r--mm/memcontrol.c309
2 files changed, 327 insertions, 1 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 1f59a1a38bd9..268ab08222dd 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -468,7 +468,24 @@ Note: More type of pages(e.g. file cache, shmem,) will be supported by other
468- All of moving charge operations are done under cgroup_mutex. It's not good 468- All of moving charge operations are done under cgroup_mutex. It's not good
469 behavior to hold the mutex too long, so we may need some trick. 469 behavior to hold the mutex too long, so we may need some trick.
470 470
4719. TODO 4719. Memory thresholds
472
473Memory controler implements memory thresholds using cgroups notification
474API (see cgroups.txt). It allows to register multiple memory and memsw
475thresholds and gets notifications when it crosses.
476
477To register a threshold application need:
478 - create an eventfd using eventfd(2);
479 - open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
480 - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to
481 cgroup.event_control.
482
483Application will be notified through eventfd when memory usage crosses
484threshold in any direction.
485
486It's applicable for root and non-root cgroup.
487
48810. TODO
472 489
4731. Add support for accounting huge pages (as a separate controller) 4901. Add support for accounting huge pages (as a separate controller)
4742. Make per-cgroup scanner reclaim not-shared pages first 4912. Make per-cgroup scanner reclaim not-shared pages first
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5a41d93c7077..649df435b8e2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6,6 +6,10 @@
6 * Copyright 2007 OpenVZ SWsoft Inc 6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Author: Pavel Emelianov <xemul@openvz.org>
8 * 8 *
9 * Memory thresholds
10 * Copyright (C) 2009 Nokia Corporation
11 * Author: Kirill A. Shutemov
12 *
9 * This program is free software; you can redistribute it and/or modify 13 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by 14 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or 15 * the Free Software Foundation; either version 2 of the License, or
@@ -35,6 +39,8 @@
35#include <linux/swap.h> 39#include <linux/swap.h>
36#include <linux/swapops.h> 40#include <linux/swapops.h>
37#include <linux/spinlock.h> 41#include <linux/spinlock.h>
42#include <linux/eventfd.h>
43#include <linux/sort.h>
38#include <linux/fs.h> 44#include <linux/fs.h>
39#include <linux/seq_file.h> 45#include <linux/seq_file.h>
40#include <linux/vmalloc.h> 46#include <linux/vmalloc.h>
@@ -58,6 +64,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
58#endif 64#endif
59 65
60#define SOFTLIMIT_EVENTS_THRESH (1000) 66#define SOFTLIMIT_EVENTS_THRESH (1000)
67#define THRESHOLDS_EVENTS_THRESH (100)
61 68
62/* 69/*
63 * Statistics for memory cgroup. 70 * Statistics for memory cgroup.
@@ -74,6 +81,8 @@ enum mem_cgroup_stat_index {
74 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 81 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
75 MEM_CGROUP_STAT_SOFTLIMIT, /* decrements on each page in/out. 82 MEM_CGROUP_STAT_SOFTLIMIT, /* decrements on each page in/out.
76 used by soft limit implementation */ 83 used by soft limit implementation */
84 MEM_CGROUP_STAT_THRESHOLDS, /* decrements on each page in/out.
85 used by threshold implementation */
77 86
78 MEM_CGROUP_STAT_NSTATS, 87 MEM_CGROUP_STAT_NSTATS,
79}; 88};
@@ -177,6 +186,23 @@ struct mem_cgroup_tree {
177 186
178static struct mem_cgroup_tree soft_limit_tree __read_mostly; 187static struct mem_cgroup_tree soft_limit_tree __read_mostly;
179 188
189struct mem_cgroup_threshold {
190 struct eventfd_ctx *eventfd;
191 u64 threshold;
192};
193
194struct mem_cgroup_threshold_ary {
195 /* An array index points to threshold just below usage. */
196 atomic_t current_threshold;
197 /* Size of entries[] */
198 unsigned int size;
199 /* Array of thresholds */
200 struct mem_cgroup_threshold entries[0];
201};
202
203static bool mem_cgroup_threshold_check(struct mem_cgroup *mem);
204static void mem_cgroup_threshold(struct mem_cgroup *mem);
205
180/* 206/*
181 * The memory controller data structure. The memory controller controls both 207 * The memory controller data structure. The memory controller controls both
182 * page cache and RSS per cgroup. We would eventually like to provide 208 * page cache and RSS per cgroup. We would eventually like to provide
@@ -228,6 +254,15 @@ struct mem_cgroup {
228 /* set when res.limit == memsw.limit */ 254 /* set when res.limit == memsw.limit */
229 bool memsw_is_minimum; 255 bool memsw_is_minimum;
230 256
257 /* protect arrays of thresholds */
258 struct mutex thresholds_lock;
259
260 /* thresholds for memory usage. RCU-protected */
261 struct mem_cgroup_threshold_ary *thresholds;
262
263 /* thresholds for mem+swap usage. RCU-protected */
264 struct mem_cgroup_threshold_ary *memsw_thresholds;
265
231 /* 266 /*
232 * Should we move charges of a task when a task is moved into this 267 * Should we move charges of a task when a task is moved into this
233 * mem_cgroup ? And what type of charges should we move ? 268 * mem_cgroup ? And what type of charges should we move ?
@@ -549,6 +584,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
549 __mem_cgroup_stat_add_safe(cpustat, 584 __mem_cgroup_stat_add_safe(cpustat,
550 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 585 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
551 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, -1); 586 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, -1);
587 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, -1);
588
552 put_cpu(); 589 put_cpu();
553} 590}
554 591
@@ -1576,6 +1613,8 @@ charged:
1576 if (page && mem_cgroup_soft_limit_check(mem)) 1613 if (page && mem_cgroup_soft_limit_check(mem))
1577 mem_cgroup_update_tree(mem, page); 1614 mem_cgroup_update_tree(mem, page);
1578done: 1615done:
1616 if (mem_cgroup_threshold_check(mem))
1617 mem_cgroup_threshold(mem);
1579 return 0; 1618 return 0;
1580nomem: 1619nomem:
1581 css_put(&mem->css); 1620 css_put(&mem->css);
@@ -2148,6 +2187,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2148 2187
2149 if (mem_cgroup_soft_limit_check(mem)) 2188 if (mem_cgroup_soft_limit_check(mem))
2150 mem_cgroup_update_tree(mem, page); 2189 mem_cgroup_update_tree(mem, page);
2190 if (mem_cgroup_threshold_check(mem))
2191 mem_cgroup_threshold(mem);
2151 /* at swapout, this memcg will be accessed to record to swap */ 2192 /* at swapout, this memcg will be accessed to record to swap */
2152 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2193 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2153 css_put(&mem->css); 2194 css_put(&mem->css);
@@ -3232,12 +3273,277 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3232 return 0; 3273 return 0;
3233} 3274}
3234 3275
3276static bool mem_cgroup_threshold_check(struct mem_cgroup *mem)
3277{
3278 bool ret = false;
3279 int cpu;
3280 s64 val;
3281 struct mem_cgroup_stat_cpu *cpustat;
3282
3283 cpu = get_cpu();
3284 cpustat = &mem->stat.cpustat[cpu];
3285 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_THRESHOLDS);
3286 if (unlikely(val < 0)) {
3287 __mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS,
3288 THRESHOLDS_EVENTS_THRESH);
3289 ret = true;
3290 }
3291 put_cpu();
3292 return ret;
3293}
3294
3295static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3296{
3297 struct mem_cgroup_threshold_ary *t;
3298 u64 usage;
3299 int i;
3300
3301 rcu_read_lock();
3302 if (!swap)
3303 t = rcu_dereference(memcg->thresholds);
3304 else
3305 t = rcu_dereference(memcg->memsw_thresholds);
3306
3307 if (!t)
3308 goto unlock;
3309
3310 usage = mem_cgroup_usage(memcg, swap);
3311
3312 /*
3313 * current_threshold points to threshold just below usage.
3314 * If it's not true, a threshold was crossed after last
3315 * call of __mem_cgroup_threshold().
3316 */
3317 i = atomic_read(&t->current_threshold);
3318
3319 /*
3320 * Iterate backward over array of thresholds starting from
3321 * current_threshold and check if a threshold is crossed.
3322 * If none of thresholds below usage is crossed, we read
3323 * only one element of the array here.
3324 */
3325 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3326 eventfd_signal(t->entries[i].eventfd, 1);
3327
3328 /* i = current_threshold + 1 */
3329 i++;
3330
3331 /*
3332 * Iterate forward over array of thresholds starting from
3333 * current_threshold+1 and check if a threshold is crossed.
3334 * If none of thresholds above usage is crossed, we read
3335 * only one element of the array here.
3336 */
3337 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3338 eventfd_signal(t->entries[i].eventfd, 1);
3339
3340 /* Update current_threshold */
3341 atomic_set(&t->current_threshold, i - 1);
3342unlock:
3343 rcu_read_unlock();
3344}
3345
3346static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3347{
3348 __mem_cgroup_threshold(memcg, false);
3349 if (do_swap_account)
3350 __mem_cgroup_threshold(memcg, true);
3351}
3352
3353static int compare_thresholds(const void *a, const void *b)
3354{
3355 const struct mem_cgroup_threshold *_a = a;
3356 const struct mem_cgroup_threshold *_b = b;
3357
3358 return _a->threshold - _b->threshold;
3359}
3360
3361static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft,
3362 struct eventfd_ctx *eventfd, const char *args)
3363{
3364 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3365 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
3366 int type = MEMFILE_TYPE(cft->private);
3367 u64 threshold, usage;
3368 int size;
3369 int i, ret;
3370
3371 ret = res_counter_memparse_write_strategy(args, &threshold);
3372 if (ret)
3373 return ret;
3374
3375 mutex_lock(&memcg->thresholds_lock);
3376 if (type == _MEM)
3377 thresholds = memcg->thresholds;
3378 else if (type == _MEMSWAP)
3379 thresholds = memcg->memsw_thresholds;
3380 else
3381 BUG();
3382
3383 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3384
3385 /* Check if a threshold crossed before adding a new one */
3386 if (thresholds)
3387 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3388
3389 if (thresholds)
3390 size = thresholds->size + 1;
3391 else
3392 size = 1;
3393
3394 /* Allocate memory for new array of thresholds */
3395 thresholds_new = kmalloc(sizeof(*thresholds_new) +
3396 size * sizeof(struct mem_cgroup_threshold),
3397 GFP_KERNEL);
3398 if (!thresholds_new) {
3399 ret = -ENOMEM;
3400 goto unlock;
3401 }
3402 thresholds_new->size = size;
3403
3404 /* Copy thresholds (if any) to new array */
3405 if (thresholds)
3406 memcpy(thresholds_new->entries, thresholds->entries,
3407 thresholds->size *
3408 sizeof(struct mem_cgroup_threshold));
3409 /* Add new threshold */
3410 thresholds_new->entries[size - 1].eventfd = eventfd;
3411 thresholds_new->entries[size - 1].threshold = threshold;
3412
3413 /* Sort thresholds. Registering of new threshold isn't time-critical */
3414 sort(thresholds_new->entries, size,
3415 sizeof(struct mem_cgroup_threshold),
3416 compare_thresholds, NULL);
3417
3418 /* Find current threshold */
3419 atomic_set(&thresholds_new->current_threshold, -1);
3420 for (i = 0; i < size; i++) {
3421 if (thresholds_new->entries[i].threshold < usage) {
3422 /*
3423 * thresholds_new->current_threshold will not be used
3424 * until rcu_assign_pointer(), so it's safe to increment
3425 * it here.
3426 */
3427 atomic_inc(&thresholds_new->current_threshold);
3428 }
3429 }
3430
3431 /*
3432 * We need to increment refcnt to be sure that all thresholds
3433 * will be unregistered before calling __mem_cgroup_free()
3434 */
3435 mem_cgroup_get(memcg);
3436
3437 if (type == _MEM)
3438 rcu_assign_pointer(memcg->thresholds, thresholds_new);
3439 else
3440 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
3441
3442 /* To be sure that nobody uses thresholds before freeing it */
3443 synchronize_rcu();
3444
3445 kfree(thresholds);
3446unlock:
3447 mutex_unlock(&memcg->thresholds_lock);
3448
3449 return ret;
3450}
3451
3452static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft,
3453 struct eventfd_ctx *eventfd)
3454{
3455 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3456 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
3457 int type = MEMFILE_TYPE(cft->private);
3458 u64 usage;
3459 int size = 0;
3460 int i, j, ret;
3461
3462 mutex_lock(&memcg->thresholds_lock);
3463 if (type == _MEM)
3464 thresholds = memcg->thresholds;
3465 else if (type == _MEMSWAP)
3466 thresholds = memcg->memsw_thresholds;
3467 else
3468 BUG();
3469
3470 /*
3471 * Something went wrong if we trying to unregister a threshold
3472 * if we don't have thresholds
3473 */
3474 BUG_ON(!thresholds);
3475
3476 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3477
3478 /* Check if a threshold crossed before removing */
3479 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3480
3481 /* Calculate new number of threshold */
3482 for (i = 0; i < thresholds->size; i++) {
3483 if (thresholds->entries[i].eventfd != eventfd)
3484 size++;
3485 }
3486
3487 /* Set thresholds array to NULL if we don't have thresholds */
3488 if (!size) {
3489 thresholds_new = NULL;
3490 goto assign;
3491 }
3492
3493 /* Allocate memory for new array of thresholds */
3494 thresholds_new = kmalloc(sizeof(*thresholds_new) +
3495 size * sizeof(struct mem_cgroup_threshold),
3496 GFP_KERNEL);
3497 if (!thresholds_new) {
3498 ret = -ENOMEM;
3499 goto unlock;
3500 }
3501 thresholds_new->size = size;
3502
3503 /* Copy thresholds and find current threshold */
3504 atomic_set(&thresholds_new->current_threshold, -1);
3505 for (i = 0, j = 0; i < thresholds->size; i++) {
3506 if (thresholds->entries[i].eventfd == eventfd)
3507 continue;
3508
3509 thresholds_new->entries[j] = thresholds->entries[i];
3510 if (thresholds_new->entries[j].threshold < usage) {
3511 /*
3512 * thresholds_new->current_threshold will not be used
3513 * until rcu_assign_pointer(), so it's safe to increment
3514 * it here.
3515 */
3516 atomic_inc(&thresholds_new->current_threshold);
3517 }
3518 j++;
3519 }
3520
3521assign:
3522 if (type == _MEM)
3523 rcu_assign_pointer(memcg->thresholds, thresholds_new);
3524 else
3525 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
3526
3527 /* To be sure that nobody uses thresholds before freeing it */
3528 synchronize_rcu();
3529
3530 for (i = 0; i < thresholds->size - size; i++)
3531 mem_cgroup_put(memcg);
3532
3533 kfree(thresholds);
3534unlock:
3535 mutex_unlock(&memcg->thresholds_lock);
3536
3537 return ret;
3538}
3235 3539
3236static struct cftype mem_cgroup_files[] = { 3540static struct cftype mem_cgroup_files[] = {
3237 { 3541 {
3238 .name = "usage_in_bytes", 3542 .name = "usage_in_bytes",
3239 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 3543 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3240 .read_u64 = mem_cgroup_read, 3544 .read_u64 = mem_cgroup_read,
3545 .register_event = mem_cgroup_register_event,
3546 .unregister_event = mem_cgroup_unregister_event,
3241 }, 3547 },
3242 { 3548 {
3243 .name = "max_usage_in_bytes", 3549 .name = "max_usage_in_bytes",
@@ -3294,6 +3600,8 @@ static struct cftype memsw_cgroup_files[] = {
3294 .name = "memsw.usage_in_bytes", 3600 .name = "memsw.usage_in_bytes",
3295 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 3601 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3296 .read_u64 = mem_cgroup_read, 3602 .read_u64 = mem_cgroup_read,
3603 .register_event = mem_cgroup_register_event,
3604 .unregister_event = mem_cgroup_unregister_event,
3297 }, 3605 },
3298 { 3606 {
3299 .name = "memsw.max_usage_in_bytes", 3607 .name = "memsw.max_usage_in_bytes",
@@ -3538,6 +3846,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3538 mem->swappiness = get_swappiness(parent); 3846 mem->swappiness = get_swappiness(parent);
3539 atomic_set(&mem->refcnt, 1); 3847 atomic_set(&mem->refcnt, 1);
3540 mem->move_charge_at_immigrate = 0; 3848 mem->move_charge_at_immigrate = 0;
3849 mutex_init(&mem->thresholds_lock);
3541 return &mem->css; 3850 return &mem->css;
3542free_out: 3851free_out:
3543 __mem_cgroup_free(mem); 3852 __mem_cgroup_free(mem);