aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-11-29 10:42:58 -0500
committerTejun Heo <tj@kernel.org>2013-11-29 10:42:58 -0500
commitb1a21367314f36a819c0676e0999f34db12ee6ed (patch)
tree5c2d2986a110c9b999e36467e85908b58e8d14fa /kernel/cgroup.c
parentb9f3cecaba592d4e98cd155c91b1414323ed51e8 (diff)
cgroup: implement delayed destruction for cgroup_pidlist
Currently, pidlists are reference counted from file open and release methods. This means that holding onto an open file may waste memory and reads may return data which is very stale. Both aren't critical because pidlists are keyed and shared per namespace and, well, the user isn't supposed to have large delay between open and reads. cgroup is planned to be converted to use kernfs and it'd be best if we can stick to just the seq_file operations - start, next, stop and show. This can be achieved by loading pidlist on demand from start and release with time delay from stop, so that consecutive reads don't end up reloading the pidlist on each iteration. This would remove the need for hooking into open and release while also avoiding issues with holding onto pidlist for too long. This patch implements delayed release of pidlist. As pidlists could be lingering on cgroup removal waiting for the timer to expire, cgroup free path needs to queue the destruction work item immediately and flush. As those work items are self-destroying, each work item can't be flushed directly. A new workqueue - cgroup_pidlist_destroy_wq - is added to serve as flush domain. Note that this patch just adds delayed release on top of the current implementation and doesn't change where pidlist is loaded and released. Following patches will make those changes. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com>
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c102
1 files changed, 77 insertions, 25 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b59e3453eae7..acdcddf8ab82 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -62,6 +62,14 @@
62#include <linux/atomic.h> 62#include <linux/atomic.h>
63 63
64/* 64/*
65 * pidlists linger the following amount before being destroyed. The goal
66 * is avoiding frequent destruction in the middle of consecutive read calls
67 * Expiring in the middle is a performance problem not a correctness one.
68 * 1 sec should be enough.
69 */
70#define CGROUP_PIDLIST_DESTROY_DELAY HZ
71
72/*
65 * cgroup_mutex is the master lock. Any modification to cgroup or its 73 * cgroup_mutex is the master lock. Any modification to cgroup or its
66 * hierarchy must be performed while holding it. 74 * hierarchy must be performed while holding it.
67 * 75 *
@@ -95,6 +103,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);
95static struct workqueue_struct *cgroup_destroy_wq; 103static struct workqueue_struct *cgroup_destroy_wq;
96 104
97/* 105/*
106 * pidlist destructions need to be flushed on cgroup destruction. Use a
107 * separate workqueue as flush domain.
108 */
109static struct workqueue_struct *cgroup_pidlist_destroy_wq;
110
111/*
98 * Generate an array of cgroup subsystem pointers. At boot time, this is 112 * Generate an array of cgroup subsystem pointers. At boot time, this is
99 * populated with the built in subsystems, and modular subsystems are 113 * populated with the built in subsystems, and modular subsystems are
100 * registered after that. The mutable section of this array is protected by 114 * registered after that. The mutable section of this array is protected by
@@ -167,6 +181,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
167static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 181static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
168 bool is_add); 182 bool is_add);
169static int cgroup_file_release(struct inode *inode, struct file *file); 183static int cgroup_file_release(struct inode *inode, struct file *file);
184static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
170 185
171/** 186/**
172 * cgroup_css - obtain a cgroup's css for the specified subsystem 187 * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -830,11 +845,7 @@ static void cgroup_free_fn(struct work_struct *work)
830 */ 845 */
831 deactivate_super(cgrp->root->sb); 846 deactivate_super(cgrp->root->sb);
832 847
833 /* 848 cgroup_pidlist_destroy_all(cgrp);
834 * if we're getting rid of the cgroup, refcount should ensure
835 * that there are no pidlists left.
836 */
837 BUG_ON(!list_empty(&cgrp->pidlists));
838 849
839 simple_xattrs_free(&cgrp->xattrs); 850 simple_xattrs_free(&cgrp->xattrs);
840 851
@@ -2449,13 +2460,12 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
2449{ 2460{
2450 struct cfent *cfe = __d_cfe(file->f_dentry); 2461 struct cfent *cfe = __d_cfe(file->f_dentry);
2451 struct cgroup_subsys_state *css = cfe->css; 2462 struct cgroup_subsys_state *css = cfe->css;
2452 int ret = 0;
2453 2463
2454 if (css->ss) 2464 if (css->ss)
2455 css_put(css); 2465 css_put(css);
2456 if (file->f_op == &cgroup_seqfile_operations) 2466 if (file->f_op == &cgroup_seqfile_operations)
2457 single_release(inode, file); 2467 single_release(inode, file);
2458 return ret; 2468 return 0;
2459} 2469}
2460 2470
2461/* 2471/*
@@ -3454,6 +3464,8 @@ struct cgroup_pidlist {
3454 struct cgroup *owner; 3464 struct cgroup *owner;
3455 /* protects the other fields */ 3465 /* protects the other fields */
3456 struct rw_semaphore rwsem; 3466 struct rw_semaphore rwsem;
3467 /* for delayed destruction */
3468 struct delayed_work destroy_dwork;
3457}; 3469};
3458 3470
3459/* 3471/*
@@ -3469,6 +3481,7 @@ static void *pidlist_allocate(int count)
3469 else 3481 else
3470 return kmalloc(count * sizeof(pid_t), GFP_KERNEL); 3482 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3471} 3483}
3484
3472static void pidlist_free(void *p) 3485static void pidlist_free(void *p)
3473{ 3486{
3474 if (is_vmalloc_addr(p)) 3487 if (is_vmalloc_addr(p))
@@ -3478,6 +3491,49 @@ static void pidlist_free(void *p)
3478} 3491}
3479 3492
3480/* 3493/*
3494 * Used to destroy all pidlists lingering waiting for destroy timer. None
3495 * should be left afterwards.
3496 */
3497static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
3498{
3499 struct cgroup_pidlist *l, *tmp_l;
3500
3501 mutex_lock(&cgrp->pidlist_mutex);
3502 list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
3503 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
3504 mutex_unlock(&cgrp->pidlist_mutex);
3505
3506 flush_workqueue(cgroup_pidlist_destroy_wq);
3507 BUG_ON(!list_empty(&cgrp->pidlists));
3508}
3509
3510static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
3511{
3512 struct delayed_work *dwork = to_delayed_work(work);
3513 struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
3514 destroy_dwork);
3515 struct cgroup_pidlist *tofree = NULL;
3516
3517 mutex_lock(&l->owner->pidlist_mutex);
3518 down_write(&l->rwsem);
3519
3520 /*
3521 * Destroy iff we didn't race with a new user or get queued again.
3522 * Queued state won't change as it can only be queued while locked.
3523 */
3524 if (!l->use_count && !delayed_work_pending(dwork)) {
3525 list_del(&l->links);
3526 pidlist_free(l->list);
3527 put_pid_ns(l->key.ns);
3528 tofree = l;
3529 }
3530
3531 up_write(&l->rwsem);
3532 mutex_unlock(&l->owner->pidlist_mutex);
3533 kfree(tofree);
3534}
3535
3536/*
3481 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries 3537 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3482 * Returns the number of unique elements. 3538 * Returns the number of unique elements.
3483 */ 3539 */
@@ -3547,6 +3603,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3547 return l; 3603 return l;
3548 } 3604 }
3549 init_rwsem(&l->rwsem); 3605 init_rwsem(&l->rwsem);
3606 INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3550 down_write(&l->rwsem); 3607 down_write(&l->rwsem);
3551 l->key.type = type; 3608 l->key.type = type;
3552 l->key.ns = get_pid_ns(ns); 3609 l->key.ns = get_pid_ns(ns);
@@ -3752,26 +3809,12 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
3752 3809
3753static void cgroup_release_pid_array(struct cgroup_pidlist *l) 3810static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3754{ 3811{
3755 /*
3756 * the case where we're the last user of this particular pidlist will
3757 * have us remove it from the cgroup's list, which entails taking the
3758 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
3759 * pidlist_mutex, we have to take pidlist_mutex first.
3760 */
3761 mutex_lock(&l->owner->pidlist_mutex);
3762 down_write(&l->rwsem); 3812 down_write(&l->rwsem);
3763 BUG_ON(!l->use_count); 3813 BUG_ON(!l->use_count);
3764 if (!--l->use_count) { 3814 /* if the last user, arm the destroy work */
3765 /* we're the last user if refcount is 0; remove and free */ 3815 if (!--l->use_count)
3766 list_del(&l->links); 3816 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
3767 mutex_unlock(&l->owner->pidlist_mutex); 3817 CGROUP_PIDLIST_DESTROY_DELAY);
3768 pidlist_free(l->list);
3769 put_pid_ns(l->key.ns);
3770 up_write(&l->rwsem);
3771 kfree(l);
3772 return;
3773 }
3774 mutex_unlock(&l->owner->pidlist_mutex);
3775 up_write(&l->rwsem); 3818 up_write(&l->rwsem);
3776} 3819}
3777 3820
@@ -4813,6 +4856,15 @@ static int __init cgroup_wq_init(void)
4813 */ 4856 */
4814 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); 4857 cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
4815 BUG_ON(!cgroup_destroy_wq); 4858 BUG_ON(!cgroup_destroy_wq);
4859
4860 /*
4861 * Used to destroy pidlists and separate to serve as flush domain.
4862 * Cap @max_active to 1 too.
4863 */
4864 cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
4865 0, 1);
4866 BUG_ON(!cgroup_pidlist_destroy_wq);
4867
4816 return 0; 4868 return 0;
4817} 4869}
4818core_initcall(cgroup_wq_init); 4870core_initcall(cgroup_wq_init);