aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c325
1 files changed, 179 insertions, 146 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4855892798fd..a32f9432666c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,7 @@
52#include <linux/module.h> 52#include <linux/module.h>
53#include <linux/delayacct.h> 53#include <linux/delayacct.h>
54#include <linux/cgroupstats.h> 54#include <linux/cgroupstats.h>
55#include <linux/hash.h> 55#include <linux/hashtable.h>
56#include <linux/namei.h> 56#include <linux/namei.h>
57#include <linux/pid_namespace.h> 57#include <linux/pid_namespace.h>
58#include <linux/idr.h> 58#include <linux/idr.h>
@@ -376,22 +376,18 @@ static int css_set_count;
376 * account cgroups in empty hierarchies. 376 * account cgroups in empty hierarchies.
377 */ 377 */
378#define CSS_SET_HASH_BITS 7 378#define CSS_SET_HASH_BITS 7
379#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) 379static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
380static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
381 380
382static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) 381static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
383{ 382{
384 int i; 383 int i;
385 int index; 384 unsigned long key = 0UL;
386 unsigned long tmp = 0UL;
387 385
388 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) 386 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
389 tmp += (unsigned long)css[i]; 387 key += (unsigned long)css[i];
390 tmp = (tmp >> 16) ^ tmp; 388 key = (key >> 16) ^ key;
391 389
392 index = hash_long(tmp, CSS_SET_HASH_BITS); 390 return key;
393
394 return &css_set_table[index];
395} 391}
396 392
397/* We don't maintain the lists running through each css_set to its 393/* We don't maintain the lists running through each css_set to its
@@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
418 } 414 }
419 415
420 /* This css_set is dead. unlink it and release cgroup refcounts */ 416 /* This css_set is dead. unlink it and release cgroup refcounts */
421 hlist_del(&cg->hlist); 417 hash_del(&cg->hlist);
422 css_set_count--; 418 css_set_count--;
423 419
424 list_for_each_entry_safe(link, saved_link, &cg->cg_links, 420 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
426 struct cgroup *cgrp = link->cgrp; 422 struct cgroup *cgrp = link->cgrp;
427 list_del(&link->cg_link_list); 423 list_del(&link->cg_link_list);
428 list_del(&link->cgrp_link_list); 424 list_del(&link->cgrp_link_list);
425
426 /*
427 * We may not be holding cgroup_mutex, and if cgrp->count is
428 * dropped to 0 the cgroup can be destroyed at any time, hence
429 * rcu_read_lock is used to keep it alive.
430 */
431 rcu_read_lock();
429 if (atomic_dec_and_test(&cgrp->count) && 432 if (atomic_dec_and_test(&cgrp->count) &&
430 notify_on_release(cgrp)) { 433 notify_on_release(cgrp)) {
431 if (taskexit) 434 if (taskexit)
432 set_bit(CGRP_RELEASABLE, &cgrp->flags); 435 set_bit(CGRP_RELEASABLE, &cgrp->flags);
433 check_for_release(cgrp); 436 check_for_release(cgrp);
434 } 437 }
438 rcu_read_unlock();
435 439
436 kfree(link); 440 kfree(link);
437 } 441 }
@@ -550,9 +554,8 @@ static struct css_set *find_existing_css_set(
550{ 554{
551 int i; 555 int i;
552 struct cgroupfs_root *root = cgrp->root; 556 struct cgroupfs_root *root = cgrp->root;
553 struct hlist_head *hhead;
554 struct hlist_node *node;
555 struct css_set *cg; 557 struct css_set *cg;
558 unsigned long key;
556 559
557 /* 560 /*
558 * Build the set of subsystem state objects that we want to see in the 561 * Build the set of subsystem state objects that we want to see in the
@@ -572,8 +575,8 @@ static struct css_set *find_existing_css_set(
572 } 575 }
573 } 576 }
574 577
575 hhead = css_set_hash(template); 578 key = css_set_hash(template);
576 hlist_for_each_entry(cg, node, hhead, hlist) { 579 hash_for_each_possible(css_set_table, cg, hlist, key) {
577 if (!compare_css_sets(cg, oldcg, cgrp, template)) 580 if (!compare_css_sets(cg, oldcg, cgrp, template))
578 continue; 581 continue;
579 582
@@ -657,8 +660,8 @@ static struct css_set *find_css_set(
657 660
658 struct list_head tmp_cg_links; 661 struct list_head tmp_cg_links;
659 662
660 struct hlist_head *hhead;
661 struct cg_cgroup_link *link; 663 struct cg_cgroup_link *link;
664 unsigned long key;
662 665
663 /* First see if we already have a cgroup group that matches 666 /* First see if we already have a cgroup group that matches
664 * the desired set */ 667 * the desired set */
@@ -704,8 +707,8 @@ static struct css_set *find_css_set(
704 css_set_count++; 707 css_set_count++;
705 708
706 /* Add this cgroup group to the hash table */ 709 /* Add this cgroup group to the hash table */
707 hhead = css_set_hash(res->subsys); 710 key = css_set_hash(res->subsys);
708 hlist_add_head(&res->hlist, hhead); 711 hash_add(css_set_table, &res->hlist, key);
709 712
710 write_unlock(&css_set_lock); 713 write_unlock(&css_set_lock);
711 714
@@ -856,47 +859,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
856 return inode; 859 return inode;
857} 860}
858 861
859static void cgroup_diput(struct dentry *dentry, struct inode *inode) 862static void cgroup_free_fn(struct work_struct *work)
860{ 863{
861 /* is dentry a directory ? if so, kfree() associated cgroup */ 864 struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
862 if (S_ISDIR(inode->i_mode)) { 865 struct cgroup_subsys *ss;
863 struct cgroup *cgrp = dentry->d_fsdata;
864 struct cgroup_subsys *ss;
865 BUG_ON(!(cgroup_is_removed(cgrp)));
866 /* It's possible for external users to be holding css
867 * reference counts on a cgroup; css_put() needs to
868 * be able to access the cgroup after decrementing
869 * the reference count in order to know if it needs to
870 * queue the cgroup to be handled by the release
871 * agent */
872 synchronize_rcu();
873 866
874 mutex_lock(&cgroup_mutex); 867 mutex_lock(&cgroup_mutex);
875 /* 868 /*
876 * Release the subsystem state objects. 869 * Release the subsystem state objects.
877 */ 870 */
878 for_each_subsys(cgrp->root, ss) 871 for_each_subsys(cgrp->root, ss)
879 ss->css_free(cgrp); 872 ss->css_free(cgrp);
880 873
881 cgrp->root->number_of_cgroups--; 874 cgrp->root->number_of_cgroups--;
882 mutex_unlock(&cgroup_mutex); 875 mutex_unlock(&cgroup_mutex);
883 876
884 /* 877 /*
885 * Drop the active superblock reference that we took when we 878 * Drop the active superblock reference that we took when we
886 * created the cgroup 879 * created the cgroup
887 */ 880 */
888 deactivate_super(cgrp->root->sb); 881 deactivate_super(cgrp->root->sb);
889 882
890 /* 883 /*
891 * if we're getting rid of the cgroup, refcount should ensure 884 * if we're getting rid of the cgroup, refcount should ensure
892 * that there are no pidlists left. 885 * that there are no pidlists left.
893 */ 886 */
894 BUG_ON(!list_empty(&cgrp->pidlists)); 887 BUG_ON(!list_empty(&cgrp->pidlists));
888
889 simple_xattrs_free(&cgrp->xattrs);
890
891 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
892 kfree(cgrp);
893}
895 894
896 simple_xattrs_free(&cgrp->xattrs); 895static void cgroup_free_rcu(struct rcu_head *head)
896{
897 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
898
899 schedule_work(&cgrp->free_work);
900}
901
902static void cgroup_diput(struct dentry *dentry, struct inode *inode)
903{
904 /* is dentry a directory ? if so, kfree() associated cgroup */
905 if (S_ISDIR(inode->i_mode)) {
906 struct cgroup *cgrp = dentry->d_fsdata;
897 907
898 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); 908 BUG_ON(!(cgroup_is_removed(cgrp)));
899 kfree_rcu(cgrp, rcu_head); 909 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
900 } else { 910 } else {
901 struct cfent *cfe = __d_cfe(dentry); 911 struct cfent *cfe = __d_cfe(dentry);
902 struct cgroup *cgrp = dentry->d_parent->d_fsdata; 912 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -925,13 +935,17 @@ static void remove_dir(struct dentry *d)
925 dput(parent); 935 dput(parent);
926} 936}
927 937
928static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 938static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
929{ 939{
930 struct cfent *cfe; 940 struct cfent *cfe;
931 941
932 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 942 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
933 lockdep_assert_held(&cgroup_mutex); 943 lockdep_assert_held(&cgroup_mutex);
934 944
945 /*
946 * If we're doing cleanup due to failure of cgroup_create(),
947 * the corresponding @cfe may not exist.
948 */
935 list_for_each_entry(cfe, &cgrp->files, node) { 949 list_for_each_entry(cfe, &cgrp->files, node) {
936 struct dentry *d = cfe->dentry; 950 struct dentry *d = cfe->dentry;
937 951
@@ -944,9 +958,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
944 list_del_init(&cfe->node); 958 list_del_init(&cfe->node);
945 dput(d); 959 dput(d);
946 960
947 return 0; 961 break;
948 } 962 }
949 return -ENOENT;
950} 963}
951 964
952/** 965/**
@@ -1083,7 +1096,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1083 } 1096 }
1084 } 1097 }
1085 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; 1098 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
1086 synchronize_rcu();
1087 1099
1088 return 0; 1100 return 0;
1089} 1101}
@@ -1393,6 +1405,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1393 INIT_LIST_HEAD(&cgrp->allcg_node); 1405 INIT_LIST_HEAD(&cgrp->allcg_node);
1394 INIT_LIST_HEAD(&cgrp->release_list); 1406 INIT_LIST_HEAD(&cgrp->release_list);
1395 INIT_LIST_HEAD(&cgrp->pidlists); 1407 INIT_LIST_HEAD(&cgrp->pidlists);
1408 INIT_WORK(&cgrp->free_work, cgroup_free_fn);
1396 mutex_init(&cgrp->pidlist_mutex); 1409 mutex_init(&cgrp->pidlist_mutex);
1397 INIT_LIST_HEAD(&cgrp->event_list); 1410 INIT_LIST_HEAD(&cgrp->event_list);
1398 spin_lock_init(&cgrp->event_list_lock); 1411 spin_lock_init(&cgrp->event_list_lock);
@@ -1597,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1597 struct cgroupfs_root *existing_root; 1610 struct cgroupfs_root *existing_root;
1598 const struct cred *cred; 1611 const struct cred *cred;
1599 int i; 1612 int i;
1613 struct css_set *cg;
1600 1614
1601 BUG_ON(sb->s_root != NULL); 1615 BUG_ON(sb->s_root != NULL);
1602 1616
@@ -1650,14 +1664,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1650 /* Link the top cgroup in this hierarchy into all 1664 /* Link the top cgroup in this hierarchy into all
1651 * the css_set objects */ 1665 * the css_set objects */
1652 write_lock(&css_set_lock); 1666 write_lock(&css_set_lock);
1653 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { 1667 hash_for_each(css_set_table, i, cg, hlist)
1654 struct hlist_head *hhead = &css_set_table[i]; 1668 link_css_set(&tmp_cg_links, cg, root_cgrp);
1655 struct hlist_node *node;
1656 struct css_set *cg;
1657
1658 hlist_for_each_entry(cg, node, hhead, hlist)
1659 link_css_set(&tmp_cg_links, cg, root_cgrp);
1660 }
1661 write_unlock(&css_set_lock); 1669 write_unlock(&css_set_lock);
1662 1670
1663 free_cg_links(&tmp_cg_links); 1671 free_cg_links(&tmp_cg_links);
@@ -1773,7 +1781,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1773 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), 1781 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
1774 "cgroup_path() called without proper locking"); 1782 "cgroup_path() called without proper locking");
1775 1783
1776 if (!dentry || cgrp == dummytop) { 1784 if (cgrp == dummytop) {
1777 /* 1785 /*
1778 * Inactive subsystems have no dentry for their root 1786 * Inactive subsystems have no dentry for their root
1779 * cgroup 1787 * cgroup
@@ -1982,7 +1990,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1982 ss->attach(cgrp, &tset); 1990 ss->attach(cgrp, &tset);
1983 } 1991 }
1984 1992
1985 synchronize_rcu();
1986out: 1993out:
1987 if (retval) { 1994 if (retval) {
1988 for_each_subsys(root, ss) { 1995 for_each_subsys(root, ss) {
@@ -2151,7 +2158,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2151 /* 2158 /*
2152 * step 5: success! and cleanup 2159 * step 5: success! and cleanup
2153 */ 2160 */
2154 synchronize_rcu();
2155 retval = 0; 2161 retval = 0;
2156out_put_css_set_refs: 2162out_put_css_set_refs:
2157 if (retval) { 2163 if (retval) {
@@ -2637,7 +2643,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un
2637 */ 2643 */
2638static inline struct cftype *__file_cft(struct file *file) 2644static inline struct cftype *__file_cft(struct file *file)
2639{ 2645{
2640 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) 2646 if (file_inode(file)->i_fop != &cgroup_file_operations)
2641 return ERR_PTR(-EINVAL); 2647 return ERR_PTR(-EINVAL);
2642 return __d_cft(file->f_dentry); 2648 return __d_cft(file->f_dentry);
2643} 2649}
@@ -2769,14 +2775,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2769 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2775 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2770 continue; 2776 continue;
2771 2777
2772 if (is_add) 2778 if (is_add) {
2773 err = cgroup_add_file(cgrp, subsys, cft); 2779 err = cgroup_add_file(cgrp, subsys, cft);
2774 else 2780 if (err)
2775 err = cgroup_rm_file(cgrp, cft); 2781 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2776 if (err) { 2782 cft->name, err);
2777 pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
2778 is_add ? "add" : "remove", cft->name, err);
2779 ret = err; 2783 ret = err;
2784 } else {
2785 cgroup_rm_file(cgrp, cft);
2780 } 2786 }
2781 } 2787 }
2782 return ret; 2788 return ret;
@@ -3017,6 +3023,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
3017} 3023}
3018EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); 3024EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3019 3025
3026/**
3027 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
3028 * @pos: cgroup of interest
3029 *
3030 * Return the rightmost descendant of @pos. If there's no descendant,
3031 * @pos is returned. This can be used during pre-order traversal to skip
3032 * subtree of @pos.
3033 */
3034struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
3035{
3036 struct cgroup *last, *tmp;
3037
3038 WARN_ON_ONCE(!rcu_read_lock_held());
3039
3040 do {
3041 last = pos;
3042 /* ->prev isn't RCU safe, walk ->next till the end */
3043 pos = NULL;
3044 list_for_each_entry_rcu(tmp, &last->children, sibling)
3045 pos = tmp;
3046 } while (pos);
3047
3048 return last;
3049}
3050EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
3051
3020static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) 3052static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3021{ 3053{
3022 struct cgroup *last; 3054 struct cgroup *last;
@@ -3752,8 +3784,13 @@ static void cgroup_event_remove(struct work_struct *work)
3752 remove); 3784 remove);
3753 struct cgroup *cgrp = event->cgrp; 3785 struct cgroup *cgrp = event->cgrp;
3754 3786
3787 remove_wait_queue(event->wqh, &event->wait);
3788
3755 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3789 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3756 3790
3791 /* Notify userspace the event is going away. */
3792 eventfd_signal(event->eventfd, 1);
3793
3757 eventfd_ctx_put(event->eventfd); 3794 eventfd_ctx_put(event->eventfd);
3758 kfree(event); 3795 kfree(event);
3759 dput(cgrp->dentry); 3796 dput(cgrp->dentry);
@@ -3773,15 +3810,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3773 unsigned long flags = (unsigned long)key; 3810 unsigned long flags = (unsigned long)key;
3774 3811
3775 if (flags & POLLHUP) { 3812 if (flags & POLLHUP) {
3776 __remove_wait_queue(event->wqh, &event->wait);
3777 spin_lock(&cgrp->event_list_lock);
3778 list_del_init(&event->list);
3779 spin_unlock(&cgrp->event_list_lock);
3780 /* 3813 /*
3781 * We are in atomic context, but cgroup_event_remove() may 3814 * If the event has been detached at cgroup removal, we
3782 * sleep, so we have to call it in workqueue. 3815 * can simply return knowing the other side will cleanup
3816 * for us.
3817 *
3818 * We can't race against event freeing since the other
3819 * side will require wqh->lock via remove_wait_queue(),
3820 * which we hold.
3783 */ 3821 */
3784 schedule_work(&event->remove); 3822 spin_lock(&cgrp->event_list_lock);
3823 if (!list_empty(&event->list)) {
3824 list_del_init(&event->list);
3825 /*
3826 * We are in atomic context, but cgroup_event_remove()
3827 * may sleep, so we have to call it in workqueue.
3828 */
3829 schedule_work(&event->remove);
3830 }
3831 spin_unlock(&cgrp->event_list_lock);
3785 } 3832 }
3786 3833
3787 return 0; 3834 return 0;
@@ -3807,6 +3854,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3807 const char *buffer) 3854 const char *buffer)
3808{ 3855{
3809 struct cgroup_event *event = NULL; 3856 struct cgroup_event *event = NULL;
3857 struct cgroup *cgrp_cfile;
3810 unsigned int efd, cfd; 3858 unsigned int efd, cfd;
3811 struct file *efile = NULL; 3859 struct file *efile = NULL;
3812 struct file *cfile = NULL; 3860 struct file *cfile = NULL;
@@ -3852,7 +3900,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3852 3900
3853 /* the process need read permission on control file */ 3901 /* the process need read permission on control file */
3854 /* AV: shouldn't we check that it's been opened for read instead? */ 3902 /* AV: shouldn't we check that it's been opened for read instead? */
3855 ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); 3903 ret = inode_permission(file_inode(cfile), MAY_READ);
3856 if (ret < 0) 3904 if (ret < 0)
3857 goto fail; 3905 goto fail;
3858 3906
@@ -3862,6 +3910,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3862 goto fail; 3910 goto fail;
3863 } 3911 }
3864 3912
3913 /*
3914 * The file to be monitored must be in the same cgroup as
3915 * cgroup.event_control is.
3916 */
3917 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
3918 if (cgrp_cfile != cgrp) {
3919 ret = -EINVAL;
3920 goto fail;
3921 }
3922
3865 if (!event->cft->register_event || !event->cft->unregister_event) { 3923 if (!event->cft->register_event || !event->cft->unregister_event) {
3866 ret = -EINVAL; 3924 ret = -EINVAL;
3867 goto fail; 3925 goto fail;
@@ -4135,6 +4193,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4135 4193
4136 init_cgroup_housekeeping(cgrp); 4194 init_cgroup_housekeeping(cgrp);
4137 4195
4196 dentry->d_fsdata = cgrp;
4197 cgrp->dentry = dentry;
4198
4138 cgrp->parent = parent; 4199 cgrp->parent = parent;
4139 cgrp->root = parent->root; 4200 cgrp->root = parent->root;
4140 cgrp->top_cgroup = parent->top_cgroup; 4201 cgrp->top_cgroup = parent->top_cgroup;
@@ -4172,8 +4233,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4172 lockdep_assert_held(&dentry->d_inode->i_mutex); 4233 lockdep_assert_held(&dentry->d_inode->i_mutex);
4173 4234
4174 /* allocation complete, commit to creation */ 4235 /* allocation complete, commit to creation */
4175 dentry->d_fsdata = cgrp;
4176 cgrp->dentry = dentry;
4177 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 4236 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4178 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4237 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4179 root->number_of_cgroups++; 4238 root->number_of_cgroups++;
@@ -4340,20 +4399,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4340 /* 4399 /*
4341 * Unregister events and notify userspace. 4400 * Unregister events and notify userspace.
4342 * Notify userspace about cgroup removing only after rmdir of cgroup 4401 * Notify userspace about cgroup removing only after rmdir of cgroup
4343 * directory to avoid race between userspace and kernelspace. Use 4402 * directory to avoid race between userspace and kernelspace.
4344 * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
4345 * cgroup_event_wake() is called with the wait queue head locked,
4346 * remove_wait_queue() cannot be called while holding event_list_lock.
4347 */ 4403 */
4348 spin_lock(&cgrp->event_list_lock); 4404 spin_lock(&cgrp->event_list_lock);
4349 list_splice_init(&cgrp->event_list, &tmp_list); 4405 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
4350 spin_unlock(&cgrp->event_list_lock);
4351 list_for_each_entry_safe(event, tmp, &tmp_list, list) {
4352 list_del_init(&event->list); 4406 list_del_init(&event->list);
4353 remove_wait_queue(event->wqh, &event->wait);
4354 eventfd_signal(event->eventfd, 1);
4355 schedule_work(&event->remove); 4407 schedule_work(&event->remove);
4356 } 4408 }
4409 spin_unlock(&cgrp->event_list_lock);
4357 4410
4358 return 0; 4411 return 0;
4359} 4412}
@@ -4438,6 +4491,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4438{ 4491{
4439 struct cgroup_subsys_state *css; 4492 struct cgroup_subsys_state *css;
4440 int i, ret; 4493 int i, ret;
4494 struct hlist_node *tmp;
4495 struct css_set *cg;
4496 unsigned long key;
4441 4497
4442 /* check name and function validity */ 4498 /* check name and function validity */
4443 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4499 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
@@ -4503,23 +4559,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4503 * this is all done under the css_set_lock. 4559 * this is all done under the css_set_lock.
4504 */ 4560 */
4505 write_lock(&css_set_lock); 4561 write_lock(&css_set_lock);
4506 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { 4562 hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
4507 struct css_set *cg; 4563 /* skip entries that we already rehashed */
4508 struct hlist_node *node, *tmp; 4564 if (cg->subsys[ss->subsys_id])
4509 struct hlist_head *bucket = &css_set_table[i], *new_bucket; 4565 continue;
4510 4566 /* remove existing entry */
4511 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { 4567 hash_del(&cg->hlist);
4512 /* skip entries that we already rehashed */ 4568 /* set new value */
4513 if (cg->subsys[ss->subsys_id]) 4569 cg->subsys[ss->subsys_id] = css;
4514 continue; 4570 /* recompute hash and restore entry */
4515 /* remove existing entry */ 4571 key = css_set_hash(cg->subsys);
4516 hlist_del(&cg->hlist); 4572 hash_add(css_set_table, &cg->hlist, key);
4517 /* set new value */
4518 cg->subsys[ss->subsys_id] = css;
4519 /* recompute hash and restore entry */
4520 new_bucket = css_set_hash(cg->subsys);
4521 hlist_add_head(&cg->hlist, new_bucket);
4522 }
4523 } 4573 }
4524 write_unlock(&css_set_lock); 4574 write_unlock(&css_set_lock);
4525 4575
@@ -4551,7 +4601,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4551void cgroup_unload_subsys(struct cgroup_subsys *ss) 4601void cgroup_unload_subsys(struct cgroup_subsys *ss)
4552{ 4602{
4553 struct cg_cgroup_link *link; 4603 struct cg_cgroup_link *link;
4554 struct hlist_head *hhead;
4555 4604
4556 BUG_ON(ss->module == NULL); 4605 BUG_ON(ss->module == NULL);
4557 4606
@@ -4567,10 +4616,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4567 offline_css(ss, dummytop); 4616 offline_css(ss, dummytop);
4568 ss->active = 0; 4617 ss->active = 0;
4569 4618
4570 if (ss->use_id) { 4619 if (ss->use_id)
4571 idr_remove_all(&ss->idr);
4572 idr_destroy(&ss->idr); 4620 idr_destroy(&ss->idr);
4573 }
4574 4621
4575 /* deassign the subsys_id */ 4622 /* deassign the subsys_id */
4576 subsys[ss->subsys_id] = NULL; 4623 subsys[ss->subsys_id] = NULL;
@@ -4585,11 +4632,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4585 write_lock(&css_set_lock); 4632 write_lock(&css_set_lock);
4586 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { 4633 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
4587 struct css_set *cg = link->cg; 4634 struct css_set *cg = link->cg;
4635 unsigned long key;
4588 4636
4589 hlist_del(&cg->hlist); 4637 hash_del(&cg->hlist);
4590 cg->subsys[ss->subsys_id] = NULL; 4638 cg->subsys[ss->subsys_id] = NULL;
4591 hhead = css_set_hash(cg->subsys); 4639 key = css_set_hash(cg->subsys);
4592 hlist_add_head(&cg->hlist, hhead); 4640 hash_add(css_set_table, &cg->hlist, key);
4593 } 4641 }
4594 write_unlock(&css_set_lock); 4642 write_unlock(&css_set_lock);
4595 4643
@@ -4631,9 +4679,6 @@ int __init cgroup_init_early(void)
4631 list_add(&init_css_set_link.cg_link_list, 4679 list_add(&init_css_set_link.cg_link_list,
4632 &init_css_set.cg_links); 4680 &init_css_set.cg_links);
4633 4681
4634 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
4635 INIT_HLIST_HEAD(&css_set_table[i]);
4636
4637 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4682 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4638 struct cgroup_subsys *ss = subsys[i]; 4683 struct cgroup_subsys *ss = subsys[i];
4639 4684
@@ -4667,7 +4712,7 @@ int __init cgroup_init(void)
4667{ 4712{
4668 int err; 4713 int err;
4669 int i; 4714 int i;
4670 struct hlist_head *hhead; 4715 unsigned long key;
4671 4716
4672 err = bdi_init(&cgroup_backing_dev_info); 4717 err = bdi_init(&cgroup_backing_dev_info);
4673 if (err) 4718 if (err)
@@ -4686,8 +4731,8 @@ int __init cgroup_init(void)
4686 } 4731 }
4687 4732
4688 /* Add init_css_set to the hash table */ 4733 /* Add init_css_set to the hash table */
4689 hhead = css_set_hash(init_css_set.subsys); 4734 key = css_set_hash(init_css_set.subsys);
4690 hlist_add_head(&init_css_set.hlist, hhead); 4735 hash_add(css_set_table, &init_css_set.hlist, key);
4691 BUG_ON(!init_root_id(&rootnode)); 4736 BUG_ON(!init_root_id(&rootnode));
4692 4737
4693 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4738 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4982,8 +5027,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4982 } 5027 }
4983 task_unlock(tsk); 5028 task_unlock(tsk);
4984 5029
4985 if (cg) 5030 put_css_set_taskexit(cg);
4986 put_css_set_taskexit(cg);
4987} 5031}
4988 5032
4989/** 5033/**
@@ -5274,7 +5318,7 @@ EXPORT_SYMBOL_GPL(free_css_id);
5274static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) 5318static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
5275{ 5319{
5276 struct css_id *newid; 5320 struct css_id *newid;
5277 int myid, error, size; 5321 int ret, size;
5278 5322
5279 BUG_ON(!ss->use_id); 5323 BUG_ON(!ss->use_id);
5280 5324
@@ -5282,35 +5326,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
5282 newid = kzalloc(size, GFP_KERNEL); 5326 newid = kzalloc(size, GFP_KERNEL);
5283 if (!newid) 5327 if (!newid)
5284 return ERR_PTR(-ENOMEM); 5328 return ERR_PTR(-ENOMEM);
5285 /* get id */ 5329
5286 if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { 5330 idr_preload(GFP_KERNEL);
5287 error = -ENOMEM;
5288 goto err_out;
5289 }
5290 spin_lock(&ss->id_lock); 5331 spin_lock(&ss->id_lock);
5291 /* Don't use 0. allocates an ID of 1-65535 */ 5332 /* Don't use 0. allocates an ID of 1-65535 */
5292 error = idr_get_new_above(&ss->idr, newid, 1, &myid); 5333 ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
5293 spin_unlock(&ss->id_lock); 5334 spin_unlock(&ss->id_lock);
5335 idr_preload_end();
5294 5336
5295 /* Returns error when there are no free spaces for new ID.*/ 5337 /* Returns error when there are no free spaces for new ID.*/
5296 if (error) { 5338 if (ret < 0)
5297 error = -ENOSPC;
5298 goto err_out; 5339 goto err_out;
5299 }
5300 if (myid > CSS_ID_MAX)
5301 goto remove_idr;
5302 5340
5303 newid->id = myid; 5341 newid->id = ret;
5304 newid->depth = depth; 5342 newid->depth = depth;
5305 return newid; 5343 return newid;
5306remove_idr:
5307 error = -ENOSPC;
5308 spin_lock(&ss->id_lock);
5309 idr_remove(&ss->idr, myid);
5310 spin_unlock(&ss->id_lock);
5311err_out: 5344err_out:
5312 kfree(newid); 5345 kfree(newid);
5313 return ERR_PTR(error); 5346 return ERR_PTR(ret);
5314 5347
5315} 5348}
5316 5349
@@ -5441,7 +5474,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5441 struct inode *inode; 5474 struct inode *inode;
5442 struct cgroup_subsys_state *css; 5475 struct cgroup_subsys_state *css;
5443 5476
5444 inode = f->f_dentry->d_inode; 5477 inode = file_inode(f);
5445 /* check in cgroup filesystem dir */ 5478 /* check in cgroup filesystem dir */
5446 if (inode->i_op != &cgroup_dir_inode_operations) 5479 if (inode->i_op != &cgroup_dir_inode_operations)
5447 return ERR_PTR(-EBADF); 5480 return ERR_PTR(-EBADF);