diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 325 |
1 files changed, 179 insertions, 146 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4855892798fd..a32f9432666c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -52,7 +52,7 @@ | |||
52 | #include <linux/module.h> | 52 | #include <linux/module.h> |
53 | #include <linux/delayacct.h> | 53 | #include <linux/delayacct.h> |
54 | #include <linux/cgroupstats.h> | 54 | #include <linux/cgroupstats.h> |
55 | #include <linux/hash.h> | 55 | #include <linux/hashtable.h> |
56 | #include <linux/namei.h> | 56 | #include <linux/namei.h> |
57 | #include <linux/pid_namespace.h> | 57 | #include <linux/pid_namespace.h> |
58 | #include <linux/idr.h> | 58 | #include <linux/idr.h> |
@@ -376,22 +376,18 @@ static int css_set_count; | |||
376 | * account cgroups in empty hierarchies. | 376 | * account cgroups in empty hierarchies. |
377 | */ | 377 | */ |
378 | #define CSS_SET_HASH_BITS 7 | 378 | #define CSS_SET_HASH_BITS 7 |
379 | #define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) | 379 | static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); |
380 | static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; | ||
381 | 380 | ||
382 | static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) | 381 | static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) |
383 | { | 382 | { |
384 | int i; | 383 | int i; |
385 | int index; | 384 | unsigned long key = 0UL; |
386 | unsigned long tmp = 0UL; | ||
387 | 385 | ||
388 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) | 386 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) |
389 | tmp += (unsigned long)css[i]; | 387 | key += (unsigned long)css[i]; |
390 | tmp = (tmp >> 16) ^ tmp; | 388 | key = (key >> 16) ^ key; |
391 | 389 | ||
392 | index = hash_long(tmp, CSS_SET_HASH_BITS); | 390 | return key; |
393 | |||
394 | return &css_set_table[index]; | ||
395 | } | 391 | } |
396 | 392 | ||
397 | /* We don't maintain the lists running through each css_set to its | 393 | /* We don't maintain the lists running through each css_set to its |
@@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) | |||
418 | } | 414 | } |
419 | 415 | ||
420 | /* This css_set is dead. unlink it and release cgroup refcounts */ | 416 | /* This css_set is dead. unlink it and release cgroup refcounts */ |
421 | hlist_del(&cg->hlist); | 417 | hash_del(&cg->hlist); |
422 | css_set_count--; | 418 | css_set_count--; |
423 | 419 | ||
424 | list_for_each_entry_safe(link, saved_link, &cg->cg_links, | 420 | list_for_each_entry_safe(link, saved_link, &cg->cg_links, |
@@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit) | |||
426 | struct cgroup *cgrp = link->cgrp; | 422 | struct cgroup *cgrp = link->cgrp; |
427 | list_del(&link->cg_link_list); | 423 | list_del(&link->cg_link_list); |
428 | list_del(&link->cgrp_link_list); | 424 | list_del(&link->cgrp_link_list); |
425 | |||
426 | /* | ||
427 | * We may not be holding cgroup_mutex, and if cgrp->count is | ||
428 | * dropped to 0 the cgroup can be destroyed at any time, hence | ||
429 | * rcu_read_lock is used to keep it alive. | ||
430 | */ | ||
431 | rcu_read_lock(); | ||
429 | if (atomic_dec_and_test(&cgrp->count) && | 432 | if (atomic_dec_and_test(&cgrp->count) && |
430 | notify_on_release(cgrp)) { | 433 | notify_on_release(cgrp)) { |
431 | if (taskexit) | 434 | if (taskexit) |
432 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 435 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
433 | check_for_release(cgrp); | 436 | check_for_release(cgrp); |
434 | } | 437 | } |
438 | rcu_read_unlock(); | ||
435 | 439 | ||
436 | kfree(link); | 440 | kfree(link); |
437 | } | 441 | } |
@@ -550,9 +554,8 @@ static struct css_set *find_existing_css_set( | |||
550 | { | 554 | { |
551 | int i; | 555 | int i; |
552 | struct cgroupfs_root *root = cgrp->root; | 556 | struct cgroupfs_root *root = cgrp->root; |
553 | struct hlist_head *hhead; | ||
554 | struct hlist_node *node; | ||
555 | struct css_set *cg; | 557 | struct css_set *cg; |
558 | unsigned long key; | ||
556 | 559 | ||
557 | /* | 560 | /* |
558 | * Build the set of subsystem state objects that we want to see in the | 561 | * Build the set of subsystem state objects that we want to see in the |
@@ -572,8 +575,8 @@ static struct css_set *find_existing_css_set( | |||
572 | } | 575 | } |
573 | } | 576 | } |
574 | 577 | ||
575 | hhead = css_set_hash(template); | 578 | key = css_set_hash(template); |
576 | hlist_for_each_entry(cg, node, hhead, hlist) { | 579 | hash_for_each_possible(css_set_table, cg, hlist, key) { |
577 | if (!compare_css_sets(cg, oldcg, cgrp, template)) | 580 | if (!compare_css_sets(cg, oldcg, cgrp, template)) |
578 | continue; | 581 | continue; |
579 | 582 | ||
@@ -657,8 +660,8 @@ static struct css_set *find_css_set( | |||
657 | 660 | ||
658 | struct list_head tmp_cg_links; | 661 | struct list_head tmp_cg_links; |
659 | 662 | ||
660 | struct hlist_head *hhead; | ||
661 | struct cg_cgroup_link *link; | 663 | struct cg_cgroup_link *link; |
664 | unsigned long key; | ||
662 | 665 | ||
663 | /* First see if we already have a cgroup group that matches | 666 | /* First see if we already have a cgroup group that matches |
664 | * the desired set */ | 667 | * the desired set */ |
@@ -704,8 +707,8 @@ static struct css_set *find_css_set( | |||
704 | css_set_count++; | 707 | css_set_count++; |
705 | 708 | ||
706 | /* Add this cgroup group to the hash table */ | 709 | /* Add this cgroup group to the hash table */ |
707 | hhead = css_set_hash(res->subsys); | 710 | key = css_set_hash(res->subsys); |
708 | hlist_add_head(&res->hlist, hhead); | 711 | hash_add(css_set_table, &res->hlist, key); |
709 | 712 | ||
710 | write_unlock(&css_set_lock); | 713 | write_unlock(&css_set_lock); |
711 | 714 | ||
@@ -856,47 +859,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | |||
856 | return inode; | 859 | return inode; |
857 | } | 860 | } |
858 | 861 | ||
859 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 862 | static void cgroup_free_fn(struct work_struct *work) |
860 | { | 863 | { |
861 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 864 | struct cgroup *cgrp = container_of(work, struct cgroup, free_work); |
862 | if (S_ISDIR(inode->i_mode)) { | 865 | struct cgroup_subsys *ss; |
863 | struct cgroup *cgrp = dentry->d_fsdata; | ||
864 | struct cgroup_subsys *ss; | ||
865 | BUG_ON(!(cgroup_is_removed(cgrp))); | ||
866 | /* It's possible for external users to be holding css | ||
867 | * reference counts on a cgroup; css_put() needs to | ||
868 | * be able to access the cgroup after decrementing | ||
869 | * the reference count in order to know if it needs to | ||
870 | * queue the cgroup to be handled by the release | ||
871 | * agent */ | ||
872 | synchronize_rcu(); | ||
873 | 866 | ||
874 | mutex_lock(&cgroup_mutex); | 867 | mutex_lock(&cgroup_mutex); |
875 | /* | 868 | /* |
876 | * Release the subsystem state objects. | 869 | * Release the subsystem state objects. |
877 | */ | 870 | */ |
878 | for_each_subsys(cgrp->root, ss) | 871 | for_each_subsys(cgrp->root, ss) |
879 | ss->css_free(cgrp); | 872 | ss->css_free(cgrp); |
880 | 873 | ||
881 | cgrp->root->number_of_cgroups--; | 874 | cgrp->root->number_of_cgroups--; |
882 | mutex_unlock(&cgroup_mutex); | 875 | mutex_unlock(&cgroup_mutex); |
883 | 876 | ||
884 | /* | 877 | /* |
885 | * Drop the active superblock reference that we took when we | 878 | * Drop the active superblock reference that we took when we |
886 | * created the cgroup | 879 | * created the cgroup |
887 | */ | 880 | */ |
888 | deactivate_super(cgrp->root->sb); | 881 | deactivate_super(cgrp->root->sb); |
889 | 882 | ||
890 | /* | 883 | /* |
891 | * if we're getting rid of the cgroup, refcount should ensure | 884 | * if we're getting rid of the cgroup, refcount should ensure |
892 | * that there are no pidlists left. | 885 | * that there are no pidlists left. |
893 | */ | 886 | */ |
894 | BUG_ON(!list_empty(&cgrp->pidlists)); | 887 | BUG_ON(!list_empty(&cgrp->pidlists)); |
888 | |||
889 | simple_xattrs_free(&cgrp->xattrs); | ||
890 | |||
891 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | ||
892 | kfree(cgrp); | ||
893 | } | ||
895 | 894 | ||
896 | simple_xattrs_free(&cgrp->xattrs); | 895 | static void cgroup_free_rcu(struct rcu_head *head) |
896 | { | ||
897 | struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); | ||
898 | |||
899 | schedule_work(&cgrp->free_work); | ||
900 | } | ||
901 | |||
902 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | ||
903 | { | ||
904 | /* is dentry a directory ? if so, kfree() associated cgroup */ | ||
905 | if (S_ISDIR(inode->i_mode)) { | ||
906 | struct cgroup *cgrp = dentry->d_fsdata; | ||
897 | 907 | ||
898 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | 908 | BUG_ON(!(cgroup_is_removed(cgrp))); |
899 | kfree_rcu(cgrp, rcu_head); | 909 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); |
900 | } else { | 910 | } else { |
901 | struct cfent *cfe = __d_cfe(dentry); | 911 | struct cfent *cfe = __d_cfe(dentry); |
902 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; | 912 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; |
@@ -925,13 +935,17 @@ static void remove_dir(struct dentry *d) | |||
925 | dput(parent); | 935 | dput(parent); |
926 | } | 936 | } |
927 | 937 | ||
928 | static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | 938 | static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) |
929 | { | 939 | { |
930 | struct cfent *cfe; | 940 | struct cfent *cfe; |
931 | 941 | ||
932 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); | 942 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); |
933 | lockdep_assert_held(&cgroup_mutex); | 943 | lockdep_assert_held(&cgroup_mutex); |
934 | 944 | ||
945 | /* | ||
946 | * If we're doing cleanup due to failure of cgroup_create(), | ||
947 | * the corresponding @cfe may not exist. | ||
948 | */ | ||
935 | list_for_each_entry(cfe, &cgrp->files, node) { | 949 | list_for_each_entry(cfe, &cgrp->files, node) { |
936 | struct dentry *d = cfe->dentry; | 950 | struct dentry *d = cfe->dentry; |
937 | 951 | ||
@@ -944,9 +958,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
944 | list_del_init(&cfe->node); | 958 | list_del_init(&cfe->node); |
945 | dput(d); | 959 | dput(d); |
946 | 960 | ||
947 | return 0; | 961 | break; |
948 | } | 962 | } |
949 | return -ENOENT; | ||
950 | } | 963 | } |
951 | 964 | ||
952 | /** | 965 | /** |
@@ -1083,7 +1096,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1083 | } | 1096 | } |
1084 | } | 1097 | } |
1085 | root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; | 1098 | root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; |
1086 | synchronize_rcu(); | ||
1087 | 1099 | ||
1088 | return 0; | 1100 | return 0; |
1089 | } | 1101 | } |
@@ -1393,6 +1405,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1393 | INIT_LIST_HEAD(&cgrp->allcg_node); | 1405 | INIT_LIST_HEAD(&cgrp->allcg_node); |
1394 | INIT_LIST_HEAD(&cgrp->release_list); | 1406 | INIT_LIST_HEAD(&cgrp->release_list); |
1395 | INIT_LIST_HEAD(&cgrp->pidlists); | 1407 | INIT_LIST_HEAD(&cgrp->pidlists); |
1408 | INIT_WORK(&cgrp->free_work, cgroup_free_fn); | ||
1396 | mutex_init(&cgrp->pidlist_mutex); | 1409 | mutex_init(&cgrp->pidlist_mutex); |
1397 | INIT_LIST_HEAD(&cgrp->event_list); | 1410 | INIT_LIST_HEAD(&cgrp->event_list); |
1398 | spin_lock_init(&cgrp->event_list_lock); | 1411 | spin_lock_init(&cgrp->event_list_lock); |
@@ -1597,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1597 | struct cgroupfs_root *existing_root; | 1610 | struct cgroupfs_root *existing_root; |
1598 | const struct cred *cred; | 1611 | const struct cred *cred; |
1599 | int i; | 1612 | int i; |
1613 | struct css_set *cg; | ||
1600 | 1614 | ||
1601 | BUG_ON(sb->s_root != NULL); | 1615 | BUG_ON(sb->s_root != NULL); |
1602 | 1616 | ||
@@ -1650,14 +1664,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1650 | /* Link the top cgroup in this hierarchy into all | 1664 | /* Link the top cgroup in this hierarchy into all |
1651 | * the css_set objects */ | 1665 | * the css_set objects */ |
1652 | write_lock(&css_set_lock); | 1666 | write_lock(&css_set_lock); |
1653 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { | 1667 | hash_for_each(css_set_table, i, cg, hlist) |
1654 | struct hlist_head *hhead = &css_set_table[i]; | 1668 | link_css_set(&tmp_cg_links, cg, root_cgrp); |
1655 | struct hlist_node *node; | ||
1656 | struct css_set *cg; | ||
1657 | |||
1658 | hlist_for_each_entry(cg, node, hhead, hlist) | ||
1659 | link_css_set(&tmp_cg_links, cg, root_cgrp); | ||
1660 | } | ||
1661 | write_unlock(&css_set_lock); | 1669 | write_unlock(&css_set_lock); |
1662 | 1670 | ||
1663 | free_cg_links(&tmp_cg_links); | 1671 | free_cg_links(&tmp_cg_links); |
@@ -1773,7 +1781,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1773 | rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), | 1781 | rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), |
1774 | "cgroup_path() called without proper locking"); | 1782 | "cgroup_path() called without proper locking"); |
1775 | 1783 | ||
1776 | if (!dentry || cgrp == dummytop) { | 1784 | if (cgrp == dummytop) { |
1777 | /* | 1785 | /* |
1778 | * Inactive subsystems have no dentry for their root | 1786 | * Inactive subsystems have no dentry for their root |
1779 | * cgroup | 1787 | * cgroup |
@@ -1982,7 +1990,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1982 | ss->attach(cgrp, &tset); | 1990 | ss->attach(cgrp, &tset); |
1983 | } | 1991 | } |
1984 | 1992 | ||
1985 | synchronize_rcu(); | ||
1986 | out: | 1993 | out: |
1987 | if (retval) { | 1994 | if (retval) { |
1988 | for_each_subsys(root, ss) { | 1995 | for_each_subsys(root, ss) { |
@@ -2151,7 +2158,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2151 | /* | 2158 | /* |
2152 | * step 5: success! and cleanup | 2159 | * step 5: success! and cleanup |
2153 | */ | 2160 | */ |
2154 | synchronize_rcu(); | ||
2155 | retval = 0; | 2161 | retval = 0; |
2156 | out_put_css_set_refs: | 2162 | out_put_css_set_refs: |
2157 | if (retval) { | 2163 | if (retval) { |
@@ -2637,7 +2643,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un | |||
2637 | */ | 2643 | */ |
2638 | static inline struct cftype *__file_cft(struct file *file) | 2644 | static inline struct cftype *__file_cft(struct file *file) |
2639 | { | 2645 | { |
2640 | if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) | 2646 | if (file_inode(file)->i_fop != &cgroup_file_operations) |
2641 | return ERR_PTR(-EINVAL); | 2647 | return ERR_PTR(-EINVAL); |
2642 | return __d_cft(file->f_dentry); | 2648 | return __d_cft(file->f_dentry); |
2643 | } | 2649 | } |
@@ -2769,14 +2775,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2769 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | 2775 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) |
2770 | continue; | 2776 | continue; |
2771 | 2777 | ||
2772 | if (is_add) | 2778 | if (is_add) { |
2773 | err = cgroup_add_file(cgrp, subsys, cft); | 2779 | err = cgroup_add_file(cgrp, subsys, cft); |
2774 | else | 2780 | if (err) |
2775 | err = cgroup_rm_file(cgrp, cft); | 2781 | pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", |
2776 | if (err) { | 2782 | cft->name, err); |
2777 | pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", | ||
2778 | is_add ? "add" : "remove", cft->name, err); | ||
2779 | ret = err; | 2783 | ret = err; |
2784 | } else { | ||
2785 | cgroup_rm_file(cgrp, cft); | ||
2780 | } | 2786 | } |
2781 | } | 2787 | } |
2782 | return ret; | 2788 | return ret; |
@@ -3017,6 +3023,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | |||
3017 | } | 3023 | } |
3018 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | 3024 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); |
3019 | 3025 | ||
3026 | /** | ||
3027 | * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup | ||
3028 | * @pos: cgroup of interest | ||
3029 | * | ||
3030 | * Return the rightmost descendant of @pos. If there's no descendant, | ||
3031 | * @pos is returned. This can be used during pre-order traversal to skip | ||
3032 | * subtree of @pos. | ||
3033 | */ | ||
3034 | struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) | ||
3035 | { | ||
3036 | struct cgroup *last, *tmp; | ||
3037 | |||
3038 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
3039 | |||
3040 | do { | ||
3041 | last = pos; | ||
3042 | /* ->prev isn't RCU safe, walk ->next till the end */ | ||
3043 | pos = NULL; | ||
3044 | list_for_each_entry_rcu(tmp, &last->children, sibling) | ||
3045 | pos = tmp; | ||
3046 | } while (pos); | ||
3047 | |||
3048 | return last; | ||
3049 | } | ||
3050 | EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); | ||
3051 | |||
3020 | static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) | 3052 | static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) |
3021 | { | 3053 | { |
3022 | struct cgroup *last; | 3054 | struct cgroup *last; |
@@ -3752,8 +3784,13 @@ static void cgroup_event_remove(struct work_struct *work) | |||
3752 | remove); | 3784 | remove); |
3753 | struct cgroup *cgrp = event->cgrp; | 3785 | struct cgroup *cgrp = event->cgrp; |
3754 | 3786 | ||
3787 | remove_wait_queue(event->wqh, &event->wait); | ||
3788 | |||
3755 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); | 3789 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); |
3756 | 3790 | ||
3791 | /* Notify userspace the event is going away. */ | ||
3792 | eventfd_signal(event->eventfd, 1); | ||
3793 | |||
3757 | eventfd_ctx_put(event->eventfd); | 3794 | eventfd_ctx_put(event->eventfd); |
3758 | kfree(event); | 3795 | kfree(event); |
3759 | dput(cgrp->dentry); | 3796 | dput(cgrp->dentry); |
@@ -3773,15 +3810,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | |||
3773 | unsigned long flags = (unsigned long)key; | 3810 | unsigned long flags = (unsigned long)key; |
3774 | 3811 | ||
3775 | if (flags & POLLHUP) { | 3812 | if (flags & POLLHUP) { |
3776 | __remove_wait_queue(event->wqh, &event->wait); | ||
3777 | spin_lock(&cgrp->event_list_lock); | ||
3778 | list_del_init(&event->list); | ||
3779 | spin_unlock(&cgrp->event_list_lock); | ||
3780 | /* | 3813 | /* |
3781 | * We are in atomic context, but cgroup_event_remove() may | 3814 | * If the event has been detached at cgroup removal, we |
3782 | * sleep, so we have to call it in workqueue. | 3815 | * can simply return knowing the other side will cleanup |
3816 | * for us. | ||
3817 | * | ||
3818 | * We can't race against event freeing since the other | ||
3819 | * side will require wqh->lock via remove_wait_queue(), | ||
3820 | * which we hold. | ||
3783 | */ | 3821 | */ |
3784 | schedule_work(&event->remove); | 3822 | spin_lock(&cgrp->event_list_lock); |
3823 | if (!list_empty(&event->list)) { | ||
3824 | list_del_init(&event->list); | ||
3825 | /* | ||
3826 | * We are in atomic context, but cgroup_event_remove() | ||
3827 | * may sleep, so we have to call it in workqueue. | ||
3828 | */ | ||
3829 | schedule_work(&event->remove); | ||
3830 | } | ||
3831 | spin_unlock(&cgrp->event_list_lock); | ||
3785 | } | 3832 | } |
3786 | 3833 | ||
3787 | return 0; | 3834 | return 0; |
@@ -3807,6 +3854,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
3807 | const char *buffer) | 3854 | const char *buffer) |
3808 | { | 3855 | { |
3809 | struct cgroup_event *event = NULL; | 3856 | struct cgroup_event *event = NULL; |
3857 | struct cgroup *cgrp_cfile; | ||
3810 | unsigned int efd, cfd; | 3858 | unsigned int efd, cfd; |
3811 | struct file *efile = NULL; | 3859 | struct file *efile = NULL; |
3812 | struct file *cfile = NULL; | 3860 | struct file *cfile = NULL; |
@@ -3852,7 +3900,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
3852 | 3900 | ||
3853 | /* the process need read permission on control file */ | 3901 | /* the process need read permission on control file */ |
3854 | /* AV: shouldn't we check that it's been opened for read instead? */ | 3902 | /* AV: shouldn't we check that it's been opened for read instead? */ |
3855 | ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); | 3903 | ret = inode_permission(file_inode(cfile), MAY_READ); |
3856 | if (ret < 0) | 3904 | if (ret < 0) |
3857 | goto fail; | 3905 | goto fail; |
3858 | 3906 | ||
@@ -3862,6 +3910,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
3862 | goto fail; | 3910 | goto fail; |
3863 | } | 3911 | } |
3864 | 3912 | ||
3913 | /* | ||
3914 | * The file to be monitored must be in the same cgroup as | ||
3915 | * cgroup.event_control is. | ||
3916 | */ | ||
3917 | cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); | ||
3918 | if (cgrp_cfile != cgrp) { | ||
3919 | ret = -EINVAL; | ||
3920 | goto fail; | ||
3921 | } | ||
3922 | |||
3865 | if (!event->cft->register_event || !event->cft->unregister_event) { | 3923 | if (!event->cft->register_event || !event->cft->unregister_event) { |
3866 | ret = -EINVAL; | 3924 | ret = -EINVAL; |
3867 | goto fail; | 3925 | goto fail; |
@@ -4135,6 +4193,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4135 | 4193 | ||
4136 | init_cgroup_housekeeping(cgrp); | 4194 | init_cgroup_housekeeping(cgrp); |
4137 | 4195 | ||
4196 | dentry->d_fsdata = cgrp; | ||
4197 | cgrp->dentry = dentry; | ||
4198 | |||
4138 | cgrp->parent = parent; | 4199 | cgrp->parent = parent; |
4139 | cgrp->root = parent->root; | 4200 | cgrp->root = parent->root; |
4140 | cgrp->top_cgroup = parent->top_cgroup; | 4201 | cgrp->top_cgroup = parent->top_cgroup; |
@@ -4172,8 +4233,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4172 | lockdep_assert_held(&dentry->d_inode->i_mutex); | 4233 | lockdep_assert_held(&dentry->d_inode->i_mutex); |
4173 | 4234 | ||
4174 | /* allocation complete, commit to creation */ | 4235 | /* allocation complete, commit to creation */ |
4175 | dentry->d_fsdata = cgrp; | ||
4176 | cgrp->dentry = dentry; | ||
4177 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | 4236 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); |
4178 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | 4237 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); |
4179 | root->number_of_cgroups++; | 4238 | root->number_of_cgroups++; |
@@ -4340,20 +4399,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4340 | /* | 4399 | /* |
4341 | * Unregister events and notify userspace. | 4400 | * Unregister events and notify userspace. |
4342 | * Notify userspace about cgroup removing only after rmdir of cgroup | 4401 | * Notify userspace about cgroup removing only after rmdir of cgroup |
4343 | * directory to avoid race between userspace and kernelspace. Use | 4402 | * directory to avoid race between userspace and kernelspace. |
4344 | * a temporary list to avoid a deadlock with cgroup_event_wake(). Since | ||
4345 | * cgroup_event_wake() is called with the wait queue head locked, | ||
4346 | * remove_wait_queue() cannot be called while holding event_list_lock. | ||
4347 | */ | 4403 | */ |
4348 | spin_lock(&cgrp->event_list_lock); | 4404 | spin_lock(&cgrp->event_list_lock); |
4349 | list_splice_init(&cgrp->event_list, &tmp_list); | 4405 | list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { |
4350 | spin_unlock(&cgrp->event_list_lock); | ||
4351 | list_for_each_entry_safe(event, tmp, &tmp_list, list) { | ||
4352 | list_del_init(&event->list); | 4406 | list_del_init(&event->list); |
4353 | remove_wait_queue(event->wqh, &event->wait); | ||
4354 | eventfd_signal(event->eventfd, 1); | ||
4355 | schedule_work(&event->remove); | 4407 | schedule_work(&event->remove); |
4356 | } | 4408 | } |
4409 | spin_unlock(&cgrp->event_list_lock); | ||
4357 | 4410 | ||
4358 | return 0; | 4411 | return 0; |
4359 | } | 4412 | } |
@@ -4438,6 +4491,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4438 | { | 4491 | { |
4439 | struct cgroup_subsys_state *css; | 4492 | struct cgroup_subsys_state *css; |
4440 | int i, ret; | 4493 | int i, ret; |
4494 | struct hlist_node *tmp; | ||
4495 | struct css_set *cg; | ||
4496 | unsigned long key; | ||
4441 | 4497 | ||
4442 | /* check name and function validity */ | 4498 | /* check name and function validity */ |
4443 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || | 4499 | if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || |
@@ -4503,23 +4559,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4503 | * this is all done under the css_set_lock. | 4559 | * this is all done under the css_set_lock. |
4504 | */ | 4560 | */ |
4505 | write_lock(&css_set_lock); | 4561 | write_lock(&css_set_lock); |
4506 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { | 4562 | hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { |
4507 | struct css_set *cg; | 4563 | /* skip entries that we already rehashed */ |
4508 | struct hlist_node *node, *tmp; | 4564 | if (cg->subsys[ss->subsys_id]) |
4509 | struct hlist_head *bucket = &css_set_table[i], *new_bucket; | 4565 | continue; |
4510 | 4566 | /* remove existing entry */ | |
4511 | hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { | 4567 | hash_del(&cg->hlist); |
4512 | /* skip entries that we already rehashed */ | 4568 | /* set new value */ |
4513 | if (cg->subsys[ss->subsys_id]) | 4569 | cg->subsys[ss->subsys_id] = css; |
4514 | continue; | 4570 | /* recompute hash and restore entry */ |
4515 | /* remove existing entry */ | 4571 | key = css_set_hash(cg->subsys); |
4516 | hlist_del(&cg->hlist); | 4572 | hash_add(css_set_table, &cg->hlist, key); |
4517 | /* set new value */ | ||
4518 | cg->subsys[ss->subsys_id] = css; | ||
4519 | /* recompute hash and restore entry */ | ||
4520 | new_bucket = css_set_hash(cg->subsys); | ||
4521 | hlist_add_head(&cg->hlist, new_bucket); | ||
4522 | } | ||
4523 | } | 4573 | } |
4524 | write_unlock(&css_set_lock); | 4574 | write_unlock(&css_set_lock); |
4525 | 4575 | ||
@@ -4551,7 +4601,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); | |||
4551 | void cgroup_unload_subsys(struct cgroup_subsys *ss) | 4601 | void cgroup_unload_subsys(struct cgroup_subsys *ss) |
4552 | { | 4602 | { |
4553 | struct cg_cgroup_link *link; | 4603 | struct cg_cgroup_link *link; |
4554 | struct hlist_head *hhead; | ||
4555 | 4604 | ||
4556 | BUG_ON(ss->module == NULL); | 4605 | BUG_ON(ss->module == NULL); |
4557 | 4606 | ||
@@ -4567,10 +4616,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4567 | offline_css(ss, dummytop); | 4616 | offline_css(ss, dummytop); |
4568 | ss->active = 0; | 4617 | ss->active = 0; |
4569 | 4618 | ||
4570 | if (ss->use_id) { | 4619 | if (ss->use_id) |
4571 | idr_remove_all(&ss->idr); | ||
4572 | idr_destroy(&ss->idr); | 4620 | idr_destroy(&ss->idr); |
4573 | } | ||
4574 | 4621 | ||
4575 | /* deassign the subsys_id */ | 4622 | /* deassign the subsys_id */ |
4576 | subsys[ss->subsys_id] = NULL; | 4623 | subsys[ss->subsys_id] = NULL; |
@@ -4585,11 +4632,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4585 | write_lock(&css_set_lock); | 4632 | write_lock(&css_set_lock); |
4586 | list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { | 4633 | list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { |
4587 | struct css_set *cg = link->cg; | 4634 | struct css_set *cg = link->cg; |
4635 | unsigned long key; | ||
4588 | 4636 | ||
4589 | hlist_del(&cg->hlist); | 4637 | hash_del(&cg->hlist); |
4590 | cg->subsys[ss->subsys_id] = NULL; | 4638 | cg->subsys[ss->subsys_id] = NULL; |
4591 | hhead = css_set_hash(cg->subsys); | 4639 | key = css_set_hash(cg->subsys); |
4592 | hlist_add_head(&cg->hlist, hhead); | 4640 | hash_add(css_set_table, &cg->hlist, key); |
4593 | } | 4641 | } |
4594 | write_unlock(&css_set_lock); | 4642 | write_unlock(&css_set_lock); |
4595 | 4643 | ||
@@ -4631,9 +4679,6 @@ int __init cgroup_init_early(void) | |||
4631 | list_add(&init_css_set_link.cg_link_list, | 4679 | list_add(&init_css_set_link.cg_link_list, |
4632 | &init_css_set.cg_links); | 4680 | &init_css_set.cg_links); |
4633 | 4681 | ||
4634 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) | ||
4635 | INIT_HLIST_HEAD(&css_set_table[i]); | ||
4636 | |||
4637 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4682 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
4638 | struct cgroup_subsys *ss = subsys[i]; | 4683 | struct cgroup_subsys *ss = subsys[i]; |
4639 | 4684 | ||
@@ -4667,7 +4712,7 @@ int __init cgroup_init(void) | |||
4667 | { | 4712 | { |
4668 | int err; | 4713 | int err; |
4669 | int i; | 4714 | int i; |
4670 | struct hlist_head *hhead; | 4715 | unsigned long key; |
4671 | 4716 | ||
4672 | err = bdi_init(&cgroup_backing_dev_info); | 4717 | err = bdi_init(&cgroup_backing_dev_info); |
4673 | if (err) | 4718 | if (err) |
@@ -4686,8 +4731,8 @@ int __init cgroup_init(void) | |||
4686 | } | 4731 | } |
4687 | 4732 | ||
4688 | /* Add init_css_set to the hash table */ | 4733 | /* Add init_css_set to the hash table */ |
4689 | hhead = css_set_hash(init_css_set.subsys); | 4734 | key = css_set_hash(init_css_set.subsys); |
4690 | hlist_add_head(&init_css_set.hlist, hhead); | 4735 | hash_add(css_set_table, &init_css_set.hlist, key); |
4691 | BUG_ON(!init_root_id(&rootnode)); | 4736 | BUG_ON(!init_root_id(&rootnode)); |
4692 | 4737 | ||
4693 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); | 4738 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); |
@@ -4982,8 +5027,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
4982 | } | 5027 | } |
4983 | task_unlock(tsk); | 5028 | task_unlock(tsk); |
4984 | 5029 | ||
4985 | if (cg) | 5030 | put_css_set_taskexit(cg); |
4986 | put_css_set_taskexit(cg); | ||
4987 | } | 5031 | } |
4988 | 5032 | ||
4989 | /** | 5033 | /** |
@@ -5274,7 +5318,7 @@ EXPORT_SYMBOL_GPL(free_css_id); | |||
5274 | static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) | 5318 | static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) |
5275 | { | 5319 | { |
5276 | struct css_id *newid; | 5320 | struct css_id *newid; |
5277 | int myid, error, size; | 5321 | int ret, size; |
5278 | 5322 | ||
5279 | BUG_ON(!ss->use_id); | 5323 | BUG_ON(!ss->use_id); |
5280 | 5324 | ||
@@ -5282,35 +5326,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) | |||
5282 | newid = kzalloc(size, GFP_KERNEL); | 5326 | newid = kzalloc(size, GFP_KERNEL); |
5283 | if (!newid) | 5327 | if (!newid) |
5284 | return ERR_PTR(-ENOMEM); | 5328 | return ERR_PTR(-ENOMEM); |
5285 | /* get id */ | 5329 | |
5286 | if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { | 5330 | idr_preload(GFP_KERNEL); |
5287 | error = -ENOMEM; | ||
5288 | goto err_out; | ||
5289 | } | ||
5290 | spin_lock(&ss->id_lock); | 5331 | spin_lock(&ss->id_lock); |
5291 | /* Don't use 0. allocates an ID of 1-65535 */ | 5332 | /* Don't use 0. allocates an ID of 1-65535 */ |
5292 | error = idr_get_new_above(&ss->idr, newid, 1, &myid); | 5333 | ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT); |
5293 | spin_unlock(&ss->id_lock); | 5334 | spin_unlock(&ss->id_lock); |
5335 | idr_preload_end(); | ||
5294 | 5336 | ||
5295 | /* Returns error when there are no free spaces for new ID.*/ | 5337 | /* Returns error when there are no free spaces for new ID.*/ |
5296 | if (error) { | 5338 | if (ret < 0) |
5297 | error = -ENOSPC; | ||
5298 | goto err_out; | 5339 | goto err_out; |
5299 | } | ||
5300 | if (myid > CSS_ID_MAX) | ||
5301 | goto remove_idr; | ||
5302 | 5340 | ||
5303 | newid->id = myid; | 5341 | newid->id = ret; |
5304 | newid->depth = depth; | 5342 | newid->depth = depth; |
5305 | return newid; | 5343 | return newid; |
5306 | remove_idr: | ||
5307 | error = -ENOSPC; | ||
5308 | spin_lock(&ss->id_lock); | ||
5309 | idr_remove(&ss->idr, myid); | ||
5310 | spin_unlock(&ss->id_lock); | ||
5311 | err_out: | 5344 | err_out: |
5312 | kfree(newid); | 5345 | kfree(newid); |
5313 | return ERR_PTR(error); | 5346 | return ERR_PTR(ret); |
5314 | 5347 | ||
5315 | } | 5348 | } |
5316 | 5349 | ||
@@ -5441,7 +5474,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | |||
5441 | struct inode *inode; | 5474 | struct inode *inode; |
5442 | struct cgroup_subsys_state *css; | 5475 | struct cgroup_subsys_state *css; |
5443 | 5476 | ||
5444 | inode = f->f_dentry->d_inode; | 5477 | inode = file_inode(f); |
5445 | /* check in cgroup filesystem dir */ | 5478 | /* check in cgroup filesystem dir */ |
5446 | if (inode->i_op != &cgroup_dir_inode_operations) | 5479 | if (inode->i_op != &cgroup_dir_inode_operations) |
5447 | return ERR_PTR(-EBADF); | 5480 | return ERR_PTR(-EBADF); |