diff options
author | Li Zefan <lizefan@huawei.com> | 2013-03-01 02:01:56 -0500 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2013-03-04 12:50:08 -0500 |
commit | 65dff759d2948cf18e2029fc5c0c595b8b7da3a5 (patch) | |
tree | 2904e81e44bb939413f406bbb2056e88df008f72 | |
parent | 6dbe51c251a327e012439c4772097a13df43c5b8 (diff) |
cgroup: fix cgroup_path() vs rename() race
rename() will change dentry->d_name. The result of this race can
be worse than seeing partially rewritten name, but we might access
a stale pointer because rename() will re-allocate memory to hold
a longer name.
As accessing dentry->name must be protected by dentry->d_lock or
parent inode's i_mutex, while on the other hand cgroup-path() can
be called with some irq-safe spinlocks held, we can't generate
cgroup path using dentry->d_name.
Alternatively we make a copy of dentry->d_name and save it in
cgrp->name when a cgroup is created, and update cgrp->name at
rename().
v5: use flexible array instead of zero-size array.
v4: - allocate root_cgroup_name and all root_cgroup->name points to it.
- add cgroup_name() wrapper.
v3: use kfree_rcu() instead of synchronize_rcu() in user-visible path.
v2: make cgrp->name RCU safe.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
-rw-r--r-- | block/blk-cgroup.h | 2 | ||||
-rw-r--r-- | include/linux/cgroup.h | 24 | ||||
-rw-r--r-- | kernel/cgroup.c | 106 |
3 files changed, 100 insertions, 32 deletions
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index f2b292925ccd..4e595ee8c915 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h | |||
@@ -247,9 +247,7 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) | |||
247 | { | 247 | { |
248 | int ret; | 248 | int ret; |
249 | 249 | ||
250 | rcu_read_lock(); | ||
251 | ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); | 250 | ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); |
252 | rcu_read_unlock(); | ||
253 | if (ret) | 251 | if (ret) |
254 | strncpy(buf, "<unavailable>", buflen); | 252 | strncpy(buf, "<unavailable>", buflen); |
255 | return ret; | 253 | return ret; |
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 900af5964f55..75c6ec1ba1ba 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -150,6 +150,11 @@ enum { | |||
150 | CGRP_CPUSET_CLONE_CHILDREN, | 150 | CGRP_CPUSET_CLONE_CHILDREN, |
151 | }; | 151 | }; |
152 | 152 | ||
153 | struct cgroup_name { | ||
154 | struct rcu_head rcu_head; | ||
155 | char name[]; | ||
156 | }; | ||
157 | |||
153 | struct cgroup { | 158 | struct cgroup { |
154 | unsigned long flags; /* "unsigned long" so bitops work */ | 159 | unsigned long flags; /* "unsigned long" so bitops work */ |
155 | 160 | ||
@@ -172,6 +177,19 @@ struct cgroup { | |||
172 | struct cgroup *parent; /* my parent */ | 177 | struct cgroup *parent; /* my parent */ |
173 | struct dentry *dentry; /* cgroup fs entry, RCU protected */ | 178 | struct dentry *dentry; /* cgroup fs entry, RCU protected */ |
174 | 179 | ||
180 | /* | ||
181 | * This is a copy of dentry->d_name, and it's needed because | ||
182 | * we can't use dentry->d_name in cgroup_path(). | ||
183 | * | ||
184 | * You must acquire rcu_read_lock() to access cgrp->name, and | ||
185 | * the only place that can change it is rename(), which is | ||
186 | * protected by parent dir's i_mutex. | ||
187 | * | ||
188 | * Normally you should use cgroup_name() wrapper rather than | ||
189 | * access it directly. | ||
190 | */ | ||
191 | struct cgroup_name __rcu *name; | ||
192 | |||
175 | /* Private pointers for each registered subsystem */ | 193 | /* Private pointers for each registered subsystem */ |
176 | struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; | 194 | struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; |
177 | 195 | ||
@@ -404,6 +422,12 @@ struct cgroup_scanner { | |||
404 | void *data; | 422 | void *data; |
405 | }; | 423 | }; |
406 | 424 | ||
425 | /* Caller should hold rcu_read_lock() */ | ||
426 | static inline const char *cgroup_name(const struct cgroup *cgrp) | ||
427 | { | ||
428 | return rcu_dereference(cgrp->name)->name; | ||
429 | } | ||
430 | |||
407 | int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); | 431 | int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); |
408 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); | 432 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); |
409 | 433 | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a32f9432666c..50682168abc2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -238,6 +238,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); | |||
238 | /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ | 238 | /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ |
239 | #define dummytop (&rootnode.top_cgroup) | 239 | #define dummytop (&rootnode.top_cgroup) |
240 | 240 | ||
241 | static struct cgroup_name root_cgroup_name = { .name = "/" }; | ||
242 | |||
241 | /* This flag indicates whether tasks in the fork and exit paths should | 243 | /* This flag indicates whether tasks in the fork and exit paths should |
242 | * check for fork/exit handlers to call. This avoids us having to do | 244 | * check for fork/exit handlers to call. This avoids us having to do |
243 | * extra work in the fork/exit path if none of the subsystems need to | 245 | * extra work in the fork/exit path if none of the subsystems need to |
@@ -859,6 +861,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | |||
859 | return inode; | 861 | return inode; |
860 | } | 862 | } |
861 | 863 | ||
864 | static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) | ||
865 | { | ||
866 | struct cgroup_name *name; | ||
867 | |||
868 | name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); | ||
869 | if (!name) | ||
870 | return NULL; | ||
871 | strcpy(name->name, dentry->d_name.name); | ||
872 | return name; | ||
873 | } | ||
874 | |||
862 | static void cgroup_free_fn(struct work_struct *work) | 875 | static void cgroup_free_fn(struct work_struct *work) |
863 | { | 876 | { |
864 | struct cgroup *cgrp = container_of(work, struct cgroup, free_work); | 877 | struct cgroup *cgrp = container_of(work, struct cgroup, free_work); |
@@ -889,6 +902,7 @@ static void cgroup_free_fn(struct work_struct *work) | |||
889 | simple_xattrs_free(&cgrp->xattrs); | 902 | simple_xattrs_free(&cgrp->xattrs); |
890 | 903 | ||
891 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | 904 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); |
905 | kfree(rcu_dereference_raw(cgrp->name)); | ||
892 | kfree(cgrp); | 906 | kfree(cgrp); |
893 | } | 907 | } |
894 | 908 | ||
@@ -1421,6 +1435,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
1421 | INIT_LIST_HEAD(&root->allcg_list); | 1435 | INIT_LIST_HEAD(&root->allcg_list); |
1422 | root->number_of_cgroups = 1; | 1436 | root->number_of_cgroups = 1; |
1423 | cgrp->root = root; | 1437 | cgrp->root = root; |
1438 | cgrp->name = &root_cgroup_name; | ||
1424 | cgrp->top_cgroup = cgrp; | 1439 | cgrp->top_cgroup = cgrp; |
1425 | init_cgroup_housekeeping(cgrp); | 1440 | init_cgroup_housekeeping(cgrp); |
1426 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | 1441 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); |
@@ -1769,49 +1784,45 @@ static struct kobject *cgroup_kobj; | |||
1769 | * @buf: the buffer to write the path into | 1784 | * @buf: the buffer to write the path into |
1770 | * @buflen: the length of the buffer | 1785 | * @buflen: the length of the buffer |
1771 | * | 1786 | * |
1772 | * Called with cgroup_mutex held or else with an RCU-protected cgroup | 1787 | * Writes path of cgroup into buf. Returns 0 on success, -errno on error. |
1773 | * reference. Writes path of cgroup into buf. Returns 0 on success, | 1788 | * |
1774 | * -errno on error. | 1789 | * We can't generate cgroup path using dentry->d_name, as accessing |
1790 | * dentry->name must be protected by irq-unsafe dentry->d_lock or parent | ||
1791 | * inode's i_mutex, while on the other hand cgroup_path() can be called | ||
1792 | * with some irq-safe spinlocks held. | ||
1775 | */ | 1793 | */ |
1776 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | 1794 | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) |
1777 | { | 1795 | { |
1778 | struct dentry *dentry = cgrp->dentry; | 1796 | int ret = -ENAMETOOLONG; |
1779 | char *start; | 1797 | char *start; |
1780 | 1798 | ||
1781 | rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), | ||
1782 | "cgroup_path() called without proper locking"); | ||
1783 | |||
1784 | if (cgrp == dummytop) { | ||
1785 | /* | ||
1786 | * Inactive subsystems have no dentry for their root | ||
1787 | * cgroup | ||
1788 | */ | ||
1789 | strcpy(buf, "/"); | ||
1790 | return 0; | ||
1791 | } | ||
1792 | |||
1793 | start = buf + buflen - 1; | 1799 | start = buf + buflen - 1; |
1794 | |||
1795 | *start = '\0'; | 1800 | *start = '\0'; |
1796 | for (;;) { | ||
1797 | int len = dentry->d_name.len; | ||
1798 | 1801 | ||
1802 | rcu_read_lock(); | ||
1803 | while (cgrp) { | ||
1804 | const char *name = cgroup_name(cgrp); | ||
1805 | int len; | ||
1806 | |||
1807 | len = strlen(name); | ||
1799 | if ((start -= len) < buf) | 1808 | if ((start -= len) < buf) |
1800 | return -ENAMETOOLONG; | 1809 | goto out; |
1801 | memcpy(start, dentry->d_name.name, len); | 1810 | memcpy(start, name, len); |
1802 | cgrp = cgrp->parent; | ||
1803 | if (!cgrp) | ||
1804 | break; | ||
1805 | 1811 | ||
1806 | dentry = cgrp->dentry; | ||
1807 | if (!cgrp->parent) | 1812 | if (!cgrp->parent) |
1808 | continue; | 1813 | break; |
1814 | |||
1809 | if (--start < buf) | 1815 | if (--start < buf) |
1810 | return -ENAMETOOLONG; | 1816 | goto out; |
1811 | *start = '/'; | 1817 | *start = '/'; |
1818 | |||
1819 | cgrp = cgrp->parent; | ||
1812 | } | 1820 | } |
1821 | ret = 0; | ||
1813 | memmove(buf, start, buf + buflen - start); | 1822 | memmove(buf, start, buf + buflen - start); |
1814 | return 0; | 1823 | out: |
1824 | rcu_read_unlock(); | ||
1825 | return ret; | ||
1815 | } | 1826 | } |
1816 | EXPORT_SYMBOL_GPL(cgroup_path); | 1827 | EXPORT_SYMBOL_GPL(cgroup_path); |
1817 | 1828 | ||
@@ -2537,13 +2548,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file) | |||
2537 | static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, | 2548 | static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, |
2538 | struct inode *new_dir, struct dentry *new_dentry) | 2549 | struct inode *new_dir, struct dentry *new_dentry) |
2539 | { | 2550 | { |
2551 | int ret; | ||
2552 | struct cgroup_name *name, *old_name; | ||
2553 | struct cgroup *cgrp; | ||
2554 | |||
2555 | /* | ||
2556 | * It's convinient to use parent dir's i_mutex to protected | ||
2557 | * cgrp->name. | ||
2558 | */ | ||
2559 | lockdep_assert_held(&old_dir->i_mutex); | ||
2560 | |||
2540 | if (!S_ISDIR(old_dentry->d_inode->i_mode)) | 2561 | if (!S_ISDIR(old_dentry->d_inode->i_mode)) |
2541 | return -ENOTDIR; | 2562 | return -ENOTDIR; |
2542 | if (new_dentry->d_inode) | 2563 | if (new_dentry->d_inode) |
2543 | return -EEXIST; | 2564 | return -EEXIST; |
2544 | if (old_dir != new_dir) | 2565 | if (old_dir != new_dir) |
2545 | return -EIO; | 2566 | return -EIO; |
2546 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); | 2567 | |
2568 | cgrp = __d_cgrp(old_dentry); | ||
2569 | |||
2570 | name = cgroup_alloc_name(new_dentry); | ||
2571 | if (!name) | ||
2572 | return -ENOMEM; | ||
2573 | |||
2574 | ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); | ||
2575 | if (ret) { | ||
2576 | kfree(name); | ||
2577 | return ret; | ||
2578 | } | ||
2579 | |||
2580 | old_name = cgrp->name; | ||
2581 | rcu_assign_pointer(cgrp->name, name); | ||
2582 | |||
2583 | kfree_rcu(old_name, rcu_head); | ||
2584 | return 0; | ||
2547 | } | 2585 | } |
2548 | 2586 | ||
2549 | static struct simple_xattrs *__d_xattrs(struct dentry *dentry) | 2587 | static struct simple_xattrs *__d_xattrs(struct dentry *dentry) |
@@ -4158,6 +4196,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4158 | umode_t mode) | 4196 | umode_t mode) |
4159 | { | 4197 | { |
4160 | struct cgroup *cgrp; | 4198 | struct cgroup *cgrp; |
4199 | struct cgroup_name *name; | ||
4161 | struct cgroupfs_root *root = parent->root; | 4200 | struct cgroupfs_root *root = parent->root; |
4162 | int err = 0; | 4201 | int err = 0; |
4163 | struct cgroup_subsys *ss; | 4202 | struct cgroup_subsys *ss; |
@@ -4168,9 +4207,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4168 | if (!cgrp) | 4207 | if (!cgrp) |
4169 | return -ENOMEM; | 4208 | return -ENOMEM; |
4170 | 4209 | ||
4210 | name = cgroup_alloc_name(dentry); | ||
4211 | if (!name) | ||
4212 | goto err_free_cgrp; | ||
4213 | rcu_assign_pointer(cgrp->name, name); | ||
4214 | |||
4171 | cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); | 4215 | cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); |
4172 | if (cgrp->id < 0) | 4216 | if (cgrp->id < 0) |
4173 | goto err_free_cgrp; | 4217 | goto err_free_name; |
4174 | 4218 | ||
4175 | /* | 4219 | /* |
4176 | * Only live parents can have children. Note that the liveliness | 4220 | * Only live parents can have children. Note that the liveliness |
@@ -4276,6 +4320,8 @@ err_free_all: | |||
4276 | deactivate_super(sb); | 4320 | deactivate_super(sb); |
4277 | err_free_id: | 4321 | err_free_id: |
4278 | ida_simple_remove(&root->cgroup_ida, cgrp->id); | 4322 | ida_simple_remove(&root->cgroup_ida, cgrp->id); |
4323 | err_free_name: | ||
4324 | kfree(rcu_dereference_raw(cgrp->name)); | ||
4279 | err_free_cgrp: | 4325 | err_free_cgrp: |
4280 | kfree(cgrp); | 4326 | kfree(cgrp); |
4281 | return err; | 4327 | return err; |