diff options
Diffstat (limited to 'kernel/cgroup.c')
| -rw-r--r-- | kernel/cgroup.c | 318 |
1 files changed, 254 insertions, 64 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1a3c23936d43..4766bb65e4d9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -141,7 +141,7 @@ enum { | |||
| 141 | ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ | 141 | ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ |
| 142 | }; | 142 | }; |
| 143 | 143 | ||
| 144 | inline int cgroup_is_releasable(const struct cgroup *cgrp) | 144 | static int cgroup_is_releasable(const struct cgroup *cgrp) |
| 145 | { | 145 | { |
| 146 | const int bits = | 146 | const int bits = |
| 147 | (1 << CGRP_RELEASABLE) | | 147 | (1 << CGRP_RELEASABLE) | |
| @@ -149,7 +149,7 @@ inline int cgroup_is_releasable(const struct cgroup *cgrp) | |||
| 149 | return (cgrp->flags & bits) == bits; | 149 | return (cgrp->flags & bits) == bits; |
| 150 | } | 150 | } |
| 151 | 151 | ||
| 152 | inline int notify_on_release(const struct cgroup *cgrp) | 152 | static int notify_on_release(const struct cgroup *cgrp) |
| 153 | { | 153 | { |
| 154 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 154 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
| 155 | } | 155 | } |
| @@ -489,7 +489,7 @@ static struct css_set *find_css_set( | |||
| 489 | * Any task can increment and decrement the count field without lock. | 489 | * Any task can increment and decrement the count field without lock. |
| 490 | * So in general, code holding cgroup_mutex can't rely on the count | 490 | * So in general, code holding cgroup_mutex can't rely on the count |
| 491 | * field not changing. However, if the count goes to zero, then only | 491 | * field not changing. However, if the count goes to zero, then only |
| 492 | * attach_task() can increment it again. Because a count of zero | 492 | * cgroup_attach_task() can increment it again. Because a count of zero |
| 493 | * means that no tasks are currently attached, therefore there is no | 493 | * means that no tasks are currently attached, therefore there is no |
| 494 | * way a task attached to that cgroup can fork (the other way to | 494 | * way a task attached to that cgroup can fork (the other way to |
| 495 | * increment the count). So code holding cgroup_mutex can safely | 495 | * increment the count). So code holding cgroup_mutex can safely |
| @@ -520,17 +520,17 @@ static struct css_set *find_css_set( | |||
| 520 | * The task_lock() exception | 520 | * The task_lock() exception |
| 521 | * | 521 | * |
| 522 | * The need for this exception arises from the action of | 522 | * The need for this exception arises from the action of |
| 523 | * attach_task(), which overwrites one tasks cgroup pointer with | 523 | * cgroup_attach_task(), which overwrites one tasks cgroup pointer with |
| 524 | * another. It does so using cgroup_mutexe, however there are | 524 | * another. It does so using cgroup_mutexe, however there are |
| 525 | * several performance critical places that need to reference | 525 | * several performance critical places that need to reference |
| 526 | * task->cgroup without the expense of grabbing a system global | 526 | * task->cgroup without the expense of grabbing a system global |
| 527 | * mutex. Therefore except as noted below, when dereferencing or, as | 527 | * mutex. Therefore except as noted below, when dereferencing or, as |
| 528 | * in attach_task(), modifying a task'ss cgroup pointer we use | 528 | * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use |
| 529 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in | 529 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in |
| 530 | * the task_struct routinely used for such matters. | 530 | * the task_struct routinely used for such matters. |
| 531 | * | 531 | * |
| 532 | * P.S. One more locking exception. RCU is used to guard the | 532 | * P.S. One more locking exception. RCU is used to guard the |
| 533 | * update of a tasks cgroup pointer by attach_task() | 533 | * update of a tasks cgroup pointer by cgroup_attach_task() |
| 534 | */ | 534 | */ |
| 535 | 535 | ||
| 536 | /** | 536 | /** |
| @@ -586,11 +586,27 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) | |||
| 586 | return inode; | 586 | return inode; |
| 587 | } | 587 | } |
| 588 | 588 | ||
| 589 | /* | ||
| 590 | * Call subsys's pre_destroy handler. | ||
| 591 | * This is called before css refcnt check. | ||
| 592 | */ | ||
| 593 | |||
| 594 | static void cgroup_call_pre_destroy(struct cgroup *cgrp) | ||
| 595 | { | ||
| 596 | struct cgroup_subsys *ss; | ||
| 597 | for_each_subsys(cgrp->root, ss) | ||
| 598 | if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) | ||
| 599 | ss->pre_destroy(ss, cgrp); | ||
| 600 | return; | ||
| 601 | } | ||
| 602 | |||
| 603 | |||
| 589 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 604 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
| 590 | { | 605 | { |
| 591 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 606 | /* is dentry a directory ? if so, kfree() associated cgroup */ |
| 592 | if (S_ISDIR(inode->i_mode)) { | 607 | if (S_ISDIR(inode->i_mode)) { |
| 593 | struct cgroup *cgrp = dentry->d_fsdata; | 608 | struct cgroup *cgrp = dentry->d_fsdata; |
| 609 | struct cgroup_subsys *ss; | ||
| 594 | BUG_ON(!(cgroup_is_removed(cgrp))); | 610 | BUG_ON(!(cgroup_is_removed(cgrp))); |
| 595 | /* It's possible for external users to be holding css | 611 | /* It's possible for external users to be holding css |
| 596 | * reference counts on a cgroup; css_put() needs to | 612 | * reference counts on a cgroup; css_put() needs to |
| @@ -599,6 +615,23 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
| 599 | * queue the cgroup to be handled by the release | 615 | * queue the cgroup to be handled by the release |
| 600 | * agent */ | 616 | * agent */ |
| 601 | synchronize_rcu(); | 617 | synchronize_rcu(); |
| 618 | |||
| 619 | mutex_lock(&cgroup_mutex); | ||
| 620 | /* | ||
| 621 | * Release the subsystem state objects. | ||
| 622 | */ | ||
| 623 | for_each_subsys(cgrp->root, ss) { | ||
| 624 | if (cgrp->subsys[ss->subsys_id]) | ||
| 625 | ss->destroy(ss, cgrp); | ||
| 626 | } | ||
| 627 | |||
| 628 | cgrp->root->number_of_cgroups--; | ||
| 629 | mutex_unlock(&cgroup_mutex); | ||
| 630 | |||
| 631 | /* Drop the active superblock reference that we took when we | ||
| 632 | * created the cgroup */ | ||
| 633 | deactivate_super(cgrp->root->sb); | ||
| 634 | |||
| 602 | kfree(cgrp); | 635 | kfree(cgrp); |
| 603 | } | 636 | } |
| 604 | iput(inode); | 637 | iput(inode); |
| @@ -1161,7 +1194,7 @@ static void get_first_subsys(const struct cgroup *cgrp, | |||
| 1161 | * Call holding cgroup_mutex. May take task_lock of | 1194 | * Call holding cgroup_mutex. May take task_lock of |
| 1162 | * the task 'pid' during call. | 1195 | * the task 'pid' during call. |
| 1163 | */ | 1196 | */ |
| 1164 | static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 1197 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
| 1165 | { | 1198 | { |
| 1166 | int retval = 0; | 1199 | int retval = 0; |
| 1167 | struct cgroup_subsys *ss; | 1200 | struct cgroup_subsys *ss; |
| @@ -1181,9 +1214,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
| 1181 | for_each_subsys(root, ss) { | 1214 | for_each_subsys(root, ss) { |
| 1182 | if (ss->can_attach) { | 1215 | if (ss->can_attach) { |
| 1183 | retval = ss->can_attach(ss, cgrp, tsk); | 1216 | retval = ss->can_attach(ss, cgrp, tsk); |
| 1184 | if (retval) { | 1217 | if (retval) |
| 1185 | return retval; | 1218 | return retval; |
| 1186 | } | ||
| 1187 | } | 1219 | } |
| 1188 | } | 1220 | } |
| 1189 | 1221 | ||
| @@ -1192,9 +1224,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
| 1192 | * based on its final set of cgroups | 1224 | * based on its final set of cgroups |
| 1193 | */ | 1225 | */ |
| 1194 | newcg = find_css_set(cg, cgrp); | 1226 | newcg = find_css_set(cg, cgrp); |
| 1195 | if (!newcg) { | 1227 | if (!newcg) |
| 1196 | return -ENOMEM; | 1228 | return -ENOMEM; |
| 1197 | } | ||
| 1198 | 1229 | ||
| 1199 | task_lock(tsk); | 1230 | task_lock(tsk); |
| 1200 | if (tsk->flags & PF_EXITING) { | 1231 | if (tsk->flags & PF_EXITING) { |
| @@ -1214,9 +1245,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
| 1214 | write_unlock(&css_set_lock); | 1245 | write_unlock(&css_set_lock); |
| 1215 | 1246 | ||
| 1216 | for_each_subsys(root, ss) { | 1247 | for_each_subsys(root, ss) { |
| 1217 | if (ss->attach) { | 1248 | if (ss->attach) |
| 1218 | ss->attach(ss, cgrp, oldcgrp, tsk); | 1249 | ss->attach(ss, cgrp, oldcgrp, tsk); |
| 1219 | } | ||
| 1220 | } | 1250 | } |
| 1221 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); | 1251 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); |
| 1222 | synchronize_rcu(); | 1252 | synchronize_rcu(); |
| @@ -1239,7 +1269,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) | |||
| 1239 | 1269 | ||
| 1240 | if (pid) { | 1270 | if (pid) { |
| 1241 | rcu_read_lock(); | 1271 | rcu_read_lock(); |
| 1242 | tsk = find_task_by_pid(pid); | 1272 | tsk = find_task_by_vpid(pid); |
| 1243 | if (!tsk || tsk->flags & PF_EXITING) { | 1273 | if (!tsk || tsk->flags & PF_EXITING) { |
| 1244 | rcu_read_unlock(); | 1274 | rcu_read_unlock(); |
| 1245 | return -ESRCH; | 1275 | return -ESRCH; |
| @@ -1257,7 +1287,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) | |||
| 1257 | get_task_struct(tsk); | 1287 | get_task_struct(tsk); |
| 1258 | } | 1288 | } |
| 1259 | 1289 | ||
| 1260 | ret = attach_task(cgrp, tsk); | 1290 | ret = cgroup_attach_task(cgrp, tsk); |
| 1261 | put_task_struct(tsk); | 1291 | put_task_struct(tsk); |
| 1262 | return ret; | 1292 | return ret; |
| 1263 | } | 1293 | } |
| @@ -1329,9 +1359,14 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp, | |||
| 1329 | goto out1; | 1359 | goto out1; |
| 1330 | } | 1360 | } |
| 1331 | buffer[nbytes] = 0; /* nul-terminate */ | 1361 | buffer[nbytes] = 0; /* nul-terminate */ |
| 1362 | strstrip(buffer); /* strip -just- trailing whitespace */ | ||
| 1332 | 1363 | ||
| 1333 | mutex_lock(&cgroup_mutex); | 1364 | mutex_lock(&cgroup_mutex); |
| 1334 | 1365 | ||
| 1366 | /* | ||
| 1367 | * This was already checked for in cgroup_file_write(), but | ||
| 1368 | * check again now we're holding cgroup_mutex. | ||
| 1369 | */ | ||
| 1335 | if (cgroup_is_removed(cgrp)) { | 1370 | if (cgroup_is_removed(cgrp)) { |
| 1336 | retval = -ENODEV; | 1371 | retval = -ENODEV; |
| 1337 | goto out2; | 1372 | goto out2; |
| @@ -1349,24 +1384,9 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp, | |||
| 1349 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 1384 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
| 1350 | break; | 1385 | break; |
| 1351 | case FILE_RELEASE_AGENT: | 1386 | case FILE_RELEASE_AGENT: |
| 1352 | { | 1387 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); |
| 1353 | struct cgroupfs_root *root = cgrp->root; | 1388 | strcpy(cgrp->root->release_agent_path, buffer); |
| 1354 | /* Strip trailing newline */ | ||
| 1355 | if (nbytes && (buffer[nbytes-1] == '\n')) { | ||
| 1356 | buffer[nbytes-1] = 0; | ||
| 1357 | } | ||
| 1358 | if (nbytes < sizeof(root->release_agent_path)) { | ||
| 1359 | /* We never write anything other than '\0' | ||
| 1360 | * into the last char of release_agent_path, | ||
| 1361 | * so it always remains a NUL-terminated | ||
| 1362 | * string */ | ||
| 1363 | strncpy(root->release_agent_path, buffer, nbytes); | ||
| 1364 | root->release_agent_path[nbytes] = 0; | ||
| 1365 | } else { | ||
| 1366 | retval = -ENOSPC; | ||
| 1367 | } | ||
| 1368 | break; | 1389 | break; |
| 1369 | } | ||
| 1370 | default: | 1390 | default: |
| 1371 | retval = -EINVAL; | 1391 | retval = -EINVAL; |
| 1372 | goto out2; | 1392 | goto out2; |
| @@ -1387,7 +1407,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, | |||
| 1387 | struct cftype *cft = __d_cft(file->f_dentry); | 1407 | struct cftype *cft = __d_cft(file->f_dentry); |
| 1388 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 1408 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
| 1389 | 1409 | ||
| 1390 | if (!cft) | 1410 | if (!cft || cgroup_is_removed(cgrp)) |
| 1391 | return -ENODEV; | 1411 | return -ENODEV; |
| 1392 | if (cft->write) | 1412 | if (cft->write) |
| 1393 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); | 1413 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); |
| @@ -1457,7 +1477,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, | |||
| 1457 | struct cftype *cft = __d_cft(file->f_dentry); | 1477 | struct cftype *cft = __d_cft(file->f_dentry); |
| 1458 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 1478 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
| 1459 | 1479 | ||
| 1460 | if (!cft) | 1480 | if (!cft || cgroup_is_removed(cgrp)) |
| 1461 | return -ENODEV; | 1481 | return -ENODEV; |
| 1462 | 1482 | ||
| 1463 | if (cft->read) | 1483 | if (cft->read) |
| @@ -1675,6 +1695,29 @@ static void cgroup_advance_iter(struct cgroup *cgrp, | |||
| 1675 | it->task = cg->tasks.next; | 1695 | it->task = cg->tasks.next; |
| 1676 | } | 1696 | } |
| 1677 | 1697 | ||
| 1698 | /* | ||
| 1699 | * To reduce the fork() overhead for systems that are not actually | ||
| 1700 | * using their cgroups capability, we don't maintain the lists running | ||
| 1701 | * through each css_set to its tasks until we see the list actually | ||
| 1702 | * used - in other words after the first call to cgroup_iter_start(). | ||
| 1703 | * | ||
| 1704 | * The tasklist_lock is not held here, as do_each_thread() and | ||
| 1705 | * while_each_thread() are protected by RCU. | ||
| 1706 | */ | ||
| 1707 | void cgroup_enable_task_cg_lists(void) | ||
| 1708 | { | ||
| 1709 | struct task_struct *p, *g; | ||
| 1710 | write_lock(&css_set_lock); | ||
| 1711 | use_task_css_set_links = 1; | ||
| 1712 | do_each_thread(g, p) { | ||
| 1713 | task_lock(p); | ||
| 1714 | if (list_empty(&p->cg_list)) | ||
| 1715 | list_add(&p->cg_list, &p->cgroups->tasks); | ||
| 1716 | task_unlock(p); | ||
| 1717 | } while_each_thread(g, p); | ||
| 1718 | write_unlock(&css_set_lock); | ||
| 1719 | } | ||
| 1720 | |||
| 1678 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | 1721 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) |
| 1679 | { | 1722 | { |
| 1680 | /* | 1723 | /* |
| @@ -1682,18 +1725,9 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | |||
| 1682 | * we need to enable the list linking each css_set to its | 1725 | * we need to enable the list linking each css_set to its |
| 1683 | * tasks, and fix up all existing tasks. | 1726 | * tasks, and fix up all existing tasks. |
| 1684 | */ | 1727 | */ |
| 1685 | if (!use_task_css_set_links) { | 1728 | if (!use_task_css_set_links) |
| 1686 | struct task_struct *p, *g; | 1729 | cgroup_enable_task_cg_lists(); |
| 1687 | write_lock(&css_set_lock); | 1730 | |
| 1688 | use_task_css_set_links = 1; | ||
| 1689 | do_each_thread(g, p) { | ||
| 1690 | task_lock(p); | ||
| 1691 | if (list_empty(&p->cg_list)) | ||
| 1692 | list_add(&p->cg_list, &p->cgroups->tasks); | ||
| 1693 | task_unlock(p); | ||
| 1694 | } while_each_thread(g, p); | ||
| 1695 | write_unlock(&css_set_lock); | ||
| 1696 | } | ||
| 1697 | read_lock(&css_set_lock); | 1731 | read_lock(&css_set_lock); |
| 1698 | it->cg_link = &cgrp->css_sets; | 1732 | it->cg_link = &cgrp->css_sets; |
| 1699 | cgroup_advance_iter(cgrp, it); | 1733 | cgroup_advance_iter(cgrp, it); |
| @@ -1726,6 +1760,166 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) | |||
| 1726 | read_unlock(&css_set_lock); | 1760 | read_unlock(&css_set_lock); |
| 1727 | } | 1761 | } |
| 1728 | 1762 | ||
| 1763 | static inline int started_after_time(struct task_struct *t1, | ||
| 1764 | struct timespec *time, | ||
| 1765 | struct task_struct *t2) | ||
| 1766 | { | ||
| 1767 | int start_diff = timespec_compare(&t1->start_time, time); | ||
| 1768 | if (start_diff > 0) { | ||
| 1769 | return 1; | ||
| 1770 | } else if (start_diff < 0) { | ||
| 1771 | return 0; | ||
| 1772 | } else { | ||
| 1773 | /* | ||
| 1774 | * Arbitrarily, if two processes started at the same | ||
| 1775 | * time, we'll say that the lower pointer value | ||
| 1776 | * started first. Note that t2 may have exited by now | ||
| 1777 | * so this may not be a valid pointer any longer, but | ||
| 1778 | * that's fine - it still serves to distinguish | ||
| 1779 | * between two tasks started (effectively) simultaneously. | ||
| 1780 | */ | ||
| 1781 | return t1 > t2; | ||
| 1782 | } | ||
| 1783 | } | ||
| 1784 | |||
| 1785 | /* | ||
| 1786 | * This function is a callback from heap_insert() and is used to order | ||
| 1787 | * the heap. | ||
| 1788 | * In this case we order the heap in descending task start time. | ||
| 1789 | */ | ||
| 1790 | static inline int started_after(void *p1, void *p2) | ||
| 1791 | { | ||
| 1792 | struct task_struct *t1 = p1; | ||
| 1793 | struct task_struct *t2 = p2; | ||
| 1794 | return started_after_time(t1, &t2->start_time, t2); | ||
| 1795 | } | ||
| 1796 | |||
| 1797 | /** | ||
| 1798 | * cgroup_scan_tasks - iterate though all the tasks in a cgroup | ||
| 1799 | * @scan: struct cgroup_scanner containing arguments for the scan | ||
| 1800 | * | ||
| 1801 | * Arguments include pointers to callback functions test_task() and | ||
| 1802 | * process_task(). | ||
| 1803 | * Iterate through all the tasks in a cgroup, calling test_task() for each, | ||
| 1804 | * and if it returns true, call process_task() for it also. | ||
| 1805 | * The test_task pointer may be NULL, meaning always true (select all tasks). | ||
| 1806 | * Effectively duplicates cgroup_iter_{start,next,end}() | ||
| 1807 | * but does not lock css_set_lock for the call to process_task(). | ||
| 1808 | * The struct cgroup_scanner may be embedded in any structure of the caller's | ||
| 1809 | * creation. | ||
| 1810 | * It is guaranteed that process_task() will act on every task that | ||
| 1811 | * is a member of the cgroup for the duration of this call. This | ||
| 1812 | * function may or may not call process_task() for tasks that exit | ||
| 1813 | * or move to a different cgroup during the call, or are forked or | ||
| 1814 | * move into the cgroup during the call. | ||
| 1815 | * | ||
| 1816 | * Note that test_task() may be called with locks held, and may in some | ||
| 1817 | * situations be called multiple times for the same task, so it should | ||
| 1818 | * be cheap. | ||
| 1819 | * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been | ||
| 1820 | * pre-allocated and will be used for heap operations (and its "gt" member will | ||
| 1821 | * be overwritten), else a temporary heap will be used (allocation of which | ||
| 1822 | * may cause this function to fail). | ||
| 1823 | */ | ||
| 1824 | int cgroup_scan_tasks(struct cgroup_scanner *scan) | ||
| 1825 | { | ||
| 1826 | int retval, i; | ||
| 1827 | struct cgroup_iter it; | ||
| 1828 | struct task_struct *p, *dropped; | ||
| 1829 | /* Never dereference latest_task, since it's not refcounted */ | ||
| 1830 | struct task_struct *latest_task = NULL; | ||
| 1831 | struct ptr_heap tmp_heap; | ||
| 1832 | struct ptr_heap *heap; | ||
| 1833 | struct timespec latest_time = { 0, 0 }; | ||
| 1834 | |||
| 1835 | if (scan->heap) { | ||
| 1836 | /* The caller supplied our heap and pre-allocated its memory */ | ||
| 1837 | heap = scan->heap; | ||
| 1838 | heap->gt = &started_after; | ||
| 1839 | } else { | ||
| 1840 | /* We need to allocate our own heap memory */ | ||
| 1841 | heap = &tmp_heap; | ||
| 1842 | retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); | ||
| 1843 | if (retval) | ||
| 1844 | /* cannot allocate the heap */ | ||
| 1845 | return retval; | ||
| 1846 | } | ||
| 1847 | |||
| 1848 | again: | ||
| 1849 | /* | ||
| 1850 | * Scan tasks in the cgroup, using the scanner's "test_task" callback | ||
| 1851 | * to determine which are of interest, and using the scanner's | ||
| 1852 | * "process_task" callback to process any of them that need an update. | ||
| 1853 | * Since we don't want to hold any locks during the task updates, | ||
| 1854 | * gather tasks to be processed in a heap structure. | ||
| 1855 | * The heap is sorted by descending task start time. | ||
| 1856 | * If the statically-sized heap fills up, we overflow tasks that | ||
| 1857 | * started later, and in future iterations only consider tasks that | ||
| 1858 | * started after the latest task in the previous pass. This | ||
| 1859 | * guarantees forward progress and that we don't miss any tasks. | ||
| 1860 | */ | ||
| 1861 | heap->size = 0; | ||
| 1862 | cgroup_iter_start(scan->cg, &it); | ||
| 1863 | while ((p = cgroup_iter_next(scan->cg, &it))) { | ||
| 1864 | /* | ||
| 1865 | * Only affect tasks that qualify per the caller's callback, | ||
| 1866 | * if he provided one | ||
| 1867 | */ | ||
| 1868 | if (scan->test_task && !scan->test_task(p, scan)) | ||
| 1869 | continue; | ||
| 1870 | /* | ||
| 1871 | * Only process tasks that started after the last task | ||
| 1872 | * we processed | ||
| 1873 | */ | ||
| 1874 | if (!started_after_time(p, &latest_time, latest_task)) | ||
| 1875 | continue; | ||
| 1876 | dropped = heap_insert(heap, p); | ||
| 1877 | if (dropped == NULL) { | ||
| 1878 | /* | ||
| 1879 | * The new task was inserted; the heap wasn't | ||
| 1880 | * previously full | ||
| 1881 | */ | ||
| 1882 | get_task_struct(p); | ||
| 1883 | } else if (dropped != p) { | ||
| 1884 | /* | ||
| 1885 | * The new task was inserted, and pushed out a | ||
| 1886 | * different task | ||
| 1887 | */ | ||
| 1888 | get_task_struct(p); | ||
| 1889 | put_task_struct(dropped); | ||
| 1890 | } | ||
| 1891 | /* | ||
| 1892 | * Else the new task was newer than anything already in | ||
| 1893 | * the heap and wasn't inserted | ||
| 1894 | */ | ||
| 1895 | } | ||
| 1896 | cgroup_iter_end(scan->cg, &it); | ||
| 1897 | |||
| 1898 | if (heap->size) { | ||
| 1899 | for (i = 0; i < heap->size; i++) { | ||
| 1900 | struct task_struct *p = heap->ptrs[i]; | ||
| 1901 | if (i == 0) { | ||
| 1902 | latest_time = p->start_time; | ||
| 1903 | latest_task = p; | ||
| 1904 | } | ||
| 1905 | /* Process the task per the caller's callback */ | ||
| 1906 | scan->process_task(p, scan); | ||
| 1907 | put_task_struct(p); | ||
| 1908 | } | ||
| 1909 | /* | ||
| 1910 | * If we had to process any tasks at all, scan again | ||
| 1911 | * in case some of them were in the middle of forking | ||
| 1912 | * children that didn't get processed. | ||
| 1913 | * Not the most efficient way to do it, but it avoids | ||
| 1914 | * having to take callback_mutex in the fork path | ||
| 1915 | */ | ||
| 1916 | goto again; | ||
| 1917 | } | ||
| 1918 | if (heap == &tmp_heap) | ||
| 1919 | heap_free(&tmp_heap); | ||
| 1920 | return 0; | ||
| 1921 | } | ||
| 1922 | |||
| 1729 | /* | 1923 | /* |
| 1730 | * Stuff for reading the 'tasks' file. | 1924 | * Stuff for reading the 'tasks' file. |
| 1731 | * | 1925 | * |
| @@ -1761,7 +1955,7 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) | |||
| 1761 | while ((tsk = cgroup_iter_next(cgrp, &it))) { | 1955 | while ((tsk = cgroup_iter_next(cgrp, &it))) { |
| 1762 | if (unlikely(n == npids)) | 1956 | if (unlikely(n == npids)) |
| 1763 | break; | 1957 | break; |
| 1764 | pidarray[n++] = task_pid_nr(tsk); | 1958 | pidarray[n++] = task_pid_vnr(tsk); |
| 1765 | } | 1959 | } |
| 1766 | cgroup_iter_end(cgrp, &it); | 1960 | cgroup_iter_end(cgrp, &it); |
| 1767 | return n; | 1961 | return n; |
| @@ -2126,9 +2320,8 @@ static inline int cgroup_has_css_refs(struct cgroup *cgrp) | |||
| 2126 | * matter, since it can only happen if the cgroup | 2320 | * matter, since it can only happen if the cgroup |
| 2127 | * has been deleted and hence no longer needs the | 2321 | * has been deleted and hence no longer needs the |
| 2128 | * release agent to be called anyway. */ | 2322 | * release agent to be called anyway. */ |
| 2129 | if (css && atomic_read(&css->refcnt)) { | 2323 | if (css && atomic_read(&css->refcnt)) |
| 2130 | return 1; | 2324 | return 1; |
| 2131 | } | ||
| 2132 | } | 2325 | } |
| 2133 | return 0; | 2326 | return 0; |
| 2134 | } | 2327 | } |
| @@ -2138,7 +2331,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 2138 | struct cgroup *cgrp = dentry->d_fsdata; | 2331 | struct cgroup *cgrp = dentry->d_fsdata; |
| 2139 | struct dentry *d; | 2332 | struct dentry *d; |
| 2140 | struct cgroup *parent; | 2333 | struct cgroup *parent; |
| 2141 | struct cgroup_subsys *ss; | ||
| 2142 | struct super_block *sb; | 2334 | struct super_block *sb; |
| 2143 | struct cgroupfs_root *root; | 2335 | struct cgroupfs_root *root; |
| 2144 | 2336 | ||
| @@ -2157,17 +2349,19 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 2157 | parent = cgrp->parent; | 2349 | parent = cgrp->parent; |
| 2158 | root = cgrp->root; | 2350 | root = cgrp->root; |
| 2159 | sb = root->sb; | 2351 | sb = root->sb; |
| 2352 | /* | ||
| 2353 | * Call pre_destroy handlers of subsys | ||
| 2354 | */ | ||
| 2355 | cgroup_call_pre_destroy(cgrp); | ||
| 2356 | /* | ||
| 2357 | * Notify subsyses that rmdir() request comes. | ||
| 2358 | */ | ||
| 2160 | 2359 | ||
| 2161 | if (cgroup_has_css_refs(cgrp)) { | 2360 | if (cgroup_has_css_refs(cgrp)) { |
| 2162 | mutex_unlock(&cgroup_mutex); | 2361 | mutex_unlock(&cgroup_mutex); |
| 2163 | return -EBUSY; | 2362 | return -EBUSY; |
| 2164 | } | 2363 | } |
| 2165 | 2364 | ||
| 2166 | for_each_subsys(root, ss) { | ||
| 2167 | if (cgrp->subsys[ss->subsys_id]) | ||
| 2168 | ss->destroy(ss, cgrp); | ||
| 2169 | } | ||
| 2170 | |||
| 2171 | spin_lock(&release_list_lock); | 2365 | spin_lock(&release_list_lock); |
| 2172 | set_bit(CGRP_REMOVED, &cgrp->flags); | 2366 | set_bit(CGRP_REMOVED, &cgrp->flags); |
| 2173 | if (!list_empty(&cgrp->release_list)) | 2367 | if (!list_empty(&cgrp->release_list)) |
| @@ -2182,15 +2376,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 2182 | 2376 | ||
| 2183 | cgroup_d_remove_dir(d); | 2377 | cgroup_d_remove_dir(d); |
| 2184 | dput(d); | 2378 | dput(d); |
| 2185 | root->number_of_cgroups--; | ||
| 2186 | 2379 | ||
| 2187 | set_bit(CGRP_RELEASABLE, &parent->flags); | 2380 | set_bit(CGRP_RELEASABLE, &parent->flags); |
| 2188 | check_for_release(parent); | 2381 | check_for_release(parent); |
| 2189 | 2382 | ||
| 2190 | mutex_unlock(&cgroup_mutex); | 2383 | mutex_unlock(&cgroup_mutex); |
| 2191 | /* Drop the active superblock reference that we took when we | ||
| 2192 | * created the cgroup */ | ||
| 2193 | deactivate_super(sb); | ||
| 2194 | return 0; | 2384 | return 0; |
| 2195 | } | 2385 | } |
| 2196 | 2386 | ||
| @@ -2324,7 +2514,7 @@ out: | |||
| 2324 | * - Used for /proc/<pid>/cgroup. | 2514 | * - Used for /proc/<pid>/cgroup. |
| 2325 | * - No need to task_lock(tsk) on this tsk->cgroup reference, as it | 2515 | * - No need to task_lock(tsk) on this tsk->cgroup reference, as it |
| 2326 | * doesn't really matter if tsk->cgroup changes after we read it, | 2516 | * doesn't really matter if tsk->cgroup changes after we read it, |
| 2327 | * and we take cgroup_mutex, keeping attach_task() from changing it | 2517 | * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it |
| 2328 | * anyway. No need to check that tsk->cgroup != NULL, thanks to | 2518 | * anyway. No need to check that tsk->cgroup != NULL, thanks to |
| 2329 | * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks | 2519 | * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks |
| 2330 | * cgroup to top_cgroup. | 2520 | * cgroup to top_cgroup. |
| @@ -2435,7 +2625,7 @@ static struct file_operations proc_cgroupstats_operations = { | |||
| 2435 | * A pointer to the shared css_set was automatically copied in | 2625 | * A pointer to the shared css_set was automatically copied in |
| 2436 | * fork.c by dup_task_struct(). However, we ignore that copy, since | 2626 | * fork.c by dup_task_struct(). However, we ignore that copy, since |
| 2437 | * it was not made under the protection of RCU or cgroup_mutex, so | 2627 | * it was not made under the protection of RCU or cgroup_mutex, so |
| 2438 | * might no longer be a valid cgroup pointer. attach_task() might | 2628 | * might no longer be a valid cgroup pointer. cgroup_attach_task() might |
| 2439 | * have already changed current->cgroups, allowing the previously | 2629 | * have already changed current->cgroups, allowing the previously |
| 2440 | * referenced cgroup group to be removed and freed. | 2630 | * referenced cgroup group to be removed and freed. |
| 2441 | * | 2631 | * |
| @@ -2514,8 +2704,8 @@ void cgroup_post_fork(struct task_struct *child) | |||
| 2514 | * attach us to a different cgroup, decrementing the count on | 2704 | * attach us to a different cgroup, decrementing the count on |
| 2515 | * the first cgroup that we never incremented. But in this case, | 2705 | * the first cgroup that we never incremented. But in this case, |
| 2516 | * top_cgroup isn't going away, and either task has PF_EXITING set, | 2706 | * top_cgroup isn't going away, and either task has PF_EXITING set, |
| 2517 | * which wards off any attach_task() attempts, or task is a failed | 2707 | * which wards off any cgroup_attach_task() attempts, or task is a failed |
| 2518 | * fork, never visible to attach_task. | 2708 | * fork, never visible to cgroup_attach_task. |
| 2519 | * | 2709 | * |
| 2520 | */ | 2710 | */ |
| 2521 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) | 2711 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) |
| @@ -2655,7 +2845,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) | |||
| 2655 | } | 2845 | } |
| 2656 | 2846 | ||
| 2657 | /* All seems fine. Finish by moving the task into the new cgroup */ | 2847 | /* All seems fine. Finish by moving the task into the new cgroup */ |
| 2658 | ret = attach_task(child, tsk); | 2848 | ret = cgroup_attach_task(child, tsk); |
| 2659 | mutex_unlock(&cgroup_mutex); | 2849 | mutex_unlock(&cgroup_mutex); |
| 2660 | 2850 | ||
| 2661 | out_release: | 2851 | out_release: |
