aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Menage <menage@google.com>2007-10-19 02:39:32 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-19 14:53:36 -0400
commitbbcb81d09104f0d440974b994c1fc508ccbe9503 (patch)
tree6d9ef3e2c611bb0a8f63519196f7bd7725b7ea1a
parentddbcc7e8e50aefe467c01cac3dec71f118cd8ac2 (diff)
Task Control Groups: add tasks file interface
Add the per-directory "tasks" file for cgroupfs mounts; this allows the user to determine which tasks are members of a cgroup by reading a cgroup's "tasks", and to move a task into a cgroup by writing its pid to its "tasks". Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/cgroup.h10
-rw-r--r--kernel/cgroup.c359
2 files changed, 368 insertions, 1 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 60735dcf427..e2dd44f68f9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -144,6 +144,16 @@ int cgroup_is_removed(const struct cgroup *cont);
144 144
145int cgroup_path(const struct cgroup *cont, char *buf, int buflen); 145int cgroup_path(const struct cgroup *cont, char *buf, int buflen);
146 146
147int __cgroup_task_count(const struct cgroup *cont);
148static inline int cgroup_task_count(const struct cgroup *cont)
149{
150 int task_count;
151 rcu_read_lock();
152 task_count = __cgroup_task_count(cont);
153 rcu_read_unlock();
154 return task_count;
155}
156
147/* Return true if the cgroup is a descendant of the current cgroup */ 157/* Return true if the cgroup is a descendant of the current cgroup */
148int cgroup_is_descendant(const struct cgroup *cont); 158int cgroup_is_descendant(const struct cgroup *cont);
149 159
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6ba857bec71..356c40d5d20 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -40,7 +40,7 @@
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/spinlock.h> 41#include <linux/spinlock.h>
42#include <linux/string.h> 42#include <linux/string.h>
43 43#include <linux/sort.h>
44#include <asm/atomic.h> 44#include <asm/atomic.h>
45 45
46/* Generate an array of cgroup subsystem pointers */ 46/* Generate an array of cgroup subsystem pointers */
@@ -700,6 +700,127 @@ int cgroup_path(const struct cgroup *cont, char *buf, int buflen)
700 return 0; 700 return 0;
701} 701}
702 702
703/*
704 * Return the first subsystem attached to a cgroup's hierarchy, and
705 * its subsystem id.
706 */
707
708static void get_first_subsys(const struct cgroup *cont,
709 struct cgroup_subsys_state **css, int *subsys_id)
710{
711 const struct cgroupfs_root *root = cont->root;
712 const struct cgroup_subsys *test_ss;
713 BUG_ON(list_empty(&root->subsys_list));
714 test_ss = list_entry(root->subsys_list.next,
715 struct cgroup_subsys, sibling);
716 if (css) {
717 *css = cont->subsys[test_ss->subsys_id];
718 BUG_ON(!*css);
719 }
720 if (subsys_id)
721 *subsys_id = test_ss->subsys_id;
722}
723
724/*
725 * Attach task 'tsk' to cgroup 'cont'
726 *
727 * Call holding cgroup_mutex. May take task_lock of
728 * the task 'pid' during call.
729 */
730static int attach_task(struct cgroup *cont, struct task_struct *tsk)
731{
732 int retval = 0;
733 struct cgroup_subsys *ss;
734 struct cgroup *oldcont;
735 struct css_set *cg = &tsk->cgroups;
736 struct cgroupfs_root *root = cont->root;
737 int i;
738 int subsys_id;
739
740 get_first_subsys(cont, NULL, &subsys_id);
741
742 /* Nothing to do if the task is already in that cgroup */
743 oldcont = task_cgroup(tsk, subsys_id);
744 if (cont == oldcont)
745 return 0;
746
747 for_each_subsys(root, ss) {
748 if (ss->can_attach) {
749 retval = ss->can_attach(ss, cont, tsk);
750 if (retval) {
751 return retval;
752 }
753 }
754 }
755
756 task_lock(tsk);
757 if (tsk->flags & PF_EXITING) {
758 task_unlock(tsk);
759 return -ESRCH;
760 }
761 /* Update the css_set pointers for the subsystems in this
762 * hierarchy */
763 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
764 if (root->subsys_bits & (1ull << i)) {
765 /* Subsystem is in this hierarchy. So we want
766 * the subsystem state from the new
767 * cgroup. Transfer the refcount from the
768 * old to the new */
769 atomic_inc(&cont->count);
770 atomic_dec(&cg->subsys[i]->cgroup->count);
771 rcu_assign_pointer(cg->subsys[i], cont->subsys[i]);
772 }
773 }
774 task_unlock(tsk);
775
776 for_each_subsys(root, ss) {
777 if (ss->attach) {
778 ss->attach(ss, cont, oldcont, tsk);
779 }
780 }
781
782 synchronize_rcu();
783 return 0;
784}
785
786/*
787 * Attach task with pid 'pid' to cgroup 'cont'. Call with
788 * cgroup_mutex, may take task_lock of task
789 */
790static int attach_task_by_pid(struct cgroup *cont, char *pidbuf)
791{
792 pid_t pid;
793 struct task_struct *tsk;
794 int ret;
795
796 if (sscanf(pidbuf, "%d", &pid) != 1)
797 return -EIO;
798
799 if (pid) {
800 rcu_read_lock();
801 tsk = find_task_by_pid(pid);
802 if (!tsk || tsk->flags & PF_EXITING) {
803 rcu_read_unlock();
804 return -ESRCH;
805 }
806 get_task_struct(tsk);
807 rcu_read_unlock();
808
809 if ((current->euid) && (current->euid != tsk->uid)
810 && (current->euid != tsk->suid)) {
811 put_task_struct(tsk);
812 return -EACCES;
813 }
814 } else {
815 tsk = current;
816 get_task_struct(tsk);
817 }
818
819 ret = attach_task(cont, tsk);
820 put_task_struct(tsk);
821 return ret;
822}
823
703/* The various types of files and directories in a cgroup file system */ 824/* The various types of files and directories in a cgroup file system */
704 825
705enum cgroup_filetype { 826enum cgroup_filetype {
@@ -708,6 +829,55 @@ enum cgroup_filetype {
708 FILE_TASKLIST, 829 FILE_TASKLIST,
709}; 830};
710 831
832static ssize_t cgroup_common_file_write(struct cgroup *cont,
833 struct cftype *cft,
834 struct file *file,
835 const char __user *userbuf,
836 size_t nbytes, loff_t *unused_ppos)
837{
838 enum cgroup_filetype type = cft->private;
839 char *buffer;
840 int retval = 0;
841
842 if (nbytes >= PATH_MAX)
843 return -E2BIG;
844
845 /* +1 for nul-terminator */
846 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
847 if (buffer == NULL)
848 return -ENOMEM;
849
850 if (copy_from_user(buffer, userbuf, nbytes)) {
851 retval = -EFAULT;
852 goto out1;
853 }
854 buffer[nbytes] = 0; /* nul-terminate */
855
856 mutex_lock(&cgroup_mutex);
857
858 if (cgroup_is_removed(cont)) {
859 retval = -ENODEV;
860 goto out2;
861 }
862
863 switch (type) {
864 case FILE_TASKLIST:
865 retval = attach_task_by_pid(cont, buffer);
866 break;
867 default:
868 retval = -EINVAL;
869 goto out2;
870 }
871
872 if (retval == 0)
873 retval = nbytes;
874out2:
875 mutex_unlock(&cgroup_mutex);
876out1:
877 kfree(buffer);
878 return retval;
879}
880
711static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 881static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
712 size_t nbytes, loff_t *ppos) 882 size_t nbytes, loff_t *ppos)
713{ 883{
@@ -914,6 +1084,189 @@ int cgroup_add_files(struct cgroup *cont,
914 return 0; 1084 return 0;
915} 1085}
916 1086
1087/* Count the number of tasks in a cgroup. Could be made more
1088 * time-efficient but less space-efficient with more linked lists
1089 * running through each cgroup and the css_set structures that
1090 * referenced it. Must be called with tasklist_lock held for read or
1091 * write or in an rcu critical section.
1092 */
1093int __cgroup_task_count(const struct cgroup *cont)
1094{
1095 int count = 0;
1096 struct task_struct *g, *p;
1097 struct cgroup_subsys_state *css;
1098 int subsys_id;
1099
1100 get_first_subsys(cont, &css, &subsys_id);
1101 do_each_thread(g, p) {
1102 if (task_subsys_state(p, subsys_id) == css)
1103 count ++;
1104 } while_each_thread(g, p);
1105 return count;
1106}
1107
1108/*
1109 * Stuff for reading the 'tasks' file.
1110 *
1111 * Reading this file can return large amounts of data if a cgroup has
1112 * *lots* of attached tasks. So it may need several calls to read(),
1113 * but we cannot guarantee that the information we produce is correct
1114 * unless we produce it entirely atomically.
1115 *
1116 * Upon tasks file open(), a struct ctr_struct is allocated, that
1117 * will have a pointer to an array (also allocated here). The struct
1118 * ctr_struct * is stored in file->private_data. Its resources will
1119 * be freed by release() when the file is closed. The array is used
1120 * to sprintf the PIDs and then used by read().
1121 */
1122struct ctr_struct {
1123 char *buf;
1124 int bufsz;
1125};
1126
1127/*
1128 * Load into 'pidarray' up to 'npids' of the tasks using cgroup
1129 * 'cont'. Return actual number of pids loaded. No need to
1130 * task_lock(p) when reading out p->cgroup, since we're in an RCU
1131 * read section, so the css_set can't go away, and is
1132 * immutable after creation.
1133 */
1134static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cont)
1135{
1136 int n = 0;
1137 struct task_struct *g, *p;
1138 struct cgroup_subsys_state *css;
1139 int subsys_id;
1140
1141 get_first_subsys(cont, &css, &subsys_id);
1142 rcu_read_lock();
1143 do_each_thread(g, p) {
1144 if (task_subsys_state(p, subsys_id) == css) {
1145 pidarray[n++] = pid_nr(task_pid(p));
1146 if (unlikely(n == npids))
1147 goto array_full;
1148 }
1149 } while_each_thread(g, p);
1150
1151array_full:
1152 rcu_read_unlock();
1153 return n;
1154}
1155
1156static int cmppid(const void *a, const void *b)
1157{
1158 return *(pid_t *)a - *(pid_t *)b;
1159}
1160
1161/*
1162 * Convert array 'a' of 'npids' pid_t's to a string of newline separated
1163 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return
1164 * count 'cnt' of how many chars would be written if buf were large enough.
1165 */
1166static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1167{
1168 int cnt = 0;
1169 int i;
1170
1171 for (i = 0; i < npids; i++)
1172 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
1173 return cnt;
1174}
1175
1176/*
1177 * Handle an open on 'tasks' file. Prepare a buffer listing the
1178 * process id's of tasks currently attached to the cgroup being opened.
1179 *
1180 * Does not require any specific cgroup mutexes, and does not take any.
1181 */
1182static int cgroup_tasks_open(struct inode *unused, struct file *file)
1183{
1184 struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
1185 struct ctr_struct *ctr;
1186 pid_t *pidarray;
1187 int npids;
1188 char c;
1189
1190 if (!(file->f_mode & FMODE_READ))
1191 return 0;
1192
1193 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
1194 if (!ctr)
1195 goto err0;
1196
1197 /*
1198 * If cgroup gets more users after we read count, we won't have
1199 * enough space - tough. This race is indistinguishable to the
1200 * caller from the case that the additional cgroup users didn't
1201 * show up until sometime later on.
1202 */
1203 npids = cgroup_task_count(cont);
1204 if (npids) {
1205 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
1206 if (!pidarray)
1207 goto err1;
1208
1209 npids = pid_array_load(pidarray, npids, cont);
1210 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
1211
1212 /* Call pid_array_to_buf() twice, first just to get bufsz */
1213 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
1214 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
1215 if (!ctr->buf)
1216 goto err2;
1217 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
1218
1219 kfree(pidarray);
1220 } else {
1221 ctr->buf = 0;
1222 ctr->bufsz = 0;
1223 }
1224 file->private_data = ctr;
1225 return 0;
1226
1227err2:
1228 kfree(pidarray);
1229err1:
1230 kfree(ctr);
1231err0:
1232 return -ENOMEM;
1233}
1234
1235static ssize_t cgroup_tasks_read(struct cgroup *cont,
1236 struct cftype *cft,
1237 struct file *file, char __user *buf,
1238 size_t nbytes, loff_t *ppos)
1239{
1240 struct ctr_struct *ctr = file->private_data;
1241
1242 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
1243}
1244
1245static int cgroup_tasks_release(struct inode *unused_inode,
1246 struct file *file)
1247{
1248 struct ctr_struct *ctr;
1249
1250 if (file->f_mode & FMODE_READ) {
1251 ctr = file->private_data;
1252 kfree(ctr->buf);
1253 kfree(ctr);
1254 }
1255 return 0;
1256}
1257
1258/*
1259 * for the common functions, 'private' gives the type of file
1260 */
1261static struct cftype cft_tasks = {
1262 .name = "tasks",
1263 .open = cgroup_tasks_open,
1264 .read = cgroup_tasks_read,
1265 .write = cgroup_common_file_write,
1266 .release = cgroup_tasks_release,
1267 .private = FILE_TASKLIST,
1268};
1269
917static int cgroup_populate_dir(struct cgroup *cont) 1270static int cgroup_populate_dir(struct cgroup *cont)
918{ 1271{
919 int err; 1272 int err;
@@ -922,6 +1275,10 @@ static int cgroup_populate_dir(struct cgroup *cont)
922 /* First clear out any existing files */ 1275 /* First clear out any existing files */
923 cgroup_clear_directory(cont->dentry); 1276 cgroup_clear_directory(cont->dentry);
924 1277
1278 err = cgroup_add_file(cont, NULL, &cft_tasks);
1279 if (err < 0)
1280 return err;
1281
925 for_each_subsys(cont->root, ss) { 1282 for_each_subsys(cont->root, ss) {
926 if (ss->populate && (err = ss->populate(ss, cont)) < 0) 1283 if (ss->populate && (err = ss->populate(ss, cont)) < 0)
927 return err; 1284 return err;