diff options
author | Paul Menage <menage@google.com> | 2007-10-19 02:39:32 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-19 14:53:36 -0400 |
commit | bbcb81d09104f0d440974b994c1fc508ccbe9503 (patch) | |
tree | 6d9ef3e2c611bb0a8f63519196f7bd7725b7ea1a | |
parent | ddbcc7e8e50aefe467c01cac3dec71f118cd8ac2 (diff) |
Task Control Groups: add tasks file interface
Add the per-directory "tasks" file for cgroupfs mounts; this allows the
user to determine which tasks are members of a cgroup by reading a
cgroup's "tasks", and to move a task into a cgroup by writing its pid to
its "tasks".
Signed-off-by: Paul Menage <menage@google.com>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/cgroup.h | 10 | ||||
-rw-r--r-- | kernel/cgroup.c | 359 |
2 files changed, 368 insertions, 1 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 60735dcf427..e2dd44f68f9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -144,6 +144,16 @@ int cgroup_is_removed(const struct cgroup *cont); | |||
144 | 144 | ||
145 | int cgroup_path(const struct cgroup *cont, char *buf, int buflen); | 145 | int cgroup_path(const struct cgroup *cont, char *buf, int buflen); |
146 | 146 | ||
147 | int __cgroup_task_count(const struct cgroup *cont); | ||
148 | static inline int cgroup_task_count(const struct cgroup *cont) | ||
149 | { | ||
150 | int task_count; | ||
151 | rcu_read_lock(); | ||
152 | task_count = __cgroup_task_count(cont); | ||
153 | rcu_read_unlock(); | ||
154 | return task_count; | ||
155 | } | ||
156 | |||
147 | /* Return true if the cgroup is a descendant of the current cgroup */ | 157 | /* Return true if the cgroup is a descendant of the current cgroup */ |
148 | int cgroup_is_descendant(const struct cgroup *cont); | 158 | int cgroup_is_descendant(const struct cgroup *cont); |
149 | 159 | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6ba857bec71..356c40d5d20 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -40,7 +40,7 @@ | |||
40 | #include <linux/magic.h> | 40 | #include <linux/magic.h> |
41 | #include <linux/spinlock.h> | 41 | #include <linux/spinlock.h> |
42 | #include <linux/string.h> | 42 | #include <linux/string.h> |
43 | 43 | #include <linux/sort.h> | |
44 | #include <asm/atomic.h> | 44 | #include <asm/atomic.h> |
45 | 45 | ||
46 | /* Generate an array of cgroup subsystem pointers */ | 46 | /* Generate an array of cgroup subsystem pointers */ |
@@ -700,6 +700,127 @@ int cgroup_path(const struct cgroup *cont, char *buf, int buflen) | |||
700 | return 0; | 700 | return 0; |
701 | } | 701 | } |
702 | 702 | ||
703 | /* | ||
704 | * Return the first subsystem attached to a cgroup's hierarchy, and | ||
705 | * its subsystem id. | ||
706 | */ | ||
707 | |||
708 | static void get_first_subsys(const struct cgroup *cont, | ||
709 | struct cgroup_subsys_state **css, int *subsys_id) | ||
710 | { | ||
711 | const struct cgroupfs_root *root = cont->root; | ||
712 | const struct cgroup_subsys *test_ss; | ||
713 | BUG_ON(list_empty(&root->subsys_list)); | ||
714 | test_ss = list_entry(root->subsys_list.next, | ||
715 | struct cgroup_subsys, sibling); | ||
716 | if (css) { | ||
717 | *css = cont->subsys[test_ss->subsys_id]; | ||
718 | BUG_ON(!*css); | ||
719 | } | ||
720 | if (subsys_id) | ||
721 | *subsys_id = test_ss->subsys_id; | ||
722 | } | ||
723 | |||
724 | /* | ||
725 | * Attach task 'tsk' to cgroup 'cont' | ||
726 | * | ||
727 | * Call holding cgroup_mutex. May take task_lock of | ||
728 | * the task 'pid' during call. | ||
729 | */ | ||
730 | static int attach_task(struct cgroup *cont, struct task_struct *tsk) | ||
731 | { | ||
732 | int retval = 0; | ||
733 | struct cgroup_subsys *ss; | ||
734 | struct cgroup *oldcont; | ||
735 | struct css_set *cg = &tsk->cgroups; | ||
736 | struct cgroupfs_root *root = cont->root; | ||
737 | int i; | ||
738 | int subsys_id; | ||
739 | |||
740 | get_first_subsys(cont, NULL, &subsys_id); | ||
741 | |||
742 | /* Nothing to do if the task is already in that cgroup */ | ||
743 | oldcont = task_cgroup(tsk, subsys_id); | ||
744 | if (cont == oldcont) | ||
745 | return 0; | ||
746 | |||
747 | for_each_subsys(root, ss) { | ||
748 | if (ss->can_attach) { | ||
749 | retval = ss->can_attach(ss, cont, tsk); | ||
750 | if (retval) { | ||
751 | return retval; | ||
752 | } | ||
753 | } | ||
754 | } | ||
755 | |||
756 | task_lock(tsk); | ||
757 | if (tsk->flags & PF_EXITING) { | ||
758 | task_unlock(tsk); | ||
759 | return -ESRCH; | ||
760 | } | ||
761 | /* Update the css_set pointers for the subsystems in this | ||
762 | * hierarchy */ | ||
763 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
764 | if (root->subsys_bits & (1ull << i)) { | ||
765 | /* Subsystem is in this hierarchy. So we want | ||
766 | * the subsystem state from the new | ||
767 | * cgroup. Transfer the refcount from the | ||
768 | * old to the new */ | ||
769 | atomic_inc(&cont->count); | ||
770 | atomic_dec(&cg->subsys[i]->cgroup->count); | ||
771 | rcu_assign_pointer(cg->subsys[i], cont->subsys[i]); | ||
772 | } | ||
773 | } | ||
774 | task_unlock(tsk); | ||
775 | |||
776 | for_each_subsys(root, ss) { | ||
777 | if (ss->attach) { | ||
778 | ss->attach(ss, cont, oldcont, tsk); | ||
779 | } | ||
780 | } | ||
781 | |||
782 | synchronize_rcu(); | ||
783 | return 0; | ||
784 | } | ||
785 | |||
786 | /* | ||
787 | * Attach task with pid 'pid' to cgroup 'cont'. Call with | ||
788 | * cgroup_mutex, may take task_lock of task | ||
789 | */ | ||
790 | static int attach_task_by_pid(struct cgroup *cont, char *pidbuf) | ||
791 | { | ||
792 | pid_t pid; | ||
793 | struct task_struct *tsk; | ||
794 | int ret; | ||
795 | |||
796 | if (sscanf(pidbuf, "%d", &pid) != 1) | ||
797 | return -EIO; | ||
798 | |||
799 | if (pid) { | ||
800 | rcu_read_lock(); | ||
801 | tsk = find_task_by_pid(pid); | ||
802 | if (!tsk || tsk->flags & PF_EXITING) { | ||
803 | rcu_read_unlock(); | ||
804 | return -ESRCH; | ||
805 | } | ||
806 | get_task_struct(tsk); | ||
807 | rcu_read_unlock(); | ||
808 | |||
809 | if ((current->euid) && (current->euid != tsk->uid) | ||
810 | && (current->euid != tsk->suid)) { | ||
811 | put_task_struct(tsk); | ||
812 | return -EACCES; | ||
813 | } | ||
814 | } else { | ||
815 | tsk = current; | ||
816 | get_task_struct(tsk); | ||
817 | } | ||
818 | |||
819 | ret = attach_task(cont, tsk); | ||
820 | put_task_struct(tsk); | ||
821 | return ret; | ||
822 | } | ||
823 | |||
703 | /* The various types of files and directories in a cgroup file system */ | 824 | /* The various types of files and directories in a cgroup file system */ |
704 | 825 | ||
705 | enum cgroup_filetype { | 826 | enum cgroup_filetype { |
@@ -708,6 +829,55 @@ enum cgroup_filetype { | |||
708 | FILE_TASKLIST, | 829 | FILE_TASKLIST, |
709 | }; | 830 | }; |
710 | 831 | ||
832 | static ssize_t cgroup_common_file_write(struct cgroup *cont, | ||
833 | struct cftype *cft, | ||
834 | struct file *file, | ||
835 | const char __user *userbuf, | ||
836 | size_t nbytes, loff_t *unused_ppos) | ||
837 | { | ||
838 | enum cgroup_filetype type = cft->private; | ||
839 | char *buffer; | ||
840 | int retval = 0; | ||
841 | |||
842 | if (nbytes >= PATH_MAX) | ||
843 | return -E2BIG; | ||
844 | |||
845 | /* +1 for nul-terminator */ | ||
846 | buffer = kmalloc(nbytes + 1, GFP_KERNEL); | ||
847 | if (buffer == NULL) | ||
848 | return -ENOMEM; | ||
849 | |||
850 | if (copy_from_user(buffer, userbuf, nbytes)) { | ||
851 | retval = -EFAULT; | ||
852 | goto out1; | ||
853 | } | ||
854 | buffer[nbytes] = 0; /* nul-terminate */ | ||
855 | |||
856 | mutex_lock(&cgroup_mutex); | ||
857 | |||
858 | if (cgroup_is_removed(cont)) { | ||
859 | retval = -ENODEV; | ||
860 | goto out2; | ||
861 | } | ||
862 | |||
863 | switch (type) { | ||
864 | case FILE_TASKLIST: | ||
865 | retval = attach_task_by_pid(cont, buffer); | ||
866 | break; | ||
867 | default: | ||
868 | retval = -EINVAL; | ||
869 | goto out2; | ||
870 | } | ||
871 | |||
872 | if (retval == 0) | ||
873 | retval = nbytes; | ||
874 | out2: | ||
875 | mutex_unlock(&cgroup_mutex); | ||
876 | out1: | ||
877 | kfree(buffer); | ||
878 | return retval; | ||
879 | } | ||
880 | |||
711 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, | 881 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, |
712 | size_t nbytes, loff_t *ppos) | 882 | size_t nbytes, loff_t *ppos) |
713 | { | 883 | { |
@@ -914,6 +1084,189 @@ int cgroup_add_files(struct cgroup *cont, | |||
914 | return 0; | 1084 | return 0; |
915 | } | 1085 | } |
916 | 1086 | ||
1087 | /* Count the number of tasks in a cgroup. Could be made more | ||
1088 | * time-efficient but less space-efficient with more linked lists | ||
1089 | * running through each cgroup and the css_set structures that | ||
1090 | * referenced it. Must be called with tasklist_lock held for read or | ||
1091 | * write or in an rcu critical section. | ||
1092 | */ | ||
1093 | int __cgroup_task_count(const struct cgroup *cont) | ||
1094 | { | ||
1095 | int count = 0; | ||
1096 | struct task_struct *g, *p; | ||
1097 | struct cgroup_subsys_state *css; | ||
1098 | int subsys_id; | ||
1099 | |||
1100 | get_first_subsys(cont, &css, &subsys_id); | ||
1101 | do_each_thread(g, p) { | ||
1102 | if (task_subsys_state(p, subsys_id) == css) | ||
1103 | count ++; | ||
1104 | } while_each_thread(g, p); | ||
1105 | return count; | ||
1106 | } | ||
1107 | |||
1108 | /* | ||
1109 | * Stuff for reading the 'tasks' file. | ||
1110 | * | ||
1111 | * Reading this file can return large amounts of data if a cgroup has | ||
1112 | * *lots* of attached tasks. So it may need several calls to read(), | ||
1113 | * but we cannot guarantee that the information we produce is correct | ||
1114 | * unless we produce it entirely atomically. | ||
1115 | * | ||
1116 | * Upon tasks file open(), a struct ctr_struct is allocated, that | ||
1117 | * will have a pointer to an array (also allocated here). The struct | ||
1118 | * ctr_struct * is stored in file->private_data. Its resources will | ||
1119 | * be freed by release() when the file is closed. The array is used | ||
1120 | * to sprintf the PIDs and then used by read(). | ||
1121 | */ | ||
1122 | struct ctr_struct { | ||
1123 | char *buf; | ||
1124 | int bufsz; | ||
1125 | }; | ||
1126 | |||
1127 | /* | ||
1128 | * Load into 'pidarray' up to 'npids' of the tasks using cgroup | ||
1129 | * 'cont'. Return actual number of pids loaded. No need to | ||
1130 | * task_lock(p) when reading out p->cgroup, since we're in an RCU | ||
1131 | * read section, so the css_set can't go away, and is | ||
1132 | * immutable after creation. | ||
1133 | */ | ||
1134 | static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cont) | ||
1135 | { | ||
1136 | int n = 0; | ||
1137 | struct task_struct *g, *p; | ||
1138 | struct cgroup_subsys_state *css; | ||
1139 | int subsys_id; | ||
1140 | |||
1141 | get_first_subsys(cont, &css, &subsys_id); | ||
1142 | rcu_read_lock(); | ||
1143 | do_each_thread(g, p) { | ||
1144 | if (task_subsys_state(p, subsys_id) == css) { | ||
1145 | pidarray[n++] = pid_nr(task_pid(p)); | ||
1146 | if (unlikely(n == npids)) | ||
1147 | goto array_full; | ||
1148 | } | ||
1149 | } while_each_thread(g, p); | ||
1150 | |||
1151 | array_full: | ||
1152 | rcu_read_unlock(); | ||
1153 | return n; | ||
1154 | } | ||
1155 | |||
1156 | static int cmppid(const void *a, const void *b) | ||
1157 | { | ||
1158 | return *(pid_t *)a - *(pid_t *)b; | ||
1159 | } | ||
1160 | |||
1161 | /* | ||
1162 | * Convert array 'a' of 'npids' pid_t's to a string of newline separated | ||
1163 | * decimal pids in 'buf'. Don't write more than 'sz' chars, but return | ||
1164 | * count 'cnt' of how many chars would be written if buf were large enough. | ||
1165 | */ | ||
1166 | static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) | ||
1167 | { | ||
1168 | int cnt = 0; | ||
1169 | int i; | ||
1170 | |||
1171 | for (i = 0; i < npids; i++) | ||
1172 | cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); | ||
1173 | return cnt; | ||
1174 | } | ||
1175 | |||
1176 | /* | ||
1177 | * Handle an open on 'tasks' file. Prepare a buffer listing the | ||
1178 | * process id's of tasks currently attached to the cgroup being opened. | ||
1179 | * | ||
1180 | * Does not require any specific cgroup mutexes, and does not take any. | ||
1181 | */ | ||
1182 | static int cgroup_tasks_open(struct inode *unused, struct file *file) | ||
1183 | { | ||
1184 | struct cgroup *cont = __d_cont(file->f_dentry->d_parent); | ||
1185 | struct ctr_struct *ctr; | ||
1186 | pid_t *pidarray; | ||
1187 | int npids; | ||
1188 | char c; | ||
1189 | |||
1190 | if (!(file->f_mode & FMODE_READ)) | ||
1191 | return 0; | ||
1192 | |||
1193 | ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); | ||
1194 | if (!ctr) | ||
1195 | goto err0; | ||
1196 | |||
1197 | /* | ||
1198 | * If cgroup gets more users after we read count, we won't have | ||
1199 | * enough space - tough. This race is indistinguishable to the | ||
1200 | * caller from the case that the additional cgroup users didn't | ||
1201 | * show up until sometime later on. | ||
1202 | */ | ||
1203 | npids = cgroup_task_count(cont); | ||
1204 | if (npids) { | ||
1205 | pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); | ||
1206 | if (!pidarray) | ||
1207 | goto err1; | ||
1208 | |||
1209 | npids = pid_array_load(pidarray, npids, cont); | ||
1210 | sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); | ||
1211 | |||
1212 | /* Call pid_array_to_buf() twice, first just to get bufsz */ | ||
1213 | ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; | ||
1214 | ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); | ||
1215 | if (!ctr->buf) | ||
1216 | goto err2; | ||
1217 | ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); | ||
1218 | |||
1219 | kfree(pidarray); | ||
1220 | } else { | ||
1221 | ctr->buf = 0; | ||
1222 | ctr->bufsz = 0; | ||
1223 | } | ||
1224 | file->private_data = ctr; | ||
1225 | return 0; | ||
1226 | |||
1227 | err2: | ||
1228 | kfree(pidarray); | ||
1229 | err1: | ||
1230 | kfree(ctr); | ||
1231 | err0: | ||
1232 | return -ENOMEM; | ||
1233 | } | ||
1234 | |||
1235 | static ssize_t cgroup_tasks_read(struct cgroup *cont, | ||
1236 | struct cftype *cft, | ||
1237 | struct file *file, char __user *buf, | ||
1238 | size_t nbytes, loff_t *ppos) | ||
1239 | { | ||
1240 | struct ctr_struct *ctr = file->private_data; | ||
1241 | |||
1242 | return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); | ||
1243 | } | ||
1244 | |||
1245 | static int cgroup_tasks_release(struct inode *unused_inode, | ||
1246 | struct file *file) | ||
1247 | { | ||
1248 | struct ctr_struct *ctr; | ||
1249 | |||
1250 | if (file->f_mode & FMODE_READ) { | ||
1251 | ctr = file->private_data; | ||
1252 | kfree(ctr->buf); | ||
1253 | kfree(ctr); | ||
1254 | } | ||
1255 | return 0; | ||
1256 | } | ||
1257 | |||
1258 | /* | ||
1259 | * for the common functions, 'private' gives the type of file | ||
1260 | */ | ||
1261 | static struct cftype cft_tasks = { | ||
1262 | .name = "tasks", | ||
1263 | .open = cgroup_tasks_open, | ||
1264 | .read = cgroup_tasks_read, | ||
1265 | .write = cgroup_common_file_write, | ||
1266 | .release = cgroup_tasks_release, | ||
1267 | .private = FILE_TASKLIST, | ||
1268 | }; | ||
1269 | |||
917 | static int cgroup_populate_dir(struct cgroup *cont) | 1270 | static int cgroup_populate_dir(struct cgroup *cont) |
918 | { | 1271 | { |
919 | int err; | 1272 | int err; |
@@ -922,6 +1275,10 @@ static int cgroup_populate_dir(struct cgroup *cont) | |||
922 | /* First clear out any existing files */ | 1275 | /* First clear out any existing files */ |
923 | cgroup_clear_directory(cont->dentry); | 1276 | cgroup_clear_directory(cont->dentry); |
924 | 1277 | ||
1278 | err = cgroup_add_file(cont, NULL, &cft_tasks); | ||
1279 | if (err < 0) | ||
1280 | return err; | ||
1281 | |||
925 | for_each_subsys(cont->root, ss) { | 1282 | for_each_subsys(cont->root, ss) { |
926 | if (ss->populate && (err = ss->populate(ss, cont)) < 0) | 1283 | if (ss->populate && (err = ss->populate(ss, cont)) < 0) |
927 | return err; | 1284 | return err; |