aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Menage <menage@google.com>2008-10-18 23:28:04 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-20 11:52:38 -0400
commitcc31edceee04a7b87f2be48f9489ebb72d264844 (patch)
tree5d37791218c420281e509899645d89aee7902f2b
parent146aa1bd0511f88ddb4e92fafa2b8aad4f2f65f3 (diff)
cgroups: convert tasks file to use a seq_file with shared pid array
Rather than pre-generating the entire text for the "tasks" file each time the file is opened, we instead just generate/update the array of process ids and use a seq_file to report these to userspace. All open file handles on the same "tasks" file can share a pid array, which may be updated any time that no thread is actively reading the array. By sharing the array, the potential for userspace to DoS the system by opening many handles on the same "tasks" file is removed. [Based on a patch by Lai Jiangshan, extended to use seq_file] Signed-off-by: Paul Menage <menage@google.com> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: Serge Hallyn <serue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/cgroup.h10
-rw-r--r--kernel/cgroup.c222
2 files changed, 149 insertions, 83 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7166023e07d2..8ab91880a0ad 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -14,6 +14,7 @@
14#include <linux/rcupdate.h> 14#include <linux/rcupdate.h>
15#include <linux/cgroupstats.h> 15#include <linux/cgroupstats.h>
16#include <linux/prio_heap.h> 16#include <linux/prio_heap.h>
17#include <linux/rwsem.h>
17 18
18#ifdef CONFIG_CGROUPS 19#ifdef CONFIG_CGROUPS
19 20
@@ -136,6 +137,15 @@ struct cgroup {
136 * release_list_lock 137 * release_list_lock
137 */ 138 */
138 struct list_head release_list; 139 struct list_head release_list;
140
141 /* pids_mutex protects the fields below */
142 struct rw_semaphore pids_mutex;
143 /* Array of process ids in the cgroup */
144 pid_t *tasks_pids;
145 /* How many files are using the current tasks_pids array */
146 int pids_use_count;
147 /* Length of the current tasks_pids array */
148 int pids_length;
139}; 149};
140 150
141/* A css_set is a structure holding pointers to a set of 151/* A css_set is a structure holding pointers to a set of
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1e49218457e0..046c1609606b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -868,6 +868,14 @@ static struct super_operations cgroup_ops = {
868 .remount_fs = cgroup_remount, 868 .remount_fs = cgroup_remount,
869}; 869};
870 870
871static void init_cgroup_housekeeping(struct cgroup *cgrp)
872{
873 INIT_LIST_HEAD(&cgrp->sibling);
874 INIT_LIST_HEAD(&cgrp->children);
875 INIT_LIST_HEAD(&cgrp->css_sets);
876 INIT_LIST_HEAD(&cgrp->release_list);
877 init_rwsem(&cgrp->pids_mutex);
878}
871static void init_cgroup_root(struct cgroupfs_root *root) 879static void init_cgroup_root(struct cgroupfs_root *root)
872{ 880{
873 struct cgroup *cgrp = &root->top_cgroup; 881 struct cgroup *cgrp = &root->top_cgroup;
@@ -876,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
876 root->number_of_cgroups = 1; 884 root->number_of_cgroups = 1;
877 cgrp->root = root; 885 cgrp->root = root;
878 cgrp->top_cgroup = cgrp; 886 cgrp->top_cgroup = cgrp;
879 INIT_LIST_HEAD(&cgrp->sibling); 887 init_cgroup_housekeeping(cgrp);
880 INIT_LIST_HEAD(&cgrp->children);
881 INIT_LIST_HEAD(&cgrp->css_sets);
882 INIT_LIST_HEAD(&cgrp->release_list);
883} 888}
884 889
885static int cgroup_test_super(struct super_block *sb, void *data) 890static int cgroup_test_super(struct super_block *sb, void *data)
@@ -1995,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
1995 * but we cannot guarantee that the information we produce is correct 2000 * but we cannot guarantee that the information we produce is correct
1996 * unless we produce it entirely atomically. 2001 * unless we produce it entirely atomically.
1997 * 2002 *
1998 * Upon tasks file open(), a struct ctr_struct is allocated, that
1999 * will have a pointer to an array (also allocated here). The struct
2000 * ctr_struct * is stored in file->private_data. Its resources will
2001 * be freed by release() when the file is closed. The array is used
2002 * to sprintf the PIDs and then used by read().
2003 */ 2003 */
2004struct ctr_struct {
2005 char *buf;
2006 int bufsz;
2007};
2008 2004
2009/* 2005/*
2010 * Load into 'pidarray' up to 'npids' of the tasks using cgroup 2006 * Load into 'pidarray' up to 'npids' of the tasks using cgroup
@@ -2086,42 +2082,132 @@ static int cmppid(const void *a, const void *b)
2086 return *(pid_t *)a - *(pid_t *)b; 2082 return *(pid_t *)a - *(pid_t *)b;
2087} 2083}
2088 2084
2085
2089/* 2086/*
2090 * Convert array 'a' of 'npids' pid_t's to a string of newline separated 2087 * seq_file methods for the "tasks" file. The seq_file position is the
2091 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return 2088 * next pid to display; the seq_file iterator is a pointer to the pid
2092 * count 'cnt' of how many chars would be written if buf were large enough. 2089 * in the cgroup->tasks_pids array.
2093 */ 2090 */
2094static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) 2091
2092static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2095{ 2093{
2096 int cnt = 0; 2094 /*
2097 int i; 2095 * Initially we receive a position value that corresponds to
2096 * one more than the last pid shown (or 0 on the first call or
2097 * after a seek to the start). Use a binary-search to find the
2098 * next pid to display, if any
2099 */
2100 struct cgroup *cgrp = s->private;
2101 int index = 0, pid = *pos;
2102 int *iter;
2098 2103
2099 for (i = 0; i < npids; i++) 2104 down_read(&cgrp->pids_mutex);
2100 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); 2105 if (pid) {
2101 return cnt; 2106 int end = cgrp->pids_length;
2107 int i;
2108 while (index < end) {
2109 int mid = (index + end) / 2;
2110 if (cgrp->tasks_pids[mid] == pid) {
2111 index = mid;
2112 break;
2113 } else if (cgrp->tasks_pids[mid] <= pid)
2114 index = mid + 1;
2115 else
2116 end = mid;
2117 }
2118 }
2119 /* If we're off the end of the array, we're done */
2120 if (index >= cgrp->pids_length)
2121 return NULL;
2122 /* Update the abstract position to be the actual pid that we found */
2123 iter = cgrp->tasks_pids + index;
2124 *pos = *iter;
2125 return iter;
2126}
2127
2128static void cgroup_tasks_stop(struct seq_file *s, void *v)
2129{
2130 struct cgroup *cgrp = s->private;
2131 up_read(&cgrp->pids_mutex);
2102} 2132}
2103 2133
2134static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2135{
2136 struct cgroup *cgrp = s->private;
2137 int *p = v;
2138 int *end = cgrp->tasks_pids + cgrp->pids_length;
2139
2140 /*
2141 * Advance to the next pid in the array. If this goes off the
2142 * end, we're done
2143 */
2144 p++;
2145 if (p >= end) {
2146 return NULL;
2147 } else {
2148 *pos = *p;
2149 return p;
2150 }
2151}
2152
2153static int cgroup_tasks_show(struct seq_file *s, void *v)
2154{
2155 return seq_printf(s, "%d\n", *(int *)v);
2156}
2157
2158static struct seq_operations cgroup_tasks_seq_operations = {
2159 .start = cgroup_tasks_start,
2160 .stop = cgroup_tasks_stop,
2161 .next = cgroup_tasks_next,
2162 .show = cgroup_tasks_show,
2163};
2164
2165static void release_cgroup_pid_array(struct cgroup *cgrp)
2166{
2167 down_write(&cgrp->pids_mutex);
2168 BUG_ON(!cgrp->pids_use_count);
2169 if (!--cgrp->pids_use_count) {
2170 kfree(cgrp->tasks_pids);
2171 cgrp->tasks_pids = NULL;
2172 cgrp->pids_length = 0;
2173 }
2174 up_write(&cgrp->pids_mutex);
2175}
2176
2177static int cgroup_tasks_release(struct inode *inode, struct file *file)
2178{
2179 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2180
2181 if (!(file->f_mode & FMODE_READ))
2182 return 0;
2183
2184 release_cgroup_pid_array(cgrp);
2185 return seq_release(inode, file);
2186}
2187
2188static struct file_operations cgroup_tasks_operations = {
2189 .read = seq_read,
2190 .llseek = seq_lseek,
2191 .write = cgroup_file_write,
2192 .release = cgroup_tasks_release,
2193};
2194
2104/* 2195/*
2105 * Handle an open on 'tasks' file. Prepare a buffer listing the 2196 * Handle an open on 'tasks' file. Prepare an array containing the
2106 * process id's of tasks currently attached to the cgroup being opened. 2197 * process id's of tasks currently attached to the cgroup being opened.
2107 *
2108 * Does not require any specific cgroup mutexes, and does not take any.
2109 */ 2198 */
2199
2110static int cgroup_tasks_open(struct inode *unused, struct file *file) 2200static int cgroup_tasks_open(struct inode *unused, struct file *file)
2111{ 2201{
2112 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2202 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2113 struct ctr_struct *ctr;
2114 pid_t *pidarray; 2203 pid_t *pidarray;
2115 int npids; 2204 int npids;
2116 char c; 2205 int retval;
2117 2206
2207 /* Nothing to do for write-only files */
2118 if (!(file->f_mode & FMODE_READ)) 2208 if (!(file->f_mode & FMODE_READ))
2119 return 0; 2209 return 0;
2120 2210
2121 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
2122 if (!ctr)
2123 goto err0;
2124
2125 /* 2211 /*
2126 * If cgroup gets more users after we read count, we won't have 2212 * If cgroup gets more users after we read count, we won't have
2127 * enough space - tough. This race is indistinguishable to the 2213 * enough space - tough. This race is indistinguishable to the
@@ -2129,57 +2215,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2129 * show up until sometime later on. 2215 * show up until sometime later on.
2130 */ 2216 */
2131 npids = cgroup_task_count(cgrp); 2217 npids = cgroup_task_count(cgrp);
2132 if (npids) { 2218 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2133 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); 2219 if (!pidarray)
2134 if (!pidarray) 2220 return -ENOMEM;
2135 goto err1; 2221 npids = pid_array_load(pidarray, npids, cgrp);
2136 2222 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2137 npids = pid_array_load(pidarray, npids, cgrp);
2138 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2139
2140 /* Call pid_array_to_buf() twice, first just to get bufsz */
2141 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
2142 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
2143 if (!ctr->buf)
2144 goto err2;
2145 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
2146
2147 kfree(pidarray);
2148 } else {
2149 ctr->buf = NULL;
2150 ctr->bufsz = 0;
2151 }
2152 file->private_data = ctr;
2153 return 0;
2154
2155err2:
2156 kfree(pidarray);
2157err1:
2158 kfree(ctr);
2159err0:
2160 return -ENOMEM;
2161}
2162
2163static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
2164 struct cftype *cft,
2165 struct file *file, char __user *buf,
2166 size_t nbytes, loff_t *ppos)
2167{
2168 struct ctr_struct *ctr = file->private_data;
2169 2223
2170 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); 2224 /*
2171} 2225 * Store the array in the cgroup, freeing the old
2226 * array if necessary
2227 */
2228 down_write(&cgrp->pids_mutex);
2229 kfree(cgrp->tasks_pids);
2230 cgrp->tasks_pids = pidarray;
2231 cgrp->pids_length = npids;
2232 cgrp->pids_use_count++;
2233 up_write(&cgrp->pids_mutex);
2172 2234
2173static int cgroup_tasks_release(struct inode *unused_inode, 2235 file->f_op = &cgroup_tasks_operations;
2174 struct file *file)
2175{
2176 struct ctr_struct *ctr;
2177 2236
2178 if (file->f_mode & FMODE_READ) { 2237 retval = seq_open(file, &cgroup_tasks_seq_operations);
2179 ctr = file->private_data; 2238 if (retval) {
2180 kfree(ctr->buf); 2239 release_cgroup_pid_array(cgrp);
2181 kfree(ctr); 2240 return retval;
2182 } 2241 }
2242 ((struct seq_file *)file->private_data)->private = cgrp;
2183 return 0; 2243 return 0;
2184} 2244}
2185 2245
@@ -2208,7 +2268,6 @@ static struct cftype files[] = {
2208 { 2268 {
2209 .name = "tasks", 2269 .name = "tasks",
2210 .open = cgroup_tasks_open, 2270 .open = cgroup_tasks_open,
2211 .read = cgroup_tasks_read,
2212 .write_u64 = cgroup_tasks_write, 2271 .write_u64 = cgroup_tasks_write,
2213 .release = cgroup_tasks_release, 2272 .release = cgroup_tasks_release,
2214 .private = FILE_TASKLIST, 2273 .private = FILE_TASKLIST,
@@ -2298,10 +2357,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2298 2357
2299 mutex_lock(&cgroup_mutex); 2358 mutex_lock(&cgroup_mutex);
2300 2359
2301 INIT_LIST_HEAD(&cgrp->sibling); 2360 init_cgroup_housekeeping(cgrp);
2302 INIT_LIST_HEAD(&cgrp->children);
2303 INIT_LIST_HEAD(&cgrp->css_sets);
2304 INIT_LIST_HEAD(&cgrp->release_list);
2305 2361
2306 cgrp->parent = parent; 2362 cgrp->parent = parent;
2307 cgrp->root = parent->root; 2363 cgrp->root = parent->root;