aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorBen Blum <bblum@google.com>2009-09-23 18:56:26 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-24 10:20:58 -0400
commit102a775e3647628727ae83a9a6abf0564c3ca7cb (patch)
tree77a3d9717daa0f1dceccc0dcdf821aa12e684e07 /kernel
parent8f3ff20862cfcb85500a2bb55ee64622bd59fd0c (diff)
cgroups: add a read-only "procs" file similar to "tasks" that shows only unique tgids
struct cgroup used to have a bunch of fields for keeping track of the pidlist for the tasks file. Those are now separated into a new struct cgroup_pidlist, of which two are had, one for procs and one for tasks. The way the seq_file operations are set up is changed so that just the pidlist struct gets passed around as the private data. Interface example: Suppose a multithreaded process has pid 1000 and other threads with ids 1001, 1002, 1003: $ cat tasks 1000 1001 1002 1003 $ cat cgroup.procs 1000 $ Signed-off-by: Ben Blum <bblum@google.com> Signed-off-by: Paul Menage <menage@google.com> Acked-by: Li Zefan <lizf@cn.fujitsu.com> Cc: Matt Helsley <matthltc@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c278
1 files changed, 172 insertions, 106 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 22db0a7cf1fa..a9433f50e53d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1121,7 +1121,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1121 INIT_LIST_HEAD(&cgrp->children); 1121 INIT_LIST_HEAD(&cgrp->children);
1122 INIT_LIST_HEAD(&cgrp->css_sets); 1122 INIT_LIST_HEAD(&cgrp->css_sets);
1123 INIT_LIST_HEAD(&cgrp->release_list); 1123 INIT_LIST_HEAD(&cgrp->release_list);
1124 init_rwsem(&cgrp->pids_mutex); 1124 init_rwsem(&(cgrp->tasks.mutex));
1125 init_rwsem(&(cgrp->procs.mutex));
1125} 1126}
1126 1127
1127static void init_cgroup_root(struct cgroupfs_root *root) 1128static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1637,15 +1638,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1637 return ret; 1638 return ret;
1638} 1639}
1639 1640
1640/* The various types of files and directories in a cgroup file system */
1641enum cgroup_filetype {
1642 FILE_ROOT,
1643 FILE_DIR,
1644 FILE_TASKLIST,
1645 FILE_NOTIFY_ON_RELEASE,
1646 FILE_RELEASE_AGENT,
1647};
1648
1649/** 1641/**
1650 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 1642 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
1651 * @cgrp: the cgroup to be checked for liveness 1643 * @cgrp: the cgroup to be checked for liveness
@@ -2343,7 +2335,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2343} 2335}
2344 2336
2345/* 2337/*
2346 * Stuff for reading the 'tasks' file. 2338 * Stuff for reading the 'tasks'/'procs' files.
2347 * 2339 *
2348 * Reading this file can return large amounts of data if a cgroup has 2340 * Reading this file can return large amounts of data if a cgroup has
2349 * *lots* of attached tasks. So it may need several calls to read(), 2341 * *lots* of attached tasks. So it may need several calls to read(),
@@ -2353,27 +2345,106 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2353 */ 2345 */
2354 2346
2355/* 2347/*
2356 * Load into 'pidarray' up to 'npids' of the tasks using cgroup 2348 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
2357 * 'cgrp'. Return actual number of pids loaded. No need to 2349 * If the new stripped list is sufficiently smaller and there's enough memory
2358 * task_lock(p) when reading out p->cgroup, since we're in an RCU 2350 * to allocate a new buffer, will let go of the unneeded memory. Returns the
2359 * read section, so the css_set can't go away, and is 2351 * number of unique elements.
2360 * immutable after creation.
2361 */ 2352 */
2362static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2353/* is the size difference enough that we should re-allocate the array? */
2354#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
2355static int pidlist_uniq(pid_t **p, int length)
2363{ 2356{
2364 int n = 0, pid; 2357 int src, dest = 1;
2358 pid_t *list = *p;
2359 pid_t *newlist;
2360
2361 /*
2362 * we presume the 0th element is unique, so i starts at 1. trivial
2363 * edge cases first; no work needs to be done for either
2364 */
2365 if (length == 0 || length == 1)
2366 return length;
2367 /* src and dest walk down the list; dest counts unique elements */
2368 for (src = 1; src < length; src++) {
2369 /* find next unique element */
2370 while (list[src] == list[src-1]) {
2371 src++;
2372 if (src == length)
2373 goto after;
2374 }
2375 /* dest always points to where the next unique element goes */
2376 list[dest] = list[src];
2377 dest++;
2378 }
2379after:
2380 /*
2381 * if the length difference is large enough, we want to allocate a
2382 * smaller buffer to save memory. if this fails due to out of memory,
2383 * we'll just stay with what we've got.
2384 */
2385 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
2386 newlist = krealloc(list, dest * sizeof(pid_t), GFP_KERNEL);
2387 if (newlist)
2388 *p = newlist;
2389 }
2390 return dest;
2391}
2392
2393static int cmppid(const void *a, const void *b)
2394{
2395 return *(pid_t *)a - *(pid_t *)b;
2396}
2397
2398/*
2399 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
2400 */
2401static int pidlist_array_load(struct cgroup *cgrp, bool procs)
2402{
2403 pid_t *array;
2404 int length;
2405 int pid, n = 0; /* used for populating the array */
2365 struct cgroup_iter it; 2406 struct cgroup_iter it;
2366 struct task_struct *tsk; 2407 struct task_struct *tsk;
2408 struct cgroup_pidlist *l;
2409
2410 /*
2411 * If cgroup gets more users after we read count, we won't have
2412 * enough space - tough. This race is indistinguishable to the
2413 * caller from the case that the additional cgroup users didn't
2414 * show up until sometime later on.
2415 */
2416 length = cgroup_task_count(cgrp);
2417 array = kmalloc(length * sizeof(pid_t), GFP_KERNEL);
2418 if (!array)
2419 return -ENOMEM;
2420 /* now, populate the array */
2367 cgroup_iter_start(cgrp, &it); 2421 cgroup_iter_start(cgrp, &it);
2368 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2422 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2369 if (unlikely(n == npids)) 2423 if (unlikely(n == length))
2370 break; 2424 break;
2371 pid = task_pid_vnr(tsk); 2425 /* get tgid or pid for procs or tasks file respectively */
2372 if (pid > 0) 2426 pid = (procs ? task_tgid_vnr(tsk) : task_pid_vnr(tsk));
2373 pidarray[n++] = pid; 2427 if (pid > 0) /* make sure to only use valid results */
2428 array[n++] = pid;
2374 } 2429 }
2375 cgroup_iter_end(cgrp, &it); 2430 cgroup_iter_end(cgrp, &it);
2376 return n; 2431 length = n;
2432 /* now sort & (if procs) strip out duplicates */
2433 sort(array, length, sizeof(pid_t), cmppid, NULL);
2434 if (procs) {
2435 length = pidlist_uniq(&array, length);
2436 l = &(cgrp->procs);
2437 } else {
2438 l = &(cgrp->tasks);
2439 }
2440 /* store array in cgroup, freeing old if necessary */
2441 down_write(&l->mutex);
2442 kfree(l->list);
2443 l->list = array;
2444 l->length = length;
2445 l->use_count++;
2446 up_write(&l->mutex);
2447 return 0;
2377} 2448}
2378 2449
2379/** 2450/**
@@ -2430,19 +2501,14 @@ err:
2430 return ret; 2501 return ret;
2431} 2502}
2432 2503
2433static int cmppid(const void *a, const void *b)
2434{
2435 return *(pid_t *)a - *(pid_t *)b;
2436}
2437
2438 2504
2439/* 2505/*
2440 * seq_file methods for the "tasks" file. The seq_file position is the 2506 * seq_file methods for the tasks/procs files. The seq_file position is the
2441 * next pid to display; the seq_file iterator is a pointer to the pid 2507 * next pid to display; the seq_file iterator is a pointer to the pid
2442 * in the cgroup->tasks_pids array. 2508 * in the cgroup->l->list array.
2443 */ 2509 */
2444 2510
2445static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) 2511static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
2446{ 2512{
2447 /* 2513 /*
2448 * Initially we receive a position value that corresponds to 2514 * Initially we receive a position value that corresponds to
@@ -2450,46 +2516,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2450 * after a seek to the start). Use a binary-search to find the 2516 * after a seek to the start). Use a binary-search to find the
2451 * next pid to display, if any 2517 * next pid to display, if any
2452 */ 2518 */
2453 struct cgroup *cgrp = s->private; 2519 struct cgroup_pidlist *l = s->private;
2454 int index = 0, pid = *pos; 2520 int index = 0, pid = *pos;
2455 int *iter; 2521 int *iter;
2456 2522
2457 down_read(&cgrp->pids_mutex); 2523 down_read(&l->mutex);
2458 if (pid) { 2524 if (pid) {
2459 int end = cgrp->pids_length; 2525 int end = l->length;
2460 2526
2461 while (index < end) { 2527 while (index < end) {
2462 int mid = (index + end) / 2; 2528 int mid = (index + end) / 2;
2463 if (cgrp->tasks_pids[mid] == pid) { 2529 if (l->list[mid] == pid) {
2464 index = mid; 2530 index = mid;
2465 break; 2531 break;
2466 } else if (cgrp->tasks_pids[mid] <= pid) 2532 } else if (l->list[mid] <= pid)
2467 index = mid + 1; 2533 index = mid + 1;
2468 else 2534 else
2469 end = mid; 2535 end = mid;
2470 } 2536 }
2471 } 2537 }
2472 /* If we're off the end of the array, we're done */ 2538 /* If we're off the end of the array, we're done */
2473 if (index >= cgrp->pids_length) 2539 if (index >= l->length)
2474 return NULL; 2540 return NULL;
2475 /* Update the abstract position to be the actual pid that we found */ 2541 /* Update the abstract position to be the actual pid that we found */
2476 iter = cgrp->tasks_pids + index; 2542 iter = l->list + index;
2477 *pos = *iter; 2543 *pos = *iter;
2478 return iter; 2544 return iter;
2479} 2545}
2480 2546
2481static void cgroup_tasks_stop(struct seq_file *s, void *v) 2547static void cgroup_pidlist_stop(struct seq_file *s, void *v)
2482{ 2548{
2483 struct cgroup *cgrp = s->private; 2549 struct cgroup_pidlist *l = s->private;
2484 up_read(&cgrp->pids_mutex); 2550 up_read(&l->mutex);
2485} 2551}
2486 2552
2487static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2553static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
2488{ 2554{
2489 struct cgroup *cgrp = s->private; 2555 struct cgroup_pidlist *l = s->private;
2490 int *p = v; 2556 pid_t *p = v;
2491 int *end = cgrp->tasks_pids + cgrp->pids_length; 2557 pid_t *end = l->list + l->length;
2492
2493 /* 2558 /*
2494 * Advance to the next pid in the array. If this goes off the 2559 * Advance to the next pid in the array. If this goes off the
2495 * end, we're done 2560 * end, we're done
@@ -2503,98 +2568,94 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2503 } 2568 }
2504} 2569}
2505 2570
2506static int cgroup_tasks_show(struct seq_file *s, void *v) 2571static int cgroup_pidlist_show(struct seq_file *s, void *v)
2507{ 2572{
2508 return seq_printf(s, "%d\n", *(int *)v); 2573 return seq_printf(s, "%d\n", *(int *)v);
2509} 2574}
2510 2575
2511static const struct seq_operations cgroup_tasks_seq_operations = { 2576/*
2512 .start = cgroup_tasks_start, 2577 * seq_operations functions for iterating on pidlists through seq_file -
2513 .stop = cgroup_tasks_stop, 2578 * independent of whether it's tasks or procs
2514 .next = cgroup_tasks_next, 2579 */
2515 .show = cgroup_tasks_show, 2580static const struct seq_operations cgroup_pidlist_seq_operations = {
2581 .start = cgroup_pidlist_start,
2582 .stop = cgroup_pidlist_stop,
2583 .next = cgroup_pidlist_next,
2584 .show = cgroup_pidlist_show,
2516}; 2585};
2517 2586
2518static void release_cgroup_pid_array(struct cgroup *cgrp) 2587static void cgroup_release_pid_array(struct cgroup_pidlist *l)
2519{ 2588{
2520 down_write(&cgrp->pids_mutex); 2589 down_write(&l->mutex);
2521 BUG_ON(!cgrp->pids_use_count); 2590 BUG_ON(!l->use_count);
2522 if (!--cgrp->pids_use_count) { 2591 if (!--l->use_count) {
2523 kfree(cgrp->tasks_pids); 2592 kfree(l->list);
2524 cgrp->tasks_pids = NULL; 2593 l->list = NULL;
2525 cgrp->pids_length = 0; 2594 l->length = 0;
2526 } 2595 }
2527 up_write(&cgrp->pids_mutex); 2596 up_write(&l->mutex);
2528} 2597}
2529 2598
2530static int cgroup_tasks_release(struct inode *inode, struct file *file) 2599static int cgroup_pidlist_release(struct inode *inode, struct file *file)
2531{ 2600{
2532 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2601 struct cgroup_pidlist *l;
2533
2534 if (!(file->f_mode & FMODE_READ)) 2602 if (!(file->f_mode & FMODE_READ))
2535 return 0; 2603 return 0;
2536 2604 /*
2537 release_cgroup_pid_array(cgrp); 2605 * the seq_file will only be initialized if the file was opened for
2606 * reading; hence we check if it's not null only in that case.
2607 */
2608 l = ((struct seq_file *)file->private_data)->private;
2609 cgroup_release_pid_array(l);
2538 return seq_release(inode, file); 2610 return seq_release(inode, file);
2539} 2611}
2540 2612
2541static struct file_operations cgroup_tasks_operations = { 2613static const struct file_operations cgroup_pidlist_operations = {
2542 .read = seq_read, 2614 .read = seq_read,
2543 .llseek = seq_lseek, 2615 .llseek = seq_lseek,
2544 .write = cgroup_file_write, 2616 .write = cgroup_file_write,
2545 .release = cgroup_tasks_release, 2617 .release = cgroup_pidlist_release,
2546}; 2618};
2547 2619
2548/* 2620/*
2549 * Handle an open on 'tasks' file. Prepare an array containing the 2621 * The following functions handle opens on a file that displays a pidlist
2550 * process id's of tasks currently attached to the cgroup being opened. 2622 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
2623 * in the cgroup.
2551 */ 2624 */
2552 2625/* helper function for the two below it */
2553static int cgroup_tasks_open(struct inode *unused, struct file *file) 2626static int cgroup_pidlist_open(struct file *file, bool procs)
2554{ 2627{
2555 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2628 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2556 pid_t *pidarray; 2629 struct cgroup_pidlist *l = (procs ? &cgrp->procs : &cgrp->tasks);
2557 int npids;
2558 int retval; 2630 int retval;
2559 2631
2560 /* Nothing to do for write-only files */ 2632 /* Nothing to do for write-only files */
2561 if (!(file->f_mode & FMODE_READ)) 2633 if (!(file->f_mode & FMODE_READ))
2562 return 0; 2634 return 0;
2563 2635
2564 /* 2636 /* have the array populated */
2565 * If cgroup gets more users after we read count, we won't have 2637 retval = pidlist_array_load(cgrp, procs);
2566 * enough space - tough. This race is indistinguishable to the 2638 if (retval)
2567 * caller from the case that the additional cgroup users didn't 2639 return retval;
2568 * show up until sometime later on. 2640 /* configure file information */
2569 */ 2641 file->f_op = &cgroup_pidlist_operations;
2570 npids = cgroup_task_count(cgrp);
2571 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2572 if (!pidarray)
2573 return -ENOMEM;
2574 npids = pid_array_load(pidarray, npids, cgrp);
2575 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2576
2577 /*
2578 * Store the array in the cgroup, freeing the old
2579 * array if necessary
2580 */
2581 down_write(&cgrp->pids_mutex);
2582 kfree(cgrp->tasks_pids);
2583 cgrp->tasks_pids = pidarray;
2584 cgrp->pids_length = npids;
2585 cgrp->pids_use_count++;
2586 up_write(&cgrp->pids_mutex);
2587
2588 file->f_op = &cgroup_tasks_operations;
2589 2642
2590 retval = seq_open(file, &cgroup_tasks_seq_operations); 2643 retval = seq_open(file, &cgroup_pidlist_seq_operations);
2591 if (retval) { 2644 if (retval) {
2592 release_cgroup_pid_array(cgrp); 2645 cgroup_release_pid_array(l);
2593 return retval; 2646 return retval;
2594 } 2647 }
2595 ((struct seq_file *)file->private_data)->private = cgrp; 2648 ((struct seq_file *)file->private_data)->private = l;
2596 return 0; 2649 return 0;
2597} 2650}
2651static int cgroup_tasks_open(struct inode *unused, struct file *file)
2652{
2653 return cgroup_pidlist_open(file, false);
2654}
2655static int cgroup_procs_open(struct inode *unused, struct file *file)
2656{
2657 return cgroup_pidlist_open(file, true);
2658}
2598 2659
2599static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 2660static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2600 struct cftype *cft) 2661 struct cftype *cft)
@@ -2617,21 +2678,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2617/* 2678/*
2618 * for the common functions, 'private' gives the type of file 2679 * for the common functions, 'private' gives the type of file
2619 */ 2680 */
2681/* for hysterical raisins, we can't put this on the older files */
2682#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
2620static struct cftype files[] = { 2683static struct cftype files[] = {
2621 { 2684 {
2622 .name = "tasks", 2685 .name = "tasks",
2623 .open = cgroup_tasks_open, 2686 .open = cgroup_tasks_open,
2624 .write_u64 = cgroup_tasks_write, 2687 .write_u64 = cgroup_tasks_write,
2625 .release = cgroup_tasks_release, 2688 .release = cgroup_pidlist_release,
2626 .private = FILE_TASKLIST,
2627 .mode = S_IRUGO | S_IWUSR, 2689 .mode = S_IRUGO | S_IWUSR,
2628 }, 2690 },
2629 2691 {
2692 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
2693 .open = cgroup_procs_open,
2694 /* .write_u64 = cgroup_procs_write, TODO */
2695 .release = cgroup_pidlist_release,
2696 .mode = S_IRUGO,
2697 },
2630 { 2698 {
2631 .name = "notify_on_release", 2699 .name = "notify_on_release",
2632 .read_u64 = cgroup_read_notify_on_release, 2700 .read_u64 = cgroup_read_notify_on_release,
2633 .write_u64 = cgroup_write_notify_on_release, 2701 .write_u64 = cgroup_write_notify_on_release,
2634 .private = FILE_NOTIFY_ON_RELEASE,
2635 }, 2702 },
2636}; 2703};
2637 2704
@@ -2640,7 +2707,6 @@ static struct cftype cft_release_agent = {
2640 .read_seq_string = cgroup_release_agent_show, 2707 .read_seq_string = cgroup_release_agent_show,
2641 .write_string = cgroup_release_agent_write, 2708 .write_string = cgroup_release_agent_write,
2642 .max_write_len = PATH_MAX, 2709 .max_write_len = PATH_MAX,
2643 .private = FILE_RELEASE_AGENT,
2644}; 2710};
2645 2711
2646static int cgroup_populate_dir(struct cgroup *cgrp) 2712static int cgroup_populate_dir(struct cgroup *cgrp)