diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 278 |
1 files changed, 172 insertions, 106 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 22db0a7cf1fa..a9433f50e53d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -1121,7 +1121,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1121 | INIT_LIST_HEAD(&cgrp->children); | 1121 | INIT_LIST_HEAD(&cgrp->children); |
1122 | INIT_LIST_HEAD(&cgrp->css_sets); | 1122 | INIT_LIST_HEAD(&cgrp->css_sets); |
1123 | INIT_LIST_HEAD(&cgrp->release_list); | 1123 | INIT_LIST_HEAD(&cgrp->release_list); |
1124 | init_rwsem(&cgrp->pids_mutex); | 1124 | init_rwsem(&(cgrp->tasks.mutex)); |
1125 | init_rwsem(&(cgrp->procs.mutex)); | ||
1125 | } | 1126 | } |
1126 | 1127 | ||
1127 | static void init_cgroup_root(struct cgroupfs_root *root) | 1128 | static void init_cgroup_root(struct cgroupfs_root *root) |
@@ -1637,15 +1638,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) | |||
1637 | return ret; | 1638 | return ret; |
1638 | } | 1639 | } |
1639 | 1640 | ||
1640 | /* The various types of files and directories in a cgroup file system */ | ||
1641 | enum cgroup_filetype { | ||
1642 | FILE_ROOT, | ||
1643 | FILE_DIR, | ||
1644 | FILE_TASKLIST, | ||
1645 | FILE_NOTIFY_ON_RELEASE, | ||
1646 | FILE_RELEASE_AGENT, | ||
1647 | }; | ||
1648 | |||
1649 | /** | 1641 | /** |
1650 | * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. | 1642 | * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. |
1651 | * @cgrp: the cgroup to be checked for liveness | 1643 | * @cgrp: the cgroup to be checked for liveness |
@@ -2343,7 +2335,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
2343 | } | 2335 | } |
2344 | 2336 | ||
2345 | /* | 2337 | /* |
2346 | * Stuff for reading the 'tasks' file. | 2338 | * Stuff for reading the 'tasks'/'procs' files. |
2347 | * | 2339 | * |
2348 | * Reading this file can return large amounts of data if a cgroup has | 2340 | * Reading this file can return large amounts of data if a cgroup has |
2349 | * *lots* of attached tasks. So it may need several calls to read(), | 2341 | * *lots* of attached tasks. So it may need several calls to read(), |
@@ -2353,27 +2345,106 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
2353 | */ | 2345 | */ |
2354 | 2346 | ||
2355 | /* | 2347 | /* |
2356 | * Load into 'pidarray' up to 'npids' of the tasks using cgroup | 2348 | * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries |
2357 | * 'cgrp'. Return actual number of pids loaded. No need to | 2349 | * If the new stripped list is sufficiently smaller and there's enough memory |
2358 | * task_lock(p) when reading out p->cgroup, since we're in an RCU | 2350 | * to allocate a new buffer, will let go of the unneeded memory. Returns the |
2359 | * read section, so the css_set can't go away, and is | 2351 | * number of unique elements. |
2360 | * immutable after creation. | ||
2361 | */ | 2352 | */ |
2362 | static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) | 2353 | /* is the size difference enough that we should re-allocate the array? */ |
2354 | #define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) | ||
2355 | static int pidlist_uniq(pid_t **p, int length) | ||
2363 | { | 2356 | { |
2364 | int n = 0, pid; | 2357 | int src, dest = 1; |
2358 | pid_t *list = *p; | ||
2359 | pid_t *newlist; | ||
2360 | |||
2361 | /* | ||
2362 | * we presume the 0th element is unique, so i starts at 1. trivial | ||
2363 | * edge cases first; no work needs to be done for either | ||
2364 | */ | ||
2365 | if (length == 0 || length == 1) | ||
2366 | return length; | ||
2367 | /* src and dest walk down the list; dest counts unique elements */ | ||
2368 | for (src = 1; src < length; src++) { | ||
2369 | /* find next unique element */ | ||
2370 | while (list[src] == list[src-1]) { | ||
2371 | src++; | ||
2372 | if (src == length) | ||
2373 | goto after; | ||
2374 | } | ||
2375 | /* dest always points to where the next unique element goes */ | ||
2376 | list[dest] = list[src]; | ||
2377 | dest++; | ||
2378 | } | ||
2379 | after: | ||
2380 | /* | ||
2381 | * if the length difference is large enough, we want to allocate a | ||
2382 | * smaller buffer to save memory. if this fails due to out of memory, | ||
2383 | * we'll just stay with what we've got. | ||
2384 | */ | ||
2385 | if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { | ||
2386 | newlist = krealloc(list, dest * sizeof(pid_t), GFP_KERNEL); | ||
2387 | if (newlist) | ||
2388 | *p = newlist; | ||
2389 | } | ||
2390 | return dest; | ||
2391 | } | ||
2392 | |||
2393 | static int cmppid(const void *a, const void *b) | ||
2394 | { | ||
2395 | return *(pid_t *)a - *(pid_t *)b; | ||
2396 | } | ||
2397 | |||
2398 | /* | ||
2399 | * Load a cgroup's pidarray with either procs' tgids or tasks' pids | ||
2400 | */ | ||
2401 | static int pidlist_array_load(struct cgroup *cgrp, bool procs) | ||
2402 | { | ||
2403 | pid_t *array; | ||
2404 | int length; | ||
2405 | int pid, n = 0; /* used for populating the array */ | ||
2365 | struct cgroup_iter it; | 2406 | struct cgroup_iter it; |
2366 | struct task_struct *tsk; | 2407 | struct task_struct *tsk; |
2408 | struct cgroup_pidlist *l; | ||
2409 | |||
2410 | /* | ||
2411 | * If cgroup gets more users after we read count, we won't have | ||
2412 | * enough space - tough. This race is indistinguishable to the | ||
2413 | * caller from the case that the additional cgroup users didn't | ||
2414 | * show up until sometime later on. | ||
2415 | */ | ||
2416 | length = cgroup_task_count(cgrp); | ||
2417 | array = kmalloc(length * sizeof(pid_t), GFP_KERNEL); | ||
2418 | if (!array) | ||
2419 | return -ENOMEM; | ||
2420 | /* now, populate the array */ | ||
2367 | cgroup_iter_start(cgrp, &it); | 2421 | cgroup_iter_start(cgrp, &it); |
2368 | while ((tsk = cgroup_iter_next(cgrp, &it))) { | 2422 | while ((tsk = cgroup_iter_next(cgrp, &it))) { |
2369 | if (unlikely(n == npids)) | 2423 | if (unlikely(n == length)) |
2370 | break; | 2424 | break; |
2371 | pid = task_pid_vnr(tsk); | 2425 | /* get tgid or pid for procs or tasks file respectively */ |
2372 | if (pid > 0) | 2426 | pid = (procs ? task_tgid_vnr(tsk) : task_pid_vnr(tsk)); |
2373 | pidarray[n++] = pid; | 2427 | if (pid > 0) /* make sure to only use valid results */ |
2428 | array[n++] = pid; | ||
2374 | } | 2429 | } |
2375 | cgroup_iter_end(cgrp, &it); | 2430 | cgroup_iter_end(cgrp, &it); |
2376 | return n; | 2431 | length = n; |
2432 | /* now sort & (if procs) strip out duplicates */ | ||
2433 | sort(array, length, sizeof(pid_t), cmppid, NULL); | ||
2434 | if (procs) { | ||
2435 | length = pidlist_uniq(&array, length); | ||
2436 | l = &(cgrp->procs); | ||
2437 | } else { | ||
2438 | l = &(cgrp->tasks); | ||
2439 | } | ||
2440 | /* store array in cgroup, freeing old if necessary */ | ||
2441 | down_write(&l->mutex); | ||
2442 | kfree(l->list); | ||
2443 | l->list = array; | ||
2444 | l->length = length; | ||
2445 | l->use_count++; | ||
2446 | up_write(&l->mutex); | ||
2447 | return 0; | ||
2377 | } | 2448 | } |
2378 | 2449 | ||
2379 | /** | 2450 | /** |
@@ -2430,19 +2501,14 @@ err: | |||
2430 | return ret; | 2501 | return ret; |
2431 | } | 2502 | } |
2432 | 2503 | ||
2433 | static int cmppid(const void *a, const void *b) | ||
2434 | { | ||
2435 | return *(pid_t *)a - *(pid_t *)b; | ||
2436 | } | ||
2437 | |||
2438 | 2504 | ||
2439 | /* | 2505 | /* |
2440 | * seq_file methods for the "tasks" file. The seq_file position is the | 2506 | * seq_file methods for the tasks/procs files. The seq_file position is the |
2441 | * next pid to display; the seq_file iterator is a pointer to the pid | 2507 | * next pid to display; the seq_file iterator is a pointer to the pid |
2442 | * in the cgroup->tasks_pids array. | 2508 | * in the cgroup->l->list array. |
2443 | */ | 2509 | */ |
2444 | 2510 | ||
2445 | static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) | 2511 | static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) |
2446 | { | 2512 | { |
2447 | /* | 2513 | /* |
2448 | * Initially we receive a position value that corresponds to | 2514 | * Initially we receive a position value that corresponds to |
@@ -2450,46 +2516,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) | |||
2450 | * after a seek to the start). Use a binary-search to find the | 2516 | * after a seek to the start). Use a binary-search to find the |
2451 | * next pid to display, if any | 2517 | * next pid to display, if any |
2452 | */ | 2518 | */ |
2453 | struct cgroup *cgrp = s->private; | 2519 | struct cgroup_pidlist *l = s->private; |
2454 | int index = 0, pid = *pos; | 2520 | int index = 0, pid = *pos; |
2455 | int *iter; | 2521 | int *iter; |
2456 | 2522 | ||
2457 | down_read(&cgrp->pids_mutex); | 2523 | down_read(&l->mutex); |
2458 | if (pid) { | 2524 | if (pid) { |
2459 | int end = cgrp->pids_length; | 2525 | int end = l->length; |
2460 | 2526 | ||
2461 | while (index < end) { | 2527 | while (index < end) { |
2462 | int mid = (index + end) / 2; | 2528 | int mid = (index + end) / 2; |
2463 | if (cgrp->tasks_pids[mid] == pid) { | 2529 | if (l->list[mid] == pid) { |
2464 | index = mid; | 2530 | index = mid; |
2465 | break; | 2531 | break; |
2466 | } else if (cgrp->tasks_pids[mid] <= pid) | 2532 | } else if (l->list[mid] <= pid) |
2467 | index = mid + 1; | 2533 | index = mid + 1; |
2468 | else | 2534 | else |
2469 | end = mid; | 2535 | end = mid; |
2470 | } | 2536 | } |
2471 | } | 2537 | } |
2472 | /* If we're off the end of the array, we're done */ | 2538 | /* If we're off the end of the array, we're done */ |
2473 | if (index >= cgrp->pids_length) | 2539 | if (index >= l->length) |
2474 | return NULL; | 2540 | return NULL; |
2475 | /* Update the abstract position to be the actual pid that we found */ | 2541 | /* Update the abstract position to be the actual pid that we found */ |
2476 | iter = cgrp->tasks_pids + index; | 2542 | iter = l->list + index; |
2477 | *pos = *iter; | 2543 | *pos = *iter; |
2478 | return iter; | 2544 | return iter; |
2479 | } | 2545 | } |
2480 | 2546 | ||
2481 | static void cgroup_tasks_stop(struct seq_file *s, void *v) | 2547 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) |
2482 | { | 2548 | { |
2483 | struct cgroup *cgrp = s->private; | 2549 | struct cgroup_pidlist *l = s->private; |
2484 | up_read(&cgrp->pids_mutex); | 2550 | up_read(&l->mutex); |
2485 | } | 2551 | } |
2486 | 2552 | ||
2487 | static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) | 2553 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) |
2488 | { | 2554 | { |
2489 | struct cgroup *cgrp = s->private; | 2555 | struct cgroup_pidlist *l = s->private; |
2490 | int *p = v; | 2556 | pid_t *p = v; |
2491 | int *end = cgrp->tasks_pids + cgrp->pids_length; | 2557 | pid_t *end = l->list + l->length; |
2492 | |||
2493 | /* | 2558 | /* |
2494 | * Advance to the next pid in the array. If this goes off the | 2559 | * Advance to the next pid in the array. If this goes off the |
2495 | * end, we're done | 2560 | * end, we're done |
@@ -2503,98 +2568,94 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) | |||
2503 | } | 2568 | } |
2504 | } | 2569 | } |
2505 | 2570 | ||
2506 | static int cgroup_tasks_show(struct seq_file *s, void *v) | 2571 | static int cgroup_pidlist_show(struct seq_file *s, void *v) |
2507 | { | 2572 | { |
2508 | return seq_printf(s, "%d\n", *(int *)v); | 2573 | return seq_printf(s, "%d\n", *(int *)v); |
2509 | } | 2574 | } |
2510 | 2575 | ||
2511 | static const struct seq_operations cgroup_tasks_seq_operations = { | 2576 | /* |
2512 | .start = cgroup_tasks_start, | 2577 | * seq_operations functions for iterating on pidlists through seq_file - |
2513 | .stop = cgroup_tasks_stop, | 2578 | * independent of whether it's tasks or procs |
2514 | .next = cgroup_tasks_next, | 2579 | */ |
2515 | .show = cgroup_tasks_show, | 2580 | static const struct seq_operations cgroup_pidlist_seq_operations = { |
2581 | .start = cgroup_pidlist_start, | ||
2582 | .stop = cgroup_pidlist_stop, | ||
2583 | .next = cgroup_pidlist_next, | ||
2584 | .show = cgroup_pidlist_show, | ||
2516 | }; | 2585 | }; |
2517 | 2586 | ||
2518 | static void release_cgroup_pid_array(struct cgroup *cgrp) | 2587 | static void cgroup_release_pid_array(struct cgroup_pidlist *l) |
2519 | { | 2588 | { |
2520 | down_write(&cgrp->pids_mutex); | 2589 | down_write(&l->mutex); |
2521 | BUG_ON(!cgrp->pids_use_count); | 2590 | BUG_ON(!l->use_count); |
2522 | if (!--cgrp->pids_use_count) { | 2591 | if (!--l->use_count) { |
2523 | kfree(cgrp->tasks_pids); | 2592 | kfree(l->list); |
2524 | cgrp->tasks_pids = NULL; | 2593 | l->list = NULL; |
2525 | cgrp->pids_length = 0; | 2594 | l->length = 0; |
2526 | } | 2595 | } |
2527 | up_write(&cgrp->pids_mutex); | 2596 | up_write(&l->mutex); |
2528 | } | 2597 | } |
2529 | 2598 | ||
2530 | static int cgroup_tasks_release(struct inode *inode, struct file *file) | 2599 | static int cgroup_pidlist_release(struct inode *inode, struct file *file) |
2531 | { | 2600 | { |
2532 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2601 | struct cgroup_pidlist *l; |
2533 | |||
2534 | if (!(file->f_mode & FMODE_READ)) | 2602 | if (!(file->f_mode & FMODE_READ)) |
2535 | return 0; | 2603 | return 0; |
2536 | 2604 | /* | |
2537 | release_cgroup_pid_array(cgrp); | 2605 | * the seq_file will only be initialized if the file was opened for |
2606 | * reading; hence we check if it's not null only in that case. | ||
2607 | */ | ||
2608 | l = ((struct seq_file *)file->private_data)->private; | ||
2609 | cgroup_release_pid_array(l); | ||
2538 | return seq_release(inode, file); | 2610 | return seq_release(inode, file); |
2539 | } | 2611 | } |
2540 | 2612 | ||
2541 | static struct file_operations cgroup_tasks_operations = { | 2613 | static const struct file_operations cgroup_pidlist_operations = { |
2542 | .read = seq_read, | 2614 | .read = seq_read, |
2543 | .llseek = seq_lseek, | 2615 | .llseek = seq_lseek, |
2544 | .write = cgroup_file_write, | 2616 | .write = cgroup_file_write, |
2545 | .release = cgroup_tasks_release, | 2617 | .release = cgroup_pidlist_release, |
2546 | }; | 2618 | }; |
2547 | 2619 | ||
2548 | /* | 2620 | /* |
2549 | * Handle an open on 'tasks' file. Prepare an array containing the | 2621 | * The following functions handle opens on a file that displays a pidlist |
2550 | * process id's of tasks currently attached to the cgroup being opened. | 2622 | * (tasks or procs). Prepare an array of the process/thread IDs of whoever's |
2623 | * in the cgroup. | ||
2551 | */ | 2624 | */ |
2552 | 2625 | /* helper function for the two below it */ | |
2553 | static int cgroup_tasks_open(struct inode *unused, struct file *file) | 2626 | static int cgroup_pidlist_open(struct file *file, bool procs) |
2554 | { | 2627 | { |
2555 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2628 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
2556 | pid_t *pidarray; | 2629 | struct cgroup_pidlist *l = (procs ? &cgrp->procs : &cgrp->tasks); |
2557 | int npids; | ||
2558 | int retval; | 2630 | int retval; |
2559 | 2631 | ||
2560 | /* Nothing to do for write-only files */ | 2632 | /* Nothing to do for write-only files */ |
2561 | if (!(file->f_mode & FMODE_READ)) | 2633 | if (!(file->f_mode & FMODE_READ)) |
2562 | return 0; | 2634 | return 0; |
2563 | 2635 | ||
2564 | /* | 2636 | /* have the array populated */ |
2565 | * If cgroup gets more users after we read count, we won't have | 2637 | retval = pidlist_array_load(cgrp, procs); |
2566 | * enough space - tough. This race is indistinguishable to the | 2638 | if (retval) |
2567 | * caller from the case that the additional cgroup users didn't | 2639 | return retval; |
2568 | * show up until sometime later on. | 2640 | /* configure file information */ |
2569 | */ | 2641 | file->f_op = &cgroup_pidlist_operations; |
2570 | npids = cgroup_task_count(cgrp); | ||
2571 | pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); | ||
2572 | if (!pidarray) | ||
2573 | return -ENOMEM; | ||
2574 | npids = pid_array_load(pidarray, npids, cgrp); | ||
2575 | sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); | ||
2576 | |||
2577 | /* | ||
2578 | * Store the array in the cgroup, freeing the old | ||
2579 | * array if necessary | ||
2580 | */ | ||
2581 | down_write(&cgrp->pids_mutex); | ||
2582 | kfree(cgrp->tasks_pids); | ||
2583 | cgrp->tasks_pids = pidarray; | ||
2584 | cgrp->pids_length = npids; | ||
2585 | cgrp->pids_use_count++; | ||
2586 | up_write(&cgrp->pids_mutex); | ||
2587 | |||
2588 | file->f_op = &cgroup_tasks_operations; | ||
2589 | 2642 | ||
2590 | retval = seq_open(file, &cgroup_tasks_seq_operations); | 2643 | retval = seq_open(file, &cgroup_pidlist_seq_operations); |
2591 | if (retval) { | 2644 | if (retval) { |
2592 | release_cgroup_pid_array(cgrp); | 2645 | cgroup_release_pid_array(l); |
2593 | return retval; | 2646 | return retval; |
2594 | } | 2647 | } |
2595 | ((struct seq_file *)file->private_data)->private = cgrp; | 2648 | ((struct seq_file *)file->private_data)->private = l; |
2596 | return 0; | 2649 | return 0; |
2597 | } | 2650 | } |
2651 | static int cgroup_tasks_open(struct inode *unused, struct file *file) | ||
2652 | { | ||
2653 | return cgroup_pidlist_open(file, false); | ||
2654 | } | ||
2655 | static int cgroup_procs_open(struct inode *unused, struct file *file) | ||
2656 | { | ||
2657 | return cgroup_pidlist_open(file, true); | ||
2658 | } | ||
2598 | 2659 | ||
2599 | static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, | 2660 | static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, |
2600 | struct cftype *cft) | 2661 | struct cftype *cft) |
@@ -2617,21 +2678,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, | |||
2617 | /* | 2678 | /* |
2618 | * for the common functions, 'private' gives the type of file | 2679 | * for the common functions, 'private' gives the type of file |
2619 | */ | 2680 | */ |
2681 | /* for hysterical raisins, we can't put this on the older files */ | ||
2682 | #define CGROUP_FILE_GENERIC_PREFIX "cgroup." | ||
2620 | static struct cftype files[] = { | 2683 | static struct cftype files[] = { |
2621 | { | 2684 | { |
2622 | .name = "tasks", | 2685 | .name = "tasks", |
2623 | .open = cgroup_tasks_open, | 2686 | .open = cgroup_tasks_open, |
2624 | .write_u64 = cgroup_tasks_write, | 2687 | .write_u64 = cgroup_tasks_write, |
2625 | .release = cgroup_tasks_release, | 2688 | .release = cgroup_pidlist_release, |
2626 | .private = FILE_TASKLIST, | ||
2627 | .mode = S_IRUGO | S_IWUSR, | 2689 | .mode = S_IRUGO | S_IWUSR, |
2628 | }, | 2690 | }, |
2629 | 2691 | { | |
2692 | .name = CGROUP_FILE_GENERIC_PREFIX "procs", | ||
2693 | .open = cgroup_procs_open, | ||
2694 | /* .write_u64 = cgroup_procs_write, TODO */ | ||
2695 | .release = cgroup_pidlist_release, | ||
2696 | .mode = S_IRUGO, | ||
2697 | }, | ||
2630 | { | 2698 | { |
2631 | .name = "notify_on_release", | 2699 | .name = "notify_on_release", |
2632 | .read_u64 = cgroup_read_notify_on_release, | 2700 | .read_u64 = cgroup_read_notify_on_release, |
2633 | .write_u64 = cgroup_write_notify_on_release, | 2701 | .write_u64 = cgroup_write_notify_on_release, |
2634 | .private = FILE_NOTIFY_ON_RELEASE, | ||
2635 | }, | 2702 | }, |
2636 | }; | 2703 | }; |
2637 | 2704 | ||
@@ -2640,7 +2707,6 @@ static struct cftype cft_release_agent = { | |||
2640 | .read_seq_string = cgroup_release_agent_show, | 2707 | .read_seq_string = cgroup_release_agent_show, |
2641 | .write_string = cgroup_release_agent_write, | 2708 | .write_string = cgroup_release_agent_write, |
2642 | .max_write_len = PATH_MAX, | 2709 | .max_write_len = PATH_MAX, |
2643 | .private = FILE_RELEASE_AGENT, | ||
2644 | }; | 2710 | }; |
2645 | 2711 | ||
2646 | static int cgroup_populate_dir(struct cgroup *cgrp) | 2712 | static int cgroup_populate_dir(struct cgroup *cgrp) |