aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2014-02-25 10:04:01 -0500
committerTejun Heo <tj@kernel.org>2014-02-25 10:04:01 -0500
commitc75611282cf1bf717c1866e7a7eb4d0743815187 (patch)
tree4579606847463892cc65977a55bee386b2b5cf2a
parentf153ad11bca27996a5e8e1782557e36e80b03a8c (diff)
cgroup: add css_set->mg_tasks
Currently, while migrating tasks from one cgroup to another, cgroup_attach_task() builds a flex array of all target tasks; unfortunately, this has a couple issues. * Flex array has size limit. On 64bit, struct task_and_cgroup is 24bytes making the flex element limit around 87k. It is a high number but not impossible to hit. This means that the current cgroup implementation can't migrate a process with more than 87k threads. * Process migration involves memory allocation whose size is dependent on the number of threads the process has. This means that cgroup core can't guarantee success or failure of multi-process migrations as memory allocation failure can happen in the middle. This is in part because cgroup can't grab threadgroup locks of multiple processes at the same time, so when there are multiple processes to migrate, it is imposible to tell how many tasks are to be migrated beforehand. Note that this already affects cgroup_transfer_tasks(). cgroup currently cannot guarantee atomic success or failure of the operation. It may fail in the middle and after such failure cgroup doesn't have enough information to roll back properly. It just aborts with some tasks migrated and others not. To resolve the situation, we're going to use task->cg_list during migration too. Instead of building a separate array, target tasks will be linked into a dedicated migration list_head on the owning css_set. Tasks on the migration list are treated the same as tasks on the usual tasks list; however, being on a separate list allows cgroup migration code path to keep track of the target tasks by simply keeping the list of css_sets with tasks being migrated, making unpredictable dynamic allocation unnecessary. In prepartion of such migration path update, this patch introduces css_set->mg_tasks list and updates css_set task iterations so that they walk both css_set->tasks and ->mg_tasks. Note that ->mg_tasks isn't used yet. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com>
-rw-r--r--include/linux/cgroup.h8
-rw-r--r--kernel/cgroup.c56
2 files changed, 43 insertions, 21 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8c283a910b91..528e2aed36c3 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -324,10 +324,14 @@ struct css_set {
324 struct hlist_node hlist; 324 struct hlist_node hlist;
325 325
326 /* 326 /*
327 * List running through all tasks using this cgroup 327 * Lists running through all tasks using this cgroup group.
328 * group. Protected by css_set_lock 328 * mg_tasks lists tasks which belong to this cset but are in the
329 * process of being migrated out or in. Protected by
330 * css_set_rwsem, but, during migration, once tasks are moved to
331 * mg_tasks, it can be read safely while holding cgroup_mutex.
329 */ 332 */
330 struct list_head tasks; 333 struct list_head tasks;
334 struct list_head mg_tasks;
331 335
332 /* 336 /*
333 * List of cgrp_cset_links pointing at cgroups referenced from this 337 * List of cgrp_cset_links pointing at cgroups referenced from this
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8ab800c7bac0..b80c611ff836 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -644,6 +644,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
644 atomic_set(&cset->refcount, 1); 644 atomic_set(&cset->refcount, 1);
645 INIT_LIST_HEAD(&cset->cgrp_links); 645 INIT_LIST_HEAD(&cset->cgrp_links);
646 INIT_LIST_HEAD(&cset->tasks); 646 INIT_LIST_HEAD(&cset->tasks);
647 INIT_LIST_HEAD(&cset->mg_tasks);
647 INIT_HLIST_NODE(&cset->hlist); 648 INIT_HLIST_NODE(&cset->hlist);
648 649
649 /* Copy the set of subsystem state objects generated in 650 /* Copy the set of subsystem state objects generated in
@@ -2590,9 +2591,14 @@ static void css_advance_task_iter(struct css_task_iter *it)
2590 } 2591 }
2591 link = list_entry(l, struct cgrp_cset_link, cset_link); 2592 link = list_entry(l, struct cgrp_cset_link, cset_link);
2592 cset = link->cset; 2593 cset = link->cset;
2593 } while (list_empty(&cset->tasks)); 2594 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
2595
2594 it->cset_link = l; 2596 it->cset_link = l;
2595 it->task = cset->tasks.next; 2597
2598 if (!list_empty(&cset->tasks))
2599 it->task = cset->tasks.next;
2600 else
2601 it->task = cset->mg_tasks.next;
2596} 2602}
2597 2603
2598/** 2604/**
@@ -2636,24 +2642,29 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
2636{ 2642{
2637 struct task_struct *res; 2643 struct task_struct *res;
2638 struct list_head *l = it->task; 2644 struct list_head *l = it->task;
2639 struct cgrp_cset_link *link; 2645 struct cgrp_cset_link *link = list_entry(it->cset_link,
2646 struct cgrp_cset_link, cset_link);
2640 2647
2641 /* If the iterator cg is NULL, we have no tasks */ 2648 /* If the iterator cg is NULL, we have no tasks */
2642 if (!it->cset_link) 2649 if (!it->cset_link)
2643 return NULL; 2650 return NULL;
2644 res = list_entry(l, struct task_struct, cg_list); 2651 res = list_entry(l, struct task_struct, cg_list);
2645 /* Advance iterator to find next entry */ 2652
2653 /*
2654 * Advance iterator to find next entry. cset->tasks is consumed
2655 * first and then ->mg_tasks. After ->mg_tasks, we move onto the
2656 * next cset.
2657 */
2646 l = l->next; 2658 l = l->next;
2647 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); 2659
2648 if (l == &link->cset->tasks) { 2660 if (l == &link->cset->tasks)
2649 /* 2661 l = link->cset->mg_tasks.next;
2650 * We reached the end of this task list - move on to the 2662
2651 * next cgrp_cset_link. 2663 if (l == &link->cset->mg_tasks)
2652 */
2653 css_advance_task_iter(it); 2664 css_advance_task_iter(it);
2654 } else { 2665 else
2655 it->task = l; 2666 it->task = l;
2656 } 2667
2657 return res; 2668 return res;
2658} 2669}
2659 2670
@@ -4502,16 +4513,23 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
4502 struct css_set *cset = link->cset; 4513 struct css_set *cset = link->cset;
4503 struct task_struct *task; 4514 struct task_struct *task;
4504 int count = 0; 4515 int count = 0;
4516
4505 seq_printf(seq, "css_set %p\n", cset); 4517 seq_printf(seq, "css_set %p\n", cset);
4518
4506 list_for_each_entry(task, &cset->tasks, cg_list) { 4519 list_for_each_entry(task, &cset->tasks, cg_list) {
4507 if (count++ > MAX_TASKS_SHOWN_PER_CSS) { 4520 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
4508 seq_puts(seq, " ...\n"); 4521 goto overflow;
4509 break; 4522 seq_printf(seq, " task %d\n", task_pid_vnr(task));
4510 } else { 4523 }
4511 seq_printf(seq, " task %d\n", 4524
4512 task_pid_vnr(task)); 4525 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
4513 } 4526 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
4527 goto overflow;
4528 seq_printf(seq, " task %d\n", task_pid_vnr(task));
4514 } 4529 }
4530 continue;
4531 overflow:
4532 seq_puts(seq, " ...\n");
4515 } 4533 }
4516 up_read(&css_set_rwsem); 4534 up_read(&css_set_rwsem);
4517 return 0; 4535 return 0;