diff options
author | Li Zefan <lizf@cn.fujitsu.com> | 2009-04-02 19:57:51 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-04-02 22:04:57 -0400 |
commit | 3b6766fe668b83c8a03c6ed01bcc2ac77cbae848 (patch) | |
tree | 8b109576301d849406f080c61f4ce1809556ad0b /kernel/cpuset.c | |
parent | bd1a8ab73edd449fecda633449cc277b856ad4f5 (diff) |
cpuset: rewrite update_tasks_nodemask()
This patch uses cgroup_scan_tasks() to rebind tasks' vmas to new cpuset's
mems_allowed.
Not only simplify the code largely, but also avoid allocating an array to
hold mm pointers of all the tasks in the cpuset. This array can be big
(size > PAGESIZE) if we have lots of tasks in that cpuset, thus has a
chance to fail the allocation when under memory stress.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 109 |
1 files changed, 39 insertions, 70 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 31737957cb62..dca455e0482e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1026,6 +1026,31 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
1026 | mutex_unlock(&callback_mutex); | 1026 | mutex_unlock(&callback_mutex); |
1027 | } | 1027 | } |
1028 | 1028 | ||
1029 | /* | ||
1030 | * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new | ||
1031 | * nodes if memory_migrate flag is set. Called with cgroup_mutex held. | ||
1032 | */ | ||
1033 | static void cpuset_change_nodemask(struct task_struct *p, | ||
1034 | struct cgroup_scanner *scan) | ||
1035 | { | ||
1036 | struct mm_struct *mm; | ||
1037 | struct cpuset *cs; | ||
1038 | int migrate; | ||
1039 | const nodemask_t *oldmem = scan->data; | ||
1040 | |||
1041 | mm = get_task_mm(p); | ||
1042 | if (!mm) | ||
1043 | return; | ||
1044 | |||
1045 | cs = cgroup_cs(scan->cg); | ||
1046 | migrate = is_memory_migrate(cs); | ||
1047 | |||
1048 | mpol_rebind_mm(mm, &cs->mems_allowed); | ||
1049 | if (migrate) | ||
1050 | cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); | ||
1051 | mmput(mm); | ||
1052 | } | ||
1053 | |||
1029 | static void *cpuset_being_rebound; | 1054 | static void *cpuset_being_rebound; |
1030 | 1055 | ||
1031 | /** | 1056 | /** |
@@ -1038,88 +1063,32 @@ static void *cpuset_being_rebound; | |||
1038 | */ | 1063 | */ |
1039 | static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) | 1064 | static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) |
1040 | { | 1065 | { |
1041 | struct task_struct *p; | ||
1042 | struct mm_struct **mmarray; | ||
1043 | int i, n, ntasks; | ||
1044 | int migrate; | ||
1045 | int fudge; | ||
1046 | struct cgroup_iter it; | ||
1047 | int retval; | 1066 | int retval; |
1067 | struct cgroup_scanner scan; | ||
1048 | 1068 | ||
1049 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ | 1069 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
1050 | 1070 | ||
1051 | fudge = 10; /* spare mmarray[] slots */ | 1071 | scan.cg = cs->css.cgroup; |
1052 | fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */ | 1072 | scan.test_task = NULL; |
1053 | retval = -ENOMEM; | 1073 | scan.process_task = cpuset_change_nodemask; |
1054 | 1074 | scan.heap = NULL; | |
1055 | /* | 1075 | scan.data = (nodemask_t *)oldmem; |
1056 | * Allocate mmarray[] to hold mm reference for each task | ||
1057 | * in cpuset cs. Can't kmalloc GFP_KERNEL while holding | ||
1058 | * tasklist_lock. We could use GFP_ATOMIC, but with a | ||
1059 | * few more lines of code, we can retry until we get a big | ||
1060 | * enough mmarray[] w/o using GFP_ATOMIC. | ||
1061 | */ | ||
1062 | while (1) { | ||
1063 | ntasks = cgroup_task_count(cs->css.cgroup); /* guess */ | ||
1064 | ntasks += fudge; | ||
1065 | mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); | ||
1066 | if (!mmarray) | ||
1067 | goto done; | ||
1068 | read_lock(&tasklist_lock); /* block fork */ | ||
1069 | if (cgroup_task_count(cs->css.cgroup) <= ntasks) | ||
1070 | break; /* got enough */ | ||
1071 | read_unlock(&tasklist_lock); /* try again */ | ||
1072 | kfree(mmarray); | ||
1073 | } | ||
1074 | |||
1075 | n = 0; | ||
1076 | |||
1077 | /* Load up mmarray[] with mm reference for each task in cpuset. */ | ||
1078 | cgroup_iter_start(cs->css.cgroup, &it); | ||
1079 | while ((p = cgroup_iter_next(cs->css.cgroup, &it))) { | ||
1080 | struct mm_struct *mm; | ||
1081 | |||
1082 | if (n >= ntasks) { | ||
1083 | printk(KERN_WARNING | ||
1084 | "Cpuset mempolicy rebind incomplete.\n"); | ||
1085 | break; | ||
1086 | } | ||
1087 | mm = get_task_mm(p); | ||
1088 | if (!mm) | ||
1089 | continue; | ||
1090 | mmarray[n++] = mm; | ||
1091 | } | ||
1092 | cgroup_iter_end(cs->css.cgroup, &it); | ||
1093 | read_unlock(&tasklist_lock); | ||
1094 | 1076 | ||
1095 | /* | 1077 | /* |
1096 | * Now that we've dropped the tasklist spinlock, we can | 1078 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't |
1097 | * rebind the vma mempolicies of each mm in mmarray[] to their | 1079 | * take while holding tasklist_lock. Forks can happen - the |
1098 | * new cpuset, and release that mm. The mpol_rebind_mm() | 1080 | * mpol_dup() cpuset_being_rebound check will catch such forks, |
1099 | * call takes mmap_sem, which we couldn't take while holding | 1081 | * and rebind their vma mempolicies too. Because we still hold |
1100 | * tasklist_lock. Forks can happen again now - the mpol_dup() | 1082 | * the global cgroup_mutex, we know that no other rebind effort |
1101 | * cpuset_being_rebound check will catch such forks, and rebind | 1083 | * will be contending for the global variable cpuset_being_rebound. |
1102 | * their vma mempolicies too. Because we still hold the global | ||
1103 | * cgroup_mutex, we know that no other rebind effort will | ||
1104 | * be contending for the global variable cpuset_being_rebound. | ||
1105 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | 1084 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() |
1106 | * is idempotent. Also migrate pages in each mm to new nodes. | 1085 | * is idempotent. Also migrate pages in each mm to new nodes. |
1107 | */ | 1086 | */ |
1108 | migrate = is_memory_migrate(cs); | 1087 | retval = cgroup_scan_tasks(&scan); |
1109 | for (i = 0; i < n; i++) { | ||
1110 | struct mm_struct *mm = mmarray[i]; | ||
1111 | |||
1112 | mpol_rebind_mm(mm, &cs->mems_allowed); | ||
1113 | if (migrate) | ||
1114 | cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); | ||
1115 | mmput(mm); | ||
1116 | } | ||
1117 | 1088 | ||
1118 | /* We're done rebinding vmas to this cpuset's new mems_allowed. */ | 1089 | /* We're done rebinding vmas to this cpuset's new mems_allowed. */ |
1119 | kfree(mmarray); | ||
1120 | cpuset_being_rebound = NULL; | 1090 | cpuset_being_rebound = NULL; |
1121 | retval = 0; | 1091 | |
1122 | done: | ||
1123 | return retval; | 1092 | return retval; |
1124 | } | 1093 | } |
1125 | 1094 | ||