aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMiao Xie <miaox@cn.fujitsu.com>2010-05-24 17:32:07 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-05-25 11:06:57 -0400
commit708c1bbc9d0c3e57f40501794d9b0eed29d10fce (patch)
tree4b19caf68c420b32abdf05af3d413b1320f04fbe
parent971ada0f6659488c3f36aed4c6f7670ff5ce4368 (diff)
mempolicy: restructure rebinding-mempolicy functions
Nick Piggin reported that the allocator may see an empty nodemask when changing cpuset's mems[1]. It happens only on the kernel that do not do atomic nodemask_t stores. (MAX_NUMNODES > BITS_PER_LONG) But I found that there is also a problem on the kernel that can do atomic nodemask_t stores. The problem is that the allocator can't find a node to alloc page when changing cpuset's mems though there is a lot of free memory. The reason is like this: (mpol: mempolicy) task1 task1's mpol task2 alloc page 1 alloc on node0? NO 1 1 change mems from 1 to 0 1 rebind task1's mpol 0-1 set new bits 0 clear disallowed bits alloc on node1? NO 0 ... can't alloc page goto oom I can use the attached program reproduce it by the following step: # mkdir /dev/cpuset # mount -t cpuset cpuset /dev/cpuset # mkdir /dev/cpuset/1 # echo `cat /dev/cpuset/cpus` > /dev/cpuset/1/cpus # echo `cat /dev/cpuset/mems` > /dev/cpuset/1/mems # echo $$ > /dev/cpuset/1/tasks # numactl --membind=`cat /dev/cpuset/mems` ./cpuset_mem_hog <nr_tasks> & <nr_tasks> = max(nr_cpus - 1, 1) # killall -s SIGUSR1 cpuset_mem_hog # ./change_mems.sh several hours later, oom will happen though there is a lot of free memory. This patchset fixes this problem by expanding the nodes range first(set newly allowed bits) and shrink it lazily(clear newly disallowed bits). So we use a variable to tell the write-side task that read-side task is reading nodemask, and the write-side task clears newly disallowed nodes after read-side task ends the current memory allocation. This patch: In order to fix no node to alloc memory, when we want to update mempolicy and mems_allowed, we expand the set of nodes first (set all the newly nodes) and shrink the set of nodes lazily(clean disallowed nodes), But the mempolicy's rebind functions may breaks the expanding. So we restructure the mempolicy's rebind functions and split the rebind work to two steps, just like the update of cpuset's mems: The 1st step: expand the set of the mempolicy's nodes. The 2nd step: shrink the set of the mempolicy's nodes. It is used when there is no real lock to protect the mempolicy in the read-side. Otherwise we can do rebind work at once. In order to implement it, we define enum mpol_rebind_step { MPOL_REBIND_ONCE, MPOL_REBIND_STEP1, MPOL_REBIND_STEP2, MPOL_REBIND_NSTEP, }; If the mempolicy needn't be updated by two steps, we can pass MPOL_REBIND_ONCE to the rebind functions. Or we can pass MPOL_REBIND_STEP1 to do the first step of the rebind work and pass MPOL_REBIND_STEP2 to do the second step work. Besides that, it maybe long time between these two step and we have to release the lock that protects mempolicy and mems_allowed. If we hold the lock once again, we must check whether the current mempolicy is under the rebinding (the first step has been done) or not, because the task may alloc a new mempolicy when we don't hold the lock. So we defined the following flag to identify it: #define MPOL_F_REBINDING (1 << 2) The new functions will be used in the next patch. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Paul Menage <menage@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Ravikiran Thirumalai <kiran@scalex86.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Andi Kleen <andi@firstfloor.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mempolicy.h15
-rw-r--r--kernel/cpuset.c4
-rw-r--r--mm/mempolicy.c124
3 files changed, 119 insertions, 24 deletions
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 1cc966cd3e5f..7b9ef6bf45aa 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -23,6 +23,13 @@ enum {
23 MPOL_MAX, /* always last member of enum */ 23 MPOL_MAX, /* always last member of enum */
24}; 24};
25 25
26enum mpol_rebind_step {
27 MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */
28 MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */
29 MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/
30 MPOL_REBIND_NSTEP,
31};
32
26/* Flags for set_mempolicy */ 33/* Flags for set_mempolicy */
27#define MPOL_F_STATIC_NODES (1 << 15) 34#define MPOL_F_STATIC_NODES (1 << 15)
28#define MPOL_F_RELATIVE_NODES (1 << 14) 35#define MPOL_F_RELATIVE_NODES (1 << 14)
@@ -51,6 +58,7 @@ enum {
51 */ 58 */
52#define MPOL_F_SHARED (1 << 0) /* identify shared policies */ 59#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
53#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ 60#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
61#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
54 62
55#ifdef __KERNEL__ 63#ifdef __KERNEL__
56 64
@@ -193,8 +201,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
193 201
194extern void numa_default_policy(void); 202extern void numa_default_policy(void);
195extern void numa_policy_init(void); 203extern void numa_policy_init(void);
196extern void mpol_rebind_task(struct task_struct *tsk, 204extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
197 const nodemask_t *new); 205 enum mpol_rebind_step step);
198extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); 206extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
199extern void mpol_fix_fork_child_flag(struct task_struct *p); 207extern void mpol_fix_fork_child_flag(struct task_struct *p);
200 208
@@ -308,7 +316,8 @@ static inline void numa_default_policy(void)
308} 316}
309 317
310static inline void mpol_rebind_task(struct task_struct *tsk, 318static inline void mpol_rebind_task(struct task_struct *tsk,
311 const nodemask_t *new) 319 const nodemask_t *new,
320 enum mpol_rebind_step step)
312{ 321{
313} 322}
314 323
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9a50c5f6e727..db0990ac3fac 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -953,8 +953,8 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
953 nodemask_t *newmems) 953 nodemask_t *newmems)
954{ 954{
955 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 955 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
956 mpol_rebind_task(tsk, &tsk->mems_allowed); 956 mpol_rebind_task(tsk, &tsk->mems_allowed, MPOL_REBIND_ONCE);
957 mpol_rebind_task(tsk, newmems); 957 mpol_rebind_task(tsk, newmems, MPOL_REBIND_ONCE);
958 tsk->mems_allowed = *newmems; 958 tsk->mems_allowed = *newmems;
959} 959}
960 960
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0c73c8b814cd..8a993db88029 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -119,7 +119,22 @@ struct mempolicy default_policy = {
119 119
120static const struct mempolicy_operations { 120static const struct mempolicy_operations {
121 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 121 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 122 /*
123 * If read-side task has no lock to protect task->mempolicy, write-side
124 * task will rebind the task->mempolicy by two step. The first step is
125 * setting all the newly nodes, and the second step is cleaning all the
126 * disallowed nodes. In this way, we can avoid finding no node to alloc
127 * page.
128 * If we have a lock to protect task->mempolicy in read-side, we do
129 * rebind directly.
130 *
131 * step:
132 * MPOL_REBIND_ONCE - do rebind work at once
133 * MPOL_REBIND_STEP1 - set all the newly nodes
134 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
135 */
136 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
137 enum mpol_rebind_step step);
123} mpol_ops[MPOL_MAX]; 138} mpol_ops[MPOL_MAX];
124 139
125/* Check that the nodemask contains at least one populated zone */ 140/* Check that the nodemask contains at least one populated zone */
@@ -274,12 +289,19 @@ void __mpol_put(struct mempolicy *p)
274 kmem_cache_free(policy_cache, p); 289 kmem_cache_free(policy_cache, p);
275} 290}
276 291
277static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 292static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
293 enum mpol_rebind_step step)
278{ 294{
279} 295}
280 296
281static void mpol_rebind_nodemask(struct mempolicy *pol, 297/*
282 const nodemask_t *nodes) 298 * step:
299 * MPOL_REBIND_ONCE - do rebind work at once
300 * MPOL_REBIND_STEP1 - set all the newly nodes
301 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
302 */
303static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
304 enum mpol_rebind_step step)
283{ 305{
284 nodemask_t tmp; 306 nodemask_t tmp;
285 307
@@ -288,12 +310,31 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
288 else if (pol->flags & MPOL_F_RELATIVE_NODES) 310 else if (pol->flags & MPOL_F_RELATIVE_NODES)
289 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 311 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
290 else { 312 else {
291 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, 313 /*
292 *nodes); 314 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
293 pol->w.cpuset_mems_allowed = *nodes; 315 * result
316 */
317 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
318 nodes_remap(tmp, pol->v.nodes,
319 pol->w.cpuset_mems_allowed, *nodes);
320 pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
321 } else if (step == MPOL_REBIND_STEP2) {
322 tmp = pol->w.cpuset_mems_allowed;
323 pol->w.cpuset_mems_allowed = *nodes;
324 } else
325 BUG();
294 } 326 }
295 327
296 pol->v.nodes = tmp; 328 if (nodes_empty(tmp))
329 tmp = *nodes;
330
331 if (step == MPOL_REBIND_STEP1)
332 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
333 else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
334 pol->v.nodes = tmp;
335 else
336 BUG();
337
297 if (!node_isset(current->il_next, tmp)) { 338 if (!node_isset(current->il_next, tmp)) {
298 current->il_next = next_node(current->il_next, tmp); 339 current->il_next = next_node(current->il_next, tmp);
299 if (current->il_next >= MAX_NUMNODES) 340 if (current->il_next >= MAX_NUMNODES)
@@ -304,7 +345,8 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
304} 345}
305 346
306static void mpol_rebind_preferred(struct mempolicy *pol, 347static void mpol_rebind_preferred(struct mempolicy *pol,
307 const nodemask_t *nodes) 348 const nodemask_t *nodes,
349 enum mpol_rebind_step step)
308{ 350{
309 nodemask_t tmp; 351 nodemask_t tmp;
310 352
@@ -327,16 +369,45 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
327 } 369 }
328} 370}
329 371
330/* Migrate a policy to a different set of nodes */ 372/*
331static void mpol_rebind_policy(struct mempolicy *pol, 373 * mpol_rebind_policy - Migrate a policy to a different set of nodes
332 const nodemask_t *newmask) 374 *
375 * If read-side task has no lock to protect task->mempolicy, write-side
376 * task will rebind the task->mempolicy by two step. The first step is
377 * setting all the newly nodes, and the second step is cleaning all the
378 * disallowed nodes. In this way, we can avoid finding no node to alloc
379 * page.
380 * If we have a lock to protect task->mempolicy in read-side, we do
381 * rebind directly.
382 *
383 * step:
384 * MPOL_REBIND_ONCE - do rebind work at once
385 * MPOL_REBIND_STEP1 - set all the newly nodes
386 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
387 */
388static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
389 enum mpol_rebind_step step)
333{ 390{
334 if (!pol) 391 if (!pol)
335 return; 392 return;
336 if (!mpol_store_user_nodemask(pol) && 393 if (!mpol_store_user_nodemask(pol) && step == 0 &&
337 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 394 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
338 return; 395 return;
339 mpol_ops[pol->mode].rebind(pol, newmask); 396
397 if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
398 return;
399
400 if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
401 BUG();
402
403 if (step == MPOL_REBIND_STEP1)
404 pol->flags |= MPOL_F_REBINDING;
405 else if (step == MPOL_REBIND_STEP2)
406 pol->flags &= ~MPOL_F_REBINDING;
407 else if (step >= MPOL_REBIND_NSTEP)
408 BUG();
409
410 mpol_ops[pol->mode].rebind(pol, newmask, step);
340} 411}
341 412
342/* 413/*
@@ -346,9 +417,10 @@ static void mpol_rebind_policy(struct mempolicy *pol,
346 * Called with task's alloc_lock held. 417 * Called with task's alloc_lock held.
347 */ 418 */
348 419
349void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 420void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
421 enum mpol_rebind_step step)
350{ 422{
351 mpol_rebind_policy(tsk->mempolicy, new); 423 mpol_rebind_policy(tsk->mempolicy, new, step);
352} 424}
353 425
354/* 426/*
@@ -363,7 +435,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
363 435
364 down_write(&mm->mmap_sem); 436 down_write(&mm->mmap_sem);
365 for (vma = mm->mmap; vma; vma = vma->vm_next) 437 for (vma = mm->mmap; vma; vma = vma->vm_next)
366 mpol_rebind_policy(vma->vm_policy, new); 438 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
367 up_write(&mm->mmap_sem); 439 up_write(&mm->mmap_sem);
368} 440}
369 441
@@ -1745,6 +1817,9 @@ EXPORT_SYMBOL(alloc_pages_current);
1745 * with the mems_allowed returned by cpuset_mems_allowed(). This 1817 * with the mems_allowed returned by cpuset_mems_allowed(). This
1746 * keeps mempolicies cpuset relative after its cpuset moves. See 1818 * keeps mempolicies cpuset relative after its cpuset moves. See
1747 * further kernel/cpuset.c update_nodemask(). 1819 * further kernel/cpuset.c update_nodemask().
1820 *
1821 * current's mempolicy may be rebinded by the other task(the task that changes
1822 * cpuset's mems), so we needn't do rebind work for current task.
1748 */ 1823 */
1749 1824
1750/* Slow path of a mempolicy duplicate */ 1825/* Slow path of a mempolicy duplicate */
@@ -1754,13 +1829,24 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
1754 1829
1755 if (!new) 1830 if (!new)
1756 return ERR_PTR(-ENOMEM); 1831 return ERR_PTR(-ENOMEM);
1832
1833 /* task's mempolicy is protected by alloc_lock */
1834 if (old == current->mempolicy) {
1835 task_lock(current);
1836 *new = *old;
1837 task_unlock(current);
1838 } else
1839 *new = *old;
1840
1757 rcu_read_lock(); 1841 rcu_read_lock();
1758 if (current_cpuset_is_being_rebound()) { 1842 if (current_cpuset_is_being_rebound()) {
1759 nodemask_t mems = cpuset_mems_allowed(current); 1843 nodemask_t mems = cpuset_mems_allowed(current);
1760 mpol_rebind_policy(old, &mems); 1844 if (new->flags & MPOL_F_REBINDING)
1845 mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
1846 else
1847 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
1761 } 1848 }
1762 rcu_read_unlock(); 1849 rcu_read_unlock();
1763 *new = *old;
1764 atomic_set(&new->refcnt, 1); 1850 atomic_set(&new->refcnt, 1);
1765 return new; 1851 return new;
1766} 1852}