aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/mempolicy.h15
-rw-r--r--kernel/cpuset.c4
-rw-r--r--mm/mempolicy.c124
3 files changed, 119 insertions, 24 deletions
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 1cc966cd3e5f..7b9ef6bf45aa 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -23,6 +23,13 @@ enum {
23 MPOL_MAX, /* always last member of enum */ 23 MPOL_MAX, /* always last member of enum */
24}; 24};
25 25
26enum mpol_rebind_step {
27 MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */
28 MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */
29 MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/
30 MPOL_REBIND_NSTEP,
31};
32
26/* Flags for set_mempolicy */ 33/* Flags for set_mempolicy */
27#define MPOL_F_STATIC_NODES (1 << 15) 34#define MPOL_F_STATIC_NODES (1 << 15)
28#define MPOL_F_RELATIVE_NODES (1 << 14) 35#define MPOL_F_RELATIVE_NODES (1 << 14)
@@ -51,6 +58,7 @@ enum {
51 */ 58 */
52#define MPOL_F_SHARED (1 << 0) /* identify shared policies */ 59#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
53#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ 60#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
61#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
54 62
55#ifdef __KERNEL__ 63#ifdef __KERNEL__
56 64
@@ -193,8 +201,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
193 201
194extern void numa_default_policy(void); 202extern void numa_default_policy(void);
195extern void numa_policy_init(void); 203extern void numa_policy_init(void);
196extern void mpol_rebind_task(struct task_struct *tsk, 204extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
197 const nodemask_t *new); 205 enum mpol_rebind_step step);
198extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); 206extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
199extern void mpol_fix_fork_child_flag(struct task_struct *p); 207extern void mpol_fix_fork_child_flag(struct task_struct *p);
200 208
@@ -308,7 +316,8 @@ static inline void numa_default_policy(void)
308} 316}
309 317
310static inline void mpol_rebind_task(struct task_struct *tsk, 318static inline void mpol_rebind_task(struct task_struct *tsk,
311 const nodemask_t *new) 319 const nodemask_t *new,
320 enum mpol_rebind_step step)
312{ 321{
313} 322}
314 323
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9a50c5f6e727..db0990ac3fac 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -953,8 +953,8 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
953 nodemask_t *newmems) 953 nodemask_t *newmems)
954{ 954{
955 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 955 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
956 mpol_rebind_task(tsk, &tsk->mems_allowed); 956 mpol_rebind_task(tsk, &tsk->mems_allowed, MPOL_REBIND_ONCE);
957 mpol_rebind_task(tsk, newmems); 957 mpol_rebind_task(tsk, newmems, MPOL_REBIND_ONCE);
958 tsk->mems_allowed = *newmems; 958 tsk->mems_allowed = *newmems;
959} 959}
960 960
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0c73c8b814cd..8a993db88029 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -119,7 +119,22 @@ struct mempolicy default_policy = {
119 119
120static const struct mempolicy_operations { 120static const struct mempolicy_operations {
121 int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 121 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 122 /*
123 * If read-side task has no lock to protect task->mempolicy, write-side
124 * task will rebind the task->mempolicy by two step. The first step is
125 * setting all the newly nodes, and the second step is cleaning all the
126 * disallowed nodes. In this way, we can avoid finding no node to alloc
127 * page.
128 * If we have a lock to protect task->mempolicy in read-side, we do
129 * rebind directly.
130 *
131 * step:
132 * MPOL_REBIND_ONCE - do rebind work at once
133 * MPOL_REBIND_STEP1 - set all the newly nodes
134 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
135 */
136 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
137 enum mpol_rebind_step step);
123} mpol_ops[MPOL_MAX]; 138} mpol_ops[MPOL_MAX];
124 139
125/* Check that the nodemask contains at least one populated zone */ 140/* Check that the nodemask contains at least one populated zone */
@@ -274,12 +289,19 @@ void __mpol_put(struct mempolicy *p)
274 kmem_cache_free(policy_cache, p); 289 kmem_cache_free(policy_cache, p);
275} 290}
276 291
277static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 292static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
293 enum mpol_rebind_step step)
278{ 294{
279} 295}
280 296
281static void mpol_rebind_nodemask(struct mempolicy *pol, 297/*
282 const nodemask_t *nodes) 298 * step:
299 * MPOL_REBIND_ONCE - do rebind work at once
300 * MPOL_REBIND_STEP1 - set all the newly nodes
301 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
302 */
303static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
304 enum mpol_rebind_step step)
283{ 305{
284 nodemask_t tmp; 306 nodemask_t tmp;
285 307
@@ -288,12 +310,31 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
288 else if (pol->flags & MPOL_F_RELATIVE_NODES) 310 else if (pol->flags & MPOL_F_RELATIVE_NODES)
289 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 311 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
290 else { 312 else {
291 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, 313 /*
292 *nodes); 314 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
293 pol->w.cpuset_mems_allowed = *nodes; 315 * result
316 */
317 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
318 nodes_remap(tmp, pol->v.nodes,
319 pol->w.cpuset_mems_allowed, *nodes);
320 pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
321 } else if (step == MPOL_REBIND_STEP2) {
322 tmp = pol->w.cpuset_mems_allowed;
323 pol->w.cpuset_mems_allowed = *nodes;
324 } else
325 BUG();
294 } 326 }
295 327
296 pol->v.nodes = tmp; 328 if (nodes_empty(tmp))
329 tmp = *nodes;
330
331 if (step == MPOL_REBIND_STEP1)
332 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
333 else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
334 pol->v.nodes = tmp;
335 else
336 BUG();
337
297 if (!node_isset(current->il_next, tmp)) { 338 if (!node_isset(current->il_next, tmp)) {
298 current->il_next = next_node(current->il_next, tmp); 339 current->il_next = next_node(current->il_next, tmp);
299 if (current->il_next >= MAX_NUMNODES) 340 if (current->il_next >= MAX_NUMNODES)
@@ -304,7 +345,8 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,
304} 345}
305 346
306static void mpol_rebind_preferred(struct mempolicy *pol, 347static void mpol_rebind_preferred(struct mempolicy *pol,
307 const nodemask_t *nodes) 348 const nodemask_t *nodes,
349 enum mpol_rebind_step step)
308{ 350{
309 nodemask_t tmp; 351 nodemask_t tmp;
310 352
@@ -327,16 +369,45 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
327 } 369 }
328} 370}
329 371
330/* Migrate a policy to a different set of nodes */ 372/*
331static void mpol_rebind_policy(struct mempolicy *pol, 373 * mpol_rebind_policy - Migrate a policy to a different set of nodes
332 const nodemask_t *newmask) 374 *
375 * If read-side task has no lock to protect task->mempolicy, write-side
376 * task will rebind the task->mempolicy by two step. The first step is
377 * setting all the newly nodes, and the second step is cleaning all the
378 * disallowed nodes. In this way, we can avoid finding no node to alloc
379 * page.
380 * If we have a lock to protect task->mempolicy in read-side, we do
381 * rebind directly.
382 *
383 * step:
384 * MPOL_REBIND_ONCE - do rebind work at once
385 * MPOL_REBIND_STEP1 - set all the newly nodes
386 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
387 */
388static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
389 enum mpol_rebind_step step)
333{ 390{
334 if (!pol) 391 if (!pol)
335 return; 392 return;
336 if (!mpol_store_user_nodemask(pol) && 393 if (!mpol_store_user_nodemask(pol) && step == 0 &&
337 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 394 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
338 return; 395 return;
339 mpol_ops[pol->mode].rebind(pol, newmask); 396
397 if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
398 return;
399
400 if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
401 BUG();
402
403 if (step == MPOL_REBIND_STEP1)
404 pol->flags |= MPOL_F_REBINDING;
405 else if (step == MPOL_REBIND_STEP2)
406 pol->flags &= ~MPOL_F_REBINDING;
407 else if (step >= MPOL_REBIND_NSTEP)
408 BUG();
409
410 mpol_ops[pol->mode].rebind(pol, newmask, step);
340} 411}
341 412
342/* 413/*
@@ -346,9 +417,10 @@ static void mpol_rebind_policy(struct mempolicy *pol,
346 * Called with task's alloc_lock held. 417 * Called with task's alloc_lock held.
347 */ 418 */
348 419
349void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 420void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
421 enum mpol_rebind_step step)
350{ 422{
351 mpol_rebind_policy(tsk->mempolicy, new); 423 mpol_rebind_policy(tsk->mempolicy, new, step);
352} 424}
353 425
354/* 426/*
@@ -363,7 +435,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
363 435
364 down_write(&mm->mmap_sem); 436 down_write(&mm->mmap_sem);
365 for (vma = mm->mmap; vma; vma = vma->vm_next) 437 for (vma = mm->mmap; vma; vma = vma->vm_next)
366 mpol_rebind_policy(vma->vm_policy, new); 438 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
367 up_write(&mm->mmap_sem); 439 up_write(&mm->mmap_sem);
368} 440}
369 441
@@ -1745,6 +1817,9 @@ EXPORT_SYMBOL(alloc_pages_current);
1745 * with the mems_allowed returned by cpuset_mems_allowed(). This 1817 * with the mems_allowed returned by cpuset_mems_allowed(). This
1746 * keeps mempolicies cpuset relative after its cpuset moves. See 1818 * keeps mempolicies cpuset relative after its cpuset moves. See
1747 * further kernel/cpuset.c update_nodemask(). 1819 * further kernel/cpuset.c update_nodemask().
1820 *
1821 * current's mempolicy may be rebinded by the other task(the task that changes
1822 * cpuset's mems), so we needn't do rebind work for current task.
1748 */ 1823 */
1749 1824
1750/* Slow path of a mempolicy duplicate */ 1825/* Slow path of a mempolicy duplicate */
@@ -1754,13 +1829,24 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
1754 1829
1755 if (!new) 1830 if (!new)
1756 return ERR_PTR(-ENOMEM); 1831 return ERR_PTR(-ENOMEM);
1832
1833 /* task's mempolicy is protected by alloc_lock */
1834 if (old == current->mempolicy) {
1835 task_lock(current);
1836 *new = *old;
1837 task_unlock(current);
1838 } else
1839 *new = *old;
1840
1757 rcu_read_lock(); 1841 rcu_read_lock();
1758 if (current_cpuset_is_being_rebound()) { 1842 if (current_cpuset_is_being_rebound()) {
1759 nodemask_t mems = cpuset_mems_allowed(current); 1843 nodemask_t mems = cpuset_mems_allowed(current);
1760 mpol_rebind_policy(old, &mems); 1844 if (new->flags & MPOL_F_REBINDING)
1845 mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
1846 else
1847 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
1761 } 1848 }
1762 rcu_read_unlock(); 1849 rcu_read_unlock();
1763 *new = *old;
1764 atomic_set(&new->refcnt, 1); 1850 atomic_set(&new->refcnt, 1);
1765 return new; 1851 return new;
1766} 1852}