aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
authorLee Schermerhorn <lee.schermerhorn@hp.com>2008-04-28 05:13:16 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-28 11:58:24 -0400
commit52cd3b074050dd664380b5e8cfc85d4a6ed8ad48 (patch)
treefcfcf55c0e81376ea34919fab26e29bedd7f3b88 /mm/mempolicy.c
parenta6020ed759404372e8be2b276e85e51735472cc9 (diff)
mempolicy: rework mempolicy Reference Counting [yet again]
After further discussion with Christoph Lameter, it has become clear that my earlier attempts to clean up the mempolicy reference counting were a bit of overkill in some areas, resulting in superflous ref/unref in what are usually fast paths. In other areas, further inspection reveals that I botched the unref for interleave policies. A separate patch, suitable for upstream/stable trees, fixes up the known errors in the previous attempt to fix reference counting. This patch reworks the memory policy referencing counting and, one hopes, simplifies the code. Maybe I'll get it right this time. See the update to the numa_memory_policy.txt document for a discussion of memory policy reference counting that motivates this patch. Summary: Lookup of mempolicy, based on (vma, address) need only add a reference for shared policy, and we need only unref the policy when finished for shared policies. So, this patch backs out all of the unneeded extra reference counting added by my previous attempt. It then unrefs only shared policies when we're finished with them, using the mpol_cond_put() [conditional put] helper function introduced by this patch. Note that shmem_swapin() calls read_swap_cache_async() with a dummy vma containing just the policy. read_swap_cache_async() can call alloc_page_vma() multiple times, so we can't let alloc_page_vma() unref the shared policy in this case. To avoid this, we make a copy of any non-null shared policy and remove the MPOL_F_SHARED flag from the copy. This copy occurs before reading a page [or multiple pages] from swap, so the overhead should not be an issue here. I introduced a new static inline function "mpol_cond_copy()" to copy the shared policy to an on-stack policy and remove the flags that would require a conditional free. The current implementation of mpol_cond_copy() assumes that the struct mempolicy contains no pointers to dynamically allocated structures that must be duplicated or reference counted during copy. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c146
1 files changed, 79 insertions, 67 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 78b18a60b9b2..a237295f8190 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -241,6 +241,15 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
241 return policy; 241 return policy;
242} 242}
243 243
244/* Slow path of a mpol destructor. */
245void __mpol_put(struct mempolicy *p)
246{
247 if (!atomic_dec_and_test(&p->refcnt))
248 return;
249 p->mode = MPOL_DEFAULT;
250 kmem_cache_free(policy_cache, p);
251}
252
244static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 253static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
245{ 254{
246} 255}
@@ -719,6 +728,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
719 get_zonemask(pol, nmask); 728 get_zonemask(pol, nmask);
720 729
721 out: 730 out:
731 mpol_cond_put(pol);
722 if (vma) 732 if (vma)
723 up_read(&current->mm->mmap_sem); 733 up_read(&current->mm->mmap_sem);
724 return err; 734 return err;
@@ -1257,16 +1267,18 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1257 * 1267 *
1258 * Returns effective policy for a VMA at specified address. 1268 * Returns effective policy for a VMA at specified address.
1259 * Falls back to @task or system default policy, as necessary. 1269 * Falls back to @task or system default policy, as necessary.
1260 * Returned policy has extra reference count if shared, vma, 1270 * Current or other task's task mempolicy and non-shared vma policies
1261 * or some other task's policy [show_numa_maps() can pass 1271 * are protected by the task's mmap_sem, which must be held for read by
1262 * @task != current]. It is the caller's responsibility to 1272 * the caller.
1263 * free the reference in these cases. 1273 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1274 * count--added by the get_policy() vm_op, as appropriate--to protect against
1275 * freeing by another task. It is the caller's responsibility to free the
1276 * extra reference for shared policies.
1264 */ 1277 */
1265static struct mempolicy *get_vma_policy(struct task_struct *task, 1278static struct mempolicy *get_vma_policy(struct task_struct *task,
1266 struct vm_area_struct *vma, unsigned long addr) 1279 struct vm_area_struct *vma, unsigned long addr)
1267{ 1280{
1268 struct mempolicy *pol = task->mempolicy; 1281 struct mempolicy *pol = task->mempolicy;
1269 int shared_pol = 0;
1270 1282
1271 if (vma) { 1283 if (vma) {
1272 if (vma->vm_ops && vma->vm_ops->get_policy) { 1284 if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1274,20 +1286,20 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,
1274 addr); 1286 addr);
1275 if (vpol) 1287 if (vpol)
1276 pol = vpol; 1288 pol = vpol;
1277 shared_pol = 1; /* if pol non-NULL, add ref below */
1278 } else if (vma->vm_policy && 1289 } else if (vma->vm_policy &&
1279 vma->vm_policy->mode != MPOL_DEFAULT) 1290 vma->vm_policy->mode != MPOL_DEFAULT)
1280 pol = vma->vm_policy; 1291 pol = vma->vm_policy;
1281 } 1292 }
1282 if (!pol) 1293 if (!pol)
1283 pol = &default_policy; 1294 pol = &default_policy;
1284 else if (!shared_pol && pol != current->mempolicy)
1285 mpol_get(pol); /* vma or other task's policy */
1286 return pol; 1295 return pol;
1287} 1296}
1288 1297
1289/* Return a nodemask representing a mempolicy */ 1298/*
1290static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy) 1299 * Return a nodemask representing a mempolicy for filtering nodes for
1300 * page allocation
1301 */
1302static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1291{ 1303{
1292 /* Lower zones don't get a nodemask applied for MPOL_BIND */ 1304 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1293 if (unlikely(policy->mode == MPOL_BIND) && 1305 if (unlikely(policy->mode == MPOL_BIND) &&
@@ -1298,8 +1310,8 @@ static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
1298 return NULL; 1310 return NULL;
1299} 1311}
1300 1312
1301/* Return a zonelist representing a mempolicy */ 1313/* Return a zonelist indicated by gfp for node representing a mempolicy */
1302static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) 1314static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1303{ 1315{
1304 int nd; 1316 int nd;
1305 1317
@@ -1311,10 +1323,10 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1311 break; 1323 break;
1312 case MPOL_BIND: 1324 case MPOL_BIND:
1313 /* 1325 /*
1314 * Normally, MPOL_BIND allocations node-local are node-local 1326 * Normally, MPOL_BIND allocations are node-local within the
1315 * within the allowed nodemask. However, if __GFP_THISNODE is 1327 * allowed nodemask. However, if __GFP_THISNODE is set and the
1316 * set and the current node is part of the mask, we use the 1328 * current node is part of the mask, we use the zonelist for
1317 * the zonelist for the first node in the mask instead. 1329 * the first node in the mask instead.
1318 */ 1330 */
1319 nd = numa_node_id(); 1331 nd = numa_node_id();
1320 if (unlikely(gfp & __GFP_THISNODE) && 1332 if (unlikely(gfp & __GFP_THISNODE) &&
@@ -1350,6 +1362,10 @@ static unsigned interleave_nodes(struct mempolicy *policy)
1350/* 1362/*
1351 * Depending on the memory policy provide a node from which to allocate the 1363 * Depending on the memory policy provide a node from which to allocate the
1352 * next slab entry. 1364 * next slab entry.
1365 * @policy must be protected by freeing by the caller. If @policy is
1366 * the current task's mempolicy, this protection is implicit, as only the
1367 * task can change it's policy. The system default policy requires no
1368 * such protection.
1353 */ 1369 */
1354unsigned slab_node(struct mempolicy *policy) 1370unsigned slab_node(struct mempolicy *policy)
1355{ 1371{
@@ -1435,43 +1451,27 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1435 * @mpol = pointer to mempolicy pointer for reference counted mempolicy 1451 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1436 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask 1452 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1437 * 1453 *
1438 * Returns a zonelist suitable for a huge page allocation. 1454 * Returns a zonelist suitable for a huge page allocation and a pointer
1439 * If the effective policy is 'BIND, returns pointer to local node's zonelist, 1455 * to the struct mempolicy for conditional unref after allocation.
1440 * and a pointer to the mempolicy's @nodemask for filtering the zonelist. 1456 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1441 * If it is also a policy for which get_vma_policy() returns an extra 1457 * @nodemask for filtering the zonelist.
1442 * reference, we must hold that reference until after the allocation.
1443 * In that case, return policy via @mpol so hugetlb allocation can drop
1444 * the reference. For non-'BIND referenced policies, we can/do drop the
1445 * reference here, so the caller doesn't need to know about the special case
1446 * for default and current task policy.
1447 */ 1458 */
1448struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1459struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1449 gfp_t gfp_flags, struct mempolicy **mpol, 1460 gfp_t gfp_flags, struct mempolicy **mpol,
1450 nodemask_t **nodemask) 1461 nodemask_t **nodemask)
1451{ 1462{
1452 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1453 struct zonelist *zl; 1463 struct zonelist *zl;
1454 1464
1455 *mpol = NULL; /* probably no unref needed */ 1465 *mpol = get_vma_policy(current, vma, addr);
1456 *nodemask = NULL; /* assume !MPOL_BIND */ 1466 *nodemask = NULL; /* assume !MPOL_BIND */
1457 if (pol->mode == MPOL_BIND) {
1458 *nodemask = &pol->v.nodes;
1459 } else if (pol->mode == MPOL_INTERLEAVE) {
1460 unsigned nid;
1461
1462 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1463 if (unlikely(pol != &default_policy &&
1464 pol != current->mempolicy))
1465 __mpol_put(pol); /* finished with pol */
1466 return node_zonelist(nid, gfp_flags);
1467 }
1468 1467
1469 zl = zonelist_policy(GFP_HIGHUSER, pol); 1468 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1470 if (unlikely(pol != &default_policy && pol != current->mempolicy)) { 1469 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1471 if (pol->mode != MPOL_BIND) 1470 HPAGE_SHIFT), gfp_flags);
1472 __mpol_put(pol); /* finished with pol */ 1471 } else {
1473 else 1472 zl = policy_zonelist(gfp_flags, *mpol);
1474 *mpol = pol; /* unref needed after allocation */ 1473 if ((*mpol)->mode == MPOL_BIND)
1474 *nodemask = &(*mpol)->v.nodes;
1475 } 1475 }
1476 return zl; 1476 return zl;
1477} 1477}
@@ -1526,25 +1526,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1526 unsigned nid; 1526 unsigned nid;
1527 1527
1528 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1528 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1529 if (unlikely(pol != &default_policy && 1529 mpol_cond_put(pol);
1530 pol != current->mempolicy))
1531 __mpol_put(pol); /* finished with pol */
1532 return alloc_page_interleave(gfp, 0, nid); 1530 return alloc_page_interleave(gfp, 0, nid);
1533 } 1531 }
1534 zl = zonelist_policy(gfp, pol); 1532 zl = policy_zonelist(gfp, pol);
1535 if (pol != &default_policy && pol != current->mempolicy) { 1533 if (unlikely(mpol_needs_cond_ref(pol))) {
1536 /* 1534 /*
1537 * slow path: ref counted policy -- shared or vma 1535 * slow path: ref counted shared policy
1538 */ 1536 */
1539 struct page *page = __alloc_pages_nodemask(gfp, 0, 1537 struct page *page = __alloc_pages_nodemask(gfp, 0,
1540 zl, nodemask_policy(gfp, pol)); 1538 zl, policy_nodemask(gfp, pol));
1541 __mpol_put(pol); 1539 __mpol_put(pol);
1542 return page; 1540 return page;
1543 } 1541 }
1544 /* 1542 /*
1545 * fast path: default or task policy 1543 * fast path: default or task policy
1546 */ 1544 */
1547 return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol)); 1545 return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1548} 1546}
1549 1547
1550/** 1548/**
@@ -1574,10 +1572,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1574 cpuset_update_task_memory_state(); 1572 cpuset_update_task_memory_state();
1575 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1573 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1576 pol = &default_policy; 1574 pol = &default_policy;
1575
1576 /*
1577 * No reference counting needed for current->mempolicy
1578 * nor system default_policy
1579 */
1577 if (pol->mode == MPOL_INTERLEAVE) 1580 if (pol->mode == MPOL_INTERLEAVE)
1578 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1581 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1579 return __alloc_pages_nodemask(gfp, order, 1582 return __alloc_pages_nodemask(gfp, order,
1580 zonelist_policy(gfp, pol), nodemask_policy(gfp, pol)); 1583 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1581} 1584}
1582EXPORT_SYMBOL(alloc_pages_current); 1585EXPORT_SYMBOL(alloc_pages_current);
1583 1586
@@ -1605,6 +1608,28 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
1605 return new; 1608 return new;
1606} 1609}
1607 1610
1611/*
1612 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1613 * eliminate the * MPOL_F_* flags that require conditional ref and
1614 * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
1615 * after return. Use the returned value.
1616 *
1617 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1618 * policy lookup, even if the policy needs/has extra ref on lookup.
1619 * shmem_readahead needs this.
1620 */
1621struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1622 struct mempolicy *frompol)
1623{
1624 if (!mpol_needs_cond_ref(frompol))
1625 return frompol;
1626
1627 *tompol = *frompol;
1628 tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
1629 __mpol_put(frompol);
1630 return tompol;
1631}
1632
1608static int mpol_match_intent(const struct mempolicy *a, 1633static int mpol_match_intent(const struct mempolicy *a,
1609 const struct mempolicy *b) 1634 const struct mempolicy *b)
1610{ 1635{
@@ -1639,15 +1664,6 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1639 } 1664 }
1640} 1665}
1641 1666
1642/* Slow path of a mpol destructor. */
1643void __mpol_put(struct mempolicy *p)
1644{
1645 if (!atomic_dec_and_test(&p->refcnt))
1646 return;
1647 p->mode = MPOL_DEFAULT;
1648 kmem_cache_free(policy_cache, p);
1649}
1650
1651/* 1667/*
1652 * Shared memory backing store policy support. 1668 * Shared memory backing store policy support.
1653 * 1669 *
@@ -2081,11 +2097,7 @@ int show_numa_map(struct seq_file *m, void *v)
2081 2097
2082 pol = get_vma_policy(priv->task, vma, vma->vm_start); 2098 pol = get_vma_policy(priv->task, vma, vma->vm_start);
2083 mpol_to_str(buffer, sizeof(buffer), pol); 2099 mpol_to_str(buffer, sizeof(buffer), pol);
2084 /* 2100 mpol_cond_put(pol);
2085 * unref shared or other task's mempolicy
2086 */
2087 if (pol != &default_policy && pol != current->mempolicy)
2088 __mpol_put(pol);
2089 2101
2090 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 2102 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2091 2103