diff options
author | Lee Schermerhorn <lee.schermerhorn@hp.com> | 2008-04-28 05:13:16 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-28 11:58:24 -0400 |
commit | 52cd3b074050dd664380b5e8cfc85d4a6ed8ad48 (patch) | |
tree | fcfcf55c0e81376ea34919fab26e29bedd7f3b88 /mm/mempolicy.c | |
parent | a6020ed759404372e8be2b276e85e51735472cc9 (diff) |
mempolicy: rework mempolicy Reference Counting [yet again]
After further discussion with Christoph Lameter, it has become clear that my
earlier attempts to clean up the mempolicy reference counting were a bit of
overkill in some areas, resulting in superflous ref/unref in what are usually
fast paths. In other areas, further inspection reveals that I botched the
unref for interleave policies.
A separate patch, suitable for upstream/stable trees, fixes up the known
errors in the previous attempt to fix reference counting.
This patch reworks the memory policy referencing counting and, one hopes,
simplifies the code. Maybe I'll get it right this time.
See the update to the numa_memory_policy.txt document for a discussion of
memory policy reference counting that motivates this patch.
Summary:
Lookup of mempolicy, based on (vma, address) need only add a reference for
shared policy, and we need only unref the policy when finished for shared
policies. So, this patch backs out all of the unneeded extra reference
counting added by my previous attempt. It then unrefs only shared policies
when we're finished with them, using the mpol_cond_put() [conditional put]
helper function introduced by this patch.
Note that shmem_swapin() calls read_swap_cache_async() with a dummy vma
containing just the policy. read_swap_cache_async() can call alloc_page_vma()
multiple times, so we can't let alloc_page_vma() unref the shared policy in
this case. To avoid this, we make a copy of any non-null shared policy and
remove the MPOL_F_SHARED flag from the copy. This copy occurs before reading
a page [or multiple pages] from swap, so the overhead should not be an issue
here.
I introduced a new static inline function "mpol_cond_copy()" to copy the
shared policy to an on-stack policy and remove the flags that would require a
conditional free. The current implementation of mpol_cond_copy() assumes that
the struct mempolicy contains no pointers to dynamically allocated structures
that must be duplicated or reference counted during copy.
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 146 |
1 files changed, 79 insertions, 67 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 78b18a60b9b2..a237295f8190 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -241,6 +241,15 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
241 | return policy; | 241 | return policy; |
242 | } | 242 | } |
243 | 243 | ||
244 | /* Slow path of a mpol destructor. */ | ||
245 | void __mpol_put(struct mempolicy *p) | ||
246 | { | ||
247 | if (!atomic_dec_and_test(&p->refcnt)) | ||
248 | return; | ||
249 | p->mode = MPOL_DEFAULT; | ||
250 | kmem_cache_free(policy_cache, p); | ||
251 | } | ||
252 | |||
244 | static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) | 253 | static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) |
245 | { | 254 | { |
246 | } | 255 | } |
@@ -719,6 +728,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
719 | get_zonemask(pol, nmask); | 728 | get_zonemask(pol, nmask); |
720 | 729 | ||
721 | out: | 730 | out: |
731 | mpol_cond_put(pol); | ||
722 | if (vma) | 732 | if (vma) |
723 | up_read(¤t->mm->mmap_sem); | 733 | up_read(¤t->mm->mmap_sem); |
724 | return err; | 734 | return err; |
@@ -1257,16 +1267,18 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
1257 | * | 1267 | * |
1258 | * Returns effective policy for a VMA at specified address. | 1268 | * Returns effective policy for a VMA at specified address. |
1259 | * Falls back to @task or system default policy, as necessary. | 1269 | * Falls back to @task or system default policy, as necessary. |
1260 | * Returned policy has extra reference count if shared, vma, | 1270 | * Current or other task's task mempolicy and non-shared vma policies |
1261 | * or some other task's policy [show_numa_maps() can pass | 1271 | * are protected by the task's mmap_sem, which must be held for read by |
1262 | * @task != current]. It is the caller's responsibility to | 1272 | * the caller. |
1263 | * free the reference in these cases. | 1273 | * Shared policies [those marked as MPOL_F_SHARED] require an extra reference |
1274 | * count--added by the get_policy() vm_op, as appropriate--to protect against | ||
1275 | * freeing by another task. It is the caller's responsibility to free the | ||
1276 | * extra reference for shared policies. | ||
1264 | */ | 1277 | */ |
1265 | static struct mempolicy *get_vma_policy(struct task_struct *task, | 1278 | static struct mempolicy *get_vma_policy(struct task_struct *task, |
1266 | struct vm_area_struct *vma, unsigned long addr) | 1279 | struct vm_area_struct *vma, unsigned long addr) |
1267 | { | 1280 | { |
1268 | struct mempolicy *pol = task->mempolicy; | 1281 | struct mempolicy *pol = task->mempolicy; |
1269 | int shared_pol = 0; | ||
1270 | 1282 | ||
1271 | if (vma) { | 1283 | if (vma) { |
1272 | if (vma->vm_ops && vma->vm_ops->get_policy) { | 1284 | if (vma->vm_ops && vma->vm_ops->get_policy) { |
@@ -1274,20 +1286,20 @@ static struct mempolicy *get_vma_policy(struct task_struct *task, | |||
1274 | addr); | 1286 | addr); |
1275 | if (vpol) | 1287 | if (vpol) |
1276 | pol = vpol; | 1288 | pol = vpol; |
1277 | shared_pol = 1; /* if pol non-NULL, add ref below */ | ||
1278 | } else if (vma->vm_policy && | 1289 | } else if (vma->vm_policy && |
1279 | vma->vm_policy->mode != MPOL_DEFAULT) | 1290 | vma->vm_policy->mode != MPOL_DEFAULT) |
1280 | pol = vma->vm_policy; | 1291 | pol = vma->vm_policy; |
1281 | } | 1292 | } |
1282 | if (!pol) | 1293 | if (!pol) |
1283 | pol = &default_policy; | 1294 | pol = &default_policy; |
1284 | else if (!shared_pol && pol != current->mempolicy) | ||
1285 | mpol_get(pol); /* vma or other task's policy */ | ||
1286 | return pol; | 1295 | return pol; |
1287 | } | 1296 | } |
1288 | 1297 | ||
1289 | /* Return a nodemask representing a mempolicy */ | 1298 | /* |
1290 | static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy) | 1299 | * Return a nodemask representing a mempolicy for filtering nodes for |
1300 | * page allocation | ||
1301 | */ | ||
1302 | static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) | ||
1291 | { | 1303 | { |
1292 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ | 1304 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ |
1293 | if (unlikely(policy->mode == MPOL_BIND) && | 1305 | if (unlikely(policy->mode == MPOL_BIND) && |
@@ -1298,8 +1310,8 @@ static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy) | |||
1298 | return NULL; | 1310 | return NULL; |
1299 | } | 1311 | } |
1300 | 1312 | ||
1301 | /* Return a zonelist representing a mempolicy */ | 1313 | /* Return a zonelist indicated by gfp for node representing a mempolicy */ |
1302 | static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) | 1314 | static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) |
1303 | { | 1315 | { |
1304 | int nd; | 1316 | int nd; |
1305 | 1317 | ||
@@ -1311,10 +1323,10 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) | |||
1311 | break; | 1323 | break; |
1312 | case MPOL_BIND: | 1324 | case MPOL_BIND: |
1313 | /* | 1325 | /* |
1314 | * Normally, MPOL_BIND allocations node-local are node-local | 1326 | * Normally, MPOL_BIND allocations are node-local within the |
1315 | * within the allowed nodemask. However, if __GFP_THISNODE is | 1327 | * allowed nodemask. However, if __GFP_THISNODE is set and the |
1316 | * set and the current node is part of the mask, we use the | 1328 | * current node is part of the mask, we use the zonelist for |
1317 | * the zonelist for the first node in the mask instead. | 1329 | * the first node in the mask instead. |
1318 | */ | 1330 | */ |
1319 | nd = numa_node_id(); | 1331 | nd = numa_node_id(); |
1320 | if (unlikely(gfp & __GFP_THISNODE) && | 1332 | if (unlikely(gfp & __GFP_THISNODE) && |
@@ -1350,6 +1362,10 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
1350 | /* | 1362 | /* |
1351 | * Depending on the memory policy provide a node from which to allocate the | 1363 | * Depending on the memory policy provide a node from which to allocate the |
1352 | * next slab entry. | 1364 | * next slab entry. |
1365 | * @policy must be protected by freeing by the caller. If @policy is | ||
1366 | * the current task's mempolicy, this protection is implicit, as only the | ||
1367 | * task can change it's policy. The system default policy requires no | ||
1368 | * such protection. | ||
1353 | */ | 1369 | */ |
1354 | unsigned slab_node(struct mempolicy *policy) | 1370 | unsigned slab_node(struct mempolicy *policy) |
1355 | { | 1371 | { |
@@ -1435,43 +1451,27 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | |||
1435 | * @mpol = pointer to mempolicy pointer for reference counted mempolicy | 1451 | * @mpol = pointer to mempolicy pointer for reference counted mempolicy |
1436 | * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask | 1452 | * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask |
1437 | * | 1453 | * |
1438 | * Returns a zonelist suitable for a huge page allocation. | 1454 | * Returns a zonelist suitable for a huge page allocation and a pointer |
1439 | * If the effective policy is 'BIND, returns pointer to local node's zonelist, | 1455 | * to the struct mempolicy for conditional unref after allocation. |
1440 | * and a pointer to the mempolicy's @nodemask for filtering the zonelist. | 1456 | * If the effective policy is 'BIND, returns a pointer to the mempolicy's |
1441 | * If it is also a policy for which get_vma_policy() returns an extra | 1457 | * @nodemask for filtering the zonelist. |
1442 | * reference, we must hold that reference until after the allocation. | ||
1443 | * In that case, return policy via @mpol so hugetlb allocation can drop | ||
1444 | * the reference. For non-'BIND referenced policies, we can/do drop the | ||
1445 | * reference here, so the caller doesn't need to know about the special case | ||
1446 | * for default and current task policy. | ||
1447 | */ | 1458 | */ |
1448 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | 1459 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, |
1449 | gfp_t gfp_flags, struct mempolicy **mpol, | 1460 | gfp_t gfp_flags, struct mempolicy **mpol, |
1450 | nodemask_t **nodemask) | 1461 | nodemask_t **nodemask) |
1451 | { | 1462 | { |
1452 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | ||
1453 | struct zonelist *zl; | 1463 | struct zonelist *zl; |
1454 | 1464 | ||
1455 | *mpol = NULL; /* probably no unref needed */ | 1465 | *mpol = get_vma_policy(current, vma, addr); |
1456 | *nodemask = NULL; /* assume !MPOL_BIND */ | 1466 | *nodemask = NULL; /* assume !MPOL_BIND */ |
1457 | if (pol->mode == MPOL_BIND) { | ||
1458 | *nodemask = &pol->v.nodes; | ||
1459 | } else if (pol->mode == MPOL_INTERLEAVE) { | ||
1460 | unsigned nid; | ||
1461 | |||
1462 | nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); | ||
1463 | if (unlikely(pol != &default_policy && | ||
1464 | pol != current->mempolicy)) | ||
1465 | __mpol_put(pol); /* finished with pol */ | ||
1466 | return node_zonelist(nid, gfp_flags); | ||
1467 | } | ||
1468 | 1467 | ||
1469 | zl = zonelist_policy(GFP_HIGHUSER, pol); | 1468 | if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { |
1470 | if (unlikely(pol != &default_policy && pol != current->mempolicy)) { | 1469 | zl = node_zonelist(interleave_nid(*mpol, vma, addr, |
1471 | if (pol->mode != MPOL_BIND) | 1470 | HPAGE_SHIFT), gfp_flags); |
1472 | __mpol_put(pol); /* finished with pol */ | 1471 | } else { |
1473 | else | 1472 | zl = policy_zonelist(gfp_flags, *mpol); |
1474 | *mpol = pol; /* unref needed after allocation */ | 1473 | if ((*mpol)->mode == MPOL_BIND) |
1474 | *nodemask = &(*mpol)->v.nodes; | ||
1475 | } | 1475 | } |
1476 | return zl; | 1476 | return zl; |
1477 | } | 1477 | } |
@@ -1526,25 +1526,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1526 | unsigned nid; | 1526 | unsigned nid; |
1527 | 1527 | ||
1528 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); | 1528 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); |
1529 | if (unlikely(pol != &default_policy && | 1529 | mpol_cond_put(pol); |
1530 | pol != current->mempolicy)) | ||
1531 | __mpol_put(pol); /* finished with pol */ | ||
1532 | return alloc_page_interleave(gfp, 0, nid); | 1530 | return alloc_page_interleave(gfp, 0, nid); |
1533 | } | 1531 | } |
1534 | zl = zonelist_policy(gfp, pol); | 1532 | zl = policy_zonelist(gfp, pol); |
1535 | if (pol != &default_policy && pol != current->mempolicy) { | 1533 | if (unlikely(mpol_needs_cond_ref(pol))) { |
1536 | /* | 1534 | /* |
1537 | * slow path: ref counted policy -- shared or vma | 1535 | * slow path: ref counted shared policy |
1538 | */ | 1536 | */ |
1539 | struct page *page = __alloc_pages_nodemask(gfp, 0, | 1537 | struct page *page = __alloc_pages_nodemask(gfp, 0, |
1540 | zl, nodemask_policy(gfp, pol)); | 1538 | zl, policy_nodemask(gfp, pol)); |
1541 | __mpol_put(pol); | 1539 | __mpol_put(pol); |
1542 | return page; | 1540 | return page; |
1543 | } | 1541 | } |
1544 | /* | 1542 | /* |
1545 | * fast path: default or task policy | 1543 | * fast path: default or task policy |
1546 | */ | 1544 | */ |
1547 | return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol)); | 1545 | return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); |
1548 | } | 1546 | } |
1549 | 1547 | ||
1550 | /** | 1548 | /** |
@@ -1574,10 +1572,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1574 | cpuset_update_task_memory_state(); | 1572 | cpuset_update_task_memory_state(); |
1575 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1573 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
1576 | pol = &default_policy; | 1574 | pol = &default_policy; |
1575 | |||
1576 | /* | ||
1577 | * No reference counting needed for current->mempolicy | ||
1578 | * nor system default_policy | ||
1579 | */ | ||
1577 | if (pol->mode == MPOL_INTERLEAVE) | 1580 | if (pol->mode == MPOL_INTERLEAVE) |
1578 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); | 1581 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); |
1579 | return __alloc_pages_nodemask(gfp, order, | 1582 | return __alloc_pages_nodemask(gfp, order, |
1580 | zonelist_policy(gfp, pol), nodemask_policy(gfp, pol)); | 1583 | policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); |
1581 | } | 1584 | } |
1582 | EXPORT_SYMBOL(alloc_pages_current); | 1585 | EXPORT_SYMBOL(alloc_pages_current); |
1583 | 1586 | ||
@@ -1605,6 +1608,28 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) | |||
1605 | return new; | 1608 | return new; |
1606 | } | 1609 | } |
1607 | 1610 | ||
1611 | /* | ||
1612 | * If *frompol needs [has] an extra ref, copy *frompol to *tompol , | ||
1613 | * eliminate the * MPOL_F_* flags that require conditional ref and | ||
1614 | * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly | ||
1615 | * after return. Use the returned value. | ||
1616 | * | ||
1617 | * Allows use of a mempolicy for, e.g., multiple allocations with a single | ||
1618 | * policy lookup, even if the policy needs/has extra ref on lookup. | ||
1619 | * shmem_readahead needs this. | ||
1620 | */ | ||
1621 | struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, | ||
1622 | struct mempolicy *frompol) | ||
1623 | { | ||
1624 | if (!mpol_needs_cond_ref(frompol)) | ||
1625 | return frompol; | ||
1626 | |||
1627 | *tompol = *frompol; | ||
1628 | tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */ | ||
1629 | __mpol_put(frompol); | ||
1630 | return tompol; | ||
1631 | } | ||
1632 | |||
1608 | static int mpol_match_intent(const struct mempolicy *a, | 1633 | static int mpol_match_intent(const struct mempolicy *a, |
1609 | const struct mempolicy *b) | 1634 | const struct mempolicy *b) |
1610 | { | 1635 | { |
@@ -1639,15 +1664,6 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
1639 | } | 1664 | } |
1640 | } | 1665 | } |
1641 | 1666 | ||
1642 | /* Slow path of a mpol destructor. */ | ||
1643 | void __mpol_put(struct mempolicy *p) | ||
1644 | { | ||
1645 | if (!atomic_dec_and_test(&p->refcnt)) | ||
1646 | return; | ||
1647 | p->mode = MPOL_DEFAULT; | ||
1648 | kmem_cache_free(policy_cache, p); | ||
1649 | } | ||
1650 | |||
1651 | /* | 1667 | /* |
1652 | * Shared memory backing store policy support. | 1668 | * Shared memory backing store policy support. |
1653 | * | 1669 | * |
@@ -2081,11 +2097,7 @@ int show_numa_map(struct seq_file *m, void *v) | |||
2081 | 2097 | ||
2082 | pol = get_vma_policy(priv->task, vma, vma->vm_start); | 2098 | pol = get_vma_policy(priv->task, vma, vma->vm_start); |
2083 | mpol_to_str(buffer, sizeof(buffer), pol); | 2099 | mpol_to_str(buffer, sizeof(buffer), pol); |
2084 | /* | 2100 | mpol_cond_put(pol); |
2085 | * unref shared or other task's mempolicy | ||
2086 | */ | ||
2087 | if (pol != &default_policy && pol != current->mempolicy) | ||
2088 | __mpol_put(pol); | ||
2089 | 2101 | ||
2090 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); | 2102 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); |
2091 | 2103 | ||