diff options
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 146 |
1 files changed, 79 insertions, 67 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 78b18a60b9b2..a237295f8190 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -241,6 +241,15 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
241 | return policy; | 241 | return policy; |
242 | } | 242 | } |
243 | 243 | ||
244 | /* Slow path of a mpol destructor. */ | ||
245 | void __mpol_put(struct mempolicy *p) | ||
246 | { | ||
247 | if (!atomic_dec_and_test(&p->refcnt)) | ||
248 | return; | ||
249 | p->mode = MPOL_DEFAULT; | ||
250 | kmem_cache_free(policy_cache, p); | ||
251 | } | ||
252 | |||
244 | static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) | 253 | static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) |
245 | { | 254 | { |
246 | } | 255 | } |
@@ -719,6 +728,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
719 | get_zonemask(pol, nmask); | 728 | get_zonemask(pol, nmask); |
720 | 729 | ||
721 | out: | 730 | out: |
731 | mpol_cond_put(pol); | ||
722 | if (vma) | 732 | if (vma) |
723 | up_read(¤t->mm->mmap_sem); | 733 | up_read(¤t->mm->mmap_sem); |
724 | return err; | 734 | return err; |
@@ -1257,16 +1267,18 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
1257 | * | 1267 | * |
1258 | * Returns effective policy for a VMA at specified address. | 1268 | * Returns effective policy for a VMA at specified address. |
1259 | * Falls back to @task or system default policy, as necessary. | 1269 | * Falls back to @task or system default policy, as necessary. |
1260 | * Returned policy has extra reference count if shared, vma, | 1270 | * Current or other task's task mempolicy and non-shared vma policies |
1261 | * or some other task's policy [show_numa_maps() can pass | 1271 | * are protected by the task's mmap_sem, which must be held for read by |
1262 | * @task != current]. It is the caller's responsibility to | 1272 | * the caller. |
1263 | * free the reference in these cases. | 1273 | * Shared policies [those marked as MPOL_F_SHARED] require an extra reference |
1274 | * count--added by the get_policy() vm_op, as appropriate--to protect against | ||
1275 | * freeing by another task. It is the caller's responsibility to free the | ||
1276 | * extra reference for shared policies. | ||
1264 | */ | 1277 | */ |
1265 | static struct mempolicy *get_vma_policy(struct task_struct *task, | 1278 | static struct mempolicy *get_vma_policy(struct task_struct *task, |
1266 | struct vm_area_struct *vma, unsigned long addr) | 1279 | struct vm_area_struct *vma, unsigned long addr) |
1267 | { | 1280 | { |
1268 | struct mempolicy *pol = task->mempolicy; | 1281 | struct mempolicy *pol = task->mempolicy; |
1269 | int shared_pol = 0; | ||
1270 | 1282 | ||
1271 | if (vma) { | 1283 | if (vma) { |
1272 | if (vma->vm_ops && vma->vm_ops->get_policy) { | 1284 | if (vma->vm_ops && vma->vm_ops->get_policy) { |
@@ -1274,20 +1286,20 @@ static struct mempolicy *get_vma_policy(struct task_struct *task, | |||
1274 | addr); | 1286 | addr); |
1275 | if (vpol) | 1287 | if (vpol) |
1276 | pol = vpol; | 1288 | pol = vpol; |
1277 | shared_pol = 1; /* if pol non-NULL, add ref below */ | ||
1278 | } else if (vma->vm_policy && | 1289 | } else if (vma->vm_policy && |
1279 | vma->vm_policy->mode != MPOL_DEFAULT) | 1290 | vma->vm_policy->mode != MPOL_DEFAULT) |
1280 | pol = vma->vm_policy; | 1291 | pol = vma->vm_policy; |
1281 | } | 1292 | } |
1282 | if (!pol) | 1293 | if (!pol) |
1283 | pol = &default_policy; | 1294 | pol = &default_policy; |
1284 | else if (!shared_pol && pol != current->mempolicy) | ||
1285 | mpol_get(pol); /* vma or other task's policy */ | ||
1286 | return pol; | 1295 | return pol; |
1287 | } | 1296 | } |
1288 | 1297 | ||
1289 | /* Return a nodemask representing a mempolicy */ | 1298 | /* |
1290 | static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy) | 1299 | * Return a nodemask representing a mempolicy for filtering nodes for |
1300 | * page allocation | ||
1301 | */ | ||
1302 | static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) | ||
1291 | { | 1303 | { |
1292 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ | 1304 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ |
1293 | if (unlikely(policy->mode == MPOL_BIND) && | 1305 | if (unlikely(policy->mode == MPOL_BIND) && |
@@ -1298,8 +1310,8 @@ static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy) | |||
1298 | return NULL; | 1310 | return NULL; |
1299 | } | 1311 | } |
1300 | 1312 | ||
1301 | /* Return a zonelist representing a mempolicy */ | 1313 | /* Return a zonelist indicated by gfp for node representing a mempolicy */ |
1302 | static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) | 1314 | static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) |
1303 | { | 1315 | { |
1304 | int nd; | 1316 | int nd; |
1305 | 1317 | ||
@@ -1311,10 +1323,10 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) | |||
1311 | break; | 1323 | break; |
1312 | case MPOL_BIND: | 1324 | case MPOL_BIND: |
1313 | /* | 1325 | /* |
1314 | * Normally, MPOL_BIND allocations node-local are node-local | 1326 | * Normally, MPOL_BIND allocations are node-local within the |
1315 | * within the allowed nodemask. However, if __GFP_THISNODE is | 1327 | * allowed nodemask. However, if __GFP_THISNODE is set and the |
1316 | * set and the current node is part of the mask, we use the | 1328 | * current node is part of the mask, we use the zonelist for |
1317 | * the zonelist for the first node in the mask instead. | 1329 | * the first node in the mask instead. |
1318 | */ | 1330 | */ |
1319 | nd = numa_node_id(); | 1331 | nd = numa_node_id(); |
1320 | if (unlikely(gfp & __GFP_THISNODE) && | 1332 | if (unlikely(gfp & __GFP_THISNODE) && |
@@ -1350,6 +1362,10 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
1350 | /* | 1362 | /* |
1351 | * Depending on the memory policy provide a node from which to allocate the | 1363 | * Depending on the memory policy provide a node from which to allocate the |
1352 | * next slab entry. | 1364 | * next slab entry. |
1365 | * @policy must be protected by freeing by the caller. If @policy is | ||
1366 | * the current task's mempolicy, this protection is implicit, as only the | ||
1367 | * task can change it's policy. The system default policy requires no | ||
1368 | * such protection. | ||
1353 | */ | 1369 | */ |
1354 | unsigned slab_node(struct mempolicy *policy) | 1370 | unsigned slab_node(struct mempolicy *policy) |
1355 | { | 1371 | { |
@@ -1435,43 +1451,27 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | |||
1435 | * @mpol = pointer to mempolicy pointer for reference counted mempolicy | 1451 | * @mpol = pointer to mempolicy pointer for reference counted mempolicy |
1436 | * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask | 1452 | * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask |
1437 | * | 1453 | * |
1438 | * Returns a zonelist suitable for a huge page allocation. | 1454 | * Returns a zonelist suitable for a huge page allocation and a pointer |
1439 | * If the effective policy is 'BIND, returns pointer to local node's zonelist, | 1455 | * to the struct mempolicy for conditional unref after allocation. |
1440 | * and a pointer to the mempolicy's @nodemask for filtering the zonelist. | 1456 | * If the effective policy is 'BIND, returns a pointer to the mempolicy's |
1441 | * If it is also a policy for which get_vma_policy() returns an extra | 1457 | * @nodemask for filtering the zonelist. |
1442 | * reference, we must hold that reference until after the allocation. | ||
1443 | * In that case, return policy via @mpol so hugetlb allocation can drop | ||
1444 | * the reference. For non-'BIND referenced policies, we can/do drop the | ||
1445 | * reference here, so the caller doesn't need to know about the special case | ||
1446 | * for default and current task policy. | ||
1447 | */ | 1458 | */ |
1448 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | 1459 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, |
1449 | gfp_t gfp_flags, struct mempolicy **mpol, | 1460 | gfp_t gfp_flags, struct mempolicy **mpol, |
1450 | nodemask_t **nodemask) | 1461 | nodemask_t **nodemask) |
1451 | { | 1462 | { |
1452 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | ||
1453 | struct zonelist *zl; | 1463 | struct zonelist *zl; |
1454 | 1464 | ||
1455 | *mpol = NULL; /* probably no unref needed */ | 1465 | *mpol = get_vma_policy(current, vma, addr); |
1456 | *nodemask = NULL; /* assume !MPOL_BIND */ | 1466 | *nodemask = NULL; /* assume !MPOL_BIND */ |
1457 | if (pol->mode == MPOL_BIND) { | ||
1458 | *nodemask = &pol->v.nodes; | ||
1459 | } else if (pol->mode == MPOL_INTERLEAVE) { | ||
1460 | unsigned nid; | ||
1461 | |||
1462 | nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); | ||
1463 | if (unlikely(pol != &default_policy && | ||
1464 | pol != current->mempolicy)) | ||
1465 | __mpol_put(pol); /* finished with pol */ | ||
1466 | return node_zonelist(nid, gfp_flags); | ||
1467 | } | ||
1468 | 1467 | ||
1469 | zl = zonelist_policy(GFP_HIGHUSER, pol); | 1468 | if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { |
1470 | if (unlikely(pol != &default_policy && pol != current->mempolicy)) { | 1469 | zl = node_zonelist(interleave_nid(*mpol, vma, addr, |
1471 | if (pol->mode != MPOL_BIND) | 1470 | HPAGE_SHIFT), gfp_flags); |
1472 | __mpol_put(pol); /* finished with pol */ | 1471 | } else { |
1473 | else | 1472 | zl = policy_zonelist(gfp_flags, *mpol); |
1474 | *mpol = pol; /* unref needed after allocation */ | 1473 | if ((*mpol)->mode == MPOL_BIND) |
1474 | *nodemask = &(*mpol)->v.nodes; | ||
1475 | } | 1475 | } |
1476 | return zl; | 1476 | return zl; |
1477 | } | 1477 | } |
@@ -1526,25 +1526,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1526 | unsigned nid; | 1526 | unsigned nid; |
1527 | 1527 | ||
1528 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); | 1528 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); |
1529 | if (unlikely(pol != &default_policy && | 1529 | mpol_cond_put(pol); |
1530 | pol != current->mempolicy)) | ||
1531 | __mpol_put(pol); /* finished with pol */ | ||
1532 | return alloc_page_interleave(gfp, 0, nid); | 1530 | return alloc_page_interleave(gfp, 0, nid); |
1533 | } | 1531 | } |
1534 | zl = zonelist_policy(gfp, pol); | 1532 | zl = policy_zonelist(gfp, pol); |
1535 | if (pol != &default_policy && pol != current->mempolicy) { | 1533 | if (unlikely(mpol_needs_cond_ref(pol))) { |
1536 | /* | 1534 | /* |
1537 | * slow path: ref counted policy -- shared or vma | 1535 | * slow path: ref counted shared policy |
1538 | */ | 1536 | */ |
1539 | struct page *page = __alloc_pages_nodemask(gfp, 0, | 1537 | struct page *page = __alloc_pages_nodemask(gfp, 0, |
1540 | zl, nodemask_policy(gfp, pol)); | 1538 | zl, policy_nodemask(gfp, pol)); |
1541 | __mpol_put(pol); | 1539 | __mpol_put(pol); |
1542 | return page; | 1540 | return page; |
1543 | } | 1541 | } |
1544 | /* | 1542 | /* |
1545 | * fast path: default or task policy | 1543 | * fast path: default or task policy |
1546 | */ | 1544 | */ |
1547 | return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol)); | 1545 | return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); |
1548 | } | 1546 | } |
1549 | 1547 | ||
1550 | /** | 1548 | /** |
@@ -1574,10 +1572,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1574 | cpuset_update_task_memory_state(); | 1572 | cpuset_update_task_memory_state(); |
1575 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1573 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
1576 | pol = &default_policy; | 1574 | pol = &default_policy; |
1575 | |||
1576 | /* | ||
1577 | * No reference counting needed for current->mempolicy | ||
1578 | * nor system default_policy | ||
1579 | */ | ||
1577 | if (pol->mode == MPOL_INTERLEAVE) | 1580 | if (pol->mode == MPOL_INTERLEAVE) |
1578 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); | 1581 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); |
1579 | return __alloc_pages_nodemask(gfp, order, | 1582 | return __alloc_pages_nodemask(gfp, order, |
1580 | zonelist_policy(gfp, pol), nodemask_policy(gfp, pol)); | 1583 | policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); |
1581 | } | 1584 | } |
1582 | EXPORT_SYMBOL(alloc_pages_current); | 1585 | EXPORT_SYMBOL(alloc_pages_current); |
1583 | 1586 | ||
@@ -1605,6 +1608,28 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) | |||
1605 | return new; | 1608 | return new; |
1606 | } | 1609 | } |
1607 | 1610 | ||
1611 | /* | ||
1612 | * If *frompol needs [has] an extra ref, copy *frompol to *tompol , | ||
1613 | * eliminate the * MPOL_F_* flags that require conditional ref and | ||
1614 | * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly | ||
1615 | * after return. Use the returned value. | ||
1616 | * | ||
1617 | * Allows use of a mempolicy for, e.g., multiple allocations with a single | ||
1618 | * policy lookup, even if the policy needs/has extra ref on lookup. | ||
1619 | * shmem_readahead needs this. | ||
1620 | */ | ||
1621 | struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, | ||
1622 | struct mempolicy *frompol) | ||
1623 | { | ||
1624 | if (!mpol_needs_cond_ref(frompol)) | ||
1625 | return frompol; | ||
1626 | |||
1627 | *tompol = *frompol; | ||
1628 | tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */ | ||
1629 | __mpol_put(frompol); | ||
1630 | return tompol; | ||
1631 | } | ||
1632 | |||
1608 | static int mpol_match_intent(const struct mempolicy *a, | 1633 | static int mpol_match_intent(const struct mempolicy *a, |
1609 | const struct mempolicy *b) | 1634 | const struct mempolicy *b) |
1610 | { | 1635 | { |
@@ -1639,15 +1664,6 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
1639 | } | 1664 | } |
1640 | } | 1665 | } |
1641 | 1666 | ||
1642 | /* Slow path of a mpol destructor. */ | ||
1643 | void __mpol_put(struct mempolicy *p) | ||
1644 | { | ||
1645 | if (!atomic_dec_and_test(&p->refcnt)) | ||
1646 | return; | ||
1647 | p->mode = MPOL_DEFAULT; | ||
1648 | kmem_cache_free(policy_cache, p); | ||
1649 | } | ||
1650 | |||
1651 | /* | 1667 | /* |
1652 | * Shared memory backing store policy support. | 1668 | * Shared memory backing store policy support. |
1653 | * | 1669 | * |
@@ -2081,11 +2097,7 @@ int show_numa_map(struct seq_file *m, void *v) | |||
2081 | 2097 | ||
2082 | pol = get_vma_policy(priv->task, vma, vma->vm_start); | 2098 | pol = get_vma_policy(priv->task, vma, vma->vm_start); |
2083 | mpol_to_str(buffer, sizeof(buffer), pol); | 2099 | mpol_to_str(buffer, sizeof(buffer), pol); |
2084 | /* | 2100 | mpol_cond_put(pol); |
2085 | * unref shared or other task's mempolicy | ||
2086 | */ | ||
2087 | if (pol != &default_policy && pol != current->mempolicy) | ||
2088 | __mpol_put(pol); | ||
2089 | 2101 | ||
2090 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); | 2102 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); |
2091 | 2103 | ||