aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c146
1 files changed, 79 insertions, 67 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 78b18a60b9b2..a237295f8190 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -241,6 +241,15 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
241 return policy; 241 return policy;
242} 242}
243 243
244/* Slow path of a mpol destructor. */
245void __mpol_put(struct mempolicy *p)
246{
247 if (!atomic_dec_and_test(&p->refcnt))
248 return;
249 p->mode = MPOL_DEFAULT;
250 kmem_cache_free(policy_cache, p);
251}
252
244static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 253static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
245{ 254{
246} 255}
@@ -719,6 +728,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
719 get_zonemask(pol, nmask); 728 get_zonemask(pol, nmask);
720 729
721 out: 730 out:
731 mpol_cond_put(pol);
722 if (vma) 732 if (vma)
723 up_read(&current->mm->mmap_sem); 733 up_read(&current->mm->mmap_sem);
724 return err; 734 return err;
@@ -1257,16 +1267,18 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1257 * 1267 *
1258 * Returns effective policy for a VMA at specified address. 1268 * Returns effective policy for a VMA at specified address.
1259 * Falls back to @task or system default policy, as necessary. 1269 * Falls back to @task or system default policy, as necessary.
1260 * Returned policy has extra reference count if shared, vma, 1270 * Current or other task's task mempolicy and non-shared vma policies
1261 * or some other task's policy [show_numa_maps() can pass 1271 * are protected by the task's mmap_sem, which must be held for read by
1262 * @task != current]. It is the caller's responsibility to 1272 * the caller.
1263 * free the reference in these cases. 1273 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1274 * count--added by the get_policy() vm_op, as appropriate--to protect against
1275 * freeing by another task. It is the caller's responsibility to free the
1276 * extra reference for shared policies.
1264 */ 1277 */
1265static struct mempolicy *get_vma_policy(struct task_struct *task, 1278static struct mempolicy *get_vma_policy(struct task_struct *task,
1266 struct vm_area_struct *vma, unsigned long addr) 1279 struct vm_area_struct *vma, unsigned long addr)
1267{ 1280{
1268 struct mempolicy *pol = task->mempolicy; 1281 struct mempolicy *pol = task->mempolicy;
1269 int shared_pol = 0;
1270 1282
1271 if (vma) { 1283 if (vma) {
1272 if (vma->vm_ops && vma->vm_ops->get_policy) { 1284 if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1274,20 +1286,20 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,
1274 addr); 1286 addr);
1275 if (vpol) 1287 if (vpol)
1276 pol = vpol; 1288 pol = vpol;
1277 shared_pol = 1; /* if pol non-NULL, add ref below */
1278 } else if (vma->vm_policy && 1289 } else if (vma->vm_policy &&
1279 vma->vm_policy->mode != MPOL_DEFAULT) 1290 vma->vm_policy->mode != MPOL_DEFAULT)
1280 pol = vma->vm_policy; 1291 pol = vma->vm_policy;
1281 } 1292 }
1282 if (!pol) 1293 if (!pol)
1283 pol = &default_policy; 1294 pol = &default_policy;
1284 else if (!shared_pol && pol != current->mempolicy)
1285 mpol_get(pol); /* vma or other task's policy */
1286 return pol; 1295 return pol;
1287} 1296}
1288 1297
1289/* Return a nodemask representing a mempolicy */ 1298/*
1290static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy) 1299 * Return a nodemask representing a mempolicy for filtering nodes for
1300 * page allocation
1301 */
1302static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1291{ 1303{
1292 /* Lower zones don't get a nodemask applied for MPOL_BIND */ 1304 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1293 if (unlikely(policy->mode == MPOL_BIND) && 1305 if (unlikely(policy->mode == MPOL_BIND) &&
@@ -1298,8 +1310,8 @@ static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
1298 return NULL; 1310 return NULL;
1299} 1311}
1300 1312
1301/* Return a zonelist representing a mempolicy */ 1313/* Return a zonelist indicated by gfp for node representing a mempolicy */
1302static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) 1314static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1303{ 1315{
1304 int nd; 1316 int nd;
1305 1317
@@ -1311,10 +1323,10 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1311 break; 1323 break;
1312 case MPOL_BIND: 1324 case MPOL_BIND:
1313 /* 1325 /*
1314 * Normally, MPOL_BIND allocations node-local are node-local 1326 * Normally, MPOL_BIND allocations are node-local within the
1315 * within the allowed nodemask. However, if __GFP_THISNODE is 1327 * allowed nodemask. However, if __GFP_THISNODE is set and the
1316 * set and the current node is part of the mask, we use the 1328 * current node is part of the mask, we use the zonelist for
1317 * the zonelist for the first node in the mask instead. 1329 * the first node in the mask instead.
1318 */ 1330 */
1319 nd = numa_node_id(); 1331 nd = numa_node_id();
1320 if (unlikely(gfp & __GFP_THISNODE) && 1332 if (unlikely(gfp & __GFP_THISNODE) &&
@@ -1350,6 +1362,10 @@ static unsigned interleave_nodes(struct mempolicy *policy)
1350/* 1362/*
1351 * Depending on the memory policy provide a node from which to allocate the 1363 * Depending on the memory policy provide a node from which to allocate the
1352 * next slab entry. 1364 * next slab entry.
1365 * @policy must be protected by freeing by the caller. If @policy is
1366 * the current task's mempolicy, this protection is implicit, as only the
1367 * task can change it's policy. The system default policy requires no
1368 * such protection.
1353 */ 1369 */
1354unsigned slab_node(struct mempolicy *policy) 1370unsigned slab_node(struct mempolicy *policy)
1355{ 1371{
@@ -1435,43 +1451,27 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1435 * @mpol = pointer to mempolicy pointer for reference counted mempolicy 1451 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1436 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask 1452 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1437 * 1453 *
1438 * Returns a zonelist suitable for a huge page allocation. 1454 * Returns a zonelist suitable for a huge page allocation and a pointer
1439 * If the effective policy is 'BIND, returns pointer to local node's zonelist, 1455 * to the struct mempolicy for conditional unref after allocation.
1440 * and a pointer to the mempolicy's @nodemask for filtering the zonelist. 1456 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1441 * If it is also a policy for which get_vma_policy() returns an extra 1457 * @nodemask for filtering the zonelist.
1442 * reference, we must hold that reference until after the allocation.
1443 * In that case, return policy via @mpol so hugetlb allocation can drop
1444 * the reference. For non-'BIND referenced policies, we can/do drop the
1445 * reference here, so the caller doesn't need to know about the special case
1446 * for default and current task policy.
1447 */ 1458 */
1448struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1459struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1449 gfp_t gfp_flags, struct mempolicy **mpol, 1460 gfp_t gfp_flags, struct mempolicy **mpol,
1450 nodemask_t **nodemask) 1461 nodemask_t **nodemask)
1451{ 1462{
1452 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1453 struct zonelist *zl; 1463 struct zonelist *zl;
1454 1464
1455 *mpol = NULL; /* probably no unref needed */ 1465 *mpol = get_vma_policy(current, vma, addr);
1456 *nodemask = NULL; /* assume !MPOL_BIND */ 1466 *nodemask = NULL; /* assume !MPOL_BIND */
1457 if (pol->mode == MPOL_BIND) {
1458 *nodemask = &pol->v.nodes;
1459 } else if (pol->mode == MPOL_INTERLEAVE) {
1460 unsigned nid;
1461
1462 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1463 if (unlikely(pol != &default_policy &&
1464 pol != current->mempolicy))
1465 __mpol_put(pol); /* finished with pol */
1466 return node_zonelist(nid, gfp_flags);
1467 }
1468 1467
1469 zl = zonelist_policy(GFP_HIGHUSER, pol); 1468 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1470 if (unlikely(pol != &default_policy && pol != current->mempolicy)) { 1469 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1471 if (pol->mode != MPOL_BIND) 1470 HPAGE_SHIFT), gfp_flags);
1472 __mpol_put(pol); /* finished with pol */ 1471 } else {
1473 else 1472 zl = policy_zonelist(gfp_flags, *mpol);
1474 *mpol = pol; /* unref needed after allocation */ 1473 if ((*mpol)->mode == MPOL_BIND)
1474 *nodemask = &(*mpol)->v.nodes;
1475 } 1475 }
1476 return zl; 1476 return zl;
1477} 1477}
@@ -1526,25 +1526,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1526 unsigned nid; 1526 unsigned nid;
1527 1527
1528 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1528 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1529 if (unlikely(pol != &default_policy && 1529 mpol_cond_put(pol);
1530 pol != current->mempolicy))
1531 __mpol_put(pol); /* finished with pol */
1532 return alloc_page_interleave(gfp, 0, nid); 1530 return alloc_page_interleave(gfp, 0, nid);
1533 } 1531 }
1534 zl = zonelist_policy(gfp, pol); 1532 zl = policy_zonelist(gfp, pol);
1535 if (pol != &default_policy && pol != current->mempolicy) { 1533 if (unlikely(mpol_needs_cond_ref(pol))) {
1536 /* 1534 /*
1537 * slow path: ref counted policy -- shared or vma 1535 * slow path: ref counted shared policy
1538 */ 1536 */
1539 struct page *page = __alloc_pages_nodemask(gfp, 0, 1537 struct page *page = __alloc_pages_nodemask(gfp, 0,
1540 zl, nodemask_policy(gfp, pol)); 1538 zl, policy_nodemask(gfp, pol));
1541 __mpol_put(pol); 1539 __mpol_put(pol);
1542 return page; 1540 return page;
1543 } 1541 }
1544 /* 1542 /*
1545 * fast path: default or task policy 1543 * fast path: default or task policy
1546 */ 1544 */
1547 return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol)); 1545 return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1548} 1546}
1549 1547
1550/** 1548/**
@@ -1574,10 +1572,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1574 cpuset_update_task_memory_state(); 1572 cpuset_update_task_memory_state();
1575 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1573 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1576 pol = &default_policy; 1574 pol = &default_policy;
1575
1576 /*
1577 * No reference counting needed for current->mempolicy
1578 * nor system default_policy
1579 */
1577 if (pol->mode == MPOL_INTERLEAVE) 1580 if (pol->mode == MPOL_INTERLEAVE)
1578 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1581 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1579 return __alloc_pages_nodemask(gfp, order, 1582 return __alloc_pages_nodemask(gfp, order,
1580 zonelist_policy(gfp, pol), nodemask_policy(gfp, pol)); 1583 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1581} 1584}
1582EXPORT_SYMBOL(alloc_pages_current); 1585EXPORT_SYMBOL(alloc_pages_current);
1583 1586
@@ -1605,6 +1608,28 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
1605 return new; 1608 return new;
1606} 1609}
1607 1610
1611/*
1612 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1613 * eliminate the * MPOL_F_* flags that require conditional ref and
1614 * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
1615 * after return. Use the returned value.
1616 *
1617 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1618 * policy lookup, even if the policy needs/has extra ref on lookup.
1619 * shmem_readahead needs this.
1620 */
1621struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1622 struct mempolicy *frompol)
1623{
1624 if (!mpol_needs_cond_ref(frompol))
1625 return frompol;
1626
1627 *tompol = *frompol;
1628 tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
1629 __mpol_put(frompol);
1630 return tompol;
1631}
1632
1608static int mpol_match_intent(const struct mempolicy *a, 1633static int mpol_match_intent(const struct mempolicy *a,
1609 const struct mempolicy *b) 1634 const struct mempolicy *b)
1610{ 1635{
@@ -1639,15 +1664,6 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1639 } 1664 }
1640} 1665}
1641 1666
1642/* Slow path of a mpol destructor. */
1643void __mpol_put(struct mempolicy *p)
1644{
1645 if (!atomic_dec_and_test(&p->refcnt))
1646 return;
1647 p->mode = MPOL_DEFAULT;
1648 kmem_cache_free(policy_cache, p);
1649}
1650
1651/* 1667/*
1652 * Shared memory backing store policy support. 1668 * Shared memory backing store policy support.
1653 * 1669 *
@@ -2081,11 +2097,7 @@ int show_numa_map(struct seq_file *m, void *v)
2081 2097
2082 pol = get_vma_policy(priv->task, vma, vma->vm_start); 2098 pol = get_vma_policy(priv->task, vma, vma->vm_start);
2083 mpol_to_str(buffer, sizeof(buffer), pol); 2099 mpol_to_str(buffer, sizeof(buffer), pol);
2084 /* 2100 mpol_cond_put(pol);
2085 * unref shared or other task's mempolicy
2086 */
2087 if (pol != &default_policy && pol != current->mempolicy)
2088 __mpol_put(pol);
2089 2101
2090 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 2102 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2091 2103