diff options
-rw-r--r-- | Documentation/vm/numa_memory_policy.txt | 68 | ||||
-rw-r--r-- | include/linux/mempolicy.h | 35 | ||||
-rw-r--r-- | ipc/shm.c | 5 | ||||
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 146 | ||||
-rw-r--r-- | mm/shmem.c | 16 |
6 files changed, 195 insertions, 77 deletions
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt index 27b9507a3769..6719d642653f 100644 --- a/Documentation/vm/numa_memory_policy.txt +++ b/Documentation/vm/numa_memory_policy.txt | |||
@@ -311,6 +311,74 @@ Components of Memory Policies | |||
311 | MPOL_PREFERRED policies that were created with an empty nodemask | 311 | MPOL_PREFERRED policies that were created with an empty nodemask |
312 | (local allocation). | 312 | (local allocation). |
313 | 313 | ||
314 | MEMORY POLICY REFERENCE COUNTING | ||
315 | |||
316 | To resolve use/free races, struct mempolicy contains an atomic reference | ||
317 | count field. Internal interfaces, mpol_get()/mpol_put() increment and | ||
318 | decrement this reference count, respectively. mpol_put() will only free | ||
319 | the structure back to the mempolicy kmem cache when the reference count | ||
320 | goes to zero. | ||
321 | |||
322 | When a new memory policy is allocated, it's reference count is initialized | ||
323 | to '1', representing the reference held by the task that is installing the | ||
324 | new policy. When a pointer to a memory policy structure is stored in another | ||
325 | structure, another reference is added, as the task's reference will be dropped | ||
326 | on completion of the policy installation. | ||
327 | |||
328 | During run-time "usage" of the policy, we attempt to minimize atomic operations | ||
329 | on the reference count, as this can lead to cache lines bouncing between cpus | ||
330 | and NUMA nodes. "Usage" here means one of the following: | ||
331 | |||
332 | 1) querying of the policy, either by the task itself [using the get_mempolicy() | ||
333 | API discussed below] or by another task using the /proc/<pid>/numa_maps | ||
334 | interface. | ||
335 | |||
336 | 2) examination of the policy to determine the policy mode and associated node | ||
337 | or node lists, if any, for page allocation. This is considered a "hot | ||
338 | path". Note that for MPOL_BIND, the "usage" extends across the entire | ||
339 | allocation process, which may sleep during page reclaimation, because the | ||
340 | BIND policy nodemask is used, by reference, to filter ineligible nodes. | ||
341 | |||
342 | We can avoid taking an extra reference during the usages listed above as | ||
343 | follows: | ||
344 | |||
345 | 1) we never need to get/free the system default policy as this is never | ||
346 | changed nor freed, once the system is up and running. | ||
347 | |||
348 | 2) for querying the policy, we do not need to take an extra reference on the | ||
349 | target task's task policy nor vma policies because we always acquire the | ||
350 | task's mm's mmap_sem for read during the query. The set_mempolicy() and | ||
351 | mbind() APIs [see below] always acquire the mmap_sem for write when | ||
352 | installing or replacing task or vma policies. Thus, there is no possibility | ||
353 | of a task or thread freeing a policy while another task or thread is | ||
354 | querying it. | ||
355 | |||
356 | 3) Page allocation usage of task or vma policy occurs in the fault path where | ||
357 | we hold them mmap_sem for read. Again, because replacing the task or vma | ||
358 | policy requires that the mmap_sem be held for write, the policy can't be | ||
359 | freed out from under us while we're using it for page allocation. | ||
360 | |||
361 | 4) Shared policies require special consideration. One task can replace a | ||
362 | shared memory policy while another task, with a distinct mmap_sem, is | ||
363 | querying or allocating a page based on the policy. To resolve this | ||
364 | potential race, the shared policy infrastructure adds an extra reference | ||
365 | to the shared policy during lookup while holding a spin lock on the shared | ||
366 | policy management structure. This requires that we drop this extra | ||
367 | reference when we're finished "using" the policy. We must drop the | ||
368 | extra reference on shared policies in the same query/allocation paths | ||
369 | used for non-shared policies. For this reason, shared policies are marked | ||
370 | as such, and the extra reference is dropped "conditionally"--i.e., only | ||
371 | for shared policies. | ||
372 | |||
373 | Because of this extra reference counting, and because we must lookup | ||
374 | shared policies in a tree structure under spinlock, shared policies are | ||
375 | more expensive to use in the page allocation path. This is expecially | ||
376 | true for shared policies on shared memory regions shared by tasks running | ||
377 | on different NUMA nodes. This extra overhead can be avoided by always | ||
378 | falling back to task or system default policy for shared memory regions, | ||
379 | or by prefaulting the entire shared memory region into memory and locking | ||
380 | it down. However, this might not be appropriate for all applications. | ||
381 | |||
314 | MEMORY POLICY APIs | 382 | MEMORY POLICY APIs |
315 | 383 | ||
316 | Linux supports 3 system calls for controlling memory policy. These APIS | 384 | Linux supports 3 system calls for controlling memory policy. These APIS |
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 017def89e568..172b9c6acb91 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h | |||
@@ -112,6 +112,31 @@ static inline void mpol_put(struct mempolicy *pol) | |||
112 | __mpol_put(pol); | 112 | __mpol_put(pol); |
113 | } | 113 | } |
114 | 114 | ||
115 | /* | ||
116 | * Does mempolicy pol need explicit unref after use? | ||
117 | * Currently only needed for shared policies. | ||
118 | */ | ||
119 | static inline int mpol_needs_cond_ref(struct mempolicy *pol) | ||
120 | { | ||
121 | return (pol && (pol->flags & MPOL_F_SHARED)); | ||
122 | } | ||
123 | |||
124 | static inline void mpol_cond_put(struct mempolicy *pol) | ||
125 | { | ||
126 | if (mpol_needs_cond_ref(pol)) | ||
127 | __mpol_put(pol); | ||
128 | } | ||
129 | |||
130 | extern struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, | ||
131 | struct mempolicy *frompol); | ||
132 | static inline struct mempolicy *mpol_cond_copy(struct mempolicy *tompol, | ||
133 | struct mempolicy *frompol) | ||
134 | { | ||
135 | if (!frompol) | ||
136 | return frompol; | ||
137 | return __mpol_cond_copy(tompol, frompol); | ||
138 | } | ||
139 | |||
115 | extern struct mempolicy *__mpol_dup(struct mempolicy *pol); | 140 | extern struct mempolicy *__mpol_dup(struct mempolicy *pol); |
116 | static inline struct mempolicy *mpol_dup(struct mempolicy *pol) | 141 | static inline struct mempolicy *mpol_dup(struct mempolicy *pol) |
117 | { | 142 | { |
@@ -201,6 +226,16 @@ static inline void mpol_put(struct mempolicy *p) | |||
201 | { | 226 | { |
202 | } | 227 | } |
203 | 228 | ||
229 | static inline void mpol_cond_put(struct mempolicy *pol) | ||
230 | { | ||
231 | } | ||
232 | |||
233 | static inline struct mempolicy *mpol_cond_copy(struct mempolicy *to, | ||
234 | struct mempolicy *from) | ||
235 | { | ||
236 | return from; | ||
237 | } | ||
238 | |||
204 | static inline void mpol_get(struct mempolicy *pol) | 239 | static inline void mpol_get(struct mempolicy *pol) |
205 | { | 240 | { |
206 | } | 241 | } |
@@ -271,10 +271,9 @@ static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, | |||
271 | 271 | ||
272 | if (sfd->vm_ops->get_policy) | 272 | if (sfd->vm_ops->get_policy) |
273 | pol = sfd->vm_ops->get_policy(vma, addr); | 273 | pol = sfd->vm_ops->get_policy(vma, addr); |
274 | else if (vma->vm_policy) { | 274 | else if (vma->vm_policy) |
275 | pol = vma->vm_policy; | 275 | pol = vma->vm_policy; |
276 | mpol_get(pol); /* get_vma_policy() expects this */ | 276 | |
277 | } | ||
278 | return pol; | 277 | return pol; |
279 | } | 278 | } |
280 | #endif | 279 | #endif |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 53afa8c76ada..d36e1f11a5f2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -116,7 +116,7 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
116 | break; | 116 | break; |
117 | } | 117 | } |
118 | } | 118 | } |
119 | mpol_put(mpol); /* unref if mpol !NULL */ | 119 | mpol_cond_put(mpol); |
120 | return page; | 120 | return page; |
121 | } | 121 | } |
122 | 122 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 78b18a60b9b2..a237295f8190 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -241,6 +241,15 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
241 | return policy; | 241 | return policy; |
242 | } | 242 | } |
243 | 243 | ||
244 | /* Slow path of a mpol destructor. */ | ||
245 | void __mpol_put(struct mempolicy *p) | ||
246 | { | ||
247 | if (!atomic_dec_and_test(&p->refcnt)) | ||
248 | return; | ||
249 | p->mode = MPOL_DEFAULT; | ||
250 | kmem_cache_free(policy_cache, p); | ||
251 | } | ||
252 | |||
244 | static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) | 253 | static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) |
245 | { | 254 | { |
246 | } | 255 | } |
@@ -719,6 +728,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
719 | get_zonemask(pol, nmask); | 728 | get_zonemask(pol, nmask); |
720 | 729 | ||
721 | out: | 730 | out: |
731 | mpol_cond_put(pol); | ||
722 | if (vma) | 732 | if (vma) |
723 | up_read(¤t->mm->mmap_sem); | 733 | up_read(¤t->mm->mmap_sem); |
724 | return err; | 734 | return err; |
@@ -1257,16 +1267,18 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
1257 | * | 1267 | * |
1258 | * Returns effective policy for a VMA at specified address. | 1268 | * Returns effective policy for a VMA at specified address. |
1259 | * Falls back to @task or system default policy, as necessary. | 1269 | * Falls back to @task or system default policy, as necessary. |
1260 | * Returned policy has extra reference count if shared, vma, | 1270 | * Current or other task's task mempolicy and non-shared vma policies |
1261 | * or some other task's policy [show_numa_maps() can pass | 1271 | * are protected by the task's mmap_sem, which must be held for read by |
1262 | * @task != current]. It is the caller's responsibility to | 1272 | * the caller. |
1263 | * free the reference in these cases. | 1273 | * Shared policies [those marked as MPOL_F_SHARED] require an extra reference |
1274 | * count--added by the get_policy() vm_op, as appropriate--to protect against | ||
1275 | * freeing by another task. It is the caller's responsibility to free the | ||
1276 | * extra reference for shared policies. | ||
1264 | */ | 1277 | */ |
1265 | static struct mempolicy *get_vma_policy(struct task_struct *task, | 1278 | static struct mempolicy *get_vma_policy(struct task_struct *task, |
1266 | struct vm_area_struct *vma, unsigned long addr) | 1279 | struct vm_area_struct *vma, unsigned long addr) |
1267 | { | 1280 | { |
1268 | struct mempolicy *pol = task->mempolicy; | 1281 | struct mempolicy *pol = task->mempolicy; |
1269 | int shared_pol = 0; | ||
1270 | 1282 | ||
1271 | if (vma) { | 1283 | if (vma) { |
1272 | if (vma->vm_ops && vma->vm_ops->get_policy) { | 1284 | if (vma->vm_ops && vma->vm_ops->get_policy) { |
@@ -1274,20 +1286,20 @@ static struct mempolicy *get_vma_policy(struct task_struct *task, | |||
1274 | addr); | 1286 | addr); |
1275 | if (vpol) | 1287 | if (vpol) |
1276 | pol = vpol; | 1288 | pol = vpol; |
1277 | shared_pol = 1; /* if pol non-NULL, add ref below */ | ||
1278 | } else if (vma->vm_policy && | 1289 | } else if (vma->vm_policy && |
1279 | vma->vm_policy->mode != MPOL_DEFAULT) | 1290 | vma->vm_policy->mode != MPOL_DEFAULT) |
1280 | pol = vma->vm_policy; | 1291 | pol = vma->vm_policy; |
1281 | } | 1292 | } |
1282 | if (!pol) | 1293 | if (!pol) |
1283 | pol = &default_policy; | 1294 | pol = &default_policy; |
1284 | else if (!shared_pol && pol != current->mempolicy) | ||
1285 | mpol_get(pol); /* vma or other task's policy */ | ||
1286 | return pol; | 1295 | return pol; |
1287 | } | 1296 | } |
1288 | 1297 | ||
1289 | /* Return a nodemask representing a mempolicy */ | 1298 | /* |
1290 | static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy) | 1299 | * Return a nodemask representing a mempolicy for filtering nodes for |
1300 | * page allocation | ||
1301 | */ | ||
1302 | static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) | ||
1291 | { | 1303 | { |
1292 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ | 1304 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ |
1293 | if (unlikely(policy->mode == MPOL_BIND) && | 1305 | if (unlikely(policy->mode == MPOL_BIND) && |
@@ -1298,8 +1310,8 @@ static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy) | |||
1298 | return NULL; | 1310 | return NULL; |
1299 | } | 1311 | } |
1300 | 1312 | ||
1301 | /* Return a zonelist representing a mempolicy */ | 1313 | /* Return a zonelist indicated by gfp for node representing a mempolicy */ |
1302 | static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) | 1314 | static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) |
1303 | { | 1315 | { |
1304 | int nd; | 1316 | int nd; |
1305 | 1317 | ||
@@ -1311,10 +1323,10 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) | |||
1311 | break; | 1323 | break; |
1312 | case MPOL_BIND: | 1324 | case MPOL_BIND: |
1313 | /* | 1325 | /* |
1314 | * Normally, MPOL_BIND allocations node-local are node-local | 1326 | * Normally, MPOL_BIND allocations are node-local within the |
1315 | * within the allowed nodemask. However, if __GFP_THISNODE is | 1327 | * allowed nodemask. However, if __GFP_THISNODE is set and the |
1316 | * set and the current node is part of the mask, we use the | 1328 | * current node is part of the mask, we use the zonelist for |
1317 | * the zonelist for the first node in the mask instead. | 1329 | * the first node in the mask instead. |
1318 | */ | 1330 | */ |
1319 | nd = numa_node_id(); | 1331 | nd = numa_node_id(); |
1320 | if (unlikely(gfp & __GFP_THISNODE) && | 1332 | if (unlikely(gfp & __GFP_THISNODE) && |
@@ -1350,6 +1362,10 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
1350 | /* | 1362 | /* |
1351 | * Depending on the memory policy provide a node from which to allocate the | 1363 | * Depending on the memory policy provide a node from which to allocate the |
1352 | * next slab entry. | 1364 | * next slab entry. |
1365 | * @policy must be protected by freeing by the caller. If @policy is | ||
1366 | * the current task's mempolicy, this protection is implicit, as only the | ||
1367 | * task can change it's policy. The system default policy requires no | ||
1368 | * such protection. | ||
1353 | */ | 1369 | */ |
1354 | unsigned slab_node(struct mempolicy *policy) | 1370 | unsigned slab_node(struct mempolicy *policy) |
1355 | { | 1371 | { |
@@ -1435,43 +1451,27 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | |||
1435 | * @mpol = pointer to mempolicy pointer for reference counted mempolicy | 1451 | * @mpol = pointer to mempolicy pointer for reference counted mempolicy |
1436 | * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask | 1452 | * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask |
1437 | * | 1453 | * |
1438 | * Returns a zonelist suitable for a huge page allocation. | 1454 | * Returns a zonelist suitable for a huge page allocation and a pointer |
1439 | * If the effective policy is 'BIND, returns pointer to local node's zonelist, | 1455 | * to the struct mempolicy for conditional unref after allocation. |
1440 | * and a pointer to the mempolicy's @nodemask for filtering the zonelist. | 1456 | * If the effective policy is 'BIND, returns a pointer to the mempolicy's |
1441 | * If it is also a policy for which get_vma_policy() returns an extra | 1457 | * @nodemask for filtering the zonelist. |
1442 | * reference, we must hold that reference until after the allocation. | ||
1443 | * In that case, return policy via @mpol so hugetlb allocation can drop | ||
1444 | * the reference. For non-'BIND referenced policies, we can/do drop the | ||
1445 | * reference here, so the caller doesn't need to know about the special case | ||
1446 | * for default and current task policy. | ||
1447 | */ | 1458 | */ |
1448 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | 1459 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, |
1449 | gfp_t gfp_flags, struct mempolicy **mpol, | 1460 | gfp_t gfp_flags, struct mempolicy **mpol, |
1450 | nodemask_t **nodemask) | 1461 | nodemask_t **nodemask) |
1451 | { | 1462 | { |
1452 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | ||
1453 | struct zonelist *zl; | 1463 | struct zonelist *zl; |
1454 | 1464 | ||
1455 | *mpol = NULL; /* probably no unref needed */ | 1465 | *mpol = get_vma_policy(current, vma, addr); |
1456 | *nodemask = NULL; /* assume !MPOL_BIND */ | 1466 | *nodemask = NULL; /* assume !MPOL_BIND */ |
1457 | if (pol->mode == MPOL_BIND) { | ||
1458 | *nodemask = &pol->v.nodes; | ||
1459 | } else if (pol->mode == MPOL_INTERLEAVE) { | ||
1460 | unsigned nid; | ||
1461 | |||
1462 | nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); | ||
1463 | if (unlikely(pol != &default_policy && | ||
1464 | pol != current->mempolicy)) | ||
1465 | __mpol_put(pol); /* finished with pol */ | ||
1466 | return node_zonelist(nid, gfp_flags); | ||
1467 | } | ||
1468 | 1467 | ||
1469 | zl = zonelist_policy(GFP_HIGHUSER, pol); | 1468 | if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { |
1470 | if (unlikely(pol != &default_policy && pol != current->mempolicy)) { | 1469 | zl = node_zonelist(interleave_nid(*mpol, vma, addr, |
1471 | if (pol->mode != MPOL_BIND) | 1470 | HPAGE_SHIFT), gfp_flags); |
1472 | __mpol_put(pol); /* finished with pol */ | 1471 | } else { |
1473 | else | 1472 | zl = policy_zonelist(gfp_flags, *mpol); |
1474 | *mpol = pol; /* unref needed after allocation */ | 1473 | if ((*mpol)->mode == MPOL_BIND) |
1474 | *nodemask = &(*mpol)->v.nodes; | ||
1475 | } | 1475 | } |
1476 | return zl; | 1476 | return zl; |
1477 | } | 1477 | } |
@@ -1526,25 +1526,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1526 | unsigned nid; | 1526 | unsigned nid; |
1527 | 1527 | ||
1528 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); | 1528 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); |
1529 | if (unlikely(pol != &default_policy && | 1529 | mpol_cond_put(pol); |
1530 | pol != current->mempolicy)) | ||
1531 | __mpol_put(pol); /* finished with pol */ | ||
1532 | return alloc_page_interleave(gfp, 0, nid); | 1530 | return alloc_page_interleave(gfp, 0, nid); |
1533 | } | 1531 | } |
1534 | zl = zonelist_policy(gfp, pol); | 1532 | zl = policy_zonelist(gfp, pol); |
1535 | if (pol != &default_policy && pol != current->mempolicy) { | 1533 | if (unlikely(mpol_needs_cond_ref(pol))) { |
1536 | /* | 1534 | /* |
1537 | * slow path: ref counted policy -- shared or vma | 1535 | * slow path: ref counted shared policy |
1538 | */ | 1536 | */ |
1539 | struct page *page = __alloc_pages_nodemask(gfp, 0, | 1537 | struct page *page = __alloc_pages_nodemask(gfp, 0, |
1540 | zl, nodemask_policy(gfp, pol)); | 1538 | zl, policy_nodemask(gfp, pol)); |
1541 | __mpol_put(pol); | 1539 | __mpol_put(pol); |
1542 | return page; | 1540 | return page; |
1543 | } | 1541 | } |
1544 | /* | 1542 | /* |
1545 | * fast path: default or task policy | 1543 | * fast path: default or task policy |
1546 | */ | 1544 | */ |
1547 | return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol)); | 1545 | return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); |
1548 | } | 1546 | } |
1549 | 1547 | ||
1550 | /** | 1548 | /** |
@@ -1574,10 +1572,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1574 | cpuset_update_task_memory_state(); | 1572 | cpuset_update_task_memory_state(); |
1575 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1573 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
1576 | pol = &default_policy; | 1574 | pol = &default_policy; |
1575 | |||
1576 | /* | ||
1577 | * No reference counting needed for current->mempolicy | ||
1578 | * nor system default_policy | ||
1579 | */ | ||
1577 | if (pol->mode == MPOL_INTERLEAVE) | 1580 | if (pol->mode == MPOL_INTERLEAVE) |
1578 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); | 1581 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); |
1579 | return __alloc_pages_nodemask(gfp, order, | 1582 | return __alloc_pages_nodemask(gfp, order, |
1580 | zonelist_policy(gfp, pol), nodemask_policy(gfp, pol)); | 1583 | policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); |
1581 | } | 1584 | } |
1582 | EXPORT_SYMBOL(alloc_pages_current); | 1585 | EXPORT_SYMBOL(alloc_pages_current); |
1583 | 1586 | ||
@@ -1605,6 +1608,28 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) | |||
1605 | return new; | 1608 | return new; |
1606 | } | 1609 | } |
1607 | 1610 | ||
1611 | /* | ||
1612 | * If *frompol needs [has] an extra ref, copy *frompol to *tompol , | ||
1613 | * eliminate the * MPOL_F_* flags that require conditional ref and | ||
1614 | * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly | ||
1615 | * after return. Use the returned value. | ||
1616 | * | ||
1617 | * Allows use of a mempolicy for, e.g., multiple allocations with a single | ||
1618 | * policy lookup, even if the policy needs/has extra ref on lookup. | ||
1619 | * shmem_readahead needs this. | ||
1620 | */ | ||
1621 | struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, | ||
1622 | struct mempolicy *frompol) | ||
1623 | { | ||
1624 | if (!mpol_needs_cond_ref(frompol)) | ||
1625 | return frompol; | ||
1626 | |||
1627 | *tompol = *frompol; | ||
1628 | tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */ | ||
1629 | __mpol_put(frompol); | ||
1630 | return tompol; | ||
1631 | } | ||
1632 | |||
1608 | static int mpol_match_intent(const struct mempolicy *a, | 1633 | static int mpol_match_intent(const struct mempolicy *a, |
1609 | const struct mempolicy *b) | 1634 | const struct mempolicy *b) |
1610 | { | 1635 | { |
@@ -1639,15 +1664,6 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
1639 | } | 1664 | } |
1640 | } | 1665 | } |
1641 | 1666 | ||
1642 | /* Slow path of a mpol destructor. */ | ||
1643 | void __mpol_put(struct mempolicy *p) | ||
1644 | { | ||
1645 | if (!atomic_dec_and_test(&p->refcnt)) | ||
1646 | return; | ||
1647 | p->mode = MPOL_DEFAULT; | ||
1648 | kmem_cache_free(policy_cache, p); | ||
1649 | } | ||
1650 | |||
1651 | /* | 1667 | /* |
1652 | * Shared memory backing store policy support. | 1668 | * Shared memory backing store policy support. |
1653 | * | 1669 | * |
@@ -2081,11 +2097,7 @@ int show_numa_map(struct seq_file *m, void *v) | |||
2081 | 2097 | ||
2082 | pol = get_vma_policy(priv->task, vma, vma->vm_start); | 2098 | pol = get_vma_policy(priv->task, vma, vma->vm_start); |
2083 | mpol_to_str(buffer, sizeof(buffer), pol); | 2099 | mpol_to_str(buffer, sizeof(buffer), pol); |
2084 | /* | 2100 | mpol_cond_put(pol); |
2085 | * unref shared or other task's mempolicy | ||
2086 | */ | ||
2087 | if (pol != &default_policy && pol != current->mempolicy) | ||
2088 | __mpol_put(pol); | ||
2089 | 2101 | ||
2090 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); | 2102 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); |
2091 | 2103 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 5326876d814d..0b591c669b2d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1187,16 +1187,19 @@ static void shmem_show_mpol(struct seq_file *seq, unsigned short policy, | |||
1187 | static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, | 1187 | static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, |
1188 | struct shmem_inode_info *info, unsigned long idx) | 1188 | struct shmem_inode_info *info, unsigned long idx) |
1189 | { | 1189 | { |
1190 | struct mempolicy mpol, *spol; | ||
1190 | struct vm_area_struct pvma; | 1191 | struct vm_area_struct pvma; |
1191 | struct page *page; | 1192 | struct page *page; |
1192 | 1193 | ||
1194 | spol = mpol_cond_copy(&mpol, | ||
1195 | mpol_shared_policy_lookup(&info->policy, idx)); | ||
1196 | |||
1193 | /* Create a pseudo vma that just contains the policy */ | 1197 | /* Create a pseudo vma that just contains the policy */ |
1194 | pvma.vm_start = 0; | 1198 | pvma.vm_start = 0; |
1195 | pvma.vm_pgoff = idx; | 1199 | pvma.vm_pgoff = idx; |
1196 | pvma.vm_ops = NULL; | 1200 | pvma.vm_ops = NULL; |
1197 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); | 1201 | pvma.vm_policy = spol; |
1198 | page = swapin_readahead(entry, gfp, &pvma, 0); | 1202 | page = swapin_readahead(entry, gfp, &pvma, 0); |
1199 | mpol_put(pvma.vm_policy); | ||
1200 | return page; | 1203 | return page; |
1201 | } | 1204 | } |
1202 | 1205 | ||
@@ -1204,16 +1207,17 @@ static struct page *shmem_alloc_page(gfp_t gfp, | |||
1204 | struct shmem_inode_info *info, unsigned long idx) | 1207 | struct shmem_inode_info *info, unsigned long idx) |
1205 | { | 1208 | { |
1206 | struct vm_area_struct pvma; | 1209 | struct vm_area_struct pvma; |
1207 | struct page *page; | ||
1208 | 1210 | ||
1209 | /* Create a pseudo vma that just contains the policy */ | 1211 | /* Create a pseudo vma that just contains the policy */ |
1210 | pvma.vm_start = 0; | 1212 | pvma.vm_start = 0; |
1211 | pvma.vm_pgoff = idx; | 1213 | pvma.vm_pgoff = idx; |
1212 | pvma.vm_ops = NULL; | 1214 | pvma.vm_ops = NULL; |
1213 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); | 1215 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); |
1214 | page = alloc_page_vma(gfp, &pvma, 0); | 1216 | |
1215 | mpol_put(pvma.vm_policy); | 1217 | /* |
1216 | return page; | 1218 | * alloc_page_vma() will drop the shared policy reference |
1219 | */ | ||
1220 | return alloc_page_vma(gfp, &pvma, 0); | ||
1217 | } | 1221 | } |
1218 | #else /* !CONFIG_NUMA */ | 1222 | #else /* !CONFIG_NUMA */ |
1219 | #ifdef CONFIG_TMPFS | 1223 | #ifdef CONFIG_TMPFS |