aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLee Schermerhorn <Lee.Schermerhorn@hp.com>2007-09-19 01:46:47 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-09-19 14:24:18 -0400
commit480eccf9ae1073b87bb4fe118971fbf134a5bc61 (patch)
treeb66cd85cd6ad9dc7c141d34837a848111d036584 /mm
parent28f300d23674fa01ae747c66ce861d4ee6aebe8c (diff)
Fix NUMA Memory Policy Reference Counting
This patch proposes fixes to the reference counting of memory policy in the page allocation paths and in show_numa_map(). Extracted from my "Memory Policy Cleanups and Enhancements" series as stand-alone. Shared policy lookup [shmem] has always added a reference to the policy, but this was never unrefed after page allocation or after formatting the numa map data. Default system policy should not require additional ref counting, nor should the current task's task policy. However, show_numa_map() calls get_vma_policy() to examine what may be [likely is] another task's policy. The latter case needs protection against freeing of the policy. This patch adds a reference count to a mempolicy returned by get_vma_policy() when the policy is a vma policy or another task's mempolicy. Again, shared policy is already reference counted on lookup. A matching "unref" [__mpol_free()] is performed in alloc_page_vma() for shared and vma policies, and in show_numa_map() for shared and another task's mempolicy. We can call __mpol_free() directly, saving an admittedly inexpensive inline NULL test, because we know we have a non-NULL policy. Handling policy ref counts for hugepages is a bit trickier. huge_zonelist() returns a zone list that might come from a shared or vma 'BIND policy. In this case, we should hold the reference until after the huge page allocation in dequeue_hugepage(). The patch modifies huge_zonelist() to return a pointer to the mempolicy if it needs to be unref'd after allocation. Kernel Build [16cpu, 32GB, ia64] - average of 10 runs: w/o patch w/ refcount patch Avg Std Devn Avg Std Devn Real: 100.59 0.38 100.63 0.43 User: 1209.60 0.37 1209.91 0.31 System: 81.52 0.42 81.64 0.34 Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Andi Kleen <ak@suse.de> Cc: Christoph Lameter <clameter@sgi.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/mempolicy.c79
2 files changed, 73 insertions, 10 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index de4cf458d6e1..84c795ee2d65 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -71,8 +71,9 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
71{ 71{
72 int nid; 72 int nid;
73 struct page *page = NULL; 73 struct page *page = NULL;
74 struct mempolicy *mpol;
74 struct zonelist *zonelist = huge_zonelist(vma, address, 75 struct zonelist *zonelist = huge_zonelist(vma, address,
75 htlb_alloc_mask); 76 htlb_alloc_mask, &mpol);
76 struct zone **z; 77 struct zone **z;
77 78
78 for (z = zonelist->zones; *z; z++) { 79 for (z = zonelist->zones; *z; z++) {
@@ -87,6 +88,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
87 break; 88 break;
88 } 89 }
89 } 90 }
91 mpol_free(mpol); /* unref if mpol !NULL */
90 return page; 92 return page;
91} 93}
92 94
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bb54b88c3d5a..3d6ac9505d07 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1077,21 +1077,37 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1077 1077
1078#endif 1078#endif
1079 1079
1080/* Return effective policy for a VMA */ 1080/*
1081 * get_vma_policy(@task, @vma, @addr)
1082 * @task - task for fallback if vma policy == default
1083 * @vma - virtual memory area whose policy is sought
1084 * @addr - address in @vma for shared policy lookup
1085 *
1086 * Returns effective policy for a VMA at specified address.
1087 * Falls back to @task or system default policy, as necessary.
1088 * Returned policy has extra reference count if shared, vma,
1089 * or some other task's policy [show_numa_maps() can pass
1090 * @task != current]. It is the caller's responsibility to
1091 * free the reference in these cases.
1092 */
1081static struct mempolicy * get_vma_policy(struct task_struct *task, 1093static struct mempolicy * get_vma_policy(struct task_struct *task,
1082 struct vm_area_struct *vma, unsigned long addr) 1094 struct vm_area_struct *vma, unsigned long addr)
1083{ 1095{
1084 struct mempolicy *pol = task->mempolicy; 1096 struct mempolicy *pol = task->mempolicy;
1097 int shared_pol = 0;
1085 1098
1086 if (vma) { 1099 if (vma) {
1087 if (vma->vm_ops && vma->vm_ops->get_policy) 1100 if (vma->vm_ops && vma->vm_ops->get_policy) {
1088 pol = vma->vm_ops->get_policy(vma, addr); 1101 pol = vma->vm_ops->get_policy(vma, addr);
1089 else if (vma->vm_policy && 1102 shared_pol = 1; /* if pol non-NULL, add ref below */
1103 } else if (vma->vm_policy &&
1090 vma->vm_policy->policy != MPOL_DEFAULT) 1104 vma->vm_policy->policy != MPOL_DEFAULT)
1091 pol = vma->vm_policy; 1105 pol = vma->vm_policy;
1092 } 1106 }
1093 if (!pol) 1107 if (!pol)
1094 pol = &default_policy; 1108 pol = &default_policy;
1109 else if (!shared_pol && pol != current->mempolicy)
1110 mpol_get(pol); /* vma or other task's policy */
1095 return pol; 1111 return pol;
1096} 1112}
1097 1113
@@ -1207,19 +1223,45 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1207} 1223}
1208 1224
1209#ifdef CONFIG_HUGETLBFS 1225#ifdef CONFIG_HUGETLBFS
1210/* Return a zonelist suitable for a huge page allocation. */ 1226/*
1227 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1228 * @vma = virtual memory area whose policy is sought
1229 * @addr = address in @vma for shared policy lookup and interleave policy
1230 * @gfp_flags = for requested zone
1231 * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
1232 *
1233 * Returns a zonelist suitable for a huge page allocation.
1234 * If the effective policy is 'BIND, returns pointer to policy's zonelist.
1235 * If it is also a policy for which get_vma_policy() returns an extra
1236 * reference, we must hold that reference until after allocation.
1237 * In that case, return policy via @mpol so hugetlb allocation can drop
1238 * the reference. For non-'BIND referenced policies, we can/do drop the
1239 * reference here, so the caller doesn't need to know about the special case
1240 * for default and current task policy.
1241 */
1211struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1242struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1212 gfp_t gfp_flags) 1243 gfp_t gfp_flags, struct mempolicy **mpol)
1213{ 1244{
1214 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1245 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1246 struct zonelist *zl;
1215 1247
1248 *mpol = NULL; /* probably no unref needed */
1216 if (pol->policy == MPOL_INTERLEAVE) { 1249 if (pol->policy == MPOL_INTERLEAVE) {
1217 unsigned nid; 1250 unsigned nid;
1218 1251
1219 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); 1252 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1253 __mpol_free(pol); /* finished with pol */
1220 return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags); 1254 return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
1221 } 1255 }
1222 return zonelist_policy(GFP_HIGHUSER, pol); 1256
1257 zl = zonelist_policy(GFP_HIGHUSER, pol);
1258 if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1259 if (pol->policy != MPOL_BIND)
1260 __mpol_free(pol); /* finished with pol */
1261 else
1262 *mpol = pol; /* unref needed after allocation */
1263 }
1264 return zl;
1223} 1265}
1224#endif 1266#endif
1225 1267
@@ -1264,6 +1306,7 @@ struct page *
1264alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) 1306alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1265{ 1307{
1266 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1308 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1309 struct zonelist *zl;
1267 1310
1268 cpuset_update_task_memory_state(); 1311 cpuset_update_task_memory_state();
1269 1312
@@ -1273,7 +1316,19 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1273 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1316 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1274 return alloc_page_interleave(gfp, 0, nid); 1317 return alloc_page_interleave(gfp, 0, nid);
1275 } 1318 }
1276 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); 1319 zl = zonelist_policy(gfp, pol);
1320 if (pol != &default_policy && pol != current->mempolicy) {
1321 /*
1322 * slow path: ref counted policy -- shared or vma
1323 */
1324 struct page *page = __alloc_pages(gfp, 0, zl);
1325 __mpol_free(pol);
1326 return page;
1327 }
1328 /*
1329 * fast path: default or task policy
1330 */
1331 return __alloc_pages(gfp, 0, zl);
1277} 1332}
1278 1333
1279/** 1334/**
@@ -1872,6 +1927,7 @@ int show_numa_map(struct seq_file *m, void *v)
1872 struct numa_maps *md; 1927 struct numa_maps *md;
1873 struct file *file = vma->vm_file; 1928 struct file *file = vma->vm_file;
1874 struct mm_struct *mm = vma->vm_mm; 1929 struct mm_struct *mm = vma->vm_mm;
1930 struct mempolicy *pol;
1875 int n; 1931 int n;
1876 char buffer[50]; 1932 char buffer[50];
1877 1933
@@ -1882,8 +1938,13 @@ int show_numa_map(struct seq_file *m, void *v)
1882 if (!md) 1938 if (!md)
1883 return 0; 1939 return 0;
1884 1940
1885 mpol_to_str(buffer, sizeof(buffer), 1941 pol = get_vma_policy(priv->task, vma, vma->vm_start);
1886 get_vma_policy(priv->task, vma, vma->vm_start)); 1942 mpol_to_str(buffer, sizeof(buffer), pol);
1943 /*
1944 * unref shared or other task's mempolicy
1945 */
1946 if (pol != &default_policy && pol != current->mempolicy)
1947 __mpol_free(pol);
1887 1948
1888 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1949 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1889 1950