diff options
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 393 |
1 files changed, 212 insertions, 181 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1d5c64df1653..2076b1542b8a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -2,6 +2,7 @@ | |||
2 | * Simple NUMA memory policy for the Linux kernel. | 2 | * Simple NUMA memory policy for the Linux kernel. |
3 | * | 3 | * |
4 | * Copyright 2003,2004 Andi Kleen, SuSE Labs. | 4 | * Copyright 2003,2004 Andi Kleen, SuSE Labs. |
5 | * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. | ||
5 | * Subject to the GNU Public License, version 2. | 6 | * Subject to the GNU Public License, version 2. |
6 | * | 7 | * |
7 | * NUMA policy allows the user to give hints in which node(s) memory should | 8 | * NUMA policy allows the user to give hints in which node(s) memory should |
@@ -17,13 +18,19 @@ | |||
17 | * offset into the backing object or offset into the mapping | 18 | * offset into the backing object or offset into the mapping |
18 | * for anonymous memory. For process policy an process counter | 19 | * for anonymous memory. For process policy an process counter |
19 | * is used. | 20 | * is used. |
21 | * | ||
20 | * bind Only allocate memory on a specific set of nodes, | 22 | * bind Only allocate memory on a specific set of nodes, |
21 | * no fallback. | 23 | * no fallback. |
24 | * FIXME: memory is allocated starting with the first node | ||
25 | * to the last. It would be better if bind would truly restrict | ||
26 | * the allocation to memory nodes instead | ||
27 | * | ||
22 | * preferred Try a specific node first before normal fallback. | 28 | * preferred Try a specific node first before normal fallback. |
23 | * As a special case node -1 here means do the allocation | 29 | * As a special case node -1 here means do the allocation |
24 | * on the local CPU. This is normally identical to default, | 30 | * on the local CPU. This is normally identical to default, |
25 | * but useful to set in a VMA when you have a non default | 31 | * but useful to set in a VMA when you have a non default |
26 | * process policy. | 32 | * process policy. |
33 | * | ||
27 | * default Allocate on the local node first, or when on a VMA | 34 | * default Allocate on the local node first, or when on a VMA |
28 | * use the process policy. This is what Linux always did | 35 | * use the process policy. This is what Linux always did |
29 | * in a NUMA aware kernel and still does by, ahem, default. | 36 | * in a NUMA aware kernel and still does by, ahem, default. |
@@ -93,23 +100,10 @@ struct mempolicy default_policy = { | |||
93 | .policy = MPOL_DEFAULT, | 100 | .policy = MPOL_DEFAULT, |
94 | }; | 101 | }; |
95 | 102 | ||
96 | /* Check if all specified nodes are online */ | ||
97 | static int nodes_online(unsigned long *nodes) | ||
98 | { | ||
99 | DECLARE_BITMAP(online2, MAX_NUMNODES); | ||
100 | |||
101 | bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES); | ||
102 | if (bitmap_empty(online2, MAX_NUMNODES)) | ||
103 | set_bit(0, online2); | ||
104 | if (!bitmap_subset(nodes, online2, MAX_NUMNODES)) | ||
105 | return -EINVAL; | ||
106 | return 0; | ||
107 | } | ||
108 | |||
109 | /* Do sanity checking on a policy */ | 103 | /* Do sanity checking on a policy */ |
110 | static int mpol_check_policy(int mode, unsigned long *nodes) | 104 | static int mpol_check_policy(int mode, nodemask_t *nodes) |
111 | { | 105 | { |
112 | int empty = bitmap_empty(nodes, MAX_NUMNODES); | 106 | int empty = nodes_empty(*nodes); |
113 | 107 | ||
114 | switch (mode) { | 108 | switch (mode) { |
115 | case MPOL_DEFAULT: | 109 | case MPOL_DEFAULT: |
@@ -124,71 +118,20 @@ static int mpol_check_policy(int mode, unsigned long *nodes) | |||
124 | return -EINVAL; | 118 | return -EINVAL; |
125 | break; | 119 | break; |
126 | } | 120 | } |
127 | return nodes_online(nodes); | 121 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; |
128 | } | ||
129 | |||
130 | /* Copy a node mask from user space. */ | ||
131 | static int get_nodes(unsigned long *nodes, unsigned long __user *nmask, | ||
132 | unsigned long maxnode, int mode) | ||
133 | { | ||
134 | unsigned long k; | ||
135 | unsigned long nlongs; | ||
136 | unsigned long endmask; | ||
137 | |||
138 | --maxnode; | ||
139 | bitmap_zero(nodes, MAX_NUMNODES); | ||
140 | if (maxnode == 0 || !nmask) | ||
141 | return 0; | ||
142 | |||
143 | nlongs = BITS_TO_LONGS(maxnode); | ||
144 | if ((maxnode % BITS_PER_LONG) == 0) | ||
145 | endmask = ~0UL; | ||
146 | else | ||
147 | endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; | ||
148 | |||
149 | /* When the user specified more nodes than supported just check | ||
150 | if the non supported part is all zero. */ | ||
151 | if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { | ||
152 | if (nlongs > PAGE_SIZE/sizeof(long)) | ||
153 | return -EINVAL; | ||
154 | for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { | ||
155 | unsigned long t; | ||
156 | if (get_user(t, nmask + k)) | ||
157 | return -EFAULT; | ||
158 | if (k == nlongs - 1) { | ||
159 | if (t & endmask) | ||
160 | return -EINVAL; | ||
161 | } else if (t) | ||
162 | return -EINVAL; | ||
163 | } | ||
164 | nlongs = BITS_TO_LONGS(MAX_NUMNODES); | ||
165 | endmask = ~0UL; | ||
166 | } | ||
167 | |||
168 | if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long))) | ||
169 | return -EFAULT; | ||
170 | nodes[nlongs-1] &= endmask; | ||
171 | /* Update current mems_allowed */ | ||
172 | cpuset_update_current_mems_allowed(); | ||
173 | /* Ignore nodes not set in current->mems_allowed */ | ||
174 | cpuset_restrict_to_mems_allowed(nodes); | ||
175 | return mpol_check_policy(mode, nodes); | ||
176 | } | 122 | } |
177 | |||
178 | /* Generate a custom zonelist for the BIND policy. */ | 123 | /* Generate a custom zonelist for the BIND policy. */ |
179 | static struct zonelist *bind_zonelist(unsigned long *nodes) | 124 | static struct zonelist *bind_zonelist(nodemask_t *nodes) |
180 | { | 125 | { |
181 | struct zonelist *zl; | 126 | struct zonelist *zl; |
182 | int num, max, nd; | 127 | int num, max, nd; |
183 | 128 | ||
184 | max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); | 129 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); |
185 | zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); | 130 | zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); |
186 | if (!zl) | 131 | if (!zl) |
187 | return NULL; | 132 | return NULL; |
188 | num = 0; | 133 | num = 0; |
189 | for (nd = find_first_bit(nodes, MAX_NUMNODES); | 134 | for_each_node_mask(nd, *nodes) { |
190 | nd < MAX_NUMNODES; | ||
191 | nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) { | ||
192 | int k; | 135 | int k; |
193 | for (k = MAX_NR_ZONES-1; k >= 0; k--) { | 136 | for (k = MAX_NR_ZONES-1; k >= 0; k--) { |
194 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | 137 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; |
@@ -199,17 +142,16 @@ static struct zonelist *bind_zonelist(unsigned long *nodes) | |||
199 | policy_zone = k; | 142 | policy_zone = k; |
200 | } | 143 | } |
201 | } | 144 | } |
202 | BUG_ON(num >= max); | ||
203 | zl->zones[num] = NULL; | 145 | zl->zones[num] = NULL; |
204 | return zl; | 146 | return zl; |
205 | } | 147 | } |
206 | 148 | ||
207 | /* Create a new policy */ | 149 | /* Create a new policy */ |
208 | static struct mempolicy *mpol_new(int mode, unsigned long *nodes) | 150 | static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) |
209 | { | 151 | { |
210 | struct mempolicy *policy; | 152 | struct mempolicy *policy; |
211 | 153 | ||
212 | PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); | 154 | PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); |
213 | if (mode == MPOL_DEFAULT) | 155 | if (mode == MPOL_DEFAULT) |
214 | return NULL; | 156 | return NULL; |
215 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 157 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
@@ -218,10 +160,10 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) | |||
218 | atomic_set(&policy->refcnt, 1); | 160 | atomic_set(&policy->refcnt, 1); |
219 | switch (mode) { | 161 | switch (mode) { |
220 | case MPOL_INTERLEAVE: | 162 | case MPOL_INTERLEAVE: |
221 | bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); | 163 | policy->v.nodes = *nodes; |
222 | break; | 164 | break; |
223 | case MPOL_PREFERRED: | 165 | case MPOL_PREFERRED: |
224 | policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); | 166 | policy->v.preferred_node = first_node(*nodes); |
225 | if (policy->v.preferred_node >= MAX_NUMNODES) | 167 | if (policy->v.preferred_node >= MAX_NUMNODES) |
226 | policy->v.preferred_node = -1; | 168 | policy->v.preferred_node = -1; |
227 | break; | 169 | break; |
@@ -238,14 +180,14 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) | |||
238 | } | 180 | } |
239 | 181 | ||
240 | /* Ensure all existing pages follow the policy. */ | 182 | /* Ensure all existing pages follow the policy. */ |
241 | static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, | 183 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
242 | unsigned long addr, unsigned long end, unsigned long *nodes) | 184 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
243 | { | 185 | { |
244 | pte_t *orig_pte; | 186 | pte_t *orig_pte; |
245 | pte_t *pte; | 187 | pte_t *pte; |
188 | spinlock_t *ptl; | ||
246 | 189 | ||
247 | spin_lock(&mm->page_table_lock); | 190 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
248 | orig_pte = pte = pte_offset_map(pmd, addr); | ||
249 | do { | 191 | do { |
250 | unsigned long pfn; | 192 | unsigned long pfn; |
251 | unsigned int nid; | 193 | unsigned int nid; |
@@ -253,19 +195,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
253 | if (!pte_present(*pte)) | 195 | if (!pte_present(*pte)) |
254 | continue; | 196 | continue; |
255 | pfn = pte_pfn(*pte); | 197 | pfn = pte_pfn(*pte); |
256 | if (!pfn_valid(pfn)) | 198 | if (!pfn_valid(pfn)) { |
199 | print_bad_pte(vma, *pte, addr); | ||
257 | continue; | 200 | continue; |
201 | } | ||
258 | nid = pfn_to_nid(pfn); | 202 | nid = pfn_to_nid(pfn); |
259 | if (!test_bit(nid, nodes)) | 203 | if (!node_isset(nid, *nodes)) |
260 | break; | 204 | break; |
261 | } while (pte++, addr += PAGE_SIZE, addr != end); | 205 | } while (pte++, addr += PAGE_SIZE, addr != end); |
262 | pte_unmap(orig_pte); | 206 | pte_unmap_unlock(orig_pte, ptl); |
263 | spin_unlock(&mm->page_table_lock); | ||
264 | return addr != end; | 207 | return addr != end; |
265 | } | 208 | } |
266 | 209 | ||
267 | static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, | 210 | static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
268 | unsigned long addr, unsigned long end, unsigned long *nodes) | 211 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
269 | { | 212 | { |
270 | pmd_t *pmd; | 213 | pmd_t *pmd; |
271 | unsigned long next; | 214 | unsigned long next; |
@@ -275,14 +218,14 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
275 | next = pmd_addr_end(addr, end); | 218 | next = pmd_addr_end(addr, end); |
276 | if (pmd_none_or_clear_bad(pmd)) | 219 | if (pmd_none_or_clear_bad(pmd)) |
277 | continue; | 220 | continue; |
278 | if (check_pte_range(mm, pmd, addr, next, nodes)) | 221 | if (check_pte_range(vma, pmd, addr, next, nodes)) |
279 | return -EIO; | 222 | return -EIO; |
280 | } while (pmd++, addr = next, addr != end); | 223 | } while (pmd++, addr = next, addr != end); |
281 | return 0; | 224 | return 0; |
282 | } | 225 | } |
283 | 226 | ||
284 | static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, | 227 | static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
285 | unsigned long addr, unsigned long end, unsigned long *nodes) | 228 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
286 | { | 229 | { |
287 | pud_t *pud; | 230 | pud_t *pud; |
288 | unsigned long next; | 231 | unsigned long next; |
@@ -292,24 +235,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, | |||
292 | next = pud_addr_end(addr, end); | 235 | next = pud_addr_end(addr, end); |
293 | if (pud_none_or_clear_bad(pud)) | 236 | if (pud_none_or_clear_bad(pud)) |
294 | continue; | 237 | continue; |
295 | if (check_pmd_range(mm, pud, addr, next, nodes)) | 238 | if (check_pmd_range(vma, pud, addr, next, nodes)) |
296 | return -EIO; | 239 | return -EIO; |
297 | } while (pud++, addr = next, addr != end); | 240 | } while (pud++, addr = next, addr != end); |
298 | return 0; | 241 | return 0; |
299 | } | 242 | } |
300 | 243 | ||
301 | static inline int check_pgd_range(struct mm_struct *mm, | 244 | static inline int check_pgd_range(struct vm_area_struct *vma, |
302 | unsigned long addr, unsigned long end, unsigned long *nodes) | 245 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
303 | { | 246 | { |
304 | pgd_t *pgd; | 247 | pgd_t *pgd; |
305 | unsigned long next; | 248 | unsigned long next; |
306 | 249 | ||
307 | pgd = pgd_offset(mm, addr); | 250 | pgd = pgd_offset(vma->vm_mm, addr); |
308 | do { | 251 | do { |
309 | next = pgd_addr_end(addr, end); | 252 | next = pgd_addr_end(addr, end); |
310 | if (pgd_none_or_clear_bad(pgd)) | 253 | if (pgd_none_or_clear_bad(pgd)) |
311 | continue; | 254 | continue; |
312 | if (check_pud_range(mm, pgd, addr, next, nodes)) | 255 | if (check_pud_range(vma, pgd, addr, next, nodes)) |
313 | return -EIO; | 256 | return -EIO; |
314 | } while (pgd++, addr = next, addr != end); | 257 | } while (pgd++, addr = next, addr != end); |
315 | return 0; | 258 | return 0; |
@@ -318,7 +261,7 @@ static inline int check_pgd_range(struct mm_struct *mm, | |||
318 | /* Step 1: check the range */ | 261 | /* Step 1: check the range */ |
319 | static struct vm_area_struct * | 262 | static struct vm_area_struct * |
320 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | 263 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, |
321 | unsigned long *nodes, unsigned long flags) | 264 | nodemask_t *nodes, unsigned long flags) |
322 | { | 265 | { |
323 | int err; | 266 | int err; |
324 | struct vm_area_struct *first, *vma, *prev; | 267 | struct vm_area_struct *first, *vma, *prev; |
@@ -326,6 +269,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
326 | first = find_vma(mm, start); | 269 | first = find_vma(mm, start); |
327 | if (!first) | 270 | if (!first) |
328 | return ERR_PTR(-EFAULT); | 271 | return ERR_PTR(-EFAULT); |
272 | if (first->vm_flags & VM_RESERVED) | ||
273 | return ERR_PTR(-EACCES); | ||
329 | prev = NULL; | 274 | prev = NULL; |
330 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 275 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
331 | if (!vma->vm_next && vma->vm_end < end) | 276 | if (!vma->vm_next && vma->vm_end < end) |
@@ -338,8 +283,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
338 | endvma = end; | 283 | endvma = end; |
339 | if (vma->vm_start > start) | 284 | if (vma->vm_start > start) |
340 | start = vma->vm_start; | 285 | start = vma->vm_start; |
341 | err = check_pgd_range(vma->vm_mm, | 286 | err = check_pgd_range(vma, start, endvma, nodes); |
342 | start, endvma, nodes); | ||
343 | if (err) { | 287 | if (err) { |
344 | first = ERR_PTR(err); | 288 | first = ERR_PTR(err); |
345 | break; | 289 | break; |
@@ -393,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start, | |||
393 | return err; | 337 | return err; |
394 | } | 338 | } |
395 | 339 | ||
396 | /* Change policy for a memory range */ | 340 | static int contextualize_policy(int mode, nodemask_t *nodes) |
397 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, | 341 | { |
398 | unsigned long mode, | 342 | if (!nodes) |
399 | unsigned long __user *nmask, unsigned long maxnode, | 343 | return 0; |
400 | unsigned flags) | 344 | |
345 | /* Update current mems_allowed */ | ||
346 | cpuset_update_current_mems_allowed(); | ||
347 | /* Ignore nodes not set in current->mems_allowed */ | ||
348 | cpuset_restrict_to_mems_allowed(nodes->bits); | ||
349 | return mpol_check_policy(mode, nodes); | ||
350 | } | ||
351 | |||
352 | long do_mbind(unsigned long start, unsigned long len, | ||
353 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | ||
401 | { | 354 | { |
402 | struct vm_area_struct *vma; | 355 | struct vm_area_struct *vma; |
403 | struct mm_struct *mm = current->mm; | 356 | struct mm_struct *mm = current->mm; |
404 | struct mempolicy *new; | 357 | struct mempolicy *new; |
405 | unsigned long end; | 358 | unsigned long end; |
406 | DECLARE_BITMAP(nodes, MAX_NUMNODES); | ||
407 | int err; | 359 | int err; |
408 | 360 | ||
409 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) | 361 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) |
@@ -418,20 +370,17 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |||
418 | return -EINVAL; | 370 | return -EINVAL; |
419 | if (end == start) | 371 | if (end == start) |
420 | return 0; | 372 | return 0; |
421 | 373 | if (mpol_check_policy(mode, nmask)) | |
422 | err = get_nodes(nodes, nmask, maxnode, mode); | 374 | return -EINVAL; |
423 | if (err) | 375 | new = mpol_new(mode, nmask); |
424 | return err; | ||
425 | |||
426 | new = mpol_new(mode, nodes); | ||
427 | if (IS_ERR(new)) | 376 | if (IS_ERR(new)) |
428 | return PTR_ERR(new); | 377 | return PTR_ERR(new); |
429 | 378 | ||
430 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | 379 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, |
431 | mode,nodes[0]); | 380 | mode,nodes_addr(nodes)[0]); |
432 | 381 | ||
433 | down_write(&mm->mmap_sem); | 382 | down_write(&mm->mmap_sem); |
434 | vma = check_range(mm, start, end, nodes, flags); | 383 | vma = check_range(mm, start, end, nmask, flags); |
435 | err = PTR_ERR(vma); | 384 | err = PTR_ERR(vma); |
436 | if (!IS_ERR(vma)) | 385 | if (!IS_ERR(vma)) |
437 | err = mbind_range(vma, start, end, new); | 386 | err = mbind_range(vma, start, end, new); |
@@ -441,50 +390,45 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |||
441 | } | 390 | } |
442 | 391 | ||
443 | /* Set the process memory policy */ | 392 | /* Set the process memory policy */ |
444 | asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | 393 | long do_set_mempolicy(int mode, nodemask_t *nodes) |
445 | unsigned long maxnode) | ||
446 | { | 394 | { |
447 | int err; | ||
448 | struct mempolicy *new; | 395 | struct mempolicy *new; |
449 | DECLARE_BITMAP(nodes, MAX_NUMNODES); | ||
450 | 396 | ||
451 | if (mode < 0 || mode > MPOL_MAX) | 397 | if (contextualize_policy(mode, nodes)) |
452 | return -EINVAL; | 398 | return -EINVAL; |
453 | err = get_nodes(nodes, nmask, maxnode, mode); | ||
454 | if (err) | ||
455 | return err; | ||
456 | new = mpol_new(mode, nodes); | 399 | new = mpol_new(mode, nodes); |
457 | if (IS_ERR(new)) | 400 | if (IS_ERR(new)) |
458 | return PTR_ERR(new); | 401 | return PTR_ERR(new); |
459 | mpol_free(current->mempolicy); | 402 | mpol_free(current->mempolicy); |
460 | current->mempolicy = new; | 403 | current->mempolicy = new; |
461 | if (new && new->policy == MPOL_INTERLEAVE) | 404 | if (new && new->policy == MPOL_INTERLEAVE) |
462 | current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); | 405 | current->il_next = first_node(new->v.nodes); |
463 | return 0; | 406 | return 0; |
464 | } | 407 | } |
465 | 408 | ||
466 | /* Fill a zone bitmap for a policy */ | 409 | /* Fill a zone bitmap for a policy */ |
467 | static void get_zonemask(struct mempolicy *p, unsigned long *nodes) | 410 | static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) |
468 | { | 411 | { |
469 | int i; | 412 | int i; |
470 | 413 | ||
471 | bitmap_zero(nodes, MAX_NUMNODES); | 414 | nodes_clear(*nodes); |
472 | switch (p->policy) { | 415 | switch (p->policy) { |
473 | case MPOL_BIND: | 416 | case MPOL_BIND: |
474 | for (i = 0; p->v.zonelist->zones[i]; i++) | 417 | for (i = 0; p->v.zonelist->zones[i]; i++) |
475 | __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); | 418 | node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, |
419 | *nodes); | ||
476 | break; | 420 | break; |
477 | case MPOL_DEFAULT: | 421 | case MPOL_DEFAULT: |
478 | break; | 422 | break; |
479 | case MPOL_INTERLEAVE: | 423 | case MPOL_INTERLEAVE: |
480 | bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); | 424 | *nodes = p->v.nodes; |
481 | break; | 425 | break; |
482 | case MPOL_PREFERRED: | 426 | case MPOL_PREFERRED: |
483 | /* or use current node instead of online map? */ | 427 | /* or use current node instead of online map? */ |
484 | if (p->v.preferred_node < 0) | 428 | if (p->v.preferred_node < 0) |
485 | bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES); | 429 | *nodes = node_online_map; |
486 | else | 430 | else |
487 | __set_bit(p->v.preferred_node, nodes); | 431 | node_set(p->v.preferred_node, *nodes); |
488 | break; | 432 | break; |
489 | default: | 433 | default: |
490 | BUG(); | 434 | BUG(); |
@@ -504,37 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) | |||
504 | return err; | 448 | return err; |
505 | } | 449 | } |
506 | 450 | ||
507 | /* Copy a kernel node mask to user space */ | ||
508 | static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | ||
509 | void *nodes, unsigned nbytes) | ||
510 | { | ||
511 | unsigned long copy = ALIGN(maxnode-1, 64) / 8; | ||
512 | |||
513 | if (copy > nbytes) { | ||
514 | if (copy > PAGE_SIZE) | ||
515 | return -EINVAL; | ||
516 | if (clear_user((char __user *)mask + nbytes, copy - nbytes)) | ||
517 | return -EFAULT; | ||
518 | copy = nbytes; | ||
519 | } | ||
520 | return copy_to_user(mask, nodes, copy) ? -EFAULT : 0; | ||
521 | } | ||
522 | |||
523 | /* Retrieve NUMA policy */ | 451 | /* Retrieve NUMA policy */ |
524 | asmlinkage long sys_get_mempolicy(int __user *policy, | 452 | long do_get_mempolicy(int *policy, nodemask_t *nmask, |
525 | unsigned long __user *nmask, | 453 | unsigned long addr, unsigned long flags) |
526 | unsigned long maxnode, | ||
527 | unsigned long addr, unsigned long flags) | ||
528 | { | 454 | { |
529 | int err, pval; | 455 | int err; |
530 | struct mm_struct *mm = current->mm; | 456 | struct mm_struct *mm = current->mm; |
531 | struct vm_area_struct *vma = NULL; | 457 | struct vm_area_struct *vma = NULL; |
532 | struct mempolicy *pol = current->mempolicy; | 458 | struct mempolicy *pol = current->mempolicy; |
533 | 459 | ||
534 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) | 460 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) |
535 | return -EINVAL; | 461 | return -EINVAL; |
536 | if (nmask != NULL && maxnode < MAX_NUMNODES) | ||
537 | return -EINVAL; | ||
538 | if (flags & MPOL_F_ADDR) { | 462 | if (flags & MPOL_F_ADDR) { |
539 | down_read(&mm->mmap_sem); | 463 | down_read(&mm->mmap_sem); |
540 | vma = find_vma_intersection(mm, addr, addr+1); | 464 | vma = find_vma_intersection(mm, addr, addr+1); |
@@ -557,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy, | |||
557 | err = lookup_node(mm, addr); | 481 | err = lookup_node(mm, addr); |
558 | if (err < 0) | 482 | if (err < 0) |
559 | goto out; | 483 | goto out; |
560 | pval = err; | 484 | *policy = err; |
561 | } else if (pol == current->mempolicy && | 485 | } else if (pol == current->mempolicy && |
562 | pol->policy == MPOL_INTERLEAVE) { | 486 | pol->policy == MPOL_INTERLEAVE) { |
563 | pval = current->il_next; | 487 | *policy = current->il_next; |
564 | } else { | 488 | } else { |
565 | err = -EINVAL; | 489 | err = -EINVAL; |
566 | goto out; | 490 | goto out; |
567 | } | 491 | } |
568 | } else | 492 | } else |
569 | pval = pol->policy; | 493 | *policy = pol->policy; |
570 | 494 | ||
571 | if (vma) { | 495 | if (vma) { |
572 | up_read(¤t->mm->mmap_sem); | 496 | up_read(¤t->mm->mmap_sem); |
573 | vma = NULL; | 497 | vma = NULL; |
574 | } | 498 | } |
575 | 499 | ||
576 | if (policy && put_user(pval, policy)) | ||
577 | return -EFAULT; | ||
578 | |||
579 | err = 0; | 500 | err = 0; |
580 | if (nmask) { | 501 | if (nmask) |
581 | DECLARE_BITMAP(nodes, MAX_NUMNODES); | 502 | get_zonemask(pol, nmask); |
582 | get_zonemask(pol, nodes); | ||
583 | err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes)); | ||
584 | } | ||
585 | 503 | ||
586 | out: | 504 | out: |
587 | if (vma) | 505 | if (vma) |
@@ -589,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy, | |||
589 | return err; | 507 | return err; |
590 | } | 508 | } |
591 | 509 | ||
510 | /* | ||
511 | * User space interface with variable sized bitmaps for nodelists. | ||
512 | */ | ||
513 | |||
514 | /* Copy a node mask from user space. */ | ||
515 | static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, | ||
516 | unsigned long maxnode) | ||
517 | { | ||
518 | unsigned long k; | ||
519 | unsigned long nlongs; | ||
520 | unsigned long endmask; | ||
521 | |||
522 | --maxnode; | ||
523 | nodes_clear(*nodes); | ||
524 | if (maxnode == 0 || !nmask) | ||
525 | return 0; | ||
526 | |||
527 | nlongs = BITS_TO_LONGS(maxnode); | ||
528 | if ((maxnode % BITS_PER_LONG) == 0) | ||
529 | endmask = ~0UL; | ||
530 | else | ||
531 | endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; | ||
532 | |||
533 | /* When the user specified more nodes than supported just check | ||
534 | if the non supported part is all zero. */ | ||
535 | if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { | ||
536 | if (nlongs > PAGE_SIZE/sizeof(long)) | ||
537 | return -EINVAL; | ||
538 | for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { | ||
539 | unsigned long t; | ||
540 | if (get_user(t, nmask + k)) | ||
541 | return -EFAULT; | ||
542 | if (k == nlongs - 1) { | ||
543 | if (t & endmask) | ||
544 | return -EINVAL; | ||
545 | } else if (t) | ||
546 | return -EINVAL; | ||
547 | } | ||
548 | nlongs = BITS_TO_LONGS(MAX_NUMNODES); | ||
549 | endmask = ~0UL; | ||
550 | } | ||
551 | |||
552 | if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) | ||
553 | return -EFAULT; | ||
554 | nodes_addr(*nodes)[nlongs-1] &= endmask; | ||
555 | return 0; | ||
556 | } | ||
557 | |||
558 | /* Copy a kernel node mask to user space */ | ||
559 | static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | ||
560 | nodemask_t *nodes) | ||
561 | { | ||
562 | unsigned long copy = ALIGN(maxnode-1, 64) / 8; | ||
563 | const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); | ||
564 | |||
565 | if (copy > nbytes) { | ||
566 | if (copy > PAGE_SIZE) | ||
567 | return -EINVAL; | ||
568 | if (clear_user((char __user *)mask + nbytes, copy - nbytes)) | ||
569 | return -EFAULT; | ||
570 | copy = nbytes; | ||
571 | } | ||
572 | return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; | ||
573 | } | ||
574 | |||
575 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, | ||
576 | unsigned long mode, | ||
577 | unsigned long __user *nmask, unsigned long maxnode, | ||
578 | unsigned flags) | ||
579 | { | ||
580 | nodemask_t nodes; | ||
581 | int err; | ||
582 | |||
583 | err = get_nodes(&nodes, nmask, maxnode); | ||
584 | if (err) | ||
585 | return err; | ||
586 | return do_mbind(start, len, mode, &nodes, flags); | ||
587 | } | ||
588 | |||
589 | /* Set the process memory policy */ | ||
590 | asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | ||
591 | unsigned long maxnode) | ||
592 | { | ||
593 | int err; | ||
594 | nodemask_t nodes; | ||
595 | |||
596 | if (mode < 0 || mode > MPOL_MAX) | ||
597 | return -EINVAL; | ||
598 | err = get_nodes(&nodes, nmask, maxnode); | ||
599 | if (err) | ||
600 | return err; | ||
601 | return do_set_mempolicy(mode, &nodes); | ||
602 | } | ||
603 | |||
604 | /* Retrieve NUMA policy */ | ||
605 | asmlinkage long sys_get_mempolicy(int __user *policy, | ||
606 | unsigned long __user *nmask, | ||
607 | unsigned long maxnode, | ||
608 | unsigned long addr, unsigned long flags) | ||
609 | { | ||
610 | int err, pval; | ||
611 | nodemask_t nodes; | ||
612 | |||
613 | if (nmask != NULL && maxnode < MAX_NUMNODES) | ||
614 | return -EINVAL; | ||
615 | |||
616 | err = do_get_mempolicy(&pval, &nodes, addr, flags); | ||
617 | |||
618 | if (err) | ||
619 | return err; | ||
620 | |||
621 | if (policy && put_user(pval, policy)) | ||
622 | return -EFAULT; | ||
623 | |||
624 | if (nmask) | ||
625 | err = copy_nodes_to_user(nmask, maxnode, &nodes); | ||
626 | |||
627 | return err; | ||
628 | } | ||
629 | |||
592 | #ifdef CONFIG_COMPAT | 630 | #ifdef CONFIG_COMPAT |
593 | 631 | ||
594 | asmlinkage long compat_sys_get_mempolicy(int __user *policy, | 632 | asmlinkage long compat_sys_get_mempolicy(int __user *policy, |
@@ -649,15 +687,15 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
649 | long err = 0; | 687 | long err = 0; |
650 | unsigned long __user *nm = NULL; | 688 | unsigned long __user *nm = NULL; |
651 | unsigned long nr_bits, alloc_size; | 689 | unsigned long nr_bits, alloc_size; |
652 | DECLARE_BITMAP(bm, MAX_NUMNODES); | 690 | nodemask_t bm; |
653 | 691 | ||
654 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); | 692 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); |
655 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; | 693 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; |
656 | 694 | ||
657 | if (nmask) { | 695 | if (nmask) { |
658 | err = compat_get_bitmap(bm, nmask, nr_bits); | 696 | err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); |
659 | nm = compat_alloc_user_space(alloc_size); | 697 | nm = compat_alloc_user_space(alloc_size); |
660 | err |= copy_to_user(nm, bm, alloc_size); | 698 | err |= copy_to_user(nm, nodes_addr(bm), alloc_size); |
661 | } | 699 | } |
662 | 700 | ||
663 | if (err) | 701 | if (err) |
@@ -676,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo | |||
676 | 714 | ||
677 | if (vma) { | 715 | if (vma) { |
678 | if (vma->vm_ops && vma->vm_ops->get_policy) | 716 | if (vma->vm_ops && vma->vm_ops->get_policy) |
679 | pol = vma->vm_ops->get_policy(vma, addr); | 717 | pol = vma->vm_ops->get_policy(vma, addr); |
680 | else if (vma->vm_policy && | 718 | else if (vma->vm_policy && |
681 | vma->vm_policy->policy != MPOL_DEFAULT) | 719 | vma->vm_policy->policy != MPOL_DEFAULT) |
682 | pol = vma->vm_policy; | 720 | pol = vma->vm_policy; |
@@ -722,10 +760,9 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
722 | struct task_struct *me = current; | 760 | struct task_struct *me = current; |
723 | 761 | ||
724 | nid = me->il_next; | 762 | nid = me->il_next; |
725 | BUG_ON(nid >= MAX_NUMNODES); | 763 | next = next_node(nid, policy->v.nodes); |
726 | next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid); | ||
727 | if (next >= MAX_NUMNODES) | 764 | if (next >= MAX_NUMNODES) |
728 | next = find_first_bit(policy->v.nodes, MAX_NUMNODES); | 765 | next = first_node(policy->v.nodes); |
729 | me->il_next = next; | 766 | me->il_next = next; |
730 | return nid; | 767 | return nid; |
731 | } | 768 | } |
@@ -734,29 +771,27 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
734 | static unsigned offset_il_node(struct mempolicy *pol, | 771 | static unsigned offset_il_node(struct mempolicy *pol, |
735 | struct vm_area_struct *vma, unsigned long off) | 772 | struct vm_area_struct *vma, unsigned long off) |
736 | { | 773 | { |
737 | unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); | 774 | unsigned nnodes = nodes_weight(pol->v.nodes); |
738 | unsigned target = (unsigned)off % nnodes; | 775 | unsigned target = (unsigned)off % nnodes; |
739 | int c; | 776 | int c; |
740 | int nid = -1; | 777 | int nid = -1; |
741 | 778 | ||
742 | c = 0; | 779 | c = 0; |
743 | do { | 780 | do { |
744 | nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); | 781 | nid = next_node(nid, pol->v.nodes); |
745 | c++; | 782 | c++; |
746 | } while (c <= target); | 783 | } while (c <= target); |
747 | BUG_ON(nid >= MAX_NUMNODES); | ||
748 | BUG_ON(!test_bit(nid, pol->v.nodes)); | ||
749 | return nid; | 784 | return nid; |
750 | } | 785 | } |
751 | 786 | ||
752 | /* Allocate a page in interleaved policy. | 787 | /* Allocate a page in interleaved policy. |
753 | Own path because it needs to do special accounting. */ | 788 | Own path because it needs to do special accounting. */ |
754 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) | 789 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, |
790 | unsigned nid) | ||
755 | { | 791 | { |
756 | struct zonelist *zl; | 792 | struct zonelist *zl; |
757 | struct page *page; | 793 | struct page *page; |
758 | 794 | ||
759 | BUG_ON(!node_online(nid)); | ||
760 | zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); | 795 | zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); |
761 | page = __alloc_pages(gfp, order, zl); | 796 | page = __alloc_pages(gfp, order, zl); |
762 | if (page && page_zone(page) == zl->zones[0]) { | 797 | if (page && page_zone(page) == zl->zones[0]) { |
@@ -799,8 +834,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
799 | unsigned nid; | 834 | unsigned nid; |
800 | if (vma) { | 835 | if (vma) { |
801 | unsigned long off; | 836 | unsigned long off; |
802 | BUG_ON(addr >= vma->vm_end); | ||
803 | BUG_ON(addr < vma->vm_start); | ||
804 | off = vma->vm_pgoff; | 837 | off = vma->vm_pgoff; |
805 | off += (addr - vma->vm_start) >> PAGE_SHIFT; | 838 | off += (addr - vma->vm_start) >> PAGE_SHIFT; |
806 | nid = offset_il_node(pol, vma, off); | 839 | nid = offset_il_node(pol, vma, off); |
@@ -878,7 +911,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
878 | case MPOL_DEFAULT: | 911 | case MPOL_DEFAULT: |
879 | return 1; | 912 | return 1; |
880 | case MPOL_INTERLEAVE: | 913 | case MPOL_INTERLEAVE: |
881 | return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); | 914 | return nodes_equal(a->v.nodes, b->v.nodes); |
882 | case MPOL_PREFERRED: | 915 | case MPOL_PREFERRED: |
883 | return a->v.preferred_node == b->v.preferred_node; | 916 | return a->v.preferred_node == b->v.preferred_node; |
884 | case MPOL_BIND: { | 917 | case MPOL_BIND: { |
@@ -1117,7 +1150,7 @@ int mpol_set_shared_policy(struct shared_policy *info, | |||
1117 | PDprintk("set_shared_policy %lx sz %lu %d %lx\n", | 1150 | PDprintk("set_shared_policy %lx sz %lu %d %lx\n", |
1118 | vma->vm_pgoff, | 1151 | vma->vm_pgoff, |
1119 | sz, npol? npol->policy : -1, | 1152 | sz, npol? npol->policy : -1, |
1120 | npol ? npol->v.nodes[0] : -1); | 1153 | npol ? nodes_addr(npol->v.nodes)[0] : -1); |
1121 | 1154 | ||
1122 | if (npol) { | 1155 | if (npol) { |
1123 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); | 1156 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); |
@@ -1164,14 +1197,12 @@ void __init numa_policy_init(void) | |||
1164 | /* Set interleaving policy for system init. This way not all | 1197 | /* Set interleaving policy for system init. This way not all |
1165 | the data structures allocated at system boot end up in node zero. */ | 1198 | the data structures allocated at system boot end up in node zero. */ |
1166 | 1199 | ||
1167 | if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), | 1200 | if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) |
1168 | MAX_NUMNODES) < 0) | ||
1169 | printk("numa_policy_init: interleaving failed\n"); | 1201 | printk("numa_policy_init: interleaving failed\n"); |
1170 | } | 1202 | } |
1171 | 1203 | ||
1172 | /* Reset policy of current process to default. | 1204 | /* Reset policy of current process to default */ |
1173 | * Assumes fs == KERNEL_DS */ | ||
1174 | void numa_default_policy(void) | 1205 | void numa_default_policy(void) |
1175 | { | 1206 | { |
1176 | sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); | 1207 | do_set_mempolicy(MPOL_DEFAULT, NULL); |
1177 | } | 1208 | } |