diff options
Diffstat (limited to 'mm/mempolicy.c')
| -rw-r--r-- | mm/mempolicy.c | 393 |
1 files changed, 212 insertions, 181 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1d5c64df1653..2076b1542b8a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -2,6 +2,7 @@ | |||
| 2 | * Simple NUMA memory policy for the Linux kernel. | 2 | * Simple NUMA memory policy for the Linux kernel. |
| 3 | * | 3 | * |
| 4 | * Copyright 2003,2004 Andi Kleen, SuSE Labs. | 4 | * Copyright 2003,2004 Andi Kleen, SuSE Labs. |
| 5 | * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. | ||
| 5 | * Subject to the GNU Public License, version 2. | 6 | * Subject to the GNU Public License, version 2. |
| 6 | * | 7 | * |
| 7 | * NUMA policy allows the user to give hints in which node(s) memory should | 8 | * NUMA policy allows the user to give hints in which node(s) memory should |
| @@ -17,13 +18,19 @@ | |||
| 17 | * offset into the backing object or offset into the mapping | 18 | * offset into the backing object or offset into the mapping |
| 18 | * for anonymous memory. For process policy an process counter | 19 | * for anonymous memory. For process policy an process counter |
| 19 | * is used. | 20 | * is used. |
| 21 | * | ||
| 20 | * bind Only allocate memory on a specific set of nodes, | 22 | * bind Only allocate memory on a specific set of nodes, |
| 21 | * no fallback. | 23 | * no fallback. |
| 24 | * FIXME: memory is allocated starting with the first node | ||
| 25 | * to the last. It would be better if bind would truly restrict | ||
| 26 | * the allocation to memory nodes instead | ||
| 27 | * | ||
| 22 | * preferred Try a specific node first before normal fallback. | 28 | * preferred Try a specific node first before normal fallback. |
| 23 | * As a special case node -1 here means do the allocation | 29 | * As a special case node -1 here means do the allocation |
| 24 | * on the local CPU. This is normally identical to default, | 30 | * on the local CPU. This is normally identical to default, |
| 25 | * but useful to set in a VMA when you have a non default | 31 | * but useful to set in a VMA when you have a non default |
| 26 | * process policy. | 32 | * process policy. |
| 33 | * | ||
| 27 | * default Allocate on the local node first, or when on a VMA | 34 | * default Allocate on the local node first, or when on a VMA |
| 28 | * use the process policy. This is what Linux always did | 35 | * use the process policy. This is what Linux always did |
| 29 | * in a NUMA aware kernel and still does by, ahem, default. | 36 | * in a NUMA aware kernel and still does by, ahem, default. |
| @@ -93,23 +100,10 @@ struct mempolicy default_policy = { | |||
| 93 | .policy = MPOL_DEFAULT, | 100 | .policy = MPOL_DEFAULT, |
| 94 | }; | 101 | }; |
| 95 | 102 | ||
| 96 | /* Check if all specified nodes are online */ | ||
| 97 | static int nodes_online(unsigned long *nodes) | ||
| 98 | { | ||
| 99 | DECLARE_BITMAP(online2, MAX_NUMNODES); | ||
| 100 | |||
| 101 | bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES); | ||
| 102 | if (bitmap_empty(online2, MAX_NUMNODES)) | ||
| 103 | set_bit(0, online2); | ||
| 104 | if (!bitmap_subset(nodes, online2, MAX_NUMNODES)) | ||
| 105 | return -EINVAL; | ||
| 106 | return 0; | ||
| 107 | } | ||
| 108 | |||
| 109 | /* Do sanity checking on a policy */ | 103 | /* Do sanity checking on a policy */ |
| 110 | static int mpol_check_policy(int mode, unsigned long *nodes) | 104 | static int mpol_check_policy(int mode, nodemask_t *nodes) |
| 111 | { | 105 | { |
| 112 | int empty = bitmap_empty(nodes, MAX_NUMNODES); | 106 | int empty = nodes_empty(*nodes); |
| 113 | 107 | ||
| 114 | switch (mode) { | 108 | switch (mode) { |
| 115 | case MPOL_DEFAULT: | 109 | case MPOL_DEFAULT: |
| @@ -124,71 +118,20 @@ static int mpol_check_policy(int mode, unsigned long *nodes) | |||
| 124 | return -EINVAL; | 118 | return -EINVAL; |
| 125 | break; | 119 | break; |
| 126 | } | 120 | } |
| 127 | return nodes_online(nodes); | 121 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; |
| 128 | } | ||
| 129 | |||
| 130 | /* Copy a node mask from user space. */ | ||
| 131 | static int get_nodes(unsigned long *nodes, unsigned long __user *nmask, | ||
| 132 | unsigned long maxnode, int mode) | ||
| 133 | { | ||
| 134 | unsigned long k; | ||
| 135 | unsigned long nlongs; | ||
| 136 | unsigned long endmask; | ||
| 137 | |||
| 138 | --maxnode; | ||
| 139 | bitmap_zero(nodes, MAX_NUMNODES); | ||
| 140 | if (maxnode == 0 || !nmask) | ||
| 141 | return 0; | ||
| 142 | |||
| 143 | nlongs = BITS_TO_LONGS(maxnode); | ||
| 144 | if ((maxnode % BITS_PER_LONG) == 0) | ||
| 145 | endmask = ~0UL; | ||
| 146 | else | ||
| 147 | endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; | ||
| 148 | |||
| 149 | /* When the user specified more nodes than supported just check | ||
| 150 | if the non supported part is all zero. */ | ||
| 151 | if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { | ||
| 152 | if (nlongs > PAGE_SIZE/sizeof(long)) | ||
| 153 | return -EINVAL; | ||
| 154 | for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { | ||
| 155 | unsigned long t; | ||
| 156 | if (get_user(t, nmask + k)) | ||
| 157 | return -EFAULT; | ||
| 158 | if (k == nlongs - 1) { | ||
| 159 | if (t & endmask) | ||
| 160 | return -EINVAL; | ||
| 161 | } else if (t) | ||
| 162 | return -EINVAL; | ||
| 163 | } | ||
| 164 | nlongs = BITS_TO_LONGS(MAX_NUMNODES); | ||
| 165 | endmask = ~0UL; | ||
| 166 | } | ||
| 167 | |||
| 168 | if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long))) | ||
| 169 | return -EFAULT; | ||
| 170 | nodes[nlongs-1] &= endmask; | ||
| 171 | /* Update current mems_allowed */ | ||
| 172 | cpuset_update_current_mems_allowed(); | ||
| 173 | /* Ignore nodes not set in current->mems_allowed */ | ||
| 174 | cpuset_restrict_to_mems_allowed(nodes); | ||
| 175 | return mpol_check_policy(mode, nodes); | ||
| 176 | } | 122 | } |
| 177 | |||
| 178 | /* Generate a custom zonelist for the BIND policy. */ | 123 | /* Generate a custom zonelist for the BIND policy. */ |
| 179 | static struct zonelist *bind_zonelist(unsigned long *nodes) | 124 | static struct zonelist *bind_zonelist(nodemask_t *nodes) |
| 180 | { | 125 | { |
| 181 | struct zonelist *zl; | 126 | struct zonelist *zl; |
| 182 | int num, max, nd; | 127 | int num, max, nd; |
| 183 | 128 | ||
| 184 | max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); | 129 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); |
| 185 | zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); | 130 | zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); |
| 186 | if (!zl) | 131 | if (!zl) |
| 187 | return NULL; | 132 | return NULL; |
| 188 | num = 0; | 133 | num = 0; |
| 189 | for (nd = find_first_bit(nodes, MAX_NUMNODES); | 134 | for_each_node_mask(nd, *nodes) { |
| 190 | nd < MAX_NUMNODES; | ||
| 191 | nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) { | ||
| 192 | int k; | 135 | int k; |
| 193 | for (k = MAX_NR_ZONES-1; k >= 0; k--) { | 136 | for (k = MAX_NR_ZONES-1; k >= 0; k--) { |
| 194 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | 137 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; |
| @@ -199,17 +142,16 @@ static struct zonelist *bind_zonelist(unsigned long *nodes) | |||
| 199 | policy_zone = k; | 142 | policy_zone = k; |
| 200 | } | 143 | } |
| 201 | } | 144 | } |
| 202 | BUG_ON(num >= max); | ||
| 203 | zl->zones[num] = NULL; | 145 | zl->zones[num] = NULL; |
| 204 | return zl; | 146 | return zl; |
| 205 | } | 147 | } |
| 206 | 148 | ||
| 207 | /* Create a new policy */ | 149 | /* Create a new policy */ |
| 208 | static struct mempolicy *mpol_new(int mode, unsigned long *nodes) | 150 | static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) |
| 209 | { | 151 | { |
| 210 | struct mempolicy *policy; | 152 | struct mempolicy *policy; |
| 211 | 153 | ||
| 212 | PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); | 154 | PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); |
| 213 | if (mode == MPOL_DEFAULT) | 155 | if (mode == MPOL_DEFAULT) |
| 214 | return NULL; | 156 | return NULL; |
| 215 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 157 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
| @@ -218,10 +160,10 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) | |||
| 218 | atomic_set(&policy->refcnt, 1); | 160 | atomic_set(&policy->refcnt, 1); |
| 219 | switch (mode) { | 161 | switch (mode) { |
| 220 | case MPOL_INTERLEAVE: | 162 | case MPOL_INTERLEAVE: |
| 221 | bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); | 163 | policy->v.nodes = *nodes; |
| 222 | break; | 164 | break; |
| 223 | case MPOL_PREFERRED: | 165 | case MPOL_PREFERRED: |
| 224 | policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); | 166 | policy->v.preferred_node = first_node(*nodes); |
| 225 | if (policy->v.preferred_node >= MAX_NUMNODES) | 167 | if (policy->v.preferred_node >= MAX_NUMNODES) |
| 226 | policy->v.preferred_node = -1; | 168 | policy->v.preferred_node = -1; |
| 227 | break; | 169 | break; |
| @@ -238,14 +180,14 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) | |||
| 238 | } | 180 | } |
| 239 | 181 | ||
| 240 | /* Ensure all existing pages follow the policy. */ | 182 | /* Ensure all existing pages follow the policy. */ |
| 241 | static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, | 183 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
| 242 | unsigned long addr, unsigned long end, unsigned long *nodes) | 184 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
| 243 | { | 185 | { |
| 244 | pte_t *orig_pte; | 186 | pte_t *orig_pte; |
| 245 | pte_t *pte; | 187 | pte_t *pte; |
| 188 | spinlock_t *ptl; | ||
| 246 | 189 | ||
| 247 | spin_lock(&mm->page_table_lock); | 190 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
| 248 | orig_pte = pte = pte_offset_map(pmd, addr); | ||
| 249 | do { | 191 | do { |
| 250 | unsigned long pfn; | 192 | unsigned long pfn; |
| 251 | unsigned int nid; | 193 | unsigned int nid; |
| @@ -253,19 +195,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
| 253 | if (!pte_present(*pte)) | 195 | if (!pte_present(*pte)) |
| 254 | continue; | 196 | continue; |
| 255 | pfn = pte_pfn(*pte); | 197 | pfn = pte_pfn(*pte); |
| 256 | if (!pfn_valid(pfn)) | 198 | if (!pfn_valid(pfn)) { |
| 199 | print_bad_pte(vma, *pte, addr); | ||
| 257 | continue; | 200 | continue; |
| 201 | } | ||
| 258 | nid = pfn_to_nid(pfn); | 202 | nid = pfn_to_nid(pfn); |
| 259 | if (!test_bit(nid, nodes)) | 203 | if (!node_isset(nid, *nodes)) |
| 260 | break; | 204 | break; |
| 261 | } while (pte++, addr += PAGE_SIZE, addr != end); | 205 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| 262 | pte_unmap(orig_pte); | 206 | pte_unmap_unlock(orig_pte, ptl); |
| 263 | spin_unlock(&mm->page_table_lock); | ||
| 264 | return addr != end; | 207 | return addr != end; |
| 265 | } | 208 | } |
| 266 | 209 | ||
| 267 | static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, | 210 | static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
| 268 | unsigned long addr, unsigned long end, unsigned long *nodes) | 211 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
| 269 | { | 212 | { |
| 270 | pmd_t *pmd; | 213 | pmd_t *pmd; |
| 271 | unsigned long next; | 214 | unsigned long next; |
| @@ -275,14 +218,14 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
| 275 | next = pmd_addr_end(addr, end); | 218 | next = pmd_addr_end(addr, end); |
| 276 | if (pmd_none_or_clear_bad(pmd)) | 219 | if (pmd_none_or_clear_bad(pmd)) |
| 277 | continue; | 220 | continue; |
| 278 | if (check_pte_range(mm, pmd, addr, next, nodes)) | 221 | if (check_pte_range(vma, pmd, addr, next, nodes)) |
| 279 | return -EIO; | 222 | return -EIO; |
| 280 | } while (pmd++, addr = next, addr != end); | 223 | } while (pmd++, addr = next, addr != end); |
| 281 | return 0; | 224 | return 0; |
| 282 | } | 225 | } |
| 283 | 226 | ||
| 284 | static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, | 227 | static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
| 285 | unsigned long addr, unsigned long end, unsigned long *nodes) | 228 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
| 286 | { | 229 | { |
| 287 | pud_t *pud; | 230 | pud_t *pud; |
| 288 | unsigned long next; | 231 | unsigned long next; |
| @@ -292,24 +235,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, | |||
| 292 | next = pud_addr_end(addr, end); | 235 | next = pud_addr_end(addr, end); |
| 293 | if (pud_none_or_clear_bad(pud)) | 236 | if (pud_none_or_clear_bad(pud)) |
| 294 | continue; | 237 | continue; |
| 295 | if (check_pmd_range(mm, pud, addr, next, nodes)) | 238 | if (check_pmd_range(vma, pud, addr, next, nodes)) |
| 296 | return -EIO; | 239 | return -EIO; |
| 297 | } while (pud++, addr = next, addr != end); | 240 | } while (pud++, addr = next, addr != end); |
| 298 | return 0; | 241 | return 0; |
| 299 | } | 242 | } |
| 300 | 243 | ||
| 301 | static inline int check_pgd_range(struct mm_struct *mm, | 244 | static inline int check_pgd_range(struct vm_area_struct *vma, |
| 302 | unsigned long addr, unsigned long end, unsigned long *nodes) | 245 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
| 303 | { | 246 | { |
| 304 | pgd_t *pgd; | 247 | pgd_t *pgd; |
| 305 | unsigned long next; | 248 | unsigned long next; |
| 306 | 249 | ||
| 307 | pgd = pgd_offset(mm, addr); | 250 | pgd = pgd_offset(vma->vm_mm, addr); |
| 308 | do { | 251 | do { |
| 309 | next = pgd_addr_end(addr, end); | 252 | next = pgd_addr_end(addr, end); |
| 310 | if (pgd_none_or_clear_bad(pgd)) | 253 | if (pgd_none_or_clear_bad(pgd)) |
| 311 | continue; | 254 | continue; |
| 312 | if (check_pud_range(mm, pgd, addr, next, nodes)) | 255 | if (check_pud_range(vma, pgd, addr, next, nodes)) |
| 313 | return -EIO; | 256 | return -EIO; |
| 314 | } while (pgd++, addr = next, addr != end); | 257 | } while (pgd++, addr = next, addr != end); |
| 315 | return 0; | 258 | return 0; |
| @@ -318,7 +261,7 @@ static inline int check_pgd_range(struct mm_struct *mm, | |||
| 318 | /* Step 1: check the range */ | 261 | /* Step 1: check the range */ |
| 319 | static struct vm_area_struct * | 262 | static struct vm_area_struct * |
| 320 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | 263 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, |
| 321 | unsigned long *nodes, unsigned long flags) | 264 | nodemask_t *nodes, unsigned long flags) |
| 322 | { | 265 | { |
| 323 | int err; | 266 | int err; |
| 324 | struct vm_area_struct *first, *vma, *prev; | 267 | struct vm_area_struct *first, *vma, *prev; |
| @@ -326,6 +269,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
| 326 | first = find_vma(mm, start); | 269 | first = find_vma(mm, start); |
| 327 | if (!first) | 270 | if (!first) |
| 328 | return ERR_PTR(-EFAULT); | 271 | return ERR_PTR(-EFAULT); |
| 272 | if (first->vm_flags & VM_RESERVED) | ||
| 273 | return ERR_PTR(-EACCES); | ||
| 329 | prev = NULL; | 274 | prev = NULL; |
| 330 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 275 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
| 331 | if (!vma->vm_next && vma->vm_end < end) | 276 | if (!vma->vm_next && vma->vm_end < end) |
| @@ -338,8 +283,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
| 338 | endvma = end; | 283 | endvma = end; |
| 339 | if (vma->vm_start > start) | 284 | if (vma->vm_start > start) |
| 340 | start = vma->vm_start; | 285 | start = vma->vm_start; |
| 341 | err = check_pgd_range(vma->vm_mm, | 286 | err = check_pgd_range(vma, start, endvma, nodes); |
| 342 | start, endvma, nodes); | ||
| 343 | if (err) { | 287 | if (err) { |
| 344 | first = ERR_PTR(err); | 288 | first = ERR_PTR(err); |
| 345 | break; | 289 | break; |
| @@ -393,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start, | |||
| 393 | return err; | 337 | return err; |
| 394 | } | 338 | } |
| 395 | 339 | ||
| 396 | /* Change policy for a memory range */ | 340 | static int contextualize_policy(int mode, nodemask_t *nodes) |
| 397 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, | 341 | { |
| 398 | unsigned long mode, | 342 | if (!nodes) |
| 399 | unsigned long __user *nmask, unsigned long maxnode, | 343 | return 0; |
| 400 | unsigned flags) | 344 | |
| 345 | /* Update current mems_allowed */ | ||
| 346 | cpuset_update_current_mems_allowed(); | ||
| 347 | /* Ignore nodes not set in current->mems_allowed */ | ||
| 348 | cpuset_restrict_to_mems_allowed(nodes->bits); | ||
| 349 | return mpol_check_policy(mode, nodes); | ||
| 350 | } | ||
| 351 | |||
| 352 | long do_mbind(unsigned long start, unsigned long len, | ||
| 353 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | ||
| 401 | { | 354 | { |
| 402 | struct vm_area_struct *vma; | 355 | struct vm_area_struct *vma; |
| 403 | struct mm_struct *mm = current->mm; | 356 | struct mm_struct *mm = current->mm; |
| 404 | struct mempolicy *new; | 357 | struct mempolicy *new; |
| 405 | unsigned long end; | 358 | unsigned long end; |
| 406 | DECLARE_BITMAP(nodes, MAX_NUMNODES); | ||
| 407 | int err; | 359 | int err; |
| 408 | 360 | ||
| 409 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) | 361 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) |
| @@ -418,20 +370,17 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |||
| 418 | return -EINVAL; | 370 | return -EINVAL; |
| 419 | if (end == start) | 371 | if (end == start) |
| 420 | return 0; | 372 | return 0; |
| 421 | 373 | if (mpol_check_policy(mode, nmask)) | |
| 422 | err = get_nodes(nodes, nmask, maxnode, mode); | 374 | return -EINVAL; |
| 423 | if (err) | 375 | new = mpol_new(mode, nmask); |
| 424 | return err; | ||
| 425 | |||
| 426 | new = mpol_new(mode, nodes); | ||
| 427 | if (IS_ERR(new)) | 376 | if (IS_ERR(new)) |
| 428 | return PTR_ERR(new); | 377 | return PTR_ERR(new); |
| 429 | 378 | ||
| 430 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | 379 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, |
| 431 | mode,nodes[0]); | 380 | mode,nodes_addr(nodes)[0]); |
| 432 | 381 | ||
| 433 | down_write(&mm->mmap_sem); | 382 | down_write(&mm->mmap_sem); |
| 434 | vma = check_range(mm, start, end, nodes, flags); | 383 | vma = check_range(mm, start, end, nmask, flags); |
| 435 | err = PTR_ERR(vma); | 384 | err = PTR_ERR(vma); |
| 436 | if (!IS_ERR(vma)) | 385 | if (!IS_ERR(vma)) |
| 437 | err = mbind_range(vma, start, end, new); | 386 | err = mbind_range(vma, start, end, new); |
| @@ -441,50 +390,45 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |||
| 441 | } | 390 | } |
| 442 | 391 | ||
| 443 | /* Set the process memory policy */ | 392 | /* Set the process memory policy */ |
| 444 | asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | 393 | long do_set_mempolicy(int mode, nodemask_t *nodes) |
| 445 | unsigned long maxnode) | ||
| 446 | { | 394 | { |
| 447 | int err; | ||
| 448 | struct mempolicy *new; | 395 | struct mempolicy *new; |
| 449 | DECLARE_BITMAP(nodes, MAX_NUMNODES); | ||
| 450 | 396 | ||
| 451 | if (mode < 0 || mode > MPOL_MAX) | 397 | if (contextualize_policy(mode, nodes)) |
| 452 | return -EINVAL; | 398 | return -EINVAL; |
| 453 | err = get_nodes(nodes, nmask, maxnode, mode); | ||
| 454 | if (err) | ||
| 455 | return err; | ||
| 456 | new = mpol_new(mode, nodes); | 399 | new = mpol_new(mode, nodes); |
| 457 | if (IS_ERR(new)) | 400 | if (IS_ERR(new)) |
| 458 | return PTR_ERR(new); | 401 | return PTR_ERR(new); |
| 459 | mpol_free(current->mempolicy); | 402 | mpol_free(current->mempolicy); |
| 460 | current->mempolicy = new; | 403 | current->mempolicy = new; |
| 461 | if (new && new->policy == MPOL_INTERLEAVE) | 404 | if (new && new->policy == MPOL_INTERLEAVE) |
| 462 | current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); | 405 | current->il_next = first_node(new->v.nodes); |
| 463 | return 0; | 406 | return 0; |
| 464 | } | 407 | } |
| 465 | 408 | ||
| 466 | /* Fill a zone bitmap for a policy */ | 409 | /* Fill a zone bitmap for a policy */ |
| 467 | static void get_zonemask(struct mempolicy *p, unsigned long *nodes) | 410 | static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) |
| 468 | { | 411 | { |
| 469 | int i; | 412 | int i; |
| 470 | 413 | ||
| 471 | bitmap_zero(nodes, MAX_NUMNODES); | 414 | nodes_clear(*nodes); |
| 472 | switch (p->policy) { | 415 | switch (p->policy) { |
| 473 | case MPOL_BIND: | 416 | case MPOL_BIND: |
| 474 | for (i = 0; p->v.zonelist->zones[i]; i++) | 417 | for (i = 0; p->v.zonelist->zones[i]; i++) |
| 475 | __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); | 418 | node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, |
| 419 | *nodes); | ||
| 476 | break; | 420 | break; |
| 477 | case MPOL_DEFAULT: | 421 | case MPOL_DEFAULT: |
| 478 | break; | 422 | break; |
| 479 | case MPOL_INTERLEAVE: | 423 | case MPOL_INTERLEAVE: |
| 480 | bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); | 424 | *nodes = p->v.nodes; |
| 481 | break; | 425 | break; |
| 482 | case MPOL_PREFERRED: | 426 | case MPOL_PREFERRED: |
| 483 | /* or use current node instead of online map? */ | 427 | /* or use current node instead of online map? */ |
| 484 | if (p->v.preferred_node < 0) | 428 | if (p->v.preferred_node < 0) |
| 485 | bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES); | 429 | *nodes = node_online_map; |
| 486 | else | 430 | else |
| 487 | __set_bit(p->v.preferred_node, nodes); | 431 | node_set(p->v.preferred_node, *nodes); |
| 488 | break; | 432 | break; |
| 489 | default: | 433 | default: |
| 490 | BUG(); | 434 | BUG(); |
| @@ -504,37 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) | |||
| 504 | return err; | 448 | return err; |
| 505 | } | 449 | } |
| 506 | 450 | ||
| 507 | /* Copy a kernel node mask to user space */ | ||
| 508 | static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | ||
| 509 | void *nodes, unsigned nbytes) | ||
| 510 | { | ||
| 511 | unsigned long copy = ALIGN(maxnode-1, 64) / 8; | ||
| 512 | |||
| 513 | if (copy > nbytes) { | ||
| 514 | if (copy > PAGE_SIZE) | ||
| 515 | return -EINVAL; | ||
| 516 | if (clear_user((char __user *)mask + nbytes, copy - nbytes)) | ||
| 517 | return -EFAULT; | ||
| 518 | copy = nbytes; | ||
| 519 | } | ||
| 520 | return copy_to_user(mask, nodes, copy) ? -EFAULT : 0; | ||
| 521 | } | ||
| 522 | |||
| 523 | /* Retrieve NUMA policy */ | 451 | /* Retrieve NUMA policy */ |
| 524 | asmlinkage long sys_get_mempolicy(int __user *policy, | 452 | long do_get_mempolicy(int *policy, nodemask_t *nmask, |
| 525 | unsigned long __user *nmask, | 453 | unsigned long addr, unsigned long flags) |
| 526 | unsigned long maxnode, | ||
| 527 | unsigned long addr, unsigned long flags) | ||
| 528 | { | 454 | { |
| 529 | int err, pval; | 455 | int err; |
| 530 | struct mm_struct *mm = current->mm; | 456 | struct mm_struct *mm = current->mm; |
| 531 | struct vm_area_struct *vma = NULL; | 457 | struct vm_area_struct *vma = NULL; |
| 532 | struct mempolicy *pol = current->mempolicy; | 458 | struct mempolicy *pol = current->mempolicy; |
| 533 | 459 | ||
| 534 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) | 460 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) |
| 535 | return -EINVAL; | 461 | return -EINVAL; |
| 536 | if (nmask != NULL && maxnode < MAX_NUMNODES) | ||
| 537 | return -EINVAL; | ||
| 538 | if (flags & MPOL_F_ADDR) { | 462 | if (flags & MPOL_F_ADDR) { |
| 539 | down_read(&mm->mmap_sem); | 463 | down_read(&mm->mmap_sem); |
| 540 | vma = find_vma_intersection(mm, addr, addr+1); | 464 | vma = find_vma_intersection(mm, addr, addr+1); |
| @@ -557,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy, | |||
| 557 | err = lookup_node(mm, addr); | 481 | err = lookup_node(mm, addr); |
| 558 | if (err < 0) | 482 | if (err < 0) |
| 559 | goto out; | 483 | goto out; |
| 560 | pval = err; | 484 | *policy = err; |
| 561 | } else if (pol == current->mempolicy && | 485 | } else if (pol == current->mempolicy && |
| 562 | pol->policy == MPOL_INTERLEAVE) { | 486 | pol->policy == MPOL_INTERLEAVE) { |
| 563 | pval = current->il_next; | 487 | *policy = current->il_next; |
| 564 | } else { | 488 | } else { |
| 565 | err = -EINVAL; | 489 | err = -EINVAL; |
| 566 | goto out; | 490 | goto out; |
| 567 | } | 491 | } |
| 568 | } else | 492 | } else |
| 569 | pval = pol->policy; | 493 | *policy = pol->policy; |
| 570 | 494 | ||
| 571 | if (vma) { | 495 | if (vma) { |
| 572 | up_read(¤t->mm->mmap_sem); | 496 | up_read(¤t->mm->mmap_sem); |
| 573 | vma = NULL; | 497 | vma = NULL; |
| 574 | } | 498 | } |
| 575 | 499 | ||
| 576 | if (policy && put_user(pval, policy)) | ||
| 577 | return -EFAULT; | ||
| 578 | |||
| 579 | err = 0; | 500 | err = 0; |
| 580 | if (nmask) { | 501 | if (nmask) |
| 581 | DECLARE_BITMAP(nodes, MAX_NUMNODES); | 502 | get_zonemask(pol, nmask); |
| 582 | get_zonemask(pol, nodes); | ||
| 583 | err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes)); | ||
| 584 | } | ||
| 585 | 503 | ||
| 586 | out: | 504 | out: |
| 587 | if (vma) | 505 | if (vma) |
| @@ -589,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy, | |||
| 589 | return err; | 507 | return err; |
| 590 | } | 508 | } |
| 591 | 509 | ||
| 510 | /* | ||
| 511 | * User space interface with variable sized bitmaps for nodelists. | ||
| 512 | */ | ||
| 513 | |||
| 514 | /* Copy a node mask from user space. */ | ||
| 515 | static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, | ||
| 516 | unsigned long maxnode) | ||
| 517 | { | ||
| 518 | unsigned long k; | ||
| 519 | unsigned long nlongs; | ||
| 520 | unsigned long endmask; | ||
| 521 | |||
| 522 | --maxnode; | ||
| 523 | nodes_clear(*nodes); | ||
| 524 | if (maxnode == 0 || !nmask) | ||
| 525 | return 0; | ||
| 526 | |||
| 527 | nlongs = BITS_TO_LONGS(maxnode); | ||
| 528 | if ((maxnode % BITS_PER_LONG) == 0) | ||
| 529 | endmask = ~0UL; | ||
| 530 | else | ||
| 531 | endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; | ||
| 532 | |||
| 533 | /* When the user specified more nodes than supported just check | ||
| 534 | if the non supported part is all zero. */ | ||
| 535 | if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { | ||
| 536 | if (nlongs > PAGE_SIZE/sizeof(long)) | ||
| 537 | return -EINVAL; | ||
| 538 | for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { | ||
| 539 | unsigned long t; | ||
| 540 | if (get_user(t, nmask + k)) | ||
| 541 | return -EFAULT; | ||
| 542 | if (k == nlongs - 1) { | ||
| 543 | if (t & endmask) | ||
| 544 | return -EINVAL; | ||
| 545 | } else if (t) | ||
| 546 | return -EINVAL; | ||
| 547 | } | ||
| 548 | nlongs = BITS_TO_LONGS(MAX_NUMNODES); | ||
| 549 | endmask = ~0UL; | ||
| 550 | } | ||
| 551 | |||
| 552 | if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) | ||
| 553 | return -EFAULT; | ||
| 554 | nodes_addr(*nodes)[nlongs-1] &= endmask; | ||
| 555 | return 0; | ||
| 556 | } | ||
| 557 | |||
| 558 | /* Copy a kernel node mask to user space */ | ||
| 559 | static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | ||
| 560 | nodemask_t *nodes) | ||
| 561 | { | ||
| 562 | unsigned long copy = ALIGN(maxnode-1, 64) / 8; | ||
| 563 | const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); | ||
| 564 | |||
| 565 | if (copy > nbytes) { | ||
| 566 | if (copy > PAGE_SIZE) | ||
| 567 | return -EINVAL; | ||
| 568 | if (clear_user((char __user *)mask + nbytes, copy - nbytes)) | ||
| 569 | return -EFAULT; | ||
| 570 | copy = nbytes; | ||
| 571 | } | ||
| 572 | return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; | ||
| 573 | } | ||
| 574 | |||
| 575 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, | ||
| 576 | unsigned long mode, | ||
| 577 | unsigned long __user *nmask, unsigned long maxnode, | ||
| 578 | unsigned flags) | ||
| 579 | { | ||
| 580 | nodemask_t nodes; | ||
| 581 | int err; | ||
| 582 | |||
| 583 | err = get_nodes(&nodes, nmask, maxnode); | ||
| 584 | if (err) | ||
| 585 | return err; | ||
| 586 | return do_mbind(start, len, mode, &nodes, flags); | ||
| 587 | } | ||
| 588 | |||
| 589 | /* Set the process memory policy */ | ||
| 590 | asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | ||
| 591 | unsigned long maxnode) | ||
| 592 | { | ||
| 593 | int err; | ||
| 594 | nodemask_t nodes; | ||
| 595 | |||
| 596 | if (mode < 0 || mode > MPOL_MAX) | ||
| 597 | return -EINVAL; | ||
| 598 | err = get_nodes(&nodes, nmask, maxnode); | ||
| 599 | if (err) | ||
| 600 | return err; | ||
| 601 | return do_set_mempolicy(mode, &nodes); | ||
| 602 | } | ||
| 603 | |||
| 604 | /* Retrieve NUMA policy */ | ||
| 605 | asmlinkage long sys_get_mempolicy(int __user *policy, | ||
| 606 | unsigned long __user *nmask, | ||
| 607 | unsigned long maxnode, | ||
| 608 | unsigned long addr, unsigned long flags) | ||
| 609 | { | ||
| 610 | int err, pval; | ||
| 611 | nodemask_t nodes; | ||
| 612 | |||
| 613 | if (nmask != NULL && maxnode < MAX_NUMNODES) | ||
| 614 | return -EINVAL; | ||
| 615 | |||
| 616 | err = do_get_mempolicy(&pval, &nodes, addr, flags); | ||
| 617 | |||
| 618 | if (err) | ||
| 619 | return err; | ||
| 620 | |||
| 621 | if (policy && put_user(pval, policy)) | ||
| 622 | return -EFAULT; | ||
| 623 | |||
| 624 | if (nmask) | ||
| 625 | err = copy_nodes_to_user(nmask, maxnode, &nodes); | ||
| 626 | |||
| 627 | return err; | ||
| 628 | } | ||
| 629 | |||
| 592 | #ifdef CONFIG_COMPAT | 630 | #ifdef CONFIG_COMPAT |
| 593 | 631 | ||
| 594 | asmlinkage long compat_sys_get_mempolicy(int __user *policy, | 632 | asmlinkage long compat_sys_get_mempolicy(int __user *policy, |
| @@ -649,15 +687,15 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
| 649 | long err = 0; | 687 | long err = 0; |
| 650 | unsigned long __user *nm = NULL; | 688 | unsigned long __user *nm = NULL; |
| 651 | unsigned long nr_bits, alloc_size; | 689 | unsigned long nr_bits, alloc_size; |
| 652 | DECLARE_BITMAP(bm, MAX_NUMNODES); | 690 | nodemask_t bm; |
| 653 | 691 | ||
| 654 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); | 692 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); |
| 655 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; | 693 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; |
| 656 | 694 | ||
| 657 | if (nmask) { | 695 | if (nmask) { |
| 658 | err = compat_get_bitmap(bm, nmask, nr_bits); | 696 | err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); |
| 659 | nm = compat_alloc_user_space(alloc_size); | 697 | nm = compat_alloc_user_space(alloc_size); |
| 660 | err |= copy_to_user(nm, bm, alloc_size); | 698 | err |= copy_to_user(nm, nodes_addr(bm), alloc_size); |
| 661 | } | 699 | } |
| 662 | 700 | ||
| 663 | if (err) | 701 | if (err) |
| @@ -676,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo | |||
| 676 | 714 | ||
| 677 | if (vma) { | 715 | if (vma) { |
| 678 | if (vma->vm_ops && vma->vm_ops->get_policy) | 716 | if (vma->vm_ops && vma->vm_ops->get_policy) |
| 679 | pol = vma->vm_ops->get_policy(vma, addr); | 717 | pol = vma->vm_ops->get_policy(vma, addr); |
| 680 | else if (vma->vm_policy && | 718 | else if (vma->vm_policy && |
| 681 | vma->vm_policy->policy != MPOL_DEFAULT) | 719 | vma->vm_policy->policy != MPOL_DEFAULT) |
| 682 | pol = vma->vm_policy; | 720 | pol = vma->vm_policy; |
| @@ -722,10 +760,9 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
| 722 | struct task_struct *me = current; | 760 | struct task_struct *me = current; |
| 723 | 761 | ||
| 724 | nid = me->il_next; | 762 | nid = me->il_next; |
| 725 | BUG_ON(nid >= MAX_NUMNODES); | 763 | next = next_node(nid, policy->v.nodes); |
| 726 | next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid); | ||
| 727 | if (next >= MAX_NUMNODES) | 764 | if (next >= MAX_NUMNODES) |
| 728 | next = find_first_bit(policy->v.nodes, MAX_NUMNODES); | 765 | next = first_node(policy->v.nodes); |
| 729 | me->il_next = next; | 766 | me->il_next = next; |
| 730 | return nid; | 767 | return nid; |
| 731 | } | 768 | } |
| @@ -734,29 +771,27 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
| 734 | static unsigned offset_il_node(struct mempolicy *pol, | 771 | static unsigned offset_il_node(struct mempolicy *pol, |
| 735 | struct vm_area_struct *vma, unsigned long off) | 772 | struct vm_area_struct *vma, unsigned long off) |
| 736 | { | 773 | { |
| 737 | unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); | 774 | unsigned nnodes = nodes_weight(pol->v.nodes); |
| 738 | unsigned target = (unsigned)off % nnodes; | 775 | unsigned target = (unsigned)off % nnodes; |
| 739 | int c; | 776 | int c; |
| 740 | int nid = -1; | 777 | int nid = -1; |
| 741 | 778 | ||
| 742 | c = 0; | 779 | c = 0; |
| 743 | do { | 780 | do { |
| 744 | nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); | 781 | nid = next_node(nid, pol->v.nodes); |
| 745 | c++; | 782 | c++; |
| 746 | } while (c <= target); | 783 | } while (c <= target); |
| 747 | BUG_ON(nid >= MAX_NUMNODES); | ||
| 748 | BUG_ON(!test_bit(nid, pol->v.nodes)); | ||
| 749 | return nid; | 784 | return nid; |
| 750 | } | 785 | } |
| 751 | 786 | ||
| 752 | /* Allocate a page in interleaved policy. | 787 | /* Allocate a page in interleaved policy. |
| 753 | Own path because it needs to do special accounting. */ | 788 | Own path because it needs to do special accounting. */ |
| 754 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) | 789 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, |
| 790 | unsigned nid) | ||
| 755 | { | 791 | { |
| 756 | struct zonelist *zl; | 792 | struct zonelist *zl; |
| 757 | struct page *page; | 793 | struct page *page; |
| 758 | 794 | ||
| 759 | BUG_ON(!node_online(nid)); | ||
| 760 | zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); | 795 | zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); |
| 761 | page = __alloc_pages(gfp, order, zl); | 796 | page = __alloc_pages(gfp, order, zl); |
| 762 | if (page && page_zone(page) == zl->zones[0]) { | 797 | if (page && page_zone(page) == zl->zones[0]) { |
| @@ -799,8 +834,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
| 799 | unsigned nid; | 834 | unsigned nid; |
| 800 | if (vma) { | 835 | if (vma) { |
| 801 | unsigned long off; | 836 | unsigned long off; |
| 802 | BUG_ON(addr >= vma->vm_end); | ||
| 803 | BUG_ON(addr < vma->vm_start); | ||
| 804 | off = vma->vm_pgoff; | 837 | off = vma->vm_pgoff; |
| 805 | off += (addr - vma->vm_start) >> PAGE_SHIFT; | 838 | off += (addr - vma->vm_start) >> PAGE_SHIFT; |
| 806 | nid = offset_il_node(pol, vma, off); | 839 | nid = offset_il_node(pol, vma, off); |
| @@ -878,7 +911,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
| 878 | case MPOL_DEFAULT: | 911 | case MPOL_DEFAULT: |
| 879 | return 1; | 912 | return 1; |
| 880 | case MPOL_INTERLEAVE: | 913 | case MPOL_INTERLEAVE: |
| 881 | return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); | 914 | return nodes_equal(a->v.nodes, b->v.nodes); |
| 882 | case MPOL_PREFERRED: | 915 | case MPOL_PREFERRED: |
| 883 | return a->v.preferred_node == b->v.preferred_node; | 916 | return a->v.preferred_node == b->v.preferred_node; |
| 884 | case MPOL_BIND: { | 917 | case MPOL_BIND: { |
| @@ -1117,7 +1150,7 @@ int mpol_set_shared_policy(struct shared_policy *info, | |||
| 1117 | PDprintk("set_shared_policy %lx sz %lu %d %lx\n", | 1150 | PDprintk("set_shared_policy %lx sz %lu %d %lx\n", |
| 1118 | vma->vm_pgoff, | 1151 | vma->vm_pgoff, |
| 1119 | sz, npol? npol->policy : -1, | 1152 | sz, npol? npol->policy : -1, |
| 1120 | npol ? npol->v.nodes[0] : -1); | 1153 | npol ? nodes_addr(npol->v.nodes)[0] : -1); |
| 1121 | 1154 | ||
| 1122 | if (npol) { | 1155 | if (npol) { |
| 1123 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); | 1156 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); |
| @@ -1164,14 +1197,12 @@ void __init numa_policy_init(void) | |||
| 1164 | /* Set interleaving policy for system init. This way not all | 1197 | /* Set interleaving policy for system init. This way not all |
| 1165 | the data structures allocated at system boot end up in node zero. */ | 1198 | the data structures allocated at system boot end up in node zero. */ |
| 1166 | 1199 | ||
| 1167 | if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), | 1200 | if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) |
| 1168 | MAX_NUMNODES) < 0) | ||
| 1169 | printk("numa_policy_init: interleaving failed\n"); | 1201 | printk("numa_policy_init: interleaving failed\n"); |
| 1170 | } | 1202 | } |
| 1171 | 1203 | ||
| 1172 | /* Reset policy of current process to default. | 1204 | /* Reset policy of current process to default */ |
| 1173 | * Assumes fs == KERNEL_DS */ | ||
| 1174 | void numa_default_policy(void) | 1205 | void numa_default_policy(void) |
| 1175 | { | 1206 | { |
| 1176 | sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); | 1207 | do_set_mempolicy(MPOL_DEFAULT, NULL); |
| 1177 | } | 1208 | } |
