diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/memory.c | 117 | ||||
| -rw-r--r-- | mm/mempolicy.c | 4 | ||||
| -rw-r--r-- | mm/nommu.c | 245 | ||||
| -rw-r--r-- | mm/page_alloc.c | 749 | ||||
| -rw-r--r-- | mm/shmem.c | 4 | ||||
| -rw-r--r-- | mm/slab.c | 126 | ||||
| -rw-r--r-- | mm/slob.c | 3 | ||||
| -rw-r--r-- | mm/truncate.c | 25 | ||||
| -rw-r--r-- | mm/vmalloc.c | 30 | ||||
| -rw-r--r-- | mm/vmscan.c | 30 | ||||
| -rw-r--r-- | mm/vmstat.c | 3 |
11 files changed, 1136 insertions, 200 deletions
diff --git a/mm/memory.c b/mm/memory.c index 92a3ebd8d795..601159a46ab6 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -2256,6 +2256,54 @@ oom: | |||
| 2256 | } | 2256 | } |
| 2257 | 2257 | ||
| 2258 | /* | 2258 | /* |
| 2259 | * do_no_pfn() tries to create a new page mapping for a page without | ||
| 2260 | * a struct_page backing it | ||
| 2261 | * | ||
| 2262 | * As this is called only for pages that do not currently exist, we | ||
| 2263 | * do not need to flush old virtual caches or the TLB. | ||
| 2264 | * | ||
| 2265 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
| 2266 | * but allow concurrent faults), and pte mapped but not yet locked. | ||
| 2267 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
| 2268 | * | ||
| 2269 | * It is expected that the ->nopfn handler always returns the same pfn | ||
| 2270 | * for a given virtual mapping. | ||
| 2271 | * | ||
| 2272 | * Mark this `noinline' to prevent it from bloating the main pagefault code. | ||
| 2273 | */ | ||
| 2274 | static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 2275 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
| 2276 | int write_access) | ||
| 2277 | { | ||
| 2278 | spinlock_t *ptl; | ||
| 2279 | pte_t entry; | ||
| 2280 | unsigned long pfn; | ||
| 2281 | int ret = VM_FAULT_MINOR; | ||
| 2282 | |||
| 2283 | pte_unmap(page_table); | ||
| 2284 | BUG_ON(!(vma->vm_flags & VM_PFNMAP)); | ||
| 2285 | BUG_ON(is_cow_mapping(vma->vm_flags)); | ||
| 2286 | |||
| 2287 | pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); | ||
| 2288 | if (pfn == NOPFN_OOM) | ||
| 2289 | return VM_FAULT_OOM; | ||
| 2290 | if (pfn == NOPFN_SIGBUS) | ||
| 2291 | return VM_FAULT_SIGBUS; | ||
| 2292 | |||
| 2293 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
| 2294 | |||
| 2295 | /* Only go through if we didn't race with anybody else... */ | ||
| 2296 | if (pte_none(*page_table)) { | ||
| 2297 | entry = pfn_pte(pfn, vma->vm_page_prot); | ||
| 2298 | if (write_access) | ||
| 2299 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
| 2300 | set_pte_at(mm, address, page_table, entry); | ||
| 2301 | } | ||
| 2302 | pte_unmap_unlock(page_table, ptl); | ||
| 2303 | return ret; | ||
| 2304 | } | ||
| 2305 | |||
| 2306 | /* | ||
| 2259 | * Fault of a previously existing named mapping. Repopulate the pte | 2307 | * Fault of a previously existing named mapping. Repopulate the pte |
| 2260 | * from the encoded file_pte if possible. This enables swappable | 2308 | * from the encoded file_pte if possible. This enables swappable |
| 2261 | * nonlinear vmas. | 2309 | * nonlinear vmas. |
| @@ -2317,11 +2365,17 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
| 2317 | old_entry = entry = *pte; | 2365 | old_entry = entry = *pte; |
| 2318 | if (!pte_present(entry)) { | 2366 | if (!pte_present(entry)) { |
| 2319 | if (pte_none(entry)) { | 2367 | if (pte_none(entry)) { |
| 2320 | if (!vma->vm_ops || !vma->vm_ops->nopage) | 2368 | if (vma->vm_ops) { |
| 2321 | return do_anonymous_page(mm, vma, address, | 2369 | if (vma->vm_ops->nopage) |
| 2322 | pte, pmd, write_access); | 2370 | return do_no_page(mm, vma, address, |
| 2323 | return do_no_page(mm, vma, address, | 2371 | pte, pmd, |
| 2324 | pte, pmd, write_access); | 2372 | write_access); |
| 2373 | if (unlikely(vma->vm_ops->nopfn)) | ||
| 2374 | return do_no_pfn(mm, vma, address, pte, | ||
| 2375 | pmd, write_access); | ||
| 2376 | } | ||
| 2377 | return do_anonymous_page(mm, vma, address, | ||
| 2378 | pte, pmd, write_access); | ||
| 2325 | } | 2379 | } |
| 2326 | if (pte_file(entry)) | 2380 | if (pte_file(entry)) |
| 2327 | return do_file_page(mm, vma, address, | 2381 | return do_file_page(mm, vma, address, |
| @@ -2550,3 +2604,56 @@ int in_gate_area_no_task(unsigned long addr) | |||
| 2550 | } | 2604 | } |
| 2551 | 2605 | ||
| 2552 | #endif /* __HAVE_ARCH_GATE_AREA */ | 2606 | #endif /* __HAVE_ARCH_GATE_AREA */ |
| 2607 | |||
| 2608 | /* | ||
| 2609 | * Access another process' address space. | ||
| 2610 | * Source/target buffer must be kernel space, | ||
| 2611 | * Do not walk the page table directly, use get_user_pages | ||
| 2612 | */ | ||
| 2613 | int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) | ||
| 2614 | { | ||
| 2615 | struct mm_struct *mm; | ||
| 2616 | struct vm_area_struct *vma; | ||
| 2617 | struct page *page; | ||
| 2618 | void *old_buf = buf; | ||
| 2619 | |||
| 2620 | mm = get_task_mm(tsk); | ||
| 2621 | if (!mm) | ||
| 2622 | return 0; | ||
| 2623 | |||
| 2624 | down_read(&mm->mmap_sem); | ||
| 2625 | /* ignore errors, just check how much was sucessfully transfered */ | ||
| 2626 | while (len) { | ||
| 2627 | int bytes, ret, offset; | ||
| 2628 | void *maddr; | ||
| 2629 | |||
| 2630 | ret = get_user_pages(tsk, mm, addr, 1, | ||
| 2631 | write, 1, &page, &vma); | ||
| 2632 | if (ret <= 0) | ||
| 2633 | break; | ||
| 2634 | |||
| 2635 | bytes = len; | ||
| 2636 | offset = addr & (PAGE_SIZE-1); | ||
| 2637 | if (bytes > PAGE_SIZE-offset) | ||
| 2638 | bytes = PAGE_SIZE-offset; | ||
| 2639 | |||
| 2640 | maddr = kmap(page); | ||
| 2641 | if (write) { | ||
| 2642 | copy_to_user_page(vma, page, addr, | ||
| 2643 | maddr + offset, buf, bytes); | ||
| 2644 | set_page_dirty_lock(page); | ||
| 2645 | } else { | ||
| 2646 | copy_from_user_page(vma, page, addr, | ||
| 2647 | buf, maddr + offset, bytes); | ||
| 2648 | } | ||
| 2649 | kunmap(page); | ||
| 2650 | page_cache_release(page); | ||
| 2651 | len -= bytes; | ||
| 2652 | buf += bytes; | ||
| 2653 | addr += bytes; | ||
| 2654 | } | ||
| 2655 | up_read(&mm->mmap_sem); | ||
| 2656 | mmput(mm); | ||
| 2657 | |||
| 2658 | return buf - old_buf; | ||
| 2659 | } | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 38f89650bc84..cf18f0942553 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -1136,7 +1136,9 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
| 1136 | */ | 1136 | */ |
| 1137 | unsigned slab_node(struct mempolicy *policy) | 1137 | unsigned slab_node(struct mempolicy *policy) |
| 1138 | { | 1138 | { |
| 1139 | switch (policy->policy) { | 1139 | int pol = policy ? policy->policy : MPOL_DEFAULT; |
| 1140 | |||
| 1141 | switch (pol) { | ||
| 1140 | case MPOL_INTERLEAVE: | 1142 | case MPOL_INTERLEAVE: |
| 1141 | return interleave_nodes(policy); | 1143 | return interleave_nodes(policy); |
| 1142 | 1144 | ||
diff --git a/mm/nommu.c b/mm/nommu.c index d99dea31e443..564540662192 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -122,26 +122,50 @@ unsigned int kobjsize(const void *objp) | |||
| 122 | } | 122 | } |
| 123 | 123 | ||
| 124 | /* | 124 | /* |
| 125 | * The nommu dodgy version :-) | 125 | * get a list of pages in an address range belonging to the specified process |
| 126 | * and indicate the VMA that covers each page | ||
| 127 | * - this is potentially dodgy as we may end incrementing the page count of a | ||
| 128 | * slab page or a secondary page from a compound page | ||
| 129 | * - don't permit access to VMAs that don't support it, such as I/O mappings | ||
| 126 | */ | 130 | */ |
| 127 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 131 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 128 | unsigned long start, int len, int write, int force, | 132 | unsigned long start, int len, int write, int force, |
| 129 | struct page **pages, struct vm_area_struct **vmas) | 133 | struct page **pages, struct vm_area_struct **vmas) |
| 130 | { | 134 | { |
| 135 | struct vm_area_struct *vma; | ||
| 136 | unsigned long vm_flags; | ||
| 131 | int i; | 137 | int i; |
| 132 | static struct vm_area_struct dummy_vma; | 138 | |
| 139 | /* calculate required read or write permissions. | ||
| 140 | * - if 'force' is set, we only require the "MAY" flags. | ||
| 141 | */ | ||
| 142 | vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | ||
| 143 | vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | ||
| 133 | 144 | ||
| 134 | for (i = 0; i < len; i++) { | 145 | for (i = 0; i < len; i++) { |
| 146 | vma = find_vma(mm, start); | ||
| 147 | if (!vma) | ||
| 148 | goto finish_or_fault; | ||
| 149 | |||
| 150 | /* protect what we can, including chardevs */ | ||
| 151 | if (vma->vm_flags & (VM_IO | VM_PFNMAP) || | ||
| 152 | !(vm_flags & vma->vm_flags)) | ||
| 153 | goto finish_or_fault; | ||
| 154 | |||
| 135 | if (pages) { | 155 | if (pages) { |
| 136 | pages[i] = virt_to_page(start); | 156 | pages[i] = virt_to_page(start); |
| 137 | if (pages[i]) | 157 | if (pages[i]) |
| 138 | page_cache_get(pages[i]); | 158 | page_cache_get(pages[i]); |
| 139 | } | 159 | } |
| 140 | if (vmas) | 160 | if (vmas) |
| 141 | vmas[i] = &dummy_vma; | 161 | vmas[i] = vma; |
| 142 | start += PAGE_SIZE; | 162 | start += PAGE_SIZE; |
| 143 | } | 163 | } |
| 144 | return(i); | 164 | |
| 165 | return i; | ||
| 166 | |||
| 167 | finish_or_fault: | ||
| 168 | return i ? : -EFAULT; | ||
| 145 | } | 169 | } |
| 146 | 170 | ||
| 147 | EXPORT_SYMBOL(get_user_pages); | 171 | EXPORT_SYMBOL(get_user_pages); |
| @@ -286,6 +310,77 @@ static void show_process_blocks(void) | |||
| 286 | } | 310 | } |
| 287 | #endif /* DEBUG */ | 311 | #endif /* DEBUG */ |
| 288 | 312 | ||
| 313 | /* | ||
| 314 | * add a VMA into a process's mm_struct in the appropriate place in the list | ||
| 315 | * - should be called with mm->mmap_sem held writelocked | ||
| 316 | */ | ||
| 317 | static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml) | ||
| 318 | { | ||
| 319 | struct vm_list_struct **ppv; | ||
| 320 | |||
| 321 | for (ppv = ¤t->mm->context.vmlist; *ppv; ppv = &(*ppv)->next) | ||
| 322 | if ((*ppv)->vma->vm_start > vml->vma->vm_start) | ||
| 323 | break; | ||
| 324 | |||
| 325 | vml->next = *ppv; | ||
| 326 | *ppv = vml; | ||
| 327 | } | ||
| 328 | |||
| 329 | /* | ||
| 330 | * look up the first VMA in which addr resides, NULL if none | ||
| 331 | * - should be called with mm->mmap_sem at least held readlocked | ||
| 332 | */ | ||
| 333 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | ||
| 334 | { | ||
| 335 | struct vm_list_struct *loop, *vml; | ||
| 336 | |||
| 337 | /* search the vm_start ordered list */ | ||
| 338 | vml = NULL; | ||
| 339 | for (loop = mm->context.vmlist; loop; loop = loop->next) { | ||
| 340 | if (loop->vma->vm_start > addr) | ||
| 341 | break; | ||
| 342 | vml = loop; | ||
| 343 | } | ||
| 344 | |||
| 345 | if (vml && vml->vma->vm_end > addr) | ||
| 346 | return vml->vma; | ||
| 347 | |||
| 348 | return NULL; | ||
| 349 | } | ||
| 350 | EXPORT_SYMBOL(find_vma); | ||
| 351 | |||
| 352 | /* | ||
| 353 | * find a VMA | ||
| 354 | * - we don't extend stack VMAs under NOMMU conditions | ||
| 355 | */ | ||
| 356 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | ||
| 357 | { | ||
| 358 | return find_vma(mm, addr); | ||
| 359 | } | ||
| 360 | |||
| 361 | /* | ||
| 362 | * look up the first VMA exactly that exactly matches addr | ||
| 363 | * - should be called with mm->mmap_sem at least held readlocked | ||
| 364 | */ | ||
| 365 | static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | ||
| 366 | unsigned long addr) | ||
| 367 | { | ||
| 368 | struct vm_list_struct *vml; | ||
| 369 | |||
| 370 | /* search the vm_start ordered list */ | ||
| 371 | for (vml = mm->context.vmlist; vml; vml = vml->next) { | ||
| 372 | if (vml->vma->vm_start == addr) | ||
| 373 | return vml->vma; | ||
| 374 | if (vml->vma->vm_start > addr) | ||
| 375 | break; | ||
| 376 | } | ||
| 377 | |||
| 378 | return NULL; | ||
| 379 | } | ||
| 380 | |||
| 381 | /* | ||
| 382 | * find a VMA in the global tree | ||
| 383 | */ | ||
| 289 | static inline struct vm_area_struct *find_nommu_vma(unsigned long start) | 384 | static inline struct vm_area_struct *find_nommu_vma(unsigned long start) |
| 290 | { | 385 | { |
| 291 | struct vm_area_struct *vma; | 386 | struct vm_area_struct *vma; |
| @@ -305,6 +400,9 @@ static inline struct vm_area_struct *find_nommu_vma(unsigned long start) | |||
| 305 | return NULL; | 400 | return NULL; |
| 306 | } | 401 | } |
| 307 | 402 | ||
| 403 | /* | ||
| 404 | * add a VMA in the global tree | ||
| 405 | */ | ||
| 308 | static void add_nommu_vma(struct vm_area_struct *vma) | 406 | static void add_nommu_vma(struct vm_area_struct *vma) |
| 309 | { | 407 | { |
| 310 | struct vm_area_struct *pvma; | 408 | struct vm_area_struct *pvma; |
| @@ -351,6 +449,9 @@ static void add_nommu_vma(struct vm_area_struct *vma) | |||
| 351 | rb_insert_color(&vma->vm_rb, &nommu_vma_tree); | 449 | rb_insert_color(&vma->vm_rb, &nommu_vma_tree); |
| 352 | } | 450 | } |
| 353 | 451 | ||
| 452 | /* | ||
| 453 | * delete a VMA from the global list | ||
| 454 | */ | ||
| 354 | static void delete_nommu_vma(struct vm_area_struct *vma) | 455 | static void delete_nommu_vma(struct vm_area_struct *vma) |
| 355 | { | 456 | { |
| 356 | struct address_space *mapping; | 457 | struct address_space *mapping; |
| @@ -828,8 +929,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 828 | realalloc += kobjsize(vml); | 929 | realalloc += kobjsize(vml); |
| 829 | askedalloc += sizeof(*vml); | 930 | askedalloc += sizeof(*vml); |
| 830 | 931 | ||
| 831 | vml->next = current->mm->context.vmlist; | 932 | add_vma_to_mm(current->mm, vml); |
| 832 | current->mm->context.vmlist = vml; | ||
| 833 | 933 | ||
| 834 | up_write(&nommu_vma_sem); | 934 | up_write(&nommu_vma_sem); |
| 835 | 935 | ||
| @@ -908,6 +1008,11 @@ static void put_vma(struct vm_area_struct *vma) | |||
| 908 | } | 1008 | } |
| 909 | } | 1009 | } |
| 910 | 1010 | ||
| 1011 | /* | ||
| 1012 | * release a mapping | ||
| 1013 | * - under NOMMU conditions the parameters must match exactly to the mapping to | ||
| 1014 | * be removed | ||
| 1015 | */ | ||
| 911 | int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | 1016 | int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) |
| 912 | { | 1017 | { |
| 913 | struct vm_list_struct *vml, **parent; | 1018 | struct vm_list_struct *vml, **parent; |
| @@ -917,10 +1022,13 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | |||
| 917 | printk("do_munmap:\n"); | 1022 | printk("do_munmap:\n"); |
| 918 | #endif | 1023 | #endif |
| 919 | 1024 | ||
| 920 | for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) | 1025 | for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) { |
| 1026 | if ((*parent)->vma->vm_start > addr) | ||
| 1027 | break; | ||
| 921 | if ((*parent)->vma->vm_start == addr && | 1028 | if ((*parent)->vma->vm_start == addr && |
| 922 | ((len == 0) || ((*parent)->vma->vm_end == end))) | 1029 | ((len == 0) || ((*parent)->vma->vm_end == end))) |
| 923 | goto found; | 1030 | goto found; |
| 1031 | } | ||
| 924 | 1032 | ||
| 925 | printk("munmap of non-mmaped memory by process %d (%s): %p\n", | 1033 | printk("munmap of non-mmaped memory by process %d (%s): %p\n", |
| 926 | current->pid, current->comm, (void *) addr); | 1034 | current->pid, current->comm, (void *) addr); |
| @@ -946,7 +1054,20 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | |||
| 946 | return 0; | 1054 | return 0; |
| 947 | } | 1055 | } |
| 948 | 1056 | ||
| 949 | /* Release all mmaps. */ | 1057 | asmlinkage long sys_munmap(unsigned long addr, size_t len) |
| 1058 | { | ||
| 1059 | int ret; | ||
| 1060 | struct mm_struct *mm = current->mm; | ||
| 1061 | |||
| 1062 | down_write(&mm->mmap_sem); | ||
| 1063 | ret = do_munmap(mm, addr, len); | ||
| 1064 | up_write(&mm->mmap_sem); | ||
| 1065 | return ret; | ||
| 1066 | } | ||
| 1067 | |||
| 1068 | /* | ||
| 1069 | * Release all mappings | ||
| 1070 | */ | ||
| 950 | void exit_mmap(struct mm_struct * mm) | 1071 | void exit_mmap(struct mm_struct * mm) |
| 951 | { | 1072 | { |
| 952 | struct vm_list_struct *tmp; | 1073 | struct vm_list_struct *tmp; |
| @@ -973,37 +1094,26 @@ void exit_mmap(struct mm_struct * mm) | |||
| 973 | } | 1094 | } |
| 974 | } | 1095 | } |
| 975 | 1096 | ||
| 976 | asmlinkage long sys_munmap(unsigned long addr, size_t len) | ||
| 977 | { | ||
| 978 | int ret; | ||
| 979 | struct mm_struct *mm = current->mm; | ||
| 980 | |||
| 981 | down_write(&mm->mmap_sem); | ||
| 982 | ret = do_munmap(mm, addr, len); | ||
| 983 | up_write(&mm->mmap_sem); | ||
| 984 | return ret; | ||
| 985 | } | ||
| 986 | |||
| 987 | unsigned long do_brk(unsigned long addr, unsigned long len) | 1097 | unsigned long do_brk(unsigned long addr, unsigned long len) |
| 988 | { | 1098 | { |
| 989 | return -ENOMEM; | 1099 | return -ENOMEM; |
| 990 | } | 1100 | } |
| 991 | 1101 | ||
| 992 | /* | 1102 | /* |
| 993 | * Expand (or shrink) an existing mapping, potentially moving it at the | 1103 | * expand (or shrink) an existing mapping, potentially moving it at the same |
| 994 | * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) | 1104 | * time (controlled by the MREMAP_MAYMOVE flag and available VM space) |
| 995 | * | 1105 | * |
| 996 | * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise | 1106 | * under NOMMU conditions, we only permit changing a mapping's size, and only |
| 997 | * This option implies MREMAP_MAYMOVE. | 1107 | * as long as it stays within the hole allocated by the kmalloc() call in |
| 1108 | * do_mmap_pgoff() and the block is not shareable | ||
| 998 | * | 1109 | * |
| 999 | * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the | 1110 | * MREMAP_FIXED is not supported under NOMMU conditions |
| 1000 | * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable | ||
| 1001 | */ | 1111 | */ |
| 1002 | unsigned long do_mremap(unsigned long addr, | 1112 | unsigned long do_mremap(unsigned long addr, |
| 1003 | unsigned long old_len, unsigned long new_len, | 1113 | unsigned long old_len, unsigned long new_len, |
| 1004 | unsigned long flags, unsigned long new_addr) | 1114 | unsigned long flags, unsigned long new_addr) |
| 1005 | { | 1115 | { |
| 1006 | struct vm_list_struct *vml = NULL; | 1116 | struct vm_area_struct *vma; |
| 1007 | 1117 | ||
| 1008 | /* insanity checks first */ | 1118 | /* insanity checks first */ |
| 1009 | if (new_len == 0) | 1119 | if (new_len == 0) |
| @@ -1012,58 +1122,46 @@ unsigned long do_mremap(unsigned long addr, | |||
| 1012 | if (flags & MREMAP_FIXED && new_addr != addr) | 1122 | if (flags & MREMAP_FIXED && new_addr != addr) |
| 1013 | return (unsigned long) -EINVAL; | 1123 | return (unsigned long) -EINVAL; |
| 1014 | 1124 | ||
| 1015 | for (vml = current->mm->context.vmlist; vml; vml = vml->next) | 1125 | vma = find_vma_exact(current->mm, addr); |
| 1016 | if (vml->vma->vm_start == addr) | 1126 | if (!vma) |
| 1017 | goto found; | 1127 | return (unsigned long) -EINVAL; |
| 1018 | |||
| 1019 | return (unsigned long) -EINVAL; | ||
| 1020 | 1128 | ||
| 1021 | found: | 1129 | if (vma->vm_end != vma->vm_start + old_len) |
| 1022 | if (vml->vma->vm_end != vml->vma->vm_start + old_len) | ||
| 1023 | return (unsigned long) -EFAULT; | 1130 | return (unsigned long) -EFAULT; |
| 1024 | 1131 | ||
| 1025 | if (vml->vma->vm_flags & VM_MAYSHARE) | 1132 | if (vma->vm_flags & VM_MAYSHARE) |
| 1026 | return (unsigned long) -EPERM; | 1133 | return (unsigned long) -EPERM; |
| 1027 | 1134 | ||
| 1028 | if (new_len > kobjsize((void *) addr)) | 1135 | if (new_len > kobjsize((void *) addr)) |
| 1029 | return (unsigned long) -ENOMEM; | 1136 | return (unsigned long) -ENOMEM; |
| 1030 | 1137 | ||
| 1031 | /* all checks complete - do it */ | 1138 | /* all checks complete - do it */ |
| 1032 | vml->vma->vm_end = vml->vma->vm_start + new_len; | 1139 | vma->vm_end = vma->vm_start + new_len; |
| 1033 | 1140 | ||
| 1034 | askedalloc -= old_len; | 1141 | askedalloc -= old_len; |
| 1035 | askedalloc += new_len; | 1142 | askedalloc += new_len; |
| 1036 | 1143 | ||
| 1037 | return vml->vma->vm_start; | 1144 | return vma->vm_start; |
| 1038 | } | 1145 | } |
| 1039 | 1146 | ||
| 1040 | /* | 1147 | asmlinkage unsigned long sys_mremap(unsigned long addr, |
| 1041 | * Look up the first VMA which satisfies addr < vm_end, NULL if none | 1148 | unsigned long old_len, unsigned long new_len, |
| 1042 | */ | 1149 | unsigned long flags, unsigned long new_addr) |
| 1043 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | ||
| 1044 | { | 1150 | { |
| 1045 | struct vm_list_struct *vml; | 1151 | unsigned long ret; |
| 1046 | |||
| 1047 | for (vml = mm->context.vmlist; vml; vml = vml->next) | ||
| 1048 | if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end) | ||
| 1049 | return vml->vma; | ||
| 1050 | 1152 | ||
| 1051 | return NULL; | 1153 | down_write(¤t->mm->mmap_sem); |
| 1154 | ret = do_mremap(addr, old_len, new_len, flags, new_addr); | ||
| 1155 | up_write(¤t->mm->mmap_sem); | ||
| 1156 | return ret; | ||
| 1052 | } | 1157 | } |
| 1053 | 1158 | ||
| 1054 | EXPORT_SYMBOL(find_vma); | ||
| 1055 | |||
| 1056 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | 1159 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, |
| 1057 | unsigned int foll_flags) | 1160 | unsigned int foll_flags) |
| 1058 | { | 1161 | { |
| 1059 | return NULL; | 1162 | return NULL; |
| 1060 | } | 1163 | } |
| 1061 | 1164 | ||
| 1062 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | ||
| 1063 | { | ||
| 1064 | return NULL; | ||
| 1065 | } | ||
| 1066 | |||
| 1067 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, | 1165 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, |
| 1068 | unsigned long to, unsigned long size, pgprot_t prot) | 1166 | unsigned long to, unsigned long size, pgprot_t prot) |
| 1069 | { | 1167 | { |
| @@ -1206,3 +1304,44 @@ struct page *filemap_nopage(struct vm_area_struct *area, | |||
| 1206 | BUG(); | 1304 | BUG(); |
| 1207 | return NULL; | 1305 | return NULL; |
| 1208 | } | 1306 | } |
| 1307 | |||
| 1308 | /* | ||
| 1309 | * Access another process' address space. | ||
| 1310 | * - source/target buffer must be kernel space | ||
| 1311 | */ | ||
| 1312 | int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) | ||
| 1313 | { | ||
| 1314 | struct vm_area_struct *vma; | ||
| 1315 | struct mm_struct *mm; | ||
| 1316 | |||
| 1317 | if (addr + len < addr) | ||
| 1318 | return 0; | ||
| 1319 | |||
| 1320 | mm = get_task_mm(tsk); | ||
| 1321 | if (!mm) | ||
| 1322 | return 0; | ||
| 1323 | |||
| 1324 | down_read(&mm->mmap_sem); | ||
| 1325 | |||
| 1326 | /* the access must start within one of the target process's mappings */ | ||
| 1327 | vma = find_vma(mm, addr); | ||
| 1328 | if (vma) { | ||
| 1329 | /* don't overrun this mapping */ | ||
| 1330 | if (addr + len >= vma->vm_end) | ||
| 1331 | len = vma->vm_end - addr; | ||
| 1332 | |||
| 1333 | /* only read or write mappings where it is permitted */ | ||
| 1334 | if (write && vma->vm_flags & VM_MAYWRITE) | ||
| 1335 | len -= copy_to_user((void *) addr, buf, len); | ||
| 1336 | else if (!write && vma->vm_flags & VM_MAYREAD) | ||
| 1337 | len -= copy_from_user(buf, (void *) addr, len); | ||
| 1338 | else | ||
| 1339 | len = 0; | ||
| 1340 | } else { | ||
| 1341 | len = 0; | ||
| 1342 | } | ||
| 1343 | |||
| 1344 | up_read(&mm->mmap_sem); | ||
| 1345 | mmput(mm); | ||
| 1346 | return len; | ||
| 1347 | } | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9810f0a60db7..4f59d90b81e6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -37,6 +37,8 @@ | |||
| 37 | #include <linux/vmalloc.h> | 37 | #include <linux/vmalloc.h> |
| 38 | #include <linux/mempolicy.h> | 38 | #include <linux/mempolicy.h> |
| 39 | #include <linux/stop_machine.h> | 39 | #include <linux/stop_machine.h> |
| 40 | #include <linux/sort.h> | ||
| 41 | #include <linux/pfn.h> | ||
| 40 | 42 | ||
| 41 | #include <asm/tlbflush.h> | 43 | #include <asm/tlbflush.h> |
| 42 | #include <asm/div64.h> | 44 | #include <asm/div64.h> |
| @@ -102,6 +104,38 @@ int min_free_kbytes = 1024; | |||
| 102 | 104 | ||
| 103 | unsigned long __meminitdata nr_kernel_pages; | 105 | unsigned long __meminitdata nr_kernel_pages; |
| 104 | unsigned long __meminitdata nr_all_pages; | 106 | unsigned long __meminitdata nr_all_pages; |
| 107 | static unsigned long __initdata dma_reserve; | ||
| 108 | |||
| 109 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | ||
| 110 | /* | ||
| 111 | * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct | ||
| 112 | * ranges of memory (RAM) that may be registered with add_active_range(). | ||
| 113 | * Ranges passed to add_active_range() will be merged if possible | ||
| 114 | * so the number of times add_active_range() can be called is | ||
| 115 | * related to the number of nodes and the number of holes | ||
| 116 | */ | ||
| 117 | #ifdef CONFIG_MAX_ACTIVE_REGIONS | ||
| 118 | /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ | ||
| 119 | #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS | ||
| 120 | #else | ||
| 121 | #if MAX_NUMNODES >= 32 | ||
| 122 | /* If there can be many nodes, allow up to 50 holes per node */ | ||
| 123 | #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) | ||
| 124 | #else | ||
| 125 | /* By default, allow up to 256 distinct regions */ | ||
| 126 | #define MAX_ACTIVE_REGIONS 256 | ||
| 127 | #endif | ||
| 128 | #endif | ||
| 129 | |||
| 130 | struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS]; | ||
| 131 | int __initdata nr_nodemap_entries; | ||
| 132 | unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | ||
| 133 | unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | ||
| 134 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
| 135 | unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; | ||
| 136 | unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; | ||
| 137 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | ||
| 138 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | ||
| 105 | 139 | ||
| 106 | #ifdef CONFIG_DEBUG_VM | 140 | #ifdef CONFIG_DEBUG_VM |
| 107 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 141 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
| @@ -908,7 +942,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
| 908 | */ | 942 | */ |
| 909 | do { | 943 | do { |
| 910 | zone = *z; | 944 | zone = *z; |
| 911 | if (unlikely((gfp_mask & __GFP_THISNODE) && | 945 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && |
| 912 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) | 946 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) |
| 913 | break; | 947 | break; |
| 914 | if ((alloc_flags & ALLOC_CPUSET) && | 948 | if ((alloc_flags & ALLOC_CPUSET) && |
| @@ -1222,14 +1256,12 @@ unsigned int nr_free_pagecache_pages(void) | |||
| 1222 | { | 1256 | { |
| 1223 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); | 1257 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); |
| 1224 | } | 1258 | } |
| 1225 | #ifdef CONFIG_NUMA | 1259 | |
| 1226 | static void show_node(struct zone *zone) | 1260 | static inline void show_node(struct zone *zone) |
| 1227 | { | 1261 | { |
| 1228 | printk("Node %ld ", zone_to_nid(zone)); | 1262 | if (NUMA_BUILD) |
| 1263 | printk("Node %ld ", zone_to_nid(zone)); | ||
| 1229 | } | 1264 | } |
| 1230 | #else | ||
| 1231 | #define show_node(zone) do { } while (0) | ||
| 1232 | #endif | ||
| 1233 | 1265 | ||
| 1234 | void si_meminfo(struct sysinfo *val) | 1266 | void si_meminfo(struct sysinfo *val) |
| 1235 | { | 1267 | { |
| @@ -1271,34 +1303,30 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
| 1271 | */ | 1303 | */ |
| 1272 | void show_free_areas(void) | 1304 | void show_free_areas(void) |
| 1273 | { | 1305 | { |
| 1274 | int cpu, temperature; | 1306 | int cpu; |
| 1275 | unsigned long active; | 1307 | unsigned long active; |
| 1276 | unsigned long inactive; | 1308 | unsigned long inactive; |
| 1277 | unsigned long free; | 1309 | unsigned long free; |
| 1278 | struct zone *zone; | 1310 | struct zone *zone; |
| 1279 | 1311 | ||
| 1280 | for_each_zone(zone) { | 1312 | for_each_zone(zone) { |
| 1281 | show_node(zone); | 1313 | if (!populated_zone(zone)) |
| 1282 | printk("%s per-cpu:", zone->name); | ||
| 1283 | |||
| 1284 | if (!populated_zone(zone)) { | ||
| 1285 | printk(" empty\n"); | ||
| 1286 | continue; | 1314 | continue; |
| 1287 | } else | 1315 | |
| 1288 | printk("\n"); | 1316 | show_node(zone); |
| 1317 | printk("%s per-cpu:\n", zone->name); | ||
| 1289 | 1318 | ||
| 1290 | for_each_online_cpu(cpu) { | 1319 | for_each_online_cpu(cpu) { |
| 1291 | struct per_cpu_pageset *pageset; | 1320 | struct per_cpu_pageset *pageset; |
| 1292 | 1321 | ||
| 1293 | pageset = zone_pcp(zone, cpu); | 1322 | pageset = zone_pcp(zone, cpu); |
| 1294 | 1323 | ||
| 1295 | for (temperature = 0; temperature < 2; temperature++) | 1324 | printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " |
| 1296 | printk("cpu %d %s: high %d, batch %d used:%d\n", | 1325 | "Cold: hi:%5d, btch:%4d usd:%4d\n", |
| 1297 | cpu, | 1326 | cpu, pageset->pcp[0].high, |
| 1298 | temperature ? "cold" : "hot", | 1327 | pageset->pcp[0].batch, pageset->pcp[0].count, |
| 1299 | pageset->pcp[temperature].high, | 1328 | pageset->pcp[1].high, pageset->pcp[1].batch, |
| 1300 | pageset->pcp[temperature].batch, | 1329 | pageset->pcp[1].count); |
| 1301 | pageset->pcp[temperature].count); | ||
| 1302 | } | 1330 | } |
| 1303 | } | 1331 | } |
| 1304 | 1332 | ||
| @@ -1320,6 +1348,9 @@ void show_free_areas(void) | |||
| 1320 | for_each_zone(zone) { | 1348 | for_each_zone(zone) { |
| 1321 | int i; | 1349 | int i; |
| 1322 | 1350 | ||
| 1351 | if (!populated_zone(zone)) | ||
| 1352 | continue; | ||
| 1353 | |||
| 1323 | show_node(zone); | 1354 | show_node(zone); |
| 1324 | printk("%s" | 1355 | printk("%s" |
| 1325 | " free:%lukB" | 1356 | " free:%lukB" |
| @@ -1352,12 +1383,11 @@ void show_free_areas(void) | |||
| 1352 | for_each_zone(zone) { | 1383 | for_each_zone(zone) { |
| 1353 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 1384 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
| 1354 | 1385 | ||
| 1386 | if (!populated_zone(zone)) | ||
| 1387 | continue; | ||
| 1388 | |||
| 1355 | show_node(zone); | 1389 | show_node(zone); |
| 1356 | printk("%s: ", zone->name); | 1390 | printk("%s: ", zone->name); |
| 1357 | if (!populated_zone(zone)) { | ||
| 1358 | printk("empty\n"); | ||
| 1359 | continue; | ||
| 1360 | } | ||
| 1361 | 1391 | ||
| 1362 | spin_lock_irqsave(&zone->lock, flags); | 1392 | spin_lock_irqsave(&zone->lock, flags); |
| 1363 | for (order = 0; order < MAX_ORDER; order++) { | 1393 | for (order = 0; order < MAX_ORDER; order++) { |
| @@ -1561,7 +1591,7 @@ static int __meminit __build_all_zonelists(void *dummy) | |||
| 1561 | void __meminit build_all_zonelists(void) | 1591 | void __meminit build_all_zonelists(void) |
| 1562 | { | 1592 | { |
| 1563 | if (system_state == SYSTEM_BOOTING) { | 1593 | if (system_state == SYSTEM_BOOTING) { |
| 1564 | __build_all_zonelists(0); | 1594 | __build_all_zonelists(NULL); |
| 1565 | cpuset_init_current_mems_allowed(); | 1595 | cpuset_init_current_mems_allowed(); |
| 1566 | } else { | 1596 | } else { |
| 1567 | /* we have to stop all cpus to guaranntee there is no user | 1597 | /* we have to stop all cpus to guaranntee there is no user |
| @@ -1642,25 +1672,6 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
| 1642 | 1672 | ||
| 1643 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | 1673 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) |
| 1644 | 1674 | ||
| 1645 | static void __init calculate_zone_totalpages(struct pglist_data *pgdat, | ||
| 1646 | unsigned long *zones_size, unsigned long *zholes_size) | ||
| 1647 | { | ||
| 1648 | unsigned long realtotalpages, totalpages = 0; | ||
| 1649 | enum zone_type i; | ||
| 1650 | |||
| 1651 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 1652 | totalpages += zones_size[i]; | ||
| 1653 | pgdat->node_spanned_pages = totalpages; | ||
| 1654 | |||
| 1655 | realtotalpages = totalpages; | ||
| 1656 | if (zholes_size) | ||
| 1657 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 1658 | realtotalpages -= zholes_size[i]; | ||
| 1659 | pgdat->node_present_pages = realtotalpages; | ||
| 1660 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); | ||
| 1661 | } | ||
| 1662 | |||
| 1663 | |||
| 1664 | /* | 1675 | /* |
| 1665 | * Initially all pages are reserved - free ones are freed | 1676 | * Initially all pages are reserved - free ones are freed |
| 1666 | * up by free_all_bootmem() once the early boot process is | 1677 | * up by free_all_bootmem() once the early boot process is |
| @@ -1818,6 +1829,9 @@ static int __cpuinit process_zones(int cpu) | |||
| 1818 | 1829 | ||
| 1819 | for_each_zone(zone) { | 1830 | for_each_zone(zone) { |
| 1820 | 1831 | ||
| 1832 | if (!populated_zone(zone)) | ||
| 1833 | continue; | ||
| 1834 | |||
| 1821 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 1835 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), |
| 1822 | GFP_KERNEL, cpu_to_node(cpu)); | 1836 | GFP_KERNEL, cpu_to_node(cpu)); |
| 1823 | if (!zone_pcp(zone, cpu)) | 1837 | if (!zone_pcp(zone, cpu)) |
| @@ -1977,6 +1991,366 @@ __meminit int init_currently_empty_zone(struct zone *zone, | |||
| 1977 | return 0; | 1991 | return 0; |
| 1978 | } | 1992 | } |
| 1979 | 1993 | ||
| 1994 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | ||
| 1995 | /* | ||
| 1996 | * Basic iterator support. Return the first range of PFNs for a node | ||
| 1997 | * Note: nid == MAX_NUMNODES returns first region regardless of node | ||
| 1998 | */ | ||
| 1999 | static int __init first_active_region_index_in_nid(int nid) | ||
| 2000 | { | ||
| 2001 | int i; | ||
| 2002 | |||
| 2003 | for (i = 0; i < nr_nodemap_entries; i++) | ||
| 2004 | if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) | ||
| 2005 | return i; | ||
| 2006 | |||
| 2007 | return -1; | ||
| 2008 | } | ||
| 2009 | |||
| 2010 | /* | ||
| 2011 | * Basic iterator support. Return the next active range of PFNs for a node | ||
| 2012 | * Note: nid == MAX_NUMNODES returns next region regardles of node | ||
| 2013 | */ | ||
| 2014 | static int __init next_active_region_index_in_nid(int index, int nid) | ||
| 2015 | { | ||
| 2016 | for (index = index + 1; index < nr_nodemap_entries; index++) | ||
| 2017 | if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) | ||
| 2018 | return index; | ||
| 2019 | |||
| 2020 | return -1; | ||
| 2021 | } | ||
| 2022 | |||
| 2023 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID | ||
| 2024 | /* | ||
| 2025 | * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. | ||
| 2026 | * Architectures may implement their own version but if add_active_range() | ||
| 2027 | * was used and there are no special requirements, this is a convenient | ||
| 2028 | * alternative | ||
| 2029 | */ | ||
| 2030 | int __init early_pfn_to_nid(unsigned long pfn) | ||
| 2031 | { | ||
| 2032 | int i; | ||
| 2033 | |||
| 2034 | for (i = 0; i < nr_nodemap_entries; i++) { | ||
| 2035 | unsigned long start_pfn = early_node_map[i].start_pfn; | ||
| 2036 | unsigned long end_pfn = early_node_map[i].end_pfn; | ||
| 2037 | |||
| 2038 | if (start_pfn <= pfn && pfn < end_pfn) | ||
| 2039 | return early_node_map[i].nid; | ||
| 2040 | } | ||
| 2041 | |||
| 2042 | return 0; | ||
| 2043 | } | ||
| 2044 | #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ | ||
| 2045 | |||
| 2046 | /* Basic iterator support to walk early_node_map[] */ | ||
| 2047 | #define for_each_active_range_index_in_nid(i, nid) \ | ||
| 2048 | for (i = first_active_region_index_in_nid(nid); i != -1; \ | ||
| 2049 | i = next_active_region_index_in_nid(i, nid)) | ||
| 2050 | |||
| 2051 | /** | ||
| 2052 | * free_bootmem_with_active_regions - Call free_bootmem_node for each active range | ||
| 2053 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed | ||
| 2054 | * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node | ||
| 2055 | * | ||
| 2056 | * If an architecture guarantees that all ranges registered with | ||
| 2057 | * add_active_ranges() contain no holes and may be freed, this | ||
| 2058 | * this function may be used instead of calling free_bootmem() manually. | ||
| 2059 | */ | ||
| 2060 | void __init free_bootmem_with_active_regions(int nid, | ||
| 2061 | unsigned long max_low_pfn) | ||
| 2062 | { | ||
| 2063 | int i; | ||
| 2064 | |||
| 2065 | for_each_active_range_index_in_nid(i, nid) { | ||
| 2066 | unsigned long size_pages = 0; | ||
| 2067 | unsigned long end_pfn = early_node_map[i].end_pfn; | ||
| 2068 | |||
| 2069 | if (early_node_map[i].start_pfn >= max_low_pfn) | ||
| 2070 | continue; | ||
| 2071 | |||
| 2072 | if (end_pfn > max_low_pfn) | ||
| 2073 | end_pfn = max_low_pfn; | ||
| 2074 | |||
| 2075 | size_pages = end_pfn - early_node_map[i].start_pfn; | ||
| 2076 | free_bootmem_node(NODE_DATA(early_node_map[i].nid), | ||
| 2077 | PFN_PHYS(early_node_map[i].start_pfn), | ||
| 2078 | size_pages << PAGE_SHIFT); | ||
| 2079 | } | ||
| 2080 | } | ||
| 2081 | |||
| 2082 | /** | ||
| 2083 | * sparse_memory_present_with_active_regions - Call memory_present for each active range | ||
| 2084 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used | ||
| 2085 | * | ||
| 2086 | * If an architecture guarantees that all ranges registered with | ||
| 2087 | * add_active_ranges() contain no holes and may be freed, this | ||
| 2088 | * this function may be used instead of calling memory_present() manually. | ||
| 2089 | */ | ||
| 2090 | void __init sparse_memory_present_with_active_regions(int nid) | ||
| 2091 | { | ||
| 2092 | int i; | ||
| 2093 | |||
| 2094 | for_each_active_range_index_in_nid(i, nid) | ||
| 2095 | memory_present(early_node_map[i].nid, | ||
| 2096 | early_node_map[i].start_pfn, | ||
| 2097 | early_node_map[i].end_pfn); | ||
| 2098 | } | ||
| 2099 | |||
| 2100 | /** | ||
| 2101 | * push_node_boundaries - Push node boundaries to at least the requested boundary | ||
| 2102 | * @nid: The nid of the node to push the boundary for | ||
| 2103 | * @start_pfn: The start pfn of the node | ||
| 2104 | * @end_pfn: The end pfn of the node | ||
| 2105 | * | ||
| 2106 | * In reserve-based hot-add, mem_map is allocated that is unused until hotadd | ||
| 2107 | * time. Specifically, on x86_64, SRAT will report ranges that can potentially | ||
| 2108 | * be hotplugged even though no physical memory exists. This function allows | ||
| 2109 | * an arch to push out the node boundaries so mem_map is allocated that can | ||
| 2110 | * be used later. | ||
| 2111 | */ | ||
| 2112 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
| 2113 | void __init push_node_boundaries(unsigned int nid, | ||
| 2114 | unsigned long start_pfn, unsigned long end_pfn) | ||
| 2115 | { | ||
| 2116 | printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", | ||
| 2117 | nid, start_pfn, end_pfn); | ||
| 2118 | |||
| 2119 | /* Initialise the boundary for this node if necessary */ | ||
| 2120 | if (node_boundary_end_pfn[nid] == 0) | ||
| 2121 | node_boundary_start_pfn[nid] = -1UL; | ||
| 2122 | |||
| 2123 | /* Update the boundaries */ | ||
| 2124 | if (node_boundary_start_pfn[nid] > start_pfn) | ||
| 2125 | node_boundary_start_pfn[nid] = start_pfn; | ||
| 2126 | if (node_boundary_end_pfn[nid] < end_pfn) | ||
| 2127 | node_boundary_end_pfn[nid] = end_pfn; | ||
| 2128 | } | ||
| 2129 | |||
| 2130 | /* If necessary, push the node boundary out for reserve hotadd */ | ||
| 2131 | static void __init account_node_boundary(unsigned int nid, | ||
| 2132 | unsigned long *start_pfn, unsigned long *end_pfn) | ||
| 2133 | { | ||
| 2134 | printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", | ||
| 2135 | nid, *start_pfn, *end_pfn); | ||
| 2136 | |||
| 2137 | /* Return if boundary information has not been provided */ | ||
| 2138 | if (node_boundary_end_pfn[nid] == 0) | ||
| 2139 | return; | ||
| 2140 | |||
| 2141 | /* Check the boundaries and update if necessary */ | ||
| 2142 | if (node_boundary_start_pfn[nid] < *start_pfn) | ||
| 2143 | *start_pfn = node_boundary_start_pfn[nid]; | ||
| 2144 | if (node_boundary_end_pfn[nid] > *end_pfn) | ||
| 2145 | *end_pfn = node_boundary_end_pfn[nid]; | ||
| 2146 | } | ||
| 2147 | #else | ||
| 2148 | void __init push_node_boundaries(unsigned int nid, | ||
| 2149 | unsigned long start_pfn, unsigned long end_pfn) {} | ||
| 2150 | |||
| 2151 | static void __init account_node_boundary(unsigned int nid, | ||
| 2152 | unsigned long *start_pfn, unsigned long *end_pfn) {} | ||
| 2153 | #endif | ||
| 2154 | |||
| 2155 | |||
| 2156 | /** | ||
| 2157 | * get_pfn_range_for_nid - Return the start and end page frames for a node | ||
| 2158 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned | ||
| 2159 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn | ||
| 2160 | * @end_pfn: Passed by reference. On return, it will have the node end_pfn | ||
| 2161 | * | ||
| 2162 | * It returns the start and end page frame of a node based on information | ||
| 2163 | * provided by an arch calling add_active_range(). If called for a node | ||
| 2164 | * with no available memory, a warning is printed and the start and end | ||
| 2165 | * PFNs will be 0 | ||
| 2166 | */ | ||
| 2167 | void __init get_pfn_range_for_nid(unsigned int nid, | ||
| 2168 | unsigned long *start_pfn, unsigned long *end_pfn) | ||
| 2169 | { | ||
| 2170 | int i; | ||
| 2171 | *start_pfn = -1UL; | ||
| 2172 | *end_pfn = 0; | ||
| 2173 | |||
| 2174 | for_each_active_range_index_in_nid(i, nid) { | ||
| 2175 | *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); | ||
| 2176 | *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); | ||
| 2177 | } | ||
| 2178 | |||
| 2179 | if (*start_pfn == -1UL) { | ||
| 2180 | printk(KERN_WARNING "Node %u active with no memory\n", nid); | ||
| 2181 | *start_pfn = 0; | ||
| 2182 | } | ||
| 2183 | |||
| 2184 | /* Push the node boundaries out if requested */ | ||
| 2185 | account_node_boundary(nid, start_pfn, end_pfn); | ||
| 2186 | } | ||
| 2187 | |||
| 2188 | /* | ||
| 2189 | * Return the number of pages a zone spans in a node, including holes | ||
| 2190 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() | ||
| 2191 | */ | ||
| 2192 | unsigned long __init zone_spanned_pages_in_node(int nid, | ||
| 2193 | unsigned long zone_type, | ||
| 2194 | unsigned long *ignored) | ||
| 2195 | { | ||
| 2196 | unsigned long node_start_pfn, node_end_pfn; | ||
| 2197 | unsigned long zone_start_pfn, zone_end_pfn; | ||
| 2198 | |||
| 2199 | /* Get the start and end of the node and zone */ | ||
| 2200 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | ||
| 2201 | zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; | ||
| 2202 | zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; | ||
| 2203 | |||
| 2204 | /* Check that this node has pages within the zone's required range */ | ||
| 2205 | if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) | ||
| 2206 | return 0; | ||
| 2207 | |||
| 2208 | /* Move the zone boundaries inside the node if necessary */ | ||
| 2209 | zone_end_pfn = min(zone_end_pfn, node_end_pfn); | ||
| 2210 | zone_start_pfn = max(zone_start_pfn, node_start_pfn); | ||
| 2211 | |||
| 2212 | /* Return the spanned pages */ | ||
| 2213 | return zone_end_pfn - zone_start_pfn; | ||
| 2214 | } | ||
| 2215 | |||
| 2216 | /* | ||
| 2217 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | ||
| 2218 | * then all holes in the requested range will be accounted for | ||
| 2219 | */ | ||
| 2220 | unsigned long __init __absent_pages_in_range(int nid, | ||
| 2221 | unsigned long range_start_pfn, | ||
| 2222 | unsigned long range_end_pfn) | ||
| 2223 | { | ||
| 2224 | int i = 0; | ||
| 2225 | unsigned long prev_end_pfn = 0, hole_pages = 0; | ||
| 2226 | unsigned long start_pfn; | ||
| 2227 | |||
| 2228 | /* Find the end_pfn of the first active range of pfns in the node */ | ||
| 2229 | i = first_active_region_index_in_nid(nid); | ||
| 2230 | if (i == -1) | ||
| 2231 | return 0; | ||
| 2232 | |||
| 2233 | /* Account for ranges before physical memory on this node */ | ||
| 2234 | if (early_node_map[i].start_pfn > range_start_pfn) | ||
| 2235 | hole_pages = early_node_map[i].start_pfn - range_start_pfn; | ||
| 2236 | |||
| 2237 | prev_end_pfn = early_node_map[i].start_pfn; | ||
| 2238 | |||
| 2239 | /* Find all holes for the zone within the node */ | ||
| 2240 | for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { | ||
| 2241 | |||
| 2242 | /* No need to continue if prev_end_pfn is outside the zone */ | ||
| 2243 | if (prev_end_pfn >= range_end_pfn) | ||
| 2244 | break; | ||
| 2245 | |||
| 2246 | /* Make sure the end of the zone is not within the hole */ | ||
| 2247 | start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); | ||
| 2248 | prev_end_pfn = max(prev_end_pfn, range_start_pfn); | ||
| 2249 | |||
| 2250 | /* Update the hole size cound and move on */ | ||
| 2251 | if (start_pfn > range_start_pfn) { | ||
| 2252 | BUG_ON(prev_end_pfn > start_pfn); | ||
| 2253 | hole_pages += start_pfn - prev_end_pfn; | ||
| 2254 | } | ||
| 2255 | prev_end_pfn = early_node_map[i].end_pfn; | ||
| 2256 | } | ||
| 2257 | |||
| 2258 | /* Account for ranges past physical memory on this node */ | ||
| 2259 | if (range_end_pfn > prev_end_pfn) | ||
| 2260 | hole_pages = range_end_pfn - | ||
| 2261 | max(range_start_pfn, prev_end_pfn); | ||
| 2262 | |||
| 2263 | return hole_pages; | ||
| 2264 | } | ||
| 2265 | |||
| 2266 | /** | ||
| 2267 | * absent_pages_in_range - Return number of page frames in holes within a range | ||
| 2268 | * @start_pfn: The start PFN to start searching for holes | ||
| 2269 | * @end_pfn: The end PFN to stop searching for holes | ||
| 2270 | * | ||
| 2271 | * It returns the number of pages frames in memory holes within a range | ||
| 2272 | */ | ||
| 2273 | unsigned long __init absent_pages_in_range(unsigned long start_pfn, | ||
| 2274 | unsigned long end_pfn) | ||
| 2275 | { | ||
| 2276 | return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); | ||
| 2277 | } | ||
| 2278 | |||
| 2279 | /* Return the number of page frames in holes in a zone on a node */ | ||
| 2280 | unsigned long __init zone_absent_pages_in_node(int nid, | ||
| 2281 | unsigned long zone_type, | ||
| 2282 | unsigned long *ignored) | ||
| 2283 | { | ||
| 2284 | unsigned long node_start_pfn, node_end_pfn; | ||
| 2285 | unsigned long zone_start_pfn, zone_end_pfn; | ||
| 2286 | |||
| 2287 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | ||
| 2288 | zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], | ||
| 2289 | node_start_pfn); | ||
| 2290 | zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], | ||
| 2291 | node_end_pfn); | ||
| 2292 | |||
| 2293 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); | ||
| 2294 | } | ||
| 2295 | |||
| 2296 | /* Return the zone index a PFN is in */ | ||
| 2297 | int memmap_zone_idx(struct page *lmem_map) | ||
| 2298 | { | ||
| 2299 | int i; | ||
| 2300 | unsigned long phys_addr = virt_to_phys(lmem_map); | ||
| 2301 | unsigned long pfn = phys_addr >> PAGE_SHIFT; | ||
| 2302 | |||
| 2303 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 2304 | if (pfn < arch_zone_highest_possible_pfn[i]) | ||
| 2305 | break; | ||
| 2306 | |||
| 2307 | return i; | ||
| 2308 | } | ||
| 2309 | #else | ||
| 2310 | static inline unsigned long zone_spanned_pages_in_node(int nid, | ||
| 2311 | unsigned long zone_type, | ||
| 2312 | unsigned long *zones_size) | ||
| 2313 | { | ||
| 2314 | return zones_size[zone_type]; | ||
| 2315 | } | ||
| 2316 | |||
| 2317 | static inline unsigned long zone_absent_pages_in_node(int nid, | ||
| 2318 | unsigned long zone_type, | ||
| 2319 | unsigned long *zholes_size) | ||
| 2320 | { | ||
| 2321 | if (!zholes_size) | ||
| 2322 | return 0; | ||
| 2323 | |||
| 2324 | return zholes_size[zone_type]; | ||
| 2325 | } | ||
| 2326 | |||
| 2327 | static inline int memmap_zone_idx(struct page *lmem_map) | ||
| 2328 | { | ||
| 2329 | return MAX_NR_ZONES; | ||
| 2330 | } | ||
| 2331 | #endif | ||
| 2332 | |||
| 2333 | static void __init calculate_node_totalpages(struct pglist_data *pgdat, | ||
| 2334 | unsigned long *zones_size, unsigned long *zholes_size) | ||
| 2335 | { | ||
| 2336 | unsigned long realtotalpages, totalpages = 0; | ||
| 2337 | enum zone_type i; | ||
| 2338 | |||
| 2339 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 2340 | totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, | ||
| 2341 | zones_size); | ||
| 2342 | pgdat->node_spanned_pages = totalpages; | ||
| 2343 | |||
| 2344 | realtotalpages = totalpages; | ||
| 2345 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 2346 | realtotalpages -= | ||
| 2347 | zone_absent_pages_in_node(pgdat->node_id, i, | ||
| 2348 | zholes_size); | ||
| 2349 | pgdat->node_present_pages = realtotalpages; | ||
| 2350 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, | ||
| 2351 | realtotalpages); | ||
| 2352 | } | ||
| 2353 | |||
| 1980 | /* | 2354 | /* |
| 1981 | * Set up the zone data structures: | 2355 | * Set up the zone data structures: |
| 1982 | * - mark all pages reserved | 2356 | * - mark all pages reserved |
| @@ -1998,11 +2372,34 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
| 1998 | 2372 | ||
| 1999 | for (j = 0; j < MAX_NR_ZONES; j++) { | 2373 | for (j = 0; j < MAX_NR_ZONES; j++) { |
| 2000 | struct zone *zone = pgdat->node_zones + j; | 2374 | struct zone *zone = pgdat->node_zones + j; |
| 2001 | unsigned long size, realsize; | 2375 | unsigned long size, realsize, memmap_pages; |
| 2002 | 2376 | ||
| 2003 | realsize = size = zones_size[j]; | 2377 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
| 2004 | if (zholes_size) | 2378 | realsize = size - zone_absent_pages_in_node(nid, j, |
| 2005 | realsize -= zholes_size[j]; | 2379 | zholes_size); |
| 2380 | |||
| 2381 | /* | ||
| 2382 | * Adjust realsize so that it accounts for how much memory | ||
| 2383 | * is used by this zone for memmap. This affects the watermark | ||
| 2384 | * and per-cpu initialisations | ||
| 2385 | */ | ||
| 2386 | memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT; | ||
| 2387 | if (realsize >= memmap_pages) { | ||
| 2388 | realsize -= memmap_pages; | ||
| 2389 | printk(KERN_DEBUG | ||
| 2390 | " %s zone: %lu pages used for memmap\n", | ||
| 2391 | zone_names[j], memmap_pages); | ||
| 2392 | } else | ||
| 2393 | printk(KERN_WARNING | ||
| 2394 | " %s zone: %lu pages exceeds realsize %lu\n", | ||
| 2395 | zone_names[j], memmap_pages, realsize); | ||
| 2396 | |||
| 2397 | /* Account for reserved DMA pages */ | ||
| 2398 | if (j == ZONE_DMA && realsize > dma_reserve) { | ||
| 2399 | realsize -= dma_reserve; | ||
| 2400 | printk(KERN_DEBUG " DMA zone: %lu pages reserved\n", | ||
| 2401 | dma_reserve); | ||
| 2402 | } | ||
| 2006 | 2403 | ||
| 2007 | if (!is_highmem_idx(j)) | 2404 | if (!is_highmem_idx(j)) |
| 2008 | nr_kernel_pages += realsize; | 2405 | nr_kernel_pages += realsize; |
| @@ -2011,6 +2408,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
| 2011 | zone->spanned_pages = size; | 2408 | zone->spanned_pages = size; |
| 2012 | zone->present_pages = realsize; | 2409 | zone->present_pages = realsize; |
| 2013 | #ifdef CONFIG_NUMA | 2410 | #ifdef CONFIG_NUMA |
| 2411 | zone->node = nid; | ||
| 2014 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 2412 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) |
| 2015 | / 100; | 2413 | / 100; |
| 2016 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; | 2414 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; |
| @@ -2073,8 +2471,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) | |||
| 2073 | /* | 2471 | /* |
| 2074 | * With no DISCONTIG, the global mem_map is just set as node 0's | 2472 | * With no DISCONTIG, the global mem_map is just set as node 0's |
| 2075 | */ | 2473 | */ |
| 2076 | if (pgdat == NODE_DATA(0)) | 2474 | if (pgdat == NODE_DATA(0)) { |
| 2077 | mem_map = NODE_DATA(0)->node_mem_map; | 2475 | mem_map = NODE_DATA(0)->node_mem_map; |
| 2476 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | ||
| 2477 | if (page_to_pfn(mem_map) != pgdat->node_start_pfn) | ||
| 2478 | mem_map -= pgdat->node_start_pfn; | ||
| 2479 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | ||
| 2480 | } | ||
| 2078 | #endif | 2481 | #endif |
| 2079 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | 2482 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ |
| 2080 | } | 2483 | } |
| @@ -2085,13 +2488,255 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, | |||
| 2085 | { | 2488 | { |
| 2086 | pgdat->node_id = nid; | 2489 | pgdat->node_id = nid; |
| 2087 | pgdat->node_start_pfn = node_start_pfn; | 2490 | pgdat->node_start_pfn = node_start_pfn; |
| 2088 | calculate_zone_totalpages(pgdat, zones_size, zholes_size); | 2491 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
| 2089 | 2492 | ||
| 2090 | alloc_node_mem_map(pgdat); | 2493 | alloc_node_mem_map(pgdat); |
| 2091 | 2494 | ||
| 2092 | free_area_init_core(pgdat, zones_size, zholes_size); | 2495 | free_area_init_core(pgdat, zones_size, zholes_size); |
| 2093 | } | 2496 | } |
| 2094 | 2497 | ||
| 2498 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | ||
| 2499 | /** | ||
| 2500 | * add_active_range - Register a range of PFNs backed by physical memory | ||
| 2501 | * @nid: The node ID the range resides on | ||
| 2502 | * @start_pfn: The start PFN of the available physical memory | ||
| 2503 | * @end_pfn: The end PFN of the available physical memory | ||
| 2504 | * | ||
| 2505 | * These ranges are stored in an early_node_map[] and later used by | ||
| 2506 | * free_area_init_nodes() to calculate zone sizes and holes. If the | ||
| 2507 | * range spans a memory hole, it is up to the architecture to ensure | ||
| 2508 | * the memory is not freed by the bootmem allocator. If possible | ||
| 2509 | * the range being registered will be merged with existing ranges. | ||
| 2510 | */ | ||
| 2511 | void __init add_active_range(unsigned int nid, unsigned long start_pfn, | ||
| 2512 | unsigned long end_pfn) | ||
| 2513 | { | ||
| 2514 | int i; | ||
| 2515 | |||
| 2516 | printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " | ||
| 2517 | "%d entries of %d used\n", | ||
| 2518 | nid, start_pfn, end_pfn, | ||
| 2519 | nr_nodemap_entries, MAX_ACTIVE_REGIONS); | ||
| 2520 | |||
| 2521 | /* Merge with existing active regions if possible */ | ||
| 2522 | for (i = 0; i < nr_nodemap_entries; i++) { | ||
| 2523 | if (early_node_map[i].nid != nid) | ||
| 2524 | continue; | ||
| 2525 | |||
| 2526 | /* Skip if an existing region covers this new one */ | ||
| 2527 | if (start_pfn >= early_node_map[i].start_pfn && | ||
| 2528 | end_pfn <= early_node_map[i].end_pfn) | ||
| 2529 | return; | ||
| 2530 | |||
| 2531 | /* Merge forward if suitable */ | ||
| 2532 | if (start_pfn <= early_node_map[i].end_pfn && | ||
| 2533 | end_pfn > early_node_map[i].end_pfn) { | ||
| 2534 | early_node_map[i].end_pfn = end_pfn; | ||
| 2535 | return; | ||
| 2536 | } | ||
| 2537 | |||
| 2538 | /* Merge backward if suitable */ | ||
| 2539 | if (start_pfn < early_node_map[i].end_pfn && | ||
| 2540 | end_pfn >= early_node_map[i].start_pfn) { | ||
| 2541 | early_node_map[i].start_pfn = start_pfn; | ||
| 2542 | return; | ||
| 2543 | } | ||
| 2544 | } | ||
| 2545 | |||
| 2546 | /* Check that early_node_map is large enough */ | ||
| 2547 | if (i >= MAX_ACTIVE_REGIONS) { | ||
| 2548 | printk(KERN_CRIT "More than %d memory regions, truncating\n", | ||
| 2549 | MAX_ACTIVE_REGIONS); | ||
| 2550 | return; | ||
| 2551 | } | ||
| 2552 | |||
| 2553 | early_node_map[i].nid = nid; | ||
| 2554 | early_node_map[i].start_pfn = start_pfn; | ||
| 2555 | early_node_map[i].end_pfn = end_pfn; | ||
| 2556 | nr_nodemap_entries = i + 1; | ||
| 2557 | } | ||
| 2558 | |||
| 2559 | /** | ||
| 2560 | * shrink_active_range - Shrink an existing registered range of PFNs | ||
| 2561 | * @nid: The node id the range is on that should be shrunk | ||
| 2562 | * @old_end_pfn: The old end PFN of the range | ||
| 2563 | * @new_end_pfn: The new PFN of the range | ||
| 2564 | * | ||
| 2565 | * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. | ||
| 2566 | * The map is kept at the end physical page range that has already been | ||
| 2567 | * registered with add_active_range(). This function allows an arch to shrink | ||
| 2568 | * an existing registered range. | ||
| 2569 | */ | ||
| 2570 | void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, | ||
| 2571 | unsigned long new_end_pfn) | ||
| 2572 | { | ||
| 2573 | int i; | ||
| 2574 | |||
| 2575 | /* Find the old active region end and shrink */ | ||
| 2576 | for_each_active_range_index_in_nid(i, nid) | ||
| 2577 | if (early_node_map[i].end_pfn == old_end_pfn) { | ||
| 2578 | early_node_map[i].end_pfn = new_end_pfn; | ||
| 2579 | break; | ||
| 2580 | } | ||
| 2581 | } | ||
| 2582 | |||
| 2583 | /** | ||
| 2584 | * remove_all_active_ranges - Remove all currently registered regions | ||
| 2585 | * During discovery, it may be found that a table like SRAT is invalid | ||
| 2586 | * and an alternative discovery method must be used. This function removes | ||
| 2587 | * all currently registered regions. | ||
| 2588 | */ | ||
| 2589 | void __init remove_all_active_ranges() | ||
| 2590 | { | ||
| 2591 | memset(early_node_map, 0, sizeof(early_node_map)); | ||
| 2592 | nr_nodemap_entries = 0; | ||
| 2593 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
| 2594 | memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); | ||
| 2595 | memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); | ||
| 2596 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | ||
| 2597 | } | ||
| 2598 | |||
| 2599 | /* Compare two active node_active_regions */ | ||
| 2600 | static int __init cmp_node_active_region(const void *a, const void *b) | ||
| 2601 | { | ||
| 2602 | struct node_active_region *arange = (struct node_active_region *)a; | ||
| 2603 | struct node_active_region *brange = (struct node_active_region *)b; | ||
| 2604 | |||
| 2605 | /* Done this way to avoid overflows */ | ||
| 2606 | if (arange->start_pfn > brange->start_pfn) | ||
| 2607 | return 1; | ||
| 2608 | if (arange->start_pfn < brange->start_pfn) | ||
| 2609 | return -1; | ||
| 2610 | |||
| 2611 | return 0; | ||
| 2612 | } | ||
| 2613 | |||
| 2614 | /* sort the node_map by start_pfn */ | ||
| 2615 | static void __init sort_node_map(void) | ||
| 2616 | { | ||
| 2617 | sort(early_node_map, (size_t)nr_nodemap_entries, | ||
| 2618 | sizeof(struct node_active_region), | ||
| 2619 | cmp_node_active_region, NULL); | ||
| 2620 | } | ||
| 2621 | |||
| 2622 | /* Find the lowest pfn for a node. This depends on a sorted early_node_map */ | ||
| 2623 | unsigned long __init find_min_pfn_for_node(unsigned long nid) | ||
| 2624 | { | ||
| 2625 | int i; | ||
| 2626 | |||
| 2627 | /* Assuming a sorted map, the first range found has the starting pfn */ | ||
| 2628 | for_each_active_range_index_in_nid(i, nid) | ||
| 2629 | return early_node_map[i].start_pfn; | ||
| 2630 | |||
| 2631 | printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid); | ||
| 2632 | return 0; | ||
| 2633 | } | ||
| 2634 | |||
| 2635 | /** | ||
| 2636 | * find_min_pfn_with_active_regions - Find the minimum PFN registered | ||
| 2637 | * | ||
| 2638 | * It returns the minimum PFN based on information provided via | ||
| 2639 | * add_active_range() | ||
| 2640 | */ | ||
| 2641 | unsigned long __init find_min_pfn_with_active_regions(void) | ||
| 2642 | { | ||
| 2643 | return find_min_pfn_for_node(MAX_NUMNODES); | ||
| 2644 | } | ||
| 2645 | |||
| 2646 | /** | ||
| 2647 | * find_max_pfn_with_active_regions - Find the maximum PFN registered | ||
| 2648 | * | ||
| 2649 | * It returns the maximum PFN based on information provided via | ||
| 2650 | * add_active_range() | ||
| 2651 | */ | ||
| 2652 | unsigned long __init find_max_pfn_with_active_regions(void) | ||
| 2653 | { | ||
| 2654 | int i; | ||
| 2655 | unsigned long max_pfn = 0; | ||
| 2656 | |||
| 2657 | for (i = 0; i < nr_nodemap_entries; i++) | ||
| 2658 | max_pfn = max(max_pfn, early_node_map[i].end_pfn); | ||
| 2659 | |||
| 2660 | return max_pfn; | ||
| 2661 | } | ||
| 2662 | |||
| 2663 | /** | ||
| 2664 | * free_area_init_nodes - Initialise all pg_data_t and zone data | ||
| 2665 | * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA | ||
| 2666 | * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32 | ||
| 2667 | * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL | ||
| 2668 | * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM | ||
| 2669 | * | ||
| 2670 | * This will call free_area_init_node() for each active node in the system. | ||
| 2671 | * Using the page ranges provided by add_active_range(), the size of each | ||
| 2672 | * zone in each node and their holes is calculated. If the maximum PFN | ||
| 2673 | * between two adjacent zones match, it is assumed that the zone is empty. | ||
| 2674 | * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed | ||
| 2675 | * that arch_max_dma32_pfn has no pages. It is also assumed that a zone | ||
| 2676 | * starts where the previous one ended. For example, ZONE_DMA32 starts | ||
| 2677 | * at arch_max_dma_pfn. | ||
| 2678 | */ | ||
| 2679 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) | ||
| 2680 | { | ||
| 2681 | unsigned long nid; | ||
| 2682 | enum zone_type i; | ||
| 2683 | |||
| 2684 | /* Record where the zone boundaries are */ | ||
| 2685 | memset(arch_zone_lowest_possible_pfn, 0, | ||
| 2686 | sizeof(arch_zone_lowest_possible_pfn)); | ||
| 2687 | memset(arch_zone_highest_possible_pfn, 0, | ||
| 2688 | sizeof(arch_zone_highest_possible_pfn)); | ||
| 2689 | arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); | ||
| 2690 | arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; | ||
| 2691 | for (i = 1; i < MAX_NR_ZONES; i++) { | ||
| 2692 | arch_zone_lowest_possible_pfn[i] = | ||
| 2693 | arch_zone_highest_possible_pfn[i-1]; | ||
| 2694 | arch_zone_highest_possible_pfn[i] = | ||
| 2695 | max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); | ||
| 2696 | } | ||
| 2697 | |||
| 2698 | /* Regions in the early_node_map can be in any order */ | ||
| 2699 | sort_node_map(); | ||
| 2700 | |||
| 2701 | /* Print out the zone ranges */ | ||
| 2702 | printk("Zone PFN ranges:\n"); | ||
| 2703 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
| 2704 | printk(" %-8s %8lu -> %8lu\n", | ||
| 2705 | zone_names[i], | ||
| 2706 | arch_zone_lowest_possible_pfn[i], | ||
| 2707 | arch_zone_highest_possible_pfn[i]); | ||
| 2708 | |||
| 2709 | /* Print out the early_node_map[] */ | ||
| 2710 | printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); | ||
| 2711 | for (i = 0; i < nr_nodemap_entries; i++) | ||
| 2712 | printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, | ||
| 2713 | early_node_map[i].start_pfn, | ||
| 2714 | early_node_map[i].end_pfn); | ||
| 2715 | |||
| 2716 | /* Initialise every node */ | ||
| 2717 | for_each_online_node(nid) { | ||
| 2718 | pg_data_t *pgdat = NODE_DATA(nid); | ||
| 2719 | free_area_init_node(nid, pgdat, NULL, | ||
| 2720 | find_min_pfn_for_node(nid), NULL); | ||
| 2721 | } | ||
| 2722 | } | ||
| 2723 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | ||
| 2724 | |||
| 2725 | /** | ||
| 2726 | * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA | ||
| 2727 | * @new_dma_reserve - The number of pages to mark reserved | ||
| 2728 | * | ||
| 2729 | * The per-cpu batchsize and zone watermarks are determined by present_pages. | ||
| 2730 | * In the DMA zone, a significant percentage may be consumed by kernel image | ||
| 2731 | * and other unfreeable allocations which can skew the watermarks badly. This | ||
| 2732 | * function may optionally be used to account for unfreeable pages in | ||
| 2733 | * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize | ||
| 2734 | */ | ||
| 2735 | void __init set_dma_reserve(unsigned long new_dma_reserve) | ||
| 2736 | { | ||
| 2737 | dma_reserve = new_dma_reserve; | ||
| 2738 | } | ||
| 2739 | |||
| 2095 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 2740 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
| 2096 | static bootmem_data_t contig_bootmem_data; | 2741 | static bootmem_data_t contig_bootmem_data; |
| 2097 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; | 2742 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; |
diff --git a/mm/shmem.c b/mm/shmem.c index 8631be45b40d..eda907c3a86a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -1351,7 +1351,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
| 1351 | inode->i_mode = mode; | 1351 | inode->i_mode = mode; |
| 1352 | inode->i_uid = current->fsuid; | 1352 | inode->i_uid = current->fsuid; |
| 1353 | inode->i_gid = current->fsgid; | 1353 | inode->i_gid = current->fsgid; |
| 1354 | inode->i_blksize = PAGE_CACHE_SIZE; | ||
| 1355 | inode->i_blocks = 0; | 1354 | inode->i_blocks = 0; |
| 1356 | inode->i_mapping->a_ops = &shmem_aops; | 1355 | inode->i_mapping->a_ops = &shmem_aops; |
| 1357 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; | 1356 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; |
| @@ -2157,8 +2156,7 @@ static int init_inodecache(void) | |||
| 2157 | 2156 | ||
| 2158 | static void destroy_inodecache(void) | 2157 | static void destroy_inodecache(void) |
| 2159 | { | 2158 | { |
| 2160 | if (kmem_cache_destroy(shmem_inode_cachep)) | 2159 | kmem_cache_destroy(shmem_inode_cachep); |
| 2161 | printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n"); | ||
| 2162 | } | 2160 | } |
| 2163 | 2161 | ||
| 2164 | static const struct address_space_operations shmem_aops = { | 2162 | static const struct address_space_operations shmem_aops = { |
| @@ -972,7 +972,39 @@ static int transfer_objects(struct array_cache *to, | |||
| 972 | return nr; | 972 | return nr; |
| 973 | } | 973 | } |
| 974 | 974 | ||
| 975 | #ifdef CONFIG_NUMA | 975 | #ifndef CONFIG_NUMA |
| 976 | |||
| 977 | #define drain_alien_cache(cachep, alien) do { } while (0) | ||
| 978 | #define reap_alien(cachep, l3) do { } while (0) | ||
| 979 | |||
| 980 | static inline struct array_cache **alloc_alien_cache(int node, int limit) | ||
| 981 | { | ||
| 982 | return (struct array_cache **)BAD_ALIEN_MAGIC; | ||
| 983 | } | ||
| 984 | |||
| 985 | static inline void free_alien_cache(struct array_cache **ac_ptr) | ||
| 986 | { | ||
| 987 | } | ||
| 988 | |||
| 989 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | ||
| 990 | { | ||
| 991 | return 0; | ||
| 992 | } | ||
| 993 | |||
| 994 | static inline void *alternate_node_alloc(struct kmem_cache *cachep, | ||
| 995 | gfp_t flags) | ||
| 996 | { | ||
| 997 | return NULL; | ||
| 998 | } | ||
| 999 | |||
| 1000 | static inline void *__cache_alloc_node(struct kmem_cache *cachep, | ||
| 1001 | gfp_t flags, int nodeid) | ||
| 1002 | { | ||
| 1003 | return NULL; | ||
| 1004 | } | ||
| 1005 | |||
| 1006 | #else /* CONFIG_NUMA */ | ||
| 1007 | |||
| 976 | static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); | 1008 | static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); |
| 977 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); | 1009 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); |
| 978 | 1010 | ||
| @@ -1101,26 +1133,6 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
| 1101 | } | 1133 | } |
| 1102 | return 1; | 1134 | return 1; |
| 1103 | } | 1135 | } |
| 1104 | |||
| 1105 | #else | ||
| 1106 | |||
| 1107 | #define drain_alien_cache(cachep, alien) do { } while (0) | ||
| 1108 | #define reap_alien(cachep, l3) do { } while (0) | ||
| 1109 | |||
| 1110 | static inline struct array_cache **alloc_alien_cache(int node, int limit) | ||
| 1111 | { | ||
| 1112 | return (struct array_cache **)BAD_ALIEN_MAGIC; | ||
| 1113 | } | ||
| 1114 | |||
| 1115 | static inline void free_alien_cache(struct array_cache **ac_ptr) | ||
| 1116 | { | ||
| 1117 | } | ||
| 1118 | |||
| 1119 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | ||
| 1120 | { | ||
| 1121 | return 0; | ||
| 1122 | } | ||
| 1123 | |||
| 1124 | #endif | 1136 | #endif |
| 1125 | 1137 | ||
| 1126 | static int __cpuinit cpuup_callback(struct notifier_block *nfb, | 1138 | static int __cpuinit cpuup_callback(struct notifier_block *nfb, |
| @@ -1564,7 +1576,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 1564 | */ | 1576 | */ |
| 1565 | flags |= __GFP_COMP; | 1577 | flags |= __GFP_COMP; |
| 1566 | #endif | 1578 | #endif |
| 1567 | flags |= cachep->gfpflags; | 1579 | |
| 1580 | /* | ||
| 1581 | * Under NUMA we want memory on the indicated node. We will handle | ||
| 1582 | * the needed fallback ourselves since we want to serve from our | ||
| 1583 | * per node object lists first for other nodes. | ||
| 1584 | */ | ||
| 1585 | flags |= cachep->gfpflags | GFP_THISNODE; | ||
| 1568 | 1586 | ||
| 1569 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | 1587 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); |
| 1570 | if (!page) | 1588 | if (!page) |
| @@ -2442,7 +2460,6 @@ EXPORT_SYMBOL(kmem_cache_shrink); | |||
| 2442 | * @cachep: the cache to destroy | 2460 | * @cachep: the cache to destroy |
| 2443 | * | 2461 | * |
| 2444 | * Remove a struct kmem_cache object from the slab cache. | 2462 | * Remove a struct kmem_cache object from the slab cache. |
| 2445 | * Returns 0 on success. | ||
| 2446 | * | 2463 | * |
| 2447 | * It is expected this function will be called by a module when it is | 2464 | * It is expected this function will be called by a module when it is |
| 2448 | * unloaded. This will remove the cache completely, and avoid a duplicate | 2465 | * unloaded. This will remove the cache completely, and avoid a duplicate |
| @@ -2454,7 +2471,7 @@ EXPORT_SYMBOL(kmem_cache_shrink); | |||
| 2454 | * The caller must guarantee that noone will allocate memory from the cache | 2471 | * The caller must guarantee that noone will allocate memory from the cache |
| 2455 | * during the kmem_cache_destroy(). | 2472 | * during the kmem_cache_destroy(). |
| 2456 | */ | 2473 | */ |
| 2457 | int kmem_cache_destroy(struct kmem_cache *cachep) | 2474 | void kmem_cache_destroy(struct kmem_cache *cachep) |
| 2458 | { | 2475 | { |
| 2459 | BUG_ON(!cachep || in_interrupt()); | 2476 | BUG_ON(!cachep || in_interrupt()); |
| 2460 | 2477 | ||
| @@ -2475,7 +2492,7 @@ int kmem_cache_destroy(struct kmem_cache *cachep) | |||
| 2475 | list_add(&cachep->next, &cache_chain); | 2492 | list_add(&cachep->next, &cache_chain); |
| 2476 | mutex_unlock(&cache_chain_mutex); | 2493 | mutex_unlock(&cache_chain_mutex); |
| 2477 | unlock_cpu_hotplug(); | 2494 | unlock_cpu_hotplug(); |
| 2478 | return 1; | 2495 | return; |
| 2479 | } | 2496 | } |
| 2480 | 2497 | ||
| 2481 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) | 2498 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) |
| @@ -2483,7 +2500,6 @@ int kmem_cache_destroy(struct kmem_cache *cachep) | |||
| 2483 | 2500 | ||
| 2484 | __kmem_cache_destroy(cachep); | 2501 | __kmem_cache_destroy(cachep); |
| 2485 | unlock_cpu_hotplug(); | 2502 | unlock_cpu_hotplug(); |
| 2486 | return 0; | ||
| 2487 | } | 2503 | } |
| 2488 | EXPORT_SYMBOL(kmem_cache_destroy); | 2504 | EXPORT_SYMBOL(kmem_cache_destroy); |
| 2489 | 2505 | ||
| @@ -3030,14 +3046,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
| 3030 | void *objp; | 3046 | void *objp; |
| 3031 | struct array_cache *ac; | 3047 | struct array_cache *ac; |
| 3032 | 3048 | ||
| 3033 | #ifdef CONFIG_NUMA | ||
| 3034 | if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { | ||
| 3035 | objp = alternate_node_alloc(cachep, flags); | ||
| 3036 | if (objp != NULL) | ||
| 3037 | return objp; | ||
| 3038 | } | ||
| 3039 | #endif | ||
| 3040 | |||
| 3041 | check_irq_off(); | 3049 | check_irq_off(); |
| 3042 | ac = cpu_cache_get(cachep); | 3050 | ac = cpu_cache_get(cachep); |
| 3043 | if (likely(ac->avail)) { | 3051 | if (likely(ac->avail)) { |
| @@ -3055,12 +3063,24 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep, | |||
| 3055 | gfp_t flags, void *caller) | 3063 | gfp_t flags, void *caller) |
| 3056 | { | 3064 | { |
| 3057 | unsigned long save_flags; | 3065 | unsigned long save_flags; |
| 3058 | void *objp; | 3066 | void *objp = NULL; |
| 3059 | 3067 | ||
| 3060 | cache_alloc_debugcheck_before(cachep, flags); | 3068 | cache_alloc_debugcheck_before(cachep, flags); |
| 3061 | 3069 | ||
| 3062 | local_irq_save(save_flags); | 3070 | local_irq_save(save_flags); |
| 3063 | objp = ____cache_alloc(cachep, flags); | 3071 | |
| 3072 | if (unlikely(NUMA_BUILD && | ||
| 3073 | current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) | ||
| 3074 | objp = alternate_node_alloc(cachep, flags); | ||
| 3075 | |||
| 3076 | if (!objp) | ||
| 3077 | objp = ____cache_alloc(cachep, flags); | ||
| 3078 | /* | ||
| 3079 | * We may just have run out of memory on the local node. | ||
| 3080 | * __cache_alloc_node() knows how to locate memory on other nodes | ||
| 3081 | */ | ||
| 3082 | if (NUMA_BUILD && !objp) | ||
| 3083 | objp = __cache_alloc_node(cachep, flags, numa_node_id()); | ||
| 3064 | local_irq_restore(save_flags); | 3084 | local_irq_restore(save_flags); |
| 3065 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, | 3085 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, |
| 3066 | caller); | 3086 | caller); |
| @@ -3079,7 +3099,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
| 3079 | { | 3099 | { |
| 3080 | int nid_alloc, nid_here; | 3100 | int nid_alloc, nid_here; |
| 3081 | 3101 | ||
| 3082 | if (in_interrupt()) | 3102 | if (in_interrupt() || (flags & __GFP_THISNODE)) |
| 3083 | return NULL; | 3103 | return NULL; |
| 3084 | nid_alloc = nid_here = numa_node_id(); | 3104 | nid_alloc = nid_here = numa_node_id(); |
| 3085 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | 3105 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) |
| @@ -3092,6 +3112,28 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
| 3092 | } | 3112 | } |
| 3093 | 3113 | ||
| 3094 | /* | 3114 | /* |
| 3115 | * Fallback function if there was no memory available and no objects on a | ||
| 3116 | * certain node and we are allowed to fall back. We mimick the behavior of | ||
| 3117 | * the page allocator. We fall back according to a zonelist determined by | ||
| 3118 | * the policy layer while obeying cpuset constraints. | ||
| 3119 | */ | ||
| 3120 | void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | ||
| 3121 | { | ||
| 3122 | struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy)) | ||
| 3123 | ->node_zonelists[gfp_zone(flags)]; | ||
| 3124 | struct zone **z; | ||
| 3125 | void *obj = NULL; | ||
| 3126 | |||
| 3127 | for (z = zonelist->zones; *z && !obj; z++) | ||
| 3128 | if (zone_idx(*z) <= ZONE_NORMAL && | ||
| 3129 | cpuset_zone_allowed(*z, flags)) | ||
| 3130 | obj = __cache_alloc_node(cache, | ||
| 3131 | flags | __GFP_THISNODE, | ||
| 3132 | zone_to_nid(*z)); | ||
| 3133 | return obj; | ||
| 3134 | } | ||
| 3135 | |||
| 3136 | /* | ||
| 3095 | * A interface to enable slab creation on nodeid | 3137 | * A interface to enable slab creation on nodeid |
| 3096 | */ | 3138 | */ |
| 3097 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, | 3139 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, |
| @@ -3144,11 +3186,15 @@ retry: | |||
| 3144 | must_grow: | 3186 | must_grow: |
| 3145 | spin_unlock(&l3->list_lock); | 3187 | spin_unlock(&l3->list_lock); |
| 3146 | x = cache_grow(cachep, flags, nodeid); | 3188 | x = cache_grow(cachep, flags, nodeid); |
| 3189 | if (x) | ||
| 3190 | goto retry; | ||
| 3147 | 3191 | ||
| 3148 | if (!x) | 3192 | if (!(flags & __GFP_THISNODE)) |
| 3149 | return NULL; | 3193 | /* Unable to grow the cache. Fall back to other nodes. */ |
| 3194 | return fallback_alloc(cachep, flags); | ||
| 3195 | |||
| 3196 | return NULL; | ||
| 3150 | 3197 | ||
| 3151 | goto retry; | ||
| 3152 | done: | 3198 | done: |
| 3153 | return obj; | 3199 | return obj; |
| 3154 | } | 3200 | } |
| @@ -270,10 +270,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
| 270 | } | 270 | } |
| 271 | EXPORT_SYMBOL(kmem_cache_create); | 271 | EXPORT_SYMBOL(kmem_cache_create); |
| 272 | 272 | ||
| 273 | int kmem_cache_destroy(struct kmem_cache *c) | 273 | void kmem_cache_destroy(struct kmem_cache *c) |
| 274 | { | 274 | { |
| 275 | slob_free(c, sizeof(struct kmem_cache)); | 275 | slob_free(c, sizeof(struct kmem_cache)); |
| 276 | return 0; | ||
| 277 | } | 276 | } |
| 278 | EXPORT_SYMBOL(kmem_cache_destroy); | 277 | EXPORT_SYMBOL(kmem_cache_destroy); |
| 279 | 278 | ||
diff --git a/mm/truncate.c b/mm/truncate.c index c6ab55ec6883..a654928323dc 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | 9 | ||
| 10 | #include <linux/kernel.h> | 10 | #include <linux/kernel.h> |
| 11 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
| 12 | #include <linux/swap.h> | ||
| 12 | #include <linux/module.h> | 13 | #include <linux/module.h> |
| 13 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
| 14 | #include <linux/pagevec.h> | 15 | #include <linux/pagevec.h> |
| @@ -52,36 +53,26 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
| 52 | /* | 53 | /* |
| 53 | * This is for invalidate_inode_pages(). That function can be called at | 54 | * This is for invalidate_inode_pages(). That function can be called at |
| 54 | * any time, and is not supposed to throw away dirty pages. But pages can | 55 | * any time, and is not supposed to throw away dirty pages. But pages can |
| 55 | * be marked dirty at any time too. So we re-check the dirtiness inside | 56 | * be marked dirty at any time too, so use remove_mapping which safely |
| 56 | * ->tree_lock. That provides exclusion against the __set_page_dirty | 57 | * discards clean, unused pages. |
| 57 | * functions. | ||
| 58 | * | 58 | * |
| 59 | * Returns non-zero if the page was successfully invalidated. | 59 | * Returns non-zero if the page was successfully invalidated. |
| 60 | */ | 60 | */ |
| 61 | static int | 61 | static int |
| 62 | invalidate_complete_page(struct address_space *mapping, struct page *page) | 62 | invalidate_complete_page(struct address_space *mapping, struct page *page) |
| 63 | { | 63 | { |
| 64 | int ret; | ||
| 65 | |||
| 64 | if (page->mapping != mapping) | 66 | if (page->mapping != mapping) |
| 65 | return 0; | 67 | return 0; |
| 66 | 68 | ||
| 67 | if (PagePrivate(page) && !try_to_release_page(page, 0)) | 69 | if (PagePrivate(page) && !try_to_release_page(page, 0)) |
| 68 | return 0; | 70 | return 0; |
| 69 | 71 | ||
| 70 | write_lock_irq(&mapping->tree_lock); | 72 | ret = remove_mapping(mapping, page); |
| 71 | if (PageDirty(page)) | ||
| 72 | goto failed; | ||
| 73 | if (page_count(page) != 2) /* caller's ref + pagecache ref */ | ||
| 74 | goto failed; | ||
| 75 | |||
| 76 | BUG_ON(PagePrivate(page)); | ||
| 77 | __remove_from_page_cache(page); | ||
| 78 | write_unlock_irq(&mapping->tree_lock); | ||
| 79 | ClearPageUptodate(page); | 73 | ClearPageUptodate(page); |
| 80 | page_cache_release(page); /* pagecache ref */ | 74 | |
| 81 | return 1; | 75 | return ret; |
| 82 | failed: | ||
| 83 | write_unlock_irq(&mapping->tree_lock); | ||
| 84 | return 0; | ||
| 85 | } | 76 | } |
| 86 | 77 | ||
| 87 | /** | 78 | /** |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9aad8b0cc6ee..1ac191ce5641 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -241,7 +241,6 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | |||
| 241 | 241 | ||
| 242 | /** | 242 | /** |
| 243 | * get_vm_area - reserve a contingous kernel virtual area | 243 | * get_vm_area - reserve a contingous kernel virtual area |
| 244 | * | ||
| 245 | * @size: size of the area | 244 | * @size: size of the area |
| 246 | * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC | 245 | * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC |
| 247 | * | 246 | * |
| @@ -273,7 +272,7 @@ static struct vm_struct *__find_vm_area(void *addr) | |||
| 273 | } | 272 | } |
| 274 | 273 | ||
| 275 | /* Caller must hold vmlist_lock */ | 274 | /* Caller must hold vmlist_lock */ |
| 276 | struct vm_struct *__remove_vm_area(void *addr) | 275 | static struct vm_struct *__remove_vm_area(void *addr) |
| 277 | { | 276 | { |
| 278 | struct vm_struct **p, *tmp; | 277 | struct vm_struct **p, *tmp; |
| 279 | 278 | ||
| @@ -296,7 +295,6 @@ found: | |||
| 296 | 295 | ||
| 297 | /** | 296 | /** |
| 298 | * remove_vm_area - find and remove a contingous kernel virtual area | 297 | * remove_vm_area - find and remove a contingous kernel virtual area |
| 299 | * | ||
| 300 | * @addr: base address | 298 | * @addr: base address |
| 301 | * | 299 | * |
| 302 | * Search for the kernel VM area starting at @addr, and remove it. | 300 | * Search for the kernel VM area starting at @addr, and remove it. |
| @@ -355,7 +353,6 @@ void __vunmap(void *addr, int deallocate_pages) | |||
| 355 | 353 | ||
| 356 | /** | 354 | /** |
| 357 | * vfree - release memory allocated by vmalloc() | 355 | * vfree - release memory allocated by vmalloc() |
| 358 | * | ||
| 359 | * @addr: memory base address | 356 | * @addr: memory base address |
| 360 | * | 357 | * |
| 361 | * Free the virtually contiguous memory area starting at @addr, as | 358 | * Free the virtually contiguous memory area starting at @addr, as |
| @@ -373,7 +370,6 @@ EXPORT_SYMBOL(vfree); | |||
| 373 | 370 | ||
| 374 | /** | 371 | /** |
| 375 | * vunmap - release virtual mapping obtained by vmap() | 372 | * vunmap - release virtual mapping obtained by vmap() |
| 376 | * | ||
| 377 | * @addr: memory base address | 373 | * @addr: memory base address |
| 378 | * | 374 | * |
| 379 | * Free the virtually contiguous memory area starting at @addr, | 375 | * Free the virtually contiguous memory area starting at @addr, |
| @@ -390,7 +386,6 @@ EXPORT_SYMBOL(vunmap); | |||
| 390 | 386 | ||
| 391 | /** | 387 | /** |
| 392 | * vmap - map an array of pages into virtually contiguous space | 388 | * vmap - map an array of pages into virtually contiguous space |
| 393 | * | ||
| 394 | * @pages: array of page pointers | 389 | * @pages: array of page pointers |
| 395 | * @count: number of pages to map | 390 | * @count: number of pages to map |
| 396 | * @flags: vm_area->flags | 391 | * @flags: vm_area->flags |
| @@ -471,7 +466,6 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
| 471 | 466 | ||
| 472 | /** | 467 | /** |
| 473 | * __vmalloc_node - allocate virtually contiguous memory | 468 | * __vmalloc_node - allocate virtually contiguous memory |
| 474 | * | ||
| 475 | * @size: allocation size | 469 | * @size: allocation size |
| 476 | * @gfp_mask: flags for the page level allocator | 470 | * @gfp_mask: flags for the page level allocator |
| 477 | * @prot: protection mask for the allocated pages | 471 | * @prot: protection mask for the allocated pages |
| @@ -505,9 +499,7 @@ EXPORT_SYMBOL(__vmalloc); | |||
| 505 | 499 | ||
| 506 | /** | 500 | /** |
| 507 | * vmalloc - allocate virtually contiguous memory | 501 | * vmalloc - allocate virtually contiguous memory |
| 508 | * | ||
| 509 | * @size: allocation size | 502 | * @size: allocation size |
| 510 | * | ||
| 511 | * Allocate enough pages to cover @size from the page level | 503 | * Allocate enough pages to cover @size from the page level |
| 512 | * allocator and map them into contiguous kernel virtual space. | 504 | * allocator and map them into contiguous kernel virtual space. |
| 513 | * | 505 | * |
| @@ -521,11 +513,11 @@ void *vmalloc(unsigned long size) | |||
| 521 | EXPORT_SYMBOL(vmalloc); | 513 | EXPORT_SYMBOL(vmalloc); |
| 522 | 514 | ||
| 523 | /** | 515 | /** |
| 524 | * vmalloc_user - allocate virtually contiguous memory which has | 516 | * vmalloc_user - allocate zeroed virtually contiguous memory for userspace |
| 525 | * been zeroed so it can be mapped to userspace without | 517 | * @size: allocation size |
| 526 | * leaking data. | ||
| 527 | * | 518 | * |
| 528 | * @size: allocation size | 519 | * The resulting memory area is zeroed so it can be mapped to userspace |
| 520 | * without leaking data. | ||
| 529 | */ | 521 | */ |
| 530 | void *vmalloc_user(unsigned long size) | 522 | void *vmalloc_user(unsigned long size) |
| 531 | { | 523 | { |
| @@ -544,7 +536,6 @@ EXPORT_SYMBOL(vmalloc_user); | |||
| 544 | 536 | ||
| 545 | /** | 537 | /** |
| 546 | * vmalloc_node - allocate memory on a specific node | 538 | * vmalloc_node - allocate memory on a specific node |
| 547 | * | ||
| 548 | * @size: allocation size | 539 | * @size: allocation size |
| 549 | * @node: numa node | 540 | * @node: numa node |
| 550 | * | 541 | * |
| @@ -566,7 +557,6 @@ EXPORT_SYMBOL(vmalloc_node); | |||
| 566 | 557 | ||
| 567 | /** | 558 | /** |
| 568 | * vmalloc_exec - allocate virtually contiguous, executable memory | 559 | * vmalloc_exec - allocate virtually contiguous, executable memory |
| 569 | * | ||
| 570 | * @size: allocation size | 560 | * @size: allocation size |
| 571 | * | 561 | * |
| 572 | * Kernel-internal function to allocate enough pages to cover @size | 562 | * Kernel-internal function to allocate enough pages to cover @size |
| @@ -584,7 +574,6 @@ void *vmalloc_exec(unsigned long size) | |||
| 584 | 574 | ||
| 585 | /** | 575 | /** |
| 586 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) | 576 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) |
| 587 | * | ||
| 588 | * @size: allocation size | 577 | * @size: allocation size |
| 589 | * | 578 | * |
| 590 | * Allocate enough 32bit PA addressable pages to cover @size from the | 579 | * Allocate enough 32bit PA addressable pages to cover @size from the |
| @@ -597,11 +586,11 @@ void *vmalloc_32(unsigned long size) | |||
| 597 | EXPORT_SYMBOL(vmalloc_32); | 586 | EXPORT_SYMBOL(vmalloc_32); |
| 598 | 587 | ||
| 599 | /** | 588 | /** |
| 600 | * vmalloc_32_user - allocate virtually contiguous memory (32bit | 589 | * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory |
| 601 | * addressable) which is zeroed so it can be | ||
| 602 | * mapped to userspace without leaking data. | ||
| 603 | * | ||
| 604 | * @size: allocation size | 590 | * @size: allocation size |
| 591 | * | ||
| 592 | * The resulting memory area is 32bit addressable and zeroed so it can be | ||
| 593 | * mapped to userspace without leaking data. | ||
| 605 | */ | 594 | */ |
| 606 | void *vmalloc_32_user(unsigned long size) | 595 | void *vmalloc_32_user(unsigned long size) |
| 607 | { | 596 | { |
| @@ -695,7 +684,6 @@ finished: | |||
| 695 | 684 | ||
| 696 | /** | 685 | /** |
| 697 | * remap_vmalloc_range - map vmalloc pages to userspace | 686 | * remap_vmalloc_range - map vmalloc pages to userspace |
| 698 | * | ||
| 699 | * @vma: vma to cover (map full range of vma) | 687 | * @vma: vma to cover (map full range of vma) |
| 700 | * @addr: vmalloc memory | 688 | * @addr: vmalloc memory |
| 701 | * @pgoff: number of pages into addr before first page to map | 689 | * @pgoff: number of pages into addr before first page to map |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 87779dda4ec6..eca70310adb2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
| 20 | #include <linux/init.h> | 20 | #include <linux/init.h> |
| 21 | #include <linux/highmem.h> | 21 | #include <linux/highmem.h> |
| 22 | #include <linux/vmstat.h> | ||
| 22 | #include <linux/file.h> | 23 | #include <linux/file.h> |
| 23 | #include <linux/writeback.h> | 24 | #include <linux/writeback.h> |
| 24 | #include <linux/blkdev.h> | 25 | #include <linux/blkdev.h> |
| @@ -370,7 +371,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
| 370 | /* synchronous write or broken a_ops? */ | 371 | /* synchronous write or broken a_ops? */ |
| 371 | ClearPageReclaim(page); | 372 | ClearPageReclaim(page); |
| 372 | } | 373 | } |
| 373 | 374 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | |
| 374 | return PAGE_SUCCESS; | 375 | return PAGE_SUCCESS; |
| 375 | } | 376 | } |
| 376 | 377 | ||
| @@ -383,11 +384,30 @@ int remove_mapping(struct address_space *mapping, struct page *page) | |||
| 383 | BUG_ON(mapping != page_mapping(page)); | 384 | BUG_ON(mapping != page_mapping(page)); |
| 384 | 385 | ||
| 385 | write_lock_irq(&mapping->tree_lock); | 386 | write_lock_irq(&mapping->tree_lock); |
| 386 | |||
| 387 | /* | 387 | /* |
| 388 | * The non-racy check for busy page. It is critical to check | 388 | * The non racy check for a busy page. |
| 389 | * PageDirty _after_ making sure that the page is freeable and | 389 | * |
| 390 | * not in use by anybody. (pagecache + us == 2) | 390 | * Must be careful with the order of the tests. When someone has |
| 391 | * a ref to the page, it may be possible that they dirty it then | ||
| 392 | * drop the reference. So if PageDirty is tested before page_count | ||
| 393 | * here, then the following race may occur: | ||
| 394 | * | ||
| 395 | * get_user_pages(&page); | ||
| 396 | * [user mapping goes away] | ||
| 397 | * write_to(page); | ||
| 398 | * !PageDirty(page) [good] | ||
| 399 | * SetPageDirty(page); | ||
| 400 | * put_page(page); | ||
| 401 | * !page_count(page) [good, discard it] | ||
| 402 | * | ||
| 403 | * [oops, our write_to data is lost] | ||
| 404 | * | ||
| 405 | * Reversing the order of the tests ensures such a situation cannot | ||
| 406 | * escape unnoticed. The smp_rmb is needed to ensure the page->flags | ||
| 407 | * load is not satisfied before that of page->_count. | ||
| 408 | * | ||
| 409 | * Note that if SetPageDirty is always performed via set_page_dirty, | ||
| 410 | * and thus under tree_lock, then this ordering is not required. | ||
| 391 | */ | 411 | */ |
| 392 | if (unlikely(page_count(page) != 2)) | 412 | if (unlikely(page_count(page) != 2)) |
| 393 | goto cannot_free; | 413 | goto cannot_free; |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 490d8c1a0ded..a2b6a9f96e5c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -371,7 +371,7 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z) | |||
| 371 | __inc_zone_state(z, NUMA_MISS); | 371 | __inc_zone_state(z, NUMA_MISS); |
| 372 | __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); | 372 | __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); |
| 373 | } | 373 | } |
| 374 | if (z->zone_pgdat == NODE_DATA(numa_node_id())) | 374 | if (z->node == numa_node_id()) |
| 375 | __inc_zone_state(z, NUMA_LOCAL); | 375 | __inc_zone_state(z, NUMA_LOCAL); |
| 376 | else | 376 | else |
| 377 | __inc_zone_state(z, NUMA_OTHER); | 377 | __inc_zone_state(z, NUMA_OTHER); |
| @@ -465,6 +465,7 @@ static char *vmstat_text[] = { | |||
| 465 | "nr_writeback", | 465 | "nr_writeback", |
| 466 | "nr_unstable", | 466 | "nr_unstable", |
| 467 | "nr_bounce", | 467 | "nr_bounce", |
| 468 | "nr_vmscan_write", | ||
| 468 | 469 | ||
| 469 | #ifdef CONFIG_NUMA | 470 | #ifdef CONFIG_NUMA |
| 470 | "numa_hit", | 471 | "numa_hit", |
