aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/memory.c117
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/nommu.c245
-rw-r--r--mm/page_alloc.c749
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slab.c126
-rw-r--r--mm/slob.c3
-rw-r--r--mm/truncate.c25
-rw-r--r--mm/vmalloc.c30
-rw-r--r--mm/vmscan.c30
-rw-r--r--mm/vmstat.c3
11 files changed, 1136 insertions, 200 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 92a3ebd8d795..601159a46ab6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2256,6 +2256,54 @@ oom:
2256} 2256}
2257 2257
2258/* 2258/*
2259 * do_no_pfn() tries to create a new page mapping for a page without
2260 * a struct_page backing it
2261 *
2262 * As this is called only for pages that do not currently exist, we
2263 * do not need to flush old virtual caches or the TLB.
2264 *
2265 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2266 * but allow concurrent faults), and pte mapped but not yet locked.
2267 * We return with mmap_sem still held, but pte unmapped and unlocked.
2268 *
2269 * It is expected that the ->nopfn handler always returns the same pfn
2270 * for a given virtual mapping.
2271 *
2272 * Mark this `noinline' to prevent it from bloating the main pagefault code.
2273 */
2274static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2275 unsigned long address, pte_t *page_table, pmd_t *pmd,
2276 int write_access)
2277{
2278 spinlock_t *ptl;
2279 pte_t entry;
2280 unsigned long pfn;
2281 int ret = VM_FAULT_MINOR;
2282
2283 pte_unmap(page_table);
2284 BUG_ON(!(vma->vm_flags & VM_PFNMAP));
2285 BUG_ON(is_cow_mapping(vma->vm_flags));
2286
2287 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
2288 if (pfn == NOPFN_OOM)
2289 return VM_FAULT_OOM;
2290 if (pfn == NOPFN_SIGBUS)
2291 return VM_FAULT_SIGBUS;
2292
2293 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2294
2295 /* Only go through if we didn't race with anybody else... */
2296 if (pte_none(*page_table)) {
2297 entry = pfn_pte(pfn, vma->vm_page_prot);
2298 if (write_access)
2299 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2300 set_pte_at(mm, address, page_table, entry);
2301 }
2302 pte_unmap_unlock(page_table, ptl);
2303 return ret;
2304}
2305
2306/*
2259 * Fault of a previously existing named mapping. Repopulate the pte 2307 * Fault of a previously existing named mapping. Repopulate the pte
2260 * from the encoded file_pte if possible. This enables swappable 2308 * from the encoded file_pte if possible. This enables swappable
2261 * nonlinear vmas. 2309 * nonlinear vmas.
@@ -2317,11 +2365,17 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2317 old_entry = entry = *pte; 2365 old_entry = entry = *pte;
2318 if (!pte_present(entry)) { 2366 if (!pte_present(entry)) {
2319 if (pte_none(entry)) { 2367 if (pte_none(entry)) {
2320 if (!vma->vm_ops || !vma->vm_ops->nopage) 2368 if (vma->vm_ops) {
2321 return do_anonymous_page(mm, vma, address, 2369 if (vma->vm_ops->nopage)
2322 pte, pmd, write_access); 2370 return do_no_page(mm, vma, address,
2323 return do_no_page(mm, vma, address, 2371 pte, pmd,
2324 pte, pmd, write_access); 2372 write_access);
2373 if (unlikely(vma->vm_ops->nopfn))
2374 return do_no_pfn(mm, vma, address, pte,
2375 pmd, write_access);
2376 }
2377 return do_anonymous_page(mm, vma, address,
2378 pte, pmd, write_access);
2325 } 2379 }
2326 if (pte_file(entry)) 2380 if (pte_file(entry))
2327 return do_file_page(mm, vma, address, 2381 return do_file_page(mm, vma, address,
@@ -2550,3 +2604,56 @@ int in_gate_area_no_task(unsigned long addr)
2550} 2604}
2551 2605
2552#endif /* __HAVE_ARCH_GATE_AREA */ 2606#endif /* __HAVE_ARCH_GATE_AREA */
2607
2608/*
2609 * Access another process' address space.
2610 * Source/target buffer must be kernel space,
2611 * Do not walk the page table directly, use get_user_pages
2612 */
2613int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
2614{
2615 struct mm_struct *mm;
2616 struct vm_area_struct *vma;
2617 struct page *page;
2618 void *old_buf = buf;
2619
2620 mm = get_task_mm(tsk);
2621 if (!mm)
2622 return 0;
2623
2624 down_read(&mm->mmap_sem);
2625 /* ignore errors, just check how much was sucessfully transfered */
2626 while (len) {
2627 int bytes, ret, offset;
2628 void *maddr;
2629
2630 ret = get_user_pages(tsk, mm, addr, 1,
2631 write, 1, &page, &vma);
2632 if (ret <= 0)
2633 break;
2634
2635 bytes = len;
2636 offset = addr & (PAGE_SIZE-1);
2637 if (bytes > PAGE_SIZE-offset)
2638 bytes = PAGE_SIZE-offset;
2639
2640 maddr = kmap(page);
2641 if (write) {
2642 copy_to_user_page(vma, page, addr,
2643 maddr + offset, buf, bytes);
2644 set_page_dirty_lock(page);
2645 } else {
2646 copy_from_user_page(vma, page, addr,
2647 buf, maddr + offset, bytes);
2648 }
2649 kunmap(page);
2650 page_cache_release(page);
2651 len -= bytes;
2652 buf += bytes;
2653 addr += bytes;
2654 }
2655 up_read(&mm->mmap_sem);
2656 mmput(mm);
2657
2658 return buf - old_buf;
2659}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 38f89650bc84..cf18f0942553 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1136,7 +1136,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
1136 */ 1136 */
1137unsigned slab_node(struct mempolicy *policy) 1137unsigned slab_node(struct mempolicy *policy)
1138{ 1138{
1139 switch (policy->policy) { 1139 int pol = policy ? policy->policy : MPOL_DEFAULT;
1140
1141 switch (pol) {
1140 case MPOL_INTERLEAVE: 1142 case MPOL_INTERLEAVE:
1141 return interleave_nodes(policy); 1143 return interleave_nodes(policy);
1142 1144
diff --git a/mm/nommu.c b/mm/nommu.c
index d99dea31e443..564540662192 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -122,26 +122,50 @@ unsigned int kobjsize(const void *objp)
122} 122}
123 123
124/* 124/*
125 * The nommu dodgy version :-) 125 * get a list of pages in an address range belonging to the specified process
126 * and indicate the VMA that covers each page
127 * - this is potentially dodgy as we may end incrementing the page count of a
128 * slab page or a secondary page from a compound page
129 * - don't permit access to VMAs that don't support it, such as I/O mappings
126 */ 130 */
127int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 131int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
128 unsigned long start, int len, int write, int force, 132 unsigned long start, int len, int write, int force,
129 struct page **pages, struct vm_area_struct **vmas) 133 struct page **pages, struct vm_area_struct **vmas)
130{ 134{
135 struct vm_area_struct *vma;
136 unsigned long vm_flags;
131 int i; 137 int i;
132 static struct vm_area_struct dummy_vma; 138
139 /* calculate required read or write permissions.
140 * - if 'force' is set, we only require the "MAY" flags.
141 */
142 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
143 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
133 144
134 for (i = 0; i < len; i++) { 145 for (i = 0; i < len; i++) {
146 vma = find_vma(mm, start);
147 if (!vma)
148 goto finish_or_fault;
149
150 /* protect what we can, including chardevs */
151 if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
152 !(vm_flags & vma->vm_flags))
153 goto finish_or_fault;
154
135 if (pages) { 155 if (pages) {
136 pages[i] = virt_to_page(start); 156 pages[i] = virt_to_page(start);
137 if (pages[i]) 157 if (pages[i])
138 page_cache_get(pages[i]); 158 page_cache_get(pages[i]);
139 } 159 }
140 if (vmas) 160 if (vmas)
141 vmas[i] = &dummy_vma; 161 vmas[i] = vma;
142 start += PAGE_SIZE; 162 start += PAGE_SIZE;
143 } 163 }
144 return(i); 164
165 return i;
166
167finish_or_fault:
168 return i ? : -EFAULT;
145} 169}
146 170
147EXPORT_SYMBOL(get_user_pages); 171EXPORT_SYMBOL(get_user_pages);
@@ -286,6 +310,77 @@ static void show_process_blocks(void)
286} 310}
287#endif /* DEBUG */ 311#endif /* DEBUG */
288 312
313/*
314 * add a VMA into a process's mm_struct in the appropriate place in the list
315 * - should be called with mm->mmap_sem held writelocked
316 */
317static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml)
318{
319 struct vm_list_struct **ppv;
320
321 for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next)
322 if ((*ppv)->vma->vm_start > vml->vma->vm_start)
323 break;
324
325 vml->next = *ppv;
326 *ppv = vml;
327}
328
329/*
330 * look up the first VMA in which addr resides, NULL if none
331 * - should be called with mm->mmap_sem at least held readlocked
332 */
333struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
334{
335 struct vm_list_struct *loop, *vml;
336
337 /* search the vm_start ordered list */
338 vml = NULL;
339 for (loop = mm->context.vmlist; loop; loop = loop->next) {
340 if (loop->vma->vm_start > addr)
341 break;
342 vml = loop;
343 }
344
345 if (vml && vml->vma->vm_end > addr)
346 return vml->vma;
347
348 return NULL;
349}
350EXPORT_SYMBOL(find_vma);
351
352/*
353 * find a VMA
354 * - we don't extend stack VMAs under NOMMU conditions
355 */
356struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
357{
358 return find_vma(mm, addr);
359}
360
361/*
362 * look up the first VMA exactly that exactly matches addr
363 * - should be called with mm->mmap_sem at least held readlocked
364 */
365static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
366 unsigned long addr)
367{
368 struct vm_list_struct *vml;
369
370 /* search the vm_start ordered list */
371 for (vml = mm->context.vmlist; vml; vml = vml->next) {
372 if (vml->vma->vm_start == addr)
373 return vml->vma;
374 if (vml->vma->vm_start > addr)
375 break;
376 }
377
378 return NULL;
379}
380
381/*
382 * find a VMA in the global tree
383 */
289static inline struct vm_area_struct *find_nommu_vma(unsigned long start) 384static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
290{ 385{
291 struct vm_area_struct *vma; 386 struct vm_area_struct *vma;
@@ -305,6 +400,9 @@ static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
305 return NULL; 400 return NULL;
306} 401}
307 402
403/*
404 * add a VMA in the global tree
405 */
308static void add_nommu_vma(struct vm_area_struct *vma) 406static void add_nommu_vma(struct vm_area_struct *vma)
309{ 407{
310 struct vm_area_struct *pvma; 408 struct vm_area_struct *pvma;
@@ -351,6 +449,9 @@ static void add_nommu_vma(struct vm_area_struct *vma)
351 rb_insert_color(&vma->vm_rb, &nommu_vma_tree); 449 rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
352} 450}
353 451
452/*
453 * delete a VMA from the global list
454 */
354static void delete_nommu_vma(struct vm_area_struct *vma) 455static void delete_nommu_vma(struct vm_area_struct *vma)
355{ 456{
356 struct address_space *mapping; 457 struct address_space *mapping;
@@ -828,8 +929,7 @@ unsigned long do_mmap_pgoff(struct file *file,
828 realalloc += kobjsize(vml); 929 realalloc += kobjsize(vml);
829 askedalloc += sizeof(*vml); 930 askedalloc += sizeof(*vml);
830 931
831 vml->next = current->mm->context.vmlist; 932 add_vma_to_mm(current->mm, vml);
832 current->mm->context.vmlist = vml;
833 933
834 up_write(&nommu_vma_sem); 934 up_write(&nommu_vma_sem);
835 935
@@ -908,6 +1008,11 @@ static void put_vma(struct vm_area_struct *vma)
908 } 1008 }
909} 1009}
910 1010
1011/*
1012 * release a mapping
1013 * - under NOMMU conditions the parameters must match exactly to the mapping to
1014 * be removed
1015 */
911int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) 1016int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
912{ 1017{
913 struct vm_list_struct *vml, **parent; 1018 struct vm_list_struct *vml, **parent;
@@ -917,10 +1022,13 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
917 printk("do_munmap:\n"); 1022 printk("do_munmap:\n");
918#endif 1023#endif
919 1024
920 for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) 1025 for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) {
1026 if ((*parent)->vma->vm_start > addr)
1027 break;
921 if ((*parent)->vma->vm_start == addr && 1028 if ((*parent)->vma->vm_start == addr &&
922 ((len == 0) || ((*parent)->vma->vm_end == end))) 1029 ((len == 0) || ((*parent)->vma->vm_end == end)))
923 goto found; 1030 goto found;
1031 }
924 1032
925 printk("munmap of non-mmaped memory by process %d (%s): %p\n", 1033 printk("munmap of non-mmaped memory by process %d (%s): %p\n",
926 current->pid, current->comm, (void *) addr); 1034 current->pid, current->comm, (void *) addr);
@@ -946,7 +1054,20 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
946 return 0; 1054 return 0;
947} 1055}
948 1056
949/* Release all mmaps. */ 1057asmlinkage long sys_munmap(unsigned long addr, size_t len)
1058{
1059 int ret;
1060 struct mm_struct *mm = current->mm;
1061
1062 down_write(&mm->mmap_sem);
1063 ret = do_munmap(mm, addr, len);
1064 up_write(&mm->mmap_sem);
1065 return ret;
1066}
1067
1068/*
1069 * Release all mappings
1070 */
950void exit_mmap(struct mm_struct * mm) 1071void exit_mmap(struct mm_struct * mm)
951{ 1072{
952 struct vm_list_struct *tmp; 1073 struct vm_list_struct *tmp;
@@ -973,37 +1094,26 @@ void exit_mmap(struct mm_struct * mm)
973 } 1094 }
974} 1095}
975 1096
976asmlinkage long sys_munmap(unsigned long addr, size_t len)
977{
978 int ret;
979 struct mm_struct *mm = current->mm;
980
981 down_write(&mm->mmap_sem);
982 ret = do_munmap(mm, addr, len);
983 up_write(&mm->mmap_sem);
984 return ret;
985}
986
987unsigned long do_brk(unsigned long addr, unsigned long len) 1097unsigned long do_brk(unsigned long addr, unsigned long len)
988{ 1098{
989 return -ENOMEM; 1099 return -ENOMEM;
990} 1100}
991 1101
992/* 1102/*
993 * Expand (or shrink) an existing mapping, potentially moving it at the 1103 * expand (or shrink) an existing mapping, potentially moving it at the same
994 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) 1104 * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
995 * 1105 *
996 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise 1106 * under NOMMU conditions, we only permit changing a mapping's size, and only
997 * This option implies MREMAP_MAYMOVE. 1107 * as long as it stays within the hole allocated by the kmalloc() call in
1108 * do_mmap_pgoff() and the block is not shareable
998 * 1109 *
999 * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the 1110 * MREMAP_FIXED is not supported under NOMMU conditions
1000 * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable
1001 */ 1111 */
1002unsigned long do_mremap(unsigned long addr, 1112unsigned long do_mremap(unsigned long addr,
1003 unsigned long old_len, unsigned long new_len, 1113 unsigned long old_len, unsigned long new_len,
1004 unsigned long flags, unsigned long new_addr) 1114 unsigned long flags, unsigned long new_addr)
1005{ 1115{
1006 struct vm_list_struct *vml = NULL; 1116 struct vm_area_struct *vma;
1007 1117
1008 /* insanity checks first */ 1118 /* insanity checks first */
1009 if (new_len == 0) 1119 if (new_len == 0)
@@ -1012,58 +1122,46 @@ unsigned long do_mremap(unsigned long addr,
1012 if (flags & MREMAP_FIXED && new_addr != addr) 1122 if (flags & MREMAP_FIXED && new_addr != addr)
1013 return (unsigned long) -EINVAL; 1123 return (unsigned long) -EINVAL;
1014 1124
1015 for (vml = current->mm->context.vmlist; vml; vml = vml->next) 1125 vma = find_vma_exact(current->mm, addr);
1016 if (vml->vma->vm_start == addr) 1126 if (!vma)
1017 goto found; 1127 return (unsigned long) -EINVAL;
1018
1019 return (unsigned long) -EINVAL;
1020 1128
1021 found: 1129 if (vma->vm_end != vma->vm_start + old_len)
1022 if (vml->vma->vm_end != vml->vma->vm_start + old_len)
1023 return (unsigned long) -EFAULT; 1130 return (unsigned long) -EFAULT;
1024 1131
1025 if (vml->vma->vm_flags & VM_MAYSHARE) 1132 if (vma->vm_flags & VM_MAYSHARE)
1026 return (unsigned long) -EPERM; 1133 return (unsigned long) -EPERM;
1027 1134
1028 if (new_len > kobjsize((void *) addr)) 1135 if (new_len > kobjsize((void *) addr))
1029 return (unsigned long) -ENOMEM; 1136 return (unsigned long) -ENOMEM;
1030 1137
1031 /* all checks complete - do it */ 1138 /* all checks complete - do it */
1032 vml->vma->vm_end = vml->vma->vm_start + new_len; 1139 vma->vm_end = vma->vm_start + new_len;
1033 1140
1034 askedalloc -= old_len; 1141 askedalloc -= old_len;
1035 askedalloc += new_len; 1142 askedalloc += new_len;
1036 1143
1037 return vml->vma->vm_start; 1144 return vma->vm_start;
1038} 1145}
1039 1146
1040/* 1147asmlinkage unsigned long sys_mremap(unsigned long addr,
1041 * Look up the first VMA which satisfies addr < vm_end, NULL if none 1148 unsigned long old_len, unsigned long new_len,
1042 */ 1149 unsigned long flags, unsigned long new_addr)
1043struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1044{ 1150{
1045 struct vm_list_struct *vml; 1151 unsigned long ret;
1046
1047 for (vml = mm->context.vmlist; vml; vml = vml->next)
1048 if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end)
1049 return vml->vma;
1050 1152
1051 return NULL; 1153 down_write(&current->mm->mmap_sem);
1154 ret = do_mremap(addr, old_len, new_len, flags, new_addr);
1155 up_write(&current->mm->mmap_sem);
1156 return ret;
1052} 1157}
1053 1158
1054EXPORT_SYMBOL(find_vma);
1055
1056struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1159struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1057 unsigned int foll_flags) 1160 unsigned int foll_flags)
1058{ 1161{
1059 return NULL; 1162 return NULL;
1060} 1163}
1061 1164
1062struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
1063{
1064 return NULL;
1065}
1066
1067int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, 1165int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
1068 unsigned long to, unsigned long size, pgprot_t prot) 1166 unsigned long to, unsigned long size, pgprot_t prot)
1069{ 1167{
@@ -1206,3 +1304,44 @@ struct page *filemap_nopage(struct vm_area_struct *area,
1206 BUG(); 1304 BUG();
1207 return NULL; 1305 return NULL;
1208} 1306}
1307
1308/*
1309 * Access another process' address space.
1310 * - source/target buffer must be kernel space
1311 */
1312int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
1313{
1314 struct vm_area_struct *vma;
1315 struct mm_struct *mm;
1316
1317 if (addr + len < addr)
1318 return 0;
1319
1320 mm = get_task_mm(tsk);
1321 if (!mm)
1322 return 0;
1323
1324 down_read(&mm->mmap_sem);
1325
1326 /* the access must start within one of the target process's mappings */
1327 vma = find_vma(mm, addr);
1328 if (vma) {
1329 /* don't overrun this mapping */
1330 if (addr + len >= vma->vm_end)
1331 len = vma->vm_end - addr;
1332
1333 /* only read or write mappings where it is permitted */
1334 if (write && vma->vm_flags & VM_MAYWRITE)
1335 len -= copy_to_user((void *) addr, buf, len);
1336 else if (!write && vma->vm_flags & VM_MAYREAD)
1337 len -= copy_from_user(buf, (void *) addr, len);
1338 else
1339 len = 0;
1340 } else {
1341 len = 0;
1342 }
1343
1344 up_read(&mm->mmap_sem);
1345 mmput(mm);
1346 return len;
1347}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9810f0a60db7..4f59d90b81e6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,8 @@
37#include <linux/vmalloc.h> 37#include <linux/vmalloc.h>
38#include <linux/mempolicy.h> 38#include <linux/mempolicy.h>
39#include <linux/stop_machine.h> 39#include <linux/stop_machine.h>
40#include <linux/sort.h>
41#include <linux/pfn.h>
40 42
41#include <asm/tlbflush.h> 43#include <asm/tlbflush.h>
42#include <asm/div64.h> 44#include <asm/div64.h>
@@ -102,6 +104,38 @@ int min_free_kbytes = 1024;
102 104
103unsigned long __meminitdata nr_kernel_pages; 105unsigned long __meminitdata nr_kernel_pages;
104unsigned long __meminitdata nr_all_pages; 106unsigned long __meminitdata nr_all_pages;
107static unsigned long __initdata dma_reserve;
108
109#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
110 /*
111 * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
112 * ranges of memory (RAM) that may be registered with add_active_range().
113 * Ranges passed to add_active_range() will be merged if possible
114 * so the number of times add_active_range() can be called is
115 * related to the number of nodes and the number of holes
116 */
117 #ifdef CONFIG_MAX_ACTIVE_REGIONS
118 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
119 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
120 #else
121 #if MAX_NUMNODES >= 32
122 /* If there can be many nodes, allow up to 50 holes per node */
123 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
124 #else
125 /* By default, allow up to 256 distinct regions */
126 #define MAX_ACTIVE_REGIONS 256
127 #endif
128 #endif
129
130 struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
131 int __initdata nr_nodemap_entries;
132 unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
133 unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
134#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
135 unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
136 unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
137#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
138#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
105 139
106#ifdef CONFIG_DEBUG_VM 140#ifdef CONFIG_DEBUG_VM
107static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 141static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -908,7 +942,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
908 */ 942 */
909 do { 943 do {
910 zone = *z; 944 zone = *z;
911 if (unlikely((gfp_mask & __GFP_THISNODE) && 945 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
912 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) 946 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
913 break; 947 break;
914 if ((alloc_flags & ALLOC_CPUSET) && 948 if ((alloc_flags & ALLOC_CPUSET) &&
@@ -1222,14 +1256,12 @@ unsigned int nr_free_pagecache_pages(void)
1222{ 1256{
1223 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); 1257 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1224} 1258}
1225#ifdef CONFIG_NUMA 1259
1226static void show_node(struct zone *zone) 1260static inline void show_node(struct zone *zone)
1227{ 1261{
1228 printk("Node %ld ", zone_to_nid(zone)); 1262 if (NUMA_BUILD)
1263 printk("Node %ld ", zone_to_nid(zone));
1229} 1264}
1230#else
1231#define show_node(zone) do { } while (0)
1232#endif
1233 1265
1234void si_meminfo(struct sysinfo *val) 1266void si_meminfo(struct sysinfo *val)
1235{ 1267{
@@ -1271,34 +1303,30 @@ void si_meminfo_node(struct sysinfo *val, int nid)
1271 */ 1303 */
1272void show_free_areas(void) 1304void show_free_areas(void)
1273{ 1305{
1274 int cpu, temperature; 1306 int cpu;
1275 unsigned long active; 1307 unsigned long active;
1276 unsigned long inactive; 1308 unsigned long inactive;
1277 unsigned long free; 1309 unsigned long free;
1278 struct zone *zone; 1310 struct zone *zone;
1279 1311
1280 for_each_zone(zone) { 1312 for_each_zone(zone) {
1281 show_node(zone); 1313 if (!populated_zone(zone))
1282 printk("%s per-cpu:", zone->name);
1283
1284 if (!populated_zone(zone)) {
1285 printk(" empty\n");
1286 continue; 1314 continue;
1287 } else 1315
1288 printk("\n"); 1316 show_node(zone);
1317 printk("%s per-cpu:\n", zone->name);
1289 1318
1290 for_each_online_cpu(cpu) { 1319 for_each_online_cpu(cpu) {
1291 struct per_cpu_pageset *pageset; 1320 struct per_cpu_pageset *pageset;
1292 1321
1293 pageset = zone_pcp(zone, cpu); 1322 pageset = zone_pcp(zone, cpu);
1294 1323
1295 for (temperature = 0; temperature < 2; temperature++) 1324 printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "
1296 printk("cpu %d %s: high %d, batch %d used:%d\n", 1325 "Cold: hi:%5d, btch:%4d usd:%4d\n",
1297 cpu, 1326 cpu, pageset->pcp[0].high,
1298 temperature ? "cold" : "hot", 1327 pageset->pcp[0].batch, pageset->pcp[0].count,
1299 pageset->pcp[temperature].high, 1328 pageset->pcp[1].high, pageset->pcp[1].batch,
1300 pageset->pcp[temperature].batch, 1329 pageset->pcp[1].count);
1301 pageset->pcp[temperature].count);
1302 } 1330 }
1303 } 1331 }
1304 1332
@@ -1320,6 +1348,9 @@ void show_free_areas(void)
1320 for_each_zone(zone) { 1348 for_each_zone(zone) {
1321 int i; 1349 int i;
1322 1350
1351 if (!populated_zone(zone))
1352 continue;
1353
1323 show_node(zone); 1354 show_node(zone);
1324 printk("%s" 1355 printk("%s"
1325 " free:%lukB" 1356 " free:%lukB"
@@ -1352,12 +1383,11 @@ void show_free_areas(void)
1352 for_each_zone(zone) { 1383 for_each_zone(zone) {
1353 unsigned long nr[MAX_ORDER], flags, order, total = 0; 1384 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1354 1385
1386 if (!populated_zone(zone))
1387 continue;
1388
1355 show_node(zone); 1389 show_node(zone);
1356 printk("%s: ", zone->name); 1390 printk("%s: ", zone->name);
1357 if (!populated_zone(zone)) {
1358 printk("empty\n");
1359 continue;
1360 }
1361 1391
1362 spin_lock_irqsave(&zone->lock, flags); 1392 spin_lock_irqsave(&zone->lock, flags);
1363 for (order = 0; order < MAX_ORDER; order++) { 1393 for (order = 0; order < MAX_ORDER; order++) {
@@ -1561,7 +1591,7 @@ static int __meminit __build_all_zonelists(void *dummy)
1561void __meminit build_all_zonelists(void) 1591void __meminit build_all_zonelists(void)
1562{ 1592{
1563 if (system_state == SYSTEM_BOOTING) { 1593 if (system_state == SYSTEM_BOOTING) {
1564 __build_all_zonelists(0); 1594 __build_all_zonelists(NULL);
1565 cpuset_init_current_mems_allowed(); 1595 cpuset_init_current_mems_allowed();
1566 } else { 1596 } else {
1567 /* we have to stop all cpus to guaranntee there is no user 1597 /* we have to stop all cpus to guaranntee there is no user
@@ -1642,25 +1672,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
1642 1672
1643#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 1673#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1644 1674
1645static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1646 unsigned long *zones_size, unsigned long *zholes_size)
1647{
1648 unsigned long realtotalpages, totalpages = 0;
1649 enum zone_type i;
1650
1651 for (i = 0; i < MAX_NR_ZONES; i++)
1652 totalpages += zones_size[i];
1653 pgdat->node_spanned_pages = totalpages;
1654
1655 realtotalpages = totalpages;
1656 if (zholes_size)
1657 for (i = 0; i < MAX_NR_ZONES; i++)
1658 realtotalpages -= zholes_size[i];
1659 pgdat->node_present_pages = realtotalpages;
1660 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1661}
1662
1663
1664/* 1675/*
1665 * Initially all pages are reserved - free ones are freed 1676 * Initially all pages are reserved - free ones are freed
1666 * up by free_all_bootmem() once the early boot process is 1677 * up by free_all_bootmem() once the early boot process is
@@ -1818,6 +1829,9 @@ static int __cpuinit process_zones(int cpu)
1818 1829
1819 for_each_zone(zone) { 1830 for_each_zone(zone) {
1820 1831
1832 if (!populated_zone(zone))
1833 continue;
1834
1821 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 1835 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
1822 GFP_KERNEL, cpu_to_node(cpu)); 1836 GFP_KERNEL, cpu_to_node(cpu));
1823 if (!zone_pcp(zone, cpu)) 1837 if (!zone_pcp(zone, cpu))
@@ -1977,6 +1991,366 @@ __meminit int init_currently_empty_zone(struct zone *zone,
1977 return 0; 1991 return 0;
1978} 1992}
1979 1993
1994#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
1995/*
1996 * Basic iterator support. Return the first range of PFNs for a node
1997 * Note: nid == MAX_NUMNODES returns first region regardless of node
1998 */
1999static int __init first_active_region_index_in_nid(int nid)
2000{
2001 int i;
2002
2003 for (i = 0; i < nr_nodemap_entries; i++)
2004 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
2005 return i;
2006
2007 return -1;
2008}
2009
2010/*
2011 * Basic iterator support. Return the next active range of PFNs for a node
2012 * Note: nid == MAX_NUMNODES returns next region regardles of node
2013 */
2014static int __init next_active_region_index_in_nid(int index, int nid)
2015{
2016 for (index = index + 1; index < nr_nodemap_entries; index++)
2017 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
2018 return index;
2019
2020 return -1;
2021}
2022
2023#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
2024/*
2025 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
2026 * Architectures may implement their own version but if add_active_range()
2027 * was used and there are no special requirements, this is a convenient
2028 * alternative
2029 */
2030int __init early_pfn_to_nid(unsigned long pfn)
2031{
2032 int i;
2033
2034 for (i = 0; i < nr_nodemap_entries; i++) {
2035 unsigned long start_pfn = early_node_map[i].start_pfn;
2036 unsigned long end_pfn = early_node_map[i].end_pfn;
2037
2038 if (start_pfn <= pfn && pfn < end_pfn)
2039 return early_node_map[i].nid;
2040 }
2041
2042 return 0;
2043}
2044#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
2045
2046/* Basic iterator support to walk early_node_map[] */
2047#define for_each_active_range_index_in_nid(i, nid) \
2048 for (i = first_active_region_index_in_nid(nid); i != -1; \
2049 i = next_active_region_index_in_nid(i, nid))
2050
2051/**
2052 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
2053 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed
2054 * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node
2055 *
2056 * If an architecture guarantees that all ranges registered with
2057 * add_active_ranges() contain no holes and may be freed, this
2058 * this function may be used instead of calling free_bootmem() manually.
2059 */
2060void __init free_bootmem_with_active_regions(int nid,
2061 unsigned long max_low_pfn)
2062{
2063 int i;
2064
2065 for_each_active_range_index_in_nid(i, nid) {
2066 unsigned long size_pages = 0;
2067 unsigned long end_pfn = early_node_map[i].end_pfn;
2068
2069 if (early_node_map[i].start_pfn >= max_low_pfn)
2070 continue;
2071
2072 if (end_pfn > max_low_pfn)
2073 end_pfn = max_low_pfn;
2074
2075 size_pages = end_pfn - early_node_map[i].start_pfn;
2076 free_bootmem_node(NODE_DATA(early_node_map[i].nid),
2077 PFN_PHYS(early_node_map[i].start_pfn),
2078 size_pages << PAGE_SHIFT);
2079 }
2080}
2081
2082/**
2083 * sparse_memory_present_with_active_regions - Call memory_present for each active range
2084 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used
2085 *
2086 * If an architecture guarantees that all ranges registered with
2087 * add_active_ranges() contain no holes and may be freed, this
2088 * this function may be used instead of calling memory_present() manually.
2089 */
2090void __init sparse_memory_present_with_active_regions(int nid)
2091{
2092 int i;
2093
2094 for_each_active_range_index_in_nid(i, nid)
2095 memory_present(early_node_map[i].nid,
2096 early_node_map[i].start_pfn,
2097 early_node_map[i].end_pfn);
2098}
2099
2100/**
2101 * push_node_boundaries - Push node boundaries to at least the requested boundary
2102 * @nid: The nid of the node to push the boundary for
2103 * @start_pfn: The start pfn of the node
2104 * @end_pfn: The end pfn of the node
2105 *
2106 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
2107 * time. Specifically, on x86_64, SRAT will report ranges that can potentially
2108 * be hotplugged even though no physical memory exists. This function allows
2109 * an arch to push out the node boundaries so mem_map is allocated that can
2110 * be used later.
2111 */
2112#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2113void __init push_node_boundaries(unsigned int nid,
2114 unsigned long start_pfn, unsigned long end_pfn)
2115{
2116 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
2117 nid, start_pfn, end_pfn);
2118
2119 /* Initialise the boundary for this node if necessary */
2120 if (node_boundary_end_pfn[nid] == 0)
2121 node_boundary_start_pfn[nid] = -1UL;
2122
2123 /* Update the boundaries */
2124 if (node_boundary_start_pfn[nid] > start_pfn)
2125 node_boundary_start_pfn[nid] = start_pfn;
2126 if (node_boundary_end_pfn[nid] < end_pfn)
2127 node_boundary_end_pfn[nid] = end_pfn;
2128}
2129
2130/* If necessary, push the node boundary out for reserve hotadd */
2131static void __init account_node_boundary(unsigned int nid,
2132 unsigned long *start_pfn, unsigned long *end_pfn)
2133{
2134 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
2135 nid, *start_pfn, *end_pfn);
2136
2137 /* Return if boundary information has not been provided */
2138 if (node_boundary_end_pfn[nid] == 0)
2139 return;
2140
2141 /* Check the boundaries and update if necessary */
2142 if (node_boundary_start_pfn[nid] < *start_pfn)
2143 *start_pfn = node_boundary_start_pfn[nid];
2144 if (node_boundary_end_pfn[nid] > *end_pfn)
2145 *end_pfn = node_boundary_end_pfn[nid];
2146}
2147#else
2148void __init push_node_boundaries(unsigned int nid,
2149 unsigned long start_pfn, unsigned long end_pfn) {}
2150
2151static void __init account_node_boundary(unsigned int nid,
2152 unsigned long *start_pfn, unsigned long *end_pfn) {}
2153#endif
2154
2155
2156/**
2157 * get_pfn_range_for_nid - Return the start and end page frames for a node
2158 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned
2159 * @start_pfn: Passed by reference. On return, it will have the node start_pfn
2160 * @end_pfn: Passed by reference. On return, it will have the node end_pfn
2161 *
2162 * It returns the start and end page frame of a node based on information
2163 * provided by an arch calling add_active_range(). If called for a node
2164 * with no available memory, a warning is printed and the start and end
2165 * PFNs will be 0
2166 */
2167void __init get_pfn_range_for_nid(unsigned int nid,
2168 unsigned long *start_pfn, unsigned long *end_pfn)
2169{
2170 int i;
2171 *start_pfn = -1UL;
2172 *end_pfn = 0;
2173
2174 for_each_active_range_index_in_nid(i, nid) {
2175 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
2176 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
2177 }
2178
2179 if (*start_pfn == -1UL) {
2180 printk(KERN_WARNING "Node %u active with no memory\n", nid);
2181 *start_pfn = 0;
2182 }
2183
2184 /* Push the node boundaries out if requested */
2185 account_node_boundary(nid, start_pfn, end_pfn);
2186}
2187
2188/*
2189 * Return the number of pages a zone spans in a node, including holes
2190 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
2191 */
2192unsigned long __init zone_spanned_pages_in_node(int nid,
2193 unsigned long zone_type,
2194 unsigned long *ignored)
2195{
2196 unsigned long node_start_pfn, node_end_pfn;
2197 unsigned long zone_start_pfn, zone_end_pfn;
2198
2199 /* Get the start and end of the node and zone */
2200 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2201 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
2202 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2203
2204 /* Check that this node has pages within the zone's required range */
2205 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
2206 return 0;
2207
2208 /* Move the zone boundaries inside the node if necessary */
2209 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
2210 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
2211
2212 /* Return the spanned pages */
2213 return zone_end_pfn - zone_start_pfn;
2214}
2215
2216/*
2217 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
2218 * then all holes in the requested range will be accounted for
2219 */
2220unsigned long __init __absent_pages_in_range(int nid,
2221 unsigned long range_start_pfn,
2222 unsigned long range_end_pfn)
2223{
2224 int i = 0;
2225 unsigned long prev_end_pfn = 0, hole_pages = 0;
2226 unsigned long start_pfn;
2227
2228 /* Find the end_pfn of the first active range of pfns in the node */
2229 i = first_active_region_index_in_nid(nid);
2230 if (i == -1)
2231 return 0;
2232
2233 /* Account for ranges before physical memory on this node */
2234 if (early_node_map[i].start_pfn > range_start_pfn)
2235 hole_pages = early_node_map[i].start_pfn - range_start_pfn;
2236
2237 prev_end_pfn = early_node_map[i].start_pfn;
2238
2239 /* Find all holes for the zone within the node */
2240 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
2241
2242 /* No need to continue if prev_end_pfn is outside the zone */
2243 if (prev_end_pfn >= range_end_pfn)
2244 break;
2245
2246 /* Make sure the end of the zone is not within the hole */
2247 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
2248 prev_end_pfn = max(prev_end_pfn, range_start_pfn);
2249
2250 /* Update the hole size cound and move on */
2251 if (start_pfn > range_start_pfn) {
2252 BUG_ON(prev_end_pfn > start_pfn);
2253 hole_pages += start_pfn - prev_end_pfn;
2254 }
2255 prev_end_pfn = early_node_map[i].end_pfn;
2256 }
2257
2258 /* Account for ranges past physical memory on this node */
2259 if (range_end_pfn > prev_end_pfn)
2260 hole_pages = range_end_pfn -
2261 max(range_start_pfn, prev_end_pfn);
2262
2263 return hole_pages;
2264}
2265
2266/**
2267 * absent_pages_in_range - Return number of page frames in holes within a range
2268 * @start_pfn: The start PFN to start searching for holes
2269 * @end_pfn: The end PFN to stop searching for holes
2270 *
2271 * It returns the number of pages frames in memory holes within a range
2272 */
2273unsigned long __init absent_pages_in_range(unsigned long start_pfn,
2274 unsigned long end_pfn)
2275{
2276 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
2277}
2278
2279/* Return the number of page frames in holes in a zone on a node */
2280unsigned long __init zone_absent_pages_in_node(int nid,
2281 unsigned long zone_type,
2282 unsigned long *ignored)
2283{
2284 unsigned long node_start_pfn, node_end_pfn;
2285 unsigned long zone_start_pfn, zone_end_pfn;
2286
2287 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2288 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
2289 node_start_pfn);
2290 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
2291 node_end_pfn);
2292
2293 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
2294}
2295
2296/* Return the zone index a PFN is in */
2297int memmap_zone_idx(struct page *lmem_map)
2298{
2299 int i;
2300 unsigned long phys_addr = virt_to_phys(lmem_map);
2301 unsigned long pfn = phys_addr >> PAGE_SHIFT;
2302
2303 for (i = 0; i < MAX_NR_ZONES; i++)
2304 if (pfn < arch_zone_highest_possible_pfn[i])
2305 break;
2306
2307 return i;
2308}
2309#else
2310static inline unsigned long zone_spanned_pages_in_node(int nid,
2311 unsigned long zone_type,
2312 unsigned long *zones_size)
2313{
2314 return zones_size[zone_type];
2315}
2316
2317static inline unsigned long zone_absent_pages_in_node(int nid,
2318 unsigned long zone_type,
2319 unsigned long *zholes_size)
2320{
2321 if (!zholes_size)
2322 return 0;
2323
2324 return zholes_size[zone_type];
2325}
2326
2327static inline int memmap_zone_idx(struct page *lmem_map)
2328{
2329 return MAX_NR_ZONES;
2330}
2331#endif
2332
2333static void __init calculate_node_totalpages(struct pglist_data *pgdat,
2334 unsigned long *zones_size, unsigned long *zholes_size)
2335{
2336 unsigned long realtotalpages, totalpages = 0;
2337 enum zone_type i;
2338
2339 for (i = 0; i < MAX_NR_ZONES; i++)
2340 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
2341 zones_size);
2342 pgdat->node_spanned_pages = totalpages;
2343
2344 realtotalpages = totalpages;
2345 for (i = 0; i < MAX_NR_ZONES; i++)
2346 realtotalpages -=
2347 zone_absent_pages_in_node(pgdat->node_id, i,
2348 zholes_size);
2349 pgdat->node_present_pages = realtotalpages;
2350 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
2351 realtotalpages);
2352}
2353
1980/* 2354/*
1981 * Set up the zone data structures: 2355 * Set up the zone data structures:
1982 * - mark all pages reserved 2356 * - mark all pages reserved
@@ -1998,11 +2372,34 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
1998 2372
1999 for (j = 0; j < MAX_NR_ZONES; j++) { 2373 for (j = 0; j < MAX_NR_ZONES; j++) {
2000 struct zone *zone = pgdat->node_zones + j; 2374 struct zone *zone = pgdat->node_zones + j;
2001 unsigned long size, realsize; 2375 unsigned long size, realsize, memmap_pages;
2002 2376
2003 realsize = size = zones_size[j]; 2377 size = zone_spanned_pages_in_node(nid, j, zones_size);
2004 if (zholes_size) 2378 realsize = size - zone_absent_pages_in_node(nid, j,
2005 realsize -= zholes_size[j]; 2379 zholes_size);
2380
2381 /*
2382 * Adjust realsize so that it accounts for how much memory
2383 * is used by this zone for memmap. This affects the watermark
2384 * and per-cpu initialisations
2385 */
2386 memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
2387 if (realsize >= memmap_pages) {
2388 realsize -= memmap_pages;
2389 printk(KERN_DEBUG
2390 " %s zone: %lu pages used for memmap\n",
2391 zone_names[j], memmap_pages);
2392 } else
2393 printk(KERN_WARNING
2394 " %s zone: %lu pages exceeds realsize %lu\n",
2395 zone_names[j], memmap_pages, realsize);
2396
2397 /* Account for reserved DMA pages */
2398 if (j == ZONE_DMA && realsize > dma_reserve) {
2399 realsize -= dma_reserve;
2400 printk(KERN_DEBUG " DMA zone: %lu pages reserved\n",
2401 dma_reserve);
2402 }
2006 2403
2007 if (!is_highmem_idx(j)) 2404 if (!is_highmem_idx(j))
2008 nr_kernel_pages += realsize; 2405 nr_kernel_pages += realsize;
@@ -2011,6 +2408,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2011 zone->spanned_pages = size; 2408 zone->spanned_pages = size;
2012 zone->present_pages = realsize; 2409 zone->present_pages = realsize;
2013#ifdef CONFIG_NUMA 2410#ifdef CONFIG_NUMA
2411 zone->node = nid;
2014 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 2412 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
2015 / 100; 2413 / 100;
2016 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 2414 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
@@ -2073,8 +2471,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2073 /* 2471 /*
2074 * With no DISCONTIG, the global mem_map is just set as node 0's 2472 * With no DISCONTIG, the global mem_map is just set as node 0's
2075 */ 2473 */
2076 if (pgdat == NODE_DATA(0)) 2474 if (pgdat == NODE_DATA(0)) {
2077 mem_map = NODE_DATA(0)->node_mem_map; 2475 mem_map = NODE_DATA(0)->node_mem_map;
2476#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2477 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
2478 mem_map -= pgdat->node_start_pfn;
2479#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2480 }
2078#endif 2481#endif
2079#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2482#endif /* CONFIG_FLAT_NODE_MEM_MAP */
2080} 2483}
@@ -2085,13 +2488,255 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2085{ 2488{
2086 pgdat->node_id = nid; 2489 pgdat->node_id = nid;
2087 pgdat->node_start_pfn = node_start_pfn; 2490 pgdat->node_start_pfn = node_start_pfn;
2088 calculate_zone_totalpages(pgdat, zones_size, zholes_size); 2491 calculate_node_totalpages(pgdat, zones_size, zholes_size);
2089 2492
2090 alloc_node_mem_map(pgdat); 2493 alloc_node_mem_map(pgdat);
2091 2494
2092 free_area_init_core(pgdat, zones_size, zholes_size); 2495 free_area_init_core(pgdat, zones_size, zholes_size);
2093} 2496}
2094 2497
2498#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2499/**
2500 * add_active_range - Register a range of PFNs backed by physical memory
2501 * @nid: The node ID the range resides on
2502 * @start_pfn: The start PFN of the available physical memory
2503 * @end_pfn: The end PFN of the available physical memory
2504 *
2505 * These ranges are stored in an early_node_map[] and later used by
2506 * free_area_init_nodes() to calculate zone sizes and holes. If the
2507 * range spans a memory hole, it is up to the architecture to ensure
2508 * the memory is not freed by the bootmem allocator. If possible
2509 * the range being registered will be merged with existing ranges.
2510 */
2511void __init add_active_range(unsigned int nid, unsigned long start_pfn,
2512 unsigned long end_pfn)
2513{
2514 int i;
2515
2516 printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
2517 "%d entries of %d used\n",
2518 nid, start_pfn, end_pfn,
2519 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
2520
2521 /* Merge with existing active regions if possible */
2522 for (i = 0; i < nr_nodemap_entries; i++) {
2523 if (early_node_map[i].nid != nid)
2524 continue;
2525
2526 /* Skip if an existing region covers this new one */
2527 if (start_pfn >= early_node_map[i].start_pfn &&
2528 end_pfn <= early_node_map[i].end_pfn)
2529 return;
2530
2531 /* Merge forward if suitable */
2532 if (start_pfn <= early_node_map[i].end_pfn &&
2533 end_pfn > early_node_map[i].end_pfn) {
2534 early_node_map[i].end_pfn = end_pfn;
2535 return;
2536 }
2537
2538 /* Merge backward if suitable */
2539 if (start_pfn < early_node_map[i].end_pfn &&
2540 end_pfn >= early_node_map[i].start_pfn) {
2541 early_node_map[i].start_pfn = start_pfn;
2542 return;
2543 }
2544 }
2545
2546 /* Check that early_node_map is large enough */
2547 if (i >= MAX_ACTIVE_REGIONS) {
2548 printk(KERN_CRIT "More than %d memory regions, truncating\n",
2549 MAX_ACTIVE_REGIONS);
2550 return;
2551 }
2552
2553 early_node_map[i].nid = nid;
2554 early_node_map[i].start_pfn = start_pfn;
2555 early_node_map[i].end_pfn = end_pfn;
2556 nr_nodemap_entries = i + 1;
2557}
2558
2559/**
2560 * shrink_active_range - Shrink an existing registered range of PFNs
2561 * @nid: The node id the range is on that should be shrunk
2562 * @old_end_pfn: The old end PFN of the range
2563 * @new_end_pfn: The new PFN of the range
2564 *
2565 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
2566 * The map is kept at the end physical page range that has already been
2567 * registered with add_active_range(). This function allows an arch to shrink
2568 * an existing registered range.
2569 */
2570void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
2571 unsigned long new_end_pfn)
2572{
2573 int i;
2574
2575 /* Find the old active region end and shrink */
2576 for_each_active_range_index_in_nid(i, nid)
2577 if (early_node_map[i].end_pfn == old_end_pfn) {
2578 early_node_map[i].end_pfn = new_end_pfn;
2579 break;
2580 }
2581}
2582
2583/**
2584 * remove_all_active_ranges - Remove all currently registered regions
2585 * During discovery, it may be found that a table like SRAT is invalid
2586 * and an alternative discovery method must be used. This function removes
2587 * all currently registered regions.
2588 */
2589void __init remove_all_active_ranges()
2590{
2591 memset(early_node_map, 0, sizeof(early_node_map));
2592 nr_nodemap_entries = 0;
2593#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2594 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
2595 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
2596#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
2597}
2598
2599/* Compare two active node_active_regions */
2600static int __init cmp_node_active_region(const void *a, const void *b)
2601{
2602 struct node_active_region *arange = (struct node_active_region *)a;
2603 struct node_active_region *brange = (struct node_active_region *)b;
2604
2605 /* Done this way to avoid overflows */
2606 if (arange->start_pfn > brange->start_pfn)
2607 return 1;
2608 if (arange->start_pfn < brange->start_pfn)
2609 return -1;
2610
2611 return 0;
2612}
2613
2614/* sort the node_map by start_pfn */
2615static void __init sort_node_map(void)
2616{
2617 sort(early_node_map, (size_t)nr_nodemap_entries,
2618 sizeof(struct node_active_region),
2619 cmp_node_active_region, NULL);
2620}
2621
2622/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
2623unsigned long __init find_min_pfn_for_node(unsigned long nid)
2624{
2625 int i;
2626
2627 /* Assuming a sorted map, the first range found has the starting pfn */
2628 for_each_active_range_index_in_nid(i, nid)
2629 return early_node_map[i].start_pfn;
2630
2631 printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
2632 return 0;
2633}
2634
2635/**
2636 * find_min_pfn_with_active_regions - Find the minimum PFN registered
2637 *
2638 * It returns the minimum PFN based on information provided via
2639 * add_active_range()
2640 */
2641unsigned long __init find_min_pfn_with_active_regions(void)
2642{
2643 return find_min_pfn_for_node(MAX_NUMNODES);
2644}
2645
2646/**
2647 * find_max_pfn_with_active_regions - Find the maximum PFN registered
2648 *
2649 * It returns the maximum PFN based on information provided via
2650 * add_active_range()
2651 */
2652unsigned long __init find_max_pfn_with_active_regions(void)
2653{
2654 int i;
2655 unsigned long max_pfn = 0;
2656
2657 for (i = 0; i < nr_nodemap_entries; i++)
2658 max_pfn = max(max_pfn, early_node_map[i].end_pfn);
2659
2660 return max_pfn;
2661}
2662
2663/**
2664 * free_area_init_nodes - Initialise all pg_data_t and zone data
2665 * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA
2666 * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32
2667 * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL
2668 * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM
2669 *
2670 * This will call free_area_init_node() for each active node in the system.
2671 * Using the page ranges provided by add_active_range(), the size of each
2672 * zone in each node and their holes is calculated. If the maximum PFN
2673 * between two adjacent zones match, it is assumed that the zone is empty.
2674 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
2675 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
2676 * starts where the previous one ended. For example, ZONE_DMA32 starts
2677 * at arch_max_dma_pfn.
2678 */
2679void __init free_area_init_nodes(unsigned long *max_zone_pfn)
2680{
2681 unsigned long nid;
2682 enum zone_type i;
2683
2684 /* Record where the zone boundaries are */
2685 memset(arch_zone_lowest_possible_pfn, 0,
2686 sizeof(arch_zone_lowest_possible_pfn));
2687 memset(arch_zone_highest_possible_pfn, 0,
2688 sizeof(arch_zone_highest_possible_pfn));
2689 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
2690 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
2691 for (i = 1; i < MAX_NR_ZONES; i++) {
2692 arch_zone_lowest_possible_pfn[i] =
2693 arch_zone_highest_possible_pfn[i-1];
2694 arch_zone_highest_possible_pfn[i] =
2695 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
2696 }
2697
2698 /* Regions in the early_node_map can be in any order */
2699 sort_node_map();
2700
2701 /* Print out the zone ranges */
2702 printk("Zone PFN ranges:\n");
2703 for (i = 0; i < MAX_NR_ZONES; i++)
2704 printk(" %-8s %8lu -> %8lu\n",
2705 zone_names[i],
2706 arch_zone_lowest_possible_pfn[i],
2707 arch_zone_highest_possible_pfn[i]);
2708
2709 /* Print out the early_node_map[] */
2710 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
2711 for (i = 0; i < nr_nodemap_entries; i++)
2712 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
2713 early_node_map[i].start_pfn,
2714 early_node_map[i].end_pfn);
2715
2716 /* Initialise every node */
2717 for_each_online_node(nid) {
2718 pg_data_t *pgdat = NODE_DATA(nid);
2719 free_area_init_node(nid, pgdat, NULL,
2720 find_min_pfn_for_node(nid), NULL);
2721 }
2722}
2723#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2724
2725/**
2726 * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA
2727 * @new_dma_reserve - The number of pages to mark reserved
2728 *
2729 * The per-cpu batchsize and zone watermarks are determined by present_pages.
2730 * In the DMA zone, a significant percentage may be consumed by kernel image
2731 * and other unfreeable allocations which can skew the watermarks badly. This
2732 * function may optionally be used to account for unfreeable pages in
2733 * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize
2734 */
2735void __init set_dma_reserve(unsigned long new_dma_reserve)
2736{
2737 dma_reserve = new_dma_reserve;
2738}
2739
2095#ifndef CONFIG_NEED_MULTIPLE_NODES 2740#ifndef CONFIG_NEED_MULTIPLE_NODES
2096static bootmem_data_t contig_bootmem_data; 2741static bootmem_data_t contig_bootmem_data;
2097struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 2742struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
diff --git a/mm/shmem.c b/mm/shmem.c
index 8631be45b40d..eda907c3a86a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1351,7 +1351,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1351 inode->i_mode = mode; 1351 inode->i_mode = mode;
1352 inode->i_uid = current->fsuid; 1352 inode->i_uid = current->fsuid;
1353 inode->i_gid = current->fsgid; 1353 inode->i_gid = current->fsgid;
1354 inode->i_blksize = PAGE_CACHE_SIZE;
1355 inode->i_blocks = 0; 1354 inode->i_blocks = 0;
1356 inode->i_mapping->a_ops = &shmem_aops; 1355 inode->i_mapping->a_ops = &shmem_aops;
1357 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1356 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -2157,8 +2156,7 @@ static int init_inodecache(void)
2157 2156
2158static void destroy_inodecache(void) 2157static void destroy_inodecache(void)
2159{ 2158{
2160 if (kmem_cache_destroy(shmem_inode_cachep)) 2159 kmem_cache_destroy(shmem_inode_cachep);
2161 printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
2162} 2160}
2163 2161
2164static const struct address_space_operations shmem_aops = { 2162static const struct address_space_operations shmem_aops = {
diff --git a/mm/slab.c b/mm/slab.c
index 7a48eb1a60c8..792bfe320a8b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -972,7 +972,39 @@ static int transfer_objects(struct array_cache *to,
972 return nr; 972 return nr;
973} 973}
974 974
975#ifdef CONFIG_NUMA 975#ifndef CONFIG_NUMA
976
977#define drain_alien_cache(cachep, alien) do { } while (0)
978#define reap_alien(cachep, l3) do { } while (0)
979
980static inline struct array_cache **alloc_alien_cache(int node, int limit)
981{
982 return (struct array_cache **)BAD_ALIEN_MAGIC;
983}
984
985static inline void free_alien_cache(struct array_cache **ac_ptr)
986{
987}
988
989static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
990{
991 return 0;
992}
993
994static inline void *alternate_node_alloc(struct kmem_cache *cachep,
995 gfp_t flags)
996{
997 return NULL;
998}
999
1000static inline void *__cache_alloc_node(struct kmem_cache *cachep,
1001 gfp_t flags, int nodeid)
1002{
1003 return NULL;
1004}
1005
1006#else /* CONFIG_NUMA */
1007
976static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); 1008static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
977static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 1009static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
978 1010
@@ -1101,26 +1133,6 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1101 } 1133 }
1102 return 1; 1134 return 1;
1103} 1135}
1104
1105#else
1106
1107#define drain_alien_cache(cachep, alien) do { } while (0)
1108#define reap_alien(cachep, l3) do { } while (0)
1109
1110static inline struct array_cache **alloc_alien_cache(int node, int limit)
1111{
1112 return (struct array_cache **)BAD_ALIEN_MAGIC;
1113}
1114
1115static inline void free_alien_cache(struct array_cache **ac_ptr)
1116{
1117}
1118
1119static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1120{
1121 return 0;
1122}
1123
1124#endif 1136#endif
1125 1137
1126static int __cpuinit cpuup_callback(struct notifier_block *nfb, 1138static int __cpuinit cpuup_callback(struct notifier_block *nfb,
@@ -1564,7 +1576,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1564 */ 1576 */
1565 flags |= __GFP_COMP; 1577 flags |= __GFP_COMP;
1566#endif 1578#endif
1567 flags |= cachep->gfpflags; 1579
1580 /*
1581 * Under NUMA we want memory on the indicated node. We will handle
1582 * the needed fallback ourselves since we want to serve from our
1583 * per node object lists first for other nodes.
1584 */
1585 flags |= cachep->gfpflags | GFP_THISNODE;
1568 1586
1569 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1587 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1570 if (!page) 1588 if (!page)
@@ -2442,7 +2460,6 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2442 * @cachep: the cache to destroy 2460 * @cachep: the cache to destroy
2443 * 2461 *
2444 * Remove a struct kmem_cache object from the slab cache. 2462 * Remove a struct kmem_cache object from the slab cache.
2445 * Returns 0 on success.
2446 * 2463 *
2447 * It is expected this function will be called by a module when it is 2464 * It is expected this function will be called by a module when it is
2448 * unloaded. This will remove the cache completely, and avoid a duplicate 2465 * unloaded. This will remove the cache completely, and avoid a duplicate
@@ -2454,7 +2471,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2454 * The caller must guarantee that noone will allocate memory from the cache 2471 * The caller must guarantee that noone will allocate memory from the cache
2455 * during the kmem_cache_destroy(). 2472 * during the kmem_cache_destroy().
2456 */ 2473 */
2457int kmem_cache_destroy(struct kmem_cache *cachep) 2474void kmem_cache_destroy(struct kmem_cache *cachep)
2458{ 2475{
2459 BUG_ON(!cachep || in_interrupt()); 2476 BUG_ON(!cachep || in_interrupt());
2460 2477
@@ -2475,7 +2492,7 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
2475 list_add(&cachep->next, &cache_chain); 2492 list_add(&cachep->next, &cache_chain);
2476 mutex_unlock(&cache_chain_mutex); 2493 mutex_unlock(&cache_chain_mutex);
2477 unlock_cpu_hotplug(); 2494 unlock_cpu_hotplug();
2478 return 1; 2495 return;
2479 } 2496 }
2480 2497
2481 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2498 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
@@ -2483,7 +2500,6 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
2483 2500
2484 __kmem_cache_destroy(cachep); 2501 __kmem_cache_destroy(cachep);
2485 unlock_cpu_hotplug(); 2502 unlock_cpu_hotplug();
2486 return 0;
2487} 2503}
2488EXPORT_SYMBOL(kmem_cache_destroy); 2504EXPORT_SYMBOL(kmem_cache_destroy);
2489 2505
@@ -3030,14 +3046,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3030 void *objp; 3046 void *objp;
3031 struct array_cache *ac; 3047 struct array_cache *ac;
3032 3048
3033#ifdef CONFIG_NUMA
3034 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
3035 objp = alternate_node_alloc(cachep, flags);
3036 if (objp != NULL)
3037 return objp;
3038 }
3039#endif
3040
3041 check_irq_off(); 3049 check_irq_off();
3042 ac = cpu_cache_get(cachep); 3050 ac = cpu_cache_get(cachep);
3043 if (likely(ac->avail)) { 3051 if (likely(ac->avail)) {
@@ -3055,12 +3063,24 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
3055 gfp_t flags, void *caller) 3063 gfp_t flags, void *caller)
3056{ 3064{
3057 unsigned long save_flags; 3065 unsigned long save_flags;
3058 void *objp; 3066 void *objp = NULL;
3059 3067
3060 cache_alloc_debugcheck_before(cachep, flags); 3068 cache_alloc_debugcheck_before(cachep, flags);
3061 3069
3062 local_irq_save(save_flags); 3070 local_irq_save(save_flags);
3063 objp = ____cache_alloc(cachep, flags); 3071
3072 if (unlikely(NUMA_BUILD &&
3073 current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
3074 objp = alternate_node_alloc(cachep, flags);
3075
3076 if (!objp)
3077 objp = ____cache_alloc(cachep, flags);
3078 /*
3079 * We may just have run out of memory on the local node.
3080 * __cache_alloc_node() knows how to locate memory on other nodes
3081 */
3082 if (NUMA_BUILD && !objp)
3083 objp = __cache_alloc_node(cachep, flags, numa_node_id());
3064 local_irq_restore(save_flags); 3084 local_irq_restore(save_flags);
3065 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 3085 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
3066 caller); 3086 caller);
@@ -3079,7 +3099,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3079{ 3099{
3080 int nid_alloc, nid_here; 3100 int nid_alloc, nid_here;
3081 3101
3082 if (in_interrupt()) 3102 if (in_interrupt() || (flags & __GFP_THISNODE))
3083 return NULL; 3103 return NULL;
3084 nid_alloc = nid_here = numa_node_id(); 3104 nid_alloc = nid_here = numa_node_id();
3085 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3105 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
@@ -3092,6 +3112,28 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3092} 3112}
3093 3113
3094/* 3114/*
3115 * Fallback function if there was no memory available and no objects on a
3116 * certain node and we are allowed to fall back. We mimick the behavior of
3117 * the page allocator. We fall back according to a zonelist determined by
3118 * the policy layer while obeying cpuset constraints.
3119 */
3120void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3121{
3122 struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))
3123 ->node_zonelists[gfp_zone(flags)];
3124 struct zone **z;
3125 void *obj = NULL;
3126
3127 for (z = zonelist->zones; *z && !obj; z++)
3128 if (zone_idx(*z) <= ZONE_NORMAL &&
3129 cpuset_zone_allowed(*z, flags))
3130 obj = __cache_alloc_node(cache,
3131 flags | __GFP_THISNODE,
3132 zone_to_nid(*z));
3133 return obj;
3134}
3135
3136/*
3095 * A interface to enable slab creation on nodeid 3137 * A interface to enable slab creation on nodeid
3096 */ 3138 */
3097static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3139static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
@@ -3144,11 +3186,15 @@ retry:
3144must_grow: 3186must_grow:
3145 spin_unlock(&l3->list_lock); 3187 spin_unlock(&l3->list_lock);
3146 x = cache_grow(cachep, flags, nodeid); 3188 x = cache_grow(cachep, flags, nodeid);
3189 if (x)
3190 goto retry;
3147 3191
3148 if (!x) 3192 if (!(flags & __GFP_THISNODE))
3149 return NULL; 3193 /* Unable to grow the cache. Fall back to other nodes. */
3194 return fallback_alloc(cachep, flags);
3195
3196 return NULL;
3150 3197
3151 goto retry;
3152done: 3198done:
3153 return obj; 3199 return obj;
3154} 3200}
diff --git a/mm/slob.c b/mm/slob.c
index 20188627347c..542394184a58 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -270,10 +270,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
270} 270}
271EXPORT_SYMBOL(kmem_cache_create); 271EXPORT_SYMBOL(kmem_cache_create);
272 272
273int kmem_cache_destroy(struct kmem_cache *c) 273void kmem_cache_destroy(struct kmem_cache *c)
274{ 274{
275 slob_free(c, sizeof(struct kmem_cache)); 275 slob_free(c, sizeof(struct kmem_cache));
276 return 0;
277} 276}
278EXPORT_SYMBOL(kmem_cache_destroy); 277EXPORT_SYMBOL(kmem_cache_destroy);
279 278
diff --git a/mm/truncate.c b/mm/truncate.c
index c6ab55ec6883..a654928323dc 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/swap.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/pagemap.h> 14#include <linux/pagemap.h>
14#include <linux/pagevec.h> 15#include <linux/pagevec.h>
@@ -52,36 +53,26 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
52/* 53/*
53 * This is for invalidate_inode_pages(). That function can be called at 54 * This is for invalidate_inode_pages(). That function can be called at
54 * any time, and is not supposed to throw away dirty pages. But pages can 55 * any time, and is not supposed to throw away dirty pages. But pages can
55 * be marked dirty at any time too. So we re-check the dirtiness inside 56 * be marked dirty at any time too, so use remove_mapping which safely
56 * ->tree_lock. That provides exclusion against the __set_page_dirty 57 * discards clean, unused pages.
57 * functions.
58 * 58 *
59 * Returns non-zero if the page was successfully invalidated. 59 * Returns non-zero if the page was successfully invalidated.
60 */ 60 */
61static int 61static int
62invalidate_complete_page(struct address_space *mapping, struct page *page) 62invalidate_complete_page(struct address_space *mapping, struct page *page)
63{ 63{
64 int ret;
65
64 if (page->mapping != mapping) 66 if (page->mapping != mapping)
65 return 0; 67 return 0;
66 68
67 if (PagePrivate(page) && !try_to_release_page(page, 0)) 69 if (PagePrivate(page) && !try_to_release_page(page, 0))
68 return 0; 70 return 0;
69 71
70 write_lock_irq(&mapping->tree_lock); 72 ret = remove_mapping(mapping, page);
71 if (PageDirty(page))
72 goto failed;
73 if (page_count(page) != 2) /* caller's ref + pagecache ref */
74 goto failed;
75
76 BUG_ON(PagePrivate(page));
77 __remove_from_page_cache(page);
78 write_unlock_irq(&mapping->tree_lock);
79 ClearPageUptodate(page); 73 ClearPageUptodate(page);
80 page_cache_release(page); /* pagecache ref */ 74
81 return 1; 75 return ret;
82failed:
83 write_unlock_irq(&mapping->tree_lock);
84 return 0;
85} 76}
86 77
87/** 78/**
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9aad8b0cc6ee..1ac191ce5641 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -241,7 +241,6 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
241 241
242/** 242/**
243 * get_vm_area - reserve a contingous kernel virtual area 243 * get_vm_area - reserve a contingous kernel virtual area
244 *
245 * @size: size of the area 244 * @size: size of the area
246 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 245 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
247 * 246 *
@@ -273,7 +272,7 @@ static struct vm_struct *__find_vm_area(void *addr)
273} 272}
274 273
275/* Caller must hold vmlist_lock */ 274/* Caller must hold vmlist_lock */
276struct vm_struct *__remove_vm_area(void *addr) 275static struct vm_struct *__remove_vm_area(void *addr)
277{ 276{
278 struct vm_struct **p, *tmp; 277 struct vm_struct **p, *tmp;
279 278
@@ -296,7 +295,6 @@ found:
296 295
297/** 296/**
298 * remove_vm_area - find and remove a contingous kernel virtual area 297 * remove_vm_area - find and remove a contingous kernel virtual area
299 *
300 * @addr: base address 298 * @addr: base address
301 * 299 *
302 * Search for the kernel VM area starting at @addr, and remove it. 300 * Search for the kernel VM area starting at @addr, and remove it.
@@ -355,7 +353,6 @@ void __vunmap(void *addr, int deallocate_pages)
355 353
356/** 354/**
357 * vfree - release memory allocated by vmalloc() 355 * vfree - release memory allocated by vmalloc()
358 *
359 * @addr: memory base address 356 * @addr: memory base address
360 * 357 *
361 * Free the virtually contiguous memory area starting at @addr, as 358 * Free the virtually contiguous memory area starting at @addr, as
@@ -373,7 +370,6 @@ EXPORT_SYMBOL(vfree);
373 370
374/** 371/**
375 * vunmap - release virtual mapping obtained by vmap() 372 * vunmap - release virtual mapping obtained by vmap()
376 *
377 * @addr: memory base address 373 * @addr: memory base address
378 * 374 *
379 * Free the virtually contiguous memory area starting at @addr, 375 * Free the virtually contiguous memory area starting at @addr,
@@ -390,7 +386,6 @@ EXPORT_SYMBOL(vunmap);
390 386
391/** 387/**
392 * vmap - map an array of pages into virtually contiguous space 388 * vmap - map an array of pages into virtually contiguous space
393 *
394 * @pages: array of page pointers 389 * @pages: array of page pointers
395 * @count: number of pages to map 390 * @count: number of pages to map
396 * @flags: vm_area->flags 391 * @flags: vm_area->flags
@@ -471,7 +466,6 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
471 466
472/** 467/**
473 * __vmalloc_node - allocate virtually contiguous memory 468 * __vmalloc_node - allocate virtually contiguous memory
474 *
475 * @size: allocation size 469 * @size: allocation size
476 * @gfp_mask: flags for the page level allocator 470 * @gfp_mask: flags for the page level allocator
477 * @prot: protection mask for the allocated pages 471 * @prot: protection mask for the allocated pages
@@ -505,9 +499,7 @@ EXPORT_SYMBOL(__vmalloc);
505 499
506/** 500/**
507 * vmalloc - allocate virtually contiguous memory 501 * vmalloc - allocate virtually contiguous memory
508 *
509 * @size: allocation size 502 * @size: allocation size
510 *
511 * Allocate enough pages to cover @size from the page level 503 * Allocate enough pages to cover @size from the page level
512 * allocator and map them into contiguous kernel virtual space. 504 * allocator and map them into contiguous kernel virtual space.
513 * 505 *
@@ -521,11 +513,11 @@ void *vmalloc(unsigned long size)
521EXPORT_SYMBOL(vmalloc); 513EXPORT_SYMBOL(vmalloc);
522 514
523/** 515/**
524 * vmalloc_user - allocate virtually contiguous memory which has 516 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
525 * been zeroed so it can be mapped to userspace without 517 * @size: allocation size
526 * leaking data.
527 * 518 *
528 * @size: allocation size 519 * The resulting memory area is zeroed so it can be mapped to userspace
520 * without leaking data.
529 */ 521 */
530void *vmalloc_user(unsigned long size) 522void *vmalloc_user(unsigned long size)
531{ 523{
@@ -544,7 +536,6 @@ EXPORT_SYMBOL(vmalloc_user);
544 536
545/** 537/**
546 * vmalloc_node - allocate memory on a specific node 538 * vmalloc_node - allocate memory on a specific node
547 *
548 * @size: allocation size 539 * @size: allocation size
549 * @node: numa node 540 * @node: numa node
550 * 541 *
@@ -566,7 +557,6 @@ EXPORT_SYMBOL(vmalloc_node);
566 557
567/** 558/**
568 * vmalloc_exec - allocate virtually contiguous, executable memory 559 * vmalloc_exec - allocate virtually contiguous, executable memory
569 *
570 * @size: allocation size 560 * @size: allocation size
571 * 561 *
572 * Kernel-internal function to allocate enough pages to cover @size 562 * Kernel-internal function to allocate enough pages to cover @size
@@ -584,7 +574,6 @@ void *vmalloc_exec(unsigned long size)
584 574
585/** 575/**
586 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 576 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
587 *
588 * @size: allocation size 577 * @size: allocation size
589 * 578 *
590 * Allocate enough 32bit PA addressable pages to cover @size from the 579 * Allocate enough 32bit PA addressable pages to cover @size from the
@@ -597,11 +586,11 @@ void *vmalloc_32(unsigned long size)
597EXPORT_SYMBOL(vmalloc_32); 586EXPORT_SYMBOL(vmalloc_32);
598 587
599/** 588/**
600 * vmalloc_32_user - allocate virtually contiguous memory (32bit 589 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
601 * addressable) which is zeroed so it can be
602 * mapped to userspace without leaking data.
603 *
604 * @size: allocation size 590 * @size: allocation size
591 *
592 * The resulting memory area is 32bit addressable and zeroed so it can be
593 * mapped to userspace without leaking data.
605 */ 594 */
606void *vmalloc_32_user(unsigned long size) 595void *vmalloc_32_user(unsigned long size)
607{ 596{
@@ -695,7 +684,6 @@ finished:
695 684
696/** 685/**
697 * remap_vmalloc_range - map vmalloc pages to userspace 686 * remap_vmalloc_range - map vmalloc pages to userspace
698 *
699 * @vma: vma to cover (map full range of vma) 687 * @vma: vma to cover (map full range of vma)
700 * @addr: vmalloc memory 688 * @addr: vmalloc memory
701 * @pgoff: number of pages into addr before first page to map 689 * @pgoff: number of pages into addr before first page to map
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 87779dda4ec6..eca70310adb2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/highmem.h> 21#include <linux/highmem.h>
22#include <linux/vmstat.h>
22#include <linux/file.h> 23#include <linux/file.h>
23#include <linux/writeback.h> 24#include <linux/writeback.h>
24#include <linux/blkdev.h> 25#include <linux/blkdev.h>
@@ -370,7 +371,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
370 /* synchronous write or broken a_ops? */ 371 /* synchronous write or broken a_ops? */
371 ClearPageReclaim(page); 372 ClearPageReclaim(page);
372 } 373 }
373 374 inc_zone_page_state(page, NR_VMSCAN_WRITE);
374 return PAGE_SUCCESS; 375 return PAGE_SUCCESS;
375 } 376 }
376 377
@@ -383,11 +384,30 @@ int remove_mapping(struct address_space *mapping, struct page *page)
383 BUG_ON(mapping != page_mapping(page)); 384 BUG_ON(mapping != page_mapping(page));
384 385
385 write_lock_irq(&mapping->tree_lock); 386 write_lock_irq(&mapping->tree_lock);
386
387 /* 387 /*
388 * The non-racy check for busy page. It is critical to check 388 * The non racy check for a busy page.
389 * PageDirty _after_ making sure that the page is freeable and 389 *
390 * not in use by anybody. (pagecache + us == 2) 390 * Must be careful with the order of the tests. When someone has
391 * a ref to the page, it may be possible that they dirty it then
392 * drop the reference. So if PageDirty is tested before page_count
393 * here, then the following race may occur:
394 *
395 * get_user_pages(&page);
396 * [user mapping goes away]
397 * write_to(page);
398 * !PageDirty(page) [good]
399 * SetPageDirty(page);
400 * put_page(page);
401 * !page_count(page) [good, discard it]
402 *
403 * [oops, our write_to data is lost]
404 *
405 * Reversing the order of the tests ensures such a situation cannot
406 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
407 * load is not satisfied before that of page->_count.
408 *
409 * Note that if SetPageDirty is always performed via set_page_dirty,
410 * and thus under tree_lock, then this ordering is not required.
391 */ 411 */
392 if (unlikely(page_count(page) != 2)) 412 if (unlikely(page_count(page) != 2))
393 goto cannot_free; 413 goto cannot_free;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 490d8c1a0ded..a2b6a9f96e5c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -371,7 +371,7 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z)
371 __inc_zone_state(z, NUMA_MISS); 371 __inc_zone_state(z, NUMA_MISS);
372 __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); 372 __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
373 } 373 }
374 if (z->zone_pgdat == NODE_DATA(numa_node_id())) 374 if (z->node == numa_node_id())
375 __inc_zone_state(z, NUMA_LOCAL); 375 __inc_zone_state(z, NUMA_LOCAL);
376 else 376 else
377 __inc_zone_state(z, NUMA_OTHER); 377 __inc_zone_state(z, NUMA_OTHER);
@@ -465,6 +465,7 @@ static char *vmstat_text[] = {
465 "nr_writeback", 465 "nr_writeback",
466 "nr_unstable", 466 "nr_unstable",
467 "nr_bounce", 467 "nr_bounce",
468 "nr_vmscan_write",
468 469
469#ifdef CONFIG_NUMA 470#ifdef CONFIG_NUMA
470 "numa_hit", 471 "numa_hit",