aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/allocpercpu.c9
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/bounce.c4
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c13
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/fremap.c4
-rw-r--r--mm/hugetlb.c30
-rw-r--r--mm/memory.c51
-rw-r--r--mm/memory_hotplug.c7
-rw-r--r--mm/mempolicy.c12
-rw-r--r--mm/migrate.c19
-rw-r--r--mm/mincore.c183
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c14
-rw-r--r--mm/mmzone.c5
-rw-r--r--mm/nommu.c30
-rw-r--r--mm/oom_kill.c62
-rw-r--r--mm/page-writeback.c106
-rw-r--r--mm/page_alloc.c406
-rw-r--r--mm/page_io.c45
-rw-r--r--mm/pdflush.c1
-rw-r--r--mm/readahead.c12
-rw-r--r--mm/rmap.c36
-rw-r--r--mm/shmem.c35
-rw-r--r--mm/slab.c411
-rw-r--r--mm/slob.c27
-rw-r--r--mm/sparse.c23
-rw-r--r--mm/swap.c10
-rw-r--r--mm/swapfile.c102
-rw-r--r--mm/thrash.c116
-rw-r--r--mm/tiny-shmem.c4
-rw-r--r--mm/truncate.c41
-rw-r--r--mm/vmscan.c60
-rw-r--r--mm/vmstat.c22
35 files changed, 1224 insertions, 690 deletions
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index eaa9abeea536..b2486cf887a0 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -17,10 +17,9 @@
17void percpu_depopulate(void *__pdata, int cpu) 17void percpu_depopulate(void *__pdata, int cpu)
18{ 18{
19 struct percpu_data *pdata = __percpu_disguise(__pdata); 19 struct percpu_data *pdata = __percpu_disguise(__pdata);
20 if (pdata->ptrs[cpu]) { 20
21 kfree(pdata->ptrs[cpu]); 21 kfree(pdata->ptrs[cpu]);
22 pdata->ptrs[cpu] = NULL; 22 pdata->ptrs[cpu] = NULL;
23 }
24} 23}
25EXPORT_SYMBOL_GPL(percpu_depopulate); 24EXPORT_SYMBOL_GPL(percpu_depopulate);
26 25
@@ -123,6 +122,8 @@ EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
123 */ 122 */
124void percpu_free(void *__pdata) 123void percpu_free(void *__pdata)
125{ 124{
125 if (unlikely(!__pdata))
126 return;
126 __percpu_depopulate_mask(__pdata, &cpu_possible_map); 127 __percpu_depopulate_mask(__pdata, &cpu_possible_map);
127 kfree(__percpu_disguise(__pdata)); 128 kfree(__percpu_disguise(__pdata));
128} 129}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d53112fcb404..00a96970b237 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -27,8 +27,6 @@ unsigned long max_low_pfn;
27unsigned long min_low_pfn; 27unsigned long min_low_pfn;
28unsigned long max_pfn; 28unsigned long max_pfn;
29 29
30EXPORT_UNUSED_SYMBOL(max_pfn); /* June 2006 */
31
32static LIST_HEAD(bdata_list); 30static LIST_HEAD(bdata_list);
33#ifdef CONFIG_CRASH_DUMP 31#ifdef CONFIG_CRASH_DUMP
34/* 32/*
@@ -196,6 +194,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
196 if (limit && bdata->node_boot_start >= limit) 194 if (limit && bdata->node_boot_start >= limit)
197 return NULL; 195 return NULL;
198 196
197 /* on nodes without memory - bootmem_map is NULL */
198 if (!bdata->node_bootmem_map)
199 return NULL;
200
199 end_pfn = bdata->node_low_pfn; 201 end_pfn = bdata->node_low_pfn;
200 limit = PFN_DOWN(limit); 202 limit = PFN_DOWN(limit);
201 if (limit && end_pfn > limit) 203 if (limit && end_pfn > limit)
diff --git a/mm/bounce.c b/mm/bounce.c
index e4b62d2a4024..643efbe82402 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -237,6 +237,8 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
237 if (!bio) 237 if (!bio)
238 return; 238 return;
239 239
240 blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
241
240 /* 242 /*
241 * at least one page was bounced, fill in possible non-highmem 243 * at least one page was bounced, fill in possible non-highmem
242 * pages 244 * pages
@@ -291,8 +293,6 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
291 pool = isa_page_pool; 293 pool = isa_page_pool;
292 } 294 }
293 295
294 blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
295
296 /* 296 /*
297 * slow path 297 * slow path
298 */ 298 */
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 168c78a121bb..0df4c899e979 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -38,7 +38,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
38 if (!file) 38 if (!file)
39 return -EBADF; 39 return -EBADF;
40 40
41 if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) { 41 if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
42 ret = -ESPIPE; 42 ret = -ESPIPE;
43 goto out; 43 goto out;
44 } 44 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 7b84dc814347..8332c77b1bd1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1181,8 +1181,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1181 if (pos < size) { 1181 if (pos < size) {
1182 retval = generic_file_direct_IO(READ, iocb, 1182 retval = generic_file_direct_IO(READ, iocb,
1183 iov, pos, nr_segs); 1183 iov, pos, nr_segs);
1184 if (retval > 0 && !is_sync_kiocb(iocb))
1185 retval = -EIOCBQUEUED;
1186 if (retval > 0) 1184 if (retval > 0)
1187 *ppos = pos + retval; 1185 *ppos = pos + retval;
1188 } 1186 }
@@ -1445,7 +1443,6 @@ no_cached_page:
1445 * effect. 1443 * effect.
1446 */ 1444 */
1447 error = page_cache_read(file, pgoff); 1445 error = page_cache_read(file, pgoff);
1448 grab_swap_token();
1449 1446
1450 /* 1447 /*
1451 * The page we want has now been added to the page cache. 1448 * The page we want has now been added to the page cache.
@@ -1893,6 +1890,7 @@ int should_remove_suid(struct dentry *dentry)
1893 1890
1894 return 0; 1891 return 0;
1895} 1892}
1893EXPORT_SYMBOL(should_remove_suid);
1896 1894
1897int __remove_suid(struct dentry *dentry, int kill) 1895int __remove_suid(struct dentry *dentry, int kill)
1898{ 1896{
@@ -2047,15 +2045,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2047 * Sync the fs metadata but not the minor inode changes and 2045 * Sync the fs metadata but not the minor inode changes and
2048 * of course not the data as we did direct DMA for the IO. 2046 * of course not the data as we did direct DMA for the IO.
2049 * i_mutex is held, which protects generic_osync_inode() from 2047 * i_mutex is held, which protects generic_osync_inode() from
2050 * livelocking. 2048 * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
2051 */ 2049 */
2052 if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2050 if ((written >= 0 || written == -EIOCBQUEUED) &&
2051 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2053 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); 2052 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
2054 if (err < 0) 2053 if (err < 0)
2055 written = err; 2054 written = err;
2056 } 2055 }
2057 if (written == count && !is_sync_kiocb(iocb))
2058 written = -EIOCBQUEUED;
2059 return written; 2056 return written;
2060} 2057}
2061EXPORT_SYMBOL(generic_file_direct_write); 2058EXPORT_SYMBOL(generic_file_direct_write);
@@ -2269,7 +2266,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2269 if (count == 0) 2266 if (count == 0)
2270 goto out; 2267 goto out;
2271 2268
2272 err = remove_suid(file->f_dentry); 2269 err = remove_suid(file->f_path.dentry);
2273 if (err) 2270 if (err)
2274 goto out; 2271 goto out;
2275 2272
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b4fd0d7c9bfb..45b3553865cf 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -189,7 +189,7 @@ __xip_unmap (struct address_space * mapping,
189 /* Nuke the page table entry. */ 189 /* Nuke the page table entry. */
190 flush_cache_page(vma, address, pte_pfn(*pte)); 190 flush_cache_page(vma, address, pte_pfn(*pte));
191 pteval = ptep_clear_flush(vma, address, pte); 191 pteval = ptep_clear_flush(vma, address, pte);
192 page_remove_rmap(page); 192 page_remove_rmap(page, vma);
193 dec_mm_counter(mm, file_rss); 193 dec_mm_counter(mm, file_rss);
194 BUG_ON(pte_dirty(pteval)); 194 BUG_ON(pte_dirty(pteval));
195 pte_unmap_unlock(pte, ptl); 195 pte_unmap_unlock(pte, ptl);
@@ -379,7 +379,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
379 if (count == 0) 379 if (count == 0)
380 goto out_backing; 380 goto out_backing;
381 381
382 ret = remove_suid(filp->f_dentry); 382 ret = remove_suid(filp->f_path.dentry);
383 if (ret) 383 if (ret)
384 goto out_backing; 384 goto out_backing;
385 385
diff --git a/mm/fremap.c b/mm/fremap.c
index 7a9d0f5d246d..4e3f53dd5fd4 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -33,7 +33,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
33 if (page) { 33 if (page) {
34 if (pte_dirty(pte)) 34 if (pte_dirty(pte))
35 set_page_dirty(page); 35 set_page_dirty(page);
36 page_remove_rmap(page); 36 page_remove_rmap(page, vma);
37 page_cache_release(page); 37 page_cache_release(page);
38 } 38 }
39 } else { 39 } else {
@@ -101,7 +101,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
101{ 101{
102 int err = -ENOMEM; 102 int err = -ENOMEM;
103 pte_t *pte; 103 pte_t *pte;
104 pte_t pte_val;
105 spinlock_t *ptl; 104 spinlock_t *ptl;
106 105
107 pte = get_locked_pte(mm, addr, &ptl); 106 pte = get_locked_pte(mm, addr, &ptl);
@@ -114,7 +113,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
114 } 113 }
115 114
116 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); 115 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
117 pte_val = *pte;
118 /* 116 /*
119 * We don't need to run update_mmu_cache() here because the "file pte" 117 * We don't need to run update_mmu_cache() here because the "file pte"
120 * being installed by install_file_pte() is not a real pte - it's a 118 * being installed by install_file_pte() is not a real pte - it's a
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a088f593a807..cb362f761f17 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -44,14 +44,14 @@ static void clear_huge_page(struct page *page, unsigned long addr)
44} 44}
45 45
46static void copy_huge_page(struct page *dst, struct page *src, 46static void copy_huge_page(struct page *dst, struct page *src,
47 unsigned long addr) 47 unsigned long addr, struct vm_area_struct *vma)
48{ 48{
49 int i; 49 int i;
50 50
51 might_sleep(); 51 might_sleep();
52 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 52 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
53 cond_resched(); 53 cond_resched();
54 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE); 54 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
55 } 55 }
56} 56}
57 57
@@ -73,7 +73,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
73 73
74 for (z = zonelist->zones; *z; z++) { 74 for (z = zonelist->zones; *z; z++) {
75 nid = zone_to_nid(*z); 75 nid = zone_to_nid(*z);
76 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && 76 if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
77 !list_empty(&hugepage_freelists[nid])) 77 !list_empty(&hugepage_freelists[nid]))
78 break; 78 break;
79 } 79 }
@@ -109,7 +109,7 @@ static int alloc_fresh_huge_page(void)
109 if (nid == MAX_NUMNODES) 109 if (nid == MAX_NUMNODES)
110 nid = first_node(node_online_map); 110 nid = first_node(node_online_map);
111 if (page) { 111 if (page) {
112 page[1].lru.next = (void *)free_huge_page; /* dtor */ 112 set_compound_page_dtor(page, free_huge_page);
113 spin_lock(&hugetlb_lock); 113 spin_lock(&hugetlb_lock);
114 nr_huge_pages++; 114 nr_huge_pages++;
115 nr_huge_pages_node[page_to_nid(page)]++; 115 nr_huge_pages_node[page_to_nid(page)]++;
@@ -344,7 +344,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
344 entry = *src_pte; 344 entry = *src_pte;
345 ptepage = pte_page(entry); 345 ptepage = pte_page(entry);
346 get_page(ptepage); 346 get_page(ptepage);
347 add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
348 set_huge_pte_at(dst, addr, dst_pte, entry); 347 set_huge_pte_at(dst, addr, dst_pte, entry);
349 } 348 }
350 spin_unlock(&src->page_table_lock); 349 spin_unlock(&src->page_table_lock);
@@ -365,6 +364,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
365 pte_t pte; 364 pte_t pte;
366 struct page *page; 365 struct page *page;
367 struct page *tmp; 366 struct page *tmp;
367 /*
368 * A page gathering list, protected by per file i_mmap_lock. The
369 * lock is used to avoid list corruption from multiple unmapping
370 * of the same page since we are using page->lru.
371 */
368 LIST_HEAD(page_list); 372 LIST_HEAD(page_list);
369 373
370 WARN_ON(!is_vm_hugetlb_page(vma)); 374 WARN_ON(!is_vm_hugetlb_page(vma));
@@ -372,24 +376,21 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
372 BUG_ON(end & ~HPAGE_MASK); 376 BUG_ON(end & ~HPAGE_MASK);
373 377
374 spin_lock(&mm->page_table_lock); 378 spin_lock(&mm->page_table_lock);
375
376 /* Update high watermark before we lower rss */
377 update_hiwater_rss(mm);
378
379 for (address = start; address < end; address += HPAGE_SIZE) { 379 for (address = start; address < end; address += HPAGE_SIZE) {
380 ptep = huge_pte_offset(mm, address); 380 ptep = huge_pte_offset(mm, address);
381 if (!ptep) 381 if (!ptep)
382 continue; 382 continue;
383 383
384 if (huge_pmd_unshare(mm, &address, ptep))
385 continue;
386
384 pte = huge_ptep_get_and_clear(mm, address, ptep); 387 pte = huge_ptep_get_and_clear(mm, address, ptep);
385 if (pte_none(pte)) 388 if (pte_none(pte))
386 continue; 389 continue;
387 390
388 page = pte_page(pte); 391 page = pte_page(pte);
389 list_add(&page->lru, &page_list); 392 list_add(&page->lru, &page_list);
390 add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
391 } 393 }
392
393 spin_unlock(&mm->page_table_lock); 394 spin_unlock(&mm->page_table_lock);
394 flush_tlb_range(vma, start, end); 395 flush_tlb_range(vma, start, end);
395 list_for_each_entry_safe(page, tmp, &page_list, lru) { 396 list_for_each_entry_safe(page, tmp, &page_list, lru) {
@@ -441,7 +442,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
441 } 442 }
442 443
443 spin_unlock(&mm->page_table_lock); 444 spin_unlock(&mm->page_table_lock);
444 copy_huge_page(new_page, old_page, address); 445 copy_huge_page(new_page, old_page, address, vma);
445 spin_lock(&mm->page_table_lock); 446 spin_lock(&mm->page_table_lock);
446 447
447 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 448 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -515,7 +516,6 @@ retry:
515 if (!pte_none(*ptep)) 516 if (!pte_none(*ptep))
516 goto backout; 517 goto backout;
517 518
518 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
519 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 519 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
520 && (vma->vm_flags & VM_SHARED))); 520 && (vma->vm_flags & VM_SHARED)));
521 set_huge_pte_at(mm, address, ptep, new_pte); 521 set_huge_pte_at(mm, address, ptep, new_pte);
@@ -653,11 +653,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
653 BUG_ON(address >= end); 653 BUG_ON(address >= end);
654 flush_cache_range(vma, address, end); 654 flush_cache_range(vma, address, end);
655 655
656 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
656 spin_lock(&mm->page_table_lock); 657 spin_lock(&mm->page_table_lock);
657 for (; address < end; address += HPAGE_SIZE) { 658 for (; address < end; address += HPAGE_SIZE) {
658 ptep = huge_pte_offset(mm, address); 659 ptep = huge_pte_offset(mm, address);
659 if (!ptep) 660 if (!ptep)
660 continue; 661 continue;
662 if (huge_pmd_unshare(mm, &address, ptep))
663 continue;
661 if (!pte_none(*ptep)) { 664 if (!pte_none(*ptep)) {
662 pte = huge_ptep_get_and_clear(mm, address, ptep); 665 pte = huge_ptep_get_and_clear(mm, address, ptep);
663 pte = pte_mkhuge(pte_modify(pte, newprot)); 666 pte = pte_mkhuge(pte_modify(pte, newprot));
@@ -666,6 +669,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
666 } 669 }
667 } 670 }
668 spin_unlock(&mm->page_table_lock); 671 spin_unlock(&mm->page_table_lock);
672 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
669 673
670 flush_tlb_range(vma, start, end); 674 flush_tlb_range(vma, start, end);
671} 675}
diff --git a/mm/memory.c b/mm/memory.c
index 156861fcac43..af227d26e104 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -681,7 +681,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
681 mark_page_accessed(page); 681 mark_page_accessed(page);
682 file_rss--; 682 file_rss--;
683 } 683 }
684 page_remove_rmap(page); 684 page_remove_rmap(page, vma);
685 tlb_remove_page(tlb, page); 685 tlb_remove_page(tlb, page);
686 continue; 686 continue;
687 } 687 }
@@ -1091,7 +1091,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1091 if (pages) { 1091 if (pages) {
1092 pages[i] = page; 1092 pages[i] = page;
1093 1093
1094 flush_anon_page(page, start); 1094 flush_anon_page(vma, page, start);
1095 flush_dcache_page(page); 1095 flush_dcache_page(page);
1096 } 1096 }
1097 if (vmas) 1097 if (vmas)
@@ -1110,23 +1110,29 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1110{ 1110{
1111 pte_t *pte; 1111 pte_t *pte;
1112 spinlock_t *ptl; 1112 spinlock_t *ptl;
1113 int err = 0;
1113 1114
1114 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); 1115 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1115 if (!pte) 1116 if (!pte)
1116 return -ENOMEM; 1117 return -EAGAIN;
1117 arch_enter_lazy_mmu_mode(); 1118 arch_enter_lazy_mmu_mode();
1118 do { 1119 do {
1119 struct page *page = ZERO_PAGE(addr); 1120 struct page *page = ZERO_PAGE(addr);
1120 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); 1121 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
1122
1123 if (unlikely(!pte_none(*pte))) {
1124 err = -EEXIST;
1125 pte++;
1126 break;
1127 }
1121 page_cache_get(page); 1128 page_cache_get(page);
1122 page_add_file_rmap(page); 1129 page_add_file_rmap(page);
1123 inc_mm_counter(mm, file_rss); 1130 inc_mm_counter(mm, file_rss);
1124 BUG_ON(!pte_none(*pte));
1125 set_pte_at(mm, addr, pte, zero_pte); 1131 set_pte_at(mm, addr, pte, zero_pte);
1126 } while (pte++, addr += PAGE_SIZE, addr != end); 1132 } while (pte++, addr += PAGE_SIZE, addr != end);
1127 arch_leave_lazy_mmu_mode(); 1133 arch_leave_lazy_mmu_mode();
1128 pte_unmap_unlock(pte - 1, ptl); 1134 pte_unmap_unlock(pte - 1, ptl);
1129 return 0; 1135 return err;
1130} 1136}
1131 1137
1132static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, 1138static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -1134,16 +1140,18 @@ static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
1134{ 1140{
1135 pmd_t *pmd; 1141 pmd_t *pmd;
1136 unsigned long next; 1142 unsigned long next;
1143 int err;
1137 1144
1138 pmd = pmd_alloc(mm, pud, addr); 1145 pmd = pmd_alloc(mm, pud, addr);
1139 if (!pmd) 1146 if (!pmd)
1140 return -ENOMEM; 1147 return -EAGAIN;
1141 do { 1148 do {
1142 next = pmd_addr_end(addr, end); 1149 next = pmd_addr_end(addr, end);
1143 if (zeromap_pte_range(mm, pmd, addr, next, prot)) 1150 err = zeromap_pte_range(mm, pmd, addr, next, prot);
1144 return -ENOMEM; 1151 if (err)
1152 break;
1145 } while (pmd++, addr = next, addr != end); 1153 } while (pmd++, addr = next, addr != end);
1146 return 0; 1154 return err;
1147} 1155}
1148 1156
1149static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, 1157static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
@@ -1151,16 +1159,18 @@ static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1151{ 1159{
1152 pud_t *pud; 1160 pud_t *pud;
1153 unsigned long next; 1161 unsigned long next;
1162 int err;
1154 1163
1155 pud = pud_alloc(mm, pgd, addr); 1164 pud = pud_alloc(mm, pgd, addr);
1156 if (!pud) 1165 if (!pud)
1157 return -ENOMEM; 1166 return -EAGAIN;
1158 do { 1167 do {
1159 next = pud_addr_end(addr, end); 1168 next = pud_addr_end(addr, end);
1160 if (zeromap_pmd_range(mm, pud, addr, next, prot)) 1169 err = zeromap_pmd_range(mm, pud, addr, next, prot);
1161 return -ENOMEM; 1170 if (err)
1171 break;
1162 } while (pud++, addr = next, addr != end); 1172 } while (pud++, addr = next, addr != end);
1163 return 0; 1173 return err;
1164} 1174}
1165 1175
1166int zeromap_page_range(struct vm_area_struct *vma, 1176int zeromap_page_range(struct vm_area_struct *vma,
@@ -1431,7 +1441,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1431 return pte; 1441 return pte;
1432} 1442}
1433 1443
1434static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) 1444static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
1435{ 1445{
1436 /* 1446 /*
1437 * If the source page was a PFN mapping, we don't have 1447 * If the source page was a PFN mapping, we don't have
@@ -1454,9 +1464,9 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
1454 kunmap_atomic(kaddr, KM_USER0); 1464 kunmap_atomic(kaddr, KM_USER0);
1455 flush_dcache_page(dst); 1465 flush_dcache_page(dst);
1456 return; 1466 return;
1457 1467
1458 } 1468 }
1459 copy_user_highpage(dst, src, va); 1469 copy_user_highpage(dst, src, va, vma);
1460} 1470}
1461 1471
1462/* 1472/*
@@ -1567,7 +1577,7 @@ gotten:
1567 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); 1577 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1568 if (!new_page) 1578 if (!new_page)
1569 goto oom; 1579 goto oom;
1570 cow_user_page(new_page, old_page, address); 1580 cow_user_page(new_page, old_page, address, vma);
1571 } 1581 }
1572 1582
1573 /* 1583 /*
@@ -1576,7 +1586,7 @@ gotten:
1576 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 1586 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1577 if (likely(pte_same(*page_table, orig_pte))) { 1587 if (likely(pte_same(*page_table, orig_pte))) {
1578 if (old_page) { 1588 if (old_page) {
1579 page_remove_rmap(old_page); 1589 page_remove_rmap(old_page, vma);
1580 if (!PageAnon(old_page)) { 1590 if (!PageAnon(old_page)) {
1581 dec_mm_counter(mm, file_rss); 1591 dec_mm_counter(mm, file_rss);
1582 inc_mm_counter(mm, anon_rss); 1592 inc_mm_counter(mm, anon_rss);
@@ -1902,7 +1912,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1902 1912
1903 return 0; 1913 return 0;
1904} 1914}
1905EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */
1906 1915
1907/** 1916/**
1908 * swapin_readahead - swap in pages in hope we need them soon 1917 * swapin_readahead - swap in pages in hope we need them soon
@@ -1991,6 +2000,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1991 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2000 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
1992 page = lookup_swap_cache(entry); 2001 page = lookup_swap_cache(entry);
1993 if (!page) { 2002 if (!page) {
2003 grab_swap_token(); /* Contend for token _before_ read-in */
1994 swapin_readahead(entry, address, vma); 2004 swapin_readahead(entry, address, vma);
1995 page = read_swap_cache_async(entry, vma, address); 2005 page = read_swap_cache_async(entry, vma, address);
1996 if (!page) { 2006 if (!page) {
@@ -2008,7 +2018,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2008 /* Had to read the page from swap area: Major fault */ 2018 /* Had to read the page from swap area: Major fault */
2009 ret = VM_FAULT_MAJOR; 2019 ret = VM_FAULT_MAJOR;
2010 count_vm_event(PGMAJFAULT); 2020 count_vm_event(PGMAJFAULT);
2011 grab_swap_token();
2012 } 2021 }
2013 2022
2014 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2023 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2191,7 +2200,7 @@ retry:
2191 page = alloc_page_vma(GFP_HIGHUSER, vma, address); 2200 page = alloc_page_vma(GFP_HIGHUSER, vma, address);
2192 if (!page) 2201 if (!page)
2193 goto oom; 2202 goto oom;
2194 copy_user_highpage(page, new_page, address); 2203 copy_user_highpage(page, new_page, address, vma);
2195 page_cache_release(new_page); 2204 page_cache_release(new_page);
2196 new_page = page; 2205 new_page = page;
2197 anon = 1; 2206 anon = 1;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fd678a662eae..84279127fcd3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -67,12 +67,13 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
67 zone_type = zone - pgdat->node_zones; 67 zone_type = zone - pgdat->node_zones;
68 if (!populated_zone(zone)) { 68 if (!populated_zone(zone)) {
69 int ret = 0; 69 int ret = 0;
70 ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages); 70 ret = init_currently_empty_zone(zone, phys_start_pfn,
71 nr_pages, MEMMAP_HOTPLUG);
71 if (ret < 0) 72 if (ret < 0)
72 return ret; 73 return ret;
73 } 74 }
74 memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); 75 memmap_init_zone(nr_pages, nid, zone_type,
75 zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); 76 phys_start_pfn, MEMMAP_HOTPLUG);
76 return 0; 77 return 0;
77} 78}
78 79
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 617fb31086ee..da9463946556 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
141 enum zone_type k; 141 enum zone_type k;
142 142
143 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 143 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
144 max++; /* space for zlcache_ptr (see mmzone.h) */
144 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); 145 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
145 if (!zl) 146 if (!zl)
146 return NULL; 147 return NULL;
148 zl->zlcache_ptr = NULL;
147 num = 0; 149 num = 0;
148 /* First put in the highest zones from all nodes, then all the next 150 /* First put in the highest zones from all nodes, then all the next
149 lower zones etc. Avoid empty zones because the memory allocator 151 lower zones etc. Avoid empty zones because the memory allocator
@@ -219,7 +221,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
219 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 221 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
220 do { 222 do {
221 struct page *page; 223 struct page *page;
222 unsigned int nid; 224 int nid;
223 225
224 if (!pte_present(*pte)) 226 if (!pte_present(*pte))
225 continue; 227 continue;
@@ -1324,7 +1326,7 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
1324 atomic_set(&new->refcnt, 1); 1326 atomic_set(&new->refcnt, 1);
1325 if (new->policy == MPOL_BIND) { 1327 if (new->policy == MPOL_BIND) {
1326 int sz = ksize(old->v.zonelist); 1328 int sz = ksize(old->v.zonelist);
1327 new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL); 1329 new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1328 if (!new->v.zonelist) { 1330 if (!new->v.zonelist) {
1329 kmem_cache_free(policy_cache, new); 1331 kmem_cache_free(policy_cache, new);
1330 return ERR_PTR(-ENOMEM); 1332 return ERR_PTR(-ENOMEM);
@@ -1705,8 +1707,8 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1705 * Display pages allocated per node and memory policy via /proc. 1707 * Display pages allocated per node and memory policy via /proc.
1706 */ 1708 */
1707 1709
1708static const char *policy_types[] = { "default", "prefer", "bind", 1710static const char * const policy_types[] =
1709 "interleave" }; 1711 { "default", "prefer", "bind", "interleave" };
1710 1712
1711/* 1713/*
1712 * Convert a mempolicy into a string. 1714 * Convert a mempolicy into a string.
@@ -1855,7 +1857,7 @@ int show_numa_map(struct seq_file *m, void *v)
1855 1857
1856 if (file) { 1858 if (file) {
1857 seq_printf(m, " file="); 1859 seq_printf(m, " file=");
1858 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= "); 1860 seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1859 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 1861 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1860 seq_printf(m, " heap"); 1862 seq_printf(m, " heap");
1861 } else if (vma->vm_start <= mm->start_stack && 1863 } else if (vma->vm_start <= mm->start_stack &&
diff --git a/mm/migrate.c b/mm/migrate.c
index b4979d423d2b..e9b161bde95b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -294,7 +294,7 @@ out:
294static int migrate_page_move_mapping(struct address_space *mapping, 294static int migrate_page_move_mapping(struct address_space *mapping,
295 struct page *newpage, struct page *page) 295 struct page *newpage, struct page *page)
296{ 296{
297 struct page **radix_pointer; 297 void **pslot;
298 298
299 if (!mapping) { 299 if (!mapping) {
300 /* Anonymous page */ 300 /* Anonymous page */
@@ -305,12 +305,11 @@ static int migrate_page_move_mapping(struct address_space *mapping,
305 305
306 write_lock_irq(&mapping->tree_lock); 306 write_lock_irq(&mapping->tree_lock);
307 307
308 radix_pointer = (struct page **)radix_tree_lookup_slot( 308 pslot = radix_tree_lookup_slot(&mapping->page_tree,
309 &mapping->page_tree, 309 page_index(page));
310 page_index(page));
311 310
312 if (page_count(page) != 2 + !!PagePrivate(page) || 311 if (page_count(page) != 2 + !!PagePrivate(page) ||
313 *radix_pointer != page) { 312 (struct page *)radix_tree_deref_slot(pslot) != page) {
314 write_unlock_irq(&mapping->tree_lock); 313 write_unlock_irq(&mapping->tree_lock);
315 return -EAGAIN; 314 return -EAGAIN;
316 } 315 }
@@ -318,7 +317,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
318 /* 317 /*
319 * Now we know that no one else is looking at the page. 318 * Now we know that no one else is looking at the page.
320 */ 319 */
321 get_page(newpage); 320 get_page(newpage); /* add cache reference */
322#ifdef CONFIG_SWAP 321#ifdef CONFIG_SWAP
323 if (PageSwapCache(page)) { 322 if (PageSwapCache(page)) {
324 SetPageSwapCache(newpage); 323 SetPageSwapCache(newpage);
@@ -326,8 +325,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
326 } 325 }
327#endif 326#endif
328 327
329 *radix_pointer = newpage; 328 radix_tree_replace_slot(pslot, newpage);
329
330 /*
331 * Drop cache reference from old page.
332 * We know this isn't the last reference.
333 */
330 __put_page(page); 334 __put_page(page);
335
331 write_unlock_irq(&mapping->tree_lock); 336 write_unlock_irq(&mapping->tree_lock);
332 337
333 return 0; 338 return 0;
diff --git a/mm/mincore.c b/mm/mincore.c
index 72890780c1c9..8aca6f7167bb 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * linux/mm/mincore.c 2 * linux/mm/mincore.c
3 * 3 *
4 * Copyright (C) 1994-1999 Linus Torvalds 4 * Copyright (C) 1994-2006 Linus Torvalds
5 */ 5 */
6 6
7/* 7/*
@@ -38,46 +38,51 @@ static unsigned char mincore_page(struct vm_area_struct * vma,
38 return present; 38 return present;
39} 39}
40 40
41static long mincore_vma(struct vm_area_struct * vma, 41/*
42 unsigned long start, unsigned long end, unsigned char __user * vec) 42 * Do a chunk of "sys_mincore()". We've already checked
43 * all the arguments, we hold the mmap semaphore: we should
44 * just return the amount of info we're asked for.
45 */
46static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
43{ 47{
44 long error, i, remaining; 48 unsigned long i, nr, pgoff;
45 unsigned char * tmp; 49 struct vm_area_struct *vma = find_vma(current->mm, addr);
46
47 error = -ENOMEM;
48 if (!vma->vm_file)
49 return error;
50
51 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
52 if (end > vma->vm_end)
53 end = vma->vm_end;
54 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
55 50
56 error = -EAGAIN; 51 /*
57 tmp = (unsigned char *) __get_free_page(GFP_KERNEL); 52 * find_vma() didn't find anything above us, or we're
58 if (!tmp) 53 * in an unmapped hole in the address space: ENOMEM.
59 return error; 54 */
55 if (!vma || addr < vma->vm_start)
56 return -ENOMEM;
60 57
61 /* (end - start) is # of pages, and also # of bytes in "vec */ 58 /*
62 remaining = (end - start), 59 * Ok, got it. But check whether it's a segment we support
60 * mincore() on. Right now, we don't do any anonymous mappings.
61 *
62 * FIXME: This is just stupid. And returning ENOMEM is
63 * stupid too. We should just look at the page tables. But
64 * this is what we've traditionally done, so we'll just
65 * continue doing it.
66 */
67 if (!vma->vm_file)
68 return -ENOMEM;
63 69
64 error = 0; 70 /*
65 for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { 71 * Calculate how many pages there are left in the vma, and
66 int j = 0; 72 * what the pgoff is for our address.
67 long thispiece = (remaining < PAGE_SIZE) ? 73 */
68 remaining : PAGE_SIZE; 74 nr = (vma->vm_end - addr) >> PAGE_SHIFT;
75 if (nr > pages)
76 nr = pages;
69 77
70 while (j < thispiece) 78 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
71 tmp[j++] = mincore_page(vma, start++); 79 pgoff += vma->vm_pgoff;
72 80
73 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { 81 /* And then we just fill the sucker in.. */
74 error = -EFAULT; 82 for (i = 0 ; i < nr; i++, pgoff++)
75 break; 83 vec[i] = mincore_page(vma, pgoff);
76 }
77 }
78 84
79 free_page((unsigned long) tmp); 85 return nr;
80 return error;
81} 86}
82 87
83/* 88/*
@@ -107,82 +112,50 @@ static long mincore_vma(struct vm_area_struct * vma,
107asmlinkage long sys_mincore(unsigned long start, size_t len, 112asmlinkage long sys_mincore(unsigned long start, size_t len,
108 unsigned char __user * vec) 113 unsigned char __user * vec)
109{ 114{
110 int index = 0; 115 long retval;
111 unsigned long end, limit; 116 unsigned long pages;
112 struct vm_area_struct * vma; 117 unsigned char *tmp;
113 size_t max;
114 int unmapped_error = 0;
115 long error;
116
117 /* check the arguments */
118 if (start & ~PAGE_CACHE_MASK)
119 goto einval;
120
121 limit = TASK_SIZE;
122 if (start >= limit)
123 goto enomem;
124
125 if (!len)
126 return 0;
127
128 max = limit - start;
129 len = PAGE_CACHE_ALIGN(len);
130 if (len > max || !len)
131 goto enomem;
132 118
133 end = start + len; 119 /* Check the start address: needs to be page-aligned.. */
120 if (start & ~PAGE_CACHE_MASK)
121 return -EINVAL;
134 122
135 /* check the output buffer whilst holding the lock */ 123 /* ..and we need to be passed a valid user-space range */
136 error = -EFAULT; 124 if (!access_ok(VERIFY_READ, (void __user *) start, len))
137 down_read(&current->mm->mmap_sem); 125 return -ENOMEM;
138 126
139 if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT)) 127 /* This also avoids any overflows on PAGE_CACHE_ALIGN */
140 goto out; 128 pages = len >> PAGE_SHIFT;
129 pages += (len & ~PAGE_MASK) != 0;
141 130
142 /* 131 if (!access_ok(VERIFY_WRITE, vec, pages))
143 * If the interval [start,end) covers some unmapped address 132 return -EFAULT;
144 * ranges, just ignore them, but return -ENOMEM at the end.
145 */
146 error = 0;
147
148 vma = find_vma(current->mm, start);
149 while (vma) {
150 /* Here start < vma->vm_end. */
151 if (start < vma->vm_start) {
152 unmapped_error = -ENOMEM;
153 start = vma->vm_start;
154 }
155 133
156 /* Here vma->vm_start <= start < vma->vm_end. */ 134 tmp = (void *) __get_free_page(GFP_USER);
157 if (end <= vma->vm_end) { 135 if (!tmp)
158 if (start < end) { 136 return -EAGAIN;
159 error = mincore_vma(vma, start, end, 137
160 &vec[index]); 138 retval = 0;
161 if (error) 139 while (pages) {
162 goto out; 140 /*
163 } 141 * Do at most PAGE_SIZE entries per iteration, due to
164 error = unmapped_error; 142 * the temporary buffer size.
165 goto out; 143 */
144 down_read(&current->mm->mmap_sem);
145 retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
146 up_read(&current->mm->mmap_sem);
147
148 if (retval <= 0)
149 break;
150 if (copy_to_user(vec, tmp, retval)) {
151 retval = -EFAULT;
152 break;
166 } 153 }
167 154 pages -= retval;
168 /* Here vma->vm_start <= start < vma->vm_end < end. */ 155 vec += retval;
169 error = mincore_vma(vma, start, vma->vm_end, &vec[index]); 156 start += retval << PAGE_SHIFT;
170 if (error) 157 retval = 0;
171 goto out;
172 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
173 start = vma->vm_end;
174 vma = vma->vm_next;
175 } 158 }
176 159 free_page((unsigned long) tmp);
177 /* we found a hole in the area queried if we arrive here */ 160 return retval;
178 error = -ENOMEM;
179
180out:
181 up_read(&current->mm->mmap_sem);
182 return error;
183
184einval:
185 return -EINVAL;
186enomem:
187 return -ENOMEM;
188} 161}
diff --git a/mm/mlock.c b/mm/mlock.c
index b90c59573abf..3446b7ef731e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -65,7 +65,7 @@ success:
65 ret = make_pages_present(start, end); 65 ret = make_pages_present(start, end);
66 } 66 }
67 67
68 vma->vm_mm->locked_vm -= pages; 68 mm->locked_vm -= pages;
69out: 69out:
70 if (ret == -ENOMEM) 70 if (ret == -ENOMEM)
71 ret = -EAGAIN; 71 ret = -EAGAIN;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7b40abd7cba2..9717337293c3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -188,7 +188,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
188 struct file *file, struct address_space *mapping) 188 struct file *file, struct address_space *mapping)
189{ 189{
190 if (vma->vm_flags & VM_DENYWRITE) 190 if (vma->vm_flags & VM_DENYWRITE)
191 atomic_inc(&file->f_dentry->d_inode->i_writecount); 191 atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
192 if (vma->vm_flags & VM_SHARED) 192 if (vma->vm_flags & VM_SHARED)
193 mapping->i_mmap_writable--; 193 mapping->i_mmap_writable--;
194 194
@@ -399,7 +399,7 @@ static inline void __vma_link_file(struct vm_area_struct *vma)
399 struct address_space *mapping = file->f_mapping; 399 struct address_space *mapping = file->f_mapping;
400 400
401 if (vma->vm_flags & VM_DENYWRITE) 401 if (vma->vm_flags & VM_DENYWRITE)
402 atomic_dec(&file->f_dentry->d_inode->i_writecount); 402 atomic_dec(&file->f_path.dentry->d_inode->i_writecount);
403 if (vma->vm_flags & VM_SHARED) 403 if (vma->vm_flags & VM_SHARED)
404 mapping->i_mmap_writable++; 404 mapping->i_mmap_writable++;
405 405
@@ -907,7 +907,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
907 * mounted, in which case we dont add PROT_EXEC.) 907 * mounted, in which case we dont add PROT_EXEC.)
908 */ 908 */
909 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) 909 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
910 if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))) 910 if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
911 prot |= PROT_EXEC; 911 prot |= PROT_EXEC;
912 912
913 if (!len) 913 if (!len)
@@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
960 return -EAGAIN; 960 return -EAGAIN;
961 } 961 }
962 962
963 inode = file ? file->f_dentry->d_inode : NULL; 963 inode = file ? file->f_path.dentry->d_inode : NULL;
964 964
965 if (file) { 965 if (file) {
966 switch (flags & MAP_TYPE) { 966 switch (flags & MAP_TYPE) {
@@ -989,7 +989,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
989 case MAP_PRIVATE: 989 case MAP_PRIVATE:
990 if (!(file->f_mode & FMODE_READ)) 990 if (!(file->f_mode & FMODE_READ))
991 return -EACCES; 991 return -EACCES;
992 if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { 992 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
993 if (vm_flags & VM_EXEC) 993 if (vm_flags & VM_EXEC)
994 return -EPERM; 994 return -EPERM;
995 vm_flags &= ~VM_MAYEXEC; 995 vm_flags &= ~VM_MAYEXEC;
@@ -1736,7 +1736,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1736 if (mm->map_count >= sysctl_max_map_count) 1736 if (mm->map_count >= sysctl_max_map_count)
1737 return -ENOMEM; 1737 return -ENOMEM;
1738 1738
1739 new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 1739 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1740 if (!new) 1740 if (!new)
1741 return -ENOMEM; 1741 return -ENOMEM;
1742 1742
@@ -2057,7 +2057,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2057 vma_start < new_vma->vm_end) 2057 vma_start < new_vma->vm_end)
2058 *vmap = new_vma; 2058 *vmap = new_vma;
2059 } else { 2059 } else {
2060 new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 2060 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2061 if (new_vma) { 2061 if (new_vma) {
2062 *new_vma = *vma; 2062 *new_vma = *vma;
2063 pol = mpol_copy(vma_policy(vma)); 2063 pol = mpol_copy(vma_policy(vma));
diff --git a/mm/mmzone.c b/mm/mmzone.c
index febea1c98168..eb5838634f18 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -14,8 +14,6 @@ struct pglist_data *first_online_pgdat(void)
14 return NODE_DATA(first_online_node); 14 return NODE_DATA(first_online_node);
15} 15}
16 16
17EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */
18
19struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) 17struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
20{ 18{
21 int nid = next_online_node(pgdat->node_id); 19 int nid = next_online_node(pgdat->node_id);
@@ -24,8 +22,6 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
24 return NULL; 22 return NULL;
25 return NODE_DATA(nid); 23 return NODE_DATA(nid);
26} 24}
27EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */
28
29 25
30/* 26/*
31 * next_zone - helper magic for for_each_zone() 27 * next_zone - helper magic for for_each_zone()
@@ -45,5 +41,4 @@ struct zone *next_zone(struct zone *zone)
45 } 41 }
46 return zone; 42 return zone;
47} 43}
48EXPORT_UNUSED_SYMBOL(next_zone); /* June 2006 */
49 44
diff --git a/mm/nommu.c b/mm/nommu.c
index 8bdde9508f3b..23fb033e596d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -497,15 +497,17 @@ static int validate_mmap_request(struct file *file,
497 (flags & MAP_TYPE) != MAP_SHARED) 497 (flags & MAP_TYPE) != MAP_SHARED)
498 return -EINVAL; 498 return -EINVAL;
499 499
500 if (PAGE_ALIGN(len) == 0) 500 if (!len)
501 return addr;
502
503 if (len > TASK_SIZE)
504 return -EINVAL; 501 return -EINVAL;
505 502
503 /* Careful about overflows.. */
504 len = PAGE_ALIGN(len);
505 if (!len || len > TASK_SIZE)
506 return -ENOMEM;
507
506 /* offset overflow? */ 508 /* offset overflow? */
507 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) 509 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
508 return -EINVAL; 510 return -EOVERFLOW;
509 511
510 if (file) { 512 if (file) {
511 /* validate file mapping requests */ 513 /* validate file mapping requests */
@@ -521,7 +523,7 @@ static int validate_mmap_request(struct file *file,
521 */ 523 */
522 mapping = file->f_mapping; 524 mapping = file->f_mapping;
523 if (!mapping) 525 if (!mapping)
524 mapping = file->f_dentry->d_inode->i_mapping; 526 mapping = file->f_path.dentry->d_inode->i_mapping;
525 527
526 capabilities = 0; 528 capabilities = 0;
527 if (mapping && mapping->backing_dev_info) 529 if (mapping && mapping->backing_dev_info)
@@ -530,7 +532,7 @@ static int validate_mmap_request(struct file *file,
530 if (!capabilities) { 532 if (!capabilities) {
531 /* no explicit capabilities set, so assume some 533 /* no explicit capabilities set, so assume some
532 * defaults */ 534 * defaults */
533 switch (file->f_dentry->d_inode->i_mode & S_IFMT) { 535 switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) {
534 case S_IFREG: 536 case S_IFREG:
535 case S_IFBLK: 537 case S_IFBLK:
536 capabilities = BDI_CAP_MAP_COPY; 538 capabilities = BDI_CAP_MAP_COPY;
@@ -561,11 +563,11 @@ static int validate_mmap_request(struct file *file,
561 !(file->f_mode & FMODE_WRITE)) 563 !(file->f_mode & FMODE_WRITE))
562 return -EACCES; 564 return -EACCES;
563 565
564 if (IS_APPEND(file->f_dentry->d_inode) && 566 if (IS_APPEND(file->f_path.dentry->d_inode) &&
565 (file->f_mode & FMODE_WRITE)) 567 (file->f_mode & FMODE_WRITE))
566 return -EACCES; 568 return -EACCES;
567 569
568 if (locks_verify_locked(file->f_dentry->d_inode)) 570 if (locks_verify_locked(file->f_path.dentry->d_inode))
569 return -EAGAIN; 571 return -EAGAIN;
570 572
571 if (!(capabilities & BDI_CAP_MAP_DIRECT)) 573 if (!(capabilities & BDI_CAP_MAP_DIRECT))
@@ -596,7 +598,7 @@ static int validate_mmap_request(struct file *file,
596 598
597 /* handle executable mappings and implied executable 599 /* handle executable mappings and implied executable
598 * mappings */ 600 * mappings */
599 if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { 601 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
600 if (prot & PROT_EXEC) 602 if (prot & PROT_EXEC)
601 return -EPERM; 603 return -EPERM;
602 } 604 }
@@ -806,10 +808,9 @@ unsigned long do_mmap_pgoff(struct file *file,
806 vm_flags = determine_vm_flags(file, prot, flags, capabilities); 808 vm_flags = determine_vm_flags(file, prot, flags, capabilities);
807 809
808 /* we're going to need to record the mapping if it works */ 810 /* we're going to need to record the mapping if it works */
809 vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL); 811 vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
810 if (!vml) 812 if (!vml)
811 goto error_getting_vml; 813 goto error_getting_vml;
812 memset(vml, 0, sizeof(*vml));
813 814
814 down_write(&nommu_vma_sem); 815 down_write(&nommu_vma_sem);
815 816
@@ -832,7 +833,7 @@ unsigned long do_mmap_pgoff(struct file *file,
832 continue; 833 continue;
833 834
834 /* search for overlapping mappings on the same file */ 835 /* search for overlapping mappings on the same file */
835 if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode) 836 if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode)
836 continue; 837 continue;
837 838
838 if (vma->vm_pgoff >= pgoff + pglen) 839 if (vma->vm_pgoff >= pgoff + pglen)
@@ -885,11 +886,10 @@ unsigned long do_mmap_pgoff(struct file *file,
885 } 886 }
886 887
887 /* we're going to need a VMA struct as well */ 888 /* we're going to need a VMA struct as well */
888 vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); 889 vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
889 if (!vma) 890 if (!vma)
890 goto error_getting_vma; 891 goto error_getting_vma;
891 892
892 memset(vma, 0, sizeof(*vma));
893 INIT_LIST_HEAD(&vma->anon_vma_node); 893 INIT_LIST_HEAD(&vma->anon_vma_node);
894 atomic_set(&vma->vm_usage, 1); 894 atomic_set(&vma->vm_usage, 1);
895 if (file) 895 if (file)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2e3ce3a928b9..b278b8d60eee 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -61,12 +61,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
61 } 61 }
62 62
63 /* 63 /*
64 * swapoff can easily use up all memory, so kill those first.
65 */
66 if (p->flags & PF_SWAPOFF)
67 return ULONG_MAX;
68
69 /*
70 * The memory size of the process is the basis for the badness. 64 * The memory size of the process is the basis for the badness.
71 */ 65 */
72 points = mm->total_vm; 66 points = mm->total_vm;
@@ -77,6 +71,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
77 task_unlock(p); 71 task_unlock(p);
78 72
79 /* 73 /*
74 * swapoff can easily use up all memory, so kill those first.
75 */
76 if (p->flags & PF_SWAPOFF)
77 return ULONG_MAX;
78
79 /*
80 * Processes which fork a lot of child processes are likely 80 * Processes which fork a lot of child processes are likely
81 * a good choice. We add half the vmsize of the children if they 81 * a good choice. We add half the vmsize of the children if they
82 * have an own mm. This prevents forking servers to flood the 82 * have an own mm. This prevents forking servers to flood the
@@ -174,10 +174,15 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
174{ 174{
175#ifdef CONFIG_NUMA 175#ifdef CONFIG_NUMA
176 struct zone **z; 176 struct zone **z;
177 nodemask_t nodes = node_online_map; 177 nodemask_t nodes;
178 int node;
179 /* node has memory ? */
180 for_each_online_node(node)
181 if (NODE_DATA(node)->node_present_pages)
182 node_set(node, nodes);
178 183
179 for (z = zonelist->zones; *z; z++) 184 for (z = zonelist->zones; *z; z++)
180 if (cpuset_zone_allowed(*z, gfp_mask)) 185 if (cpuset_zone_allowed_softwall(*z, gfp_mask))
181 node_clear(zone_to_nid(*z), nodes); 186 node_clear(zone_to_nid(*z), nodes);
182 else 187 else
183 return CONSTRAINT_CPUSET; 188 return CONSTRAINT_CPUSET;
@@ -264,7 +269,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
264 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO 269 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
265 * set. 270 * set.
266 */ 271 */
267static void __oom_kill_task(struct task_struct *p, const char *message) 272static void __oom_kill_task(struct task_struct *p, int verbose)
268{ 273{
269 if (is_init(p)) { 274 if (is_init(p)) {
270 WARN_ON(1); 275 WARN_ON(1);
@@ -278,10 +283,8 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
278 return; 283 return;
279 } 284 }
280 285
281 if (message) { 286 if (verbose)
282 printk(KERN_ERR "%s: Killed process %d (%s).\n", 287 printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm);
283 message, p->pid, p->comm);
284 }
285 288
286 /* 289 /*
287 * We give our sacrificial lamb high priority and access to 290 * We give our sacrificial lamb high priority and access to
@@ -294,7 +297,7 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
294 force_sig(SIGKILL, p); 297 force_sig(SIGKILL, p);
295} 298}
296 299
297static int oom_kill_task(struct task_struct *p, const char *message) 300static int oom_kill_task(struct task_struct *p)
298{ 301{
299 struct mm_struct *mm; 302 struct mm_struct *mm;
300 struct task_struct *g, *q; 303 struct task_struct *g, *q;
@@ -313,15 +316,25 @@ static int oom_kill_task(struct task_struct *p, const char *message)
313 if (mm == NULL) 316 if (mm == NULL)
314 return 1; 317 return 1;
315 318
316 __oom_kill_task(p, message); 319 /*
320 * Don't kill the process if any threads are set to OOM_DISABLE
321 */
322 do_each_thread(g, q) {
323 if (q->mm == mm && p->oomkilladj == OOM_DISABLE)
324 return 1;
325 } while_each_thread(g, q);
326
327 __oom_kill_task(p, 1);
328
317 /* 329 /*
318 * kill all processes that share the ->mm (i.e. all threads), 330 * kill all processes that share the ->mm (i.e. all threads),
319 * but are in a different thread group 331 * but are in a different thread group. Don't let them have access
332 * to memory reserves though, otherwise we might deplete all memory.
320 */ 333 */
321 do_each_thread(g, q) 334 do_each_thread(g, q) {
322 if (q->mm == mm && q->tgid != p->tgid) 335 if (q->mm == mm && q->tgid != p->tgid)
323 __oom_kill_task(q, message); 336 force_sig(SIGKILL, p);
324 while_each_thread(g, q); 337 } while_each_thread(g, q);
325 338
326 return 0; 339 return 0;
327} 340}
@@ -337,21 +350,22 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
337 * its children or threads, just set TIF_MEMDIE so it can die quickly 350 * its children or threads, just set TIF_MEMDIE so it can die quickly
338 */ 351 */
339 if (p->flags & PF_EXITING) { 352 if (p->flags & PF_EXITING) {
340 __oom_kill_task(p, NULL); 353 __oom_kill_task(p, 0);
341 return 0; 354 return 0;
342 } 355 }
343 356
344 printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li" 357 printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
345 " and children.\n", p->pid, p->comm, points); 358 message, p->pid, p->comm, points);
359
346 /* Try to kill a child first */ 360 /* Try to kill a child first */
347 list_for_each(tsk, &p->children) { 361 list_for_each(tsk, &p->children) {
348 c = list_entry(tsk, struct task_struct, sibling); 362 c = list_entry(tsk, struct task_struct, sibling);
349 if (c->mm == p->mm) 363 if (c->mm == p->mm)
350 continue; 364 continue;
351 if (!oom_kill_task(c, message)) 365 if (!oom_kill_task(c))
352 return 0; 366 return 0;
353 } 367 }
354 return oom_kill_task(p, message); 368 return oom_kill_task(p);
355} 369}
356 370
357static BLOCKING_NOTIFIER_HEAD(oom_notify_list); 371static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8d9b19f239c3..1d2fc89ca56d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include <linux/task_io_accounting_ops.h>
24#include <linux/blkdev.h> 25#include <linux/blkdev.h>
25#include <linux/mpage.h> 26#include <linux/mpage.h>
26#include <linux/rmap.h> 27#include <linux/rmap.h>
@@ -761,23 +762,24 @@ int __set_page_dirty_nobuffers(struct page *page)
761 struct address_space *mapping = page_mapping(page); 762 struct address_space *mapping = page_mapping(page);
762 struct address_space *mapping2; 763 struct address_space *mapping2;
763 764
764 if (mapping) { 765 if (!mapping)
765 write_lock_irq(&mapping->tree_lock); 766 return 1;
766 mapping2 = page_mapping(page); 767
767 if (mapping2) { /* Race with truncate? */ 768 write_lock_irq(&mapping->tree_lock);
768 BUG_ON(mapping2 != mapping); 769 mapping2 = page_mapping(page);
769 if (mapping_cap_account_dirty(mapping)) 770 if (mapping2) { /* Race with truncate? */
770 __inc_zone_page_state(page, 771 BUG_ON(mapping2 != mapping);
771 NR_FILE_DIRTY); 772 if (mapping_cap_account_dirty(mapping)) {
772 radix_tree_tag_set(&mapping->page_tree, 773 __inc_zone_page_state(page, NR_FILE_DIRTY);
773 page_index(page), PAGECACHE_TAG_DIRTY); 774 task_io_account_write(PAGE_CACHE_SIZE);
774 }
775 write_unlock_irq(&mapping->tree_lock);
776 if (mapping->host) {
777 /* !PageAnon && !swapper_space */
778 __mark_inode_dirty(mapping->host,
779 I_DIRTY_PAGES);
780 } 775 }
776 radix_tree_tag_set(&mapping->page_tree,
777 page_index(page), PAGECACHE_TAG_DIRTY);
778 }
779 write_unlock_irq(&mapping->tree_lock);
780 if (mapping->host) {
781 /* !PageAnon && !swapper_space */
782 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
781 } 783 }
782 return 1; 784 return 1;
783 } 785 }
@@ -843,39 +845,6 @@ int set_page_dirty_lock(struct page *page)
843EXPORT_SYMBOL(set_page_dirty_lock); 845EXPORT_SYMBOL(set_page_dirty_lock);
844 846
845/* 847/*
846 * Clear a page's dirty flag, while caring for dirty memory accounting.
847 * Returns true if the page was previously dirty.
848 */
849int test_clear_page_dirty(struct page *page)
850{
851 struct address_space *mapping = page_mapping(page);
852 unsigned long flags;
853
854 if (mapping) {
855 write_lock_irqsave(&mapping->tree_lock, flags);
856 if (TestClearPageDirty(page)) {
857 radix_tree_tag_clear(&mapping->page_tree,
858 page_index(page),
859 PAGECACHE_TAG_DIRTY);
860 write_unlock_irqrestore(&mapping->tree_lock, flags);
861 /*
862 * We can continue to use `mapping' here because the
863 * page is locked, which pins the address_space
864 */
865 if (mapping_cap_account_dirty(mapping)) {
866 page_mkclean(page);
867 dec_zone_page_state(page, NR_FILE_DIRTY);
868 }
869 return 1;
870 }
871 write_unlock_irqrestore(&mapping->tree_lock, flags);
872 return 0;
873 }
874 return TestClearPageDirty(page);
875}
876EXPORT_SYMBOL(test_clear_page_dirty);
877
878/*
879 * Clear a page's dirty flag, while caring for dirty memory accounting. 848 * Clear a page's dirty flag, while caring for dirty memory accounting.
880 * Returns true if the page was previously dirty. 849 * Returns true if the page was previously dirty.
881 * 850 *
@@ -893,12 +862,41 @@ int clear_page_dirty_for_io(struct page *page)
893{ 862{
894 struct address_space *mapping = page_mapping(page); 863 struct address_space *mapping = page_mapping(page);
895 864
896 if (mapping) { 865 if (mapping && mapping_cap_account_dirty(mapping)) {
866 /*
867 * Yes, Virginia, this is indeed insane.
868 *
869 * We use this sequence to make sure that
870 * (a) we account for dirty stats properly
871 * (b) we tell the low-level filesystem to
872 * mark the whole page dirty if it was
873 * dirty in a pagetable. Only to then
874 * (c) clean the page again and return 1 to
875 * cause the writeback.
876 *
877 * This way we avoid all nasty races with the
878 * dirty bit in multiple places and clearing
879 * them concurrently from different threads.
880 *
881 * Note! Normally the "set_page_dirty(page)"
882 * has no effect on the actual dirty bit - since
883 * that will already usually be set. But we
884 * need the side effects, and it can help us
885 * avoid races.
886 *
887 * We basically use the page "master dirty bit"
888 * as a serialization point for all the different
889 * threads doing their things.
890 *
891 * FIXME! We still have a race here: if somebody
892 * adds the page back to the page tables in
893 * between the "page_mkclean()" and the "TestClearPageDirty()",
894 * we might have it mapped without the dirty bit set.
895 */
896 if (page_mkclean(page))
897 set_page_dirty(page);
897 if (TestClearPageDirty(page)) { 898 if (TestClearPageDirty(page)) {
898 if (mapping_cap_account_dirty(mapping)) { 899 dec_zone_page_state(page, NR_FILE_DIRTY);
899 page_mkclean(page);
900 dec_zone_page_state(page, NR_FILE_DIRTY);
901 }
902 return 1; 900 return 1;
903 } 901 }
904 return 0; 902 return 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index aa6fcc7ca66f..fc5b5442e942 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -40,6 +40,7 @@
40#include <linux/sort.h> 40#include <linux/sort.h>
41#include <linux/pfn.h> 41#include <linux/pfn.h>
42#include <linux/backing-dev.h> 42#include <linux/backing-dev.h>
43#include <linux/fault-inject.h>
43 44
44#include <asm/tlbflush.h> 45#include <asm/tlbflush.h>
45#include <asm/div64.h> 46#include <asm/div64.h>
@@ -83,14 +84,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
83 84
84EXPORT_SYMBOL(totalram_pages); 85EXPORT_SYMBOL(totalram_pages);
85 86
86/* 87static char * const zone_names[MAX_NR_ZONES] = {
87 * Used by page_zone() to look up the address of the struct zone whose
88 * id is encoded in the upper bits of page->flags
89 */
90struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
91EXPORT_SYMBOL(zone_table);
92
93static char *zone_names[MAX_NR_ZONES] = {
94 "DMA", 88 "DMA",
95#ifdef CONFIG_ZONE_DMA32 89#ifdef CONFIG_ZONE_DMA32
96 "DMA32", 90 "DMA32",
@@ -237,7 +231,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
237 int i; 231 int i;
238 int nr_pages = 1 << order; 232 int nr_pages = 1 << order;
239 233
240 page[1].lru.next = (void *)free_compound_page; /* set dtor */ 234 set_compound_page_dtor(page, free_compound_page);
241 page[1].lru.prev = (void *)order; 235 page[1].lru.prev = (void *)order;
242 for (i = 0; i < nr_pages; i++) { 236 for (i = 0; i < nr_pages; i++) {
243 struct page *p = page + i; 237 struct page *p = page + i;
@@ -486,7 +480,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order)
486 spin_lock(&zone->lock); 480 spin_lock(&zone->lock);
487 zone->all_unreclaimable = 0; 481 zone->all_unreclaimable = 0;
488 zone->pages_scanned = 0; 482 zone->pages_scanned = 0;
489 __free_one_page(page, zone ,order); 483 __free_one_page(page, zone, order);
490 spin_unlock(&zone->lock); 484 spin_unlock(&zone->lock);
491} 485}
492 486
@@ -605,6 +599,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
605 1 << PG_checked | 1 << PG_mappedtodisk); 599 1 << PG_checked | 1 << PG_mappedtodisk);
606 set_page_private(page, 0); 600 set_page_private(page, 0);
607 set_page_refcounted(page); 601 set_page_refcounted(page);
602
603 arch_alloc_page(page, order);
608 kernel_map_pages(page, 1 << order, 1); 604 kernel_map_pages(page, 1 << order, 1);
609 605
610 if (gfp_flags & __GFP_ZERO) 606 if (gfp_flags & __GFP_ZERO)
@@ -690,9 +686,15 @@ void drain_node_pages(int nodeid)
690 686
691 pcp = &pset->pcp[i]; 687 pcp = &pset->pcp[i];
692 if (pcp->count) { 688 if (pcp->count) {
689 int to_drain;
690
693 local_irq_save(flags); 691 local_irq_save(flags);
694 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 692 if (pcp->count >= pcp->batch)
695 pcp->count = 0; 693 to_drain = pcp->batch;
694 else
695 to_drain = pcp->count;
696 free_pages_bulk(zone, to_drain, &pcp->list, 0);
697 pcp->count -= to_drain;
696 local_irq_restore(flags); 698 local_irq_restore(flags);
697 } 699 }
698 } 700 }
@@ -700,7 +702,6 @@ void drain_node_pages(int nodeid)
700} 702}
701#endif 703#endif
702 704
703#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
704static void __drain_pages(unsigned int cpu) 705static void __drain_pages(unsigned int cpu)
705{ 706{
706 unsigned long flags; 707 unsigned long flags;
@@ -710,6 +711,9 @@ static void __drain_pages(unsigned int cpu)
710 for_each_zone(zone) { 711 for_each_zone(zone) {
711 struct per_cpu_pageset *pset; 712 struct per_cpu_pageset *pset;
712 713
714 if (!populated_zone(zone))
715 continue;
716
713 pset = zone_pcp(zone, cpu); 717 pset = zone_pcp(zone, cpu);
714 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 718 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
715 struct per_cpu_pages *pcp; 719 struct per_cpu_pages *pcp;
@@ -722,7 +726,6 @@ static void __drain_pages(unsigned int cpu)
722 } 726 }
723 } 727 }
724} 728}
725#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
726 729
727#ifdef CONFIG_PM 730#ifdef CONFIG_PM
728 731
@@ -893,6 +896,91 @@ failed:
893#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 896#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
894#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 897#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
895 898
899#ifdef CONFIG_FAIL_PAGE_ALLOC
900
901static struct fail_page_alloc_attr {
902 struct fault_attr attr;
903
904 u32 ignore_gfp_highmem;
905 u32 ignore_gfp_wait;
906
907#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
908
909 struct dentry *ignore_gfp_highmem_file;
910 struct dentry *ignore_gfp_wait_file;
911
912#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
913
914} fail_page_alloc = {
915 .attr = FAULT_ATTR_INITIALIZER,
916 .ignore_gfp_wait = 1,
917 .ignore_gfp_highmem = 1,
918};
919
920static int __init setup_fail_page_alloc(char *str)
921{
922 return setup_fault_attr(&fail_page_alloc.attr, str);
923}
924__setup("fail_page_alloc=", setup_fail_page_alloc);
925
926static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
927{
928 if (gfp_mask & __GFP_NOFAIL)
929 return 0;
930 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
931 return 0;
932 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
933 return 0;
934
935 return should_fail(&fail_page_alloc.attr, 1 << order);
936}
937
938#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
939
940static int __init fail_page_alloc_debugfs(void)
941{
942 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
943 struct dentry *dir;
944 int err;
945
946 err = init_fault_attr_dentries(&fail_page_alloc.attr,
947 "fail_page_alloc");
948 if (err)
949 return err;
950 dir = fail_page_alloc.attr.dentries.dir;
951
952 fail_page_alloc.ignore_gfp_wait_file =
953 debugfs_create_bool("ignore-gfp-wait", mode, dir,
954 &fail_page_alloc.ignore_gfp_wait);
955
956 fail_page_alloc.ignore_gfp_highmem_file =
957 debugfs_create_bool("ignore-gfp-highmem", mode, dir,
958 &fail_page_alloc.ignore_gfp_highmem);
959
960 if (!fail_page_alloc.ignore_gfp_wait_file ||
961 !fail_page_alloc.ignore_gfp_highmem_file) {
962 err = -ENOMEM;
963 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
964 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
965 cleanup_fault_attr_dentries(&fail_page_alloc.attr);
966 }
967
968 return err;
969}
970
971late_initcall(fail_page_alloc_debugfs);
972
973#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
974
975#else /* CONFIG_FAIL_PAGE_ALLOC */
976
977static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
978{
979 return 0;
980}
981
982#endif /* CONFIG_FAIL_PAGE_ALLOC */
983
896/* 984/*
897 * Return 1 if free pages are above 'mark'. This takes into account the order 985 * Return 1 if free pages are above 'mark'. This takes into account the order
898 * of the allocation. 986 * of the allocation.
@@ -925,31 +1013,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
925 return 1; 1013 return 1;
926} 1014}
927 1015
1016#ifdef CONFIG_NUMA
1017/*
1018 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
1019 * skip over zones that are not allowed by the cpuset, or that have
1020 * been recently (in last second) found to be nearly full. See further
1021 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1022 * that have to skip over alot of full or unallowed zones.
1023 *
1024 * If the zonelist cache is present in the passed in zonelist, then
1025 * returns a pointer to the allowed node mask (either the current
1026 * tasks mems_allowed, or node_online_map.)
1027 *
1028 * If the zonelist cache is not available for this zonelist, does
1029 * nothing and returns NULL.
1030 *
1031 * If the fullzones BITMAP in the zonelist cache is stale (more than
1032 * a second since last zap'd) then we zap it out (clear its bits.)
1033 *
1034 * We hold off even calling zlc_setup, until after we've checked the
1035 * first zone in the zonelist, on the theory that most allocations will
1036 * be satisfied from that first zone, so best to examine that zone as
1037 * quickly as we can.
1038 */
1039static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1040{
1041 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1042 nodemask_t *allowednodes; /* zonelist_cache approximation */
1043
1044 zlc = zonelist->zlcache_ptr;
1045 if (!zlc)
1046 return NULL;
1047
1048 if (jiffies - zlc->last_full_zap > 1 * HZ) {
1049 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1050 zlc->last_full_zap = jiffies;
1051 }
1052
1053 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1054 &cpuset_current_mems_allowed :
1055 &node_online_map;
1056 return allowednodes;
1057}
1058
1059/*
1060 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1061 * if it is worth looking at further for free memory:
1062 * 1) Check that the zone isn't thought to be full (doesn't have its
1063 * bit set in the zonelist_cache fullzones BITMAP).
1064 * 2) Check that the zones node (obtained from the zonelist_cache
1065 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1066 * Return true (non-zero) if zone is worth looking at further, or
1067 * else return false (zero) if it is not.
1068 *
1069 * This check -ignores- the distinction between various watermarks,
1070 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
1071 * found to be full for any variation of these watermarks, it will
1072 * be considered full for up to one second by all requests, unless
1073 * we are so low on memory on all allowed nodes that we are forced
1074 * into the second scan of the zonelist.
1075 *
1076 * In the second scan we ignore this zonelist cache and exactly
1077 * apply the watermarks to all zones, even it is slower to do so.
1078 * We are low on memory in the second scan, and should leave no stone
1079 * unturned looking for a free page.
1080 */
1081static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1082 nodemask_t *allowednodes)
1083{
1084 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1085 int i; /* index of *z in zonelist zones */
1086 int n; /* node that zone *z is on */
1087
1088 zlc = zonelist->zlcache_ptr;
1089 if (!zlc)
1090 return 1;
1091
1092 i = z - zonelist->zones;
1093 n = zlc->z_to_n[i];
1094
1095 /* This zone is worth trying if it is allowed but not full */
1096 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1097}
1098
1099/*
1100 * Given 'z' scanning a zonelist, set the corresponding bit in
1101 * zlc->fullzones, so that subsequent attempts to allocate a page
1102 * from that zone don't waste time re-examining it.
1103 */
1104static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1105{
1106 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1107 int i; /* index of *z in zonelist zones */
1108
1109 zlc = zonelist->zlcache_ptr;
1110 if (!zlc)
1111 return;
1112
1113 i = z - zonelist->zones;
1114
1115 set_bit(i, zlc->fullzones);
1116}
1117
1118#else /* CONFIG_NUMA */
1119
1120static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1121{
1122 return NULL;
1123}
1124
1125static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1126 nodemask_t *allowednodes)
1127{
1128 return 1;
1129}
1130
1131static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1132{
1133}
1134#endif /* CONFIG_NUMA */
1135
928/* 1136/*
929 * get_page_from_freeliest goes through the zonelist trying to allocate 1137 * get_page_from_freelist goes through the zonelist trying to allocate
930 * a page. 1138 * a page.
931 */ 1139 */
932static struct page * 1140static struct page *
933get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1141get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
934 struct zonelist *zonelist, int alloc_flags) 1142 struct zonelist *zonelist, int alloc_flags)
935{ 1143{
936 struct zone **z = zonelist->zones; 1144 struct zone **z;
937 struct page *page = NULL; 1145 struct page *page = NULL;
938 int classzone_idx = zone_idx(*z); 1146 int classzone_idx = zone_idx(zonelist->zones[0]);
939 struct zone *zone; 1147 struct zone *zone;
1148 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1149 int zlc_active = 0; /* set if using zonelist_cache */
1150 int did_zlc_setup = 0; /* just call zlc_setup() one time */
940 1151
1152zonelist_scan:
941 /* 1153 /*
942 * Go through the zonelist once, looking for a zone with enough free. 1154 * Scan zonelist, looking for a zone with enough free.
943 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1155 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
944 */ 1156 */
1157 z = zonelist->zones;
1158
945 do { 1159 do {
1160 if (NUMA_BUILD && zlc_active &&
1161 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1162 continue;
946 zone = *z; 1163 zone = *z;
947 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && 1164 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
948 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) 1165 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
949 break; 1166 break;
950 if ((alloc_flags & ALLOC_CPUSET) && 1167 if ((alloc_flags & ALLOC_CPUSET) &&
951 !cpuset_zone_allowed(zone, gfp_mask)) 1168 !cpuset_zone_allowed_softwall(zone, gfp_mask))
952 continue; 1169 goto try_next_zone;
953 1170
954 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1171 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
955 unsigned long mark; 1172 unsigned long mark;
@@ -959,18 +1176,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
959 mark = zone->pages_low; 1176 mark = zone->pages_low;
960 else 1177 else
961 mark = zone->pages_high; 1178 mark = zone->pages_high;
962 if (!zone_watermark_ok(zone , order, mark, 1179 if (!zone_watermark_ok(zone, order, mark,
963 classzone_idx, alloc_flags)) 1180 classzone_idx, alloc_flags)) {
964 if (!zone_reclaim_mode || 1181 if (!zone_reclaim_mode ||
965 !zone_reclaim(zone, gfp_mask, order)) 1182 !zone_reclaim(zone, gfp_mask, order))
966 continue; 1183 goto this_zone_full;
1184 }
967 } 1185 }
968 1186
969 page = buffered_rmqueue(zonelist, zone, order, gfp_mask); 1187 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
970 if (page) { 1188 if (page)
971 break; 1189 break;
1190this_zone_full:
1191 if (NUMA_BUILD)
1192 zlc_mark_zone_full(zonelist, z);
1193try_next_zone:
1194 if (NUMA_BUILD && !did_zlc_setup) {
1195 /* we do zlc_setup after the first zone is tried */
1196 allowednodes = zlc_setup(zonelist, alloc_flags);
1197 zlc_active = 1;
1198 did_zlc_setup = 1;
972 } 1199 }
973 } while (*(++z) != NULL); 1200 } while (*(++z) != NULL);
1201
1202 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1203 /* Disable zlc cache for second zonelist scan */
1204 zlc_active = 0;
1205 goto zonelist_scan;
1206 }
974 return page; 1207 return page;
975} 1208}
976 1209
@@ -992,6 +1225,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
992 1225
993 might_sleep_if(wait); 1226 might_sleep_if(wait);
994 1227
1228 if (should_fail_alloc_page(gfp_mask, order))
1229 return NULL;
1230
995restart: 1231restart:
996 z = zonelist->zones; /* the list of zones suitable for gfp_mask */ 1232 z = zonelist->zones; /* the list of zones suitable for gfp_mask */
997 1233
@@ -1005,9 +1241,19 @@ restart:
1005 if (page) 1241 if (page)
1006 goto got_pg; 1242 goto got_pg;
1007 1243
1008 do { 1244 /*
1245 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
1246 * __GFP_NOWARN set) should not cause reclaim since the subsystem
1247 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
1248 * using a larger set of nodes after it has established that the
1249 * allowed per node queues are empty and that nodes are
1250 * over allocated.
1251 */
1252 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1253 goto nopage;
1254
1255 for (z = zonelist->zones; *z; z++)
1009 wakeup_kswapd(*z, order); 1256 wakeup_kswapd(*z, order);
1010 } while (*(++z));
1011 1257
1012 /* 1258 /*
1013 * OK, we're below the kswapd watermark and have kicked background 1259 * OK, we're below the kswapd watermark and have kicked background
@@ -1041,6 +1287,7 @@ restart:
1041 1287
1042 /* This allocation should allow future memory freeing. */ 1288 /* This allocation should allow future memory freeing. */
1043 1289
1290rebalance:
1044 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1291 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
1045 && !in_interrupt()) { 1292 && !in_interrupt()) {
1046 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1293 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
@@ -1062,7 +1309,6 @@ nofail_alloc:
1062 if (!wait) 1309 if (!wait)
1063 goto nopage; 1310 goto nopage;
1064 1311
1065rebalance:
1066 cond_resched(); 1312 cond_resched();
1067 1313
1068 /* We now go into synchronous reclaim */ 1314 /* We now go into synchronous reclaim */
@@ -1262,7 +1508,7 @@ unsigned int nr_free_pagecache_pages(void)
1262static inline void show_node(struct zone *zone) 1508static inline void show_node(struct zone *zone)
1263{ 1509{
1264 if (NUMA_BUILD) 1510 if (NUMA_BUILD)
1265 printk("Node %ld ", zone_to_nid(zone)); 1511 printk("Node %d ", zone_to_nid(zone));
1266} 1512}
1267 1513
1268void si_meminfo(struct sysinfo *val) 1514void si_meminfo(struct sysinfo *val)
@@ -1542,6 +1788,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1542 } 1788 }
1543} 1789}
1544 1790
1791/* Construct the zonelist performance cache - see further mmzone.h */
1792static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1793{
1794 int i;
1795
1796 for (i = 0; i < MAX_NR_ZONES; i++) {
1797 struct zonelist *zonelist;
1798 struct zonelist_cache *zlc;
1799 struct zone **z;
1800
1801 zonelist = pgdat->node_zonelists + i;
1802 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
1803 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1804 for (z = zonelist->zones; *z; z++)
1805 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
1806 }
1807}
1808
1545#else /* CONFIG_NUMA */ 1809#else /* CONFIG_NUMA */
1546 1810
1547static void __meminit build_zonelists(pg_data_t *pgdat) 1811static void __meminit build_zonelists(pg_data_t *pgdat)
@@ -1579,14 +1843,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1579 } 1843 }
1580} 1844}
1581 1845
1846/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
1847static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1848{
1849 int i;
1850
1851 for (i = 0; i < MAX_NR_ZONES; i++)
1852 pgdat->node_zonelists[i].zlcache_ptr = NULL;
1853}
1854
1582#endif /* CONFIG_NUMA */ 1855#endif /* CONFIG_NUMA */
1583 1856
1584/* return values int ....just for stop_machine_run() */ 1857/* return values int ....just for stop_machine_run() */
1585static int __meminit __build_all_zonelists(void *dummy) 1858static int __meminit __build_all_zonelists(void *dummy)
1586{ 1859{
1587 int nid; 1860 int nid;
1588 for_each_online_node(nid) 1861
1862 for_each_online_node(nid) {
1589 build_zonelists(NODE_DATA(nid)); 1863 build_zonelists(NODE_DATA(nid));
1864 build_zonelist_cache(NODE_DATA(nid));
1865 }
1590 return 0; 1866 return 0;
1591} 1867}
1592 1868
@@ -1680,17 +1956,24 @@ static inline unsigned long wait_table_bits(unsigned long size)
1680 * done. Non-atomic initialization, single-pass. 1956 * done. Non-atomic initialization, single-pass.
1681 */ 1957 */
1682void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1958void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1683 unsigned long start_pfn) 1959 unsigned long start_pfn, enum memmap_context context)
1684{ 1960{
1685 struct page *page; 1961 struct page *page;
1686 unsigned long end_pfn = start_pfn + size; 1962 unsigned long end_pfn = start_pfn + size;
1687 unsigned long pfn; 1963 unsigned long pfn;
1688 1964
1689 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 1965 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1690 if (!early_pfn_valid(pfn)) 1966 /*
1691 continue; 1967 * There can be holes in boot-time mem_map[]s
1692 if (!early_pfn_in_nid(pfn, nid)) 1968 * handed to this function. They do not
1693 continue; 1969 * exist on hotplugged memory.
1970 */
1971 if (context == MEMMAP_EARLY) {
1972 if (!early_pfn_valid(pfn))
1973 continue;
1974 if (!early_pfn_in_nid(pfn, nid))
1975 continue;
1976 }
1694 page = pfn_to_page(pfn); 1977 page = pfn_to_page(pfn);
1695 set_page_links(page, zone, nid, pfn); 1978 set_page_links(page, zone, nid, pfn);
1696 init_page_count(page); 1979 init_page_count(page);
@@ -1715,23 +1998,9 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1715 } 1998 }
1716} 1999}
1717 2000
1718#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
1719void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
1720 unsigned long pfn, unsigned long size)
1721{
1722 unsigned long snum = pfn_to_section_nr(pfn);
1723 unsigned long end = pfn_to_section_nr(pfn + size);
1724
1725 if (FLAGS_HAS_NODE)
1726 zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
1727 else
1728 for (; snum <= end; snum++)
1729 zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
1730}
1731
1732#ifndef __HAVE_ARCH_MEMMAP_INIT 2001#ifndef __HAVE_ARCH_MEMMAP_INIT
1733#define memmap_init(size, nid, zone, start_pfn) \ 2002#define memmap_init(size, nid, zone, start_pfn) \
1734 memmap_init_zone((size), (nid), (zone), (start_pfn)) 2003 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
1735#endif 2004#endif
1736 2005
1737static int __cpuinit zone_batchsize(struct zone *zone) 2006static int __cpuinit zone_batchsize(struct zone *zone)
@@ -1881,16 +2150,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
1881 int ret = NOTIFY_OK; 2150 int ret = NOTIFY_OK;
1882 2151
1883 switch (action) { 2152 switch (action) {
1884 case CPU_UP_PREPARE: 2153 case CPU_UP_PREPARE:
1885 if (process_zones(cpu)) 2154 if (process_zones(cpu))
1886 ret = NOTIFY_BAD; 2155 ret = NOTIFY_BAD;
1887 break; 2156 break;
1888 case CPU_UP_CANCELED: 2157 case CPU_UP_CANCELED:
1889 case CPU_DEAD: 2158 case CPU_DEAD:
1890 free_zone_pagesets(cpu); 2159 free_zone_pagesets(cpu);
1891 break; 2160 break;
1892 default: 2161 default:
1893 break; 2162 break;
1894 } 2163 }
1895 return ret; 2164 return ret;
1896} 2165}
@@ -1977,7 +2246,8 @@ static __meminit void zone_pcp_init(struct zone *zone)
1977 2246
1978__meminit int init_currently_empty_zone(struct zone *zone, 2247__meminit int init_currently_empty_zone(struct zone *zone,
1979 unsigned long zone_start_pfn, 2248 unsigned long zone_start_pfn,
1980 unsigned long size) 2249 unsigned long size,
2250 enum memmap_context context)
1981{ 2251{
1982 struct pglist_data *pgdat = zone->zone_pgdat; 2252 struct pglist_data *pgdat = zone->zone_pgdat;
1983 int ret; 2253 int ret;
@@ -2421,8 +2691,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2421 if (!size) 2691 if (!size)
2422 continue; 2692 continue;
2423 2693
2424 zonetable_add(zone, nid, j, zone_start_pfn, size); 2694 ret = init_currently_empty_zone(zone, zone_start_pfn,
2425 ret = init_currently_empty_zone(zone, zone_start_pfn, size); 2695 size, MEMMAP_EARLY);
2426 BUG_ON(ret); 2696 BUG_ON(ret);
2427 zone_start_pfn += size; 2697 zone_start_pfn += size;
2428 } 2698 }
@@ -2736,7 +3006,6 @@ void __init free_area_init(unsigned long *zones_size)
2736 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 3006 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
2737} 3007}
2738 3008
2739#ifdef CONFIG_HOTPLUG_CPU
2740static int page_alloc_cpu_notify(struct notifier_block *self, 3009static int page_alloc_cpu_notify(struct notifier_block *self,
2741 unsigned long action, void *hcpu) 3010 unsigned long action, void *hcpu)
2742{ 3011{
@@ -2751,7 +3020,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
2751 } 3020 }
2752 return NOTIFY_OK; 3021 return NOTIFY_OK;
2753} 3022}
2754#endif /* CONFIG_HOTPLUG_CPU */
2755 3023
2756void __init page_alloc_init(void) 3024void __init page_alloc_init(void)
2757{ 3025{
@@ -3055,7 +3323,7 @@ void *__init alloc_large_system_hash(const char *tablename,
3055 /* allow the kernel cmdline to have a say */ 3323 /* allow the kernel cmdline to have a say */
3056 if (!numentries) { 3324 if (!numentries) {
3057 /* round applicable memory size up to nearest megabyte */ 3325 /* round applicable memory size up to nearest megabyte */
3058 numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; 3326 numentries = nr_kernel_pages;
3059 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 3327 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
3060 numentries >>= 20 - PAGE_SHIFT; 3328 numentries >>= 20 - PAGE_SHIFT;
3061 numentries <<= 20 - PAGE_SHIFT; 3329 numentries <<= 20 - PAGE_SHIFT;
@@ -3065,6 +3333,10 @@ void *__init alloc_large_system_hash(const char *tablename,
3065 numentries >>= (scale - PAGE_SHIFT); 3333 numentries >>= (scale - PAGE_SHIFT);
3066 else 3334 else
3067 numentries <<= (PAGE_SHIFT - scale); 3335 numentries <<= (PAGE_SHIFT - scale);
3336
3337 /* Make sure we've got at least a 0-order allocation.. */
3338 if (unlikely((numentries * bucketsize) < PAGE_SIZE))
3339 numentries = PAGE_SIZE / bucketsize;
3068 } 3340 }
3069 numentries = roundup_pow_of_two(numentries); 3341 numentries = roundup_pow_of_two(numentries);
3070 3342
@@ -3077,7 +3349,7 @@ void *__init alloc_large_system_hash(const char *tablename,
3077 if (numentries > max) 3349 if (numentries > max)
3078 numentries = max; 3350 numentries = max;
3079 3351
3080 log2qty = long_log2(numentries); 3352 log2qty = ilog2(numentries);
3081 3353
3082 do { 3354 do {
3083 size = bucketsize << log2qty; 3355 size = bucketsize << log2qty;
@@ -3099,7 +3371,7 @@ void *__init alloc_large_system_hash(const char *tablename,
3099 printk("%s hash table entries: %d (order: %d, %lu bytes)\n", 3371 printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
3100 tablename, 3372 tablename,
3101 (1U << log2qty), 3373 (1U << log2qty),
3102 long_log2(size) - PAGE_SHIFT, 3374 ilog2(size) - PAGE_SHIFT,
3103 size); 3375 size);
3104 3376
3105 if (_hash_shift) 3377 if (_hash_shift)
diff --git a/mm/page_io.c b/mm/page_io.c
index d4840ecbf8f9..dbffec0d78c9 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -147,48 +147,3 @@ int swap_readpage(struct file *file, struct page *page)
147out: 147out:
148 return ret; 148 return ret;
149} 149}
150
151#ifdef CONFIG_SOFTWARE_SUSPEND
152/*
153 * A scruffy utility function to read or write an arbitrary swap page
154 * and wait on the I/O. The caller must have a ref on the page.
155 *
156 * We use end_swap_bio_read() even for writes, because it happens to do what
157 * we want.
158 */
159int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
160 struct bio **bio_chain)
161{
162 struct bio *bio;
163 int ret = 0;
164 int bio_rw;
165
166 lock_page(page);
167
168 bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read);
169 if (bio == NULL) {
170 unlock_page(page);
171 ret = -ENOMEM;
172 goto out;
173 }
174
175 bio_rw = rw;
176 if (!bio_chain)
177 bio_rw |= (1 << BIO_RW_SYNC);
178 if (bio_chain)
179 bio_get(bio);
180 submit_bio(bio_rw, bio);
181 if (bio_chain == NULL) {
182 wait_on_page_locked(page);
183
184 if (!PageUptodate(page) || PageError(page))
185 ret = -EIO;
186 }
187 if (bio_chain) {
188 bio->bi_private = *bio_chain;
189 *bio_chain = bio;
190 }
191out:
192 return ret;
193}
194#endif
diff --git a/mm/pdflush.c b/mm/pdflush.c
index b02102feeb4b..8ce0900dc95c 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> // Prototypes pdflush_operation() 21#include <linux/writeback.h> // Prototypes pdflush_operation()
22#include <linux/kthread.h> 22#include <linux/kthread.h>
23#include <linux/cpuset.h> 23#include <linux/cpuset.h>
24#include <linux/freezer.h>
24 25
25 26
26/* 27/*
diff --git a/mm/readahead.c b/mm/readahead.c
index 23cb61a01c6e..0f539e8e827a 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/blkdev.h> 14#include <linux/blkdev.h>
15#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
16#include <linux/task_io_accounting_ops.h>
16#include <linux/pagevec.h> 17#include <linux/pagevec.h>
17 18
18void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 19void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
@@ -148,15 +149,10 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
148 if (!pagevec_add(&lru_pvec, page)) 149 if (!pagevec_add(&lru_pvec, page))
149 __pagevec_lru_add(&lru_pvec); 150 __pagevec_lru_add(&lru_pvec);
150 if (ret) { 151 if (ret) {
151 while (!list_empty(pages)) { 152 put_pages_list(pages);
152 struct page *victim;
153
154 victim = list_to_page(pages);
155 list_del(&victim->lru);
156 page_cache_release(victim);
157 }
158 break; 153 break;
159 } 154 }
155 task_io_account_read(PAGE_CACHE_SIZE);
160 } 156 }
161 pagevec_lru_add(&lru_pvec); 157 pagevec_lru_add(&lru_pvec);
162 return ret; 158 return ret;
@@ -456,7 +452,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
456 * 452 *
457 * Note that @filp is purely used for passing on to the ->readpage[s]() 453 * Note that @filp is purely used for passing on to the ->readpage[s]()
458 * handler: it may refer to a different file from @mapping (so we may not use 454 * handler: it may refer to a different file from @mapping (so we may not use
459 * @filp->f_mapping or @filp->f_dentry->d_inode here). 455 * @filp->f_mapping or @filp->f_path.dentry->d_inode here).
460 * Also, @ra may not be equal to &@filp->f_ra. 456 * Also, @ra may not be equal to &@filp->f_ra.
461 * 457 *
462 */ 458 */
diff --git a/mm/rmap.c b/mm/rmap.c
index d8a842a586db..669acb22b572 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -47,6 +47,7 @@
47#include <linux/rmap.h> 47#include <linux/rmap.h>
48#include <linux/rcupdate.h> 48#include <linux/rcupdate.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kallsyms.h>
50 51
51#include <asm/tlbflush.h> 52#include <asm/tlbflush.h>
52 53
@@ -432,7 +433,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
432{ 433{
433 struct mm_struct *mm = vma->vm_mm; 434 struct mm_struct *mm = vma->vm_mm;
434 unsigned long address; 435 unsigned long address;
435 pte_t *pte, entry; 436 pte_t *pte;
436 spinlock_t *ptl; 437 spinlock_t *ptl;
437 int ret = 0; 438 int ret = 0;
438 439
@@ -444,17 +445,18 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
444 if (!pte) 445 if (!pte)
445 goto out; 446 goto out;
446 447
447 if (!pte_dirty(*pte) && !pte_write(*pte)) 448 if (pte_dirty(*pte) || pte_write(*pte)) {
448 goto unlock; 449 pte_t entry;
449 450
450 entry = ptep_get_and_clear(mm, address, pte); 451 flush_cache_page(vma, address, pte_pfn(*pte));
451 entry = pte_mkclean(entry); 452 entry = ptep_clear_flush(vma, address, pte);
452 entry = pte_wrprotect(entry); 453 entry = pte_wrprotect(entry);
453 ptep_establish(vma, address, pte, entry); 454 entry = pte_mkclean(entry);
454 lazy_mmu_prot_update(entry); 455 set_pte_at(mm, address, pte, entry);
455 ret = 1; 456 lazy_mmu_prot_update(entry);
457 ret = 1;
458 }
456 459
457unlock:
458 pte_unmap_unlock(pte, ptl); 460 pte_unmap_unlock(pte, ptl);
459out: 461out:
460 return ret; 462 return ret;
@@ -489,6 +491,8 @@ int page_mkclean(struct page *page)
489 if (mapping) 491 if (mapping)
490 ret = page_mkclean_file(mapping, page); 492 ret = page_mkclean_file(mapping, page);
491 } 493 }
494 if (page_test_and_clear_dirty(page))
495 ret = 1;
492 496
493 return ret; 497 return ret;
494} 498}
@@ -567,14 +571,20 @@ void page_add_file_rmap(struct page *page)
567 * 571 *
568 * The caller needs to hold the pte lock. 572 * The caller needs to hold the pte lock.
569 */ 573 */
570void page_remove_rmap(struct page *page) 574void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
571{ 575{
572 if (atomic_add_negative(-1, &page->_mapcount)) { 576 if (atomic_add_negative(-1, &page->_mapcount)) {
573 if (unlikely(page_mapcount(page) < 0)) { 577 if (unlikely(page_mapcount(page) < 0)) {
574 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); 578 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
579 printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page));
575 printk (KERN_EMERG " page->flags = %lx\n", page->flags); 580 printk (KERN_EMERG " page->flags = %lx\n", page->flags);
576 printk (KERN_EMERG " page->count = %x\n", page_count(page)); 581 printk (KERN_EMERG " page->count = %x\n", page_count(page));
577 printk (KERN_EMERG " page->mapping = %p\n", page->mapping); 582 printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
583 print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
584 if (vma->vm_ops)
585 print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
586 if (vma->vm_file && vma->vm_file->f_op)
587 print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
578 BUG(); 588 BUG();
579 } 589 }
580 590
@@ -679,7 +689,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
679 dec_mm_counter(mm, file_rss); 689 dec_mm_counter(mm, file_rss);
680 690
681 691
682 page_remove_rmap(page); 692 page_remove_rmap(page, vma);
683 page_cache_release(page); 693 page_cache_release(page);
684 694
685out_unmap: 695out_unmap:
@@ -769,7 +779,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
769 if (pte_dirty(pteval)) 779 if (pte_dirty(pteval))
770 set_page_dirty(page); 780 set_page_dirty(page);
771 781
772 page_remove_rmap(page); 782 page_remove_rmap(page, vma);
773 page_cache_release(page); 783 page_cache_release(page);
774 dec_mm_counter(mm, file_rss); 784 dec_mm_counter(mm, file_rss);
775 (*mapcount)--; 785 (*mapcount)--;
diff --git a/mm/shmem.c b/mm/shmem.c
index 4959535fc14c..70da7a0981bf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -177,7 +177,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
177 177
178static struct super_operations shmem_ops; 178static struct super_operations shmem_ops;
179static const struct address_space_operations shmem_aops; 179static const struct address_space_operations shmem_aops;
180static struct file_operations shmem_file_operations; 180static const struct file_operations shmem_file_operations;
181static struct inode_operations shmem_inode_operations; 181static struct inode_operations shmem_inode_operations;
182static struct inode_operations shmem_dir_inode_operations; 182static struct inode_operations shmem_dir_inode_operations;
183static struct inode_operations shmem_special_inode_operations; 183static struct inode_operations shmem_special_inode_operations;
@@ -515,7 +515,12 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
515 size = SHMEM_NR_DIRECT; 515 size = SHMEM_NR_DIRECT;
516 nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size); 516 nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
517 } 517 }
518 if (!topdir) 518
519 /*
520 * If there are no indirect blocks or we are punching a hole
521 * below indirect blocks, nothing to be done.
522 */
523 if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT)))
519 goto done2; 524 goto done2;
520 525
521 BUG_ON(limit <= SHMEM_NR_DIRECT); 526 BUG_ON(limit <= SHMEM_NR_DIRECT);
@@ -1225,7 +1230,7 @@ failed:
1225 1230
1226struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type) 1231struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
1227{ 1232{
1228 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1233 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1229 struct page *page = NULL; 1234 struct page *page = NULL;
1230 unsigned long idx; 1235 unsigned long idx;
1231 int error; 1236 int error;
@@ -1248,7 +1253,7 @@ static int shmem_populate(struct vm_area_struct *vma,
1248 unsigned long addr, unsigned long len, 1253 unsigned long addr, unsigned long len,
1249 pgprot_t prot, unsigned long pgoff, int nonblock) 1254 pgprot_t prot, unsigned long pgoff, int nonblock)
1250{ 1255{
1251 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1256 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1252 struct mm_struct *mm = vma->vm_mm; 1257 struct mm_struct *mm = vma->vm_mm;
1253 enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; 1258 enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
1254 unsigned long size; 1259 unsigned long size;
@@ -1293,14 +1298,14 @@ static int shmem_populate(struct vm_area_struct *vma,
1293#ifdef CONFIG_NUMA 1298#ifdef CONFIG_NUMA
1294int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) 1299int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
1295{ 1300{
1296 struct inode *i = vma->vm_file->f_dentry->d_inode; 1301 struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1297 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); 1302 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
1298} 1303}
1299 1304
1300struct mempolicy * 1305struct mempolicy *
1301shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) 1306shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
1302{ 1307{
1303 struct inode *i = vma->vm_file->f_dentry->d_inode; 1308 struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1304 unsigned long idx; 1309 unsigned long idx;
1305 1310
1306 idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1311 idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -1310,7 +1315,7 @@ shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
1310 1315
1311int shmem_lock(struct file *file, int lock, struct user_struct *user) 1316int shmem_lock(struct file *file, int lock, struct user_struct *user)
1312{ 1317{
1313 struct inode *inode = file->f_dentry->d_inode; 1318 struct inode *inode = file->f_path.dentry->d_inode;
1314 struct shmem_inode_info *info = SHMEM_I(inode); 1319 struct shmem_inode_info *info = SHMEM_I(inode);
1315 int retval = -ENOMEM; 1320 int retval = -ENOMEM;
1316 1321
@@ -1422,7 +1427,7 @@ shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsig
1422static ssize_t 1427static ssize_t
1423shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) 1428shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
1424{ 1429{
1425 struct inode *inode = file->f_dentry->d_inode; 1430 struct inode *inode = file->f_path.dentry->d_inode;
1426 loff_t pos; 1431 loff_t pos;
1427 unsigned long written; 1432 unsigned long written;
1428 ssize_t err; 1433 ssize_t err;
@@ -1442,7 +1447,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
1442 if (err || !count) 1447 if (err || !count)
1443 goto out; 1448 goto out;
1444 1449
1445 err = remove_suid(file->f_dentry); 1450 err = remove_suid(file->f_path.dentry);
1446 if (err) 1451 if (err)
1447 goto out; 1452 goto out;
1448 1453
@@ -1524,7 +1529,7 @@ out:
1524 1529
1525static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) 1530static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1526{ 1531{
1527 struct inode *inode = filp->f_dentry->d_inode; 1532 struct inode *inode = filp->f_path.dentry->d_inode;
1528 struct address_space *mapping = inode->i_mapping; 1533 struct address_space *mapping = inode->i_mapping;
1529 unsigned long index, offset; 1534 unsigned long index, offset;
1530 1535
@@ -1943,7 +1948,7 @@ static int shmem_xattr_security_set(struct inode *inode, const char *name,
1943 return security_inode_setsecurity(inode, name, value, size, flags); 1948 return security_inode_setsecurity(inode, name, value, size, flags);
1944} 1949}
1945 1950
1946struct xattr_handler shmem_xattr_security_handler = { 1951static struct xattr_handler shmem_xattr_security_handler = {
1947 .prefix = XATTR_SECURITY_PREFIX, 1952 .prefix = XATTR_SECURITY_PREFIX,
1948 .list = shmem_xattr_security_list, 1953 .list = shmem_xattr_security_list,
1949 .get = shmem_xattr_security_get, 1954 .get = shmem_xattr_security_get,
@@ -2263,7 +2268,7 @@ static struct kmem_cache *shmem_inode_cachep;
2263static struct inode *shmem_alloc_inode(struct super_block *sb) 2268static struct inode *shmem_alloc_inode(struct super_block *sb)
2264{ 2269{
2265 struct shmem_inode_info *p; 2270 struct shmem_inode_info *p;
2266 p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL); 2271 p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2267 if (!p) 2272 if (!p)
2268 return NULL; 2273 return NULL;
2269 return &p->vfs_inode; 2274 return &p->vfs_inode;
@@ -2319,7 +2324,7 @@ static const struct address_space_operations shmem_aops = {
2319 .migratepage = migrate_page, 2324 .migratepage = migrate_page,
2320}; 2325};
2321 2326
2322static struct file_operations shmem_file_operations = { 2327static const struct file_operations shmem_file_operations = {
2323 .mmap = shmem_mmap, 2328 .mmap = shmem_mmap,
2324#ifdef CONFIG_TMPFS 2329#ifdef CONFIG_TMPFS
2325 .llseek = generic_file_llseek, 2330 .llseek = generic_file_llseek,
@@ -2493,8 +2498,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2493 d_instantiate(dentry, inode); 2498 d_instantiate(dentry, inode);
2494 inode->i_size = size; 2499 inode->i_size = size;
2495 inode->i_nlink = 0; /* It is unlinked */ 2500 inode->i_nlink = 0; /* It is unlinked */
2496 file->f_vfsmnt = mntget(shm_mnt); 2501 file->f_path.mnt = mntget(shm_mnt);
2497 file->f_dentry = dentry; 2502 file->f_path.dentry = dentry;
2498 file->f_mapping = inode->i_mapping; 2503 file->f_mapping = inode->i_mapping;
2499 file->f_op = &shmem_file_operations; 2504 file->f_op = &shmem_file_operations;
2500 file->f_mode = FMODE_WRITE | FMODE_READ; 2505 file->f_mode = FMODE_WRITE | FMODE_READ;
diff --git a/mm/slab.c b/mm/slab.c
index 3c4a7e34eddc..c6100628a6ef 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -103,12 +103,14 @@
103#include <linux/module.h> 103#include <linux/module.h>
104#include <linux/rcupdate.h> 104#include <linux/rcupdate.h>
105#include <linux/string.h> 105#include <linux/string.h>
106#include <linux/uaccess.h>
106#include <linux/nodemask.h> 107#include <linux/nodemask.h>
107#include <linux/mempolicy.h> 108#include <linux/mempolicy.h>
108#include <linux/mutex.h> 109#include <linux/mutex.h>
110#include <linux/fault-inject.h>
109#include <linux/rtmutex.h> 111#include <linux/rtmutex.h>
112#include <linux/reciprocal_div.h>
110 113
111#include <asm/uaccess.h>
112#include <asm/cacheflush.h> 114#include <asm/cacheflush.h>
113#include <asm/tlbflush.h> 115#include <asm/tlbflush.h>
114#include <asm/page.h> 116#include <asm/page.h>
@@ -313,7 +315,7 @@ static int drain_freelist(struct kmem_cache *cache,
313static void free_block(struct kmem_cache *cachep, void **objpp, int len, 315static void free_block(struct kmem_cache *cachep, void **objpp, int len,
314 int node); 316 int node);
315static int enable_cpucache(struct kmem_cache *cachep); 317static int enable_cpucache(struct kmem_cache *cachep);
316static void cache_reap(void *unused); 318static void cache_reap(struct work_struct *unused);
317 319
318/* 320/*
319 * This function must be completely optimized away if a constant is passed to 321 * This function must be completely optimized away if a constant is passed to
@@ -385,6 +387,7 @@ struct kmem_cache {
385 unsigned int shared; 387 unsigned int shared;
386 388
387 unsigned int buffer_size; 389 unsigned int buffer_size;
390 u32 reciprocal_buffer_size;
388/* 3) touched by every alloc & free from the backend */ 391/* 3) touched by every alloc & free from the backend */
389 struct kmem_list3 *nodelists[MAX_NUMNODES]; 392 struct kmem_list3 *nodelists[MAX_NUMNODES];
390 393
@@ -626,10 +629,17 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
626 return slab->s_mem + cache->buffer_size * idx; 629 return slab->s_mem + cache->buffer_size * idx;
627} 630}
628 631
629static inline unsigned int obj_to_index(struct kmem_cache *cache, 632/*
630 struct slab *slab, void *obj) 633 * We want to avoid an expensive divide : (offset / cache->buffer_size)
634 * Using the fact that buffer_size is a constant for a particular cache,
635 * we can replace (offset / cache->buffer_size) by
636 * reciprocal_divide(offset, cache->reciprocal_buffer_size)
637 */
638static inline unsigned int obj_to_index(const struct kmem_cache *cache,
639 const struct slab *slab, void *obj)
631{ 640{
632 return (unsigned)(obj - slab->s_mem) / cache->buffer_size; 641 u32 offset = (obj - slab->s_mem);
642 return reciprocal_divide(offset, cache->reciprocal_buffer_size);
633} 643}
634 644
635/* 645/*
@@ -730,7 +740,10 @@ static inline void init_lock_keys(void)
730} 740}
731#endif 741#endif
732 742
733/* Guard access to the cache-chain. */ 743/*
744 * 1. Guard access to the cache-chain.
745 * 2. Protect sanity of cpu_online_map against cpu hotplug events
746 */
734static DEFINE_MUTEX(cache_chain_mutex); 747static DEFINE_MUTEX(cache_chain_mutex);
735static struct list_head cache_chain; 748static struct list_head cache_chain;
736 749
@@ -753,7 +766,7 @@ int slab_is_available(void)
753 return g_cpucache_up == FULL; 766 return g_cpucache_up == FULL;
754} 767}
755 768
756static DEFINE_PER_CPU(struct work_struct, reap_work); 769static DEFINE_PER_CPU(struct delayed_work, reap_work);
757 770
758static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 771static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
759{ 772{
@@ -866,6 +879,22 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
866 dump_stack(); 879 dump_stack();
867} 880}
868 881
882/*
883 * By default on NUMA we use alien caches to stage the freeing of
884 * objects allocated from other nodes. This causes massive memory
885 * inefficiencies when using fake NUMA setup to split memory into a
886 * large number of small nodes, so it can be disabled on the command
887 * line
888 */
889
890static int use_alien_caches __read_mostly = 1;
891static int __init noaliencache_setup(char *s)
892{
893 use_alien_caches = 0;
894 return 1;
895}
896__setup("noaliencache", noaliencache_setup);
897
869#ifdef CONFIG_NUMA 898#ifdef CONFIG_NUMA
870/* 899/*
871 * Special reaping functions for NUMA systems called from cache_reap(). 900 * Special reaping functions for NUMA systems called from cache_reap().
@@ -916,17 +945,18 @@ static void next_reap_node(void)
916 */ 945 */
917static void __devinit start_cpu_timer(int cpu) 946static void __devinit start_cpu_timer(int cpu)
918{ 947{
919 struct work_struct *reap_work = &per_cpu(reap_work, cpu); 948 struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
920 949
921 /* 950 /*
922 * When this gets called from do_initcalls via cpucache_init(), 951 * When this gets called from do_initcalls via cpucache_init(),
923 * init_workqueues() has already run, so keventd will be setup 952 * init_workqueues() has already run, so keventd will be setup
924 * at that time. 953 * at that time.
925 */ 954 */
926 if (keventd_up() && reap_work->func == NULL) { 955 if (keventd_up() && reap_work->work.func == NULL) {
927 init_reap_node(cpu); 956 init_reap_node(cpu);
928 INIT_WORK(reap_work, cache_reap, NULL); 957 INIT_DELAYED_WORK(reap_work, cache_reap);
929 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 958 schedule_delayed_work_on(cpu, reap_work,
959 __round_jiffies_relative(HZ, cpu));
930 } 960 }
931} 961}
932 962
@@ -996,7 +1026,7 @@ static inline void *alternate_node_alloc(struct kmem_cache *cachep,
996 return NULL; 1026 return NULL;
997} 1027}
998 1028
999static inline void *__cache_alloc_node(struct kmem_cache *cachep, 1029static inline void *____cache_alloc_node(struct kmem_cache *cachep,
1000 gfp_t flags, int nodeid) 1030 gfp_t flags, int nodeid)
1001{ 1031{
1002 return NULL; 1032 return NULL;
@@ -1004,7 +1034,7 @@ static inline void *__cache_alloc_node(struct kmem_cache *cachep,
1004 1034
1005#else /* CONFIG_NUMA */ 1035#else /* CONFIG_NUMA */
1006 1036
1007static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); 1037static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1008static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 1038static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1009 1039
1010static struct array_cache **alloc_alien_cache(int node, int limit) 1040static struct array_cache **alloc_alien_cache(int node, int limit)
@@ -1114,7 +1144,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1114 * Make sure we are not freeing a object from another node to the array 1144 * Make sure we are not freeing a object from another node to the array
1115 * cache on this cpu. 1145 * cache on this cpu.
1116 */ 1146 */
1117 if (likely(slabp->nodeid == node)) 1147 if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))
1118 return 0; 1148 return 0;
1119 1149
1120 l3 = cachep->nodelists[node]; 1150 l3 = cachep->nodelists[node];
@@ -1192,7 +1222,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1192 list_for_each_entry(cachep, &cache_chain, next) { 1222 list_for_each_entry(cachep, &cache_chain, next) {
1193 struct array_cache *nc; 1223 struct array_cache *nc;
1194 struct array_cache *shared; 1224 struct array_cache *shared;
1195 struct array_cache **alien; 1225 struct array_cache **alien = NULL;
1196 1226
1197 nc = alloc_arraycache(node, cachep->limit, 1227 nc = alloc_arraycache(node, cachep->limit,
1198 cachep->batchcount); 1228 cachep->batchcount);
@@ -1204,9 +1234,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1204 if (!shared) 1234 if (!shared)
1205 goto bad; 1235 goto bad;
1206 1236
1207 alien = alloc_alien_cache(node, cachep->limit); 1237 if (use_alien_caches) {
1208 if (!alien) 1238 alien = alloc_alien_cache(node, cachep->limit);
1209 goto bad; 1239 if (!alien)
1240 goto bad;
1241 }
1210 cachep->array[cpu] = nc; 1242 cachep->array[cpu] = nc;
1211 l3 = cachep->nodelists[node]; 1243 l3 = cachep->nodelists[node];
1212 BUG_ON(!l3); 1244 BUG_ON(!l3);
@@ -1230,12 +1262,18 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1230 kfree(shared); 1262 kfree(shared);
1231 free_alien_cache(alien); 1263 free_alien_cache(alien);
1232 } 1264 }
1233 mutex_unlock(&cache_chain_mutex);
1234 break; 1265 break;
1235 case CPU_ONLINE: 1266 case CPU_ONLINE:
1267 mutex_unlock(&cache_chain_mutex);
1236 start_cpu_timer(cpu); 1268 start_cpu_timer(cpu);
1237 break; 1269 break;
1238#ifdef CONFIG_HOTPLUG_CPU 1270#ifdef CONFIG_HOTPLUG_CPU
1271 case CPU_DOWN_PREPARE:
1272 mutex_lock(&cache_chain_mutex);
1273 break;
1274 case CPU_DOWN_FAILED:
1275 mutex_unlock(&cache_chain_mutex);
1276 break;
1239 case CPU_DEAD: 1277 case CPU_DEAD:
1240 /* 1278 /*
1241 * Even if all the cpus of a node are down, we don't free the 1279 * Even if all the cpus of a node are down, we don't free the
@@ -1246,8 +1284,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1246 * gets destroyed at kmem_cache_destroy(). 1284 * gets destroyed at kmem_cache_destroy().
1247 */ 1285 */
1248 /* fall thru */ 1286 /* fall thru */
1287#endif
1249 case CPU_UP_CANCELED: 1288 case CPU_UP_CANCELED:
1250 mutex_lock(&cache_chain_mutex);
1251 list_for_each_entry(cachep, &cache_chain, next) { 1289 list_for_each_entry(cachep, &cache_chain, next) {
1252 struct array_cache *nc; 1290 struct array_cache *nc;
1253 struct array_cache *shared; 1291 struct array_cache *shared;
@@ -1308,11 +1346,9 @@ free_array_cache:
1308 } 1346 }
1309 mutex_unlock(&cache_chain_mutex); 1347 mutex_unlock(&cache_chain_mutex);
1310 break; 1348 break;
1311#endif
1312 } 1349 }
1313 return NOTIFY_OK; 1350 return NOTIFY_OK;
1314bad: 1351bad:
1315 mutex_unlock(&cache_chain_mutex);
1316 return NOTIFY_BAD; 1352 return NOTIFY_BAD;
1317} 1353}
1318 1354
@@ -1400,6 +1436,8 @@ void __init kmem_cache_init(void)
1400 1436
1401 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, 1437 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1402 cache_line_size()); 1438 cache_line_size());
1439 cache_cache.reciprocal_buffer_size =
1440 reciprocal_value(cache_cache.buffer_size);
1403 1441
1404 for (order = 0; order < MAX_ORDER; order++) { 1442 for (order = 0; order < MAX_ORDER; order++) {
1405 cache_estimate(order, cache_cache.buffer_size, 1443 cache_estimate(order, cache_cache.buffer_size,
@@ -1580,12 +1618,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1580 flags |= __GFP_COMP; 1618 flags |= __GFP_COMP;
1581#endif 1619#endif
1582 1620
1583 /* 1621 flags |= cachep->gfpflags;
1584 * Under NUMA we want memory on the indicated node. We will handle
1585 * the needed fallback ourselves since we want to serve from our
1586 * per node object lists first for other nodes.
1587 */
1588 flags |= cachep->gfpflags | GFP_THISNODE;
1589 1622
1590 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1623 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1591 if (!page) 1624 if (!page)
@@ -2098,15 +2131,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2098 } 2131 }
2099 2132
2100 /* 2133 /*
2101 * Prevent CPUs from coming and going. 2134 * We use cache_chain_mutex to ensure a consistent view of
2102 * lock_cpu_hotplug() nests outside cache_chain_mutex 2135 * cpu_online_map as well. Please see cpuup_callback
2103 */ 2136 */
2104 lock_cpu_hotplug();
2105
2106 mutex_lock(&cache_chain_mutex); 2137 mutex_lock(&cache_chain_mutex);
2107 2138
2108 list_for_each_entry(pc, &cache_chain, next) { 2139 list_for_each_entry(pc, &cache_chain, next) {
2109 mm_segment_t old_fs = get_fs();
2110 char tmp; 2140 char tmp;
2111 int res; 2141 int res;
2112 2142
@@ -2115,9 +2145,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2115 * destroy its slab cache and no-one else reuses the vmalloc 2145 * destroy its slab cache and no-one else reuses the vmalloc
2116 * area of the module. Print a warning. 2146 * area of the module. Print a warning.
2117 */ 2147 */
2118 set_fs(KERNEL_DS); 2148 res = probe_kernel_address(pc->name, tmp);
2119 res = __get_user(tmp, pc->name);
2120 set_fs(old_fs);
2121 if (res) { 2149 if (res) {
2122 printk("SLAB: cache with size %d has lost its name\n", 2150 printk("SLAB: cache with size %d has lost its name\n",
2123 pc->buffer_size); 2151 pc->buffer_size);
@@ -2197,25 +2225,24 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2197 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) 2225 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
2198 ralign = BYTES_PER_WORD; 2226 ralign = BYTES_PER_WORD;
2199 2227
2200 /* 2) arch mandated alignment: disables debug if necessary */ 2228 /* 2) arch mandated alignment */
2201 if (ralign < ARCH_SLAB_MINALIGN) { 2229 if (ralign < ARCH_SLAB_MINALIGN) {
2202 ralign = ARCH_SLAB_MINALIGN; 2230 ralign = ARCH_SLAB_MINALIGN;
2203 if (ralign > BYTES_PER_WORD)
2204 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2205 } 2231 }
2206 /* 3) caller mandated alignment: disables debug if necessary */ 2232 /* 3) caller mandated alignment */
2207 if (ralign < align) { 2233 if (ralign < align) {
2208 ralign = align; 2234 ralign = align;
2209 if (ralign > BYTES_PER_WORD)
2210 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2211 } 2235 }
2236 /* disable debug if necessary */
2237 if (ralign > BYTES_PER_WORD)
2238 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2212 /* 2239 /*
2213 * 4) Store it. 2240 * 4) Store it.
2214 */ 2241 */
2215 align = ralign; 2242 align = ralign;
2216 2243
2217 /* Get cache's description obj. */ 2244 /* Get cache's description obj. */
2218 cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); 2245 cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
2219 if (!cachep) 2246 if (!cachep)
2220 goto oops; 2247 goto oops;
2221 2248
@@ -2297,6 +2324,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2297 if (flags & SLAB_CACHE_DMA) 2324 if (flags & SLAB_CACHE_DMA)
2298 cachep->gfpflags |= GFP_DMA; 2325 cachep->gfpflags |= GFP_DMA;
2299 cachep->buffer_size = size; 2326 cachep->buffer_size = size;
2327 cachep->reciprocal_buffer_size = reciprocal_value(size);
2300 2328
2301 if (flags & CFLGS_OFF_SLAB) { 2329 if (flags & CFLGS_OFF_SLAB) {
2302 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 2330 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
@@ -2326,7 +2354,6 @@ oops:
2326 panic("kmem_cache_create(): failed to create slab `%s'\n", 2354 panic("kmem_cache_create(): failed to create slab `%s'\n",
2327 name); 2355 name);
2328 mutex_unlock(&cache_chain_mutex); 2356 mutex_unlock(&cache_chain_mutex);
2329 unlock_cpu_hotplug();
2330 return cachep; 2357 return cachep;
2331} 2358}
2332EXPORT_SYMBOL(kmem_cache_create); 2359EXPORT_SYMBOL(kmem_cache_create);
@@ -2444,6 +2471,7 @@ out:
2444 return nr_freed; 2471 return nr_freed;
2445} 2472}
2446 2473
2474/* Called with cache_chain_mutex held to protect against cpu hotplug */
2447static int __cache_shrink(struct kmem_cache *cachep) 2475static int __cache_shrink(struct kmem_cache *cachep)
2448{ 2476{
2449 int ret = 0, i = 0; 2477 int ret = 0, i = 0;
@@ -2474,9 +2502,13 @@ static int __cache_shrink(struct kmem_cache *cachep)
2474 */ 2502 */
2475int kmem_cache_shrink(struct kmem_cache *cachep) 2503int kmem_cache_shrink(struct kmem_cache *cachep)
2476{ 2504{
2505 int ret;
2477 BUG_ON(!cachep || in_interrupt()); 2506 BUG_ON(!cachep || in_interrupt());
2478 2507
2479 return __cache_shrink(cachep); 2508 mutex_lock(&cache_chain_mutex);
2509 ret = __cache_shrink(cachep);
2510 mutex_unlock(&cache_chain_mutex);
2511 return ret;
2480} 2512}
2481EXPORT_SYMBOL(kmem_cache_shrink); 2513EXPORT_SYMBOL(kmem_cache_shrink);
2482 2514
@@ -2500,23 +2532,16 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2500{ 2532{
2501 BUG_ON(!cachep || in_interrupt()); 2533 BUG_ON(!cachep || in_interrupt());
2502 2534
2503 /* Don't let CPUs to come and go */
2504 lock_cpu_hotplug();
2505
2506 /* Find the cache in the chain of caches. */ 2535 /* Find the cache in the chain of caches. */
2507 mutex_lock(&cache_chain_mutex); 2536 mutex_lock(&cache_chain_mutex);
2508 /* 2537 /*
2509 * the chain is never empty, cache_cache is never destroyed 2538 * the chain is never empty, cache_cache is never destroyed
2510 */ 2539 */
2511 list_del(&cachep->next); 2540 list_del(&cachep->next);
2512 mutex_unlock(&cache_chain_mutex);
2513
2514 if (__cache_shrink(cachep)) { 2541 if (__cache_shrink(cachep)) {
2515 slab_error(cachep, "Can't free all objects"); 2542 slab_error(cachep, "Can't free all objects");
2516 mutex_lock(&cache_chain_mutex);
2517 list_add(&cachep->next, &cache_chain); 2543 list_add(&cachep->next, &cache_chain);
2518 mutex_unlock(&cache_chain_mutex); 2544 mutex_unlock(&cache_chain_mutex);
2519 unlock_cpu_hotplug();
2520 return; 2545 return;
2521 } 2546 }
2522 2547
@@ -2524,7 +2549,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2524 synchronize_rcu(); 2549 synchronize_rcu();
2525 2550
2526 __kmem_cache_destroy(cachep); 2551 __kmem_cache_destroy(cachep);
2527 unlock_cpu_hotplug(); 2552 mutex_unlock(&cache_chain_mutex);
2528} 2553}
2529EXPORT_SYMBOL(kmem_cache_destroy); 2554EXPORT_SYMBOL(kmem_cache_destroy);
2530 2555
@@ -2548,7 +2573,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2548 if (OFF_SLAB(cachep)) { 2573 if (OFF_SLAB(cachep)) {
2549 /* Slab management obj is off-slab. */ 2574 /* Slab management obj is off-slab. */
2550 slabp = kmem_cache_alloc_node(cachep->slabp_cache, 2575 slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2551 local_flags, nodeid); 2576 local_flags & ~GFP_THISNODE, nodeid);
2552 if (!slabp) 2577 if (!slabp)
2553 return NULL; 2578 return NULL;
2554 } else { 2579 } else {
@@ -2618,7 +2643,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2618 2643
2619static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2644static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2620{ 2645{
2621 if (flags & SLAB_DMA) 2646 if (flags & GFP_DMA)
2622 BUG_ON(!(cachep->gfpflags & GFP_DMA)); 2647 BUG_ON(!(cachep->gfpflags & GFP_DMA));
2623 else 2648 else
2624 BUG_ON(cachep->gfpflags & GFP_DMA); 2649 BUG_ON(cachep->gfpflags & GFP_DMA);
@@ -2689,10 +2714,10 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2689 * Grow (by 1) the number of slabs within a cache. This is called by 2714 * Grow (by 1) the number of slabs within a cache. This is called by
2690 * kmem_cache_alloc() when there are no active objs left in a cache. 2715 * kmem_cache_alloc() when there are no active objs left in a cache.
2691 */ 2716 */
2692static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) 2717static int cache_grow(struct kmem_cache *cachep,
2718 gfp_t flags, int nodeid, void *objp)
2693{ 2719{
2694 struct slab *slabp; 2720 struct slab *slabp;
2695 void *objp;
2696 size_t offset; 2721 size_t offset;
2697 gfp_t local_flags; 2722 gfp_t local_flags;
2698 unsigned long ctor_flags; 2723 unsigned long ctor_flags;
@@ -2702,12 +2727,12 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2702 * Be lazy and only check for valid flags here, keeping it out of the 2727 * Be lazy and only check for valid flags here, keeping it out of the
2703 * critical path in kmem_cache_alloc(). 2728 * critical path in kmem_cache_alloc().
2704 */ 2729 */
2705 BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)); 2730 BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));
2706 if (flags & SLAB_NO_GROW) 2731 if (flags & __GFP_NO_GROW)
2707 return 0; 2732 return 0;
2708 2733
2709 ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2734 ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2710 local_flags = (flags & SLAB_LEVEL_MASK); 2735 local_flags = (flags & GFP_LEVEL_MASK);
2711 if (!(local_flags & __GFP_WAIT)) 2736 if (!(local_flags & __GFP_WAIT))
2712 /* 2737 /*
2713 * Not allowed to sleep. Need to tell a constructor about 2738 * Not allowed to sleep. Need to tell a constructor about
@@ -2744,12 +2769,14 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2744 * Get mem for the objs. Attempt to allocate a physical page from 2769 * Get mem for the objs. Attempt to allocate a physical page from
2745 * 'nodeid'. 2770 * 'nodeid'.
2746 */ 2771 */
2747 objp = kmem_getpages(cachep, flags, nodeid); 2772 if (!objp)
2773 objp = kmem_getpages(cachep, flags, nodeid);
2748 if (!objp) 2774 if (!objp)
2749 goto failed; 2775 goto failed;
2750 2776
2751 /* Get slab management. */ 2777 /* Get slab management. */
2752 slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid); 2778 slabp = alloc_slabmgmt(cachep, objp, offset,
2779 local_flags & ~GFP_THISNODE, nodeid);
2753 if (!slabp) 2780 if (!slabp)
2754 goto opps1; 2781 goto opps1;
2755 2782
@@ -2987,7 +3014,7 @@ alloc_done:
2987 3014
2988 if (unlikely(!ac->avail)) { 3015 if (unlikely(!ac->avail)) {
2989 int x; 3016 int x;
2990 x = cache_grow(cachep, flags, node); 3017 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
2991 3018
2992 /* cache_grow can reenable interrupts, then ac could change. */ 3019 /* cache_grow can reenable interrupts, then ac could change. */
2993 ac = cpu_cache_get(cachep); 3020 ac = cpu_cache_get(cachep);
@@ -3063,18 +3090,101 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3063 3090
3064 cachep->ctor(objp, cachep, ctor_flags); 3091 cachep->ctor(objp, cachep, ctor_flags);
3065 } 3092 }
3093#if ARCH_SLAB_MINALIGN
3094 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3095 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3096 objp, ARCH_SLAB_MINALIGN);
3097 }
3098#endif
3066 return objp; 3099 return objp;
3067} 3100}
3068#else 3101#else
3069#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 3102#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3070#endif 3103#endif
3071 3104
3105#ifdef CONFIG_FAILSLAB
3106
3107static struct failslab_attr {
3108
3109 struct fault_attr attr;
3110
3111 u32 ignore_gfp_wait;
3112#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3113 struct dentry *ignore_gfp_wait_file;
3114#endif
3115
3116} failslab = {
3117 .attr = FAULT_ATTR_INITIALIZER,
3118 .ignore_gfp_wait = 1,
3119};
3120
3121static int __init setup_failslab(char *str)
3122{
3123 return setup_fault_attr(&failslab.attr, str);
3124}
3125__setup("failslab=", setup_failslab);
3126
3127static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3128{
3129 if (cachep == &cache_cache)
3130 return 0;
3131 if (flags & __GFP_NOFAIL)
3132 return 0;
3133 if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
3134 return 0;
3135
3136 return should_fail(&failslab.attr, obj_size(cachep));
3137}
3138
3139#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3140
3141static int __init failslab_debugfs(void)
3142{
3143 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
3144 struct dentry *dir;
3145 int err;
3146
3147 err = init_fault_attr_dentries(&failslab.attr, "failslab");
3148 if (err)
3149 return err;
3150 dir = failslab.attr.dentries.dir;
3151
3152 failslab.ignore_gfp_wait_file =
3153 debugfs_create_bool("ignore-gfp-wait", mode, dir,
3154 &failslab.ignore_gfp_wait);
3155
3156 if (!failslab.ignore_gfp_wait_file) {
3157 err = -ENOMEM;
3158 debugfs_remove(failslab.ignore_gfp_wait_file);
3159 cleanup_fault_attr_dentries(&failslab.attr);
3160 }
3161
3162 return err;
3163}
3164
3165late_initcall(failslab_debugfs);
3166
3167#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
3168
3169#else /* CONFIG_FAILSLAB */
3170
3171static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3172{
3173 return 0;
3174}
3175
3176#endif /* CONFIG_FAILSLAB */
3177
3072static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3178static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3073{ 3179{
3074 void *objp; 3180 void *objp;
3075 struct array_cache *ac; 3181 struct array_cache *ac;
3076 3182
3077 check_irq_off(); 3183 check_irq_off();
3184
3185 if (should_failslab(cachep, flags))
3186 return NULL;
3187
3078 ac = cpu_cache_get(cachep); 3188 ac = cpu_cache_get(cachep);
3079 if (likely(ac->avail)) { 3189 if (likely(ac->avail)) {
3080 STATS_INC_ALLOCHIT(cachep); 3190 STATS_INC_ALLOCHIT(cachep);
@@ -3105,10 +3215,10 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
3105 objp = ____cache_alloc(cachep, flags); 3215 objp = ____cache_alloc(cachep, flags);
3106 /* 3216 /*
3107 * We may just have run out of memory on the local node. 3217 * We may just have run out of memory on the local node.
3108 * __cache_alloc_node() knows how to locate memory on other nodes 3218 * ____cache_alloc_node() knows how to locate memory on other nodes
3109 */ 3219 */
3110 if (NUMA_BUILD && !objp) 3220 if (NUMA_BUILD && !objp)
3111 objp = __cache_alloc_node(cachep, flags, numa_node_id()); 3221 objp = ____cache_alloc_node(cachep, flags, numa_node_id());
3112 local_irq_restore(save_flags); 3222 local_irq_restore(save_flags);
3113 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 3223 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
3114 caller); 3224 caller);
@@ -3135,15 +3245,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3135 else if (current->mempolicy) 3245 else if (current->mempolicy)
3136 nid_alloc = slab_node(current->mempolicy); 3246 nid_alloc = slab_node(current->mempolicy);
3137 if (nid_alloc != nid_here) 3247 if (nid_alloc != nid_here)
3138 return __cache_alloc_node(cachep, flags, nid_alloc); 3248 return ____cache_alloc_node(cachep, flags, nid_alloc);
3139 return NULL; 3249 return NULL;
3140} 3250}
3141 3251
3142/* 3252/*
3143 * Fallback function if there was no memory available and no objects on a 3253 * Fallback function if there was no memory available and no objects on a
3144 * certain node and we are allowed to fall back. We mimick the behavior of 3254 * certain node and fall back is permitted. First we scan all the
3145 * the page allocator. We fall back according to a zonelist determined by 3255 * available nodelists for available objects. If that fails then we
3146 * the policy layer while obeying cpuset constraints. 3256 * perform an allocation without specifying a node. This allows the page
3257 * allocator to do its reclaim / fallback magic. We then insert the
3258 * slab into the proper nodelist and then allocate from it.
3147 */ 3259 */
3148void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) 3260void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3149{ 3261{
@@ -3151,15 +3263,57 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3151 ->node_zonelists[gfp_zone(flags)]; 3263 ->node_zonelists[gfp_zone(flags)];
3152 struct zone **z; 3264 struct zone **z;
3153 void *obj = NULL; 3265 void *obj = NULL;
3266 int nid;
3267 gfp_t local_flags = (flags & GFP_LEVEL_MASK);
3154 3268
3269retry:
3270 /*
3271 * Look through allowed nodes for objects available
3272 * from existing per node queues.
3273 */
3155 for (z = zonelist->zones; *z && !obj; z++) { 3274 for (z = zonelist->zones; *z && !obj; z++) {
3156 int nid = zone_to_nid(*z); 3275 nid = zone_to_nid(*z);
3157 3276
3158 if (zone_idx(*z) <= ZONE_NORMAL && 3277 if (cpuset_zone_allowed_hardwall(*z, flags) &&
3159 cpuset_zone_allowed(*z, flags) && 3278 cache->nodelists[nid] &&
3160 cache->nodelists[nid]) 3279 cache->nodelists[nid]->free_objects)
3161 obj = __cache_alloc_node(cache, 3280 obj = ____cache_alloc_node(cache,
3162 flags | __GFP_THISNODE, nid); 3281 flags | GFP_THISNODE, nid);
3282 }
3283
3284 if (!obj && !(flags & __GFP_NO_GROW)) {
3285 /*
3286 * This allocation will be performed within the constraints
3287 * of the current cpuset / memory policy requirements.
3288 * We may trigger various forms of reclaim on the allowed
3289 * set and go into memory reserves if necessary.
3290 */
3291 if (local_flags & __GFP_WAIT)
3292 local_irq_enable();
3293 kmem_flagcheck(cache, flags);
3294 obj = kmem_getpages(cache, flags, -1);
3295 if (local_flags & __GFP_WAIT)
3296 local_irq_disable();
3297 if (obj) {
3298 /*
3299 * Insert into the appropriate per node queues
3300 */
3301 nid = page_to_nid(virt_to_page(obj));
3302 if (cache_grow(cache, flags, nid, obj)) {
3303 obj = ____cache_alloc_node(cache,
3304 flags | GFP_THISNODE, nid);
3305 if (!obj)
3306 /*
3307 * Another processor may allocate the
3308 * objects in the slab since we are
3309 * not holding any locks.
3310 */
3311 goto retry;
3312 } else {
3313 /* cache_grow already freed obj */
3314 obj = NULL;
3315 }
3316 }
3163 } 3317 }
3164 return obj; 3318 return obj;
3165} 3319}
@@ -3167,7 +3321,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3167/* 3321/*
3168 * A interface to enable slab creation on nodeid 3322 * A interface to enable slab creation on nodeid
3169 */ 3323 */
3170static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3324static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3171 int nodeid) 3325 int nodeid)
3172{ 3326{
3173 struct list_head *entry; 3327 struct list_head *entry;
@@ -3216,7 +3370,7 @@ retry:
3216 3370
3217must_grow: 3371must_grow:
3218 spin_unlock(&l3->list_lock); 3372 spin_unlock(&l3->list_lock);
3219 x = cache_grow(cachep, flags, nodeid); 3373 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3220 if (x) 3374 if (x)
3221 goto retry; 3375 goto retry;
3222 3376
@@ -3399,7 +3553,7 @@ EXPORT_SYMBOL(kmem_cache_zalloc);
3399 * 3553 *
3400 * Currently only used for dentry validation. 3554 * Currently only used for dentry validation.
3401 */ 3555 */
3402int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr) 3556int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3403{ 3557{
3404 unsigned long addr = (unsigned long)ptr; 3558 unsigned long addr = (unsigned long)ptr;
3405 unsigned long min_addr = PAGE_OFFSET; 3559 unsigned long min_addr = PAGE_OFFSET;
@@ -3433,36 +3587,61 @@ out:
3433 * @cachep: The cache to allocate from. 3587 * @cachep: The cache to allocate from.
3434 * @flags: See kmalloc(). 3588 * @flags: See kmalloc().
3435 * @nodeid: node number of the target node. 3589 * @nodeid: node number of the target node.
3590 * @caller: return address of caller, used for debug information
3591 *
3592 * Identical to kmem_cache_alloc but it will allocate memory on the given
3593 * node, which can improve the performance for cpu bound structures.
3436 * 3594 *
3437 * Identical to kmem_cache_alloc, except that this function is slow 3595 * Fallback to other node is possible if __GFP_THISNODE is not set.
3438 * and can sleep. And it will allocate memory on the given node, which
3439 * can improve the performance for cpu bound structures.
3440 * New and improved: it will now make sure that the object gets
3441 * put on the correct node list so that there is no false sharing.
3442 */ 3596 */
3443void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3597static __always_inline void *
3598__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3599 int nodeid, void *caller)
3444{ 3600{
3445 unsigned long save_flags; 3601 unsigned long save_flags;
3446 void *ptr; 3602 void *ptr = NULL;
3447 3603
3448 cache_alloc_debugcheck_before(cachep, flags); 3604 cache_alloc_debugcheck_before(cachep, flags);
3449 local_irq_save(save_flags); 3605 local_irq_save(save_flags);
3450 3606
3451 if (nodeid == -1 || nodeid == numa_node_id() || 3607 if (unlikely(nodeid == -1))
3452 !cachep->nodelists[nodeid]) 3608 nodeid = numa_node_id();
3453 ptr = ____cache_alloc(cachep, flags);
3454 else
3455 ptr = __cache_alloc_node(cachep, flags, nodeid);
3456 local_irq_restore(save_flags);
3457 3609
3458 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, 3610 if (likely(cachep->nodelists[nodeid])) {
3459 __builtin_return_address(0)); 3611 if (nodeid == numa_node_id()) {
3612 /*
3613 * Use the locally cached objects if possible.
3614 * However ____cache_alloc does not allow fallback
3615 * to other nodes. It may fail while we still have
3616 * objects on other nodes available.
3617 */
3618 ptr = ____cache_alloc(cachep, flags);
3619 }
3620 if (!ptr) {
3621 /* ___cache_alloc_node can fall back to other nodes */
3622 ptr = ____cache_alloc_node(cachep, flags, nodeid);
3623 }
3624 } else {
3625 /* Node not bootstrapped yet */
3626 if (!(flags & __GFP_THISNODE))
3627 ptr = fallback_alloc(cachep, flags);
3628 }
3629
3630 local_irq_restore(save_flags);
3631 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3460 3632
3461 return ptr; 3633 return ptr;
3462} 3634}
3635
3636void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3637{
3638 return __cache_alloc_node(cachep, flags, nodeid,
3639 __builtin_return_address(0));
3640}
3463EXPORT_SYMBOL(kmem_cache_alloc_node); 3641EXPORT_SYMBOL(kmem_cache_alloc_node);
3464 3642
3465void *__kmalloc_node(size_t size, gfp_t flags, int node) 3643static __always_inline void *
3644__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3466{ 3645{
3467 struct kmem_cache *cachep; 3646 struct kmem_cache *cachep;
3468 3647
@@ -3471,8 +3650,29 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
3471 return NULL; 3650 return NULL;
3472 return kmem_cache_alloc_node(cachep, flags, node); 3651 return kmem_cache_alloc_node(cachep, flags, node);
3473} 3652}
3653
3654#ifdef CONFIG_DEBUG_SLAB
3655void *__kmalloc_node(size_t size, gfp_t flags, int node)
3656{
3657 return __do_kmalloc_node(size, flags, node,
3658 __builtin_return_address(0));
3659}
3474EXPORT_SYMBOL(__kmalloc_node); 3660EXPORT_SYMBOL(__kmalloc_node);
3475#endif 3661
3662void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3663 int node, void *caller)
3664{
3665 return __do_kmalloc_node(size, flags, node, caller);
3666}
3667EXPORT_SYMBOL(__kmalloc_node_track_caller);
3668#else
3669void *__kmalloc_node(size_t size, gfp_t flags, int node)
3670{
3671 return __do_kmalloc_node(size, flags, node, NULL);
3672}
3673EXPORT_SYMBOL(__kmalloc_node);
3674#endif /* CONFIG_DEBUG_SLAB */
3675#endif /* CONFIG_NUMA */
3476 3676
3477/** 3677/**
3478 * __do_kmalloc - allocate memory 3678 * __do_kmalloc - allocate memory
@@ -3583,13 +3783,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3583 int node; 3783 int node;
3584 struct kmem_list3 *l3; 3784 struct kmem_list3 *l3;
3585 struct array_cache *new_shared; 3785 struct array_cache *new_shared;
3586 struct array_cache **new_alien; 3786 struct array_cache **new_alien = NULL;
3587 3787
3588 for_each_online_node(node) { 3788 for_each_online_node(node) {
3589 3789
3590 new_alien = alloc_alien_cache(node, cachep->limit); 3790 if (use_alien_caches) {
3591 if (!new_alien) 3791 new_alien = alloc_alien_cache(node, cachep->limit);
3592 goto fail; 3792 if (!new_alien)
3793 goto fail;
3794 }
3593 3795
3594 new_shared = alloc_arraycache(node, 3796 new_shared = alloc_arraycache(node,
3595 cachep->shared*cachep->batchcount, 3797 cachep->shared*cachep->batchcount,
@@ -3815,7 +4017,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
3815 * If we cannot acquire the cache chain mutex then just give up - we'll try 4017 * If we cannot acquire the cache chain mutex then just give up - we'll try
3816 * again on the next iteration. 4018 * again on the next iteration.
3817 */ 4019 */
3818static void cache_reap(void *unused) 4020static void cache_reap(struct work_struct *unused)
3819{ 4021{
3820 struct kmem_cache *searchp; 4022 struct kmem_cache *searchp;
3821 struct kmem_list3 *l3; 4023 struct kmem_list3 *l3;
@@ -3824,7 +4026,7 @@ static void cache_reap(void *unused)
3824 if (!mutex_trylock(&cache_chain_mutex)) { 4026 if (!mutex_trylock(&cache_chain_mutex)) {
3825 /* Give up. Setup the next iteration. */ 4027 /* Give up. Setup the next iteration. */
3826 schedule_delayed_work(&__get_cpu_var(reap_work), 4028 schedule_delayed_work(&__get_cpu_var(reap_work),
3827 REAPTIMEOUT_CPUC); 4029 round_jiffies_relative(REAPTIMEOUT_CPUC));
3828 return; 4030 return;
3829 } 4031 }
3830 4032
@@ -3870,7 +4072,8 @@ next:
3870 next_reap_node(); 4072 next_reap_node();
3871 refresh_cpu_vm_stats(smp_processor_id()); 4073 refresh_cpu_vm_stats(smp_processor_id());
3872 /* Set up the next iteration */ 4074 /* Set up the next iteration */
3873 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 4075 schedule_delayed_work(&__get_cpu_var(reap_work),
4076 round_jiffies_relative(REAPTIMEOUT_CPUC));
3874} 4077}
3875 4078
3876#ifdef CONFIG_PROC_FS 4079#ifdef CONFIG_PROC_FS
@@ -4038,7 +4241,7 @@ static int s_show(struct seq_file *m, void *p)
4038 * + further values on SMP and with statistics enabled 4241 * + further values on SMP and with statistics enabled
4039 */ 4242 */
4040 4243
4041struct seq_operations slabinfo_op = { 4244const struct seq_operations slabinfo_op = {
4042 .start = s_start, 4245 .start = s_start,
4043 .next = s_next, 4246 .next = s_next,
4044 .stop = s_stop, 4247 .stop = s_stop,
@@ -4236,7 +4439,7 @@ static int leaks_show(struct seq_file *m, void *p)
4236 return 0; 4439 return 0;
4237} 4440}
4238 4441
4239struct seq_operations slabstats_op = { 4442const struct seq_operations slabstats_op = {
4240 .start = leaks_start, 4443 .start = leaks_start,
4241 .next = s_next, 4444 .next = s_next,
4242 .stop = s_stop, 4445 .stop = s_stop,
diff --git a/mm/slob.c b/mm/slob.c
index 542394184a58..5adc29cb58dd 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -60,6 +60,8 @@ static DEFINE_SPINLOCK(slob_lock);
60static DEFINE_SPINLOCK(block_lock); 60static DEFINE_SPINLOCK(block_lock);
61 61
62static void slob_free(void *b, int size); 62static void slob_free(void *b, int size);
63static void slob_timer_cbk(void);
64
63 65
64static void *slob_alloc(size_t size, gfp_t gfp, int align) 66static void *slob_alloc(size_t size, gfp_t gfp, int align)
65{ 67{
@@ -157,7 +159,7 @@ static int fastcall find_order(int size)
157 return order; 159 return order;
158} 160}
159 161
160void *kmalloc(size_t size, gfp_t gfp) 162void *__kmalloc(size_t size, gfp_t gfp)
161{ 163{
162 slob_t *m; 164 slob_t *m;
163 bigblock_t *bb; 165 bigblock_t *bb;
@@ -186,8 +188,7 @@ void *kmalloc(size_t size, gfp_t gfp)
186 slob_free(bb, sizeof(bigblock_t)); 188 slob_free(bb, sizeof(bigblock_t));
187 return 0; 189 return 0;
188} 190}
189 191EXPORT_SYMBOL(__kmalloc);
190EXPORT_SYMBOL(kmalloc);
191 192
192void kfree(const void *block) 193void kfree(const void *block)
193{ 194{
@@ -327,9 +328,25 @@ const char *kmem_cache_name(struct kmem_cache *c)
327EXPORT_SYMBOL(kmem_cache_name); 328EXPORT_SYMBOL(kmem_cache_name);
328 329
329static struct timer_list slob_timer = TIMER_INITIALIZER( 330static struct timer_list slob_timer = TIMER_INITIALIZER(
330 (void (*)(unsigned long))kmem_cache_init, 0, 0); 331 (void (*)(unsigned long))slob_timer_cbk, 0, 0);
332
333int kmem_cache_shrink(struct kmem_cache *d)
334{
335 return 0;
336}
337EXPORT_SYMBOL(kmem_cache_shrink);
338
339int kmem_ptr_validate(struct kmem_cache *a, const void *b)
340{
341 return 0;
342}
343
344void __init kmem_cache_init(void)
345{
346 slob_timer_cbk();
347}
331 348
332void kmem_cache_init(void) 349static void slob_timer_cbk(void)
333{ 350{
334 void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); 351 void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
335 352
diff --git a/mm/sparse.c b/mm/sparse.c
index b3c82ba30012..ac26eb0d73cd 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -24,6 +24,25 @@ struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
24#endif 24#endif
25EXPORT_SYMBOL(mem_section); 25EXPORT_SYMBOL(mem_section);
26 26
27#ifdef NODE_NOT_IN_PAGE_FLAGS
28/*
29 * If we did not store the node number in the page then we have to
30 * do a lookup in the section_to_node_table in order to find which
31 * node the page belongs to.
32 */
33#if MAX_NUMNODES <= 256
34static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
35#else
36static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
37#endif
38
39int page_to_nid(struct page *page)
40{
41 return section_to_node_table[page_to_section(page)];
42}
43EXPORT_SYMBOL(page_to_nid);
44#endif
45
27#ifdef CONFIG_SPARSEMEM_EXTREME 46#ifdef CONFIG_SPARSEMEM_EXTREME
28static struct mem_section *sparse_index_alloc(int nid) 47static struct mem_section *sparse_index_alloc(int nid)
29{ 48{
@@ -49,6 +68,10 @@ static int sparse_index_init(unsigned long section_nr, int nid)
49 struct mem_section *section; 68 struct mem_section *section;
50 int ret = 0; 69 int ret = 0;
51 70
71#ifdef NODE_NOT_IN_PAGE_FLAGS
72 section_to_node_table[section_nr] = nid;
73#endif
74
52 if (mem_section[root]) 75 if (mem_section[root])
53 return -EEXIST; 76 return -EEXIST;
54 77
diff --git a/mm/swap.c b/mm/swap.c
index 2e0e871f542f..2ed7be39795e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -57,9 +57,9 @@ static void put_compound_page(struct page *page)
57{ 57{
58 page = (struct page *)page_private(page); 58 page = (struct page *)page_private(page);
59 if (put_page_testzero(page)) { 59 if (put_page_testzero(page)) {
60 void (*dtor)(struct page *page); 60 compound_page_dtor *dtor;
61 61
62 dtor = (void (*)(struct page *))page[1].lru.next; 62 dtor = get_compound_page_dtor(page);
63 (*dtor)(page); 63 (*dtor)(page);
64 } 64 }
65} 65}
@@ -216,7 +216,7 @@ void lru_add_drain(void)
216} 216}
217 217
218#ifdef CONFIG_NUMA 218#ifdef CONFIG_NUMA
219static void lru_add_drain_per_cpu(void *dummy) 219static void lru_add_drain_per_cpu(struct work_struct *dummy)
220{ 220{
221 lru_add_drain(); 221 lru_add_drain();
222} 222}
@@ -226,7 +226,7 @@ static void lru_add_drain_per_cpu(void *dummy)
226 */ 226 */
227int lru_add_drain_all(void) 227int lru_add_drain_all(void)
228{ 228{
229 return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); 229 return schedule_on_each_cpu(lru_add_drain_per_cpu);
230} 230}
231 231
232#else 232#else
@@ -514,5 +514,7 @@ void __init swap_setup(void)
514 * Right now other parts of the system means that we 514 * Right now other parts of the system means that we
515 * _really_ don't want to cluster much more 515 * _really_ don't want to cluster much more
516 */ 516 */
517#ifdef CONFIG_HOTPLUG_CPU
517 hotcpu_notifier(cpu_swap_callback, 0); 518 hotcpu_notifier(cpu_swap_callback, 0);
519#endif
518} 520}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a15def63f28f..a2d9bb4e80df 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -427,34 +427,54 @@ void free_swap_and_cache(swp_entry_t entry)
427 427
428#ifdef CONFIG_SOFTWARE_SUSPEND 428#ifdef CONFIG_SOFTWARE_SUSPEND
429/* 429/*
430 * Find the swap type that corresponds to given device (if any) 430 * Find the swap type that corresponds to given device (if any).
431 * 431 *
432 * This is needed for software suspend and is done in such a way that inode 432 * @offset - number of the PAGE_SIZE-sized block of the device, starting
433 * aliasing is allowed. 433 * from 0, in which the swap header is expected to be located.
434 *
435 * This is needed for the suspend to disk (aka swsusp).
434 */ 436 */
435int swap_type_of(dev_t device) 437int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
436{ 438{
439 struct block_device *bdev = NULL;
437 int i; 440 int i;
438 441
442 if (device)
443 bdev = bdget(device);
444
439 spin_lock(&swap_lock); 445 spin_lock(&swap_lock);
440 for (i = 0; i < nr_swapfiles; i++) { 446 for (i = 0; i < nr_swapfiles; i++) {
441 struct inode *inode; 447 struct swap_info_struct *sis = swap_info + i;
442 448
443 if (!(swap_info[i].flags & SWP_WRITEOK)) 449 if (!(sis->flags & SWP_WRITEOK))
444 continue; 450 continue;
445 451
446 if (!device) { 452 if (!bdev) {
453 if (bdev_p)
454 *bdev_p = sis->bdev;
455
447 spin_unlock(&swap_lock); 456 spin_unlock(&swap_lock);
448 return i; 457 return i;
449 } 458 }
450 inode = swap_info[i].swap_file->f_dentry->d_inode; 459 if (bdev == sis->bdev) {
451 if (S_ISBLK(inode->i_mode) && 460 struct swap_extent *se;
452 device == MKDEV(imajor(inode), iminor(inode))) { 461
453 spin_unlock(&swap_lock); 462 se = list_entry(sis->extent_list.next,
454 return i; 463 struct swap_extent, list);
464 if (se->start_block == offset) {
465 if (bdev_p)
466 *bdev_p = sis->bdev;
467
468 spin_unlock(&swap_lock);
469 bdput(bdev);
470 return i;
471 }
455 } 472 }
456 } 473 }
457 spin_unlock(&swap_lock); 474 spin_unlock(&swap_lock);
475 if (bdev)
476 bdput(bdev);
477
458 return -ENODEV; 478 return -ENODEV;
459} 479}
460 480
@@ -931,6 +951,23 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
931 } 951 }
932} 952}
933 953
954#ifdef CONFIG_SOFTWARE_SUSPEND
955/*
956 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
957 * corresponding to given index in swap_info (swap type).
958 */
959sector_t swapdev_block(int swap_type, pgoff_t offset)
960{
961 struct swap_info_struct *sis;
962
963 if (swap_type >= nr_swapfiles)
964 return 0;
965
966 sis = swap_info + swap_type;
967 return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
968}
969#endif /* CONFIG_SOFTWARE_SUSPEND */
970
934/* 971/*
935 * Free all of a swapdev's extent information 972 * Free all of a swapdev's extent information
936 */ 973 */
@@ -1274,10 +1311,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1274 1311
1275 mutex_lock(&swapon_mutex); 1312 mutex_lock(&swapon_mutex);
1276 1313
1314 if (!l)
1315 return SEQ_START_TOKEN;
1316
1277 for (i = 0; i < nr_swapfiles; i++, ptr++) { 1317 for (i = 0; i < nr_swapfiles; i++, ptr++) {
1278 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1318 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
1279 continue; 1319 continue;
1280 if (!l--) 1320 if (!--l)
1281 return ptr; 1321 return ptr;
1282 } 1322 }
1283 1323
@@ -1286,10 +1326,17 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1286 1326
1287static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 1327static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1288{ 1328{
1289 struct swap_info_struct *ptr = v; 1329 struct swap_info_struct *ptr;
1290 struct swap_info_struct *endptr = swap_info + nr_swapfiles; 1330 struct swap_info_struct *endptr = swap_info + nr_swapfiles;
1291 1331
1292 for (++ptr; ptr < endptr; ptr++) { 1332 if (v == SEQ_START_TOKEN)
1333 ptr = swap_info;
1334 else {
1335 ptr = v;
1336 ptr++;
1337 }
1338
1339 for (; ptr < endptr; ptr++) {
1293 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1340 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
1294 continue; 1341 continue;
1295 ++*pos; 1342 ++*pos;
@@ -1310,14 +1357,16 @@ static int swap_show(struct seq_file *swap, void *v)
1310 struct file *file; 1357 struct file *file;
1311 int len; 1358 int len;
1312 1359
1313 if (v == swap_info) 1360 if (ptr == SEQ_START_TOKEN) {
1314 seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 1361 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1362 return 0;
1363 }
1315 1364
1316 file = ptr->swap_file; 1365 file = ptr->swap_file;
1317 len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\"); 1366 len = seq_path(swap, file->f_path.mnt, file->f_path.dentry, " \t\n\\");
1318 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1367 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1319 len < 40 ? 40 - len : 1, " ", 1368 len < 40 ? 40 - len : 1, " ",
1320 S_ISBLK(file->f_dentry->d_inode->i_mode) ? 1369 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1321 "partition" : "file\t", 1370 "partition" : "file\t",
1322 ptr->pages << (PAGE_SHIFT - 10), 1371 ptr->pages << (PAGE_SHIFT - 10),
1323 ptr->inuse_pages << (PAGE_SHIFT - 10), 1372 ptr->inuse_pages << (PAGE_SHIFT - 10),
@@ -1325,7 +1374,7 @@ static int swap_show(struct seq_file *swap, void *v)
1325 return 0; 1374 return 0;
1326} 1375}
1327 1376
1328static struct seq_operations swaps_op = { 1377static const struct seq_operations swaps_op = {
1329 .start = swap_start, 1378 .start = swap_start,
1330 .next = swap_next, 1379 .next = swap_next,
1331 .stop = swap_stop, 1380 .stop = swap_stop,
@@ -1337,7 +1386,7 @@ static int swaps_open(struct inode *inode, struct file *file)
1337 return seq_open(file, &swaps_op); 1386 return seq_open(file, &swaps_op);
1338} 1387}
1339 1388
1340static struct file_operations proc_swaps_operations = { 1389static const struct file_operations proc_swaps_operations = {
1341 .open = swaps_open, 1390 .open = swaps_open,
1342 .read = seq_read, 1391 .read = seq_read,
1343 .llseek = seq_lseek, 1392 .llseek = seq_lseek,
@@ -1540,6 +1589,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1540 error = -EINVAL; 1589 error = -EINVAL;
1541 if (!maxpages) 1590 if (!maxpages)
1542 goto bad_swap; 1591 goto bad_swap;
1592 if (swapfilesize && maxpages > swapfilesize) {
1593 printk(KERN_WARNING
1594 "Swap area shorter than signature indicates\n");
1595 goto bad_swap;
1596 }
1543 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 1597 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1544 goto bad_swap; 1598 goto bad_swap;
1545 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1599 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
@@ -1567,12 +1621,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1567 goto bad_swap; 1621 goto bad_swap;
1568 } 1622 }
1569 1623
1570 if (swapfilesize && maxpages > swapfilesize) {
1571 printk(KERN_WARNING
1572 "Swap area shorter than signature indicates\n");
1573 error = -EINVAL;
1574 goto bad_swap;
1575 }
1576 if (nr_good_pages) { 1624 if (nr_good_pages) {
1577 p->swap_map[0] = SWAP_MAP_BAD; 1625 p->swap_map[0] = SWAP_MAP_BAD;
1578 p->max = maxpages; 1626 p->max = maxpages;
diff --git a/mm/thrash.c b/mm/thrash.c
index f4c560b4a2b7..9ef9071f99bc 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -7,100 +7,74 @@
7 * 7 *
8 * Simple token based thrashing protection, using the algorithm 8 * Simple token based thrashing protection, using the algorithm
9 * described in: http://www.cs.wm.edu/~sjiang/token.pdf 9 * described in: http://www.cs.wm.edu/~sjiang/token.pdf
10 *
11 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
12 * Improved algorithm to pass token:
13 * Each task has a priority which is incremented if it contended
14 * for the token in an interval less than its previous attempt.
15 * If the token is acquired, that task's priority is boosted to prevent
16 * the token from bouncing around too often and to let the task make
17 * some progress in its execution.
10 */ 18 */
19
11#include <linux/jiffies.h> 20#include <linux/jiffies.h>
12#include <linux/mm.h> 21#include <linux/mm.h>
13#include <linux/sched.h> 22#include <linux/sched.h>
14#include <linux/swap.h> 23#include <linux/swap.h>
15 24
16static DEFINE_SPINLOCK(swap_token_lock); 25static DEFINE_SPINLOCK(swap_token_lock);
17static unsigned long swap_token_timeout; 26struct mm_struct *swap_token_mm;
18static unsigned long swap_token_check; 27static unsigned int global_faults;
19struct mm_struct * swap_token_mm = &init_mm;
20
21#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
22#define SWAP_TOKEN_TIMEOUT (300 * HZ)
23/*
24 * Currently disabled; Needs further code to work at HZ * 300.
25 */
26unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT;
27
28/*
29 * Take the token away if the process had no page faults
30 * in the last interval, or if it has held the token for
31 * too long.
32 */
33#define SWAP_TOKEN_ENOUGH_RSS 1
34#define SWAP_TOKEN_TIMED_OUT 2
35static int should_release_swap_token(struct mm_struct *mm)
36{
37 int ret = 0;
38 if (!mm->recent_pagein)
39 ret = SWAP_TOKEN_ENOUGH_RSS;
40 else if (time_after(jiffies, swap_token_timeout))
41 ret = SWAP_TOKEN_TIMED_OUT;
42 mm->recent_pagein = 0;
43 return ret;
44}
45 28
46/*
47 * Try to grab the swapout protection token. We only try to
48 * grab it once every TOKEN_CHECK_INTERVAL, both to prevent
49 * SMP lock contention and to check that the process that held
50 * the token before is no longer thrashing.
51 */
52void grab_swap_token(void) 29void grab_swap_token(void)
53{ 30{
54 struct mm_struct *mm; 31 int current_interval;
55 int reason;
56 32
57 /* We have the token. Let others know we still need it. */ 33 global_faults++;
58 if (has_swap_token(current->mm)) {
59 current->mm->recent_pagein = 1;
60 if (unlikely(!swap_token_default_timeout))
61 disable_swap_token();
62 return;
63 }
64
65 if (time_after(jiffies, swap_token_check)) {
66 34
67 if (!swap_token_default_timeout) { 35 current_interval = global_faults - current->mm->faultstamp;
68 swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
69 return;
70 }
71
72 /* ... or if we recently held the token. */
73 if (time_before(jiffies, current->mm->swap_token_time))
74 return;
75 36
76 if (!spin_trylock(&swap_token_lock)) 37 if (!spin_trylock(&swap_token_lock))
77 return; 38 return;
78 39
79 swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; 40 /* First come first served */
41 if (swap_token_mm == NULL) {
42 current->mm->token_priority = current->mm->token_priority + 2;
43 swap_token_mm = current->mm;
44 goto out;
45 }
80 46
81 mm = swap_token_mm; 47 if (current->mm != swap_token_mm) {
82 if ((reason = should_release_swap_token(mm))) { 48 if (current_interval < current->mm->last_interval)
83 unsigned long eligible = jiffies; 49 current->mm->token_priority++;
84 if (reason == SWAP_TOKEN_TIMED_OUT) { 50 else {
85 eligible += swap_token_default_timeout; 51 current->mm->token_priority--;
86 } 52 if (unlikely(current->mm->token_priority < 0))
87 mm->swap_token_time = eligible; 53 current->mm->token_priority = 0;
88 swap_token_timeout = jiffies + swap_token_default_timeout; 54 }
55 /* Check if we deserve the token */
56 if (current->mm->token_priority >
57 swap_token_mm->token_priority) {
58 current->mm->token_priority += 2;
89 swap_token_mm = current->mm; 59 swap_token_mm = current->mm;
90 } 60 }
91 spin_unlock(&swap_token_lock); 61 } else {
62 /* Token holder came in again! */
63 current->mm->token_priority += 2;
92 } 64 }
93 return; 65
66out:
67 current->mm->faultstamp = global_faults;
68 current->mm->last_interval = current_interval;
69 spin_unlock(&swap_token_lock);
70return;
94} 71}
95 72
96/* Called on process exit. */ 73/* Called on process exit. */
97void __put_swap_token(struct mm_struct *mm) 74void __put_swap_token(struct mm_struct *mm)
98{ 75{
99 spin_lock(&swap_token_lock); 76 spin_lock(&swap_token_lock);
100 if (likely(mm == swap_token_mm)) { 77 if (likely(mm == swap_token_mm))
101 mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL; 78 swap_token_mm = NULL;
102 swap_token_mm = &init_mm;
103 swap_token_check = jiffies;
104 }
105 spin_unlock(&swap_token_lock); 79 spin_unlock(&swap_token_lock);
106} 80}
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index 5f2cbf0f153c..c7f6e1914bc4 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -79,8 +79,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
79 d_instantiate(dentry, inode); 79 d_instantiate(dentry, inode);
80 inode->i_nlink = 0; /* It is unlinked */ 80 inode->i_nlink = 0; /* It is unlinked */
81 81
82 file->f_vfsmnt = mntget(shm_mnt); 82 file->f_path.mnt = mntget(shm_mnt);
83 file->f_dentry = dentry; 83 file->f_path.dentry = dentry;
84 file->f_mapping = inode->i_mapping; 84 file->f_mapping = inode->i_mapping;
85 file->f_op = &ramfs_file_operations; 85 file->f_op = &ramfs_file_operations;
86 file->f_mode = FMODE_WRITE | FMODE_READ; 86 file->f_mode = FMODE_WRITE | FMODE_READ;
diff --git a/mm/truncate.c b/mm/truncate.c
index e07b1e682c38..6c79ca4a1ca7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/pagemap.h> 14#include <linux/pagemap.h>
15#include <linux/pagevec.h> 15#include <linux/pagevec.h>
16#include <linux/task_io_accounting_ops.h>
16#include <linux/buffer_head.h> /* grr. try_to_release_page, 17#include <linux/buffer_head.h> /* grr. try_to_release_page,
17 do_invalidatepage */ 18 do_invalidatepage */
18 19
@@ -50,6 +51,26 @@ static inline void truncate_partial_page(struct page *page, unsigned partial)
50 do_invalidatepage(page, partial); 51 do_invalidatepage(page, partial);
51} 52}
52 53
54void cancel_dirty_page(struct page *page, unsigned int account_size)
55{
56 /* If we're cancelling the page, it had better not be mapped any more */
57 if (page_mapped(page)) {
58 static unsigned int warncount;
59
60 WARN_ON(++warncount < 5);
61 }
62
63 if (TestClearPageDirty(page)) {
64 struct address_space *mapping = page->mapping;
65 if (mapping && mapping_cap_account_dirty(mapping)) {
66 dec_zone_page_state(page, NR_FILE_DIRTY);
67 if (account_size)
68 task_io_account_cancelled_write(account_size);
69 }
70 }
71}
72EXPORT_SYMBOL(cancel_dirty_page);
73
53/* 74/*
54 * If truncate cannot remove the fs-private metadata from the page, the page 75 * If truncate cannot remove the fs-private metadata from the page, the page
55 * becomes anonymous. It will be left on the LRU and may even be mapped into 76 * becomes anonymous. It will be left on the LRU and may even be mapped into
@@ -66,10 +87,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
66 if (page->mapping != mapping) 87 if (page->mapping != mapping)
67 return; 88 return;
68 89
90 cancel_dirty_page(page, PAGE_CACHE_SIZE);
91
69 if (PagePrivate(page)) 92 if (PagePrivate(page))
70 do_invalidatepage(page, 0); 93 do_invalidatepage(page, 0);
71 94
72 clear_page_dirty(page);
73 ClearPageUptodate(page); 95 ClearPageUptodate(page);
74 ClearPageMappedToDisk(page); 96 ClearPageMappedToDisk(page);
75 remove_from_page_cache(page); 97 remove_from_page_cache(page);
@@ -319,6 +341,15 @@ failed:
319 return 0; 341 return 0;
320} 342}
321 343
344static int do_launder_page(struct address_space *mapping, struct page *page)
345{
346 if (!PageDirty(page))
347 return 0;
348 if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
349 return 0;
350 return mapping->a_ops->launder_page(page);
351}
352
322/** 353/**
323 * invalidate_inode_pages2_range - remove range of pages from an address_space 354 * invalidate_inode_pages2_range - remove range of pages from an address_space
324 * @mapping: the address_space 355 * @mapping: the address_space
@@ -348,7 +379,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
348 for (i = 0; !ret && i < pagevec_count(&pvec); i++) { 379 for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
349 struct page *page = pvec.pages[i]; 380 struct page *page = pvec.pages[i];
350 pgoff_t page_index; 381 pgoff_t page_index;
351 int was_dirty;
352 382
353 lock_page(page); 383 lock_page(page);
354 if (page->mapping != mapping) { 384 if (page->mapping != mapping) {
@@ -384,12 +414,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
384 PAGE_CACHE_SIZE, 0); 414 PAGE_CACHE_SIZE, 0);
385 } 415 }
386 } 416 }
387 was_dirty = test_clear_page_dirty(page); 417 ret = do_launder_page(mapping, page);
388 if (!invalidate_complete_page2(mapping, page)) { 418 if (ret == 0 && !invalidate_complete_page2(mapping, page))
389 if (was_dirty)
390 set_page_dirty(page);
391 ret = -EIO; 419 ret = -EIO;
392 }
393 unlock_page(page); 420 unlock_page(page);
394 } 421 }
395 pagevec_release(&pvec); 422 pagevec_release(&pvec);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 518540a4a2a6..7430df68cb64 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -36,6 +36,7 @@
36#include <linux/rwsem.h> 36#include <linux/rwsem.h>
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/kthread.h> 38#include <linux/kthread.h>
39#include <linux/freezer.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include <asm/div64.h> 42#include <asm/div64.h>
@@ -691,7 +692,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
691 __count_vm_events(KSWAPD_STEAL, nr_freed); 692 __count_vm_events(KSWAPD_STEAL, nr_freed);
692 } else 693 } else
693 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); 694 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
694 __count_vm_events(PGACTIVATE, nr_freed); 695 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
695 696
696 if (nr_taken == 0) 697 if (nr_taken == 0)
697 goto done; 698 goto done;
@@ -983,7 +984,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
983 if (!populated_zone(zone)) 984 if (!populated_zone(zone))
984 continue; 985 continue;
985 986
986 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 987 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
987 continue; 988 continue;
988 989
989 note_zone_scanning_priority(zone, priority); 990 note_zone_scanning_priority(zone, priority);
@@ -1033,7 +1034,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1033 for (i = 0; zones[i] != NULL; i++) { 1034 for (i = 0; zones[i] != NULL; i++) {
1034 struct zone *zone = zones[i]; 1035 struct zone *zone = zones[i];
1035 1036
1036 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1037 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1037 continue; 1038 continue;
1038 1039
1039 lru_pages += zone->nr_active + zone->nr_inactive; 1040 lru_pages += zone->nr_active + zone->nr_inactive;
@@ -1088,7 +1089,7 @@ out:
1088 for (i = 0; zones[i] != 0; i++) { 1089 for (i = 0; zones[i] != 0; i++) {
1089 struct zone *zone = zones[i]; 1090 struct zone *zone = zones[i];
1090 1091
1091 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1092 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1092 continue; 1093 continue;
1093 1094
1094 zone->prev_priority = priority; 1095 zone->prev_priority = priority;
@@ -1172,11 +1173,12 @@ loop_again:
1172 if (!zone_watermark_ok(zone, order, zone->pages_high, 1173 if (!zone_watermark_ok(zone, order, zone->pages_high,
1173 0, 0)) { 1174 0, 0)) {
1174 end_zone = i; 1175 end_zone = i;
1175 goto scan; 1176 break;
1176 } 1177 }
1177 } 1178 }
1178 goto out; 1179 if (i < 0)
1179scan: 1180 goto out;
1181
1180 for (i = 0; i <= end_zone; i++) { 1182 for (i = 0; i <= end_zone; i++) {
1181 struct zone *zone = pgdat->node_zones + i; 1183 struct zone *zone = pgdat->node_zones + i;
1182 1184
@@ -1259,6 +1261,9 @@ out:
1259 } 1261 }
1260 if (!all_zones_ok) { 1262 if (!all_zones_ok) {
1261 cond_resched(); 1263 cond_resched();
1264
1265 try_to_freeze();
1266
1262 goto loop_again; 1267 goto loop_again;
1263 } 1268 }
1264 1269
@@ -1349,7 +1354,7 @@ void wakeup_kswapd(struct zone *zone, int order)
1349 return; 1354 return;
1350 if (pgdat->kswapd_max_order < order) 1355 if (pgdat->kswapd_max_order < order)
1351 pgdat->kswapd_max_order = order; 1356 pgdat->kswapd_max_order = order;
1352 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1357 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1353 return; 1358 return;
1354 if (!waitqueue_active(&pgdat->kswapd_wait)) 1359 if (!waitqueue_active(&pgdat->kswapd_wait))
1355 return; 1360 return;
@@ -1364,8 +1369,8 @@ void wakeup_kswapd(struct zone *zone, int order)
1364 * 1369 *
1365 * For pass > 3 we also try to shrink the LRU lists that contain a few pages 1370 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
1366 */ 1371 */
1367static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, 1372static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1368 int prio, struct scan_control *sc) 1373 int pass, struct scan_control *sc)
1369{ 1374{
1370 struct zone *zone; 1375 struct zone *zone;
1371 unsigned long nr_to_scan, ret = 0; 1376 unsigned long nr_to_scan, ret = 0;
@@ -1401,6 +1406,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
1401 return ret; 1406 return ret;
1402} 1407}
1403 1408
1409static unsigned long count_lru_pages(void)
1410{
1411 struct zone *zone;
1412 unsigned long ret = 0;
1413
1414 for_each_zone(zone)
1415 ret += zone->nr_active + zone->nr_inactive;
1416 return ret;
1417}
1418
1404/* 1419/*
1405 * Try to free `nr_pages' of memory, system-wide, and return the number of 1420 * Try to free `nr_pages' of memory, system-wide, and return the number of
1406 * freed pages. 1421 * freed pages.
@@ -1415,7 +1430,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1415 unsigned long ret = 0; 1430 unsigned long ret = 0;
1416 int pass; 1431 int pass;
1417 struct reclaim_state reclaim_state; 1432 struct reclaim_state reclaim_state;
1418 struct zone *zone;
1419 struct scan_control sc = { 1433 struct scan_control sc = {
1420 .gfp_mask = GFP_KERNEL, 1434 .gfp_mask = GFP_KERNEL,
1421 .may_swap = 0, 1435 .may_swap = 0,
@@ -1426,10 +1440,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1426 1440
1427 current->reclaim_state = &reclaim_state; 1441 current->reclaim_state = &reclaim_state;
1428 1442
1429 lru_pages = 0; 1443 lru_pages = count_lru_pages();
1430 for_each_zone(zone)
1431 lru_pages += zone->nr_active + zone->nr_inactive;
1432
1433 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 1444 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
1434 /* If slab caches are huge, it's better to hit them first */ 1445 /* If slab caches are huge, it's better to hit them first */
1435 while (nr_slab >= lru_pages) { 1446 while (nr_slab >= lru_pages) {
@@ -1456,13 +1467,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1456 for (pass = 0; pass < 5; pass++) { 1467 for (pass = 0; pass < 5; pass++) {
1457 int prio; 1468 int prio;
1458 1469
1459 /* Needed for shrinking slab caches later on */
1460 if (!lru_pages)
1461 for_each_zone(zone) {
1462 lru_pages += zone->nr_active;
1463 lru_pages += zone->nr_inactive;
1464 }
1465
1466 /* Force reclaiming mapped pages in the passes #3 and #4 */ 1470 /* Force reclaiming mapped pages in the passes #3 and #4 */
1467 if (pass > 2) { 1471 if (pass > 2) {
1468 sc.may_swap = 1; 1472 sc.may_swap = 1;
@@ -1478,7 +1482,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1478 goto out; 1482 goto out;
1479 1483
1480 reclaim_state.reclaimed_slab = 0; 1484 reclaim_state.reclaimed_slab = 0;
1481 shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); 1485 shrink_slab(sc.nr_scanned, sc.gfp_mask,
1486 count_lru_pages());
1482 ret += reclaim_state.reclaimed_slab; 1487 ret += reclaim_state.reclaimed_slab;
1483 if (ret >= nr_pages) 1488 if (ret >= nr_pages)
1484 goto out; 1489 goto out;
@@ -1486,20 +1491,19 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1486 if (sc.nr_scanned && prio < DEF_PRIORITY - 2) 1491 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
1487 congestion_wait(WRITE, HZ / 10); 1492 congestion_wait(WRITE, HZ / 10);
1488 } 1493 }
1489
1490 lru_pages = 0;
1491 } 1494 }
1492 1495
1493 /* 1496 /*
1494 * If ret = 0, we could not shrink LRUs, but there may be something 1497 * If ret = 0, we could not shrink LRUs, but there may be something
1495 * in slab caches 1498 * in slab caches
1496 */ 1499 */
1497 if (!ret) 1500 if (!ret) {
1498 do { 1501 do {
1499 reclaim_state.reclaimed_slab = 0; 1502 reclaim_state.reclaimed_slab = 0;
1500 shrink_slab(nr_pages, sc.gfp_mask, lru_pages); 1503 shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
1501 ret += reclaim_state.reclaimed_slab; 1504 ret += reclaim_state.reclaimed_slab;
1502 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 1505 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
1506 }
1503 1507
1504out: 1508out:
1505 current->reclaim_state = NULL; 1509 current->reclaim_state = NULL;
@@ -1508,7 +1512,6 @@ out:
1508} 1512}
1509#endif 1513#endif
1510 1514
1511#ifdef CONFIG_HOTPLUG_CPU
1512/* It's optimal to keep kswapds on the same CPUs as their memory, but 1515/* It's optimal to keep kswapds on the same CPUs as their memory, but
1513 not required for correctness. So if the last cpu in a node goes 1516 not required for correctness. So if the last cpu in a node goes
1514 away, we get changed to run anywhere: as the first one comes back, 1517 away, we get changed to run anywhere: as the first one comes back,
@@ -1529,7 +1532,6 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
1529 } 1532 }
1530 return NOTIFY_OK; 1533 return NOTIFY_OK;
1531} 1534}
1532#endif /* CONFIG_HOTPLUG_CPU */
1533 1535
1534/* 1536/*
1535 * This kswapd start function will be called by init and node-hot-add. 1537 * This kswapd start function will be called by init and node-hot-add.
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8614e8f6743b..dc005a0c96ae 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -430,7 +430,7 @@ static int frag_show(struct seq_file *m, void *arg)
430 return 0; 430 return 0;
431} 431}
432 432
433struct seq_operations fragmentation_op = { 433const struct seq_operations fragmentation_op = {
434 .start = frag_start, 434 .start = frag_start,
435 .next = frag_next, 435 .next = frag_next,
436 .stop = frag_stop, 436 .stop = frag_stop,
@@ -452,7 +452,7 @@ struct seq_operations fragmentation_op = {
452#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \ 452#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
453 TEXT_FOR_HIGHMEM(xx) 453 TEXT_FOR_HIGHMEM(xx)
454 454
455static char *vmstat_text[] = { 455static const char * const vmstat_text[] = {
456 /* Zoned VM counters */ 456 /* Zoned VM counters */
457 "nr_anon_pages", 457 "nr_anon_pages",
458 "nr_mapped", 458 "nr_mapped",
@@ -597,7 +597,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
597 return 0; 597 return 0;
598} 598}
599 599
600struct seq_operations zoneinfo_op = { 600const struct seq_operations zoneinfo_op = {
601 .start = frag_start, /* iterate over all zones. The same as in 601 .start = frag_start, /* iterate over all zones. The same as in
602 * fragmentation. */ 602 * fragmentation. */
603 .next = frag_next, 603 .next = frag_next,
@@ -660,7 +660,7 @@ static void vmstat_stop(struct seq_file *m, void *arg)
660 m->private = NULL; 660 m->private = NULL;
661} 661}
662 662
663struct seq_operations vmstat_op = { 663const struct seq_operations vmstat_op = {
664 .start = vmstat_start, 664 .start = vmstat_start,
665 .next = vmstat_next, 665 .next = vmstat_next,
666 .stop = vmstat_stop, 666 .stop = vmstat_stop,
@@ -679,13 +679,13 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
679 void *hcpu) 679 void *hcpu)
680{ 680{
681 switch (action) { 681 switch (action) {
682 case CPU_UP_PREPARE: 682 case CPU_UP_PREPARE:
683 case CPU_UP_CANCELED: 683 case CPU_UP_CANCELED:
684 case CPU_DEAD: 684 case CPU_DEAD:
685 refresh_zone_stat_thresholds(); 685 refresh_zone_stat_thresholds();
686 break; 686 break;
687 default: 687 default:
688 break; 688 break;
689 } 689 }
690 return NOTIFY_OK; 690 return NOTIFY_OK;
691} 691}