diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/allocpercpu.c | 9 | ||||
-rw-r--r-- | mm/bootmem.c | 6 | ||||
-rw-r--r-- | mm/bounce.c | 4 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 13 | ||||
-rw-r--r-- | mm/filemap_xip.c | 4 | ||||
-rw-r--r-- | mm/fremap.c | 4 | ||||
-rw-r--r-- | mm/hugetlb.c | 30 | ||||
-rw-r--r-- | mm/memory.c | 51 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 7 | ||||
-rw-r--r-- | mm/mempolicy.c | 12 | ||||
-rw-r--r-- | mm/migrate.c | 19 | ||||
-rw-r--r-- | mm/mincore.c | 183 | ||||
-rw-r--r-- | mm/mlock.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 14 | ||||
-rw-r--r-- | mm/mmzone.c | 5 | ||||
-rw-r--r-- | mm/nommu.c | 30 | ||||
-rw-r--r-- | mm/oom_kill.c | 62 | ||||
-rw-r--r-- | mm/page-writeback.c | 106 | ||||
-rw-r--r-- | mm/page_alloc.c | 406 | ||||
-rw-r--r-- | mm/page_io.c | 45 | ||||
-rw-r--r-- | mm/pdflush.c | 1 | ||||
-rw-r--r-- | mm/readahead.c | 12 | ||||
-rw-r--r-- | mm/rmap.c | 36 | ||||
-rw-r--r-- | mm/shmem.c | 35 | ||||
-rw-r--r-- | mm/slab.c | 411 | ||||
-rw-r--r-- | mm/slob.c | 27 | ||||
-rw-r--r-- | mm/sparse.c | 23 | ||||
-rw-r--r-- | mm/swap.c | 10 | ||||
-rw-r--r-- | mm/swapfile.c | 102 | ||||
-rw-r--r-- | mm/thrash.c | 116 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 4 | ||||
-rw-r--r-- | mm/truncate.c | 41 | ||||
-rw-r--r-- | mm/vmscan.c | 60 | ||||
-rw-r--r-- | mm/vmstat.c | 22 |
35 files changed, 1224 insertions, 690 deletions
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index eaa9abeea536..b2486cf887a0 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c | |||
@@ -17,10 +17,9 @@ | |||
17 | void percpu_depopulate(void *__pdata, int cpu) | 17 | void percpu_depopulate(void *__pdata, int cpu) |
18 | { | 18 | { |
19 | struct percpu_data *pdata = __percpu_disguise(__pdata); | 19 | struct percpu_data *pdata = __percpu_disguise(__pdata); |
20 | if (pdata->ptrs[cpu]) { | 20 | |
21 | kfree(pdata->ptrs[cpu]); | 21 | kfree(pdata->ptrs[cpu]); |
22 | pdata->ptrs[cpu] = NULL; | 22 | pdata->ptrs[cpu] = NULL; |
23 | } | ||
24 | } | 23 | } |
25 | EXPORT_SYMBOL_GPL(percpu_depopulate); | 24 | EXPORT_SYMBOL_GPL(percpu_depopulate); |
26 | 25 | ||
@@ -123,6 +122,8 @@ EXPORT_SYMBOL_GPL(__percpu_alloc_mask); | |||
123 | */ | 122 | */ |
124 | void percpu_free(void *__pdata) | 123 | void percpu_free(void *__pdata) |
125 | { | 124 | { |
125 | if (unlikely(!__pdata)) | ||
126 | return; | ||
126 | __percpu_depopulate_mask(__pdata, &cpu_possible_map); | 127 | __percpu_depopulate_mask(__pdata, &cpu_possible_map); |
127 | kfree(__percpu_disguise(__pdata)); | 128 | kfree(__percpu_disguise(__pdata)); |
128 | } | 129 | } |
diff --git a/mm/bootmem.c b/mm/bootmem.c index d53112fcb404..00a96970b237 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -27,8 +27,6 @@ unsigned long max_low_pfn; | |||
27 | unsigned long min_low_pfn; | 27 | unsigned long min_low_pfn; |
28 | unsigned long max_pfn; | 28 | unsigned long max_pfn; |
29 | 29 | ||
30 | EXPORT_UNUSED_SYMBOL(max_pfn); /* June 2006 */ | ||
31 | |||
32 | static LIST_HEAD(bdata_list); | 30 | static LIST_HEAD(bdata_list); |
33 | #ifdef CONFIG_CRASH_DUMP | 31 | #ifdef CONFIG_CRASH_DUMP |
34 | /* | 32 | /* |
@@ -196,6 +194,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | |||
196 | if (limit && bdata->node_boot_start >= limit) | 194 | if (limit && bdata->node_boot_start >= limit) |
197 | return NULL; | 195 | return NULL; |
198 | 196 | ||
197 | /* on nodes without memory - bootmem_map is NULL */ | ||
198 | if (!bdata->node_bootmem_map) | ||
199 | return NULL; | ||
200 | |||
199 | end_pfn = bdata->node_low_pfn; | 201 | end_pfn = bdata->node_low_pfn; |
200 | limit = PFN_DOWN(limit); | 202 | limit = PFN_DOWN(limit); |
201 | if (limit && end_pfn > limit) | 203 | if (limit && end_pfn > limit) |
diff --git a/mm/bounce.c b/mm/bounce.c index e4b62d2a4024..643efbe82402 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -237,6 +237,8 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, | |||
237 | if (!bio) | 237 | if (!bio) |
238 | return; | 238 | return; |
239 | 239 | ||
240 | blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); | ||
241 | |||
240 | /* | 242 | /* |
241 | * at least one page was bounced, fill in possible non-highmem | 243 | * at least one page was bounced, fill in possible non-highmem |
242 | * pages | 244 | * pages |
@@ -291,8 +293,6 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) | |||
291 | pool = isa_page_pool; | 293 | pool = isa_page_pool; |
292 | } | 294 | } |
293 | 295 | ||
294 | blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); | ||
295 | |||
296 | /* | 296 | /* |
297 | * slow path | 297 | * slow path |
298 | */ | 298 | */ |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 168c78a121bb..0df4c899e979 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -38,7 +38,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | |||
38 | if (!file) | 38 | if (!file) |
39 | return -EBADF; | 39 | return -EBADF; |
40 | 40 | ||
41 | if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) { | 41 | if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) { |
42 | ret = -ESPIPE; | 42 | ret = -ESPIPE; |
43 | goto out; | 43 | goto out; |
44 | } | 44 | } |
diff --git a/mm/filemap.c b/mm/filemap.c index 7b84dc814347..8332c77b1bd1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1181,8 +1181,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1181 | if (pos < size) { | 1181 | if (pos < size) { |
1182 | retval = generic_file_direct_IO(READ, iocb, | 1182 | retval = generic_file_direct_IO(READ, iocb, |
1183 | iov, pos, nr_segs); | 1183 | iov, pos, nr_segs); |
1184 | if (retval > 0 && !is_sync_kiocb(iocb)) | ||
1185 | retval = -EIOCBQUEUED; | ||
1186 | if (retval > 0) | 1184 | if (retval > 0) |
1187 | *ppos = pos + retval; | 1185 | *ppos = pos + retval; |
1188 | } | 1186 | } |
@@ -1445,7 +1443,6 @@ no_cached_page: | |||
1445 | * effect. | 1443 | * effect. |
1446 | */ | 1444 | */ |
1447 | error = page_cache_read(file, pgoff); | 1445 | error = page_cache_read(file, pgoff); |
1448 | grab_swap_token(); | ||
1449 | 1446 | ||
1450 | /* | 1447 | /* |
1451 | * The page we want has now been added to the page cache. | 1448 | * The page we want has now been added to the page cache. |
@@ -1893,6 +1890,7 @@ int should_remove_suid(struct dentry *dentry) | |||
1893 | 1890 | ||
1894 | return 0; | 1891 | return 0; |
1895 | } | 1892 | } |
1893 | EXPORT_SYMBOL(should_remove_suid); | ||
1896 | 1894 | ||
1897 | int __remove_suid(struct dentry *dentry, int kill) | 1895 | int __remove_suid(struct dentry *dentry, int kill) |
1898 | { | 1896 | { |
@@ -2047,15 +2045,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
2047 | * Sync the fs metadata but not the minor inode changes and | 2045 | * Sync the fs metadata but not the minor inode changes and |
2048 | * of course not the data as we did direct DMA for the IO. | 2046 | * of course not the data as we did direct DMA for the IO. |
2049 | * i_mutex is held, which protects generic_osync_inode() from | 2047 | * i_mutex is held, which protects generic_osync_inode() from |
2050 | * livelocking. | 2048 | * livelocking. AIO O_DIRECT ops attempt to sync metadata here. |
2051 | */ | 2049 | */ |
2052 | if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2050 | if ((written >= 0 || written == -EIOCBQUEUED) && |
2051 | ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
2053 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); | 2052 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); |
2054 | if (err < 0) | 2053 | if (err < 0) |
2055 | written = err; | 2054 | written = err; |
2056 | } | 2055 | } |
2057 | if (written == count && !is_sync_kiocb(iocb)) | ||
2058 | written = -EIOCBQUEUED; | ||
2059 | return written; | 2056 | return written; |
2060 | } | 2057 | } |
2061 | EXPORT_SYMBOL(generic_file_direct_write); | 2058 | EXPORT_SYMBOL(generic_file_direct_write); |
@@ -2269,7 +2266,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
2269 | if (count == 0) | 2266 | if (count == 0) |
2270 | goto out; | 2267 | goto out; |
2271 | 2268 | ||
2272 | err = remove_suid(file->f_dentry); | 2269 | err = remove_suid(file->f_path.dentry); |
2273 | if (err) | 2270 | if (err) |
2274 | goto out; | 2271 | goto out; |
2275 | 2272 | ||
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b4fd0d7c9bfb..45b3553865cf 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -189,7 +189,7 @@ __xip_unmap (struct address_space * mapping, | |||
189 | /* Nuke the page table entry. */ | 189 | /* Nuke the page table entry. */ |
190 | flush_cache_page(vma, address, pte_pfn(*pte)); | 190 | flush_cache_page(vma, address, pte_pfn(*pte)); |
191 | pteval = ptep_clear_flush(vma, address, pte); | 191 | pteval = ptep_clear_flush(vma, address, pte); |
192 | page_remove_rmap(page); | 192 | page_remove_rmap(page, vma); |
193 | dec_mm_counter(mm, file_rss); | 193 | dec_mm_counter(mm, file_rss); |
194 | BUG_ON(pte_dirty(pteval)); | 194 | BUG_ON(pte_dirty(pteval)); |
195 | pte_unmap_unlock(pte, ptl); | 195 | pte_unmap_unlock(pte, ptl); |
@@ -379,7 +379,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, | |||
379 | if (count == 0) | 379 | if (count == 0) |
380 | goto out_backing; | 380 | goto out_backing; |
381 | 381 | ||
382 | ret = remove_suid(filp->f_dentry); | 382 | ret = remove_suid(filp->f_path.dentry); |
383 | if (ret) | 383 | if (ret) |
384 | goto out_backing; | 384 | goto out_backing; |
385 | 385 | ||
diff --git a/mm/fremap.c b/mm/fremap.c index 7a9d0f5d246d..4e3f53dd5fd4 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -33,7 +33,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
33 | if (page) { | 33 | if (page) { |
34 | if (pte_dirty(pte)) | 34 | if (pte_dirty(pte)) |
35 | set_page_dirty(page); | 35 | set_page_dirty(page); |
36 | page_remove_rmap(page); | 36 | page_remove_rmap(page, vma); |
37 | page_cache_release(page); | 37 | page_cache_release(page); |
38 | } | 38 | } |
39 | } else { | 39 | } else { |
@@ -101,7 +101,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
101 | { | 101 | { |
102 | int err = -ENOMEM; | 102 | int err = -ENOMEM; |
103 | pte_t *pte; | 103 | pte_t *pte; |
104 | pte_t pte_val; | ||
105 | spinlock_t *ptl; | 104 | spinlock_t *ptl; |
106 | 105 | ||
107 | pte = get_locked_pte(mm, addr, &ptl); | 106 | pte = get_locked_pte(mm, addr, &ptl); |
@@ -114,7 +113,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
114 | } | 113 | } |
115 | 114 | ||
116 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); | 115 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); |
117 | pte_val = *pte; | ||
118 | /* | 116 | /* |
119 | * We don't need to run update_mmu_cache() here because the "file pte" | 117 | * We don't need to run update_mmu_cache() here because the "file pte" |
120 | * being installed by install_file_pte() is not a real pte - it's a | 118 | * being installed by install_file_pte() is not a real pte - it's a |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a088f593a807..cb362f761f17 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -44,14 +44,14 @@ static void clear_huge_page(struct page *page, unsigned long addr) | |||
44 | } | 44 | } |
45 | 45 | ||
46 | static void copy_huge_page(struct page *dst, struct page *src, | 46 | static void copy_huge_page(struct page *dst, struct page *src, |
47 | unsigned long addr) | 47 | unsigned long addr, struct vm_area_struct *vma) |
48 | { | 48 | { |
49 | int i; | 49 | int i; |
50 | 50 | ||
51 | might_sleep(); | 51 | might_sleep(); |
52 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { | 52 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { |
53 | cond_resched(); | 53 | cond_resched(); |
54 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE); | 54 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); |
55 | } | 55 | } |
56 | } | 56 | } |
57 | 57 | ||
@@ -73,7 +73,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, | |||
73 | 73 | ||
74 | for (z = zonelist->zones; *z; z++) { | 74 | for (z = zonelist->zones; *z; z++) { |
75 | nid = zone_to_nid(*z); | 75 | nid = zone_to_nid(*z); |
76 | if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && | 76 | if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) && |
77 | !list_empty(&hugepage_freelists[nid])) | 77 | !list_empty(&hugepage_freelists[nid])) |
78 | break; | 78 | break; |
79 | } | 79 | } |
@@ -109,7 +109,7 @@ static int alloc_fresh_huge_page(void) | |||
109 | if (nid == MAX_NUMNODES) | 109 | if (nid == MAX_NUMNODES) |
110 | nid = first_node(node_online_map); | 110 | nid = first_node(node_online_map); |
111 | if (page) { | 111 | if (page) { |
112 | page[1].lru.next = (void *)free_huge_page; /* dtor */ | 112 | set_compound_page_dtor(page, free_huge_page); |
113 | spin_lock(&hugetlb_lock); | 113 | spin_lock(&hugetlb_lock); |
114 | nr_huge_pages++; | 114 | nr_huge_pages++; |
115 | nr_huge_pages_node[page_to_nid(page)]++; | 115 | nr_huge_pages_node[page_to_nid(page)]++; |
@@ -344,7 +344,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
344 | entry = *src_pte; | 344 | entry = *src_pte; |
345 | ptepage = pte_page(entry); | 345 | ptepage = pte_page(entry); |
346 | get_page(ptepage); | 346 | get_page(ptepage); |
347 | add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE); | ||
348 | set_huge_pte_at(dst, addr, dst_pte, entry); | 347 | set_huge_pte_at(dst, addr, dst_pte, entry); |
349 | } | 348 | } |
350 | spin_unlock(&src->page_table_lock); | 349 | spin_unlock(&src->page_table_lock); |
@@ -365,6 +364,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
365 | pte_t pte; | 364 | pte_t pte; |
366 | struct page *page; | 365 | struct page *page; |
367 | struct page *tmp; | 366 | struct page *tmp; |
367 | /* | ||
368 | * A page gathering list, protected by per file i_mmap_lock. The | ||
369 | * lock is used to avoid list corruption from multiple unmapping | ||
370 | * of the same page since we are using page->lru. | ||
371 | */ | ||
368 | LIST_HEAD(page_list); | 372 | LIST_HEAD(page_list); |
369 | 373 | ||
370 | WARN_ON(!is_vm_hugetlb_page(vma)); | 374 | WARN_ON(!is_vm_hugetlb_page(vma)); |
@@ -372,24 +376,21 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
372 | BUG_ON(end & ~HPAGE_MASK); | 376 | BUG_ON(end & ~HPAGE_MASK); |
373 | 377 | ||
374 | spin_lock(&mm->page_table_lock); | 378 | spin_lock(&mm->page_table_lock); |
375 | |||
376 | /* Update high watermark before we lower rss */ | ||
377 | update_hiwater_rss(mm); | ||
378 | |||
379 | for (address = start; address < end; address += HPAGE_SIZE) { | 379 | for (address = start; address < end; address += HPAGE_SIZE) { |
380 | ptep = huge_pte_offset(mm, address); | 380 | ptep = huge_pte_offset(mm, address); |
381 | if (!ptep) | 381 | if (!ptep) |
382 | continue; | 382 | continue; |
383 | 383 | ||
384 | if (huge_pmd_unshare(mm, &address, ptep)) | ||
385 | continue; | ||
386 | |||
384 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 387 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
385 | if (pte_none(pte)) | 388 | if (pte_none(pte)) |
386 | continue; | 389 | continue; |
387 | 390 | ||
388 | page = pte_page(pte); | 391 | page = pte_page(pte); |
389 | list_add(&page->lru, &page_list); | 392 | list_add(&page->lru, &page_list); |
390 | add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); | ||
391 | } | 393 | } |
392 | |||
393 | spin_unlock(&mm->page_table_lock); | 394 | spin_unlock(&mm->page_table_lock); |
394 | flush_tlb_range(vma, start, end); | 395 | flush_tlb_range(vma, start, end); |
395 | list_for_each_entry_safe(page, tmp, &page_list, lru) { | 396 | list_for_each_entry_safe(page, tmp, &page_list, lru) { |
@@ -441,7 +442,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
441 | } | 442 | } |
442 | 443 | ||
443 | spin_unlock(&mm->page_table_lock); | 444 | spin_unlock(&mm->page_table_lock); |
444 | copy_huge_page(new_page, old_page, address); | 445 | copy_huge_page(new_page, old_page, address, vma); |
445 | spin_lock(&mm->page_table_lock); | 446 | spin_lock(&mm->page_table_lock); |
446 | 447 | ||
447 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); | 448 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); |
@@ -515,7 +516,6 @@ retry: | |||
515 | if (!pte_none(*ptep)) | 516 | if (!pte_none(*ptep)) |
516 | goto backout; | 517 | goto backout; |
517 | 518 | ||
518 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); | ||
519 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) | 519 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) |
520 | && (vma->vm_flags & VM_SHARED))); | 520 | && (vma->vm_flags & VM_SHARED))); |
521 | set_huge_pte_at(mm, address, ptep, new_pte); | 521 | set_huge_pte_at(mm, address, ptep, new_pte); |
@@ -653,11 +653,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
653 | BUG_ON(address >= end); | 653 | BUG_ON(address >= end); |
654 | flush_cache_range(vma, address, end); | 654 | flush_cache_range(vma, address, end); |
655 | 655 | ||
656 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | ||
656 | spin_lock(&mm->page_table_lock); | 657 | spin_lock(&mm->page_table_lock); |
657 | for (; address < end; address += HPAGE_SIZE) { | 658 | for (; address < end; address += HPAGE_SIZE) { |
658 | ptep = huge_pte_offset(mm, address); | 659 | ptep = huge_pte_offset(mm, address); |
659 | if (!ptep) | 660 | if (!ptep) |
660 | continue; | 661 | continue; |
662 | if (huge_pmd_unshare(mm, &address, ptep)) | ||
663 | continue; | ||
661 | if (!pte_none(*ptep)) { | 664 | if (!pte_none(*ptep)) { |
662 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 665 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
663 | pte = pte_mkhuge(pte_modify(pte, newprot)); | 666 | pte = pte_mkhuge(pte_modify(pte, newprot)); |
@@ -666,6 +669,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
666 | } | 669 | } |
667 | } | 670 | } |
668 | spin_unlock(&mm->page_table_lock); | 671 | spin_unlock(&mm->page_table_lock); |
672 | spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); | ||
669 | 673 | ||
670 | flush_tlb_range(vma, start, end); | 674 | flush_tlb_range(vma, start, end); |
671 | } | 675 | } |
diff --git a/mm/memory.c b/mm/memory.c index 156861fcac43..af227d26e104 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -681,7 +681,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
681 | mark_page_accessed(page); | 681 | mark_page_accessed(page); |
682 | file_rss--; | 682 | file_rss--; |
683 | } | 683 | } |
684 | page_remove_rmap(page); | 684 | page_remove_rmap(page, vma); |
685 | tlb_remove_page(tlb, page); | 685 | tlb_remove_page(tlb, page); |
686 | continue; | 686 | continue; |
687 | } | 687 | } |
@@ -1091,7 +1091,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1091 | if (pages) { | 1091 | if (pages) { |
1092 | pages[i] = page; | 1092 | pages[i] = page; |
1093 | 1093 | ||
1094 | flush_anon_page(page, start); | 1094 | flush_anon_page(vma, page, start); |
1095 | flush_dcache_page(page); | 1095 | flush_dcache_page(page); |
1096 | } | 1096 | } |
1097 | if (vmas) | 1097 | if (vmas) |
@@ -1110,23 +1110,29 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1110 | { | 1110 | { |
1111 | pte_t *pte; | 1111 | pte_t *pte; |
1112 | spinlock_t *ptl; | 1112 | spinlock_t *ptl; |
1113 | int err = 0; | ||
1113 | 1114 | ||
1114 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | 1115 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); |
1115 | if (!pte) | 1116 | if (!pte) |
1116 | return -ENOMEM; | 1117 | return -EAGAIN; |
1117 | arch_enter_lazy_mmu_mode(); | 1118 | arch_enter_lazy_mmu_mode(); |
1118 | do { | 1119 | do { |
1119 | struct page *page = ZERO_PAGE(addr); | 1120 | struct page *page = ZERO_PAGE(addr); |
1120 | pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); | 1121 | pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); |
1122 | |||
1123 | if (unlikely(!pte_none(*pte))) { | ||
1124 | err = -EEXIST; | ||
1125 | pte++; | ||
1126 | break; | ||
1127 | } | ||
1121 | page_cache_get(page); | 1128 | page_cache_get(page); |
1122 | page_add_file_rmap(page); | 1129 | page_add_file_rmap(page); |
1123 | inc_mm_counter(mm, file_rss); | 1130 | inc_mm_counter(mm, file_rss); |
1124 | BUG_ON(!pte_none(*pte)); | ||
1125 | set_pte_at(mm, addr, pte, zero_pte); | 1131 | set_pte_at(mm, addr, pte, zero_pte); |
1126 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1132 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1127 | arch_leave_lazy_mmu_mode(); | 1133 | arch_leave_lazy_mmu_mode(); |
1128 | pte_unmap_unlock(pte - 1, ptl); | 1134 | pte_unmap_unlock(pte - 1, ptl); |
1129 | return 0; | 1135 | return err; |
1130 | } | 1136 | } |
1131 | 1137 | ||
1132 | static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, | 1138 | static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, |
@@ -1134,16 +1140,18 @@ static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
1134 | { | 1140 | { |
1135 | pmd_t *pmd; | 1141 | pmd_t *pmd; |
1136 | unsigned long next; | 1142 | unsigned long next; |
1143 | int err; | ||
1137 | 1144 | ||
1138 | pmd = pmd_alloc(mm, pud, addr); | 1145 | pmd = pmd_alloc(mm, pud, addr); |
1139 | if (!pmd) | 1146 | if (!pmd) |
1140 | return -ENOMEM; | 1147 | return -EAGAIN; |
1141 | do { | 1148 | do { |
1142 | next = pmd_addr_end(addr, end); | 1149 | next = pmd_addr_end(addr, end); |
1143 | if (zeromap_pte_range(mm, pmd, addr, next, prot)) | 1150 | err = zeromap_pte_range(mm, pmd, addr, next, prot); |
1144 | return -ENOMEM; | 1151 | if (err) |
1152 | break; | ||
1145 | } while (pmd++, addr = next, addr != end); | 1153 | } while (pmd++, addr = next, addr != end); |
1146 | return 0; | 1154 | return err; |
1147 | } | 1155 | } |
1148 | 1156 | ||
1149 | static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, | 1157 | static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, |
@@ -1151,16 +1159,18 @@ static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, | |||
1151 | { | 1159 | { |
1152 | pud_t *pud; | 1160 | pud_t *pud; |
1153 | unsigned long next; | 1161 | unsigned long next; |
1162 | int err; | ||
1154 | 1163 | ||
1155 | pud = pud_alloc(mm, pgd, addr); | 1164 | pud = pud_alloc(mm, pgd, addr); |
1156 | if (!pud) | 1165 | if (!pud) |
1157 | return -ENOMEM; | 1166 | return -EAGAIN; |
1158 | do { | 1167 | do { |
1159 | next = pud_addr_end(addr, end); | 1168 | next = pud_addr_end(addr, end); |
1160 | if (zeromap_pmd_range(mm, pud, addr, next, prot)) | 1169 | err = zeromap_pmd_range(mm, pud, addr, next, prot); |
1161 | return -ENOMEM; | 1170 | if (err) |
1171 | break; | ||
1162 | } while (pud++, addr = next, addr != end); | 1172 | } while (pud++, addr = next, addr != end); |
1163 | return 0; | 1173 | return err; |
1164 | } | 1174 | } |
1165 | 1175 | ||
1166 | int zeromap_page_range(struct vm_area_struct *vma, | 1176 | int zeromap_page_range(struct vm_area_struct *vma, |
@@ -1431,7 +1441,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
1431 | return pte; | 1441 | return pte; |
1432 | } | 1442 | } |
1433 | 1443 | ||
1434 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) | 1444 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) |
1435 | { | 1445 | { |
1436 | /* | 1446 | /* |
1437 | * If the source page was a PFN mapping, we don't have | 1447 | * If the source page was a PFN mapping, we don't have |
@@ -1454,9 +1464,9 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo | |||
1454 | kunmap_atomic(kaddr, KM_USER0); | 1464 | kunmap_atomic(kaddr, KM_USER0); |
1455 | flush_dcache_page(dst); | 1465 | flush_dcache_page(dst); |
1456 | return; | 1466 | return; |
1457 | 1467 | ||
1458 | } | 1468 | } |
1459 | copy_user_highpage(dst, src, va); | 1469 | copy_user_highpage(dst, src, va, vma); |
1460 | } | 1470 | } |
1461 | 1471 | ||
1462 | /* | 1472 | /* |
@@ -1567,7 +1577,7 @@ gotten: | |||
1567 | new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); | 1577 | new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); |
1568 | if (!new_page) | 1578 | if (!new_page) |
1569 | goto oom; | 1579 | goto oom; |
1570 | cow_user_page(new_page, old_page, address); | 1580 | cow_user_page(new_page, old_page, address, vma); |
1571 | } | 1581 | } |
1572 | 1582 | ||
1573 | /* | 1583 | /* |
@@ -1576,7 +1586,7 @@ gotten: | |||
1576 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 1586 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
1577 | if (likely(pte_same(*page_table, orig_pte))) { | 1587 | if (likely(pte_same(*page_table, orig_pte))) { |
1578 | if (old_page) { | 1588 | if (old_page) { |
1579 | page_remove_rmap(old_page); | 1589 | page_remove_rmap(old_page, vma); |
1580 | if (!PageAnon(old_page)) { | 1590 | if (!PageAnon(old_page)) { |
1581 | dec_mm_counter(mm, file_rss); | 1591 | dec_mm_counter(mm, file_rss); |
1582 | inc_mm_counter(mm, anon_rss); | 1592 | inc_mm_counter(mm, anon_rss); |
@@ -1902,7 +1912,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | |||
1902 | 1912 | ||
1903 | return 0; | 1913 | return 0; |
1904 | } | 1914 | } |
1905 | EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */ | ||
1906 | 1915 | ||
1907 | /** | 1916 | /** |
1908 | * swapin_readahead - swap in pages in hope we need them soon | 1917 | * swapin_readahead - swap in pages in hope we need them soon |
@@ -1991,6 +2000,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1991 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2000 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
1992 | page = lookup_swap_cache(entry); | 2001 | page = lookup_swap_cache(entry); |
1993 | if (!page) { | 2002 | if (!page) { |
2003 | grab_swap_token(); /* Contend for token _before_ read-in */ | ||
1994 | swapin_readahead(entry, address, vma); | 2004 | swapin_readahead(entry, address, vma); |
1995 | page = read_swap_cache_async(entry, vma, address); | 2005 | page = read_swap_cache_async(entry, vma, address); |
1996 | if (!page) { | 2006 | if (!page) { |
@@ -2008,7 +2018,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2008 | /* Had to read the page from swap area: Major fault */ | 2018 | /* Had to read the page from swap area: Major fault */ |
2009 | ret = VM_FAULT_MAJOR; | 2019 | ret = VM_FAULT_MAJOR; |
2010 | count_vm_event(PGMAJFAULT); | 2020 | count_vm_event(PGMAJFAULT); |
2011 | grab_swap_token(); | ||
2012 | } | 2021 | } |
2013 | 2022 | ||
2014 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2023 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
@@ -2191,7 +2200,7 @@ retry: | |||
2191 | page = alloc_page_vma(GFP_HIGHUSER, vma, address); | 2200 | page = alloc_page_vma(GFP_HIGHUSER, vma, address); |
2192 | if (!page) | 2201 | if (!page) |
2193 | goto oom; | 2202 | goto oom; |
2194 | copy_user_highpage(page, new_page, address); | 2203 | copy_user_highpage(page, new_page, address, vma); |
2195 | page_cache_release(new_page); | 2204 | page_cache_release(new_page); |
2196 | new_page = page; | 2205 | new_page = page; |
2197 | anon = 1; | 2206 | anon = 1; |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fd678a662eae..84279127fcd3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -67,12 +67,13 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
67 | zone_type = zone - pgdat->node_zones; | 67 | zone_type = zone - pgdat->node_zones; |
68 | if (!populated_zone(zone)) { | 68 | if (!populated_zone(zone)) { |
69 | int ret = 0; | 69 | int ret = 0; |
70 | ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages); | 70 | ret = init_currently_empty_zone(zone, phys_start_pfn, |
71 | nr_pages, MEMMAP_HOTPLUG); | ||
71 | if (ret < 0) | 72 | if (ret < 0) |
72 | return ret; | 73 | return ret; |
73 | } | 74 | } |
74 | memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); | 75 | memmap_init_zone(nr_pages, nid, zone_type, |
75 | zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); | 76 | phys_start_pfn, MEMMAP_HOTPLUG); |
76 | return 0; | 77 | return 0; |
77 | } | 78 | } |
78 | 79 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 617fb31086ee..da9463946556 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) | |||
141 | enum zone_type k; | 141 | enum zone_type k; |
142 | 142 | ||
143 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); | 143 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); |
144 | max++; /* space for zlcache_ptr (see mmzone.h) */ | ||
144 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); | 145 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); |
145 | if (!zl) | 146 | if (!zl) |
146 | return NULL; | 147 | return NULL; |
148 | zl->zlcache_ptr = NULL; | ||
147 | num = 0; | 149 | num = 0; |
148 | /* First put in the highest zones from all nodes, then all the next | 150 | /* First put in the highest zones from all nodes, then all the next |
149 | lower zones etc. Avoid empty zones because the memory allocator | 151 | lower zones etc. Avoid empty zones because the memory allocator |
@@ -219,7 +221,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
219 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 221 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
220 | do { | 222 | do { |
221 | struct page *page; | 223 | struct page *page; |
222 | unsigned int nid; | 224 | int nid; |
223 | 225 | ||
224 | if (!pte_present(*pte)) | 226 | if (!pte_present(*pte)) |
225 | continue; | 227 | continue; |
@@ -1324,7 +1326,7 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) | |||
1324 | atomic_set(&new->refcnt, 1); | 1326 | atomic_set(&new->refcnt, 1); |
1325 | if (new->policy == MPOL_BIND) { | 1327 | if (new->policy == MPOL_BIND) { |
1326 | int sz = ksize(old->v.zonelist); | 1328 | int sz = ksize(old->v.zonelist); |
1327 | new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL); | 1329 | new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL); |
1328 | if (!new->v.zonelist) { | 1330 | if (!new->v.zonelist) { |
1329 | kmem_cache_free(policy_cache, new); | 1331 | kmem_cache_free(policy_cache, new); |
1330 | return ERR_PTR(-ENOMEM); | 1332 | return ERR_PTR(-ENOMEM); |
@@ -1705,8 +1707,8 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) | |||
1705 | * Display pages allocated per node and memory policy via /proc. | 1707 | * Display pages allocated per node and memory policy via /proc. |
1706 | */ | 1708 | */ |
1707 | 1709 | ||
1708 | static const char *policy_types[] = { "default", "prefer", "bind", | 1710 | static const char * const policy_types[] = |
1709 | "interleave" }; | 1711 | { "default", "prefer", "bind", "interleave" }; |
1710 | 1712 | ||
1711 | /* | 1713 | /* |
1712 | * Convert a mempolicy into a string. | 1714 | * Convert a mempolicy into a string. |
@@ -1855,7 +1857,7 @@ int show_numa_map(struct seq_file *m, void *v) | |||
1855 | 1857 | ||
1856 | if (file) { | 1858 | if (file) { |
1857 | seq_printf(m, " file="); | 1859 | seq_printf(m, " file="); |
1858 | seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= "); | 1860 | seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= "); |
1859 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { | 1861 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { |
1860 | seq_printf(m, " heap"); | 1862 | seq_printf(m, " heap"); |
1861 | } else if (vma->vm_start <= mm->start_stack && | 1863 | } else if (vma->vm_start <= mm->start_stack && |
diff --git a/mm/migrate.c b/mm/migrate.c index b4979d423d2b..e9b161bde95b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -294,7 +294,7 @@ out: | |||
294 | static int migrate_page_move_mapping(struct address_space *mapping, | 294 | static int migrate_page_move_mapping(struct address_space *mapping, |
295 | struct page *newpage, struct page *page) | 295 | struct page *newpage, struct page *page) |
296 | { | 296 | { |
297 | struct page **radix_pointer; | 297 | void **pslot; |
298 | 298 | ||
299 | if (!mapping) { | 299 | if (!mapping) { |
300 | /* Anonymous page */ | 300 | /* Anonymous page */ |
@@ -305,12 +305,11 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
305 | 305 | ||
306 | write_lock_irq(&mapping->tree_lock); | 306 | write_lock_irq(&mapping->tree_lock); |
307 | 307 | ||
308 | radix_pointer = (struct page **)radix_tree_lookup_slot( | 308 | pslot = radix_tree_lookup_slot(&mapping->page_tree, |
309 | &mapping->page_tree, | 309 | page_index(page)); |
310 | page_index(page)); | ||
311 | 310 | ||
312 | if (page_count(page) != 2 + !!PagePrivate(page) || | 311 | if (page_count(page) != 2 + !!PagePrivate(page) || |
313 | *radix_pointer != page) { | 312 | (struct page *)radix_tree_deref_slot(pslot) != page) { |
314 | write_unlock_irq(&mapping->tree_lock); | 313 | write_unlock_irq(&mapping->tree_lock); |
315 | return -EAGAIN; | 314 | return -EAGAIN; |
316 | } | 315 | } |
@@ -318,7 +317,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
318 | /* | 317 | /* |
319 | * Now we know that no one else is looking at the page. | 318 | * Now we know that no one else is looking at the page. |
320 | */ | 319 | */ |
321 | get_page(newpage); | 320 | get_page(newpage); /* add cache reference */ |
322 | #ifdef CONFIG_SWAP | 321 | #ifdef CONFIG_SWAP |
323 | if (PageSwapCache(page)) { | 322 | if (PageSwapCache(page)) { |
324 | SetPageSwapCache(newpage); | 323 | SetPageSwapCache(newpage); |
@@ -326,8 +325,14 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
326 | } | 325 | } |
327 | #endif | 326 | #endif |
328 | 327 | ||
329 | *radix_pointer = newpage; | 328 | radix_tree_replace_slot(pslot, newpage); |
329 | |||
330 | /* | ||
331 | * Drop cache reference from old page. | ||
332 | * We know this isn't the last reference. | ||
333 | */ | ||
330 | __put_page(page); | 334 | __put_page(page); |
335 | |||
331 | write_unlock_irq(&mapping->tree_lock); | 336 | write_unlock_irq(&mapping->tree_lock); |
332 | 337 | ||
333 | return 0; | 338 | return 0; |
diff --git a/mm/mincore.c b/mm/mincore.c index 72890780c1c9..8aca6f7167bb 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * linux/mm/mincore.c | 2 | * linux/mm/mincore.c |
3 | * | 3 | * |
4 | * Copyright (C) 1994-1999 Linus Torvalds | 4 | * Copyright (C) 1994-2006 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | /* | 7 | /* |
@@ -38,46 +38,51 @@ static unsigned char mincore_page(struct vm_area_struct * vma, | |||
38 | return present; | 38 | return present; |
39 | } | 39 | } |
40 | 40 | ||
41 | static long mincore_vma(struct vm_area_struct * vma, | 41 | /* |
42 | unsigned long start, unsigned long end, unsigned char __user * vec) | 42 | * Do a chunk of "sys_mincore()". We've already checked |
43 | * all the arguments, we hold the mmap semaphore: we should | ||
44 | * just return the amount of info we're asked for. | ||
45 | */ | ||
46 | static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages) | ||
43 | { | 47 | { |
44 | long error, i, remaining; | 48 | unsigned long i, nr, pgoff; |
45 | unsigned char * tmp; | 49 | struct vm_area_struct *vma = find_vma(current->mm, addr); |
46 | |||
47 | error = -ENOMEM; | ||
48 | if (!vma->vm_file) | ||
49 | return error; | ||
50 | |||
51 | start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
52 | if (end > vma->vm_end) | ||
53 | end = vma->vm_end; | ||
54 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
55 | 50 | ||
56 | error = -EAGAIN; | 51 | /* |
57 | tmp = (unsigned char *) __get_free_page(GFP_KERNEL); | 52 | * find_vma() didn't find anything above us, or we're |
58 | if (!tmp) | 53 | * in an unmapped hole in the address space: ENOMEM. |
59 | return error; | 54 | */ |
55 | if (!vma || addr < vma->vm_start) | ||
56 | return -ENOMEM; | ||
60 | 57 | ||
61 | /* (end - start) is # of pages, and also # of bytes in "vec */ | 58 | /* |
62 | remaining = (end - start), | 59 | * Ok, got it. But check whether it's a segment we support |
60 | * mincore() on. Right now, we don't do any anonymous mappings. | ||
61 | * | ||
62 | * FIXME: This is just stupid. And returning ENOMEM is | ||
63 | * stupid too. We should just look at the page tables. But | ||
64 | * this is what we've traditionally done, so we'll just | ||
65 | * continue doing it. | ||
66 | */ | ||
67 | if (!vma->vm_file) | ||
68 | return -ENOMEM; | ||
63 | 69 | ||
64 | error = 0; | 70 | /* |
65 | for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { | 71 | * Calculate how many pages there are left in the vma, and |
66 | int j = 0; | 72 | * what the pgoff is for our address. |
67 | long thispiece = (remaining < PAGE_SIZE) ? | 73 | */ |
68 | remaining : PAGE_SIZE; | 74 | nr = (vma->vm_end - addr) >> PAGE_SHIFT; |
75 | if (nr > pages) | ||
76 | nr = pages; | ||
69 | 77 | ||
70 | while (j < thispiece) | 78 | pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; |
71 | tmp[j++] = mincore_page(vma, start++); | 79 | pgoff += vma->vm_pgoff; |
72 | 80 | ||
73 | if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { | 81 | /* And then we just fill the sucker in.. */ |
74 | error = -EFAULT; | 82 | for (i = 0 ; i < nr; i++, pgoff++) |
75 | break; | 83 | vec[i] = mincore_page(vma, pgoff); |
76 | } | ||
77 | } | ||
78 | 84 | ||
79 | free_page((unsigned long) tmp); | 85 | return nr; |
80 | return error; | ||
81 | } | 86 | } |
82 | 87 | ||
83 | /* | 88 | /* |
@@ -107,82 +112,50 @@ static long mincore_vma(struct vm_area_struct * vma, | |||
107 | asmlinkage long sys_mincore(unsigned long start, size_t len, | 112 | asmlinkage long sys_mincore(unsigned long start, size_t len, |
108 | unsigned char __user * vec) | 113 | unsigned char __user * vec) |
109 | { | 114 | { |
110 | int index = 0; | 115 | long retval; |
111 | unsigned long end, limit; | 116 | unsigned long pages; |
112 | struct vm_area_struct * vma; | 117 | unsigned char *tmp; |
113 | size_t max; | ||
114 | int unmapped_error = 0; | ||
115 | long error; | ||
116 | |||
117 | /* check the arguments */ | ||
118 | if (start & ~PAGE_CACHE_MASK) | ||
119 | goto einval; | ||
120 | |||
121 | limit = TASK_SIZE; | ||
122 | if (start >= limit) | ||
123 | goto enomem; | ||
124 | |||
125 | if (!len) | ||
126 | return 0; | ||
127 | |||
128 | max = limit - start; | ||
129 | len = PAGE_CACHE_ALIGN(len); | ||
130 | if (len > max || !len) | ||
131 | goto enomem; | ||
132 | 118 | ||
133 | end = start + len; | 119 | /* Check the start address: needs to be page-aligned.. */ |
120 | if (start & ~PAGE_CACHE_MASK) | ||
121 | return -EINVAL; | ||
134 | 122 | ||
135 | /* check the output buffer whilst holding the lock */ | 123 | /* ..and we need to be passed a valid user-space range */ |
136 | error = -EFAULT; | 124 | if (!access_ok(VERIFY_READ, (void __user *) start, len)) |
137 | down_read(¤t->mm->mmap_sem); | 125 | return -ENOMEM; |
138 | 126 | ||
139 | if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT)) | 127 | /* This also avoids any overflows on PAGE_CACHE_ALIGN */ |
140 | goto out; | 128 | pages = len >> PAGE_SHIFT; |
129 | pages += (len & ~PAGE_MASK) != 0; | ||
141 | 130 | ||
142 | /* | 131 | if (!access_ok(VERIFY_WRITE, vec, pages)) |
143 | * If the interval [start,end) covers some unmapped address | 132 | return -EFAULT; |
144 | * ranges, just ignore them, but return -ENOMEM at the end. | ||
145 | */ | ||
146 | error = 0; | ||
147 | |||
148 | vma = find_vma(current->mm, start); | ||
149 | while (vma) { | ||
150 | /* Here start < vma->vm_end. */ | ||
151 | if (start < vma->vm_start) { | ||
152 | unmapped_error = -ENOMEM; | ||
153 | start = vma->vm_start; | ||
154 | } | ||
155 | 133 | ||
156 | /* Here vma->vm_start <= start < vma->vm_end. */ | 134 | tmp = (void *) __get_free_page(GFP_USER); |
157 | if (end <= vma->vm_end) { | 135 | if (!tmp) |
158 | if (start < end) { | 136 | return -EAGAIN; |
159 | error = mincore_vma(vma, start, end, | 137 | |
160 | &vec[index]); | 138 | retval = 0; |
161 | if (error) | 139 | while (pages) { |
162 | goto out; | 140 | /* |
163 | } | 141 | * Do at most PAGE_SIZE entries per iteration, due to |
164 | error = unmapped_error; | 142 | * the temporary buffer size. |
165 | goto out; | 143 | */ |
144 | down_read(¤t->mm->mmap_sem); | ||
145 | retval = do_mincore(start, tmp, min(pages, PAGE_SIZE)); | ||
146 | up_read(¤t->mm->mmap_sem); | ||
147 | |||
148 | if (retval <= 0) | ||
149 | break; | ||
150 | if (copy_to_user(vec, tmp, retval)) { | ||
151 | retval = -EFAULT; | ||
152 | break; | ||
166 | } | 153 | } |
167 | 154 | pages -= retval; | |
168 | /* Here vma->vm_start <= start < vma->vm_end < end. */ | 155 | vec += retval; |
169 | error = mincore_vma(vma, start, vma->vm_end, &vec[index]); | 156 | start += retval << PAGE_SHIFT; |
170 | if (error) | 157 | retval = 0; |
171 | goto out; | ||
172 | index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT; | ||
173 | start = vma->vm_end; | ||
174 | vma = vma->vm_next; | ||
175 | } | 158 | } |
176 | 159 | free_page((unsigned long) tmp); | |
177 | /* we found a hole in the area queried if we arrive here */ | 160 | return retval; |
178 | error = -ENOMEM; | ||
179 | |||
180 | out: | ||
181 | up_read(¤t->mm->mmap_sem); | ||
182 | return error; | ||
183 | |||
184 | einval: | ||
185 | return -EINVAL; | ||
186 | enomem: | ||
187 | return -ENOMEM; | ||
188 | } | 161 | } |
diff --git a/mm/mlock.c b/mm/mlock.c index b90c59573abf..3446b7ef731e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -65,7 +65,7 @@ success: | |||
65 | ret = make_pages_present(start, end); | 65 | ret = make_pages_present(start, end); |
66 | } | 66 | } |
67 | 67 | ||
68 | vma->vm_mm->locked_vm -= pages; | 68 | mm->locked_vm -= pages; |
69 | out: | 69 | out: |
70 | if (ret == -ENOMEM) | 70 | if (ret == -ENOMEM) |
71 | ret = -EAGAIN; | 71 | ret = -EAGAIN; |
@@ -188,7 +188,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, | |||
188 | struct file *file, struct address_space *mapping) | 188 | struct file *file, struct address_space *mapping) |
189 | { | 189 | { |
190 | if (vma->vm_flags & VM_DENYWRITE) | 190 | if (vma->vm_flags & VM_DENYWRITE) |
191 | atomic_inc(&file->f_dentry->d_inode->i_writecount); | 191 | atomic_inc(&file->f_path.dentry->d_inode->i_writecount); |
192 | if (vma->vm_flags & VM_SHARED) | 192 | if (vma->vm_flags & VM_SHARED) |
193 | mapping->i_mmap_writable--; | 193 | mapping->i_mmap_writable--; |
194 | 194 | ||
@@ -399,7 +399,7 @@ static inline void __vma_link_file(struct vm_area_struct *vma) | |||
399 | struct address_space *mapping = file->f_mapping; | 399 | struct address_space *mapping = file->f_mapping; |
400 | 400 | ||
401 | if (vma->vm_flags & VM_DENYWRITE) | 401 | if (vma->vm_flags & VM_DENYWRITE) |
402 | atomic_dec(&file->f_dentry->d_inode->i_writecount); | 402 | atomic_dec(&file->f_path.dentry->d_inode->i_writecount); |
403 | if (vma->vm_flags & VM_SHARED) | 403 | if (vma->vm_flags & VM_SHARED) |
404 | mapping->i_mmap_writable++; | 404 | mapping->i_mmap_writable++; |
405 | 405 | ||
@@ -907,7 +907,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
907 | * mounted, in which case we dont add PROT_EXEC.) | 907 | * mounted, in which case we dont add PROT_EXEC.) |
908 | */ | 908 | */ |
909 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) | 909 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) |
910 | if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))) | 910 | if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) |
911 | prot |= PROT_EXEC; | 911 | prot |= PROT_EXEC; |
912 | 912 | ||
913 | if (!len) | 913 | if (!len) |
@@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
960 | return -EAGAIN; | 960 | return -EAGAIN; |
961 | } | 961 | } |
962 | 962 | ||
963 | inode = file ? file->f_dentry->d_inode : NULL; | 963 | inode = file ? file->f_path.dentry->d_inode : NULL; |
964 | 964 | ||
965 | if (file) { | 965 | if (file) { |
966 | switch (flags & MAP_TYPE) { | 966 | switch (flags & MAP_TYPE) { |
@@ -989,7 +989,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
989 | case MAP_PRIVATE: | 989 | case MAP_PRIVATE: |
990 | if (!(file->f_mode & FMODE_READ)) | 990 | if (!(file->f_mode & FMODE_READ)) |
991 | return -EACCES; | 991 | return -EACCES; |
992 | if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { | 992 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { |
993 | if (vm_flags & VM_EXEC) | 993 | if (vm_flags & VM_EXEC) |
994 | return -EPERM; | 994 | return -EPERM; |
995 | vm_flags &= ~VM_MAYEXEC; | 995 | vm_flags &= ~VM_MAYEXEC; |
@@ -1736,7 +1736,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1736 | if (mm->map_count >= sysctl_max_map_count) | 1736 | if (mm->map_count >= sysctl_max_map_count) |
1737 | return -ENOMEM; | 1737 | return -ENOMEM; |
1738 | 1738 | ||
1739 | new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | 1739 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
1740 | if (!new) | 1740 | if (!new) |
1741 | return -ENOMEM; | 1741 | return -ENOMEM; |
1742 | 1742 | ||
@@ -2057,7 +2057,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2057 | vma_start < new_vma->vm_end) | 2057 | vma_start < new_vma->vm_end) |
2058 | *vmap = new_vma; | 2058 | *vmap = new_vma; |
2059 | } else { | 2059 | } else { |
2060 | new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | 2060 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2061 | if (new_vma) { | 2061 | if (new_vma) { |
2062 | *new_vma = *vma; | 2062 | *new_vma = *vma; |
2063 | pol = mpol_copy(vma_policy(vma)); | 2063 | pol = mpol_copy(vma_policy(vma)); |
diff --git a/mm/mmzone.c b/mm/mmzone.c index febea1c98168..eb5838634f18 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -14,8 +14,6 @@ struct pglist_data *first_online_pgdat(void) | |||
14 | return NODE_DATA(first_online_node); | 14 | return NODE_DATA(first_online_node); |
15 | } | 15 | } |
16 | 16 | ||
17 | EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */ | ||
18 | |||
19 | struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) | 17 | struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) |
20 | { | 18 | { |
21 | int nid = next_online_node(pgdat->node_id); | 19 | int nid = next_online_node(pgdat->node_id); |
@@ -24,8 +22,6 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) | |||
24 | return NULL; | 22 | return NULL; |
25 | return NODE_DATA(nid); | 23 | return NODE_DATA(nid); |
26 | } | 24 | } |
27 | EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */ | ||
28 | |||
29 | 25 | ||
30 | /* | 26 | /* |
31 | * next_zone - helper magic for for_each_zone() | 27 | * next_zone - helper magic for for_each_zone() |
@@ -45,5 +41,4 @@ struct zone *next_zone(struct zone *zone) | |||
45 | } | 41 | } |
46 | return zone; | 42 | return zone; |
47 | } | 43 | } |
48 | EXPORT_UNUSED_SYMBOL(next_zone); /* June 2006 */ | ||
49 | 44 | ||
diff --git a/mm/nommu.c b/mm/nommu.c index 8bdde9508f3b..23fb033e596d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -497,15 +497,17 @@ static int validate_mmap_request(struct file *file, | |||
497 | (flags & MAP_TYPE) != MAP_SHARED) | 497 | (flags & MAP_TYPE) != MAP_SHARED) |
498 | return -EINVAL; | 498 | return -EINVAL; |
499 | 499 | ||
500 | if (PAGE_ALIGN(len) == 0) | 500 | if (!len) |
501 | return addr; | ||
502 | |||
503 | if (len > TASK_SIZE) | ||
504 | return -EINVAL; | 501 | return -EINVAL; |
505 | 502 | ||
503 | /* Careful about overflows.. */ | ||
504 | len = PAGE_ALIGN(len); | ||
505 | if (!len || len > TASK_SIZE) | ||
506 | return -ENOMEM; | ||
507 | |||
506 | /* offset overflow? */ | 508 | /* offset overflow? */ |
507 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) | 509 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) |
508 | return -EINVAL; | 510 | return -EOVERFLOW; |
509 | 511 | ||
510 | if (file) { | 512 | if (file) { |
511 | /* validate file mapping requests */ | 513 | /* validate file mapping requests */ |
@@ -521,7 +523,7 @@ static int validate_mmap_request(struct file *file, | |||
521 | */ | 523 | */ |
522 | mapping = file->f_mapping; | 524 | mapping = file->f_mapping; |
523 | if (!mapping) | 525 | if (!mapping) |
524 | mapping = file->f_dentry->d_inode->i_mapping; | 526 | mapping = file->f_path.dentry->d_inode->i_mapping; |
525 | 527 | ||
526 | capabilities = 0; | 528 | capabilities = 0; |
527 | if (mapping && mapping->backing_dev_info) | 529 | if (mapping && mapping->backing_dev_info) |
@@ -530,7 +532,7 @@ static int validate_mmap_request(struct file *file, | |||
530 | if (!capabilities) { | 532 | if (!capabilities) { |
531 | /* no explicit capabilities set, so assume some | 533 | /* no explicit capabilities set, so assume some |
532 | * defaults */ | 534 | * defaults */ |
533 | switch (file->f_dentry->d_inode->i_mode & S_IFMT) { | 535 | switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) { |
534 | case S_IFREG: | 536 | case S_IFREG: |
535 | case S_IFBLK: | 537 | case S_IFBLK: |
536 | capabilities = BDI_CAP_MAP_COPY; | 538 | capabilities = BDI_CAP_MAP_COPY; |
@@ -561,11 +563,11 @@ static int validate_mmap_request(struct file *file, | |||
561 | !(file->f_mode & FMODE_WRITE)) | 563 | !(file->f_mode & FMODE_WRITE)) |
562 | return -EACCES; | 564 | return -EACCES; |
563 | 565 | ||
564 | if (IS_APPEND(file->f_dentry->d_inode) && | 566 | if (IS_APPEND(file->f_path.dentry->d_inode) && |
565 | (file->f_mode & FMODE_WRITE)) | 567 | (file->f_mode & FMODE_WRITE)) |
566 | return -EACCES; | 568 | return -EACCES; |
567 | 569 | ||
568 | if (locks_verify_locked(file->f_dentry->d_inode)) | 570 | if (locks_verify_locked(file->f_path.dentry->d_inode)) |
569 | return -EAGAIN; | 571 | return -EAGAIN; |
570 | 572 | ||
571 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) | 573 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) |
@@ -596,7 +598,7 @@ static int validate_mmap_request(struct file *file, | |||
596 | 598 | ||
597 | /* handle executable mappings and implied executable | 599 | /* handle executable mappings and implied executable |
598 | * mappings */ | 600 | * mappings */ |
599 | if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { | 601 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { |
600 | if (prot & PROT_EXEC) | 602 | if (prot & PROT_EXEC) |
601 | return -EPERM; | 603 | return -EPERM; |
602 | } | 604 | } |
@@ -806,10 +808,9 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
806 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); | 808 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); |
807 | 809 | ||
808 | /* we're going to need to record the mapping if it works */ | 810 | /* we're going to need to record the mapping if it works */ |
809 | vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL); | 811 | vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL); |
810 | if (!vml) | 812 | if (!vml) |
811 | goto error_getting_vml; | 813 | goto error_getting_vml; |
812 | memset(vml, 0, sizeof(*vml)); | ||
813 | 814 | ||
814 | down_write(&nommu_vma_sem); | 815 | down_write(&nommu_vma_sem); |
815 | 816 | ||
@@ -832,7 +833,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
832 | continue; | 833 | continue; |
833 | 834 | ||
834 | /* search for overlapping mappings on the same file */ | 835 | /* search for overlapping mappings on the same file */ |
835 | if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode) | 836 | if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode) |
836 | continue; | 837 | continue; |
837 | 838 | ||
838 | if (vma->vm_pgoff >= pgoff + pglen) | 839 | if (vma->vm_pgoff >= pgoff + pglen) |
@@ -885,11 +886,10 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
885 | } | 886 | } |
886 | 887 | ||
887 | /* we're going to need a VMA struct as well */ | 888 | /* we're going to need a VMA struct as well */ |
888 | vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL); | 889 | vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); |
889 | if (!vma) | 890 | if (!vma) |
890 | goto error_getting_vma; | 891 | goto error_getting_vma; |
891 | 892 | ||
892 | memset(vma, 0, sizeof(*vma)); | ||
893 | INIT_LIST_HEAD(&vma->anon_vma_node); | 893 | INIT_LIST_HEAD(&vma->anon_vma_node); |
894 | atomic_set(&vma->vm_usage, 1); | 894 | atomic_set(&vma->vm_usage, 1); |
895 | if (file) | 895 | if (file) |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2e3ce3a928b9..b278b8d60eee 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -61,12 +61,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
61 | } | 61 | } |
62 | 62 | ||
63 | /* | 63 | /* |
64 | * swapoff can easily use up all memory, so kill those first. | ||
65 | */ | ||
66 | if (p->flags & PF_SWAPOFF) | ||
67 | return ULONG_MAX; | ||
68 | |||
69 | /* | ||
70 | * The memory size of the process is the basis for the badness. | 64 | * The memory size of the process is the basis for the badness. |
71 | */ | 65 | */ |
72 | points = mm->total_vm; | 66 | points = mm->total_vm; |
@@ -77,6 +71,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
77 | task_unlock(p); | 71 | task_unlock(p); |
78 | 72 | ||
79 | /* | 73 | /* |
74 | * swapoff can easily use up all memory, so kill those first. | ||
75 | */ | ||
76 | if (p->flags & PF_SWAPOFF) | ||
77 | return ULONG_MAX; | ||
78 | |||
79 | /* | ||
80 | * Processes which fork a lot of child processes are likely | 80 | * Processes which fork a lot of child processes are likely |
81 | * a good choice. We add half the vmsize of the children if they | 81 | * a good choice. We add half the vmsize of the children if they |
82 | * have an own mm. This prevents forking servers to flood the | 82 | * have an own mm. This prevents forking servers to flood the |
@@ -174,10 +174,15 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) | |||
174 | { | 174 | { |
175 | #ifdef CONFIG_NUMA | 175 | #ifdef CONFIG_NUMA |
176 | struct zone **z; | 176 | struct zone **z; |
177 | nodemask_t nodes = node_online_map; | 177 | nodemask_t nodes; |
178 | int node; | ||
179 | /* node has memory ? */ | ||
180 | for_each_online_node(node) | ||
181 | if (NODE_DATA(node)->node_present_pages) | ||
182 | node_set(node, nodes); | ||
178 | 183 | ||
179 | for (z = zonelist->zones; *z; z++) | 184 | for (z = zonelist->zones; *z; z++) |
180 | if (cpuset_zone_allowed(*z, gfp_mask)) | 185 | if (cpuset_zone_allowed_softwall(*z, gfp_mask)) |
181 | node_clear(zone_to_nid(*z), nodes); | 186 | node_clear(zone_to_nid(*z), nodes); |
182 | else | 187 | else |
183 | return CONSTRAINT_CPUSET; | 188 | return CONSTRAINT_CPUSET; |
@@ -264,7 +269,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) | |||
264 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO | 269 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO |
265 | * set. | 270 | * set. |
266 | */ | 271 | */ |
267 | static void __oom_kill_task(struct task_struct *p, const char *message) | 272 | static void __oom_kill_task(struct task_struct *p, int verbose) |
268 | { | 273 | { |
269 | if (is_init(p)) { | 274 | if (is_init(p)) { |
270 | WARN_ON(1); | 275 | WARN_ON(1); |
@@ -278,10 +283,8 @@ static void __oom_kill_task(struct task_struct *p, const char *message) | |||
278 | return; | 283 | return; |
279 | } | 284 | } |
280 | 285 | ||
281 | if (message) { | 286 | if (verbose) |
282 | printk(KERN_ERR "%s: Killed process %d (%s).\n", | 287 | printk(KERN_ERR "Killed process %d (%s)\n", p->pid, p->comm); |
283 | message, p->pid, p->comm); | ||
284 | } | ||
285 | 288 | ||
286 | /* | 289 | /* |
287 | * We give our sacrificial lamb high priority and access to | 290 | * We give our sacrificial lamb high priority and access to |
@@ -294,7 +297,7 @@ static void __oom_kill_task(struct task_struct *p, const char *message) | |||
294 | force_sig(SIGKILL, p); | 297 | force_sig(SIGKILL, p); |
295 | } | 298 | } |
296 | 299 | ||
297 | static int oom_kill_task(struct task_struct *p, const char *message) | 300 | static int oom_kill_task(struct task_struct *p) |
298 | { | 301 | { |
299 | struct mm_struct *mm; | 302 | struct mm_struct *mm; |
300 | struct task_struct *g, *q; | 303 | struct task_struct *g, *q; |
@@ -313,15 +316,25 @@ static int oom_kill_task(struct task_struct *p, const char *message) | |||
313 | if (mm == NULL) | 316 | if (mm == NULL) |
314 | return 1; | 317 | return 1; |
315 | 318 | ||
316 | __oom_kill_task(p, message); | 319 | /* |
320 | * Don't kill the process if any threads are set to OOM_DISABLE | ||
321 | */ | ||
322 | do_each_thread(g, q) { | ||
323 | if (q->mm == mm && p->oomkilladj == OOM_DISABLE) | ||
324 | return 1; | ||
325 | } while_each_thread(g, q); | ||
326 | |||
327 | __oom_kill_task(p, 1); | ||
328 | |||
317 | /* | 329 | /* |
318 | * kill all processes that share the ->mm (i.e. all threads), | 330 | * kill all processes that share the ->mm (i.e. all threads), |
319 | * but are in a different thread group | 331 | * but are in a different thread group. Don't let them have access |
332 | * to memory reserves though, otherwise we might deplete all memory. | ||
320 | */ | 333 | */ |
321 | do_each_thread(g, q) | 334 | do_each_thread(g, q) { |
322 | if (q->mm == mm && q->tgid != p->tgid) | 335 | if (q->mm == mm && q->tgid != p->tgid) |
323 | __oom_kill_task(q, message); | 336 | force_sig(SIGKILL, p); |
324 | while_each_thread(g, q); | 337 | } while_each_thread(g, q); |
325 | 338 | ||
326 | return 0; | 339 | return 0; |
327 | } | 340 | } |
@@ -337,21 +350,22 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, | |||
337 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 350 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
338 | */ | 351 | */ |
339 | if (p->flags & PF_EXITING) { | 352 | if (p->flags & PF_EXITING) { |
340 | __oom_kill_task(p, NULL); | 353 | __oom_kill_task(p, 0); |
341 | return 0; | 354 | return 0; |
342 | } | 355 | } |
343 | 356 | ||
344 | printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li" | 357 | printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", |
345 | " and children.\n", p->pid, p->comm, points); | 358 | message, p->pid, p->comm, points); |
359 | |||
346 | /* Try to kill a child first */ | 360 | /* Try to kill a child first */ |
347 | list_for_each(tsk, &p->children) { | 361 | list_for_each(tsk, &p->children) { |
348 | c = list_entry(tsk, struct task_struct, sibling); | 362 | c = list_entry(tsk, struct task_struct, sibling); |
349 | if (c->mm == p->mm) | 363 | if (c->mm == p->mm) |
350 | continue; | 364 | continue; |
351 | if (!oom_kill_task(c, message)) | 365 | if (!oom_kill_task(c)) |
352 | return 0; | 366 | return 0; |
353 | } | 367 | } |
354 | return oom_kill_task(p, message); | 368 | return oom_kill_task(p); |
355 | } | 369 | } |
356 | 370 | ||
357 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); | 371 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8d9b19f239c3..1d2fc89ca56d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/writeback.h> | 21 | #include <linux/writeback.h> |
22 | #include <linux/init.h> | 22 | #include <linux/init.h> |
23 | #include <linux/backing-dev.h> | 23 | #include <linux/backing-dev.h> |
24 | #include <linux/task_io_accounting_ops.h> | ||
24 | #include <linux/blkdev.h> | 25 | #include <linux/blkdev.h> |
25 | #include <linux/mpage.h> | 26 | #include <linux/mpage.h> |
26 | #include <linux/rmap.h> | 27 | #include <linux/rmap.h> |
@@ -761,23 +762,24 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
761 | struct address_space *mapping = page_mapping(page); | 762 | struct address_space *mapping = page_mapping(page); |
762 | struct address_space *mapping2; | 763 | struct address_space *mapping2; |
763 | 764 | ||
764 | if (mapping) { | 765 | if (!mapping) |
765 | write_lock_irq(&mapping->tree_lock); | 766 | return 1; |
766 | mapping2 = page_mapping(page); | 767 | |
767 | if (mapping2) { /* Race with truncate? */ | 768 | write_lock_irq(&mapping->tree_lock); |
768 | BUG_ON(mapping2 != mapping); | 769 | mapping2 = page_mapping(page); |
769 | if (mapping_cap_account_dirty(mapping)) | 770 | if (mapping2) { /* Race with truncate? */ |
770 | __inc_zone_page_state(page, | 771 | BUG_ON(mapping2 != mapping); |
771 | NR_FILE_DIRTY); | 772 | if (mapping_cap_account_dirty(mapping)) { |
772 | radix_tree_tag_set(&mapping->page_tree, | 773 | __inc_zone_page_state(page, NR_FILE_DIRTY); |
773 | page_index(page), PAGECACHE_TAG_DIRTY); | 774 | task_io_account_write(PAGE_CACHE_SIZE); |
774 | } | ||
775 | write_unlock_irq(&mapping->tree_lock); | ||
776 | if (mapping->host) { | ||
777 | /* !PageAnon && !swapper_space */ | ||
778 | __mark_inode_dirty(mapping->host, | ||
779 | I_DIRTY_PAGES); | ||
780 | } | 775 | } |
776 | radix_tree_tag_set(&mapping->page_tree, | ||
777 | page_index(page), PAGECACHE_TAG_DIRTY); | ||
778 | } | ||
779 | write_unlock_irq(&mapping->tree_lock); | ||
780 | if (mapping->host) { | ||
781 | /* !PageAnon && !swapper_space */ | ||
782 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | ||
781 | } | 783 | } |
782 | return 1; | 784 | return 1; |
783 | } | 785 | } |
@@ -843,39 +845,6 @@ int set_page_dirty_lock(struct page *page) | |||
843 | EXPORT_SYMBOL(set_page_dirty_lock); | 845 | EXPORT_SYMBOL(set_page_dirty_lock); |
844 | 846 | ||
845 | /* | 847 | /* |
846 | * Clear a page's dirty flag, while caring for dirty memory accounting. | ||
847 | * Returns true if the page was previously dirty. | ||
848 | */ | ||
849 | int test_clear_page_dirty(struct page *page) | ||
850 | { | ||
851 | struct address_space *mapping = page_mapping(page); | ||
852 | unsigned long flags; | ||
853 | |||
854 | if (mapping) { | ||
855 | write_lock_irqsave(&mapping->tree_lock, flags); | ||
856 | if (TestClearPageDirty(page)) { | ||
857 | radix_tree_tag_clear(&mapping->page_tree, | ||
858 | page_index(page), | ||
859 | PAGECACHE_TAG_DIRTY); | ||
860 | write_unlock_irqrestore(&mapping->tree_lock, flags); | ||
861 | /* | ||
862 | * We can continue to use `mapping' here because the | ||
863 | * page is locked, which pins the address_space | ||
864 | */ | ||
865 | if (mapping_cap_account_dirty(mapping)) { | ||
866 | page_mkclean(page); | ||
867 | dec_zone_page_state(page, NR_FILE_DIRTY); | ||
868 | } | ||
869 | return 1; | ||
870 | } | ||
871 | write_unlock_irqrestore(&mapping->tree_lock, flags); | ||
872 | return 0; | ||
873 | } | ||
874 | return TestClearPageDirty(page); | ||
875 | } | ||
876 | EXPORT_SYMBOL(test_clear_page_dirty); | ||
877 | |||
878 | /* | ||
879 | * Clear a page's dirty flag, while caring for dirty memory accounting. | 848 | * Clear a page's dirty flag, while caring for dirty memory accounting. |
880 | * Returns true if the page was previously dirty. | 849 | * Returns true if the page was previously dirty. |
881 | * | 850 | * |
@@ -893,12 +862,41 @@ int clear_page_dirty_for_io(struct page *page) | |||
893 | { | 862 | { |
894 | struct address_space *mapping = page_mapping(page); | 863 | struct address_space *mapping = page_mapping(page); |
895 | 864 | ||
896 | if (mapping) { | 865 | if (mapping && mapping_cap_account_dirty(mapping)) { |
866 | /* | ||
867 | * Yes, Virginia, this is indeed insane. | ||
868 | * | ||
869 | * We use this sequence to make sure that | ||
870 | * (a) we account for dirty stats properly | ||
871 | * (b) we tell the low-level filesystem to | ||
872 | * mark the whole page dirty if it was | ||
873 | * dirty in a pagetable. Only to then | ||
874 | * (c) clean the page again and return 1 to | ||
875 | * cause the writeback. | ||
876 | * | ||
877 | * This way we avoid all nasty races with the | ||
878 | * dirty bit in multiple places and clearing | ||
879 | * them concurrently from different threads. | ||
880 | * | ||
881 | * Note! Normally the "set_page_dirty(page)" | ||
882 | * has no effect on the actual dirty bit - since | ||
883 | * that will already usually be set. But we | ||
884 | * need the side effects, and it can help us | ||
885 | * avoid races. | ||
886 | * | ||
887 | * We basically use the page "master dirty bit" | ||
888 | * as a serialization point for all the different | ||
889 | * threads doing their things. | ||
890 | * | ||
891 | * FIXME! We still have a race here: if somebody | ||
892 | * adds the page back to the page tables in | ||
893 | * between the "page_mkclean()" and the "TestClearPageDirty()", | ||
894 | * we might have it mapped without the dirty bit set. | ||
895 | */ | ||
896 | if (page_mkclean(page)) | ||
897 | set_page_dirty(page); | ||
897 | if (TestClearPageDirty(page)) { | 898 | if (TestClearPageDirty(page)) { |
898 | if (mapping_cap_account_dirty(mapping)) { | 899 | dec_zone_page_state(page, NR_FILE_DIRTY); |
899 | page_mkclean(page); | ||
900 | dec_zone_page_state(page, NR_FILE_DIRTY); | ||
901 | } | ||
902 | return 1; | 900 | return 1; |
903 | } | 901 | } |
904 | return 0; | 902 | return 0; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index aa6fcc7ca66f..fc5b5442e942 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #include <linux/sort.h> | 40 | #include <linux/sort.h> |
41 | #include <linux/pfn.h> | 41 | #include <linux/pfn.h> |
42 | #include <linux/backing-dev.h> | 42 | #include <linux/backing-dev.h> |
43 | #include <linux/fault-inject.h> | ||
43 | 44 | ||
44 | #include <asm/tlbflush.h> | 45 | #include <asm/tlbflush.h> |
45 | #include <asm/div64.h> | 46 | #include <asm/div64.h> |
@@ -83,14 +84,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { | |||
83 | 84 | ||
84 | EXPORT_SYMBOL(totalram_pages); | 85 | EXPORT_SYMBOL(totalram_pages); |
85 | 86 | ||
86 | /* | 87 | static char * const zone_names[MAX_NR_ZONES] = { |
87 | * Used by page_zone() to look up the address of the struct zone whose | ||
88 | * id is encoded in the upper bits of page->flags | ||
89 | */ | ||
90 | struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; | ||
91 | EXPORT_SYMBOL(zone_table); | ||
92 | |||
93 | static char *zone_names[MAX_NR_ZONES] = { | ||
94 | "DMA", | 88 | "DMA", |
95 | #ifdef CONFIG_ZONE_DMA32 | 89 | #ifdef CONFIG_ZONE_DMA32 |
96 | "DMA32", | 90 | "DMA32", |
@@ -237,7 +231,7 @@ static void prep_compound_page(struct page *page, unsigned long order) | |||
237 | int i; | 231 | int i; |
238 | int nr_pages = 1 << order; | 232 | int nr_pages = 1 << order; |
239 | 233 | ||
240 | page[1].lru.next = (void *)free_compound_page; /* set dtor */ | 234 | set_compound_page_dtor(page, free_compound_page); |
241 | page[1].lru.prev = (void *)order; | 235 | page[1].lru.prev = (void *)order; |
242 | for (i = 0; i < nr_pages; i++) { | 236 | for (i = 0; i < nr_pages; i++) { |
243 | struct page *p = page + i; | 237 | struct page *p = page + i; |
@@ -486,7 +480,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order) | |||
486 | spin_lock(&zone->lock); | 480 | spin_lock(&zone->lock); |
487 | zone->all_unreclaimable = 0; | 481 | zone->all_unreclaimable = 0; |
488 | zone->pages_scanned = 0; | 482 | zone->pages_scanned = 0; |
489 | __free_one_page(page, zone ,order); | 483 | __free_one_page(page, zone, order); |
490 | spin_unlock(&zone->lock); | 484 | spin_unlock(&zone->lock); |
491 | } | 485 | } |
492 | 486 | ||
@@ -605,6 +599,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
605 | 1 << PG_checked | 1 << PG_mappedtodisk); | 599 | 1 << PG_checked | 1 << PG_mappedtodisk); |
606 | set_page_private(page, 0); | 600 | set_page_private(page, 0); |
607 | set_page_refcounted(page); | 601 | set_page_refcounted(page); |
602 | |||
603 | arch_alloc_page(page, order); | ||
608 | kernel_map_pages(page, 1 << order, 1); | 604 | kernel_map_pages(page, 1 << order, 1); |
609 | 605 | ||
610 | if (gfp_flags & __GFP_ZERO) | 606 | if (gfp_flags & __GFP_ZERO) |
@@ -690,9 +686,15 @@ void drain_node_pages(int nodeid) | |||
690 | 686 | ||
691 | pcp = &pset->pcp[i]; | 687 | pcp = &pset->pcp[i]; |
692 | if (pcp->count) { | 688 | if (pcp->count) { |
689 | int to_drain; | ||
690 | |||
693 | local_irq_save(flags); | 691 | local_irq_save(flags); |
694 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | 692 | if (pcp->count >= pcp->batch) |
695 | pcp->count = 0; | 693 | to_drain = pcp->batch; |
694 | else | ||
695 | to_drain = pcp->count; | ||
696 | free_pages_bulk(zone, to_drain, &pcp->list, 0); | ||
697 | pcp->count -= to_drain; | ||
696 | local_irq_restore(flags); | 698 | local_irq_restore(flags); |
697 | } | 699 | } |
698 | } | 700 | } |
@@ -700,7 +702,6 @@ void drain_node_pages(int nodeid) | |||
700 | } | 702 | } |
701 | #endif | 703 | #endif |
702 | 704 | ||
703 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) | ||
704 | static void __drain_pages(unsigned int cpu) | 705 | static void __drain_pages(unsigned int cpu) |
705 | { | 706 | { |
706 | unsigned long flags; | 707 | unsigned long flags; |
@@ -710,6 +711,9 @@ static void __drain_pages(unsigned int cpu) | |||
710 | for_each_zone(zone) { | 711 | for_each_zone(zone) { |
711 | struct per_cpu_pageset *pset; | 712 | struct per_cpu_pageset *pset; |
712 | 713 | ||
714 | if (!populated_zone(zone)) | ||
715 | continue; | ||
716 | |||
713 | pset = zone_pcp(zone, cpu); | 717 | pset = zone_pcp(zone, cpu); |
714 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 718 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { |
715 | struct per_cpu_pages *pcp; | 719 | struct per_cpu_pages *pcp; |
@@ -722,7 +726,6 @@ static void __drain_pages(unsigned int cpu) | |||
722 | } | 726 | } |
723 | } | 727 | } |
724 | } | 728 | } |
725 | #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ | ||
726 | 729 | ||
727 | #ifdef CONFIG_PM | 730 | #ifdef CONFIG_PM |
728 | 731 | ||
@@ -893,6 +896,91 @@ failed: | |||
893 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 896 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
894 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 897 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
895 | 898 | ||
899 | #ifdef CONFIG_FAIL_PAGE_ALLOC | ||
900 | |||
901 | static struct fail_page_alloc_attr { | ||
902 | struct fault_attr attr; | ||
903 | |||
904 | u32 ignore_gfp_highmem; | ||
905 | u32 ignore_gfp_wait; | ||
906 | |||
907 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
908 | |||
909 | struct dentry *ignore_gfp_highmem_file; | ||
910 | struct dentry *ignore_gfp_wait_file; | ||
911 | |||
912 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | ||
913 | |||
914 | } fail_page_alloc = { | ||
915 | .attr = FAULT_ATTR_INITIALIZER, | ||
916 | .ignore_gfp_wait = 1, | ||
917 | .ignore_gfp_highmem = 1, | ||
918 | }; | ||
919 | |||
920 | static int __init setup_fail_page_alloc(char *str) | ||
921 | { | ||
922 | return setup_fault_attr(&fail_page_alloc.attr, str); | ||
923 | } | ||
924 | __setup("fail_page_alloc=", setup_fail_page_alloc); | ||
925 | |||
926 | static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | ||
927 | { | ||
928 | if (gfp_mask & __GFP_NOFAIL) | ||
929 | return 0; | ||
930 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) | ||
931 | return 0; | ||
932 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) | ||
933 | return 0; | ||
934 | |||
935 | return should_fail(&fail_page_alloc.attr, 1 << order); | ||
936 | } | ||
937 | |||
938 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
939 | |||
940 | static int __init fail_page_alloc_debugfs(void) | ||
941 | { | ||
942 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | ||
943 | struct dentry *dir; | ||
944 | int err; | ||
945 | |||
946 | err = init_fault_attr_dentries(&fail_page_alloc.attr, | ||
947 | "fail_page_alloc"); | ||
948 | if (err) | ||
949 | return err; | ||
950 | dir = fail_page_alloc.attr.dentries.dir; | ||
951 | |||
952 | fail_page_alloc.ignore_gfp_wait_file = | ||
953 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | ||
954 | &fail_page_alloc.ignore_gfp_wait); | ||
955 | |||
956 | fail_page_alloc.ignore_gfp_highmem_file = | ||
957 | debugfs_create_bool("ignore-gfp-highmem", mode, dir, | ||
958 | &fail_page_alloc.ignore_gfp_highmem); | ||
959 | |||
960 | if (!fail_page_alloc.ignore_gfp_wait_file || | ||
961 | !fail_page_alloc.ignore_gfp_highmem_file) { | ||
962 | err = -ENOMEM; | ||
963 | debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); | ||
964 | debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); | ||
965 | cleanup_fault_attr_dentries(&fail_page_alloc.attr); | ||
966 | } | ||
967 | |||
968 | return err; | ||
969 | } | ||
970 | |||
971 | late_initcall(fail_page_alloc_debugfs); | ||
972 | |||
973 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | ||
974 | |||
975 | #else /* CONFIG_FAIL_PAGE_ALLOC */ | ||
976 | |||
977 | static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | ||
978 | { | ||
979 | return 0; | ||
980 | } | ||
981 | |||
982 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | ||
983 | |||
896 | /* | 984 | /* |
897 | * Return 1 if free pages are above 'mark'. This takes into account the order | 985 | * Return 1 if free pages are above 'mark'. This takes into account the order |
898 | * of the allocation. | 986 | * of the allocation. |
@@ -925,31 +1013,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
925 | return 1; | 1013 | return 1; |
926 | } | 1014 | } |
927 | 1015 | ||
1016 | #ifdef CONFIG_NUMA | ||
1017 | /* | ||
1018 | * zlc_setup - Setup for "zonelist cache". Uses cached zone data to | ||
1019 | * skip over zones that are not allowed by the cpuset, or that have | ||
1020 | * been recently (in last second) found to be nearly full. See further | ||
1021 | * comments in mmzone.h. Reduces cache footprint of zonelist scans | ||
1022 | * that have to skip over alot of full or unallowed zones. | ||
1023 | * | ||
1024 | * If the zonelist cache is present in the passed in zonelist, then | ||
1025 | * returns a pointer to the allowed node mask (either the current | ||
1026 | * tasks mems_allowed, or node_online_map.) | ||
1027 | * | ||
1028 | * If the zonelist cache is not available for this zonelist, does | ||
1029 | * nothing and returns NULL. | ||
1030 | * | ||
1031 | * If the fullzones BITMAP in the zonelist cache is stale (more than | ||
1032 | * a second since last zap'd) then we zap it out (clear its bits.) | ||
1033 | * | ||
1034 | * We hold off even calling zlc_setup, until after we've checked the | ||
1035 | * first zone in the zonelist, on the theory that most allocations will | ||
1036 | * be satisfied from that first zone, so best to examine that zone as | ||
1037 | * quickly as we can. | ||
1038 | */ | ||
1039 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | ||
1040 | { | ||
1041 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1042 | nodemask_t *allowednodes; /* zonelist_cache approximation */ | ||
1043 | |||
1044 | zlc = zonelist->zlcache_ptr; | ||
1045 | if (!zlc) | ||
1046 | return NULL; | ||
1047 | |||
1048 | if (jiffies - zlc->last_full_zap > 1 * HZ) { | ||
1049 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
1050 | zlc->last_full_zap = jiffies; | ||
1051 | } | ||
1052 | |||
1053 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | ||
1054 | &cpuset_current_mems_allowed : | ||
1055 | &node_online_map; | ||
1056 | return allowednodes; | ||
1057 | } | ||
1058 | |||
1059 | /* | ||
1060 | * Given 'z' scanning a zonelist, run a couple of quick checks to see | ||
1061 | * if it is worth looking at further for free memory: | ||
1062 | * 1) Check that the zone isn't thought to be full (doesn't have its | ||
1063 | * bit set in the zonelist_cache fullzones BITMAP). | ||
1064 | * 2) Check that the zones node (obtained from the zonelist_cache | ||
1065 | * z_to_n[] mapping) is allowed in the passed in allowednodes mask. | ||
1066 | * Return true (non-zero) if zone is worth looking at further, or | ||
1067 | * else return false (zero) if it is not. | ||
1068 | * | ||
1069 | * This check -ignores- the distinction between various watermarks, | ||
1070 | * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is | ||
1071 | * found to be full for any variation of these watermarks, it will | ||
1072 | * be considered full for up to one second by all requests, unless | ||
1073 | * we are so low on memory on all allowed nodes that we are forced | ||
1074 | * into the second scan of the zonelist. | ||
1075 | * | ||
1076 | * In the second scan we ignore this zonelist cache and exactly | ||
1077 | * apply the watermarks to all zones, even it is slower to do so. | ||
1078 | * We are low on memory in the second scan, and should leave no stone | ||
1079 | * unturned looking for a free page. | ||
1080 | */ | ||
1081 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | ||
1082 | nodemask_t *allowednodes) | ||
1083 | { | ||
1084 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1085 | int i; /* index of *z in zonelist zones */ | ||
1086 | int n; /* node that zone *z is on */ | ||
1087 | |||
1088 | zlc = zonelist->zlcache_ptr; | ||
1089 | if (!zlc) | ||
1090 | return 1; | ||
1091 | |||
1092 | i = z - zonelist->zones; | ||
1093 | n = zlc->z_to_n[i]; | ||
1094 | |||
1095 | /* This zone is worth trying if it is allowed but not full */ | ||
1096 | return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); | ||
1097 | } | ||
1098 | |||
1099 | /* | ||
1100 | * Given 'z' scanning a zonelist, set the corresponding bit in | ||
1101 | * zlc->fullzones, so that subsequent attempts to allocate a page | ||
1102 | * from that zone don't waste time re-examining it. | ||
1103 | */ | ||
1104 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | ||
1105 | { | ||
1106 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1107 | int i; /* index of *z in zonelist zones */ | ||
1108 | |||
1109 | zlc = zonelist->zlcache_ptr; | ||
1110 | if (!zlc) | ||
1111 | return; | ||
1112 | |||
1113 | i = z - zonelist->zones; | ||
1114 | |||
1115 | set_bit(i, zlc->fullzones); | ||
1116 | } | ||
1117 | |||
1118 | #else /* CONFIG_NUMA */ | ||
1119 | |||
1120 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | ||
1121 | { | ||
1122 | return NULL; | ||
1123 | } | ||
1124 | |||
1125 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | ||
1126 | nodemask_t *allowednodes) | ||
1127 | { | ||
1128 | return 1; | ||
1129 | } | ||
1130 | |||
1131 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | ||
1132 | { | ||
1133 | } | ||
1134 | #endif /* CONFIG_NUMA */ | ||
1135 | |||
928 | /* | 1136 | /* |
929 | * get_page_from_freeliest goes through the zonelist trying to allocate | 1137 | * get_page_from_freelist goes through the zonelist trying to allocate |
930 | * a page. | 1138 | * a page. |
931 | */ | 1139 | */ |
932 | static struct page * | 1140 | static struct page * |
933 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | 1141 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, |
934 | struct zonelist *zonelist, int alloc_flags) | 1142 | struct zonelist *zonelist, int alloc_flags) |
935 | { | 1143 | { |
936 | struct zone **z = zonelist->zones; | 1144 | struct zone **z; |
937 | struct page *page = NULL; | 1145 | struct page *page = NULL; |
938 | int classzone_idx = zone_idx(*z); | 1146 | int classzone_idx = zone_idx(zonelist->zones[0]); |
939 | struct zone *zone; | 1147 | struct zone *zone; |
1148 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | ||
1149 | int zlc_active = 0; /* set if using zonelist_cache */ | ||
1150 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | ||
940 | 1151 | ||
1152 | zonelist_scan: | ||
941 | /* | 1153 | /* |
942 | * Go through the zonelist once, looking for a zone with enough free. | 1154 | * Scan zonelist, looking for a zone with enough free. |
943 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1155 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
944 | */ | 1156 | */ |
1157 | z = zonelist->zones; | ||
1158 | |||
945 | do { | 1159 | do { |
1160 | if (NUMA_BUILD && zlc_active && | ||
1161 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | ||
1162 | continue; | ||
946 | zone = *z; | 1163 | zone = *z; |
947 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && | 1164 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && |
948 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) | 1165 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) |
949 | break; | 1166 | break; |
950 | if ((alloc_flags & ALLOC_CPUSET) && | 1167 | if ((alloc_flags & ALLOC_CPUSET) && |
951 | !cpuset_zone_allowed(zone, gfp_mask)) | 1168 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
952 | continue; | 1169 | goto try_next_zone; |
953 | 1170 | ||
954 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1171 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
955 | unsigned long mark; | 1172 | unsigned long mark; |
@@ -959,18 +1176,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
959 | mark = zone->pages_low; | 1176 | mark = zone->pages_low; |
960 | else | 1177 | else |
961 | mark = zone->pages_high; | 1178 | mark = zone->pages_high; |
962 | if (!zone_watermark_ok(zone , order, mark, | 1179 | if (!zone_watermark_ok(zone, order, mark, |
963 | classzone_idx, alloc_flags)) | 1180 | classzone_idx, alloc_flags)) { |
964 | if (!zone_reclaim_mode || | 1181 | if (!zone_reclaim_mode || |
965 | !zone_reclaim(zone, gfp_mask, order)) | 1182 | !zone_reclaim(zone, gfp_mask, order)) |
966 | continue; | 1183 | goto this_zone_full; |
1184 | } | ||
967 | } | 1185 | } |
968 | 1186 | ||
969 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); | 1187 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); |
970 | if (page) { | 1188 | if (page) |
971 | break; | 1189 | break; |
1190 | this_zone_full: | ||
1191 | if (NUMA_BUILD) | ||
1192 | zlc_mark_zone_full(zonelist, z); | ||
1193 | try_next_zone: | ||
1194 | if (NUMA_BUILD && !did_zlc_setup) { | ||
1195 | /* we do zlc_setup after the first zone is tried */ | ||
1196 | allowednodes = zlc_setup(zonelist, alloc_flags); | ||
1197 | zlc_active = 1; | ||
1198 | did_zlc_setup = 1; | ||
972 | } | 1199 | } |
973 | } while (*(++z) != NULL); | 1200 | } while (*(++z) != NULL); |
1201 | |||
1202 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | ||
1203 | /* Disable zlc cache for second zonelist scan */ | ||
1204 | zlc_active = 0; | ||
1205 | goto zonelist_scan; | ||
1206 | } | ||
974 | return page; | 1207 | return page; |
975 | } | 1208 | } |
976 | 1209 | ||
@@ -992,6 +1225,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, | |||
992 | 1225 | ||
993 | might_sleep_if(wait); | 1226 | might_sleep_if(wait); |
994 | 1227 | ||
1228 | if (should_fail_alloc_page(gfp_mask, order)) | ||
1229 | return NULL; | ||
1230 | |||
995 | restart: | 1231 | restart: |
996 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ | 1232 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ |
997 | 1233 | ||
@@ -1005,9 +1241,19 @@ restart: | |||
1005 | if (page) | 1241 | if (page) |
1006 | goto got_pg; | 1242 | goto got_pg; |
1007 | 1243 | ||
1008 | do { | 1244 | /* |
1245 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | ||
1246 | * __GFP_NOWARN set) should not cause reclaim since the subsystem | ||
1247 | * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim | ||
1248 | * using a larger set of nodes after it has established that the | ||
1249 | * allowed per node queues are empty and that nodes are | ||
1250 | * over allocated. | ||
1251 | */ | ||
1252 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | ||
1253 | goto nopage; | ||
1254 | |||
1255 | for (z = zonelist->zones; *z; z++) | ||
1009 | wakeup_kswapd(*z, order); | 1256 | wakeup_kswapd(*z, order); |
1010 | } while (*(++z)); | ||
1011 | 1257 | ||
1012 | /* | 1258 | /* |
1013 | * OK, we're below the kswapd watermark and have kicked background | 1259 | * OK, we're below the kswapd watermark and have kicked background |
@@ -1041,6 +1287,7 @@ restart: | |||
1041 | 1287 | ||
1042 | /* This allocation should allow future memory freeing. */ | 1288 | /* This allocation should allow future memory freeing. */ |
1043 | 1289 | ||
1290 | rebalance: | ||
1044 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) | 1291 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) |
1045 | && !in_interrupt()) { | 1292 | && !in_interrupt()) { |
1046 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 1293 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { |
@@ -1062,7 +1309,6 @@ nofail_alloc: | |||
1062 | if (!wait) | 1309 | if (!wait) |
1063 | goto nopage; | 1310 | goto nopage; |
1064 | 1311 | ||
1065 | rebalance: | ||
1066 | cond_resched(); | 1312 | cond_resched(); |
1067 | 1313 | ||
1068 | /* We now go into synchronous reclaim */ | 1314 | /* We now go into synchronous reclaim */ |
@@ -1262,7 +1508,7 @@ unsigned int nr_free_pagecache_pages(void) | |||
1262 | static inline void show_node(struct zone *zone) | 1508 | static inline void show_node(struct zone *zone) |
1263 | { | 1509 | { |
1264 | if (NUMA_BUILD) | 1510 | if (NUMA_BUILD) |
1265 | printk("Node %ld ", zone_to_nid(zone)); | 1511 | printk("Node %d ", zone_to_nid(zone)); |
1266 | } | 1512 | } |
1267 | 1513 | ||
1268 | void si_meminfo(struct sysinfo *val) | 1514 | void si_meminfo(struct sysinfo *val) |
@@ -1542,6 +1788,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1542 | } | 1788 | } |
1543 | } | 1789 | } |
1544 | 1790 | ||
1791 | /* Construct the zonelist performance cache - see further mmzone.h */ | ||
1792 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | ||
1793 | { | ||
1794 | int i; | ||
1795 | |||
1796 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1797 | struct zonelist *zonelist; | ||
1798 | struct zonelist_cache *zlc; | ||
1799 | struct zone **z; | ||
1800 | |||
1801 | zonelist = pgdat->node_zonelists + i; | ||
1802 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; | ||
1803 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
1804 | for (z = zonelist->zones; *z; z++) | ||
1805 | zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); | ||
1806 | } | ||
1807 | } | ||
1808 | |||
1545 | #else /* CONFIG_NUMA */ | 1809 | #else /* CONFIG_NUMA */ |
1546 | 1810 | ||
1547 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1811 | static void __meminit build_zonelists(pg_data_t *pgdat) |
@@ -1579,14 +1843,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1579 | } | 1843 | } |
1580 | } | 1844 | } |
1581 | 1845 | ||
1846 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ | ||
1847 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | ||
1848 | { | ||
1849 | int i; | ||
1850 | |||
1851 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
1852 | pgdat->node_zonelists[i].zlcache_ptr = NULL; | ||
1853 | } | ||
1854 | |||
1582 | #endif /* CONFIG_NUMA */ | 1855 | #endif /* CONFIG_NUMA */ |
1583 | 1856 | ||
1584 | /* return values int ....just for stop_machine_run() */ | 1857 | /* return values int ....just for stop_machine_run() */ |
1585 | static int __meminit __build_all_zonelists(void *dummy) | 1858 | static int __meminit __build_all_zonelists(void *dummy) |
1586 | { | 1859 | { |
1587 | int nid; | 1860 | int nid; |
1588 | for_each_online_node(nid) | 1861 | |
1862 | for_each_online_node(nid) { | ||
1589 | build_zonelists(NODE_DATA(nid)); | 1863 | build_zonelists(NODE_DATA(nid)); |
1864 | build_zonelist_cache(NODE_DATA(nid)); | ||
1865 | } | ||
1590 | return 0; | 1866 | return 0; |
1591 | } | 1867 | } |
1592 | 1868 | ||
@@ -1680,17 +1956,24 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
1680 | * done. Non-atomic initialization, single-pass. | 1956 | * done. Non-atomic initialization, single-pass. |
1681 | */ | 1957 | */ |
1682 | void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | 1958 | void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, |
1683 | unsigned long start_pfn) | 1959 | unsigned long start_pfn, enum memmap_context context) |
1684 | { | 1960 | { |
1685 | struct page *page; | 1961 | struct page *page; |
1686 | unsigned long end_pfn = start_pfn + size; | 1962 | unsigned long end_pfn = start_pfn + size; |
1687 | unsigned long pfn; | 1963 | unsigned long pfn; |
1688 | 1964 | ||
1689 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 1965 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
1690 | if (!early_pfn_valid(pfn)) | 1966 | /* |
1691 | continue; | 1967 | * There can be holes in boot-time mem_map[]s |
1692 | if (!early_pfn_in_nid(pfn, nid)) | 1968 | * handed to this function. They do not |
1693 | continue; | 1969 | * exist on hotplugged memory. |
1970 | */ | ||
1971 | if (context == MEMMAP_EARLY) { | ||
1972 | if (!early_pfn_valid(pfn)) | ||
1973 | continue; | ||
1974 | if (!early_pfn_in_nid(pfn, nid)) | ||
1975 | continue; | ||
1976 | } | ||
1694 | page = pfn_to_page(pfn); | 1977 | page = pfn_to_page(pfn); |
1695 | set_page_links(page, zone, nid, pfn); | 1978 | set_page_links(page, zone, nid, pfn); |
1696 | init_page_count(page); | 1979 | init_page_count(page); |
@@ -1715,23 +1998,9 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | |||
1715 | } | 1998 | } |
1716 | } | 1999 | } |
1717 | 2000 | ||
1718 | #define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) | ||
1719 | void zonetable_add(struct zone *zone, int nid, enum zone_type zid, | ||
1720 | unsigned long pfn, unsigned long size) | ||
1721 | { | ||
1722 | unsigned long snum = pfn_to_section_nr(pfn); | ||
1723 | unsigned long end = pfn_to_section_nr(pfn + size); | ||
1724 | |||
1725 | if (FLAGS_HAS_NODE) | ||
1726 | zone_table[ZONETABLE_INDEX(nid, zid)] = zone; | ||
1727 | else | ||
1728 | for (; snum <= end; snum++) | ||
1729 | zone_table[ZONETABLE_INDEX(snum, zid)] = zone; | ||
1730 | } | ||
1731 | |||
1732 | #ifndef __HAVE_ARCH_MEMMAP_INIT | 2001 | #ifndef __HAVE_ARCH_MEMMAP_INIT |
1733 | #define memmap_init(size, nid, zone, start_pfn) \ | 2002 | #define memmap_init(size, nid, zone, start_pfn) \ |
1734 | memmap_init_zone((size), (nid), (zone), (start_pfn)) | 2003 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) |
1735 | #endif | 2004 | #endif |
1736 | 2005 | ||
1737 | static int __cpuinit zone_batchsize(struct zone *zone) | 2006 | static int __cpuinit zone_batchsize(struct zone *zone) |
@@ -1881,16 +2150,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, | |||
1881 | int ret = NOTIFY_OK; | 2150 | int ret = NOTIFY_OK; |
1882 | 2151 | ||
1883 | switch (action) { | 2152 | switch (action) { |
1884 | case CPU_UP_PREPARE: | 2153 | case CPU_UP_PREPARE: |
1885 | if (process_zones(cpu)) | 2154 | if (process_zones(cpu)) |
1886 | ret = NOTIFY_BAD; | 2155 | ret = NOTIFY_BAD; |
1887 | break; | 2156 | break; |
1888 | case CPU_UP_CANCELED: | 2157 | case CPU_UP_CANCELED: |
1889 | case CPU_DEAD: | 2158 | case CPU_DEAD: |
1890 | free_zone_pagesets(cpu); | 2159 | free_zone_pagesets(cpu); |
1891 | break; | 2160 | break; |
1892 | default: | 2161 | default: |
1893 | break; | 2162 | break; |
1894 | } | 2163 | } |
1895 | return ret; | 2164 | return ret; |
1896 | } | 2165 | } |
@@ -1977,7 +2246,8 @@ static __meminit void zone_pcp_init(struct zone *zone) | |||
1977 | 2246 | ||
1978 | __meminit int init_currently_empty_zone(struct zone *zone, | 2247 | __meminit int init_currently_empty_zone(struct zone *zone, |
1979 | unsigned long zone_start_pfn, | 2248 | unsigned long zone_start_pfn, |
1980 | unsigned long size) | 2249 | unsigned long size, |
2250 | enum memmap_context context) | ||
1981 | { | 2251 | { |
1982 | struct pglist_data *pgdat = zone->zone_pgdat; | 2252 | struct pglist_data *pgdat = zone->zone_pgdat; |
1983 | int ret; | 2253 | int ret; |
@@ -2421,8 +2691,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
2421 | if (!size) | 2691 | if (!size) |
2422 | continue; | 2692 | continue; |
2423 | 2693 | ||
2424 | zonetable_add(zone, nid, j, zone_start_pfn, size); | 2694 | ret = init_currently_empty_zone(zone, zone_start_pfn, |
2425 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); | 2695 | size, MEMMAP_EARLY); |
2426 | BUG_ON(ret); | 2696 | BUG_ON(ret); |
2427 | zone_start_pfn += size; | 2697 | zone_start_pfn += size; |
2428 | } | 2698 | } |
@@ -2736,7 +3006,6 @@ void __init free_area_init(unsigned long *zones_size) | |||
2736 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 3006 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
2737 | } | 3007 | } |
2738 | 3008 | ||
2739 | #ifdef CONFIG_HOTPLUG_CPU | ||
2740 | static int page_alloc_cpu_notify(struct notifier_block *self, | 3009 | static int page_alloc_cpu_notify(struct notifier_block *self, |
2741 | unsigned long action, void *hcpu) | 3010 | unsigned long action, void *hcpu) |
2742 | { | 3011 | { |
@@ -2751,7 +3020,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self, | |||
2751 | } | 3020 | } |
2752 | return NOTIFY_OK; | 3021 | return NOTIFY_OK; |
2753 | } | 3022 | } |
2754 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
2755 | 3023 | ||
2756 | void __init page_alloc_init(void) | 3024 | void __init page_alloc_init(void) |
2757 | { | 3025 | { |
@@ -3055,7 +3323,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
3055 | /* allow the kernel cmdline to have a say */ | 3323 | /* allow the kernel cmdline to have a say */ |
3056 | if (!numentries) { | 3324 | if (!numentries) { |
3057 | /* round applicable memory size up to nearest megabyte */ | 3325 | /* round applicable memory size up to nearest megabyte */ |
3058 | numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; | 3326 | numentries = nr_kernel_pages; |
3059 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; | 3327 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; |
3060 | numentries >>= 20 - PAGE_SHIFT; | 3328 | numentries >>= 20 - PAGE_SHIFT; |
3061 | numentries <<= 20 - PAGE_SHIFT; | 3329 | numentries <<= 20 - PAGE_SHIFT; |
@@ -3065,6 +3333,10 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
3065 | numentries >>= (scale - PAGE_SHIFT); | 3333 | numentries >>= (scale - PAGE_SHIFT); |
3066 | else | 3334 | else |
3067 | numentries <<= (PAGE_SHIFT - scale); | 3335 | numentries <<= (PAGE_SHIFT - scale); |
3336 | |||
3337 | /* Make sure we've got at least a 0-order allocation.. */ | ||
3338 | if (unlikely((numentries * bucketsize) < PAGE_SIZE)) | ||
3339 | numentries = PAGE_SIZE / bucketsize; | ||
3068 | } | 3340 | } |
3069 | numentries = roundup_pow_of_two(numentries); | 3341 | numentries = roundup_pow_of_two(numentries); |
3070 | 3342 | ||
@@ -3077,7 +3349,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
3077 | if (numentries > max) | 3349 | if (numentries > max) |
3078 | numentries = max; | 3350 | numentries = max; |
3079 | 3351 | ||
3080 | log2qty = long_log2(numentries); | 3352 | log2qty = ilog2(numentries); |
3081 | 3353 | ||
3082 | do { | 3354 | do { |
3083 | size = bucketsize << log2qty; | 3355 | size = bucketsize << log2qty; |
@@ -3099,7 +3371,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
3099 | printk("%s hash table entries: %d (order: %d, %lu bytes)\n", | 3371 | printk("%s hash table entries: %d (order: %d, %lu bytes)\n", |
3100 | tablename, | 3372 | tablename, |
3101 | (1U << log2qty), | 3373 | (1U << log2qty), |
3102 | long_log2(size) - PAGE_SHIFT, | 3374 | ilog2(size) - PAGE_SHIFT, |
3103 | size); | 3375 | size); |
3104 | 3376 | ||
3105 | if (_hash_shift) | 3377 | if (_hash_shift) |
diff --git a/mm/page_io.c b/mm/page_io.c index d4840ecbf8f9..dbffec0d78c9 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -147,48 +147,3 @@ int swap_readpage(struct file *file, struct page *page) | |||
147 | out: | 147 | out: |
148 | return ret; | 148 | return ret; |
149 | } | 149 | } |
150 | |||
151 | #ifdef CONFIG_SOFTWARE_SUSPEND | ||
152 | /* | ||
153 | * A scruffy utility function to read or write an arbitrary swap page | ||
154 | * and wait on the I/O. The caller must have a ref on the page. | ||
155 | * | ||
156 | * We use end_swap_bio_read() even for writes, because it happens to do what | ||
157 | * we want. | ||
158 | */ | ||
159 | int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page, | ||
160 | struct bio **bio_chain) | ||
161 | { | ||
162 | struct bio *bio; | ||
163 | int ret = 0; | ||
164 | int bio_rw; | ||
165 | |||
166 | lock_page(page); | ||
167 | |||
168 | bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read); | ||
169 | if (bio == NULL) { | ||
170 | unlock_page(page); | ||
171 | ret = -ENOMEM; | ||
172 | goto out; | ||
173 | } | ||
174 | |||
175 | bio_rw = rw; | ||
176 | if (!bio_chain) | ||
177 | bio_rw |= (1 << BIO_RW_SYNC); | ||
178 | if (bio_chain) | ||
179 | bio_get(bio); | ||
180 | submit_bio(bio_rw, bio); | ||
181 | if (bio_chain == NULL) { | ||
182 | wait_on_page_locked(page); | ||
183 | |||
184 | if (!PageUptodate(page) || PageError(page)) | ||
185 | ret = -EIO; | ||
186 | } | ||
187 | if (bio_chain) { | ||
188 | bio->bi_private = *bio_chain; | ||
189 | *bio_chain = bio; | ||
190 | } | ||
191 | out: | ||
192 | return ret; | ||
193 | } | ||
194 | #endif | ||
diff --git a/mm/pdflush.c b/mm/pdflush.c index b02102feeb4b..8ce0900dc95c 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/writeback.h> // Prototypes pdflush_operation() | 21 | #include <linux/writeback.h> // Prototypes pdflush_operation() |
22 | #include <linux/kthread.h> | 22 | #include <linux/kthread.h> |
23 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> |
24 | #include <linux/freezer.h> | ||
24 | 25 | ||
25 | 26 | ||
26 | /* | 27 | /* |
diff --git a/mm/readahead.c b/mm/readahead.c index 23cb61a01c6e..0f539e8e827a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/blkdev.h> | 14 | #include <linux/blkdev.h> |
15 | #include <linux/backing-dev.h> | 15 | #include <linux/backing-dev.h> |
16 | #include <linux/task_io_accounting_ops.h> | ||
16 | #include <linux/pagevec.h> | 17 | #include <linux/pagevec.h> |
17 | 18 | ||
18 | void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | 19 | void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) |
@@ -148,15 +149,10 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, | |||
148 | if (!pagevec_add(&lru_pvec, page)) | 149 | if (!pagevec_add(&lru_pvec, page)) |
149 | __pagevec_lru_add(&lru_pvec); | 150 | __pagevec_lru_add(&lru_pvec); |
150 | if (ret) { | 151 | if (ret) { |
151 | while (!list_empty(pages)) { | 152 | put_pages_list(pages); |
152 | struct page *victim; | ||
153 | |||
154 | victim = list_to_page(pages); | ||
155 | list_del(&victim->lru); | ||
156 | page_cache_release(victim); | ||
157 | } | ||
158 | break; | 153 | break; |
159 | } | 154 | } |
155 | task_io_account_read(PAGE_CACHE_SIZE); | ||
160 | } | 156 | } |
161 | pagevec_lru_add(&lru_pvec); | 157 | pagevec_lru_add(&lru_pvec); |
162 | return ret; | 158 | return ret; |
@@ -456,7 +452,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp, | |||
456 | * | 452 | * |
457 | * Note that @filp is purely used for passing on to the ->readpage[s]() | 453 | * Note that @filp is purely used for passing on to the ->readpage[s]() |
458 | * handler: it may refer to a different file from @mapping (so we may not use | 454 | * handler: it may refer to a different file from @mapping (so we may not use |
459 | * @filp->f_mapping or @filp->f_dentry->d_inode here). | 455 | * @filp->f_mapping or @filp->f_path.dentry->d_inode here). |
460 | * Also, @ra may not be equal to &@filp->f_ra. | 456 | * Also, @ra may not be equal to &@filp->f_ra. |
461 | * | 457 | * |
462 | */ | 458 | */ |
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/rmap.h> | 47 | #include <linux/rmap.h> |
48 | #include <linux/rcupdate.h> | 48 | #include <linux/rcupdate.h> |
49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
50 | #include <linux/kallsyms.h> | ||
50 | 51 | ||
51 | #include <asm/tlbflush.h> | 52 | #include <asm/tlbflush.h> |
52 | 53 | ||
@@ -432,7 +433,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | |||
432 | { | 433 | { |
433 | struct mm_struct *mm = vma->vm_mm; | 434 | struct mm_struct *mm = vma->vm_mm; |
434 | unsigned long address; | 435 | unsigned long address; |
435 | pte_t *pte, entry; | 436 | pte_t *pte; |
436 | spinlock_t *ptl; | 437 | spinlock_t *ptl; |
437 | int ret = 0; | 438 | int ret = 0; |
438 | 439 | ||
@@ -444,17 +445,18 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | |||
444 | if (!pte) | 445 | if (!pte) |
445 | goto out; | 446 | goto out; |
446 | 447 | ||
447 | if (!pte_dirty(*pte) && !pte_write(*pte)) | 448 | if (pte_dirty(*pte) || pte_write(*pte)) { |
448 | goto unlock; | 449 | pte_t entry; |
449 | 450 | ||
450 | entry = ptep_get_and_clear(mm, address, pte); | 451 | flush_cache_page(vma, address, pte_pfn(*pte)); |
451 | entry = pte_mkclean(entry); | 452 | entry = ptep_clear_flush(vma, address, pte); |
452 | entry = pte_wrprotect(entry); | 453 | entry = pte_wrprotect(entry); |
453 | ptep_establish(vma, address, pte, entry); | 454 | entry = pte_mkclean(entry); |
454 | lazy_mmu_prot_update(entry); | 455 | set_pte_at(mm, address, pte, entry); |
455 | ret = 1; | 456 | lazy_mmu_prot_update(entry); |
457 | ret = 1; | ||
458 | } | ||
456 | 459 | ||
457 | unlock: | ||
458 | pte_unmap_unlock(pte, ptl); | 460 | pte_unmap_unlock(pte, ptl); |
459 | out: | 461 | out: |
460 | return ret; | 462 | return ret; |
@@ -489,6 +491,8 @@ int page_mkclean(struct page *page) | |||
489 | if (mapping) | 491 | if (mapping) |
490 | ret = page_mkclean_file(mapping, page); | 492 | ret = page_mkclean_file(mapping, page); |
491 | } | 493 | } |
494 | if (page_test_and_clear_dirty(page)) | ||
495 | ret = 1; | ||
492 | 496 | ||
493 | return ret; | 497 | return ret; |
494 | } | 498 | } |
@@ -567,14 +571,20 @@ void page_add_file_rmap(struct page *page) | |||
567 | * | 571 | * |
568 | * The caller needs to hold the pte lock. | 572 | * The caller needs to hold the pte lock. |
569 | */ | 573 | */ |
570 | void page_remove_rmap(struct page *page) | 574 | void page_remove_rmap(struct page *page, struct vm_area_struct *vma) |
571 | { | 575 | { |
572 | if (atomic_add_negative(-1, &page->_mapcount)) { | 576 | if (atomic_add_negative(-1, &page->_mapcount)) { |
573 | if (unlikely(page_mapcount(page) < 0)) { | 577 | if (unlikely(page_mapcount(page) < 0)) { |
574 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); | 578 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); |
579 | printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); | ||
575 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); | 580 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); |
576 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); | 581 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); |
577 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); | 582 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); |
583 | print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); | ||
584 | if (vma->vm_ops) | ||
585 | print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage); | ||
586 | if (vma->vm_file && vma->vm_file->f_op) | ||
587 | print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); | ||
578 | BUG(); | 588 | BUG(); |
579 | } | 589 | } |
580 | 590 | ||
@@ -679,7 +689,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
679 | dec_mm_counter(mm, file_rss); | 689 | dec_mm_counter(mm, file_rss); |
680 | 690 | ||
681 | 691 | ||
682 | page_remove_rmap(page); | 692 | page_remove_rmap(page, vma); |
683 | page_cache_release(page); | 693 | page_cache_release(page); |
684 | 694 | ||
685 | out_unmap: | 695 | out_unmap: |
@@ -769,7 +779,7 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
769 | if (pte_dirty(pteval)) | 779 | if (pte_dirty(pteval)) |
770 | set_page_dirty(page); | 780 | set_page_dirty(page); |
771 | 781 | ||
772 | page_remove_rmap(page); | 782 | page_remove_rmap(page, vma); |
773 | page_cache_release(page); | 783 | page_cache_release(page); |
774 | dec_mm_counter(mm, file_rss); | 784 | dec_mm_counter(mm, file_rss); |
775 | (*mapcount)--; | 785 | (*mapcount)--; |
diff --git a/mm/shmem.c b/mm/shmem.c index 4959535fc14c..70da7a0981bf 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -177,7 +177,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) | |||
177 | 177 | ||
178 | static struct super_operations shmem_ops; | 178 | static struct super_operations shmem_ops; |
179 | static const struct address_space_operations shmem_aops; | 179 | static const struct address_space_operations shmem_aops; |
180 | static struct file_operations shmem_file_operations; | 180 | static const struct file_operations shmem_file_operations; |
181 | static struct inode_operations shmem_inode_operations; | 181 | static struct inode_operations shmem_inode_operations; |
182 | static struct inode_operations shmem_dir_inode_operations; | 182 | static struct inode_operations shmem_dir_inode_operations; |
183 | static struct inode_operations shmem_special_inode_operations; | 183 | static struct inode_operations shmem_special_inode_operations; |
@@ -515,7 +515,12 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | |||
515 | size = SHMEM_NR_DIRECT; | 515 | size = SHMEM_NR_DIRECT; |
516 | nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size); | 516 | nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size); |
517 | } | 517 | } |
518 | if (!topdir) | 518 | |
519 | /* | ||
520 | * If there are no indirect blocks or we are punching a hole | ||
521 | * below indirect blocks, nothing to be done. | ||
522 | */ | ||
523 | if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT))) | ||
519 | goto done2; | 524 | goto done2; |
520 | 525 | ||
521 | BUG_ON(limit <= SHMEM_NR_DIRECT); | 526 | BUG_ON(limit <= SHMEM_NR_DIRECT); |
@@ -1225,7 +1230,7 @@ failed: | |||
1225 | 1230 | ||
1226 | struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type) | 1231 | struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type) |
1227 | { | 1232 | { |
1228 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | 1233 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1229 | struct page *page = NULL; | 1234 | struct page *page = NULL; |
1230 | unsigned long idx; | 1235 | unsigned long idx; |
1231 | int error; | 1236 | int error; |
@@ -1248,7 +1253,7 @@ static int shmem_populate(struct vm_area_struct *vma, | |||
1248 | unsigned long addr, unsigned long len, | 1253 | unsigned long addr, unsigned long len, |
1249 | pgprot_t prot, unsigned long pgoff, int nonblock) | 1254 | pgprot_t prot, unsigned long pgoff, int nonblock) |
1250 | { | 1255 | { |
1251 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | 1256 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1252 | struct mm_struct *mm = vma->vm_mm; | 1257 | struct mm_struct *mm = vma->vm_mm; |
1253 | enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; | 1258 | enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; |
1254 | unsigned long size; | 1259 | unsigned long size; |
@@ -1293,14 +1298,14 @@ static int shmem_populate(struct vm_area_struct *vma, | |||
1293 | #ifdef CONFIG_NUMA | 1298 | #ifdef CONFIG_NUMA |
1294 | int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | 1299 | int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) |
1295 | { | 1300 | { |
1296 | struct inode *i = vma->vm_file->f_dentry->d_inode; | 1301 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; |
1297 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); | 1302 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); |
1298 | } | 1303 | } |
1299 | 1304 | ||
1300 | struct mempolicy * | 1305 | struct mempolicy * |
1301 | shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) | 1306 | shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) |
1302 | { | 1307 | { |
1303 | struct inode *i = vma->vm_file->f_dentry->d_inode; | 1308 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; |
1304 | unsigned long idx; | 1309 | unsigned long idx; |
1305 | 1310 | ||
1306 | idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 1311 | idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
@@ -1310,7 +1315,7 @@ shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) | |||
1310 | 1315 | ||
1311 | int shmem_lock(struct file *file, int lock, struct user_struct *user) | 1316 | int shmem_lock(struct file *file, int lock, struct user_struct *user) |
1312 | { | 1317 | { |
1313 | struct inode *inode = file->f_dentry->d_inode; | 1318 | struct inode *inode = file->f_path.dentry->d_inode; |
1314 | struct shmem_inode_info *info = SHMEM_I(inode); | 1319 | struct shmem_inode_info *info = SHMEM_I(inode); |
1315 | int retval = -ENOMEM; | 1320 | int retval = -ENOMEM; |
1316 | 1321 | ||
@@ -1422,7 +1427,7 @@ shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsig | |||
1422 | static ssize_t | 1427 | static ssize_t |
1423 | shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) | 1428 | shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) |
1424 | { | 1429 | { |
1425 | struct inode *inode = file->f_dentry->d_inode; | 1430 | struct inode *inode = file->f_path.dentry->d_inode; |
1426 | loff_t pos; | 1431 | loff_t pos; |
1427 | unsigned long written; | 1432 | unsigned long written; |
1428 | ssize_t err; | 1433 | ssize_t err; |
@@ -1442,7 +1447,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t | |||
1442 | if (err || !count) | 1447 | if (err || !count) |
1443 | goto out; | 1448 | goto out; |
1444 | 1449 | ||
1445 | err = remove_suid(file->f_dentry); | 1450 | err = remove_suid(file->f_path.dentry); |
1446 | if (err) | 1451 | if (err) |
1447 | goto out; | 1452 | goto out; |
1448 | 1453 | ||
@@ -1524,7 +1529,7 @@ out: | |||
1524 | 1529 | ||
1525 | static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) | 1530 | static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) |
1526 | { | 1531 | { |
1527 | struct inode *inode = filp->f_dentry->d_inode; | 1532 | struct inode *inode = filp->f_path.dentry->d_inode; |
1528 | struct address_space *mapping = inode->i_mapping; | 1533 | struct address_space *mapping = inode->i_mapping; |
1529 | unsigned long index, offset; | 1534 | unsigned long index, offset; |
1530 | 1535 | ||
@@ -1943,7 +1948,7 @@ static int shmem_xattr_security_set(struct inode *inode, const char *name, | |||
1943 | return security_inode_setsecurity(inode, name, value, size, flags); | 1948 | return security_inode_setsecurity(inode, name, value, size, flags); |
1944 | } | 1949 | } |
1945 | 1950 | ||
1946 | struct xattr_handler shmem_xattr_security_handler = { | 1951 | static struct xattr_handler shmem_xattr_security_handler = { |
1947 | .prefix = XATTR_SECURITY_PREFIX, | 1952 | .prefix = XATTR_SECURITY_PREFIX, |
1948 | .list = shmem_xattr_security_list, | 1953 | .list = shmem_xattr_security_list, |
1949 | .get = shmem_xattr_security_get, | 1954 | .get = shmem_xattr_security_get, |
@@ -2263,7 +2268,7 @@ static struct kmem_cache *shmem_inode_cachep; | |||
2263 | static struct inode *shmem_alloc_inode(struct super_block *sb) | 2268 | static struct inode *shmem_alloc_inode(struct super_block *sb) |
2264 | { | 2269 | { |
2265 | struct shmem_inode_info *p; | 2270 | struct shmem_inode_info *p; |
2266 | p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL); | 2271 | p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); |
2267 | if (!p) | 2272 | if (!p) |
2268 | return NULL; | 2273 | return NULL; |
2269 | return &p->vfs_inode; | 2274 | return &p->vfs_inode; |
@@ -2319,7 +2324,7 @@ static const struct address_space_operations shmem_aops = { | |||
2319 | .migratepage = migrate_page, | 2324 | .migratepage = migrate_page, |
2320 | }; | 2325 | }; |
2321 | 2326 | ||
2322 | static struct file_operations shmem_file_operations = { | 2327 | static const struct file_operations shmem_file_operations = { |
2323 | .mmap = shmem_mmap, | 2328 | .mmap = shmem_mmap, |
2324 | #ifdef CONFIG_TMPFS | 2329 | #ifdef CONFIG_TMPFS |
2325 | .llseek = generic_file_llseek, | 2330 | .llseek = generic_file_llseek, |
@@ -2493,8 +2498,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
2493 | d_instantiate(dentry, inode); | 2498 | d_instantiate(dentry, inode); |
2494 | inode->i_size = size; | 2499 | inode->i_size = size; |
2495 | inode->i_nlink = 0; /* It is unlinked */ | 2500 | inode->i_nlink = 0; /* It is unlinked */ |
2496 | file->f_vfsmnt = mntget(shm_mnt); | 2501 | file->f_path.mnt = mntget(shm_mnt); |
2497 | file->f_dentry = dentry; | 2502 | file->f_path.dentry = dentry; |
2498 | file->f_mapping = inode->i_mapping; | 2503 | file->f_mapping = inode->i_mapping; |
2499 | file->f_op = &shmem_file_operations; | 2504 | file->f_op = &shmem_file_operations; |
2500 | file->f_mode = FMODE_WRITE | FMODE_READ; | 2505 | file->f_mode = FMODE_WRITE | FMODE_READ; |
@@ -103,12 +103,14 @@ | |||
103 | #include <linux/module.h> | 103 | #include <linux/module.h> |
104 | #include <linux/rcupdate.h> | 104 | #include <linux/rcupdate.h> |
105 | #include <linux/string.h> | 105 | #include <linux/string.h> |
106 | #include <linux/uaccess.h> | ||
106 | #include <linux/nodemask.h> | 107 | #include <linux/nodemask.h> |
107 | #include <linux/mempolicy.h> | 108 | #include <linux/mempolicy.h> |
108 | #include <linux/mutex.h> | 109 | #include <linux/mutex.h> |
110 | #include <linux/fault-inject.h> | ||
109 | #include <linux/rtmutex.h> | 111 | #include <linux/rtmutex.h> |
112 | #include <linux/reciprocal_div.h> | ||
110 | 113 | ||
111 | #include <asm/uaccess.h> | ||
112 | #include <asm/cacheflush.h> | 114 | #include <asm/cacheflush.h> |
113 | #include <asm/tlbflush.h> | 115 | #include <asm/tlbflush.h> |
114 | #include <asm/page.h> | 116 | #include <asm/page.h> |
@@ -313,7 +315,7 @@ static int drain_freelist(struct kmem_cache *cache, | |||
313 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, | 315 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, |
314 | int node); | 316 | int node); |
315 | static int enable_cpucache(struct kmem_cache *cachep); | 317 | static int enable_cpucache(struct kmem_cache *cachep); |
316 | static void cache_reap(void *unused); | 318 | static void cache_reap(struct work_struct *unused); |
317 | 319 | ||
318 | /* | 320 | /* |
319 | * This function must be completely optimized away if a constant is passed to | 321 | * This function must be completely optimized away if a constant is passed to |
@@ -385,6 +387,7 @@ struct kmem_cache { | |||
385 | unsigned int shared; | 387 | unsigned int shared; |
386 | 388 | ||
387 | unsigned int buffer_size; | 389 | unsigned int buffer_size; |
390 | u32 reciprocal_buffer_size; | ||
388 | /* 3) touched by every alloc & free from the backend */ | 391 | /* 3) touched by every alloc & free from the backend */ |
389 | struct kmem_list3 *nodelists[MAX_NUMNODES]; | 392 | struct kmem_list3 *nodelists[MAX_NUMNODES]; |
390 | 393 | ||
@@ -626,10 +629,17 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, | |||
626 | return slab->s_mem + cache->buffer_size * idx; | 629 | return slab->s_mem + cache->buffer_size * idx; |
627 | } | 630 | } |
628 | 631 | ||
629 | static inline unsigned int obj_to_index(struct kmem_cache *cache, | 632 | /* |
630 | struct slab *slab, void *obj) | 633 | * We want to avoid an expensive divide : (offset / cache->buffer_size) |
634 | * Using the fact that buffer_size is a constant for a particular cache, | ||
635 | * we can replace (offset / cache->buffer_size) by | ||
636 | * reciprocal_divide(offset, cache->reciprocal_buffer_size) | ||
637 | */ | ||
638 | static inline unsigned int obj_to_index(const struct kmem_cache *cache, | ||
639 | const struct slab *slab, void *obj) | ||
631 | { | 640 | { |
632 | return (unsigned)(obj - slab->s_mem) / cache->buffer_size; | 641 | u32 offset = (obj - slab->s_mem); |
642 | return reciprocal_divide(offset, cache->reciprocal_buffer_size); | ||
633 | } | 643 | } |
634 | 644 | ||
635 | /* | 645 | /* |
@@ -730,7 +740,10 @@ static inline void init_lock_keys(void) | |||
730 | } | 740 | } |
731 | #endif | 741 | #endif |
732 | 742 | ||
733 | /* Guard access to the cache-chain. */ | 743 | /* |
744 | * 1. Guard access to the cache-chain. | ||
745 | * 2. Protect sanity of cpu_online_map against cpu hotplug events | ||
746 | */ | ||
734 | static DEFINE_MUTEX(cache_chain_mutex); | 747 | static DEFINE_MUTEX(cache_chain_mutex); |
735 | static struct list_head cache_chain; | 748 | static struct list_head cache_chain; |
736 | 749 | ||
@@ -753,7 +766,7 @@ int slab_is_available(void) | |||
753 | return g_cpucache_up == FULL; | 766 | return g_cpucache_up == FULL; |
754 | } | 767 | } |
755 | 768 | ||
756 | static DEFINE_PER_CPU(struct work_struct, reap_work); | 769 | static DEFINE_PER_CPU(struct delayed_work, reap_work); |
757 | 770 | ||
758 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | 771 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) |
759 | { | 772 | { |
@@ -866,6 +879,22 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, | |||
866 | dump_stack(); | 879 | dump_stack(); |
867 | } | 880 | } |
868 | 881 | ||
882 | /* | ||
883 | * By default on NUMA we use alien caches to stage the freeing of | ||
884 | * objects allocated from other nodes. This causes massive memory | ||
885 | * inefficiencies when using fake NUMA setup to split memory into a | ||
886 | * large number of small nodes, so it can be disabled on the command | ||
887 | * line | ||
888 | */ | ||
889 | |||
890 | static int use_alien_caches __read_mostly = 1; | ||
891 | static int __init noaliencache_setup(char *s) | ||
892 | { | ||
893 | use_alien_caches = 0; | ||
894 | return 1; | ||
895 | } | ||
896 | __setup("noaliencache", noaliencache_setup); | ||
897 | |||
869 | #ifdef CONFIG_NUMA | 898 | #ifdef CONFIG_NUMA |
870 | /* | 899 | /* |
871 | * Special reaping functions for NUMA systems called from cache_reap(). | 900 | * Special reaping functions for NUMA systems called from cache_reap(). |
@@ -916,17 +945,18 @@ static void next_reap_node(void) | |||
916 | */ | 945 | */ |
917 | static void __devinit start_cpu_timer(int cpu) | 946 | static void __devinit start_cpu_timer(int cpu) |
918 | { | 947 | { |
919 | struct work_struct *reap_work = &per_cpu(reap_work, cpu); | 948 | struct delayed_work *reap_work = &per_cpu(reap_work, cpu); |
920 | 949 | ||
921 | /* | 950 | /* |
922 | * When this gets called from do_initcalls via cpucache_init(), | 951 | * When this gets called from do_initcalls via cpucache_init(), |
923 | * init_workqueues() has already run, so keventd will be setup | 952 | * init_workqueues() has already run, so keventd will be setup |
924 | * at that time. | 953 | * at that time. |
925 | */ | 954 | */ |
926 | if (keventd_up() && reap_work->func == NULL) { | 955 | if (keventd_up() && reap_work->work.func == NULL) { |
927 | init_reap_node(cpu); | 956 | init_reap_node(cpu); |
928 | INIT_WORK(reap_work, cache_reap, NULL); | 957 | INIT_DELAYED_WORK(reap_work, cache_reap); |
929 | schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); | 958 | schedule_delayed_work_on(cpu, reap_work, |
959 | __round_jiffies_relative(HZ, cpu)); | ||
930 | } | 960 | } |
931 | } | 961 | } |
932 | 962 | ||
@@ -996,7 +1026,7 @@ static inline void *alternate_node_alloc(struct kmem_cache *cachep, | |||
996 | return NULL; | 1026 | return NULL; |
997 | } | 1027 | } |
998 | 1028 | ||
999 | static inline void *__cache_alloc_node(struct kmem_cache *cachep, | 1029 | static inline void *____cache_alloc_node(struct kmem_cache *cachep, |
1000 | gfp_t flags, int nodeid) | 1030 | gfp_t flags, int nodeid) |
1001 | { | 1031 | { |
1002 | return NULL; | 1032 | return NULL; |
@@ -1004,7 +1034,7 @@ static inline void *__cache_alloc_node(struct kmem_cache *cachep, | |||
1004 | 1034 | ||
1005 | #else /* CONFIG_NUMA */ | 1035 | #else /* CONFIG_NUMA */ |
1006 | 1036 | ||
1007 | static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); | 1037 | static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); |
1008 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); | 1038 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); |
1009 | 1039 | ||
1010 | static struct array_cache **alloc_alien_cache(int node, int limit) | 1040 | static struct array_cache **alloc_alien_cache(int node, int limit) |
@@ -1114,7 +1144,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
1114 | * Make sure we are not freeing a object from another node to the array | 1144 | * Make sure we are not freeing a object from another node to the array |
1115 | * cache on this cpu. | 1145 | * cache on this cpu. |
1116 | */ | 1146 | */ |
1117 | if (likely(slabp->nodeid == node)) | 1147 | if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches)) |
1118 | return 0; | 1148 | return 0; |
1119 | 1149 | ||
1120 | l3 = cachep->nodelists[node]; | 1150 | l3 = cachep->nodelists[node]; |
@@ -1192,7 +1222,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1192 | list_for_each_entry(cachep, &cache_chain, next) { | 1222 | list_for_each_entry(cachep, &cache_chain, next) { |
1193 | struct array_cache *nc; | 1223 | struct array_cache *nc; |
1194 | struct array_cache *shared; | 1224 | struct array_cache *shared; |
1195 | struct array_cache **alien; | 1225 | struct array_cache **alien = NULL; |
1196 | 1226 | ||
1197 | nc = alloc_arraycache(node, cachep->limit, | 1227 | nc = alloc_arraycache(node, cachep->limit, |
1198 | cachep->batchcount); | 1228 | cachep->batchcount); |
@@ -1204,9 +1234,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1204 | if (!shared) | 1234 | if (!shared) |
1205 | goto bad; | 1235 | goto bad; |
1206 | 1236 | ||
1207 | alien = alloc_alien_cache(node, cachep->limit); | 1237 | if (use_alien_caches) { |
1208 | if (!alien) | 1238 | alien = alloc_alien_cache(node, cachep->limit); |
1209 | goto bad; | 1239 | if (!alien) |
1240 | goto bad; | ||
1241 | } | ||
1210 | cachep->array[cpu] = nc; | 1242 | cachep->array[cpu] = nc; |
1211 | l3 = cachep->nodelists[node]; | 1243 | l3 = cachep->nodelists[node]; |
1212 | BUG_ON(!l3); | 1244 | BUG_ON(!l3); |
@@ -1230,12 +1262,18 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1230 | kfree(shared); | 1262 | kfree(shared); |
1231 | free_alien_cache(alien); | 1263 | free_alien_cache(alien); |
1232 | } | 1264 | } |
1233 | mutex_unlock(&cache_chain_mutex); | ||
1234 | break; | 1265 | break; |
1235 | case CPU_ONLINE: | 1266 | case CPU_ONLINE: |
1267 | mutex_unlock(&cache_chain_mutex); | ||
1236 | start_cpu_timer(cpu); | 1268 | start_cpu_timer(cpu); |
1237 | break; | 1269 | break; |
1238 | #ifdef CONFIG_HOTPLUG_CPU | 1270 | #ifdef CONFIG_HOTPLUG_CPU |
1271 | case CPU_DOWN_PREPARE: | ||
1272 | mutex_lock(&cache_chain_mutex); | ||
1273 | break; | ||
1274 | case CPU_DOWN_FAILED: | ||
1275 | mutex_unlock(&cache_chain_mutex); | ||
1276 | break; | ||
1239 | case CPU_DEAD: | 1277 | case CPU_DEAD: |
1240 | /* | 1278 | /* |
1241 | * Even if all the cpus of a node are down, we don't free the | 1279 | * Even if all the cpus of a node are down, we don't free the |
@@ -1246,8 +1284,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1246 | * gets destroyed at kmem_cache_destroy(). | 1284 | * gets destroyed at kmem_cache_destroy(). |
1247 | */ | 1285 | */ |
1248 | /* fall thru */ | 1286 | /* fall thru */ |
1287 | #endif | ||
1249 | case CPU_UP_CANCELED: | 1288 | case CPU_UP_CANCELED: |
1250 | mutex_lock(&cache_chain_mutex); | ||
1251 | list_for_each_entry(cachep, &cache_chain, next) { | 1289 | list_for_each_entry(cachep, &cache_chain, next) { |
1252 | struct array_cache *nc; | 1290 | struct array_cache *nc; |
1253 | struct array_cache *shared; | 1291 | struct array_cache *shared; |
@@ -1308,11 +1346,9 @@ free_array_cache: | |||
1308 | } | 1346 | } |
1309 | mutex_unlock(&cache_chain_mutex); | 1347 | mutex_unlock(&cache_chain_mutex); |
1310 | break; | 1348 | break; |
1311 | #endif | ||
1312 | } | 1349 | } |
1313 | return NOTIFY_OK; | 1350 | return NOTIFY_OK; |
1314 | bad: | 1351 | bad: |
1315 | mutex_unlock(&cache_chain_mutex); | ||
1316 | return NOTIFY_BAD; | 1352 | return NOTIFY_BAD; |
1317 | } | 1353 | } |
1318 | 1354 | ||
@@ -1400,6 +1436,8 @@ void __init kmem_cache_init(void) | |||
1400 | 1436 | ||
1401 | cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, | 1437 | cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, |
1402 | cache_line_size()); | 1438 | cache_line_size()); |
1439 | cache_cache.reciprocal_buffer_size = | ||
1440 | reciprocal_value(cache_cache.buffer_size); | ||
1403 | 1441 | ||
1404 | for (order = 0; order < MAX_ORDER; order++) { | 1442 | for (order = 0; order < MAX_ORDER; order++) { |
1405 | cache_estimate(order, cache_cache.buffer_size, | 1443 | cache_estimate(order, cache_cache.buffer_size, |
@@ -1580,12 +1618,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1580 | flags |= __GFP_COMP; | 1618 | flags |= __GFP_COMP; |
1581 | #endif | 1619 | #endif |
1582 | 1620 | ||
1583 | /* | 1621 | flags |= cachep->gfpflags; |
1584 | * Under NUMA we want memory on the indicated node. We will handle | ||
1585 | * the needed fallback ourselves since we want to serve from our | ||
1586 | * per node object lists first for other nodes. | ||
1587 | */ | ||
1588 | flags |= cachep->gfpflags | GFP_THISNODE; | ||
1589 | 1622 | ||
1590 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | 1623 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); |
1591 | if (!page) | 1624 | if (!page) |
@@ -2098,15 +2131,12 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2098 | } | 2131 | } |
2099 | 2132 | ||
2100 | /* | 2133 | /* |
2101 | * Prevent CPUs from coming and going. | 2134 | * We use cache_chain_mutex to ensure a consistent view of |
2102 | * lock_cpu_hotplug() nests outside cache_chain_mutex | 2135 | * cpu_online_map as well. Please see cpuup_callback |
2103 | */ | 2136 | */ |
2104 | lock_cpu_hotplug(); | ||
2105 | |||
2106 | mutex_lock(&cache_chain_mutex); | 2137 | mutex_lock(&cache_chain_mutex); |
2107 | 2138 | ||
2108 | list_for_each_entry(pc, &cache_chain, next) { | 2139 | list_for_each_entry(pc, &cache_chain, next) { |
2109 | mm_segment_t old_fs = get_fs(); | ||
2110 | char tmp; | 2140 | char tmp; |
2111 | int res; | 2141 | int res; |
2112 | 2142 | ||
@@ -2115,9 +2145,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2115 | * destroy its slab cache and no-one else reuses the vmalloc | 2145 | * destroy its slab cache and no-one else reuses the vmalloc |
2116 | * area of the module. Print a warning. | 2146 | * area of the module. Print a warning. |
2117 | */ | 2147 | */ |
2118 | set_fs(KERNEL_DS); | 2148 | res = probe_kernel_address(pc->name, tmp); |
2119 | res = __get_user(tmp, pc->name); | ||
2120 | set_fs(old_fs); | ||
2121 | if (res) { | 2149 | if (res) { |
2122 | printk("SLAB: cache with size %d has lost its name\n", | 2150 | printk("SLAB: cache with size %d has lost its name\n", |
2123 | pc->buffer_size); | 2151 | pc->buffer_size); |
@@ -2197,25 +2225,24 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2197 | if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) | 2225 | if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) |
2198 | ralign = BYTES_PER_WORD; | 2226 | ralign = BYTES_PER_WORD; |
2199 | 2227 | ||
2200 | /* 2) arch mandated alignment: disables debug if necessary */ | 2228 | /* 2) arch mandated alignment */ |
2201 | if (ralign < ARCH_SLAB_MINALIGN) { | 2229 | if (ralign < ARCH_SLAB_MINALIGN) { |
2202 | ralign = ARCH_SLAB_MINALIGN; | 2230 | ralign = ARCH_SLAB_MINALIGN; |
2203 | if (ralign > BYTES_PER_WORD) | ||
2204 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | ||
2205 | } | 2231 | } |
2206 | /* 3) caller mandated alignment: disables debug if necessary */ | 2232 | /* 3) caller mandated alignment */ |
2207 | if (ralign < align) { | 2233 | if (ralign < align) { |
2208 | ralign = align; | 2234 | ralign = align; |
2209 | if (ralign > BYTES_PER_WORD) | ||
2210 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | ||
2211 | } | 2235 | } |
2236 | /* disable debug if necessary */ | ||
2237 | if (ralign > BYTES_PER_WORD) | ||
2238 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | ||
2212 | /* | 2239 | /* |
2213 | * 4) Store it. | 2240 | * 4) Store it. |
2214 | */ | 2241 | */ |
2215 | align = ralign; | 2242 | align = ralign; |
2216 | 2243 | ||
2217 | /* Get cache's description obj. */ | 2244 | /* Get cache's description obj. */ |
2218 | cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); | 2245 | cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); |
2219 | if (!cachep) | 2246 | if (!cachep) |
2220 | goto oops; | 2247 | goto oops; |
2221 | 2248 | ||
@@ -2297,6 +2324,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2297 | if (flags & SLAB_CACHE_DMA) | 2324 | if (flags & SLAB_CACHE_DMA) |
2298 | cachep->gfpflags |= GFP_DMA; | 2325 | cachep->gfpflags |= GFP_DMA; |
2299 | cachep->buffer_size = size; | 2326 | cachep->buffer_size = size; |
2327 | cachep->reciprocal_buffer_size = reciprocal_value(size); | ||
2300 | 2328 | ||
2301 | if (flags & CFLGS_OFF_SLAB) { | 2329 | if (flags & CFLGS_OFF_SLAB) { |
2302 | cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); | 2330 | cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); |
@@ -2326,7 +2354,6 @@ oops: | |||
2326 | panic("kmem_cache_create(): failed to create slab `%s'\n", | 2354 | panic("kmem_cache_create(): failed to create slab `%s'\n", |
2327 | name); | 2355 | name); |
2328 | mutex_unlock(&cache_chain_mutex); | 2356 | mutex_unlock(&cache_chain_mutex); |
2329 | unlock_cpu_hotplug(); | ||
2330 | return cachep; | 2357 | return cachep; |
2331 | } | 2358 | } |
2332 | EXPORT_SYMBOL(kmem_cache_create); | 2359 | EXPORT_SYMBOL(kmem_cache_create); |
@@ -2444,6 +2471,7 @@ out: | |||
2444 | return nr_freed; | 2471 | return nr_freed; |
2445 | } | 2472 | } |
2446 | 2473 | ||
2474 | /* Called with cache_chain_mutex held to protect against cpu hotplug */ | ||
2447 | static int __cache_shrink(struct kmem_cache *cachep) | 2475 | static int __cache_shrink(struct kmem_cache *cachep) |
2448 | { | 2476 | { |
2449 | int ret = 0, i = 0; | 2477 | int ret = 0, i = 0; |
@@ -2474,9 +2502,13 @@ static int __cache_shrink(struct kmem_cache *cachep) | |||
2474 | */ | 2502 | */ |
2475 | int kmem_cache_shrink(struct kmem_cache *cachep) | 2503 | int kmem_cache_shrink(struct kmem_cache *cachep) |
2476 | { | 2504 | { |
2505 | int ret; | ||
2477 | BUG_ON(!cachep || in_interrupt()); | 2506 | BUG_ON(!cachep || in_interrupt()); |
2478 | 2507 | ||
2479 | return __cache_shrink(cachep); | 2508 | mutex_lock(&cache_chain_mutex); |
2509 | ret = __cache_shrink(cachep); | ||
2510 | mutex_unlock(&cache_chain_mutex); | ||
2511 | return ret; | ||
2480 | } | 2512 | } |
2481 | EXPORT_SYMBOL(kmem_cache_shrink); | 2513 | EXPORT_SYMBOL(kmem_cache_shrink); |
2482 | 2514 | ||
@@ -2500,23 +2532,16 @@ void kmem_cache_destroy(struct kmem_cache *cachep) | |||
2500 | { | 2532 | { |
2501 | BUG_ON(!cachep || in_interrupt()); | 2533 | BUG_ON(!cachep || in_interrupt()); |
2502 | 2534 | ||
2503 | /* Don't let CPUs to come and go */ | ||
2504 | lock_cpu_hotplug(); | ||
2505 | |||
2506 | /* Find the cache in the chain of caches. */ | 2535 | /* Find the cache in the chain of caches. */ |
2507 | mutex_lock(&cache_chain_mutex); | 2536 | mutex_lock(&cache_chain_mutex); |
2508 | /* | 2537 | /* |
2509 | * the chain is never empty, cache_cache is never destroyed | 2538 | * the chain is never empty, cache_cache is never destroyed |
2510 | */ | 2539 | */ |
2511 | list_del(&cachep->next); | 2540 | list_del(&cachep->next); |
2512 | mutex_unlock(&cache_chain_mutex); | ||
2513 | |||
2514 | if (__cache_shrink(cachep)) { | 2541 | if (__cache_shrink(cachep)) { |
2515 | slab_error(cachep, "Can't free all objects"); | 2542 | slab_error(cachep, "Can't free all objects"); |
2516 | mutex_lock(&cache_chain_mutex); | ||
2517 | list_add(&cachep->next, &cache_chain); | 2543 | list_add(&cachep->next, &cache_chain); |
2518 | mutex_unlock(&cache_chain_mutex); | 2544 | mutex_unlock(&cache_chain_mutex); |
2519 | unlock_cpu_hotplug(); | ||
2520 | return; | 2545 | return; |
2521 | } | 2546 | } |
2522 | 2547 | ||
@@ -2524,7 +2549,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) | |||
2524 | synchronize_rcu(); | 2549 | synchronize_rcu(); |
2525 | 2550 | ||
2526 | __kmem_cache_destroy(cachep); | 2551 | __kmem_cache_destroy(cachep); |
2527 | unlock_cpu_hotplug(); | 2552 | mutex_unlock(&cache_chain_mutex); |
2528 | } | 2553 | } |
2529 | EXPORT_SYMBOL(kmem_cache_destroy); | 2554 | EXPORT_SYMBOL(kmem_cache_destroy); |
2530 | 2555 | ||
@@ -2548,7 +2573,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | |||
2548 | if (OFF_SLAB(cachep)) { | 2573 | if (OFF_SLAB(cachep)) { |
2549 | /* Slab management obj is off-slab. */ | 2574 | /* Slab management obj is off-slab. */ |
2550 | slabp = kmem_cache_alloc_node(cachep->slabp_cache, | 2575 | slabp = kmem_cache_alloc_node(cachep->slabp_cache, |
2551 | local_flags, nodeid); | 2576 | local_flags & ~GFP_THISNODE, nodeid); |
2552 | if (!slabp) | 2577 | if (!slabp) |
2553 | return NULL; | 2578 | return NULL; |
2554 | } else { | 2579 | } else { |
@@ -2618,7 +2643,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2618 | 2643 | ||
2619 | static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) | 2644 | static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) |
2620 | { | 2645 | { |
2621 | if (flags & SLAB_DMA) | 2646 | if (flags & GFP_DMA) |
2622 | BUG_ON(!(cachep->gfpflags & GFP_DMA)); | 2647 | BUG_ON(!(cachep->gfpflags & GFP_DMA)); |
2623 | else | 2648 | else |
2624 | BUG_ON(cachep->gfpflags & GFP_DMA); | 2649 | BUG_ON(cachep->gfpflags & GFP_DMA); |
@@ -2689,10 +2714,10 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, | |||
2689 | * Grow (by 1) the number of slabs within a cache. This is called by | 2714 | * Grow (by 1) the number of slabs within a cache. This is called by |
2690 | * kmem_cache_alloc() when there are no active objs left in a cache. | 2715 | * kmem_cache_alloc() when there are no active objs left in a cache. |
2691 | */ | 2716 | */ |
2692 | static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 2717 | static int cache_grow(struct kmem_cache *cachep, |
2718 | gfp_t flags, int nodeid, void *objp) | ||
2693 | { | 2719 | { |
2694 | struct slab *slabp; | 2720 | struct slab *slabp; |
2695 | void *objp; | ||
2696 | size_t offset; | 2721 | size_t offset; |
2697 | gfp_t local_flags; | 2722 | gfp_t local_flags; |
2698 | unsigned long ctor_flags; | 2723 | unsigned long ctor_flags; |
@@ -2702,12 +2727,12 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
2702 | * Be lazy and only check for valid flags here, keeping it out of the | 2727 | * Be lazy and only check for valid flags here, keeping it out of the |
2703 | * critical path in kmem_cache_alloc(). | 2728 | * critical path in kmem_cache_alloc(). |
2704 | */ | 2729 | */ |
2705 | BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)); | 2730 | BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW)); |
2706 | if (flags & SLAB_NO_GROW) | 2731 | if (flags & __GFP_NO_GROW) |
2707 | return 0; | 2732 | return 0; |
2708 | 2733 | ||
2709 | ctor_flags = SLAB_CTOR_CONSTRUCTOR; | 2734 | ctor_flags = SLAB_CTOR_CONSTRUCTOR; |
2710 | local_flags = (flags & SLAB_LEVEL_MASK); | 2735 | local_flags = (flags & GFP_LEVEL_MASK); |
2711 | if (!(local_flags & __GFP_WAIT)) | 2736 | if (!(local_flags & __GFP_WAIT)) |
2712 | /* | 2737 | /* |
2713 | * Not allowed to sleep. Need to tell a constructor about | 2738 | * Not allowed to sleep. Need to tell a constructor about |
@@ -2744,12 +2769,14 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
2744 | * Get mem for the objs. Attempt to allocate a physical page from | 2769 | * Get mem for the objs. Attempt to allocate a physical page from |
2745 | * 'nodeid'. | 2770 | * 'nodeid'. |
2746 | */ | 2771 | */ |
2747 | objp = kmem_getpages(cachep, flags, nodeid); | 2772 | if (!objp) |
2773 | objp = kmem_getpages(cachep, flags, nodeid); | ||
2748 | if (!objp) | 2774 | if (!objp) |
2749 | goto failed; | 2775 | goto failed; |
2750 | 2776 | ||
2751 | /* Get slab management. */ | 2777 | /* Get slab management. */ |
2752 | slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid); | 2778 | slabp = alloc_slabmgmt(cachep, objp, offset, |
2779 | local_flags & ~GFP_THISNODE, nodeid); | ||
2753 | if (!slabp) | 2780 | if (!slabp) |
2754 | goto opps1; | 2781 | goto opps1; |
2755 | 2782 | ||
@@ -2987,7 +3014,7 @@ alloc_done: | |||
2987 | 3014 | ||
2988 | if (unlikely(!ac->avail)) { | 3015 | if (unlikely(!ac->avail)) { |
2989 | int x; | 3016 | int x; |
2990 | x = cache_grow(cachep, flags, node); | 3017 | x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); |
2991 | 3018 | ||
2992 | /* cache_grow can reenable interrupts, then ac could change. */ | 3019 | /* cache_grow can reenable interrupts, then ac could change. */ |
2993 | ac = cpu_cache_get(cachep); | 3020 | ac = cpu_cache_get(cachep); |
@@ -3063,18 +3090,101 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3063 | 3090 | ||
3064 | cachep->ctor(objp, cachep, ctor_flags); | 3091 | cachep->ctor(objp, cachep, ctor_flags); |
3065 | } | 3092 | } |
3093 | #if ARCH_SLAB_MINALIGN | ||
3094 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { | ||
3095 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", | ||
3096 | objp, ARCH_SLAB_MINALIGN); | ||
3097 | } | ||
3098 | #endif | ||
3066 | return objp; | 3099 | return objp; |
3067 | } | 3100 | } |
3068 | #else | 3101 | #else |
3069 | #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) | 3102 | #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) |
3070 | #endif | 3103 | #endif |
3071 | 3104 | ||
3105 | #ifdef CONFIG_FAILSLAB | ||
3106 | |||
3107 | static struct failslab_attr { | ||
3108 | |||
3109 | struct fault_attr attr; | ||
3110 | |||
3111 | u32 ignore_gfp_wait; | ||
3112 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
3113 | struct dentry *ignore_gfp_wait_file; | ||
3114 | #endif | ||
3115 | |||
3116 | } failslab = { | ||
3117 | .attr = FAULT_ATTR_INITIALIZER, | ||
3118 | .ignore_gfp_wait = 1, | ||
3119 | }; | ||
3120 | |||
3121 | static int __init setup_failslab(char *str) | ||
3122 | { | ||
3123 | return setup_fault_attr(&failslab.attr, str); | ||
3124 | } | ||
3125 | __setup("failslab=", setup_failslab); | ||
3126 | |||
3127 | static int should_failslab(struct kmem_cache *cachep, gfp_t flags) | ||
3128 | { | ||
3129 | if (cachep == &cache_cache) | ||
3130 | return 0; | ||
3131 | if (flags & __GFP_NOFAIL) | ||
3132 | return 0; | ||
3133 | if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT)) | ||
3134 | return 0; | ||
3135 | |||
3136 | return should_fail(&failslab.attr, obj_size(cachep)); | ||
3137 | } | ||
3138 | |||
3139 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
3140 | |||
3141 | static int __init failslab_debugfs(void) | ||
3142 | { | ||
3143 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | ||
3144 | struct dentry *dir; | ||
3145 | int err; | ||
3146 | |||
3147 | err = init_fault_attr_dentries(&failslab.attr, "failslab"); | ||
3148 | if (err) | ||
3149 | return err; | ||
3150 | dir = failslab.attr.dentries.dir; | ||
3151 | |||
3152 | failslab.ignore_gfp_wait_file = | ||
3153 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | ||
3154 | &failslab.ignore_gfp_wait); | ||
3155 | |||
3156 | if (!failslab.ignore_gfp_wait_file) { | ||
3157 | err = -ENOMEM; | ||
3158 | debugfs_remove(failslab.ignore_gfp_wait_file); | ||
3159 | cleanup_fault_attr_dentries(&failslab.attr); | ||
3160 | } | ||
3161 | |||
3162 | return err; | ||
3163 | } | ||
3164 | |||
3165 | late_initcall(failslab_debugfs); | ||
3166 | |||
3167 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | ||
3168 | |||
3169 | #else /* CONFIG_FAILSLAB */ | ||
3170 | |||
3171 | static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags) | ||
3172 | { | ||
3173 | return 0; | ||
3174 | } | ||
3175 | |||
3176 | #endif /* CONFIG_FAILSLAB */ | ||
3177 | |||
3072 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 3178 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
3073 | { | 3179 | { |
3074 | void *objp; | 3180 | void *objp; |
3075 | struct array_cache *ac; | 3181 | struct array_cache *ac; |
3076 | 3182 | ||
3077 | check_irq_off(); | 3183 | check_irq_off(); |
3184 | |||
3185 | if (should_failslab(cachep, flags)) | ||
3186 | return NULL; | ||
3187 | |||
3078 | ac = cpu_cache_get(cachep); | 3188 | ac = cpu_cache_get(cachep); |
3079 | if (likely(ac->avail)) { | 3189 | if (likely(ac->avail)) { |
3080 | STATS_INC_ALLOCHIT(cachep); | 3190 | STATS_INC_ALLOCHIT(cachep); |
@@ -3105,10 +3215,10 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep, | |||
3105 | objp = ____cache_alloc(cachep, flags); | 3215 | objp = ____cache_alloc(cachep, flags); |
3106 | /* | 3216 | /* |
3107 | * We may just have run out of memory on the local node. | 3217 | * We may just have run out of memory on the local node. |
3108 | * __cache_alloc_node() knows how to locate memory on other nodes | 3218 | * ____cache_alloc_node() knows how to locate memory on other nodes |
3109 | */ | 3219 | */ |
3110 | if (NUMA_BUILD && !objp) | 3220 | if (NUMA_BUILD && !objp) |
3111 | objp = __cache_alloc_node(cachep, flags, numa_node_id()); | 3221 | objp = ____cache_alloc_node(cachep, flags, numa_node_id()); |
3112 | local_irq_restore(save_flags); | 3222 | local_irq_restore(save_flags); |
3113 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, | 3223 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, |
3114 | caller); | 3224 | caller); |
@@ -3135,15 +3245,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3135 | else if (current->mempolicy) | 3245 | else if (current->mempolicy) |
3136 | nid_alloc = slab_node(current->mempolicy); | 3246 | nid_alloc = slab_node(current->mempolicy); |
3137 | if (nid_alloc != nid_here) | 3247 | if (nid_alloc != nid_here) |
3138 | return __cache_alloc_node(cachep, flags, nid_alloc); | 3248 | return ____cache_alloc_node(cachep, flags, nid_alloc); |
3139 | return NULL; | 3249 | return NULL; |
3140 | } | 3250 | } |
3141 | 3251 | ||
3142 | /* | 3252 | /* |
3143 | * Fallback function if there was no memory available and no objects on a | 3253 | * Fallback function if there was no memory available and no objects on a |
3144 | * certain node and we are allowed to fall back. We mimick the behavior of | 3254 | * certain node and fall back is permitted. First we scan all the |
3145 | * the page allocator. We fall back according to a zonelist determined by | 3255 | * available nodelists for available objects. If that fails then we |
3146 | * the policy layer while obeying cpuset constraints. | 3256 | * perform an allocation without specifying a node. This allows the page |
3257 | * allocator to do its reclaim / fallback magic. We then insert the | ||
3258 | * slab into the proper nodelist and then allocate from it. | ||
3147 | */ | 3259 | */ |
3148 | void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | 3260 | void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) |
3149 | { | 3261 | { |
@@ -3151,15 +3263,57 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3151 | ->node_zonelists[gfp_zone(flags)]; | 3263 | ->node_zonelists[gfp_zone(flags)]; |
3152 | struct zone **z; | 3264 | struct zone **z; |
3153 | void *obj = NULL; | 3265 | void *obj = NULL; |
3266 | int nid; | ||
3267 | gfp_t local_flags = (flags & GFP_LEVEL_MASK); | ||
3154 | 3268 | ||
3269 | retry: | ||
3270 | /* | ||
3271 | * Look through allowed nodes for objects available | ||
3272 | * from existing per node queues. | ||
3273 | */ | ||
3155 | for (z = zonelist->zones; *z && !obj; z++) { | 3274 | for (z = zonelist->zones; *z && !obj; z++) { |
3156 | int nid = zone_to_nid(*z); | 3275 | nid = zone_to_nid(*z); |
3157 | 3276 | ||
3158 | if (zone_idx(*z) <= ZONE_NORMAL && | 3277 | if (cpuset_zone_allowed_hardwall(*z, flags) && |
3159 | cpuset_zone_allowed(*z, flags) && | 3278 | cache->nodelists[nid] && |
3160 | cache->nodelists[nid]) | 3279 | cache->nodelists[nid]->free_objects) |
3161 | obj = __cache_alloc_node(cache, | 3280 | obj = ____cache_alloc_node(cache, |
3162 | flags | __GFP_THISNODE, nid); | 3281 | flags | GFP_THISNODE, nid); |
3282 | } | ||
3283 | |||
3284 | if (!obj && !(flags & __GFP_NO_GROW)) { | ||
3285 | /* | ||
3286 | * This allocation will be performed within the constraints | ||
3287 | * of the current cpuset / memory policy requirements. | ||
3288 | * We may trigger various forms of reclaim on the allowed | ||
3289 | * set and go into memory reserves if necessary. | ||
3290 | */ | ||
3291 | if (local_flags & __GFP_WAIT) | ||
3292 | local_irq_enable(); | ||
3293 | kmem_flagcheck(cache, flags); | ||
3294 | obj = kmem_getpages(cache, flags, -1); | ||
3295 | if (local_flags & __GFP_WAIT) | ||
3296 | local_irq_disable(); | ||
3297 | if (obj) { | ||
3298 | /* | ||
3299 | * Insert into the appropriate per node queues | ||
3300 | */ | ||
3301 | nid = page_to_nid(virt_to_page(obj)); | ||
3302 | if (cache_grow(cache, flags, nid, obj)) { | ||
3303 | obj = ____cache_alloc_node(cache, | ||
3304 | flags | GFP_THISNODE, nid); | ||
3305 | if (!obj) | ||
3306 | /* | ||
3307 | * Another processor may allocate the | ||
3308 | * objects in the slab since we are | ||
3309 | * not holding any locks. | ||
3310 | */ | ||
3311 | goto retry; | ||
3312 | } else { | ||
3313 | /* cache_grow already freed obj */ | ||
3314 | obj = NULL; | ||
3315 | } | ||
3316 | } | ||
3163 | } | 3317 | } |
3164 | return obj; | 3318 | return obj; |
3165 | } | 3319 | } |
@@ -3167,7 +3321,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3167 | /* | 3321 | /* |
3168 | * A interface to enable slab creation on nodeid | 3322 | * A interface to enable slab creation on nodeid |
3169 | */ | 3323 | */ |
3170 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, | 3324 | static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, |
3171 | int nodeid) | 3325 | int nodeid) |
3172 | { | 3326 | { |
3173 | struct list_head *entry; | 3327 | struct list_head *entry; |
@@ -3216,7 +3370,7 @@ retry: | |||
3216 | 3370 | ||
3217 | must_grow: | 3371 | must_grow: |
3218 | spin_unlock(&l3->list_lock); | 3372 | spin_unlock(&l3->list_lock); |
3219 | x = cache_grow(cachep, flags, nodeid); | 3373 | x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); |
3220 | if (x) | 3374 | if (x) |
3221 | goto retry; | 3375 | goto retry; |
3222 | 3376 | ||
@@ -3399,7 +3553,7 @@ EXPORT_SYMBOL(kmem_cache_zalloc); | |||
3399 | * | 3553 | * |
3400 | * Currently only used for dentry validation. | 3554 | * Currently only used for dentry validation. |
3401 | */ | 3555 | */ |
3402 | int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr) | 3556 | int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) |
3403 | { | 3557 | { |
3404 | unsigned long addr = (unsigned long)ptr; | 3558 | unsigned long addr = (unsigned long)ptr; |
3405 | unsigned long min_addr = PAGE_OFFSET; | 3559 | unsigned long min_addr = PAGE_OFFSET; |
@@ -3433,36 +3587,61 @@ out: | |||
3433 | * @cachep: The cache to allocate from. | 3587 | * @cachep: The cache to allocate from. |
3434 | * @flags: See kmalloc(). | 3588 | * @flags: See kmalloc(). |
3435 | * @nodeid: node number of the target node. | 3589 | * @nodeid: node number of the target node. |
3590 | * @caller: return address of caller, used for debug information | ||
3591 | * | ||
3592 | * Identical to kmem_cache_alloc but it will allocate memory on the given | ||
3593 | * node, which can improve the performance for cpu bound structures. | ||
3436 | * | 3594 | * |
3437 | * Identical to kmem_cache_alloc, except that this function is slow | 3595 | * Fallback to other node is possible if __GFP_THISNODE is not set. |
3438 | * and can sleep. And it will allocate memory on the given node, which | ||
3439 | * can improve the performance for cpu bound structures. | ||
3440 | * New and improved: it will now make sure that the object gets | ||
3441 | * put on the correct node list so that there is no false sharing. | ||
3442 | */ | 3596 | */ |
3443 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 3597 | static __always_inline void * |
3598 | __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, | ||
3599 | int nodeid, void *caller) | ||
3444 | { | 3600 | { |
3445 | unsigned long save_flags; | 3601 | unsigned long save_flags; |
3446 | void *ptr; | 3602 | void *ptr = NULL; |
3447 | 3603 | ||
3448 | cache_alloc_debugcheck_before(cachep, flags); | 3604 | cache_alloc_debugcheck_before(cachep, flags); |
3449 | local_irq_save(save_flags); | 3605 | local_irq_save(save_flags); |
3450 | 3606 | ||
3451 | if (nodeid == -1 || nodeid == numa_node_id() || | 3607 | if (unlikely(nodeid == -1)) |
3452 | !cachep->nodelists[nodeid]) | 3608 | nodeid = numa_node_id(); |
3453 | ptr = ____cache_alloc(cachep, flags); | ||
3454 | else | ||
3455 | ptr = __cache_alloc_node(cachep, flags, nodeid); | ||
3456 | local_irq_restore(save_flags); | ||
3457 | 3609 | ||
3458 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, | 3610 | if (likely(cachep->nodelists[nodeid])) { |
3459 | __builtin_return_address(0)); | 3611 | if (nodeid == numa_node_id()) { |
3612 | /* | ||
3613 | * Use the locally cached objects if possible. | ||
3614 | * However ____cache_alloc does not allow fallback | ||
3615 | * to other nodes. It may fail while we still have | ||
3616 | * objects on other nodes available. | ||
3617 | */ | ||
3618 | ptr = ____cache_alloc(cachep, flags); | ||
3619 | } | ||
3620 | if (!ptr) { | ||
3621 | /* ___cache_alloc_node can fall back to other nodes */ | ||
3622 | ptr = ____cache_alloc_node(cachep, flags, nodeid); | ||
3623 | } | ||
3624 | } else { | ||
3625 | /* Node not bootstrapped yet */ | ||
3626 | if (!(flags & __GFP_THISNODE)) | ||
3627 | ptr = fallback_alloc(cachep, flags); | ||
3628 | } | ||
3629 | |||
3630 | local_irq_restore(save_flags); | ||
3631 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); | ||
3460 | 3632 | ||
3461 | return ptr; | 3633 | return ptr; |
3462 | } | 3634 | } |
3635 | |||
3636 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | ||
3637 | { | ||
3638 | return __cache_alloc_node(cachep, flags, nodeid, | ||
3639 | __builtin_return_address(0)); | ||
3640 | } | ||
3463 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 3641 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
3464 | 3642 | ||
3465 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3643 | static __always_inline void * |
3644 | __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) | ||
3466 | { | 3645 | { |
3467 | struct kmem_cache *cachep; | 3646 | struct kmem_cache *cachep; |
3468 | 3647 | ||
@@ -3471,8 +3650,29 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) | |||
3471 | return NULL; | 3650 | return NULL; |
3472 | return kmem_cache_alloc_node(cachep, flags, node); | 3651 | return kmem_cache_alloc_node(cachep, flags, node); |
3473 | } | 3652 | } |
3653 | |||
3654 | #ifdef CONFIG_DEBUG_SLAB | ||
3655 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | ||
3656 | { | ||
3657 | return __do_kmalloc_node(size, flags, node, | ||
3658 | __builtin_return_address(0)); | ||
3659 | } | ||
3474 | EXPORT_SYMBOL(__kmalloc_node); | 3660 | EXPORT_SYMBOL(__kmalloc_node); |
3475 | #endif | 3661 | |
3662 | void *__kmalloc_node_track_caller(size_t size, gfp_t flags, | ||
3663 | int node, void *caller) | ||
3664 | { | ||
3665 | return __do_kmalloc_node(size, flags, node, caller); | ||
3666 | } | ||
3667 | EXPORT_SYMBOL(__kmalloc_node_track_caller); | ||
3668 | #else | ||
3669 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | ||
3670 | { | ||
3671 | return __do_kmalloc_node(size, flags, node, NULL); | ||
3672 | } | ||
3673 | EXPORT_SYMBOL(__kmalloc_node); | ||
3674 | #endif /* CONFIG_DEBUG_SLAB */ | ||
3675 | #endif /* CONFIG_NUMA */ | ||
3476 | 3676 | ||
3477 | /** | 3677 | /** |
3478 | * __do_kmalloc - allocate memory | 3678 | * __do_kmalloc - allocate memory |
@@ -3583,13 +3783,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
3583 | int node; | 3783 | int node; |
3584 | struct kmem_list3 *l3; | 3784 | struct kmem_list3 *l3; |
3585 | struct array_cache *new_shared; | 3785 | struct array_cache *new_shared; |
3586 | struct array_cache **new_alien; | 3786 | struct array_cache **new_alien = NULL; |
3587 | 3787 | ||
3588 | for_each_online_node(node) { | 3788 | for_each_online_node(node) { |
3589 | 3789 | ||
3590 | new_alien = alloc_alien_cache(node, cachep->limit); | 3790 | if (use_alien_caches) { |
3591 | if (!new_alien) | 3791 | new_alien = alloc_alien_cache(node, cachep->limit); |
3592 | goto fail; | 3792 | if (!new_alien) |
3793 | goto fail; | ||
3794 | } | ||
3593 | 3795 | ||
3594 | new_shared = alloc_arraycache(node, | 3796 | new_shared = alloc_arraycache(node, |
3595 | cachep->shared*cachep->batchcount, | 3797 | cachep->shared*cachep->batchcount, |
@@ -3815,7 +4017,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, | |||
3815 | * If we cannot acquire the cache chain mutex then just give up - we'll try | 4017 | * If we cannot acquire the cache chain mutex then just give up - we'll try |
3816 | * again on the next iteration. | 4018 | * again on the next iteration. |
3817 | */ | 4019 | */ |
3818 | static void cache_reap(void *unused) | 4020 | static void cache_reap(struct work_struct *unused) |
3819 | { | 4021 | { |
3820 | struct kmem_cache *searchp; | 4022 | struct kmem_cache *searchp; |
3821 | struct kmem_list3 *l3; | 4023 | struct kmem_list3 *l3; |
@@ -3824,7 +4026,7 @@ static void cache_reap(void *unused) | |||
3824 | if (!mutex_trylock(&cache_chain_mutex)) { | 4026 | if (!mutex_trylock(&cache_chain_mutex)) { |
3825 | /* Give up. Setup the next iteration. */ | 4027 | /* Give up. Setup the next iteration. */ |
3826 | schedule_delayed_work(&__get_cpu_var(reap_work), | 4028 | schedule_delayed_work(&__get_cpu_var(reap_work), |
3827 | REAPTIMEOUT_CPUC); | 4029 | round_jiffies_relative(REAPTIMEOUT_CPUC)); |
3828 | return; | 4030 | return; |
3829 | } | 4031 | } |
3830 | 4032 | ||
@@ -3870,7 +4072,8 @@ next: | |||
3870 | next_reap_node(); | 4072 | next_reap_node(); |
3871 | refresh_cpu_vm_stats(smp_processor_id()); | 4073 | refresh_cpu_vm_stats(smp_processor_id()); |
3872 | /* Set up the next iteration */ | 4074 | /* Set up the next iteration */ |
3873 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); | 4075 | schedule_delayed_work(&__get_cpu_var(reap_work), |
4076 | round_jiffies_relative(REAPTIMEOUT_CPUC)); | ||
3874 | } | 4077 | } |
3875 | 4078 | ||
3876 | #ifdef CONFIG_PROC_FS | 4079 | #ifdef CONFIG_PROC_FS |
@@ -4038,7 +4241,7 @@ static int s_show(struct seq_file *m, void *p) | |||
4038 | * + further values on SMP and with statistics enabled | 4241 | * + further values on SMP and with statistics enabled |
4039 | */ | 4242 | */ |
4040 | 4243 | ||
4041 | struct seq_operations slabinfo_op = { | 4244 | const struct seq_operations slabinfo_op = { |
4042 | .start = s_start, | 4245 | .start = s_start, |
4043 | .next = s_next, | 4246 | .next = s_next, |
4044 | .stop = s_stop, | 4247 | .stop = s_stop, |
@@ -4236,7 +4439,7 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4236 | return 0; | 4439 | return 0; |
4237 | } | 4440 | } |
4238 | 4441 | ||
4239 | struct seq_operations slabstats_op = { | 4442 | const struct seq_operations slabstats_op = { |
4240 | .start = leaks_start, | 4443 | .start = leaks_start, |
4241 | .next = s_next, | 4444 | .next = s_next, |
4242 | .stop = s_stop, | 4445 | .stop = s_stop, |
@@ -60,6 +60,8 @@ static DEFINE_SPINLOCK(slob_lock); | |||
60 | static DEFINE_SPINLOCK(block_lock); | 60 | static DEFINE_SPINLOCK(block_lock); |
61 | 61 | ||
62 | static void slob_free(void *b, int size); | 62 | static void slob_free(void *b, int size); |
63 | static void slob_timer_cbk(void); | ||
64 | |||
63 | 65 | ||
64 | static void *slob_alloc(size_t size, gfp_t gfp, int align) | 66 | static void *slob_alloc(size_t size, gfp_t gfp, int align) |
65 | { | 67 | { |
@@ -157,7 +159,7 @@ static int fastcall find_order(int size) | |||
157 | return order; | 159 | return order; |
158 | } | 160 | } |
159 | 161 | ||
160 | void *kmalloc(size_t size, gfp_t gfp) | 162 | void *__kmalloc(size_t size, gfp_t gfp) |
161 | { | 163 | { |
162 | slob_t *m; | 164 | slob_t *m; |
163 | bigblock_t *bb; | 165 | bigblock_t *bb; |
@@ -186,8 +188,7 @@ void *kmalloc(size_t size, gfp_t gfp) | |||
186 | slob_free(bb, sizeof(bigblock_t)); | 188 | slob_free(bb, sizeof(bigblock_t)); |
187 | return 0; | 189 | return 0; |
188 | } | 190 | } |
189 | 191 | EXPORT_SYMBOL(__kmalloc); | |
190 | EXPORT_SYMBOL(kmalloc); | ||
191 | 192 | ||
192 | void kfree(const void *block) | 193 | void kfree(const void *block) |
193 | { | 194 | { |
@@ -327,9 +328,25 @@ const char *kmem_cache_name(struct kmem_cache *c) | |||
327 | EXPORT_SYMBOL(kmem_cache_name); | 328 | EXPORT_SYMBOL(kmem_cache_name); |
328 | 329 | ||
329 | static struct timer_list slob_timer = TIMER_INITIALIZER( | 330 | static struct timer_list slob_timer = TIMER_INITIALIZER( |
330 | (void (*)(unsigned long))kmem_cache_init, 0, 0); | 331 | (void (*)(unsigned long))slob_timer_cbk, 0, 0); |
332 | |||
333 | int kmem_cache_shrink(struct kmem_cache *d) | ||
334 | { | ||
335 | return 0; | ||
336 | } | ||
337 | EXPORT_SYMBOL(kmem_cache_shrink); | ||
338 | |||
339 | int kmem_ptr_validate(struct kmem_cache *a, const void *b) | ||
340 | { | ||
341 | return 0; | ||
342 | } | ||
343 | |||
344 | void __init kmem_cache_init(void) | ||
345 | { | ||
346 | slob_timer_cbk(); | ||
347 | } | ||
331 | 348 | ||
332 | void kmem_cache_init(void) | 349 | static void slob_timer_cbk(void) |
333 | { | 350 | { |
334 | void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); | 351 | void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); |
335 | 352 | ||
diff --git a/mm/sparse.c b/mm/sparse.c index b3c82ba30012..ac26eb0d73cd 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -24,6 +24,25 @@ struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] | |||
24 | #endif | 24 | #endif |
25 | EXPORT_SYMBOL(mem_section); | 25 | EXPORT_SYMBOL(mem_section); |
26 | 26 | ||
27 | #ifdef NODE_NOT_IN_PAGE_FLAGS | ||
28 | /* | ||
29 | * If we did not store the node number in the page then we have to | ||
30 | * do a lookup in the section_to_node_table in order to find which | ||
31 | * node the page belongs to. | ||
32 | */ | ||
33 | #if MAX_NUMNODES <= 256 | ||
34 | static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; | ||
35 | #else | ||
36 | static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; | ||
37 | #endif | ||
38 | |||
39 | int page_to_nid(struct page *page) | ||
40 | { | ||
41 | return section_to_node_table[page_to_section(page)]; | ||
42 | } | ||
43 | EXPORT_SYMBOL(page_to_nid); | ||
44 | #endif | ||
45 | |||
27 | #ifdef CONFIG_SPARSEMEM_EXTREME | 46 | #ifdef CONFIG_SPARSEMEM_EXTREME |
28 | static struct mem_section *sparse_index_alloc(int nid) | 47 | static struct mem_section *sparse_index_alloc(int nid) |
29 | { | 48 | { |
@@ -49,6 +68,10 @@ static int sparse_index_init(unsigned long section_nr, int nid) | |||
49 | struct mem_section *section; | 68 | struct mem_section *section; |
50 | int ret = 0; | 69 | int ret = 0; |
51 | 70 | ||
71 | #ifdef NODE_NOT_IN_PAGE_FLAGS | ||
72 | section_to_node_table[section_nr] = nid; | ||
73 | #endif | ||
74 | |||
52 | if (mem_section[root]) | 75 | if (mem_section[root]) |
53 | return -EEXIST; | 76 | return -EEXIST; |
54 | 77 | ||
@@ -57,9 +57,9 @@ static void put_compound_page(struct page *page) | |||
57 | { | 57 | { |
58 | page = (struct page *)page_private(page); | 58 | page = (struct page *)page_private(page); |
59 | if (put_page_testzero(page)) { | 59 | if (put_page_testzero(page)) { |
60 | void (*dtor)(struct page *page); | 60 | compound_page_dtor *dtor; |
61 | 61 | ||
62 | dtor = (void (*)(struct page *))page[1].lru.next; | 62 | dtor = get_compound_page_dtor(page); |
63 | (*dtor)(page); | 63 | (*dtor)(page); |
64 | } | 64 | } |
65 | } | 65 | } |
@@ -216,7 +216,7 @@ void lru_add_drain(void) | |||
216 | } | 216 | } |
217 | 217 | ||
218 | #ifdef CONFIG_NUMA | 218 | #ifdef CONFIG_NUMA |
219 | static void lru_add_drain_per_cpu(void *dummy) | 219 | static void lru_add_drain_per_cpu(struct work_struct *dummy) |
220 | { | 220 | { |
221 | lru_add_drain(); | 221 | lru_add_drain(); |
222 | } | 222 | } |
@@ -226,7 +226,7 @@ static void lru_add_drain_per_cpu(void *dummy) | |||
226 | */ | 226 | */ |
227 | int lru_add_drain_all(void) | 227 | int lru_add_drain_all(void) |
228 | { | 228 | { |
229 | return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); | 229 | return schedule_on_each_cpu(lru_add_drain_per_cpu); |
230 | } | 230 | } |
231 | 231 | ||
232 | #else | 232 | #else |
@@ -514,5 +514,7 @@ void __init swap_setup(void) | |||
514 | * Right now other parts of the system means that we | 514 | * Right now other parts of the system means that we |
515 | * _really_ don't want to cluster much more | 515 | * _really_ don't want to cluster much more |
516 | */ | 516 | */ |
517 | #ifdef CONFIG_HOTPLUG_CPU | ||
517 | hotcpu_notifier(cpu_swap_callback, 0); | 518 | hotcpu_notifier(cpu_swap_callback, 0); |
519 | #endif | ||
518 | } | 520 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index a15def63f28f..a2d9bb4e80df 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -427,34 +427,54 @@ void free_swap_and_cache(swp_entry_t entry) | |||
427 | 427 | ||
428 | #ifdef CONFIG_SOFTWARE_SUSPEND | 428 | #ifdef CONFIG_SOFTWARE_SUSPEND |
429 | /* | 429 | /* |
430 | * Find the swap type that corresponds to given device (if any) | 430 | * Find the swap type that corresponds to given device (if any). |
431 | * | 431 | * |
432 | * This is needed for software suspend and is done in such a way that inode | 432 | * @offset - number of the PAGE_SIZE-sized block of the device, starting |
433 | * aliasing is allowed. | 433 | * from 0, in which the swap header is expected to be located. |
434 | * | ||
435 | * This is needed for the suspend to disk (aka swsusp). | ||
434 | */ | 436 | */ |
435 | int swap_type_of(dev_t device) | 437 | int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) |
436 | { | 438 | { |
439 | struct block_device *bdev = NULL; | ||
437 | int i; | 440 | int i; |
438 | 441 | ||
442 | if (device) | ||
443 | bdev = bdget(device); | ||
444 | |||
439 | spin_lock(&swap_lock); | 445 | spin_lock(&swap_lock); |
440 | for (i = 0; i < nr_swapfiles; i++) { | 446 | for (i = 0; i < nr_swapfiles; i++) { |
441 | struct inode *inode; | 447 | struct swap_info_struct *sis = swap_info + i; |
442 | 448 | ||
443 | if (!(swap_info[i].flags & SWP_WRITEOK)) | 449 | if (!(sis->flags & SWP_WRITEOK)) |
444 | continue; | 450 | continue; |
445 | 451 | ||
446 | if (!device) { | 452 | if (!bdev) { |
453 | if (bdev_p) | ||
454 | *bdev_p = sis->bdev; | ||
455 | |||
447 | spin_unlock(&swap_lock); | 456 | spin_unlock(&swap_lock); |
448 | return i; | 457 | return i; |
449 | } | 458 | } |
450 | inode = swap_info[i].swap_file->f_dentry->d_inode; | 459 | if (bdev == sis->bdev) { |
451 | if (S_ISBLK(inode->i_mode) && | 460 | struct swap_extent *se; |
452 | device == MKDEV(imajor(inode), iminor(inode))) { | 461 | |
453 | spin_unlock(&swap_lock); | 462 | se = list_entry(sis->extent_list.next, |
454 | return i; | 463 | struct swap_extent, list); |
464 | if (se->start_block == offset) { | ||
465 | if (bdev_p) | ||
466 | *bdev_p = sis->bdev; | ||
467 | |||
468 | spin_unlock(&swap_lock); | ||
469 | bdput(bdev); | ||
470 | return i; | ||
471 | } | ||
455 | } | 472 | } |
456 | } | 473 | } |
457 | spin_unlock(&swap_lock); | 474 | spin_unlock(&swap_lock); |
475 | if (bdev) | ||
476 | bdput(bdev); | ||
477 | |||
458 | return -ENODEV; | 478 | return -ENODEV; |
459 | } | 479 | } |
460 | 480 | ||
@@ -931,6 +951,23 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) | |||
931 | } | 951 | } |
932 | } | 952 | } |
933 | 953 | ||
954 | #ifdef CONFIG_SOFTWARE_SUSPEND | ||
955 | /* | ||
956 | * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev | ||
957 | * corresponding to given index in swap_info (swap type). | ||
958 | */ | ||
959 | sector_t swapdev_block(int swap_type, pgoff_t offset) | ||
960 | { | ||
961 | struct swap_info_struct *sis; | ||
962 | |||
963 | if (swap_type >= nr_swapfiles) | ||
964 | return 0; | ||
965 | |||
966 | sis = swap_info + swap_type; | ||
967 | return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; | ||
968 | } | ||
969 | #endif /* CONFIG_SOFTWARE_SUSPEND */ | ||
970 | |||
934 | /* | 971 | /* |
935 | * Free all of a swapdev's extent information | 972 | * Free all of a swapdev's extent information |
936 | */ | 973 | */ |
@@ -1274,10 +1311,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
1274 | 1311 | ||
1275 | mutex_lock(&swapon_mutex); | 1312 | mutex_lock(&swapon_mutex); |
1276 | 1313 | ||
1314 | if (!l) | ||
1315 | return SEQ_START_TOKEN; | ||
1316 | |||
1277 | for (i = 0; i < nr_swapfiles; i++, ptr++) { | 1317 | for (i = 0; i < nr_swapfiles; i++, ptr++) { |
1278 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1318 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) |
1279 | continue; | 1319 | continue; |
1280 | if (!l--) | 1320 | if (!--l) |
1281 | return ptr; | 1321 | return ptr; |
1282 | } | 1322 | } |
1283 | 1323 | ||
@@ -1286,10 +1326,17 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
1286 | 1326 | ||
1287 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) | 1327 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) |
1288 | { | 1328 | { |
1289 | struct swap_info_struct *ptr = v; | 1329 | struct swap_info_struct *ptr; |
1290 | struct swap_info_struct *endptr = swap_info + nr_swapfiles; | 1330 | struct swap_info_struct *endptr = swap_info + nr_swapfiles; |
1291 | 1331 | ||
1292 | for (++ptr; ptr < endptr; ptr++) { | 1332 | if (v == SEQ_START_TOKEN) |
1333 | ptr = swap_info; | ||
1334 | else { | ||
1335 | ptr = v; | ||
1336 | ptr++; | ||
1337 | } | ||
1338 | |||
1339 | for (; ptr < endptr; ptr++) { | ||
1293 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1340 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) |
1294 | continue; | 1341 | continue; |
1295 | ++*pos; | 1342 | ++*pos; |
@@ -1310,14 +1357,16 @@ static int swap_show(struct seq_file *swap, void *v) | |||
1310 | struct file *file; | 1357 | struct file *file; |
1311 | int len; | 1358 | int len; |
1312 | 1359 | ||
1313 | if (v == swap_info) | 1360 | if (ptr == SEQ_START_TOKEN) { |
1314 | seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); | 1361 | seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); |
1362 | return 0; | ||
1363 | } | ||
1315 | 1364 | ||
1316 | file = ptr->swap_file; | 1365 | file = ptr->swap_file; |
1317 | len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\"); | 1366 | len = seq_path(swap, file->f_path.mnt, file->f_path.dentry, " \t\n\\"); |
1318 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", | 1367 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", |
1319 | len < 40 ? 40 - len : 1, " ", | 1368 | len < 40 ? 40 - len : 1, " ", |
1320 | S_ISBLK(file->f_dentry->d_inode->i_mode) ? | 1369 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? |
1321 | "partition" : "file\t", | 1370 | "partition" : "file\t", |
1322 | ptr->pages << (PAGE_SHIFT - 10), | 1371 | ptr->pages << (PAGE_SHIFT - 10), |
1323 | ptr->inuse_pages << (PAGE_SHIFT - 10), | 1372 | ptr->inuse_pages << (PAGE_SHIFT - 10), |
@@ -1325,7 +1374,7 @@ static int swap_show(struct seq_file *swap, void *v) | |||
1325 | return 0; | 1374 | return 0; |
1326 | } | 1375 | } |
1327 | 1376 | ||
1328 | static struct seq_operations swaps_op = { | 1377 | static const struct seq_operations swaps_op = { |
1329 | .start = swap_start, | 1378 | .start = swap_start, |
1330 | .next = swap_next, | 1379 | .next = swap_next, |
1331 | .stop = swap_stop, | 1380 | .stop = swap_stop, |
@@ -1337,7 +1386,7 @@ static int swaps_open(struct inode *inode, struct file *file) | |||
1337 | return seq_open(file, &swaps_op); | 1386 | return seq_open(file, &swaps_op); |
1338 | } | 1387 | } |
1339 | 1388 | ||
1340 | static struct file_operations proc_swaps_operations = { | 1389 | static const struct file_operations proc_swaps_operations = { |
1341 | .open = swaps_open, | 1390 | .open = swaps_open, |
1342 | .read = seq_read, | 1391 | .read = seq_read, |
1343 | .llseek = seq_lseek, | 1392 | .llseek = seq_lseek, |
@@ -1540,6 +1589,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1540 | error = -EINVAL; | 1589 | error = -EINVAL; |
1541 | if (!maxpages) | 1590 | if (!maxpages) |
1542 | goto bad_swap; | 1591 | goto bad_swap; |
1592 | if (swapfilesize && maxpages > swapfilesize) { | ||
1593 | printk(KERN_WARNING | ||
1594 | "Swap area shorter than signature indicates\n"); | ||
1595 | goto bad_swap; | ||
1596 | } | ||
1543 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) | 1597 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) |
1544 | goto bad_swap; | 1598 | goto bad_swap; |
1545 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) | 1599 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) |
@@ -1567,12 +1621,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1567 | goto bad_swap; | 1621 | goto bad_swap; |
1568 | } | 1622 | } |
1569 | 1623 | ||
1570 | if (swapfilesize && maxpages > swapfilesize) { | ||
1571 | printk(KERN_WARNING | ||
1572 | "Swap area shorter than signature indicates\n"); | ||
1573 | error = -EINVAL; | ||
1574 | goto bad_swap; | ||
1575 | } | ||
1576 | if (nr_good_pages) { | 1624 | if (nr_good_pages) { |
1577 | p->swap_map[0] = SWAP_MAP_BAD; | 1625 | p->swap_map[0] = SWAP_MAP_BAD; |
1578 | p->max = maxpages; | 1626 | p->max = maxpages; |
diff --git a/mm/thrash.c b/mm/thrash.c index f4c560b4a2b7..9ef9071f99bc 100644 --- a/mm/thrash.c +++ b/mm/thrash.c | |||
@@ -7,100 +7,74 @@ | |||
7 | * | 7 | * |
8 | * Simple token based thrashing protection, using the algorithm | 8 | * Simple token based thrashing protection, using the algorithm |
9 | * described in: http://www.cs.wm.edu/~sjiang/token.pdf | 9 | * described in: http://www.cs.wm.edu/~sjiang/token.pdf |
10 | * | ||
11 | * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> | ||
12 | * Improved algorithm to pass token: | ||
13 | * Each task has a priority which is incremented if it contended | ||
14 | * for the token in an interval less than its previous attempt. | ||
15 | * If the token is acquired, that task's priority is boosted to prevent | ||
16 | * the token from bouncing around too often and to let the task make | ||
17 | * some progress in its execution. | ||
10 | */ | 18 | */ |
19 | |||
11 | #include <linux/jiffies.h> | 20 | #include <linux/jiffies.h> |
12 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
13 | #include <linux/sched.h> | 22 | #include <linux/sched.h> |
14 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
15 | 24 | ||
16 | static DEFINE_SPINLOCK(swap_token_lock); | 25 | static DEFINE_SPINLOCK(swap_token_lock); |
17 | static unsigned long swap_token_timeout; | 26 | struct mm_struct *swap_token_mm; |
18 | static unsigned long swap_token_check; | 27 | static unsigned int global_faults; |
19 | struct mm_struct * swap_token_mm = &init_mm; | ||
20 | |||
21 | #define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2) | ||
22 | #define SWAP_TOKEN_TIMEOUT (300 * HZ) | ||
23 | /* | ||
24 | * Currently disabled; Needs further code to work at HZ * 300. | ||
25 | */ | ||
26 | unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT; | ||
27 | |||
28 | /* | ||
29 | * Take the token away if the process had no page faults | ||
30 | * in the last interval, or if it has held the token for | ||
31 | * too long. | ||
32 | */ | ||
33 | #define SWAP_TOKEN_ENOUGH_RSS 1 | ||
34 | #define SWAP_TOKEN_TIMED_OUT 2 | ||
35 | static int should_release_swap_token(struct mm_struct *mm) | ||
36 | { | ||
37 | int ret = 0; | ||
38 | if (!mm->recent_pagein) | ||
39 | ret = SWAP_TOKEN_ENOUGH_RSS; | ||
40 | else if (time_after(jiffies, swap_token_timeout)) | ||
41 | ret = SWAP_TOKEN_TIMED_OUT; | ||
42 | mm->recent_pagein = 0; | ||
43 | return ret; | ||
44 | } | ||
45 | 28 | ||
46 | /* | ||
47 | * Try to grab the swapout protection token. We only try to | ||
48 | * grab it once every TOKEN_CHECK_INTERVAL, both to prevent | ||
49 | * SMP lock contention and to check that the process that held | ||
50 | * the token before is no longer thrashing. | ||
51 | */ | ||
52 | void grab_swap_token(void) | 29 | void grab_swap_token(void) |
53 | { | 30 | { |
54 | struct mm_struct *mm; | 31 | int current_interval; |
55 | int reason; | ||
56 | 32 | ||
57 | /* We have the token. Let others know we still need it. */ | 33 | global_faults++; |
58 | if (has_swap_token(current->mm)) { | ||
59 | current->mm->recent_pagein = 1; | ||
60 | if (unlikely(!swap_token_default_timeout)) | ||
61 | disable_swap_token(); | ||
62 | return; | ||
63 | } | ||
64 | |||
65 | if (time_after(jiffies, swap_token_check)) { | ||
66 | 34 | ||
67 | if (!swap_token_default_timeout) { | 35 | current_interval = global_faults - current->mm->faultstamp; |
68 | swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; | ||
69 | return; | ||
70 | } | ||
71 | |||
72 | /* ... or if we recently held the token. */ | ||
73 | if (time_before(jiffies, current->mm->swap_token_time)) | ||
74 | return; | ||
75 | 36 | ||
76 | if (!spin_trylock(&swap_token_lock)) | 37 | if (!spin_trylock(&swap_token_lock)) |
77 | return; | 38 | return; |
78 | 39 | ||
79 | swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; | 40 | /* First come first served */ |
41 | if (swap_token_mm == NULL) { | ||
42 | current->mm->token_priority = current->mm->token_priority + 2; | ||
43 | swap_token_mm = current->mm; | ||
44 | goto out; | ||
45 | } | ||
80 | 46 | ||
81 | mm = swap_token_mm; | 47 | if (current->mm != swap_token_mm) { |
82 | if ((reason = should_release_swap_token(mm))) { | 48 | if (current_interval < current->mm->last_interval) |
83 | unsigned long eligible = jiffies; | 49 | current->mm->token_priority++; |
84 | if (reason == SWAP_TOKEN_TIMED_OUT) { | 50 | else { |
85 | eligible += swap_token_default_timeout; | 51 | current->mm->token_priority--; |
86 | } | 52 | if (unlikely(current->mm->token_priority < 0)) |
87 | mm->swap_token_time = eligible; | 53 | current->mm->token_priority = 0; |
88 | swap_token_timeout = jiffies + swap_token_default_timeout; | 54 | } |
55 | /* Check if we deserve the token */ | ||
56 | if (current->mm->token_priority > | ||
57 | swap_token_mm->token_priority) { | ||
58 | current->mm->token_priority += 2; | ||
89 | swap_token_mm = current->mm; | 59 | swap_token_mm = current->mm; |
90 | } | 60 | } |
91 | spin_unlock(&swap_token_lock); | 61 | } else { |
62 | /* Token holder came in again! */ | ||
63 | current->mm->token_priority += 2; | ||
92 | } | 64 | } |
93 | return; | 65 | |
66 | out: | ||
67 | current->mm->faultstamp = global_faults; | ||
68 | current->mm->last_interval = current_interval; | ||
69 | spin_unlock(&swap_token_lock); | ||
70 | return; | ||
94 | } | 71 | } |
95 | 72 | ||
96 | /* Called on process exit. */ | 73 | /* Called on process exit. */ |
97 | void __put_swap_token(struct mm_struct *mm) | 74 | void __put_swap_token(struct mm_struct *mm) |
98 | { | 75 | { |
99 | spin_lock(&swap_token_lock); | 76 | spin_lock(&swap_token_lock); |
100 | if (likely(mm == swap_token_mm)) { | 77 | if (likely(mm == swap_token_mm)) |
101 | mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL; | 78 | swap_token_mm = NULL; |
102 | swap_token_mm = &init_mm; | ||
103 | swap_token_check = jiffies; | ||
104 | } | ||
105 | spin_unlock(&swap_token_lock); | 79 | spin_unlock(&swap_token_lock); |
106 | } | 80 | } |
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index 5f2cbf0f153c..c7f6e1914bc4 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c | |||
@@ -79,8 +79,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
79 | d_instantiate(dentry, inode); | 79 | d_instantiate(dentry, inode); |
80 | inode->i_nlink = 0; /* It is unlinked */ | 80 | inode->i_nlink = 0; /* It is unlinked */ |
81 | 81 | ||
82 | file->f_vfsmnt = mntget(shm_mnt); | 82 | file->f_path.mnt = mntget(shm_mnt); |
83 | file->f_dentry = dentry; | 83 | file->f_path.dentry = dentry; |
84 | file->f_mapping = inode->i_mapping; | 84 | file->f_mapping = inode->i_mapping; |
85 | file->f_op = &ramfs_file_operations; | 85 | file->f_op = &ramfs_file_operations; |
86 | file->f_mode = FMODE_WRITE | FMODE_READ; | 86 | file->f_mode = FMODE_WRITE | FMODE_READ; |
diff --git a/mm/truncate.c b/mm/truncate.c index e07b1e682c38..6c79ca4a1ca7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
15 | #include <linux/pagevec.h> | 15 | #include <linux/pagevec.h> |
16 | #include <linux/task_io_accounting_ops.h> | ||
16 | #include <linux/buffer_head.h> /* grr. try_to_release_page, | 17 | #include <linux/buffer_head.h> /* grr. try_to_release_page, |
17 | do_invalidatepage */ | 18 | do_invalidatepage */ |
18 | 19 | ||
@@ -50,6 +51,26 @@ static inline void truncate_partial_page(struct page *page, unsigned partial) | |||
50 | do_invalidatepage(page, partial); | 51 | do_invalidatepage(page, partial); |
51 | } | 52 | } |
52 | 53 | ||
54 | void cancel_dirty_page(struct page *page, unsigned int account_size) | ||
55 | { | ||
56 | /* If we're cancelling the page, it had better not be mapped any more */ | ||
57 | if (page_mapped(page)) { | ||
58 | static unsigned int warncount; | ||
59 | |||
60 | WARN_ON(++warncount < 5); | ||
61 | } | ||
62 | |||
63 | if (TestClearPageDirty(page)) { | ||
64 | struct address_space *mapping = page->mapping; | ||
65 | if (mapping && mapping_cap_account_dirty(mapping)) { | ||
66 | dec_zone_page_state(page, NR_FILE_DIRTY); | ||
67 | if (account_size) | ||
68 | task_io_account_cancelled_write(account_size); | ||
69 | } | ||
70 | } | ||
71 | } | ||
72 | EXPORT_SYMBOL(cancel_dirty_page); | ||
73 | |||
53 | /* | 74 | /* |
54 | * If truncate cannot remove the fs-private metadata from the page, the page | 75 | * If truncate cannot remove the fs-private metadata from the page, the page |
55 | * becomes anonymous. It will be left on the LRU and may even be mapped into | 76 | * becomes anonymous. It will be left on the LRU and may even be mapped into |
@@ -66,10 +87,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
66 | if (page->mapping != mapping) | 87 | if (page->mapping != mapping) |
67 | return; | 88 | return; |
68 | 89 | ||
90 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | ||
91 | |||
69 | if (PagePrivate(page)) | 92 | if (PagePrivate(page)) |
70 | do_invalidatepage(page, 0); | 93 | do_invalidatepage(page, 0); |
71 | 94 | ||
72 | clear_page_dirty(page); | ||
73 | ClearPageUptodate(page); | 95 | ClearPageUptodate(page); |
74 | ClearPageMappedToDisk(page); | 96 | ClearPageMappedToDisk(page); |
75 | remove_from_page_cache(page); | 97 | remove_from_page_cache(page); |
@@ -319,6 +341,15 @@ failed: | |||
319 | return 0; | 341 | return 0; |
320 | } | 342 | } |
321 | 343 | ||
344 | static int do_launder_page(struct address_space *mapping, struct page *page) | ||
345 | { | ||
346 | if (!PageDirty(page)) | ||
347 | return 0; | ||
348 | if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) | ||
349 | return 0; | ||
350 | return mapping->a_ops->launder_page(page); | ||
351 | } | ||
352 | |||
322 | /** | 353 | /** |
323 | * invalidate_inode_pages2_range - remove range of pages from an address_space | 354 | * invalidate_inode_pages2_range - remove range of pages from an address_space |
324 | * @mapping: the address_space | 355 | * @mapping: the address_space |
@@ -348,7 +379,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
348 | for (i = 0; !ret && i < pagevec_count(&pvec); i++) { | 379 | for (i = 0; !ret && i < pagevec_count(&pvec); i++) { |
349 | struct page *page = pvec.pages[i]; | 380 | struct page *page = pvec.pages[i]; |
350 | pgoff_t page_index; | 381 | pgoff_t page_index; |
351 | int was_dirty; | ||
352 | 382 | ||
353 | lock_page(page); | 383 | lock_page(page); |
354 | if (page->mapping != mapping) { | 384 | if (page->mapping != mapping) { |
@@ -384,12 +414,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
384 | PAGE_CACHE_SIZE, 0); | 414 | PAGE_CACHE_SIZE, 0); |
385 | } | 415 | } |
386 | } | 416 | } |
387 | was_dirty = test_clear_page_dirty(page); | 417 | ret = do_launder_page(mapping, page); |
388 | if (!invalidate_complete_page2(mapping, page)) { | 418 | if (ret == 0 && !invalidate_complete_page2(mapping, page)) |
389 | if (was_dirty) | ||
390 | set_page_dirty(page); | ||
391 | ret = -EIO; | 419 | ret = -EIO; |
392 | } | ||
393 | unlock_page(page); | 420 | unlock_page(page); |
394 | } | 421 | } |
395 | pagevec_release(&pvec); | 422 | pagevec_release(&pvec); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 518540a4a2a6..7430df68cb64 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/rwsem.h> | 36 | #include <linux/rwsem.h> |
37 | #include <linux/delay.h> | 37 | #include <linux/delay.h> |
38 | #include <linux/kthread.h> | 38 | #include <linux/kthread.h> |
39 | #include <linux/freezer.h> | ||
39 | 40 | ||
40 | #include <asm/tlbflush.h> | 41 | #include <asm/tlbflush.h> |
41 | #include <asm/div64.h> | 42 | #include <asm/div64.h> |
@@ -691,7 +692,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
691 | __count_vm_events(KSWAPD_STEAL, nr_freed); | 692 | __count_vm_events(KSWAPD_STEAL, nr_freed); |
692 | } else | 693 | } else |
693 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); | 694 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); |
694 | __count_vm_events(PGACTIVATE, nr_freed); | 695 | __count_zone_vm_events(PGSTEAL, zone, nr_freed); |
695 | 696 | ||
696 | if (nr_taken == 0) | 697 | if (nr_taken == 0) |
697 | goto done; | 698 | goto done; |
@@ -983,7 +984,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
983 | if (!populated_zone(zone)) | 984 | if (!populated_zone(zone)) |
984 | continue; | 985 | continue; |
985 | 986 | ||
986 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 987 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
987 | continue; | 988 | continue; |
988 | 989 | ||
989 | note_zone_scanning_priority(zone, priority); | 990 | note_zone_scanning_priority(zone, priority); |
@@ -1033,7 +1034,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
1033 | for (i = 0; zones[i] != NULL; i++) { | 1034 | for (i = 0; zones[i] != NULL; i++) { |
1034 | struct zone *zone = zones[i]; | 1035 | struct zone *zone = zones[i]; |
1035 | 1036 | ||
1036 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 1037 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
1037 | continue; | 1038 | continue; |
1038 | 1039 | ||
1039 | lru_pages += zone->nr_active + zone->nr_inactive; | 1040 | lru_pages += zone->nr_active + zone->nr_inactive; |
@@ -1088,7 +1089,7 @@ out: | |||
1088 | for (i = 0; zones[i] != 0; i++) { | 1089 | for (i = 0; zones[i] != 0; i++) { |
1089 | struct zone *zone = zones[i]; | 1090 | struct zone *zone = zones[i]; |
1090 | 1091 | ||
1091 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 1092 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
1092 | continue; | 1093 | continue; |
1093 | 1094 | ||
1094 | zone->prev_priority = priority; | 1095 | zone->prev_priority = priority; |
@@ -1172,11 +1173,12 @@ loop_again: | |||
1172 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1173 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1173 | 0, 0)) { | 1174 | 0, 0)) { |
1174 | end_zone = i; | 1175 | end_zone = i; |
1175 | goto scan; | 1176 | break; |
1176 | } | 1177 | } |
1177 | } | 1178 | } |
1178 | goto out; | 1179 | if (i < 0) |
1179 | scan: | 1180 | goto out; |
1181 | |||
1180 | for (i = 0; i <= end_zone; i++) { | 1182 | for (i = 0; i <= end_zone; i++) { |
1181 | struct zone *zone = pgdat->node_zones + i; | 1183 | struct zone *zone = pgdat->node_zones + i; |
1182 | 1184 | ||
@@ -1259,6 +1261,9 @@ out: | |||
1259 | } | 1261 | } |
1260 | if (!all_zones_ok) { | 1262 | if (!all_zones_ok) { |
1261 | cond_resched(); | 1263 | cond_resched(); |
1264 | |||
1265 | try_to_freeze(); | ||
1266 | |||
1262 | goto loop_again; | 1267 | goto loop_again; |
1263 | } | 1268 | } |
1264 | 1269 | ||
@@ -1349,7 +1354,7 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
1349 | return; | 1354 | return; |
1350 | if (pgdat->kswapd_max_order < order) | 1355 | if (pgdat->kswapd_max_order < order) |
1351 | pgdat->kswapd_max_order = order; | 1356 | pgdat->kswapd_max_order = order; |
1352 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 1357 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
1353 | return; | 1358 | return; |
1354 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 1359 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
1355 | return; | 1360 | return; |
@@ -1364,8 +1369,8 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
1364 | * | 1369 | * |
1365 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages | 1370 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages |
1366 | */ | 1371 | */ |
1367 | static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, | 1372 | static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, |
1368 | int prio, struct scan_control *sc) | 1373 | int pass, struct scan_control *sc) |
1369 | { | 1374 | { |
1370 | struct zone *zone; | 1375 | struct zone *zone; |
1371 | unsigned long nr_to_scan, ret = 0; | 1376 | unsigned long nr_to_scan, ret = 0; |
@@ -1401,6 +1406,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, | |||
1401 | return ret; | 1406 | return ret; |
1402 | } | 1407 | } |
1403 | 1408 | ||
1409 | static unsigned long count_lru_pages(void) | ||
1410 | { | ||
1411 | struct zone *zone; | ||
1412 | unsigned long ret = 0; | ||
1413 | |||
1414 | for_each_zone(zone) | ||
1415 | ret += zone->nr_active + zone->nr_inactive; | ||
1416 | return ret; | ||
1417 | } | ||
1418 | |||
1404 | /* | 1419 | /* |
1405 | * Try to free `nr_pages' of memory, system-wide, and return the number of | 1420 | * Try to free `nr_pages' of memory, system-wide, and return the number of |
1406 | * freed pages. | 1421 | * freed pages. |
@@ -1415,7 +1430,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1415 | unsigned long ret = 0; | 1430 | unsigned long ret = 0; |
1416 | int pass; | 1431 | int pass; |
1417 | struct reclaim_state reclaim_state; | 1432 | struct reclaim_state reclaim_state; |
1418 | struct zone *zone; | ||
1419 | struct scan_control sc = { | 1433 | struct scan_control sc = { |
1420 | .gfp_mask = GFP_KERNEL, | 1434 | .gfp_mask = GFP_KERNEL, |
1421 | .may_swap = 0, | 1435 | .may_swap = 0, |
@@ -1426,10 +1440,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1426 | 1440 | ||
1427 | current->reclaim_state = &reclaim_state; | 1441 | current->reclaim_state = &reclaim_state; |
1428 | 1442 | ||
1429 | lru_pages = 0; | 1443 | lru_pages = count_lru_pages(); |
1430 | for_each_zone(zone) | ||
1431 | lru_pages += zone->nr_active + zone->nr_inactive; | ||
1432 | |||
1433 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); | 1444 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); |
1434 | /* If slab caches are huge, it's better to hit them first */ | 1445 | /* If slab caches are huge, it's better to hit them first */ |
1435 | while (nr_slab >= lru_pages) { | 1446 | while (nr_slab >= lru_pages) { |
@@ -1456,13 +1467,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1456 | for (pass = 0; pass < 5; pass++) { | 1467 | for (pass = 0; pass < 5; pass++) { |
1457 | int prio; | 1468 | int prio; |
1458 | 1469 | ||
1459 | /* Needed for shrinking slab caches later on */ | ||
1460 | if (!lru_pages) | ||
1461 | for_each_zone(zone) { | ||
1462 | lru_pages += zone->nr_active; | ||
1463 | lru_pages += zone->nr_inactive; | ||
1464 | } | ||
1465 | |||
1466 | /* Force reclaiming mapped pages in the passes #3 and #4 */ | 1470 | /* Force reclaiming mapped pages in the passes #3 and #4 */ |
1467 | if (pass > 2) { | 1471 | if (pass > 2) { |
1468 | sc.may_swap = 1; | 1472 | sc.may_swap = 1; |
@@ -1478,7 +1482,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1478 | goto out; | 1482 | goto out; |
1479 | 1483 | ||
1480 | reclaim_state.reclaimed_slab = 0; | 1484 | reclaim_state.reclaimed_slab = 0; |
1481 | shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); | 1485 | shrink_slab(sc.nr_scanned, sc.gfp_mask, |
1486 | count_lru_pages()); | ||
1482 | ret += reclaim_state.reclaimed_slab; | 1487 | ret += reclaim_state.reclaimed_slab; |
1483 | if (ret >= nr_pages) | 1488 | if (ret >= nr_pages) |
1484 | goto out; | 1489 | goto out; |
@@ -1486,20 +1491,19 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1486 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | 1491 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) |
1487 | congestion_wait(WRITE, HZ / 10); | 1492 | congestion_wait(WRITE, HZ / 10); |
1488 | } | 1493 | } |
1489 | |||
1490 | lru_pages = 0; | ||
1491 | } | 1494 | } |
1492 | 1495 | ||
1493 | /* | 1496 | /* |
1494 | * If ret = 0, we could not shrink LRUs, but there may be something | 1497 | * If ret = 0, we could not shrink LRUs, but there may be something |
1495 | * in slab caches | 1498 | * in slab caches |
1496 | */ | 1499 | */ |
1497 | if (!ret) | 1500 | if (!ret) { |
1498 | do { | 1501 | do { |
1499 | reclaim_state.reclaimed_slab = 0; | 1502 | reclaim_state.reclaimed_slab = 0; |
1500 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | 1503 | shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); |
1501 | ret += reclaim_state.reclaimed_slab; | 1504 | ret += reclaim_state.reclaimed_slab; |
1502 | } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); | 1505 | } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); |
1506 | } | ||
1503 | 1507 | ||
1504 | out: | 1508 | out: |
1505 | current->reclaim_state = NULL; | 1509 | current->reclaim_state = NULL; |
@@ -1508,7 +1512,6 @@ out: | |||
1508 | } | 1512 | } |
1509 | #endif | 1513 | #endif |
1510 | 1514 | ||
1511 | #ifdef CONFIG_HOTPLUG_CPU | ||
1512 | /* It's optimal to keep kswapds on the same CPUs as their memory, but | 1515 | /* It's optimal to keep kswapds on the same CPUs as their memory, but |
1513 | not required for correctness. So if the last cpu in a node goes | 1516 | not required for correctness. So if the last cpu in a node goes |
1514 | away, we get changed to run anywhere: as the first one comes back, | 1517 | away, we get changed to run anywhere: as the first one comes back, |
@@ -1529,7 +1532,6 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
1529 | } | 1532 | } |
1530 | return NOTIFY_OK; | 1533 | return NOTIFY_OK; |
1531 | } | 1534 | } |
1532 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
1533 | 1535 | ||
1534 | /* | 1536 | /* |
1535 | * This kswapd start function will be called by init and node-hot-add. | 1537 | * This kswapd start function will be called by init and node-hot-add. |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 8614e8f6743b..dc005a0c96ae 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -430,7 +430,7 @@ static int frag_show(struct seq_file *m, void *arg) | |||
430 | return 0; | 430 | return 0; |
431 | } | 431 | } |
432 | 432 | ||
433 | struct seq_operations fragmentation_op = { | 433 | const struct seq_operations fragmentation_op = { |
434 | .start = frag_start, | 434 | .start = frag_start, |
435 | .next = frag_next, | 435 | .next = frag_next, |
436 | .stop = frag_stop, | 436 | .stop = frag_stop, |
@@ -452,7 +452,7 @@ struct seq_operations fragmentation_op = { | |||
452 | #define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \ | 452 | #define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \ |
453 | TEXT_FOR_HIGHMEM(xx) | 453 | TEXT_FOR_HIGHMEM(xx) |
454 | 454 | ||
455 | static char *vmstat_text[] = { | 455 | static const char * const vmstat_text[] = { |
456 | /* Zoned VM counters */ | 456 | /* Zoned VM counters */ |
457 | "nr_anon_pages", | 457 | "nr_anon_pages", |
458 | "nr_mapped", | 458 | "nr_mapped", |
@@ -597,7 +597,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
597 | return 0; | 597 | return 0; |
598 | } | 598 | } |
599 | 599 | ||
600 | struct seq_operations zoneinfo_op = { | 600 | const struct seq_operations zoneinfo_op = { |
601 | .start = frag_start, /* iterate over all zones. The same as in | 601 | .start = frag_start, /* iterate over all zones. The same as in |
602 | * fragmentation. */ | 602 | * fragmentation. */ |
603 | .next = frag_next, | 603 | .next = frag_next, |
@@ -660,7 +660,7 @@ static void vmstat_stop(struct seq_file *m, void *arg) | |||
660 | m->private = NULL; | 660 | m->private = NULL; |
661 | } | 661 | } |
662 | 662 | ||
663 | struct seq_operations vmstat_op = { | 663 | const struct seq_operations vmstat_op = { |
664 | .start = vmstat_start, | 664 | .start = vmstat_start, |
665 | .next = vmstat_next, | 665 | .next = vmstat_next, |
666 | .stop = vmstat_stop, | 666 | .stop = vmstat_stop, |
@@ -679,13 +679,13 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
679 | void *hcpu) | 679 | void *hcpu) |
680 | { | 680 | { |
681 | switch (action) { | 681 | switch (action) { |
682 | case CPU_UP_PREPARE: | 682 | case CPU_UP_PREPARE: |
683 | case CPU_UP_CANCELED: | 683 | case CPU_UP_CANCELED: |
684 | case CPU_DEAD: | 684 | case CPU_DEAD: |
685 | refresh_zone_stat_thresholds(); | 685 | refresh_zone_stat_thresholds(); |
686 | break; | 686 | break; |
687 | default: | 687 | default: |
688 | break; | 688 | break; |
689 | } | 689 | } |
690 | return NOTIFY_OK; | 690 | return NOTIFY_OK; |
691 | } | 691 | } |