diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 3 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/allocpercpu.c | 20 | ||||
-rw-r--r-- | mm/bootmem.c | 37 | ||||
-rw-r--r-- | mm/bounce.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 265 | ||||
-rw-r--r-- | mm/filemap_xip.c | 70 | ||||
-rw-r--r-- | mm/fremap.c | 3 | ||||
-rw-r--r-- | mm/highmem.c | 5 | ||||
-rw-r--r-- | mm/hugetlb.c | 97 | ||||
-rw-r--r-- | mm/madvise.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 23 | ||||
-rw-r--r-- | mm/memory.c | 79 | ||||
-rw-r--r-- | mm/mempolicy.c | 1 | ||||
-rw-r--r-- | mm/migrate.c | 33 | ||||
-rw-r--r-- | mm/mlock.c | 2 | ||||
-rw-r--r-- | mm/mm_init.c | 10 | ||||
-rw-r--r-- | mm/mmap.c | 172 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 277 | ||||
-rw-r--r-- | mm/mmzone.c | 2 | ||||
-rw-r--r-- | mm/mprotect.c | 3 | ||||
-rw-r--r-- | mm/mremap.c | 6 | ||||
-rw-r--r-- | mm/nommu.c | 25 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 12 | ||||
-rw-r--r-- | mm/page_alloc.c | 45 | ||||
-rw-r--r-- | mm/page_isolation.c | 13 | ||||
-rw-r--r-- | mm/quicklist.c | 9 | ||||
-rw-r--r-- | mm/readahead.c | 6 | ||||
-rw-r--r-- | mm/rmap.c | 55 | ||||
-rw-r--r-- | mm/shmem.c | 15 | ||||
-rw-r--r-- | mm/shmem_acl.c | 2 | ||||
-rw-r--r-- | mm/slab.c | 12 | ||||
-rw-r--r-- | mm/slob.c | 16 | ||||
-rw-r--r-- | mm/slub.c | 45 | ||||
-rw-r--r-- | mm/sparse.c | 3 | ||||
-rw-r--r-- | mm/swap.c | 9 | ||||
-rw-r--r-- | mm/swap_state.c | 40 | ||||
-rw-r--r-- | mm/swapfile.c | 16 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 26 | ||||
-rw-r--r-- | mm/truncate.c | 16 | ||||
-rw-r--r-- | mm/util.c | 70 | ||||
-rw-r--r-- | mm/vmalloc.c | 13 | ||||
-rw-r--r-- | mm/vmscan.c | 88 | ||||
-rw-r--r-- | mm/vmstat.c | 19 |
45 files changed, 1304 insertions, 372 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index aa799007a11b..0bd9c2dbb2a0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -205,3 +205,6 @@ config NR_QUICK | |||
205 | config VIRT_TO_BUS | 205 | config VIRT_TO_BUS |
206 | def_bool y | 206 | def_bool y |
207 | depends on !ARCH_NO_VIRT_TO_BUS | 207 | depends on !ARCH_NO_VIRT_TO_BUS |
208 | |||
209 | config MMU_NOTIFIER | ||
210 | bool | ||
diff --git a/mm/Makefile b/mm/Makefile index 06ca2381fef1..da4ccf015aea 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o | |||
25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | 25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o |
26 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | 26 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o |
27 | obj-$(CONFIG_SLOB) += slob.o | 27 | obj-$(CONFIG_SLOB) += slob.o |
28 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | ||
28 | obj-$(CONFIG_SLAB) += slab.o | 29 | obj-$(CONFIG_SLAB) += slab.o |
29 | obj-$(CONFIG_SLUB) += slub.o | 30 | obj-$(CONFIG_SLUB) += slub.o |
30 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 31 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index 843364594e23..4297bc41bfd2 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c | |||
@@ -18,27 +18,28 @@ | |||
18 | * Depopulating per-cpu data for a cpu going offline would be a typical | 18 | * Depopulating per-cpu data for a cpu going offline would be a typical |
19 | * use case. You need to register a cpu hotplug handler for that purpose. | 19 | * use case. You need to register a cpu hotplug handler for that purpose. |
20 | */ | 20 | */ |
21 | void percpu_depopulate(void *__pdata, int cpu) | 21 | static void percpu_depopulate(void *__pdata, int cpu) |
22 | { | 22 | { |
23 | struct percpu_data *pdata = __percpu_disguise(__pdata); | 23 | struct percpu_data *pdata = __percpu_disguise(__pdata); |
24 | 24 | ||
25 | kfree(pdata->ptrs[cpu]); | 25 | kfree(pdata->ptrs[cpu]); |
26 | pdata->ptrs[cpu] = NULL; | 26 | pdata->ptrs[cpu] = NULL; |
27 | } | 27 | } |
28 | EXPORT_SYMBOL_GPL(percpu_depopulate); | ||
29 | 28 | ||
30 | /** | 29 | /** |
31 | * percpu_depopulate_mask - depopulate per-cpu data for some cpu's | 30 | * percpu_depopulate_mask - depopulate per-cpu data for some cpu's |
32 | * @__pdata: per-cpu data to depopulate | 31 | * @__pdata: per-cpu data to depopulate |
33 | * @mask: depopulate per-cpu data for cpu's selected through mask bits | 32 | * @mask: depopulate per-cpu data for cpu's selected through mask bits |
34 | */ | 33 | */ |
35 | void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) | 34 | static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) |
36 | { | 35 | { |
37 | int cpu; | 36 | int cpu; |
38 | for_each_cpu_mask_nr(cpu, *mask) | 37 | for_each_cpu_mask_nr(cpu, *mask) |
39 | percpu_depopulate(__pdata, cpu); | 38 | percpu_depopulate(__pdata, cpu); |
40 | } | 39 | } |
41 | EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); | 40 | |
41 | #define percpu_depopulate_mask(__pdata, mask) \ | ||
42 | __percpu_depopulate_mask((__pdata), &(mask)) | ||
42 | 43 | ||
43 | /** | 44 | /** |
44 | * percpu_populate - populate per-cpu data for given cpu | 45 | * percpu_populate - populate per-cpu data for given cpu |
@@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); | |||
51 | * use case. You need to register a cpu hotplug handler for that purpose. | 52 | * use case. You need to register a cpu hotplug handler for that purpose. |
52 | * Per-cpu object is populated with zeroed buffer. | 53 | * Per-cpu object is populated with zeroed buffer. |
53 | */ | 54 | */ |
54 | void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) | 55 | static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) |
55 | { | 56 | { |
56 | struct percpu_data *pdata = __percpu_disguise(__pdata); | 57 | struct percpu_data *pdata = __percpu_disguise(__pdata); |
57 | int node = cpu_to_node(cpu); | 58 | int node = cpu_to_node(cpu); |
@@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) | |||
68 | pdata->ptrs[cpu] = kzalloc(size, gfp); | 69 | pdata->ptrs[cpu] = kzalloc(size, gfp); |
69 | return pdata->ptrs[cpu]; | 70 | return pdata->ptrs[cpu]; |
70 | } | 71 | } |
71 | EXPORT_SYMBOL_GPL(percpu_populate); | ||
72 | 72 | ||
73 | /** | 73 | /** |
74 | * percpu_populate_mask - populate per-cpu data for more cpu's | 74 | * percpu_populate_mask - populate per-cpu data for more cpu's |
@@ -79,8 +79,8 @@ EXPORT_SYMBOL_GPL(percpu_populate); | |||
79 | * | 79 | * |
80 | * Per-cpu objects are populated with zeroed buffers. | 80 | * Per-cpu objects are populated with zeroed buffers. |
81 | */ | 81 | */ |
82 | int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, | 82 | static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, |
83 | cpumask_t *mask) | 83 | cpumask_t *mask) |
84 | { | 84 | { |
85 | cpumask_t populated; | 85 | cpumask_t populated; |
86 | int cpu; | 86 | int cpu; |
@@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, | |||
94 | cpu_set(cpu, populated); | 94 | cpu_set(cpu, populated); |
95 | return 0; | 95 | return 0; |
96 | } | 96 | } |
97 | EXPORT_SYMBOL_GPL(__percpu_populate_mask); | 97 | |
98 | #define percpu_populate_mask(__pdata, size, gfp, mask) \ | ||
99 | __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) | ||
98 | 100 | ||
99 | /** | 101 | /** |
100 | * percpu_alloc_mask - initial setup of per-cpu data | 102 | * percpu_alloc_mask - initial setup of per-cpu data |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 4af15d0340ad..ad8eec6e44a8 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -405,6 +405,29 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, | |||
405 | } | 405 | } |
406 | #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ | 406 | #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ |
407 | 407 | ||
408 | static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, | ||
409 | unsigned long step) | ||
410 | { | ||
411 | unsigned long base = bdata->node_min_pfn; | ||
412 | |||
413 | /* | ||
414 | * Align the index with respect to the node start so that the | ||
415 | * combination of both satisfies the requested alignment. | ||
416 | */ | ||
417 | |||
418 | return ALIGN(base + idx, step) - base; | ||
419 | } | ||
420 | |||
421 | static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, | ||
422 | unsigned long align) | ||
423 | { | ||
424 | unsigned long base = PFN_PHYS(bdata->node_min_pfn); | ||
425 | |||
426 | /* Same as align_idx for byte offsets */ | ||
427 | |||
428 | return ALIGN(base + off, align) - base; | ||
429 | } | ||
430 | |||
408 | static void * __init alloc_bootmem_core(struct bootmem_data *bdata, | 431 | static void * __init alloc_bootmem_core(struct bootmem_data *bdata, |
409 | unsigned long size, unsigned long align, | 432 | unsigned long size, unsigned long align, |
410 | unsigned long goal, unsigned long limit) | 433 | unsigned long goal, unsigned long limit) |
@@ -441,7 +464,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, | |||
441 | else | 464 | else |
442 | start = ALIGN(min, step); | 465 | start = ALIGN(min, step); |
443 | 466 | ||
444 | sidx = start - bdata->node_min_pfn;; | 467 | sidx = start - bdata->node_min_pfn; |
445 | midx = max - bdata->node_min_pfn; | 468 | midx = max - bdata->node_min_pfn; |
446 | 469 | ||
447 | if (bdata->hint_idx > sidx) { | 470 | if (bdata->hint_idx > sidx) { |
@@ -450,7 +473,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, | |||
450 | * catch the fallback below. | 473 | * catch the fallback below. |
451 | */ | 474 | */ |
452 | fallback = sidx + 1; | 475 | fallback = sidx + 1; |
453 | sidx = ALIGN(bdata->hint_idx, step); | 476 | sidx = align_idx(bdata, bdata->hint_idx, step); |
454 | } | 477 | } |
455 | 478 | ||
456 | while (1) { | 479 | while (1) { |
@@ -459,7 +482,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, | |||
459 | unsigned long eidx, i, start_off, end_off; | 482 | unsigned long eidx, i, start_off, end_off; |
460 | find_block: | 483 | find_block: |
461 | sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); | 484 | sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); |
462 | sidx = ALIGN(sidx, step); | 485 | sidx = align_idx(bdata, sidx, step); |
463 | eidx = sidx + PFN_UP(size); | 486 | eidx = sidx + PFN_UP(size); |
464 | 487 | ||
465 | if (sidx >= midx || eidx > midx) | 488 | if (sidx >= midx || eidx > midx) |
@@ -467,15 +490,15 @@ find_block: | |||
467 | 490 | ||
468 | for (i = sidx; i < eidx; i++) | 491 | for (i = sidx; i < eidx; i++) |
469 | if (test_bit(i, bdata->node_bootmem_map)) { | 492 | if (test_bit(i, bdata->node_bootmem_map)) { |
470 | sidx = ALIGN(i, step); | 493 | sidx = align_idx(bdata, i, step); |
471 | if (sidx == i) | 494 | if (sidx == i) |
472 | sidx += step; | 495 | sidx += step; |
473 | goto find_block; | 496 | goto find_block; |
474 | } | 497 | } |
475 | 498 | ||
476 | if (bdata->last_end_off && | 499 | if (bdata->last_end_off & (PAGE_SIZE - 1) && |
477 | PFN_DOWN(bdata->last_end_off) + 1 == sidx) | 500 | PFN_DOWN(bdata->last_end_off) + 1 == sidx) |
478 | start_off = ALIGN(bdata->last_end_off, align); | 501 | start_off = align_off(bdata, bdata->last_end_off, align); |
479 | else | 502 | else |
480 | start_off = PFN_PHYS(sidx); | 503 | start_off = PFN_PHYS(sidx); |
481 | 504 | ||
@@ -499,7 +522,7 @@ find_block: | |||
499 | } | 522 | } |
500 | 523 | ||
501 | if (fallback) { | 524 | if (fallback) { |
502 | sidx = ALIGN(fallback - 1, step); | 525 | sidx = align_idx(bdata, fallback - 1, step); |
503 | fallback = 0; | 526 | fallback = 0; |
504 | goto find_block; | 527 | goto find_block; |
505 | } | 528 | } |
diff --git a/mm/bounce.c b/mm/bounce.c index b6d2d0f1019b..06722c403058 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) | |||
267 | /* | 267 | /* |
268 | * Data-less bio, nothing to bounce | 268 | * Data-less bio, nothing to bounce |
269 | */ | 269 | */ |
270 | if (bio_empty_barrier(*bio_orig)) | 270 | if (!bio_has_data(*bio_orig)) |
271 | return; | 271 | return; |
272 | 272 | ||
273 | /* | 273 | /* |
diff --git a/mm/filemap.c b/mm/filemap.c index 2d3ec1ffc66e..876bc595d0f8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -109,7 +109,7 @@ | |||
109 | /* | 109 | /* |
110 | * Remove a page from the page cache and free it. Caller has to make | 110 | * Remove a page from the page cache and free it. Caller has to make |
111 | * sure the page is locked and that nobody else uses it - or that usage | 111 | * sure the page is locked and that nobody else uses it - or that usage |
112 | * is safe. The caller must hold a write_lock on the mapping's tree_lock. | 112 | * is safe. The caller must hold the mapping's tree_lock. |
113 | */ | 113 | */ |
114 | void __remove_from_page_cache(struct page *page) | 114 | void __remove_from_page_cache(struct page *page) |
115 | { | 115 | { |
@@ -141,9 +141,9 @@ void remove_from_page_cache(struct page *page) | |||
141 | 141 | ||
142 | BUG_ON(!PageLocked(page)); | 142 | BUG_ON(!PageLocked(page)); |
143 | 143 | ||
144 | write_lock_irq(&mapping->tree_lock); | 144 | spin_lock_irq(&mapping->tree_lock); |
145 | __remove_from_page_cache(page); | 145 | __remove_from_page_cache(page); |
146 | write_unlock_irq(&mapping->tree_lock); | 146 | spin_unlock_irq(&mapping->tree_lock); |
147 | } | 147 | } |
148 | 148 | ||
149 | static int sync_page(void *word) | 149 | static int sync_page(void *word) |
@@ -442,48 +442,52 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
442 | } | 442 | } |
443 | 443 | ||
444 | /** | 444 | /** |
445 | * add_to_page_cache - add newly allocated pagecache pages | 445 | * add_to_page_cache_locked - add a locked page to the pagecache |
446 | * @page: page to add | 446 | * @page: page to add |
447 | * @mapping: the page's address_space | 447 | * @mapping: the page's address_space |
448 | * @offset: page index | 448 | * @offset: page index |
449 | * @gfp_mask: page allocation mode | 449 | * @gfp_mask: page allocation mode |
450 | * | 450 | * |
451 | * This function is used to add newly allocated pagecache pages; | 451 | * This function is used to add a page to the pagecache. It must be locked. |
452 | * the page is new, so we can just run SetPageLocked() against it. | ||
453 | * The other page state flags were set by rmqueue(). | ||
454 | * | ||
455 | * This function does not add the page to the LRU. The caller must do that. | 452 | * This function does not add the page to the LRU. The caller must do that. |
456 | */ | 453 | */ |
457 | int add_to_page_cache(struct page *page, struct address_space *mapping, | 454 | int add_to_page_cache_locked(struct page *page, struct address_space *mapping, |
458 | pgoff_t offset, gfp_t gfp_mask) | 455 | pgoff_t offset, gfp_t gfp_mask) |
459 | { | 456 | { |
460 | int error = mem_cgroup_cache_charge(page, current->mm, | 457 | int error; |
458 | |||
459 | VM_BUG_ON(!PageLocked(page)); | ||
460 | |||
461 | error = mem_cgroup_cache_charge(page, current->mm, | ||
461 | gfp_mask & ~__GFP_HIGHMEM); | 462 | gfp_mask & ~__GFP_HIGHMEM); |
462 | if (error) | 463 | if (error) |
463 | goto out; | 464 | goto out; |
464 | 465 | ||
465 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 466 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); |
466 | if (error == 0) { | 467 | if (error == 0) { |
467 | write_lock_irq(&mapping->tree_lock); | 468 | page_cache_get(page); |
469 | page->mapping = mapping; | ||
470 | page->index = offset; | ||
471 | |||
472 | spin_lock_irq(&mapping->tree_lock); | ||
468 | error = radix_tree_insert(&mapping->page_tree, offset, page); | 473 | error = radix_tree_insert(&mapping->page_tree, offset, page); |
469 | if (!error) { | 474 | if (likely(!error)) { |
470 | page_cache_get(page); | ||
471 | SetPageLocked(page); | ||
472 | page->mapping = mapping; | ||
473 | page->index = offset; | ||
474 | mapping->nrpages++; | 475 | mapping->nrpages++; |
475 | __inc_zone_page_state(page, NR_FILE_PAGES); | 476 | __inc_zone_page_state(page, NR_FILE_PAGES); |
476 | } else | 477 | } else { |
478 | page->mapping = NULL; | ||
477 | mem_cgroup_uncharge_cache_page(page); | 479 | mem_cgroup_uncharge_cache_page(page); |
480 | page_cache_release(page); | ||
481 | } | ||
478 | 482 | ||
479 | write_unlock_irq(&mapping->tree_lock); | 483 | spin_unlock_irq(&mapping->tree_lock); |
480 | radix_tree_preload_end(); | 484 | radix_tree_preload_end(); |
481 | } else | 485 | } else |
482 | mem_cgroup_uncharge_cache_page(page); | 486 | mem_cgroup_uncharge_cache_page(page); |
483 | out: | 487 | out: |
484 | return error; | 488 | return error; |
485 | } | 489 | } |
486 | EXPORT_SYMBOL(add_to_page_cache); | 490 | EXPORT_SYMBOL(add_to_page_cache_locked); |
487 | 491 | ||
488 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | 492 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, |
489 | pgoff_t offset, gfp_t gfp_mask) | 493 | pgoff_t offset, gfp_t gfp_mask) |
@@ -554,14 +558,14 @@ EXPORT_SYMBOL(wait_on_page_bit); | |||
554 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. | 558 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. |
555 | * | 559 | * |
556 | * The first mb is necessary to safely close the critical section opened by the | 560 | * The first mb is necessary to safely close the critical section opened by the |
557 | * TestSetPageLocked(), the second mb is necessary to enforce ordering between | 561 | * test_and_set_bit() to lock the page; the second mb is necessary to enforce |
558 | * the clear_bit and the read of the waitqueue (to avoid SMP races with a | 562 | * ordering between the clear_bit and the read of the waitqueue (to avoid SMP |
559 | * parallel wait_on_page_locked()). | 563 | * races with a parallel wait_on_page_locked()). |
560 | */ | 564 | */ |
561 | void unlock_page(struct page *page) | 565 | void unlock_page(struct page *page) |
562 | { | 566 | { |
563 | smp_mb__before_clear_bit(); | 567 | smp_mb__before_clear_bit(); |
564 | if (!TestClearPageLocked(page)) | 568 | if (!test_and_clear_bit(PG_locked, &page->flags)) |
565 | BUG(); | 569 | BUG(); |
566 | smp_mb__after_clear_bit(); | 570 | smp_mb__after_clear_bit(); |
567 | wake_up_page(page, PG_locked); | 571 | wake_up_page(page, PG_locked); |
@@ -633,15 +637,35 @@ void __lock_page_nosync(struct page *page) | |||
633 | * Is there a pagecache struct page at the given (mapping, offset) tuple? | 637 | * Is there a pagecache struct page at the given (mapping, offset) tuple? |
634 | * If yes, increment its refcount and return it; if no, return NULL. | 638 | * If yes, increment its refcount and return it; if no, return NULL. |
635 | */ | 639 | */ |
636 | struct page * find_get_page(struct address_space *mapping, pgoff_t offset) | 640 | struct page *find_get_page(struct address_space *mapping, pgoff_t offset) |
637 | { | 641 | { |
642 | void **pagep; | ||
638 | struct page *page; | 643 | struct page *page; |
639 | 644 | ||
640 | read_lock_irq(&mapping->tree_lock); | 645 | rcu_read_lock(); |
641 | page = radix_tree_lookup(&mapping->page_tree, offset); | 646 | repeat: |
642 | if (page) | 647 | page = NULL; |
643 | page_cache_get(page); | 648 | pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); |
644 | read_unlock_irq(&mapping->tree_lock); | 649 | if (pagep) { |
650 | page = radix_tree_deref_slot(pagep); | ||
651 | if (unlikely(!page || page == RADIX_TREE_RETRY)) | ||
652 | goto repeat; | ||
653 | |||
654 | if (!page_cache_get_speculative(page)) | ||
655 | goto repeat; | ||
656 | |||
657 | /* | ||
658 | * Has the page moved? | ||
659 | * This is part of the lockless pagecache protocol. See | ||
660 | * include/linux/pagemap.h for details. | ||
661 | */ | ||
662 | if (unlikely(page != *pagep)) { | ||
663 | page_cache_release(page); | ||
664 | goto repeat; | ||
665 | } | ||
666 | } | ||
667 | rcu_read_unlock(); | ||
668 | |||
645 | return page; | 669 | return page; |
646 | } | 670 | } |
647 | EXPORT_SYMBOL(find_get_page); | 671 | EXPORT_SYMBOL(find_get_page); |
@@ -656,32 +680,22 @@ EXPORT_SYMBOL(find_get_page); | |||
656 | * | 680 | * |
657 | * Returns zero if the page was not present. find_lock_page() may sleep. | 681 | * Returns zero if the page was not present. find_lock_page() may sleep. |
658 | */ | 682 | */ |
659 | struct page *find_lock_page(struct address_space *mapping, | 683 | struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) |
660 | pgoff_t offset) | ||
661 | { | 684 | { |
662 | struct page *page; | 685 | struct page *page; |
663 | 686 | ||
664 | repeat: | 687 | repeat: |
665 | read_lock_irq(&mapping->tree_lock); | 688 | page = find_get_page(mapping, offset); |
666 | page = radix_tree_lookup(&mapping->page_tree, offset); | ||
667 | if (page) { | 689 | if (page) { |
668 | page_cache_get(page); | 690 | lock_page(page); |
669 | if (TestSetPageLocked(page)) { | 691 | /* Has the page been truncated? */ |
670 | read_unlock_irq(&mapping->tree_lock); | 692 | if (unlikely(page->mapping != mapping)) { |
671 | __lock_page(page); | 693 | unlock_page(page); |
672 | 694 | page_cache_release(page); | |
673 | /* Has the page been truncated while we slept? */ | 695 | goto repeat; |
674 | if (unlikely(page->mapping != mapping)) { | ||
675 | unlock_page(page); | ||
676 | page_cache_release(page); | ||
677 | goto repeat; | ||
678 | } | ||
679 | VM_BUG_ON(page->index != offset); | ||
680 | goto out; | ||
681 | } | 696 | } |
697 | VM_BUG_ON(page->index != offset); | ||
682 | } | 698 | } |
683 | read_unlock_irq(&mapping->tree_lock); | ||
684 | out: | ||
685 | return page; | 699 | return page; |
686 | } | 700 | } |
687 | EXPORT_SYMBOL(find_lock_page); | 701 | EXPORT_SYMBOL(find_lock_page); |
@@ -747,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | |||
747 | { | 761 | { |
748 | unsigned int i; | 762 | unsigned int i; |
749 | unsigned int ret; | 763 | unsigned int ret; |
764 | unsigned int nr_found; | ||
765 | |||
766 | rcu_read_lock(); | ||
767 | restart: | ||
768 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | ||
769 | (void ***)pages, start, nr_pages); | ||
770 | ret = 0; | ||
771 | for (i = 0; i < nr_found; i++) { | ||
772 | struct page *page; | ||
773 | repeat: | ||
774 | page = radix_tree_deref_slot((void **)pages[i]); | ||
775 | if (unlikely(!page)) | ||
776 | continue; | ||
777 | /* | ||
778 | * this can only trigger if nr_found == 1, making livelock | ||
779 | * a non issue. | ||
780 | */ | ||
781 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
782 | goto restart; | ||
783 | |||
784 | if (!page_cache_get_speculative(page)) | ||
785 | goto repeat; | ||
786 | |||
787 | /* Has the page moved? */ | ||
788 | if (unlikely(page != *((void **)pages[i]))) { | ||
789 | page_cache_release(page); | ||
790 | goto repeat; | ||
791 | } | ||
750 | 792 | ||
751 | read_lock_irq(&mapping->tree_lock); | 793 | pages[ret] = page; |
752 | ret = radix_tree_gang_lookup(&mapping->page_tree, | 794 | ret++; |
753 | (void **)pages, start, nr_pages); | 795 | } |
754 | for (i = 0; i < ret; i++) | 796 | rcu_read_unlock(); |
755 | page_cache_get(pages[i]); | ||
756 | read_unlock_irq(&mapping->tree_lock); | ||
757 | return ret; | 797 | return ret; |
758 | } | 798 | } |
759 | 799 | ||
@@ -774,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
774 | { | 814 | { |
775 | unsigned int i; | 815 | unsigned int i; |
776 | unsigned int ret; | 816 | unsigned int ret; |
817 | unsigned int nr_found; | ||
818 | |||
819 | rcu_read_lock(); | ||
820 | restart: | ||
821 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | ||
822 | (void ***)pages, index, nr_pages); | ||
823 | ret = 0; | ||
824 | for (i = 0; i < nr_found; i++) { | ||
825 | struct page *page; | ||
826 | repeat: | ||
827 | page = radix_tree_deref_slot((void **)pages[i]); | ||
828 | if (unlikely(!page)) | ||
829 | continue; | ||
830 | /* | ||
831 | * this can only trigger if nr_found == 1, making livelock | ||
832 | * a non issue. | ||
833 | */ | ||
834 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
835 | goto restart; | ||
777 | 836 | ||
778 | read_lock_irq(&mapping->tree_lock); | 837 | if (page->mapping == NULL || page->index != index) |
779 | ret = radix_tree_gang_lookup(&mapping->page_tree, | ||
780 | (void **)pages, index, nr_pages); | ||
781 | for (i = 0; i < ret; i++) { | ||
782 | if (pages[i]->mapping == NULL || pages[i]->index != index) | ||
783 | break; | 838 | break; |
784 | 839 | ||
785 | page_cache_get(pages[i]); | 840 | if (!page_cache_get_speculative(page)) |
841 | goto repeat; | ||
842 | |||
843 | /* Has the page moved? */ | ||
844 | if (unlikely(page != *((void **)pages[i]))) { | ||
845 | page_cache_release(page); | ||
846 | goto repeat; | ||
847 | } | ||
848 | |||
849 | pages[ret] = page; | ||
850 | ret++; | ||
786 | index++; | 851 | index++; |
787 | } | 852 | } |
788 | read_unlock_irq(&mapping->tree_lock); | 853 | rcu_read_unlock(); |
789 | return i; | 854 | return ret; |
790 | } | 855 | } |
791 | EXPORT_SYMBOL(find_get_pages_contig); | 856 | EXPORT_SYMBOL(find_get_pages_contig); |
792 | 857 | ||
@@ -806,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | |||
806 | { | 871 | { |
807 | unsigned int i; | 872 | unsigned int i; |
808 | unsigned int ret; | 873 | unsigned int ret; |
874 | unsigned int nr_found; | ||
875 | |||
876 | rcu_read_lock(); | ||
877 | restart: | ||
878 | nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, | ||
879 | (void ***)pages, *index, nr_pages, tag); | ||
880 | ret = 0; | ||
881 | for (i = 0; i < nr_found; i++) { | ||
882 | struct page *page; | ||
883 | repeat: | ||
884 | page = radix_tree_deref_slot((void **)pages[i]); | ||
885 | if (unlikely(!page)) | ||
886 | continue; | ||
887 | /* | ||
888 | * this can only trigger if nr_found == 1, making livelock | ||
889 | * a non issue. | ||
890 | */ | ||
891 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
892 | goto restart; | ||
893 | |||
894 | if (!page_cache_get_speculative(page)) | ||
895 | goto repeat; | ||
896 | |||
897 | /* Has the page moved? */ | ||
898 | if (unlikely(page != *((void **)pages[i]))) { | ||
899 | page_cache_release(page); | ||
900 | goto repeat; | ||
901 | } | ||
902 | |||
903 | pages[ret] = page; | ||
904 | ret++; | ||
905 | } | ||
906 | rcu_read_unlock(); | ||
809 | 907 | ||
810 | read_lock_irq(&mapping->tree_lock); | ||
811 | ret = radix_tree_gang_lookup_tag(&mapping->page_tree, | ||
812 | (void **)pages, *index, nr_pages, tag); | ||
813 | for (i = 0; i < ret; i++) | ||
814 | page_cache_get(pages[i]); | ||
815 | if (ret) | 908 | if (ret) |
816 | *index = pages[ret - 1]->index + 1; | 909 | *index = pages[ret - 1]->index + 1; |
817 | read_unlock_irq(&mapping->tree_lock); | 910 | |
818 | return ret; | 911 | return ret; |
819 | } | 912 | } |
820 | EXPORT_SYMBOL(find_get_pages_tag); | 913 | EXPORT_SYMBOL(find_get_pages_tag); |
@@ -838,7 +931,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) | |||
838 | struct page *page = find_get_page(mapping, index); | 931 | struct page *page = find_get_page(mapping, index); |
839 | 932 | ||
840 | if (page) { | 933 | if (page) { |
841 | if (!TestSetPageLocked(page)) | 934 | if (trylock_page(page)) |
842 | return page; | 935 | return page; |
843 | page_cache_release(page); | 936 | page_cache_release(page); |
844 | return NULL; | 937 | return NULL; |
@@ -930,8 +1023,17 @@ find_page: | |||
930 | ra, filp, page, | 1023 | ra, filp, page, |
931 | index, last_index - index); | 1024 | index, last_index - index); |
932 | } | 1025 | } |
933 | if (!PageUptodate(page)) | 1026 | if (!PageUptodate(page)) { |
934 | goto page_not_up_to_date; | 1027 | if (inode->i_blkbits == PAGE_CACHE_SHIFT || |
1028 | !mapping->a_ops->is_partially_uptodate) | ||
1029 | goto page_not_up_to_date; | ||
1030 | if (!trylock_page(page)) | ||
1031 | goto page_not_up_to_date; | ||
1032 | if (!mapping->a_ops->is_partially_uptodate(page, | ||
1033 | desc, offset)) | ||
1034 | goto page_not_up_to_date_locked; | ||
1035 | unlock_page(page); | ||
1036 | } | ||
935 | page_ok: | 1037 | page_ok: |
936 | /* | 1038 | /* |
937 | * i_size must be checked after we know the page is Uptodate. | 1039 | * i_size must be checked after we know the page is Uptodate. |
@@ -1001,6 +1103,7 @@ page_not_up_to_date: | |||
1001 | if (lock_page_killable(page)) | 1103 | if (lock_page_killable(page)) |
1002 | goto readpage_eio; | 1104 | goto readpage_eio; |
1003 | 1105 | ||
1106 | page_not_up_to_date_locked: | ||
1004 | /* Did it get truncated before we got the lock? */ | 1107 | /* Did it get truncated before we got the lock? */ |
1005 | if (!page->mapping) { | 1108 | if (!page->mapping) { |
1006 | unlock_page(page); | 1109 | unlock_page(page); |
@@ -1665,8 +1768,9 @@ static int __remove_suid(struct dentry *dentry, int kill) | |||
1665 | return notify_change(dentry, &newattrs); | 1768 | return notify_change(dentry, &newattrs); |
1666 | } | 1769 | } |
1667 | 1770 | ||
1668 | int remove_suid(struct dentry *dentry) | 1771 | int file_remove_suid(struct file *file) |
1669 | { | 1772 | { |
1773 | struct dentry *dentry = file->f_path.dentry; | ||
1670 | int killsuid = should_remove_suid(dentry); | 1774 | int killsuid = should_remove_suid(dentry); |
1671 | int killpriv = security_inode_need_killpriv(dentry); | 1775 | int killpriv = security_inode_need_killpriv(dentry); |
1672 | int error = 0; | 1776 | int error = 0; |
@@ -1680,7 +1784,7 @@ int remove_suid(struct dentry *dentry) | |||
1680 | 1784 | ||
1681 | return error; | 1785 | return error; |
1682 | } | 1786 | } |
1683 | EXPORT_SYMBOL(remove_suid); | 1787 | EXPORT_SYMBOL(file_remove_suid); |
1684 | 1788 | ||
1685 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, | 1789 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, |
1686 | const struct iovec *iov, size_t base, size_t bytes) | 1790 | const struct iovec *iov, size_t base, size_t bytes) |
@@ -1775,7 +1879,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) | |||
1775 | * The !iov->iov_len check ensures we skip over unlikely | 1879 | * The !iov->iov_len check ensures we skip over unlikely |
1776 | * zero-length segments (without overruning the iovec). | 1880 | * zero-length segments (without overruning the iovec). |
1777 | */ | 1881 | */ |
1778 | while (bytes || unlikely(!iov->iov_len && i->count)) { | 1882 | while (bytes || unlikely(i->count && !iov->iov_len)) { |
1779 | int copy; | 1883 | int copy; |
1780 | 1884 | ||
1781 | copy = min(bytes, iov->iov_len - base); | 1885 | copy = min(bytes, iov->iov_len - base); |
@@ -2025,13 +2129,20 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
2025 | * After a write we want buffered reads to be sure to go to disk to get | 2129 | * After a write we want buffered reads to be sure to go to disk to get |
2026 | * the new data. We invalidate clean cached page from the region we're | 2130 | * the new data. We invalidate clean cached page from the region we're |
2027 | * about to write. We do this *before* the write so that we can return | 2131 | * about to write. We do this *before* the write so that we can return |
2028 | * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). | 2132 | * without clobbering -EIOCBQUEUED from ->direct_IO(). |
2029 | */ | 2133 | */ |
2030 | if (mapping->nrpages) { | 2134 | if (mapping->nrpages) { |
2031 | written = invalidate_inode_pages2_range(mapping, | 2135 | written = invalidate_inode_pages2_range(mapping, |
2032 | pos >> PAGE_CACHE_SHIFT, end); | 2136 | pos >> PAGE_CACHE_SHIFT, end); |
2033 | if (written) | 2137 | /* |
2138 | * If a page can not be invalidated, return 0 to fall back | ||
2139 | * to buffered write. | ||
2140 | */ | ||
2141 | if (written) { | ||
2142 | if (written == -EBUSY) | ||
2143 | return 0; | ||
2034 | goto out; | 2144 | goto out; |
2145 | } | ||
2035 | } | 2146 | } |
2036 | 2147 | ||
2037 | written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); | 2148 | written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); |
@@ -2436,7 +2547,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
2436 | if (count == 0) | 2547 | if (count == 0) |
2437 | goto out; | 2548 | goto out; |
2438 | 2549 | ||
2439 | err = remove_suid(file->f_path.dentry); | 2550 | err = file_remove_suid(file); |
2440 | if (err) | 2551 | if (err) |
2441 | goto out; | 2552 | goto out; |
2442 | 2553 | ||
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 3e744abcce9d..b5167dfb2f2d 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -13,7 +13,10 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/uio.h> | 14 | #include <linux/uio.h> |
15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
16 | #include <linux/mmu_notifier.h> | ||
16 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
18 | #include <linux/seqlock.h> | ||
19 | #include <linux/mutex.h> | ||
17 | #include <asm/tlbflush.h> | 20 | #include <asm/tlbflush.h> |
18 | #include <asm/io.h> | 21 | #include <asm/io.h> |
19 | 22 | ||
@@ -21,22 +24,18 @@ | |||
21 | * We do use our own empty page to avoid interference with other users | 24 | * We do use our own empty page to avoid interference with other users |
22 | * of ZERO_PAGE(), such as /dev/zero | 25 | * of ZERO_PAGE(), such as /dev/zero |
23 | */ | 26 | */ |
27 | static DEFINE_MUTEX(xip_sparse_mutex); | ||
28 | static seqcount_t xip_sparse_seq = SEQCNT_ZERO; | ||
24 | static struct page *__xip_sparse_page; | 29 | static struct page *__xip_sparse_page; |
25 | 30 | ||
31 | /* called under xip_sparse_mutex */ | ||
26 | static struct page *xip_sparse_page(void) | 32 | static struct page *xip_sparse_page(void) |
27 | { | 33 | { |
28 | if (!__xip_sparse_page) { | 34 | if (!__xip_sparse_page) { |
29 | struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); | 35 | struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); |
30 | 36 | ||
31 | if (page) { | 37 | if (page) |
32 | static DEFINE_SPINLOCK(xip_alloc_lock); | 38 | __xip_sparse_page = page; |
33 | spin_lock(&xip_alloc_lock); | ||
34 | if (!__xip_sparse_page) | ||
35 | __xip_sparse_page = page; | ||
36 | else | ||
37 | __free_page(page); | ||
38 | spin_unlock(&xip_alloc_lock); | ||
39 | } | ||
40 | } | 39 | } |
41 | return __xip_sparse_page; | 40 | return __xip_sparse_page; |
42 | } | 41 | } |
@@ -173,22 +172,27 @@ __xip_unmap (struct address_space * mapping, | |||
173 | pte_t pteval; | 172 | pte_t pteval; |
174 | spinlock_t *ptl; | 173 | spinlock_t *ptl; |
175 | struct page *page; | 174 | struct page *page; |
175 | unsigned count; | ||
176 | int locked = 0; | ||
177 | |||
178 | count = read_seqcount_begin(&xip_sparse_seq); | ||
176 | 179 | ||
177 | page = __xip_sparse_page; | 180 | page = __xip_sparse_page; |
178 | if (!page) | 181 | if (!page) |
179 | return; | 182 | return; |
180 | 183 | ||
184 | retry: | ||
181 | spin_lock(&mapping->i_mmap_lock); | 185 | spin_lock(&mapping->i_mmap_lock); |
182 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 186 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
183 | mm = vma->vm_mm; | 187 | mm = vma->vm_mm; |
184 | address = vma->vm_start + | 188 | address = vma->vm_start + |
185 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 189 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
186 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 190 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
187 | pte = page_check_address(page, mm, address, &ptl); | 191 | pte = page_check_address(page, mm, address, &ptl, 1); |
188 | if (pte) { | 192 | if (pte) { |
189 | /* Nuke the page table entry. */ | 193 | /* Nuke the page table entry. */ |
190 | flush_cache_page(vma, address, pte_pfn(*pte)); | 194 | flush_cache_page(vma, address, pte_pfn(*pte)); |
191 | pteval = ptep_clear_flush(vma, address, pte); | 195 | pteval = ptep_clear_flush_notify(vma, address, pte); |
192 | page_remove_rmap(page, vma); | 196 | page_remove_rmap(page, vma); |
193 | dec_mm_counter(mm, file_rss); | 197 | dec_mm_counter(mm, file_rss); |
194 | BUG_ON(pte_dirty(pteval)); | 198 | BUG_ON(pte_dirty(pteval)); |
@@ -197,6 +201,14 @@ __xip_unmap (struct address_space * mapping, | |||
197 | } | 201 | } |
198 | } | 202 | } |
199 | spin_unlock(&mapping->i_mmap_lock); | 203 | spin_unlock(&mapping->i_mmap_lock); |
204 | |||
205 | if (locked) { | ||
206 | mutex_unlock(&xip_sparse_mutex); | ||
207 | } else if (read_seqcount_retry(&xip_sparse_seq, count)) { | ||
208 | mutex_lock(&xip_sparse_mutex); | ||
209 | locked = 1; | ||
210 | goto retry; | ||
211 | } | ||
200 | } | 212 | } |
201 | 213 | ||
202 | /* | 214 | /* |
@@ -217,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
217 | int error; | 229 | int error; |
218 | 230 | ||
219 | /* XXX: are VM_FAULT_ codes OK? */ | 231 | /* XXX: are VM_FAULT_ codes OK? */ |
220 | 232 | again: | |
221 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 233 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
222 | if (vmf->pgoff >= size) | 234 | if (vmf->pgoff >= size) |
223 | return VM_FAULT_SIGBUS; | 235 | return VM_FAULT_SIGBUS; |
@@ -236,8 +248,10 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
236 | int err; | 248 | int err; |
237 | 249 | ||
238 | /* maybe shared writable, allocate new block */ | 250 | /* maybe shared writable, allocate new block */ |
251 | mutex_lock(&xip_sparse_mutex); | ||
239 | error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, | 252 | error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, |
240 | &xip_mem, &xip_pfn); | 253 | &xip_mem, &xip_pfn); |
254 | mutex_unlock(&xip_sparse_mutex); | ||
241 | if (error) | 255 | if (error) |
242 | return VM_FAULT_SIGBUS; | 256 | return VM_FAULT_SIGBUS; |
243 | /* unmap sparse mappings at pgoff from all other vmas */ | 257 | /* unmap sparse mappings at pgoff from all other vmas */ |
@@ -251,14 +265,34 @@ found: | |||
251 | BUG_ON(err); | 265 | BUG_ON(err); |
252 | return VM_FAULT_NOPAGE; | 266 | return VM_FAULT_NOPAGE; |
253 | } else { | 267 | } else { |
268 | int err, ret = VM_FAULT_OOM; | ||
269 | |||
270 | mutex_lock(&xip_sparse_mutex); | ||
271 | write_seqcount_begin(&xip_sparse_seq); | ||
272 | error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, | ||
273 | &xip_mem, &xip_pfn); | ||
274 | if (unlikely(!error)) { | ||
275 | write_seqcount_end(&xip_sparse_seq); | ||
276 | mutex_unlock(&xip_sparse_mutex); | ||
277 | goto again; | ||
278 | } | ||
279 | if (error != -ENODATA) | ||
280 | goto out; | ||
254 | /* not shared and writable, use xip_sparse_page() */ | 281 | /* not shared and writable, use xip_sparse_page() */ |
255 | page = xip_sparse_page(); | 282 | page = xip_sparse_page(); |
256 | if (!page) | 283 | if (!page) |
257 | return VM_FAULT_OOM; | 284 | goto out; |
285 | err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, | ||
286 | page); | ||
287 | if (err == -ENOMEM) | ||
288 | goto out; | ||
258 | 289 | ||
259 | page_cache_get(page); | 290 | ret = VM_FAULT_NOPAGE; |
260 | vmf->page = page; | 291 | out: |
261 | return 0; | 292 | write_seqcount_end(&xip_sparse_seq); |
293 | mutex_unlock(&xip_sparse_mutex); | ||
294 | |||
295 | return ret; | ||
262 | } | 296 | } |
263 | } | 297 | } |
264 | 298 | ||
@@ -307,8 +341,10 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
307 | &xip_mem, &xip_pfn); | 341 | &xip_mem, &xip_pfn); |
308 | if (status == -ENODATA) { | 342 | if (status == -ENODATA) { |
309 | /* we allocate a new page unmap it */ | 343 | /* we allocate a new page unmap it */ |
344 | mutex_lock(&xip_sparse_mutex); | ||
310 | status = a_ops->get_xip_mem(mapping, index, 1, | 345 | status = a_ops->get_xip_mem(mapping, index, 1, |
311 | &xip_mem, &xip_pfn); | 346 | &xip_mem, &xip_pfn); |
347 | mutex_unlock(&xip_sparse_mutex); | ||
312 | if (!status) | 348 | if (!status) |
313 | /* unmap page at pgoff from all other vmas */ | 349 | /* unmap page at pgoff from all other vmas */ |
314 | __xip_unmap(mapping, index); | 350 | __xip_unmap(mapping, index); |
@@ -380,7 +416,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, | |||
380 | if (count == 0) | 416 | if (count == 0) |
381 | goto out_backing; | 417 | goto out_backing; |
382 | 418 | ||
383 | ret = remove_suid(filp->f_path.dentry); | 419 | ret = file_remove_suid(filp); |
384 | if (ret) | 420 | if (ret) |
385 | goto out_backing; | 421 | goto out_backing; |
386 | 422 | ||
diff --git a/mm/fremap.c b/mm/fremap.c index 07a9c82ce1a3..7881638e4a12 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/syscalls.h> | 17 | #include <linux/syscalls.h> |
18 | #include <linux/mmu_notifier.h> | ||
18 | 19 | ||
19 | #include <asm/mmu_context.h> | 20 | #include <asm/mmu_context.h> |
20 | #include <asm/cacheflush.h> | 21 | #include <asm/cacheflush.h> |
@@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, | |||
214 | spin_unlock(&mapping->i_mmap_lock); | 215 | spin_unlock(&mapping->i_mmap_lock); |
215 | } | 216 | } |
216 | 217 | ||
218 | mmu_notifier_invalidate_range_start(mm, start, start + size); | ||
217 | err = populate_range(mm, vma, start, size, pgoff); | 219 | err = populate_range(mm, vma, start, size, pgoff); |
220 | mmu_notifier_invalidate_range_end(mm, start, start + size); | ||
218 | if (!err && !(flags & MAP_NONBLOCK)) { | 221 | if (!err && !(flags & MAP_NONBLOCK)) { |
219 | if (unlikely(has_write_lock)) { | 222 | if (unlikely(has_write_lock)) { |
220 | downgrade_write(&mm->mmap_sem); | 223 | downgrade_write(&mm->mmap_sem); |
diff --git a/mm/highmem.c b/mm/highmem.c index e16e1523b688..b36b83b920ff 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -70,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); | |||
70 | static void flush_all_zero_pkmaps(void) | 70 | static void flush_all_zero_pkmaps(void) |
71 | { | 71 | { |
72 | int i; | 72 | int i; |
73 | int need_flush = 0; | ||
73 | 74 | ||
74 | flush_cache_kmaps(); | 75 | flush_cache_kmaps(); |
75 | 76 | ||
@@ -101,8 +102,10 @@ static void flush_all_zero_pkmaps(void) | |||
101 | &pkmap_page_table[i]); | 102 | &pkmap_page_table[i]); |
102 | 103 | ||
103 | set_page_address(page, NULL); | 104 | set_page_address(page, NULL); |
105 | need_flush = 1; | ||
104 | } | 106 | } |
105 | flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); | 107 | if (need_flush) |
108 | flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); | ||
106 | } | 109 | } |
107 | 110 | ||
108 | /** | 111 | /** |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8bf4ab01f86..67a71191136e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/sysctl.h> | 10 | #include <linux/sysctl.h> |
11 | #include <linux/highmem.h> | 11 | #include <linux/highmem.h> |
12 | #include <linux/mmu_notifier.h> | ||
12 | #include <linux/nodemask.h> | 13 | #include <linux/nodemask.h> |
13 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
14 | #include <linux/mempolicy.h> | 15 | #include <linux/mempolicy.h> |
@@ -19,6 +20,7 @@ | |||
19 | 20 | ||
20 | #include <asm/page.h> | 21 | #include <asm/page.h> |
21 | #include <asm/pgtable.h> | 22 | #include <asm/pgtable.h> |
23 | #include <asm/io.h> | ||
22 | 24 | ||
23 | #include <linux/hugetlb.h> | 25 | #include <linux/hugetlb.h> |
24 | #include "internal.h" | 26 | #include "internal.h" |
@@ -563,7 +565,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
563 | huge_page_order(h)); | 565 | huge_page_order(h)); |
564 | if (page) { | 566 | if (page) { |
565 | if (arch_prepare_hugepage(page)) { | 567 | if (arch_prepare_hugepage(page)) { |
566 | __free_pages(page, HUGETLB_PAGE_ORDER); | 568 | __free_pages(page, huge_page_order(h)); |
567 | return NULL; | 569 | return NULL; |
568 | } | 570 | } |
569 | prep_new_huge_page(h, page, nid); | 571 | prep_new_huge_page(h, page, nid); |
@@ -663,6 +665,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
663 | __GFP_REPEAT|__GFP_NOWARN, | 665 | __GFP_REPEAT|__GFP_NOWARN, |
664 | huge_page_order(h)); | 666 | huge_page_order(h)); |
665 | 667 | ||
668 | if (page && arch_prepare_hugepage(page)) { | ||
669 | __free_pages(page, huge_page_order(h)); | ||
670 | return NULL; | ||
671 | } | ||
672 | |||
666 | spin_lock(&hugetlb_lock); | 673 | spin_lock(&hugetlb_lock); |
667 | if (page) { | 674 | if (page) { |
668 | /* | 675 | /* |
@@ -1026,18 +1033,6 @@ static void __init report_hugepages(void) | |||
1026 | } | 1033 | } |
1027 | } | 1034 | } |
1028 | 1035 | ||
1029 | static unsigned int cpuset_mems_nr(unsigned int *array) | ||
1030 | { | ||
1031 | int node; | ||
1032 | unsigned int nr = 0; | ||
1033 | |||
1034 | for_each_node_mask(node, cpuset_current_mems_allowed) | ||
1035 | nr += array[node]; | ||
1036 | |||
1037 | return nr; | ||
1038 | } | ||
1039 | |||
1040 | #ifdef CONFIG_SYSCTL | ||
1041 | #ifdef CONFIG_HIGHMEM | 1036 | #ifdef CONFIG_HIGHMEM |
1042 | static void try_to_free_low(struct hstate *h, unsigned long count) | 1037 | static void try_to_free_low(struct hstate *h, unsigned long count) |
1043 | { | 1038 | { |
@@ -1293,7 +1288,12 @@ module_exit(hugetlb_exit); | |||
1293 | 1288 | ||
1294 | static int __init hugetlb_init(void) | 1289 | static int __init hugetlb_init(void) |
1295 | { | 1290 | { |
1296 | BUILD_BUG_ON(HPAGE_SHIFT == 0); | 1291 | /* Some platform decide whether they support huge pages at boot |
1292 | * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when | ||
1293 | * there is no such support | ||
1294 | */ | ||
1295 | if (HPAGE_SHIFT == 0) | ||
1296 | return 0; | ||
1297 | 1297 | ||
1298 | if (!size_to_hstate(default_hstate_size)) { | 1298 | if (!size_to_hstate(default_hstate_size)) { |
1299 | default_hstate_size = HPAGE_SIZE; | 1299 | default_hstate_size = HPAGE_SIZE; |
@@ -1386,6 +1386,18 @@ static int __init hugetlb_default_setup(char *s) | |||
1386 | } | 1386 | } |
1387 | __setup("default_hugepagesz=", hugetlb_default_setup); | 1387 | __setup("default_hugepagesz=", hugetlb_default_setup); |
1388 | 1388 | ||
1389 | static unsigned int cpuset_mems_nr(unsigned int *array) | ||
1390 | { | ||
1391 | int node; | ||
1392 | unsigned int nr = 0; | ||
1393 | |||
1394 | for_each_node_mask(node, cpuset_current_mems_allowed) | ||
1395 | nr += array[node]; | ||
1396 | |||
1397 | return nr; | ||
1398 | } | ||
1399 | |||
1400 | #ifdef CONFIG_SYSCTL | ||
1389 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1401 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, |
1390 | struct file *file, void __user *buffer, | 1402 | struct file *file, void __user *buffer, |
1391 | size_t *length, loff_t *ppos) | 1403 | size_t *length, loff_t *ppos) |
@@ -1672,6 +1684,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
1672 | BUG_ON(start & ~huge_page_mask(h)); | 1684 | BUG_ON(start & ~huge_page_mask(h)); |
1673 | BUG_ON(end & ~huge_page_mask(h)); | 1685 | BUG_ON(end & ~huge_page_mask(h)); |
1674 | 1686 | ||
1687 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
1675 | spin_lock(&mm->page_table_lock); | 1688 | spin_lock(&mm->page_table_lock); |
1676 | for (address = start; address < end; address += sz) { | 1689 | for (address = start; address < end; address += sz) { |
1677 | ptep = huge_pte_offset(mm, address); | 1690 | ptep = huge_pte_offset(mm, address); |
@@ -1713,6 +1726,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
1713 | } | 1726 | } |
1714 | spin_unlock(&mm->page_table_lock); | 1727 | spin_unlock(&mm->page_table_lock); |
1715 | flush_tlb_range(vma, start, end); | 1728 | flush_tlb_range(vma, start, end); |
1729 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
1716 | list_for_each_entry_safe(page, tmp, &page_list, lru) { | 1730 | list_for_each_entry_safe(page, tmp, &page_list, lru) { |
1717 | list_del(&page->lru); | 1731 | list_del(&page->lru); |
1718 | put_page(page); | 1732 | put_page(page); |
@@ -1928,6 +1942,18 @@ retry: | |||
1928 | lock_page(page); | 1942 | lock_page(page); |
1929 | } | 1943 | } |
1930 | 1944 | ||
1945 | /* | ||
1946 | * If we are going to COW a private mapping later, we examine the | ||
1947 | * pending reservations for this page now. This will ensure that | ||
1948 | * any allocations necessary to record that reservation occur outside | ||
1949 | * the spinlock. | ||
1950 | */ | ||
1951 | if (write_access && !(vma->vm_flags & VM_SHARED)) | ||
1952 | if (vma_needs_reservation(h, vma, address) < 0) { | ||
1953 | ret = VM_FAULT_OOM; | ||
1954 | goto backout_unlocked; | ||
1955 | } | ||
1956 | |||
1931 | spin_lock(&mm->page_table_lock); | 1957 | spin_lock(&mm->page_table_lock); |
1932 | size = i_size_read(mapping->host) >> huge_page_shift(h); | 1958 | size = i_size_read(mapping->host) >> huge_page_shift(h); |
1933 | if (idx >= size) | 1959 | if (idx >= size) |
@@ -1953,6 +1979,7 @@ out: | |||
1953 | 1979 | ||
1954 | backout: | 1980 | backout: |
1955 | spin_unlock(&mm->page_table_lock); | 1981 | spin_unlock(&mm->page_table_lock); |
1982 | backout_unlocked: | ||
1956 | unlock_page(page); | 1983 | unlock_page(page); |
1957 | put_page(page); | 1984 | put_page(page); |
1958 | goto out; | 1985 | goto out; |
@@ -1964,6 +1991,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1964 | pte_t *ptep; | 1991 | pte_t *ptep; |
1965 | pte_t entry; | 1992 | pte_t entry; |
1966 | int ret; | 1993 | int ret; |
1994 | struct page *pagecache_page = NULL; | ||
1967 | static DEFINE_MUTEX(hugetlb_instantiation_mutex); | 1995 | static DEFINE_MUTEX(hugetlb_instantiation_mutex); |
1968 | struct hstate *h = hstate_vma(vma); | 1996 | struct hstate *h = hstate_vma(vma); |
1969 | 1997 | ||
@@ -1980,25 +2008,44 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1980 | entry = huge_ptep_get(ptep); | 2008 | entry = huge_ptep_get(ptep); |
1981 | if (huge_pte_none(entry)) { | 2009 | if (huge_pte_none(entry)) { |
1982 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); | 2010 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); |
1983 | mutex_unlock(&hugetlb_instantiation_mutex); | 2011 | goto out_unlock; |
1984 | return ret; | ||
1985 | } | 2012 | } |
1986 | 2013 | ||
1987 | ret = 0; | 2014 | ret = 0; |
1988 | 2015 | ||
2016 | /* | ||
2017 | * If we are going to COW the mapping later, we examine the pending | ||
2018 | * reservations for this page now. This will ensure that any | ||
2019 | * allocations necessary to record that reservation occur outside the | ||
2020 | * spinlock. For private mappings, we also lookup the pagecache | ||
2021 | * page now as it is used to determine if a reservation has been | ||
2022 | * consumed. | ||
2023 | */ | ||
2024 | if (write_access && !pte_write(entry)) { | ||
2025 | if (vma_needs_reservation(h, vma, address) < 0) { | ||
2026 | ret = VM_FAULT_OOM; | ||
2027 | goto out_unlock; | ||
2028 | } | ||
2029 | |||
2030 | if (!(vma->vm_flags & VM_SHARED)) | ||
2031 | pagecache_page = hugetlbfs_pagecache_page(h, | ||
2032 | vma, address); | ||
2033 | } | ||
2034 | |||
1989 | spin_lock(&mm->page_table_lock); | 2035 | spin_lock(&mm->page_table_lock); |
1990 | /* Check for a racing update before calling hugetlb_cow */ | 2036 | /* Check for a racing update before calling hugetlb_cow */ |
1991 | if (likely(pte_same(entry, huge_ptep_get(ptep)))) | 2037 | if (likely(pte_same(entry, huge_ptep_get(ptep)))) |
1992 | if (write_access && !pte_write(entry)) { | 2038 | if (write_access && !pte_write(entry)) |
1993 | struct page *page; | 2039 | ret = hugetlb_cow(mm, vma, address, ptep, entry, |
1994 | page = hugetlbfs_pagecache_page(h, vma, address); | 2040 | pagecache_page); |
1995 | ret = hugetlb_cow(mm, vma, address, ptep, entry, page); | ||
1996 | if (page) { | ||
1997 | unlock_page(page); | ||
1998 | put_page(page); | ||
1999 | } | ||
2000 | } | ||
2001 | spin_unlock(&mm->page_table_lock); | 2041 | spin_unlock(&mm->page_table_lock); |
2042 | |||
2043 | if (pagecache_page) { | ||
2044 | unlock_page(pagecache_page); | ||
2045 | put_page(pagecache_page); | ||
2046 | } | ||
2047 | |||
2048 | out_unlock: | ||
2002 | mutex_unlock(&hugetlb_instantiation_mutex); | 2049 | mutex_unlock(&hugetlb_instantiation_mutex); |
2003 | 2050 | ||
2004 | return ret; | 2051 | return ret; |
diff --git a/mm/madvise.c b/mm/madvise.c index 23a0ec3e0ea0..f9349c18a1b5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -132,10 +132,10 @@ static long madvise_willneed(struct vm_area_struct * vma, | |||
132 | * Application no longer needs these pages. If the pages are dirty, | 132 | * Application no longer needs these pages. If the pages are dirty, |
133 | * it's OK to just throw them away. The app will be more careful about | 133 | * it's OK to just throw them away. The app will be more careful about |
134 | * data it wants to keep. Be sure to free swap resources too. The | 134 | * data it wants to keep. Be sure to free swap resources too. The |
135 | * zap_page_range call sets things up for refill_inactive to actually free | 135 | * zap_page_range call sets things up for shrink_active_list to actually free |
136 | * these pages later if no one else has touched them in the meantime, | 136 | * these pages later if no one else has touched them in the meantime, |
137 | * although we could add these pages to a global reuse list for | 137 | * although we could add these pages to a global reuse list for |
138 | * refill_inactive to pick up before reclaiming other pages. | 138 | * shrink_active_list to pick up before reclaiming other pages. |
139 | * | 139 | * |
140 | * NB: This interface discards data rather than pushes it out to swap, | 140 | * NB: This interface discards data rather than pushes it out to swap, |
141 | * as some implementations do. This has performance implications for | 141 | * as some implementations do. This has performance implications for |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fba566c51322..36896f3eb7f5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -250,6 +250,14 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | |||
250 | 250 | ||
251 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | 251 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) |
252 | { | 252 | { |
253 | /* | ||
254 | * mm_update_next_owner() may clear mm->owner to NULL | ||
255 | * if it races with swapoff, page migration, etc. | ||
256 | * So this can be called with p == NULL. | ||
257 | */ | ||
258 | if (unlikely(!p)) | ||
259 | return NULL; | ||
260 | |||
253 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), | 261 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), |
254 | struct mem_cgroup, css); | 262 | struct mem_cgroup, css); |
255 | } | 263 | } |
@@ -549,6 +557,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
549 | if (likely(!memcg)) { | 557 | if (likely(!memcg)) { |
550 | rcu_read_lock(); | 558 | rcu_read_lock(); |
551 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 559 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
560 | if (unlikely(!mem)) { | ||
561 | rcu_read_unlock(); | ||
562 | kmem_cache_free(page_cgroup_cache, pc); | ||
563 | return 0; | ||
564 | } | ||
552 | /* | 565 | /* |
553 | * For every charge from the cgroup, increment reference count | 566 | * For every charge from the cgroup, increment reference count |
554 | */ | 567 | */ |
@@ -796,14 +809,21 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) | |||
796 | 809 | ||
797 | if (mem_cgroup_subsys.disabled) | 810 | if (mem_cgroup_subsys.disabled) |
798 | return 0; | 811 | return 0; |
812 | if (!mm) | ||
813 | return 0; | ||
799 | 814 | ||
800 | rcu_read_lock(); | 815 | rcu_read_lock(); |
801 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 816 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
817 | if (unlikely(!mem)) { | ||
818 | rcu_read_unlock(); | ||
819 | return 0; | ||
820 | } | ||
802 | css_get(&mem->css); | 821 | css_get(&mem->css); |
803 | rcu_read_unlock(); | 822 | rcu_read_unlock(); |
804 | 823 | ||
805 | do { | 824 | do { |
806 | progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); | 825 | progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); |
826 | progress += res_counter_check_under_limit(&mem->res); | ||
807 | } while (!progress && --retry); | 827 | } while (!progress && --retry); |
808 | 828 | ||
809 | css_put(&mem->css); | 829 | css_put(&mem->css); |
@@ -1168,9 +1188,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
1168 | mem = mem_cgroup_from_cont(cont); | 1188 | mem = mem_cgroup_from_cont(cont); |
1169 | old_mem = mem_cgroup_from_cont(old_cont); | 1189 | old_mem = mem_cgroup_from_cont(old_cont); |
1170 | 1190 | ||
1171 | if (mem == old_mem) | ||
1172 | goto out; | ||
1173 | |||
1174 | /* | 1191 | /* |
1175 | * Only thread group leaders are allowed to migrate, the mm_struct is | 1192 | * Only thread group leaders are allowed to migrate, the mm_struct is |
1176 | * in effect owned by the leader | 1193 | * in effect owned by the leader |
diff --git a/mm/memory.c b/mm/memory.c index 262e3eb6601a..1002f473f497 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/init.h> | 51 | #include <linux/init.h> |
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | #include <linux/memcontrol.h> | 53 | #include <linux/memcontrol.h> |
54 | #include <linux/mmu_notifier.h> | ||
54 | 55 | ||
55 | #include <asm/pgalloc.h> | 56 | #include <asm/pgalloc.h> |
56 | #include <asm/uaccess.h> | 57 | #include <asm/uaccess.h> |
@@ -374,7 +375,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) | |||
374 | * | 375 | * |
375 | * The calling function must still handle the error. | 376 | * The calling function must still handle the error. |
376 | */ | 377 | */ |
377 | void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) | 378 | static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, |
379 | unsigned long vaddr) | ||
378 | { | 380 | { |
379 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " | 381 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " |
380 | "vm_flags = %lx, vaddr = %lx\n", | 382 | "vm_flags = %lx, vaddr = %lx\n", |
@@ -651,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
651 | unsigned long next; | 653 | unsigned long next; |
652 | unsigned long addr = vma->vm_start; | 654 | unsigned long addr = vma->vm_start; |
653 | unsigned long end = vma->vm_end; | 655 | unsigned long end = vma->vm_end; |
656 | int ret; | ||
654 | 657 | ||
655 | /* | 658 | /* |
656 | * Don't copy ptes where a page fault will fill them correctly. | 659 | * Don't copy ptes where a page fault will fill them correctly. |
@@ -666,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
666 | if (is_vm_hugetlb_page(vma)) | 669 | if (is_vm_hugetlb_page(vma)) |
667 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); | 670 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); |
668 | 671 | ||
672 | /* | ||
673 | * We need to invalidate the secondary MMU mappings only when | ||
674 | * there could be a permission downgrade on the ptes of the | ||
675 | * parent mm. And a permission downgrade will only happen if | ||
676 | * is_cow_mapping() returns true. | ||
677 | */ | ||
678 | if (is_cow_mapping(vma->vm_flags)) | ||
679 | mmu_notifier_invalidate_range_start(src_mm, addr, end); | ||
680 | |||
681 | ret = 0; | ||
669 | dst_pgd = pgd_offset(dst_mm, addr); | 682 | dst_pgd = pgd_offset(dst_mm, addr); |
670 | src_pgd = pgd_offset(src_mm, addr); | 683 | src_pgd = pgd_offset(src_mm, addr); |
671 | do { | 684 | do { |
672 | next = pgd_addr_end(addr, end); | 685 | next = pgd_addr_end(addr, end); |
673 | if (pgd_none_or_clear_bad(src_pgd)) | 686 | if (pgd_none_or_clear_bad(src_pgd)) |
674 | continue; | 687 | continue; |
675 | if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, | 688 | if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, |
676 | vma, addr, next)) | 689 | vma, addr, next))) { |
677 | return -ENOMEM; | 690 | ret = -ENOMEM; |
691 | break; | ||
692 | } | ||
678 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); | 693 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); |
679 | return 0; | 694 | |
695 | if (is_cow_mapping(vma->vm_flags)) | ||
696 | mmu_notifier_invalidate_range_end(src_mm, | ||
697 | vma->vm_start, end); | ||
698 | return ret; | ||
680 | } | 699 | } |
681 | 700 | ||
682 | static unsigned long zap_pte_range(struct mmu_gather *tlb, | 701 | static unsigned long zap_pte_range(struct mmu_gather *tlb, |
@@ -880,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
880 | unsigned long start = start_addr; | 899 | unsigned long start = start_addr; |
881 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; | 900 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; |
882 | int fullmm = (*tlbp)->fullmm; | 901 | int fullmm = (*tlbp)->fullmm; |
902 | struct mm_struct *mm = vma->vm_mm; | ||
883 | 903 | ||
904 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); | ||
884 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { | 905 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { |
885 | unsigned long end; | 906 | unsigned long end; |
886 | 907 | ||
@@ -945,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
945 | } | 966 | } |
946 | } | 967 | } |
947 | out: | 968 | out: |
969 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); | ||
948 | return start; /* which is now the end (or restart) address */ | 970 | return start; /* which is now the end (or restart) address */ |
949 | } | 971 | } |
950 | 972 | ||
@@ -972,6 +994,29 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | |||
972 | return end; | 994 | return end; |
973 | } | 995 | } |
974 | 996 | ||
997 | /** | ||
998 | * zap_vma_ptes - remove ptes mapping the vma | ||
999 | * @vma: vm_area_struct holding ptes to be zapped | ||
1000 | * @address: starting address of pages to zap | ||
1001 | * @size: number of bytes to zap | ||
1002 | * | ||
1003 | * This function only unmaps ptes assigned to VM_PFNMAP vmas. | ||
1004 | * | ||
1005 | * The entire address range must be fully contained within the vma. | ||
1006 | * | ||
1007 | * Returns 0 if successful. | ||
1008 | */ | ||
1009 | int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, | ||
1010 | unsigned long size) | ||
1011 | { | ||
1012 | if (address < vma->vm_start || address + size > vma->vm_end || | ||
1013 | !(vma->vm_flags & VM_PFNMAP)) | ||
1014 | return -1; | ||
1015 | zap_page_range(vma, address, size, NULL); | ||
1016 | return 0; | ||
1017 | } | ||
1018 | EXPORT_SYMBOL_GPL(zap_vma_ptes); | ||
1019 | |||
975 | /* | 1020 | /* |
976 | * Do a quick page-table lookup for a single page. | 1021 | * Do a quick page-table lookup for a single page. |
977 | */ | 1022 | */ |
@@ -1615,10 +1660,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, | |||
1615 | { | 1660 | { |
1616 | pgd_t *pgd; | 1661 | pgd_t *pgd; |
1617 | unsigned long next; | 1662 | unsigned long next; |
1618 | unsigned long end = addr + size; | 1663 | unsigned long start = addr, end = addr + size; |
1619 | int err; | 1664 | int err; |
1620 | 1665 | ||
1621 | BUG_ON(addr >= end); | 1666 | BUG_ON(addr >= end); |
1667 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
1622 | pgd = pgd_offset(mm, addr); | 1668 | pgd = pgd_offset(mm, addr); |
1623 | do { | 1669 | do { |
1624 | next = pgd_addr_end(addr, end); | 1670 | next = pgd_addr_end(addr, end); |
@@ -1626,6 +1672,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, | |||
1626 | if (err) | 1672 | if (err) |
1627 | break; | 1673 | break; |
1628 | } while (pgd++, addr = next, addr != end); | 1674 | } while (pgd++, addr = next, addr != end); |
1675 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
1629 | return err; | 1676 | return err; |
1630 | } | 1677 | } |
1631 | EXPORT_SYMBOL_GPL(apply_to_page_range); | 1678 | EXPORT_SYMBOL_GPL(apply_to_page_range); |
@@ -1742,7 +1789,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1742 | * not dirty accountable. | 1789 | * not dirty accountable. |
1743 | */ | 1790 | */ |
1744 | if (PageAnon(old_page)) { | 1791 | if (PageAnon(old_page)) { |
1745 | if (!TestSetPageLocked(old_page)) { | 1792 | if (trylock_page(old_page)) { |
1746 | reuse = can_share_swap_page(old_page); | 1793 | reuse = can_share_swap_page(old_page); |
1747 | unlock_page(old_page); | 1794 | unlock_page(old_page); |
1748 | } | 1795 | } |
@@ -1838,7 +1885,7 @@ gotten: | |||
1838 | * seen in the presence of one thread doing SMC and another | 1885 | * seen in the presence of one thread doing SMC and another |
1839 | * thread doing COW. | 1886 | * thread doing COW. |
1840 | */ | 1887 | */ |
1841 | ptep_clear_flush(vma, address, page_table); | 1888 | ptep_clear_flush_notify(vma, address, page_table); |
1842 | set_pte_at(mm, address, page_table, entry); | 1889 | set_pte_at(mm, address, page_table, entry); |
1843 | update_mmu_cache(vma, address, entry); | 1890 | update_mmu_cache(vma, address, entry); |
1844 | lru_cache_add_active(new_page); | 1891 | lru_cache_add_active(new_page); |
@@ -2718,16 +2765,26 @@ int make_pages_present(unsigned long addr, unsigned long end) | |||
2718 | 2765 | ||
2719 | vma = find_vma(current->mm, addr); | 2766 | vma = find_vma(current->mm, addr); |
2720 | if (!vma) | 2767 | if (!vma) |
2721 | return -1; | 2768 | return -ENOMEM; |
2722 | write = (vma->vm_flags & VM_WRITE) != 0; | 2769 | write = (vma->vm_flags & VM_WRITE) != 0; |
2723 | BUG_ON(addr >= end); | 2770 | BUG_ON(addr >= end); |
2724 | BUG_ON(end > vma->vm_end); | 2771 | BUG_ON(end > vma->vm_end); |
2725 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; | 2772 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; |
2726 | ret = get_user_pages(current, current->mm, addr, | 2773 | ret = get_user_pages(current, current->mm, addr, |
2727 | len, write, 0, NULL, NULL); | 2774 | len, write, 0, NULL, NULL); |
2728 | if (ret < 0) | 2775 | if (ret < 0) { |
2776 | /* | ||
2777 | SUS require strange return value to mlock | ||
2778 | - invalid addr generate to ENOMEM. | ||
2779 | - out of memory should generate EAGAIN. | ||
2780 | */ | ||
2781 | if (ret == -EFAULT) | ||
2782 | ret = -ENOMEM; | ||
2783 | else if (ret == -ENOMEM) | ||
2784 | ret = -EAGAIN; | ||
2729 | return ret; | 2785 | return ret; |
2730 | return ret == len ? 0 : -1; | 2786 | } |
2787 | return ret == len ? 0 : -ENOMEM; | ||
2731 | } | 2788 | } |
2732 | 2789 | ||
2733 | #if !defined(__HAVE_ARCH_GATE_AREA) | 2790 | #if !defined(__HAVE_ARCH_GATE_AREA) |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e550bec20582..83369058ec13 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -803,7 +803,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
803 | int do_migrate_pages(struct mm_struct *mm, | 803 | int do_migrate_pages(struct mm_struct *mm, |
804 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | 804 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) |
805 | { | 805 | { |
806 | LIST_HEAD(pagelist); | ||
807 | int busy = 0; | 806 | int busy = 0; |
808 | int err = 0; | 807 | int err = 0; |
809 | nodemask_t tmp; | 808 | nodemask_t tmp; |
diff --git a/mm/migrate.c b/mm/migrate.c index d8c65a65c61d..2a80136b23bb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -285,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, | |||
285 | 285 | ||
286 | page = migration_entry_to_page(entry); | 286 | page = migration_entry_to_page(entry); |
287 | 287 | ||
288 | get_page(page); | 288 | /* |
289 | * Once radix-tree replacement of page migration started, page_count | ||
290 | * *must* be zero. And, we don't want to call wait_on_page_locked() | ||
291 | * against a page without get_page(). | ||
292 | * So, we use get_page_unless_zero(), here. Even failed, page fault | ||
293 | * will occur again. | ||
294 | */ | ||
295 | if (!get_page_unless_zero(page)) | ||
296 | goto out; | ||
289 | pte_unmap_unlock(ptep, ptl); | 297 | pte_unmap_unlock(ptep, ptl); |
290 | wait_on_page_locked(page); | 298 | wait_on_page_locked(page); |
291 | put_page(page); | 299 | put_page(page); |
@@ -305,6 +313,7 @@ out: | |||
305 | static int migrate_page_move_mapping(struct address_space *mapping, | 313 | static int migrate_page_move_mapping(struct address_space *mapping, |
306 | struct page *newpage, struct page *page) | 314 | struct page *newpage, struct page *page) |
307 | { | 315 | { |
316 | int expected_count; | ||
308 | void **pslot; | 317 | void **pslot; |
309 | 318 | ||
310 | if (!mapping) { | 319 | if (!mapping) { |
@@ -314,14 +323,20 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
314 | return 0; | 323 | return 0; |
315 | } | 324 | } |
316 | 325 | ||
317 | write_lock_irq(&mapping->tree_lock); | 326 | spin_lock_irq(&mapping->tree_lock); |
318 | 327 | ||
319 | pslot = radix_tree_lookup_slot(&mapping->page_tree, | 328 | pslot = radix_tree_lookup_slot(&mapping->page_tree, |
320 | page_index(page)); | 329 | page_index(page)); |
321 | 330 | ||
322 | if (page_count(page) != 2 + !!PagePrivate(page) || | 331 | expected_count = 2 + !!PagePrivate(page); |
332 | if (page_count(page) != expected_count || | ||
323 | (struct page *)radix_tree_deref_slot(pslot) != page) { | 333 | (struct page *)radix_tree_deref_slot(pslot) != page) { |
324 | write_unlock_irq(&mapping->tree_lock); | 334 | spin_unlock_irq(&mapping->tree_lock); |
335 | return -EAGAIN; | ||
336 | } | ||
337 | |||
338 | if (!page_freeze_refs(page, expected_count)) { | ||
339 | spin_unlock_irq(&mapping->tree_lock); | ||
325 | return -EAGAIN; | 340 | return -EAGAIN; |
326 | } | 341 | } |
327 | 342 | ||
@@ -338,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
338 | 353 | ||
339 | radix_tree_replace_slot(pslot, newpage); | 354 | radix_tree_replace_slot(pslot, newpage); |
340 | 355 | ||
356 | page_unfreeze_refs(page, expected_count); | ||
341 | /* | 357 | /* |
342 | * Drop cache reference from old page. | 358 | * Drop cache reference from old page. |
343 | * We know this isn't the last reference. | 359 | * We know this isn't the last reference. |
@@ -357,10 +373,9 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
357 | __dec_zone_page_state(page, NR_FILE_PAGES); | 373 | __dec_zone_page_state(page, NR_FILE_PAGES); |
358 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 374 | __inc_zone_page_state(newpage, NR_FILE_PAGES); |
359 | 375 | ||
360 | write_unlock_irq(&mapping->tree_lock); | 376 | spin_unlock_irq(&mapping->tree_lock); |
361 | if (!PageSwapCache(newpage)) { | 377 | if (!PageSwapCache(newpage)) |
362 | mem_cgroup_uncharge_cache_page(page); | 378 | mem_cgroup_uncharge_cache_page(page); |
363 | } | ||
364 | 379 | ||
365 | return 0; | 380 | return 0; |
366 | } | 381 | } |
@@ -590,7 +605,7 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
590 | * establishing additional references. We are the only one | 605 | * establishing additional references. We are the only one |
591 | * holding a reference to the new page at this point. | 606 | * holding a reference to the new page at this point. |
592 | */ | 607 | */ |
593 | if (TestSetPageLocked(newpage)) | 608 | if (!trylock_page(newpage)) |
594 | BUG(); | 609 | BUG(); |
595 | 610 | ||
596 | /* Prepare mapping for the new page.*/ | 611 | /* Prepare mapping for the new page.*/ |
@@ -652,7 +667,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
652 | BUG_ON(charge); | 667 | BUG_ON(charge); |
653 | 668 | ||
654 | rc = -EAGAIN; | 669 | rc = -EAGAIN; |
655 | if (TestSetPageLocked(page)) { | 670 | if (!trylock_page(page)) { |
656 | if (!force) | 671 | if (!force) |
657 | goto move_newpage; | 672 | goto move_newpage; |
658 | lock_page(page); | 673 | lock_page(page); |
diff --git a/mm/mlock.c b/mm/mlock.c index 7b2656055d6a..01fbe93eff5c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -78,8 +78,6 @@ success: | |||
78 | 78 | ||
79 | mm->locked_vm -= pages; | 79 | mm->locked_vm -= pages; |
80 | out: | 80 | out: |
81 | if (ret == -ENOMEM) | ||
82 | ret = -EAGAIN; | ||
83 | return ret; | 81 | return ret; |
84 | } | 82 | } |
85 | 83 | ||
diff --git a/mm/mm_init.c b/mm/mm_init.c index c6af41ea9994..4e0e26591dfa 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c | |||
@@ -12,7 +12,11 @@ | |||
12 | #include "internal.h" | 12 | #include "internal.h" |
13 | 13 | ||
14 | #ifdef CONFIG_DEBUG_MEMORY_INIT | 14 | #ifdef CONFIG_DEBUG_MEMORY_INIT |
15 | int __meminitdata mminit_loglevel; | 15 | int mminit_loglevel; |
16 | |||
17 | #ifndef SECTIONS_SHIFT | ||
18 | #define SECTIONS_SHIFT 0 | ||
19 | #endif | ||
16 | 20 | ||
17 | /* The zonelists are simply reported, validation is manual. */ | 21 | /* The zonelists are simply reported, validation is manual. */ |
18 | void mminit_verify_zonelist(void) | 22 | void mminit_verify_zonelist(void) |
@@ -74,11 +78,7 @@ void __init mminit_verify_pageflags_layout(void) | |||
74 | NR_PAGEFLAGS); | 78 | NR_PAGEFLAGS); |
75 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", | 79 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", |
76 | "Section %d Node %d Zone %d\n", | 80 | "Section %d Node %d Zone %d\n", |
77 | #ifdef SECTIONS_SHIFT | ||
78 | SECTIONS_SHIFT, | 81 | SECTIONS_SHIFT, |
79 | #else | ||
80 | 0, | ||
81 | #endif | ||
82 | NODES_SHIFT, | 82 | NODES_SHIFT, |
83 | ZONES_SHIFT); | 83 | ZONES_SHIFT); |
84 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", | 84 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", |
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/mount.h> | 26 | #include <linux/mount.h> |
27 | #include <linux/mempolicy.h> | 27 | #include <linux/mempolicy.h> |
28 | #include <linux/rmap.h> | 28 | #include <linux/rmap.h> |
29 | #include <linux/mmu_notifier.h> | ||
29 | 30 | ||
30 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
31 | #include <asm/cacheflush.h> | 32 | #include <asm/cacheflush.h> |
@@ -369,7 +370,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, | |||
369 | if (vma_tmp->vm_end > addr) { | 370 | if (vma_tmp->vm_end > addr) { |
370 | vma = vma_tmp; | 371 | vma = vma_tmp; |
371 | if (vma_tmp->vm_start <= addr) | 372 | if (vma_tmp->vm_start <= addr) |
372 | return vma; | 373 | break; |
373 | __rb_link = &__rb_parent->rb_left; | 374 | __rb_link = &__rb_parent->rb_left; |
374 | } else { | 375 | } else { |
375 | rb_prev = __rb_parent; | 376 | rb_prev = __rb_parent; |
@@ -1029,6 +1030,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
1029 | } else { | 1030 | } else { |
1030 | switch (flags & MAP_TYPE) { | 1031 | switch (flags & MAP_TYPE) { |
1031 | case MAP_SHARED: | 1032 | case MAP_SHARED: |
1033 | /* | ||
1034 | * Ignore pgoff. | ||
1035 | */ | ||
1036 | pgoff = 0; | ||
1032 | vm_flags |= VM_SHARED | VM_MAYSHARE; | 1037 | vm_flags |= VM_SHARED | VM_MAYSHARE; |
1033 | break; | 1038 | break; |
1034 | case MAP_PRIVATE: | 1039 | case MAP_PRIVATE: |
@@ -2061,6 +2066,7 @@ void exit_mmap(struct mm_struct *mm) | |||
2061 | 2066 | ||
2062 | /* mm's last user has gone, and its about to be pulled down */ | 2067 | /* mm's last user has gone, and its about to be pulled down */ |
2063 | arch_exit_mmap(mm); | 2068 | arch_exit_mmap(mm); |
2069 | mmu_notifier_release(mm); | ||
2064 | 2070 | ||
2065 | lru_add_drain(); | 2071 | lru_add_drain(); |
2066 | flush_cache_mm(mm); | 2072 | flush_cache_mm(mm); |
@@ -2268,3 +2274,167 @@ int install_special_mapping(struct mm_struct *mm, | |||
2268 | 2274 | ||
2269 | return 0; | 2275 | return 0; |
2270 | } | 2276 | } |
2277 | |||
2278 | static DEFINE_MUTEX(mm_all_locks_mutex); | ||
2279 | |||
2280 | static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | ||
2281 | { | ||
2282 | if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { | ||
2283 | /* | ||
2284 | * The LSB of head.next can't change from under us | ||
2285 | * because we hold the mm_all_locks_mutex. | ||
2286 | */ | ||
2287 | spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); | ||
2288 | /* | ||
2289 | * We can safely modify head.next after taking the | ||
2290 | * anon_vma->lock. If some other vma in this mm shares | ||
2291 | * the same anon_vma we won't take it again. | ||
2292 | * | ||
2293 | * No need of atomic instructions here, head.next | ||
2294 | * can't change from under us thanks to the | ||
2295 | * anon_vma->lock. | ||
2296 | */ | ||
2297 | if (__test_and_set_bit(0, (unsigned long *) | ||
2298 | &anon_vma->head.next)) | ||
2299 | BUG(); | ||
2300 | } | ||
2301 | } | ||
2302 | |||
2303 | static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | ||
2304 | { | ||
2305 | if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { | ||
2306 | /* | ||
2307 | * AS_MM_ALL_LOCKS can't change from under us because | ||
2308 | * we hold the mm_all_locks_mutex. | ||
2309 | * | ||
2310 | * Operations on ->flags have to be atomic because | ||
2311 | * even if AS_MM_ALL_LOCKS is stable thanks to the | ||
2312 | * mm_all_locks_mutex, there may be other cpus | ||
2313 | * changing other bitflags in parallel to us. | ||
2314 | */ | ||
2315 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) | ||
2316 | BUG(); | ||
2317 | spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); | ||
2318 | } | ||
2319 | } | ||
2320 | |||
2321 | /* | ||
2322 | * This operation locks against the VM for all pte/vma/mm related | ||
2323 | * operations that could ever happen on a certain mm. This includes | ||
2324 | * vmtruncate, try_to_unmap, and all page faults. | ||
2325 | * | ||
2326 | * The caller must take the mmap_sem in write mode before calling | ||
2327 | * mm_take_all_locks(). The caller isn't allowed to release the | ||
2328 | * mmap_sem until mm_drop_all_locks() returns. | ||
2329 | * | ||
2330 | * mmap_sem in write mode is required in order to block all operations | ||
2331 | * that could modify pagetables and free pages without need of | ||
2332 | * altering the vma layout (for example populate_range() with | ||
2333 | * nonlinear vmas). It's also needed in write mode to avoid new | ||
2334 | * anon_vmas to be associated with existing vmas. | ||
2335 | * | ||
2336 | * A single task can't take more than one mm_take_all_locks() in a row | ||
2337 | * or it would deadlock. | ||
2338 | * | ||
2339 | * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in | ||
2340 | * mapping->flags avoid to take the same lock twice, if more than one | ||
2341 | * vma in this mm is backed by the same anon_vma or address_space. | ||
2342 | * | ||
2343 | * We can take all the locks in random order because the VM code | ||
2344 | * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never | ||
2345 | * takes more than one of them in a row. Secondly we're protected | ||
2346 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. | ||
2347 | * | ||
2348 | * mm_take_all_locks() and mm_drop_all_locks are expensive operations | ||
2349 | * that may have to take thousand of locks. | ||
2350 | * | ||
2351 | * mm_take_all_locks() can fail if it's interrupted by signals. | ||
2352 | */ | ||
2353 | int mm_take_all_locks(struct mm_struct *mm) | ||
2354 | { | ||
2355 | struct vm_area_struct *vma; | ||
2356 | int ret = -EINTR; | ||
2357 | |||
2358 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | ||
2359 | |||
2360 | mutex_lock(&mm_all_locks_mutex); | ||
2361 | |||
2362 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
2363 | if (signal_pending(current)) | ||
2364 | goto out_unlock; | ||
2365 | if (vma->vm_file && vma->vm_file->f_mapping) | ||
2366 | vm_lock_mapping(mm, vma->vm_file->f_mapping); | ||
2367 | } | ||
2368 | |||
2369 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
2370 | if (signal_pending(current)) | ||
2371 | goto out_unlock; | ||
2372 | if (vma->anon_vma) | ||
2373 | vm_lock_anon_vma(mm, vma->anon_vma); | ||
2374 | } | ||
2375 | |||
2376 | ret = 0; | ||
2377 | |||
2378 | out_unlock: | ||
2379 | if (ret) | ||
2380 | mm_drop_all_locks(mm); | ||
2381 | |||
2382 | return ret; | ||
2383 | } | ||
2384 | |||
2385 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | ||
2386 | { | ||
2387 | if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { | ||
2388 | /* | ||
2389 | * The LSB of head.next can't change to 0 from under | ||
2390 | * us because we hold the mm_all_locks_mutex. | ||
2391 | * | ||
2392 | * We must however clear the bitflag before unlocking | ||
2393 | * the vma so the users using the anon_vma->head will | ||
2394 | * never see our bitflag. | ||
2395 | * | ||
2396 | * No need of atomic instructions here, head.next | ||
2397 | * can't change from under us until we release the | ||
2398 | * anon_vma->lock. | ||
2399 | */ | ||
2400 | if (!__test_and_clear_bit(0, (unsigned long *) | ||
2401 | &anon_vma->head.next)) | ||
2402 | BUG(); | ||
2403 | spin_unlock(&anon_vma->lock); | ||
2404 | } | ||
2405 | } | ||
2406 | |||
2407 | static void vm_unlock_mapping(struct address_space *mapping) | ||
2408 | { | ||
2409 | if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { | ||
2410 | /* | ||
2411 | * AS_MM_ALL_LOCKS can't change to 0 from under us | ||
2412 | * because we hold the mm_all_locks_mutex. | ||
2413 | */ | ||
2414 | spin_unlock(&mapping->i_mmap_lock); | ||
2415 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, | ||
2416 | &mapping->flags)) | ||
2417 | BUG(); | ||
2418 | } | ||
2419 | } | ||
2420 | |||
2421 | /* | ||
2422 | * The mmap_sem cannot be released by the caller until | ||
2423 | * mm_drop_all_locks() returns. | ||
2424 | */ | ||
2425 | void mm_drop_all_locks(struct mm_struct *mm) | ||
2426 | { | ||
2427 | struct vm_area_struct *vma; | ||
2428 | |||
2429 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | ||
2430 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); | ||
2431 | |||
2432 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
2433 | if (vma->anon_vma) | ||
2434 | vm_unlock_anon_vma(vma->anon_vma); | ||
2435 | if (vma->vm_file && vma->vm_file->f_mapping) | ||
2436 | vm_unlock_mapping(vma->vm_file->f_mapping); | ||
2437 | } | ||
2438 | |||
2439 | mutex_unlock(&mm_all_locks_mutex); | ||
2440 | } | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c new file mode 100644 index 000000000000..5f4ef0250bee --- /dev/null +++ b/mm/mmu_notifier.c | |||
@@ -0,0 +1,277 @@ | |||
1 | /* | ||
2 | * linux/mm/mmu_notifier.c | ||
3 | * | ||
4 | * Copyright (C) 2008 Qumranet, Inc. | ||
5 | * Copyright (C) 2008 SGI | ||
6 | * Christoph Lameter <clameter@sgi.com> | ||
7 | * | ||
8 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
9 | * the COPYING file in the top-level directory. | ||
10 | */ | ||
11 | |||
12 | #include <linux/rculist.h> | ||
13 | #include <linux/mmu_notifier.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/mm.h> | ||
16 | #include <linux/err.h> | ||
17 | #include <linux/rcupdate.h> | ||
18 | #include <linux/sched.h> | ||
19 | |||
20 | /* | ||
21 | * This function can't run concurrently against mmu_notifier_register | ||
22 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap | ||
23 | * runs with mm_users == 0. Other tasks may still invoke mmu notifiers | ||
24 | * in parallel despite there being no task using this mm any more, | ||
25 | * through the vmas outside of the exit_mmap context, such as with | ||
26 | * vmtruncate. This serializes against mmu_notifier_unregister with | ||
27 | * the mmu_notifier_mm->lock in addition to RCU and it serializes | ||
28 | * against the other mmu notifiers with RCU. struct mmu_notifier_mm | ||
29 | * can't go away from under us as exit_mmap holds an mm_count pin | ||
30 | * itself. | ||
31 | */ | ||
32 | void __mmu_notifier_release(struct mm_struct *mm) | ||
33 | { | ||
34 | struct mmu_notifier *mn; | ||
35 | |||
36 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
37 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | ||
38 | mn = hlist_entry(mm->mmu_notifier_mm->list.first, | ||
39 | struct mmu_notifier, | ||
40 | hlist); | ||
41 | /* | ||
42 | * We arrived before mmu_notifier_unregister so | ||
43 | * mmu_notifier_unregister will do nothing other than | ||
44 | * to wait ->release to finish and | ||
45 | * mmu_notifier_unregister to return. | ||
46 | */ | ||
47 | hlist_del_init_rcu(&mn->hlist); | ||
48 | /* | ||
49 | * RCU here will block mmu_notifier_unregister until | ||
50 | * ->release returns. | ||
51 | */ | ||
52 | rcu_read_lock(); | ||
53 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
54 | /* | ||
55 | * if ->release runs before mmu_notifier_unregister it | ||
56 | * must be handled as it's the only way for the driver | ||
57 | * to flush all existing sptes and stop the driver | ||
58 | * from establishing any more sptes before all the | ||
59 | * pages in the mm are freed. | ||
60 | */ | ||
61 | if (mn->ops->release) | ||
62 | mn->ops->release(mn, mm); | ||
63 | rcu_read_unlock(); | ||
64 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
65 | } | ||
66 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
67 | |||
68 | /* | ||
69 | * synchronize_rcu here prevents mmu_notifier_release to | ||
70 | * return to exit_mmap (which would proceed freeing all pages | ||
71 | * in the mm) until the ->release method returns, if it was | ||
72 | * invoked by mmu_notifier_unregister. | ||
73 | * | ||
74 | * The mmu_notifier_mm can't go away from under us because one | ||
75 | * mm_count is hold by exit_mmap. | ||
76 | */ | ||
77 | synchronize_rcu(); | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * If no young bitflag is supported by the hardware, ->clear_flush_young can | ||
82 | * unmap the address and return 1 or 0 depending if the mapping previously | ||
83 | * existed or not. | ||
84 | */ | ||
85 | int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | ||
86 | unsigned long address) | ||
87 | { | ||
88 | struct mmu_notifier *mn; | ||
89 | struct hlist_node *n; | ||
90 | int young = 0; | ||
91 | |||
92 | rcu_read_lock(); | ||
93 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
94 | if (mn->ops->clear_flush_young) | ||
95 | young |= mn->ops->clear_flush_young(mn, mm, address); | ||
96 | } | ||
97 | rcu_read_unlock(); | ||
98 | |||
99 | return young; | ||
100 | } | ||
101 | |||
102 | void __mmu_notifier_invalidate_page(struct mm_struct *mm, | ||
103 | unsigned long address) | ||
104 | { | ||
105 | struct mmu_notifier *mn; | ||
106 | struct hlist_node *n; | ||
107 | |||
108 | rcu_read_lock(); | ||
109 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
110 | if (mn->ops->invalidate_page) | ||
111 | mn->ops->invalidate_page(mn, mm, address); | ||
112 | } | ||
113 | rcu_read_unlock(); | ||
114 | } | ||
115 | |||
116 | void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | ||
117 | unsigned long start, unsigned long end) | ||
118 | { | ||
119 | struct mmu_notifier *mn; | ||
120 | struct hlist_node *n; | ||
121 | |||
122 | rcu_read_lock(); | ||
123 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
124 | if (mn->ops->invalidate_range_start) | ||
125 | mn->ops->invalidate_range_start(mn, mm, start, end); | ||
126 | } | ||
127 | rcu_read_unlock(); | ||
128 | } | ||
129 | |||
130 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | ||
131 | unsigned long start, unsigned long end) | ||
132 | { | ||
133 | struct mmu_notifier *mn; | ||
134 | struct hlist_node *n; | ||
135 | |||
136 | rcu_read_lock(); | ||
137 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
138 | if (mn->ops->invalidate_range_end) | ||
139 | mn->ops->invalidate_range_end(mn, mm, start, end); | ||
140 | } | ||
141 | rcu_read_unlock(); | ||
142 | } | ||
143 | |||
144 | static int do_mmu_notifier_register(struct mmu_notifier *mn, | ||
145 | struct mm_struct *mm, | ||
146 | int take_mmap_sem) | ||
147 | { | ||
148 | struct mmu_notifier_mm *mmu_notifier_mm; | ||
149 | int ret; | ||
150 | |||
151 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | ||
152 | |||
153 | ret = -ENOMEM; | ||
154 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); | ||
155 | if (unlikely(!mmu_notifier_mm)) | ||
156 | goto out; | ||
157 | |||
158 | if (take_mmap_sem) | ||
159 | down_write(&mm->mmap_sem); | ||
160 | ret = mm_take_all_locks(mm); | ||
161 | if (unlikely(ret)) | ||
162 | goto out_cleanup; | ||
163 | |||
164 | if (!mm_has_notifiers(mm)) { | ||
165 | INIT_HLIST_HEAD(&mmu_notifier_mm->list); | ||
166 | spin_lock_init(&mmu_notifier_mm->lock); | ||
167 | mm->mmu_notifier_mm = mmu_notifier_mm; | ||
168 | mmu_notifier_mm = NULL; | ||
169 | } | ||
170 | atomic_inc(&mm->mm_count); | ||
171 | |||
172 | /* | ||
173 | * Serialize the update against mmu_notifier_unregister. A | ||
174 | * side note: mmu_notifier_release can't run concurrently with | ||
175 | * us because we hold the mm_users pin (either implicitly as | ||
176 | * current->mm or explicitly with get_task_mm() or similar). | ||
177 | * We can't race against any other mmu notifier method either | ||
178 | * thanks to mm_take_all_locks(). | ||
179 | */ | ||
180 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
181 | hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); | ||
182 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
183 | |||
184 | mm_drop_all_locks(mm); | ||
185 | out_cleanup: | ||
186 | if (take_mmap_sem) | ||
187 | up_write(&mm->mmap_sem); | ||
188 | /* kfree() does nothing if mmu_notifier_mm is NULL */ | ||
189 | kfree(mmu_notifier_mm); | ||
190 | out: | ||
191 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | ||
192 | return ret; | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * Must not hold mmap_sem nor any other VM related lock when calling | ||
197 | * this registration function. Must also ensure mm_users can't go down | ||
198 | * to zero while this runs to avoid races with mmu_notifier_release, | ||
199 | * so mm has to be current->mm or the mm should be pinned safely such | ||
200 | * as with get_task_mm(). If the mm is not current->mm, the mm_users | ||
201 | * pin should be released by calling mmput after mmu_notifier_register | ||
202 | * returns. mmu_notifier_unregister must be always called to | ||
203 | * unregister the notifier. mm_count is automatically pinned to allow | ||
204 | * mmu_notifier_unregister to safely run at any time later, before or | ||
205 | * after exit_mmap. ->release will always be called before exit_mmap | ||
206 | * frees the pages. | ||
207 | */ | ||
208 | int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) | ||
209 | { | ||
210 | return do_mmu_notifier_register(mn, mm, 1); | ||
211 | } | ||
212 | EXPORT_SYMBOL_GPL(mmu_notifier_register); | ||
213 | |||
214 | /* | ||
215 | * Same as mmu_notifier_register but here the caller must hold the | ||
216 | * mmap_sem in write mode. | ||
217 | */ | ||
218 | int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) | ||
219 | { | ||
220 | return do_mmu_notifier_register(mn, mm, 0); | ||
221 | } | ||
222 | EXPORT_SYMBOL_GPL(__mmu_notifier_register); | ||
223 | |||
224 | /* this is called after the last mmu_notifier_unregister() returned */ | ||
225 | void __mmu_notifier_mm_destroy(struct mm_struct *mm) | ||
226 | { | ||
227 | BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list)); | ||
228 | kfree(mm->mmu_notifier_mm); | ||
229 | mm->mmu_notifier_mm = LIST_POISON1; /* debug */ | ||
230 | } | ||
231 | |||
232 | /* | ||
233 | * This releases the mm_count pin automatically and frees the mm | ||
234 | * structure if it was the last user of it. It serializes against | ||
235 | * running mmu notifiers with RCU and against mmu_notifier_unregister | ||
236 | * with the unregister lock + RCU. All sptes must be dropped before | ||
237 | * calling mmu_notifier_unregister. ->release or any other notifier | ||
238 | * method may be invoked concurrently with mmu_notifier_unregister, | ||
239 | * and only after mmu_notifier_unregister returned we're guaranteed | ||
240 | * that ->release or any other method can't run anymore. | ||
241 | */ | ||
242 | void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | ||
243 | { | ||
244 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | ||
245 | |||
246 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
247 | if (!hlist_unhashed(&mn->hlist)) { | ||
248 | hlist_del_rcu(&mn->hlist); | ||
249 | |||
250 | /* | ||
251 | * RCU here will force exit_mmap to wait ->release to finish | ||
252 | * before freeing the pages. | ||
253 | */ | ||
254 | rcu_read_lock(); | ||
255 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
256 | /* | ||
257 | * exit_mmap will block in mmu_notifier_release to | ||
258 | * guarantee ->release is called before freeing the | ||
259 | * pages. | ||
260 | */ | ||
261 | if (mn->ops->release) | ||
262 | mn->ops->release(mn, mm); | ||
263 | rcu_read_unlock(); | ||
264 | } else | ||
265 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
266 | |||
267 | /* | ||
268 | * Wait any running method to finish, of course including | ||
269 | * ->release if it was run by mmu_notifier_relase instead of us. | ||
270 | */ | ||
271 | synchronize_rcu(); | ||
272 | |||
273 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | ||
274 | |||
275 | mmdrop(mm); | ||
276 | } | ||
277 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); | ||
diff --git a/mm/mmzone.c b/mm/mmzone.c index 486ed595ee6f..16ce8b955dcf 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, | |||
69 | (z->zone && !zref_in_nodemask(z, nodes))) | 69 | (z->zone && !zref_in_nodemask(z, nodes))) |
70 | z++; | 70 | z++; |
71 | 71 | ||
72 | *zone = zonelist_zone(z++); | 72 | *zone = zonelist_zone(z); |
73 | return z; | 73 | return z; |
74 | } | 74 | } |
diff --git a/mm/mprotect.c b/mm/mprotect.c index abd645a3b0a0..fded06f923f4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/syscalls.h> | 21 | #include <linux/syscalls.h> |
22 | #include <linux/swap.h> | 22 | #include <linux/swap.h> |
23 | #include <linux/swapops.h> | 23 | #include <linux/swapops.h> |
24 | #include <linux/mmu_notifier.h> | ||
24 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
25 | #include <asm/pgtable.h> | 26 | #include <asm/pgtable.h> |
26 | #include <asm/cacheflush.h> | 27 | #include <asm/cacheflush.h> |
@@ -203,10 +204,12 @@ success: | |||
203 | dirty_accountable = 1; | 204 | dirty_accountable = 1; |
204 | } | 205 | } |
205 | 206 | ||
207 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
206 | if (is_vm_hugetlb_page(vma)) | 208 | if (is_vm_hugetlb_page(vma)) |
207 | hugetlb_change_protection(vma, start, end, vma->vm_page_prot); | 209 | hugetlb_change_protection(vma, start, end, vma->vm_page_prot); |
208 | else | 210 | else |
209 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); | 211 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); |
212 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
210 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 213 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
211 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 214 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
212 | return 0; | 215 | return 0; |
diff --git a/mm/mremap.c b/mm/mremap.c index 08e3c7f2bd15..1a7743923c8c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/highmem.h> | 18 | #include <linux/highmem.h> |
19 | #include <linux/security.h> | 19 | #include <linux/security.h> |
20 | #include <linux/syscalls.h> | 20 | #include <linux/syscalls.h> |
21 | #include <linux/mmu_notifier.h> | ||
21 | 22 | ||
22 | #include <asm/uaccess.h> | 23 | #include <asm/uaccess.h> |
23 | #include <asm/cacheflush.h> | 24 | #include <asm/cacheflush.h> |
@@ -74,7 +75,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
74 | struct mm_struct *mm = vma->vm_mm; | 75 | struct mm_struct *mm = vma->vm_mm; |
75 | pte_t *old_pte, *new_pte, pte; | 76 | pte_t *old_pte, *new_pte, pte; |
76 | spinlock_t *old_ptl, *new_ptl; | 77 | spinlock_t *old_ptl, *new_ptl; |
78 | unsigned long old_start; | ||
77 | 79 | ||
80 | old_start = old_addr; | ||
81 | mmu_notifier_invalidate_range_start(vma->vm_mm, | ||
82 | old_start, old_end); | ||
78 | if (vma->vm_file) { | 83 | if (vma->vm_file) { |
79 | /* | 84 | /* |
80 | * Subtle point from Rajesh Venkatasubramanian: before | 85 | * Subtle point from Rajesh Venkatasubramanian: before |
@@ -116,6 +121,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
116 | pte_unmap_unlock(old_pte - 1, old_ptl); | 121 | pte_unmap_unlock(old_pte - 1, old_ptl); |
117 | if (mapping) | 122 | if (mapping) |
118 | spin_unlock(&mapping->i_mmap_lock); | 123 | spin_unlock(&mapping->i_mmap_lock); |
124 | mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); | ||
119 | } | 125 | } |
120 | 126 | ||
121 | #define LATENCY_LIMIT (64 * PAGE_SIZE) | 127 | #define LATENCY_LIMIT (64 * PAGE_SIZE) |
diff --git a/mm/nommu.c b/mm/nommu.c index 4462b6a3fcb9..ed75bc962fbe 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -22,7 +22,7 @@ | |||
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/vmalloc.h> | 24 | #include <linux/vmalloc.h> |
25 | #include <linux/ptrace.h> | 25 | #include <linux/tracehook.h> |
26 | #include <linux/blkdev.h> | 26 | #include <linux/blkdev.h> |
27 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
28 | #include <linux/mount.h> | 28 | #include <linux/mount.h> |
@@ -266,6 +266,27 @@ void *vmalloc_node(unsigned long size, int node) | |||
266 | } | 266 | } |
267 | EXPORT_SYMBOL(vmalloc_node); | 267 | EXPORT_SYMBOL(vmalloc_node); |
268 | 268 | ||
269 | #ifndef PAGE_KERNEL_EXEC | ||
270 | # define PAGE_KERNEL_EXEC PAGE_KERNEL | ||
271 | #endif | ||
272 | |||
273 | /** | ||
274 | * vmalloc_exec - allocate virtually contiguous, executable memory | ||
275 | * @size: allocation size | ||
276 | * | ||
277 | * Kernel-internal function to allocate enough pages to cover @size | ||
278 | * the page level allocator and map them into contiguous and | ||
279 | * executable kernel virtual space. | ||
280 | * | ||
281 | * For tight control over page level allocator and protection flags | ||
282 | * use __vmalloc() instead. | ||
283 | */ | ||
284 | |||
285 | void *vmalloc_exec(unsigned long size) | ||
286 | { | ||
287 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); | ||
288 | } | ||
289 | |||
269 | /** | 290 | /** |
270 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) | 291 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) |
271 | * @size: allocation size | 292 | * @size: allocation size |
@@ -745,7 +766,7 @@ static unsigned long determine_vm_flags(struct file *file, | |||
745 | * it's being traced - otherwise breakpoints set in it may interfere | 766 | * it's being traced - otherwise breakpoints set in it may interfere |
746 | * with another untraced process | 767 | * with another untraced process |
747 | */ | 768 | */ |
748 | if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED)) | 769 | if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) |
749 | vm_flags &= ~VM_MAYSHARE; | 770 | vm_flags &= ~VM_MAYSHARE; |
750 | 771 | ||
751 | return vm_flags; | 772 | return vm_flags; |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8a5467ee6265..64e5b4bcd964 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/module.h> | 26 | #include <linux/module.h> |
27 | #include <linux/notifier.h> | 27 | #include <linux/notifier.h> |
28 | #include <linux/memcontrol.h> | 28 | #include <linux/memcontrol.h> |
29 | #include <linux/security.h> | ||
29 | 30 | ||
30 | int sysctl_panic_on_oom; | 31 | int sysctl_panic_on_oom; |
31 | int sysctl_oom_kill_allocating_task; | 32 | int sysctl_oom_kill_allocating_task; |
@@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
128 | * Superuser processes are usually more important, so we make it | 129 | * Superuser processes are usually more important, so we make it |
129 | * less likely that we kill those. | 130 | * less likely that we kill those. |
130 | */ | 131 | */ |
131 | if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE)) | 132 | if (has_capability(p, CAP_SYS_ADMIN) || |
133 | has_capability(p, CAP_SYS_RESOURCE)) | ||
132 | points /= 4; | 134 | points /= 4; |
133 | 135 | ||
134 | /* | 136 | /* |
@@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
137 | * tend to only have this flag set on applications they think | 139 | * tend to only have this flag set on applications they think |
138 | * of as important. | 140 | * of as important. |
139 | */ | 141 | */ |
140 | if (__capable(p, CAP_SYS_RAWIO)) | 142 | if (has_capability(p, CAP_SYS_RAWIO)) |
141 | points /= 4; | 143 | points /= 4; |
142 | 144 | ||
143 | /* | 145 | /* |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 94c6d8988ab3..24de8b65fdbd 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1088,7 +1088,7 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
1088 | if (!mapping) | 1088 | if (!mapping) |
1089 | return 1; | 1089 | return 1; |
1090 | 1090 | ||
1091 | write_lock_irq(&mapping->tree_lock); | 1091 | spin_lock_irq(&mapping->tree_lock); |
1092 | mapping2 = page_mapping(page); | 1092 | mapping2 = page_mapping(page); |
1093 | if (mapping2) { /* Race with truncate? */ | 1093 | if (mapping2) { /* Race with truncate? */ |
1094 | BUG_ON(mapping2 != mapping); | 1094 | BUG_ON(mapping2 != mapping); |
@@ -1102,7 +1102,7 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
1102 | radix_tree_tag_set(&mapping->page_tree, | 1102 | radix_tree_tag_set(&mapping->page_tree, |
1103 | page_index(page), PAGECACHE_TAG_DIRTY); | 1103 | page_index(page), PAGECACHE_TAG_DIRTY); |
1104 | } | 1104 | } |
1105 | write_unlock_irq(&mapping->tree_lock); | 1105 | spin_unlock_irq(&mapping->tree_lock); |
1106 | if (mapping->host) { | 1106 | if (mapping->host) { |
1107 | /* !PageAnon && !swapper_space */ | 1107 | /* !PageAnon && !swapper_space */ |
1108 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | 1108 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
@@ -1258,7 +1258,7 @@ int test_clear_page_writeback(struct page *page) | |||
1258 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1258 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1259 | unsigned long flags; | 1259 | unsigned long flags; |
1260 | 1260 | ||
1261 | write_lock_irqsave(&mapping->tree_lock, flags); | 1261 | spin_lock_irqsave(&mapping->tree_lock, flags); |
1262 | ret = TestClearPageWriteback(page); | 1262 | ret = TestClearPageWriteback(page); |
1263 | if (ret) { | 1263 | if (ret) { |
1264 | radix_tree_tag_clear(&mapping->page_tree, | 1264 | radix_tree_tag_clear(&mapping->page_tree, |
@@ -1269,7 +1269,7 @@ int test_clear_page_writeback(struct page *page) | |||
1269 | __bdi_writeout_inc(bdi); | 1269 | __bdi_writeout_inc(bdi); |
1270 | } | 1270 | } |
1271 | } | 1271 | } |
1272 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 1272 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
1273 | } else { | 1273 | } else { |
1274 | ret = TestClearPageWriteback(page); | 1274 | ret = TestClearPageWriteback(page); |
1275 | } | 1275 | } |
@@ -1287,7 +1287,7 @@ int test_set_page_writeback(struct page *page) | |||
1287 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1287 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1288 | unsigned long flags; | 1288 | unsigned long flags; |
1289 | 1289 | ||
1290 | write_lock_irqsave(&mapping->tree_lock, flags); | 1290 | spin_lock_irqsave(&mapping->tree_lock, flags); |
1291 | ret = TestSetPageWriteback(page); | 1291 | ret = TestSetPageWriteback(page); |
1292 | if (!ret) { | 1292 | if (!ret) { |
1293 | radix_tree_tag_set(&mapping->page_tree, | 1293 | radix_tree_tag_set(&mapping->page_tree, |
@@ -1300,7 +1300,7 @@ int test_set_page_writeback(struct page *page) | |||
1300 | radix_tree_tag_clear(&mapping->page_tree, | 1300 | radix_tree_tag_clear(&mapping->page_tree, |
1301 | page_index(page), | 1301 | page_index(page), |
1302 | PAGECACHE_TAG_DIRTY); | 1302 | PAGECACHE_TAG_DIRTY); |
1303 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 1303 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
1304 | } else { | 1304 | } else { |
1305 | ret = TestSetPageWriteback(page); | 1305 | ret = TestSetPageWriteback(page); |
1306 | } | 1306 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6da667274df5..27b8681139fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -268,13 +268,14 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
268 | { | 268 | { |
269 | int i; | 269 | int i; |
270 | int nr_pages = 1 << order; | 270 | int nr_pages = 1 << order; |
271 | struct page *p = page + 1; | ||
271 | 272 | ||
272 | set_compound_page_dtor(page, free_compound_page); | 273 | set_compound_page_dtor(page, free_compound_page); |
273 | set_compound_order(page, order); | 274 | set_compound_order(page, order); |
274 | __SetPageHead(page); | 275 | __SetPageHead(page); |
275 | for (i = 1; i < nr_pages; i++) { | 276 | for (i = 1; i < nr_pages; i++, p++) { |
276 | struct page *p = page + i; | 277 | if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) |
277 | 278 | p = pfn_to_page(page_to_pfn(page) + i); | |
278 | __SetPageTail(p); | 279 | __SetPageTail(p); |
279 | p->first_page = page; | 280 | p->first_page = page; |
280 | } | 281 | } |
@@ -284,6 +285,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
284 | { | 285 | { |
285 | int i; | 286 | int i; |
286 | int nr_pages = 1 << order; | 287 | int nr_pages = 1 << order; |
288 | struct page *p = page + 1; | ||
287 | 289 | ||
288 | if (unlikely(compound_order(page) != order)) | 290 | if (unlikely(compound_order(page) != order)) |
289 | bad_page(page); | 291 | bad_page(page); |
@@ -291,8 +293,9 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
291 | if (unlikely(!PageHead(page))) | 293 | if (unlikely(!PageHead(page))) |
292 | bad_page(page); | 294 | bad_page(page); |
293 | __ClearPageHead(page); | 295 | __ClearPageHead(page); |
294 | for (i = 1; i < nr_pages; i++) { | 296 | for (i = 1; i < nr_pages; i++, p++) { |
295 | struct page *p = page + i; | 297 | if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) |
298 | p = pfn_to_page(page_to_pfn(page) + i); | ||
296 | 299 | ||
297 | if (unlikely(!PageTail(p) | | 300 | if (unlikely(!PageTail(p) | |
298 | (p->first_page != page))) | 301 | (p->first_page != page))) |
@@ -694,6 +697,9 @@ static int move_freepages(struct zone *zone, | |||
694 | #endif | 697 | #endif |
695 | 698 | ||
696 | for (page = start_page; page <= end_page;) { | 699 | for (page = start_page; page <= end_page;) { |
700 | /* Make sure we are not inadvertently changing nodes */ | ||
701 | VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); | ||
702 | |||
697 | if (!pfn_valid_within(page_to_pfn(page))) { | 703 | if (!pfn_valid_within(page_to_pfn(page))) { |
698 | page++; | 704 | page++; |
699 | continue; | 705 | continue; |
@@ -2372,7 +2378,7 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
2372 | 2378 | ||
2373 | #endif /* CONFIG_NUMA */ | 2379 | #endif /* CONFIG_NUMA */ |
2374 | 2380 | ||
2375 | /* return values int ....just for stop_machine_run() */ | 2381 | /* return values int ....just for stop_machine() */ |
2376 | static int __build_all_zonelists(void *dummy) | 2382 | static int __build_all_zonelists(void *dummy) |
2377 | { | 2383 | { |
2378 | int nid; | 2384 | int nid; |
@@ -2397,7 +2403,7 @@ void build_all_zonelists(void) | |||
2397 | } else { | 2403 | } else { |
2398 | /* we have to stop all cpus to guarantee there is no user | 2404 | /* we have to stop all cpus to guarantee there is no user |
2399 | of zonelist */ | 2405 | of zonelist */ |
2400 | stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); | 2406 | stop_machine(__build_all_zonelists, NULL, NULL); |
2401 | /* cpuset refresh routine should be here */ | 2407 | /* cpuset refresh routine should be here */ |
2402 | } | 2408 | } |
2403 | vm_total_pages = nr_free_pagecache_pages(); | 2409 | vm_total_pages = nr_free_pagecache_pages(); |
@@ -2516,6 +2522,10 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
2516 | continue; | 2522 | continue; |
2517 | page = pfn_to_page(pfn); | 2523 | page = pfn_to_page(pfn); |
2518 | 2524 | ||
2525 | /* Watch out for overlapping nodes */ | ||
2526 | if (page_to_nid(page) != zone_to_nid(zone)) | ||
2527 | continue; | ||
2528 | |||
2519 | /* Blocks with reserved pages will never free, skip them. */ | 2529 | /* Blocks with reserved pages will never free, skip them. */ |
2520 | if (PageReserved(page)) | 2530 | if (PageReserved(page)) |
2521 | continue; | 2531 | continue; |
@@ -3753,23 +3763,6 @@ unsigned long __init find_min_pfn_with_active_regions(void) | |||
3753 | return find_min_pfn_for_node(MAX_NUMNODES); | 3763 | return find_min_pfn_for_node(MAX_NUMNODES); |
3754 | } | 3764 | } |
3755 | 3765 | ||
3756 | /** | ||
3757 | * find_max_pfn_with_active_regions - Find the maximum PFN registered | ||
3758 | * | ||
3759 | * It returns the maximum PFN based on information provided via | ||
3760 | * add_active_range(). | ||
3761 | */ | ||
3762 | unsigned long __init find_max_pfn_with_active_regions(void) | ||
3763 | { | ||
3764 | int i; | ||
3765 | unsigned long max_pfn = 0; | ||
3766 | |||
3767 | for (i = 0; i < nr_nodemap_entries; i++) | ||
3768 | max_pfn = max(max_pfn, early_node_map[i].end_pfn); | ||
3769 | |||
3770 | return max_pfn; | ||
3771 | } | ||
3772 | |||
3773 | /* | 3766 | /* |
3774 | * early_calculate_totalpages() | 3767 | * early_calculate_totalpages() |
3775 | * Sum pages in active regions for movable zone. | 3768 | * Sum pages in active regions for movable zone. |
@@ -4081,7 +4074,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) | |||
4081 | } | 4074 | } |
4082 | 4075 | ||
4083 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4076 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
4084 | struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] }; | 4077 | struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; |
4085 | EXPORT_SYMBOL(contig_page_data); | 4078 | EXPORT_SYMBOL(contig_page_data); |
4086 | #endif | 4079 | #endif |
4087 | 4080 | ||
@@ -4454,7 +4447,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4454 | do { | 4447 | do { |
4455 | size = bucketsize << log2qty; | 4448 | size = bucketsize << log2qty; |
4456 | if (flags & HASH_EARLY) | 4449 | if (flags & HASH_EARLY) |
4457 | table = alloc_bootmem(size); | 4450 | table = alloc_bootmem_nopanic(size); |
4458 | else if (hashdist) | 4451 | else if (hashdist) |
4459 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 4452 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
4460 | else { | 4453 | else { |
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 3444b58033c8..b70a7fec1ff6 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -2,7 +2,6 @@ | |||
2 | * linux/mm/page_isolation.c | 2 | * linux/mm/page_isolation.c |
3 | */ | 3 | */ |
4 | 4 | ||
5 | #include <stddef.h> | ||
6 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
7 | #include <linux/page-isolation.h> | 6 | #include <linux/page-isolation.h> |
8 | #include <linux/pageblock-flags.h> | 7 | #include <linux/pageblock-flags.h> |
@@ -115,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | |||
115 | 114 | ||
116 | int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | 115 | int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) |
117 | { | 116 | { |
118 | unsigned long pfn; | 117 | unsigned long pfn, flags; |
119 | struct page *page; | 118 | struct page *page; |
119 | struct zone *zone; | ||
120 | int ret; | ||
120 | 121 | ||
121 | pfn = start_pfn; | 122 | pfn = start_pfn; |
122 | /* | 123 | /* |
@@ -132,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
132 | if (pfn < end_pfn) | 133 | if (pfn < end_pfn) |
133 | return -EBUSY; | 134 | return -EBUSY; |
134 | /* Check all pages are free or Marked as ISOLATED */ | 135 | /* Check all pages are free or Marked as ISOLATED */ |
135 | if (__test_page_isolated_in_pageblock(start_pfn, end_pfn)) | 136 | zone = page_zone(pfn_to_page(pfn)); |
136 | return 0; | 137 | spin_lock_irqsave(&zone->lock, flags); |
137 | return -EBUSY; | 138 | ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); |
139 | spin_unlock_irqrestore(&zone->lock, flags); | ||
140 | return ret ? 0 : -EBUSY; | ||
138 | } | 141 | } |
diff --git a/mm/quicklist.c b/mm/quicklist.c index 3f703f7cb398..8dbb6805ef35 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c | |||
@@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; | |||
26 | static unsigned long max_pages(unsigned long min_pages) | 26 | static unsigned long max_pages(unsigned long min_pages) |
27 | { | 27 | { |
28 | unsigned long node_free_pages, max; | 28 | unsigned long node_free_pages, max; |
29 | struct zone *zones = NODE_DATA(numa_node_id())->node_zones; | 29 | int node = numa_node_id(); |
30 | struct zone *zones = NODE_DATA(node)->node_zones; | ||
31 | int num_cpus_on_node; | ||
32 | node_to_cpumask_ptr(cpumask_on_node, node); | ||
30 | 33 | ||
31 | node_free_pages = | 34 | node_free_pages = |
32 | #ifdef CONFIG_ZONE_DMA | 35 | #ifdef CONFIG_ZONE_DMA |
@@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages) | |||
38 | zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); | 41 | zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); |
39 | 42 | ||
40 | max = node_free_pages / FRACTION_OF_NODE_MEM; | 43 | max = node_free_pages / FRACTION_OF_NODE_MEM; |
44 | |||
45 | num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); | ||
46 | max /= num_cpus_on_node; | ||
47 | |||
41 | return max(max, min_pages); | 48 | return max(max, min_pages); |
42 | } | 49 | } |
43 | 50 | ||
diff --git a/mm/readahead.c b/mm/readahead.c index d8723a5f6496..77e8ddf945e9 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping, | |||
382 | if (hit_readahead_marker) { | 382 | if (hit_readahead_marker) { |
383 | pgoff_t start; | 383 | pgoff_t start; |
384 | 384 | ||
385 | read_lock_irq(&mapping->tree_lock); | 385 | rcu_read_lock(); |
386 | start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); | 386 | start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); |
387 | read_unlock_irq(&mapping->tree_lock); | 387 | rcu_read_unlock(); |
388 | 388 | ||
389 | if (!start || start - offset > max) | 389 | if (!start || start - offset > max) |
390 | return 0; | 390 | return 0; |
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
50 | #include <linux/kallsyms.h> | 50 | #include <linux/kallsyms.h> |
51 | #include <linux/memcontrol.h> | 51 | #include <linux/memcontrol.h> |
52 | #include <linux/mmu_notifier.h> | ||
52 | 53 | ||
53 | #include <asm/tlbflush.h> | 54 | #include <asm/tlbflush.h> |
54 | 55 | ||
@@ -138,7 +139,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) | |||
138 | anon_vma_free(anon_vma); | 139 | anon_vma_free(anon_vma); |
139 | } | 140 | } |
140 | 141 | ||
141 | static void anon_vma_ctor(struct kmem_cache *cachep, void *data) | 142 | static void anon_vma_ctor(void *data) |
142 | { | 143 | { |
143 | struct anon_vma *anon_vma = data; | 144 | struct anon_vma *anon_vma = data; |
144 | 145 | ||
@@ -223,10 +224,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
223 | /* | 224 | /* |
224 | * Check that @page is mapped at @address into @mm. | 225 | * Check that @page is mapped at @address into @mm. |
225 | * | 226 | * |
227 | * If @sync is false, page_check_address may perform a racy check to avoid | ||
228 | * the page table lock when the pte is not present (helpful when reclaiming | ||
229 | * highly shared pages). | ||
230 | * | ||
226 | * On success returns with pte mapped and locked. | 231 | * On success returns with pte mapped and locked. |
227 | */ | 232 | */ |
228 | pte_t *page_check_address(struct page *page, struct mm_struct *mm, | 233 | pte_t *page_check_address(struct page *page, struct mm_struct *mm, |
229 | unsigned long address, spinlock_t **ptlp) | 234 | unsigned long address, spinlock_t **ptlp, int sync) |
230 | { | 235 | { |
231 | pgd_t *pgd; | 236 | pgd_t *pgd; |
232 | pud_t *pud; | 237 | pud_t *pud; |
@@ -248,7 +253,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, | |||
248 | 253 | ||
249 | pte = pte_offset_map(pmd, address); | 254 | pte = pte_offset_map(pmd, address); |
250 | /* Make a quick check before getting the lock */ | 255 | /* Make a quick check before getting the lock */ |
251 | if (!pte_present(*pte)) { | 256 | if (!sync && !pte_present(*pte)) { |
252 | pte_unmap(pte); | 257 | pte_unmap(pte); |
253 | return NULL; | 258 | return NULL; |
254 | } | 259 | } |
@@ -280,14 +285,14 @@ static int page_referenced_one(struct page *page, | |||
280 | if (address == -EFAULT) | 285 | if (address == -EFAULT) |
281 | goto out; | 286 | goto out; |
282 | 287 | ||
283 | pte = page_check_address(page, mm, address, &ptl); | 288 | pte = page_check_address(page, mm, address, &ptl, 0); |
284 | if (!pte) | 289 | if (!pte) |
285 | goto out; | 290 | goto out; |
286 | 291 | ||
287 | if (vma->vm_flags & VM_LOCKED) { | 292 | if (vma->vm_flags & VM_LOCKED) { |
288 | referenced++; | 293 | referenced++; |
289 | *mapcount = 1; /* break early from loop */ | 294 | *mapcount = 1; /* break early from loop */ |
290 | } else if (ptep_clear_flush_young(vma, address, pte)) | 295 | } else if (ptep_clear_flush_young_notify(vma, address, pte)) |
291 | referenced++; | 296 | referenced++; |
292 | 297 | ||
293 | /* Pretend the page is referenced if the task has the | 298 | /* Pretend the page is referenced if the task has the |
@@ -421,7 +426,7 @@ int page_referenced(struct page *page, int is_locked, | |||
421 | referenced += page_referenced_anon(page, mem_cont); | 426 | referenced += page_referenced_anon(page, mem_cont); |
422 | else if (is_locked) | 427 | else if (is_locked) |
423 | referenced += page_referenced_file(page, mem_cont); | 428 | referenced += page_referenced_file(page, mem_cont); |
424 | else if (TestSetPageLocked(page)) | 429 | else if (!trylock_page(page)) |
425 | referenced++; | 430 | referenced++; |
426 | else { | 431 | else { |
427 | if (page->mapping) | 432 | if (page->mapping) |
@@ -449,7 +454,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | |||
449 | if (address == -EFAULT) | 454 | if (address == -EFAULT) |
450 | goto out; | 455 | goto out; |
451 | 456 | ||
452 | pte = page_check_address(page, mm, address, &ptl); | 457 | pte = page_check_address(page, mm, address, &ptl, 1); |
453 | if (!pte) | 458 | if (!pte) |
454 | goto out; | 459 | goto out; |
455 | 460 | ||
@@ -457,7 +462,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | |||
457 | pte_t entry; | 462 | pte_t entry; |
458 | 463 | ||
459 | flush_cache_page(vma, address, pte_pfn(*pte)); | 464 | flush_cache_page(vma, address, pte_pfn(*pte)); |
460 | entry = ptep_clear_flush(vma, address, pte); | 465 | entry = ptep_clear_flush_notify(vma, address, pte); |
461 | entry = pte_wrprotect(entry); | 466 | entry = pte_wrprotect(entry); |
462 | entry = pte_mkclean(entry); | 467 | entry = pte_mkclean(entry); |
463 | set_pte_at(mm, address, pte, entry); | 468 | set_pte_at(mm, address, pte, entry); |
@@ -658,6 +663,22 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) | |||
658 | } | 663 | } |
659 | 664 | ||
660 | /* | 665 | /* |
666 | * Now that the last pte has gone, s390 must transfer dirty | ||
667 | * flag from storage key to struct page. We can usually skip | ||
668 | * this if the page is anon, so about to be freed; but perhaps | ||
669 | * not if it's in swapcache - there might be another pte slot | ||
670 | * containing the swap entry, but page not yet written to swap. | ||
671 | */ | ||
672 | if ((!PageAnon(page) || PageSwapCache(page)) && | ||
673 | page_test_dirty(page)) { | ||
674 | page_clear_dirty(page); | ||
675 | set_page_dirty(page); | ||
676 | } | ||
677 | |||
678 | mem_cgroup_uncharge_page(page); | ||
679 | __dec_zone_page_state(page, | ||
680 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); | ||
681 | /* | ||
661 | * It would be tidy to reset the PageAnon mapping here, | 682 | * It would be tidy to reset the PageAnon mapping here, |
662 | * but that might overwrite a racing page_add_anon_rmap | 683 | * but that might overwrite a racing page_add_anon_rmap |
663 | * which increments mapcount after us but sets mapping | 684 | * which increments mapcount after us but sets mapping |
@@ -666,14 +687,6 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) | |||
666 | * Leaving it set also helps swapoff to reinstate ptes | 687 | * Leaving it set also helps swapoff to reinstate ptes |
667 | * faster for those pages still in swapcache. | 688 | * faster for those pages still in swapcache. |
668 | */ | 689 | */ |
669 | if (page_test_dirty(page)) { | ||
670 | page_clear_dirty(page); | ||
671 | set_page_dirty(page); | ||
672 | } | ||
673 | mem_cgroup_uncharge_page(page); | ||
674 | |||
675 | __dec_zone_page_state(page, | ||
676 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); | ||
677 | } | 690 | } |
678 | } | 691 | } |
679 | 692 | ||
@@ -695,7 +708,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
695 | if (address == -EFAULT) | 708 | if (address == -EFAULT) |
696 | goto out; | 709 | goto out; |
697 | 710 | ||
698 | pte = page_check_address(page, mm, address, &ptl); | 711 | pte = page_check_address(page, mm, address, &ptl, 0); |
699 | if (!pte) | 712 | if (!pte) |
700 | goto out; | 713 | goto out; |
701 | 714 | ||
@@ -705,14 +718,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
705 | * skipped over this mm) then we should reactivate it. | 718 | * skipped over this mm) then we should reactivate it. |
706 | */ | 719 | */ |
707 | if (!migration && ((vma->vm_flags & VM_LOCKED) || | 720 | if (!migration && ((vma->vm_flags & VM_LOCKED) || |
708 | (ptep_clear_flush_young(vma, address, pte)))) { | 721 | (ptep_clear_flush_young_notify(vma, address, pte)))) { |
709 | ret = SWAP_FAIL; | 722 | ret = SWAP_FAIL; |
710 | goto out_unmap; | 723 | goto out_unmap; |
711 | } | 724 | } |
712 | 725 | ||
713 | /* Nuke the page table entry. */ | 726 | /* Nuke the page table entry. */ |
714 | flush_cache_page(vma, address, page_to_pfn(page)); | 727 | flush_cache_page(vma, address, page_to_pfn(page)); |
715 | pteval = ptep_clear_flush(vma, address, pte); | 728 | pteval = ptep_clear_flush_notify(vma, address, pte); |
716 | 729 | ||
717 | /* Move the dirty bit to the physical page now the pte is gone. */ | 730 | /* Move the dirty bit to the physical page now the pte is gone. */ |
718 | if (pte_dirty(pteval)) | 731 | if (pte_dirty(pteval)) |
@@ -837,12 +850,12 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
837 | page = vm_normal_page(vma, address, *pte); | 850 | page = vm_normal_page(vma, address, *pte); |
838 | BUG_ON(!page || PageAnon(page)); | 851 | BUG_ON(!page || PageAnon(page)); |
839 | 852 | ||
840 | if (ptep_clear_flush_young(vma, address, pte)) | 853 | if (ptep_clear_flush_young_notify(vma, address, pte)) |
841 | continue; | 854 | continue; |
842 | 855 | ||
843 | /* Nuke the page table entry. */ | 856 | /* Nuke the page table entry. */ |
844 | flush_cache_page(vma, address, pte_pfn(*pte)); | 857 | flush_cache_page(vma, address, pte_pfn(*pte)); |
845 | pteval = ptep_clear_flush(vma, address, pte); | 858 | pteval = ptep_clear_flush_notify(vma, address, pte); |
846 | 859 | ||
847 | /* If nonlinear, store the file page offset in the pte. */ | 860 | /* If nonlinear, store the file page offset in the pte. */ |
848 | if (page->index != linear_page_index(vma, address)) | 861 | if (page->index != linear_page_index(vma, address)) |
diff --git a/mm/shmem.c b/mm/shmem.c index f92fea94d037..04fb4f1ab88e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -936,7 +936,7 @@ found: | |||
936 | spin_lock(&info->lock); | 936 | spin_lock(&info->lock); |
937 | ptr = shmem_swp_entry(info, idx, NULL); | 937 | ptr = shmem_swp_entry(info, idx, NULL); |
938 | if (ptr && ptr->val == entry.val) { | 938 | if (ptr && ptr->val == entry.val) { |
939 | error = add_to_page_cache(page, inode->i_mapping, | 939 | error = add_to_page_cache_locked(page, inode->i_mapping, |
940 | idx, GFP_NOWAIT); | 940 | idx, GFP_NOWAIT); |
941 | /* does mem_cgroup_uncharge_cache_page on error */ | 941 | /* does mem_cgroup_uncharge_cache_page on error */ |
942 | } else /* we must compensate for our precharge above */ | 942 | } else /* we must compensate for our precharge above */ |
@@ -1265,7 +1265,7 @@ repeat: | |||
1265 | } | 1265 | } |
1266 | 1266 | ||
1267 | /* We have to do this with page locked to prevent races */ | 1267 | /* We have to do this with page locked to prevent races */ |
1268 | if (TestSetPageLocked(swappage)) { | 1268 | if (!trylock_page(swappage)) { |
1269 | shmem_swp_unmap(entry); | 1269 | shmem_swp_unmap(entry); |
1270 | spin_unlock(&info->lock); | 1270 | spin_unlock(&info->lock); |
1271 | wait_on_page_locked(swappage); | 1271 | wait_on_page_locked(swappage); |
@@ -1301,8 +1301,8 @@ repeat: | |||
1301 | SetPageUptodate(filepage); | 1301 | SetPageUptodate(filepage); |
1302 | set_page_dirty(filepage); | 1302 | set_page_dirty(filepage); |
1303 | swap_free(swap); | 1303 | swap_free(swap); |
1304 | } else if (!(error = add_to_page_cache( | 1304 | } else if (!(error = add_to_page_cache_locked(swappage, mapping, |
1305 | swappage, mapping, idx, GFP_NOWAIT))) { | 1305 | idx, GFP_NOWAIT))) { |
1306 | info->flags |= SHMEM_PAGEIN; | 1306 | info->flags |= SHMEM_PAGEIN; |
1307 | shmem_swp_set(info, entry, 0); | 1307 | shmem_swp_set(info, entry, 0); |
1308 | shmem_swp_unmap(entry); | 1308 | shmem_swp_unmap(entry); |
@@ -1329,7 +1329,7 @@ repeat: | |||
1329 | shmem_swp_unmap(entry); | 1329 | shmem_swp_unmap(entry); |
1330 | filepage = find_get_page(mapping, idx); | 1330 | filepage = find_get_page(mapping, idx); |
1331 | if (filepage && | 1331 | if (filepage && |
1332 | (!PageUptodate(filepage) || TestSetPageLocked(filepage))) { | 1332 | (!PageUptodate(filepage) || !trylock_page(filepage))) { |
1333 | spin_unlock(&info->lock); | 1333 | spin_unlock(&info->lock); |
1334 | wait_on_page_locked(filepage); | 1334 | wait_on_page_locked(filepage); |
1335 | page_cache_release(filepage); | 1335 | page_cache_release(filepage); |
@@ -1513,7 +1513,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
1513 | inode->i_uid = current->fsuid; | 1513 | inode->i_uid = current->fsuid; |
1514 | inode->i_gid = current->fsgid; | 1514 | inode->i_gid = current->fsgid; |
1515 | inode->i_blocks = 0; | 1515 | inode->i_blocks = 0; |
1516 | inode->i_mapping->a_ops = &shmem_aops; | ||
1517 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; | 1516 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; |
1518 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 1517 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
1519 | inode->i_generation = get_seconds(); | 1518 | inode->i_generation = get_seconds(); |
@@ -1528,6 +1527,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
1528 | init_special_inode(inode, mode, dev); | 1527 | init_special_inode(inode, mode, dev); |
1529 | break; | 1528 | break; |
1530 | case S_IFREG: | 1529 | case S_IFREG: |
1530 | inode->i_mapping->a_ops = &shmem_aops; | ||
1531 | inode->i_op = &shmem_inode_operations; | 1531 | inode->i_op = &shmem_inode_operations; |
1532 | inode->i_fop = &shmem_file_operations; | 1532 | inode->i_fop = &shmem_file_operations; |
1533 | mpol_shared_policy_init(&info->policy, | 1533 | mpol_shared_policy_init(&info->policy, |
@@ -1929,6 +1929,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
1929 | return error; | 1929 | return error; |
1930 | } | 1930 | } |
1931 | unlock_page(page); | 1931 | unlock_page(page); |
1932 | inode->i_mapping->a_ops = &shmem_aops; | ||
1932 | inode->i_op = &shmem_symlink_inode_operations; | 1933 | inode->i_op = &shmem_symlink_inode_operations; |
1933 | kaddr = kmap_atomic(page, KM_USER0); | 1934 | kaddr = kmap_atomic(page, KM_USER0); |
1934 | memcpy(kaddr, symname, len); | 1935 | memcpy(kaddr, symname, len); |
@@ -2352,7 +2353,7 @@ static void shmem_destroy_inode(struct inode *inode) | |||
2352 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 2353 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); |
2353 | } | 2354 | } |
2354 | 2355 | ||
2355 | static void init_once(struct kmem_cache *cachep, void *foo) | 2356 | static void init_once(void *foo) |
2356 | { | 2357 | { |
2357 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | 2358 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; |
2358 | 2359 | ||
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index f5664c5b9eb1..8e5aadd7dcd6 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c | |||
@@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask) | |||
191 | * shmem_permission - permission() inode operation | 191 | * shmem_permission - permission() inode operation |
192 | */ | 192 | */ |
193 | int | 193 | int |
194 | shmem_permission(struct inode *inode, int mask, struct nameidata *nd) | 194 | shmem_permission(struct inode *inode, int mask) |
195 | { | 195 | { |
196 | return generic_permission(inode, mask, shmem_check_acl); | 196 | return generic_permission(inode, mask, shmem_check_acl); |
197 | } | 197 | } |
@@ -406,7 +406,7 @@ struct kmem_cache { | |||
406 | unsigned int dflags; /* dynamic flags */ | 406 | unsigned int dflags; /* dynamic flags */ |
407 | 407 | ||
408 | /* constructor func */ | 408 | /* constructor func */ |
409 | void (*ctor)(struct kmem_cache *, void *); | 409 | void (*ctor)(void *obj); |
410 | 410 | ||
411 | /* 5) cache creation/removal */ | 411 | /* 5) cache creation/removal */ |
412 | const char *name; | 412 | const char *name; |
@@ -2137,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | |||
2137 | */ | 2137 | */ |
2138 | struct kmem_cache * | 2138 | struct kmem_cache * |
2139 | kmem_cache_create (const char *name, size_t size, size_t align, | 2139 | kmem_cache_create (const char *name, size_t size, size_t align, |
2140 | unsigned long flags, | 2140 | unsigned long flags, void (*ctor)(void *)) |
2141 | void (*ctor)(struct kmem_cache *, void *)) | ||
2142 | { | 2141 | { |
2143 | size_t left_over, slab_size, ralign; | 2142 | size_t left_over, slab_size, ralign; |
2144 | struct kmem_cache *cachep = NULL, *pc; | 2143 | struct kmem_cache *cachep = NULL, *pc; |
@@ -2653,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2653 | * They must also be threaded. | 2652 | * They must also be threaded. |
2654 | */ | 2653 | */ |
2655 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) | 2654 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) |
2656 | cachep->ctor(cachep, objp + obj_offset(cachep)); | 2655 | cachep->ctor(objp + obj_offset(cachep)); |
2657 | 2656 | ||
2658 | if (cachep->flags & SLAB_RED_ZONE) { | 2657 | if (cachep->flags & SLAB_RED_ZONE) { |
2659 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) | 2658 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) |
@@ -2669,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2669 | cachep->buffer_size / PAGE_SIZE, 0); | 2668 | cachep->buffer_size / PAGE_SIZE, 0); |
2670 | #else | 2669 | #else |
2671 | if (cachep->ctor) | 2670 | if (cachep->ctor) |
2672 | cachep->ctor(cachep, objp); | 2671 | cachep->ctor(objp); |
2673 | #endif | 2672 | #endif |
2674 | slab_bufctl(slabp)[i] = i + 1; | 2673 | slab_bufctl(slabp)[i] = i + 1; |
2675 | } | 2674 | } |
@@ -3093,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3093 | #endif | 3092 | #endif |
3094 | objp += obj_offset(cachep); | 3093 | objp += obj_offset(cachep); |
3095 | if (cachep->ctor && cachep->flags & SLAB_POISON) | 3094 | if (cachep->ctor && cachep->flags & SLAB_POISON) |
3096 | cachep->ctor(cachep, objp); | 3095 | cachep->ctor(objp); |
3097 | #if ARCH_SLAB_MINALIGN | 3096 | #if ARCH_SLAB_MINALIGN |
3098 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { | 3097 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { |
3099 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", | 3098 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", |
@@ -4473,4 +4472,3 @@ size_t ksize(const void *objp) | |||
4473 | 4472 | ||
4474 | return obj_size(virt_to_cache(objp)); | 4473 | return obj_size(virt_to_cache(objp)); |
4475 | } | 4474 | } |
4476 | EXPORT_SYMBOL(ksize); | ||
@@ -514,23 +514,23 @@ size_t ksize(const void *block) | |||
514 | return 0; | 514 | return 0; |
515 | 515 | ||
516 | sp = (struct slob_page *)virt_to_page(block); | 516 | sp = (struct slob_page *)virt_to_page(block); |
517 | if (slob_page(sp)) | 517 | if (slob_page(sp)) { |
518 | return ((slob_t *)block - 1)->units + SLOB_UNIT; | 518 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
519 | else | 519 | unsigned int *m = (unsigned int *)(block - align); |
520 | return SLOB_UNITS(*m) * SLOB_UNIT; | ||
521 | } else | ||
520 | return sp->page.private; | 522 | return sp->page.private; |
521 | } | 523 | } |
522 | EXPORT_SYMBOL(ksize); | ||
523 | 524 | ||
524 | struct kmem_cache { | 525 | struct kmem_cache { |
525 | unsigned int size, align; | 526 | unsigned int size, align; |
526 | unsigned long flags; | 527 | unsigned long flags; |
527 | const char *name; | 528 | const char *name; |
528 | void (*ctor)(struct kmem_cache *, void *); | 529 | void (*ctor)(void *); |
529 | }; | 530 | }; |
530 | 531 | ||
531 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | 532 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, |
532 | size_t align, unsigned long flags, | 533 | size_t align, unsigned long flags, void (*ctor)(void *)) |
533 | void (*ctor)(struct kmem_cache *, void *)) | ||
534 | { | 534 | { |
535 | struct kmem_cache *c; | 535 | struct kmem_cache *c; |
536 | 536 | ||
@@ -575,7 +575,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
575 | b = slob_new_page(flags, get_order(c->size), node); | 575 | b = slob_new_page(flags, get_order(c->size), node); |
576 | 576 | ||
577 | if (c->ctor) | 577 | if (c->ctor) |
578 | c->ctor(c, b); | 578 | c->ctor(b); |
579 | 579 | ||
580 | return b; | 580 | return b; |
581 | } | 581 | } |
@@ -1012,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug); | |||
1012 | 1012 | ||
1013 | static unsigned long kmem_cache_flags(unsigned long objsize, | 1013 | static unsigned long kmem_cache_flags(unsigned long objsize, |
1014 | unsigned long flags, const char *name, | 1014 | unsigned long flags, const char *name, |
1015 | void (*ctor)(struct kmem_cache *, void *)) | 1015 | void (*ctor)(void *)) |
1016 | { | 1016 | { |
1017 | /* | 1017 | /* |
1018 | * Enable debugging if selected on the kernel commandline. | 1018 | * Enable debugging if selected on the kernel commandline. |
@@ -1040,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page, | |||
1040 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} | 1040 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} |
1041 | static inline unsigned long kmem_cache_flags(unsigned long objsize, | 1041 | static inline unsigned long kmem_cache_flags(unsigned long objsize, |
1042 | unsigned long flags, const char *name, | 1042 | unsigned long flags, const char *name, |
1043 | void (*ctor)(struct kmem_cache *, void *)) | 1043 | void (*ctor)(void *)) |
1044 | { | 1044 | { |
1045 | return flags; | 1045 | return flags; |
1046 | } | 1046 | } |
@@ -1103,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page, | |||
1103 | { | 1103 | { |
1104 | setup_object_debug(s, page, object); | 1104 | setup_object_debug(s, page, object); |
1105 | if (unlikely(s->ctor)) | 1105 | if (unlikely(s->ctor)) |
1106 | s->ctor(s, object); | 1106 | s->ctor(object); |
1107 | } | 1107 | } |
1108 | 1108 | ||
1109 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | 1109 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) |
@@ -1329,7 +1329,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1329 | n = get_node(s, zone_to_nid(zone)); | 1329 | n = get_node(s, zone_to_nid(zone)); |
1330 | 1330 | ||
1331 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && | 1331 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && |
1332 | n->nr_partial > MIN_PARTIAL) { | 1332 | n->nr_partial > n->min_partial) { |
1333 | page = get_partial_node(n); | 1333 | page = get_partial_node(n); |
1334 | if (page) | 1334 | if (page) |
1335 | return page; | 1335 | return page; |
@@ -1381,7 +1381,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | |||
1381 | slab_unlock(page); | 1381 | slab_unlock(page); |
1382 | } else { | 1382 | } else { |
1383 | stat(c, DEACTIVATE_EMPTY); | 1383 | stat(c, DEACTIVATE_EMPTY); |
1384 | if (n->nr_partial < MIN_PARTIAL) { | 1384 | if (n->nr_partial < n->min_partial) { |
1385 | /* | 1385 | /* |
1386 | * Adding an empty slab to the partial slabs in order | 1386 | * Adding an empty slab to the partial slabs in order |
1387 | * to avoid page allocator overhead. This slab needs | 1387 | * to avoid page allocator overhead. This slab needs |
@@ -1913,13 +1913,26 @@ static void init_kmem_cache_cpu(struct kmem_cache *s, | |||
1913 | #endif | 1913 | #endif |
1914 | } | 1914 | } |
1915 | 1915 | ||
1916 | static void init_kmem_cache_node(struct kmem_cache_node *n) | 1916 | static void |
1917 | init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) | ||
1917 | { | 1918 | { |
1918 | n->nr_partial = 0; | 1919 | n->nr_partial = 0; |
1920 | |||
1921 | /* | ||
1922 | * The larger the object size is, the more pages we want on the partial | ||
1923 | * list to avoid pounding the page allocator excessively. | ||
1924 | */ | ||
1925 | n->min_partial = ilog2(s->size); | ||
1926 | if (n->min_partial < MIN_PARTIAL) | ||
1927 | n->min_partial = MIN_PARTIAL; | ||
1928 | else if (n->min_partial > MAX_PARTIAL) | ||
1929 | n->min_partial = MAX_PARTIAL; | ||
1930 | |||
1919 | spin_lock_init(&n->list_lock); | 1931 | spin_lock_init(&n->list_lock); |
1920 | INIT_LIST_HEAD(&n->partial); | 1932 | INIT_LIST_HEAD(&n->partial); |
1921 | #ifdef CONFIG_SLUB_DEBUG | 1933 | #ifdef CONFIG_SLUB_DEBUG |
1922 | atomic_long_set(&n->nr_slabs, 0); | 1934 | atomic_long_set(&n->nr_slabs, 0); |
1935 | atomic_long_set(&n->total_objects, 0); | ||
1923 | INIT_LIST_HEAD(&n->full); | 1936 | INIT_LIST_HEAD(&n->full); |
1924 | #endif | 1937 | #endif |
1925 | } | 1938 | } |
@@ -2087,7 +2100,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, | |||
2087 | init_object(kmalloc_caches, n, 1); | 2100 | init_object(kmalloc_caches, n, 1); |
2088 | init_tracking(kmalloc_caches, n); | 2101 | init_tracking(kmalloc_caches, n); |
2089 | #endif | 2102 | #endif |
2090 | init_kmem_cache_node(n); | 2103 | init_kmem_cache_node(n, kmalloc_caches); |
2091 | inc_slabs_node(kmalloc_caches, node, page->objects); | 2104 | inc_slabs_node(kmalloc_caches, node, page->objects); |
2092 | 2105 | ||
2093 | /* | 2106 | /* |
@@ -2144,7 +2157,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) | |||
2144 | 2157 | ||
2145 | } | 2158 | } |
2146 | s->node[node] = n; | 2159 | s->node[node] = n; |
2147 | init_kmem_cache_node(n); | 2160 | init_kmem_cache_node(n, s); |
2148 | } | 2161 | } |
2149 | return 1; | 2162 | return 1; |
2150 | } | 2163 | } |
@@ -2155,7 +2168,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) | |||
2155 | 2168 | ||
2156 | static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) | 2169 | static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) |
2157 | { | 2170 | { |
2158 | init_kmem_cache_node(&s->local_node); | 2171 | init_kmem_cache_node(&s->local_node, s); |
2159 | return 1; | 2172 | return 1; |
2160 | } | 2173 | } |
2161 | #endif | 2174 | #endif |
@@ -2286,7 +2299,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
2286 | static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | 2299 | static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, |
2287 | const char *name, size_t size, | 2300 | const char *name, size_t size, |
2288 | size_t align, unsigned long flags, | 2301 | size_t align, unsigned long flags, |
2289 | void (*ctor)(struct kmem_cache *, void *)) | 2302 | void (*ctor)(void *)) |
2290 | { | 2303 | { |
2291 | memset(s, 0, kmem_size); | 2304 | memset(s, 0, kmem_size); |
2292 | s->name = name; | 2305 | s->name = name; |
@@ -2300,7 +2313,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | |||
2300 | 2313 | ||
2301 | s->refcount = 1; | 2314 | s->refcount = 1; |
2302 | #ifdef CONFIG_NUMA | 2315 | #ifdef CONFIG_NUMA |
2303 | s->remote_node_defrag_ratio = 100; | 2316 | s->remote_node_defrag_ratio = 1000; |
2304 | #endif | 2317 | #endif |
2305 | if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) | 2318 | if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) |
2306 | goto error; | 2319 | goto error; |
@@ -2715,7 +2728,6 @@ size_t ksize(const void *object) | |||
2715 | */ | 2728 | */ |
2716 | return s->size; | 2729 | return s->size; |
2717 | } | 2730 | } |
2718 | EXPORT_SYMBOL(ksize); | ||
2719 | 2731 | ||
2720 | void kfree(const void *x) | 2732 | void kfree(const void *x) |
2721 | { | 2733 | { |
@@ -2890,7 +2902,7 @@ static int slab_mem_going_online_callback(void *arg) | |||
2890 | ret = -ENOMEM; | 2902 | ret = -ENOMEM; |
2891 | goto out; | 2903 | goto out; |
2892 | } | 2904 | } |
2893 | init_kmem_cache_node(n); | 2905 | init_kmem_cache_node(n, s); |
2894 | s->node[nid] = n; | 2906 | s->node[nid] = n; |
2895 | } | 2907 | } |
2896 | out: | 2908 | out: |
@@ -3042,7 +3054,7 @@ static int slab_unmergeable(struct kmem_cache *s) | |||
3042 | 3054 | ||
3043 | static struct kmem_cache *find_mergeable(size_t size, | 3055 | static struct kmem_cache *find_mergeable(size_t size, |
3044 | size_t align, unsigned long flags, const char *name, | 3056 | size_t align, unsigned long flags, const char *name, |
3045 | void (*ctor)(struct kmem_cache *, void *)) | 3057 | void (*ctor)(void *)) |
3046 | { | 3058 | { |
3047 | struct kmem_cache *s; | 3059 | struct kmem_cache *s; |
3048 | 3060 | ||
@@ -3082,8 +3094,7 @@ static struct kmem_cache *find_mergeable(size_t size, | |||
3082 | } | 3094 | } |
3083 | 3095 | ||
3084 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | 3096 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, |
3085 | size_t align, unsigned long flags, | 3097 | size_t align, unsigned long flags, void (*ctor)(void *)) |
3086 | void (*ctor)(struct kmem_cache *, void *)) | ||
3087 | { | 3098 | { |
3088 | struct kmem_cache *s; | 3099 | struct kmem_cache *s; |
3089 | 3100 | ||
@@ -4048,7 +4059,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, | |||
4048 | if (err) | 4059 | if (err) |
4049 | return err; | 4060 | return err; |
4050 | 4061 | ||
4051 | if (ratio < 100) | 4062 | if (ratio <= 100) |
4052 | s->remote_node_defrag_ratio = ratio * 10; | 4063 | s->remote_node_defrag_ratio = ratio * 10; |
4053 | 4064 | ||
4054 | return length; | 4065 | return length; |
diff --git a/mm/sparse.c b/mm/sparse.c index 8ffc08990008..39db301b920d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -12,7 +12,6 @@ | |||
12 | #include <asm/dma.h> | 12 | #include <asm/dma.h> |
13 | #include <asm/pgalloc.h> | 13 | #include <asm/pgalloc.h> |
14 | #include <asm/pgtable.h> | 14 | #include <asm/pgtable.h> |
15 | #include "internal.h" | ||
16 | 15 | ||
17 | /* | 16 | /* |
18 | * Permanent SPARSEMEM data: | 17 | * Permanent SPARSEMEM data: |
@@ -377,7 +376,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | |||
377 | } | 376 | } |
378 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | 377 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ |
379 | 378 | ||
380 | struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | 379 | static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) |
381 | { | 380 | { |
382 | struct page *map; | 381 | struct page *map; |
383 | struct mem_section *ms = __nr_to_section(pnum); | 382 | struct mem_section *ms = __nr_to_section(pnum); |
@@ -278,9 +278,10 @@ int lru_add_drain_all(void) | |||
278 | * Avoid taking zone->lru_lock if possible, but if it is taken, retain it | 278 | * Avoid taking zone->lru_lock if possible, but if it is taken, retain it |
279 | * for the remainder of the operation. | 279 | * for the remainder of the operation. |
280 | * | 280 | * |
281 | * The locking in this function is against shrink_cache(): we recheck the | 281 | * The locking in this function is against shrink_inactive_list(): we recheck |
282 | * page count inside the lock to see whether shrink_cache grabbed the page | 282 | * the page count inside the lock to see whether shrink_inactive_list() |
283 | * via the LRU. If it did, give up: shrink_cache will free it. | 283 | * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() |
284 | * will free it. | ||
284 | */ | 285 | */ |
285 | void release_pages(struct page **pages, int nr, int cold) | 286 | void release_pages(struct page **pages, int nr, int cold) |
286 | { | 287 | { |
@@ -443,7 +444,7 @@ void pagevec_strip(struct pagevec *pvec) | |||
443 | for (i = 0; i < pagevec_count(pvec); i++) { | 444 | for (i = 0; i < pagevec_count(pvec); i++) { |
444 | struct page *page = pvec->pages[i]; | 445 | struct page *page = pvec->pages[i]; |
445 | 446 | ||
446 | if (PagePrivate(page) && !TestSetPageLocked(page)) { | 447 | if (PagePrivate(page) && trylock_page(page)) { |
447 | if (PagePrivate(page)) | 448 | if (PagePrivate(page)) |
448 | try_to_release_page(page, 0); | 449 | try_to_release_page(page, 0); |
449 | unlock_page(page); | 450 | unlock_page(page); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index d8aadaf2a0ba..797c3831cbec 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -39,7 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = { | |||
39 | 39 | ||
40 | struct address_space swapper_space = { | 40 | struct address_space swapper_space = { |
41 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), | 41 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), |
42 | .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), | 42 | .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), |
43 | .a_ops = &swap_aops, | 43 | .a_ops = &swap_aops, |
44 | .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), | 44 | .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), |
45 | .backing_dev_info = &swap_backing_dev_info, | 45 | .backing_dev_info = &swap_backing_dev_info, |
@@ -56,15 +56,16 @@ static struct { | |||
56 | 56 | ||
57 | void show_swap_cache_info(void) | 57 | void show_swap_cache_info(void) |
58 | { | 58 | { |
59 | printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", | 59 | printk("%lu pages in swap cache\n", total_swapcache_pages); |
60 | printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", | ||
60 | swap_cache_info.add_total, swap_cache_info.del_total, | 61 | swap_cache_info.add_total, swap_cache_info.del_total, |
61 | swap_cache_info.find_success, swap_cache_info.find_total); | 62 | swap_cache_info.find_success, swap_cache_info.find_total); |
62 | printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); | 63 | printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); |
63 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); | 64 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); |
64 | } | 65 | } |
65 | 66 | ||
66 | /* | 67 | /* |
67 | * add_to_swap_cache resembles add_to_page_cache on swapper_space, | 68 | * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, |
68 | * but sets SwapCache flag and private instead of mapping and index. | 69 | * but sets SwapCache flag and private instead of mapping and index. |
69 | */ | 70 | */ |
70 | int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | 71 | int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) |
@@ -76,19 +77,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
76 | BUG_ON(PagePrivate(page)); | 77 | BUG_ON(PagePrivate(page)); |
77 | error = radix_tree_preload(gfp_mask); | 78 | error = radix_tree_preload(gfp_mask); |
78 | if (!error) { | 79 | if (!error) { |
79 | write_lock_irq(&swapper_space.tree_lock); | 80 | page_cache_get(page); |
81 | SetPageSwapCache(page); | ||
82 | set_page_private(page, entry.val); | ||
83 | |||
84 | spin_lock_irq(&swapper_space.tree_lock); | ||
80 | error = radix_tree_insert(&swapper_space.page_tree, | 85 | error = radix_tree_insert(&swapper_space.page_tree, |
81 | entry.val, page); | 86 | entry.val, page); |
82 | if (!error) { | 87 | if (likely(!error)) { |
83 | page_cache_get(page); | ||
84 | SetPageSwapCache(page); | ||
85 | set_page_private(page, entry.val); | ||
86 | total_swapcache_pages++; | 88 | total_swapcache_pages++; |
87 | __inc_zone_page_state(page, NR_FILE_PAGES); | 89 | __inc_zone_page_state(page, NR_FILE_PAGES); |
88 | INC_CACHE_INFO(add_total); | 90 | INC_CACHE_INFO(add_total); |
89 | } | 91 | } |
90 | write_unlock_irq(&swapper_space.tree_lock); | 92 | spin_unlock_irq(&swapper_space.tree_lock); |
91 | radix_tree_preload_end(); | 93 | radix_tree_preload_end(); |
94 | |||
95 | if (unlikely(error)) { | ||
96 | set_page_private(page, 0UL); | ||
97 | ClearPageSwapCache(page); | ||
98 | page_cache_release(page); | ||
99 | } | ||
92 | } | 100 | } |
93 | return error; | 101 | return error; |
94 | } | 102 | } |
@@ -175,9 +183,9 @@ void delete_from_swap_cache(struct page *page) | |||
175 | 183 | ||
176 | entry.val = page_private(page); | 184 | entry.val = page_private(page); |
177 | 185 | ||
178 | write_lock_irq(&swapper_space.tree_lock); | 186 | spin_lock_irq(&swapper_space.tree_lock); |
179 | __delete_from_swap_cache(page); | 187 | __delete_from_swap_cache(page); |
180 | write_unlock_irq(&swapper_space.tree_lock); | 188 | spin_unlock_irq(&swapper_space.tree_lock); |
181 | 189 | ||
182 | swap_free(entry); | 190 | swap_free(entry); |
183 | page_cache_release(page); | 191 | page_cache_release(page); |
@@ -193,7 +201,7 @@ void delete_from_swap_cache(struct page *page) | |||
193 | */ | 201 | */ |
194 | static inline void free_swap_cache(struct page *page) | 202 | static inline void free_swap_cache(struct page *page) |
195 | { | 203 | { |
196 | if (PageSwapCache(page) && !TestSetPageLocked(page)) { | 204 | if (PageSwapCache(page) && trylock_page(page)) { |
197 | remove_exclusive_swap_page(page); | 205 | remove_exclusive_swap_page(page); |
198 | unlock_page(page); | 206 | unlock_page(page); |
199 | } | 207 | } |
@@ -294,9 +302,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
294 | * re-using the just freed swap entry for an existing page. | 302 | * re-using the just freed swap entry for an existing page. |
295 | * May fail (-ENOMEM) if radix-tree node allocation failed. | 303 | * May fail (-ENOMEM) if radix-tree node allocation failed. |
296 | */ | 304 | */ |
297 | SetPageLocked(new_page); | 305 | set_page_locked(new_page); |
298 | err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); | 306 | err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); |
299 | if (!err) { | 307 | if (likely(!err)) { |
300 | /* | 308 | /* |
301 | * Initiate read into locked page and return. | 309 | * Initiate read into locked page and return. |
302 | */ | 310 | */ |
@@ -304,7 +312,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
304 | swap_readpage(NULL, new_page); | 312 | swap_readpage(NULL, new_page); |
305 | return new_page; | 313 | return new_page; |
306 | } | 314 | } |
307 | ClearPageLocked(new_page); | 315 | clear_page_locked(new_page); |
308 | swap_free(entry); | 316 | swap_free(entry); |
309 | } while (err != -ENOMEM); | 317 | } while (err != -ENOMEM); |
310 | 318 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 2f33edb8bee9..1e330f2998fa 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -33,8 +33,8 @@ | |||
33 | #include <asm/tlbflush.h> | 33 | #include <asm/tlbflush.h> |
34 | #include <linux/swapops.h> | 34 | #include <linux/swapops.h> |
35 | 35 | ||
36 | DEFINE_SPINLOCK(swap_lock); | 36 | static DEFINE_SPINLOCK(swap_lock); |
37 | unsigned int nr_swapfiles; | 37 | static unsigned int nr_swapfiles; |
38 | long total_swap_pages; | 38 | long total_swap_pages; |
39 | static int swap_overflow; | 39 | static int swap_overflow; |
40 | static int least_priority; | 40 | static int least_priority; |
@@ -44,7 +44,7 @@ static const char Unused_file[] = "Unused swap file entry "; | |||
44 | static const char Bad_offset[] = "Bad swap offset entry "; | 44 | static const char Bad_offset[] = "Bad swap offset entry "; |
45 | static const char Unused_offset[] = "Unused swap offset entry "; | 45 | static const char Unused_offset[] = "Unused swap offset entry "; |
46 | 46 | ||
47 | struct swap_list_t swap_list = {-1, -1}; | 47 | static struct swap_list_t swap_list = {-1, -1}; |
48 | 48 | ||
49 | static struct swap_info_struct swap_info[MAX_SWAPFILES]; | 49 | static struct swap_info_struct swap_info[MAX_SWAPFILES]; |
50 | 50 | ||
@@ -369,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page) | |||
369 | retval = 0; | 369 | retval = 0; |
370 | if (p->swap_map[swp_offset(entry)] == 1) { | 370 | if (p->swap_map[swp_offset(entry)] == 1) { |
371 | /* Recheck the page count with the swapcache lock held.. */ | 371 | /* Recheck the page count with the swapcache lock held.. */ |
372 | write_lock_irq(&swapper_space.tree_lock); | 372 | spin_lock_irq(&swapper_space.tree_lock); |
373 | if ((page_count(page) == 2) && !PageWriteback(page)) { | 373 | if ((page_count(page) == 2) && !PageWriteback(page)) { |
374 | __delete_from_swap_cache(page); | 374 | __delete_from_swap_cache(page); |
375 | SetPageDirty(page); | 375 | SetPageDirty(page); |
376 | retval = 1; | 376 | retval = 1; |
377 | } | 377 | } |
378 | write_unlock_irq(&swapper_space.tree_lock); | 378 | spin_unlock_irq(&swapper_space.tree_lock); |
379 | } | 379 | } |
380 | spin_unlock(&swap_lock); | 380 | spin_unlock(&swap_lock); |
381 | 381 | ||
@@ -403,7 +403,7 @@ void free_swap_and_cache(swp_entry_t entry) | |||
403 | if (p) { | 403 | if (p) { |
404 | if (swap_entry_free(p, swp_offset(entry)) == 1) { | 404 | if (swap_entry_free(p, swp_offset(entry)) == 1) { |
405 | page = find_get_page(&swapper_space, entry.val); | 405 | page = find_get_page(&swapper_space, entry.val); |
406 | if (page && unlikely(TestSetPageLocked(page))) { | 406 | if (page && unlikely(!trylock_page(page))) { |
407 | page_cache_release(page); | 407 | page_cache_release(page); |
408 | page = NULL; | 408 | page = NULL; |
409 | } | 409 | } |
@@ -656,8 +656,8 @@ static int unuse_mm(struct mm_struct *mm, | |||
656 | 656 | ||
657 | if (!down_read_trylock(&mm->mmap_sem)) { | 657 | if (!down_read_trylock(&mm->mmap_sem)) { |
658 | /* | 658 | /* |
659 | * Activate page so shrink_cache is unlikely to unmap its | 659 | * Activate page so shrink_inactive_list is unlikely to unmap |
660 | * ptes while lock is dropped, so swapoff can make progress. | 660 | * its ptes while lock is dropped, so swapoff can make progress. |
661 | */ | 661 | */ |
662 | activate_page(page); | 662 | activate_page(page); |
663 | unlock_page(page); | 663 | unlock_page(page); |
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index ae532f501943..8d7a27a6335c 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c | |||
@@ -65,31 +65,31 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
65 | if (!dentry) | 65 | if (!dentry) |
66 | goto put_memory; | 66 | goto put_memory; |
67 | 67 | ||
68 | error = -ENFILE; | ||
69 | file = get_empty_filp(); | ||
70 | if (!file) | ||
71 | goto put_dentry; | ||
72 | |||
68 | error = -ENOSPC; | 73 | error = -ENOSPC; |
69 | inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); | 74 | inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); |
70 | if (!inode) | 75 | if (!inode) |
71 | goto put_dentry; | 76 | goto close_file; |
72 | 77 | ||
73 | d_instantiate(dentry, inode); | 78 | d_instantiate(dentry, inode); |
74 | error = -ENFILE; | 79 | inode->i_size = size; |
75 | file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | ||
76 | &ramfs_file_operations); | ||
77 | if (!file) | ||
78 | goto put_dentry; | ||
79 | |||
80 | inode->i_nlink = 0; /* It is unlinked */ | 80 | inode->i_nlink = 0; /* It is unlinked */ |
81 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | ||
82 | &ramfs_file_operations); | ||
81 | 83 | ||
82 | /* notify everyone as to the change of file size */ | 84 | #ifndef CONFIG_MMU |
83 | error = do_truncate(dentry, size, 0, file); | 85 | error = ramfs_nommu_expand_for_mapping(inode, size); |
84 | if (error < 0) | 86 | if (error) |
85 | goto close_file; | 87 | goto close_file; |
86 | 88 | #endif | |
87 | return file; | 89 | return file; |
88 | 90 | ||
89 | close_file: | 91 | close_file: |
90 | put_filp(file); | 92 | put_filp(file); |
91 | return ERR_PTR(error); | ||
92 | |||
93 | put_dentry: | 93 | put_dentry: |
94 | dput(dentry); | 94 | dput(dentry); |
95 | put_memory: | 95 | put_memory: |
diff --git a/mm/truncate.c b/mm/truncate.c index b8961cb63414..6650c1d878b4 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -104,7 +104,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
104 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | 104 | cancel_dirty_page(page, PAGE_CACHE_SIZE); |
105 | 105 | ||
106 | remove_from_page_cache(page); | 106 | remove_from_page_cache(page); |
107 | ClearPageUptodate(page); | ||
108 | ClearPageMappedToDisk(page); | 107 | ClearPageMappedToDisk(page); |
109 | page_cache_release(page); /* pagecache ref */ | 108 | page_cache_release(page); /* pagecache ref */ |
110 | } | 109 | } |
@@ -188,7 +187,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
188 | if (page_index > next) | 187 | if (page_index > next) |
189 | next = page_index; | 188 | next = page_index; |
190 | next++; | 189 | next++; |
191 | if (TestSetPageLocked(page)) | 190 | if (!trylock_page(page)) |
192 | continue; | 191 | continue; |
193 | if (PageWriteback(page)) { | 192 | if (PageWriteback(page)) { |
194 | unlock_page(page); | 193 | unlock_page(page); |
@@ -281,7 +280,7 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping, | |||
281 | pgoff_t index; | 280 | pgoff_t index; |
282 | int lock_failed; | 281 | int lock_failed; |
283 | 282 | ||
284 | lock_failed = TestSetPageLocked(page); | 283 | lock_failed = !trylock_page(page); |
285 | 284 | ||
286 | /* | 285 | /* |
287 | * We really shouldn't be looking at the ->index of an | 286 | * We really shouldn't be looking at the ->index of an |
@@ -349,18 +348,17 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
349 | if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) | 348 | if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) |
350 | return 0; | 349 | return 0; |
351 | 350 | ||
352 | write_lock_irq(&mapping->tree_lock); | 351 | spin_lock_irq(&mapping->tree_lock); |
353 | if (PageDirty(page)) | 352 | if (PageDirty(page)) |
354 | goto failed; | 353 | goto failed; |
355 | 354 | ||
356 | BUG_ON(PagePrivate(page)); | 355 | BUG_ON(PagePrivate(page)); |
357 | __remove_from_page_cache(page); | 356 | __remove_from_page_cache(page); |
358 | write_unlock_irq(&mapping->tree_lock); | 357 | spin_unlock_irq(&mapping->tree_lock); |
359 | ClearPageUptodate(page); | ||
360 | page_cache_release(page); /* pagecache ref */ | 358 | page_cache_release(page); /* pagecache ref */ |
361 | return 1; | 359 | return 1; |
362 | failed: | 360 | failed: |
363 | write_unlock_irq(&mapping->tree_lock); | 361 | spin_unlock_irq(&mapping->tree_lock); |
364 | return 0; | 362 | return 0; |
365 | } | 363 | } |
366 | 364 | ||
@@ -382,7 +380,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) | |||
382 | * Any pages which are found to be mapped into pagetables are unmapped prior to | 380 | * Any pages which are found to be mapped into pagetables are unmapped prior to |
383 | * invalidation. | 381 | * invalidation. |
384 | * | 382 | * |
385 | * Returns -EIO if any pages could not be invalidated. | 383 | * Returns -EBUSY if any pages could not be invalidated. |
386 | */ | 384 | */ |
387 | int invalidate_inode_pages2_range(struct address_space *mapping, | 385 | int invalidate_inode_pages2_range(struct address_space *mapping, |
388 | pgoff_t start, pgoff_t end) | 386 | pgoff_t start, pgoff_t end) |
@@ -442,7 +440,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
442 | ret2 = do_launder_page(mapping, page); | 440 | ret2 = do_launder_page(mapping, page); |
443 | if (ret2 == 0) { | 441 | if (ret2 == 0) { |
444 | if (!invalidate_complete_page2(mapping, page)) | 442 | if (!invalidate_complete_page2(mapping, page)) |
445 | ret2 = -EIO; | 443 | ret2 = -EBUSY; |
446 | } | 444 | } |
447 | if (ret2 < 0) | 445 | if (ret2 < 0) |
448 | ret = ret2; | 446 | ret = ret2; |
@@ -1,7 +1,9 @@ | |||
1 | #include <linux/mm.h> | ||
1 | #include <linux/slab.h> | 2 | #include <linux/slab.h> |
2 | #include <linux/string.h> | 3 | #include <linux/string.h> |
3 | #include <linux/module.h> | 4 | #include <linux/module.h> |
4 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/sched.h> | ||
5 | #include <asm/uaccess.h> | 7 | #include <asm/uaccess.h> |
6 | 8 | ||
7 | /** | 9 | /** |
@@ -68,25 +70,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) | |||
68 | EXPORT_SYMBOL(kmemdup); | 70 | EXPORT_SYMBOL(kmemdup); |
69 | 71 | ||
70 | /** | 72 | /** |
71 | * krealloc - reallocate memory. The contents will remain unchanged. | 73 | * __krealloc - like krealloc() but don't free @p. |
72 | * @p: object to reallocate memory for. | 74 | * @p: object to reallocate memory for. |
73 | * @new_size: how many bytes of memory are required. | 75 | * @new_size: how many bytes of memory are required. |
74 | * @flags: the type of memory to allocate. | 76 | * @flags: the type of memory to allocate. |
75 | * | 77 | * |
76 | * The contents of the object pointed to are preserved up to the | 78 | * This function is like krealloc() except it never frees the originally |
77 | * lesser of the new and old sizes. If @p is %NULL, krealloc() | 79 | * allocated buffer. Use this if you don't want to free the buffer immediately |
78 | * behaves exactly like kmalloc(). If @size is 0 and @p is not a | 80 | * like, for example, with RCU. |
79 | * %NULL pointer, the object pointed to is freed. | ||
80 | */ | 81 | */ |
81 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | 82 | void *__krealloc(const void *p, size_t new_size, gfp_t flags) |
82 | { | 83 | { |
83 | void *ret; | 84 | void *ret; |
84 | size_t ks = 0; | 85 | size_t ks = 0; |
85 | 86 | ||
86 | if (unlikely(!new_size)) { | 87 | if (unlikely(!new_size)) |
87 | kfree(p); | ||
88 | return ZERO_SIZE_PTR; | 88 | return ZERO_SIZE_PTR; |
89 | } | ||
90 | 89 | ||
91 | if (p) | 90 | if (p) |
92 | ks = ksize(p); | 91 | ks = ksize(p); |
@@ -95,10 +94,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) | |||
95 | return (void *)p; | 94 | return (void *)p; |
96 | 95 | ||
97 | ret = kmalloc_track_caller(new_size, flags); | 96 | ret = kmalloc_track_caller(new_size, flags); |
98 | if (ret && p) { | 97 | if (ret && p) |
99 | memcpy(ret, p, ks); | 98 | memcpy(ret, p, ks); |
99 | |||
100 | return ret; | ||
101 | } | ||
102 | EXPORT_SYMBOL(__krealloc); | ||
103 | |||
104 | /** | ||
105 | * krealloc - reallocate memory. The contents will remain unchanged. | ||
106 | * @p: object to reallocate memory for. | ||
107 | * @new_size: how many bytes of memory are required. | ||
108 | * @flags: the type of memory to allocate. | ||
109 | * | ||
110 | * The contents of the object pointed to are preserved up to the | ||
111 | * lesser of the new and old sizes. If @p is %NULL, krealloc() | ||
112 | * behaves exactly like kmalloc(). If @size is 0 and @p is not a | ||
113 | * %NULL pointer, the object pointed to is freed. | ||
114 | */ | ||
115 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | ||
116 | { | ||
117 | void *ret; | ||
118 | |||
119 | if (unlikely(!new_size)) { | ||
100 | kfree(p); | 120 | kfree(p); |
121 | return ZERO_SIZE_PTR; | ||
101 | } | 122 | } |
123 | |||
124 | ret = __krealloc(p, new_size, flags); | ||
125 | if (ret && p != ret) | ||
126 | kfree(p); | ||
127 | |||
102 | return ret; | 128 | return ret; |
103 | } | 129 | } |
104 | EXPORT_SYMBOL(krealloc); | 130 | EXPORT_SYMBOL(krealloc); |
@@ -136,3 +162,27 @@ char *strndup_user(const char __user *s, long n) | |||
136 | return p; | 162 | return p; |
137 | } | 163 | } |
138 | EXPORT_SYMBOL(strndup_user); | 164 | EXPORT_SYMBOL(strndup_user); |
165 | |||
166 | #ifndef HAVE_ARCH_PICK_MMAP_LAYOUT | ||
167 | void arch_pick_mmap_layout(struct mm_struct *mm) | ||
168 | { | ||
169 | mm->mmap_base = TASK_UNMAPPED_BASE; | ||
170 | mm->get_unmapped_area = arch_get_unmapped_area; | ||
171 | mm->unmap_area = arch_unmap_area; | ||
172 | } | ||
173 | #endif | ||
174 | |||
175 | int __attribute__((weak)) get_user_pages_fast(unsigned long start, | ||
176 | int nr_pages, int write, struct page **pages) | ||
177 | { | ||
178 | struct mm_struct *mm = current->mm; | ||
179 | int ret; | ||
180 | |||
181 | down_read(&mm->mmap_sem); | ||
182 | ret = get_user_pages(current, mm, start, nr_pages, | ||
183 | write, 0, pages, NULL); | ||
184 | up_read(&mm->mmap_sem); | ||
185 | |||
186 | return ret; | ||
187 | } | ||
188 | EXPORT_SYMBOL_GPL(get_user_pages_fast); | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 35f293816294..bba06c41fc59 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -180,6 +180,13 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) | |||
180 | pmd_t *pmd; | 180 | pmd_t *pmd; |
181 | pte_t *ptep, pte; | 181 | pte_t *ptep, pte; |
182 | 182 | ||
183 | /* | ||
184 | * XXX we might need to change this if we add VIRTUAL_BUG_ON for | ||
185 | * architectures that do not vmalloc module space | ||
186 | */ | ||
187 | VIRTUAL_BUG_ON(!is_vmalloc_addr(vmalloc_addr) && | ||
188 | !is_module_address(addr)); | ||
189 | |||
183 | if (!pgd_none(*pgd)) { | 190 | if (!pgd_none(*pgd)) { |
184 | pud = pud_offset(pgd, addr); | 191 | pud = pud_offset(pgd, addr); |
185 | if (!pud_none(*pud)) { | 192 | if (!pud_none(*pud)) { |
@@ -381,16 +388,14 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
381 | return; | 388 | return; |
382 | 389 | ||
383 | if ((PAGE_SIZE-1) & (unsigned long)addr) { | 390 | if ((PAGE_SIZE-1) & (unsigned long)addr) { |
384 | printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); | 391 | WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); |
385 | WARN_ON(1); | ||
386 | return; | 392 | return; |
387 | } | 393 | } |
388 | 394 | ||
389 | area = remove_vm_area(addr); | 395 | area = remove_vm_area(addr); |
390 | if (unlikely(!area)) { | 396 | if (unlikely(!area)) { |
391 | printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", | 397 | WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", |
392 | addr); | 398 | addr); |
393 | WARN_ON(1); | ||
394 | return; | 399 | return; |
395 | } | 400 | } |
396 | 401 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 26672c6cd3ce..1ff1a58e7c10 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -391,17 +391,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
391 | } | 391 | } |
392 | 392 | ||
393 | /* | 393 | /* |
394 | * Attempt to detach a locked page from its ->mapping. If it is dirty or if | 394 | * Same as remove_mapping, but if the page is removed from the mapping, it |
395 | * someone else has a ref on the page, abort and return 0. If it was | 395 | * gets returned with a refcount of 0. |
396 | * successfully detached, return 1. Assumes the caller has a single ref on | ||
397 | * this page. | ||
398 | */ | 396 | */ |
399 | int remove_mapping(struct address_space *mapping, struct page *page) | 397 | static int __remove_mapping(struct address_space *mapping, struct page *page) |
400 | { | 398 | { |
401 | BUG_ON(!PageLocked(page)); | 399 | BUG_ON(!PageLocked(page)); |
402 | BUG_ON(mapping != page_mapping(page)); | 400 | BUG_ON(mapping != page_mapping(page)); |
403 | 401 | ||
404 | write_lock_irq(&mapping->tree_lock); | 402 | spin_lock_irq(&mapping->tree_lock); |
405 | /* | 403 | /* |
406 | * The non racy check for a busy page. | 404 | * The non racy check for a busy page. |
407 | * | 405 | * |
@@ -427,28 +425,48 @@ int remove_mapping(struct address_space *mapping, struct page *page) | |||
427 | * Note that if SetPageDirty is always performed via set_page_dirty, | 425 | * Note that if SetPageDirty is always performed via set_page_dirty, |
428 | * and thus under tree_lock, then this ordering is not required. | 426 | * and thus under tree_lock, then this ordering is not required. |
429 | */ | 427 | */ |
430 | if (unlikely(page_count(page) != 2)) | 428 | if (!page_freeze_refs(page, 2)) |
431 | goto cannot_free; | 429 | goto cannot_free; |
432 | smp_rmb(); | 430 | /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ |
433 | if (unlikely(PageDirty(page))) | 431 | if (unlikely(PageDirty(page))) { |
432 | page_unfreeze_refs(page, 2); | ||
434 | goto cannot_free; | 433 | goto cannot_free; |
434 | } | ||
435 | 435 | ||
436 | if (PageSwapCache(page)) { | 436 | if (PageSwapCache(page)) { |
437 | swp_entry_t swap = { .val = page_private(page) }; | 437 | swp_entry_t swap = { .val = page_private(page) }; |
438 | __delete_from_swap_cache(page); | 438 | __delete_from_swap_cache(page); |
439 | write_unlock_irq(&mapping->tree_lock); | 439 | spin_unlock_irq(&mapping->tree_lock); |
440 | swap_free(swap); | 440 | swap_free(swap); |
441 | __put_page(page); /* The pagecache ref */ | 441 | } else { |
442 | return 1; | 442 | __remove_from_page_cache(page); |
443 | spin_unlock_irq(&mapping->tree_lock); | ||
443 | } | 444 | } |
444 | 445 | ||
445 | __remove_from_page_cache(page); | ||
446 | write_unlock_irq(&mapping->tree_lock); | ||
447 | __put_page(page); | ||
448 | return 1; | 446 | return 1; |
449 | 447 | ||
450 | cannot_free: | 448 | cannot_free: |
451 | write_unlock_irq(&mapping->tree_lock); | 449 | spin_unlock_irq(&mapping->tree_lock); |
450 | return 0; | ||
451 | } | ||
452 | |||
453 | /* | ||
454 | * Attempt to detach a locked page from its ->mapping. If it is dirty or if | ||
455 | * someone else has a ref on the page, abort and return 0. If it was | ||
456 | * successfully detached, return 1. Assumes the caller has a single ref on | ||
457 | * this page. | ||
458 | */ | ||
459 | int remove_mapping(struct address_space *mapping, struct page *page) | ||
460 | { | ||
461 | if (__remove_mapping(mapping, page)) { | ||
462 | /* | ||
463 | * Unfreezing the refcount with 1 rather than 2 effectively | ||
464 | * drops the pagecache ref for us without requiring another | ||
465 | * atomic operation. | ||
466 | */ | ||
467 | page_unfreeze_refs(page, 1); | ||
468 | return 1; | ||
469 | } | ||
452 | return 0; | 470 | return 0; |
453 | } | 471 | } |
454 | 472 | ||
@@ -478,7 +496,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
478 | page = lru_to_page(page_list); | 496 | page = lru_to_page(page_list); |
479 | list_del(&page->lru); | 497 | list_del(&page->lru); |
480 | 498 | ||
481 | if (TestSetPageLocked(page)) | 499 | if (!trylock_page(page)) |
482 | goto keep; | 500 | goto keep; |
483 | 501 | ||
484 | VM_BUG_ON(PageActive(page)); | 502 | VM_BUG_ON(PageActive(page)); |
@@ -564,7 +582,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
564 | * A synchronous write - probably a ramdisk. Go | 582 | * A synchronous write - probably a ramdisk. Go |
565 | * ahead and try to reclaim the page. | 583 | * ahead and try to reclaim the page. |
566 | */ | 584 | */ |
567 | if (TestSetPageLocked(page)) | 585 | if (!trylock_page(page)) |
568 | goto keep; | 586 | goto keep; |
569 | if (PageDirty(page) || PageWriteback(page)) | 587 | if (PageDirty(page) || PageWriteback(page)) |
570 | goto keep_locked; | 588 | goto keep_locked; |
@@ -598,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
598 | if (PagePrivate(page)) { | 616 | if (PagePrivate(page)) { |
599 | if (!try_to_release_page(page, sc->gfp_mask)) | 617 | if (!try_to_release_page(page, sc->gfp_mask)) |
600 | goto activate_locked; | 618 | goto activate_locked; |
601 | if (!mapping && page_count(page) == 1) | 619 | if (!mapping && page_count(page) == 1) { |
602 | goto free_it; | 620 | unlock_page(page); |
621 | if (put_page_testzero(page)) | ||
622 | goto free_it; | ||
623 | else { | ||
624 | /* | ||
625 | * rare race with speculative reference. | ||
626 | * the speculative reference will free | ||
627 | * this page shortly, so we may | ||
628 | * increment nr_reclaimed here (and | ||
629 | * leave it off the LRU). | ||
630 | */ | ||
631 | nr_reclaimed++; | ||
632 | continue; | ||
633 | } | ||
634 | } | ||
603 | } | 635 | } |
604 | 636 | ||
605 | if (!mapping || !remove_mapping(mapping, page)) | 637 | if (!mapping || !__remove_mapping(mapping, page)) |
606 | goto keep_locked; | 638 | goto keep_locked; |
607 | 639 | ||
608 | free_it: | ||
609 | unlock_page(page); | 640 | unlock_page(page); |
641 | free_it: | ||
610 | nr_reclaimed++; | 642 | nr_reclaimed++; |
611 | if (!pagevec_add(&freed_pvec, page)) | 643 | if (!pagevec_add(&freed_pvec, page)) { |
612 | __pagevec_release_nonlru(&freed_pvec); | 644 | __pagevec_free(&freed_pvec); |
645 | pagevec_reinit(&freed_pvec); | ||
646 | } | ||
613 | continue; | 647 | continue; |
614 | 648 | ||
615 | activate_locked: | 649 | activate_locked: |
@@ -623,7 +657,7 @@ keep: | |||
623 | } | 657 | } |
624 | list_splice(&ret_pages, page_list); | 658 | list_splice(&ret_pages, page_list); |
625 | if (pagevec_count(&freed_pvec)) | 659 | if (pagevec_count(&freed_pvec)) |
626 | __pagevec_release_nonlru(&freed_pvec); | 660 | __pagevec_free(&freed_pvec); |
627 | count_vm_events(PGACTIVATE, pgactivate); | 661 | count_vm_events(PGACTIVATE, pgactivate); |
628 | return nr_reclaimed; | 662 | return nr_reclaimed; |
629 | } | 663 | } |
@@ -1374,7 +1408,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1374 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) | 1408 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) |
1375 | congestion_wait(WRITE, HZ/10); | 1409 | congestion_wait(WRITE, HZ/10); |
1376 | } | 1410 | } |
1377 | /* top priority shrink_caches still had more to do? don't OOM, then */ | 1411 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
1378 | if (!sc->all_unreclaimable && scan_global_lru(sc)) | 1412 | if (!sc->all_unreclaimable && scan_global_lru(sc)) |
1379 | ret = nr_reclaimed; | 1413 | ret = nr_reclaimed; |
1380 | out: | 1414 | out: |
@@ -1945,7 +1979,7 @@ module_init(kswapd_init) | |||
1945 | int zone_reclaim_mode __read_mostly; | 1979 | int zone_reclaim_mode __read_mostly; |
1946 | 1980 | ||
1947 | #define RECLAIM_OFF 0 | 1981 | #define RECLAIM_OFF 0 |
1948 | #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ | 1982 | #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ |
1949 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ | 1983 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ |
1950 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ | 1984 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ |
1951 | 1985 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index b0d08e667ece..d7826af2fb07 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -516,9 +516,26 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, | |||
516 | continue; | 516 | continue; |
517 | 517 | ||
518 | page = pfn_to_page(pfn); | 518 | page = pfn_to_page(pfn); |
519 | #ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES | ||
520 | /* | ||
521 | * Ordinarily, memory holes in flatmem still have a valid | ||
522 | * memmap for the PFN range. However, an architecture for | ||
523 | * embedded systems (e.g. ARM) can free up the memmap backing | ||
524 | * holes to save memory on the assumption the memmap is | ||
525 | * never used. The page_zone linkages are then broken even | ||
526 | * though pfn_valid() returns true. Skip the page if the | ||
527 | * linkages are broken. Even if this test passed, the impact | ||
528 | * is that the counters for the movable type are off but | ||
529 | * fragmentation monitoring is likely meaningless on small | ||
530 | * systems. | ||
531 | */ | ||
532 | if (page_zone(page) != zone) | ||
533 | continue; | ||
534 | #endif | ||
519 | mtype = get_pageblock_migratetype(page); | 535 | mtype = get_pageblock_migratetype(page); |
520 | 536 | ||
521 | count[mtype]++; | 537 | if (mtype < MIGRATE_TYPES) |
538 | count[mtype]++; | ||
522 | } | 539 | } |
523 | 540 | ||
524 | /* Print counts */ | 541 | /* Print counts */ |