diff options
| author | Jeff Garzik <jgarzik@pobox.com> | 2005-10-30 20:37:44 -0500 |
|---|---|---|
| committer | Jeff Garzik <jgarzik@pobox.com> | 2005-10-30 20:37:44 -0500 |
| commit | 9e0cb06b17be7e562cbdaba2768649f025826dc6 (patch) | |
| tree | aaf5ef8c6cd11764d222df9c446ad9af17e0020e /mm | |
| parent | 23da0c20ef1c1f0432f373e0e2233a6b6ab2678f (diff) | |
| parent | 6e9d6b8ee4e0c37d3952256e6472c57490d6780d (diff) | |
Merge branch 'master'
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 21 | ||||
| -rw-r--r-- | mm/Makefile | 2 | ||||
| -rw-r--r-- | mm/bootmem.c | 1 | ||||
| -rw-r--r-- | mm/filemap.c | 12 | ||||
| -rw-r--r-- | mm/filemap_xip.c | 22 | ||||
| -rw-r--r-- | mm/fremap.c | 86 | ||||
| -rw-r--r-- | mm/hugetlb.c | 207 | ||||
| -rw-r--r-- | mm/madvise.c | 2 | ||||
| -rw-r--r-- | mm/memory.c | 993 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 138 | ||||
| -rw-r--r-- | mm/mempolicy.c | 393 | ||||
| -rw-r--r-- | mm/mmap.c | 126 | ||||
| -rw-r--r-- | mm/mprotect.c | 19 | ||||
| -rw-r--r-- | mm/mremap.c | 193 | ||||
| -rw-r--r-- | mm/msync.c | 78 | ||||
| -rw-r--r-- | mm/nommu.c | 18 | ||||
| -rw-r--r-- | mm/page_alloc.c | 207 | ||||
| -rw-r--r-- | mm/page_io.c | 6 | ||||
| -rw-r--r-- | mm/rmap.c | 146 | ||||
| -rw-r--r-- | mm/shmem.c | 28 | ||||
| -rw-r--r-- | mm/slab.c | 5 | ||||
| -rw-r--r-- | mm/sparse.c | 99 | ||||
| -rw-r--r-- | mm/swap.c | 6 | ||||
| -rw-r--r-- | mm/swap_state.c | 11 | ||||
| -rw-r--r-- | mm/swapfile.c | 41 | ||||
| -rw-r--r-- | mm/thrash.c | 2 | ||||
| -rw-r--r-- | mm/vmalloc.c | 77 | ||||
| -rw-r--r-- | mm/vmscan.c | 6 |
28 files changed, 1617 insertions, 1328 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 391ffc54d136..1a4473fcb2ca 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -111,3 +111,24 @@ config SPARSEMEM_STATIC | |||
| 111 | config SPARSEMEM_EXTREME | 111 | config SPARSEMEM_EXTREME |
| 112 | def_bool y | 112 | def_bool y |
| 113 | depends on SPARSEMEM && !SPARSEMEM_STATIC | 113 | depends on SPARSEMEM && !SPARSEMEM_STATIC |
| 114 | |||
| 115 | # eventually, we can have this option just 'select SPARSEMEM' | ||
| 116 | config MEMORY_HOTPLUG | ||
| 117 | bool "Allow for memory hot-add" | ||
| 118 | depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND | ||
| 119 | |||
| 120 | comment "Memory hotplug is currently incompatible with Software Suspend" | ||
| 121 | depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND | ||
| 122 | |||
| 123 | # Heavily threaded applications may benefit from splitting the mm-wide | ||
| 124 | # page_table_lock, so that faults on different parts of the user address | ||
| 125 | # space can be handled with less contention: split it at this NR_CPUS. | ||
| 126 | # Default to 4 for wider testing, though 8 might be more appropriate. | ||
| 127 | # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. | ||
| 128 | # PA-RISC's debug spinlock_t is too large for the 32-bit struct page. | ||
| 129 | # | ||
| 130 | config SPLIT_PTLOCK_CPUS | ||
| 131 | int | ||
| 132 | default "4096" if ARM && !CPU_CACHE_VIPT | ||
| 133 | default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT | ||
| 134 | default "4" | ||
diff --git a/mm/Makefile b/mm/Makefile index 4cd69e3ce421..2fa6d2ca9f28 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -18,5 +18,5 @@ obj-$(CONFIG_NUMA) += mempolicy.o | |||
| 18 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 18 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
| 19 | obj-$(CONFIG_SHMEM) += shmem.o | 19 | obj-$(CONFIG_SHMEM) += shmem.o |
| 20 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | 20 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o |
| 21 | 21 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | |
| 22 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 22 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
diff --git a/mm/bootmem.c b/mm/bootmem.c index a58699b6579e..e8c567177dcf 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
| @@ -305,6 +305,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
| 305 | if (j + 16 < BITS_PER_LONG) | 305 | if (j + 16 < BITS_PER_LONG) |
| 306 | prefetchw(page + j + 16); | 306 | prefetchw(page + j + 16); |
| 307 | __ClearPageReserved(page + j); | 307 | __ClearPageReserved(page + j); |
| 308 | set_page_count(page + j, 0); | ||
| 308 | } | 309 | } |
| 309 | __free_pages(page, order); | 310 | __free_pages(page, order); |
| 310 | i += BITS_PER_LONG; | 311 | i += BITS_PER_LONG; |
diff --git a/mm/filemap.c b/mm/filemap.c index 1c31b2fd2ca5..768687f1d46b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -66,7 +66,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
| 66 | * | 66 | * |
| 67 | * ->mmap_sem | 67 | * ->mmap_sem |
| 68 | * ->i_mmap_lock | 68 | * ->i_mmap_lock |
| 69 | * ->page_table_lock (various places, mainly in mmap.c) | 69 | * ->page_table_lock or pte_lock (various, mainly in memory.c) |
| 70 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) | 70 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) |
| 71 | * | 71 | * |
| 72 | * ->mmap_sem | 72 | * ->mmap_sem |
| @@ -86,9 +86,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
| 86 | * ->anon_vma.lock (vma_adjust) | 86 | * ->anon_vma.lock (vma_adjust) |
| 87 | * | 87 | * |
| 88 | * ->anon_vma.lock | 88 | * ->anon_vma.lock |
| 89 | * ->page_table_lock (anon_vma_prepare and various) | 89 | * ->page_table_lock or pte_lock (anon_vma_prepare and various) |
| 90 | * | 90 | * |
| 91 | * ->page_table_lock | 91 | * ->page_table_lock or pte_lock |
| 92 | * ->swap_lock (try_to_unmap_one) | 92 | * ->swap_lock (try_to_unmap_one) |
| 93 | * ->private_lock (try_to_unmap_one) | 93 | * ->private_lock (try_to_unmap_one) |
| 94 | * ->tree_lock (try_to_unmap_one) | 94 | * ->tree_lock (try_to_unmap_one) |
| @@ -152,7 +152,7 @@ static int sync_page(void *word) | |||
| 152 | * in the ->sync_page() methods make essential use of the | 152 | * in the ->sync_page() methods make essential use of the |
| 153 | * page_mapping(), merely passing the page down to the backing | 153 | * page_mapping(), merely passing the page down to the backing |
| 154 | * device's unplug functions when it's non-NULL, which in turn | 154 | * device's unplug functions when it's non-NULL, which in turn |
| 155 | * ignore it for all cases but swap, where only page->private is | 155 | * ignore it for all cases but swap, where only page_private(page) is |
| 156 | * of interest. When page_mapping() does go NULL, the entire | 156 | * of interest. When page_mapping() does go NULL, the entire |
| 157 | * call stack gracefully ignores the page and returns. | 157 | * call stack gracefully ignores the page and returns. |
| 158 | * -- wli | 158 | * -- wli |
| @@ -1520,7 +1520,7 @@ repeat: | |||
| 1520 | page_cache_release(page); | 1520 | page_cache_release(page); |
| 1521 | return err; | 1521 | return err; |
| 1522 | } | 1522 | } |
| 1523 | } else { | 1523 | } else if (vma->vm_flags & VM_NONLINEAR) { |
| 1524 | /* No page was found just because we can't read it in now (being | 1524 | /* No page was found just because we can't read it in now (being |
| 1525 | * here implies nonblock != 0), but the page may exist, so set | 1525 | * here implies nonblock != 0), but the page may exist, so set |
| 1526 | * the PTE to fault it in later. */ | 1526 | * the PTE to fault it in later. */ |
| @@ -1537,6 +1537,7 @@ repeat: | |||
| 1537 | 1537 | ||
| 1538 | return 0; | 1538 | return 0; |
| 1539 | } | 1539 | } |
| 1540 | EXPORT_SYMBOL(filemap_populate); | ||
| 1540 | 1541 | ||
| 1541 | struct vm_operations_struct generic_file_vm_ops = { | 1542 | struct vm_operations_struct generic_file_vm_ops = { |
| 1542 | .nopage = filemap_nopage, | 1543 | .nopage = filemap_nopage, |
| @@ -1555,7 +1556,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) | |||
| 1555 | vma->vm_ops = &generic_file_vm_ops; | 1556 | vma->vm_ops = &generic_file_vm_ops; |
| 1556 | return 0; | 1557 | return 0; |
| 1557 | } | 1558 | } |
| 1558 | EXPORT_SYMBOL(filemap_populate); | ||
| 1559 | 1559 | ||
| 1560 | /* | 1560 | /* |
| 1561 | * This is for filesystems which do not implement ->writepage. | 1561 | * This is for filesystems which do not implement ->writepage. |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 8c199f537732..9cf687e4a29a 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
| @@ -174,6 +174,8 @@ __xip_unmap (struct address_space * mapping, | |||
| 174 | unsigned long address; | 174 | unsigned long address; |
| 175 | pte_t *pte; | 175 | pte_t *pte; |
| 176 | pte_t pteval; | 176 | pte_t pteval; |
| 177 | spinlock_t *ptl; | ||
| 178 | struct page *page; | ||
| 177 | 179 | ||
| 178 | spin_lock(&mapping->i_mmap_lock); | 180 | spin_lock(&mapping->i_mmap_lock); |
| 179 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 181 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
| @@ -181,19 +183,17 @@ __xip_unmap (struct address_space * mapping, | |||
| 181 | address = vma->vm_start + | 183 | address = vma->vm_start + |
| 182 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 184 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
| 183 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 185 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
| 184 | /* | 186 | page = ZERO_PAGE(address); |
| 185 | * We need the page_table_lock to protect us from page faults, | 187 | pte = page_check_address(page, mm, address, &ptl); |
| 186 | * munmap, fork, etc... | 188 | if (pte) { |
| 187 | */ | ||
| 188 | pte = page_check_address(ZERO_PAGE(address), mm, | ||
| 189 | address); | ||
| 190 | if (!IS_ERR(pte)) { | ||
| 191 | /* Nuke the page table entry. */ | 189 | /* Nuke the page table entry. */ |
| 192 | flush_cache_page(vma, address, pte_pfn(*pte)); | 190 | flush_cache_page(vma, address, pte_pfn(*pte)); |
| 193 | pteval = ptep_clear_flush(vma, address, pte); | 191 | pteval = ptep_clear_flush(vma, address, pte); |
| 192 | page_remove_rmap(page); | ||
| 193 | dec_mm_counter(mm, file_rss); | ||
| 194 | BUG_ON(pte_dirty(pteval)); | 194 | BUG_ON(pte_dirty(pteval)); |
| 195 | pte_unmap(pte); | 195 | pte_unmap_unlock(pte, ptl); |
| 196 | spin_unlock(&mm->page_table_lock); | 196 | page_cache_release(page); |
| 197 | } | 197 | } |
| 198 | } | 198 | } |
| 199 | spin_unlock(&mapping->i_mmap_lock); | 199 | spin_unlock(&mapping->i_mmap_lock); |
| @@ -228,7 +228,7 @@ xip_file_nopage(struct vm_area_struct * area, | |||
| 228 | 228 | ||
| 229 | page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0); | 229 | page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0); |
| 230 | if (!IS_ERR(page)) { | 230 | if (!IS_ERR(page)) { |
| 231 | return page; | 231 | goto out; |
| 232 | } | 232 | } |
| 233 | if (PTR_ERR(page) != -ENODATA) | 233 | if (PTR_ERR(page) != -ENODATA) |
| 234 | return NULL; | 234 | return NULL; |
| @@ -249,6 +249,8 @@ xip_file_nopage(struct vm_area_struct * area, | |||
| 249 | page = ZERO_PAGE(address); | 249 | page = ZERO_PAGE(address); |
| 250 | } | 250 | } |
| 251 | 251 | ||
| 252 | out: | ||
| 253 | page_cache_get(page); | ||
| 252 | return page; | 254 | return page; |
| 253 | } | 255 | } |
| 254 | 256 | ||
diff --git a/mm/fremap.c b/mm/fremap.c index ab23a0673c35..d862be3bc3e3 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
| @@ -20,33 +20,32 @@ | |||
| 20 | #include <asm/cacheflush.h> | 20 | #include <asm/cacheflush.h> |
| 21 | #include <asm/tlbflush.h> | 21 | #include <asm/tlbflush.h> |
| 22 | 22 | ||
| 23 | static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | 23 | static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, |
| 24 | unsigned long addr, pte_t *ptep) | 24 | unsigned long addr, pte_t *ptep) |
| 25 | { | 25 | { |
| 26 | pte_t pte = *ptep; | 26 | pte_t pte = *ptep; |
| 27 | struct page *page = NULL; | ||
| 27 | 28 | ||
| 28 | if (pte_none(pte)) | ||
| 29 | return; | ||
| 30 | if (pte_present(pte)) { | 29 | if (pte_present(pte)) { |
| 31 | unsigned long pfn = pte_pfn(pte); | 30 | unsigned long pfn = pte_pfn(pte); |
| 32 | |||
| 33 | flush_cache_page(vma, addr, pfn); | 31 | flush_cache_page(vma, addr, pfn); |
| 34 | pte = ptep_clear_flush(vma, addr, ptep); | 32 | pte = ptep_clear_flush(vma, addr, ptep); |
| 35 | if (pfn_valid(pfn)) { | 33 | if (unlikely(!pfn_valid(pfn))) { |
| 36 | struct page *page = pfn_to_page(pfn); | 34 | print_bad_pte(vma, pte, addr); |
| 37 | if (!PageReserved(page)) { | 35 | goto out; |
| 38 | if (pte_dirty(pte)) | ||
| 39 | set_page_dirty(page); | ||
| 40 | page_remove_rmap(page); | ||
| 41 | page_cache_release(page); | ||
| 42 | dec_mm_counter(mm, rss); | ||
| 43 | } | ||
| 44 | } | 36 | } |
| 37 | page = pfn_to_page(pfn); | ||
| 38 | if (pte_dirty(pte)) | ||
| 39 | set_page_dirty(page); | ||
| 40 | page_remove_rmap(page); | ||
| 41 | page_cache_release(page); | ||
| 45 | } else { | 42 | } else { |
| 46 | if (!pte_file(pte)) | 43 | if (!pte_file(pte)) |
| 47 | free_swap_and_cache(pte_to_swp_entry(pte)); | 44 | free_swap_and_cache(pte_to_swp_entry(pte)); |
| 48 | pte_clear(mm, addr, ptep); | 45 | pte_clear(mm, addr, ptep); |
| 49 | } | 46 | } |
| 47 | out: | ||
| 48 | return !!page; | ||
| 50 | } | 49 | } |
| 51 | 50 | ||
| 52 | /* | 51 | /* |
| @@ -64,21 +63,20 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 64 | pud_t *pud; | 63 | pud_t *pud; |
| 65 | pgd_t *pgd; | 64 | pgd_t *pgd; |
| 66 | pte_t pte_val; | 65 | pte_t pte_val; |
| 66 | spinlock_t *ptl; | ||
| 67 | |||
| 68 | BUG_ON(vma->vm_flags & VM_RESERVED); | ||
| 67 | 69 | ||
| 68 | pgd = pgd_offset(mm, addr); | 70 | pgd = pgd_offset(mm, addr); |
| 69 | spin_lock(&mm->page_table_lock); | ||
| 70 | |||
| 71 | pud = pud_alloc(mm, pgd, addr); | 71 | pud = pud_alloc(mm, pgd, addr); |
| 72 | if (!pud) | 72 | if (!pud) |
| 73 | goto err_unlock; | 73 | goto out; |
| 74 | |||
| 75 | pmd = pmd_alloc(mm, pud, addr); | 74 | pmd = pmd_alloc(mm, pud, addr); |
| 76 | if (!pmd) | 75 | if (!pmd) |
| 77 | goto err_unlock; | 76 | goto out; |
| 78 | 77 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | |
| 79 | pte = pte_alloc_map(mm, pmd, addr); | ||
| 80 | if (!pte) | 78 | if (!pte) |
| 81 | goto err_unlock; | 79 | goto out; |
| 82 | 80 | ||
| 83 | /* | 81 | /* |
| 84 | * This page may have been truncated. Tell the | 82 | * This page may have been truncated. Tell the |
| @@ -88,29 +86,27 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 88 | inode = vma->vm_file->f_mapping->host; | 86 | inode = vma->vm_file->f_mapping->host; |
| 89 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 87 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
| 90 | if (!page->mapping || page->index >= size) | 88 | if (!page->mapping || page->index >= size) |
| 91 | goto err_unlock; | 89 | goto unlock; |
| 92 | err = -ENOMEM; | 90 | err = -ENOMEM; |
| 93 | if (page_mapcount(page) > INT_MAX/2) | 91 | if (page_mapcount(page) > INT_MAX/2) |
| 94 | goto err_unlock; | 92 | goto unlock; |
| 95 | 93 | ||
| 96 | zap_pte(mm, vma, addr, pte); | 94 | if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) |
| 95 | inc_mm_counter(mm, file_rss); | ||
| 97 | 96 | ||
| 98 | inc_mm_counter(mm,rss); | ||
| 99 | flush_icache_page(vma, page); | 97 | flush_icache_page(vma, page); |
| 100 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 98 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); |
| 101 | page_add_file_rmap(page); | 99 | page_add_file_rmap(page); |
| 102 | pte_val = *pte; | 100 | pte_val = *pte; |
| 103 | pte_unmap(pte); | ||
| 104 | update_mmu_cache(vma, addr, pte_val); | 101 | update_mmu_cache(vma, addr, pte_val); |
| 105 | |||
| 106 | err = 0; | 102 | err = 0; |
| 107 | err_unlock: | 103 | unlock: |
| 108 | spin_unlock(&mm->page_table_lock); | 104 | pte_unmap_unlock(pte, ptl); |
| 105 | out: | ||
| 109 | return err; | 106 | return err; |
| 110 | } | 107 | } |
| 111 | EXPORT_SYMBOL(install_page); | 108 | EXPORT_SYMBOL(install_page); |
| 112 | 109 | ||
| 113 | |||
| 114 | /* | 110 | /* |
| 115 | * Install a file pte to a given virtual memory address, release any | 111 | * Install a file pte to a given virtual memory address, release any |
| 116 | * previously existing mapping. | 112 | * previously existing mapping. |
| @@ -124,37 +120,35 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 124 | pud_t *pud; | 120 | pud_t *pud; |
| 125 | pgd_t *pgd; | 121 | pgd_t *pgd; |
| 126 | pte_t pte_val; | 122 | pte_t pte_val; |
| 123 | spinlock_t *ptl; | ||
| 124 | |||
| 125 | BUG_ON(vma->vm_flags & VM_RESERVED); | ||
| 127 | 126 | ||
| 128 | pgd = pgd_offset(mm, addr); | 127 | pgd = pgd_offset(mm, addr); |
| 129 | spin_lock(&mm->page_table_lock); | ||
| 130 | |||
| 131 | pud = pud_alloc(mm, pgd, addr); | 128 | pud = pud_alloc(mm, pgd, addr); |
| 132 | if (!pud) | 129 | if (!pud) |
| 133 | goto err_unlock; | 130 | goto out; |
| 134 | |||
| 135 | pmd = pmd_alloc(mm, pud, addr); | 131 | pmd = pmd_alloc(mm, pud, addr); |
| 136 | if (!pmd) | 132 | if (!pmd) |
| 137 | goto err_unlock; | 133 | goto out; |
| 138 | 134 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | |
| 139 | pte = pte_alloc_map(mm, pmd, addr); | ||
| 140 | if (!pte) | 135 | if (!pte) |
| 141 | goto err_unlock; | 136 | goto out; |
| 142 | 137 | ||
| 143 | zap_pte(mm, vma, addr, pte); | 138 | if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { |
| 139 | update_hiwater_rss(mm); | ||
| 140 | dec_mm_counter(mm, file_rss); | ||
| 141 | } | ||
| 144 | 142 | ||
| 145 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); | 143 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); |
| 146 | pte_val = *pte; | 144 | pte_val = *pte; |
| 147 | pte_unmap(pte); | ||
| 148 | update_mmu_cache(vma, addr, pte_val); | 145 | update_mmu_cache(vma, addr, pte_val); |
| 149 | spin_unlock(&mm->page_table_lock); | 146 | pte_unmap_unlock(pte, ptl); |
| 150 | return 0; | 147 | err = 0; |
| 151 | 148 | out: | |
| 152 | err_unlock: | ||
| 153 | spin_unlock(&mm->page_table_lock); | ||
| 154 | return err; | 149 | return err; |
| 155 | } | 150 | } |
| 156 | 151 | ||
| 157 | |||
| 158 | /*** | 152 | /*** |
| 159 | * sys_remap_file_pages - remap arbitrary pages of a shared backing store | 153 | * sys_remap_file_pages - remap arbitrary pages of a shared backing store |
| 160 | * file within an existing vma. | 154 | * file within an existing vma. |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 61d380678030..c9b43360fd33 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -277,19 +277,23 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
| 277 | unsigned long addr; | 277 | unsigned long addr; |
| 278 | 278 | ||
| 279 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 279 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { |
| 280 | src_pte = huge_pte_offset(src, addr); | ||
| 281 | if (!src_pte) | ||
| 282 | continue; | ||
| 280 | dst_pte = huge_pte_alloc(dst, addr); | 283 | dst_pte = huge_pte_alloc(dst, addr); |
| 281 | if (!dst_pte) | 284 | if (!dst_pte) |
| 282 | goto nomem; | 285 | goto nomem; |
| 286 | spin_lock(&dst->page_table_lock); | ||
| 283 | spin_lock(&src->page_table_lock); | 287 | spin_lock(&src->page_table_lock); |
| 284 | src_pte = huge_pte_offset(src, addr); | 288 | if (!pte_none(*src_pte)) { |
| 285 | if (src_pte && !pte_none(*src_pte)) { | ||
| 286 | entry = *src_pte; | 289 | entry = *src_pte; |
| 287 | ptepage = pte_page(entry); | 290 | ptepage = pte_page(entry); |
| 288 | get_page(ptepage); | 291 | get_page(ptepage); |
| 289 | add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); | 292 | add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE); |
| 290 | set_huge_pte_at(dst, addr, dst_pte, entry); | 293 | set_huge_pte_at(dst, addr, dst_pte, entry); |
| 291 | } | 294 | } |
| 292 | spin_unlock(&src->page_table_lock); | 295 | spin_unlock(&src->page_table_lock); |
| 296 | spin_unlock(&dst->page_table_lock); | ||
| 293 | } | 297 | } |
| 294 | return 0; | 298 | return 0; |
| 295 | 299 | ||
| @@ -310,12 +314,14 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
| 310 | BUG_ON(start & ~HPAGE_MASK); | 314 | BUG_ON(start & ~HPAGE_MASK); |
| 311 | BUG_ON(end & ~HPAGE_MASK); | 315 | BUG_ON(end & ~HPAGE_MASK); |
| 312 | 316 | ||
| 317 | spin_lock(&mm->page_table_lock); | ||
| 318 | |||
| 319 | /* Update high watermark before we lower rss */ | ||
| 320 | update_hiwater_rss(mm); | ||
| 321 | |||
| 313 | for (address = start; address < end; address += HPAGE_SIZE) { | 322 | for (address = start; address < end; address += HPAGE_SIZE) { |
| 314 | ptep = huge_pte_offset(mm, address); | 323 | ptep = huge_pte_offset(mm, address); |
| 315 | if (! ptep) | 324 | if (!ptep) |
| 316 | /* This can happen on truncate, or if an | ||
| 317 | * mmap() is aborted due to an error before | ||
| 318 | * the prefault */ | ||
| 319 | continue; | 325 | continue; |
| 320 | 326 | ||
| 321 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 327 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
| @@ -324,96 +330,99 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
| 324 | 330 | ||
| 325 | page = pte_page(pte); | 331 | page = pte_page(pte); |
| 326 | put_page(page); | 332 | put_page(page); |
| 327 | add_mm_counter(mm, rss, - (HPAGE_SIZE / PAGE_SIZE)); | 333 | add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); |
| 328 | } | 334 | } |
| 329 | flush_tlb_range(vma, start, end); | ||
| 330 | } | ||
| 331 | |||
| 332 | void zap_hugepage_range(struct vm_area_struct *vma, | ||
| 333 | unsigned long start, unsigned long length) | ||
| 334 | { | ||
| 335 | struct mm_struct *mm = vma->vm_mm; | ||
| 336 | 335 | ||
| 337 | spin_lock(&mm->page_table_lock); | ||
| 338 | unmap_hugepage_range(vma, start, start + length); | ||
| 339 | spin_unlock(&mm->page_table_lock); | 336 | spin_unlock(&mm->page_table_lock); |
| 337 | flush_tlb_range(vma, start, end); | ||
| 340 | } | 338 | } |
| 341 | 339 | ||
| 342 | int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) | 340 | static struct page *find_lock_huge_page(struct address_space *mapping, |
| 341 | unsigned long idx) | ||
| 343 | { | 342 | { |
| 344 | struct mm_struct *mm = current->mm; | 343 | struct page *page; |
| 345 | unsigned long addr; | 344 | int err; |
| 346 | int ret = 0; | 345 | struct inode *inode = mapping->host; |
| 347 | 346 | unsigned long size; | |
| 348 | WARN_ON(!is_vm_hugetlb_page(vma)); | 347 | |
| 349 | BUG_ON(vma->vm_start & ~HPAGE_MASK); | 348 | retry: |
| 350 | BUG_ON(vma->vm_end & ~HPAGE_MASK); | 349 | page = find_lock_page(mapping, idx); |
| 351 | 350 | if (page) | |
| 352 | hugetlb_prefault_arch_hook(mm); | 351 | goto out; |
| 353 | 352 | ||
| 354 | spin_lock(&mm->page_table_lock); | 353 | /* Check to make sure the mapping hasn't been truncated */ |
| 355 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 354 | size = i_size_read(inode) >> HPAGE_SHIFT; |
| 356 | unsigned long idx; | 355 | if (idx >= size) |
| 357 | pte_t *pte = huge_pte_alloc(mm, addr); | 356 | goto out; |
| 358 | struct page *page; | 357 | |
| 359 | 358 | if (hugetlb_get_quota(mapping)) | |
| 360 | if (!pte) { | 359 | goto out; |
| 361 | ret = -ENOMEM; | 360 | page = alloc_huge_page(); |
| 362 | goto out; | 361 | if (!page) { |
| 363 | } | 362 | hugetlb_put_quota(mapping); |
| 363 | goto out; | ||
| 364 | } | ||
| 364 | 365 | ||
| 365 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | 366 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); |
| 366 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | 367 | if (err) { |
| 367 | page = find_get_page(mapping, idx); | 368 | put_page(page); |
| 368 | if (!page) { | 369 | hugetlb_put_quota(mapping); |
| 369 | /* charge the fs quota first */ | 370 | if (err == -EEXIST) |
| 370 | if (hugetlb_get_quota(mapping)) { | 371 | goto retry; |
| 371 | ret = -ENOMEM; | 372 | page = NULL; |
| 372 | goto out; | ||
| 373 | } | ||
| 374 | page = alloc_huge_page(); | ||
| 375 | if (!page) { | ||
| 376 | hugetlb_put_quota(mapping); | ||
| 377 | ret = -ENOMEM; | ||
| 378 | goto out; | ||
| 379 | } | ||
| 380 | ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); | ||
| 381 | if (! ret) { | ||
| 382 | unlock_page(page); | ||
| 383 | } else { | ||
| 384 | hugetlb_put_quota(mapping); | ||
| 385 | free_huge_page(page); | ||
| 386 | goto out; | ||
| 387 | } | ||
| 388 | } | ||
| 389 | add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); | ||
| 390 | set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page)); | ||
| 391 | } | 373 | } |
| 392 | out: | 374 | out: |
| 393 | spin_unlock(&mm->page_table_lock); | 375 | return page; |
| 394 | return ret; | ||
| 395 | } | 376 | } |
| 396 | 377 | ||
| 397 | /* | ||
| 398 | * On ia64 at least, it is possible to receive a hugetlb fault from a | ||
| 399 | * stale zero entry left in the TLB from earlier hardware prefetching. | ||
| 400 | * Low-level arch code should already have flushed the stale entry as | ||
| 401 | * part of its fault handling, but we do need to accept this minor fault | ||
| 402 | * and return successfully. Whereas the "normal" case is that this is | ||
| 403 | * an access to a hugetlb page which has been truncated off since mmap. | ||
| 404 | */ | ||
| 405 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 378 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
| 406 | unsigned long address, int write_access) | 379 | unsigned long address, int write_access) |
| 407 | { | 380 | { |
| 408 | int ret = VM_FAULT_SIGBUS; | 381 | int ret = VM_FAULT_SIGBUS; |
| 382 | unsigned long idx; | ||
| 383 | unsigned long size; | ||
| 409 | pte_t *pte; | 384 | pte_t *pte; |
| 385 | struct page *page; | ||
| 386 | struct address_space *mapping; | ||
| 387 | |||
| 388 | pte = huge_pte_alloc(mm, address); | ||
| 389 | if (!pte) | ||
| 390 | goto out; | ||
| 391 | |||
| 392 | mapping = vma->vm_file->f_mapping; | ||
| 393 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) | ||
| 394 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
| 395 | |||
| 396 | /* | ||
| 397 | * Use page lock to guard against racing truncation | ||
| 398 | * before we get page_table_lock. | ||
| 399 | */ | ||
| 400 | page = find_lock_huge_page(mapping, idx); | ||
| 401 | if (!page) | ||
| 402 | goto out; | ||
| 410 | 403 | ||
| 411 | spin_lock(&mm->page_table_lock); | 404 | spin_lock(&mm->page_table_lock); |
| 412 | pte = huge_pte_offset(mm, address); | 405 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; |
| 413 | if (pte && !pte_none(*pte)) | 406 | if (idx >= size) |
| 414 | ret = VM_FAULT_MINOR; | 407 | goto backout; |
| 408 | |||
| 409 | ret = VM_FAULT_MINOR; | ||
| 410 | if (!pte_none(*pte)) | ||
| 411 | goto backout; | ||
| 412 | |||
| 413 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); | ||
| 414 | set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); | ||
| 415 | spin_unlock(&mm->page_table_lock); | 415 | spin_unlock(&mm->page_table_lock); |
| 416 | unlock_page(page); | ||
| 417 | out: | ||
| 416 | return ret; | 418 | return ret; |
| 419 | |||
| 420 | backout: | ||
| 421 | spin_unlock(&mm->page_table_lock); | ||
| 422 | hugetlb_put_quota(mapping); | ||
| 423 | unlock_page(page); | ||
| 424 | put_page(page); | ||
| 425 | goto out; | ||
| 417 | } | 426 | } |
| 418 | 427 | ||
| 419 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 428 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| @@ -423,34 +432,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 423 | unsigned long vpfn, vaddr = *position; | 432 | unsigned long vpfn, vaddr = *position; |
| 424 | int remainder = *length; | 433 | int remainder = *length; |
| 425 | 434 | ||
| 426 | BUG_ON(!is_vm_hugetlb_page(vma)); | ||
| 427 | |||
| 428 | vpfn = vaddr/PAGE_SIZE; | 435 | vpfn = vaddr/PAGE_SIZE; |
| 429 | spin_lock(&mm->page_table_lock); | 436 | spin_lock(&mm->page_table_lock); |
| 430 | while (vaddr < vma->vm_end && remainder) { | 437 | while (vaddr < vma->vm_end && remainder) { |
| 438 | pte_t *pte; | ||
| 439 | struct page *page; | ||
| 431 | 440 | ||
| 432 | if (pages) { | 441 | /* |
| 433 | pte_t *pte; | 442 | * Some archs (sparc64, sh*) have multiple pte_ts to |
| 434 | struct page *page; | 443 | * each hugepage. We have to make * sure we get the |
| 435 | 444 | * first, for the page indexing below to work. | |
| 436 | /* Some archs (sparc64, sh*) have multiple | 445 | */ |
| 437 | * pte_ts to each hugepage. We have to make | 446 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); |
| 438 | * sure we get the first, for the page | ||
| 439 | * indexing below to work. */ | ||
| 440 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); | ||
| 441 | |||
| 442 | /* the hugetlb file might have been truncated */ | ||
| 443 | if (!pte || pte_none(*pte)) { | ||
| 444 | remainder = 0; | ||
| 445 | if (!i) | ||
| 446 | i = -EFAULT; | ||
| 447 | break; | ||
| 448 | } | ||
| 449 | 447 | ||
| 450 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | 448 | if (!pte || pte_none(*pte)) { |
| 449 | int ret; | ||
| 451 | 450 | ||
| 452 | WARN_ON(!PageCompound(page)); | 451 | spin_unlock(&mm->page_table_lock); |
| 452 | ret = hugetlb_fault(mm, vma, vaddr, 0); | ||
| 453 | spin_lock(&mm->page_table_lock); | ||
| 454 | if (ret == VM_FAULT_MINOR) | ||
| 455 | continue; | ||
| 456 | |||
| 457 | remainder = 0; | ||
| 458 | if (!i) | ||
| 459 | i = -EFAULT; | ||
| 460 | break; | ||
| 461 | } | ||
| 453 | 462 | ||
| 463 | if (pages) { | ||
| 464 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | ||
| 454 | get_page(page); | 465 | get_page(page); |
| 455 | pages[i] = page; | 466 | pages[i] = page; |
| 456 | } | 467 | } |
diff --git a/mm/madvise.c b/mm/madvise.c index 20e075d1c64c..17aaf3e16449 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
| @@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma, | |||
| 126 | unsigned long start, unsigned long end) | 126 | unsigned long start, unsigned long end) |
| 127 | { | 127 | { |
| 128 | *prev = vma; | 128 | *prev = vma; |
| 129 | if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) | 129 | if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED)) |
| 130 | return -EINVAL; | 130 | return -EINVAL; |
| 131 | 131 | ||
| 132 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) { | 132 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) { |
diff --git a/mm/memory.c b/mm/memory.c index 1db40e935e55..0f60baf6f69b 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) | |||
| 114 | { | 114 | { |
| 115 | struct page *page = pmd_page(*pmd); | 115 | struct page *page = pmd_page(*pmd); |
| 116 | pmd_clear(pmd); | 116 | pmd_clear(pmd); |
| 117 | pte_lock_deinit(page); | ||
| 117 | pte_free_tlb(tlb, page); | 118 | pte_free_tlb(tlb, page); |
| 118 | dec_page_state(nr_page_table_pages); | 119 | dec_page_state(nr_page_table_pages); |
| 119 | tlb->mm->nr_ptes--; | 120 | tlb->mm->nr_ptes--; |
| @@ -249,7 +250,7 @@ void free_pgd_range(struct mmu_gather **tlb, | |||
| 249 | free_pud_range(*tlb, pgd, addr, next, floor, ceiling); | 250 | free_pud_range(*tlb, pgd, addr, next, floor, ceiling); |
| 250 | } while (pgd++, addr = next, addr != end); | 251 | } while (pgd++, addr = next, addr != end); |
| 251 | 252 | ||
| 252 | if (!tlb_is_full_mm(*tlb)) | 253 | if (!(*tlb)->fullmm) |
| 253 | flush_tlb_pgtables((*tlb)->mm, start, end); | 254 | flush_tlb_pgtables((*tlb)->mm, start, end); |
| 254 | } | 255 | } |
| 255 | 256 | ||
| @@ -260,6 +261,12 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | |||
| 260 | struct vm_area_struct *next = vma->vm_next; | 261 | struct vm_area_struct *next = vma->vm_next; |
| 261 | unsigned long addr = vma->vm_start; | 262 | unsigned long addr = vma->vm_start; |
| 262 | 263 | ||
| 264 | /* | ||
| 265 | * Hide vma from rmap and vmtruncate before freeing pgtables | ||
| 266 | */ | ||
| 267 | anon_vma_unlink(vma); | ||
| 268 | unlink_file_vma(vma); | ||
| 269 | |||
| 263 | if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { | 270 | if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { |
| 264 | hugetlb_free_pgd_range(tlb, addr, vma->vm_end, | 271 | hugetlb_free_pgd_range(tlb, addr, vma->vm_end, |
| 265 | floor, next? next->vm_start: ceiling); | 272 | floor, next? next->vm_start: ceiling); |
| @@ -272,6 +279,8 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | |||
| 272 | HPAGE_SIZE)) { | 279 | HPAGE_SIZE)) { |
| 273 | vma = next; | 280 | vma = next; |
| 274 | next = vma->vm_next; | 281 | next = vma->vm_next; |
| 282 | anon_vma_unlink(vma); | ||
| 283 | unlink_file_vma(vma); | ||
| 275 | } | 284 | } |
| 276 | free_pgd_range(tlb, addr, vma->vm_end, | 285 | free_pgd_range(tlb, addr, vma->vm_end, |
| 277 | floor, next? next->vm_start: ceiling); | 286 | floor, next? next->vm_start: ceiling); |
| @@ -280,72 +289,78 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | |||
| 280 | } | 289 | } |
| 281 | } | 290 | } |
| 282 | 291 | ||
| 283 | pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, | 292 | int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) |
| 284 | unsigned long address) | ||
| 285 | { | 293 | { |
| 286 | if (!pmd_present(*pmd)) { | 294 | struct page *new = pte_alloc_one(mm, address); |
| 287 | struct page *new; | 295 | if (!new) |
| 288 | 296 | return -ENOMEM; | |
| 289 | spin_unlock(&mm->page_table_lock); | 297 | |
| 290 | new = pte_alloc_one(mm, address); | 298 | pte_lock_init(new); |
| 291 | spin_lock(&mm->page_table_lock); | 299 | spin_lock(&mm->page_table_lock); |
| 292 | if (!new) | 300 | if (pmd_present(*pmd)) { /* Another has populated it */ |
| 293 | return NULL; | 301 | pte_lock_deinit(new); |
| 294 | /* | 302 | pte_free(new); |
| 295 | * Because we dropped the lock, we should re-check the | 303 | } else { |
| 296 | * entry, as somebody else could have populated it.. | ||
| 297 | */ | ||
| 298 | if (pmd_present(*pmd)) { | ||
| 299 | pte_free(new); | ||
| 300 | goto out; | ||
| 301 | } | ||
| 302 | mm->nr_ptes++; | 304 | mm->nr_ptes++; |
| 303 | inc_page_state(nr_page_table_pages); | 305 | inc_page_state(nr_page_table_pages); |
| 304 | pmd_populate(mm, pmd, new); | 306 | pmd_populate(mm, pmd, new); |
| 305 | } | 307 | } |
| 306 | out: | 308 | spin_unlock(&mm->page_table_lock); |
| 307 | return pte_offset_map(pmd, address); | 309 | return 0; |
| 308 | } | 310 | } |
| 309 | 311 | ||
| 310 | pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | 312 | int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) |
| 311 | { | 313 | { |
| 312 | if (!pmd_present(*pmd)) { | 314 | pte_t *new = pte_alloc_one_kernel(&init_mm, address); |
| 313 | pte_t *new; | 315 | if (!new) |
| 316 | return -ENOMEM; | ||
| 314 | 317 | ||
| 315 | spin_unlock(&mm->page_table_lock); | 318 | spin_lock(&init_mm.page_table_lock); |
| 316 | new = pte_alloc_one_kernel(mm, address); | 319 | if (pmd_present(*pmd)) /* Another has populated it */ |
| 317 | spin_lock(&mm->page_table_lock); | 320 | pte_free_kernel(new); |
| 318 | if (!new) | 321 | else |
| 319 | return NULL; | 322 | pmd_populate_kernel(&init_mm, pmd, new); |
| 323 | spin_unlock(&init_mm.page_table_lock); | ||
| 324 | return 0; | ||
| 325 | } | ||
| 320 | 326 | ||
| 321 | /* | 327 | static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) |
| 322 | * Because we dropped the lock, we should re-check the | 328 | { |
| 323 | * entry, as somebody else could have populated it.. | 329 | if (file_rss) |
| 324 | */ | 330 | add_mm_counter(mm, file_rss, file_rss); |
| 325 | if (pmd_present(*pmd)) { | 331 | if (anon_rss) |
| 326 | pte_free_kernel(new); | 332 | add_mm_counter(mm, anon_rss, anon_rss); |
| 327 | goto out; | 333 | } |
| 328 | } | 334 | |
| 329 | pmd_populate_kernel(mm, pmd, new); | 335 | /* |
| 330 | } | 336 | * This function is called to print an error when a pte in a |
| 331 | out: | 337 | * !VM_RESERVED region is found pointing to an invalid pfn (which |
| 332 | return pte_offset_kernel(pmd, address); | 338 | * is an error. |
| 339 | * | ||
| 340 | * The calling function must still handle the error. | ||
| 341 | */ | ||
| 342 | void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) | ||
| 343 | { | ||
| 344 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " | ||
| 345 | "vm_flags = %lx, vaddr = %lx\n", | ||
| 346 | (long long)pte_val(pte), | ||
| 347 | (vma->vm_mm == current->mm ? current->comm : "???"), | ||
| 348 | vma->vm_flags, vaddr); | ||
| 349 | dump_stack(); | ||
| 333 | } | 350 | } |
| 334 | 351 | ||
| 335 | /* | 352 | /* |
| 336 | * copy one vm_area from one task to the other. Assumes the page tables | 353 | * copy one vm_area from one task to the other. Assumes the page tables |
| 337 | * already present in the new task to be cleared in the whole range | 354 | * already present in the new task to be cleared in the whole range |
| 338 | * covered by this vma. | 355 | * covered by this vma. |
| 339 | * | ||
| 340 | * dst->page_table_lock is held on entry and exit, | ||
| 341 | * but may be dropped within p[mg]d_alloc() and pte_alloc_map(). | ||
| 342 | */ | 356 | */ |
| 343 | 357 | ||
| 344 | static inline void | 358 | static inline void |
| 345 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 359 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
| 346 | pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, | 360 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, |
| 347 | unsigned long addr) | 361 | unsigned long addr, int *rss) |
| 348 | { | 362 | { |
| 363 | unsigned long vm_flags = vma->vm_flags; | ||
| 349 | pte_t pte = *src_pte; | 364 | pte_t pte = *src_pte; |
| 350 | struct page *page; | 365 | struct page *page; |
| 351 | unsigned long pfn; | 366 | unsigned long pfn; |
| @@ -357,29 +372,32 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 357 | /* make sure dst_mm is on swapoff's mmlist. */ | 372 | /* make sure dst_mm is on swapoff's mmlist. */ |
| 358 | if (unlikely(list_empty(&dst_mm->mmlist))) { | 373 | if (unlikely(list_empty(&dst_mm->mmlist))) { |
| 359 | spin_lock(&mmlist_lock); | 374 | spin_lock(&mmlist_lock); |
| 360 | list_add(&dst_mm->mmlist, &src_mm->mmlist); | 375 | if (list_empty(&dst_mm->mmlist)) |
| 376 | list_add(&dst_mm->mmlist, | ||
| 377 | &src_mm->mmlist); | ||
| 361 | spin_unlock(&mmlist_lock); | 378 | spin_unlock(&mmlist_lock); |
| 362 | } | 379 | } |
| 363 | } | 380 | } |
| 364 | set_pte_at(dst_mm, addr, dst_pte, pte); | 381 | goto out_set_pte; |
| 365 | return; | ||
| 366 | } | 382 | } |
| 367 | 383 | ||
| 368 | pfn = pte_pfn(pte); | 384 | /* If the region is VM_RESERVED, the mapping is not |
| 369 | /* the pte points outside of valid memory, the | 385 | * mapped via rmap - duplicate the pte as is. |
| 370 | * mapping is assumed to be good, meaningful | ||
| 371 | * and not mapped via rmap - duplicate the | ||
| 372 | * mapping as is. | ||
| 373 | */ | 386 | */ |
| 374 | page = NULL; | 387 | if (vm_flags & VM_RESERVED) |
| 375 | if (pfn_valid(pfn)) | 388 | goto out_set_pte; |
| 376 | page = pfn_to_page(pfn); | ||
| 377 | 389 | ||
| 378 | if (!page || PageReserved(page)) { | 390 | pfn = pte_pfn(pte); |
| 379 | set_pte_at(dst_mm, addr, dst_pte, pte); | 391 | /* If the pte points outside of valid memory but |
| 380 | return; | 392 | * the region is not VM_RESERVED, we have a problem. |
| 393 | */ | ||
| 394 | if (unlikely(!pfn_valid(pfn))) { | ||
| 395 | print_bad_pte(vma, pte, addr); | ||
| 396 | goto out_set_pte; /* try to do something sane */ | ||
| 381 | } | 397 | } |
| 382 | 398 | ||
| 399 | page = pfn_to_page(pfn); | ||
| 400 | |||
| 383 | /* | 401 | /* |
| 384 | * If it's a COW mapping, write protect it both | 402 | * If it's a COW mapping, write protect it both |
| 385 | * in the parent and the child | 403 | * in the parent and the child |
| @@ -397,11 +415,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 397 | pte = pte_mkclean(pte); | 415 | pte = pte_mkclean(pte); |
| 398 | pte = pte_mkold(pte); | 416 | pte = pte_mkold(pte); |
| 399 | get_page(page); | 417 | get_page(page); |
| 400 | inc_mm_counter(dst_mm, rss); | ||
| 401 | if (PageAnon(page)) | ||
| 402 | inc_mm_counter(dst_mm, anon_rss); | ||
| 403 | set_pte_at(dst_mm, addr, dst_pte, pte); | ||
| 404 | page_dup_rmap(page); | 418 | page_dup_rmap(page); |
| 419 | rss[!!PageAnon(page)]++; | ||
| 420 | |||
| 421 | out_set_pte: | ||
| 422 | set_pte_at(dst_mm, addr, dst_pte, pte); | ||
| 405 | } | 423 | } |
| 406 | 424 | ||
| 407 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 425 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
| @@ -409,38 +427,44 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 409 | unsigned long addr, unsigned long end) | 427 | unsigned long addr, unsigned long end) |
| 410 | { | 428 | { |
| 411 | pte_t *src_pte, *dst_pte; | 429 | pte_t *src_pte, *dst_pte; |
| 412 | unsigned long vm_flags = vma->vm_flags; | 430 | spinlock_t *src_ptl, *dst_ptl; |
| 413 | int progress; | 431 | int progress = 0; |
| 432 | int rss[2]; | ||
| 414 | 433 | ||
| 415 | again: | 434 | again: |
| 416 | dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); | 435 | rss[1] = rss[0] = 0; |
| 436 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); | ||
| 417 | if (!dst_pte) | 437 | if (!dst_pte) |
| 418 | return -ENOMEM; | 438 | return -ENOMEM; |
| 419 | src_pte = pte_offset_map_nested(src_pmd, addr); | 439 | src_pte = pte_offset_map_nested(src_pmd, addr); |
| 440 | src_ptl = pte_lockptr(src_mm, src_pmd); | ||
| 441 | spin_lock(src_ptl); | ||
| 420 | 442 | ||
| 421 | progress = 0; | ||
| 422 | spin_lock(&src_mm->page_table_lock); | ||
| 423 | do { | 443 | do { |
| 424 | /* | 444 | /* |
| 425 | * We are holding two locks at this point - either of them | 445 | * We are holding two locks at this point - either of them |
| 426 | * could generate latencies in another task on another CPU. | 446 | * could generate latencies in another task on another CPU. |
| 427 | */ | 447 | */ |
| 428 | if (progress >= 32 && (need_resched() || | 448 | if (progress >= 32) { |
| 429 | need_lockbreak(&src_mm->page_table_lock) || | 449 | progress = 0; |
| 430 | need_lockbreak(&dst_mm->page_table_lock))) | 450 | if (need_resched() || |
| 431 | break; | 451 | need_lockbreak(src_ptl) || |
| 452 | need_lockbreak(dst_ptl)) | ||
| 453 | break; | ||
| 454 | } | ||
| 432 | if (pte_none(*src_pte)) { | 455 | if (pte_none(*src_pte)) { |
| 433 | progress++; | 456 | progress++; |
| 434 | continue; | 457 | continue; |
| 435 | } | 458 | } |
| 436 | copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr); | 459 | copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); |
| 437 | progress += 8; | 460 | progress += 8; |
| 438 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); | 461 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); |
| 439 | spin_unlock(&src_mm->page_table_lock); | ||
| 440 | 462 | ||
| 463 | spin_unlock(src_ptl); | ||
| 441 | pte_unmap_nested(src_pte - 1); | 464 | pte_unmap_nested(src_pte - 1); |
| 442 | pte_unmap(dst_pte - 1); | 465 | add_mm_rss(dst_mm, rss[0], rss[1]); |
| 443 | cond_resched_lock(&dst_mm->page_table_lock); | 466 | pte_unmap_unlock(dst_pte - 1, dst_ptl); |
| 467 | cond_resched(); | ||
| 444 | if (addr != end) | 468 | if (addr != end) |
| 445 | goto again; | 469 | goto again; |
| 446 | return 0; | 470 | return 0; |
| @@ -525,24 +549,30 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 525 | return 0; | 549 | return 0; |
| 526 | } | 550 | } |
| 527 | 551 | ||
| 528 | static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | 552 | static void zap_pte_range(struct mmu_gather *tlb, |
| 553 | struct vm_area_struct *vma, pmd_t *pmd, | ||
| 529 | unsigned long addr, unsigned long end, | 554 | unsigned long addr, unsigned long end, |
| 530 | struct zap_details *details) | 555 | struct zap_details *details) |
| 531 | { | 556 | { |
| 557 | struct mm_struct *mm = tlb->mm; | ||
| 532 | pte_t *pte; | 558 | pte_t *pte; |
| 559 | spinlock_t *ptl; | ||
| 560 | int file_rss = 0; | ||
| 561 | int anon_rss = 0; | ||
| 533 | 562 | ||
| 534 | pte = pte_offset_map(pmd, addr); | 563 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
| 535 | do { | 564 | do { |
| 536 | pte_t ptent = *pte; | 565 | pte_t ptent = *pte; |
| 537 | if (pte_none(ptent)) | 566 | if (pte_none(ptent)) |
| 538 | continue; | 567 | continue; |
| 539 | if (pte_present(ptent)) { | 568 | if (pte_present(ptent)) { |
| 540 | struct page *page = NULL; | 569 | struct page *page = NULL; |
| 541 | unsigned long pfn = pte_pfn(ptent); | 570 | if (!(vma->vm_flags & VM_RESERVED)) { |
| 542 | if (pfn_valid(pfn)) { | 571 | unsigned long pfn = pte_pfn(ptent); |
| 543 | page = pfn_to_page(pfn); | 572 | if (unlikely(!pfn_valid(pfn))) |
| 544 | if (PageReserved(page)) | 573 | print_bad_pte(vma, ptent, addr); |
| 545 | page = NULL; | 574 | else |
| 575 | page = pfn_to_page(pfn); | ||
| 546 | } | 576 | } |
| 547 | if (unlikely(details) && page) { | 577 | if (unlikely(details) && page) { |
| 548 | /* | 578 | /* |
| @@ -562,7 +592,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | |||
| 562 | page->index > details->last_index)) | 592 | page->index > details->last_index)) |
| 563 | continue; | 593 | continue; |
| 564 | } | 594 | } |
| 565 | ptent = ptep_get_and_clear_full(tlb->mm, addr, pte, | 595 | ptent = ptep_get_and_clear_full(mm, addr, pte, |
| 566 | tlb->fullmm); | 596 | tlb->fullmm); |
| 567 | tlb_remove_tlb_entry(tlb, pte, addr); | 597 | tlb_remove_tlb_entry(tlb, pte, addr); |
| 568 | if (unlikely(!page)) | 598 | if (unlikely(!page)) |
| @@ -570,15 +600,17 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | |||
| 570 | if (unlikely(details) && details->nonlinear_vma | 600 | if (unlikely(details) && details->nonlinear_vma |
| 571 | && linear_page_index(details->nonlinear_vma, | 601 | && linear_page_index(details->nonlinear_vma, |
| 572 | addr) != page->index) | 602 | addr) != page->index) |
| 573 | set_pte_at(tlb->mm, addr, pte, | 603 | set_pte_at(mm, addr, pte, |
| 574 | pgoff_to_pte(page->index)); | 604 | pgoff_to_pte(page->index)); |
| 575 | if (pte_dirty(ptent)) | ||
| 576 | set_page_dirty(page); | ||
| 577 | if (PageAnon(page)) | 605 | if (PageAnon(page)) |
| 578 | dec_mm_counter(tlb->mm, anon_rss); | 606 | anon_rss--; |
| 579 | else if (pte_young(ptent)) | 607 | else { |
| 580 | mark_page_accessed(page); | 608 | if (pte_dirty(ptent)) |
| 581 | tlb->freed++; | 609 | set_page_dirty(page); |
| 610 | if (pte_young(ptent)) | ||
| 611 | mark_page_accessed(page); | ||
| 612 | file_rss--; | ||
| 613 | } | ||
| 582 | page_remove_rmap(page); | 614 | page_remove_rmap(page); |
| 583 | tlb_remove_page(tlb, page); | 615 | tlb_remove_page(tlb, page); |
| 584 | continue; | 616 | continue; |
| @@ -591,12 +623,15 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | |||
| 591 | continue; | 623 | continue; |
| 592 | if (!pte_file(ptent)) | 624 | if (!pte_file(ptent)) |
| 593 | free_swap_and_cache(pte_to_swp_entry(ptent)); | 625 | free_swap_and_cache(pte_to_swp_entry(ptent)); |
| 594 | pte_clear_full(tlb->mm, addr, pte, tlb->fullmm); | 626 | pte_clear_full(mm, addr, pte, tlb->fullmm); |
| 595 | } while (pte++, addr += PAGE_SIZE, addr != end); | 627 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| 596 | pte_unmap(pte - 1); | 628 | |
| 629 | add_mm_rss(mm, file_rss, anon_rss); | ||
| 630 | pte_unmap_unlock(pte - 1, ptl); | ||
| 597 | } | 631 | } |
| 598 | 632 | ||
| 599 | static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, | 633 | static inline void zap_pmd_range(struct mmu_gather *tlb, |
| 634 | struct vm_area_struct *vma, pud_t *pud, | ||
| 600 | unsigned long addr, unsigned long end, | 635 | unsigned long addr, unsigned long end, |
| 601 | struct zap_details *details) | 636 | struct zap_details *details) |
| 602 | { | 637 | { |
| @@ -608,11 +643,12 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, | |||
| 608 | next = pmd_addr_end(addr, end); | 643 | next = pmd_addr_end(addr, end); |
| 609 | if (pmd_none_or_clear_bad(pmd)) | 644 | if (pmd_none_or_clear_bad(pmd)) |
| 610 | continue; | 645 | continue; |
| 611 | zap_pte_range(tlb, pmd, addr, next, details); | 646 | zap_pte_range(tlb, vma, pmd, addr, next, details); |
| 612 | } while (pmd++, addr = next, addr != end); | 647 | } while (pmd++, addr = next, addr != end); |
| 613 | } | 648 | } |
| 614 | 649 | ||
| 615 | static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | 650 | static inline void zap_pud_range(struct mmu_gather *tlb, |
| 651 | struct vm_area_struct *vma, pgd_t *pgd, | ||
| 616 | unsigned long addr, unsigned long end, | 652 | unsigned long addr, unsigned long end, |
| 617 | struct zap_details *details) | 653 | struct zap_details *details) |
| 618 | { | 654 | { |
| @@ -624,7 +660,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | |||
| 624 | next = pud_addr_end(addr, end); | 660 | next = pud_addr_end(addr, end); |
| 625 | if (pud_none_or_clear_bad(pud)) | 661 | if (pud_none_or_clear_bad(pud)) |
| 626 | continue; | 662 | continue; |
| 627 | zap_pmd_range(tlb, pud, addr, next, details); | 663 | zap_pmd_range(tlb, vma, pud, addr, next, details); |
| 628 | } while (pud++, addr = next, addr != end); | 664 | } while (pud++, addr = next, addr != end); |
| 629 | } | 665 | } |
| 630 | 666 | ||
| @@ -645,7 +681,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
| 645 | next = pgd_addr_end(addr, end); | 681 | next = pgd_addr_end(addr, end); |
| 646 | if (pgd_none_or_clear_bad(pgd)) | 682 | if (pgd_none_or_clear_bad(pgd)) |
| 647 | continue; | 683 | continue; |
| 648 | zap_pud_range(tlb, pgd, addr, next, details); | 684 | zap_pud_range(tlb, vma, pgd, addr, next, details); |
| 649 | } while (pgd++, addr = next, addr != end); | 685 | } while (pgd++, addr = next, addr != end); |
| 650 | tlb_end_vma(tlb, vma); | 686 | tlb_end_vma(tlb, vma); |
| 651 | } | 687 | } |
| @@ -660,7 +696,6 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
| 660 | /** | 696 | /** |
| 661 | * unmap_vmas - unmap a range of memory covered by a list of vma's | 697 | * unmap_vmas - unmap a range of memory covered by a list of vma's |
| 662 | * @tlbp: address of the caller's struct mmu_gather | 698 | * @tlbp: address of the caller's struct mmu_gather |
| 663 | * @mm: the controlling mm_struct | ||
| 664 | * @vma: the starting vma | 699 | * @vma: the starting vma |
| 665 | * @start_addr: virtual address at which to start unmapping | 700 | * @start_addr: virtual address at which to start unmapping |
| 666 | * @end_addr: virtual address at which to end unmapping | 701 | * @end_addr: virtual address at which to end unmapping |
| @@ -669,10 +704,10 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
| 669 | * | 704 | * |
| 670 | * Returns the end address of the unmapping (restart addr if interrupted). | 705 | * Returns the end address of the unmapping (restart addr if interrupted). |
| 671 | * | 706 | * |
| 672 | * Unmap all pages in the vma list. Called under page_table_lock. | 707 | * Unmap all pages in the vma list. |
| 673 | * | 708 | * |
| 674 | * We aim to not hold page_table_lock for too long (for scheduling latency | 709 | * We aim to not hold locks for too long (for scheduling latency reasons). |
| 675 | * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to | 710 | * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to |
| 676 | * return the ending mmu_gather to the caller. | 711 | * return the ending mmu_gather to the caller. |
| 677 | * | 712 | * |
| 678 | * Only addresses between `start' and `end' will be unmapped. | 713 | * Only addresses between `start' and `end' will be unmapped. |
| @@ -684,7 +719,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
| 684 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() | 719 | * ensure that any thus-far unmapped pages are flushed before unmap_vmas() |
| 685 | * drops the lock and schedules. | 720 | * drops the lock and schedules. |
| 686 | */ | 721 | */ |
| 687 | unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, | 722 | unsigned long unmap_vmas(struct mmu_gather **tlbp, |
| 688 | struct vm_area_struct *vma, unsigned long start_addr, | 723 | struct vm_area_struct *vma, unsigned long start_addr, |
| 689 | unsigned long end_addr, unsigned long *nr_accounted, | 724 | unsigned long end_addr, unsigned long *nr_accounted, |
| 690 | struct zap_details *details) | 725 | struct zap_details *details) |
| @@ -694,7 +729,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, | |||
| 694 | int tlb_start_valid = 0; | 729 | int tlb_start_valid = 0; |
| 695 | unsigned long start = start_addr; | 730 | unsigned long start = start_addr; |
| 696 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; | 731 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; |
| 697 | int fullmm = tlb_is_full_mm(*tlbp); | 732 | int fullmm = (*tlbp)->fullmm; |
| 698 | 733 | ||
| 699 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { | 734 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { |
| 700 | unsigned long end; | 735 | unsigned long end; |
| @@ -734,19 +769,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, | |||
| 734 | tlb_finish_mmu(*tlbp, tlb_start, start); | 769 | tlb_finish_mmu(*tlbp, tlb_start, start); |
| 735 | 770 | ||
| 736 | if (need_resched() || | 771 | if (need_resched() || |
| 737 | need_lockbreak(&mm->page_table_lock) || | ||
| 738 | (i_mmap_lock && need_lockbreak(i_mmap_lock))) { | 772 | (i_mmap_lock && need_lockbreak(i_mmap_lock))) { |
| 739 | if (i_mmap_lock) { | 773 | if (i_mmap_lock) { |
| 740 | /* must reset count of rss freed */ | 774 | *tlbp = NULL; |
| 741 | *tlbp = tlb_gather_mmu(mm, fullmm); | ||
| 742 | goto out; | 775 | goto out; |
| 743 | } | 776 | } |
| 744 | spin_unlock(&mm->page_table_lock); | ||
| 745 | cond_resched(); | 777 | cond_resched(); |
| 746 | spin_lock(&mm->page_table_lock); | ||
| 747 | } | 778 | } |
| 748 | 779 | ||
| 749 | *tlbp = tlb_gather_mmu(mm, fullmm); | 780 | *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); |
| 750 | tlb_start_valid = 0; | 781 | tlb_start_valid = 0; |
| 751 | zap_bytes = ZAP_BLOCK_SIZE; | 782 | zap_bytes = ZAP_BLOCK_SIZE; |
| 752 | } | 783 | } |
| @@ -770,123 +801,93 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | |||
| 770 | unsigned long end = address + size; | 801 | unsigned long end = address + size; |
| 771 | unsigned long nr_accounted = 0; | 802 | unsigned long nr_accounted = 0; |
| 772 | 803 | ||
| 773 | if (is_vm_hugetlb_page(vma)) { | ||
| 774 | zap_hugepage_range(vma, address, size); | ||
| 775 | return end; | ||
| 776 | } | ||
| 777 | |||
| 778 | lru_add_drain(); | 804 | lru_add_drain(); |
| 779 | spin_lock(&mm->page_table_lock); | ||
| 780 | tlb = tlb_gather_mmu(mm, 0); | 805 | tlb = tlb_gather_mmu(mm, 0); |
| 781 | end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); | 806 | update_hiwater_rss(mm); |
| 782 | tlb_finish_mmu(tlb, address, end); | 807 | end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); |
| 783 | spin_unlock(&mm->page_table_lock); | 808 | if (tlb) |
| 809 | tlb_finish_mmu(tlb, address, end); | ||
| 784 | return end; | 810 | return end; |
| 785 | } | 811 | } |
| 786 | 812 | ||
| 787 | /* | 813 | /* |
| 788 | * Do a quick page-table lookup for a single page. | 814 | * Do a quick page-table lookup for a single page. |
| 789 | * mm->page_table_lock must be held. | ||
| 790 | */ | 815 | */ |
| 791 | static struct page *__follow_page(struct mm_struct *mm, unsigned long address, | 816 | struct page *follow_page(struct mm_struct *mm, unsigned long address, |
| 792 | int read, int write, int accessed) | 817 | unsigned int flags) |
| 793 | { | 818 | { |
| 794 | pgd_t *pgd; | 819 | pgd_t *pgd; |
| 795 | pud_t *pud; | 820 | pud_t *pud; |
| 796 | pmd_t *pmd; | 821 | pmd_t *pmd; |
| 797 | pte_t *ptep, pte; | 822 | pte_t *ptep, pte; |
| 823 | spinlock_t *ptl; | ||
| 798 | unsigned long pfn; | 824 | unsigned long pfn; |
| 799 | struct page *page; | 825 | struct page *page; |
| 800 | 826 | ||
| 801 | page = follow_huge_addr(mm, address, write); | 827 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); |
| 802 | if (! IS_ERR(page)) | 828 | if (!IS_ERR(page)) { |
| 803 | return page; | 829 | BUG_ON(flags & FOLL_GET); |
| 830 | goto out; | ||
| 831 | } | ||
| 804 | 832 | ||
| 833 | page = NULL; | ||
| 805 | pgd = pgd_offset(mm, address); | 834 | pgd = pgd_offset(mm, address); |
| 806 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | 835 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) |
| 807 | goto out; | 836 | goto no_page_table; |
| 808 | 837 | ||
| 809 | pud = pud_offset(pgd, address); | 838 | pud = pud_offset(pgd, address); |
| 810 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | 839 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) |
| 811 | goto out; | 840 | goto no_page_table; |
| 812 | 841 | ||
| 813 | pmd = pmd_offset(pud, address); | 842 | pmd = pmd_offset(pud, address); |
| 814 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | 843 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) |
| 844 | goto no_page_table; | ||
| 845 | |||
| 846 | if (pmd_huge(*pmd)) { | ||
| 847 | BUG_ON(flags & FOLL_GET); | ||
| 848 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | ||
| 815 | goto out; | 849 | goto out; |
| 816 | if (pmd_huge(*pmd)) | 850 | } |
| 817 | return follow_huge_pmd(mm, address, pmd, write); | ||
| 818 | 851 | ||
| 819 | ptep = pte_offset_map(pmd, address); | 852 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); |
| 820 | if (!ptep) | 853 | if (!ptep) |
| 821 | goto out; | 854 | goto out; |
| 822 | 855 | ||
| 823 | pte = *ptep; | 856 | pte = *ptep; |
| 824 | pte_unmap(ptep); | 857 | if (!pte_present(pte)) |
| 825 | if (pte_present(pte)) { | 858 | goto unlock; |
| 826 | if (write && !pte_write(pte)) | 859 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
| 827 | goto out; | 860 | goto unlock; |
| 828 | if (read && !pte_read(pte)) | 861 | pfn = pte_pfn(pte); |
| 829 | goto out; | 862 | if (!pfn_valid(pfn)) |
| 830 | pfn = pte_pfn(pte); | 863 | goto unlock; |
| 831 | if (pfn_valid(pfn)) { | 864 | |
| 832 | page = pfn_to_page(pfn); | 865 | page = pfn_to_page(pfn); |
| 833 | if (accessed) { | 866 | if (flags & FOLL_GET) |
| 834 | if (write && !pte_dirty(pte) &&!PageDirty(page)) | 867 | get_page(page); |
| 835 | set_page_dirty(page); | 868 | if (flags & FOLL_TOUCH) { |
| 836 | mark_page_accessed(page); | 869 | if ((flags & FOLL_WRITE) && |
| 837 | } | 870 | !pte_dirty(pte) && !PageDirty(page)) |
| 838 | return page; | 871 | set_page_dirty(page); |
| 839 | } | 872 | mark_page_accessed(page); |
| 840 | } | 873 | } |
| 841 | 874 | unlock: | |
| 875 | pte_unmap_unlock(ptep, ptl); | ||
| 842 | out: | 876 | out: |
| 843 | return NULL; | 877 | return page; |
| 844 | } | ||
| 845 | |||
| 846 | inline struct page * | ||
| 847 | follow_page(struct mm_struct *mm, unsigned long address, int write) | ||
| 848 | { | ||
| 849 | return __follow_page(mm, address, 0, write, 1); | ||
| 850 | } | ||
| 851 | |||
| 852 | /* | ||
| 853 | * check_user_page_readable() can be called frm niterrupt context by oprofile, | ||
| 854 | * so we need to avoid taking any non-irq-safe locks | ||
| 855 | */ | ||
| 856 | int check_user_page_readable(struct mm_struct *mm, unsigned long address) | ||
| 857 | { | ||
| 858 | return __follow_page(mm, address, 1, 0, 0) != NULL; | ||
| 859 | } | ||
| 860 | EXPORT_SYMBOL(check_user_page_readable); | ||
| 861 | |||
| 862 | static inline int | ||
| 863 | untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, | ||
| 864 | unsigned long address) | ||
| 865 | { | ||
| 866 | pgd_t *pgd; | ||
| 867 | pud_t *pud; | ||
| 868 | pmd_t *pmd; | ||
| 869 | |||
| 870 | /* Check if the vma is for an anonymous mapping. */ | ||
| 871 | if (vma->vm_ops && vma->vm_ops->nopage) | ||
| 872 | return 0; | ||
| 873 | |||
| 874 | /* Check if page directory entry exists. */ | ||
| 875 | pgd = pgd_offset(mm, address); | ||
| 876 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | ||
| 877 | return 1; | ||
| 878 | |||
| 879 | pud = pud_offset(pgd, address); | ||
| 880 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | ||
| 881 | return 1; | ||
| 882 | |||
| 883 | /* Check if page middle directory entry exists. */ | ||
| 884 | pmd = pmd_offset(pud, address); | ||
| 885 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | ||
| 886 | return 1; | ||
| 887 | 878 | ||
| 888 | /* There is a pte slot for 'address' in 'mm'. */ | 879 | no_page_table: |
| 889 | return 0; | 880 | /* |
| 881 | * When core dumping an enormous anonymous area that nobody | ||
| 882 | * has touched so far, we don't want to allocate page tables. | ||
| 883 | */ | ||
| 884 | if (flags & FOLL_ANON) { | ||
| 885 | page = ZERO_PAGE(address); | ||
| 886 | if (flags & FOLL_GET) | ||
| 887 | get_page(page); | ||
| 888 | BUG_ON(flags & FOLL_WRITE); | ||
| 889 | } | ||
| 890 | return page; | ||
| 890 | } | 891 | } |
| 891 | 892 | ||
| 892 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 893 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| @@ -894,18 +895,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 894 | struct page **pages, struct vm_area_struct **vmas) | 895 | struct page **pages, struct vm_area_struct **vmas) |
| 895 | { | 896 | { |
| 896 | int i; | 897 | int i; |
| 897 | unsigned int flags; | 898 | unsigned int vm_flags; |
| 898 | 899 | ||
| 899 | /* | 900 | /* |
| 900 | * Require read or write permissions. | 901 | * Require read or write permissions. |
| 901 | * If 'force' is set, we only require the "MAY" flags. | 902 | * If 'force' is set, we only require the "MAY" flags. |
| 902 | */ | 903 | */ |
| 903 | flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | 904 | vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); |
| 904 | flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 905 | vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); |
| 905 | i = 0; | 906 | i = 0; |
| 906 | 907 | ||
| 907 | do { | 908 | do { |
| 908 | struct vm_area_struct * vma; | 909 | struct vm_area_struct *vma; |
| 910 | unsigned int foll_flags; | ||
| 909 | 911 | ||
| 910 | vma = find_extend_vma(mm, start); | 912 | vma = find_extend_vma(mm, start); |
| 911 | if (!vma && in_gate_area(tsk, start)) { | 913 | if (!vma && in_gate_area(tsk, start)) { |
| @@ -945,8 +947,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 945 | continue; | 947 | continue; |
| 946 | } | 948 | } |
| 947 | 949 | ||
| 948 | if (!vma || (vma->vm_flags & VM_IO) | 950 | if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) |
| 949 | || !(flags & vma->vm_flags)) | 951 | || !(vm_flags & vma->vm_flags)) |
| 950 | return i ? : -EFAULT; | 952 | return i ? : -EFAULT; |
| 951 | 953 | ||
| 952 | if (is_vm_hugetlb_page(vma)) { | 954 | if (is_vm_hugetlb_page(vma)) { |
| @@ -954,29 +956,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 954 | &start, &len, i); | 956 | &start, &len, i); |
| 955 | continue; | 957 | continue; |
| 956 | } | 958 | } |
| 957 | spin_lock(&mm->page_table_lock); | 959 | |
| 960 | foll_flags = FOLL_TOUCH; | ||
| 961 | if (pages) | ||
| 962 | foll_flags |= FOLL_GET; | ||
| 963 | if (!write && !(vma->vm_flags & VM_LOCKED) && | ||
| 964 | (!vma->vm_ops || !vma->vm_ops->nopage)) | ||
| 965 | foll_flags |= FOLL_ANON; | ||
| 966 | |||
| 958 | do { | 967 | do { |
| 959 | int write_access = write; | ||
| 960 | struct page *page; | 968 | struct page *page; |
| 961 | 969 | ||
| 962 | cond_resched_lock(&mm->page_table_lock); | 970 | if (write) |
| 963 | while (!(page = follow_page(mm, start, write_access))) { | 971 | foll_flags |= FOLL_WRITE; |
| 964 | int ret; | ||
| 965 | |||
| 966 | /* | ||
| 967 | * Shortcut for anonymous pages. We don't want | ||
| 968 | * to force the creation of pages tables for | ||
| 969 | * insanely big anonymously mapped areas that | ||
| 970 | * nobody touched so far. This is important | ||
| 971 | * for doing a core dump for these mappings. | ||
| 972 | */ | ||
| 973 | if (!write && untouched_anonymous_page(mm,vma,start)) { | ||
| 974 | page = ZERO_PAGE(start); | ||
| 975 | break; | ||
| 976 | } | ||
| 977 | spin_unlock(&mm->page_table_lock); | ||
| 978 | ret = __handle_mm_fault(mm, vma, start, write_access); | ||
| 979 | 972 | ||
| 973 | cond_resched(); | ||
| 974 | while (!(page = follow_page(mm, start, foll_flags))) { | ||
| 975 | int ret; | ||
| 976 | ret = __handle_mm_fault(mm, vma, start, | ||
| 977 | foll_flags & FOLL_WRITE); | ||
| 980 | /* | 978 | /* |
| 981 | * The VM_FAULT_WRITE bit tells us that do_wp_page has | 979 | * The VM_FAULT_WRITE bit tells us that do_wp_page has |
| 982 | * broken COW when necessary, even if maybe_mkwrite | 980 | * broken COW when necessary, even if maybe_mkwrite |
| @@ -984,7 +982,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 984 | * subsequent page lookups as if they were reads. | 982 | * subsequent page lookups as if they were reads. |
| 985 | */ | 983 | */ |
| 986 | if (ret & VM_FAULT_WRITE) | 984 | if (ret & VM_FAULT_WRITE) |
| 987 | write_access = 0; | 985 | foll_flags &= ~FOLL_WRITE; |
| 988 | 986 | ||
| 989 | switch (ret & ~VM_FAULT_WRITE) { | 987 | switch (ret & ~VM_FAULT_WRITE) { |
| 990 | case VM_FAULT_MINOR: | 988 | case VM_FAULT_MINOR: |
| @@ -1000,13 +998,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1000 | default: | 998 | default: |
| 1001 | BUG(); | 999 | BUG(); |
| 1002 | } | 1000 | } |
| 1003 | spin_lock(&mm->page_table_lock); | ||
| 1004 | } | 1001 | } |
| 1005 | if (pages) { | 1002 | if (pages) { |
| 1006 | pages[i] = page; | 1003 | pages[i] = page; |
| 1007 | flush_dcache_page(page); | 1004 | flush_dcache_page(page); |
| 1008 | if (!PageReserved(page)) | ||
| 1009 | page_cache_get(page); | ||
| 1010 | } | 1005 | } |
| 1011 | if (vmas) | 1006 | if (vmas) |
| 1012 | vmas[i] = vma; | 1007 | vmas[i] = vma; |
| @@ -1014,7 +1009,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1014 | start += PAGE_SIZE; | 1009 | start += PAGE_SIZE; |
| 1015 | len--; | 1010 | len--; |
| 1016 | } while (len && start < vma->vm_end); | 1011 | } while (len && start < vma->vm_end); |
| 1017 | spin_unlock(&mm->page_table_lock); | ||
| 1018 | } while (len); | 1012 | } while (len); |
| 1019 | return i; | 1013 | return i; |
| 1020 | } | 1014 | } |
| @@ -1024,16 +1018,21 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
| 1024 | unsigned long addr, unsigned long end, pgprot_t prot) | 1018 | unsigned long addr, unsigned long end, pgprot_t prot) |
| 1025 | { | 1019 | { |
| 1026 | pte_t *pte; | 1020 | pte_t *pte; |
| 1021 | spinlock_t *ptl; | ||
| 1027 | 1022 | ||
| 1028 | pte = pte_alloc_map(mm, pmd, addr); | 1023 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); |
| 1029 | if (!pte) | 1024 | if (!pte) |
| 1030 | return -ENOMEM; | 1025 | return -ENOMEM; |
| 1031 | do { | 1026 | do { |
| 1032 | pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot)); | 1027 | struct page *page = ZERO_PAGE(addr); |
| 1028 | pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); | ||
| 1029 | page_cache_get(page); | ||
| 1030 | page_add_file_rmap(page); | ||
| 1031 | inc_mm_counter(mm, file_rss); | ||
| 1033 | BUG_ON(!pte_none(*pte)); | 1032 | BUG_ON(!pte_none(*pte)); |
| 1034 | set_pte_at(mm, addr, pte, zero_pte); | 1033 | set_pte_at(mm, addr, pte, zero_pte); |
| 1035 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1034 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| 1036 | pte_unmap(pte - 1); | 1035 | pte_unmap_unlock(pte - 1, ptl); |
| 1037 | return 0; | 1036 | return 0; |
| 1038 | } | 1037 | } |
| 1039 | 1038 | ||
| @@ -1083,14 +1082,12 @@ int zeromap_page_range(struct vm_area_struct *vma, | |||
| 1083 | BUG_ON(addr >= end); | 1082 | BUG_ON(addr >= end); |
| 1084 | pgd = pgd_offset(mm, addr); | 1083 | pgd = pgd_offset(mm, addr); |
| 1085 | flush_cache_range(vma, addr, end); | 1084 | flush_cache_range(vma, addr, end); |
| 1086 | spin_lock(&mm->page_table_lock); | ||
| 1087 | do { | 1085 | do { |
| 1088 | next = pgd_addr_end(addr, end); | 1086 | next = pgd_addr_end(addr, end); |
| 1089 | err = zeromap_pud_range(mm, pgd, addr, next, prot); | 1087 | err = zeromap_pud_range(mm, pgd, addr, next, prot); |
| 1090 | if (err) | 1088 | if (err) |
| 1091 | break; | 1089 | break; |
| 1092 | } while (pgd++, addr = next, addr != end); | 1090 | } while (pgd++, addr = next, addr != end); |
| 1093 | spin_unlock(&mm->page_table_lock); | ||
| 1094 | return err; | 1091 | return err; |
| 1095 | } | 1092 | } |
| 1096 | 1093 | ||
| @@ -1104,17 +1101,17 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
| 1104 | unsigned long pfn, pgprot_t prot) | 1101 | unsigned long pfn, pgprot_t prot) |
| 1105 | { | 1102 | { |
| 1106 | pte_t *pte; | 1103 | pte_t *pte; |
| 1104 | spinlock_t *ptl; | ||
| 1107 | 1105 | ||
| 1108 | pte = pte_alloc_map(mm, pmd, addr); | 1106 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); |
| 1109 | if (!pte) | 1107 | if (!pte) |
| 1110 | return -ENOMEM; | 1108 | return -ENOMEM; |
| 1111 | do { | 1109 | do { |
| 1112 | BUG_ON(!pte_none(*pte)); | 1110 | BUG_ON(!pte_none(*pte)); |
| 1113 | if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) | 1111 | set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); |
| 1114 | set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); | ||
| 1115 | pfn++; | 1112 | pfn++; |
| 1116 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1113 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| 1117 | pte_unmap(pte - 1); | 1114 | pte_unmap_unlock(pte - 1, ptl); |
| 1118 | return 0; | 1115 | return 0; |
| 1119 | } | 1116 | } |
| 1120 | 1117 | ||
| @@ -1173,8 +1170,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
| 1173 | * rest of the world about it: | 1170 | * rest of the world about it: |
| 1174 | * VM_IO tells people not to look at these pages | 1171 | * VM_IO tells people not to look at these pages |
| 1175 | * (accesses can have side effects). | 1172 | * (accesses can have side effects). |
| 1176 | * VM_RESERVED tells swapout not to try to touch | 1173 | * VM_RESERVED tells the core MM not to "manage" these pages |
| 1177 | * this region. | 1174 | * (e.g. refcount, mapcount, try to swap them out). |
| 1178 | */ | 1175 | */ |
| 1179 | vma->vm_flags |= VM_IO | VM_RESERVED; | 1176 | vma->vm_flags |= VM_IO | VM_RESERVED; |
| 1180 | 1177 | ||
| @@ -1182,7 +1179,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
| 1182 | pfn -= addr >> PAGE_SHIFT; | 1179 | pfn -= addr >> PAGE_SHIFT; |
| 1183 | pgd = pgd_offset(mm, addr); | 1180 | pgd = pgd_offset(mm, addr); |
| 1184 | flush_cache_range(vma, addr, end); | 1181 | flush_cache_range(vma, addr, end); |
| 1185 | spin_lock(&mm->page_table_lock); | ||
| 1186 | do { | 1182 | do { |
| 1187 | next = pgd_addr_end(addr, end); | 1183 | next = pgd_addr_end(addr, end); |
| 1188 | err = remap_pud_range(mm, pgd, addr, next, | 1184 | err = remap_pud_range(mm, pgd, addr, next, |
| @@ -1190,12 +1186,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
| 1190 | if (err) | 1186 | if (err) |
| 1191 | break; | 1187 | break; |
| 1192 | } while (pgd++, addr = next, addr != end); | 1188 | } while (pgd++, addr = next, addr != end); |
| 1193 | spin_unlock(&mm->page_table_lock); | ||
| 1194 | return err; | 1189 | return err; |
| 1195 | } | 1190 | } |
| 1196 | EXPORT_SYMBOL(remap_pfn_range); | 1191 | EXPORT_SYMBOL(remap_pfn_range); |
| 1197 | 1192 | ||
| 1198 | /* | 1193 | /* |
| 1194 | * handle_pte_fault chooses page fault handler according to an entry | ||
| 1195 | * which was read non-atomically. Before making any commitment, on | ||
| 1196 | * those architectures or configurations (e.g. i386 with PAE) which | ||
| 1197 | * might give a mix of unmatched parts, do_swap_page and do_file_page | ||
| 1198 | * must check under lock before unmapping the pte and proceeding | ||
| 1199 | * (but do_wp_page is only called after already making such a check; | ||
| 1200 | * and do_anonymous_page and do_no_page can safely check later on). | ||
| 1201 | */ | ||
| 1202 | static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | ||
| 1203 | pte_t *page_table, pte_t orig_pte) | ||
| 1204 | { | ||
| 1205 | int same = 1; | ||
| 1206 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) | ||
| 1207 | if (sizeof(pte_t) > sizeof(unsigned long)) { | ||
| 1208 | spinlock_t *ptl = pte_lockptr(mm, pmd); | ||
| 1209 | spin_lock(ptl); | ||
| 1210 | same = pte_same(*page_table, orig_pte); | ||
| 1211 | spin_unlock(ptl); | ||
| 1212 | } | ||
| 1213 | #endif | ||
| 1214 | pte_unmap(page_table); | ||
| 1215 | return same; | ||
| 1216 | } | ||
| 1217 | |||
| 1218 | /* | ||
| 1199 | * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when | 1219 | * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when |
| 1200 | * servicing faults for write access. In the normal case, do always want | 1220 | * servicing faults for write access. In the normal case, do always want |
| 1201 | * pte_mkwrite. But get_user_pages can cause write faults for mappings | 1221 | * pte_mkwrite. But get_user_pages can cause write faults for mappings |
| @@ -1209,28 +1229,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
| 1209 | } | 1229 | } |
| 1210 | 1230 | ||
| 1211 | /* | 1231 | /* |
| 1212 | * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock | ||
| 1213 | */ | ||
| 1214 | static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, | ||
| 1215 | pte_t *page_table) | ||
| 1216 | { | ||
| 1217 | pte_t entry; | ||
| 1218 | |||
| 1219 | entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)), | ||
| 1220 | vma); | ||
| 1221 | ptep_establish(vma, address, page_table, entry); | ||
| 1222 | update_mmu_cache(vma, address, entry); | ||
| 1223 | lazy_mmu_prot_update(entry); | ||
| 1224 | } | ||
| 1225 | |||
| 1226 | /* | ||
| 1227 | * This routine handles present pages, when users try to write | 1232 | * This routine handles present pages, when users try to write |
| 1228 | * to a shared page. It is done by copying the page to a new address | 1233 | * to a shared page. It is done by copying the page to a new address |
| 1229 | * and decrementing the shared-page counter for the old page. | 1234 | * and decrementing the shared-page counter for the old page. |
| 1230 | * | 1235 | * |
| 1231 | * Goto-purists beware: the only reason for goto's here is that it results | ||
| 1232 | * in better assembly code.. The "default" path will see no jumps at all. | ||
| 1233 | * | ||
| 1234 | * Note that this routine assumes that the protection checks have been | 1236 | * Note that this routine assumes that the protection checks have been |
| 1235 | * done by the caller (the low-level page fault routine in most cases). | 1237 | * done by the caller (the low-level page fault routine in most cases). |
| 1236 | * Thus we can safely just mark it writable once we've done any necessary | 1238 | * Thus we can safely just mark it writable once we've done any necessary |
| @@ -1240,28 +1242,28 @@ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page | |||
| 1240 | * change only once the write actually happens. This avoids a few races, | 1242 | * change only once the write actually happens. This avoids a few races, |
| 1241 | * and potentially makes it more efficient. | 1243 | * and potentially makes it more efficient. |
| 1242 | * | 1244 | * |
| 1243 | * We hold the mm semaphore and the page_table_lock on entry and exit | 1245 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
| 1244 | * with the page_table_lock released. | 1246 | * but allow concurrent faults), with pte both mapped and locked. |
| 1247 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
| 1245 | */ | 1248 | */ |
| 1246 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, | 1249 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 1247 | unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) | 1250 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
| 1251 | spinlock_t *ptl, pte_t orig_pte) | ||
| 1248 | { | 1252 | { |
| 1249 | struct page *old_page, *new_page; | 1253 | struct page *old_page, *new_page; |
| 1250 | unsigned long pfn = pte_pfn(pte); | 1254 | unsigned long pfn = pte_pfn(orig_pte); |
| 1251 | pte_t entry; | 1255 | pte_t entry; |
| 1252 | int ret; | 1256 | int ret = VM_FAULT_MINOR; |
| 1257 | |||
| 1258 | BUG_ON(vma->vm_flags & VM_RESERVED); | ||
| 1253 | 1259 | ||
| 1254 | if (unlikely(!pfn_valid(pfn))) { | 1260 | if (unlikely(!pfn_valid(pfn))) { |
| 1255 | /* | 1261 | /* |
| 1256 | * This should really halt the system so it can be debugged or | 1262 | * Page table corrupted: show pte and kill process. |
| 1257 | * at least the kernel stops what it's doing before it corrupts | ||
| 1258 | * data, but for the moment just pretend this is OOM. | ||
| 1259 | */ | 1263 | */ |
| 1260 | pte_unmap(page_table); | 1264 | print_bad_pte(vma, orig_pte, address); |
| 1261 | printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", | 1265 | ret = VM_FAULT_OOM; |
| 1262 | address); | 1266 | goto unlock; |
| 1263 | spin_unlock(&mm->page_table_lock); | ||
| 1264 | return VM_FAULT_OOM; | ||
| 1265 | } | 1267 | } |
| 1266 | old_page = pfn_to_page(pfn); | 1268 | old_page = pfn_to_page(pfn); |
| 1267 | 1269 | ||
| @@ -1270,52 +1272,51 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, | |||
| 1270 | unlock_page(old_page); | 1272 | unlock_page(old_page); |
| 1271 | if (reuse) { | 1273 | if (reuse) { |
| 1272 | flush_cache_page(vma, address, pfn); | 1274 | flush_cache_page(vma, address, pfn); |
| 1273 | entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), | 1275 | entry = pte_mkyoung(orig_pte); |
| 1274 | vma); | 1276 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 1275 | ptep_set_access_flags(vma, address, page_table, entry, 1); | 1277 | ptep_set_access_flags(vma, address, page_table, entry, 1); |
| 1276 | update_mmu_cache(vma, address, entry); | 1278 | update_mmu_cache(vma, address, entry); |
| 1277 | lazy_mmu_prot_update(entry); | 1279 | lazy_mmu_prot_update(entry); |
| 1278 | pte_unmap(page_table); | 1280 | ret |= VM_FAULT_WRITE; |
| 1279 | spin_unlock(&mm->page_table_lock); | 1281 | goto unlock; |
| 1280 | return VM_FAULT_MINOR|VM_FAULT_WRITE; | ||
| 1281 | } | 1282 | } |
| 1282 | } | 1283 | } |
| 1283 | pte_unmap(page_table); | ||
| 1284 | 1284 | ||
| 1285 | /* | 1285 | /* |
| 1286 | * Ok, we need to copy. Oh, well.. | 1286 | * Ok, we need to copy. Oh, well.. |
| 1287 | */ | 1287 | */ |
| 1288 | if (!PageReserved(old_page)) | 1288 | page_cache_get(old_page); |
| 1289 | page_cache_get(old_page); | 1289 | pte_unmap_unlock(page_table, ptl); |
| 1290 | spin_unlock(&mm->page_table_lock); | ||
| 1291 | 1290 | ||
| 1292 | if (unlikely(anon_vma_prepare(vma))) | 1291 | if (unlikely(anon_vma_prepare(vma))) |
| 1293 | goto no_new_page; | 1292 | goto oom; |
| 1294 | if (old_page == ZERO_PAGE(address)) { | 1293 | if (old_page == ZERO_PAGE(address)) { |
| 1295 | new_page = alloc_zeroed_user_highpage(vma, address); | 1294 | new_page = alloc_zeroed_user_highpage(vma, address); |
| 1296 | if (!new_page) | 1295 | if (!new_page) |
| 1297 | goto no_new_page; | 1296 | goto oom; |
| 1298 | } else { | 1297 | } else { |
| 1299 | new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); | 1298 | new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); |
| 1300 | if (!new_page) | 1299 | if (!new_page) |
| 1301 | goto no_new_page; | 1300 | goto oom; |
| 1302 | copy_user_highpage(new_page, old_page, address); | 1301 | copy_user_highpage(new_page, old_page, address); |
| 1303 | } | 1302 | } |
| 1303 | |||
| 1304 | /* | 1304 | /* |
| 1305 | * Re-check the pte - we dropped the lock | 1305 | * Re-check the pte - we dropped the lock |
| 1306 | */ | 1306 | */ |
| 1307 | ret = VM_FAULT_MINOR; | 1307 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
| 1308 | spin_lock(&mm->page_table_lock); | 1308 | if (likely(pte_same(*page_table, orig_pte))) { |
| 1309 | page_table = pte_offset_map(pmd, address); | 1309 | page_remove_rmap(old_page); |
| 1310 | if (likely(pte_same(*page_table, pte))) { | 1310 | if (!PageAnon(old_page)) { |
| 1311 | if (PageAnon(old_page)) | 1311 | inc_mm_counter(mm, anon_rss); |
| 1312 | dec_mm_counter(mm, anon_rss); | 1312 | dec_mm_counter(mm, file_rss); |
| 1313 | if (PageReserved(old_page)) | 1313 | } |
| 1314 | inc_mm_counter(mm, rss); | ||
| 1315 | else | ||
| 1316 | page_remove_rmap(old_page); | ||
| 1317 | flush_cache_page(vma, address, pfn); | 1314 | flush_cache_page(vma, address, pfn); |
| 1318 | break_cow(vma, new_page, address, page_table); | 1315 | entry = mk_pte(new_page, vma->vm_page_prot); |
| 1316 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
| 1317 | ptep_establish(vma, address, page_table, entry); | ||
| 1318 | update_mmu_cache(vma, address, entry); | ||
| 1319 | lazy_mmu_prot_update(entry); | ||
| 1319 | lru_cache_add_active(new_page); | 1320 | lru_cache_add_active(new_page); |
| 1320 | page_add_anon_rmap(new_page, vma, address); | 1321 | page_add_anon_rmap(new_page, vma, address); |
| 1321 | 1322 | ||
| @@ -1323,13 +1324,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, | |||
| 1323 | new_page = old_page; | 1324 | new_page = old_page; |
| 1324 | ret |= VM_FAULT_WRITE; | 1325 | ret |= VM_FAULT_WRITE; |
| 1325 | } | 1326 | } |
| 1326 | pte_unmap(page_table); | ||
| 1327 | page_cache_release(new_page); | 1327 | page_cache_release(new_page); |
| 1328 | page_cache_release(old_page); | 1328 | page_cache_release(old_page); |
| 1329 | spin_unlock(&mm->page_table_lock); | 1329 | unlock: |
| 1330 | pte_unmap_unlock(page_table, ptl); | ||
| 1330 | return ret; | 1331 | return ret; |
| 1331 | 1332 | oom: | |
| 1332 | no_new_page: | ||
| 1333 | page_cache_release(old_page); | 1333 | page_cache_release(old_page); |
| 1334 | return VM_FAULT_OOM; | 1334 | return VM_FAULT_OOM; |
| 1335 | } | 1335 | } |
| @@ -1399,13 +1399,6 @@ again: | |||
| 1399 | 1399 | ||
| 1400 | restart_addr = zap_page_range(vma, start_addr, | 1400 | restart_addr = zap_page_range(vma, start_addr, |
| 1401 | end_addr - start_addr, details); | 1401 | end_addr - start_addr, details); |
| 1402 | |||
| 1403 | /* | ||
| 1404 | * We cannot rely on the break test in unmap_vmas: | ||
| 1405 | * on the one hand, we don't want to restart our loop | ||
| 1406 | * just because that broke out for the page_table_lock; | ||
| 1407 | * on the other hand, it does no test when vma is small. | ||
| 1408 | */ | ||
| 1409 | need_break = need_resched() || | 1402 | need_break = need_resched() || |
| 1410 | need_lockbreak(details->i_mmap_lock); | 1403 | need_lockbreak(details->i_mmap_lock); |
| 1411 | 1404 | ||
| @@ -1654,38 +1647,37 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc | |||
| 1654 | } | 1647 | } |
| 1655 | 1648 | ||
| 1656 | /* | 1649 | /* |
| 1657 | * We hold the mm semaphore and the page_table_lock on entry and | 1650 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
| 1658 | * should release the pagetable lock on exit.. | 1651 | * but allow concurrent faults), and pte mapped but not yet locked. |
| 1652 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
| 1659 | */ | 1653 | */ |
| 1660 | static int do_swap_page(struct mm_struct * mm, | 1654 | static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 1661 | struct vm_area_struct * vma, unsigned long address, | 1655 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
| 1662 | pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) | 1656 | int write_access, pte_t orig_pte) |
| 1663 | { | 1657 | { |
| 1658 | spinlock_t *ptl; | ||
| 1664 | struct page *page; | 1659 | struct page *page; |
| 1665 | swp_entry_t entry = pte_to_swp_entry(orig_pte); | 1660 | swp_entry_t entry; |
| 1666 | pte_t pte; | 1661 | pte_t pte; |
| 1667 | int ret = VM_FAULT_MINOR; | 1662 | int ret = VM_FAULT_MINOR; |
| 1668 | 1663 | ||
| 1669 | pte_unmap(page_table); | 1664 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
| 1670 | spin_unlock(&mm->page_table_lock); | 1665 | goto out; |
| 1666 | |||
| 1667 | entry = pte_to_swp_entry(orig_pte); | ||
| 1671 | page = lookup_swap_cache(entry); | 1668 | page = lookup_swap_cache(entry); |
| 1672 | if (!page) { | 1669 | if (!page) { |
| 1673 | swapin_readahead(entry, address, vma); | 1670 | swapin_readahead(entry, address, vma); |
| 1674 | page = read_swap_cache_async(entry, vma, address); | 1671 | page = read_swap_cache_async(entry, vma, address); |
| 1675 | if (!page) { | 1672 | if (!page) { |
| 1676 | /* | 1673 | /* |
| 1677 | * Back out if somebody else faulted in this pte while | 1674 | * Back out if somebody else faulted in this pte |
| 1678 | * we released the page table lock. | 1675 | * while we released the pte lock. |
| 1679 | */ | 1676 | */ |
| 1680 | spin_lock(&mm->page_table_lock); | 1677 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
| 1681 | page_table = pte_offset_map(pmd, address); | ||
| 1682 | if (likely(pte_same(*page_table, orig_pte))) | 1678 | if (likely(pte_same(*page_table, orig_pte))) |
| 1683 | ret = VM_FAULT_OOM; | 1679 | ret = VM_FAULT_OOM; |
| 1684 | else | 1680 | goto unlock; |
| 1685 | ret = VM_FAULT_MINOR; | ||
| 1686 | pte_unmap(page_table); | ||
| 1687 | spin_unlock(&mm->page_table_lock); | ||
| 1688 | goto out; | ||
| 1689 | } | 1681 | } |
| 1690 | 1682 | ||
| 1691 | /* Had to read the page from swap area: Major fault */ | 1683 | /* Had to read the page from swap area: Major fault */ |
| @@ -1698,15 +1690,11 @@ static int do_swap_page(struct mm_struct * mm, | |||
| 1698 | lock_page(page); | 1690 | lock_page(page); |
| 1699 | 1691 | ||
| 1700 | /* | 1692 | /* |
| 1701 | * Back out if somebody else faulted in this pte while we | 1693 | * Back out if somebody else already faulted in this pte. |
| 1702 | * released the page table lock. | ||
| 1703 | */ | 1694 | */ |
| 1704 | spin_lock(&mm->page_table_lock); | 1695 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
| 1705 | page_table = pte_offset_map(pmd, address); | 1696 | if (unlikely(!pte_same(*page_table, orig_pte))) |
| 1706 | if (unlikely(!pte_same(*page_table, orig_pte))) { | ||
| 1707 | ret = VM_FAULT_MINOR; | ||
| 1708 | goto out_nomap; | 1697 | goto out_nomap; |
| 1709 | } | ||
| 1710 | 1698 | ||
| 1711 | if (unlikely(!PageUptodate(page))) { | 1699 | if (unlikely(!PageUptodate(page))) { |
| 1712 | ret = VM_FAULT_SIGBUS; | 1700 | ret = VM_FAULT_SIGBUS; |
| @@ -1715,7 +1703,7 @@ static int do_swap_page(struct mm_struct * mm, | |||
| 1715 | 1703 | ||
| 1716 | /* The page isn't present yet, go ahead with the fault. */ | 1704 | /* The page isn't present yet, go ahead with the fault. */ |
| 1717 | 1705 | ||
| 1718 | inc_mm_counter(mm, rss); | 1706 | inc_mm_counter(mm, anon_rss); |
| 1719 | pte = mk_pte(page, vma->vm_page_prot); | 1707 | pte = mk_pte(page, vma->vm_page_prot); |
| 1720 | if (write_access && can_share_swap_page(page)) { | 1708 | if (write_access && can_share_swap_page(page)) { |
| 1721 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 1709 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
| @@ -1733,7 +1721,7 @@ static int do_swap_page(struct mm_struct * mm, | |||
| 1733 | 1721 | ||
| 1734 | if (write_access) { | 1722 | if (write_access) { |
| 1735 | if (do_wp_page(mm, vma, address, | 1723 | if (do_wp_page(mm, vma, address, |
| 1736 | page_table, pmd, pte) == VM_FAULT_OOM) | 1724 | page_table, pmd, ptl, pte) == VM_FAULT_OOM) |
| 1737 | ret = VM_FAULT_OOM; | 1725 | ret = VM_FAULT_OOM; |
| 1738 | goto out; | 1726 | goto out; |
| 1739 | } | 1727 | } |
| @@ -1741,74 +1729,76 @@ static int do_swap_page(struct mm_struct * mm, | |||
| 1741 | /* No need to invalidate - it was non-present before */ | 1729 | /* No need to invalidate - it was non-present before */ |
| 1742 | update_mmu_cache(vma, address, pte); | 1730 | update_mmu_cache(vma, address, pte); |
| 1743 | lazy_mmu_prot_update(pte); | 1731 | lazy_mmu_prot_update(pte); |
| 1744 | pte_unmap(page_table); | 1732 | unlock: |
| 1745 | spin_unlock(&mm->page_table_lock); | 1733 | pte_unmap_unlock(page_table, ptl); |
| 1746 | out: | 1734 | out: |
| 1747 | return ret; | 1735 | return ret; |
| 1748 | out_nomap: | 1736 | out_nomap: |
| 1749 | pte_unmap(page_table); | 1737 | pte_unmap_unlock(page_table, ptl); |
| 1750 | spin_unlock(&mm->page_table_lock); | ||
| 1751 | unlock_page(page); | 1738 | unlock_page(page); |
| 1752 | page_cache_release(page); | 1739 | page_cache_release(page); |
| 1753 | goto out; | 1740 | return ret; |
| 1754 | } | 1741 | } |
| 1755 | 1742 | ||
| 1756 | /* | 1743 | /* |
| 1757 | * We are called with the MM semaphore and page_table_lock | 1744 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
| 1758 | * spinlock held to protect against concurrent faults in | 1745 | * but allow concurrent faults), and pte mapped but not yet locked. |
| 1759 | * multithreaded programs. | 1746 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
| 1760 | */ | 1747 | */ |
| 1761 | static int | 1748 | static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 1762 | do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1749 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
| 1763 | pte_t *page_table, pmd_t *pmd, int write_access, | 1750 | int write_access) |
| 1764 | unsigned long addr) | ||
| 1765 | { | 1751 | { |
| 1752 | struct page *page; | ||
| 1753 | spinlock_t *ptl; | ||
| 1766 | pte_t entry; | 1754 | pte_t entry; |
| 1767 | struct page * page = ZERO_PAGE(addr); | ||
| 1768 | |||
| 1769 | /* Read-only mapping of ZERO_PAGE. */ | ||
| 1770 | entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); | ||
| 1771 | 1755 | ||
| 1772 | /* ..except if it's a write access */ | ||
| 1773 | if (write_access) { | 1756 | if (write_access) { |
| 1774 | /* Allocate our own private page. */ | 1757 | /* Allocate our own private page. */ |
| 1775 | pte_unmap(page_table); | 1758 | pte_unmap(page_table); |
| 1776 | spin_unlock(&mm->page_table_lock); | ||
| 1777 | 1759 | ||
| 1778 | if (unlikely(anon_vma_prepare(vma))) | 1760 | if (unlikely(anon_vma_prepare(vma))) |
| 1779 | goto no_mem; | 1761 | goto oom; |
| 1780 | page = alloc_zeroed_user_highpage(vma, addr); | 1762 | page = alloc_zeroed_user_highpage(vma, address); |
| 1781 | if (!page) | 1763 | if (!page) |
| 1782 | goto no_mem; | 1764 | goto oom; |
| 1783 | 1765 | ||
| 1784 | spin_lock(&mm->page_table_lock); | 1766 | entry = mk_pte(page, vma->vm_page_prot); |
| 1785 | page_table = pte_offset_map(pmd, addr); | 1767 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 1786 | 1768 | ||
| 1787 | if (!pte_none(*page_table)) { | 1769 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
| 1788 | pte_unmap(page_table); | 1770 | if (!pte_none(*page_table)) |
| 1789 | page_cache_release(page); | 1771 | goto release; |
| 1790 | spin_unlock(&mm->page_table_lock); | 1772 | inc_mm_counter(mm, anon_rss); |
| 1791 | goto out; | ||
| 1792 | } | ||
| 1793 | inc_mm_counter(mm, rss); | ||
| 1794 | entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, | ||
| 1795 | vma->vm_page_prot)), | ||
| 1796 | vma); | ||
| 1797 | lru_cache_add_active(page); | 1773 | lru_cache_add_active(page); |
| 1798 | SetPageReferenced(page); | 1774 | SetPageReferenced(page); |
| 1799 | page_add_anon_rmap(page, vma, addr); | 1775 | page_add_anon_rmap(page, vma, address); |
| 1776 | } else { | ||
| 1777 | /* Map the ZERO_PAGE - vm_page_prot is readonly */ | ||
| 1778 | page = ZERO_PAGE(address); | ||
| 1779 | page_cache_get(page); | ||
| 1780 | entry = mk_pte(page, vma->vm_page_prot); | ||
| 1781 | |||
| 1782 | ptl = pte_lockptr(mm, pmd); | ||
| 1783 | spin_lock(ptl); | ||
| 1784 | if (!pte_none(*page_table)) | ||
| 1785 | goto release; | ||
| 1786 | inc_mm_counter(mm, file_rss); | ||
| 1787 | page_add_file_rmap(page); | ||
| 1800 | } | 1788 | } |
| 1801 | 1789 | ||
| 1802 | set_pte_at(mm, addr, page_table, entry); | 1790 | set_pte_at(mm, address, page_table, entry); |
| 1803 | pte_unmap(page_table); | ||
| 1804 | 1791 | ||
| 1805 | /* No need to invalidate - it was non-present before */ | 1792 | /* No need to invalidate - it was non-present before */ |
| 1806 | update_mmu_cache(vma, addr, entry); | 1793 | update_mmu_cache(vma, address, entry); |
| 1807 | lazy_mmu_prot_update(entry); | 1794 | lazy_mmu_prot_update(entry); |
| 1808 | spin_unlock(&mm->page_table_lock); | 1795 | unlock: |
| 1809 | out: | 1796 | pte_unmap_unlock(page_table, ptl); |
| 1810 | return VM_FAULT_MINOR; | 1797 | return VM_FAULT_MINOR; |
| 1811 | no_mem: | 1798 | release: |
| 1799 | page_cache_release(page); | ||
| 1800 | goto unlock; | ||
| 1801 | oom: | ||
| 1812 | return VM_FAULT_OOM; | 1802 | return VM_FAULT_OOM; |
| 1813 | } | 1803 | } |
| 1814 | 1804 | ||
| @@ -1821,25 +1811,23 @@ no_mem: | |||
| 1821 | * As this is called only for pages that do not currently exist, we | 1811 | * As this is called only for pages that do not currently exist, we |
| 1822 | * do not need to flush old virtual caches or the TLB. | 1812 | * do not need to flush old virtual caches or the TLB. |
| 1823 | * | 1813 | * |
| 1824 | * This is called with the MM semaphore held and the page table | 1814 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
| 1825 | * spinlock held. Exit with the spinlock released. | 1815 | * but allow concurrent faults), and pte mapped but not yet locked. |
| 1816 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
| 1826 | */ | 1817 | */ |
| 1827 | static int | 1818 | static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 1828 | do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1819 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
| 1829 | unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) | 1820 | int write_access) |
| 1830 | { | 1821 | { |
| 1831 | struct page * new_page; | 1822 | spinlock_t *ptl; |
| 1823 | struct page *new_page; | ||
| 1832 | struct address_space *mapping = NULL; | 1824 | struct address_space *mapping = NULL; |
| 1833 | pte_t entry; | 1825 | pte_t entry; |
| 1834 | unsigned int sequence = 0; | 1826 | unsigned int sequence = 0; |
| 1835 | int ret = VM_FAULT_MINOR; | 1827 | int ret = VM_FAULT_MINOR; |
| 1836 | int anon = 0; | 1828 | int anon = 0; |
| 1837 | 1829 | ||
| 1838 | if (!vma->vm_ops || !vma->vm_ops->nopage) | ||
| 1839 | return do_anonymous_page(mm, vma, page_table, | ||
| 1840 | pmd, write_access, address); | ||
| 1841 | pte_unmap(page_table); | 1830 | pte_unmap(page_table); |
| 1842 | spin_unlock(&mm->page_table_lock); | ||
| 1843 | 1831 | ||
| 1844 | if (vma->vm_file) { | 1832 | if (vma->vm_file) { |
| 1845 | mapping = vma->vm_file->f_mapping; | 1833 | mapping = vma->vm_file->f_mapping; |
| @@ -1847,7 +1835,6 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1847 | smp_rmb(); /* serializes i_size against truncate_count */ | 1835 | smp_rmb(); /* serializes i_size against truncate_count */ |
| 1848 | } | 1836 | } |
| 1849 | retry: | 1837 | retry: |
| 1850 | cond_resched(); | ||
| 1851 | new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); | 1838 | new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); |
| 1852 | /* | 1839 | /* |
| 1853 | * No smp_rmb is needed here as long as there's a full | 1840 | * No smp_rmb is needed here as long as there's a full |
| @@ -1880,19 +1867,20 @@ retry: | |||
| 1880 | anon = 1; | 1867 | anon = 1; |
| 1881 | } | 1868 | } |
| 1882 | 1869 | ||
| 1883 | spin_lock(&mm->page_table_lock); | 1870 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
| 1884 | /* | 1871 | /* |
| 1885 | * For a file-backed vma, someone could have truncated or otherwise | 1872 | * For a file-backed vma, someone could have truncated or otherwise |
| 1886 | * invalidated this page. If unmap_mapping_range got called, | 1873 | * invalidated this page. If unmap_mapping_range got called, |
| 1887 | * retry getting the page. | 1874 | * retry getting the page. |
| 1888 | */ | 1875 | */ |
| 1889 | if (mapping && unlikely(sequence != mapping->truncate_count)) { | 1876 | if (mapping && unlikely(sequence != mapping->truncate_count)) { |
| 1890 | sequence = mapping->truncate_count; | 1877 | pte_unmap_unlock(page_table, ptl); |
| 1891 | spin_unlock(&mm->page_table_lock); | ||
| 1892 | page_cache_release(new_page); | 1878 | page_cache_release(new_page); |
| 1879 | cond_resched(); | ||
| 1880 | sequence = mapping->truncate_count; | ||
| 1881 | smp_rmb(); | ||
| 1893 | goto retry; | 1882 | goto retry; |
| 1894 | } | 1883 | } |
| 1895 | page_table = pte_offset_map(pmd, address); | ||
| 1896 | 1884 | ||
| 1897 | /* | 1885 | /* |
| 1898 | * This silly early PAGE_DIRTY setting removes a race | 1886 | * This silly early PAGE_DIRTY setting removes a race |
| @@ -1906,68 +1894,67 @@ retry: | |||
| 1906 | */ | 1894 | */ |
| 1907 | /* Only go through if we didn't race with anybody else... */ | 1895 | /* Only go through if we didn't race with anybody else... */ |
| 1908 | if (pte_none(*page_table)) { | 1896 | if (pte_none(*page_table)) { |
| 1909 | if (!PageReserved(new_page)) | ||
| 1910 | inc_mm_counter(mm, rss); | ||
| 1911 | |||
| 1912 | flush_icache_page(vma, new_page); | 1897 | flush_icache_page(vma, new_page); |
| 1913 | entry = mk_pte(new_page, vma->vm_page_prot); | 1898 | entry = mk_pte(new_page, vma->vm_page_prot); |
| 1914 | if (write_access) | 1899 | if (write_access) |
| 1915 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1900 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 1916 | set_pte_at(mm, address, page_table, entry); | 1901 | set_pte_at(mm, address, page_table, entry); |
| 1917 | if (anon) { | 1902 | if (anon) { |
| 1903 | inc_mm_counter(mm, anon_rss); | ||
| 1918 | lru_cache_add_active(new_page); | 1904 | lru_cache_add_active(new_page); |
| 1919 | page_add_anon_rmap(new_page, vma, address); | 1905 | page_add_anon_rmap(new_page, vma, address); |
| 1920 | } else | 1906 | } else if (!(vma->vm_flags & VM_RESERVED)) { |
| 1907 | inc_mm_counter(mm, file_rss); | ||
| 1921 | page_add_file_rmap(new_page); | 1908 | page_add_file_rmap(new_page); |
| 1922 | pte_unmap(page_table); | 1909 | } |
| 1923 | } else { | 1910 | } else { |
| 1924 | /* One of our sibling threads was faster, back out. */ | 1911 | /* One of our sibling threads was faster, back out. */ |
| 1925 | pte_unmap(page_table); | ||
| 1926 | page_cache_release(new_page); | 1912 | page_cache_release(new_page); |
| 1927 | spin_unlock(&mm->page_table_lock); | 1913 | goto unlock; |
| 1928 | goto out; | ||
| 1929 | } | 1914 | } |
| 1930 | 1915 | ||
| 1931 | /* no need to invalidate: a not-present page shouldn't be cached */ | 1916 | /* no need to invalidate: a not-present page shouldn't be cached */ |
| 1932 | update_mmu_cache(vma, address, entry); | 1917 | update_mmu_cache(vma, address, entry); |
| 1933 | lazy_mmu_prot_update(entry); | 1918 | lazy_mmu_prot_update(entry); |
| 1934 | spin_unlock(&mm->page_table_lock); | 1919 | unlock: |
| 1935 | out: | 1920 | pte_unmap_unlock(page_table, ptl); |
| 1936 | return ret; | 1921 | return ret; |
| 1937 | oom: | 1922 | oom: |
| 1938 | page_cache_release(new_page); | 1923 | page_cache_release(new_page); |
| 1939 | ret = VM_FAULT_OOM; | 1924 | return VM_FAULT_OOM; |
| 1940 | goto out; | ||
| 1941 | } | 1925 | } |
| 1942 | 1926 | ||
| 1943 | /* | 1927 | /* |
| 1944 | * Fault of a previously existing named mapping. Repopulate the pte | 1928 | * Fault of a previously existing named mapping. Repopulate the pte |
| 1945 | * from the encoded file_pte if possible. This enables swappable | 1929 | * from the encoded file_pte if possible. This enables swappable |
| 1946 | * nonlinear vmas. | 1930 | * nonlinear vmas. |
| 1931 | * | ||
| 1932 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
| 1933 | * but allow concurrent faults), and pte mapped but not yet locked. | ||
| 1934 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
| 1947 | */ | 1935 | */ |
| 1948 | static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, | 1936 | static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 1949 | unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) | 1937 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
| 1938 | int write_access, pte_t orig_pte) | ||
| 1950 | { | 1939 | { |
| 1951 | unsigned long pgoff; | 1940 | pgoff_t pgoff; |
| 1952 | int err; | 1941 | int err; |
| 1953 | 1942 | ||
| 1954 | BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); | 1943 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
| 1955 | /* | 1944 | return VM_FAULT_MINOR; |
| 1956 | * Fall back to the linear mapping if the fs does not support | ||
| 1957 | * ->populate: | ||
| 1958 | */ | ||
| 1959 | if (!vma->vm_ops->populate || | ||
| 1960 | (write_access && !(vma->vm_flags & VM_SHARED))) { | ||
| 1961 | pte_clear(mm, address, pte); | ||
| 1962 | return do_no_page(mm, vma, address, write_access, pte, pmd); | ||
| 1963 | } | ||
| 1964 | |||
| 1965 | pgoff = pte_to_pgoff(*pte); | ||
| 1966 | 1945 | ||
| 1967 | pte_unmap(pte); | 1946 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { |
| 1968 | spin_unlock(&mm->page_table_lock); | 1947 | /* |
| 1948 | * Page table corrupted: show pte and kill process. | ||
| 1949 | */ | ||
| 1950 | print_bad_pte(vma, orig_pte, address); | ||
| 1951 | return VM_FAULT_OOM; | ||
| 1952 | } | ||
| 1953 | /* We can then assume vm->vm_ops && vma->vm_ops->populate */ | ||
| 1969 | 1954 | ||
| 1970 | err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); | 1955 | pgoff = pte_to_pgoff(orig_pte); |
| 1956 | err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, | ||
| 1957 | vma->vm_page_prot, pgoff, 0); | ||
| 1971 | if (err == -ENOMEM) | 1958 | if (err == -ENOMEM) |
| 1972 | return VM_FAULT_OOM; | 1959 | return VM_FAULT_OOM; |
| 1973 | if (err) | 1960 | if (err) |
| @@ -1984,56 +1971,68 @@ static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, | |||
| 1984 | * with external mmu caches can use to update those (ie the Sparc or | 1971 | * with external mmu caches can use to update those (ie the Sparc or |
| 1985 | * PowerPC hashed page tables that act as extended TLBs). | 1972 | * PowerPC hashed page tables that act as extended TLBs). |
| 1986 | * | 1973 | * |
| 1987 | * Note the "page_table_lock". It is to protect against kswapd removing | 1974 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
| 1988 | * pages from under us. Note that kswapd only ever _removes_ pages, never | 1975 | * but allow concurrent faults), and pte mapped but not yet locked. |
| 1989 | * adds them. As such, once we have noticed that the page is not present, | 1976 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
| 1990 | * we can drop the lock early. | ||
| 1991 | * | ||
| 1992 | * The adding of pages is protected by the MM semaphore (which we hold), | ||
| 1993 | * so we don't need to worry about a page being suddenly been added into | ||
| 1994 | * our VM. | ||
| 1995 | * | ||
| 1996 | * We enter with the pagetable spinlock held, we are supposed to | ||
| 1997 | * release it when done. | ||
| 1998 | */ | 1977 | */ |
| 1999 | static inline int handle_pte_fault(struct mm_struct *mm, | 1978 | static inline int handle_pte_fault(struct mm_struct *mm, |
| 2000 | struct vm_area_struct * vma, unsigned long address, | 1979 | struct vm_area_struct *vma, unsigned long address, |
| 2001 | int write_access, pte_t *pte, pmd_t *pmd) | 1980 | pte_t *pte, pmd_t *pmd, int write_access) |
| 2002 | { | 1981 | { |
| 2003 | pte_t entry; | 1982 | pte_t entry; |
| 1983 | pte_t old_entry; | ||
| 1984 | spinlock_t *ptl; | ||
| 2004 | 1985 | ||
| 2005 | entry = *pte; | 1986 | old_entry = entry = *pte; |
| 2006 | if (!pte_present(entry)) { | 1987 | if (!pte_present(entry)) { |
| 2007 | /* | 1988 | if (pte_none(entry)) { |
| 2008 | * If it truly wasn't present, we know that kswapd | 1989 | if (!vma->vm_ops || !vma->vm_ops->nopage) |
| 2009 | * and the PTE updates will not touch it later. So | 1990 | return do_anonymous_page(mm, vma, address, |
| 2010 | * drop the lock. | 1991 | pte, pmd, write_access); |
| 2011 | */ | 1992 | return do_no_page(mm, vma, address, |
| 2012 | if (pte_none(entry)) | 1993 | pte, pmd, write_access); |
| 2013 | return do_no_page(mm, vma, address, write_access, pte, pmd); | 1994 | } |
| 2014 | if (pte_file(entry)) | 1995 | if (pte_file(entry)) |
| 2015 | return do_file_page(mm, vma, address, write_access, pte, pmd); | 1996 | return do_file_page(mm, vma, address, |
| 2016 | return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); | 1997 | pte, pmd, write_access, entry); |
| 1998 | return do_swap_page(mm, vma, address, | ||
| 1999 | pte, pmd, write_access, entry); | ||
| 2017 | } | 2000 | } |
| 2018 | 2001 | ||
| 2002 | ptl = pte_lockptr(mm, pmd); | ||
| 2003 | spin_lock(ptl); | ||
| 2004 | if (unlikely(!pte_same(*pte, entry))) | ||
| 2005 | goto unlock; | ||
| 2019 | if (write_access) { | 2006 | if (write_access) { |
| 2020 | if (!pte_write(entry)) | 2007 | if (!pte_write(entry)) |
| 2021 | return do_wp_page(mm, vma, address, pte, pmd, entry); | 2008 | return do_wp_page(mm, vma, address, |
| 2009 | pte, pmd, ptl, entry); | ||
| 2022 | entry = pte_mkdirty(entry); | 2010 | entry = pte_mkdirty(entry); |
| 2023 | } | 2011 | } |
| 2024 | entry = pte_mkyoung(entry); | 2012 | entry = pte_mkyoung(entry); |
| 2025 | ptep_set_access_flags(vma, address, pte, entry, write_access); | 2013 | if (!pte_same(old_entry, entry)) { |
| 2026 | update_mmu_cache(vma, address, entry); | 2014 | ptep_set_access_flags(vma, address, pte, entry, write_access); |
| 2027 | lazy_mmu_prot_update(entry); | 2015 | update_mmu_cache(vma, address, entry); |
| 2028 | pte_unmap(pte); | 2016 | lazy_mmu_prot_update(entry); |
| 2029 | spin_unlock(&mm->page_table_lock); | 2017 | } else { |
| 2018 | /* | ||
| 2019 | * This is needed only for protection faults but the arch code | ||
| 2020 | * is not yet telling us if this is a protection fault or not. | ||
| 2021 | * This still avoids useless tlb flushes for .text page faults | ||
| 2022 | * with threads. | ||
| 2023 | */ | ||
| 2024 | if (write_access) | ||
| 2025 | flush_tlb_page(vma, address); | ||
| 2026 | } | ||
| 2027 | unlock: | ||
| 2028 | pte_unmap_unlock(pte, ptl); | ||
| 2030 | return VM_FAULT_MINOR; | 2029 | return VM_FAULT_MINOR; |
| 2031 | } | 2030 | } |
| 2032 | 2031 | ||
| 2033 | /* | 2032 | /* |
| 2034 | * By the time we get here, we already hold the mm semaphore | 2033 | * By the time we get here, we already hold the mm semaphore |
| 2035 | */ | 2034 | */ |
| 2036 | int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, | 2035 | int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
| 2037 | unsigned long address, int write_access) | 2036 | unsigned long address, int write_access) |
| 2038 | { | 2037 | { |
| 2039 | pgd_t *pgd; | 2038 | pgd_t *pgd; |
| @@ -2048,100 +2047,66 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, | |||
| 2048 | if (unlikely(is_vm_hugetlb_page(vma))) | 2047 | if (unlikely(is_vm_hugetlb_page(vma))) |
| 2049 | return hugetlb_fault(mm, vma, address, write_access); | 2048 | return hugetlb_fault(mm, vma, address, write_access); |
| 2050 | 2049 | ||
| 2051 | /* | ||
| 2052 | * We need the page table lock to synchronize with kswapd | ||
| 2053 | * and the SMP-safe atomic PTE updates. | ||
| 2054 | */ | ||
| 2055 | pgd = pgd_offset(mm, address); | 2050 | pgd = pgd_offset(mm, address); |
| 2056 | spin_lock(&mm->page_table_lock); | ||
| 2057 | |||
| 2058 | pud = pud_alloc(mm, pgd, address); | 2051 | pud = pud_alloc(mm, pgd, address); |
| 2059 | if (!pud) | 2052 | if (!pud) |
| 2060 | goto oom; | 2053 | return VM_FAULT_OOM; |
| 2061 | |||
| 2062 | pmd = pmd_alloc(mm, pud, address); | 2054 | pmd = pmd_alloc(mm, pud, address); |
| 2063 | if (!pmd) | 2055 | if (!pmd) |
| 2064 | goto oom; | 2056 | return VM_FAULT_OOM; |
| 2065 | |||
| 2066 | pte = pte_alloc_map(mm, pmd, address); | 2057 | pte = pte_alloc_map(mm, pmd, address); |
| 2067 | if (!pte) | 2058 | if (!pte) |
| 2068 | goto oom; | 2059 | return VM_FAULT_OOM; |
| 2069 | |||
| 2070 | return handle_pte_fault(mm, vma, address, write_access, pte, pmd); | ||
| 2071 | 2060 | ||
| 2072 | oom: | 2061 | return handle_pte_fault(mm, vma, address, pte, pmd, write_access); |
| 2073 | spin_unlock(&mm->page_table_lock); | ||
| 2074 | return VM_FAULT_OOM; | ||
| 2075 | } | 2062 | } |
| 2076 | 2063 | ||
| 2077 | #ifndef __PAGETABLE_PUD_FOLDED | 2064 | #ifndef __PAGETABLE_PUD_FOLDED |
| 2078 | /* | 2065 | /* |
| 2079 | * Allocate page upper directory. | 2066 | * Allocate page upper directory. |
| 2080 | * | 2067 | * We've already handled the fast-path in-line. |
| 2081 | * We've already handled the fast-path in-line, and we own the | ||
| 2082 | * page table lock. | ||
| 2083 | */ | 2068 | */ |
| 2084 | pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) | 2069 | int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) |
| 2085 | { | 2070 | { |
| 2086 | pud_t *new; | 2071 | pud_t *new = pud_alloc_one(mm, address); |
| 2087 | |||
| 2088 | spin_unlock(&mm->page_table_lock); | ||
| 2089 | new = pud_alloc_one(mm, address); | ||
| 2090 | spin_lock(&mm->page_table_lock); | ||
| 2091 | if (!new) | 2072 | if (!new) |
| 2092 | return NULL; | 2073 | return -ENOMEM; |
| 2093 | 2074 | ||
| 2094 | /* | 2075 | spin_lock(&mm->page_table_lock); |
| 2095 | * Because we dropped the lock, we should re-check the | 2076 | if (pgd_present(*pgd)) /* Another has populated it */ |
| 2096 | * entry, as somebody else could have populated it.. | ||
| 2097 | */ | ||
| 2098 | if (pgd_present(*pgd)) { | ||
| 2099 | pud_free(new); | 2077 | pud_free(new); |
| 2100 | goto out; | 2078 | else |
| 2101 | } | 2079 | pgd_populate(mm, pgd, new); |
| 2102 | pgd_populate(mm, pgd, new); | 2080 | spin_unlock(&mm->page_table_lock); |
| 2103 | out: | 2081 | return 0; |
| 2104 | return pud_offset(pgd, address); | ||
| 2105 | } | 2082 | } |
| 2106 | #endif /* __PAGETABLE_PUD_FOLDED */ | 2083 | #endif /* __PAGETABLE_PUD_FOLDED */ |
| 2107 | 2084 | ||
| 2108 | #ifndef __PAGETABLE_PMD_FOLDED | 2085 | #ifndef __PAGETABLE_PMD_FOLDED |
| 2109 | /* | 2086 | /* |
| 2110 | * Allocate page middle directory. | 2087 | * Allocate page middle directory. |
| 2111 | * | 2088 | * We've already handled the fast-path in-line. |
| 2112 | * We've already handled the fast-path in-line, and we own the | ||
| 2113 | * page table lock. | ||
| 2114 | */ | 2089 | */ |
| 2115 | pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | 2090 | int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) |
| 2116 | { | 2091 | { |
| 2117 | pmd_t *new; | 2092 | pmd_t *new = pmd_alloc_one(mm, address); |
| 2118 | |||
| 2119 | spin_unlock(&mm->page_table_lock); | ||
| 2120 | new = pmd_alloc_one(mm, address); | ||
| 2121 | spin_lock(&mm->page_table_lock); | ||
| 2122 | if (!new) | 2093 | if (!new) |
| 2123 | return NULL; | 2094 | return -ENOMEM; |
| 2124 | 2095 | ||
| 2125 | /* | 2096 | spin_lock(&mm->page_table_lock); |
| 2126 | * Because we dropped the lock, we should re-check the | ||
| 2127 | * entry, as somebody else could have populated it.. | ||
| 2128 | */ | ||
| 2129 | #ifndef __ARCH_HAS_4LEVEL_HACK | 2097 | #ifndef __ARCH_HAS_4LEVEL_HACK |
| 2130 | if (pud_present(*pud)) { | 2098 | if (pud_present(*pud)) /* Another has populated it */ |
| 2131 | pmd_free(new); | 2099 | pmd_free(new); |
| 2132 | goto out; | 2100 | else |
| 2133 | } | 2101 | pud_populate(mm, pud, new); |
| 2134 | pud_populate(mm, pud, new); | ||
| 2135 | #else | 2102 | #else |
| 2136 | if (pgd_present(*pud)) { | 2103 | if (pgd_present(*pud)) /* Another has populated it */ |
| 2137 | pmd_free(new); | 2104 | pmd_free(new); |
| 2138 | goto out; | 2105 | else |
| 2139 | } | 2106 | pgd_populate(mm, pud, new); |
| 2140 | pgd_populate(mm, pud, new); | ||
| 2141 | #endif /* __ARCH_HAS_4LEVEL_HACK */ | 2107 | #endif /* __ARCH_HAS_4LEVEL_HACK */ |
| 2142 | 2108 | spin_unlock(&mm->page_table_lock); | |
| 2143 | out: | 2109 | return 0; |
| 2144 | return pmd_offset(pud, address); | ||
| 2145 | } | 2110 | } |
| 2146 | #endif /* __PAGETABLE_PMD_FOLDED */ | 2111 | #endif /* __PAGETABLE_PMD_FOLDED */ |
| 2147 | 2112 | ||
| @@ -2206,22 +2171,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr) | |||
| 2206 | 2171 | ||
| 2207 | EXPORT_SYMBOL(vmalloc_to_pfn); | 2172 | EXPORT_SYMBOL(vmalloc_to_pfn); |
| 2208 | 2173 | ||
| 2209 | /* | ||
| 2210 | * update_mem_hiwater | ||
| 2211 | * - update per process rss and vm high water data | ||
| 2212 | */ | ||
| 2213 | void update_mem_hiwater(struct task_struct *tsk) | ||
| 2214 | { | ||
| 2215 | if (tsk->mm) { | ||
| 2216 | unsigned long rss = get_mm_counter(tsk->mm, rss); | ||
| 2217 | |||
| 2218 | if (tsk->mm->hiwater_rss < rss) | ||
| 2219 | tsk->mm->hiwater_rss = rss; | ||
| 2220 | if (tsk->mm->hiwater_vm < tsk->mm->total_vm) | ||
| 2221 | tsk->mm->hiwater_vm = tsk->mm->total_vm; | ||
| 2222 | } | ||
| 2223 | } | ||
| 2224 | |||
| 2225 | #if !defined(__HAVE_ARCH_GATE_AREA) | 2174 | #if !defined(__HAVE_ARCH_GATE_AREA) |
| 2226 | 2175 | ||
| 2227 | #if defined(AT_SYSINFO_EHDR) | 2176 | #if defined(AT_SYSINFO_EHDR) |
| @@ -2233,7 +2182,7 @@ static int __init gate_vma_init(void) | |||
| 2233 | gate_vma.vm_start = FIXADDR_USER_START; | 2182 | gate_vma.vm_start = FIXADDR_USER_START; |
| 2234 | gate_vma.vm_end = FIXADDR_USER_END; | 2183 | gate_vma.vm_end = FIXADDR_USER_END; |
| 2235 | gate_vma.vm_page_prot = PAGE_READONLY; | 2184 | gate_vma.vm_page_prot = PAGE_READONLY; |
| 2236 | gate_vma.vm_flags = 0; | 2185 | gate_vma.vm_flags = VM_RESERVED; |
| 2237 | return 0; | 2186 | return 0; |
| 2238 | } | 2187 | } |
| 2239 | __initcall(gate_vma_init); | 2188 | __initcall(gate_vma_init); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c new file mode 100644 index 000000000000..431a64f021c0 --- /dev/null +++ b/mm/memory_hotplug.c | |||
| @@ -0,0 +1,138 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/memory_hotplug.c | ||
| 3 | * | ||
| 4 | * Copyright (C) | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include <linux/config.h> | ||
| 8 | #include <linux/stddef.h> | ||
| 9 | #include <linux/mm.h> | ||
| 10 | #include <linux/swap.h> | ||
| 11 | #include <linux/interrupt.h> | ||
| 12 | #include <linux/pagemap.h> | ||
| 13 | #include <linux/bootmem.h> | ||
| 14 | #include <linux/compiler.h> | ||
| 15 | #include <linux/module.h> | ||
| 16 | #include <linux/pagevec.h> | ||
| 17 | #include <linux/slab.h> | ||
| 18 | #include <linux/sysctl.h> | ||
| 19 | #include <linux/cpu.h> | ||
| 20 | #include <linux/memory.h> | ||
| 21 | #include <linux/memory_hotplug.h> | ||
| 22 | #include <linux/highmem.h> | ||
| 23 | #include <linux/vmalloc.h> | ||
| 24 | |||
| 25 | #include <asm/tlbflush.h> | ||
| 26 | |||
| 27 | extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, | ||
| 28 | unsigned long size); | ||
| 29 | static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) | ||
| 30 | { | ||
| 31 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
| 32 | int nr_pages = PAGES_PER_SECTION; | ||
| 33 | int nid = pgdat->node_id; | ||
| 34 | int zone_type; | ||
| 35 | |||
| 36 | zone_type = zone - pgdat->node_zones; | ||
| 37 | memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); | ||
| 38 | zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); | ||
| 39 | } | ||
| 40 | |||
| 41 | extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | ||
| 42 | int nr_pages); | ||
| 43 | static int __add_section(struct zone *zone, unsigned long phys_start_pfn) | ||
| 44 | { | ||
| 45 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
| 46 | int nr_pages = PAGES_PER_SECTION; | ||
| 47 | int ret; | ||
| 48 | |||
| 49 | ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); | ||
| 50 | |||
| 51 | if (ret < 0) | ||
| 52 | return ret; | ||
| 53 | |||
| 54 | __add_zone(zone, phys_start_pfn); | ||
| 55 | return register_new_memory(__pfn_to_section(phys_start_pfn)); | ||
| 56 | } | ||
| 57 | |||
| 58 | /* | ||
| 59 | * Reasonably generic function for adding memory. It is | ||
| 60 | * expected that archs that support memory hotplug will | ||
| 61 | * call this function after deciding the zone to which to | ||
| 62 | * add the new pages. | ||
| 63 | */ | ||
| 64 | int __add_pages(struct zone *zone, unsigned long phys_start_pfn, | ||
| 65 | unsigned long nr_pages) | ||
| 66 | { | ||
| 67 | unsigned long i; | ||
| 68 | int err = 0; | ||
| 69 | |||
| 70 | for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) { | ||
| 71 | err = __add_section(zone, phys_start_pfn + i); | ||
| 72 | |||
| 73 | if (err) | ||
| 74 | break; | ||
| 75 | } | ||
| 76 | |||
| 77 | return err; | ||
| 78 | } | ||
| 79 | |||
| 80 | static void grow_zone_span(struct zone *zone, | ||
| 81 | unsigned long start_pfn, unsigned long end_pfn) | ||
| 82 | { | ||
| 83 | unsigned long old_zone_end_pfn; | ||
| 84 | |||
| 85 | zone_span_writelock(zone); | ||
| 86 | |||
| 87 | old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
| 88 | if (start_pfn < zone->zone_start_pfn) | ||
| 89 | zone->zone_start_pfn = start_pfn; | ||
| 90 | |||
| 91 | if (end_pfn > old_zone_end_pfn) | ||
| 92 | zone->spanned_pages = end_pfn - zone->zone_start_pfn; | ||
| 93 | |||
| 94 | zone_span_writeunlock(zone); | ||
| 95 | } | ||
| 96 | |||
| 97 | static void grow_pgdat_span(struct pglist_data *pgdat, | ||
| 98 | unsigned long start_pfn, unsigned long end_pfn) | ||
| 99 | { | ||
| 100 | unsigned long old_pgdat_end_pfn = | ||
| 101 | pgdat->node_start_pfn + pgdat->node_spanned_pages; | ||
| 102 | |||
| 103 | if (start_pfn < pgdat->node_start_pfn) | ||
| 104 | pgdat->node_start_pfn = start_pfn; | ||
| 105 | |||
| 106 | if (end_pfn > old_pgdat_end_pfn) | ||
| 107 | pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages; | ||
| 108 | } | ||
| 109 | |||
| 110 | int online_pages(unsigned long pfn, unsigned long nr_pages) | ||
| 111 | { | ||
| 112 | unsigned long i; | ||
| 113 | unsigned long flags; | ||
| 114 | unsigned long onlined_pages = 0; | ||
| 115 | struct zone *zone; | ||
| 116 | |||
| 117 | /* | ||
| 118 | * This doesn't need a lock to do pfn_to_page(). | ||
| 119 | * The section can't be removed here because of the | ||
| 120 | * memory_block->state_sem. | ||
| 121 | */ | ||
| 122 | zone = page_zone(pfn_to_page(pfn)); | ||
| 123 | pgdat_resize_lock(zone->zone_pgdat, &flags); | ||
| 124 | grow_zone_span(zone, pfn, pfn + nr_pages); | ||
| 125 | grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); | ||
| 126 | pgdat_resize_unlock(zone->zone_pgdat, &flags); | ||
| 127 | |||
| 128 | for (i = 0; i < nr_pages; i++) { | ||
| 129 | struct page *page = pfn_to_page(pfn + i); | ||
| 130 | online_page(page); | ||
| 131 | onlined_pages++; | ||
| 132 | } | ||
| 133 | zone->present_pages += onlined_pages; | ||
| 134 | |||
| 135 | setup_per_zone_pages_min(); | ||
| 136 | |||
| 137 | return 0; | ||
| 138 | } | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1d5c64df1653..2076b1542b8a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -2,6 +2,7 @@ | |||
| 2 | * Simple NUMA memory policy for the Linux kernel. | 2 | * Simple NUMA memory policy for the Linux kernel. |
| 3 | * | 3 | * |
| 4 | * Copyright 2003,2004 Andi Kleen, SuSE Labs. | 4 | * Copyright 2003,2004 Andi Kleen, SuSE Labs. |
| 5 | * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. | ||
| 5 | * Subject to the GNU Public License, version 2. | 6 | * Subject to the GNU Public License, version 2. |
| 6 | * | 7 | * |
| 7 | * NUMA policy allows the user to give hints in which node(s) memory should | 8 | * NUMA policy allows the user to give hints in which node(s) memory should |
| @@ -17,13 +18,19 @@ | |||
| 17 | * offset into the backing object or offset into the mapping | 18 | * offset into the backing object or offset into the mapping |
| 18 | * for anonymous memory. For process policy an process counter | 19 | * for anonymous memory. For process policy an process counter |
| 19 | * is used. | 20 | * is used. |
| 21 | * | ||
| 20 | * bind Only allocate memory on a specific set of nodes, | 22 | * bind Only allocate memory on a specific set of nodes, |
| 21 | * no fallback. | 23 | * no fallback. |
| 24 | * FIXME: memory is allocated starting with the first node | ||
| 25 | * to the last. It would be better if bind would truly restrict | ||
| 26 | * the allocation to memory nodes instead | ||
| 27 | * | ||
| 22 | * preferred Try a specific node first before normal fallback. | 28 | * preferred Try a specific node first before normal fallback. |
| 23 | * As a special case node -1 here means do the allocation | 29 | * As a special case node -1 here means do the allocation |
| 24 | * on the local CPU. This is normally identical to default, | 30 | * on the local CPU. This is normally identical to default, |
| 25 | * but useful to set in a VMA when you have a non default | 31 | * but useful to set in a VMA when you have a non default |
| 26 | * process policy. | 32 | * process policy. |
| 33 | * | ||
| 27 | * default Allocate on the local node first, or when on a VMA | 34 | * default Allocate on the local node first, or when on a VMA |
| 28 | * use the process policy. This is what Linux always did | 35 | * use the process policy. This is what Linux always did |
| 29 | * in a NUMA aware kernel and still does by, ahem, default. | 36 | * in a NUMA aware kernel and still does by, ahem, default. |
| @@ -93,23 +100,10 @@ struct mempolicy default_policy = { | |||
| 93 | .policy = MPOL_DEFAULT, | 100 | .policy = MPOL_DEFAULT, |
| 94 | }; | 101 | }; |
| 95 | 102 | ||
| 96 | /* Check if all specified nodes are online */ | ||
| 97 | static int nodes_online(unsigned long *nodes) | ||
| 98 | { | ||
| 99 | DECLARE_BITMAP(online2, MAX_NUMNODES); | ||
| 100 | |||
| 101 | bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES); | ||
| 102 | if (bitmap_empty(online2, MAX_NUMNODES)) | ||
| 103 | set_bit(0, online2); | ||
| 104 | if (!bitmap_subset(nodes, online2, MAX_NUMNODES)) | ||
| 105 | return -EINVAL; | ||
| 106 | return 0; | ||
| 107 | } | ||
| 108 | |||
| 109 | /* Do sanity checking on a policy */ | 103 | /* Do sanity checking on a policy */ |
| 110 | static int mpol_check_policy(int mode, unsigned long *nodes) | 104 | static int mpol_check_policy(int mode, nodemask_t *nodes) |
| 111 | { | 105 | { |
| 112 | int empty = bitmap_empty(nodes, MAX_NUMNODES); | 106 | int empty = nodes_empty(*nodes); |
| 113 | 107 | ||
| 114 | switch (mode) { | 108 | switch (mode) { |
| 115 | case MPOL_DEFAULT: | 109 | case MPOL_DEFAULT: |
| @@ -124,71 +118,20 @@ static int mpol_check_policy(int mode, unsigned long *nodes) | |||
| 124 | return -EINVAL; | 118 | return -EINVAL; |
| 125 | break; | 119 | break; |
| 126 | } | 120 | } |
| 127 | return nodes_online(nodes); | 121 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; |
| 128 | } | ||
| 129 | |||
| 130 | /* Copy a node mask from user space. */ | ||
| 131 | static int get_nodes(unsigned long *nodes, unsigned long __user *nmask, | ||
| 132 | unsigned long maxnode, int mode) | ||
| 133 | { | ||
| 134 | unsigned long k; | ||
| 135 | unsigned long nlongs; | ||
| 136 | unsigned long endmask; | ||
| 137 | |||
| 138 | --maxnode; | ||
| 139 | bitmap_zero(nodes, MAX_NUMNODES); | ||
| 140 | if (maxnode == 0 || !nmask) | ||
| 141 | return 0; | ||
| 142 | |||
| 143 | nlongs = BITS_TO_LONGS(maxnode); | ||
| 144 | if ((maxnode % BITS_PER_LONG) == 0) | ||
| 145 | endmask = ~0UL; | ||
| 146 | else | ||
| 147 | endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; | ||
| 148 | |||
| 149 | /* When the user specified more nodes than supported just check | ||
| 150 | if the non supported part is all zero. */ | ||
| 151 | if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { | ||
| 152 | if (nlongs > PAGE_SIZE/sizeof(long)) | ||
| 153 | return -EINVAL; | ||
| 154 | for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { | ||
| 155 | unsigned long t; | ||
| 156 | if (get_user(t, nmask + k)) | ||
| 157 | return -EFAULT; | ||
| 158 | if (k == nlongs - 1) { | ||
| 159 | if (t & endmask) | ||
| 160 | return -EINVAL; | ||
| 161 | } else if (t) | ||
| 162 | return -EINVAL; | ||
| 163 | } | ||
| 164 | nlongs = BITS_TO_LONGS(MAX_NUMNODES); | ||
| 165 | endmask = ~0UL; | ||
| 166 | } | ||
| 167 | |||
| 168 | if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long))) | ||
| 169 | return -EFAULT; | ||
| 170 | nodes[nlongs-1] &= endmask; | ||
| 171 | /* Update current mems_allowed */ | ||
| 172 | cpuset_update_current_mems_allowed(); | ||
| 173 | /* Ignore nodes not set in current->mems_allowed */ | ||
| 174 | cpuset_restrict_to_mems_allowed(nodes); | ||
| 175 | return mpol_check_policy(mode, nodes); | ||
| 176 | } | 122 | } |
| 177 | |||
| 178 | /* Generate a custom zonelist for the BIND policy. */ | 123 | /* Generate a custom zonelist for the BIND policy. */ |
| 179 | static struct zonelist *bind_zonelist(unsigned long *nodes) | 124 | static struct zonelist *bind_zonelist(nodemask_t *nodes) |
| 180 | { | 125 | { |
| 181 | struct zonelist *zl; | 126 | struct zonelist *zl; |
| 182 | int num, max, nd; | 127 | int num, max, nd; |
| 183 | 128 | ||
| 184 | max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); | 129 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); |
| 185 | zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); | 130 | zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); |
| 186 | if (!zl) | 131 | if (!zl) |
| 187 | return NULL; | 132 | return NULL; |
| 188 | num = 0; | 133 | num = 0; |
| 189 | for (nd = find_first_bit(nodes, MAX_NUMNODES); | 134 | for_each_node_mask(nd, *nodes) { |
| 190 | nd < MAX_NUMNODES; | ||
| 191 | nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) { | ||
| 192 | int k; | 135 | int k; |
| 193 | for (k = MAX_NR_ZONES-1; k >= 0; k--) { | 136 | for (k = MAX_NR_ZONES-1; k >= 0; k--) { |
| 194 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | 137 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; |
| @@ -199,17 +142,16 @@ static struct zonelist *bind_zonelist(unsigned long *nodes) | |||
| 199 | policy_zone = k; | 142 | policy_zone = k; |
| 200 | } | 143 | } |
| 201 | } | 144 | } |
| 202 | BUG_ON(num >= max); | ||
| 203 | zl->zones[num] = NULL; | 145 | zl->zones[num] = NULL; |
| 204 | return zl; | 146 | return zl; |
| 205 | } | 147 | } |
| 206 | 148 | ||
| 207 | /* Create a new policy */ | 149 | /* Create a new policy */ |
| 208 | static struct mempolicy *mpol_new(int mode, unsigned long *nodes) | 150 | static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) |
| 209 | { | 151 | { |
| 210 | struct mempolicy *policy; | 152 | struct mempolicy *policy; |
| 211 | 153 | ||
| 212 | PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); | 154 | PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); |
| 213 | if (mode == MPOL_DEFAULT) | 155 | if (mode == MPOL_DEFAULT) |
| 214 | return NULL; | 156 | return NULL; |
| 215 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 157 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
| @@ -218,10 +160,10 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) | |||
| 218 | atomic_set(&policy->refcnt, 1); | 160 | atomic_set(&policy->refcnt, 1); |
| 219 | switch (mode) { | 161 | switch (mode) { |
| 220 | case MPOL_INTERLEAVE: | 162 | case MPOL_INTERLEAVE: |
| 221 | bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); | 163 | policy->v.nodes = *nodes; |
| 222 | break; | 164 | break; |
| 223 | case MPOL_PREFERRED: | 165 | case MPOL_PREFERRED: |
| 224 | policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); | 166 | policy->v.preferred_node = first_node(*nodes); |
| 225 | if (policy->v.preferred_node >= MAX_NUMNODES) | 167 | if (policy->v.preferred_node >= MAX_NUMNODES) |
| 226 | policy->v.preferred_node = -1; | 168 | policy->v.preferred_node = -1; |
| 227 | break; | 169 | break; |
| @@ -238,14 +180,14 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) | |||
| 238 | } | 180 | } |
| 239 | 181 | ||
| 240 | /* Ensure all existing pages follow the policy. */ | 182 | /* Ensure all existing pages follow the policy. */ |
| 241 | static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, | 183 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
| 242 | unsigned long addr, unsigned long end, unsigned long *nodes) | 184 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
| 243 | { | 185 | { |
| 244 | pte_t *orig_pte; | 186 | pte_t *orig_pte; |
| 245 | pte_t *pte; | 187 | pte_t *pte; |
| 188 | spinlock_t *ptl; | ||
| 246 | 189 | ||
| 247 | spin_lock(&mm->page_table_lock); | 190 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
| 248 | orig_pte = pte = pte_offset_map(pmd, addr); | ||
| 249 | do { | 191 | do { |
| 250 | unsigned long pfn; | 192 | unsigned long pfn; |
| 251 | unsigned int nid; | 193 | unsigned int nid; |
| @@ -253,19 +195,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
| 253 | if (!pte_present(*pte)) | 195 | if (!pte_present(*pte)) |
| 254 | continue; | 196 | continue; |
| 255 | pfn = pte_pfn(*pte); | 197 | pfn = pte_pfn(*pte); |
| 256 | if (!pfn_valid(pfn)) | 198 | if (!pfn_valid(pfn)) { |
| 199 | print_bad_pte(vma, *pte, addr); | ||
| 257 | continue; | 200 | continue; |
| 201 | } | ||
| 258 | nid = pfn_to_nid(pfn); | 202 | nid = pfn_to_nid(pfn); |
| 259 | if (!test_bit(nid, nodes)) | 203 | if (!node_isset(nid, *nodes)) |
| 260 | break; | 204 | break; |
| 261 | } while (pte++, addr += PAGE_SIZE, addr != end); | 205 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| 262 | pte_unmap(orig_pte); | 206 | pte_unmap_unlock(orig_pte, ptl); |
| 263 | spin_unlock(&mm->page_table_lock); | ||
| 264 | return addr != end; | 207 | return addr != end; |
| 265 | } | 208 | } |
| 266 | 209 | ||
| 267 | static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, | 210 | static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
| 268 | unsigned long addr, unsigned long end, unsigned long *nodes) | 211 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
| 269 | { | 212 | { |
| 270 | pmd_t *pmd; | 213 | pmd_t *pmd; |
| 271 | unsigned long next; | 214 | unsigned long next; |
| @@ -275,14 +218,14 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
| 275 | next = pmd_addr_end(addr, end); | 218 | next = pmd_addr_end(addr, end); |
| 276 | if (pmd_none_or_clear_bad(pmd)) | 219 | if (pmd_none_or_clear_bad(pmd)) |
| 277 | continue; | 220 | continue; |
| 278 | if (check_pte_range(mm, pmd, addr, next, nodes)) | 221 | if (check_pte_range(vma, pmd, addr, next, nodes)) |
| 279 | return -EIO; | 222 | return -EIO; |
| 280 | } while (pmd++, addr = next, addr != end); | 223 | } while (pmd++, addr = next, addr != end); |
| 281 | return 0; | 224 | return 0; |
| 282 | } | 225 | } |
| 283 | 226 | ||
| 284 | static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, | 227 | static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
| 285 | unsigned long addr, unsigned long end, unsigned long *nodes) | 228 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
| 286 | { | 229 | { |
| 287 | pud_t *pud; | 230 | pud_t *pud; |
| 288 | unsigned long next; | 231 | unsigned long next; |
| @@ -292,24 +235,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, | |||
| 292 | next = pud_addr_end(addr, end); | 235 | next = pud_addr_end(addr, end); |
| 293 | if (pud_none_or_clear_bad(pud)) | 236 | if (pud_none_or_clear_bad(pud)) |
| 294 | continue; | 237 | continue; |
| 295 | if (check_pmd_range(mm, pud, addr, next, nodes)) | 238 | if (check_pmd_range(vma, pud, addr, next, nodes)) |
| 296 | return -EIO; | 239 | return -EIO; |
| 297 | } while (pud++, addr = next, addr != end); | 240 | } while (pud++, addr = next, addr != end); |
| 298 | return 0; | 241 | return 0; |
| 299 | } | 242 | } |
| 300 | 243 | ||
| 301 | static inline int check_pgd_range(struct mm_struct *mm, | 244 | static inline int check_pgd_range(struct vm_area_struct *vma, |
| 302 | unsigned long addr, unsigned long end, unsigned long *nodes) | 245 | unsigned long addr, unsigned long end, nodemask_t *nodes) |
| 303 | { | 246 | { |
| 304 | pgd_t *pgd; | 247 | pgd_t *pgd; |
| 305 | unsigned long next; | 248 | unsigned long next; |
| 306 | 249 | ||
| 307 | pgd = pgd_offset(mm, addr); | 250 | pgd = pgd_offset(vma->vm_mm, addr); |
| 308 | do { | 251 | do { |
| 309 | next = pgd_addr_end(addr, end); | 252 | next = pgd_addr_end(addr, end); |
| 310 | if (pgd_none_or_clear_bad(pgd)) | 253 | if (pgd_none_or_clear_bad(pgd)) |
| 311 | continue; | 254 | continue; |
| 312 | if (check_pud_range(mm, pgd, addr, next, nodes)) | 255 | if (check_pud_range(vma, pgd, addr, next, nodes)) |
| 313 | return -EIO; | 256 | return -EIO; |
| 314 | } while (pgd++, addr = next, addr != end); | 257 | } while (pgd++, addr = next, addr != end); |
| 315 | return 0; | 258 | return 0; |
| @@ -318,7 +261,7 @@ static inline int check_pgd_range(struct mm_struct *mm, | |||
| 318 | /* Step 1: check the range */ | 261 | /* Step 1: check the range */ |
| 319 | static struct vm_area_struct * | 262 | static struct vm_area_struct * |
| 320 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | 263 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, |
| 321 | unsigned long *nodes, unsigned long flags) | 264 | nodemask_t *nodes, unsigned long flags) |
| 322 | { | 265 | { |
| 323 | int err; | 266 | int err; |
| 324 | struct vm_area_struct *first, *vma, *prev; | 267 | struct vm_area_struct *first, *vma, *prev; |
| @@ -326,6 +269,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
| 326 | first = find_vma(mm, start); | 269 | first = find_vma(mm, start); |
| 327 | if (!first) | 270 | if (!first) |
| 328 | return ERR_PTR(-EFAULT); | 271 | return ERR_PTR(-EFAULT); |
| 272 | if (first->vm_flags & VM_RESERVED) | ||
| 273 | return ERR_PTR(-EACCES); | ||
| 329 | prev = NULL; | 274 | prev = NULL; |
| 330 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 275 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
| 331 | if (!vma->vm_next && vma->vm_end < end) | 276 | if (!vma->vm_next && vma->vm_end < end) |
| @@ -338,8 +283,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
| 338 | endvma = end; | 283 | endvma = end; |
| 339 | if (vma->vm_start > start) | 284 | if (vma->vm_start > start) |
| 340 | start = vma->vm_start; | 285 | start = vma->vm_start; |
| 341 | err = check_pgd_range(vma->vm_mm, | 286 | err = check_pgd_range(vma, start, endvma, nodes); |
| 342 | start, endvma, nodes); | ||
| 343 | if (err) { | 287 | if (err) { |
| 344 | first = ERR_PTR(err); | 288 | first = ERR_PTR(err); |
| 345 | break; | 289 | break; |
| @@ -393,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start, | |||
| 393 | return err; | 337 | return err; |
| 394 | } | 338 | } |
| 395 | 339 | ||
| 396 | /* Change policy for a memory range */ | 340 | static int contextualize_policy(int mode, nodemask_t *nodes) |
| 397 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, | 341 | { |
| 398 | unsigned long mode, | 342 | if (!nodes) |
| 399 | unsigned long __user *nmask, unsigned long maxnode, | 343 | return 0; |
| 400 | unsigned flags) | 344 | |
| 345 | /* Update current mems_allowed */ | ||
| 346 | cpuset_update_current_mems_allowed(); | ||
| 347 | /* Ignore nodes not set in current->mems_allowed */ | ||
| 348 | cpuset_restrict_to_mems_allowed(nodes->bits); | ||
| 349 | return mpol_check_policy(mode, nodes); | ||
| 350 | } | ||
| 351 | |||
| 352 | long do_mbind(unsigned long start, unsigned long len, | ||
| 353 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | ||
| 401 | { | 354 | { |
| 402 | struct vm_area_struct *vma; | 355 | struct vm_area_struct *vma; |
| 403 | struct mm_struct *mm = current->mm; | 356 | struct mm_struct *mm = current->mm; |
| 404 | struct mempolicy *new; | 357 | struct mempolicy *new; |
| 405 | unsigned long end; | 358 | unsigned long end; |
| 406 | DECLARE_BITMAP(nodes, MAX_NUMNODES); | ||
| 407 | int err; | 359 | int err; |
| 408 | 360 | ||
| 409 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) | 361 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) |
| @@ -418,20 +370,17 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |||
| 418 | return -EINVAL; | 370 | return -EINVAL; |
| 419 | if (end == start) | 371 | if (end == start) |
| 420 | return 0; | 372 | return 0; |
| 421 | 373 | if (mpol_check_policy(mode, nmask)) | |
| 422 | err = get_nodes(nodes, nmask, maxnode, mode); | 374 | return -EINVAL; |
| 423 | if (err) | 375 | new = mpol_new(mode, nmask); |
| 424 | return err; | ||
| 425 | |||
| 426 | new = mpol_new(mode, nodes); | ||
| 427 | if (IS_ERR(new)) | 376 | if (IS_ERR(new)) |
| 428 | return PTR_ERR(new); | 377 | return PTR_ERR(new); |
| 429 | 378 | ||
| 430 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | 379 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, |
| 431 | mode,nodes[0]); | 380 | mode,nodes_addr(nodes)[0]); |
| 432 | 381 | ||
| 433 | down_write(&mm->mmap_sem); | 382 | down_write(&mm->mmap_sem); |
| 434 | vma = check_range(mm, start, end, nodes, flags); | 383 | vma = check_range(mm, start, end, nmask, flags); |
| 435 | err = PTR_ERR(vma); | 384 | err = PTR_ERR(vma); |
| 436 | if (!IS_ERR(vma)) | 385 | if (!IS_ERR(vma)) |
| 437 | err = mbind_range(vma, start, end, new); | 386 | err = mbind_range(vma, start, end, new); |
| @@ -441,50 +390,45 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |||
| 441 | } | 390 | } |
| 442 | 391 | ||
| 443 | /* Set the process memory policy */ | 392 | /* Set the process memory policy */ |
| 444 | asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | 393 | long do_set_mempolicy(int mode, nodemask_t *nodes) |
| 445 | unsigned long maxnode) | ||
| 446 | { | 394 | { |
| 447 | int err; | ||
| 448 | struct mempolicy *new; | 395 | struct mempolicy *new; |
| 449 | DECLARE_BITMAP(nodes, MAX_NUMNODES); | ||
| 450 | 396 | ||
| 451 | if (mode < 0 || mode > MPOL_MAX) | 397 | if (contextualize_policy(mode, nodes)) |
| 452 | return -EINVAL; | 398 | return -EINVAL; |
| 453 | err = get_nodes(nodes, nmask, maxnode, mode); | ||
| 454 | if (err) | ||
| 455 | return err; | ||
| 456 | new = mpol_new(mode, nodes); | 399 | new = mpol_new(mode, nodes); |
| 457 | if (IS_ERR(new)) | 400 | if (IS_ERR(new)) |
| 458 | return PTR_ERR(new); | 401 | return PTR_ERR(new); |
| 459 | mpol_free(current->mempolicy); | 402 | mpol_free(current->mempolicy); |
| 460 | current->mempolicy = new; | 403 | current->mempolicy = new; |
| 461 | if (new && new->policy == MPOL_INTERLEAVE) | 404 | if (new && new->policy == MPOL_INTERLEAVE) |
| 462 | current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); | 405 | current->il_next = first_node(new->v.nodes); |
| 463 | return 0; | 406 | return 0; |
| 464 | } | 407 | } |
| 465 | 408 | ||
| 466 | /* Fill a zone bitmap for a policy */ | 409 | /* Fill a zone bitmap for a policy */ |
| 467 | static void get_zonemask(struct mempolicy *p, unsigned long *nodes) | 410 | static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) |
| 468 | { | 411 | { |
| 469 | int i; | 412 | int i; |
| 470 | 413 | ||
| 471 | bitmap_zero(nodes, MAX_NUMNODES); | 414 | nodes_clear(*nodes); |
| 472 | switch (p->policy) { | 415 | switch (p->policy) { |
| 473 | case MPOL_BIND: | 416 | case MPOL_BIND: |
| 474 | for (i = 0; p->v.zonelist->zones[i]; i++) | 417 | for (i = 0; p->v.zonelist->zones[i]; i++) |
| 475 | __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); | 418 | node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, |
| 419 | *nodes); | ||
| 476 | break; | 420 | break; |
| 477 | case MPOL_DEFAULT: | 421 | case MPOL_DEFAULT: |
| 478 | break; | 422 | break; |
| 479 | case MPOL_INTERLEAVE: | 423 | case MPOL_INTERLEAVE: |
| 480 | bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); | 424 | *nodes = p->v.nodes; |
| 481 | break; | 425 | break; |
| 482 | case MPOL_PREFERRED: | 426 | case MPOL_PREFERRED: |
| 483 | /* or use current node instead of online map? */ | 427 | /* or use current node instead of online map? */ |
| 484 | if (p->v.preferred_node < 0) | 428 | if (p->v.preferred_node < 0) |
| 485 | bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES); | 429 | *nodes = node_online_map; |
| 486 | else | 430 | else |
| 487 | __set_bit(p->v.preferred_node, nodes); | 431 | node_set(p->v.preferred_node, *nodes); |
| 488 | break; | 432 | break; |
| 489 | default: | 433 | default: |
| 490 | BUG(); | 434 | BUG(); |
| @@ -504,37 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) | |||
| 504 | return err; | 448 | return err; |
| 505 | } | 449 | } |
| 506 | 450 | ||
| 507 | /* Copy a kernel node mask to user space */ | ||
| 508 | static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | ||
| 509 | void *nodes, unsigned nbytes) | ||
| 510 | { | ||
| 511 | unsigned long copy = ALIGN(maxnode-1, 64) / 8; | ||
| 512 | |||
| 513 | if (copy > nbytes) { | ||
| 514 | if (copy > PAGE_SIZE) | ||
| 515 | return -EINVAL; | ||
| 516 | if (clear_user((char __user *)mask + nbytes, copy - nbytes)) | ||
| 517 | return -EFAULT; | ||
| 518 | copy = nbytes; | ||
| 519 | } | ||
| 520 | return copy_to_user(mask, nodes, copy) ? -EFAULT : 0; | ||
| 521 | } | ||
| 522 | |||
| 523 | /* Retrieve NUMA policy */ | 451 | /* Retrieve NUMA policy */ |
| 524 | asmlinkage long sys_get_mempolicy(int __user *policy, | 452 | long do_get_mempolicy(int *policy, nodemask_t *nmask, |
| 525 | unsigned long __user *nmask, | 453 | unsigned long addr, unsigned long flags) |
| 526 | unsigned long maxnode, | ||
| 527 | unsigned long addr, unsigned long flags) | ||
| 528 | { | 454 | { |
| 529 | int err, pval; | 455 | int err; |
| 530 | struct mm_struct *mm = current->mm; | 456 | struct mm_struct *mm = current->mm; |
| 531 | struct vm_area_struct *vma = NULL; | 457 | struct vm_area_struct *vma = NULL; |
| 532 | struct mempolicy *pol = current->mempolicy; | 458 | struct mempolicy *pol = current->mempolicy; |
| 533 | 459 | ||
| 534 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) | 460 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) |
| 535 | return -EINVAL; | 461 | return -EINVAL; |
| 536 | if (nmask != NULL && maxnode < MAX_NUMNODES) | ||
| 537 | return -EINVAL; | ||
| 538 | if (flags & MPOL_F_ADDR) { | 462 | if (flags & MPOL_F_ADDR) { |
| 539 | down_read(&mm->mmap_sem); | 463 | down_read(&mm->mmap_sem); |
| 540 | vma = find_vma_intersection(mm, addr, addr+1); | 464 | vma = find_vma_intersection(mm, addr, addr+1); |
| @@ -557,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy, | |||
| 557 | err = lookup_node(mm, addr); | 481 | err = lookup_node(mm, addr); |
| 558 | if (err < 0) | 482 | if (err < 0) |
| 559 | goto out; | 483 | goto out; |
| 560 | pval = err; | 484 | *policy = err; |
| 561 | } else if (pol == current->mempolicy && | 485 | } else if (pol == current->mempolicy && |
| 562 | pol->policy == MPOL_INTERLEAVE) { | 486 | pol->policy == MPOL_INTERLEAVE) { |
| 563 | pval = current->il_next; | 487 | *policy = current->il_next; |
| 564 | } else { | 488 | } else { |
| 565 | err = -EINVAL; | 489 | err = -EINVAL; |
| 566 | goto out; | 490 | goto out; |
| 567 | } | 491 | } |
| 568 | } else | 492 | } else |
| 569 | pval = pol->policy; | 493 | *policy = pol->policy; |
| 570 | 494 | ||
| 571 | if (vma) { | 495 | if (vma) { |
| 572 | up_read(¤t->mm->mmap_sem); | 496 | up_read(¤t->mm->mmap_sem); |
| 573 | vma = NULL; | 497 | vma = NULL; |
| 574 | } | 498 | } |
| 575 | 499 | ||
| 576 | if (policy && put_user(pval, policy)) | ||
| 577 | return -EFAULT; | ||
| 578 | |||
| 579 | err = 0; | 500 | err = 0; |
| 580 | if (nmask) { | 501 | if (nmask) |
| 581 | DECLARE_BITMAP(nodes, MAX_NUMNODES); | 502 | get_zonemask(pol, nmask); |
| 582 | get_zonemask(pol, nodes); | ||
| 583 | err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes)); | ||
| 584 | } | ||
| 585 | 503 | ||
| 586 | out: | 504 | out: |
| 587 | if (vma) | 505 | if (vma) |
| @@ -589,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy, | |||
| 589 | return err; | 507 | return err; |
| 590 | } | 508 | } |
| 591 | 509 | ||
| 510 | /* | ||
| 511 | * User space interface with variable sized bitmaps for nodelists. | ||
| 512 | */ | ||
| 513 | |||
| 514 | /* Copy a node mask from user space. */ | ||
| 515 | static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, | ||
| 516 | unsigned long maxnode) | ||
| 517 | { | ||
| 518 | unsigned long k; | ||
| 519 | unsigned long nlongs; | ||
| 520 | unsigned long endmask; | ||
| 521 | |||
| 522 | --maxnode; | ||
| 523 | nodes_clear(*nodes); | ||
| 524 | if (maxnode == 0 || !nmask) | ||
| 525 | return 0; | ||
| 526 | |||
| 527 | nlongs = BITS_TO_LONGS(maxnode); | ||
| 528 | if ((maxnode % BITS_PER_LONG) == 0) | ||
| 529 | endmask = ~0UL; | ||
| 530 | else | ||
| 531 | endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; | ||
| 532 | |||
| 533 | /* When the user specified more nodes than supported just check | ||
| 534 | if the non supported part is all zero. */ | ||
| 535 | if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { | ||
| 536 | if (nlongs > PAGE_SIZE/sizeof(long)) | ||
| 537 | return -EINVAL; | ||
| 538 | for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { | ||
| 539 | unsigned long t; | ||
| 540 | if (get_user(t, nmask + k)) | ||
| 541 | return -EFAULT; | ||
| 542 | if (k == nlongs - 1) { | ||
| 543 | if (t & endmask) | ||
| 544 | return -EINVAL; | ||
| 545 | } else if (t) | ||
| 546 | return -EINVAL; | ||
| 547 | } | ||
| 548 | nlongs = BITS_TO_LONGS(MAX_NUMNODES); | ||
| 549 | endmask = ~0UL; | ||
| 550 | } | ||
| 551 | |||
| 552 | if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) | ||
| 553 | return -EFAULT; | ||
| 554 | nodes_addr(*nodes)[nlongs-1] &= endmask; | ||
| 555 | return 0; | ||
| 556 | } | ||
| 557 | |||
| 558 | /* Copy a kernel node mask to user space */ | ||
| 559 | static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | ||
| 560 | nodemask_t *nodes) | ||
| 561 | { | ||
| 562 | unsigned long copy = ALIGN(maxnode-1, 64) / 8; | ||
| 563 | const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); | ||
| 564 | |||
| 565 | if (copy > nbytes) { | ||
| 566 | if (copy > PAGE_SIZE) | ||
| 567 | return -EINVAL; | ||
| 568 | if (clear_user((char __user *)mask + nbytes, copy - nbytes)) | ||
| 569 | return -EFAULT; | ||
| 570 | copy = nbytes; | ||
| 571 | } | ||
| 572 | return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; | ||
| 573 | } | ||
| 574 | |||
| 575 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, | ||
| 576 | unsigned long mode, | ||
| 577 | unsigned long __user *nmask, unsigned long maxnode, | ||
| 578 | unsigned flags) | ||
| 579 | { | ||
| 580 | nodemask_t nodes; | ||
| 581 | int err; | ||
| 582 | |||
| 583 | err = get_nodes(&nodes, nmask, maxnode); | ||
| 584 | if (err) | ||
| 585 | return err; | ||
| 586 | return do_mbind(start, len, mode, &nodes, flags); | ||
| 587 | } | ||
| 588 | |||
| 589 | /* Set the process memory policy */ | ||
| 590 | asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | ||
| 591 | unsigned long maxnode) | ||
| 592 | { | ||
| 593 | int err; | ||
| 594 | nodemask_t nodes; | ||
| 595 | |||
| 596 | if (mode < 0 || mode > MPOL_MAX) | ||
| 597 | return -EINVAL; | ||
| 598 | err = get_nodes(&nodes, nmask, maxnode); | ||
| 599 | if (err) | ||
| 600 | return err; | ||
| 601 | return do_set_mempolicy(mode, &nodes); | ||
| 602 | } | ||
| 603 | |||
| 604 | /* Retrieve NUMA policy */ | ||
| 605 | asmlinkage long sys_get_mempolicy(int __user *policy, | ||
| 606 | unsigned long __user *nmask, | ||
| 607 | unsigned long maxnode, | ||
| 608 | unsigned long addr, unsigned long flags) | ||
| 609 | { | ||
| 610 | int err, pval; | ||
| 611 | nodemask_t nodes; | ||
| 612 | |||
| 613 | if (nmask != NULL && maxnode < MAX_NUMNODES) | ||
| 614 | return -EINVAL; | ||
| 615 | |||
| 616 | err = do_get_mempolicy(&pval, &nodes, addr, flags); | ||
| 617 | |||
| 618 | if (err) | ||
| 619 | return err; | ||
| 620 | |||
| 621 | if (policy && put_user(pval, policy)) | ||
| 622 | return -EFAULT; | ||
| 623 | |||
| 624 | if (nmask) | ||
| 625 | err = copy_nodes_to_user(nmask, maxnode, &nodes); | ||
| 626 | |||
| 627 | return err; | ||
| 628 | } | ||
| 629 | |||
| 592 | #ifdef CONFIG_COMPAT | 630 | #ifdef CONFIG_COMPAT |
| 593 | 631 | ||
| 594 | asmlinkage long compat_sys_get_mempolicy(int __user *policy, | 632 | asmlinkage long compat_sys_get_mempolicy(int __user *policy, |
| @@ -649,15 +687,15 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
| 649 | long err = 0; | 687 | long err = 0; |
| 650 | unsigned long __user *nm = NULL; | 688 | unsigned long __user *nm = NULL; |
| 651 | unsigned long nr_bits, alloc_size; | 689 | unsigned long nr_bits, alloc_size; |
| 652 | DECLARE_BITMAP(bm, MAX_NUMNODES); | 690 | nodemask_t bm; |
| 653 | 691 | ||
| 654 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); | 692 | nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); |
| 655 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; | 693 | alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; |
| 656 | 694 | ||
| 657 | if (nmask) { | 695 | if (nmask) { |
| 658 | err = compat_get_bitmap(bm, nmask, nr_bits); | 696 | err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); |
| 659 | nm = compat_alloc_user_space(alloc_size); | 697 | nm = compat_alloc_user_space(alloc_size); |
| 660 | err |= copy_to_user(nm, bm, alloc_size); | 698 | err |= copy_to_user(nm, nodes_addr(bm), alloc_size); |
| 661 | } | 699 | } |
| 662 | 700 | ||
| 663 | if (err) | 701 | if (err) |
| @@ -676,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo | |||
| 676 | 714 | ||
| 677 | if (vma) { | 715 | if (vma) { |
| 678 | if (vma->vm_ops && vma->vm_ops->get_policy) | 716 | if (vma->vm_ops && vma->vm_ops->get_policy) |
| 679 | pol = vma->vm_ops->get_policy(vma, addr); | 717 | pol = vma->vm_ops->get_policy(vma, addr); |
| 680 | else if (vma->vm_policy && | 718 | else if (vma->vm_policy && |
| 681 | vma->vm_policy->policy != MPOL_DEFAULT) | 719 | vma->vm_policy->policy != MPOL_DEFAULT) |
| 682 | pol = vma->vm_policy; | 720 | pol = vma->vm_policy; |
| @@ -722,10 +760,9 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
| 722 | struct task_struct *me = current; | 760 | struct task_struct *me = current; |
| 723 | 761 | ||
| 724 | nid = me->il_next; | 762 | nid = me->il_next; |
| 725 | BUG_ON(nid >= MAX_NUMNODES); | 763 | next = next_node(nid, policy->v.nodes); |
| 726 | next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid); | ||
| 727 | if (next >= MAX_NUMNODES) | 764 | if (next >= MAX_NUMNODES) |
| 728 | next = find_first_bit(policy->v.nodes, MAX_NUMNODES); | 765 | next = first_node(policy->v.nodes); |
| 729 | me->il_next = next; | 766 | me->il_next = next; |
| 730 | return nid; | 767 | return nid; |
| 731 | } | 768 | } |
| @@ -734,29 +771,27 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
| 734 | static unsigned offset_il_node(struct mempolicy *pol, | 771 | static unsigned offset_il_node(struct mempolicy *pol, |
| 735 | struct vm_area_struct *vma, unsigned long off) | 772 | struct vm_area_struct *vma, unsigned long off) |
| 736 | { | 773 | { |
| 737 | unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); | 774 | unsigned nnodes = nodes_weight(pol->v.nodes); |
| 738 | unsigned target = (unsigned)off % nnodes; | 775 | unsigned target = (unsigned)off % nnodes; |
| 739 | int c; | 776 | int c; |
| 740 | int nid = -1; | 777 | int nid = -1; |
| 741 | 778 | ||
| 742 | c = 0; | 779 | c = 0; |
| 743 | do { | 780 | do { |
| 744 | nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); | 781 | nid = next_node(nid, pol->v.nodes); |
| 745 | c++; | 782 | c++; |
| 746 | } while (c <= target); | 783 | } while (c <= target); |
| 747 | BUG_ON(nid >= MAX_NUMNODES); | ||
| 748 | BUG_ON(!test_bit(nid, pol->v.nodes)); | ||
| 749 | return nid; | 784 | return nid; |
| 750 | } | 785 | } |
| 751 | 786 | ||
| 752 | /* Allocate a page in interleaved policy. | 787 | /* Allocate a page in interleaved policy. |
| 753 | Own path because it needs to do special accounting. */ | 788 | Own path because it needs to do special accounting. */ |
| 754 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) | 789 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, |
| 790 | unsigned nid) | ||
| 755 | { | 791 | { |
| 756 | struct zonelist *zl; | 792 | struct zonelist *zl; |
| 757 | struct page *page; | 793 | struct page *page; |
| 758 | 794 | ||
| 759 | BUG_ON(!node_online(nid)); | ||
| 760 | zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); | 795 | zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); |
| 761 | page = __alloc_pages(gfp, order, zl); | 796 | page = __alloc_pages(gfp, order, zl); |
| 762 | if (page && page_zone(page) == zl->zones[0]) { | 797 | if (page && page_zone(page) == zl->zones[0]) { |
| @@ -799,8 +834,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
| 799 | unsigned nid; | 834 | unsigned nid; |
| 800 | if (vma) { | 835 | if (vma) { |
| 801 | unsigned long off; | 836 | unsigned long off; |
| 802 | BUG_ON(addr >= vma->vm_end); | ||
| 803 | BUG_ON(addr < vma->vm_start); | ||
| 804 | off = vma->vm_pgoff; | 837 | off = vma->vm_pgoff; |
| 805 | off += (addr - vma->vm_start) >> PAGE_SHIFT; | 838 | off += (addr - vma->vm_start) >> PAGE_SHIFT; |
| 806 | nid = offset_il_node(pol, vma, off); | 839 | nid = offset_il_node(pol, vma, off); |
| @@ -878,7 +911,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
| 878 | case MPOL_DEFAULT: | 911 | case MPOL_DEFAULT: |
| 879 | return 1; | 912 | return 1; |
| 880 | case MPOL_INTERLEAVE: | 913 | case MPOL_INTERLEAVE: |
| 881 | return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); | 914 | return nodes_equal(a->v.nodes, b->v.nodes); |
| 882 | case MPOL_PREFERRED: | 915 | case MPOL_PREFERRED: |
| 883 | return a->v.preferred_node == b->v.preferred_node; | 916 | return a->v.preferred_node == b->v.preferred_node; |
| 884 | case MPOL_BIND: { | 917 | case MPOL_BIND: { |
| @@ -1117,7 +1150,7 @@ int mpol_set_shared_policy(struct shared_policy *info, | |||
| 1117 | PDprintk("set_shared_policy %lx sz %lu %d %lx\n", | 1150 | PDprintk("set_shared_policy %lx sz %lu %d %lx\n", |
| 1118 | vma->vm_pgoff, | 1151 | vma->vm_pgoff, |
| 1119 | sz, npol? npol->policy : -1, | 1152 | sz, npol? npol->policy : -1, |
| 1120 | npol ? npol->v.nodes[0] : -1); | 1153 | npol ? nodes_addr(npol->v.nodes)[0] : -1); |
| 1121 | 1154 | ||
| 1122 | if (npol) { | 1155 | if (npol) { |
| 1123 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); | 1156 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); |
| @@ -1164,14 +1197,12 @@ void __init numa_policy_init(void) | |||
| 1164 | /* Set interleaving policy for system init. This way not all | 1197 | /* Set interleaving policy for system init. This way not all |
| 1165 | the data structures allocated at system boot end up in node zero. */ | 1198 | the data structures allocated at system boot end up in node zero. */ |
| 1166 | 1199 | ||
| 1167 | if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), | 1200 | if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) |
| 1168 | MAX_NUMNODES) < 0) | ||
| 1169 | printk("numa_policy_init: interleaving failed\n"); | 1201 | printk("numa_policy_init: interleaving failed\n"); |
| 1170 | } | 1202 | } |
| 1171 | 1203 | ||
| 1172 | /* Reset policy of current process to default. | 1204 | /* Reset policy of current process to default */ |
| 1173 | * Assumes fs == KERNEL_DS */ | ||
| 1174 | void numa_default_policy(void) | 1205 | void numa_default_policy(void) |
| 1175 | { | 1206 | { |
| 1176 | sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); | 1207 | do_set_mempolicy(MPOL_DEFAULT, NULL); |
| 1177 | } | 1208 | } |
| @@ -181,26 +181,36 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, | |||
| 181 | } | 181 | } |
| 182 | 182 | ||
| 183 | /* | 183 | /* |
| 184 | * Remove one vm structure and free it. | 184 | * Unlink a file-based vm structure from its prio_tree, to hide |
| 185 | * vma from rmap and vmtruncate before freeing its page tables. | ||
| 185 | */ | 186 | */ |
| 186 | static void remove_vm_struct(struct vm_area_struct *vma) | 187 | void unlink_file_vma(struct vm_area_struct *vma) |
| 187 | { | 188 | { |
| 188 | struct file *file = vma->vm_file; | 189 | struct file *file = vma->vm_file; |
| 189 | 190 | ||
| 190 | might_sleep(); | ||
| 191 | if (file) { | 191 | if (file) { |
| 192 | struct address_space *mapping = file->f_mapping; | 192 | struct address_space *mapping = file->f_mapping; |
| 193 | spin_lock(&mapping->i_mmap_lock); | 193 | spin_lock(&mapping->i_mmap_lock); |
| 194 | __remove_shared_vm_struct(vma, file, mapping); | 194 | __remove_shared_vm_struct(vma, file, mapping); |
| 195 | spin_unlock(&mapping->i_mmap_lock); | 195 | spin_unlock(&mapping->i_mmap_lock); |
| 196 | } | 196 | } |
| 197 | } | ||
| 198 | |||
| 199 | /* | ||
| 200 | * Close a vm structure and free it, returning the next. | ||
| 201 | */ | ||
| 202 | static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) | ||
| 203 | { | ||
| 204 | struct vm_area_struct *next = vma->vm_next; | ||
| 205 | |||
| 206 | might_sleep(); | ||
| 197 | if (vma->vm_ops && vma->vm_ops->close) | 207 | if (vma->vm_ops && vma->vm_ops->close) |
| 198 | vma->vm_ops->close(vma); | 208 | vma->vm_ops->close(vma); |
| 199 | if (file) | 209 | if (vma->vm_file) |
| 200 | fput(file); | 210 | fput(vma->vm_file); |
| 201 | anon_vma_unlink(vma); | ||
| 202 | mpol_free(vma_policy(vma)); | 211 | mpol_free(vma_policy(vma)); |
| 203 | kmem_cache_free(vm_area_cachep, vma); | 212 | kmem_cache_free(vm_area_cachep, vma); |
| 213 | return next; | ||
| 204 | } | 214 | } |
| 205 | 215 | ||
| 206 | asmlinkage unsigned long sys_brk(unsigned long brk) | 216 | asmlinkage unsigned long sys_brk(unsigned long brk) |
| @@ -832,7 +842,7 @@ none: | |||
| 832 | } | 842 | } |
| 833 | 843 | ||
| 834 | #ifdef CONFIG_PROC_FS | 844 | #ifdef CONFIG_PROC_FS |
| 835 | void __vm_stat_account(struct mm_struct *mm, unsigned long flags, | 845 | void vm_stat_account(struct mm_struct *mm, unsigned long flags, |
| 836 | struct file *file, long pages) | 846 | struct file *file, long pages) |
| 837 | { | 847 | { |
| 838 | const unsigned long stack_flags | 848 | const unsigned long stack_flags |
| @@ -1070,6 +1080,17 @@ munmap_back: | |||
| 1070 | error = file->f_op->mmap(file, vma); | 1080 | error = file->f_op->mmap(file, vma); |
| 1071 | if (error) | 1081 | if (error) |
| 1072 | goto unmap_and_free_vma; | 1082 | goto unmap_and_free_vma; |
| 1083 | if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED)) | ||
| 1084 | == (VM_WRITE | VM_RESERVED)) { | ||
| 1085 | printk(KERN_WARNING "program %s is using MAP_PRIVATE, " | ||
| 1086 | "PROT_WRITE mmap of VM_RESERVED memory, which " | ||
| 1087 | "is deprecated. Please report this to " | ||
| 1088 | "linux-kernel@vger.kernel.org\n",current->comm); | ||
| 1089 | if (vma->vm_ops && vma->vm_ops->close) | ||
| 1090 | vma->vm_ops->close(vma); | ||
| 1091 | error = -EACCES; | ||
| 1092 | goto unmap_and_free_vma; | ||
| 1093 | } | ||
| 1073 | } else if (vm_flags & VM_SHARED) { | 1094 | } else if (vm_flags & VM_SHARED) { |
| 1074 | error = shmem_zero_setup(vma); | 1095 | error = shmem_zero_setup(vma); |
| 1075 | if (error) | 1096 | if (error) |
| @@ -1110,7 +1131,7 @@ munmap_back: | |||
| 1110 | } | 1131 | } |
| 1111 | out: | 1132 | out: |
| 1112 | mm->total_vm += len >> PAGE_SHIFT; | 1133 | mm->total_vm += len >> PAGE_SHIFT; |
| 1113 | __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1134 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
| 1114 | if (vm_flags & VM_LOCKED) { | 1135 | if (vm_flags & VM_LOCKED) { |
| 1115 | mm->locked_vm += len >> PAGE_SHIFT; | 1136 | mm->locked_vm += len >> PAGE_SHIFT; |
| 1116 | make_pages_present(addr, addr + len); | 1137 | make_pages_present(addr, addr + len); |
| @@ -1475,15 +1496,19 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un | |||
| 1475 | mm->total_vm += grow; | 1496 | mm->total_vm += grow; |
| 1476 | if (vma->vm_flags & VM_LOCKED) | 1497 | if (vma->vm_flags & VM_LOCKED) |
| 1477 | mm->locked_vm += grow; | 1498 | mm->locked_vm += grow; |
| 1478 | __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); | 1499 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); |
| 1479 | return 0; | 1500 | return 0; |
| 1480 | } | 1501 | } |
| 1481 | 1502 | ||
| 1482 | #ifdef CONFIG_STACK_GROWSUP | 1503 | #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) |
| 1483 | /* | 1504 | /* |
| 1484 | * vma is the first one with address > vma->vm_end. Have to extend vma. | 1505 | * PA-RISC uses this for its stack; IA64 for its Register Backing Store. |
| 1506 | * vma is the last one with address > vma->vm_end. Have to extend vma. | ||
| 1485 | */ | 1507 | */ |
| 1486 | int expand_stack(struct vm_area_struct * vma, unsigned long address) | 1508 | #ifdef CONFIG_STACK_GROWSUP |
| 1509 | static inline | ||
| 1510 | #endif | ||
| 1511 | int expand_upwards(struct vm_area_struct *vma, unsigned long address) | ||
| 1487 | { | 1512 | { |
| 1488 | int error; | 1513 | int error; |
| 1489 | 1514 | ||
| @@ -1521,6 +1546,13 @@ int expand_stack(struct vm_area_struct * vma, unsigned long address) | |||
| 1521 | anon_vma_unlock(vma); | 1546 | anon_vma_unlock(vma); |
| 1522 | return error; | 1547 | return error; |
| 1523 | } | 1548 | } |
| 1549 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ | ||
| 1550 | |||
| 1551 | #ifdef CONFIG_STACK_GROWSUP | ||
| 1552 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | ||
| 1553 | { | ||
| 1554 | return expand_upwards(vma, address); | ||
| 1555 | } | ||
| 1524 | 1556 | ||
| 1525 | struct vm_area_struct * | 1557 | struct vm_area_struct * |
| 1526 | find_extend_vma(struct mm_struct *mm, unsigned long addr) | 1558 | find_extend_vma(struct mm_struct *mm, unsigned long addr) |
| @@ -1603,36 +1635,24 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
| 1603 | } | 1635 | } |
| 1604 | #endif | 1636 | #endif |
| 1605 | 1637 | ||
| 1606 | /* Normal function to fix up a mapping | ||
| 1607 | * This function is the default for when an area has no specific | ||
| 1608 | * function. This may be used as part of a more specific routine. | ||
| 1609 | * | ||
| 1610 | * By the time this function is called, the area struct has been | ||
| 1611 | * removed from the process mapping list. | ||
| 1612 | */ | ||
| 1613 | static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) | ||
| 1614 | { | ||
| 1615 | size_t len = area->vm_end - area->vm_start; | ||
| 1616 | |||
| 1617 | area->vm_mm->total_vm -= len >> PAGE_SHIFT; | ||
| 1618 | if (area->vm_flags & VM_LOCKED) | ||
| 1619 | area->vm_mm->locked_vm -= len >> PAGE_SHIFT; | ||
| 1620 | vm_stat_unaccount(area); | ||
| 1621 | remove_vm_struct(area); | ||
| 1622 | } | ||
| 1623 | |||
| 1624 | /* | 1638 | /* |
| 1625 | * Update the VMA and inode share lists. | 1639 | * Ok - we have the memory areas we should free on the vma list, |
| 1626 | * | ||
| 1627 | * Ok - we have the memory areas we should free on the 'free' list, | ||
| 1628 | * so release them, and do the vma updates. | 1640 | * so release them, and do the vma updates. |
| 1641 | * | ||
| 1642 | * Called with the mm semaphore held. | ||
| 1629 | */ | 1643 | */ |
| 1630 | static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) | 1644 | static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) |
| 1631 | { | 1645 | { |
| 1646 | /* Update high watermark before we lower total_vm */ | ||
| 1647 | update_hiwater_vm(mm); | ||
| 1632 | do { | 1648 | do { |
| 1633 | struct vm_area_struct *next = vma->vm_next; | 1649 | long nrpages = vma_pages(vma); |
| 1634 | unmap_vma(mm, vma); | 1650 | |
| 1635 | vma = next; | 1651 | mm->total_vm -= nrpages; |
| 1652 | if (vma->vm_flags & VM_LOCKED) | ||
| 1653 | mm->locked_vm -= nrpages; | ||
| 1654 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); | ||
| 1655 | vma = remove_vma(vma); | ||
| 1636 | } while (vma); | 1656 | } while (vma); |
| 1637 | validate_mm(mm); | 1657 | validate_mm(mm); |
| 1638 | } | 1658 | } |
| @@ -1651,14 +1671,13 @@ static void unmap_region(struct mm_struct *mm, | |||
| 1651 | unsigned long nr_accounted = 0; | 1671 | unsigned long nr_accounted = 0; |
| 1652 | 1672 | ||
| 1653 | lru_add_drain(); | 1673 | lru_add_drain(); |
| 1654 | spin_lock(&mm->page_table_lock); | ||
| 1655 | tlb = tlb_gather_mmu(mm, 0); | 1674 | tlb = tlb_gather_mmu(mm, 0); |
| 1656 | unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); | 1675 | update_hiwater_rss(mm); |
| 1676 | unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); | ||
| 1657 | vm_unacct_memory(nr_accounted); | 1677 | vm_unacct_memory(nr_accounted); |
| 1658 | free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, | 1678 | free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, |
| 1659 | next? next->vm_start: 0); | 1679 | next? next->vm_start: 0); |
| 1660 | tlb_finish_mmu(tlb, start, end); | 1680 | tlb_finish_mmu(tlb, start, end); |
| 1661 | spin_unlock(&mm->page_table_lock); | ||
| 1662 | } | 1681 | } |
| 1663 | 1682 | ||
| 1664 | /* | 1683 | /* |
| @@ -1799,7 +1818,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
| 1799 | unmap_region(mm, vma, prev, start, end); | 1818 | unmap_region(mm, vma, prev, start, end); |
| 1800 | 1819 | ||
| 1801 | /* Fix up all other VM information */ | 1820 | /* Fix up all other VM information */ |
| 1802 | unmap_vma_list(mm, vma); | 1821 | remove_vma_list(mm, vma); |
| 1803 | 1822 | ||
| 1804 | return 0; | 1823 | return 0; |
| 1805 | } | 1824 | } |
| @@ -1933,34 +1952,21 @@ void exit_mmap(struct mm_struct *mm) | |||
| 1933 | unsigned long end; | 1952 | unsigned long end; |
| 1934 | 1953 | ||
| 1935 | lru_add_drain(); | 1954 | lru_add_drain(); |
| 1936 | |||
| 1937 | spin_lock(&mm->page_table_lock); | ||
| 1938 | |||
| 1939 | flush_cache_mm(mm); | 1955 | flush_cache_mm(mm); |
| 1940 | tlb = tlb_gather_mmu(mm, 1); | 1956 | tlb = tlb_gather_mmu(mm, 1); |
| 1957 | /* Don't update_hiwater_rss(mm) here, do_exit already did */ | ||
| 1941 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 1958 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
| 1942 | end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); | 1959 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); |
| 1943 | vm_unacct_memory(nr_accounted); | 1960 | vm_unacct_memory(nr_accounted); |
| 1944 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); | 1961 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); |
| 1945 | tlb_finish_mmu(tlb, 0, end); | 1962 | tlb_finish_mmu(tlb, 0, end); |
| 1946 | 1963 | ||
| 1947 | mm->mmap = mm->mmap_cache = NULL; | ||
| 1948 | mm->mm_rb = RB_ROOT; | ||
| 1949 | set_mm_counter(mm, rss, 0); | ||
| 1950 | mm->total_vm = 0; | ||
| 1951 | mm->locked_vm = 0; | ||
| 1952 | |||
| 1953 | spin_unlock(&mm->page_table_lock); | ||
| 1954 | |||
| 1955 | /* | 1964 | /* |
| 1956 | * Walk the list again, actually closing and freeing it | 1965 | * Walk the list again, actually closing and freeing it, |
| 1957 | * without holding any MM locks. | 1966 | * with preemption enabled, without holding any MM locks. |
| 1958 | */ | 1967 | */ |
| 1959 | while (vma) { | 1968 | while (vma) |
| 1960 | struct vm_area_struct *next = vma->vm_next; | 1969 | vma = remove_vma(vma); |
| 1961 | remove_vm_struct(vma); | ||
| 1962 | vma = next; | ||
| 1963 | } | ||
| 1964 | 1970 | ||
| 1965 | BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); | 1971 | BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); |
| 1966 | } | 1972 | } |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 57577f63b305..17a2b52b753b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
| @@ -29,8 +29,9 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
| 29 | unsigned long addr, unsigned long end, pgprot_t newprot) | 29 | unsigned long addr, unsigned long end, pgprot_t newprot) |
| 30 | { | 30 | { |
| 31 | pte_t *pte; | 31 | pte_t *pte; |
| 32 | spinlock_t *ptl; | ||
| 32 | 33 | ||
| 33 | pte = pte_offset_map(pmd, addr); | 34 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
| 34 | do { | 35 | do { |
| 35 | if (pte_present(*pte)) { | 36 | if (pte_present(*pte)) { |
| 36 | pte_t ptent; | 37 | pte_t ptent; |
| @@ -44,7 +45,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
| 44 | lazy_mmu_prot_update(ptent); | 45 | lazy_mmu_prot_update(ptent); |
| 45 | } | 46 | } |
| 46 | } while (pte++, addr += PAGE_SIZE, addr != end); | 47 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| 47 | pte_unmap(pte - 1); | 48 | pte_unmap_unlock(pte - 1, ptl); |
| 48 | } | 49 | } |
| 49 | 50 | ||
| 50 | static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, | 51 | static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, |
| @@ -88,7 +89,6 @@ static void change_protection(struct vm_area_struct *vma, | |||
| 88 | BUG_ON(addr >= end); | 89 | BUG_ON(addr >= end); |
| 89 | pgd = pgd_offset(mm, addr); | 90 | pgd = pgd_offset(mm, addr); |
| 90 | flush_cache_range(vma, addr, end); | 91 | flush_cache_range(vma, addr, end); |
| 91 | spin_lock(&mm->page_table_lock); | ||
| 92 | do { | 92 | do { |
| 93 | next = pgd_addr_end(addr, end); | 93 | next = pgd_addr_end(addr, end); |
| 94 | if (pgd_none_or_clear_bad(pgd)) | 94 | if (pgd_none_or_clear_bad(pgd)) |
| @@ -96,7 +96,6 @@ static void change_protection(struct vm_area_struct *vma, | |||
| 96 | change_pud_range(mm, pgd, addr, next, newprot); | 96 | change_pud_range(mm, pgd, addr, next, newprot); |
| 97 | } while (pgd++, addr = next, addr != end); | 97 | } while (pgd++, addr = next, addr != end); |
| 98 | flush_tlb_range(vma, start, end); | 98 | flush_tlb_range(vma, start, end); |
| 99 | spin_unlock(&mm->page_table_lock); | ||
| 100 | } | 99 | } |
| 101 | 100 | ||
| 102 | static int | 101 | static int |
| @@ -125,6 +124,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
| 125 | * a MAP_NORESERVE private mapping to writable will now reserve. | 124 | * a MAP_NORESERVE private mapping to writable will now reserve. |
| 126 | */ | 125 | */ |
| 127 | if (newflags & VM_WRITE) { | 126 | if (newflags & VM_WRITE) { |
| 127 | if (oldflags & VM_RESERVED) { | ||
| 128 | BUG_ON(oldflags & VM_WRITE); | ||
| 129 | printk(KERN_WARNING "program %s is using MAP_PRIVATE, " | ||
| 130 | "PROT_WRITE mprotect of VM_RESERVED memory, " | ||
| 131 | "which is deprecated. Please report this to " | ||
| 132 | "linux-kernel@vger.kernel.org\n",current->comm); | ||
| 133 | return -EACCES; | ||
| 134 | } | ||
| 128 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { | 135 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { |
| 129 | charged = nrpages; | 136 | charged = nrpages; |
| 130 | if (security_vm_enough_memory(charged)) | 137 | if (security_vm_enough_memory(charged)) |
| @@ -168,8 +175,8 @@ success: | |||
| 168 | vma->vm_flags = newflags; | 175 | vma->vm_flags = newflags; |
| 169 | vma->vm_page_prot = newprot; | 176 | vma->vm_page_prot = newprot; |
| 170 | change_protection(vma, start, end, newprot); | 177 | change_protection(vma, start, end, newprot); |
| 171 | __vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 178 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
| 172 | __vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 179 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
| 173 | return 0; | 180 | return 0; |
| 174 | 181 | ||
| 175 | fail: | 182 | fail: |
diff --git a/mm/mremap.c b/mm/mremap.c index f343fc73a8bd..b535438c363c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -22,35 +22,7 @@ | |||
| 22 | #include <asm/cacheflush.h> | 22 | #include <asm/cacheflush.h> |
| 23 | #include <asm/tlbflush.h> | 23 | #include <asm/tlbflush.h> |
| 24 | 24 | ||
| 25 | static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr) | 25 | static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) |
| 26 | { | ||
| 27 | pgd_t *pgd; | ||
| 28 | pud_t *pud; | ||
| 29 | pmd_t *pmd; | ||
| 30 | pte_t *pte = NULL; | ||
| 31 | |||
| 32 | pgd = pgd_offset(mm, addr); | ||
| 33 | if (pgd_none_or_clear_bad(pgd)) | ||
| 34 | goto end; | ||
| 35 | |||
| 36 | pud = pud_offset(pgd, addr); | ||
| 37 | if (pud_none_or_clear_bad(pud)) | ||
| 38 | goto end; | ||
| 39 | |||
| 40 | pmd = pmd_offset(pud, addr); | ||
| 41 | if (pmd_none_or_clear_bad(pmd)) | ||
| 42 | goto end; | ||
| 43 | |||
| 44 | pte = pte_offset_map_nested(pmd, addr); | ||
| 45 | if (pte_none(*pte)) { | ||
| 46 | pte_unmap_nested(pte); | ||
| 47 | pte = NULL; | ||
| 48 | } | ||
| 49 | end: | ||
| 50 | return pte; | ||
| 51 | } | ||
| 52 | |||
| 53 | static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr) | ||
| 54 | { | 26 | { |
| 55 | pgd_t *pgd; | 27 | pgd_t *pgd; |
| 56 | pud_t *pud; | 28 | pud_t *pud; |
| @@ -68,35 +40,39 @@ static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr) | |||
| 68 | if (pmd_none_or_clear_bad(pmd)) | 40 | if (pmd_none_or_clear_bad(pmd)) |
| 69 | return NULL; | 41 | return NULL; |
| 70 | 42 | ||
| 71 | return pte_offset_map(pmd, addr); | 43 | return pmd; |
| 72 | } | 44 | } |
| 73 | 45 | ||
| 74 | static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr) | 46 | static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) |
| 75 | { | 47 | { |
| 76 | pgd_t *pgd; | 48 | pgd_t *pgd; |
| 77 | pud_t *pud; | 49 | pud_t *pud; |
| 78 | pmd_t *pmd; | 50 | pmd_t *pmd; |
| 79 | pte_t *pte = NULL; | ||
| 80 | 51 | ||
| 81 | pgd = pgd_offset(mm, addr); | 52 | pgd = pgd_offset(mm, addr); |
| 82 | |||
| 83 | pud = pud_alloc(mm, pgd, addr); | 53 | pud = pud_alloc(mm, pgd, addr); |
| 84 | if (!pud) | 54 | if (!pud) |
| 85 | return NULL; | 55 | return NULL; |
| 56 | |||
| 86 | pmd = pmd_alloc(mm, pud, addr); | 57 | pmd = pmd_alloc(mm, pud, addr); |
| 87 | if (pmd) | 58 | if (!pmd) |
| 88 | pte = pte_alloc_map(mm, pmd, addr); | 59 | return NULL; |
| 89 | return pte; | 60 | |
| 61 | if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) | ||
| 62 | return NULL; | ||
| 63 | |||
| 64 | return pmd; | ||
| 90 | } | 65 | } |
| 91 | 66 | ||
| 92 | static int | 67 | static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, |
| 93 | move_one_page(struct vm_area_struct *vma, unsigned long old_addr, | 68 | unsigned long old_addr, unsigned long old_end, |
| 94 | struct vm_area_struct *new_vma, unsigned long new_addr) | 69 | struct vm_area_struct *new_vma, pmd_t *new_pmd, |
| 70 | unsigned long new_addr) | ||
| 95 | { | 71 | { |
| 96 | struct address_space *mapping = NULL; | 72 | struct address_space *mapping = NULL; |
| 97 | struct mm_struct *mm = vma->vm_mm; | 73 | struct mm_struct *mm = vma->vm_mm; |
| 98 | int error = 0; | 74 | pte_t *old_pte, *new_pte, pte; |
| 99 | pte_t *src, *dst; | 75 | spinlock_t *old_ptl, *new_ptl; |
| 100 | 76 | ||
| 101 | if (vma->vm_file) { | 77 | if (vma->vm_file) { |
| 102 | /* | 78 | /* |
| @@ -111,74 +87,69 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr, | |||
| 111 | new_vma->vm_truncate_count != vma->vm_truncate_count) | 87 | new_vma->vm_truncate_count != vma->vm_truncate_count) |
| 112 | new_vma->vm_truncate_count = 0; | 88 | new_vma->vm_truncate_count = 0; |
| 113 | } | 89 | } |
| 114 | spin_lock(&mm->page_table_lock); | ||
| 115 | 90 | ||
| 116 | src = get_one_pte_map_nested(mm, old_addr); | 91 | /* |
| 117 | if (src) { | 92 | * We don't have to worry about the ordering of src and dst |
| 118 | /* | 93 | * pte locks because exclusive mmap_sem prevents deadlock. |
| 119 | * Look to see whether alloc_one_pte_map needs to perform a | 94 | */ |
| 120 | * memory allocation. If it does then we need to drop the | 95 | old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); |
| 121 | * atomic kmap | 96 | new_pte = pte_offset_map_nested(new_pmd, new_addr); |
| 122 | */ | 97 | new_ptl = pte_lockptr(mm, new_pmd); |
| 123 | dst = get_one_pte_map(mm, new_addr); | 98 | if (new_ptl != old_ptl) |
| 124 | if (unlikely(!dst)) { | 99 | spin_lock(new_ptl); |
| 125 | pte_unmap_nested(src); | 100 | |
| 126 | if (mapping) | 101 | for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, |
| 127 | spin_unlock(&mapping->i_mmap_lock); | 102 | new_pte++, new_addr += PAGE_SIZE) { |
| 128 | dst = alloc_one_pte_map(mm, new_addr); | 103 | if (pte_none(*old_pte)) |
| 129 | if (mapping && !spin_trylock(&mapping->i_mmap_lock)) { | 104 | continue; |
| 130 | spin_unlock(&mm->page_table_lock); | 105 | pte = ptep_clear_flush(vma, old_addr, old_pte); |
| 131 | spin_lock(&mapping->i_mmap_lock); | 106 | /* ZERO_PAGE can be dependant on virtual addr */ |
| 132 | spin_lock(&mm->page_table_lock); | 107 | pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); |
| 133 | } | 108 | set_pte_at(mm, new_addr, new_pte, pte); |
| 134 | src = get_one_pte_map_nested(mm, old_addr); | ||
| 135 | } | ||
| 136 | /* | ||
| 137 | * Since alloc_one_pte_map can drop and re-acquire | ||
| 138 | * page_table_lock, we should re-check the src entry... | ||
| 139 | */ | ||
| 140 | if (src) { | ||
| 141 | if (dst) { | ||
| 142 | pte_t pte; | ||
| 143 | pte = ptep_clear_flush(vma, old_addr, src); | ||
| 144 | |||
| 145 | /* ZERO_PAGE can be dependant on virtual addr */ | ||
| 146 | pte = move_pte(pte, new_vma->vm_page_prot, | ||
| 147 | old_addr, new_addr); | ||
| 148 | set_pte_at(mm, new_addr, dst, pte); | ||
| 149 | } else | ||
| 150 | error = -ENOMEM; | ||
| 151 | pte_unmap_nested(src); | ||
| 152 | } | ||
| 153 | if (dst) | ||
| 154 | pte_unmap(dst); | ||
| 155 | } | 109 | } |
| 156 | spin_unlock(&mm->page_table_lock); | 110 | |
| 111 | if (new_ptl != old_ptl) | ||
| 112 | spin_unlock(new_ptl); | ||
| 113 | pte_unmap_nested(new_pte - 1); | ||
| 114 | pte_unmap_unlock(old_pte - 1, old_ptl); | ||
| 157 | if (mapping) | 115 | if (mapping) |
| 158 | spin_unlock(&mapping->i_mmap_lock); | 116 | spin_unlock(&mapping->i_mmap_lock); |
| 159 | return error; | ||
| 160 | } | 117 | } |
| 161 | 118 | ||
| 119 | #define LATENCY_LIMIT (64 * PAGE_SIZE) | ||
| 120 | |||
| 162 | static unsigned long move_page_tables(struct vm_area_struct *vma, | 121 | static unsigned long move_page_tables(struct vm_area_struct *vma, |
| 163 | unsigned long old_addr, struct vm_area_struct *new_vma, | 122 | unsigned long old_addr, struct vm_area_struct *new_vma, |
| 164 | unsigned long new_addr, unsigned long len) | 123 | unsigned long new_addr, unsigned long len) |
| 165 | { | 124 | { |
| 166 | unsigned long offset; | 125 | unsigned long extent, next, old_end; |
| 126 | pmd_t *old_pmd, *new_pmd; | ||
| 167 | 127 | ||
| 168 | flush_cache_range(vma, old_addr, old_addr + len); | 128 | old_end = old_addr + len; |
| 129 | flush_cache_range(vma, old_addr, old_end); | ||
| 169 | 130 | ||
| 170 | /* | 131 | for (; old_addr < old_end; old_addr += extent, new_addr += extent) { |
| 171 | * This is not the clever way to do this, but we're taking the | ||
| 172 | * easy way out on the assumption that most remappings will be | ||
| 173 | * only a few pages.. This also makes error recovery easier. | ||
| 174 | */ | ||
| 175 | for (offset = 0; offset < len; offset += PAGE_SIZE) { | ||
| 176 | if (move_one_page(vma, old_addr + offset, | ||
| 177 | new_vma, new_addr + offset) < 0) | ||
| 178 | break; | ||
| 179 | cond_resched(); | 132 | cond_resched(); |
| 133 | next = (old_addr + PMD_SIZE) & PMD_MASK; | ||
| 134 | if (next - 1 > old_end) | ||
| 135 | next = old_end; | ||
| 136 | extent = next - old_addr; | ||
| 137 | old_pmd = get_old_pmd(vma->vm_mm, old_addr); | ||
| 138 | if (!old_pmd) | ||
| 139 | continue; | ||
| 140 | new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); | ||
| 141 | if (!new_pmd) | ||
| 142 | break; | ||
| 143 | next = (new_addr + PMD_SIZE) & PMD_MASK; | ||
| 144 | if (extent > next - new_addr) | ||
| 145 | extent = next - new_addr; | ||
| 146 | if (extent > LATENCY_LIMIT) | ||
| 147 | extent = LATENCY_LIMIT; | ||
| 148 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, | ||
| 149 | new_vma, new_pmd, new_addr); | ||
| 180 | } | 150 | } |
| 181 | return offset; | 151 | |
| 152 | return len + old_addr - old_end; /* how much done */ | ||
| 182 | } | 153 | } |
| 183 | 154 | ||
| 184 | static unsigned long move_vma(struct vm_area_struct *vma, | 155 | static unsigned long move_vma(struct vm_area_struct *vma, |
| @@ -191,6 +162,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
| 191 | unsigned long new_pgoff; | 162 | unsigned long new_pgoff; |
| 192 | unsigned long moved_len; | 163 | unsigned long moved_len; |
| 193 | unsigned long excess = 0; | 164 | unsigned long excess = 0; |
| 165 | unsigned long hiwater_vm; | ||
| 194 | int split = 0; | 166 | int split = 0; |
| 195 | 167 | ||
| 196 | /* | 168 | /* |
| @@ -229,17 +201,24 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
| 229 | } | 201 | } |
| 230 | 202 | ||
| 231 | /* | 203 | /* |
| 232 | * if we failed to move page tables we still do total_vm increment | 204 | * If we failed to move page tables we still do total_vm increment |
| 233 | * since do_munmap() will decrement it by old_len == new_len | 205 | * since do_munmap() will decrement it by old_len == new_len. |
| 206 | * | ||
| 207 | * Since total_vm is about to be raised artificially high for a | ||
| 208 | * moment, we need to restore high watermark afterwards: if stats | ||
| 209 | * are taken meanwhile, total_vm and hiwater_vm appear too high. | ||
| 210 | * If this were a serious issue, we'd add a flag to do_munmap(). | ||
| 234 | */ | 211 | */ |
| 212 | hiwater_vm = mm->hiwater_vm; | ||
| 235 | mm->total_vm += new_len >> PAGE_SHIFT; | 213 | mm->total_vm += new_len >> PAGE_SHIFT; |
| 236 | __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); | 214 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); |
| 237 | 215 | ||
| 238 | if (do_munmap(mm, old_addr, old_len) < 0) { | 216 | if (do_munmap(mm, old_addr, old_len) < 0) { |
| 239 | /* OOM: unable to split vma, just get accounts right */ | 217 | /* OOM: unable to split vma, just get accounts right */ |
| 240 | vm_unacct_memory(excess >> PAGE_SHIFT); | 218 | vm_unacct_memory(excess >> PAGE_SHIFT); |
| 241 | excess = 0; | 219 | excess = 0; |
| 242 | } | 220 | } |
| 221 | mm->hiwater_vm = hiwater_vm; | ||
| 243 | 222 | ||
| 244 | /* Restore VM_ACCOUNT if one or two pieces of vma left */ | 223 | /* Restore VM_ACCOUNT if one or two pieces of vma left */ |
| 245 | if (excess) { | 224 | if (excess) { |
| @@ -269,6 +248,7 @@ unsigned long do_mremap(unsigned long addr, | |||
| 269 | unsigned long old_len, unsigned long new_len, | 248 | unsigned long old_len, unsigned long new_len, |
| 270 | unsigned long flags, unsigned long new_addr) | 249 | unsigned long flags, unsigned long new_addr) |
| 271 | { | 250 | { |
| 251 | struct mm_struct *mm = current->mm; | ||
| 272 | struct vm_area_struct *vma; | 252 | struct vm_area_struct *vma; |
| 273 | unsigned long ret = -EINVAL; | 253 | unsigned long ret = -EINVAL; |
| 274 | unsigned long charged = 0; | 254 | unsigned long charged = 0; |
| @@ -309,7 +289,7 @@ unsigned long do_mremap(unsigned long addr, | |||
| 309 | if ((addr <= new_addr) && (addr+old_len) > new_addr) | 289 | if ((addr <= new_addr) && (addr+old_len) > new_addr) |
| 310 | goto out; | 290 | goto out; |
| 311 | 291 | ||
| 312 | ret = do_munmap(current->mm, new_addr, new_len); | 292 | ret = do_munmap(mm, new_addr, new_len); |
| 313 | if (ret) | 293 | if (ret) |
| 314 | goto out; | 294 | goto out; |
| 315 | } | 295 | } |
| @@ -320,7 +300,7 @@ unsigned long do_mremap(unsigned long addr, | |||
| 320 | * do_munmap does all the needed commit accounting | 300 | * do_munmap does all the needed commit accounting |
| 321 | */ | 301 | */ |
| 322 | if (old_len >= new_len) { | 302 | if (old_len >= new_len) { |
| 323 | ret = do_munmap(current->mm, addr+new_len, old_len - new_len); | 303 | ret = do_munmap(mm, addr+new_len, old_len - new_len); |
| 324 | if (ret && old_len != new_len) | 304 | if (ret && old_len != new_len) |
| 325 | goto out; | 305 | goto out; |
| 326 | ret = addr; | 306 | ret = addr; |
| @@ -333,7 +313,7 @@ unsigned long do_mremap(unsigned long addr, | |||
| 333 | * Ok, we need to grow.. or relocate. | 313 | * Ok, we need to grow.. or relocate. |
| 334 | */ | 314 | */ |
| 335 | ret = -EFAULT; | 315 | ret = -EFAULT; |
| 336 | vma = find_vma(current->mm, addr); | 316 | vma = find_vma(mm, addr); |
| 337 | if (!vma || vma->vm_start > addr) | 317 | if (!vma || vma->vm_start > addr) |
| 338 | goto out; | 318 | goto out; |
| 339 | if (is_vm_hugetlb_page(vma)) { | 319 | if (is_vm_hugetlb_page(vma)) { |
| @@ -349,14 +329,14 @@ unsigned long do_mremap(unsigned long addr, | |||
| 349 | } | 329 | } |
| 350 | if (vma->vm_flags & VM_LOCKED) { | 330 | if (vma->vm_flags & VM_LOCKED) { |
| 351 | unsigned long locked, lock_limit; | 331 | unsigned long locked, lock_limit; |
| 352 | locked = current->mm->locked_vm << PAGE_SHIFT; | 332 | locked = mm->locked_vm << PAGE_SHIFT; |
| 353 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 333 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; |
| 354 | locked += new_len - old_len; | 334 | locked += new_len - old_len; |
| 355 | ret = -EAGAIN; | 335 | ret = -EAGAIN; |
| 356 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 336 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
| 357 | goto out; | 337 | goto out; |
| 358 | } | 338 | } |
| 359 | if (!may_expand_vm(current->mm, (new_len - old_len) >> PAGE_SHIFT)) { | 339 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) { |
| 360 | ret = -ENOMEM; | 340 | ret = -ENOMEM; |
| 361 | goto out; | 341 | goto out; |
| 362 | } | 342 | } |
| @@ -383,11 +363,10 @@ unsigned long do_mremap(unsigned long addr, | |||
| 383 | vma_adjust(vma, vma->vm_start, | 363 | vma_adjust(vma, vma->vm_start, |
| 384 | addr + new_len, vma->vm_pgoff, NULL); | 364 | addr + new_len, vma->vm_pgoff, NULL); |
| 385 | 365 | ||
| 386 | current->mm->total_vm += pages; | 366 | mm->total_vm += pages; |
| 387 | __vm_stat_account(vma->vm_mm, vma->vm_flags, | 367 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); |
| 388 | vma->vm_file, pages); | ||
| 389 | if (vma->vm_flags & VM_LOCKED) { | 368 | if (vma->vm_flags & VM_LOCKED) { |
| 390 | current->mm->locked_vm += pages; | 369 | mm->locked_vm += pages; |
| 391 | make_pages_present(addr + old_len, | 370 | make_pages_present(addr + old_len, |
| 392 | addr + new_len); | 371 | addr + new_len); |
| 393 | } | 372 | } |
diff --git a/mm/msync.c b/mm/msync.c index d0f5a1bce7cb..0e040e9c39d8 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
| @@ -17,40 +17,48 @@ | |||
| 17 | #include <asm/pgtable.h> | 17 | #include <asm/pgtable.h> |
| 18 | #include <asm/tlbflush.h> | 18 | #include <asm/tlbflush.h> |
| 19 | 19 | ||
| 20 | /* | 20 | static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
| 21 | * Called with mm->page_table_lock held to protect against other | ||
| 22 | * threads/the swapper from ripping pte's out from under us. | ||
| 23 | */ | ||
| 24 | |||
| 25 | static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | ||
| 26 | unsigned long addr, unsigned long end) | 21 | unsigned long addr, unsigned long end) |
| 27 | { | 22 | { |
| 28 | pte_t *pte; | 23 | pte_t *pte; |
| 24 | spinlock_t *ptl; | ||
| 25 | int progress = 0; | ||
| 29 | 26 | ||
| 30 | pte = pte_offset_map(pmd, addr); | 27 | again: |
| 28 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
| 31 | do { | 29 | do { |
| 32 | unsigned long pfn; | 30 | unsigned long pfn; |
| 33 | struct page *page; | 31 | struct page *page; |
| 34 | 32 | ||
| 33 | if (progress >= 64) { | ||
| 34 | progress = 0; | ||
| 35 | if (need_resched() || need_lockbreak(ptl)) | ||
| 36 | break; | ||
| 37 | } | ||
| 38 | progress++; | ||
| 35 | if (!pte_present(*pte)) | 39 | if (!pte_present(*pte)) |
| 36 | continue; | 40 | continue; |
| 37 | if (!pte_maybe_dirty(*pte)) | 41 | if (!pte_maybe_dirty(*pte)) |
| 38 | continue; | 42 | continue; |
| 39 | pfn = pte_pfn(*pte); | 43 | pfn = pte_pfn(*pte); |
| 40 | if (!pfn_valid(pfn)) | 44 | if (unlikely(!pfn_valid(pfn))) { |
| 45 | print_bad_pte(vma, *pte, addr); | ||
| 41 | continue; | 46 | continue; |
| 47 | } | ||
| 42 | page = pfn_to_page(pfn); | 48 | page = pfn_to_page(pfn); |
| 43 | if (PageReserved(page)) | ||
| 44 | continue; | ||
| 45 | 49 | ||
| 46 | if (ptep_clear_flush_dirty(vma, addr, pte) || | 50 | if (ptep_clear_flush_dirty(vma, addr, pte) || |
| 47 | page_test_and_clear_dirty(page)) | 51 | page_test_and_clear_dirty(page)) |
| 48 | set_page_dirty(page); | 52 | set_page_dirty(page); |
| 53 | progress += 3; | ||
| 49 | } while (pte++, addr += PAGE_SIZE, addr != end); | 54 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| 50 | pte_unmap(pte - 1); | 55 | pte_unmap_unlock(pte - 1, ptl); |
| 56 | cond_resched(); | ||
| 57 | if (addr != end) | ||
| 58 | goto again; | ||
| 51 | } | 59 | } |
| 52 | 60 | ||
| 53 | static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 61 | static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
| 54 | unsigned long addr, unsigned long end) | 62 | unsigned long addr, unsigned long end) |
| 55 | { | 63 | { |
| 56 | pmd_t *pmd; | 64 | pmd_t *pmd; |
| @@ -61,11 +69,11 @@ static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
| 61 | next = pmd_addr_end(addr, end); | 69 | next = pmd_addr_end(addr, end); |
| 62 | if (pmd_none_or_clear_bad(pmd)) | 70 | if (pmd_none_or_clear_bad(pmd)) |
| 63 | continue; | 71 | continue; |
| 64 | sync_pte_range(vma, pmd, addr, next); | 72 | msync_pte_range(vma, pmd, addr, next); |
| 65 | } while (pmd++, addr = next, addr != end); | 73 | } while (pmd++, addr = next, addr != end); |
| 66 | } | 74 | } |
| 67 | 75 | ||
| 68 | static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | 76 | static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
| 69 | unsigned long addr, unsigned long end) | 77 | unsigned long addr, unsigned long end) |
| 70 | { | 78 | { |
| 71 | pud_t *pud; | 79 | pud_t *pud; |
| @@ -76,58 +84,34 @@ static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | |||
| 76 | next = pud_addr_end(addr, end); | 84 | next = pud_addr_end(addr, end); |
| 77 | if (pud_none_or_clear_bad(pud)) | 85 | if (pud_none_or_clear_bad(pud)) |
| 78 | continue; | 86 | continue; |
| 79 | sync_pmd_range(vma, pud, addr, next); | 87 | msync_pmd_range(vma, pud, addr, next); |
| 80 | } while (pud++, addr = next, addr != end); | 88 | } while (pud++, addr = next, addr != end); |
| 81 | } | 89 | } |
| 82 | 90 | ||
| 83 | static void sync_page_range(struct vm_area_struct *vma, | 91 | static void msync_page_range(struct vm_area_struct *vma, |
| 84 | unsigned long addr, unsigned long end) | 92 | unsigned long addr, unsigned long end) |
| 85 | { | 93 | { |
| 86 | struct mm_struct *mm = vma->vm_mm; | ||
| 87 | pgd_t *pgd; | 94 | pgd_t *pgd; |
| 88 | unsigned long next; | 95 | unsigned long next; |
| 89 | 96 | ||
| 90 | /* For hugepages we can't go walking the page table normally, | 97 | /* For hugepages we can't go walking the page table normally, |
| 91 | * but that's ok, hugetlbfs is memory based, so we don't need | 98 | * but that's ok, hugetlbfs is memory based, so we don't need |
| 92 | * to do anything more on an msync() */ | 99 | * to do anything more on an msync(). |
| 93 | if (is_vm_hugetlb_page(vma)) | 100 | * Can't do anything with VM_RESERVED regions either. |
| 101 | */ | ||
| 102 | if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED)) | ||
| 94 | return; | 103 | return; |
| 95 | 104 | ||
| 96 | BUG_ON(addr >= end); | 105 | BUG_ON(addr >= end); |
| 97 | pgd = pgd_offset(mm, addr); | 106 | pgd = pgd_offset(vma->vm_mm, addr); |
| 98 | flush_cache_range(vma, addr, end); | 107 | flush_cache_range(vma, addr, end); |
| 99 | spin_lock(&mm->page_table_lock); | ||
| 100 | do { | 108 | do { |
| 101 | next = pgd_addr_end(addr, end); | 109 | next = pgd_addr_end(addr, end); |
| 102 | if (pgd_none_or_clear_bad(pgd)) | 110 | if (pgd_none_or_clear_bad(pgd)) |
| 103 | continue; | 111 | continue; |
| 104 | sync_pud_range(vma, pgd, addr, next); | 112 | msync_pud_range(vma, pgd, addr, next); |
| 105 | } while (pgd++, addr = next, addr != end); | 113 | } while (pgd++, addr = next, addr != end); |
| 106 | spin_unlock(&mm->page_table_lock); | ||
| 107 | } | ||
| 108 | |||
| 109 | #ifdef CONFIG_PREEMPT | ||
| 110 | static inline void filemap_sync(struct vm_area_struct *vma, | ||
| 111 | unsigned long addr, unsigned long end) | ||
| 112 | { | ||
| 113 | const size_t chunk = 64 * 1024; /* bytes */ | ||
| 114 | unsigned long next; | ||
| 115 | |||
| 116 | do { | ||
| 117 | next = addr + chunk; | ||
| 118 | if (next > end || next < addr) | ||
| 119 | next = end; | ||
| 120 | sync_page_range(vma, addr, next); | ||
| 121 | cond_resched(); | ||
| 122 | } while (addr = next, addr != end); | ||
| 123 | } | ||
| 124 | #else | ||
| 125 | static inline void filemap_sync(struct vm_area_struct *vma, | ||
| 126 | unsigned long addr, unsigned long end) | ||
| 127 | { | ||
| 128 | sync_page_range(vma, addr, end); | ||
| 129 | } | 114 | } |
| 130 | #endif | ||
| 131 | 115 | ||
| 132 | /* | 116 | /* |
| 133 | * MS_SYNC syncs the entire file - including mappings. | 117 | * MS_SYNC syncs the entire file - including mappings. |
| @@ -150,7 +134,7 @@ static int msync_interval(struct vm_area_struct *vma, | |||
| 150 | return -EBUSY; | 134 | return -EBUSY; |
| 151 | 135 | ||
| 152 | if (file && (vma->vm_flags & VM_SHARED)) { | 136 | if (file && (vma->vm_flags & VM_SHARED)) { |
| 153 | filemap_sync(vma, addr, end); | 137 | msync_page_range(vma, addr, end); |
| 154 | 138 | ||
| 155 | if (flags & MS_SYNC) { | 139 | if (flags & MS_SYNC) { |
| 156 | struct address_space *mapping = file->f_mapping; | 140 | struct address_space *mapping = file->f_mapping; |
diff --git a/mm/nommu.c b/mm/nommu.c index 0ef241ae3763..d1e076a487cb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | |||
| 931 | realalloc -= kobjsize(vml); | 931 | realalloc -= kobjsize(vml); |
| 932 | askedalloc -= sizeof(*vml); | 932 | askedalloc -= sizeof(*vml); |
| 933 | kfree(vml); | 933 | kfree(vml); |
| 934 | |||
| 935 | update_hiwater_vm(mm); | ||
| 934 | mm->total_vm -= len >> PAGE_SHIFT; | 936 | mm->total_vm -= len >> PAGE_SHIFT; |
| 935 | 937 | ||
| 936 | #ifdef DEBUG | 938 | #ifdef DEBUG |
| @@ -1047,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
| 1047 | 1049 | ||
| 1048 | EXPORT_SYMBOL(find_vma); | 1050 | EXPORT_SYMBOL(find_vma); |
| 1049 | 1051 | ||
| 1050 | struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write) | 1052 | struct page *follow_page(struct mm_struct *mm, unsigned long address, |
| 1053 | unsigned int foll_flags) | ||
| 1051 | { | 1054 | { |
| 1052 | return NULL; | 1055 | return NULL; |
| 1053 | } | 1056 | } |
| @@ -1078,19 +1081,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr) | |||
| 1078 | { | 1081 | { |
| 1079 | } | 1082 | } |
| 1080 | 1083 | ||
| 1081 | void update_mem_hiwater(struct task_struct *tsk) | ||
| 1082 | { | ||
| 1083 | unsigned long rss; | ||
| 1084 | |||
| 1085 | if (likely(tsk->mm)) { | ||
| 1086 | rss = get_mm_counter(tsk->mm, rss); | ||
| 1087 | if (tsk->mm->hiwater_rss < rss) | ||
| 1088 | tsk->mm->hiwater_rss = rss; | ||
| 1089 | if (tsk->mm->hiwater_vm < tsk->mm->total_vm) | ||
| 1090 | tsk->mm->hiwater_vm = tsk->mm->total_vm; | ||
| 1091 | } | ||
| 1092 | } | ||
| 1093 | |||
| 1094 | void unmap_mapping_range(struct address_space *mapping, | 1084 | void unmap_mapping_range(struct address_space *mapping, |
| 1095 | loff_t const holebegin, loff_t const holelen, | 1085 | loff_t const holebegin, loff_t const holelen, |
| 1096 | int even_cows) | 1086 | int even_cows) |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94c864eac9c4..2dbdd98426fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -33,6 +33,7 @@ | |||
| 33 | #include <linux/sysctl.h> | 33 | #include <linux/sysctl.h> |
| 34 | #include <linux/cpu.h> | 34 | #include <linux/cpu.h> |
| 35 | #include <linux/cpuset.h> | 35 | #include <linux/cpuset.h> |
| 36 | #include <linux/memory_hotplug.h> | ||
| 36 | #include <linux/nodemask.h> | 37 | #include <linux/nodemask.h> |
| 37 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
| 38 | 39 | ||
| @@ -78,21 +79,44 @@ int min_free_kbytes = 1024; | |||
| 78 | unsigned long __initdata nr_kernel_pages; | 79 | unsigned long __initdata nr_kernel_pages; |
| 79 | unsigned long __initdata nr_all_pages; | 80 | unsigned long __initdata nr_all_pages; |
| 80 | 81 | ||
| 82 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | ||
| 83 | { | ||
| 84 | int ret = 0; | ||
| 85 | unsigned seq; | ||
| 86 | unsigned long pfn = page_to_pfn(page); | ||
| 87 | |||
| 88 | do { | ||
| 89 | seq = zone_span_seqbegin(zone); | ||
| 90 | if (pfn >= zone->zone_start_pfn + zone->spanned_pages) | ||
| 91 | ret = 1; | ||
| 92 | else if (pfn < zone->zone_start_pfn) | ||
| 93 | ret = 1; | ||
| 94 | } while (zone_span_seqretry(zone, seq)); | ||
| 95 | |||
| 96 | return ret; | ||
| 97 | } | ||
| 98 | |||
| 99 | static int page_is_consistent(struct zone *zone, struct page *page) | ||
| 100 | { | ||
| 101 | #ifdef CONFIG_HOLES_IN_ZONE | ||
| 102 | if (!pfn_valid(page_to_pfn(page))) | ||
| 103 | return 0; | ||
| 104 | #endif | ||
| 105 | if (zone != page_zone(page)) | ||
| 106 | return 0; | ||
| 107 | |||
| 108 | return 1; | ||
| 109 | } | ||
| 81 | /* | 110 | /* |
| 82 | * Temporary debugging check for pages not lying within a given zone. | 111 | * Temporary debugging check for pages not lying within a given zone. |
| 83 | */ | 112 | */ |
| 84 | static int bad_range(struct zone *zone, struct page *page) | 113 | static int bad_range(struct zone *zone, struct page *page) |
| 85 | { | 114 | { |
| 86 | if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) | 115 | if (page_outside_zone_boundaries(zone, page)) |
| 87 | return 1; | 116 | return 1; |
| 88 | if (page_to_pfn(page) < zone->zone_start_pfn) | 117 | if (!page_is_consistent(zone, page)) |
| 89 | return 1; | ||
| 90 | #ifdef CONFIG_HOLES_IN_ZONE | ||
| 91 | if (!pfn_valid(page_to_pfn(page))) | ||
| 92 | return 1; | ||
| 93 | #endif | ||
| 94 | if (zone != page_zone(page)) | ||
| 95 | return 1; | 118 | return 1; |
| 119 | |||
| 96 | return 0; | 120 | return 0; |
| 97 | } | 121 | } |
| 98 | 122 | ||
| @@ -114,7 +138,8 @@ static void bad_page(const char *function, struct page *page) | |||
| 114 | 1 << PG_reclaim | | 138 | 1 << PG_reclaim | |
| 115 | 1 << PG_slab | | 139 | 1 << PG_slab | |
| 116 | 1 << PG_swapcache | | 140 | 1 << PG_swapcache | |
| 117 | 1 << PG_writeback); | 141 | 1 << PG_writeback | |
| 142 | 1 << PG_reserved ); | ||
| 118 | set_page_count(page, 0); | 143 | set_page_count(page, 0); |
| 119 | reset_page_mapcount(page); | 144 | reset_page_mapcount(page); |
| 120 | page->mapping = NULL; | 145 | page->mapping = NULL; |
| @@ -153,7 +178,7 @@ static void prep_compound_page(struct page *page, unsigned long order) | |||
| 153 | struct page *p = page + i; | 178 | struct page *p = page + i; |
| 154 | 179 | ||
| 155 | SetPageCompound(p); | 180 | SetPageCompound(p); |
| 156 | p->private = (unsigned long)page; | 181 | set_page_private(p, (unsigned long)page); |
| 157 | } | 182 | } |
| 158 | } | 183 | } |
| 159 | 184 | ||
| @@ -173,7 +198,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
| 173 | 198 | ||
| 174 | if (!PageCompound(p)) | 199 | if (!PageCompound(p)) |
| 175 | bad_page(__FUNCTION__, page); | 200 | bad_page(__FUNCTION__, page); |
| 176 | if (p->private != (unsigned long)page) | 201 | if (page_private(p) != (unsigned long)page) |
| 177 | bad_page(__FUNCTION__, page); | 202 | bad_page(__FUNCTION__, page); |
| 178 | ClearPageCompound(p); | 203 | ClearPageCompound(p); |
| 179 | } | 204 | } |
| @@ -186,18 +211,18 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
| 186 | * So, we don't need atomic page->flags operations here. | 211 | * So, we don't need atomic page->flags operations here. |
| 187 | */ | 212 | */ |
| 188 | static inline unsigned long page_order(struct page *page) { | 213 | static inline unsigned long page_order(struct page *page) { |
| 189 | return page->private; | 214 | return page_private(page); |
| 190 | } | 215 | } |
| 191 | 216 | ||
| 192 | static inline void set_page_order(struct page *page, int order) { | 217 | static inline void set_page_order(struct page *page, int order) { |
| 193 | page->private = order; | 218 | set_page_private(page, order); |
| 194 | __SetPagePrivate(page); | 219 | __SetPagePrivate(page); |
| 195 | } | 220 | } |
| 196 | 221 | ||
| 197 | static inline void rmv_page_order(struct page *page) | 222 | static inline void rmv_page_order(struct page *page) |
| 198 | { | 223 | { |
| 199 | __ClearPagePrivate(page); | 224 | __ClearPagePrivate(page); |
| 200 | page->private = 0; | 225 | set_page_private(page, 0); |
| 201 | } | 226 | } |
| 202 | 227 | ||
| 203 | /* | 228 | /* |
| @@ -237,14 +262,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order) | |||
| 237 | * (a) the buddy is free && | 262 | * (a) the buddy is free && |
| 238 | * (b) the buddy is on the buddy system && | 263 | * (b) the buddy is on the buddy system && |
| 239 | * (c) a page and its buddy have the same order. | 264 | * (c) a page and its buddy have the same order. |
| 240 | * for recording page's order, we use page->private and PG_private. | 265 | * for recording page's order, we use page_private(page) and PG_private. |
| 241 | * | 266 | * |
| 242 | */ | 267 | */ |
| 243 | static inline int page_is_buddy(struct page *page, int order) | 268 | static inline int page_is_buddy(struct page *page, int order) |
| 244 | { | 269 | { |
| 245 | if (PagePrivate(page) && | 270 | if (PagePrivate(page) && |
| 246 | (page_order(page) == order) && | 271 | (page_order(page) == order) && |
| 247 | !PageReserved(page) && | ||
| 248 | page_count(page) == 0) | 272 | page_count(page) == 0) |
| 249 | return 1; | 273 | return 1; |
| 250 | return 0; | 274 | return 0; |
| @@ -264,7 +288,7 @@ static inline int page_is_buddy(struct page *page, int order) | |||
| 264 | * parts of the VM system. | 288 | * parts of the VM system. |
| 265 | * At each level, we keep a list of pages, which are heads of continuous | 289 | * At each level, we keep a list of pages, which are heads of continuous |
| 266 | * free pages of length of (1 << order) and marked with PG_Private.Page's | 290 | * free pages of length of (1 << order) and marked with PG_Private.Page's |
| 267 | * order is recorded in page->private field. | 291 | * order is recorded in page_private(page) field. |
| 268 | * So when we are allocating or freeing one, we can derive the state of the | 292 | * So when we are allocating or freeing one, we can derive the state of the |
| 269 | * other. That is, if we allocate a small block, and both were | 293 | * other. That is, if we allocate a small block, and both were |
| 270 | * free, the remainder of the region must be split into blocks. | 294 | * free, the remainder of the region must be split into blocks. |
| @@ -327,7 +351,8 @@ static inline void free_pages_check(const char *function, struct page *page) | |||
| 327 | 1 << PG_reclaim | | 351 | 1 << PG_reclaim | |
| 328 | 1 << PG_slab | | 352 | 1 << PG_slab | |
| 329 | 1 << PG_swapcache | | 353 | 1 << PG_swapcache | |
| 330 | 1 << PG_writeback ))) | 354 | 1 << PG_writeback | |
| 355 | 1 << PG_reserved ))) | ||
| 331 | bad_page(function, page); | 356 | bad_page(function, page); |
| 332 | if (PageDirty(page)) | 357 | if (PageDirty(page)) |
| 333 | __ClearPageDirty(page); | 358 | __ClearPageDirty(page); |
| @@ -455,13 +480,14 @@ static void prep_new_page(struct page *page, int order) | |||
| 455 | 1 << PG_reclaim | | 480 | 1 << PG_reclaim | |
| 456 | 1 << PG_slab | | 481 | 1 << PG_slab | |
| 457 | 1 << PG_swapcache | | 482 | 1 << PG_swapcache | |
| 458 | 1 << PG_writeback ))) | 483 | 1 << PG_writeback | |
| 484 | 1 << PG_reserved ))) | ||
| 459 | bad_page(__FUNCTION__, page); | 485 | bad_page(__FUNCTION__, page); |
| 460 | 486 | ||
| 461 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | | 487 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | |
| 462 | 1 << PG_referenced | 1 << PG_arch_1 | | 488 | 1 << PG_referenced | 1 << PG_arch_1 | |
| 463 | 1 << PG_checked | 1 << PG_mappedtodisk); | 489 | 1 << PG_checked | 1 << PG_mappedtodisk); |
| 464 | page->private = 0; | 490 | set_page_private(page, 0); |
| 465 | set_page_refs(page, order); | 491 | set_page_refs(page, order); |
| 466 | kernel_map_pages(page, 1 << order, 1); | 492 | kernel_map_pages(page, 1 << order, 1); |
| 467 | } | 493 | } |
| @@ -1016,7 +1042,7 @@ void __pagevec_free(struct pagevec *pvec) | |||
| 1016 | 1042 | ||
| 1017 | fastcall void __free_pages(struct page *page, unsigned int order) | 1043 | fastcall void __free_pages(struct page *page, unsigned int order) |
| 1018 | { | 1044 | { |
| 1019 | if (!PageReserved(page) && put_page_testzero(page)) { | 1045 | if (put_page_testzero(page)) { |
| 1020 | if (order == 0) | 1046 | if (order == 0) |
| 1021 | free_hot_page(page); | 1047 | free_hot_page(page); |
| 1022 | else | 1048 | else |
| @@ -1305,12 +1331,9 @@ void show_free_areas(void) | |||
| 1305 | } else | 1331 | } else |
| 1306 | printk("\n"); | 1332 | printk("\n"); |
| 1307 | 1333 | ||
| 1308 | for (cpu = 0; cpu < NR_CPUS; ++cpu) { | 1334 | for_each_cpu(cpu) { |
| 1309 | struct per_cpu_pageset *pageset; | 1335 | struct per_cpu_pageset *pageset; |
| 1310 | 1336 | ||
| 1311 | if (!cpu_possible(cpu)) | ||
| 1312 | continue; | ||
| 1313 | |||
| 1314 | pageset = zone_pcp(zone, cpu); | 1337 | pageset = zone_pcp(zone, cpu); |
| 1315 | 1338 | ||
| 1316 | for (temperature = 0; temperature < 2; temperature++) | 1339 | for (temperature = 0; temperature < 2; temperature++) |
| @@ -1660,7 +1683,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat, | |||
| 1660 | * up by free_all_bootmem() once the early boot process is | 1683 | * up by free_all_bootmem() once the early boot process is |
| 1661 | * done. Non-atomic initialization, single-pass. | 1684 | * done. Non-atomic initialization, single-pass. |
| 1662 | */ | 1685 | */ |
| 1663 | void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, | 1686 | void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, |
| 1664 | unsigned long start_pfn) | 1687 | unsigned long start_pfn) |
| 1665 | { | 1688 | { |
| 1666 | struct page *page; | 1689 | struct page *page; |
| @@ -1674,7 +1697,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
| 1674 | continue; | 1697 | continue; |
| 1675 | page = pfn_to_page(pfn); | 1698 | page = pfn_to_page(pfn); |
| 1676 | set_page_links(page, zone, nid, pfn); | 1699 | set_page_links(page, zone, nid, pfn); |
| 1677 | set_page_count(page, 0); | 1700 | set_page_count(page, 1); |
| 1678 | reset_page_mapcount(page); | 1701 | reset_page_mapcount(page); |
| 1679 | SetPageReserved(page); | 1702 | SetPageReserved(page); |
| 1680 | INIT_LIST_HEAD(&page->lru); | 1703 | INIT_LIST_HEAD(&page->lru); |
| @@ -1721,29 +1744,29 @@ static int __devinit zone_batchsize(struct zone *zone) | |||
| 1721 | 1744 | ||
| 1722 | /* | 1745 | /* |
| 1723 | * The per-cpu-pages pools are set to around 1000th of the | 1746 | * The per-cpu-pages pools are set to around 1000th of the |
| 1724 | * size of the zone. But no more than 1/4 of a meg - there's | 1747 | * size of the zone. But no more than 1/2 of a meg. |
| 1725 | * no point in going beyond the size of L2 cache. | ||
| 1726 | * | 1748 | * |
| 1727 | * OK, so we don't know how big the cache is. So guess. | 1749 | * OK, so we don't know how big the cache is. So guess. |
| 1728 | */ | 1750 | */ |
| 1729 | batch = zone->present_pages / 1024; | 1751 | batch = zone->present_pages / 1024; |
| 1730 | if (batch * PAGE_SIZE > 256 * 1024) | 1752 | if (batch * PAGE_SIZE > 512 * 1024) |
| 1731 | batch = (256 * 1024) / PAGE_SIZE; | 1753 | batch = (512 * 1024) / PAGE_SIZE; |
| 1732 | batch /= 4; /* We effectively *= 4 below */ | 1754 | batch /= 4; /* We effectively *= 4 below */ |
| 1733 | if (batch < 1) | 1755 | if (batch < 1) |
| 1734 | batch = 1; | 1756 | batch = 1; |
| 1735 | 1757 | ||
| 1736 | /* | 1758 | /* |
| 1737 | * Clamp the batch to a 2^n - 1 value. Having a power | 1759 | * We will be trying to allcoate bigger chunks of contiguous |
| 1738 | * of 2 value was found to be more likely to have | 1760 | * memory of the order of fls(batch). This should result in |
| 1739 | * suboptimal cache aliasing properties in some cases. | 1761 | * better cache coloring. |
| 1740 | * | 1762 | * |
| 1741 | * For example if 2 tasks are alternately allocating | 1763 | * A sanity check also to ensure that batch is still in limits. |
| 1742 | * batches of pages, one task can end up with a lot | ||
| 1743 | * of pages of one half of the possible page colors | ||
| 1744 | * and the other with pages of the other colors. | ||
| 1745 | */ | 1764 | */ |
| 1746 | batch = (1 << fls(batch + batch/2)) - 1; | 1765 | batch = (1 << fls(batch + batch/2)); |
| 1766 | |||
| 1767 | if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2)) | ||
| 1768 | batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2); | ||
| 1769 | |||
| 1747 | return batch; | 1770 | return batch; |
| 1748 | } | 1771 | } |
| 1749 | 1772 | ||
| @@ -1755,7 +1778,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | |||
| 1755 | 1778 | ||
| 1756 | pcp = &p->pcp[0]; /* hot */ | 1779 | pcp = &p->pcp[0]; /* hot */ |
| 1757 | pcp->count = 0; | 1780 | pcp->count = 0; |
| 1758 | pcp->low = 2 * batch; | 1781 | pcp->low = 0; |
| 1759 | pcp->high = 6 * batch; | 1782 | pcp->high = 6 * batch; |
| 1760 | pcp->batch = max(1UL, 1 * batch); | 1783 | pcp->batch = max(1UL, 1 * batch); |
| 1761 | INIT_LIST_HEAD(&pcp->list); | 1784 | INIT_LIST_HEAD(&pcp->list); |
| @@ -1764,7 +1787,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | |||
| 1764 | pcp->count = 0; | 1787 | pcp->count = 0; |
| 1765 | pcp->low = 0; | 1788 | pcp->low = 0; |
| 1766 | pcp->high = 2 * batch; | 1789 | pcp->high = 2 * batch; |
| 1767 | pcp->batch = max(1UL, 1 * batch); | 1790 | pcp->batch = max(1UL, batch/2); |
| 1768 | INIT_LIST_HEAD(&pcp->list); | 1791 | INIT_LIST_HEAD(&pcp->list); |
| 1769 | } | 1792 | } |
| 1770 | 1793 | ||
| @@ -1873,6 +1896,60 @@ void __init setup_per_cpu_pageset() | |||
| 1873 | 1896 | ||
| 1874 | #endif | 1897 | #endif |
| 1875 | 1898 | ||
| 1899 | static __devinit | ||
| 1900 | void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | ||
| 1901 | { | ||
| 1902 | int i; | ||
| 1903 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
| 1904 | |||
| 1905 | /* | ||
| 1906 | * The per-page waitqueue mechanism uses hashed waitqueues | ||
| 1907 | * per zone. | ||
| 1908 | */ | ||
| 1909 | zone->wait_table_size = wait_table_size(zone_size_pages); | ||
| 1910 | zone->wait_table_bits = wait_table_bits(zone->wait_table_size); | ||
| 1911 | zone->wait_table = (wait_queue_head_t *) | ||
| 1912 | alloc_bootmem_node(pgdat, zone->wait_table_size | ||
| 1913 | * sizeof(wait_queue_head_t)); | ||
| 1914 | |||
| 1915 | for(i = 0; i < zone->wait_table_size; ++i) | ||
| 1916 | init_waitqueue_head(zone->wait_table + i); | ||
| 1917 | } | ||
| 1918 | |||
| 1919 | static __devinit void zone_pcp_init(struct zone *zone) | ||
| 1920 | { | ||
| 1921 | int cpu; | ||
| 1922 | unsigned long batch = zone_batchsize(zone); | ||
| 1923 | |||
| 1924 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
| 1925 | #ifdef CONFIG_NUMA | ||
| 1926 | /* Early boot. Slab allocator not functional yet */ | ||
| 1927 | zone->pageset[cpu] = &boot_pageset[cpu]; | ||
| 1928 | setup_pageset(&boot_pageset[cpu],0); | ||
| 1929 | #else | ||
| 1930 | setup_pageset(zone_pcp(zone,cpu), batch); | ||
| 1931 | #endif | ||
| 1932 | } | ||
| 1933 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | ||
| 1934 | zone->name, zone->present_pages, batch); | ||
| 1935 | } | ||
| 1936 | |||
| 1937 | static __devinit void init_currently_empty_zone(struct zone *zone, | ||
| 1938 | unsigned long zone_start_pfn, unsigned long size) | ||
| 1939 | { | ||
| 1940 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
| 1941 | |||
| 1942 | zone_wait_table_init(zone, size); | ||
| 1943 | pgdat->nr_zones = zone_idx(zone) + 1; | ||
| 1944 | |||
| 1945 | zone->zone_mem_map = pfn_to_page(zone_start_pfn); | ||
| 1946 | zone->zone_start_pfn = zone_start_pfn; | ||
| 1947 | |||
| 1948 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); | ||
| 1949 | |||
| 1950 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); | ||
| 1951 | } | ||
| 1952 | |||
| 1876 | /* | 1953 | /* |
| 1877 | * Set up the zone data structures: | 1954 | * Set up the zone data structures: |
| 1878 | * - mark all pages reserved | 1955 | * - mark all pages reserved |
| @@ -1882,10 +1959,11 @@ void __init setup_per_cpu_pageset() | |||
| 1882 | static void __init free_area_init_core(struct pglist_data *pgdat, | 1959 | static void __init free_area_init_core(struct pglist_data *pgdat, |
| 1883 | unsigned long *zones_size, unsigned long *zholes_size) | 1960 | unsigned long *zones_size, unsigned long *zholes_size) |
| 1884 | { | 1961 | { |
| 1885 | unsigned long i, j; | 1962 | unsigned long j; |
| 1886 | int cpu, nid = pgdat->node_id; | 1963 | int nid = pgdat->node_id; |
| 1887 | unsigned long zone_start_pfn = pgdat->node_start_pfn; | 1964 | unsigned long zone_start_pfn = pgdat->node_start_pfn; |
| 1888 | 1965 | ||
| 1966 | pgdat_resize_init(pgdat); | ||
| 1889 | pgdat->nr_zones = 0; | 1967 | pgdat->nr_zones = 0; |
| 1890 | init_waitqueue_head(&pgdat->kswapd_wait); | 1968 | init_waitqueue_head(&pgdat->kswapd_wait); |
| 1891 | pgdat->kswapd_max_order = 0; | 1969 | pgdat->kswapd_max_order = 0; |
| @@ -1893,7 +1971,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
| 1893 | for (j = 0; j < MAX_NR_ZONES; j++) { | 1971 | for (j = 0; j < MAX_NR_ZONES; j++) { |
| 1894 | struct zone *zone = pgdat->node_zones + j; | 1972 | struct zone *zone = pgdat->node_zones + j; |
| 1895 | unsigned long size, realsize; | 1973 | unsigned long size, realsize; |
| 1896 | unsigned long batch; | ||
| 1897 | 1974 | ||
| 1898 | realsize = size = zones_size[j]; | 1975 | realsize = size = zones_size[j]; |
| 1899 | if (zholes_size) | 1976 | if (zholes_size) |
| @@ -1908,24 +1985,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
| 1908 | zone->name = zone_names[j]; | 1985 | zone->name = zone_names[j]; |
| 1909 | spin_lock_init(&zone->lock); | 1986 | spin_lock_init(&zone->lock); |
| 1910 | spin_lock_init(&zone->lru_lock); | 1987 | spin_lock_init(&zone->lru_lock); |
| 1988 | zone_seqlock_init(zone); | ||
| 1911 | zone->zone_pgdat = pgdat; | 1989 | zone->zone_pgdat = pgdat; |
| 1912 | zone->free_pages = 0; | 1990 | zone->free_pages = 0; |
| 1913 | 1991 | ||
| 1914 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; | 1992 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; |
| 1915 | 1993 | ||
| 1916 | batch = zone_batchsize(zone); | 1994 | zone_pcp_init(zone); |
| 1917 | |||
| 1918 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
| 1919 | #ifdef CONFIG_NUMA | ||
| 1920 | /* Early boot. Slab allocator not functional yet */ | ||
| 1921 | zone->pageset[cpu] = &boot_pageset[cpu]; | ||
| 1922 | setup_pageset(&boot_pageset[cpu],0); | ||
| 1923 | #else | ||
| 1924 | setup_pageset(zone_pcp(zone,cpu), batch); | ||
| 1925 | #endif | ||
| 1926 | } | ||
| 1927 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | ||
| 1928 | zone_names[j], realsize, batch); | ||
| 1929 | INIT_LIST_HEAD(&zone->active_list); | 1995 | INIT_LIST_HEAD(&zone->active_list); |
| 1930 | INIT_LIST_HEAD(&zone->inactive_list); | 1996 | INIT_LIST_HEAD(&zone->inactive_list); |
| 1931 | zone->nr_scan_active = 0; | 1997 | zone->nr_scan_active = 0; |
| @@ -1936,32 +2002,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
| 1936 | if (!size) | 2002 | if (!size) |
| 1937 | continue; | 2003 | continue; |
| 1938 | 2004 | ||
| 1939 | /* | ||
| 1940 | * The per-page waitqueue mechanism uses hashed waitqueues | ||
| 1941 | * per zone. | ||
| 1942 | */ | ||
| 1943 | zone->wait_table_size = wait_table_size(size); | ||
| 1944 | zone->wait_table_bits = | ||
| 1945 | wait_table_bits(zone->wait_table_size); | ||
| 1946 | zone->wait_table = (wait_queue_head_t *) | ||
| 1947 | alloc_bootmem_node(pgdat, zone->wait_table_size | ||
| 1948 | * sizeof(wait_queue_head_t)); | ||
| 1949 | |||
| 1950 | for(i = 0; i < zone->wait_table_size; ++i) | ||
| 1951 | init_waitqueue_head(zone->wait_table + i); | ||
| 1952 | |||
| 1953 | pgdat->nr_zones = j+1; | ||
| 1954 | |||
| 1955 | zone->zone_mem_map = pfn_to_page(zone_start_pfn); | ||
| 1956 | zone->zone_start_pfn = zone_start_pfn; | ||
| 1957 | |||
| 1958 | memmap_init(size, nid, j, zone_start_pfn); | ||
| 1959 | |||
| 1960 | zonetable_add(zone, nid, j, zone_start_pfn, size); | 2005 | zonetable_add(zone, nid, j, zone_start_pfn, size); |
| 1961 | 2006 | init_currently_empty_zone(zone, zone_start_pfn, size); | |
| 1962 | zone_start_pfn += size; | 2007 | zone_start_pfn += size; |
| 1963 | |||
| 1964 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); | ||
| 1965 | } | 2008 | } |
| 1966 | } | 2009 | } |
| 1967 | 2010 | ||
| @@ -2361,7 +2404,7 @@ static void setup_per_zone_lowmem_reserve(void) | |||
| 2361 | * that the pages_{min,low,high} values for each zone are set correctly | 2404 | * that the pages_{min,low,high} values for each zone are set correctly |
| 2362 | * with respect to min_free_kbytes. | 2405 | * with respect to min_free_kbytes. |
| 2363 | */ | 2406 | */ |
| 2364 | static void setup_per_zone_pages_min(void) | 2407 | void setup_per_zone_pages_min(void) |
| 2365 | { | 2408 | { |
| 2366 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | 2409 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); |
| 2367 | unsigned long lowmem_pages = 0; | 2410 | unsigned long lowmem_pages = 0; |
diff --git a/mm/page_io.c b/mm/page_io.c index 330e00d6db00..bb2b0d53889c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
| @@ -91,7 +91,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
| 91 | unlock_page(page); | 91 | unlock_page(page); |
| 92 | goto out; | 92 | goto out; |
| 93 | } | 93 | } |
| 94 | bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write); | 94 | bio = get_swap_bio(GFP_NOIO, page_private(page), page, |
| 95 | end_swap_bio_write); | ||
| 95 | if (bio == NULL) { | 96 | if (bio == NULL) { |
| 96 | set_page_dirty(page); | 97 | set_page_dirty(page); |
| 97 | unlock_page(page); | 98 | unlock_page(page); |
| @@ -115,7 +116,8 @@ int swap_readpage(struct file *file, struct page *page) | |||
| 115 | 116 | ||
| 116 | BUG_ON(!PageLocked(page)); | 117 | BUG_ON(!PageLocked(page)); |
| 117 | ClearPageUptodate(page); | 118 | ClearPageUptodate(page); |
| 118 | bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read); | 119 | bio = get_swap_bio(GFP_KERNEL, page_private(page), page, |
| 120 | end_swap_bio_read); | ||
| 119 | if (bio == NULL) { | 121 | if (bio == NULL) { |
| 120 | unlock_page(page); | 122 | unlock_page(page); |
| 121 | ret = -ENOMEM; | 123 | ret = -ENOMEM; |
| @@ -32,7 +32,7 @@ | |||
| 32 | * page->flags PG_locked (lock_page) | 32 | * page->flags PG_locked (lock_page) |
| 33 | * mapping->i_mmap_lock | 33 | * mapping->i_mmap_lock |
| 34 | * anon_vma->lock | 34 | * anon_vma->lock |
| 35 | * mm->page_table_lock | 35 | * mm->page_table_lock or pte_lock |
| 36 | * zone->lru_lock (in mark_page_accessed) | 36 | * zone->lru_lock (in mark_page_accessed) |
| 37 | * swap_lock (in swap_duplicate, swap_info_get) | 37 | * swap_lock (in swap_duplicate, swap_info_get) |
| 38 | * mmlist_lock (in mmput, drain_mmlist and others) | 38 | * mmlist_lock (in mmput, drain_mmlist and others) |
| @@ -244,37 +244,44 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
| 244 | /* | 244 | /* |
| 245 | * Check that @page is mapped at @address into @mm. | 245 | * Check that @page is mapped at @address into @mm. |
| 246 | * | 246 | * |
| 247 | * On success returns with mapped pte and locked mm->page_table_lock. | 247 | * On success returns with pte mapped and locked. |
| 248 | */ | 248 | */ |
| 249 | pte_t *page_check_address(struct page *page, struct mm_struct *mm, | 249 | pte_t *page_check_address(struct page *page, struct mm_struct *mm, |
| 250 | unsigned long address) | 250 | unsigned long address, spinlock_t **ptlp) |
| 251 | { | 251 | { |
| 252 | pgd_t *pgd; | 252 | pgd_t *pgd; |
| 253 | pud_t *pud; | 253 | pud_t *pud; |
| 254 | pmd_t *pmd; | 254 | pmd_t *pmd; |
| 255 | pte_t *pte; | 255 | pte_t *pte; |
| 256 | spinlock_t *ptl; | ||
| 256 | 257 | ||
| 257 | /* | ||
| 258 | * We need the page_table_lock to protect us from page faults, | ||
| 259 | * munmap, fork, etc... | ||
| 260 | */ | ||
| 261 | spin_lock(&mm->page_table_lock); | ||
| 262 | pgd = pgd_offset(mm, address); | 258 | pgd = pgd_offset(mm, address); |
| 263 | if (likely(pgd_present(*pgd))) { | 259 | if (!pgd_present(*pgd)) |
| 264 | pud = pud_offset(pgd, address); | 260 | return NULL; |
| 265 | if (likely(pud_present(*pud))) { | 261 | |
| 266 | pmd = pmd_offset(pud, address); | 262 | pud = pud_offset(pgd, address); |
| 267 | if (likely(pmd_present(*pmd))) { | 263 | if (!pud_present(*pud)) |
| 268 | pte = pte_offset_map(pmd, address); | 264 | return NULL; |
| 269 | if (likely(pte_present(*pte) && | 265 | |
| 270 | page_to_pfn(page) == pte_pfn(*pte))) | 266 | pmd = pmd_offset(pud, address); |
| 271 | return pte; | 267 | if (!pmd_present(*pmd)) |
| 272 | pte_unmap(pte); | 268 | return NULL; |
| 273 | } | 269 | |
| 274 | } | 270 | pte = pte_offset_map(pmd, address); |
| 271 | /* Make a quick check before getting the lock */ | ||
| 272 | if (!pte_present(*pte)) { | ||
| 273 | pte_unmap(pte); | ||
| 274 | return NULL; | ||
| 275 | } | ||
| 276 | |||
| 277 | ptl = pte_lockptr(mm, pmd); | ||
| 278 | spin_lock(ptl); | ||
| 279 | if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { | ||
| 280 | *ptlp = ptl; | ||
| 281 | return pte; | ||
| 275 | } | 282 | } |
| 276 | spin_unlock(&mm->page_table_lock); | 283 | pte_unmap_unlock(pte, ptl); |
| 277 | return ERR_PTR(-ENOENT); | 284 | return NULL; |
| 278 | } | 285 | } |
| 279 | 286 | ||
| 280 | /* | 287 | /* |
| @@ -287,24 +294,28 @@ static int page_referenced_one(struct page *page, | |||
| 287 | struct mm_struct *mm = vma->vm_mm; | 294 | struct mm_struct *mm = vma->vm_mm; |
| 288 | unsigned long address; | 295 | unsigned long address; |
| 289 | pte_t *pte; | 296 | pte_t *pte; |
| 297 | spinlock_t *ptl; | ||
| 290 | int referenced = 0; | 298 | int referenced = 0; |
| 291 | 299 | ||
| 292 | address = vma_address(page, vma); | 300 | address = vma_address(page, vma); |
| 293 | if (address == -EFAULT) | 301 | if (address == -EFAULT) |
| 294 | goto out; | 302 | goto out; |
| 295 | 303 | ||
| 296 | pte = page_check_address(page, mm, address); | 304 | pte = page_check_address(page, mm, address, &ptl); |
| 297 | if (!IS_ERR(pte)) { | 305 | if (!pte) |
| 298 | if (ptep_clear_flush_young(vma, address, pte)) | 306 | goto out; |
| 299 | referenced++; | ||
| 300 | 307 | ||
| 301 | if (mm != current->mm && !ignore_token && has_swap_token(mm)) | 308 | if (ptep_clear_flush_young(vma, address, pte)) |
| 302 | referenced++; | 309 | referenced++; |
| 303 | 310 | ||
| 304 | (*mapcount)--; | 311 | /* Pretend the page is referenced if the task has the |
| 305 | pte_unmap(pte); | 312 | swap token and is in the middle of a page fault. */ |
| 306 | spin_unlock(&mm->page_table_lock); | 313 | if (mm != current->mm && !ignore_token && has_swap_token(mm) && |
| 307 | } | 314 | rwsem_is_locked(&mm->mmap_sem)) |
| 315 | referenced++; | ||
| 316 | |||
| 317 | (*mapcount)--; | ||
| 318 | pte_unmap_unlock(pte, ptl); | ||
| 308 | out: | 319 | out: |
| 309 | return referenced; | 320 | return referenced; |
| 310 | } | 321 | } |
| @@ -434,15 +445,11 @@ int page_referenced(struct page *page, int is_locked, int ignore_token) | |||
| 434 | * @vma: the vm area in which the mapping is added | 445 | * @vma: the vm area in which the mapping is added |
| 435 | * @address: the user virtual address mapped | 446 | * @address: the user virtual address mapped |
| 436 | * | 447 | * |
| 437 | * The caller needs to hold the mm->page_table_lock. | 448 | * The caller needs to hold the pte lock. |
| 438 | */ | 449 | */ |
| 439 | void page_add_anon_rmap(struct page *page, | 450 | void page_add_anon_rmap(struct page *page, |
| 440 | struct vm_area_struct *vma, unsigned long address) | 451 | struct vm_area_struct *vma, unsigned long address) |
| 441 | { | 452 | { |
| 442 | BUG_ON(PageReserved(page)); | ||
| 443 | |||
| 444 | inc_mm_counter(vma->vm_mm, anon_rss); | ||
| 445 | |||
| 446 | if (atomic_inc_and_test(&page->_mapcount)) { | 453 | if (atomic_inc_and_test(&page->_mapcount)) { |
| 447 | struct anon_vma *anon_vma = vma->anon_vma; | 454 | struct anon_vma *anon_vma = vma->anon_vma; |
| 448 | 455 | ||
| @@ -461,13 +468,12 @@ void page_add_anon_rmap(struct page *page, | |||
| 461 | * page_add_file_rmap - add pte mapping to a file page | 468 | * page_add_file_rmap - add pte mapping to a file page |
| 462 | * @page: the page to add the mapping to | 469 | * @page: the page to add the mapping to |
| 463 | * | 470 | * |
| 464 | * The caller needs to hold the mm->page_table_lock. | 471 | * The caller needs to hold the pte lock. |
| 465 | */ | 472 | */ |
| 466 | void page_add_file_rmap(struct page *page) | 473 | void page_add_file_rmap(struct page *page) |
| 467 | { | 474 | { |
| 468 | BUG_ON(PageAnon(page)); | 475 | BUG_ON(PageAnon(page)); |
| 469 | if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) | 476 | BUG_ON(!pfn_valid(page_to_pfn(page))); |
| 470 | return; | ||
| 471 | 477 | ||
| 472 | if (atomic_inc_and_test(&page->_mapcount)) | 478 | if (atomic_inc_and_test(&page->_mapcount)) |
| 473 | inc_page_state(nr_mapped); | 479 | inc_page_state(nr_mapped); |
| @@ -477,12 +483,10 @@ void page_add_file_rmap(struct page *page) | |||
| 477 | * page_remove_rmap - take down pte mapping from a page | 483 | * page_remove_rmap - take down pte mapping from a page |
| 478 | * @page: page to remove mapping from | 484 | * @page: page to remove mapping from |
| 479 | * | 485 | * |
| 480 | * Caller needs to hold the mm->page_table_lock. | 486 | * The caller needs to hold the pte lock. |
| 481 | */ | 487 | */ |
| 482 | void page_remove_rmap(struct page *page) | 488 | void page_remove_rmap(struct page *page) |
| 483 | { | 489 | { |
| 484 | BUG_ON(PageReserved(page)); | ||
| 485 | |||
| 486 | if (atomic_add_negative(-1, &page->_mapcount)) { | 490 | if (atomic_add_negative(-1, &page->_mapcount)) { |
| 487 | BUG_ON(page_mapcount(page) < 0); | 491 | BUG_ON(page_mapcount(page) < 0); |
| 488 | /* | 492 | /* |
| @@ -510,14 +514,15 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) | |||
| 510 | unsigned long address; | 514 | unsigned long address; |
| 511 | pte_t *pte; | 515 | pte_t *pte; |
| 512 | pte_t pteval; | 516 | pte_t pteval; |
| 517 | spinlock_t *ptl; | ||
| 513 | int ret = SWAP_AGAIN; | 518 | int ret = SWAP_AGAIN; |
| 514 | 519 | ||
| 515 | address = vma_address(page, vma); | 520 | address = vma_address(page, vma); |
| 516 | if (address == -EFAULT) | 521 | if (address == -EFAULT) |
| 517 | goto out; | 522 | goto out; |
| 518 | 523 | ||
| 519 | pte = page_check_address(page, mm, address); | 524 | pte = page_check_address(page, mm, address, &ptl); |
| 520 | if (IS_ERR(pte)) | 525 | if (!pte) |
| 521 | goto out; | 526 | goto out; |
| 522 | 527 | ||
| 523 | /* | 528 | /* |
| @@ -541,8 +546,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) | |||
| 541 | if (pte_dirty(pteval)) | 546 | if (pte_dirty(pteval)) |
| 542 | set_page_dirty(page); | 547 | set_page_dirty(page); |
| 543 | 548 | ||
| 549 | /* Update high watermark before we lower rss */ | ||
| 550 | update_hiwater_rss(mm); | ||
| 551 | |||
| 544 | if (PageAnon(page)) { | 552 | if (PageAnon(page)) { |
| 545 | swp_entry_t entry = { .val = page->private }; | 553 | swp_entry_t entry = { .val = page_private(page) }; |
| 546 | /* | 554 | /* |
| 547 | * Store the swap location in the pte. | 555 | * Store the swap location in the pte. |
| 548 | * See handle_pte_fault() ... | 556 | * See handle_pte_fault() ... |
| @@ -551,21 +559,21 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) | |||
| 551 | swap_duplicate(entry); | 559 | swap_duplicate(entry); |
| 552 | if (list_empty(&mm->mmlist)) { | 560 | if (list_empty(&mm->mmlist)) { |
| 553 | spin_lock(&mmlist_lock); | 561 | spin_lock(&mmlist_lock); |
| 554 | list_add(&mm->mmlist, &init_mm.mmlist); | 562 | if (list_empty(&mm->mmlist)) |
| 563 | list_add(&mm->mmlist, &init_mm.mmlist); | ||
| 555 | spin_unlock(&mmlist_lock); | 564 | spin_unlock(&mmlist_lock); |
| 556 | } | 565 | } |
| 557 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 566 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
| 558 | BUG_ON(pte_file(*pte)); | 567 | BUG_ON(pte_file(*pte)); |
| 559 | dec_mm_counter(mm, anon_rss); | 568 | dec_mm_counter(mm, anon_rss); |
| 560 | } | 569 | } else |
| 570 | dec_mm_counter(mm, file_rss); | ||
| 561 | 571 | ||
| 562 | dec_mm_counter(mm, rss); | ||
| 563 | page_remove_rmap(page); | 572 | page_remove_rmap(page); |
| 564 | page_cache_release(page); | 573 | page_cache_release(page); |
| 565 | 574 | ||
| 566 | out_unmap: | 575 | out_unmap: |
| 567 | pte_unmap(pte); | 576 | pte_unmap_unlock(pte, ptl); |
| 568 | spin_unlock(&mm->page_table_lock); | ||
| 569 | out: | 577 | out: |
| 570 | return ret; | 578 | return ret; |
| 571 | } | 579 | } |
| @@ -599,19 +607,14 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
| 599 | pgd_t *pgd; | 607 | pgd_t *pgd; |
| 600 | pud_t *pud; | 608 | pud_t *pud; |
| 601 | pmd_t *pmd; | 609 | pmd_t *pmd; |
| 602 | pte_t *pte, *original_pte; | 610 | pte_t *pte; |
| 603 | pte_t pteval; | 611 | pte_t pteval; |
| 612 | spinlock_t *ptl; | ||
| 604 | struct page *page; | 613 | struct page *page; |
| 605 | unsigned long address; | 614 | unsigned long address; |
| 606 | unsigned long end; | 615 | unsigned long end; |
| 607 | unsigned long pfn; | 616 | unsigned long pfn; |
| 608 | 617 | ||
| 609 | /* | ||
| 610 | * We need the page_table_lock to protect us from page faults, | ||
| 611 | * munmap, fork, etc... | ||
| 612 | */ | ||
| 613 | spin_lock(&mm->page_table_lock); | ||
| 614 | |||
| 615 | address = (vma->vm_start + cursor) & CLUSTER_MASK; | 618 | address = (vma->vm_start + cursor) & CLUSTER_MASK; |
| 616 | end = address + CLUSTER_SIZE; | 619 | end = address + CLUSTER_SIZE; |
| 617 | if (address < vma->vm_start) | 620 | if (address < vma->vm_start) |
| @@ -621,30 +624,33 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
| 621 | 624 | ||
| 622 | pgd = pgd_offset(mm, address); | 625 | pgd = pgd_offset(mm, address); |
| 623 | if (!pgd_present(*pgd)) | 626 | if (!pgd_present(*pgd)) |
| 624 | goto out_unlock; | 627 | return; |
| 625 | 628 | ||
| 626 | pud = pud_offset(pgd, address); | 629 | pud = pud_offset(pgd, address); |
| 627 | if (!pud_present(*pud)) | 630 | if (!pud_present(*pud)) |
| 628 | goto out_unlock; | 631 | return; |
| 629 | 632 | ||
| 630 | pmd = pmd_offset(pud, address); | 633 | pmd = pmd_offset(pud, address); |
| 631 | if (!pmd_present(*pmd)) | 634 | if (!pmd_present(*pmd)) |
| 632 | goto out_unlock; | 635 | return; |
| 636 | |||
| 637 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
| 633 | 638 | ||
| 634 | for (original_pte = pte = pte_offset_map(pmd, address); | 639 | /* Update high watermark before we lower rss */ |
| 635 | address < end; pte++, address += PAGE_SIZE) { | 640 | update_hiwater_rss(mm); |
| 636 | 641 | ||
| 642 | for (; address < end; pte++, address += PAGE_SIZE) { | ||
| 637 | if (!pte_present(*pte)) | 643 | if (!pte_present(*pte)) |
| 638 | continue; | 644 | continue; |
| 639 | 645 | ||
| 640 | pfn = pte_pfn(*pte); | 646 | pfn = pte_pfn(*pte); |
| 641 | if (!pfn_valid(pfn)) | 647 | if (unlikely(!pfn_valid(pfn))) { |
| 648 | print_bad_pte(vma, *pte, address); | ||
| 642 | continue; | 649 | continue; |
| 650 | } | ||
| 643 | 651 | ||
| 644 | page = pfn_to_page(pfn); | 652 | page = pfn_to_page(pfn); |
| 645 | BUG_ON(PageAnon(page)); | 653 | BUG_ON(PageAnon(page)); |
| 646 | if (PageReserved(page)) | ||
| 647 | continue; | ||
| 648 | 654 | ||
| 649 | if (ptep_clear_flush_young(vma, address, pte)) | 655 | if (ptep_clear_flush_young(vma, address, pte)) |
| 650 | continue; | 656 | continue; |
| @@ -663,13 +669,10 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
| 663 | 669 | ||
| 664 | page_remove_rmap(page); | 670 | page_remove_rmap(page); |
| 665 | page_cache_release(page); | 671 | page_cache_release(page); |
| 666 | dec_mm_counter(mm, rss); | 672 | dec_mm_counter(mm, file_rss); |
| 667 | (*mapcount)--; | 673 | (*mapcount)--; |
| 668 | } | 674 | } |
| 669 | 675 | pte_unmap_unlock(pte - 1, ptl); | |
| 670 | pte_unmap(original_pte); | ||
| 671 | out_unlock: | ||
| 672 | spin_unlock(&mm->page_table_lock); | ||
| 673 | } | 676 | } |
| 674 | 677 | ||
| 675 | static int try_to_unmap_anon(struct page *page) | 678 | static int try_to_unmap_anon(struct page *page) |
| @@ -806,7 +809,6 @@ int try_to_unmap(struct page *page) | |||
| 806 | { | 809 | { |
| 807 | int ret; | 810 | int ret; |
| 808 | 811 | ||
| 809 | BUG_ON(PageReserved(page)); | ||
| 810 | BUG_ON(!PageLocked(page)); | 812 | BUG_ON(!PageLocked(page)); |
| 811 | 813 | ||
| 812 | if (PageAnon(page)) | 814 | if (PageAnon(page)) |
diff --git a/mm/shmem.c b/mm/shmem.c index 55e04a0734c1..dc25565a61e9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -71,9 +71,6 @@ | |||
| 71 | /* Pretend that each entry is of this size in directory's i_size */ | 71 | /* Pretend that each entry is of this size in directory's i_size */ |
| 72 | #define BOGO_DIRENT_SIZE 20 | 72 | #define BOGO_DIRENT_SIZE 20 |
| 73 | 73 | ||
| 74 | /* Keep swapped page count in private field of indirect struct page */ | ||
| 75 | #define nr_swapped private | ||
| 76 | |||
| 77 | /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ | 74 | /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ |
| 78 | enum sgp_type { | 75 | enum sgp_type { |
| 79 | SGP_QUICK, /* don't try more than file page cache lookup */ | 76 | SGP_QUICK, /* don't try more than file page cache lookup */ |
| @@ -324,8 +321,10 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns | |||
| 324 | 321 | ||
| 325 | entry->val = value; | 322 | entry->val = value; |
| 326 | info->swapped += incdec; | 323 | info->swapped += incdec; |
| 327 | if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) | 324 | if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { |
| 328 | kmap_atomic_to_page(entry)->nr_swapped += incdec; | 325 | struct page *page = kmap_atomic_to_page(entry); |
| 326 | set_page_private(page, page_private(page) + incdec); | ||
| 327 | } | ||
| 329 | } | 328 | } |
| 330 | 329 | ||
| 331 | /* | 330 | /* |
| @@ -368,9 +367,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long | |||
| 368 | 367 | ||
| 369 | spin_unlock(&info->lock); | 368 | spin_unlock(&info->lock); |
| 370 | page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); | 369 | page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); |
| 371 | if (page) { | 370 | if (page) |
| 372 | page->nr_swapped = 0; | 371 | set_page_private(page, 0); |
| 373 | } | ||
| 374 | spin_lock(&info->lock); | 372 | spin_lock(&info->lock); |
| 375 | 373 | ||
| 376 | if (!page) { | 374 | if (!page) { |
| @@ -561,7 +559,7 @@ static void shmem_truncate(struct inode *inode) | |||
| 561 | diroff = 0; | 559 | diroff = 0; |
| 562 | } | 560 | } |
| 563 | subdir = dir[diroff]; | 561 | subdir = dir[diroff]; |
| 564 | if (subdir && subdir->nr_swapped) { | 562 | if (subdir && page_private(subdir)) { |
| 565 | size = limit - idx; | 563 | size = limit - idx; |
| 566 | if (size > ENTRIES_PER_PAGE) | 564 | if (size > ENTRIES_PER_PAGE) |
| 567 | size = ENTRIES_PER_PAGE; | 565 | size = ENTRIES_PER_PAGE; |
| @@ -572,10 +570,10 @@ static void shmem_truncate(struct inode *inode) | |||
| 572 | nr_swaps_freed += freed; | 570 | nr_swaps_freed += freed; |
| 573 | if (offset) | 571 | if (offset) |
| 574 | spin_lock(&info->lock); | 572 | spin_lock(&info->lock); |
| 575 | subdir->nr_swapped -= freed; | 573 | set_page_private(subdir, page_private(subdir) - freed); |
| 576 | if (offset) | 574 | if (offset) |
| 577 | spin_unlock(&info->lock); | 575 | spin_unlock(&info->lock); |
| 578 | BUG_ON(subdir->nr_swapped > offset); | 576 | BUG_ON(page_private(subdir) > offset); |
| 579 | } | 577 | } |
| 580 | if (offset) | 578 | if (offset) |
| 581 | offset = 0; | 579 | offset = 0; |
| @@ -743,7 +741,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s | |||
| 743 | dir = shmem_dir_map(subdir); | 741 | dir = shmem_dir_map(subdir); |
| 744 | } | 742 | } |
| 745 | subdir = *dir; | 743 | subdir = *dir; |
| 746 | if (subdir && subdir->nr_swapped) { | 744 | if (subdir && page_private(subdir)) { |
| 747 | ptr = shmem_swp_map(subdir); | 745 | ptr = shmem_swp_map(subdir); |
| 748 | size = limit - idx; | 746 | size = limit - idx; |
| 749 | if (size > ENTRIES_PER_PAGE) | 747 | if (size > ENTRIES_PER_PAGE) |
| @@ -1201,7 +1199,7 @@ static int shmem_populate(struct vm_area_struct *vma, | |||
| 1201 | page_cache_release(page); | 1199 | page_cache_release(page); |
| 1202 | return err; | 1200 | return err; |
| 1203 | } | 1201 | } |
| 1204 | } else { | 1202 | } else if (vma->vm_flags & VM_NONLINEAR) { |
| 1205 | /* No page was found just because we can't read it in | 1203 | /* No page was found just because we can't read it in |
| 1206 | * now (being here implies nonblock != 0), but the page | 1204 | * now (being here implies nonblock != 0), but the page |
| 1207 | * may exist, so set the PTE to fault it in later. */ | 1205 | * may exist, so set the PTE to fault it in later. */ |
| @@ -1506,8 +1504,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ | |||
| 1506 | */ | 1504 | */ |
| 1507 | if (!offset) | 1505 | if (!offset) |
| 1508 | mark_page_accessed(page); | 1506 | mark_page_accessed(page); |
| 1509 | } else | 1507 | } else { |
| 1510 | page = ZERO_PAGE(0); | 1508 | page = ZERO_PAGE(0); |
| 1509 | page_cache_get(page); | ||
| 1510 | } | ||
| 1511 | 1511 | ||
| 1512 | /* | 1512 | /* |
| 1513 | * Ok, we have the page, and it's up-to-date, so | 1513 | * Ok, we have the page, and it's up-to-date, so |
| @@ -2419,6 +2419,7 @@ retry: | |||
| 2419 | next = slab_bufctl(slabp)[slabp->free]; | 2419 | next = slab_bufctl(slabp)[slabp->free]; |
| 2420 | #if DEBUG | 2420 | #if DEBUG |
| 2421 | slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; | 2421 | slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; |
| 2422 | WARN_ON(numa_node_id() != slabp->nodeid); | ||
| 2422 | #endif | 2423 | #endif |
| 2423 | slabp->free = next; | 2424 | slabp->free = next; |
| 2424 | } | 2425 | } |
| @@ -2633,8 +2634,10 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n | |||
| 2633 | check_spinlock_acquired_node(cachep, node); | 2634 | check_spinlock_acquired_node(cachep, node); |
| 2634 | check_slabp(cachep, slabp); | 2635 | check_slabp(cachep, slabp); |
| 2635 | 2636 | ||
| 2636 | |||
| 2637 | #if DEBUG | 2637 | #if DEBUG |
| 2638 | /* Verify that the slab belongs to the intended node */ | ||
| 2639 | WARN_ON(slabp->nodeid != node); | ||
| 2640 | |||
| 2638 | if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { | 2641 | if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { |
| 2639 | printk(KERN_ERR "slab: double free detected in cache " | 2642 | printk(KERN_ERR "slab: double free detected in cache " |
| 2640 | "'%s', objp %p\n", cachep->name, objp); | 2643 | "'%s', objp %p\n", cachep->name, objp); |
diff --git a/mm/sparse.c b/mm/sparse.c index 347249a4917a..72079b538e2d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
| @@ -5,8 +5,10 @@ | |||
| 5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
| 6 | #include <linux/mmzone.h> | 6 | #include <linux/mmzone.h> |
| 7 | #include <linux/bootmem.h> | 7 | #include <linux/bootmem.h> |
| 8 | #include <linux/highmem.h> | ||
| 8 | #include <linux/module.h> | 9 | #include <linux/module.h> |
| 9 | #include <linux/spinlock.h> | 10 | #include <linux/spinlock.h> |
| 11 | #include <linux/vmalloc.h> | ||
| 10 | #include <asm/dma.h> | 12 | #include <asm/dma.h> |
| 11 | 13 | ||
| 12 | /* | 14 | /* |
| @@ -72,6 +74,31 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) | |||
| 72 | } | 74 | } |
| 73 | #endif | 75 | #endif |
| 74 | 76 | ||
| 77 | /* | ||
| 78 | * Although written for the SPARSEMEM_EXTREME case, this happens | ||
| 79 | * to also work for the flat array case becase | ||
| 80 | * NR_SECTION_ROOTS==NR_MEM_SECTIONS. | ||
| 81 | */ | ||
| 82 | int __section_nr(struct mem_section* ms) | ||
| 83 | { | ||
| 84 | unsigned long root_nr; | ||
| 85 | struct mem_section* root; | ||
| 86 | |||
| 87 | for (root_nr = 0; | ||
| 88 | root_nr < NR_MEM_SECTIONS; | ||
| 89 | root_nr += SECTIONS_PER_ROOT) { | ||
| 90 | root = __nr_to_section(root_nr); | ||
| 91 | |||
| 92 | if (!root) | ||
| 93 | continue; | ||
| 94 | |||
| 95 | if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) | ||
| 96 | break; | ||
| 97 | } | ||
| 98 | |||
| 99 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); | ||
| 100 | } | ||
| 101 | |||
| 75 | /* Record a memory area against a node. */ | 102 | /* Record a memory area against a node. */ |
| 76 | void memory_present(int nid, unsigned long start, unsigned long end) | 103 | void memory_present(int nid, unsigned long start, unsigned long end) |
| 77 | { | 104 | { |
| @@ -162,6 +189,45 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum) | |||
| 162 | return NULL; | 189 | return NULL; |
| 163 | } | 190 | } |
| 164 | 191 | ||
| 192 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | ||
| 193 | { | ||
| 194 | struct page *page, *ret; | ||
| 195 | unsigned long memmap_size = sizeof(struct page) * nr_pages; | ||
| 196 | |||
| 197 | page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); | ||
| 198 | if (page) | ||
| 199 | goto got_map_page; | ||
| 200 | |||
| 201 | ret = vmalloc(memmap_size); | ||
| 202 | if (ret) | ||
| 203 | goto got_map_ptr; | ||
| 204 | |||
| 205 | return NULL; | ||
| 206 | got_map_page: | ||
| 207 | ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); | ||
| 208 | got_map_ptr: | ||
| 209 | memset(ret, 0, memmap_size); | ||
| 210 | |||
| 211 | return ret; | ||
| 212 | } | ||
| 213 | |||
| 214 | static int vaddr_in_vmalloc_area(void *addr) | ||
| 215 | { | ||
| 216 | if (addr >= (void *)VMALLOC_START && | ||
| 217 | addr < (void *)VMALLOC_END) | ||
| 218 | return 1; | ||
| 219 | return 0; | ||
| 220 | } | ||
| 221 | |||
| 222 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | ||
| 223 | { | ||
| 224 | if (vaddr_in_vmalloc_area(memmap)) | ||
| 225 | vfree(memmap); | ||
| 226 | else | ||
| 227 | free_pages((unsigned long)memmap, | ||
| 228 | get_order(sizeof(struct page) * nr_pages)); | ||
| 229 | } | ||
| 230 | |||
| 165 | /* | 231 | /* |
| 166 | * Allocate the accumulated non-linear sections, allocate a mem_map | 232 | * Allocate the accumulated non-linear sections, allocate a mem_map |
| 167 | * for each and record the physical to section mapping. | 233 | * for each and record the physical to section mapping. |
| @@ -187,14 +253,37 @@ void sparse_init(void) | |||
| 187 | * set. If this is <=0, then that means that the passed-in | 253 | * set. If this is <=0, then that means that the passed-in |
| 188 | * map was not consumed and must be freed. | 254 | * map was not consumed and must be freed. |
| 189 | */ | 255 | */ |
| 190 | int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map) | 256 | int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, |
| 257 | int nr_pages) | ||
| 191 | { | 258 | { |
| 192 | struct mem_section *ms = __pfn_to_section(start_pfn); | 259 | unsigned long section_nr = pfn_to_section_nr(start_pfn); |
| 260 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
| 261 | struct mem_section *ms; | ||
| 262 | struct page *memmap; | ||
| 263 | unsigned long flags; | ||
| 264 | int ret; | ||
| 193 | 265 | ||
| 194 | if (ms->section_mem_map & SECTION_MARKED_PRESENT) | 266 | /* |
| 195 | return -EEXIST; | 267 | * no locking for this, because it does its own |
| 268 | * plus, it does a kmalloc | ||
| 269 | */ | ||
| 270 | sparse_index_init(section_nr, pgdat->node_id); | ||
| 271 | memmap = __kmalloc_section_memmap(nr_pages); | ||
| 272 | |||
| 273 | pgdat_resize_lock(pgdat, &flags); | ||
| 196 | 274 | ||
| 275 | ms = __pfn_to_section(start_pfn); | ||
| 276 | if (ms->section_mem_map & SECTION_MARKED_PRESENT) { | ||
| 277 | ret = -EEXIST; | ||
| 278 | goto out; | ||
| 279 | } | ||
| 197 | ms->section_mem_map |= SECTION_MARKED_PRESENT; | 280 | ms->section_mem_map |= SECTION_MARKED_PRESENT; |
| 198 | 281 | ||
| 199 | return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map); | 282 | ret = sparse_init_one_section(ms, section_nr, memmap); |
| 283 | |||
| 284 | if (ret <= 0) | ||
| 285 | __kfree_section_memmap(memmap, nr_pages); | ||
| 286 | out: | ||
| 287 | pgdat_resize_unlock(pgdat, &flags); | ||
| 288 | return ret; | ||
| 200 | } | 289 | } |
| @@ -39,7 +39,7 @@ int page_cluster; | |||
| 39 | void put_page(struct page *page) | 39 | void put_page(struct page *page) |
| 40 | { | 40 | { |
| 41 | if (unlikely(PageCompound(page))) { | 41 | if (unlikely(PageCompound(page))) { |
| 42 | page = (struct page *)page->private; | 42 | page = (struct page *)page_private(page); |
| 43 | if (put_page_testzero(page)) { | 43 | if (put_page_testzero(page)) { |
| 44 | void (*dtor)(struct page *page); | 44 | void (*dtor)(struct page *page); |
| 45 | 45 | ||
| @@ -48,7 +48,7 @@ void put_page(struct page *page) | |||
| 48 | } | 48 | } |
| 49 | return; | 49 | return; |
| 50 | } | 50 | } |
| 51 | if (!PageReserved(page) && put_page_testzero(page)) | 51 | if (put_page_testzero(page)) |
| 52 | __page_cache_release(page); | 52 | __page_cache_release(page); |
| 53 | } | 53 | } |
| 54 | EXPORT_SYMBOL(put_page); | 54 | EXPORT_SYMBOL(put_page); |
| @@ -215,7 +215,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
| 215 | struct page *page = pages[i]; | 215 | struct page *page = pages[i]; |
| 216 | struct zone *pagezone; | 216 | struct zone *pagezone; |
| 217 | 217 | ||
| 218 | if (PageReserved(page) || !put_page_testzero(page)) | 218 | if (!put_page_testzero(page)) |
| 219 | continue; | 219 | continue; |
| 220 | 220 | ||
| 221 | pagezone = page_zone(page); | 221 | pagezone = page_zone(page); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 132164f7d0a7..dfd9a46755b8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, | |||
| 83 | page_cache_get(page); | 83 | page_cache_get(page); |
| 84 | SetPageLocked(page); | 84 | SetPageLocked(page); |
| 85 | SetPageSwapCache(page); | 85 | SetPageSwapCache(page); |
| 86 | page->private = entry.val; | 86 | set_page_private(page, entry.val); |
| 87 | total_swapcache_pages++; | 87 | total_swapcache_pages++; |
| 88 | pagecache_acct(1); | 88 | pagecache_acct(1); |
| 89 | } | 89 | } |
| @@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page) | |||
| 126 | BUG_ON(PageWriteback(page)); | 126 | BUG_ON(PageWriteback(page)); |
| 127 | BUG_ON(PagePrivate(page)); | 127 | BUG_ON(PagePrivate(page)); |
| 128 | 128 | ||
| 129 | radix_tree_delete(&swapper_space.page_tree, page->private); | 129 | radix_tree_delete(&swapper_space.page_tree, page_private(page)); |
| 130 | page->private = 0; | 130 | set_page_private(page, 0); |
| 131 | ClearPageSwapCache(page); | 131 | ClearPageSwapCache(page); |
| 132 | total_swapcache_pages--; | 132 | total_swapcache_pages--; |
| 133 | pagecache_acct(-1); | 133 | pagecache_acct(-1); |
| @@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page) | |||
| 197 | { | 197 | { |
| 198 | swp_entry_t entry; | 198 | swp_entry_t entry; |
| 199 | 199 | ||
| 200 | entry.val = page->private; | 200 | entry.val = page_private(page); |
| 201 | 201 | ||
| 202 | write_lock_irq(&swapper_space.tree_lock); | 202 | write_lock_irq(&swapper_space.tree_lock); |
| 203 | __delete_from_swap_cache(page); | 203 | __delete_from_swap_cache(page); |
| @@ -259,8 +259,7 @@ static inline void free_swap_cache(struct page *page) | |||
| 259 | 259 | ||
| 260 | /* | 260 | /* |
| 261 | * Perform a free_page(), also freeing any swap cache associated with | 261 | * Perform a free_page(), also freeing any swap cache associated with |
| 262 | * this page if it is the last user of the page. Can not do a lock_page, | 262 | * this page if it is the last user of the page. |
| 263 | * as we are holding the page_table_lock spinlock. | ||
| 264 | */ | 263 | */ |
| 265 | void free_page_and_swap_cache(struct page *page) | 264 | void free_page_and_swap_cache(struct page *page) |
| 266 | { | 265 | { |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 1dcaeda039f4..8970c0b74194 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
| 61 | swp_entry_t entry; | 61 | swp_entry_t entry; |
| 62 | 62 | ||
| 63 | down_read(&swap_unplug_sem); | 63 | down_read(&swap_unplug_sem); |
| 64 | entry.val = page->private; | 64 | entry.val = page_private(page); |
| 65 | if (PageSwapCache(page)) { | 65 | if (PageSwapCache(page)) { |
| 66 | struct block_device *bdev = swap_info[swp_type(entry)].bdev; | 66 | struct block_device *bdev = swap_info[swp_type(entry)].bdev; |
| 67 | struct backing_dev_info *bdi; | 67 | struct backing_dev_info *bdi; |
| @@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
| 69 | /* | 69 | /* |
| 70 | * If the page is removed from swapcache from under us (with a | 70 | * If the page is removed from swapcache from under us (with a |
| 71 | * racy try_to_unuse/swapoff) we need an additional reference | 71 | * racy try_to_unuse/swapoff) we need an additional reference |
| 72 | * count to avoid reading garbage from page->private above. If | 72 | * count to avoid reading garbage from page_private(page) above. |
| 73 | * the WARN_ON triggers during a swapoff it maybe the race | 73 | * If the WARN_ON triggers during a swapoff it maybe the race |
| 74 | * condition and it's harmless. However if it triggers without | 74 | * condition and it's harmless. However if it triggers without |
| 75 | * swapoff it signals a problem. | 75 | * swapoff it signals a problem. |
| 76 | */ | 76 | */ |
| @@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page) | |||
| 294 | struct swap_info_struct *p; | 294 | struct swap_info_struct *p; |
| 295 | swp_entry_t entry; | 295 | swp_entry_t entry; |
| 296 | 296 | ||
| 297 | entry.val = page->private; | 297 | entry.val = page_private(page); |
| 298 | p = swap_info_get(entry); | 298 | p = swap_info_get(entry); |
| 299 | if (p) { | 299 | if (p) { |
| 300 | /* Subtract the 1 for the swap cache itself */ | 300 | /* Subtract the 1 for the swap cache itself */ |
| @@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page) | |||
| 339 | if (page_count(page) != 2) /* 2: us + cache */ | 339 | if (page_count(page) != 2) /* 2: us + cache */ |
| 340 | return 0; | 340 | return 0; |
| 341 | 341 | ||
| 342 | entry.val = page->private; | 342 | entry.val = page_private(page); |
| 343 | p = swap_info_get(entry); | 343 | p = swap_info_get(entry); |
| 344 | if (!p) | 344 | if (!p) |
| 345 | return 0; | 345 | return 0; |
| @@ -398,17 +398,14 @@ void free_swap_and_cache(swp_entry_t entry) | |||
| 398 | } | 398 | } |
| 399 | 399 | ||
| 400 | /* | 400 | /* |
| 401 | * Always set the resulting pte to be nowrite (the same as COW pages | 401 | * No need to decide whether this PTE shares the swap entry with others, |
| 402 | * after one process has exited). We don't know just how many PTEs will | 402 | * just let do_wp_page work it out if a write is requested later - to |
| 403 | * share this swap entry, so be cautious and let do_wp_page work out | 403 | * force COW, vm_page_prot omits write permission from any private vma. |
| 404 | * what to do if a write is requested later. | ||
| 405 | * | ||
| 406 | * vma->vm_mm->page_table_lock is held. | ||
| 407 | */ | 404 | */ |
| 408 | static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, | 405 | static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, |
| 409 | unsigned long addr, swp_entry_t entry, struct page *page) | 406 | unsigned long addr, swp_entry_t entry, struct page *page) |
| 410 | { | 407 | { |
| 411 | inc_mm_counter(vma->vm_mm, rss); | 408 | inc_mm_counter(vma->vm_mm, anon_rss); |
| 412 | get_page(page); | 409 | get_page(page); |
| 413 | set_pte_at(vma->vm_mm, addr, pte, | 410 | set_pte_at(vma->vm_mm, addr, pte, |
| 414 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 411 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
| @@ -425,23 +422,25 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 425 | unsigned long addr, unsigned long end, | 422 | unsigned long addr, unsigned long end, |
| 426 | swp_entry_t entry, struct page *page) | 423 | swp_entry_t entry, struct page *page) |
| 427 | { | 424 | { |
| 428 | pte_t *pte; | ||
| 429 | pte_t swp_pte = swp_entry_to_pte(entry); | 425 | pte_t swp_pte = swp_entry_to_pte(entry); |
| 426 | pte_t *pte; | ||
| 427 | spinlock_t *ptl; | ||
| 428 | int found = 0; | ||
| 430 | 429 | ||
| 431 | pte = pte_offset_map(pmd, addr); | 430 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
| 432 | do { | 431 | do { |
| 433 | /* | 432 | /* |
| 434 | * swapoff spends a _lot_ of time in this loop! | 433 | * swapoff spends a _lot_ of time in this loop! |
| 435 | * Test inline before going to call unuse_pte. | 434 | * Test inline before going to call unuse_pte. |
| 436 | */ | 435 | */ |
| 437 | if (unlikely(pte_same(*pte, swp_pte))) { | 436 | if (unlikely(pte_same(*pte, swp_pte))) { |
| 438 | unuse_pte(vma, pte, addr, entry, page); | 437 | unuse_pte(vma, pte++, addr, entry, page); |
| 439 | pte_unmap(pte); | 438 | found = 1; |
| 440 | return 1; | 439 | break; |
| 441 | } | 440 | } |
| 442 | } while (pte++, addr += PAGE_SIZE, addr != end); | 441 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| 443 | pte_unmap(pte - 1); | 442 | pte_unmap_unlock(pte - 1, ptl); |
| 444 | return 0; | 443 | return found; |
| 445 | } | 444 | } |
| 446 | 445 | ||
| 447 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 446 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
| @@ -523,12 +522,10 @@ static int unuse_mm(struct mm_struct *mm, | |||
| 523 | down_read(&mm->mmap_sem); | 522 | down_read(&mm->mmap_sem); |
| 524 | lock_page(page); | 523 | lock_page(page); |
| 525 | } | 524 | } |
| 526 | spin_lock(&mm->page_table_lock); | ||
| 527 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 525 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
| 528 | if (vma->anon_vma && unuse_vma(vma, entry, page)) | 526 | if (vma->anon_vma && unuse_vma(vma, entry, page)) |
| 529 | break; | 527 | break; |
| 530 | } | 528 | } |
| 531 | spin_unlock(&mm->page_table_lock); | ||
| 532 | up_read(&mm->mmap_sem); | 529 | up_read(&mm->mmap_sem); |
| 533 | /* | 530 | /* |
| 534 | * Currently unuse_mm cannot fail, but leave error handling | 531 | * Currently unuse_mm cannot fail, but leave error handling |
| @@ -1045,7 +1042,7 @@ int page_queue_congested(struct page *page) | |||
| 1045 | BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ | 1042 | BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ |
| 1046 | 1043 | ||
| 1047 | if (PageSwapCache(page)) { | 1044 | if (PageSwapCache(page)) { |
| 1048 | swp_entry_t entry = { .val = page->private }; | 1045 | swp_entry_t entry = { .val = page_private(page) }; |
| 1049 | struct swap_info_struct *sis; | 1046 | struct swap_info_struct *sis; |
| 1050 | 1047 | ||
| 1051 | sis = get_swap_info_struct(swp_type(entry)); | 1048 | sis = get_swap_info_struct(swp_type(entry)); |
diff --git a/mm/thrash.c b/mm/thrash.c index 11461f7ad830..eff3c18c33a1 100644 --- a/mm/thrash.c +++ b/mm/thrash.c | |||
| @@ -19,7 +19,7 @@ static unsigned long swap_token_check; | |||
| 19 | struct mm_struct * swap_token_mm = &init_mm; | 19 | struct mm_struct * swap_token_mm = &init_mm; |
| 20 | 20 | ||
| 21 | #define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2) | 21 | #define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2) |
| 22 | #define SWAP_TOKEN_TIMEOUT 0 | 22 | #define SWAP_TOKEN_TIMEOUT (300 * HZ) |
| 23 | /* | 23 | /* |
| 24 | * Currently disabled; Needs further code to work at HZ * 300. | 24 | * Currently disabled; Needs further code to work at HZ * 300. |
| 25 | */ | 25 | */ |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1150229b6366..54a90e83cb31 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | 5 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 |
| 6 | * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 | 6 | * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 |
| 7 | * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 | 7 | * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 |
| 8 | * Numa awareness, Christoph Lameter, SGI, June 2005 | ||
| 8 | */ | 9 | */ |
| 9 | 10 | ||
| 10 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
| @@ -88,7 +89,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, | |||
| 88 | { | 89 | { |
| 89 | pte_t *pte; | 90 | pte_t *pte; |
| 90 | 91 | ||
| 91 | pte = pte_alloc_kernel(&init_mm, pmd, addr); | 92 | pte = pte_alloc_kernel(pmd, addr); |
| 92 | if (!pte) | 93 | if (!pte) |
| 93 | return -ENOMEM; | 94 | return -ENOMEM; |
| 94 | do { | 95 | do { |
| @@ -146,20 +147,18 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | |||
| 146 | 147 | ||
| 147 | BUG_ON(addr >= end); | 148 | BUG_ON(addr >= end); |
| 148 | pgd = pgd_offset_k(addr); | 149 | pgd = pgd_offset_k(addr); |
| 149 | spin_lock(&init_mm.page_table_lock); | ||
| 150 | do { | 150 | do { |
| 151 | next = pgd_addr_end(addr, end); | 151 | next = pgd_addr_end(addr, end); |
| 152 | err = vmap_pud_range(pgd, addr, next, prot, pages); | 152 | err = vmap_pud_range(pgd, addr, next, prot, pages); |
| 153 | if (err) | 153 | if (err) |
| 154 | break; | 154 | break; |
| 155 | } while (pgd++, addr = next, addr != end); | 155 | } while (pgd++, addr = next, addr != end); |
| 156 | spin_unlock(&init_mm.page_table_lock); | ||
| 157 | flush_cache_vmap((unsigned long) area->addr, end); | 156 | flush_cache_vmap((unsigned long) area->addr, end); |
| 158 | return err; | 157 | return err; |
| 159 | } | 158 | } |
| 160 | 159 | ||
| 161 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | 160 | struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, |
| 162 | unsigned long start, unsigned long end) | 161 | unsigned long start, unsigned long end, int node) |
| 163 | { | 162 | { |
| 164 | struct vm_struct **p, *tmp, *area; | 163 | struct vm_struct **p, *tmp, *area; |
| 165 | unsigned long align = 1; | 164 | unsigned long align = 1; |
| @@ -178,7 +177,7 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | |||
| 178 | addr = ALIGN(start, align); | 177 | addr = ALIGN(start, align); |
| 179 | size = PAGE_ALIGN(size); | 178 | size = PAGE_ALIGN(size); |
| 180 | 179 | ||
| 181 | area = kmalloc(sizeof(*area), GFP_KERNEL); | 180 | area = kmalloc_node(sizeof(*area), GFP_KERNEL, node); |
| 182 | if (unlikely(!area)) | 181 | if (unlikely(!area)) |
| 183 | return NULL; | 182 | return NULL; |
| 184 | 183 | ||
| @@ -231,6 +230,12 @@ out: | |||
| 231 | return NULL; | 230 | return NULL; |
| 232 | } | 231 | } |
| 233 | 232 | ||
| 233 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | ||
| 234 | unsigned long start, unsigned long end) | ||
| 235 | { | ||
| 236 | return __get_vm_area_node(size, flags, start, end, -1); | ||
| 237 | } | ||
| 238 | |||
| 234 | /** | 239 | /** |
| 235 | * get_vm_area - reserve a contingous kernel virtual area | 240 | * get_vm_area - reserve a contingous kernel virtual area |
| 236 | * | 241 | * |
| @@ -246,6 +251,11 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) | |||
| 246 | return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); | 251 | return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); |
| 247 | } | 252 | } |
| 248 | 253 | ||
| 254 | struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node) | ||
| 255 | { | ||
| 256 | return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node); | ||
| 257 | } | ||
| 258 | |||
| 249 | /* Caller must hold vmlist_lock */ | 259 | /* Caller must hold vmlist_lock */ |
| 250 | struct vm_struct *__remove_vm_area(void *addr) | 260 | struct vm_struct *__remove_vm_area(void *addr) |
| 251 | { | 261 | { |
| @@ -342,7 +352,6 @@ void vfree(void *addr) | |||
| 342 | BUG_ON(in_interrupt()); | 352 | BUG_ON(in_interrupt()); |
| 343 | __vunmap(addr, 1); | 353 | __vunmap(addr, 1); |
| 344 | } | 354 | } |
| 345 | |||
| 346 | EXPORT_SYMBOL(vfree); | 355 | EXPORT_SYMBOL(vfree); |
| 347 | 356 | ||
| 348 | /** | 357 | /** |
| @@ -360,7 +369,6 @@ void vunmap(void *addr) | |||
| 360 | BUG_ON(in_interrupt()); | 369 | BUG_ON(in_interrupt()); |
| 361 | __vunmap(addr, 0); | 370 | __vunmap(addr, 0); |
| 362 | } | 371 | } |
| 363 | |||
| 364 | EXPORT_SYMBOL(vunmap); | 372 | EXPORT_SYMBOL(vunmap); |
| 365 | 373 | ||
| 366 | /** | 374 | /** |
| @@ -392,10 +400,10 @@ void *vmap(struct page **pages, unsigned int count, | |||
| 392 | 400 | ||
| 393 | return area->addr; | 401 | return area->addr; |
| 394 | } | 402 | } |
| 395 | |||
| 396 | EXPORT_SYMBOL(vmap); | 403 | EXPORT_SYMBOL(vmap); |
| 397 | 404 | ||
| 398 | void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | 405 | void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
| 406 | pgprot_t prot, int node) | ||
| 399 | { | 407 | { |
| 400 | struct page **pages; | 408 | struct page **pages; |
| 401 | unsigned int nr_pages, array_size, i; | 409 | unsigned int nr_pages, array_size, i; |
| @@ -406,9 +414,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
| 406 | area->nr_pages = nr_pages; | 414 | area->nr_pages = nr_pages; |
| 407 | /* Please note that the recursion is strictly bounded. */ | 415 | /* Please note that the recursion is strictly bounded. */ |
| 408 | if (array_size > PAGE_SIZE) | 416 | if (array_size > PAGE_SIZE) |
| 409 | pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL); | 417 | pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); |
| 410 | else | 418 | else |
| 411 | pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM)); | 419 | pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node); |
| 412 | area->pages = pages; | 420 | area->pages = pages; |
| 413 | if (!area->pages) { | 421 | if (!area->pages) { |
| 414 | remove_vm_area(area->addr); | 422 | remove_vm_area(area->addr); |
| @@ -418,7 +426,10 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
| 418 | memset(area->pages, 0, array_size); | 426 | memset(area->pages, 0, array_size); |
| 419 | 427 | ||
| 420 | for (i = 0; i < area->nr_pages; i++) { | 428 | for (i = 0; i < area->nr_pages; i++) { |
| 421 | area->pages[i] = alloc_page(gfp_mask); | 429 | if (node < 0) |
| 430 | area->pages[i] = alloc_page(gfp_mask); | ||
| 431 | else | ||
| 432 | area->pages[i] = alloc_pages_node(node, gfp_mask, 0); | ||
| 422 | if (unlikely(!area->pages[i])) { | 433 | if (unlikely(!area->pages[i])) { |
| 423 | /* Successfully allocated i pages, free them in __vunmap() */ | 434 | /* Successfully allocated i pages, free them in __vunmap() */ |
| 424 | area->nr_pages = i; | 435 | area->nr_pages = i; |
| @@ -435,18 +446,25 @@ fail: | |||
| 435 | return NULL; | 446 | return NULL; |
| 436 | } | 447 | } |
| 437 | 448 | ||
| 449 | void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | ||
| 450 | { | ||
| 451 | return __vmalloc_area_node(area, gfp_mask, prot, -1); | ||
| 452 | } | ||
| 453 | |||
| 438 | /** | 454 | /** |
| 439 | * __vmalloc - allocate virtually contiguous memory | 455 | * __vmalloc_node - allocate virtually contiguous memory |
| 440 | * | 456 | * |
| 441 | * @size: allocation size | 457 | * @size: allocation size |
| 442 | * @gfp_mask: flags for the page level allocator | 458 | * @gfp_mask: flags for the page level allocator |
| 443 | * @prot: protection mask for the allocated pages | 459 | * @prot: protection mask for the allocated pages |
| 460 | * @node node to use for allocation or -1 | ||
| 444 | * | 461 | * |
| 445 | * Allocate enough pages to cover @size from the page level | 462 | * Allocate enough pages to cover @size from the page level |
| 446 | * allocator with @gfp_mask flags. Map them into contiguous | 463 | * allocator with @gfp_mask flags. Map them into contiguous |
| 447 | * kernel virtual space, using a pagetable protection of @prot. | 464 | * kernel virtual space, using a pagetable protection of @prot. |
| 448 | */ | 465 | */ |
| 449 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 466 | void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, |
| 467 | int node) | ||
| 450 | { | 468 | { |
| 451 | struct vm_struct *area; | 469 | struct vm_struct *area; |
| 452 | 470 | ||
| @@ -454,13 +472,18 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | |||
| 454 | if (!size || (size >> PAGE_SHIFT) > num_physpages) | 472 | if (!size || (size >> PAGE_SHIFT) > num_physpages) |
| 455 | return NULL; | 473 | return NULL; |
| 456 | 474 | ||
| 457 | area = get_vm_area(size, VM_ALLOC); | 475 | area = get_vm_area_node(size, VM_ALLOC, node); |
| 458 | if (!area) | 476 | if (!area) |
| 459 | return NULL; | 477 | return NULL; |
| 460 | 478 | ||
| 461 | return __vmalloc_area(area, gfp_mask, prot); | 479 | return __vmalloc_area_node(area, gfp_mask, prot, node); |
| 462 | } | 480 | } |
| 481 | EXPORT_SYMBOL(__vmalloc_node); | ||
| 463 | 482 | ||
| 483 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | ||
| 484 | { | ||
| 485 | return __vmalloc_node(size, gfp_mask, prot, -1); | ||
| 486 | } | ||
| 464 | EXPORT_SYMBOL(__vmalloc); | 487 | EXPORT_SYMBOL(__vmalloc); |
| 465 | 488 | ||
| 466 | /** | 489 | /** |
| @@ -478,9 +501,26 @@ void *vmalloc(unsigned long size) | |||
| 478 | { | 501 | { |
| 479 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); | 502 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); |
| 480 | } | 503 | } |
| 481 | |||
| 482 | EXPORT_SYMBOL(vmalloc); | 504 | EXPORT_SYMBOL(vmalloc); |
| 483 | 505 | ||
| 506 | /** | ||
| 507 | * vmalloc_node - allocate memory on a specific node | ||
| 508 | * | ||
| 509 | * @size: allocation size | ||
| 510 | * @node; numa node | ||
| 511 | * | ||
| 512 | * Allocate enough pages to cover @size from the page level | ||
| 513 | * allocator and map them into contiguous kernel virtual space. | ||
| 514 | * | ||
| 515 | * For tight cotrol over page level allocator and protection flags | ||
| 516 | * use __vmalloc() instead. | ||
| 517 | */ | ||
| 518 | void *vmalloc_node(unsigned long size, int node) | ||
| 519 | { | ||
| 520 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); | ||
| 521 | } | ||
| 522 | EXPORT_SYMBOL(vmalloc_node); | ||
| 523 | |||
| 484 | #ifndef PAGE_KERNEL_EXEC | 524 | #ifndef PAGE_KERNEL_EXEC |
| 485 | # define PAGE_KERNEL_EXEC PAGE_KERNEL | 525 | # define PAGE_KERNEL_EXEC PAGE_KERNEL |
| 486 | #endif | 526 | #endif |
| @@ -515,7 +555,6 @@ void *vmalloc_32(unsigned long size) | |||
| 515 | { | 555 | { |
| 516 | return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); | 556 | return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); |
| 517 | } | 557 | } |
| 518 | |||
| 519 | EXPORT_SYMBOL(vmalloc_32); | 558 | EXPORT_SYMBOL(vmalloc_32); |
| 520 | 559 | ||
| 521 | long vread(char *buf, char *addr, unsigned long count) | 560 | long vread(char *buf, char *addr, unsigned long count) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 843c87d1e61f..135bf8ca96ee 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -417,7 +417,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
| 417 | * Anonymous process memory has backing store? | 417 | * Anonymous process memory has backing store? |
| 418 | * Try to allocate it some swap space here. | 418 | * Try to allocate it some swap space here. |
| 419 | */ | 419 | */ |
| 420 | if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) { | 420 | if (PageAnon(page) && !PageSwapCache(page)) { |
| 421 | if (!sc->may_swap) | ||
| 422 | goto keep_locked; | ||
| 421 | if (!add_to_swap(page)) | 423 | if (!add_to_swap(page)) |
| 422 | goto activate_locked; | 424 | goto activate_locked; |
| 423 | } | 425 | } |
| @@ -519,7 +521,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
| 519 | 521 | ||
| 520 | #ifdef CONFIG_SWAP | 522 | #ifdef CONFIG_SWAP |
| 521 | if (PageSwapCache(page)) { | 523 | if (PageSwapCache(page)) { |
| 522 | swp_entry_t swap = { .val = page->private }; | 524 | swp_entry_t swap = { .val = page_private(page) }; |
| 523 | __delete_from_swap_cache(page); | 525 | __delete_from_swap_cache(page); |
| 524 | write_unlock_irq(&mapping->tree_lock); | 526 | write_unlock_irq(&mapping->tree_lock); |
| 525 | swap_free(swap); | 527 | swap_free(swap); |
