diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/bounce.c | 4 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 31 | ||||
-rw-r--r-- | mm/filemap_xip.c | 8 | ||||
-rw-r--r-- | mm/fremap.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 10 | ||||
-rw-r--r-- | mm/memory.c | 59 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 6 | ||||
-rw-r--r-- | mm/mempolicy.c | 6 | ||||
-rw-r--r-- | mm/mincore.c | 183 | ||||
-rw-r--r-- | mm/mmap.c | 89 | ||||
-rw-r--r-- | mm/mremap.c | 1 | ||||
-rw-r--r-- | mm/nommu.c | 12 | ||||
-rw-r--r-- | mm/oom_kill.c | 21 | ||||
-rw-r--r-- | mm/page-writeback.c | 147 | ||||
-rw-r--r-- | mm/page_alloc.c | 137 | ||||
-rw-r--r-- | mm/readahead.c | 4 | ||||
-rw-r--r-- | mm/rmap.c | 36 | ||||
-rw-r--r-- | mm/shmem.c | 27 | ||||
-rw-r--r-- | mm/slab.c | 119 | ||||
-rw-r--r-- | mm/slob.c | 27 | ||||
-rw-r--r-- | mm/swapfile.c | 12 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 4 | ||||
-rw-r--r-- | mm/truncate.c | 49 | ||||
-rw-r--r-- | mm/vmscan.c | 47 |
25 files changed, 685 insertions, 358 deletions
diff --git a/mm/bounce.c b/mm/bounce.c index e4b62d2a4024..643efbe82402 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -237,6 +237,8 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, | |||
237 | if (!bio) | 237 | if (!bio) |
238 | return; | 238 | return; |
239 | 239 | ||
240 | blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); | ||
241 | |||
240 | /* | 242 | /* |
241 | * at least one page was bounced, fill in possible non-highmem | 243 | * at least one page was bounced, fill in possible non-highmem |
242 | * pages | 244 | * pages |
@@ -291,8 +293,6 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) | |||
291 | pool = isa_page_pool; | 293 | pool = isa_page_pool; |
292 | } | 294 | } |
293 | 295 | ||
294 | blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); | ||
295 | |||
296 | /* | 296 | /* |
297 | * slow path | 297 | * slow path |
298 | */ | 298 | */ |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 168c78a121bb..0df4c899e979 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -38,7 +38,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | |||
38 | if (!file) | 38 | if (!file) |
39 | return -EBADF; | 39 | return -EBADF; |
40 | 40 | ||
41 | if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) { | 41 | if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) { |
42 | ret = -ESPIPE; | 42 | ret = -ESPIPE; |
43 | goto out; | 43 | goto out; |
44 | } | 44 | } |
diff --git a/mm/filemap.c b/mm/filemap.c index af7e2f5caea9..f30ef28405d3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -606,26 +606,6 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset) | |||
606 | EXPORT_SYMBOL(find_get_page); | 606 | EXPORT_SYMBOL(find_get_page); |
607 | 607 | ||
608 | /** | 608 | /** |
609 | * find_trylock_page - find and lock a page | ||
610 | * @mapping: the address_space to search | ||
611 | * @offset: the page index | ||
612 | * | ||
613 | * Same as find_get_page(), but trylock it instead of incrementing the count. | ||
614 | */ | ||
615 | struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) | ||
616 | { | ||
617 | struct page *page; | ||
618 | |||
619 | read_lock_irq(&mapping->tree_lock); | ||
620 | page = radix_tree_lookup(&mapping->page_tree, offset); | ||
621 | if (page && TestSetPageLocked(page)) | ||
622 | page = NULL; | ||
623 | read_unlock_irq(&mapping->tree_lock); | ||
624 | return page; | ||
625 | } | ||
626 | EXPORT_SYMBOL(find_trylock_page); | ||
627 | |||
628 | /** | ||
629 | * find_lock_page - locate, pin and lock a pagecache page | 609 | * find_lock_page - locate, pin and lock a pagecache page |
630 | * @mapping: the address_space to search | 610 | * @mapping: the address_space to search |
631 | * @offset: the page index | 611 | * @offset: the page index |
@@ -1181,8 +1161,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1181 | if (pos < size) { | 1161 | if (pos < size) { |
1182 | retval = generic_file_direct_IO(READ, iocb, | 1162 | retval = generic_file_direct_IO(READ, iocb, |
1183 | iov, pos, nr_segs); | 1163 | iov, pos, nr_segs); |
1184 | if (retval > 0 && !is_sync_kiocb(iocb)) | ||
1185 | retval = -EIOCBQUEUED; | ||
1186 | if (retval > 0) | 1164 | if (retval > 0) |
1187 | *ppos = pos + retval; | 1165 | *ppos = pos + retval; |
1188 | } | 1166 | } |
@@ -2047,15 +2025,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
2047 | * Sync the fs metadata but not the minor inode changes and | 2025 | * Sync the fs metadata but not the minor inode changes and |
2048 | * of course not the data as we did direct DMA for the IO. | 2026 | * of course not the data as we did direct DMA for the IO. |
2049 | * i_mutex is held, which protects generic_osync_inode() from | 2027 | * i_mutex is held, which protects generic_osync_inode() from |
2050 | * livelocking. | 2028 | * livelocking. AIO O_DIRECT ops attempt to sync metadata here. |
2051 | */ | 2029 | */ |
2052 | if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2030 | if ((written >= 0 || written == -EIOCBQUEUED) && |
2031 | ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
2053 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); | 2032 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); |
2054 | if (err < 0) | 2033 | if (err < 0) |
2055 | written = err; | 2034 | written = err; |
2056 | } | 2035 | } |
2057 | if (written == count && !is_sync_kiocb(iocb)) | ||
2058 | written = -EIOCBQUEUED; | ||
2059 | return written; | 2036 | return written; |
2060 | } | 2037 | } |
2061 | EXPORT_SYMBOL(generic_file_direct_write); | 2038 | EXPORT_SYMBOL(generic_file_direct_write); |
@@ -2269,7 +2246,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
2269 | if (count == 0) | 2246 | if (count == 0) |
2270 | goto out; | 2247 | goto out; |
2271 | 2248 | ||
2272 | err = remove_suid(file->f_dentry); | 2249 | err = remove_suid(file->f_path.dentry); |
2273 | if (err) | 2250 | if (err) |
2274 | goto out; | 2251 | goto out; |
2275 | 2252 | ||
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b4fd0d7c9bfb..9dd9fbb75139 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -183,13 +183,13 @@ __xip_unmap (struct address_space * mapping, | |||
183 | address = vma->vm_start + | 183 | address = vma->vm_start + |
184 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 184 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
185 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 185 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
186 | page = ZERO_PAGE(address); | 186 | page = ZERO_PAGE(0); |
187 | pte = page_check_address(page, mm, address, &ptl); | 187 | pte = page_check_address(page, mm, address, &ptl); |
188 | if (pte) { | 188 | if (pte) { |
189 | /* Nuke the page table entry. */ | 189 | /* Nuke the page table entry. */ |
190 | flush_cache_page(vma, address, pte_pfn(*pte)); | 190 | flush_cache_page(vma, address, pte_pfn(*pte)); |
191 | pteval = ptep_clear_flush(vma, address, pte); | 191 | pteval = ptep_clear_flush(vma, address, pte); |
192 | page_remove_rmap(page); | 192 | page_remove_rmap(page, vma); |
193 | dec_mm_counter(mm, file_rss); | 193 | dec_mm_counter(mm, file_rss); |
194 | BUG_ON(pte_dirty(pteval)); | 194 | BUG_ON(pte_dirty(pteval)); |
195 | pte_unmap_unlock(pte, ptl); | 195 | pte_unmap_unlock(pte, ptl); |
@@ -246,7 +246,7 @@ xip_file_nopage(struct vm_area_struct * area, | |||
246 | __xip_unmap(mapping, pgoff); | 246 | __xip_unmap(mapping, pgoff); |
247 | } else { | 247 | } else { |
248 | /* not shared and writable, use ZERO_PAGE() */ | 248 | /* not shared and writable, use ZERO_PAGE() */ |
249 | page = ZERO_PAGE(address); | 249 | page = ZERO_PAGE(0); |
250 | } | 250 | } |
251 | 251 | ||
252 | out: | 252 | out: |
@@ -379,7 +379,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, | |||
379 | if (count == 0) | 379 | if (count == 0) |
380 | goto out_backing; | 380 | goto out_backing; |
381 | 381 | ||
382 | ret = remove_suid(filp->f_dentry); | 382 | ret = remove_suid(filp->f_path.dentry); |
383 | if (ret) | 383 | if (ret) |
384 | goto out_backing; | 384 | goto out_backing; |
385 | 385 | ||
diff --git a/mm/fremap.c b/mm/fremap.c index b77a002c3352..4e3f53dd5fd4 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -33,7 +33,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
33 | if (page) { | 33 | if (page) { |
34 | if (pte_dirty(pte)) | 34 | if (pte_dirty(pte)) |
35 | set_page_dirty(page); | 35 | set_page_dirty(page); |
36 | page_remove_rmap(page); | 36 | page_remove_rmap(page, vma); |
37 | page_cache_release(page); | 37 | page_cache_release(page); |
38 | } | 38 | } |
39 | } else { | 39 | } else { |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0ccc7f230252..36db012b38dd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -44,14 +44,14 @@ static void clear_huge_page(struct page *page, unsigned long addr) | |||
44 | } | 44 | } |
45 | 45 | ||
46 | static void copy_huge_page(struct page *dst, struct page *src, | 46 | static void copy_huge_page(struct page *dst, struct page *src, |
47 | unsigned long addr) | 47 | unsigned long addr, struct vm_area_struct *vma) |
48 | { | 48 | { |
49 | int i; | 49 | int i; |
50 | 50 | ||
51 | might_sleep(); | 51 | might_sleep(); |
52 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { | 52 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { |
53 | cond_resched(); | 53 | cond_resched(); |
54 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE); | 54 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); |
55 | } | 55 | } |
56 | } | 56 | } |
57 | 57 | ||
@@ -73,7 +73,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, | |||
73 | 73 | ||
74 | for (z = zonelist->zones; *z; z++) { | 74 | for (z = zonelist->zones; *z; z++) { |
75 | nid = zone_to_nid(*z); | 75 | nid = zone_to_nid(*z); |
76 | if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && | 76 | if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) && |
77 | !list_empty(&hugepage_freelists[nid])) | 77 | !list_empty(&hugepage_freelists[nid])) |
78 | break; | 78 | break; |
79 | } | 79 | } |
@@ -389,6 +389,8 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
389 | continue; | 389 | continue; |
390 | 390 | ||
391 | page = pte_page(pte); | 391 | page = pte_page(pte); |
392 | if (pte_dirty(pte)) | ||
393 | set_page_dirty(page); | ||
392 | list_add(&page->lru, &page_list); | 394 | list_add(&page->lru, &page_list); |
393 | } | 395 | } |
394 | spin_unlock(&mm->page_table_lock); | 396 | spin_unlock(&mm->page_table_lock); |
@@ -442,7 +444,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
442 | } | 444 | } |
443 | 445 | ||
444 | spin_unlock(&mm->page_table_lock); | 446 | spin_unlock(&mm->page_table_lock); |
445 | copy_huge_page(new_page, old_page, address); | 447 | copy_huge_page(new_page, old_page, address, vma); |
446 | spin_lock(&mm->page_table_lock); | 448 | spin_lock(&mm->page_table_lock); |
447 | 449 | ||
448 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); | 450 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); |
diff --git a/mm/memory.c b/mm/memory.c index 4198df0dff1c..ef09f0acb1d8 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -681,7 +681,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
681 | mark_page_accessed(page); | 681 | mark_page_accessed(page); |
682 | file_rss--; | 682 | file_rss--; |
683 | } | 683 | } |
684 | page_remove_rmap(page); | 684 | page_remove_rmap(page, vma); |
685 | tlb_remove_page(tlb, page); | 685 | tlb_remove_page(tlb, page); |
686 | continue; | 686 | continue; |
687 | } | 687 | } |
@@ -1091,7 +1091,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1091 | if (pages) { | 1091 | if (pages) { |
1092 | pages[i] = page; | 1092 | pages[i] = page; |
1093 | 1093 | ||
1094 | flush_anon_page(page, start); | 1094 | flush_anon_page(vma, page, start); |
1095 | flush_dcache_page(page); | 1095 | flush_dcache_page(page); |
1096 | } | 1096 | } |
1097 | if (vmas) | 1097 | if (vmas) |
@@ -1110,23 +1110,29 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1110 | { | 1110 | { |
1111 | pte_t *pte; | 1111 | pte_t *pte; |
1112 | spinlock_t *ptl; | 1112 | spinlock_t *ptl; |
1113 | int err = 0; | ||
1113 | 1114 | ||
1114 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | 1115 | pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); |
1115 | if (!pte) | 1116 | if (!pte) |
1116 | return -ENOMEM; | 1117 | return -EAGAIN; |
1117 | arch_enter_lazy_mmu_mode(); | 1118 | arch_enter_lazy_mmu_mode(); |
1118 | do { | 1119 | do { |
1119 | struct page *page = ZERO_PAGE(addr); | 1120 | struct page *page = ZERO_PAGE(addr); |
1120 | pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); | 1121 | pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); |
1122 | |||
1123 | if (unlikely(!pte_none(*pte))) { | ||
1124 | err = -EEXIST; | ||
1125 | pte++; | ||
1126 | break; | ||
1127 | } | ||
1121 | page_cache_get(page); | 1128 | page_cache_get(page); |
1122 | page_add_file_rmap(page); | 1129 | page_add_file_rmap(page); |
1123 | inc_mm_counter(mm, file_rss); | 1130 | inc_mm_counter(mm, file_rss); |
1124 | BUG_ON(!pte_none(*pte)); | ||
1125 | set_pte_at(mm, addr, pte, zero_pte); | 1131 | set_pte_at(mm, addr, pte, zero_pte); |
1126 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1132 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1127 | arch_leave_lazy_mmu_mode(); | 1133 | arch_leave_lazy_mmu_mode(); |
1128 | pte_unmap_unlock(pte - 1, ptl); | 1134 | pte_unmap_unlock(pte - 1, ptl); |
1129 | return 0; | 1135 | return err; |
1130 | } | 1136 | } |
1131 | 1137 | ||
1132 | static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, | 1138 | static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, |
@@ -1134,16 +1140,18 @@ static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
1134 | { | 1140 | { |
1135 | pmd_t *pmd; | 1141 | pmd_t *pmd; |
1136 | unsigned long next; | 1142 | unsigned long next; |
1143 | int err; | ||
1137 | 1144 | ||
1138 | pmd = pmd_alloc(mm, pud, addr); | 1145 | pmd = pmd_alloc(mm, pud, addr); |
1139 | if (!pmd) | 1146 | if (!pmd) |
1140 | return -ENOMEM; | 1147 | return -EAGAIN; |
1141 | do { | 1148 | do { |
1142 | next = pmd_addr_end(addr, end); | 1149 | next = pmd_addr_end(addr, end); |
1143 | if (zeromap_pte_range(mm, pmd, addr, next, prot)) | 1150 | err = zeromap_pte_range(mm, pmd, addr, next, prot); |
1144 | return -ENOMEM; | 1151 | if (err) |
1152 | break; | ||
1145 | } while (pmd++, addr = next, addr != end); | 1153 | } while (pmd++, addr = next, addr != end); |
1146 | return 0; | 1154 | return err; |
1147 | } | 1155 | } |
1148 | 1156 | ||
1149 | static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, | 1157 | static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, |
@@ -1151,16 +1159,18 @@ static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, | |||
1151 | { | 1159 | { |
1152 | pud_t *pud; | 1160 | pud_t *pud; |
1153 | unsigned long next; | 1161 | unsigned long next; |
1162 | int err; | ||
1154 | 1163 | ||
1155 | pud = pud_alloc(mm, pgd, addr); | 1164 | pud = pud_alloc(mm, pgd, addr); |
1156 | if (!pud) | 1165 | if (!pud) |
1157 | return -ENOMEM; | 1166 | return -EAGAIN; |
1158 | do { | 1167 | do { |
1159 | next = pud_addr_end(addr, end); | 1168 | next = pud_addr_end(addr, end); |
1160 | if (zeromap_pmd_range(mm, pud, addr, next, prot)) | 1169 | err = zeromap_pmd_range(mm, pud, addr, next, prot); |
1161 | return -ENOMEM; | 1170 | if (err) |
1171 | break; | ||
1162 | } while (pud++, addr = next, addr != end); | 1172 | } while (pud++, addr = next, addr != end); |
1163 | return 0; | 1173 | return err; |
1164 | } | 1174 | } |
1165 | 1175 | ||
1166 | int zeromap_page_range(struct vm_area_struct *vma, | 1176 | int zeromap_page_range(struct vm_area_struct *vma, |
@@ -1431,7 +1441,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
1431 | return pte; | 1441 | return pte; |
1432 | } | 1442 | } |
1433 | 1443 | ||
1434 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) | 1444 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) |
1435 | { | 1445 | { |
1436 | /* | 1446 | /* |
1437 | * If the source page was a PFN mapping, we don't have | 1447 | * If the source page was a PFN mapping, we don't have |
@@ -1454,9 +1464,9 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo | |||
1454 | kunmap_atomic(kaddr, KM_USER0); | 1464 | kunmap_atomic(kaddr, KM_USER0); |
1455 | flush_dcache_page(dst); | 1465 | flush_dcache_page(dst); |
1456 | return; | 1466 | return; |
1457 | 1467 | ||
1458 | } | 1468 | } |
1459 | copy_user_highpage(dst, src, va); | 1469 | copy_user_highpage(dst, src, va, vma); |
1460 | } | 1470 | } |
1461 | 1471 | ||
1462 | /* | 1472 | /* |
@@ -1567,7 +1577,7 @@ gotten: | |||
1567 | new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); | 1577 | new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); |
1568 | if (!new_page) | 1578 | if (!new_page) |
1569 | goto oom; | 1579 | goto oom; |
1570 | cow_user_page(new_page, old_page, address); | 1580 | cow_user_page(new_page, old_page, address, vma); |
1571 | } | 1581 | } |
1572 | 1582 | ||
1573 | /* | 1583 | /* |
@@ -1576,7 +1586,7 @@ gotten: | |||
1576 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 1586 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
1577 | if (likely(pte_same(*page_table, orig_pte))) { | 1587 | if (likely(pte_same(*page_table, orig_pte))) { |
1578 | if (old_page) { | 1588 | if (old_page) { |
1579 | page_remove_rmap(old_page); | 1589 | page_remove_rmap(old_page, vma); |
1580 | if (!PageAnon(old_page)) { | 1590 | if (!PageAnon(old_page)) { |
1581 | dec_mm_counter(mm, file_rss); | 1591 | dec_mm_counter(mm, file_rss); |
1582 | inc_mm_counter(mm, anon_rss); | 1592 | inc_mm_counter(mm, anon_rss); |
@@ -2190,7 +2200,7 @@ retry: | |||
2190 | page = alloc_page_vma(GFP_HIGHUSER, vma, address); | 2200 | page = alloc_page_vma(GFP_HIGHUSER, vma, address); |
2191 | if (!page) | 2201 | if (!page) |
2192 | goto oom; | 2202 | goto oom; |
2193 | copy_user_highpage(page, new_page, address); | 2203 | copy_user_highpage(page, new_page, address, vma); |
2194 | page_cache_release(new_page); | 2204 | page_cache_release(new_page); |
2195 | new_page = page; | 2205 | new_page = page; |
2196 | anon = 1; | 2206 | anon = 1; |
@@ -2596,8 +2606,15 @@ static int __init gate_vma_init(void) | |||
2596 | gate_vma.vm_mm = NULL; | 2606 | gate_vma.vm_mm = NULL; |
2597 | gate_vma.vm_start = FIXADDR_USER_START; | 2607 | gate_vma.vm_start = FIXADDR_USER_START; |
2598 | gate_vma.vm_end = FIXADDR_USER_END; | 2608 | gate_vma.vm_end = FIXADDR_USER_END; |
2599 | gate_vma.vm_page_prot = PAGE_READONLY; | 2609 | gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; |
2600 | gate_vma.vm_flags = 0; | 2610 | gate_vma.vm_page_prot = __P101; |
2611 | /* | ||
2612 | * Make sure the vDSO gets into every core dump. | ||
2613 | * Dumping its contents makes post-mortem fully interpretable later | ||
2614 | * without matching up the same kernel and hardware config to see | ||
2615 | * what PC values meant. | ||
2616 | */ | ||
2617 | gate_vma.vm_flags |= VM_ALWAYSDUMP; | ||
2601 | return 0; | 2618 | return 0; |
2602 | } | 2619 | } |
2603 | __initcall(gate_vma_init); | 2620 | __initcall(gate_vma_init); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0c055a090f4d..84279127fcd3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -67,11 +67,13 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
67 | zone_type = zone - pgdat->node_zones; | 67 | zone_type = zone - pgdat->node_zones; |
68 | if (!populated_zone(zone)) { | 68 | if (!populated_zone(zone)) { |
69 | int ret = 0; | 69 | int ret = 0; |
70 | ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages); | 70 | ret = init_currently_empty_zone(zone, phys_start_pfn, |
71 | nr_pages, MEMMAP_HOTPLUG); | ||
71 | if (ret < 0) | 72 | if (ret < 0) |
72 | return ret; | 73 | return ret; |
73 | } | 74 | } |
74 | memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); | 75 | memmap_init_zone(nr_pages, nid, zone_type, |
76 | phys_start_pfn, MEMMAP_HOTPLUG); | ||
75 | return 0; | 77 | return 0; |
76 | } | 78 | } |
77 | 79 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b917d6fdc1bb..c2aec0e1090d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -884,6 +884,10 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |||
884 | err = get_nodes(&nodes, nmask, maxnode); | 884 | err = get_nodes(&nodes, nmask, maxnode); |
885 | if (err) | 885 | if (err) |
886 | return err; | 886 | return err; |
887 | #ifdef CONFIG_CPUSETS | ||
888 | /* Restrict the nodes to the allowed nodes in the cpuset */ | ||
889 | nodes_and(nodes, nodes, current->mems_allowed); | ||
890 | #endif | ||
887 | return do_mbind(start, len, mode, &nodes, flags); | 891 | return do_mbind(start, len, mode, &nodes, flags); |
888 | } | 892 | } |
889 | 893 | ||
@@ -1857,7 +1861,7 @@ int show_numa_map(struct seq_file *m, void *v) | |||
1857 | 1861 | ||
1858 | if (file) { | 1862 | if (file) { |
1859 | seq_printf(m, " file="); | 1863 | seq_printf(m, " file="); |
1860 | seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= "); | 1864 | seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= "); |
1861 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { | 1865 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { |
1862 | seq_printf(m, " heap"); | 1866 | seq_printf(m, " heap"); |
1863 | } else if (vma->vm_start <= mm->start_stack && | 1867 | } else if (vma->vm_start <= mm->start_stack && |
diff --git a/mm/mincore.c b/mm/mincore.c index 72890780c1c9..8aca6f7167bb 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * linux/mm/mincore.c | 2 | * linux/mm/mincore.c |
3 | * | 3 | * |
4 | * Copyright (C) 1994-1999 Linus Torvalds | 4 | * Copyright (C) 1994-2006 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | /* | 7 | /* |
@@ -38,46 +38,51 @@ static unsigned char mincore_page(struct vm_area_struct * vma, | |||
38 | return present; | 38 | return present; |
39 | } | 39 | } |
40 | 40 | ||
41 | static long mincore_vma(struct vm_area_struct * vma, | 41 | /* |
42 | unsigned long start, unsigned long end, unsigned char __user * vec) | 42 | * Do a chunk of "sys_mincore()". We've already checked |
43 | * all the arguments, we hold the mmap semaphore: we should | ||
44 | * just return the amount of info we're asked for. | ||
45 | */ | ||
46 | static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages) | ||
43 | { | 47 | { |
44 | long error, i, remaining; | 48 | unsigned long i, nr, pgoff; |
45 | unsigned char * tmp; | 49 | struct vm_area_struct *vma = find_vma(current->mm, addr); |
46 | |||
47 | error = -ENOMEM; | ||
48 | if (!vma->vm_file) | ||
49 | return error; | ||
50 | |||
51 | start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
52 | if (end > vma->vm_end) | ||
53 | end = vma->vm_end; | ||
54 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
55 | 50 | ||
56 | error = -EAGAIN; | 51 | /* |
57 | tmp = (unsigned char *) __get_free_page(GFP_KERNEL); | 52 | * find_vma() didn't find anything above us, or we're |
58 | if (!tmp) | 53 | * in an unmapped hole in the address space: ENOMEM. |
59 | return error; | 54 | */ |
55 | if (!vma || addr < vma->vm_start) | ||
56 | return -ENOMEM; | ||
60 | 57 | ||
61 | /* (end - start) is # of pages, and also # of bytes in "vec */ | 58 | /* |
62 | remaining = (end - start), | 59 | * Ok, got it. But check whether it's a segment we support |
60 | * mincore() on. Right now, we don't do any anonymous mappings. | ||
61 | * | ||
62 | * FIXME: This is just stupid. And returning ENOMEM is | ||
63 | * stupid too. We should just look at the page tables. But | ||
64 | * this is what we've traditionally done, so we'll just | ||
65 | * continue doing it. | ||
66 | */ | ||
67 | if (!vma->vm_file) | ||
68 | return -ENOMEM; | ||
63 | 69 | ||
64 | error = 0; | 70 | /* |
65 | for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { | 71 | * Calculate how many pages there are left in the vma, and |
66 | int j = 0; | 72 | * what the pgoff is for our address. |
67 | long thispiece = (remaining < PAGE_SIZE) ? | 73 | */ |
68 | remaining : PAGE_SIZE; | 74 | nr = (vma->vm_end - addr) >> PAGE_SHIFT; |
75 | if (nr > pages) | ||
76 | nr = pages; | ||
69 | 77 | ||
70 | while (j < thispiece) | 78 | pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; |
71 | tmp[j++] = mincore_page(vma, start++); | 79 | pgoff += vma->vm_pgoff; |
72 | 80 | ||
73 | if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { | 81 | /* And then we just fill the sucker in.. */ |
74 | error = -EFAULT; | 82 | for (i = 0 ; i < nr; i++, pgoff++) |
75 | break; | 83 | vec[i] = mincore_page(vma, pgoff); |
76 | } | ||
77 | } | ||
78 | 84 | ||
79 | free_page((unsigned long) tmp); | 85 | return nr; |
80 | return error; | ||
81 | } | 86 | } |
82 | 87 | ||
83 | /* | 88 | /* |
@@ -107,82 +112,50 @@ static long mincore_vma(struct vm_area_struct * vma, | |||
107 | asmlinkage long sys_mincore(unsigned long start, size_t len, | 112 | asmlinkage long sys_mincore(unsigned long start, size_t len, |
108 | unsigned char __user * vec) | 113 | unsigned char __user * vec) |
109 | { | 114 | { |
110 | int index = 0; | 115 | long retval; |
111 | unsigned long end, limit; | 116 | unsigned long pages; |
112 | struct vm_area_struct * vma; | 117 | unsigned char *tmp; |
113 | size_t max; | ||
114 | int unmapped_error = 0; | ||
115 | long error; | ||
116 | |||
117 | /* check the arguments */ | ||
118 | if (start & ~PAGE_CACHE_MASK) | ||
119 | goto einval; | ||
120 | |||
121 | limit = TASK_SIZE; | ||
122 | if (start >= limit) | ||
123 | goto enomem; | ||
124 | |||
125 | if (!len) | ||
126 | return 0; | ||
127 | |||
128 | max = limit - start; | ||
129 | len = PAGE_CACHE_ALIGN(len); | ||
130 | if (len > max || !len) | ||
131 | goto enomem; | ||
132 | 118 | ||
133 | end = start + len; | 119 | /* Check the start address: needs to be page-aligned.. */ |
120 | if (start & ~PAGE_CACHE_MASK) | ||
121 | return -EINVAL; | ||
134 | 122 | ||
135 | /* check the output buffer whilst holding the lock */ | 123 | /* ..and we need to be passed a valid user-space range */ |
136 | error = -EFAULT; | 124 | if (!access_ok(VERIFY_READ, (void __user *) start, len)) |
137 | down_read(¤t->mm->mmap_sem); | 125 | return -ENOMEM; |
138 | 126 | ||
139 | if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT)) | 127 | /* This also avoids any overflows on PAGE_CACHE_ALIGN */ |
140 | goto out; | 128 | pages = len >> PAGE_SHIFT; |
129 | pages += (len & ~PAGE_MASK) != 0; | ||
141 | 130 | ||
142 | /* | 131 | if (!access_ok(VERIFY_WRITE, vec, pages)) |
143 | * If the interval [start,end) covers some unmapped address | 132 | return -EFAULT; |
144 | * ranges, just ignore them, but return -ENOMEM at the end. | ||
145 | */ | ||
146 | error = 0; | ||
147 | |||
148 | vma = find_vma(current->mm, start); | ||
149 | while (vma) { | ||
150 | /* Here start < vma->vm_end. */ | ||
151 | if (start < vma->vm_start) { | ||
152 | unmapped_error = -ENOMEM; | ||
153 | start = vma->vm_start; | ||
154 | } | ||
155 | 133 | ||
156 | /* Here vma->vm_start <= start < vma->vm_end. */ | 134 | tmp = (void *) __get_free_page(GFP_USER); |
157 | if (end <= vma->vm_end) { | 135 | if (!tmp) |
158 | if (start < end) { | 136 | return -EAGAIN; |
159 | error = mincore_vma(vma, start, end, | 137 | |
160 | &vec[index]); | 138 | retval = 0; |
161 | if (error) | 139 | while (pages) { |
162 | goto out; | 140 | /* |
163 | } | 141 | * Do at most PAGE_SIZE entries per iteration, due to |
164 | error = unmapped_error; | 142 | * the temporary buffer size. |
165 | goto out; | 143 | */ |
144 | down_read(¤t->mm->mmap_sem); | ||
145 | retval = do_mincore(start, tmp, min(pages, PAGE_SIZE)); | ||
146 | up_read(¤t->mm->mmap_sem); | ||
147 | |||
148 | if (retval <= 0) | ||
149 | break; | ||
150 | if (copy_to_user(vec, tmp, retval)) { | ||
151 | retval = -EFAULT; | ||
152 | break; | ||
166 | } | 153 | } |
167 | 154 | pages -= retval; | |
168 | /* Here vma->vm_start <= start < vma->vm_end < end. */ | 155 | vec += retval; |
169 | error = mincore_vma(vma, start, vma->vm_end, &vec[index]); | 156 | start += retval << PAGE_SHIFT; |
170 | if (error) | 157 | retval = 0; |
171 | goto out; | ||
172 | index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT; | ||
173 | start = vma->vm_end; | ||
174 | vma = vma->vm_next; | ||
175 | } | 158 | } |
176 | 159 | free_page((unsigned long) tmp); | |
177 | /* we found a hole in the area queried if we arrive here */ | 160 | return retval; |
178 | error = -ENOMEM; | ||
179 | |||
180 | out: | ||
181 | up_read(¤t->mm->mmap_sem); | ||
182 | return error; | ||
183 | |||
184 | einval: | ||
185 | return -EINVAL; | ||
186 | enomem: | ||
187 | return -ENOMEM; | ||
188 | } | 161 | } |
@@ -188,7 +188,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, | |||
188 | struct file *file, struct address_space *mapping) | 188 | struct file *file, struct address_space *mapping) |
189 | { | 189 | { |
190 | if (vma->vm_flags & VM_DENYWRITE) | 190 | if (vma->vm_flags & VM_DENYWRITE) |
191 | atomic_inc(&file->f_dentry->d_inode->i_writecount); | 191 | atomic_inc(&file->f_path.dentry->d_inode->i_writecount); |
192 | if (vma->vm_flags & VM_SHARED) | 192 | if (vma->vm_flags & VM_SHARED) |
193 | mapping->i_mmap_writable--; | 193 | mapping->i_mmap_writable--; |
194 | 194 | ||
@@ -399,7 +399,7 @@ static inline void __vma_link_file(struct vm_area_struct *vma) | |||
399 | struct address_space *mapping = file->f_mapping; | 399 | struct address_space *mapping = file->f_mapping; |
400 | 400 | ||
401 | if (vma->vm_flags & VM_DENYWRITE) | 401 | if (vma->vm_flags & VM_DENYWRITE) |
402 | atomic_dec(&file->f_dentry->d_inode->i_writecount); | 402 | atomic_dec(&file->f_path.dentry->d_inode->i_writecount); |
403 | if (vma->vm_flags & VM_SHARED) | 403 | if (vma->vm_flags & VM_SHARED) |
404 | mapping->i_mmap_writable++; | 404 | mapping->i_mmap_writable++; |
405 | 405 | ||
@@ -907,7 +907,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
907 | * mounted, in which case we dont add PROT_EXEC.) | 907 | * mounted, in which case we dont add PROT_EXEC.) |
908 | */ | 908 | */ |
909 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) | 909 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) |
910 | if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))) | 910 | if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) |
911 | prot |= PROT_EXEC; | 911 | prot |= PROT_EXEC; |
912 | 912 | ||
913 | if (!len) | 913 | if (!len) |
@@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
960 | return -EAGAIN; | 960 | return -EAGAIN; |
961 | } | 961 | } |
962 | 962 | ||
963 | inode = file ? file->f_dentry->d_inode : NULL; | 963 | inode = file ? file->f_path.dentry->d_inode : NULL; |
964 | 964 | ||
965 | if (file) { | 965 | if (file) { |
966 | switch (flags & MAP_TYPE) { | 966 | switch (flags & MAP_TYPE) { |
@@ -989,7 +989,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
989 | case MAP_PRIVATE: | 989 | case MAP_PRIVATE: |
990 | if (!(file->f_mode & FMODE_READ)) | 990 | if (!(file->f_mode & FMODE_READ)) |
991 | return -EACCES; | 991 | return -EACCES; |
992 | if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { | 992 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { |
993 | if (vm_flags & VM_EXEC) | 993 | if (vm_flags & VM_EXEC) |
994 | return -EPERM; | 994 | return -EPERM; |
995 | vm_flags &= ~VM_MAYEXEC; | 995 | vm_flags &= ~VM_MAYEXEC; |
@@ -1477,6 +1477,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un | |||
1477 | { | 1477 | { |
1478 | struct mm_struct *mm = vma->vm_mm; | 1478 | struct mm_struct *mm = vma->vm_mm; |
1479 | struct rlimit *rlim = current->signal->rlim; | 1479 | struct rlimit *rlim = current->signal->rlim; |
1480 | unsigned long new_start; | ||
1480 | 1481 | ||
1481 | /* address space limit tests */ | 1482 | /* address space limit tests */ |
1482 | if (!may_expand_vm(mm, grow)) | 1483 | if (!may_expand_vm(mm, grow)) |
@@ -1496,6 +1497,12 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un | |||
1496 | return -ENOMEM; | 1497 | return -ENOMEM; |
1497 | } | 1498 | } |
1498 | 1499 | ||
1500 | /* Check to ensure the stack will not grow into a hugetlb-only region */ | ||
1501 | new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : | ||
1502 | vma->vm_end - size; | ||
1503 | if (is_hugepage_only_range(vma->vm_mm, new_start, size)) | ||
1504 | return -EFAULT; | ||
1505 | |||
1499 | /* | 1506 | /* |
1500 | * Overcommit.. This must be the final test, as it will | 1507 | * Overcommit.. This must be the final test, as it will |
1501 | * update security statistics. | 1508 | * update security statistics. |
@@ -2094,3 +2101,75 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages) | |||
2094 | return 0; | 2101 | return 0; |
2095 | return 1; | 2102 | return 1; |
2096 | } | 2103 | } |
2104 | |||
2105 | |||
2106 | static struct page *special_mapping_nopage(struct vm_area_struct *vma, | ||
2107 | unsigned long address, int *type) | ||
2108 | { | ||
2109 | struct page **pages; | ||
2110 | |||
2111 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | ||
2112 | |||
2113 | address -= vma->vm_start; | ||
2114 | for (pages = vma->vm_private_data; address > 0 && *pages; ++pages) | ||
2115 | address -= PAGE_SIZE; | ||
2116 | |||
2117 | if (*pages) { | ||
2118 | struct page *page = *pages; | ||
2119 | get_page(page); | ||
2120 | return page; | ||
2121 | } | ||
2122 | |||
2123 | return NOPAGE_SIGBUS; | ||
2124 | } | ||
2125 | |||
2126 | /* | ||
2127 | * Having a close hook prevents vma merging regardless of flags. | ||
2128 | */ | ||
2129 | static void special_mapping_close(struct vm_area_struct *vma) | ||
2130 | { | ||
2131 | } | ||
2132 | |||
2133 | static struct vm_operations_struct special_mapping_vmops = { | ||
2134 | .close = special_mapping_close, | ||
2135 | .nopage = special_mapping_nopage, | ||
2136 | }; | ||
2137 | |||
2138 | /* | ||
2139 | * Called with mm->mmap_sem held for writing. | ||
2140 | * Insert a new vma covering the given region, with the given flags. | ||
2141 | * Its pages are supplied by the given array of struct page *. | ||
2142 | * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. | ||
2143 | * The region past the last page supplied will always produce SIGBUS. | ||
2144 | * The array pointer and the pages it points to are assumed to stay alive | ||
2145 | * for as long as this mapping might exist. | ||
2146 | */ | ||
2147 | int install_special_mapping(struct mm_struct *mm, | ||
2148 | unsigned long addr, unsigned long len, | ||
2149 | unsigned long vm_flags, struct page **pages) | ||
2150 | { | ||
2151 | struct vm_area_struct *vma; | ||
2152 | |||
2153 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | ||
2154 | if (unlikely(vma == NULL)) | ||
2155 | return -ENOMEM; | ||
2156 | |||
2157 | vma->vm_mm = mm; | ||
2158 | vma->vm_start = addr; | ||
2159 | vma->vm_end = addr + len; | ||
2160 | |||
2161 | vma->vm_flags = vm_flags | mm->def_flags; | ||
2162 | vma->vm_page_prot = protection_map[vma->vm_flags & 7]; | ||
2163 | |||
2164 | vma->vm_ops = &special_mapping_vmops; | ||
2165 | vma->vm_private_data = pages; | ||
2166 | |||
2167 | if (unlikely(insert_vm_struct(mm, vma))) { | ||
2168 | kmem_cache_free(vm_area_cachep, vma); | ||
2169 | return -ENOMEM; | ||
2170 | } | ||
2171 | |||
2172 | mm->total_vm += len >> PAGE_SHIFT; | ||
2173 | |||
2174 | return 0; | ||
2175 | } | ||
diff --git a/mm/mremap.c b/mm/mremap.c index 9c769fa29f32..5d4bd4f95b8e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -105,7 +105,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
105 | if (pte_none(*old_pte)) | 105 | if (pte_none(*old_pte)) |
106 | continue; | 106 | continue; |
107 | pte = ptep_clear_flush(vma, old_addr, old_pte); | 107 | pte = ptep_clear_flush(vma, old_addr, old_pte); |
108 | /* ZERO_PAGE can be dependant on virtual addr */ | ||
109 | pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); | 108 | pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); |
110 | set_pte_at(mm, new_addr, new_pte, pte); | 109 | set_pte_at(mm, new_addr, new_pte, pte); |
111 | } | 110 | } |
diff --git a/mm/nommu.c b/mm/nommu.c index af874569d0f1..23fb033e596d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -523,7 +523,7 @@ static int validate_mmap_request(struct file *file, | |||
523 | */ | 523 | */ |
524 | mapping = file->f_mapping; | 524 | mapping = file->f_mapping; |
525 | if (!mapping) | 525 | if (!mapping) |
526 | mapping = file->f_dentry->d_inode->i_mapping; | 526 | mapping = file->f_path.dentry->d_inode->i_mapping; |
527 | 527 | ||
528 | capabilities = 0; | 528 | capabilities = 0; |
529 | if (mapping && mapping->backing_dev_info) | 529 | if (mapping && mapping->backing_dev_info) |
@@ -532,7 +532,7 @@ static int validate_mmap_request(struct file *file, | |||
532 | if (!capabilities) { | 532 | if (!capabilities) { |
533 | /* no explicit capabilities set, so assume some | 533 | /* no explicit capabilities set, so assume some |
534 | * defaults */ | 534 | * defaults */ |
535 | switch (file->f_dentry->d_inode->i_mode & S_IFMT) { | 535 | switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) { |
536 | case S_IFREG: | 536 | case S_IFREG: |
537 | case S_IFBLK: | 537 | case S_IFBLK: |
538 | capabilities = BDI_CAP_MAP_COPY; | 538 | capabilities = BDI_CAP_MAP_COPY; |
@@ -563,11 +563,11 @@ static int validate_mmap_request(struct file *file, | |||
563 | !(file->f_mode & FMODE_WRITE)) | 563 | !(file->f_mode & FMODE_WRITE)) |
564 | return -EACCES; | 564 | return -EACCES; |
565 | 565 | ||
566 | if (IS_APPEND(file->f_dentry->d_inode) && | 566 | if (IS_APPEND(file->f_path.dentry->d_inode) && |
567 | (file->f_mode & FMODE_WRITE)) | 567 | (file->f_mode & FMODE_WRITE)) |
568 | return -EACCES; | 568 | return -EACCES; |
569 | 569 | ||
570 | if (locks_verify_locked(file->f_dentry->d_inode)) | 570 | if (locks_verify_locked(file->f_path.dentry->d_inode)) |
571 | return -EAGAIN; | 571 | return -EAGAIN; |
572 | 572 | ||
573 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) | 573 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) |
@@ -598,7 +598,7 @@ static int validate_mmap_request(struct file *file, | |||
598 | 598 | ||
599 | /* handle executable mappings and implied executable | 599 | /* handle executable mappings and implied executable |
600 | * mappings */ | 600 | * mappings */ |
601 | if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { | 601 | if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { |
602 | if (prot & PROT_EXEC) | 602 | if (prot & PROT_EXEC) |
603 | return -EPERM; | 603 | return -EPERM; |
604 | } | 604 | } |
@@ -833,7 +833,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
833 | continue; | 833 | continue; |
834 | 834 | ||
835 | /* search for overlapping mappings on the same file */ | 835 | /* search for overlapping mappings on the same file */ |
836 | if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode) | 836 | if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode) |
837 | continue; | 837 | continue; |
838 | 838 | ||
839 | if (vma->vm_pgoff >= pgoff + pglen) | 839 | if (vma->vm_pgoff >= pgoff + pglen) |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 223d9ccb7d64..b278b8d60eee 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -61,12 +61,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
61 | } | 61 | } |
62 | 62 | ||
63 | /* | 63 | /* |
64 | * swapoff can easily use up all memory, so kill those first. | ||
65 | */ | ||
66 | if (p->flags & PF_SWAPOFF) | ||
67 | return ULONG_MAX; | ||
68 | |||
69 | /* | ||
70 | * The memory size of the process is the basis for the badness. | 64 | * The memory size of the process is the basis for the badness. |
71 | */ | 65 | */ |
72 | points = mm->total_vm; | 66 | points = mm->total_vm; |
@@ -77,6 +71,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
77 | task_unlock(p); | 71 | task_unlock(p); |
78 | 72 | ||
79 | /* | 73 | /* |
74 | * swapoff can easily use up all memory, so kill those first. | ||
75 | */ | ||
76 | if (p->flags & PF_SWAPOFF) | ||
77 | return ULONG_MAX; | ||
78 | |||
79 | /* | ||
80 | * Processes which fork a lot of child processes are likely | 80 | * Processes which fork a lot of child processes are likely |
81 | * a good choice. We add half the vmsize of the children if they | 81 | * a good choice. We add half the vmsize of the children if they |
82 | * have an own mm. This prevents forking servers to flood the | 82 | * have an own mm. This prevents forking servers to flood the |
@@ -174,10 +174,15 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) | |||
174 | { | 174 | { |
175 | #ifdef CONFIG_NUMA | 175 | #ifdef CONFIG_NUMA |
176 | struct zone **z; | 176 | struct zone **z; |
177 | nodemask_t nodes = node_online_map; | 177 | nodemask_t nodes; |
178 | int node; | ||
179 | /* node has memory ? */ | ||
180 | for_each_online_node(node) | ||
181 | if (NODE_DATA(node)->node_present_pages) | ||
182 | node_set(node, nodes); | ||
178 | 183 | ||
179 | for (z = zonelist->zones; *z; z++) | 184 | for (z = zonelist->zones; *z; z++) |
180 | if (cpuset_zone_allowed(*z, gfp_mask)) | 185 | if (cpuset_zone_allowed_softwall(*z, gfp_mask)) |
181 | node_clear(zone_to_nid(*z), nodes); | 186 | node_clear(zone_to_nid(*z), nodes); |
182 | else | 187 | else |
183 | return CONSTRAINT_CPUSET; | 188 | return CONSTRAINT_CPUSET; |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8d9b19f239c3..be0efbde4994 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/writeback.h> | 21 | #include <linux/writeback.h> |
22 | #include <linux/init.h> | 22 | #include <linux/init.h> |
23 | #include <linux/backing-dev.h> | 23 | #include <linux/backing-dev.h> |
24 | #include <linux/task_io_accounting_ops.h> | ||
24 | #include <linux/blkdev.h> | 25 | #include <linux/blkdev.h> |
25 | #include <linux/mpage.h> | 26 | #include <linux/mpage.h> |
26 | #include <linux/rmap.h> | 27 | #include <linux/rmap.h> |
@@ -132,11 +133,9 @@ get_dirty_limits(long *pbackground, long *pdirty, | |||
132 | 133 | ||
133 | #ifdef CONFIG_HIGHMEM | 134 | #ifdef CONFIG_HIGHMEM |
134 | /* | 135 | /* |
135 | * If this mapping can only allocate from low memory, | 136 | * We always exclude high memory from our count. |
136 | * we exclude high memory from our count. | ||
137 | */ | 137 | */ |
138 | if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM)) | 138 | available_memory -= totalhigh_pages; |
139 | available_memory -= totalhigh_pages; | ||
140 | #endif | 139 | #endif |
141 | 140 | ||
142 | 141 | ||
@@ -525,28 +524,25 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { | |||
525 | }; | 524 | }; |
526 | 525 | ||
527 | /* | 526 | /* |
528 | * If the machine has a large highmem:lowmem ratio then scale back the default | 527 | * Called early on to tune the page writeback dirty limits. |
529 | * dirty memory thresholds: allowing too much dirty highmem pins an excessive | 528 | * |
530 | * number of buffer_heads. | 529 | * We used to scale dirty pages according to how total memory |
530 | * related to pages that could be allocated for buffers (by | ||
531 | * comparing nr_free_buffer_pages() to vm_total_pages. | ||
532 | * | ||
533 | * However, that was when we used "dirty_ratio" to scale with | ||
534 | * all memory, and we don't do that any more. "dirty_ratio" | ||
535 | * is now applied to total non-HIGHPAGE memory (by subtracting | ||
536 | * totalhigh_pages from vm_total_pages), and as such we can't | ||
537 | * get into the old insane situation any more where we had | ||
538 | * large amounts of dirty pages compared to a small amount of | ||
539 | * non-HIGHMEM memory. | ||
540 | * | ||
541 | * But we might still want to scale the dirty_ratio by how | ||
542 | * much memory the box has.. | ||
531 | */ | 543 | */ |
532 | void __init page_writeback_init(void) | 544 | void __init page_writeback_init(void) |
533 | { | 545 | { |
534 | long buffer_pages = nr_free_buffer_pages(); | ||
535 | long correction; | ||
536 | |||
537 | correction = (100 * 4 * buffer_pages) / vm_total_pages; | ||
538 | |||
539 | if (correction < 100) { | ||
540 | dirty_background_ratio *= correction; | ||
541 | dirty_background_ratio /= 100; | ||
542 | vm_dirty_ratio *= correction; | ||
543 | vm_dirty_ratio /= 100; | ||
544 | |||
545 | if (dirty_background_ratio <= 0) | ||
546 | dirty_background_ratio = 1; | ||
547 | if (vm_dirty_ratio <= 0) | ||
548 | vm_dirty_ratio = 1; | ||
549 | } | ||
550 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); | 546 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); |
551 | writeback_set_ratelimit(); | 547 | writeback_set_ratelimit(); |
552 | register_cpu_notifier(&ratelimit_nb); | 548 | register_cpu_notifier(&ratelimit_nb); |
@@ -761,23 +757,24 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
761 | struct address_space *mapping = page_mapping(page); | 757 | struct address_space *mapping = page_mapping(page); |
762 | struct address_space *mapping2; | 758 | struct address_space *mapping2; |
763 | 759 | ||
764 | if (mapping) { | 760 | if (!mapping) |
765 | write_lock_irq(&mapping->tree_lock); | 761 | return 1; |
766 | mapping2 = page_mapping(page); | 762 | |
767 | if (mapping2) { /* Race with truncate? */ | 763 | write_lock_irq(&mapping->tree_lock); |
768 | BUG_ON(mapping2 != mapping); | 764 | mapping2 = page_mapping(page); |
769 | if (mapping_cap_account_dirty(mapping)) | 765 | if (mapping2) { /* Race with truncate? */ |
770 | __inc_zone_page_state(page, | 766 | BUG_ON(mapping2 != mapping); |
771 | NR_FILE_DIRTY); | 767 | if (mapping_cap_account_dirty(mapping)) { |
772 | radix_tree_tag_set(&mapping->page_tree, | 768 | __inc_zone_page_state(page, NR_FILE_DIRTY); |
773 | page_index(page), PAGECACHE_TAG_DIRTY); | 769 | task_io_account_write(PAGE_CACHE_SIZE); |
774 | } | ||
775 | write_unlock_irq(&mapping->tree_lock); | ||
776 | if (mapping->host) { | ||
777 | /* !PageAnon && !swapper_space */ | ||
778 | __mark_inode_dirty(mapping->host, | ||
779 | I_DIRTY_PAGES); | ||
780 | } | 770 | } |
771 | radix_tree_tag_set(&mapping->page_tree, | ||
772 | page_index(page), PAGECACHE_TAG_DIRTY); | ||
773 | } | ||
774 | write_unlock_irq(&mapping->tree_lock); | ||
775 | if (mapping->host) { | ||
776 | /* !PageAnon && !swapper_space */ | ||
777 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | ||
781 | } | 778 | } |
782 | return 1; | 779 | return 1; |
783 | } | 780 | } |
@@ -843,39 +840,6 @@ int set_page_dirty_lock(struct page *page) | |||
843 | EXPORT_SYMBOL(set_page_dirty_lock); | 840 | EXPORT_SYMBOL(set_page_dirty_lock); |
844 | 841 | ||
845 | /* | 842 | /* |
846 | * Clear a page's dirty flag, while caring for dirty memory accounting. | ||
847 | * Returns true if the page was previously dirty. | ||
848 | */ | ||
849 | int test_clear_page_dirty(struct page *page) | ||
850 | { | ||
851 | struct address_space *mapping = page_mapping(page); | ||
852 | unsigned long flags; | ||
853 | |||
854 | if (mapping) { | ||
855 | write_lock_irqsave(&mapping->tree_lock, flags); | ||
856 | if (TestClearPageDirty(page)) { | ||
857 | radix_tree_tag_clear(&mapping->page_tree, | ||
858 | page_index(page), | ||
859 | PAGECACHE_TAG_DIRTY); | ||
860 | write_unlock_irqrestore(&mapping->tree_lock, flags); | ||
861 | /* | ||
862 | * We can continue to use `mapping' here because the | ||
863 | * page is locked, which pins the address_space | ||
864 | */ | ||
865 | if (mapping_cap_account_dirty(mapping)) { | ||
866 | page_mkclean(page); | ||
867 | dec_zone_page_state(page, NR_FILE_DIRTY); | ||
868 | } | ||
869 | return 1; | ||
870 | } | ||
871 | write_unlock_irqrestore(&mapping->tree_lock, flags); | ||
872 | return 0; | ||
873 | } | ||
874 | return TestClearPageDirty(page); | ||
875 | } | ||
876 | EXPORT_SYMBOL(test_clear_page_dirty); | ||
877 | |||
878 | /* | ||
879 | * Clear a page's dirty flag, while caring for dirty memory accounting. | 843 | * Clear a page's dirty flag, while caring for dirty memory accounting. |
880 | * Returns true if the page was previously dirty. | 844 | * Returns true if the page was previously dirty. |
881 | * | 845 | * |
@@ -893,12 +857,41 @@ int clear_page_dirty_for_io(struct page *page) | |||
893 | { | 857 | { |
894 | struct address_space *mapping = page_mapping(page); | 858 | struct address_space *mapping = page_mapping(page); |
895 | 859 | ||
896 | if (mapping) { | 860 | if (mapping && mapping_cap_account_dirty(mapping)) { |
861 | /* | ||
862 | * Yes, Virginia, this is indeed insane. | ||
863 | * | ||
864 | * We use this sequence to make sure that | ||
865 | * (a) we account for dirty stats properly | ||
866 | * (b) we tell the low-level filesystem to | ||
867 | * mark the whole page dirty if it was | ||
868 | * dirty in a pagetable. Only to then | ||
869 | * (c) clean the page again and return 1 to | ||
870 | * cause the writeback. | ||
871 | * | ||
872 | * This way we avoid all nasty races with the | ||
873 | * dirty bit in multiple places and clearing | ||
874 | * them concurrently from different threads. | ||
875 | * | ||
876 | * Note! Normally the "set_page_dirty(page)" | ||
877 | * has no effect on the actual dirty bit - since | ||
878 | * that will already usually be set. But we | ||
879 | * need the side effects, and it can help us | ||
880 | * avoid races. | ||
881 | * | ||
882 | * We basically use the page "master dirty bit" | ||
883 | * as a serialization point for all the different | ||
884 | * threads doing their things. | ||
885 | * | ||
886 | * FIXME! We still have a race here: if somebody | ||
887 | * adds the page back to the page tables in | ||
888 | * between the "page_mkclean()" and the "TestClearPageDirty()", | ||
889 | * we might have it mapped without the dirty bit set. | ||
890 | */ | ||
891 | if (page_mkclean(page)) | ||
892 | set_page_dirty(page); | ||
897 | if (TestClearPageDirty(page)) { | 893 | if (TestClearPageDirty(page)) { |
898 | if (mapping_cap_account_dirty(mapping)) { | 894 | dec_zone_page_state(page, NR_FILE_DIRTY); |
899 | page_mkclean(page); | ||
900 | dec_zone_page_state(page, NR_FILE_DIRTY); | ||
901 | } | ||
902 | return 1; | 895 | return 1; |
903 | } | 896 | } |
904 | return 0; | 897 | return 0; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cace22b3ac25..f12052dc23ff 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #include <linux/sort.h> | 40 | #include <linux/sort.h> |
41 | #include <linux/pfn.h> | 41 | #include <linux/pfn.h> |
42 | #include <linux/backing-dev.h> | 42 | #include <linux/backing-dev.h> |
43 | #include <linux/fault-inject.h> | ||
43 | 44 | ||
44 | #include <asm/tlbflush.h> | 45 | #include <asm/tlbflush.h> |
45 | #include <asm/div64.h> | 46 | #include <asm/div64.h> |
@@ -710,6 +711,9 @@ static void __drain_pages(unsigned int cpu) | |||
710 | for_each_zone(zone) { | 711 | for_each_zone(zone) { |
711 | struct per_cpu_pageset *pset; | 712 | struct per_cpu_pageset *pset; |
712 | 713 | ||
714 | if (!populated_zone(zone)) | ||
715 | continue; | ||
716 | |||
713 | pset = zone_pcp(zone, cpu); | 717 | pset = zone_pcp(zone, cpu); |
714 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 718 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { |
715 | struct per_cpu_pages *pcp; | 719 | struct per_cpu_pages *pcp; |
@@ -892,6 +896,91 @@ failed: | |||
892 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 896 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
893 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 897 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
894 | 898 | ||
899 | #ifdef CONFIG_FAIL_PAGE_ALLOC | ||
900 | |||
901 | static struct fail_page_alloc_attr { | ||
902 | struct fault_attr attr; | ||
903 | |||
904 | u32 ignore_gfp_highmem; | ||
905 | u32 ignore_gfp_wait; | ||
906 | |||
907 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
908 | |||
909 | struct dentry *ignore_gfp_highmem_file; | ||
910 | struct dentry *ignore_gfp_wait_file; | ||
911 | |||
912 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | ||
913 | |||
914 | } fail_page_alloc = { | ||
915 | .attr = FAULT_ATTR_INITIALIZER, | ||
916 | .ignore_gfp_wait = 1, | ||
917 | .ignore_gfp_highmem = 1, | ||
918 | }; | ||
919 | |||
920 | static int __init setup_fail_page_alloc(char *str) | ||
921 | { | ||
922 | return setup_fault_attr(&fail_page_alloc.attr, str); | ||
923 | } | ||
924 | __setup("fail_page_alloc=", setup_fail_page_alloc); | ||
925 | |||
926 | static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | ||
927 | { | ||
928 | if (gfp_mask & __GFP_NOFAIL) | ||
929 | return 0; | ||
930 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) | ||
931 | return 0; | ||
932 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) | ||
933 | return 0; | ||
934 | |||
935 | return should_fail(&fail_page_alloc.attr, 1 << order); | ||
936 | } | ||
937 | |||
938 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
939 | |||
940 | static int __init fail_page_alloc_debugfs(void) | ||
941 | { | ||
942 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | ||
943 | struct dentry *dir; | ||
944 | int err; | ||
945 | |||
946 | err = init_fault_attr_dentries(&fail_page_alloc.attr, | ||
947 | "fail_page_alloc"); | ||
948 | if (err) | ||
949 | return err; | ||
950 | dir = fail_page_alloc.attr.dentries.dir; | ||
951 | |||
952 | fail_page_alloc.ignore_gfp_wait_file = | ||
953 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | ||
954 | &fail_page_alloc.ignore_gfp_wait); | ||
955 | |||
956 | fail_page_alloc.ignore_gfp_highmem_file = | ||
957 | debugfs_create_bool("ignore-gfp-highmem", mode, dir, | ||
958 | &fail_page_alloc.ignore_gfp_highmem); | ||
959 | |||
960 | if (!fail_page_alloc.ignore_gfp_wait_file || | ||
961 | !fail_page_alloc.ignore_gfp_highmem_file) { | ||
962 | err = -ENOMEM; | ||
963 | debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); | ||
964 | debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); | ||
965 | cleanup_fault_attr_dentries(&fail_page_alloc.attr); | ||
966 | } | ||
967 | |||
968 | return err; | ||
969 | } | ||
970 | |||
971 | late_initcall(fail_page_alloc_debugfs); | ||
972 | |||
973 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | ||
974 | |||
975 | #else /* CONFIG_FAIL_PAGE_ALLOC */ | ||
976 | |||
977 | static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | ||
978 | { | ||
979 | return 0; | ||
980 | } | ||
981 | |||
982 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | ||
983 | |||
895 | /* | 984 | /* |
896 | * Return 1 if free pages are above 'mark'. This takes into account the order | 985 | * Return 1 if free pages are above 'mark'. This takes into account the order |
897 | * of the allocation. | 986 | * of the allocation. |
@@ -900,8 +989,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
900 | int classzone_idx, int alloc_flags) | 989 | int classzone_idx, int alloc_flags) |
901 | { | 990 | { |
902 | /* free_pages my go negative - that's OK */ | 991 | /* free_pages my go negative - that's OK */ |
903 | unsigned long min = mark; | 992 | long min = mark, free_pages = z->free_pages - (1 << order) + 1; |
904 | long free_pages = z->free_pages - (1 << order) + 1; | ||
905 | int o; | 993 | int o; |
906 | 994 | ||
907 | if (alloc_flags & ALLOC_HIGH) | 995 | if (alloc_flags & ALLOC_HIGH) |
@@ -1076,7 +1164,7 @@ zonelist_scan: | |||
1076 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) | 1164 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) |
1077 | break; | 1165 | break; |
1078 | if ((alloc_flags & ALLOC_CPUSET) && | 1166 | if ((alloc_flags & ALLOC_CPUSET) && |
1079 | !cpuset_zone_allowed(zone, gfp_mask)) | 1167 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1080 | goto try_next_zone; | 1168 | goto try_next_zone; |
1081 | 1169 | ||
1082 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1170 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
@@ -1136,6 +1224,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, | |||
1136 | 1224 | ||
1137 | might_sleep_if(wait); | 1225 | might_sleep_if(wait); |
1138 | 1226 | ||
1227 | if (should_fail_alloc_page(gfp_mask, order)) | ||
1228 | return NULL; | ||
1229 | |||
1139 | restart: | 1230 | restart: |
1140 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ | 1231 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ |
1141 | 1232 | ||
@@ -1488,8 +1579,8 @@ void show_free_areas(void) | |||
1488 | 1579 | ||
1489 | get_zone_counts(&active, &inactive, &free); | 1580 | get_zone_counts(&active, &inactive, &free); |
1490 | 1581 | ||
1491 | printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " | 1582 | printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" |
1492 | "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", | 1583 | " free:%u slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", |
1493 | active, | 1584 | active, |
1494 | inactive, | 1585 | inactive, |
1495 | global_page_state(NR_FILE_DIRTY), | 1586 | global_page_state(NR_FILE_DIRTY), |
@@ -1499,7 +1590,8 @@ void show_free_areas(void) | |||
1499 | global_page_state(NR_SLAB_RECLAIMABLE) + | 1590 | global_page_state(NR_SLAB_RECLAIMABLE) + |
1500 | global_page_state(NR_SLAB_UNRECLAIMABLE), | 1591 | global_page_state(NR_SLAB_UNRECLAIMABLE), |
1501 | global_page_state(NR_FILE_MAPPED), | 1592 | global_page_state(NR_FILE_MAPPED), |
1502 | global_page_state(NR_PAGETABLE)); | 1593 | global_page_state(NR_PAGETABLE), |
1594 | global_page_state(NR_BOUNCE)); | ||
1503 | 1595 | ||
1504 | for_each_zone(zone) { | 1596 | for_each_zone(zone) { |
1505 | int i; | 1597 | int i; |
@@ -1864,17 +1956,24 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
1864 | * done. Non-atomic initialization, single-pass. | 1956 | * done. Non-atomic initialization, single-pass. |
1865 | */ | 1957 | */ |
1866 | void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | 1958 | void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, |
1867 | unsigned long start_pfn) | 1959 | unsigned long start_pfn, enum memmap_context context) |
1868 | { | 1960 | { |
1869 | struct page *page; | 1961 | struct page *page; |
1870 | unsigned long end_pfn = start_pfn + size; | 1962 | unsigned long end_pfn = start_pfn + size; |
1871 | unsigned long pfn; | 1963 | unsigned long pfn; |
1872 | 1964 | ||
1873 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 1965 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
1874 | if (!early_pfn_valid(pfn)) | 1966 | /* |
1875 | continue; | 1967 | * There can be holes in boot-time mem_map[]s |
1876 | if (!early_pfn_in_nid(pfn, nid)) | 1968 | * handed to this function. They do not |
1877 | continue; | 1969 | * exist on hotplugged memory. |
1970 | */ | ||
1971 | if (context == MEMMAP_EARLY) { | ||
1972 | if (!early_pfn_valid(pfn)) | ||
1973 | continue; | ||
1974 | if (!early_pfn_in_nid(pfn, nid)) | ||
1975 | continue; | ||
1976 | } | ||
1878 | page = pfn_to_page(pfn); | 1977 | page = pfn_to_page(pfn); |
1879 | set_page_links(page, zone, nid, pfn); | 1978 | set_page_links(page, zone, nid, pfn); |
1880 | init_page_count(page); | 1979 | init_page_count(page); |
@@ -1901,7 +2000,7 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | |||
1901 | 2000 | ||
1902 | #ifndef __HAVE_ARCH_MEMMAP_INIT | 2001 | #ifndef __HAVE_ARCH_MEMMAP_INIT |
1903 | #define memmap_init(size, nid, zone, start_pfn) \ | 2002 | #define memmap_init(size, nid, zone, start_pfn) \ |
1904 | memmap_init_zone((size), (nid), (zone), (start_pfn)) | 2003 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) |
1905 | #endif | 2004 | #endif |
1906 | 2005 | ||
1907 | static int __cpuinit zone_batchsize(struct zone *zone) | 2006 | static int __cpuinit zone_batchsize(struct zone *zone) |
@@ -2147,7 +2246,8 @@ static __meminit void zone_pcp_init(struct zone *zone) | |||
2147 | 2246 | ||
2148 | __meminit int init_currently_empty_zone(struct zone *zone, | 2247 | __meminit int init_currently_empty_zone(struct zone *zone, |
2149 | unsigned long zone_start_pfn, | 2248 | unsigned long zone_start_pfn, |
2150 | unsigned long size) | 2249 | unsigned long size, |
2250 | enum memmap_context context) | ||
2151 | { | 2251 | { |
2152 | struct pglist_data *pgdat = zone->zone_pgdat; | 2252 | struct pglist_data *pgdat = zone->zone_pgdat; |
2153 | int ret; | 2253 | int ret; |
@@ -2591,7 +2691,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
2591 | if (!size) | 2691 | if (!size) |
2592 | continue; | 2692 | continue; |
2593 | 2693 | ||
2594 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); | 2694 | ret = init_currently_empty_zone(zone, zone_start_pfn, |
2695 | size, MEMMAP_EARLY); | ||
2595 | BUG_ON(ret); | 2696 | BUG_ON(ret); |
2596 | zone_start_pfn += size; | 2697 | zone_start_pfn += size; |
2597 | } | 2698 | } |
@@ -3232,6 +3333,10 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
3232 | numentries >>= (scale - PAGE_SHIFT); | 3333 | numentries >>= (scale - PAGE_SHIFT); |
3233 | else | 3334 | else |
3234 | numentries <<= (PAGE_SHIFT - scale); | 3335 | numentries <<= (PAGE_SHIFT - scale); |
3336 | |||
3337 | /* Make sure we've got at least a 0-order allocation.. */ | ||
3338 | if (unlikely((numentries * bucketsize) < PAGE_SIZE)) | ||
3339 | numentries = PAGE_SIZE / bucketsize; | ||
3235 | } | 3340 | } |
3236 | numentries = roundup_pow_of_two(numentries); | 3341 | numentries = roundup_pow_of_two(numentries); |
3237 | 3342 | ||
@@ -3244,7 +3349,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
3244 | if (numentries > max) | 3349 | if (numentries > max) |
3245 | numentries = max; | 3350 | numentries = max; |
3246 | 3351 | ||
3247 | log2qty = long_log2(numentries); | 3352 | log2qty = ilog2(numentries); |
3248 | 3353 | ||
3249 | do { | 3354 | do { |
3250 | size = bucketsize << log2qty; | 3355 | size = bucketsize << log2qty; |
@@ -3266,7 +3371,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
3266 | printk("%s hash table entries: %d (order: %d, %lu bytes)\n", | 3371 | printk("%s hash table entries: %d (order: %d, %lu bytes)\n", |
3267 | tablename, | 3372 | tablename, |
3268 | (1U << log2qty), | 3373 | (1U << log2qty), |
3269 | long_log2(size) - PAGE_SHIFT, | 3374 | ilog2(size) - PAGE_SHIFT, |
3270 | size); | 3375 | size); |
3271 | 3376 | ||
3272 | if (_hash_shift) | 3377 | if (_hash_shift) |
diff --git a/mm/readahead.c b/mm/readahead.c index a386f2b6b335..0f539e8e827a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/blkdev.h> | 14 | #include <linux/blkdev.h> |
15 | #include <linux/backing-dev.h> | 15 | #include <linux/backing-dev.h> |
16 | #include <linux/task_io_accounting_ops.h> | ||
16 | #include <linux/pagevec.h> | 17 | #include <linux/pagevec.h> |
17 | 18 | ||
18 | void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | 19 | void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) |
@@ -151,6 +152,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, | |||
151 | put_pages_list(pages); | 152 | put_pages_list(pages); |
152 | break; | 153 | break; |
153 | } | 154 | } |
155 | task_io_account_read(PAGE_CACHE_SIZE); | ||
154 | } | 156 | } |
155 | pagevec_lru_add(&lru_pvec); | 157 | pagevec_lru_add(&lru_pvec); |
156 | return ret; | 158 | return ret; |
@@ -450,7 +452,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp, | |||
450 | * | 452 | * |
451 | * Note that @filp is purely used for passing on to the ->readpage[s]() | 453 | * Note that @filp is purely used for passing on to the ->readpage[s]() |
452 | * handler: it may refer to a different file from @mapping (so we may not use | 454 | * handler: it may refer to a different file from @mapping (so we may not use |
453 | * @filp->f_mapping or @filp->f_dentry->d_inode here). | 455 | * @filp->f_mapping or @filp->f_path.dentry->d_inode here). |
454 | * Also, @ra may not be equal to &@filp->f_ra. | 456 | * Also, @ra may not be equal to &@filp->f_ra. |
455 | * | 457 | * |
456 | */ | 458 | */ |
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/rmap.h> | 47 | #include <linux/rmap.h> |
48 | #include <linux/rcupdate.h> | 48 | #include <linux/rcupdate.h> |
49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
50 | #include <linux/kallsyms.h> | ||
50 | 51 | ||
51 | #include <asm/tlbflush.h> | 52 | #include <asm/tlbflush.h> |
52 | 53 | ||
@@ -432,7 +433,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | |||
432 | { | 433 | { |
433 | struct mm_struct *mm = vma->vm_mm; | 434 | struct mm_struct *mm = vma->vm_mm; |
434 | unsigned long address; | 435 | unsigned long address; |
435 | pte_t *pte, entry; | 436 | pte_t *pte; |
436 | spinlock_t *ptl; | 437 | spinlock_t *ptl; |
437 | int ret = 0; | 438 | int ret = 0; |
438 | 439 | ||
@@ -444,17 +445,18 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | |||
444 | if (!pte) | 445 | if (!pte) |
445 | goto out; | 446 | goto out; |
446 | 447 | ||
447 | if (!pte_dirty(*pte) && !pte_write(*pte)) | 448 | if (pte_dirty(*pte) || pte_write(*pte)) { |
448 | goto unlock; | 449 | pte_t entry; |
449 | 450 | ||
450 | entry = ptep_get_and_clear(mm, address, pte); | 451 | flush_cache_page(vma, address, pte_pfn(*pte)); |
451 | entry = pte_mkclean(entry); | 452 | entry = ptep_clear_flush(vma, address, pte); |
452 | entry = pte_wrprotect(entry); | 453 | entry = pte_wrprotect(entry); |
453 | ptep_establish(vma, address, pte, entry); | 454 | entry = pte_mkclean(entry); |
454 | lazy_mmu_prot_update(entry); | 455 | set_pte_at(mm, address, pte, entry); |
455 | ret = 1; | 456 | lazy_mmu_prot_update(entry); |
457 | ret = 1; | ||
458 | } | ||
456 | 459 | ||
457 | unlock: | ||
458 | pte_unmap_unlock(pte, ptl); | 460 | pte_unmap_unlock(pte, ptl); |
459 | out: | 461 | out: |
460 | return ret; | 462 | return ret; |
@@ -489,6 +491,8 @@ int page_mkclean(struct page *page) | |||
489 | if (mapping) | 491 | if (mapping) |
490 | ret = page_mkclean_file(mapping, page); | 492 | ret = page_mkclean_file(mapping, page); |
491 | } | 493 | } |
494 | if (page_test_and_clear_dirty(page)) | ||
495 | ret = 1; | ||
492 | 496 | ||
493 | return ret; | 497 | return ret; |
494 | } | 498 | } |
@@ -567,14 +571,20 @@ void page_add_file_rmap(struct page *page) | |||
567 | * | 571 | * |
568 | * The caller needs to hold the pte lock. | 572 | * The caller needs to hold the pte lock. |
569 | */ | 573 | */ |
570 | void page_remove_rmap(struct page *page) | 574 | void page_remove_rmap(struct page *page, struct vm_area_struct *vma) |
571 | { | 575 | { |
572 | if (atomic_add_negative(-1, &page->_mapcount)) { | 576 | if (atomic_add_negative(-1, &page->_mapcount)) { |
573 | if (unlikely(page_mapcount(page) < 0)) { | 577 | if (unlikely(page_mapcount(page) < 0)) { |
574 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); | 578 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); |
579 | printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); | ||
575 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); | 580 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); |
576 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); | 581 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); |
577 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); | 582 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); |
583 | print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); | ||
584 | if (vma->vm_ops) | ||
585 | print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage); | ||
586 | if (vma->vm_file && vma->vm_file->f_op) | ||
587 | print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); | ||
578 | BUG(); | 588 | BUG(); |
579 | } | 589 | } |
580 | 590 | ||
@@ -679,7 +689,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
679 | dec_mm_counter(mm, file_rss); | 689 | dec_mm_counter(mm, file_rss); |
680 | 690 | ||
681 | 691 | ||
682 | page_remove_rmap(page); | 692 | page_remove_rmap(page, vma); |
683 | page_cache_release(page); | 693 | page_cache_release(page); |
684 | 694 | ||
685 | out_unmap: | 695 | out_unmap: |
@@ -769,7 +779,7 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
769 | if (pte_dirty(pteval)) | 779 | if (pte_dirty(pteval)) |
770 | set_page_dirty(page); | 780 | set_page_dirty(page); |
771 | 781 | ||
772 | page_remove_rmap(page); | 782 | page_remove_rmap(page, vma); |
773 | page_cache_release(page); | 783 | page_cache_release(page); |
774 | dec_mm_counter(mm, file_rss); | 784 | dec_mm_counter(mm, file_rss); |
775 | (*mapcount)--; | 785 | (*mapcount)--; |
diff --git a/mm/shmem.c b/mm/shmem.c index c820b4f77b8d..70da7a0981bf 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -515,7 +515,12 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | |||
515 | size = SHMEM_NR_DIRECT; | 515 | size = SHMEM_NR_DIRECT; |
516 | nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size); | 516 | nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size); |
517 | } | 517 | } |
518 | if (!topdir) | 518 | |
519 | /* | ||
520 | * If there are no indirect blocks or we are punching a hole | ||
521 | * below indirect blocks, nothing to be done. | ||
522 | */ | ||
523 | if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT))) | ||
519 | goto done2; | 524 | goto done2; |
520 | 525 | ||
521 | BUG_ON(limit <= SHMEM_NR_DIRECT); | 526 | BUG_ON(limit <= SHMEM_NR_DIRECT); |
@@ -1225,7 +1230,7 @@ failed: | |||
1225 | 1230 | ||
1226 | struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type) | 1231 | struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type) |
1227 | { | 1232 | { |
1228 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | 1233 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1229 | struct page *page = NULL; | 1234 | struct page *page = NULL; |
1230 | unsigned long idx; | 1235 | unsigned long idx; |
1231 | int error; | 1236 | int error; |
@@ -1248,7 +1253,7 @@ static int shmem_populate(struct vm_area_struct *vma, | |||
1248 | unsigned long addr, unsigned long len, | 1253 | unsigned long addr, unsigned long len, |
1249 | pgprot_t prot, unsigned long pgoff, int nonblock) | 1254 | pgprot_t prot, unsigned long pgoff, int nonblock) |
1250 | { | 1255 | { |
1251 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | 1256 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1252 | struct mm_struct *mm = vma->vm_mm; | 1257 | struct mm_struct *mm = vma->vm_mm; |
1253 | enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; | 1258 | enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; |
1254 | unsigned long size; | 1259 | unsigned long size; |
@@ -1293,14 +1298,14 @@ static int shmem_populate(struct vm_area_struct *vma, | |||
1293 | #ifdef CONFIG_NUMA | 1298 | #ifdef CONFIG_NUMA |
1294 | int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | 1299 | int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) |
1295 | { | 1300 | { |
1296 | struct inode *i = vma->vm_file->f_dentry->d_inode; | 1301 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; |
1297 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); | 1302 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); |
1298 | } | 1303 | } |
1299 | 1304 | ||
1300 | struct mempolicy * | 1305 | struct mempolicy * |
1301 | shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) | 1306 | shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) |
1302 | { | 1307 | { |
1303 | struct inode *i = vma->vm_file->f_dentry->d_inode; | 1308 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; |
1304 | unsigned long idx; | 1309 | unsigned long idx; |
1305 | 1310 | ||
1306 | idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 1311 | idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
@@ -1310,7 +1315,7 @@ shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) | |||
1310 | 1315 | ||
1311 | int shmem_lock(struct file *file, int lock, struct user_struct *user) | 1316 | int shmem_lock(struct file *file, int lock, struct user_struct *user) |
1312 | { | 1317 | { |
1313 | struct inode *inode = file->f_dentry->d_inode; | 1318 | struct inode *inode = file->f_path.dentry->d_inode; |
1314 | struct shmem_inode_info *info = SHMEM_I(inode); | 1319 | struct shmem_inode_info *info = SHMEM_I(inode); |
1315 | int retval = -ENOMEM; | 1320 | int retval = -ENOMEM; |
1316 | 1321 | ||
@@ -1422,7 +1427,7 @@ shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsig | |||
1422 | static ssize_t | 1427 | static ssize_t |
1423 | shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) | 1428 | shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) |
1424 | { | 1429 | { |
1425 | struct inode *inode = file->f_dentry->d_inode; | 1430 | struct inode *inode = file->f_path.dentry->d_inode; |
1426 | loff_t pos; | 1431 | loff_t pos; |
1427 | unsigned long written; | 1432 | unsigned long written; |
1428 | ssize_t err; | 1433 | ssize_t err; |
@@ -1442,7 +1447,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t | |||
1442 | if (err || !count) | 1447 | if (err || !count) |
1443 | goto out; | 1448 | goto out; |
1444 | 1449 | ||
1445 | err = remove_suid(file->f_dentry); | 1450 | err = remove_suid(file->f_path.dentry); |
1446 | if (err) | 1451 | if (err) |
1447 | goto out; | 1452 | goto out; |
1448 | 1453 | ||
@@ -1524,7 +1529,7 @@ out: | |||
1524 | 1529 | ||
1525 | static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) | 1530 | static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) |
1526 | { | 1531 | { |
1527 | struct inode *inode = filp->f_dentry->d_inode; | 1532 | struct inode *inode = filp->f_path.dentry->d_inode; |
1528 | struct address_space *mapping = inode->i_mapping; | 1533 | struct address_space *mapping = inode->i_mapping; |
1529 | unsigned long index, offset; | 1534 | unsigned long index, offset; |
1530 | 1535 | ||
@@ -2493,8 +2498,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
2493 | d_instantiate(dentry, inode); | 2498 | d_instantiate(dentry, inode); |
2494 | inode->i_size = size; | 2499 | inode->i_size = size; |
2495 | inode->i_nlink = 0; /* It is unlinked */ | 2500 | inode->i_nlink = 0; /* It is unlinked */ |
2496 | file->f_vfsmnt = mntget(shm_mnt); | 2501 | file->f_path.mnt = mntget(shm_mnt); |
2497 | file->f_dentry = dentry; | 2502 | file->f_path.dentry = dentry; |
2498 | file->f_mapping = inode->i_mapping; | 2503 | file->f_mapping = inode->i_mapping; |
2499 | file->f_op = &shmem_file_operations; | 2504 | file->f_op = &shmem_file_operations; |
2500 | file->f_mode = FMODE_WRITE | FMODE_READ; | 2505 | file->f_mode = FMODE_WRITE | FMODE_READ; |
@@ -107,7 +107,9 @@ | |||
107 | #include <linux/nodemask.h> | 107 | #include <linux/nodemask.h> |
108 | #include <linux/mempolicy.h> | 108 | #include <linux/mempolicy.h> |
109 | #include <linux/mutex.h> | 109 | #include <linux/mutex.h> |
110 | #include <linux/fault-inject.h> | ||
110 | #include <linux/rtmutex.h> | 111 | #include <linux/rtmutex.h> |
112 | #include <linux/reciprocal_div.h> | ||
111 | 113 | ||
112 | #include <asm/cacheflush.h> | 114 | #include <asm/cacheflush.h> |
113 | #include <asm/tlbflush.h> | 115 | #include <asm/tlbflush.h> |
@@ -385,6 +387,7 @@ struct kmem_cache { | |||
385 | unsigned int shared; | 387 | unsigned int shared; |
386 | 388 | ||
387 | unsigned int buffer_size; | 389 | unsigned int buffer_size; |
390 | u32 reciprocal_buffer_size; | ||
388 | /* 3) touched by every alloc & free from the backend */ | 391 | /* 3) touched by every alloc & free from the backend */ |
389 | struct kmem_list3 *nodelists[MAX_NUMNODES]; | 392 | struct kmem_list3 *nodelists[MAX_NUMNODES]; |
390 | 393 | ||
@@ -626,10 +629,17 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, | |||
626 | return slab->s_mem + cache->buffer_size * idx; | 629 | return slab->s_mem + cache->buffer_size * idx; |
627 | } | 630 | } |
628 | 631 | ||
629 | static inline unsigned int obj_to_index(struct kmem_cache *cache, | 632 | /* |
630 | struct slab *slab, void *obj) | 633 | * We want to avoid an expensive divide : (offset / cache->buffer_size) |
634 | * Using the fact that buffer_size is a constant for a particular cache, | ||
635 | * we can replace (offset / cache->buffer_size) by | ||
636 | * reciprocal_divide(offset, cache->reciprocal_buffer_size) | ||
637 | */ | ||
638 | static inline unsigned int obj_to_index(const struct kmem_cache *cache, | ||
639 | const struct slab *slab, void *obj) | ||
631 | { | 640 | { |
632 | return (unsigned)(obj - slab->s_mem) / cache->buffer_size; | 641 | u32 offset = (obj - slab->s_mem); |
642 | return reciprocal_divide(offset, cache->reciprocal_buffer_size); | ||
633 | } | 643 | } |
634 | 644 | ||
635 | /* | 645 | /* |
@@ -945,7 +955,8 @@ static void __devinit start_cpu_timer(int cpu) | |||
945 | if (keventd_up() && reap_work->work.func == NULL) { | 955 | if (keventd_up() && reap_work->work.func == NULL) { |
946 | init_reap_node(cpu); | 956 | init_reap_node(cpu); |
947 | INIT_DELAYED_WORK(reap_work, cache_reap); | 957 | INIT_DELAYED_WORK(reap_work, cache_reap); |
948 | schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); | 958 | schedule_delayed_work_on(cpu, reap_work, |
959 | __round_jiffies_relative(HZ, cpu)); | ||
949 | } | 960 | } |
950 | } | 961 | } |
951 | 962 | ||
@@ -1425,6 +1436,8 @@ void __init kmem_cache_init(void) | |||
1425 | 1436 | ||
1426 | cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, | 1437 | cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, |
1427 | cache_line_size()); | 1438 | cache_line_size()); |
1439 | cache_cache.reciprocal_buffer_size = | ||
1440 | reciprocal_value(cache_cache.buffer_size); | ||
1428 | 1441 | ||
1429 | for (order = 0; order < MAX_ORDER; order++) { | 1442 | for (order = 0; order < MAX_ORDER; order++) { |
1430 | cache_estimate(order, cache_cache.buffer_size, | 1443 | cache_estimate(order, cache_cache.buffer_size, |
@@ -2311,6 +2324,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2311 | if (flags & SLAB_CACHE_DMA) | 2324 | if (flags & SLAB_CACHE_DMA) |
2312 | cachep->gfpflags |= GFP_DMA; | 2325 | cachep->gfpflags |= GFP_DMA; |
2313 | cachep->buffer_size = size; | 2326 | cachep->buffer_size = size; |
2327 | cachep->reciprocal_buffer_size = reciprocal_value(size); | ||
2314 | 2328 | ||
2315 | if (flags & CFLGS_OFF_SLAB) { | 2329 | if (flags & CFLGS_OFF_SLAB) { |
2316 | cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); | 2330 | cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); |
@@ -3088,12 +3102,89 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3088 | #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) | 3102 | #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) |
3089 | #endif | 3103 | #endif |
3090 | 3104 | ||
3105 | #ifdef CONFIG_FAILSLAB | ||
3106 | |||
3107 | static struct failslab_attr { | ||
3108 | |||
3109 | struct fault_attr attr; | ||
3110 | |||
3111 | u32 ignore_gfp_wait; | ||
3112 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
3113 | struct dentry *ignore_gfp_wait_file; | ||
3114 | #endif | ||
3115 | |||
3116 | } failslab = { | ||
3117 | .attr = FAULT_ATTR_INITIALIZER, | ||
3118 | .ignore_gfp_wait = 1, | ||
3119 | }; | ||
3120 | |||
3121 | static int __init setup_failslab(char *str) | ||
3122 | { | ||
3123 | return setup_fault_attr(&failslab.attr, str); | ||
3124 | } | ||
3125 | __setup("failslab=", setup_failslab); | ||
3126 | |||
3127 | static int should_failslab(struct kmem_cache *cachep, gfp_t flags) | ||
3128 | { | ||
3129 | if (cachep == &cache_cache) | ||
3130 | return 0; | ||
3131 | if (flags & __GFP_NOFAIL) | ||
3132 | return 0; | ||
3133 | if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT)) | ||
3134 | return 0; | ||
3135 | |||
3136 | return should_fail(&failslab.attr, obj_size(cachep)); | ||
3137 | } | ||
3138 | |||
3139 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
3140 | |||
3141 | static int __init failslab_debugfs(void) | ||
3142 | { | ||
3143 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | ||
3144 | struct dentry *dir; | ||
3145 | int err; | ||
3146 | |||
3147 | err = init_fault_attr_dentries(&failslab.attr, "failslab"); | ||
3148 | if (err) | ||
3149 | return err; | ||
3150 | dir = failslab.attr.dentries.dir; | ||
3151 | |||
3152 | failslab.ignore_gfp_wait_file = | ||
3153 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | ||
3154 | &failslab.ignore_gfp_wait); | ||
3155 | |||
3156 | if (!failslab.ignore_gfp_wait_file) { | ||
3157 | err = -ENOMEM; | ||
3158 | debugfs_remove(failslab.ignore_gfp_wait_file); | ||
3159 | cleanup_fault_attr_dentries(&failslab.attr); | ||
3160 | } | ||
3161 | |||
3162 | return err; | ||
3163 | } | ||
3164 | |||
3165 | late_initcall(failslab_debugfs); | ||
3166 | |||
3167 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | ||
3168 | |||
3169 | #else /* CONFIG_FAILSLAB */ | ||
3170 | |||
3171 | static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags) | ||
3172 | { | ||
3173 | return 0; | ||
3174 | } | ||
3175 | |||
3176 | #endif /* CONFIG_FAILSLAB */ | ||
3177 | |||
3091 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 3178 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
3092 | { | 3179 | { |
3093 | void *objp; | 3180 | void *objp; |
3094 | struct array_cache *ac; | 3181 | struct array_cache *ac; |
3095 | 3182 | ||
3096 | check_irq_off(); | 3183 | check_irq_off(); |
3184 | |||
3185 | if (should_failslab(cachep, flags)) | ||
3186 | return NULL; | ||
3187 | |||
3097 | ac = cpu_cache_get(cachep); | 3188 | ac = cpu_cache_get(cachep); |
3098 | if (likely(ac->avail)) { | 3189 | if (likely(ac->avail)) { |
3099 | STATS_INC_ALLOCHIT(cachep); | 3190 | STATS_INC_ALLOCHIT(cachep); |
@@ -3173,6 +3264,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3173 | struct zone **z; | 3264 | struct zone **z; |
3174 | void *obj = NULL; | 3265 | void *obj = NULL; |
3175 | int nid; | 3266 | int nid; |
3267 | gfp_t local_flags = (flags & GFP_LEVEL_MASK); | ||
3176 | 3268 | ||
3177 | retry: | 3269 | retry: |
3178 | /* | 3270 | /* |
@@ -3182,21 +3274,26 @@ retry: | |||
3182 | for (z = zonelist->zones; *z && !obj; z++) { | 3274 | for (z = zonelist->zones; *z && !obj; z++) { |
3183 | nid = zone_to_nid(*z); | 3275 | nid = zone_to_nid(*z); |
3184 | 3276 | ||
3185 | if (cpuset_zone_allowed(*z, flags) && | 3277 | if (cpuset_zone_allowed_hardwall(*z, flags) && |
3186 | cache->nodelists[nid] && | 3278 | cache->nodelists[nid] && |
3187 | cache->nodelists[nid]->free_objects) | 3279 | cache->nodelists[nid]->free_objects) |
3188 | obj = ____cache_alloc_node(cache, | 3280 | obj = ____cache_alloc_node(cache, |
3189 | flags | GFP_THISNODE, nid); | 3281 | flags | GFP_THISNODE, nid); |
3190 | } | 3282 | } |
3191 | 3283 | ||
3192 | if (!obj) { | 3284 | if (!obj && !(flags & __GFP_NO_GROW)) { |
3193 | /* | 3285 | /* |
3194 | * This allocation will be performed within the constraints | 3286 | * This allocation will be performed within the constraints |
3195 | * of the current cpuset / memory policy requirements. | 3287 | * of the current cpuset / memory policy requirements. |
3196 | * We may trigger various forms of reclaim on the allowed | 3288 | * We may trigger various forms of reclaim on the allowed |
3197 | * set and go into memory reserves if necessary. | 3289 | * set and go into memory reserves if necessary. |
3198 | */ | 3290 | */ |
3291 | if (local_flags & __GFP_WAIT) | ||
3292 | local_irq_enable(); | ||
3293 | kmem_flagcheck(cache, flags); | ||
3199 | obj = kmem_getpages(cache, flags, -1); | 3294 | obj = kmem_getpages(cache, flags, -1); |
3295 | if (local_flags & __GFP_WAIT) | ||
3296 | local_irq_disable(); | ||
3200 | if (obj) { | 3297 | if (obj) { |
3201 | /* | 3298 | /* |
3202 | * Insert into the appropriate per node queues | 3299 | * Insert into the appropriate per node queues |
@@ -3213,7 +3310,7 @@ retry: | |||
3213 | */ | 3310 | */ |
3214 | goto retry; | 3311 | goto retry; |
3215 | } else { | 3312 | } else { |
3216 | kmem_freepages(cache, obj); | 3313 | /* cache_grow already freed obj */ |
3217 | obj = NULL; | 3314 | obj = NULL; |
3218 | } | 3315 | } |
3219 | } | 3316 | } |
@@ -3456,7 +3553,7 @@ EXPORT_SYMBOL(kmem_cache_zalloc); | |||
3456 | * | 3553 | * |
3457 | * Currently only used for dentry validation. | 3554 | * Currently only used for dentry validation. |
3458 | */ | 3555 | */ |
3459 | int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr) | 3556 | int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) |
3460 | { | 3557 | { |
3461 | unsigned long addr = (unsigned long)ptr; | 3558 | unsigned long addr = (unsigned long)ptr; |
3462 | unsigned long min_addr = PAGE_OFFSET; | 3559 | unsigned long min_addr = PAGE_OFFSET; |
@@ -3490,6 +3587,7 @@ out: | |||
3490 | * @cachep: The cache to allocate from. | 3587 | * @cachep: The cache to allocate from. |
3491 | * @flags: See kmalloc(). | 3588 | * @flags: See kmalloc(). |
3492 | * @nodeid: node number of the target node. | 3589 | * @nodeid: node number of the target node. |
3590 | * @caller: return address of caller, used for debug information | ||
3493 | * | 3591 | * |
3494 | * Identical to kmem_cache_alloc but it will allocate memory on the given | 3592 | * Identical to kmem_cache_alloc but it will allocate memory on the given |
3495 | * node, which can improve the performance for cpu bound structures. | 3593 | * node, which can improve the performance for cpu bound structures. |
@@ -3928,7 +4026,7 @@ static void cache_reap(struct work_struct *unused) | |||
3928 | if (!mutex_trylock(&cache_chain_mutex)) { | 4026 | if (!mutex_trylock(&cache_chain_mutex)) { |
3929 | /* Give up. Setup the next iteration. */ | 4027 | /* Give up. Setup the next iteration. */ |
3930 | schedule_delayed_work(&__get_cpu_var(reap_work), | 4028 | schedule_delayed_work(&__get_cpu_var(reap_work), |
3931 | REAPTIMEOUT_CPUC); | 4029 | round_jiffies_relative(REAPTIMEOUT_CPUC)); |
3932 | return; | 4030 | return; |
3933 | } | 4031 | } |
3934 | 4032 | ||
@@ -3974,7 +4072,8 @@ next: | |||
3974 | next_reap_node(); | 4072 | next_reap_node(); |
3975 | refresh_cpu_vm_stats(smp_processor_id()); | 4073 | refresh_cpu_vm_stats(smp_processor_id()); |
3976 | /* Set up the next iteration */ | 4074 | /* Set up the next iteration */ |
3977 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); | 4075 | schedule_delayed_work(&__get_cpu_var(reap_work), |
4076 | round_jiffies_relative(REAPTIMEOUT_CPUC)); | ||
3978 | } | 4077 | } |
3979 | 4078 | ||
3980 | #ifdef CONFIG_PROC_FS | 4079 | #ifdef CONFIG_PROC_FS |
@@ -60,6 +60,8 @@ static DEFINE_SPINLOCK(slob_lock); | |||
60 | static DEFINE_SPINLOCK(block_lock); | 60 | static DEFINE_SPINLOCK(block_lock); |
61 | 61 | ||
62 | static void slob_free(void *b, int size); | 62 | static void slob_free(void *b, int size); |
63 | static void slob_timer_cbk(void); | ||
64 | |||
63 | 65 | ||
64 | static void *slob_alloc(size_t size, gfp_t gfp, int align) | 66 | static void *slob_alloc(size_t size, gfp_t gfp, int align) |
65 | { | 67 | { |
@@ -157,7 +159,7 @@ static int fastcall find_order(int size) | |||
157 | return order; | 159 | return order; |
158 | } | 160 | } |
159 | 161 | ||
160 | void *kmalloc(size_t size, gfp_t gfp) | 162 | void *__kmalloc(size_t size, gfp_t gfp) |
161 | { | 163 | { |
162 | slob_t *m; | 164 | slob_t *m; |
163 | bigblock_t *bb; | 165 | bigblock_t *bb; |
@@ -186,8 +188,7 @@ void *kmalloc(size_t size, gfp_t gfp) | |||
186 | slob_free(bb, sizeof(bigblock_t)); | 188 | slob_free(bb, sizeof(bigblock_t)); |
187 | return 0; | 189 | return 0; |
188 | } | 190 | } |
189 | 191 | EXPORT_SYMBOL(__kmalloc); | |
190 | EXPORT_SYMBOL(kmalloc); | ||
191 | 192 | ||
192 | void kfree(const void *block) | 193 | void kfree(const void *block) |
193 | { | 194 | { |
@@ -327,9 +328,25 @@ const char *kmem_cache_name(struct kmem_cache *c) | |||
327 | EXPORT_SYMBOL(kmem_cache_name); | 328 | EXPORT_SYMBOL(kmem_cache_name); |
328 | 329 | ||
329 | static struct timer_list slob_timer = TIMER_INITIALIZER( | 330 | static struct timer_list slob_timer = TIMER_INITIALIZER( |
330 | (void (*)(unsigned long))kmem_cache_init, 0, 0); | 331 | (void (*)(unsigned long))slob_timer_cbk, 0, 0); |
332 | |||
333 | int kmem_cache_shrink(struct kmem_cache *d) | ||
334 | { | ||
335 | return 0; | ||
336 | } | ||
337 | EXPORT_SYMBOL(kmem_cache_shrink); | ||
338 | |||
339 | int kmem_ptr_validate(struct kmem_cache *a, const void *b) | ||
340 | { | ||
341 | return 0; | ||
342 | } | ||
343 | |||
344 | void __init kmem_cache_init(void) | ||
345 | { | ||
346 | slob_timer_cbk(); | ||
347 | } | ||
331 | 348 | ||
332 | void kmem_cache_init(void) | 349 | static void slob_timer_cbk(void) |
333 | { | 350 | { |
334 | void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); | 351 | void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); |
335 | 352 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index c5431072f422..a2d9bb4e80df 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -434,7 +434,7 @@ void free_swap_and_cache(swp_entry_t entry) | |||
434 | * | 434 | * |
435 | * This is needed for the suspend to disk (aka swsusp). | 435 | * This is needed for the suspend to disk (aka swsusp). |
436 | */ | 436 | */ |
437 | int swap_type_of(dev_t device, sector_t offset) | 437 | int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) |
438 | { | 438 | { |
439 | struct block_device *bdev = NULL; | 439 | struct block_device *bdev = NULL; |
440 | int i; | 440 | int i; |
@@ -450,6 +450,9 @@ int swap_type_of(dev_t device, sector_t offset) | |||
450 | continue; | 450 | continue; |
451 | 451 | ||
452 | if (!bdev) { | 452 | if (!bdev) { |
453 | if (bdev_p) | ||
454 | *bdev_p = sis->bdev; | ||
455 | |||
453 | spin_unlock(&swap_lock); | 456 | spin_unlock(&swap_lock); |
454 | return i; | 457 | return i; |
455 | } | 458 | } |
@@ -459,6 +462,9 @@ int swap_type_of(dev_t device, sector_t offset) | |||
459 | se = list_entry(sis->extent_list.next, | 462 | se = list_entry(sis->extent_list.next, |
460 | struct swap_extent, list); | 463 | struct swap_extent, list); |
461 | if (se->start_block == offset) { | 464 | if (se->start_block == offset) { |
465 | if (bdev_p) | ||
466 | *bdev_p = sis->bdev; | ||
467 | |||
462 | spin_unlock(&swap_lock); | 468 | spin_unlock(&swap_lock); |
463 | bdput(bdev); | 469 | bdput(bdev); |
464 | return i; | 470 | return i; |
@@ -1357,10 +1363,10 @@ static int swap_show(struct seq_file *swap, void *v) | |||
1357 | } | 1363 | } |
1358 | 1364 | ||
1359 | file = ptr->swap_file; | 1365 | file = ptr->swap_file; |
1360 | len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\"); | 1366 | len = seq_path(swap, file->f_path.mnt, file->f_path.dentry, " \t\n\\"); |
1361 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", | 1367 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", |
1362 | len < 40 ? 40 - len : 1, " ", | 1368 | len < 40 ? 40 - len : 1, " ", |
1363 | S_ISBLK(file->f_dentry->d_inode->i_mode) ? | 1369 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? |
1364 | "partition" : "file\t", | 1370 | "partition" : "file\t", |
1365 | ptr->pages << (PAGE_SHIFT - 10), | 1371 | ptr->pages << (PAGE_SHIFT - 10), |
1366 | ptr->inuse_pages << (PAGE_SHIFT - 10), | 1372 | ptr->inuse_pages << (PAGE_SHIFT - 10), |
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index 5f2cbf0f153c..c7f6e1914bc4 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c | |||
@@ -79,8 +79,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
79 | d_instantiate(dentry, inode); | 79 | d_instantiate(dentry, inode); |
80 | inode->i_nlink = 0; /* It is unlinked */ | 80 | inode->i_nlink = 0; /* It is unlinked */ |
81 | 81 | ||
82 | file->f_vfsmnt = mntget(shm_mnt); | 82 | file->f_path.mnt = mntget(shm_mnt); |
83 | file->f_dentry = dentry; | 83 | file->f_path.dentry = dentry; |
84 | file->f_mapping = inode->i_mapping; | 84 | file->f_mapping = inode->i_mapping; |
85 | file->f_op = &ramfs_file_operations; | 85 | file->f_op = &ramfs_file_operations; |
86 | file->f_mode = FMODE_WRITE | FMODE_READ; | 86 | file->f_mode = FMODE_WRITE | FMODE_READ; |
diff --git a/mm/truncate.c b/mm/truncate.c index e07b1e682c38..5df947de7654 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
15 | #include <linux/pagevec.h> | 15 | #include <linux/pagevec.h> |
16 | #include <linux/task_io_accounting_ops.h> | ||
16 | #include <linux/buffer_head.h> /* grr. try_to_release_page, | 17 | #include <linux/buffer_head.h> /* grr. try_to_release_page, |
17 | do_invalidatepage */ | 18 | do_invalidatepage */ |
18 | 19 | ||
@@ -51,6 +52,33 @@ static inline void truncate_partial_page(struct page *page, unsigned partial) | |||
51 | } | 52 | } |
52 | 53 | ||
53 | /* | 54 | /* |
55 | * This cancels just the dirty bit on the kernel page itself, it | ||
56 | * does NOT actually remove dirty bits on any mmap's that may be | ||
57 | * around. It also leaves the page tagged dirty, so any sync | ||
58 | * activity will still find it on the dirty lists, and in particular, | ||
59 | * clear_page_dirty_for_io() will still look at the dirty bits in | ||
60 | * the VM. | ||
61 | * | ||
62 | * Doing this should *normally* only ever be done when a page | ||
63 | * is truncated, and is not actually mapped anywhere at all. However, | ||
64 | * fs/buffer.c does this when it notices that somebody has cleaned | ||
65 | * out all the buffers on a page without actually doing it through | ||
66 | * the VM. Can you say "ext3 is horribly ugly"? Tought you could. | ||
67 | */ | ||
68 | void cancel_dirty_page(struct page *page, unsigned int account_size) | ||
69 | { | ||
70 | if (TestClearPageDirty(page)) { | ||
71 | struct address_space *mapping = page->mapping; | ||
72 | if (mapping && mapping_cap_account_dirty(mapping)) { | ||
73 | dec_zone_page_state(page, NR_FILE_DIRTY); | ||
74 | if (account_size) | ||
75 | task_io_account_cancelled_write(account_size); | ||
76 | } | ||
77 | } | ||
78 | } | ||
79 | EXPORT_SYMBOL(cancel_dirty_page); | ||
80 | |||
81 | /* | ||
54 | * If truncate cannot remove the fs-private metadata from the page, the page | 82 | * If truncate cannot remove the fs-private metadata from the page, the page |
55 | * becomes anonymous. It will be left on the LRU and may even be mapped into | 83 | * becomes anonymous. It will be left on the LRU and may even be mapped into |
56 | * user pagetables if we're racing with filemap_nopage(). | 84 | * user pagetables if we're racing with filemap_nopage(). |
@@ -66,10 +94,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
66 | if (page->mapping != mapping) | 94 | if (page->mapping != mapping) |
67 | return; | 95 | return; |
68 | 96 | ||
97 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | ||
98 | |||
69 | if (PagePrivate(page)) | 99 | if (PagePrivate(page)) |
70 | do_invalidatepage(page, 0); | 100 | do_invalidatepage(page, 0); |
71 | 101 | ||
72 | clear_page_dirty(page); | ||
73 | ClearPageUptodate(page); | 102 | ClearPageUptodate(page); |
74 | ClearPageMappedToDisk(page); | 103 | ClearPageMappedToDisk(page); |
75 | remove_from_page_cache(page); | 104 | remove_from_page_cache(page); |
@@ -319,6 +348,15 @@ failed: | |||
319 | return 0; | 348 | return 0; |
320 | } | 349 | } |
321 | 350 | ||
351 | static int do_launder_page(struct address_space *mapping, struct page *page) | ||
352 | { | ||
353 | if (!PageDirty(page)) | ||
354 | return 0; | ||
355 | if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) | ||
356 | return 0; | ||
357 | return mapping->a_ops->launder_page(page); | ||
358 | } | ||
359 | |||
322 | /** | 360 | /** |
323 | * invalidate_inode_pages2_range - remove range of pages from an address_space | 361 | * invalidate_inode_pages2_range - remove range of pages from an address_space |
324 | * @mapping: the address_space | 362 | * @mapping: the address_space |
@@ -348,7 +386,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
348 | for (i = 0; !ret && i < pagevec_count(&pvec); i++) { | 386 | for (i = 0; !ret && i < pagevec_count(&pvec); i++) { |
349 | struct page *page = pvec.pages[i]; | 387 | struct page *page = pvec.pages[i]; |
350 | pgoff_t page_index; | 388 | pgoff_t page_index; |
351 | int was_dirty; | ||
352 | 389 | ||
353 | lock_page(page); | 390 | lock_page(page); |
354 | if (page->mapping != mapping) { | 391 | if (page->mapping != mapping) { |
@@ -384,18 +421,14 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
384 | PAGE_CACHE_SIZE, 0); | 421 | PAGE_CACHE_SIZE, 0); |
385 | } | 422 | } |
386 | } | 423 | } |
387 | was_dirty = test_clear_page_dirty(page); | 424 | ret = do_launder_page(mapping, page); |
388 | if (!invalidate_complete_page2(mapping, page)) { | 425 | if (ret == 0 && !invalidate_complete_page2(mapping, page)) |
389 | if (was_dirty) | ||
390 | set_page_dirty(page); | ||
391 | ret = -EIO; | 426 | ret = -EIO; |
392 | } | ||
393 | unlock_page(page); | 427 | unlock_page(page); |
394 | } | 428 | } |
395 | pagevec_release(&pvec); | 429 | pagevec_release(&pvec); |
396 | cond_resched(); | 430 | cond_resched(); |
397 | } | 431 | } |
398 | WARN_ON_ONCE(ret); | ||
399 | return ret; | 432 | return ret; |
400 | } | 433 | } |
401 | EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); | 434 | EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 093f5fe6dd77..7430df68cb64 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -692,7 +692,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
692 | __count_vm_events(KSWAPD_STEAL, nr_freed); | 692 | __count_vm_events(KSWAPD_STEAL, nr_freed); |
693 | } else | 693 | } else |
694 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); | 694 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); |
695 | __count_vm_events(PGACTIVATE, nr_freed); | 695 | __count_zone_vm_events(PGSTEAL, zone, nr_freed); |
696 | 696 | ||
697 | if (nr_taken == 0) | 697 | if (nr_taken == 0) |
698 | goto done; | 698 | goto done; |
@@ -984,7 +984,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
984 | if (!populated_zone(zone)) | 984 | if (!populated_zone(zone)) |
985 | continue; | 985 | continue; |
986 | 986 | ||
987 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 987 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
988 | continue; | 988 | continue; |
989 | 989 | ||
990 | note_zone_scanning_priority(zone, priority); | 990 | note_zone_scanning_priority(zone, priority); |
@@ -1034,7 +1034,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
1034 | for (i = 0; zones[i] != NULL; i++) { | 1034 | for (i = 0; zones[i] != NULL; i++) { |
1035 | struct zone *zone = zones[i]; | 1035 | struct zone *zone = zones[i]; |
1036 | 1036 | ||
1037 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 1037 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
1038 | continue; | 1038 | continue; |
1039 | 1039 | ||
1040 | lru_pages += zone->nr_active + zone->nr_inactive; | 1040 | lru_pages += zone->nr_active + zone->nr_inactive; |
@@ -1089,7 +1089,7 @@ out: | |||
1089 | for (i = 0; zones[i] != 0; i++) { | 1089 | for (i = 0; zones[i] != 0; i++) { |
1090 | struct zone *zone = zones[i]; | 1090 | struct zone *zone = zones[i]; |
1091 | 1091 | ||
1092 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 1092 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
1093 | continue; | 1093 | continue; |
1094 | 1094 | ||
1095 | zone->prev_priority = priority; | 1095 | zone->prev_priority = priority; |
@@ -1354,7 +1354,7 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
1354 | return; | 1354 | return; |
1355 | if (pgdat->kswapd_max_order < order) | 1355 | if (pgdat->kswapd_max_order < order) |
1356 | pgdat->kswapd_max_order = order; | 1356 | pgdat->kswapd_max_order = order; |
1357 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 1357 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
1358 | return; | 1358 | return; |
1359 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 1359 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
1360 | return; | 1360 | return; |
@@ -1369,8 +1369,8 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
1369 | * | 1369 | * |
1370 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages | 1370 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages |
1371 | */ | 1371 | */ |
1372 | static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, | 1372 | static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, |
1373 | int prio, struct scan_control *sc) | 1373 | int pass, struct scan_control *sc) |
1374 | { | 1374 | { |
1375 | struct zone *zone; | 1375 | struct zone *zone; |
1376 | unsigned long nr_to_scan, ret = 0; | 1376 | unsigned long nr_to_scan, ret = 0; |
@@ -1406,6 +1406,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, | |||
1406 | return ret; | 1406 | return ret; |
1407 | } | 1407 | } |
1408 | 1408 | ||
1409 | static unsigned long count_lru_pages(void) | ||
1410 | { | ||
1411 | struct zone *zone; | ||
1412 | unsigned long ret = 0; | ||
1413 | |||
1414 | for_each_zone(zone) | ||
1415 | ret += zone->nr_active + zone->nr_inactive; | ||
1416 | return ret; | ||
1417 | } | ||
1418 | |||
1409 | /* | 1419 | /* |
1410 | * Try to free `nr_pages' of memory, system-wide, and return the number of | 1420 | * Try to free `nr_pages' of memory, system-wide, and return the number of |
1411 | * freed pages. | 1421 | * freed pages. |
@@ -1420,7 +1430,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1420 | unsigned long ret = 0; | 1430 | unsigned long ret = 0; |
1421 | int pass; | 1431 | int pass; |
1422 | struct reclaim_state reclaim_state; | 1432 | struct reclaim_state reclaim_state; |
1423 | struct zone *zone; | ||
1424 | struct scan_control sc = { | 1433 | struct scan_control sc = { |
1425 | .gfp_mask = GFP_KERNEL, | 1434 | .gfp_mask = GFP_KERNEL, |
1426 | .may_swap = 0, | 1435 | .may_swap = 0, |
@@ -1431,10 +1440,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1431 | 1440 | ||
1432 | current->reclaim_state = &reclaim_state; | 1441 | current->reclaim_state = &reclaim_state; |
1433 | 1442 | ||
1434 | lru_pages = 0; | 1443 | lru_pages = count_lru_pages(); |
1435 | for_each_zone(zone) | ||
1436 | lru_pages += zone->nr_active + zone->nr_inactive; | ||
1437 | |||
1438 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); | 1444 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); |
1439 | /* If slab caches are huge, it's better to hit them first */ | 1445 | /* If slab caches are huge, it's better to hit them first */ |
1440 | while (nr_slab >= lru_pages) { | 1446 | while (nr_slab >= lru_pages) { |
@@ -1461,13 +1467,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1461 | for (pass = 0; pass < 5; pass++) { | 1467 | for (pass = 0; pass < 5; pass++) { |
1462 | int prio; | 1468 | int prio; |
1463 | 1469 | ||
1464 | /* Needed for shrinking slab caches later on */ | ||
1465 | if (!lru_pages) | ||
1466 | for_each_zone(zone) { | ||
1467 | lru_pages += zone->nr_active; | ||
1468 | lru_pages += zone->nr_inactive; | ||
1469 | } | ||
1470 | |||
1471 | /* Force reclaiming mapped pages in the passes #3 and #4 */ | 1470 | /* Force reclaiming mapped pages in the passes #3 and #4 */ |
1472 | if (pass > 2) { | 1471 | if (pass > 2) { |
1473 | sc.may_swap = 1; | 1472 | sc.may_swap = 1; |
@@ -1483,7 +1482,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1483 | goto out; | 1482 | goto out; |
1484 | 1483 | ||
1485 | reclaim_state.reclaimed_slab = 0; | 1484 | reclaim_state.reclaimed_slab = 0; |
1486 | shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); | 1485 | shrink_slab(sc.nr_scanned, sc.gfp_mask, |
1486 | count_lru_pages()); | ||
1487 | ret += reclaim_state.reclaimed_slab; | 1487 | ret += reclaim_state.reclaimed_slab; |
1488 | if (ret >= nr_pages) | 1488 | if (ret >= nr_pages) |
1489 | goto out; | 1489 | goto out; |
@@ -1491,20 +1491,19 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
1491 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | 1491 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) |
1492 | congestion_wait(WRITE, HZ / 10); | 1492 | congestion_wait(WRITE, HZ / 10); |
1493 | } | 1493 | } |
1494 | |||
1495 | lru_pages = 0; | ||
1496 | } | 1494 | } |
1497 | 1495 | ||
1498 | /* | 1496 | /* |
1499 | * If ret = 0, we could not shrink LRUs, but there may be something | 1497 | * If ret = 0, we could not shrink LRUs, but there may be something |
1500 | * in slab caches | 1498 | * in slab caches |
1501 | */ | 1499 | */ |
1502 | if (!ret) | 1500 | if (!ret) { |
1503 | do { | 1501 | do { |
1504 | reclaim_state.reclaimed_slab = 0; | 1502 | reclaim_state.reclaimed_slab = 0; |
1505 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | 1503 | shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); |
1506 | ret += reclaim_state.reclaimed_slab; | 1504 | ret += reclaim_state.reclaimed_slab; |
1507 | } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); | 1505 | } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); |
1506 | } | ||
1508 | 1507 | ||
1509 | out: | 1508 | out: |
1510 | current->reclaim_state = NULL; | 1509 | current->reclaim_state = NULL; |