aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDmitry Torokhov <dtor@insightbb.com>2007-02-10 01:26:32 -0500
committerDmitry Torokhov <dtor@insightbb.com>2007-02-10 01:26:32 -0500
commitb22364c8eec89e6b0c081a237f3b6348df87796f (patch)
tree233a923281fb640106465d076997ff511efb6edf /mm
parent2c8dc071517ec2843869024dc82be2e246f41064 (diff)
parent66efc5a7e3061c3597ac43a8bb1026488d57e66b (diff)
Merge rsync://rsync.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'mm')
-rw-r--r--mm/bounce.c4
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c31
-rw-r--r--mm/filemap_xip.c8
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/hugetlb.c10
-rw-r--r--mm/memory.c59
-rw-r--r--mm/memory_hotplug.c6
-rw-r--r--mm/mempolicy.c6
-rw-r--r--mm/mincore.c183
-rw-r--r--mm/mmap.c89
-rw-r--r--mm/mremap.c1
-rw-r--r--mm/nommu.c12
-rw-r--r--mm/oom_kill.c21
-rw-r--r--mm/page-writeback.c147
-rw-r--r--mm/page_alloc.c137
-rw-r--r--mm/readahead.c4
-rw-r--r--mm/rmap.c36
-rw-r--r--mm/shmem.c27
-rw-r--r--mm/slab.c119
-rw-r--r--mm/slob.c27
-rw-r--r--mm/swapfile.c12
-rw-r--r--mm/tiny-shmem.c4
-rw-r--r--mm/truncate.c49
-rw-r--r--mm/vmscan.c47
25 files changed, 685 insertions, 358 deletions
diff --git a/mm/bounce.c b/mm/bounce.c
index e4b62d2a4024..643efbe82402 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -237,6 +237,8 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
237 if (!bio) 237 if (!bio)
238 return; 238 return;
239 239
240 blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
241
240 /* 242 /*
241 * at least one page was bounced, fill in possible non-highmem 243 * at least one page was bounced, fill in possible non-highmem
242 * pages 244 * pages
@@ -291,8 +293,6 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
291 pool = isa_page_pool; 293 pool = isa_page_pool;
292 } 294 }
293 295
294 blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
295
296 /* 296 /*
297 * slow path 297 * slow path
298 */ 298 */
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 168c78a121bb..0df4c899e979 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -38,7 +38,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
38 if (!file) 38 if (!file)
39 return -EBADF; 39 return -EBADF;
40 40
41 if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) { 41 if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
42 ret = -ESPIPE; 42 ret = -ESPIPE;
43 goto out; 43 goto out;
44 } 44 }
diff --git a/mm/filemap.c b/mm/filemap.c
index af7e2f5caea9..f30ef28405d3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -606,26 +606,6 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset)
606EXPORT_SYMBOL(find_get_page); 606EXPORT_SYMBOL(find_get_page);
607 607
608/** 608/**
609 * find_trylock_page - find and lock a page
610 * @mapping: the address_space to search
611 * @offset: the page index
612 *
613 * Same as find_get_page(), but trylock it instead of incrementing the count.
614 */
615struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
616{
617 struct page *page;
618
619 read_lock_irq(&mapping->tree_lock);
620 page = radix_tree_lookup(&mapping->page_tree, offset);
621 if (page && TestSetPageLocked(page))
622 page = NULL;
623 read_unlock_irq(&mapping->tree_lock);
624 return page;
625}
626EXPORT_SYMBOL(find_trylock_page);
627
628/**
629 * find_lock_page - locate, pin and lock a pagecache page 609 * find_lock_page - locate, pin and lock a pagecache page
630 * @mapping: the address_space to search 610 * @mapping: the address_space to search
631 * @offset: the page index 611 * @offset: the page index
@@ -1181,8 +1161,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1181 if (pos < size) { 1161 if (pos < size) {
1182 retval = generic_file_direct_IO(READ, iocb, 1162 retval = generic_file_direct_IO(READ, iocb,
1183 iov, pos, nr_segs); 1163 iov, pos, nr_segs);
1184 if (retval > 0 && !is_sync_kiocb(iocb))
1185 retval = -EIOCBQUEUED;
1186 if (retval > 0) 1164 if (retval > 0)
1187 *ppos = pos + retval; 1165 *ppos = pos + retval;
1188 } 1166 }
@@ -2047,15 +2025,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2047 * Sync the fs metadata but not the minor inode changes and 2025 * Sync the fs metadata but not the minor inode changes and
2048 * of course not the data as we did direct DMA for the IO. 2026 * of course not the data as we did direct DMA for the IO.
2049 * i_mutex is held, which protects generic_osync_inode() from 2027 * i_mutex is held, which protects generic_osync_inode() from
2050 * livelocking. 2028 * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
2051 */ 2029 */
2052 if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2030 if ((written >= 0 || written == -EIOCBQUEUED) &&
2031 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2053 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); 2032 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
2054 if (err < 0) 2033 if (err < 0)
2055 written = err; 2034 written = err;
2056 } 2035 }
2057 if (written == count && !is_sync_kiocb(iocb))
2058 written = -EIOCBQUEUED;
2059 return written; 2036 return written;
2060} 2037}
2061EXPORT_SYMBOL(generic_file_direct_write); 2038EXPORT_SYMBOL(generic_file_direct_write);
@@ -2269,7 +2246,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2269 if (count == 0) 2246 if (count == 0)
2270 goto out; 2247 goto out;
2271 2248
2272 err = remove_suid(file->f_dentry); 2249 err = remove_suid(file->f_path.dentry);
2273 if (err) 2250 if (err)
2274 goto out; 2251 goto out;
2275 2252
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b4fd0d7c9bfb..9dd9fbb75139 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -183,13 +183,13 @@ __xip_unmap (struct address_space * mapping,
183 address = vma->vm_start + 183 address = vma->vm_start +
184 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 184 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
185 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 185 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
186 page = ZERO_PAGE(address); 186 page = ZERO_PAGE(0);
187 pte = page_check_address(page, mm, address, &ptl); 187 pte = page_check_address(page, mm, address, &ptl);
188 if (pte) { 188 if (pte) {
189 /* Nuke the page table entry. */ 189 /* Nuke the page table entry. */
190 flush_cache_page(vma, address, pte_pfn(*pte)); 190 flush_cache_page(vma, address, pte_pfn(*pte));
191 pteval = ptep_clear_flush(vma, address, pte); 191 pteval = ptep_clear_flush(vma, address, pte);
192 page_remove_rmap(page); 192 page_remove_rmap(page, vma);
193 dec_mm_counter(mm, file_rss); 193 dec_mm_counter(mm, file_rss);
194 BUG_ON(pte_dirty(pteval)); 194 BUG_ON(pte_dirty(pteval));
195 pte_unmap_unlock(pte, ptl); 195 pte_unmap_unlock(pte, ptl);
@@ -246,7 +246,7 @@ xip_file_nopage(struct vm_area_struct * area,
246 __xip_unmap(mapping, pgoff); 246 __xip_unmap(mapping, pgoff);
247 } else { 247 } else {
248 /* not shared and writable, use ZERO_PAGE() */ 248 /* not shared and writable, use ZERO_PAGE() */
249 page = ZERO_PAGE(address); 249 page = ZERO_PAGE(0);
250 } 250 }
251 251
252out: 252out:
@@ -379,7 +379,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
379 if (count == 0) 379 if (count == 0)
380 goto out_backing; 380 goto out_backing;
381 381
382 ret = remove_suid(filp->f_dentry); 382 ret = remove_suid(filp->f_path.dentry);
383 if (ret) 383 if (ret)
384 goto out_backing; 384 goto out_backing;
385 385
diff --git a/mm/fremap.c b/mm/fremap.c
index b77a002c3352..4e3f53dd5fd4 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -33,7 +33,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
33 if (page) { 33 if (page) {
34 if (pte_dirty(pte)) 34 if (pte_dirty(pte))
35 set_page_dirty(page); 35 set_page_dirty(page);
36 page_remove_rmap(page); 36 page_remove_rmap(page, vma);
37 page_cache_release(page); 37 page_cache_release(page);
38 } 38 }
39 } else { 39 } else {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0ccc7f230252..36db012b38dd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -44,14 +44,14 @@ static void clear_huge_page(struct page *page, unsigned long addr)
44} 44}
45 45
46static void copy_huge_page(struct page *dst, struct page *src, 46static void copy_huge_page(struct page *dst, struct page *src,
47 unsigned long addr) 47 unsigned long addr, struct vm_area_struct *vma)
48{ 48{
49 int i; 49 int i;
50 50
51 might_sleep(); 51 might_sleep();
52 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 52 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
53 cond_resched(); 53 cond_resched();
54 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE); 54 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
55 } 55 }
56} 56}
57 57
@@ -73,7 +73,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
73 73
74 for (z = zonelist->zones; *z; z++) { 74 for (z = zonelist->zones; *z; z++) {
75 nid = zone_to_nid(*z); 75 nid = zone_to_nid(*z);
76 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && 76 if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
77 !list_empty(&hugepage_freelists[nid])) 77 !list_empty(&hugepage_freelists[nid]))
78 break; 78 break;
79 } 79 }
@@ -389,6 +389,8 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
389 continue; 389 continue;
390 390
391 page = pte_page(pte); 391 page = pte_page(pte);
392 if (pte_dirty(pte))
393 set_page_dirty(page);
392 list_add(&page->lru, &page_list); 394 list_add(&page->lru, &page_list);
393 } 395 }
394 spin_unlock(&mm->page_table_lock); 396 spin_unlock(&mm->page_table_lock);
@@ -442,7 +444,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
442 } 444 }
443 445
444 spin_unlock(&mm->page_table_lock); 446 spin_unlock(&mm->page_table_lock);
445 copy_huge_page(new_page, old_page, address); 447 copy_huge_page(new_page, old_page, address, vma);
446 spin_lock(&mm->page_table_lock); 448 spin_lock(&mm->page_table_lock);
447 449
448 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 450 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
diff --git a/mm/memory.c b/mm/memory.c
index 4198df0dff1c..ef09f0acb1d8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -681,7 +681,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
681 mark_page_accessed(page); 681 mark_page_accessed(page);
682 file_rss--; 682 file_rss--;
683 } 683 }
684 page_remove_rmap(page); 684 page_remove_rmap(page, vma);
685 tlb_remove_page(tlb, page); 685 tlb_remove_page(tlb, page);
686 continue; 686 continue;
687 } 687 }
@@ -1091,7 +1091,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1091 if (pages) { 1091 if (pages) {
1092 pages[i] = page; 1092 pages[i] = page;
1093 1093
1094 flush_anon_page(page, start); 1094 flush_anon_page(vma, page, start);
1095 flush_dcache_page(page); 1095 flush_dcache_page(page);
1096 } 1096 }
1097 if (vmas) 1097 if (vmas)
@@ -1110,23 +1110,29 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1110{ 1110{
1111 pte_t *pte; 1111 pte_t *pte;
1112 spinlock_t *ptl; 1112 spinlock_t *ptl;
1113 int err = 0;
1113 1114
1114 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); 1115 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1115 if (!pte) 1116 if (!pte)
1116 return -ENOMEM; 1117 return -EAGAIN;
1117 arch_enter_lazy_mmu_mode(); 1118 arch_enter_lazy_mmu_mode();
1118 do { 1119 do {
1119 struct page *page = ZERO_PAGE(addr); 1120 struct page *page = ZERO_PAGE(addr);
1120 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); 1121 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
1122
1123 if (unlikely(!pte_none(*pte))) {
1124 err = -EEXIST;
1125 pte++;
1126 break;
1127 }
1121 page_cache_get(page); 1128 page_cache_get(page);
1122 page_add_file_rmap(page); 1129 page_add_file_rmap(page);
1123 inc_mm_counter(mm, file_rss); 1130 inc_mm_counter(mm, file_rss);
1124 BUG_ON(!pte_none(*pte));
1125 set_pte_at(mm, addr, pte, zero_pte); 1131 set_pte_at(mm, addr, pte, zero_pte);
1126 } while (pte++, addr += PAGE_SIZE, addr != end); 1132 } while (pte++, addr += PAGE_SIZE, addr != end);
1127 arch_leave_lazy_mmu_mode(); 1133 arch_leave_lazy_mmu_mode();
1128 pte_unmap_unlock(pte - 1, ptl); 1134 pte_unmap_unlock(pte - 1, ptl);
1129 return 0; 1135 return err;
1130} 1136}
1131 1137
1132static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, 1138static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -1134,16 +1140,18 @@ static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
1134{ 1140{
1135 pmd_t *pmd; 1141 pmd_t *pmd;
1136 unsigned long next; 1142 unsigned long next;
1143 int err;
1137 1144
1138 pmd = pmd_alloc(mm, pud, addr); 1145 pmd = pmd_alloc(mm, pud, addr);
1139 if (!pmd) 1146 if (!pmd)
1140 return -ENOMEM; 1147 return -EAGAIN;
1141 do { 1148 do {
1142 next = pmd_addr_end(addr, end); 1149 next = pmd_addr_end(addr, end);
1143 if (zeromap_pte_range(mm, pmd, addr, next, prot)) 1150 err = zeromap_pte_range(mm, pmd, addr, next, prot);
1144 return -ENOMEM; 1151 if (err)
1152 break;
1145 } while (pmd++, addr = next, addr != end); 1153 } while (pmd++, addr = next, addr != end);
1146 return 0; 1154 return err;
1147} 1155}
1148 1156
1149static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, 1157static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
@@ -1151,16 +1159,18 @@ static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1151{ 1159{
1152 pud_t *pud; 1160 pud_t *pud;
1153 unsigned long next; 1161 unsigned long next;
1162 int err;
1154 1163
1155 pud = pud_alloc(mm, pgd, addr); 1164 pud = pud_alloc(mm, pgd, addr);
1156 if (!pud) 1165 if (!pud)
1157 return -ENOMEM; 1166 return -EAGAIN;
1158 do { 1167 do {
1159 next = pud_addr_end(addr, end); 1168 next = pud_addr_end(addr, end);
1160 if (zeromap_pmd_range(mm, pud, addr, next, prot)) 1169 err = zeromap_pmd_range(mm, pud, addr, next, prot);
1161 return -ENOMEM; 1170 if (err)
1171 break;
1162 } while (pud++, addr = next, addr != end); 1172 } while (pud++, addr = next, addr != end);
1163 return 0; 1173 return err;
1164} 1174}
1165 1175
1166int zeromap_page_range(struct vm_area_struct *vma, 1176int zeromap_page_range(struct vm_area_struct *vma,
@@ -1431,7 +1441,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1431 return pte; 1441 return pte;
1432} 1442}
1433 1443
1434static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) 1444static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
1435{ 1445{
1436 /* 1446 /*
1437 * If the source page was a PFN mapping, we don't have 1447 * If the source page was a PFN mapping, we don't have
@@ -1454,9 +1464,9 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
1454 kunmap_atomic(kaddr, KM_USER0); 1464 kunmap_atomic(kaddr, KM_USER0);
1455 flush_dcache_page(dst); 1465 flush_dcache_page(dst);
1456 return; 1466 return;
1457 1467
1458 } 1468 }
1459 copy_user_highpage(dst, src, va); 1469 copy_user_highpage(dst, src, va, vma);
1460} 1470}
1461 1471
1462/* 1472/*
@@ -1567,7 +1577,7 @@ gotten:
1567 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); 1577 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1568 if (!new_page) 1578 if (!new_page)
1569 goto oom; 1579 goto oom;
1570 cow_user_page(new_page, old_page, address); 1580 cow_user_page(new_page, old_page, address, vma);
1571 } 1581 }
1572 1582
1573 /* 1583 /*
@@ -1576,7 +1586,7 @@ gotten:
1576 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 1586 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1577 if (likely(pte_same(*page_table, orig_pte))) { 1587 if (likely(pte_same(*page_table, orig_pte))) {
1578 if (old_page) { 1588 if (old_page) {
1579 page_remove_rmap(old_page); 1589 page_remove_rmap(old_page, vma);
1580 if (!PageAnon(old_page)) { 1590 if (!PageAnon(old_page)) {
1581 dec_mm_counter(mm, file_rss); 1591 dec_mm_counter(mm, file_rss);
1582 inc_mm_counter(mm, anon_rss); 1592 inc_mm_counter(mm, anon_rss);
@@ -2190,7 +2200,7 @@ retry:
2190 page = alloc_page_vma(GFP_HIGHUSER, vma, address); 2200 page = alloc_page_vma(GFP_HIGHUSER, vma, address);
2191 if (!page) 2201 if (!page)
2192 goto oom; 2202 goto oom;
2193 copy_user_highpage(page, new_page, address); 2203 copy_user_highpage(page, new_page, address, vma);
2194 page_cache_release(new_page); 2204 page_cache_release(new_page);
2195 new_page = page; 2205 new_page = page;
2196 anon = 1; 2206 anon = 1;
@@ -2596,8 +2606,15 @@ static int __init gate_vma_init(void)
2596 gate_vma.vm_mm = NULL; 2606 gate_vma.vm_mm = NULL;
2597 gate_vma.vm_start = FIXADDR_USER_START; 2607 gate_vma.vm_start = FIXADDR_USER_START;
2598 gate_vma.vm_end = FIXADDR_USER_END; 2608 gate_vma.vm_end = FIXADDR_USER_END;
2599 gate_vma.vm_page_prot = PAGE_READONLY; 2609 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
2600 gate_vma.vm_flags = 0; 2610 gate_vma.vm_page_prot = __P101;
2611 /*
2612 * Make sure the vDSO gets into every core dump.
2613 * Dumping its contents makes post-mortem fully interpretable later
2614 * without matching up the same kernel and hardware config to see
2615 * what PC values meant.
2616 */
2617 gate_vma.vm_flags |= VM_ALWAYSDUMP;
2601 return 0; 2618 return 0;
2602} 2619}
2603__initcall(gate_vma_init); 2620__initcall(gate_vma_init);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0c055a090f4d..84279127fcd3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -67,11 +67,13 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
67 zone_type = zone - pgdat->node_zones; 67 zone_type = zone - pgdat->node_zones;
68 if (!populated_zone(zone)) { 68 if (!populated_zone(zone)) {
69 int ret = 0; 69 int ret = 0;
70 ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages); 70 ret = init_currently_empty_zone(zone, phys_start_pfn,
71 nr_pages, MEMMAP_HOTPLUG);
71 if (ret < 0) 72 if (ret < 0)
72 return ret; 73 return ret;
73 } 74 }
74 memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); 75 memmap_init_zone(nr_pages, nid, zone_type,
76 phys_start_pfn, MEMMAP_HOTPLUG);
75 return 0; 77 return 0;
76} 78}
77 79
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b917d6fdc1bb..c2aec0e1090d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -884,6 +884,10 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
884 err = get_nodes(&nodes, nmask, maxnode); 884 err = get_nodes(&nodes, nmask, maxnode);
885 if (err) 885 if (err)
886 return err; 886 return err;
887#ifdef CONFIG_CPUSETS
888 /* Restrict the nodes to the allowed nodes in the cpuset */
889 nodes_and(nodes, nodes, current->mems_allowed);
890#endif
887 return do_mbind(start, len, mode, &nodes, flags); 891 return do_mbind(start, len, mode, &nodes, flags);
888} 892}
889 893
@@ -1857,7 +1861,7 @@ int show_numa_map(struct seq_file *m, void *v)
1857 1861
1858 if (file) { 1862 if (file) {
1859 seq_printf(m, " file="); 1863 seq_printf(m, " file=");
1860 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= "); 1864 seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1861 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 1865 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1862 seq_printf(m, " heap"); 1866 seq_printf(m, " heap");
1863 } else if (vma->vm_start <= mm->start_stack && 1867 } else if (vma->vm_start <= mm->start_stack &&
diff --git a/mm/mincore.c b/mm/mincore.c
index 72890780c1c9..8aca6f7167bb 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * linux/mm/mincore.c 2 * linux/mm/mincore.c
3 * 3 *
4 * Copyright (C) 1994-1999 Linus Torvalds 4 * Copyright (C) 1994-2006 Linus Torvalds
5 */ 5 */
6 6
7/* 7/*
@@ -38,46 +38,51 @@ static unsigned char mincore_page(struct vm_area_struct * vma,
38 return present; 38 return present;
39} 39}
40 40
41static long mincore_vma(struct vm_area_struct * vma, 41/*
42 unsigned long start, unsigned long end, unsigned char __user * vec) 42 * Do a chunk of "sys_mincore()". We've already checked
43 * all the arguments, we hold the mmap semaphore: we should
44 * just return the amount of info we're asked for.
45 */
46static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
43{ 47{
44 long error, i, remaining; 48 unsigned long i, nr, pgoff;
45 unsigned char * tmp; 49 struct vm_area_struct *vma = find_vma(current->mm, addr);
46
47 error = -ENOMEM;
48 if (!vma->vm_file)
49 return error;
50
51 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
52 if (end > vma->vm_end)
53 end = vma->vm_end;
54 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
55 50
56 error = -EAGAIN; 51 /*
57 tmp = (unsigned char *) __get_free_page(GFP_KERNEL); 52 * find_vma() didn't find anything above us, or we're
58 if (!tmp) 53 * in an unmapped hole in the address space: ENOMEM.
59 return error; 54 */
55 if (!vma || addr < vma->vm_start)
56 return -ENOMEM;
60 57
61 /* (end - start) is # of pages, and also # of bytes in "vec */ 58 /*
62 remaining = (end - start), 59 * Ok, got it. But check whether it's a segment we support
60 * mincore() on. Right now, we don't do any anonymous mappings.
61 *
62 * FIXME: This is just stupid. And returning ENOMEM is
63 * stupid too. We should just look at the page tables. But
64 * this is what we've traditionally done, so we'll just
65 * continue doing it.
66 */
67 if (!vma->vm_file)
68 return -ENOMEM;
63 69
64 error = 0; 70 /*
65 for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { 71 * Calculate how many pages there are left in the vma, and
66 int j = 0; 72 * what the pgoff is for our address.
67 long thispiece = (remaining < PAGE_SIZE) ? 73 */
68 remaining : PAGE_SIZE; 74 nr = (vma->vm_end - addr) >> PAGE_SHIFT;
75 if (nr > pages)
76 nr = pages;
69 77
70 while (j < thispiece) 78 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
71 tmp[j++] = mincore_page(vma, start++); 79 pgoff += vma->vm_pgoff;
72 80
73 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { 81 /* And then we just fill the sucker in.. */
74 error = -EFAULT; 82 for (i = 0 ; i < nr; i++, pgoff++)
75 break; 83 vec[i] = mincore_page(vma, pgoff);
76 }
77 }
78 84
79 free_page((unsigned long) tmp); 85 return nr;
80 return error;
81} 86}
82 87
83/* 88/*
@@ -107,82 +112,50 @@ static long mincore_vma(struct vm_area_struct * vma,
107asmlinkage long sys_mincore(unsigned long start, size_t len, 112asmlinkage long sys_mincore(unsigned long start, size_t len,
108 unsigned char __user * vec) 113 unsigned char __user * vec)
109{ 114{
110 int index = 0; 115 long retval;
111 unsigned long end, limit; 116 unsigned long pages;
112 struct vm_area_struct * vma; 117 unsigned char *tmp;
113 size_t max;
114 int unmapped_error = 0;
115 long error;
116
117 /* check the arguments */
118 if (start & ~PAGE_CACHE_MASK)
119 goto einval;
120
121 limit = TASK_SIZE;
122 if (start >= limit)
123 goto enomem;
124
125 if (!len)
126 return 0;
127
128 max = limit - start;
129 len = PAGE_CACHE_ALIGN(len);
130 if (len > max || !len)
131 goto enomem;
132 118
133 end = start + len; 119 /* Check the start address: needs to be page-aligned.. */
120 if (start & ~PAGE_CACHE_MASK)
121 return -EINVAL;
134 122
135 /* check the output buffer whilst holding the lock */ 123 /* ..and we need to be passed a valid user-space range */
136 error = -EFAULT; 124 if (!access_ok(VERIFY_READ, (void __user *) start, len))
137 down_read(&current->mm->mmap_sem); 125 return -ENOMEM;
138 126
139 if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT)) 127 /* This also avoids any overflows on PAGE_CACHE_ALIGN */
140 goto out; 128 pages = len >> PAGE_SHIFT;
129 pages += (len & ~PAGE_MASK) != 0;
141 130
142 /* 131 if (!access_ok(VERIFY_WRITE, vec, pages))
143 * If the interval [start,end) covers some unmapped address 132 return -EFAULT;
144 * ranges, just ignore them, but return -ENOMEM at the end.
145 */
146 error = 0;
147
148 vma = find_vma(current->mm, start);
149 while (vma) {
150 /* Here start < vma->vm_end. */
151 if (start < vma->vm_start) {
152 unmapped_error = -ENOMEM;
153 start = vma->vm_start;
154 }
155 133
156 /* Here vma->vm_start <= start < vma->vm_end. */ 134 tmp = (void *) __get_free_page(GFP_USER);
157 if (end <= vma->vm_end) { 135 if (!tmp)
158 if (start < end) { 136 return -EAGAIN;
159 error = mincore_vma(vma, start, end, 137
160 &vec[index]); 138 retval = 0;
161 if (error) 139 while (pages) {
162 goto out; 140 /*
163 } 141 * Do at most PAGE_SIZE entries per iteration, due to
164 error = unmapped_error; 142 * the temporary buffer size.
165 goto out; 143 */
144 down_read(&current->mm->mmap_sem);
145 retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
146 up_read(&current->mm->mmap_sem);
147
148 if (retval <= 0)
149 break;
150 if (copy_to_user(vec, tmp, retval)) {
151 retval = -EFAULT;
152 break;
166 } 153 }
167 154 pages -= retval;
168 /* Here vma->vm_start <= start < vma->vm_end < end. */ 155 vec += retval;
169 error = mincore_vma(vma, start, vma->vm_end, &vec[index]); 156 start += retval << PAGE_SHIFT;
170 if (error) 157 retval = 0;
171 goto out;
172 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
173 start = vma->vm_end;
174 vma = vma->vm_next;
175 } 158 }
176 159 free_page((unsigned long) tmp);
177 /* we found a hole in the area queried if we arrive here */ 160 return retval;
178 error = -ENOMEM;
179
180out:
181 up_read(&current->mm->mmap_sem);
182 return error;
183
184einval:
185 return -EINVAL;
186enomem:
187 return -ENOMEM;
188} 161}
diff --git a/mm/mmap.c b/mm/mmap.c
index 7be110e98d4c..eb509ae76553 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -188,7 +188,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
188 struct file *file, struct address_space *mapping) 188 struct file *file, struct address_space *mapping)
189{ 189{
190 if (vma->vm_flags & VM_DENYWRITE) 190 if (vma->vm_flags & VM_DENYWRITE)
191 atomic_inc(&file->f_dentry->d_inode->i_writecount); 191 atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
192 if (vma->vm_flags & VM_SHARED) 192 if (vma->vm_flags & VM_SHARED)
193 mapping->i_mmap_writable--; 193 mapping->i_mmap_writable--;
194 194
@@ -399,7 +399,7 @@ static inline void __vma_link_file(struct vm_area_struct *vma)
399 struct address_space *mapping = file->f_mapping; 399 struct address_space *mapping = file->f_mapping;
400 400
401 if (vma->vm_flags & VM_DENYWRITE) 401 if (vma->vm_flags & VM_DENYWRITE)
402 atomic_dec(&file->f_dentry->d_inode->i_writecount); 402 atomic_dec(&file->f_path.dentry->d_inode->i_writecount);
403 if (vma->vm_flags & VM_SHARED) 403 if (vma->vm_flags & VM_SHARED)
404 mapping->i_mmap_writable++; 404 mapping->i_mmap_writable++;
405 405
@@ -907,7 +907,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
907 * mounted, in which case we dont add PROT_EXEC.) 907 * mounted, in which case we dont add PROT_EXEC.)
908 */ 908 */
909 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) 909 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
910 if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))) 910 if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
911 prot |= PROT_EXEC; 911 prot |= PROT_EXEC;
912 912
913 if (!len) 913 if (!len)
@@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
960 return -EAGAIN; 960 return -EAGAIN;
961 } 961 }
962 962
963 inode = file ? file->f_dentry->d_inode : NULL; 963 inode = file ? file->f_path.dentry->d_inode : NULL;
964 964
965 if (file) { 965 if (file) {
966 switch (flags & MAP_TYPE) { 966 switch (flags & MAP_TYPE) {
@@ -989,7 +989,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
989 case MAP_PRIVATE: 989 case MAP_PRIVATE:
990 if (!(file->f_mode & FMODE_READ)) 990 if (!(file->f_mode & FMODE_READ))
991 return -EACCES; 991 return -EACCES;
992 if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { 992 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
993 if (vm_flags & VM_EXEC) 993 if (vm_flags & VM_EXEC)
994 return -EPERM; 994 return -EPERM;
995 vm_flags &= ~VM_MAYEXEC; 995 vm_flags &= ~VM_MAYEXEC;
@@ -1477,6 +1477,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
1477{ 1477{
1478 struct mm_struct *mm = vma->vm_mm; 1478 struct mm_struct *mm = vma->vm_mm;
1479 struct rlimit *rlim = current->signal->rlim; 1479 struct rlimit *rlim = current->signal->rlim;
1480 unsigned long new_start;
1480 1481
1481 /* address space limit tests */ 1482 /* address space limit tests */
1482 if (!may_expand_vm(mm, grow)) 1483 if (!may_expand_vm(mm, grow))
@@ -1496,6 +1497,12 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
1496 return -ENOMEM; 1497 return -ENOMEM;
1497 } 1498 }
1498 1499
1500 /* Check to ensure the stack will not grow into a hugetlb-only region */
1501 new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
1502 vma->vm_end - size;
1503 if (is_hugepage_only_range(vma->vm_mm, new_start, size))
1504 return -EFAULT;
1505
1499 /* 1506 /*
1500 * Overcommit.. This must be the final test, as it will 1507 * Overcommit.. This must be the final test, as it will
1501 * update security statistics. 1508 * update security statistics.
@@ -2094,3 +2101,75 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages)
2094 return 0; 2101 return 0;
2095 return 1; 2102 return 1;
2096} 2103}
2104
2105
2106static struct page *special_mapping_nopage(struct vm_area_struct *vma,
2107 unsigned long address, int *type)
2108{
2109 struct page **pages;
2110
2111 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
2112
2113 address -= vma->vm_start;
2114 for (pages = vma->vm_private_data; address > 0 && *pages; ++pages)
2115 address -= PAGE_SIZE;
2116
2117 if (*pages) {
2118 struct page *page = *pages;
2119 get_page(page);
2120 return page;
2121 }
2122
2123 return NOPAGE_SIGBUS;
2124}
2125
2126/*
2127 * Having a close hook prevents vma merging regardless of flags.
2128 */
2129static void special_mapping_close(struct vm_area_struct *vma)
2130{
2131}
2132
2133static struct vm_operations_struct special_mapping_vmops = {
2134 .close = special_mapping_close,
2135 .nopage = special_mapping_nopage,
2136};
2137
2138/*
2139 * Called with mm->mmap_sem held for writing.
2140 * Insert a new vma covering the given region, with the given flags.
2141 * Its pages are supplied by the given array of struct page *.
2142 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
2143 * The region past the last page supplied will always produce SIGBUS.
2144 * The array pointer and the pages it points to are assumed to stay alive
2145 * for as long as this mapping might exist.
2146 */
2147int install_special_mapping(struct mm_struct *mm,
2148 unsigned long addr, unsigned long len,
2149 unsigned long vm_flags, struct page **pages)
2150{
2151 struct vm_area_struct *vma;
2152
2153 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
2154 if (unlikely(vma == NULL))
2155 return -ENOMEM;
2156
2157 vma->vm_mm = mm;
2158 vma->vm_start = addr;
2159 vma->vm_end = addr + len;
2160
2161 vma->vm_flags = vm_flags | mm->def_flags;
2162 vma->vm_page_prot = protection_map[vma->vm_flags & 7];
2163
2164 vma->vm_ops = &special_mapping_vmops;
2165 vma->vm_private_data = pages;
2166
2167 if (unlikely(insert_vm_struct(mm, vma))) {
2168 kmem_cache_free(vm_area_cachep, vma);
2169 return -ENOMEM;
2170 }
2171
2172 mm->total_vm += len >> PAGE_SHIFT;
2173
2174 return 0;
2175}
diff --git a/mm/mremap.c b/mm/mremap.c
index 9c769fa29f32..5d4bd4f95b8e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -105,7 +105,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
105 if (pte_none(*old_pte)) 105 if (pte_none(*old_pte))
106 continue; 106 continue;
107 pte = ptep_clear_flush(vma, old_addr, old_pte); 107 pte = ptep_clear_flush(vma, old_addr, old_pte);
108 /* ZERO_PAGE can be dependant on virtual addr */
109 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); 108 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
110 set_pte_at(mm, new_addr, new_pte, pte); 109 set_pte_at(mm, new_addr, new_pte, pte);
111 } 110 }
diff --git a/mm/nommu.c b/mm/nommu.c
index af874569d0f1..23fb033e596d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -523,7 +523,7 @@ static int validate_mmap_request(struct file *file,
523 */ 523 */
524 mapping = file->f_mapping; 524 mapping = file->f_mapping;
525 if (!mapping) 525 if (!mapping)
526 mapping = file->f_dentry->d_inode->i_mapping; 526 mapping = file->f_path.dentry->d_inode->i_mapping;
527 527
528 capabilities = 0; 528 capabilities = 0;
529 if (mapping && mapping->backing_dev_info) 529 if (mapping && mapping->backing_dev_info)
@@ -532,7 +532,7 @@ static int validate_mmap_request(struct file *file,
532 if (!capabilities) { 532 if (!capabilities) {
533 /* no explicit capabilities set, so assume some 533 /* no explicit capabilities set, so assume some
534 * defaults */ 534 * defaults */
535 switch (file->f_dentry->d_inode->i_mode & S_IFMT) { 535 switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) {
536 case S_IFREG: 536 case S_IFREG:
537 case S_IFBLK: 537 case S_IFBLK:
538 capabilities = BDI_CAP_MAP_COPY; 538 capabilities = BDI_CAP_MAP_COPY;
@@ -563,11 +563,11 @@ static int validate_mmap_request(struct file *file,
563 !(file->f_mode & FMODE_WRITE)) 563 !(file->f_mode & FMODE_WRITE))
564 return -EACCES; 564 return -EACCES;
565 565
566 if (IS_APPEND(file->f_dentry->d_inode) && 566 if (IS_APPEND(file->f_path.dentry->d_inode) &&
567 (file->f_mode & FMODE_WRITE)) 567 (file->f_mode & FMODE_WRITE))
568 return -EACCES; 568 return -EACCES;
569 569
570 if (locks_verify_locked(file->f_dentry->d_inode)) 570 if (locks_verify_locked(file->f_path.dentry->d_inode))
571 return -EAGAIN; 571 return -EAGAIN;
572 572
573 if (!(capabilities & BDI_CAP_MAP_DIRECT)) 573 if (!(capabilities & BDI_CAP_MAP_DIRECT))
@@ -598,7 +598,7 @@ static int validate_mmap_request(struct file *file,
598 598
599 /* handle executable mappings and implied executable 599 /* handle executable mappings and implied executable
600 * mappings */ 600 * mappings */
601 if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) { 601 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
602 if (prot & PROT_EXEC) 602 if (prot & PROT_EXEC)
603 return -EPERM; 603 return -EPERM;
604 } 604 }
@@ -833,7 +833,7 @@ unsigned long do_mmap_pgoff(struct file *file,
833 continue; 833 continue;
834 834
835 /* search for overlapping mappings on the same file */ 835 /* search for overlapping mappings on the same file */
836 if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode) 836 if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode)
837 continue; 837 continue;
838 838
839 if (vma->vm_pgoff >= pgoff + pglen) 839 if (vma->vm_pgoff >= pgoff + pglen)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 223d9ccb7d64..b278b8d60eee 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -61,12 +61,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
61 } 61 }
62 62
63 /* 63 /*
64 * swapoff can easily use up all memory, so kill those first.
65 */
66 if (p->flags & PF_SWAPOFF)
67 return ULONG_MAX;
68
69 /*
70 * The memory size of the process is the basis for the badness. 64 * The memory size of the process is the basis for the badness.
71 */ 65 */
72 points = mm->total_vm; 66 points = mm->total_vm;
@@ -77,6 +71,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
77 task_unlock(p); 71 task_unlock(p);
78 72
79 /* 73 /*
74 * swapoff can easily use up all memory, so kill those first.
75 */
76 if (p->flags & PF_SWAPOFF)
77 return ULONG_MAX;
78
79 /*
80 * Processes which fork a lot of child processes are likely 80 * Processes which fork a lot of child processes are likely
81 * a good choice. We add half the vmsize of the children if they 81 * a good choice. We add half the vmsize of the children if they
82 * have an own mm. This prevents forking servers to flood the 82 * have an own mm. This prevents forking servers to flood the
@@ -174,10 +174,15 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
174{ 174{
175#ifdef CONFIG_NUMA 175#ifdef CONFIG_NUMA
176 struct zone **z; 176 struct zone **z;
177 nodemask_t nodes = node_online_map; 177 nodemask_t nodes;
178 int node;
179 /* node has memory ? */
180 for_each_online_node(node)
181 if (NODE_DATA(node)->node_present_pages)
182 node_set(node, nodes);
178 183
179 for (z = zonelist->zones; *z; z++) 184 for (z = zonelist->zones; *z; z++)
180 if (cpuset_zone_allowed(*z, gfp_mask)) 185 if (cpuset_zone_allowed_softwall(*z, gfp_mask))
181 node_clear(zone_to_nid(*z), nodes); 186 node_clear(zone_to_nid(*z), nodes);
182 else 187 else
183 return CONSTRAINT_CPUSET; 188 return CONSTRAINT_CPUSET;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8d9b19f239c3..be0efbde4994 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include <linux/task_io_accounting_ops.h>
24#include <linux/blkdev.h> 25#include <linux/blkdev.h>
25#include <linux/mpage.h> 26#include <linux/mpage.h>
26#include <linux/rmap.h> 27#include <linux/rmap.h>
@@ -132,11 +133,9 @@ get_dirty_limits(long *pbackground, long *pdirty,
132 133
133#ifdef CONFIG_HIGHMEM 134#ifdef CONFIG_HIGHMEM
134 /* 135 /*
135 * If this mapping can only allocate from low memory, 136 * We always exclude high memory from our count.
136 * we exclude high memory from our count.
137 */ 137 */
138 if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM)) 138 available_memory -= totalhigh_pages;
139 available_memory -= totalhigh_pages;
140#endif 139#endif
141 140
142 141
@@ -525,28 +524,25 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
525}; 524};
526 525
527/* 526/*
528 * If the machine has a large highmem:lowmem ratio then scale back the default 527 * Called early on to tune the page writeback dirty limits.
529 * dirty memory thresholds: allowing too much dirty highmem pins an excessive 528 *
530 * number of buffer_heads. 529 * We used to scale dirty pages according to how total memory
530 * related to pages that could be allocated for buffers (by
531 * comparing nr_free_buffer_pages() to vm_total_pages.
532 *
533 * However, that was when we used "dirty_ratio" to scale with
534 * all memory, and we don't do that any more. "dirty_ratio"
535 * is now applied to total non-HIGHPAGE memory (by subtracting
536 * totalhigh_pages from vm_total_pages), and as such we can't
537 * get into the old insane situation any more where we had
538 * large amounts of dirty pages compared to a small amount of
539 * non-HIGHMEM memory.
540 *
541 * But we might still want to scale the dirty_ratio by how
542 * much memory the box has..
531 */ 543 */
532void __init page_writeback_init(void) 544void __init page_writeback_init(void)
533{ 545{
534 long buffer_pages = nr_free_buffer_pages();
535 long correction;
536
537 correction = (100 * 4 * buffer_pages) / vm_total_pages;
538
539 if (correction < 100) {
540 dirty_background_ratio *= correction;
541 dirty_background_ratio /= 100;
542 vm_dirty_ratio *= correction;
543 vm_dirty_ratio /= 100;
544
545 if (dirty_background_ratio <= 0)
546 dirty_background_ratio = 1;
547 if (vm_dirty_ratio <= 0)
548 vm_dirty_ratio = 1;
549 }
550 mod_timer(&wb_timer, jiffies + dirty_writeback_interval); 546 mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
551 writeback_set_ratelimit(); 547 writeback_set_ratelimit();
552 register_cpu_notifier(&ratelimit_nb); 548 register_cpu_notifier(&ratelimit_nb);
@@ -761,23 +757,24 @@ int __set_page_dirty_nobuffers(struct page *page)
761 struct address_space *mapping = page_mapping(page); 757 struct address_space *mapping = page_mapping(page);
762 struct address_space *mapping2; 758 struct address_space *mapping2;
763 759
764 if (mapping) { 760 if (!mapping)
765 write_lock_irq(&mapping->tree_lock); 761 return 1;
766 mapping2 = page_mapping(page); 762
767 if (mapping2) { /* Race with truncate? */ 763 write_lock_irq(&mapping->tree_lock);
768 BUG_ON(mapping2 != mapping); 764 mapping2 = page_mapping(page);
769 if (mapping_cap_account_dirty(mapping)) 765 if (mapping2) { /* Race with truncate? */
770 __inc_zone_page_state(page, 766 BUG_ON(mapping2 != mapping);
771 NR_FILE_DIRTY); 767 if (mapping_cap_account_dirty(mapping)) {
772 radix_tree_tag_set(&mapping->page_tree, 768 __inc_zone_page_state(page, NR_FILE_DIRTY);
773 page_index(page), PAGECACHE_TAG_DIRTY); 769 task_io_account_write(PAGE_CACHE_SIZE);
774 }
775 write_unlock_irq(&mapping->tree_lock);
776 if (mapping->host) {
777 /* !PageAnon && !swapper_space */
778 __mark_inode_dirty(mapping->host,
779 I_DIRTY_PAGES);
780 } 770 }
771 radix_tree_tag_set(&mapping->page_tree,
772 page_index(page), PAGECACHE_TAG_DIRTY);
773 }
774 write_unlock_irq(&mapping->tree_lock);
775 if (mapping->host) {
776 /* !PageAnon && !swapper_space */
777 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
781 } 778 }
782 return 1; 779 return 1;
783 } 780 }
@@ -843,39 +840,6 @@ int set_page_dirty_lock(struct page *page)
843EXPORT_SYMBOL(set_page_dirty_lock); 840EXPORT_SYMBOL(set_page_dirty_lock);
844 841
845/* 842/*
846 * Clear a page's dirty flag, while caring for dirty memory accounting.
847 * Returns true if the page was previously dirty.
848 */
849int test_clear_page_dirty(struct page *page)
850{
851 struct address_space *mapping = page_mapping(page);
852 unsigned long flags;
853
854 if (mapping) {
855 write_lock_irqsave(&mapping->tree_lock, flags);
856 if (TestClearPageDirty(page)) {
857 radix_tree_tag_clear(&mapping->page_tree,
858 page_index(page),
859 PAGECACHE_TAG_DIRTY);
860 write_unlock_irqrestore(&mapping->tree_lock, flags);
861 /*
862 * We can continue to use `mapping' here because the
863 * page is locked, which pins the address_space
864 */
865 if (mapping_cap_account_dirty(mapping)) {
866 page_mkclean(page);
867 dec_zone_page_state(page, NR_FILE_DIRTY);
868 }
869 return 1;
870 }
871 write_unlock_irqrestore(&mapping->tree_lock, flags);
872 return 0;
873 }
874 return TestClearPageDirty(page);
875}
876EXPORT_SYMBOL(test_clear_page_dirty);
877
878/*
879 * Clear a page's dirty flag, while caring for dirty memory accounting. 843 * Clear a page's dirty flag, while caring for dirty memory accounting.
880 * Returns true if the page was previously dirty. 844 * Returns true if the page was previously dirty.
881 * 845 *
@@ -893,12 +857,41 @@ int clear_page_dirty_for_io(struct page *page)
893{ 857{
894 struct address_space *mapping = page_mapping(page); 858 struct address_space *mapping = page_mapping(page);
895 859
896 if (mapping) { 860 if (mapping && mapping_cap_account_dirty(mapping)) {
861 /*
862 * Yes, Virginia, this is indeed insane.
863 *
864 * We use this sequence to make sure that
865 * (a) we account for dirty stats properly
866 * (b) we tell the low-level filesystem to
867 * mark the whole page dirty if it was
868 * dirty in a pagetable. Only to then
869 * (c) clean the page again and return 1 to
870 * cause the writeback.
871 *
872 * This way we avoid all nasty races with the
873 * dirty bit in multiple places and clearing
874 * them concurrently from different threads.
875 *
876 * Note! Normally the "set_page_dirty(page)"
877 * has no effect on the actual dirty bit - since
878 * that will already usually be set. But we
879 * need the side effects, and it can help us
880 * avoid races.
881 *
882 * We basically use the page "master dirty bit"
883 * as a serialization point for all the different
884 * threads doing their things.
885 *
886 * FIXME! We still have a race here: if somebody
887 * adds the page back to the page tables in
888 * between the "page_mkclean()" and the "TestClearPageDirty()",
889 * we might have it mapped without the dirty bit set.
890 */
891 if (page_mkclean(page))
892 set_page_dirty(page);
897 if (TestClearPageDirty(page)) { 893 if (TestClearPageDirty(page)) {
898 if (mapping_cap_account_dirty(mapping)) { 894 dec_zone_page_state(page, NR_FILE_DIRTY);
899 page_mkclean(page);
900 dec_zone_page_state(page, NR_FILE_DIRTY);
901 }
902 return 1; 895 return 1;
903 } 896 }
904 return 0; 897 return 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cace22b3ac25..f12052dc23ff 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -40,6 +40,7 @@
40#include <linux/sort.h> 40#include <linux/sort.h>
41#include <linux/pfn.h> 41#include <linux/pfn.h>
42#include <linux/backing-dev.h> 42#include <linux/backing-dev.h>
43#include <linux/fault-inject.h>
43 44
44#include <asm/tlbflush.h> 45#include <asm/tlbflush.h>
45#include <asm/div64.h> 46#include <asm/div64.h>
@@ -710,6 +711,9 @@ static void __drain_pages(unsigned int cpu)
710 for_each_zone(zone) { 711 for_each_zone(zone) {
711 struct per_cpu_pageset *pset; 712 struct per_cpu_pageset *pset;
712 713
714 if (!populated_zone(zone))
715 continue;
716
713 pset = zone_pcp(zone, cpu); 717 pset = zone_pcp(zone, cpu);
714 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 718 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
715 struct per_cpu_pages *pcp; 719 struct per_cpu_pages *pcp;
@@ -892,6 +896,91 @@ failed:
892#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 896#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
893#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 897#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
894 898
899#ifdef CONFIG_FAIL_PAGE_ALLOC
900
901static struct fail_page_alloc_attr {
902 struct fault_attr attr;
903
904 u32 ignore_gfp_highmem;
905 u32 ignore_gfp_wait;
906
907#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
908
909 struct dentry *ignore_gfp_highmem_file;
910 struct dentry *ignore_gfp_wait_file;
911
912#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
913
914} fail_page_alloc = {
915 .attr = FAULT_ATTR_INITIALIZER,
916 .ignore_gfp_wait = 1,
917 .ignore_gfp_highmem = 1,
918};
919
920static int __init setup_fail_page_alloc(char *str)
921{
922 return setup_fault_attr(&fail_page_alloc.attr, str);
923}
924__setup("fail_page_alloc=", setup_fail_page_alloc);
925
926static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
927{
928 if (gfp_mask & __GFP_NOFAIL)
929 return 0;
930 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
931 return 0;
932 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
933 return 0;
934
935 return should_fail(&fail_page_alloc.attr, 1 << order);
936}
937
938#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
939
940static int __init fail_page_alloc_debugfs(void)
941{
942 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
943 struct dentry *dir;
944 int err;
945
946 err = init_fault_attr_dentries(&fail_page_alloc.attr,
947 "fail_page_alloc");
948 if (err)
949 return err;
950 dir = fail_page_alloc.attr.dentries.dir;
951
952 fail_page_alloc.ignore_gfp_wait_file =
953 debugfs_create_bool("ignore-gfp-wait", mode, dir,
954 &fail_page_alloc.ignore_gfp_wait);
955
956 fail_page_alloc.ignore_gfp_highmem_file =
957 debugfs_create_bool("ignore-gfp-highmem", mode, dir,
958 &fail_page_alloc.ignore_gfp_highmem);
959
960 if (!fail_page_alloc.ignore_gfp_wait_file ||
961 !fail_page_alloc.ignore_gfp_highmem_file) {
962 err = -ENOMEM;
963 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
964 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
965 cleanup_fault_attr_dentries(&fail_page_alloc.attr);
966 }
967
968 return err;
969}
970
971late_initcall(fail_page_alloc_debugfs);
972
973#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
974
975#else /* CONFIG_FAIL_PAGE_ALLOC */
976
977static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
978{
979 return 0;
980}
981
982#endif /* CONFIG_FAIL_PAGE_ALLOC */
983
895/* 984/*
896 * Return 1 if free pages are above 'mark'. This takes into account the order 985 * Return 1 if free pages are above 'mark'. This takes into account the order
897 * of the allocation. 986 * of the allocation.
@@ -900,8 +989,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
900 int classzone_idx, int alloc_flags) 989 int classzone_idx, int alloc_flags)
901{ 990{
902 /* free_pages my go negative - that's OK */ 991 /* free_pages my go negative - that's OK */
903 unsigned long min = mark; 992 long min = mark, free_pages = z->free_pages - (1 << order) + 1;
904 long free_pages = z->free_pages - (1 << order) + 1;
905 int o; 993 int o;
906 994
907 if (alloc_flags & ALLOC_HIGH) 995 if (alloc_flags & ALLOC_HIGH)
@@ -1076,7 +1164,7 @@ zonelist_scan:
1076 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) 1164 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
1077 break; 1165 break;
1078 if ((alloc_flags & ALLOC_CPUSET) && 1166 if ((alloc_flags & ALLOC_CPUSET) &&
1079 !cpuset_zone_allowed(zone, gfp_mask)) 1167 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1080 goto try_next_zone; 1168 goto try_next_zone;
1081 1169
1082 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1170 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1136,6 +1224,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
1136 1224
1137 might_sleep_if(wait); 1225 might_sleep_if(wait);
1138 1226
1227 if (should_fail_alloc_page(gfp_mask, order))
1228 return NULL;
1229
1139restart: 1230restart:
1140 z = zonelist->zones; /* the list of zones suitable for gfp_mask */ 1231 z = zonelist->zones; /* the list of zones suitable for gfp_mask */
1141 1232
@@ -1488,8 +1579,8 @@ void show_free_areas(void)
1488 1579
1489 get_zone_counts(&active, &inactive, &free); 1580 get_zone_counts(&active, &inactive, &free);
1490 1581
1491 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " 1582 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
1492 "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", 1583 " free:%u slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1493 active, 1584 active,
1494 inactive, 1585 inactive,
1495 global_page_state(NR_FILE_DIRTY), 1586 global_page_state(NR_FILE_DIRTY),
@@ -1499,7 +1590,8 @@ void show_free_areas(void)
1499 global_page_state(NR_SLAB_RECLAIMABLE) + 1590 global_page_state(NR_SLAB_RECLAIMABLE) +
1500 global_page_state(NR_SLAB_UNRECLAIMABLE), 1591 global_page_state(NR_SLAB_UNRECLAIMABLE),
1501 global_page_state(NR_FILE_MAPPED), 1592 global_page_state(NR_FILE_MAPPED),
1502 global_page_state(NR_PAGETABLE)); 1593 global_page_state(NR_PAGETABLE),
1594 global_page_state(NR_BOUNCE));
1503 1595
1504 for_each_zone(zone) { 1596 for_each_zone(zone) {
1505 int i; 1597 int i;
@@ -1864,17 +1956,24 @@ static inline unsigned long wait_table_bits(unsigned long size)
1864 * done. Non-atomic initialization, single-pass. 1956 * done. Non-atomic initialization, single-pass.
1865 */ 1957 */
1866void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1958void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1867 unsigned long start_pfn) 1959 unsigned long start_pfn, enum memmap_context context)
1868{ 1960{
1869 struct page *page; 1961 struct page *page;
1870 unsigned long end_pfn = start_pfn + size; 1962 unsigned long end_pfn = start_pfn + size;
1871 unsigned long pfn; 1963 unsigned long pfn;
1872 1964
1873 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 1965 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1874 if (!early_pfn_valid(pfn)) 1966 /*
1875 continue; 1967 * There can be holes in boot-time mem_map[]s
1876 if (!early_pfn_in_nid(pfn, nid)) 1968 * handed to this function. They do not
1877 continue; 1969 * exist on hotplugged memory.
1970 */
1971 if (context == MEMMAP_EARLY) {
1972 if (!early_pfn_valid(pfn))
1973 continue;
1974 if (!early_pfn_in_nid(pfn, nid))
1975 continue;
1976 }
1878 page = pfn_to_page(pfn); 1977 page = pfn_to_page(pfn);
1879 set_page_links(page, zone, nid, pfn); 1978 set_page_links(page, zone, nid, pfn);
1880 init_page_count(page); 1979 init_page_count(page);
@@ -1901,7 +2000,7 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1901 2000
1902#ifndef __HAVE_ARCH_MEMMAP_INIT 2001#ifndef __HAVE_ARCH_MEMMAP_INIT
1903#define memmap_init(size, nid, zone, start_pfn) \ 2002#define memmap_init(size, nid, zone, start_pfn) \
1904 memmap_init_zone((size), (nid), (zone), (start_pfn)) 2003 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
1905#endif 2004#endif
1906 2005
1907static int __cpuinit zone_batchsize(struct zone *zone) 2006static int __cpuinit zone_batchsize(struct zone *zone)
@@ -2147,7 +2246,8 @@ static __meminit void zone_pcp_init(struct zone *zone)
2147 2246
2148__meminit int init_currently_empty_zone(struct zone *zone, 2247__meminit int init_currently_empty_zone(struct zone *zone,
2149 unsigned long zone_start_pfn, 2248 unsigned long zone_start_pfn,
2150 unsigned long size) 2249 unsigned long size,
2250 enum memmap_context context)
2151{ 2251{
2152 struct pglist_data *pgdat = zone->zone_pgdat; 2252 struct pglist_data *pgdat = zone->zone_pgdat;
2153 int ret; 2253 int ret;
@@ -2591,7 +2691,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2591 if (!size) 2691 if (!size)
2592 continue; 2692 continue;
2593 2693
2594 ret = init_currently_empty_zone(zone, zone_start_pfn, size); 2694 ret = init_currently_empty_zone(zone, zone_start_pfn,
2695 size, MEMMAP_EARLY);
2595 BUG_ON(ret); 2696 BUG_ON(ret);
2596 zone_start_pfn += size; 2697 zone_start_pfn += size;
2597 } 2698 }
@@ -3232,6 +3333,10 @@ void *__init alloc_large_system_hash(const char *tablename,
3232 numentries >>= (scale - PAGE_SHIFT); 3333 numentries >>= (scale - PAGE_SHIFT);
3233 else 3334 else
3234 numentries <<= (PAGE_SHIFT - scale); 3335 numentries <<= (PAGE_SHIFT - scale);
3336
3337 /* Make sure we've got at least a 0-order allocation.. */
3338 if (unlikely((numentries * bucketsize) < PAGE_SIZE))
3339 numentries = PAGE_SIZE / bucketsize;
3235 } 3340 }
3236 numentries = roundup_pow_of_two(numentries); 3341 numentries = roundup_pow_of_two(numentries);
3237 3342
@@ -3244,7 +3349,7 @@ void *__init alloc_large_system_hash(const char *tablename,
3244 if (numentries > max) 3349 if (numentries > max)
3245 numentries = max; 3350 numentries = max;
3246 3351
3247 log2qty = long_log2(numentries); 3352 log2qty = ilog2(numentries);
3248 3353
3249 do { 3354 do {
3250 size = bucketsize << log2qty; 3355 size = bucketsize << log2qty;
@@ -3266,7 +3371,7 @@ void *__init alloc_large_system_hash(const char *tablename,
3266 printk("%s hash table entries: %d (order: %d, %lu bytes)\n", 3371 printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
3267 tablename, 3372 tablename,
3268 (1U << log2qty), 3373 (1U << log2qty),
3269 long_log2(size) - PAGE_SHIFT, 3374 ilog2(size) - PAGE_SHIFT,
3270 size); 3375 size);
3271 3376
3272 if (_hash_shift) 3377 if (_hash_shift)
diff --git a/mm/readahead.c b/mm/readahead.c
index a386f2b6b335..0f539e8e827a 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/blkdev.h> 14#include <linux/blkdev.h>
15#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
16#include <linux/task_io_accounting_ops.h>
16#include <linux/pagevec.h> 17#include <linux/pagevec.h>
17 18
18void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 19void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
@@ -151,6 +152,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
151 put_pages_list(pages); 152 put_pages_list(pages);
152 break; 153 break;
153 } 154 }
155 task_io_account_read(PAGE_CACHE_SIZE);
154 } 156 }
155 pagevec_lru_add(&lru_pvec); 157 pagevec_lru_add(&lru_pvec);
156 return ret; 158 return ret;
@@ -450,7 +452,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
450 * 452 *
451 * Note that @filp is purely used for passing on to the ->readpage[s]() 453 * Note that @filp is purely used for passing on to the ->readpage[s]()
452 * handler: it may refer to a different file from @mapping (so we may not use 454 * handler: it may refer to a different file from @mapping (so we may not use
453 * @filp->f_mapping or @filp->f_dentry->d_inode here). 455 * @filp->f_mapping or @filp->f_path.dentry->d_inode here).
454 * Also, @ra may not be equal to &@filp->f_ra. 456 * Also, @ra may not be equal to &@filp->f_ra.
455 * 457 *
456 */ 458 */
diff --git a/mm/rmap.c b/mm/rmap.c
index d8a842a586db..669acb22b572 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -47,6 +47,7 @@
47#include <linux/rmap.h> 47#include <linux/rmap.h>
48#include <linux/rcupdate.h> 48#include <linux/rcupdate.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kallsyms.h>
50 51
51#include <asm/tlbflush.h> 52#include <asm/tlbflush.h>
52 53
@@ -432,7 +433,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
432{ 433{
433 struct mm_struct *mm = vma->vm_mm; 434 struct mm_struct *mm = vma->vm_mm;
434 unsigned long address; 435 unsigned long address;
435 pte_t *pte, entry; 436 pte_t *pte;
436 spinlock_t *ptl; 437 spinlock_t *ptl;
437 int ret = 0; 438 int ret = 0;
438 439
@@ -444,17 +445,18 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
444 if (!pte) 445 if (!pte)
445 goto out; 446 goto out;
446 447
447 if (!pte_dirty(*pte) && !pte_write(*pte)) 448 if (pte_dirty(*pte) || pte_write(*pte)) {
448 goto unlock; 449 pte_t entry;
449 450
450 entry = ptep_get_and_clear(mm, address, pte); 451 flush_cache_page(vma, address, pte_pfn(*pte));
451 entry = pte_mkclean(entry); 452 entry = ptep_clear_flush(vma, address, pte);
452 entry = pte_wrprotect(entry); 453 entry = pte_wrprotect(entry);
453 ptep_establish(vma, address, pte, entry); 454 entry = pte_mkclean(entry);
454 lazy_mmu_prot_update(entry); 455 set_pte_at(mm, address, pte, entry);
455 ret = 1; 456 lazy_mmu_prot_update(entry);
457 ret = 1;
458 }
456 459
457unlock:
458 pte_unmap_unlock(pte, ptl); 460 pte_unmap_unlock(pte, ptl);
459out: 461out:
460 return ret; 462 return ret;
@@ -489,6 +491,8 @@ int page_mkclean(struct page *page)
489 if (mapping) 491 if (mapping)
490 ret = page_mkclean_file(mapping, page); 492 ret = page_mkclean_file(mapping, page);
491 } 493 }
494 if (page_test_and_clear_dirty(page))
495 ret = 1;
492 496
493 return ret; 497 return ret;
494} 498}
@@ -567,14 +571,20 @@ void page_add_file_rmap(struct page *page)
567 * 571 *
568 * The caller needs to hold the pte lock. 572 * The caller needs to hold the pte lock.
569 */ 573 */
570void page_remove_rmap(struct page *page) 574void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
571{ 575{
572 if (atomic_add_negative(-1, &page->_mapcount)) { 576 if (atomic_add_negative(-1, &page->_mapcount)) {
573 if (unlikely(page_mapcount(page) < 0)) { 577 if (unlikely(page_mapcount(page) < 0)) {
574 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); 578 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
579 printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page));
575 printk (KERN_EMERG " page->flags = %lx\n", page->flags); 580 printk (KERN_EMERG " page->flags = %lx\n", page->flags);
576 printk (KERN_EMERG " page->count = %x\n", page_count(page)); 581 printk (KERN_EMERG " page->count = %x\n", page_count(page));
577 printk (KERN_EMERG " page->mapping = %p\n", page->mapping); 582 printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
583 print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
584 if (vma->vm_ops)
585 print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
586 if (vma->vm_file && vma->vm_file->f_op)
587 print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
578 BUG(); 588 BUG();
579 } 589 }
580 590
@@ -679,7 +689,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
679 dec_mm_counter(mm, file_rss); 689 dec_mm_counter(mm, file_rss);
680 690
681 691
682 page_remove_rmap(page); 692 page_remove_rmap(page, vma);
683 page_cache_release(page); 693 page_cache_release(page);
684 694
685out_unmap: 695out_unmap:
@@ -769,7 +779,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
769 if (pte_dirty(pteval)) 779 if (pte_dirty(pteval))
770 set_page_dirty(page); 780 set_page_dirty(page);
771 781
772 page_remove_rmap(page); 782 page_remove_rmap(page, vma);
773 page_cache_release(page); 783 page_cache_release(page);
774 dec_mm_counter(mm, file_rss); 784 dec_mm_counter(mm, file_rss);
775 (*mapcount)--; 785 (*mapcount)--;
diff --git a/mm/shmem.c b/mm/shmem.c
index c820b4f77b8d..70da7a0981bf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -515,7 +515,12 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
515 size = SHMEM_NR_DIRECT; 515 size = SHMEM_NR_DIRECT;
516 nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size); 516 nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
517 } 517 }
518 if (!topdir) 518
519 /*
520 * If there are no indirect blocks or we are punching a hole
521 * below indirect blocks, nothing to be done.
522 */
523 if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT)))
519 goto done2; 524 goto done2;
520 525
521 BUG_ON(limit <= SHMEM_NR_DIRECT); 526 BUG_ON(limit <= SHMEM_NR_DIRECT);
@@ -1225,7 +1230,7 @@ failed:
1225 1230
1226struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type) 1231struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
1227{ 1232{
1228 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1233 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1229 struct page *page = NULL; 1234 struct page *page = NULL;
1230 unsigned long idx; 1235 unsigned long idx;
1231 int error; 1236 int error;
@@ -1248,7 +1253,7 @@ static int shmem_populate(struct vm_area_struct *vma,
1248 unsigned long addr, unsigned long len, 1253 unsigned long addr, unsigned long len,
1249 pgprot_t prot, unsigned long pgoff, int nonblock) 1254 pgprot_t prot, unsigned long pgoff, int nonblock)
1250{ 1255{
1251 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1256 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1252 struct mm_struct *mm = vma->vm_mm; 1257 struct mm_struct *mm = vma->vm_mm;
1253 enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE; 1258 enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
1254 unsigned long size; 1259 unsigned long size;
@@ -1293,14 +1298,14 @@ static int shmem_populate(struct vm_area_struct *vma,
1293#ifdef CONFIG_NUMA 1298#ifdef CONFIG_NUMA
1294int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) 1299int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
1295{ 1300{
1296 struct inode *i = vma->vm_file->f_dentry->d_inode; 1301 struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1297 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); 1302 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
1298} 1303}
1299 1304
1300struct mempolicy * 1305struct mempolicy *
1301shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) 1306shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
1302{ 1307{
1303 struct inode *i = vma->vm_file->f_dentry->d_inode; 1308 struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1304 unsigned long idx; 1309 unsigned long idx;
1305 1310
1306 idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1311 idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -1310,7 +1315,7 @@ shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
1310 1315
1311int shmem_lock(struct file *file, int lock, struct user_struct *user) 1316int shmem_lock(struct file *file, int lock, struct user_struct *user)
1312{ 1317{
1313 struct inode *inode = file->f_dentry->d_inode; 1318 struct inode *inode = file->f_path.dentry->d_inode;
1314 struct shmem_inode_info *info = SHMEM_I(inode); 1319 struct shmem_inode_info *info = SHMEM_I(inode);
1315 int retval = -ENOMEM; 1320 int retval = -ENOMEM;
1316 1321
@@ -1422,7 +1427,7 @@ shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsig
1422static ssize_t 1427static ssize_t
1423shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) 1428shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
1424{ 1429{
1425 struct inode *inode = file->f_dentry->d_inode; 1430 struct inode *inode = file->f_path.dentry->d_inode;
1426 loff_t pos; 1431 loff_t pos;
1427 unsigned long written; 1432 unsigned long written;
1428 ssize_t err; 1433 ssize_t err;
@@ -1442,7 +1447,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
1442 if (err || !count) 1447 if (err || !count)
1443 goto out; 1448 goto out;
1444 1449
1445 err = remove_suid(file->f_dentry); 1450 err = remove_suid(file->f_path.dentry);
1446 if (err) 1451 if (err)
1447 goto out; 1452 goto out;
1448 1453
@@ -1524,7 +1529,7 @@ out:
1524 1529
1525static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) 1530static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1526{ 1531{
1527 struct inode *inode = filp->f_dentry->d_inode; 1532 struct inode *inode = filp->f_path.dentry->d_inode;
1528 struct address_space *mapping = inode->i_mapping; 1533 struct address_space *mapping = inode->i_mapping;
1529 unsigned long index, offset; 1534 unsigned long index, offset;
1530 1535
@@ -2493,8 +2498,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2493 d_instantiate(dentry, inode); 2498 d_instantiate(dentry, inode);
2494 inode->i_size = size; 2499 inode->i_size = size;
2495 inode->i_nlink = 0; /* It is unlinked */ 2500 inode->i_nlink = 0; /* It is unlinked */
2496 file->f_vfsmnt = mntget(shm_mnt); 2501 file->f_path.mnt = mntget(shm_mnt);
2497 file->f_dentry = dentry; 2502 file->f_path.dentry = dentry;
2498 file->f_mapping = inode->i_mapping; 2503 file->f_mapping = inode->i_mapping;
2499 file->f_op = &shmem_file_operations; 2504 file->f_op = &shmem_file_operations;
2500 file->f_mode = FMODE_WRITE | FMODE_READ; 2505 file->f_mode = FMODE_WRITE | FMODE_READ;
diff --git a/mm/slab.c b/mm/slab.c
index 068cb4503c15..c6100628a6ef 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -107,7 +107,9 @@
107#include <linux/nodemask.h> 107#include <linux/nodemask.h>
108#include <linux/mempolicy.h> 108#include <linux/mempolicy.h>
109#include <linux/mutex.h> 109#include <linux/mutex.h>
110#include <linux/fault-inject.h>
110#include <linux/rtmutex.h> 111#include <linux/rtmutex.h>
112#include <linux/reciprocal_div.h>
111 113
112#include <asm/cacheflush.h> 114#include <asm/cacheflush.h>
113#include <asm/tlbflush.h> 115#include <asm/tlbflush.h>
@@ -385,6 +387,7 @@ struct kmem_cache {
385 unsigned int shared; 387 unsigned int shared;
386 388
387 unsigned int buffer_size; 389 unsigned int buffer_size;
390 u32 reciprocal_buffer_size;
388/* 3) touched by every alloc & free from the backend */ 391/* 3) touched by every alloc & free from the backend */
389 struct kmem_list3 *nodelists[MAX_NUMNODES]; 392 struct kmem_list3 *nodelists[MAX_NUMNODES];
390 393
@@ -626,10 +629,17 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
626 return slab->s_mem + cache->buffer_size * idx; 629 return slab->s_mem + cache->buffer_size * idx;
627} 630}
628 631
629static inline unsigned int obj_to_index(struct kmem_cache *cache, 632/*
630 struct slab *slab, void *obj) 633 * We want to avoid an expensive divide : (offset / cache->buffer_size)
634 * Using the fact that buffer_size is a constant for a particular cache,
635 * we can replace (offset / cache->buffer_size) by
636 * reciprocal_divide(offset, cache->reciprocal_buffer_size)
637 */
638static inline unsigned int obj_to_index(const struct kmem_cache *cache,
639 const struct slab *slab, void *obj)
631{ 640{
632 return (unsigned)(obj - slab->s_mem) / cache->buffer_size; 641 u32 offset = (obj - slab->s_mem);
642 return reciprocal_divide(offset, cache->reciprocal_buffer_size);
633} 643}
634 644
635/* 645/*
@@ -945,7 +955,8 @@ static void __devinit start_cpu_timer(int cpu)
945 if (keventd_up() && reap_work->work.func == NULL) { 955 if (keventd_up() && reap_work->work.func == NULL) {
946 init_reap_node(cpu); 956 init_reap_node(cpu);
947 INIT_DELAYED_WORK(reap_work, cache_reap); 957 INIT_DELAYED_WORK(reap_work, cache_reap);
948 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 958 schedule_delayed_work_on(cpu, reap_work,
959 __round_jiffies_relative(HZ, cpu));
949 } 960 }
950} 961}
951 962
@@ -1425,6 +1436,8 @@ void __init kmem_cache_init(void)
1425 1436
1426 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, 1437 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1427 cache_line_size()); 1438 cache_line_size());
1439 cache_cache.reciprocal_buffer_size =
1440 reciprocal_value(cache_cache.buffer_size);
1428 1441
1429 for (order = 0; order < MAX_ORDER; order++) { 1442 for (order = 0; order < MAX_ORDER; order++) {
1430 cache_estimate(order, cache_cache.buffer_size, 1443 cache_estimate(order, cache_cache.buffer_size,
@@ -2311,6 +2324,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2311 if (flags & SLAB_CACHE_DMA) 2324 if (flags & SLAB_CACHE_DMA)
2312 cachep->gfpflags |= GFP_DMA; 2325 cachep->gfpflags |= GFP_DMA;
2313 cachep->buffer_size = size; 2326 cachep->buffer_size = size;
2327 cachep->reciprocal_buffer_size = reciprocal_value(size);
2314 2328
2315 if (flags & CFLGS_OFF_SLAB) { 2329 if (flags & CFLGS_OFF_SLAB) {
2316 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 2330 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
@@ -3088,12 +3102,89 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3088#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 3102#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3089#endif 3103#endif
3090 3104
3105#ifdef CONFIG_FAILSLAB
3106
3107static struct failslab_attr {
3108
3109 struct fault_attr attr;
3110
3111 u32 ignore_gfp_wait;
3112#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3113 struct dentry *ignore_gfp_wait_file;
3114#endif
3115
3116} failslab = {
3117 .attr = FAULT_ATTR_INITIALIZER,
3118 .ignore_gfp_wait = 1,
3119};
3120
3121static int __init setup_failslab(char *str)
3122{
3123 return setup_fault_attr(&failslab.attr, str);
3124}
3125__setup("failslab=", setup_failslab);
3126
3127static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3128{
3129 if (cachep == &cache_cache)
3130 return 0;
3131 if (flags & __GFP_NOFAIL)
3132 return 0;
3133 if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
3134 return 0;
3135
3136 return should_fail(&failslab.attr, obj_size(cachep));
3137}
3138
3139#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3140
3141static int __init failslab_debugfs(void)
3142{
3143 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
3144 struct dentry *dir;
3145 int err;
3146
3147 err = init_fault_attr_dentries(&failslab.attr, "failslab");
3148 if (err)
3149 return err;
3150 dir = failslab.attr.dentries.dir;
3151
3152 failslab.ignore_gfp_wait_file =
3153 debugfs_create_bool("ignore-gfp-wait", mode, dir,
3154 &failslab.ignore_gfp_wait);
3155
3156 if (!failslab.ignore_gfp_wait_file) {
3157 err = -ENOMEM;
3158 debugfs_remove(failslab.ignore_gfp_wait_file);
3159 cleanup_fault_attr_dentries(&failslab.attr);
3160 }
3161
3162 return err;
3163}
3164
3165late_initcall(failslab_debugfs);
3166
3167#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
3168
3169#else /* CONFIG_FAILSLAB */
3170
3171static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3172{
3173 return 0;
3174}
3175
3176#endif /* CONFIG_FAILSLAB */
3177
3091static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3178static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3092{ 3179{
3093 void *objp; 3180 void *objp;
3094 struct array_cache *ac; 3181 struct array_cache *ac;
3095 3182
3096 check_irq_off(); 3183 check_irq_off();
3184
3185 if (should_failslab(cachep, flags))
3186 return NULL;
3187
3097 ac = cpu_cache_get(cachep); 3188 ac = cpu_cache_get(cachep);
3098 if (likely(ac->avail)) { 3189 if (likely(ac->avail)) {
3099 STATS_INC_ALLOCHIT(cachep); 3190 STATS_INC_ALLOCHIT(cachep);
@@ -3173,6 +3264,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3173 struct zone **z; 3264 struct zone **z;
3174 void *obj = NULL; 3265 void *obj = NULL;
3175 int nid; 3266 int nid;
3267 gfp_t local_flags = (flags & GFP_LEVEL_MASK);
3176 3268
3177retry: 3269retry:
3178 /* 3270 /*
@@ -3182,21 +3274,26 @@ retry:
3182 for (z = zonelist->zones; *z && !obj; z++) { 3274 for (z = zonelist->zones; *z && !obj; z++) {
3183 nid = zone_to_nid(*z); 3275 nid = zone_to_nid(*z);
3184 3276
3185 if (cpuset_zone_allowed(*z, flags) && 3277 if (cpuset_zone_allowed_hardwall(*z, flags) &&
3186 cache->nodelists[nid] && 3278 cache->nodelists[nid] &&
3187 cache->nodelists[nid]->free_objects) 3279 cache->nodelists[nid]->free_objects)
3188 obj = ____cache_alloc_node(cache, 3280 obj = ____cache_alloc_node(cache,
3189 flags | GFP_THISNODE, nid); 3281 flags | GFP_THISNODE, nid);
3190 } 3282 }
3191 3283
3192 if (!obj) { 3284 if (!obj && !(flags & __GFP_NO_GROW)) {
3193 /* 3285 /*
3194 * This allocation will be performed within the constraints 3286 * This allocation will be performed within the constraints
3195 * of the current cpuset / memory policy requirements. 3287 * of the current cpuset / memory policy requirements.
3196 * We may trigger various forms of reclaim on the allowed 3288 * We may trigger various forms of reclaim on the allowed
3197 * set and go into memory reserves if necessary. 3289 * set and go into memory reserves if necessary.
3198 */ 3290 */
3291 if (local_flags & __GFP_WAIT)
3292 local_irq_enable();
3293 kmem_flagcheck(cache, flags);
3199 obj = kmem_getpages(cache, flags, -1); 3294 obj = kmem_getpages(cache, flags, -1);
3295 if (local_flags & __GFP_WAIT)
3296 local_irq_disable();
3200 if (obj) { 3297 if (obj) {
3201 /* 3298 /*
3202 * Insert into the appropriate per node queues 3299 * Insert into the appropriate per node queues
@@ -3213,7 +3310,7 @@ retry:
3213 */ 3310 */
3214 goto retry; 3311 goto retry;
3215 } else { 3312 } else {
3216 kmem_freepages(cache, obj); 3313 /* cache_grow already freed obj */
3217 obj = NULL; 3314 obj = NULL;
3218 } 3315 }
3219 } 3316 }
@@ -3456,7 +3553,7 @@ EXPORT_SYMBOL(kmem_cache_zalloc);
3456 * 3553 *
3457 * Currently only used for dentry validation. 3554 * Currently only used for dentry validation.
3458 */ 3555 */
3459int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr) 3556int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3460{ 3557{
3461 unsigned long addr = (unsigned long)ptr; 3558 unsigned long addr = (unsigned long)ptr;
3462 unsigned long min_addr = PAGE_OFFSET; 3559 unsigned long min_addr = PAGE_OFFSET;
@@ -3490,6 +3587,7 @@ out:
3490 * @cachep: The cache to allocate from. 3587 * @cachep: The cache to allocate from.
3491 * @flags: See kmalloc(). 3588 * @flags: See kmalloc().
3492 * @nodeid: node number of the target node. 3589 * @nodeid: node number of the target node.
3590 * @caller: return address of caller, used for debug information
3493 * 3591 *
3494 * Identical to kmem_cache_alloc but it will allocate memory on the given 3592 * Identical to kmem_cache_alloc but it will allocate memory on the given
3495 * node, which can improve the performance for cpu bound structures. 3593 * node, which can improve the performance for cpu bound structures.
@@ -3928,7 +4026,7 @@ static void cache_reap(struct work_struct *unused)
3928 if (!mutex_trylock(&cache_chain_mutex)) { 4026 if (!mutex_trylock(&cache_chain_mutex)) {
3929 /* Give up. Setup the next iteration. */ 4027 /* Give up. Setup the next iteration. */
3930 schedule_delayed_work(&__get_cpu_var(reap_work), 4028 schedule_delayed_work(&__get_cpu_var(reap_work),
3931 REAPTIMEOUT_CPUC); 4029 round_jiffies_relative(REAPTIMEOUT_CPUC));
3932 return; 4030 return;
3933 } 4031 }
3934 4032
@@ -3974,7 +4072,8 @@ next:
3974 next_reap_node(); 4072 next_reap_node();
3975 refresh_cpu_vm_stats(smp_processor_id()); 4073 refresh_cpu_vm_stats(smp_processor_id());
3976 /* Set up the next iteration */ 4074 /* Set up the next iteration */
3977 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 4075 schedule_delayed_work(&__get_cpu_var(reap_work),
4076 round_jiffies_relative(REAPTIMEOUT_CPUC));
3978} 4077}
3979 4078
3980#ifdef CONFIG_PROC_FS 4079#ifdef CONFIG_PROC_FS
diff --git a/mm/slob.c b/mm/slob.c
index 542394184a58..5adc29cb58dd 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -60,6 +60,8 @@ static DEFINE_SPINLOCK(slob_lock);
60static DEFINE_SPINLOCK(block_lock); 60static DEFINE_SPINLOCK(block_lock);
61 61
62static void slob_free(void *b, int size); 62static void slob_free(void *b, int size);
63static void slob_timer_cbk(void);
64
63 65
64static void *slob_alloc(size_t size, gfp_t gfp, int align) 66static void *slob_alloc(size_t size, gfp_t gfp, int align)
65{ 67{
@@ -157,7 +159,7 @@ static int fastcall find_order(int size)
157 return order; 159 return order;
158} 160}
159 161
160void *kmalloc(size_t size, gfp_t gfp) 162void *__kmalloc(size_t size, gfp_t gfp)
161{ 163{
162 slob_t *m; 164 slob_t *m;
163 bigblock_t *bb; 165 bigblock_t *bb;
@@ -186,8 +188,7 @@ void *kmalloc(size_t size, gfp_t gfp)
186 slob_free(bb, sizeof(bigblock_t)); 188 slob_free(bb, sizeof(bigblock_t));
187 return 0; 189 return 0;
188} 190}
189 191EXPORT_SYMBOL(__kmalloc);
190EXPORT_SYMBOL(kmalloc);
191 192
192void kfree(const void *block) 193void kfree(const void *block)
193{ 194{
@@ -327,9 +328,25 @@ const char *kmem_cache_name(struct kmem_cache *c)
327EXPORT_SYMBOL(kmem_cache_name); 328EXPORT_SYMBOL(kmem_cache_name);
328 329
329static struct timer_list slob_timer = TIMER_INITIALIZER( 330static struct timer_list slob_timer = TIMER_INITIALIZER(
330 (void (*)(unsigned long))kmem_cache_init, 0, 0); 331 (void (*)(unsigned long))slob_timer_cbk, 0, 0);
332
333int kmem_cache_shrink(struct kmem_cache *d)
334{
335 return 0;
336}
337EXPORT_SYMBOL(kmem_cache_shrink);
338
339int kmem_ptr_validate(struct kmem_cache *a, const void *b)
340{
341 return 0;
342}
343
344void __init kmem_cache_init(void)
345{
346 slob_timer_cbk();
347}
331 348
332void kmem_cache_init(void) 349static void slob_timer_cbk(void)
333{ 350{
334 void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); 351 void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
335 352
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c5431072f422..a2d9bb4e80df 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -434,7 +434,7 @@ void free_swap_and_cache(swp_entry_t entry)
434 * 434 *
435 * This is needed for the suspend to disk (aka swsusp). 435 * This is needed for the suspend to disk (aka swsusp).
436 */ 436 */
437int swap_type_of(dev_t device, sector_t offset) 437int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
438{ 438{
439 struct block_device *bdev = NULL; 439 struct block_device *bdev = NULL;
440 int i; 440 int i;
@@ -450,6 +450,9 @@ int swap_type_of(dev_t device, sector_t offset)
450 continue; 450 continue;
451 451
452 if (!bdev) { 452 if (!bdev) {
453 if (bdev_p)
454 *bdev_p = sis->bdev;
455
453 spin_unlock(&swap_lock); 456 spin_unlock(&swap_lock);
454 return i; 457 return i;
455 } 458 }
@@ -459,6 +462,9 @@ int swap_type_of(dev_t device, sector_t offset)
459 se = list_entry(sis->extent_list.next, 462 se = list_entry(sis->extent_list.next,
460 struct swap_extent, list); 463 struct swap_extent, list);
461 if (se->start_block == offset) { 464 if (se->start_block == offset) {
465 if (bdev_p)
466 *bdev_p = sis->bdev;
467
462 spin_unlock(&swap_lock); 468 spin_unlock(&swap_lock);
463 bdput(bdev); 469 bdput(bdev);
464 return i; 470 return i;
@@ -1357,10 +1363,10 @@ static int swap_show(struct seq_file *swap, void *v)
1357 } 1363 }
1358 1364
1359 file = ptr->swap_file; 1365 file = ptr->swap_file;
1360 len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\"); 1366 len = seq_path(swap, file->f_path.mnt, file->f_path.dentry, " \t\n\\");
1361 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1367 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1362 len < 40 ? 40 - len : 1, " ", 1368 len < 40 ? 40 - len : 1, " ",
1363 S_ISBLK(file->f_dentry->d_inode->i_mode) ? 1369 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1364 "partition" : "file\t", 1370 "partition" : "file\t",
1365 ptr->pages << (PAGE_SHIFT - 10), 1371 ptr->pages << (PAGE_SHIFT - 10),
1366 ptr->inuse_pages << (PAGE_SHIFT - 10), 1372 ptr->inuse_pages << (PAGE_SHIFT - 10),
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index 5f2cbf0f153c..c7f6e1914bc4 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -79,8 +79,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
79 d_instantiate(dentry, inode); 79 d_instantiate(dentry, inode);
80 inode->i_nlink = 0; /* It is unlinked */ 80 inode->i_nlink = 0; /* It is unlinked */
81 81
82 file->f_vfsmnt = mntget(shm_mnt); 82 file->f_path.mnt = mntget(shm_mnt);
83 file->f_dentry = dentry; 83 file->f_path.dentry = dentry;
84 file->f_mapping = inode->i_mapping; 84 file->f_mapping = inode->i_mapping;
85 file->f_op = &ramfs_file_operations; 85 file->f_op = &ramfs_file_operations;
86 file->f_mode = FMODE_WRITE | FMODE_READ; 86 file->f_mode = FMODE_WRITE | FMODE_READ;
diff --git a/mm/truncate.c b/mm/truncate.c
index e07b1e682c38..5df947de7654 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/pagemap.h> 14#include <linux/pagemap.h>
15#include <linux/pagevec.h> 15#include <linux/pagevec.h>
16#include <linux/task_io_accounting_ops.h>
16#include <linux/buffer_head.h> /* grr. try_to_release_page, 17#include <linux/buffer_head.h> /* grr. try_to_release_page,
17 do_invalidatepage */ 18 do_invalidatepage */
18 19
@@ -51,6 +52,33 @@ static inline void truncate_partial_page(struct page *page, unsigned partial)
51} 52}
52 53
53/* 54/*
55 * This cancels just the dirty bit on the kernel page itself, it
56 * does NOT actually remove dirty bits on any mmap's that may be
57 * around. It also leaves the page tagged dirty, so any sync
58 * activity will still find it on the dirty lists, and in particular,
59 * clear_page_dirty_for_io() will still look at the dirty bits in
60 * the VM.
61 *
62 * Doing this should *normally* only ever be done when a page
63 * is truncated, and is not actually mapped anywhere at all. However,
64 * fs/buffer.c does this when it notices that somebody has cleaned
65 * out all the buffers on a page without actually doing it through
66 * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
67 */
68void cancel_dirty_page(struct page *page, unsigned int account_size)
69{
70 if (TestClearPageDirty(page)) {
71 struct address_space *mapping = page->mapping;
72 if (mapping && mapping_cap_account_dirty(mapping)) {
73 dec_zone_page_state(page, NR_FILE_DIRTY);
74 if (account_size)
75 task_io_account_cancelled_write(account_size);
76 }
77 }
78}
79EXPORT_SYMBOL(cancel_dirty_page);
80
81/*
54 * If truncate cannot remove the fs-private metadata from the page, the page 82 * If truncate cannot remove the fs-private metadata from the page, the page
55 * becomes anonymous. It will be left on the LRU and may even be mapped into 83 * becomes anonymous. It will be left on the LRU and may even be mapped into
56 * user pagetables if we're racing with filemap_nopage(). 84 * user pagetables if we're racing with filemap_nopage().
@@ -66,10 +94,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
66 if (page->mapping != mapping) 94 if (page->mapping != mapping)
67 return; 95 return;
68 96
97 cancel_dirty_page(page, PAGE_CACHE_SIZE);
98
69 if (PagePrivate(page)) 99 if (PagePrivate(page))
70 do_invalidatepage(page, 0); 100 do_invalidatepage(page, 0);
71 101
72 clear_page_dirty(page);
73 ClearPageUptodate(page); 102 ClearPageUptodate(page);
74 ClearPageMappedToDisk(page); 103 ClearPageMappedToDisk(page);
75 remove_from_page_cache(page); 104 remove_from_page_cache(page);
@@ -319,6 +348,15 @@ failed:
319 return 0; 348 return 0;
320} 349}
321 350
351static int do_launder_page(struct address_space *mapping, struct page *page)
352{
353 if (!PageDirty(page))
354 return 0;
355 if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
356 return 0;
357 return mapping->a_ops->launder_page(page);
358}
359
322/** 360/**
323 * invalidate_inode_pages2_range - remove range of pages from an address_space 361 * invalidate_inode_pages2_range - remove range of pages from an address_space
324 * @mapping: the address_space 362 * @mapping: the address_space
@@ -348,7 +386,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
348 for (i = 0; !ret && i < pagevec_count(&pvec); i++) { 386 for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
349 struct page *page = pvec.pages[i]; 387 struct page *page = pvec.pages[i];
350 pgoff_t page_index; 388 pgoff_t page_index;
351 int was_dirty;
352 389
353 lock_page(page); 390 lock_page(page);
354 if (page->mapping != mapping) { 391 if (page->mapping != mapping) {
@@ -384,18 +421,14 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
384 PAGE_CACHE_SIZE, 0); 421 PAGE_CACHE_SIZE, 0);
385 } 422 }
386 } 423 }
387 was_dirty = test_clear_page_dirty(page); 424 ret = do_launder_page(mapping, page);
388 if (!invalidate_complete_page2(mapping, page)) { 425 if (ret == 0 && !invalidate_complete_page2(mapping, page))
389 if (was_dirty)
390 set_page_dirty(page);
391 ret = -EIO; 426 ret = -EIO;
392 }
393 unlock_page(page); 427 unlock_page(page);
394 } 428 }
395 pagevec_release(&pvec); 429 pagevec_release(&pvec);
396 cond_resched(); 430 cond_resched();
397 } 431 }
398 WARN_ON_ONCE(ret);
399 return ret; 432 return ret;
400} 433}
401EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); 434EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 093f5fe6dd77..7430df68cb64 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -692,7 +692,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
692 __count_vm_events(KSWAPD_STEAL, nr_freed); 692 __count_vm_events(KSWAPD_STEAL, nr_freed);
693 } else 693 } else
694 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); 694 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
695 __count_vm_events(PGACTIVATE, nr_freed); 695 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
696 696
697 if (nr_taken == 0) 697 if (nr_taken == 0)
698 goto done; 698 goto done;
@@ -984,7 +984,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
984 if (!populated_zone(zone)) 984 if (!populated_zone(zone))
985 continue; 985 continue;
986 986
987 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 987 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
988 continue; 988 continue;
989 989
990 note_zone_scanning_priority(zone, priority); 990 note_zone_scanning_priority(zone, priority);
@@ -1034,7 +1034,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1034 for (i = 0; zones[i] != NULL; i++) { 1034 for (i = 0; zones[i] != NULL; i++) {
1035 struct zone *zone = zones[i]; 1035 struct zone *zone = zones[i];
1036 1036
1037 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1037 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1038 continue; 1038 continue;
1039 1039
1040 lru_pages += zone->nr_active + zone->nr_inactive; 1040 lru_pages += zone->nr_active + zone->nr_inactive;
@@ -1089,7 +1089,7 @@ out:
1089 for (i = 0; zones[i] != 0; i++) { 1089 for (i = 0; zones[i] != 0; i++) {
1090 struct zone *zone = zones[i]; 1090 struct zone *zone = zones[i];
1091 1091
1092 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1092 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1093 continue; 1093 continue;
1094 1094
1095 zone->prev_priority = priority; 1095 zone->prev_priority = priority;
@@ -1354,7 +1354,7 @@ void wakeup_kswapd(struct zone *zone, int order)
1354 return; 1354 return;
1355 if (pgdat->kswapd_max_order < order) 1355 if (pgdat->kswapd_max_order < order)
1356 pgdat->kswapd_max_order = order; 1356 pgdat->kswapd_max_order = order;
1357 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1357 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1358 return; 1358 return;
1359 if (!waitqueue_active(&pgdat->kswapd_wait)) 1359 if (!waitqueue_active(&pgdat->kswapd_wait))
1360 return; 1360 return;
@@ -1369,8 +1369,8 @@ void wakeup_kswapd(struct zone *zone, int order)
1369 * 1369 *
1370 * For pass > 3 we also try to shrink the LRU lists that contain a few pages 1370 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
1371 */ 1371 */
1372static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, 1372static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1373 int prio, struct scan_control *sc) 1373 int pass, struct scan_control *sc)
1374{ 1374{
1375 struct zone *zone; 1375 struct zone *zone;
1376 unsigned long nr_to_scan, ret = 0; 1376 unsigned long nr_to_scan, ret = 0;
@@ -1406,6 +1406,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
1406 return ret; 1406 return ret;
1407} 1407}
1408 1408
1409static unsigned long count_lru_pages(void)
1410{
1411 struct zone *zone;
1412 unsigned long ret = 0;
1413
1414 for_each_zone(zone)
1415 ret += zone->nr_active + zone->nr_inactive;
1416 return ret;
1417}
1418
1409/* 1419/*
1410 * Try to free `nr_pages' of memory, system-wide, and return the number of 1420 * Try to free `nr_pages' of memory, system-wide, and return the number of
1411 * freed pages. 1421 * freed pages.
@@ -1420,7 +1430,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1420 unsigned long ret = 0; 1430 unsigned long ret = 0;
1421 int pass; 1431 int pass;
1422 struct reclaim_state reclaim_state; 1432 struct reclaim_state reclaim_state;
1423 struct zone *zone;
1424 struct scan_control sc = { 1433 struct scan_control sc = {
1425 .gfp_mask = GFP_KERNEL, 1434 .gfp_mask = GFP_KERNEL,
1426 .may_swap = 0, 1435 .may_swap = 0,
@@ -1431,10 +1440,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1431 1440
1432 current->reclaim_state = &reclaim_state; 1441 current->reclaim_state = &reclaim_state;
1433 1442
1434 lru_pages = 0; 1443 lru_pages = count_lru_pages();
1435 for_each_zone(zone)
1436 lru_pages += zone->nr_active + zone->nr_inactive;
1437
1438 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 1444 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
1439 /* If slab caches are huge, it's better to hit them first */ 1445 /* If slab caches are huge, it's better to hit them first */
1440 while (nr_slab >= lru_pages) { 1446 while (nr_slab >= lru_pages) {
@@ -1461,13 +1467,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1461 for (pass = 0; pass < 5; pass++) { 1467 for (pass = 0; pass < 5; pass++) {
1462 int prio; 1468 int prio;
1463 1469
1464 /* Needed for shrinking slab caches later on */
1465 if (!lru_pages)
1466 for_each_zone(zone) {
1467 lru_pages += zone->nr_active;
1468 lru_pages += zone->nr_inactive;
1469 }
1470
1471 /* Force reclaiming mapped pages in the passes #3 and #4 */ 1470 /* Force reclaiming mapped pages in the passes #3 and #4 */
1472 if (pass > 2) { 1471 if (pass > 2) {
1473 sc.may_swap = 1; 1472 sc.may_swap = 1;
@@ -1483,7 +1482,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1483 goto out; 1482 goto out;
1484 1483
1485 reclaim_state.reclaimed_slab = 0; 1484 reclaim_state.reclaimed_slab = 0;
1486 shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); 1485 shrink_slab(sc.nr_scanned, sc.gfp_mask,
1486 count_lru_pages());
1487 ret += reclaim_state.reclaimed_slab; 1487 ret += reclaim_state.reclaimed_slab;
1488 if (ret >= nr_pages) 1488 if (ret >= nr_pages)
1489 goto out; 1489 goto out;
@@ -1491,20 +1491,19 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1491 if (sc.nr_scanned && prio < DEF_PRIORITY - 2) 1491 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
1492 congestion_wait(WRITE, HZ / 10); 1492 congestion_wait(WRITE, HZ / 10);
1493 } 1493 }
1494
1495 lru_pages = 0;
1496 } 1494 }
1497 1495
1498 /* 1496 /*
1499 * If ret = 0, we could not shrink LRUs, but there may be something 1497 * If ret = 0, we could not shrink LRUs, but there may be something
1500 * in slab caches 1498 * in slab caches
1501 */ 1499 */
1502 if (!ret) 1500 if (!ret) {
1503 do { 1501 do {
1504 reclaim_state.reclaimed_slab = 0; 1502 reclaim_state.reclaimed_slab = 0;
1505 shrink_slab(nr_pages, sc.gfp_mask, lru_pages); 1503 shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
1506 ret += reclaim_state.reclaimed_slab; 1504 ret += reclaim_state.reclaimed_slab;
1507 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 1505 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
1506 }
1508 1507
1509out: 1508out:
1510 current->reclaim_state = NULL; 1509 current->reclaim_state = NULL;