aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2007-10-16 04:24:40 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-16 12:42:53 -0400
commit557ed1fa2620dc119adb86b34c614e152a629a80 (patch)
treed00b31a7f197583c2bd8fffa1fd135fbbb5d6abc
parentaadb4bc4a1f9108c1d0fbd121827c936c2ed4217 (diff)
remove ZERO_PAGE
The commit b5810039a54e5babf428e9a1e89fc1940fabff11 contains the note A last caveat: the ZERO_PAGE is now refcounted and managed with rmap (and thus mapcounted and count towards shared rss). These writes to the struct page could cause excessive cacheline bouncing on big systems. There are a number of ways this could be addressed if it is an issue. And indeed this cacheline bouncing has shown up on large SGI systems. There was a situation where an Altix system was essentially livelocked tearing down ZERO_PAGE pagetables when an HPC app aborted during startup. This situation can be avoided in userspace, but it does highlight the potential scalability problem with refcounting ZERO_PAGE, and corner cases where it can really hurt (we don't want the system to livelock!). There are several broad ways to fix this problem: 1. add back some special casing to avoid refcounting ZERO_PAGE 2. per-node or per-cpu ZERO_PAGES 3. remove the ZERO_PAGE completely I will argue for 3. The others should also fix the problem, but they result in more complex code than does 3, with little or no real benefit that I can see. Why? Inserting a ZERO_PAGE for anonymous read faults appears to be a false optimisation: if an application is performance critical, it would not be doing many read faults of new memory, or at least it could be expected to write to that memory soon afterwards. If cache or memory use is critical, it should not be working with a significant number of ZERO_PAGEs anyway (a more compact representation of zeroes should be used). As a sanity check -- mesuring on my desktop system, there are never many mappings to the ZERO_PAGE (eg. 2 or 3), thus memory usage here should not increase much without it. When running a make -j4 kernel compile on my dual core system, there are about 1,000 mappings to the ZERO_PAGE created per second, but about 1,000 ZERO_PAGE COW faults per second (less than 1 ZERO_PAGE mapping per second is torn down without being COWed). So removing ZERO_PAGE will save 1,000 page faults per second when running kbuild, while keeping it only saves less than 1 page clearing operation per second. 1 page clear is cheaper than a thousand faults, presumably, so there isn't an obvious loss. Neither the logical argument nor these basic tests give a guarantee of no regressions. However, this is a reasonable opportunity to try to remove the ZERO_PAGE from the pagefault path. If it is found to cause regressions, we can reintroduce it and just avoid refcounting it. The /dev/zero ZERO_PAGE usage and TLB tricks also get nuked. I don't see much use to them except on benchmarks. All other users of ZERO_PAGE are converted just to use ZERO_PAGE(0) for simplicity. We can look at replacing them all and maybe ripping out ZERO_PAGE completely when we are more satisfied with this solution. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus "snif" Torvalds <torvalds@linux-foundation.org>
-rw-r--r--drivers/char/mem.c125
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/binfmt_elf_fdpic.c2
-rw-r--r--fs/direct-io.c4
-rw-r--r--include/linux/mm.h2
-rw-r--r--mm/memory.c151
6 files changed, 42 insertions, 244 deletions
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index bbee97ff355f..64551ab6be03 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -625,65 +625,10 @@ static ssize_t splice_write_null(struct pipe_inode_info *pipe,struct file *out,
625 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_null); 625 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_null);
626} 626}
627 627
628#ifdef CONFIG_MMU
629/*
630 * For fun, we are using the MMU for this.
631 */
632static inline size_t read_zero_pagealigned(char __user * buf, size_t size)
633{
634 struct mm_struct *mm;
635 struct vm_area_struct * vma;
636 unsigned long addr=(unsigned long)buf;
637
638 mm = current->mm;
639 /* Oops, this was forgotten before. -ben */
640 down_read(&mm->mmap_sem);
641
642 /* For private mappings, just map in zero pages. */
643 for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
644 unsigned long count;
645
646 if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0)
647 goto out_up;
648 if (vma->vm_flags & (VM_SHARED | VM_HUGETLB))
649 break;
650 count = vma->vm_end - addr;
651 if (count > size)
652 count = size;
653
654 zap_page_range(vma, addr, count, NULL);
655 if (zeromap_page_range(vma, addr, count, PAGE_COPY))
656 break;
657
658 size -= count;
659 buf += count;
660 addr += count;
661 if (size == 0)
662 goto out_up;
663 }
664
665 up_read(&mm->mmap_sem);
666
667 /* The shared case is hard. Let's do the conventional zeroing. */
668 do {
669 unsigned long unwritten = clear_user(buf, PAGE_SIZE);
670 if (unwritten)
671 return size + unwritten - PAGE_SIZE;
672 cond_resched();
673 buf += PAGE_SIZE;
674 size -= PAGE_SIZE;
675 } while (size);
676
677 return size;
678out_up:
679 up_read(&mm->mmap_sem);
680 return size;
681}
682
683static ssize_t read_zero(struct file * file, char __user * buf, 628static ssize_t read_zero(struct file * file, char __user * buf,
684 size_t count, loff_t *ppos) 629 size_t count, loff_t *ppos)
685{ 630{
686 unsigned long left, unwritten, written = 0; 631 size_t written;
687 632
688 if (!count) 633 if (!count)
689 return 0; 634 return 0;
@@ -691,69 +636,33 @@ static ssize_t read_zero(struct file * file, char __user * buf,
691 if (!access_ok(VERIFY_WRITE, buf, count)) 636 if (!access_ok(VERIFY_WRITE, buf, count))
692 return -EFAULT; 637 return -EFAULT;
693 638
694 left = count; 639 written = 0;
695 640 while (count) {
696 /* do we want to be clever? Arbitrary cut-off */ 641 unsigned long unwritten;
697 if (count >= PAGE_SIZE*4) { 642 size_t chunk = count;
698 unsigned long partial;
699 643
700 /* How much left of the page? */ 644 if (chunk > PAGE_SIZE)
701 partial = (PAGE_SIZE-1) & -(unsigned long) buf; 645 chunk = PAGE_SIZE; /* Just for latency reasons */
702 unwritten = clear_user(buf, partial); 646 unwritten = clear_user(buf, chunk);
703 written = partial - unwritten; 647 written += chunk - unwritten;
704 if (unwritten)
705 goto out;
706 left -= partial;
707 buf += partial;
708 unwritten = read_zero_pagealigned(buf, left & PAGE_MASK);
709 written += (left & PAGE_MASK) - unwritten;
710 if (unwritten) 648 if (unwritten)
711 goto out; 649 break;
712 buf += left & PAGE_MASK;
713 left &= ~PAGE_MASK;
714 }
715 unwritten = clear_user(buf, left);
716 written += left - unwritten;
717out:
718 return written ? written : -EFAULT;
719}
720
721static int mmap_zero(struct file * file, struct vm_area_struct * vma)
722{
723 int err;
724
725 if (vma->vm_flags & VM_SHARED)
726 return shmem_zero_setup(vma);
727 err = zeromap_page_range(vma, vma->vm_start,
728 vma->vm_end - vma->vm_start, vma->vm_page_prot);
729 BUG_ON(err == -EEXIST);
730 return err;
731}
732#else /* CONFIG_MMU */
733static ssize_t read_zero(struct file * file, char * buf,
734 size_t count, loff_t *ppos)
735{
736 size_t todo = count;
737
738 while (todo) {
739 size_t chunk = todo;
740
741 if (chunk > 4096)
742 chunk = 4096; /* Just for latency reasons */
743 if (clear_user(buf, chunk))
744 return -EFAULT;
745 buf += chunk; 650 buf += chunk;
746 todo -= chunk; 651 count -= chunk;
747 cond_resched(); 652 cond_resched();
748 } 653 }
749 return count; 654 return written ? written : -EFAULT;
750} 655}
751 656
752static int mmap_zero(struct file * file, struct vm_area_struct * vma) 657static int mmap_zero(struct file * file, struct vm_area_struct * vma)
753{ 658{
659#ifndef CONFIG_MMU
754 return -ENOSYS; 660 return -ENOSYS;
661#endif
662 if (vma->vm_flags & VM_SHARED)
663 return shmem_zero_setup(vma);
664 return 0;
755} 665}
756#endif /* CONFIG_MMU */
757 666
758static ssize_t write_full(struct file * file, const char __user * buf, 667static ssize_t write_full(struct file * file, const char __user * buf,
759 size_t count, loff_t *ppos) 668 size_t count, loff_t *ppos)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b1013f34085d..f3037c645ca9 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1725,7 +1725,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
1725 &page, &vma) <= 0) { 1725 &page, &vma) <= 0) {
1726 DUMP_SEEK(PAGE_SIZE); 1726 DUMP_SEEK(PAGE_SIZE);
1727 } else { 1727 } else {
1728 if (page == ZERO_PAGE(addr)) { 1728 if (page == ZERO_PAGE(0)) {
1729 if (!dump_seek(file, PAGE_SIZE)) { 1729 if (!dump_seek(file, PAGE_SIZE)) {
1730 page_cache_release(page); 1730 page_cache_release(page);
1731 goto end_coredump; 1731 goto end_coredump;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2f5d8dbe676d..c5ca2f0aca7f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1488,7 +1488,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1488 &page, &vma) <= 0) { 1488 &page, &vma) <= 0) {
1489 DUMP_SEEK(file->f_pos + PAGE_SIZE); 1489 DUMP_SEEK(file->f_pos + PAGE_SIZE);
1490 } 1490 }
1491 else if (page == ZERO_PAGE(addr)) { 1491 else if (page == ZERO_PAGE(0)) {
1492 page_cache_release(page); 1492 page_cache_release(page);
1493 DUMP_SEEK(file->f_pos + PAGE_SIZE); 1493 DUMP_SEEK(file->f_pos + PAGE_SIZE);
1494 } 1494 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b5928a7b6a5a..acf0da1bd257 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -163,7 +163,7 @@ static int dio_refill_pages(struct dio *dio)
163 up_read(&current->mm->mmap_sem); 163 up_read(&current->mm->mmap_sem);
164 164
165 if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) { 165 if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
166 struct page *page = ZERO_PAGE(dio->curr_user_address); 166 struct page *page = ZERO_PAGE(0);
167 /* 167 /*
168 * A memory fault, but the filesystem has some outstanding 168 * A memory fault, but the filesystem has some outstanding
169 * mapped blocks. We need to use those blocks up to avoid 169 * mapped blocks. We need to use those blocks up to avoid
@@ -763,7 +763,7 @@ static void dio_zero_block(struct dio *dio, int end)
763 763
764 this_chunk_bytes = this_chunk_blocks << dio->blkbits; 764 this_chunk_bytes = this_chunk_blocks << dio->blkbits;
765 765
766 page = ZERO_PAGE(dio->curr_user_address); 766 page = ZERO_PAGE(0);
767 if (submit_page_section(dio, page, 0, this_chunk_bytes, 767 if (submit_page_section(dio, page, 0, this_chunk_bytes,
768 dio->next_block_for_io)) 768 dio->next_block_for_io))
769 return; 769 return;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 291c4cc06ea7..fbbc29a29dff 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -779,8 +779,6 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
779 unsigned long floor, unsigned long ceiling); 779 unsigned long floor, unsigned long ceiling);
780int copy_page_range(struct mm_struct *dst, struct mm_struct *src, 780int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
781 struct vm_area_struct *vma); 781 struct vm_area_struct *vma);
782int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
783 unsigned long size, pgprot_t prot);
784void unmap_mapping_range(struct address_space *mapping, 782void unmap_mapping_range(struct address_space *mapping,
785 loff_t const holebegin, loff_t const holelen, int even_cows); 783 loff_t const holebegin, loff_t const holelen, int even_cows);
786 784
diff --git a/mm/memory.c b/mm/memory.c
index f82b359b2745..2a8430844b6d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -966,7 +966,7 @@ no_page_table:
966 * has touched so far, we don't want to allocate page tables. 966 * has touched so far, we don't want to allocate page tables.
967 */ 967 */
968 if (flags & FOLL_ANON) { 968 if (flags & FOLL_ANON) {
969 page = ZERO_PAGE(address); 969 page = ZERO_PAGE(0);
970 if (flags & FOLL_GET) 970 if (flags & FOLL_GET)
971 get_page(page); 971 get_page(page);
972 BUG_ON(flags & FOLL_WRITE); 972 BUG_ON(flags & FOLL_WRITE);
@@ -1111,95 +1111,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1111} 1111}
1112EXPORT_SYMBOL(get_user_pages); 1112EXPORT_SYMBOL(get_user_pages);
1113 1113
1114static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1115 unsigned long addr, unsigned long end, pgprot_t prot)
1116{
1117 pte_t *pte;
1118 spinlock_t *ptl;
1119 int err = 0;
1120
1121 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1122 if (!pte)
1123 return -EAGAIN;
1124 arch_enter_lazy_mmu_mode();
1125 do {
1126 struct page *page = ZERO_PAGE(addr);
1127 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
1128
1129 if (unlikely(!pte_none(*pte))) {
1130 err = -EEXIST;
1131 pte++;
1132 break;
1133 }
1134 page_cache_get(page);
1135 page_add_file_rmap(page);
1136 inc_mm_counter(mm, file_rss);
1137 set_pte_at(mm, addr, pte, zero_pte);
1138 } while (pte++, addr += PAGE_SIZE, addr != end);
1139 arch_leave_lazy_mmu_mode();
1140 pte_unmap_unlock(pte - 1, ptl);
1141 return err;
1142}
1143
1144static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
1145 unsigned long addr, unsigned long end, pgprot_t prot)
1146{
1147 pmd_t *pmd;
1148 unsigned long next;
1149 int err;
1150
1151 pmd = pmd_alloc(mm, pud, addr);
1152 if (!pmd)
1153 return -EAGAIN;
1154 do {
1155 next = pmd_addr_end(addr, end);
1156 err = zeromap_pte_range(mm, pmd, addr, next, prot);
1157 if (err)
1158 break;
1159 } while (pmd++, addr = next, addr != end);
1160 return err;
1161}
1162
1163static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1164 unsigned long addr, unsigned long end, pgprot_t prot)
1165{
1166 pud_t *pud;
1167 unsigned long next;
1168 int err;
1169
1170 pud = pud_alloc(mm, pgd, addr);
1171 if (!pud)
1172 return -EAGAIN;
1173 do {
1174 next = pud_addr_end(addr, end);
1175 err = zeromap_pmd_range(mm, pud, addr, next, prot);
1176 if (err)
1177 break;
1178 } while (pud++, addr = next, addr != end);
1179 return err;
1180}
1181
1182int zeromap_page_range(struct vm_area_struct *vma,
1183 unsigned long addr, unsigned long size, pgprot_t prot)
1184{
1185 pgd_t *pgd;
1186 unsigned long next;
1187 unsigned long end = addr + size;
1188 struct mm_struct *mm = vma->vm_mm;
1189 int err;
1190
1191 BUG_ON(addr >= end);
1192 pgd = pgd_offset(mm, addr);
1193 flush_cache_range(vma, addr, end);
1194 do {
1195 next = pgd_addr_end(addr, end);
1196 err = zeromap_pud_range(mm, pgd, addr, next, prot);
1197 if (err)
1198 break;
1199 } while (pgd++, addr = next, addr != end);
1200 return err;
1201}
1202
1203pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) 1114pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
1204{ 1115{
1205 pgd_t * pgd = pgd_offset(mm, addr); 1116 pgd_t * pgd = pgd_offset(mm, addr);
@@ -1717,16 +1628,11 @@ gotten:
1717 1628
1718 if (unlikely(anon_vma_prepare(vma))) 1629 if (unlikely(anon_vma_prepare(vma)))
1719 goto oom; 1630 goto oom;
1720 if (old_page == ZERO_PAGE(address)) { 1631 VM_BUG_ON(old_page == ZERO_PAGE(0));
1721 new_page = alloc_zeroed_user_highpage_movable(vma, address); 1632 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1722 if (!new_page) 1633 if (!new_page)
1723 goto oom; 1634 goto oom;
1724 } else { 1635 cow_user_page(new_page, old_page, address, vma);
1725 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1726 if (!new_page)
1727 goto oom;
1728 cow_user_page(new_page, old_page, address, vma);
1729 }
1730 1636
1731 /* 1637 /*
1732 * Re-check the pte - we dropped the lock 1638 * Re-check the pte - we dropped the lock
@@ -2252,39 +2158,24 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2252 spinlock_t *ptl; 2158 spinlock_t *ptl;
2253 pte_t entry; 2159 pte_t entry;
2254 2160
2255 if (write_access) { 2161 /* Allocate our own private page. */
2256 /* Allocate our own private page. */ 2162 pte_unmap(page_table);
2257 pte_unmap(page_table);
2258
2259 if (unlikely(anon_vma_prepare(vma)))
2260 goto oom;
2261 page = alloc_zeroed_user_highpage_movable(vma, address);
2262 if (!page)
2263 goto oom;
2264
2265 entry = mk_pte(page, vma->vm_page_prot);
2266 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2267 2163
2268 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2164 if (unlikely(anon_vma_prepare(vma)))
2269 if (!pte_none(*page_table)) 2165 goto oom;
2270 goto release; 2166 page = alloc_zeroed_user_highpage_movable(vma, address);
2271 inc_mm_counter(mm, anon_rss); 2167 if (!page)
2272 lru_cache_add_active(page); 2168 goto oom;
2273 page_add_new_anon_rmap(page, vma, address);
2274 } else {
2275 /* Map the ZERO_PAGE - vm_page_prot is readonly */
2276 page = ZERO_PAGE(address);
2277 page_cache_get(page);
2278 entry = mk_pte(page, vma->vm_page_prot);
2279 2169
2280 ptl = pte_lockptr(mm, pmd); 2170 entry = mk_pte(page, vma->vm_page_prot);
2281 spin_lock(ptl); 2171 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2282 if (!pte_none(*page_table))
2283 goto release;
2284 inc_mm_counter(mm, file_rss);
2285 page_add_file_rmap(page);
2286 }
2287 2172
2173 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2174 if (!pte_none(*page_table))
2175 goto release;
2176 inc_mm_counter(mm, anon_rss);
2177 lru_cache_add_active(page);
2178 page_add_new_anon_rmap(page, vma, address);
2288 set_pte_at(mm, address, page_table, entry); 2179 set_pte_at(mm, address, page_table, entry);
2289 2180
2290 /* No need to invalidate - it was non-present before */ 2181 /* No need to invalidate - it was non-present before */