From 9b1a4d38373a5581a4e01032a3ccdd94cd93477b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:30 -0500 Subject: stop_machine: Wean existing callers off stop_machine_run() Signed-off-by: Rusty Russell --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6da667274df5..3cf3d05b6bd4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2372,7 +2372,7 @@ static void build_zonelist_cache(pg_data_t *pgdat) #endif /* CONFIG_NUMA */ -/* return values int ....just for stop_machine_run() */ +/* return values int ....just for stop_machine() */ static int __build_all_zonelists(void *dummy) { int nid; @@ -2397,7 +2397,7 @@ void build_all_zonelists(void) } else { /* we have to stop all cpus to guarantee there is no user of zonelist */ - stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); + stop_machine(__build_all_zonelists, NULL, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); -- cgit v1.2.2 From 14fcc23fdc78e9d32372553ccf21758a9bd56fa1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 28 Jul 2008 15:46:19 -0700 Subject: tmpfs: fix kernel BUG in shmem_delete_inode SuSE's insserve initscript ordering program hits kernel BUG at mm/shmem.c:814 on 2.6.26. It's using posix_fadvise on directories, and the shmem_readpage method added in 2.6.23 is letting POSIX_FADV_WILLNEED allocate useless pages to a tmpfs directory, incrementing i_blocks count but never decrementing it. Fix this by assigning shmem_aops (pointing to readpage and writepage and set_page_dirty) only when it's needed, on a regular file or a long symlink. Many thanks to Kel for outstanding bugreport and steps to reproduce it. Reported-by: Kel Modderman Tested-by: Kel Modderman Signed-off-by: Hugh Dickins Cc: [2.6.25.x, 2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 952d361774bb..c1e5a3b4f758 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1513,7 +1513,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; inode->i_blocks = 0; - inode->i_mapping->a_ops = &shmem_aops; inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_generation = get_seconds(); @@ -1528,6 +1527,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) init_special_inode(inode, mode, dev); break; case S_IFREG: + inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_inode_operations; inode->i_fop = &shmem_file_operations; mpol_shared_policy_init(&info->policy, @@ -1929,6 +1929,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s return error; } unlock_page(page); + inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, len); -- cgit v1.2.2 From 7906d00cd1f687268f0a3599442d113767795ae6 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 28 Jul 2008 15:46:26 -0700 Subject: mmu-notifiers: add mm_take_all_locks() operation mm_take_all_locks holds off reclaim from an entire mm_struct. This allows mmu notifiers to register into the mm at any time with the guarantee that no mmu operation is in progress on the mm. This operation locks against the VM for all pte/vma/mm related operations that could ever happen on a certain mm. This includes vmtruncate, try_to_unmap, and all page faults. The caller must take the mmap_sem in write mode before calling mm_take_all_locks(). The caller isn't allowed to release the mmap_sem until mm_drop_all_locks() returns. mmap_sem in write mode is required in order to block all operations that could modify pagetables and free pages without need of altering the vma layout (for example populate_range() with nonlinear vmas). It's also needed in write mode to avoid new anon_vmas to be associated with existing vmas. A single task can't take more than one mm_take_all_locks() in a row or it would deadlock. mm_take_all_locks() and mm_drop_all_locks are expensive operations that may have to take thousand of locks. mm_take_all_locks() can fail if it's interrupted by signals. When mmu_notifier_register returns, we must be sure that the driver is notified if some task is in the middle of a vmtruncate for the 'mm' where the mmu notifier was registered (mmu_notifier_invalidate_range_start/end is run around the vmtruncation but mmu_notifier_register can run after mmu_notifier_invalidate_range_start and before mmu_notifier_invalidate_range_end). Same problem for rmap paths. And we've to remove page pinning to avoid replicating the tlb_gather logic inside KVM (and GRU doesn't work well with page pinning regardless of needing tlb_gather), so without mm_take_all_locks when vmtruncate frees the page, kvm would have no way to notice that it mapped into sptes a page that is going into the freelist without a chance of any further mmu_notifier notification. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrea Arcangeli Acked-by: Linus Torvalds Cc: Christoph Lameter Cc: Jack Steiner Cc: Robin Holt Cc: Nick Piggin Cc: Peter Zijlstra Cc: Kanoj Sarcar Cc: Roland Dreier Cc: Steve Wise Cc: Avi Kivity Cc: Hugh Dickins Cc: Rusty Russell Cc: Anthony Liguori Cc: Chris Wright Cc: Marcelo Tosatti Cc: Eric Dumazet Cc: "Paul E. McKenney" Cc: Izik Eidus Cc: Anthony Liguori Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 5e0cc99e9cd5..e5f9cb83d6d4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2268,3 +2268,161 @@ int install_special_mapping(struct mm_struct *mm, return 0; } + +static DEFINE_MUTEX(mm_all_locks_mutex); + +static void vm_lock_anon_vma(struct anon_vma *anon_vma) +{ + if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { + /* + * The LSB of head.next can't change from under us + * because we hold the mm_all_locks_mutex. + */ + spin_lock(&anon_vma->lock); + /* + * We can safely modify head.next after taking the + * anon_vma->lock. If some other vma in this mm shares + * the same anon_vma we won't take it again. + * + * No need of atomic instructions here, head.next + * can't change from under us thanks to the + * anon_vma->lock. + */ + if (__test_and_set_bit(0, (unsigned long *) + &anon_vma->head.next)) + BUG(); + } +} + +static void vm_lock_mapping(struct address_space *mapping) +{ + if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change from under us because + * we hold the mm_all_locks_mutex. + * + * Operations on ->flags have to be atomic because + * even if AS_MM_ALL_LOCKS is stable thanks to the + * mm_all_locks_mutex, there may be other cpus + * changing other bitflags in parallel to us. + */ + if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) + BUG(); + spin_lock(&mapping->i_mmap_lock); + } +} + +/* + * This operation locks against the VM for all pte/vma/mm related + * operations that could ever happen on a certain mm. This includes + * vmtruncate, try_to_unmap, and all page faults. + * + * The caller must take the mmap_sem in write mode before calling + * mm_take_all_locks(). The caller isn't allowed to release the + * mmap_sem until mm_drop_all_locks() returns. + * + * mmap_sem in write mode is required in order to block all operations + * that could modify pagetables and free pages without need of + * altering the vma layout (for example populate_range() with + * nonlinear vmas). It's also needed in write mode to avoid new + * anon_vmas to be associated with existing vmas. + * + * A single task can't take more than one mm_take_all_locks() in a row + * or it would deadlock. + * + * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in + * mapping->flags avoid to take the same lock twice, if more than one + * vma in this mm is backed by the same anon_vma or address_space. + * + * We can take all the locks in random order because the VM code + * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never + * takes more than one of them in a row. Secondly we're protected + * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. + * + * mm_take_all_locks() and mm_drop_all_locks are expensive operations + * that may have to take thousand of locks. + * + * mm_take_all_locks() can fail if it's interrupted by signals. + */ +int mm_take_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + int ret = -EINTR; + + BUG_ON(down_read_trylock(&mm->mmap_sem)); + + mutex_lock(&mm_all_locks_mutex); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->anon_vma) + vm_lock_anon_vma(vma->anon_vma); + if (vma->vm_file && vma->vm_file->f_mapping) + vm_lock_mapping(vma->vm_file->f_mapping); + } + ret = 0; + +out_unlock: + if (ret) + mm_drop_all_locks(mm); + + return ret; +} + +static void vm_unlock_anon_vma(struct anon_vma *anon_vma) +{ + if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { + /* + * The LSB of head.next can't change to 0 from under + * us because we hold the mm_all_locks_mutex. + * + * We must however clear the bitflag before unlocking + * the vma so the users using the anon_vma->head will + * never see our bitflag. + * + * No need of atomic instructions here, head.next + * can't change from under us until we release the + * anon_vma->lock. + */ + if (!__test_and_clear_bit(0, (unsigned long *) + &anon_vma->head.next)) + BUG(); + spin_unlock(&anon_vma->lock); + } +} + +static void vm_unlock_mapping(struct address_space *mapping) +{ + if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change to 0 from under us + * because we hold the mm_all_locks_mutex. + */ + spin_unlock(&mapping->i_mmap_lock); + if (!test_and_clear_bit(AS_MM_ALL_LOCKS, + &mapping->flags)) + BUG(); + } +} + +/* + * The mmap_sem cannot be released by the caller until + * mm_drop_all_locks() returns. + */ +void mm_drop_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + BUG_ON(down_read_trylock(&mm->mmap_sem)); + BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->anon_vma) + vm_unlock_anon_vma(vma->anon_vma); + if (vma->vm_file && vma->vm_file->f_mapping) + vm_unlock_mapping(vma->vm_file->f_mapping); + } + + mutex_unlock(&mm_all_locks_mutex); +} -- cgit v1.2.2 From cddb8a5c14aa89810b40495d94d3d2a0faee6619 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 28 Jul 2008 15:46:29 -0700 Subject: mmu-notifiers: core With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages. There are secondary MMUs (with secondary sptes and secondary tlbs) too. sptes in the kvm case are shadow pagetables, but when I say spte in mmu-notifier context, I mean "secondary pte". In GRU case there's no actual secondary pte and there's only a secondary tlb because the GRU secondary MMU has no knowledge about sptes and every secondary tlb miss event in the MMU always generates a page fault that has to be resolved by the CPU (this is not the case of KVM where the a secondary tlb miss will walk sptes in hardware and it will refill the secondary tlb transparently to software if the corresponding spte is present). The same way zap_page_range has to invalidate the pte before freeing the page, the spte (and secondary tlb) must also be invalidated before any page is freed and reused. Currently we take a page_count pin on every page mapped by sptes, but that means the pages can't be swapped whenever they're mapped by any spte because they're part of the guest working set. Furthermore a spte unmap event can immediately lead to a page to be freed when the pin is released (so requiring the same complex and relatively slow tlb_gather smp safe logic we have in zap_page_range and that can be avoided completely if the spte unmap event doesn't require an unpin of the page previously mapped in the secondary MMU). The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know when the VM is swapping or freeing or doing anything on the primary MMU so that the secondary MMU code can drop sptes before the pages are freed, avoiding all page pinning and allowing 100% reliable swapping of guest physical address space. Furthermore it avoids the code that teardown the mappings of the secondary MMU, to implement a logic like tlb_gather in zap_page_range that would require many IPI to flush other cpu tlbs, for each fixed number of spte unmapped. To make an example: if what happens on the primary MMU is a protection downgrade (from writeable to wrprotect) the secondary MMU mappings will be invalidated, and the next secondary-mmu-page-fault will call get_user_pages and trigger a do_wp_page through get_user_pages if it called get_user_pages with write=1, and it'll re-establishing an updated spte or secondary-tlb-mapping on the copied page. Or it will setup a readonly spte or readonly tlb mapping if it's a guest-read, if it calls get_user_pages with write=0. This is just an example. This allows to map any page pointed by any pte (and in turn visible in the primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an full MMU with both sptes and secondary-tlb like the shadow-pagetable layer with kvm), or a remote DMA in software like XPMEM (hence needing of schedule in XPMEM code to send the invalidate to the remote node, while no need to schedule in kvm/gru as it's an immediate event like invalidating primary-mmu pte). At least for KVM without this patch it's impossible to swap guests reliably. And having this feature and removing the page pin allows several other optimizations that simplify life considerably. Dependencies: 1) mm_take_all_locks() to register the mmu notifier when the whole VM isn't doing anything with "mm". This allows mmu notifier users to keep track if the VM is in the middle of the invalidate_range_begin/end critical section with an atomic counter incraese in range_begin and decreased in range_end. No secondary MMU page fault is allowed to map any spte or secondary tlb reference, while the VM is in the middle of range_begin/end as any page returned by get_user_pages in that critical section could later immediately be freed without any further ->invalidate_page notification (invalidate_range_begin/end works on ranges and ->invalidate_page isn't called immediately before freeing the page). To stop all page freeing and pagetable overwrites the mmap_sem must be taken in write mode and all other anon_vma/i_mmap locks must be taken too. 2) It'd be a waste to add branches in the VM if nobody could possibly run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of mmu notifiers, but this already allows to compile a KVM external module against a kernel with mmu notifiers enabled and from the next pull from kvm.git we'll start using them. And GRU/XPMEM will also be able to continue the development by enabling KVM=m in their config, until they submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n). This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM are all =n. The mmu_notifier_register call can fail because mm_take_all_locks may be interrupted by a signal and return -EINTR. Because mmu_notifier_reigster is used when a driver startup, a failure can be gracefully handled. Here an example of the change applied to kvm to register the mmu notifiers. Usually when a driver startups other allocations are required anyway and -ENOMEM failure paths exists already. struct kvm *kvm_arch_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + int err; if (!kvm) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops; + err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm); + if (err) { + kfree(kvm); + return ERR_PTR(err); + } + return kvm; } mmu_notifier_unregister returns void and it's reliable. The patch also adds a few needed but missing includes that would prevent kernel to compile after these changes on non-x86 archs (x86 didn't need them by luck). [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix mm/filemap_xip.c build] [akpm@linux-foundation.org: fix mm/mmu_notifier.c build] Signed-off-by: Andrea Arcangeli Signed-off-by: Nick Piggin Signed-off-by: Christoph Lameter Cc: Jack Steiner Cc: Robin Holt Cc: Nick Piggin Cc: Peter Zijlstra Cc: Kanoj Sarcar Cc: Roland Dreier Cc: Steve Wise Cc: Avi Kivity Cc: Hugh Dickins Cc: Rusty Russell Cc: Anthony Liguori Cc: Chris Wright Cc: Marcelo Tosatti Cc: Eric Dumazet Cc: "Paul E. McKenney" Cc: Izik Eidus Cc: Anthony Liguori Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 3 + mm/Makefile | 1 + mm/filemap_xip.c | 3 +- mm/fremap.c | 3 + mm/hugetlb.c | 3 + mm/memory.c | 35 +++++-- mm/mmap.c | 2 + mm/mmu_notifier.c | 277 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/mprotect.c | 3 + mm/mremap.c | 6 ++ mm/rmap.c | 13 +-- 11 files changed, 336 insertions(+), 13 deletions(-) create mode 100644 mm/mmu_notifier.c (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index efee5d379df4..446c6588c753 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -208,3 +208,6 @@ config NR_QUICK config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS + +config MMU_NOTIFIER + bool diff --git a/mm/Makefile b/mm/Makefile index 06ca2381fef1..da4ccf015aea 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o obj-$(CONFIG_SLOB) += slob.o +obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 98a3f31ccd6a..380ab402d711 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -188,7 +189,7 @@ __xip_unmap (struct address_space * mapping, if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush(vma, address, pte); + pteval = ptep_clear_flush_notify(vma, address, pte); page_remove_rmap(page, vma); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); diff --git a/mm/fremap.c b/mm/fremap.c index 07a9c82ce1a3..7881638e4a12 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, spin_unlock(&mapping->i_mmap_lock); } + mmu_notifier_invalidate_range_start(mm, start, start + size); err = populate_range(mm, vma, start, size, pgoff); + mmu_notifier_invalidate_range_end(mm, start, start + size); if (!err && !(flags & MAP_NONBLOCK)) { if (unlikely(has_write_lock)) { downgrade_write(&mm->mmap_sem); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3be79dc18c5c..80eb0d31d0d3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -1672,6 +1673,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); + mmu_notifier_invalidate_range_start(mm, start, end); spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); @@ -1713,6 +1715,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { list_del(&page->lru); put_page(page); diff --git a/mm/memory.c b/mm/memory.c index a8ca04faaea6..67f0ab9077d9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -652,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; + int ret; /* * Don't copy ptes where a page fault will fill them correctly. @@ -667,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); + /* + * We need to invalidate the secondary MMU mappings only when + * there could be a permission downgrade on the ptes of the + * parent mm. And a permission downgrade will only happen if + * is_cow_mapping() returns true. + */ + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_start(src_mm, addr, end); + + ret = 0; dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; - if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next)) - return -ENOMEM; + if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, + vma, addr, next))) { + ret = -ENOMEM; + break; + } } while (dst_pgd++, src_pgd++, addr = next, addr != end); - return 0; + + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_end(src_mm, + vma->vm_start, end); + return ret; } static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -881,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; int fullmm = (*tlbp)->fullmm; + struct mm_struct *mm = vma->vm_mm; + mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long end; @@ -946,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, } } out: + mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); return start; /* which is now the end (or restart) address */ } @@ -1616,10 +1637,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, { pgd_t *pgd; unsigned long next; - unsigned long end = addr + size; + unsigned long start = addr, end = addr + size; int err; BUG_ON(addr >= end); + mmu_notifier_invalidate_range_start(mm, start, end); pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); @@ -1627,6 +1649,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, if (err) break; } while (pgd++, addr = next, addr != end); + mmu_notifier_invalidate_range_end(mm, start, end); return err; } EXPORT_SYMBOL_GPL(apply_to_page_range); @@ -1839,7 +1862,7 @@ gotten: * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush(vma, address, page_table); + ptep_clear_flush_notify(vma, address, page_table); set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); lru_cache_add_active(new_page); diff --git a/mm/mmap.c b/mm/mmap.c index e5f9cb83d6d4..245c3d69067b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -2061,6 +2062,7 @@ void exit_mmap(struct mm_struct *mm) /* mm's last user has gone, and its about to be pulled down */ arch_exit_mmap(mm); + mmu_notifier_release(mm); lru_add_drain(); flush_cache_mm(mm); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c new file mode 100644 index 000000000000..5f4ef0250bee --- /dev/null +++ b/mm/mmu_notifier.c @@ -0,0 +1,277 @@ +/* + * linux/mm/mmu_notifier.c + * + * Copyright (C) 2008 Qumranet, Inc. + * Copyright (C) 2008 SGI + * Christoph Lameter + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * This function can't run concurrently against mmu_notifier_register + * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap + * runs with mm_users == 0. Other tasks may still invoke mmu notifiers + * in parallel despite there being no task using this mm any more, + * through the vmas outside of the exit_mmap context, such as with + * vmtruncate. This serializes against mmu_notifier_unregister with + * the mmu_notifier_mm->lock in addition to RCU and it serializes + * against the other mmu notifiers with RCU. struct mmu_notifier_mm + * can't go away from under us as exit_mmap holds an mm_count pin + * itself. + */ +void __mmu_notifier_release(struct mm_struct *mm) +{ + struct mmu_notifier *mn; + + spin_lock(&mm->mmu_notifier_mm->lock); + while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { + mn = hlist_entry(mm->mmu_notifier_mm->list.first, + struct mmu_notifier, + hlist); + /* + * We arrived before mmu_notifier_unregister so + * mmu_notifier_unregister will do nothing other than + * to wait ->release to finish and + * mmu_notifier_unregister to return. + */ + hlist_del_init_rcu(&mn->hlist); + /* + * RCU here will block mmu_notifier_unregister until + * ->release returns. + */ + rcu_read_lock(); + spin_unlock(&mm->mmu_notifier_mm->lock); + /* + * if ->release runs before mmu_notifier_unregister it + * must be handled as it's the only way for the driver + * to flush all existing sptes and stop the driver + * from establishing any more sptes before all the + * pages in the mm are freed. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + rcu_read_unlock(); + spin_lock(&mm->mmu_notifier_mm->lock); + } + spin_unlock(&mm->mmu_notifier_mm->lock); + + /* + * synchronize_rcu here prevents mmu_notifier_release to + * return to exit_mmap (which would proceed freeing all pages + * in the mm) until the ->release method returns, if it was + * invoked by mmu_notifier_unregister. + * + * The mmu_notifier_mm can't go away from under us because one + * mm_count is hold by exit_mmap. + */ + synchronize_rcu(); +} + +/* + * If no young bitflag is supported by the hardware, ->clear_flush_young can + * unmap the address and return 1 or 0 depending if the mapping previously + * existed or not. + */ +int __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + int young = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->clear_flush_young) + young |= mn->ops->clear_flush_young(mn, mm, address); + } + rcu_read_unlock(); + + return young; +} + +void __mmu_notifier_invalidate_page(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_page) + mn->ops->invalidate_page(mn, mm, address); + } + rcu_read_unlock(); +} + +void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_range_start) + mn->ops->invalidate_range_start(mn, mm, start, end); + } + rcu_read_unlock(); +} + +void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_range_end) + mn->ops->invalidate_range_end(mn, mm, start, end); + } + rcu_read_unlock(); +} + +static int do_mmu_notifier_register(struct mmu_notifier *mn, + struct mm_struct *mm, + int take_mmap_sem) +{ + struct mmu_notifier_mm *mmu_notifier_mm; + int ret; + + BUG_ON(atomic_read(&mm->mm_users) <= 0); + + ret = -ENOMEM; + mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); + if (unlikely(!mmu_notifier_mm)) + goto out; + + if (take_mmap_sem) + down_write(&mm->mmap_sem); + ret = mm_take_all_locks(mm); + if (unlikely(ret)) + goto out_cleanup; + + if (!mm_has_notifiers(mm)) { + INIT_HLIST_HEAD(&mmu_notifier_mm->list); + spin_lock_init(&mmu_notifier_mm->lock); + mm->mmu_notifier_mm = mmu_notifier_mm; + mmu_notifier_mm = NULL; + } + atomic_inc(&mm->mm_count); + + /* + * Serialize the update against mmu_notifier_unregister. A + * side note: mmu_notifier_release can't run concurrently with + * us because we hold the mm_users pin (either implicitly as + * current->mm or explicitly with get_task_mm() or similar). + * We can't race against any other mmu notifier method either + * thanks to mm_take_all_locks(). + */ + spin_lock(&mm->mmu_notifier_mm->lock); + hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); + spin_unlock(&mm->mmu_notifier_mm->lock); + + mm_drop_all_locks(mm); +out_cleanup: + if (take_mmap_sem) + up_write(&mm->mmap_sem); + /* kfree() does nothing if mmu_notifier_mm is NULL */ + kfree(mmu_notifier_mm); +out: + BUG_ON(atomic_read(&mm->mm_users) <= 0); + return ret; +} + +/* + * Must not hold mmap_sem nor any other VM related lock when calling + * this registration function. Must also ensure mm_users can't go down + * to zero while this runs to avoid races with mmu_notifier_release, + * so mm has to be current->mm or the mm should be pinned safely such + * as with get_task_mm(). If the mm is not current->mm, the mm_users + * pin should be released by calling mmput after mmu_notifier_register + * returns. mmu_notifier_unregister must be always called to + * unregister the notifier. mm_count is automatically pinned to allow + * mmu_notifier_unregister to safely run at any time later, before or + * after exit_mmap. ->release will always be called before exit_mmap + * frees the pages. + */ +int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +{ + return do_mmu_notifier_register(mn, mm, 1); +} +EXPORT_SYMBOL_GPL(mmu_notifier_register); + +/* + * Same as mmu_notifier_register but here the caller must hold the + * mmap_sem in write mode. + */ +int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +{ + return do_mmu_notifier_register(mn, mm, 0); +} +EXPORT_SYMBOL_GPL(__mmu_notifier_register); + +/* this is called after the last mmu_notifier_unregister() returned */ +void __mmu_notifier_mm_destroy(struct mm_struct *mm) +{ + BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list)); + kfree(mm->mmu_notifier_mm); + mm->mmu_notifier_mm = LIST_POISON1; /* debug */ +} + +/* + * This releases the mm_count pin automatically and frees the mm + * structure if it was the last user of it. It serializes against + * running mmu notifiers with RCU and against mmu_notifier_unregister + * with the unregister lock + RCU. All sptes must be dropped before + * calling mmu_notifier_unregister. ->release or any other notifier + * method may be invoked concurrently with mmu_notifier_unregister, + * and only after mmu_notifier_unregister returned we're guaranteed + * that ->release or any other method can't run anymore. + */ +void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) +{ + BUG_ON(atomic_read(&mm->mm_count) <= 0); + + spin_lock(&mm->mmu_notifier_mm->lock); + if (!hlist_unhashed(&mn->hlist)) { + hlist_del_rcu(&mn->hlist); + + /* + * RCU here will force exit_mmap to wait ->release to finish + * before freeing the pages. + */ + rcu_read_lock(); + spin_unlock(&mm->mmu_notifier_mm->lock); + /* + * exit_mmap will block in mmu_notifier_release to + * guarantee ->release is called before freeing the + * pages. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + rcu_read_unlock(); + } else + spin_unlock(&mm->mmu_notifier_mm->lock); + + /* + * Wait any running method to finish, of course including + * ->release if it was run by mmu_notifier_relase instead of us. + */ + synchronize_rcu(); + + BUG_ON(atomic_read(&mm->mm_count) <= 0); + + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_notifier_unregister); diff --git a/mm/mprotect.c b/mm/mprotect.c index abd645a3b0a0..fded06f923f4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -203,10 +204,12 @@ success: dirty_accountable = 1; } + mmu_notifier_invalidate_range_start(mm, start, end); if (is_vm_hugetlb_page(vma)) hugetlb_change_protection(vma, start, end, vma->vm_page_prot); else change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + mmu_notifier_invalidate_range_end(mm, start, end); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); return 0; diff --git a/mm/mremap.c b/mm/mremap.c index 08e3c7f2bd15..1a7743923c8c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -74,7 +75,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; + unsigned long old_start; + old_start = old_addr; + mmu_notifier_invalidate_range_start(vma->vm_mm, + old_start, old_end); if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before @@ -116,6 +121,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) spin_unlock(&mapping->i_mmap_lock); + mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); } #define LATENCY_LIMIT (64 * PAGE_SIZE) diff --git a/mm/rmap.c b/mm/rmap.c index 39ae5a9bf382..99bc3f9cd796 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -287,7 +288,7 @@ static int page_referenced_one(struct page *page, if (vma->vm_flags & VM_LOCKED) { referenced++; *mapcount = 1; /* break early from loop */ - } else if (ptep_clear_flush_young(vma, address, pte)) + } else if (ptep_clear_flush_young_notify(vma, address, pte)) referenced++; /* Pretend the page is referenced if the task has the @@ -457,7 +458,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) pte_t entry; flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush(vma, address, pte); + entry = ptep_clear_flush_notify(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(mm, address, pte, entry); @@ -705,14 +706,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * skipped over this mm) then we should reactivate it. */ if (!migration && ((vma->vm_flags & VM_LOCKED) || - (ptep_clear_flush_young(vma, address, pte)))) { + (ptep_clear_flush_young_notify(vma, address, pte)))) { ret = SWAP_FAIL; goto out_unmap; } /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush(vma, address, pte); + pteval = ptep_clear_flush_notify(vma, address, pte); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) @@ -837,12 +838,12 @@ static void try_to_unmap_cluster(unsigned long cursor, page = vm_normal_page(vma, address, *pte); BUG_ON(!page || PageAnon(page)); - if (ptep_clear_flush_young(vma, address, pte)) + if (ptep_clear_flush_young_notify(vma, address, pte)) continue; /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush(vma, address, pte); + pteval = ptep_clear_flush_notify(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ if (page->index != linear_page_index(vma, address)) -- cgit v1.2.2 From 78a34ae29bf1c9df62a5bd0f0798b6c62a54d520 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 28 Jul 2008 15:46:30 -0700 Subject: mm/hugetlb.c must #include This patch fixes the following build error on sh caused by commit aa888a74977a8f2120ae9332376e179c39a6b07d ("hugetlb: support larger than MAX_ORDER"): mm/hugetlb.c: In function 'alloc_bootmem_huge_page': mm/hugetlb.c:958: error: implicit declaration of function 'virt_to_phys' Signed-off-by: Adrian Bunk Cc: Hirokazu Takata Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 80eb0d31d0d3..254ce2b90158 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -20,6 +20,7 @@ #include #include +#include #include #include "internal.h" -- cgit v1.2.2 From 8ab22b9abb5c55413802e4adc9aa6223324547c3 Mon Sep 17 00:00:00 2001 From: Hisashi Hifumi Date: Mon, 28 Jul 2008 15:46:36 -0700 Subject: vfs: pagecache usage optimization for pagesize!=blocksize When we read some part of a file through pagecache, if there is a pagecache of corresponding index but this page is not uptodate, read IO is issued and this page will be uptodate. I think this is good for pagesize == blocksize environment but there is room for improvement on pagesize != blocksize environment. Because in this case a page can have multiple buffers and even if a page is not uptodate, some buffers can be uptodate. So I suggest that when all buffers which correspond to a part of a file that we want to read are uptodate, use this pagecache and copy data from this pagecache to user buffer even if a page is not uptodate. This can reduce read IO and improve system throughput. I wrote a benchmark program and got result number with this program. This benchmark do: 1: mount and open a test file. 2: create a 512MB file. 3: close a file and umount. 4: mount and again open a test file. 5: pwrite randomly 300000 times on a test file. offset is aligned by IO size(1024bytes). 6: measure time of preading randomly 100000 times on a test file. The result was: 2.6.26 330 sec 2.6.26-patched 226 sec Arch:i386 Filesystem:ext3 Blocksize:1024 bytes Memory: 1GB On ext3/4, a file is written through buffer/block. So random read/write mixed workloads or random read after random write workloads are optimized with this patch under pagesize != blocksize environment. This test result showed this. The benchmark program is as follows: #include #include #include #include #include #include #include #include #include #define LEN 1024 #define LOOP 1024*512 /* 512MB */ main(void) { unsigned long i, offset, filesize; int fd; char buf[LEN]; time_t t1, t2; if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } memset(buf, 0, LEN); fd = open("/root/test1/testfile", O_CREAT|O_RDWR|O_TRUNC); if (fd < 0) { perror("cannot open file\n"); exit(1); } for (i = 0; i < LOOP; i++) write(fd, buf, LEN); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } fd = open("/root/test1/testfile", O_RDWR); if (fd < 0) { perror("cannot open file\n"); exit(1); } filesize = LEN * LOOP; for (i = 0; i < 300000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pwrite(fd, buf, LEN, offset); } printf("start test\n"); time(&t1); for (i = 0; i < 100000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pread(fd, buf, LEN, offset); } time(&t2); printf("%ld sec\n", t2-t1); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } } Signed-off-by: Hisashi Hifumi Cc: Nick Piggin Cc: Christoph Hellwig Cc: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 5de7633e1dbe..42bbc6909ba4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1023,8 +1023,17 @@ find_page: ra, filp, page, index, last_index - index); } - if (!PageUptodate(page)) - goto page_not_up_to_date; + if (!PageUptodate(page)) { + if (inode->i_blkbits == PAGE_CACHE_SHIFT || + !mapping->a_ops->is_partially_uptodate) + goto page_not_up_to_date; + if (TestSetPageLocked(page)) + goto page_not_up_to_date; + if (!mapping->a_ops->is_partially_uptodate(page, + desc, offset)) + goto page_not_up_to_date_locked; + unlock_page(page); + } page_ok: /* * i_size must be checked after we know the page is Uptodate. @@ -1094,6 +1103,7 @@ page_not_up_to_date: if (lock_page_killable(page)) goto readpage_eio; +page_not_up_to_date_locked: /* Did it get truncated before we got the lock? */ if (!page->mapping) { unlock_page(page); -- cgit v1.2.2