mm: mmu_gather rework

Rework the existing mmu_gather infrastructure. The direct purpose of these patches was to allow preemptible mmu_gather, but even without that I think these patches provide an improvement to the status quo. The first 9 patches rework the mmu_gather infrastructure. For review purpose I've split them into generic and per-arch patches with the last of those a generic cleanup. The next patch provides generic RCU page-table freeing, and the followup is a patch converting s390 to use this. I've also got 4 patches from DaveM lined up (not included in this series) that uses this to implement gup_fast() for sparc64. Then there is one patch that extends the generic mmu_gather batching. After that follow the mm preemptibility patches, these make part of the mm a lot more preemptible. It converts i_mmap_lock and anon_vma->lock to mutexes which together with the mmu_gather rework makes mmu_gather preemptible as well. Making i_mmap_lock a mutex also enables a clean-up of the truncate code. This also allows for preemptible mmu_notifiers, something that XPMEM I think wants. Furthermore, it removes the new and universially detested unmap_mutex. This patch: Remove the first obstacle towards a fully preemptible mmu_gather. The current scheme assumes mmu_gather is always done with preemption disabled and uses per-cpu storage for the page batches. Change this to try and allocate a page for batching and in case of failure, use a small on-stack array to make some progress. Preemptible mmu_gather is desired in general and usable once i_mmap_lock becomes a mutex. Doing it before the mutex conversion saves us from having to rework the code by moving the mmu_gather bits inside the pte_lock. Also avoid flushing the tlb batches from under the pte lock, this is useful even without the i_mmap_lock conversion as it significantly reduces pte lock hold times. [akpm@linux-foundation.org: fix comment tpyo] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Miller <davem@davemloft.net> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Jeff Dike <jdike@addtoit.com> Cc: Richard Weinberger <richard@nod.at> Cc: Tony Luck <tony.luck@intel.com> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Hugh Dickins <hughd@google.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Namhyung Kim <namhyung@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2011-05-24 20:11:45 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-05-25 11:39:12 -0400
commit: d16dfc550f5326a4000f3322582a7c05dec91d7a (patch)
tree: 8ee963542705cbf2187777f1d3f2b209cbda827a
parent: d05f3169c0fbca16132ec7c2be71685c6de638b5 (diff)
5 files changed, 107 insertions, 65 deletions
diff --git a/fs/exec.c b/fs/exec.c
index e276d5e0abb9..936f5776655c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -600,7 +600,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
        unsigned long length = old_end - old_start;
        unsigned long new_start = old_start - shift;
        unsigned long new_end = old_end - shift;
-        struct mmu_gather *tlb;
+        struct mmu_gather tlb;
        BUG_ON(new_start > new_end);
@@ -626,12 +626,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                return -ENOMEM;
        lru_add_drain();
-        tlb = tlb_gather_mmu(mm, 0);
+        tlb_gather_mmu(&tlb, mm, 0);
        if (new_end > old_start) {
                /*
                 * when the old and new regions overlap clear from new_end.
                 */
-                free_pgd_range(tlb, new_end, old_end, new_end,
+                free_pgd_range(&tlb, new_end, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : 0);
        } else {
                /*
@@ -640,10 +640,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
                 * have constraints on va-space that make this illegal (IA64) -
                 * for the others its just a little faster.
                 */
-                free_pgd_range(tlb, old_start, old_end, new_end,
+                free_pgd_range(&tlb, old_start, old_end, new_end,
                        vma->vm_next ? vma->vm_next->vm_start : 0);
        }
-        tlb_finish_mmu(tlb, new_end, old_end);
+        tlb_finish_mmu(&tlb, new_end, old_end);
        /*
         * Shrink the vma to just the new range.  Always succeeds.
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index e43f9766259f..2d3547c84235 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -5,6 +5,8 @@
 * Copyright 2001 Red Hat, Inc.
 * Based on code from mm/memory.c Copyright Linus Torvalds and others.
 *
+ * Copyright 2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
@@ -22,51 +24,71 @@
 * and page free order so much..
 */
 #ifdef CONFIG_SMP
-  #ifdef ARCH_FREE_PTR_NR
-    #define FREE_PTR_NR   ARCH_FREE_PTR_NR
-  #else
-    #define FREE_PTE_NR 506
-  #endif
  #define tlb_fast_mode(tlb) ((tlb)->nr == ~0U)
 #else
-  #define FREE_PTE_NR   1
  #define tlb_fast_mode(tlb) 1
 #endif
+/*
+ * If we can't allocate a page to make a big batch of page pointers
+ * to work on, then just handle a few from the on-stack structure.
+ */
+#define MMU_GATHER_BUNDLE       8
 /* struct mmu_gather is an opaque type used by the mm code for passing around
 * any data needed by arch specific code for tlb_remove_page.
 */
 struct mmu_gather {
        struct mm_struct        *mm;
        unsigned int            nr;     /* set to ~0U means fast mode */
+        unsigned int            max;    /* nr < max */
        unsigned int            need_flush;/* Really unmapped some ptes? */
        unsigned int            fullmm; /* non-zero means full mm flush */
-        struct page *           pages[FREE_PTE_NR];
+#ifdef HAVE_ARCH_MMU_GATHER
+        struct arch_mmu_gather  arch;
+#endif
+        struct page             **pages;
+        struct page             *local[MMU_GATHER_BUNDLE];
 };
-/* Users of the generic TLB shootdown code must declare this storage space. */
+static inline void __tlb_alloc_page(struct mmu_gather *tlb)
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+{
+        unsigned long addr = __get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+        if (addr) {
+                tlb->pages = (void *)addr;
+                tlb->max = PAGE_SIZE / sizeof(struct page *);
+        }
+}
 /* tlb_gather_mmu
- *      Return a pointer to an initialized struct mmu_gather.
+ *      Called to initialize an (on-stack) mmu_gather structure for page-table
+ *      tear-down from @mm. The @fullmm argument is used when @mm is without
+ *      users and we're going to destroy the full address space (exit/execve).
 */
-static inline struct mmu_gather *
+static inline void
-tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
+tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
 {
-        struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
        tlb->mm = mm;
-        /* Use fast mode if only one CPU is online */
+        tlb->max = ARRAY_SIZE(tlb->local);
-        tlb->nr = num_online_cpus() > 1 ? 0U : ~0U;
+        tlb->pages = tlb->local;
+        if (num_online_cpus() > 1) {
+                tlb->nr = 0;
+                __tlb_alloc_page(tlb);
+        } else /* Use fast mode if only one CPU is online */
+                tlb->nr = ~0U;
-        tlb->fullmm = full_mm_flush;
+        tlb->fullmm = fullmm;
-        return tlb;
+#ifdef HAVE_ARCH_MMU_GATHER
+        tlb->arch = ARCH_MMU_GATHER_INIT;
+#endif
 }
 static inline void
-tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+tlb_flush_mmu(struct mmu_gather *tlb)
 {
        if (!tlb->need_flush)
                return;
@@ -75,6 +97,13 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
        if (!tlb_fast_mode(tlb)) {
                free_pages_and_swap_cache(tlb->pages, tlb->nr);
                tlb->nr = 0;
+                /*
+                 * If we are using the local on-stack array of pages for MMU
+                 * gather, try allocating an off-stack array again as we have
+                 * recently freed pages.
+                 */
+                if (tlb->pages == tlb->local)
+                        __tlb_alloc_page(tlb);
        }
 }
@@ -85,29 +114,42 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-        tlb_flush_mmu(tlb, start, end);
+        tlb_flush_mmu(tlb);
        /* keep the page table cache within bounds */
        check_pgt_cache();
-        put_cpu_var(mmu_gathers);
+        if (tlb->pages != tlb->local)
+                free_pages((unsigned long)tlb->pages, 0);
 }
-/* tlb_remove_page
+/* __tlb_remove_page
 *      Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
 *      handling the additional races in SMP caused by other CPUs caching valid
- *      mappings in their TLBs.
+ *      mappings in their TLBs. Returns the number of free page slots left.
+ *      When out of page slots we must call tlb_flush_mmu().
 */
-static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 {
        tlb->need_flush = 1;
        if (tlb_fast_mode(tlb)) {
                free_page_and_swap_cache(page);
-                return;
+                return 1; /* avoid calling tlb_flush_mmu() */
        }
        tlb->pages[tlb->nr++] = page;
-        if (tlb->nr >= FREE_PTE_NR)
+        VM_BUG_ON(tlb->nr > tlb->max);
-                tlb_flush_mmu(tlb, 0, 0);
+        return tlb->max - tlb->nr;
+}
+/* tlb_remove_page
+ *      Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when
+ *      required.
+ */
+static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+        if (!__tlb_remove_page(tlb, page))
+                tlb_flush_mmu(tlb);
 }
 /**
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d2948af126ca..ffcce9bf2b54 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -906,7 +906,7 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                unsigned long size);
 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *);
-unsigned long unmap_vmas(struct mmu_gather **tlb,
+unsigned long unmap_vmas(struct mmu_gather *tlb,
                struct vm_area_struct *start_vma, unsigned long start_addr,
                unsigned long end_addr, unsigned long *nr_accounted,
                struct zap_details *);
diff --git a/mm/memory.c b/mm/memory.c
index 4c6ea10f3d18..19b2d44de9f0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -912,12 +912,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                long *zap_work, struct zap_details *details)
 {
        struct mm_struct *mm = tlb->mm;
+        int force_flush = 0;
        pte_t *pte;
        spinlock_t *ptl;
        int rss[NR_MM_COUNTERS];
        init_rss_vec(rss);
+again:
        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        arch_enter_lazy_mmu_mode();
        do {
@@ -974,7 +975,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                        page_remove_rmap(page);
                        if (unlikely(page_mapcount(page) < 0))
                                print_bad_pte(vma, addr, ptent, page);
-                        tlb_remove_page(tlb, page);
+                        force_flush = !__tlb_remove_page(tlb, page);
+                        if (force_flush)
+                                break;
                        continue;
                }
                /*
@@ -1001,6 +1004,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(pte - 1, ptl);
+        /*
+         * mmu_gather ran out of room to batch pages, we break out of
+         * the PTE lock to avoid doing the potential expensive TLB invalidate
+         * and page-free while holding it.
+         */
+        if (force_flush) {
+                force_flush = 0;
+                tlb_flush_mmu(tlb);
+                if (addr != end)
+                        goto again;
+        }
        return addr;
 }
@@ -1121,17 +1136,14 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
-unsigned long unmap_vmas(struct mmu_gather **tlbp,
+unsigned long unmap_vmas(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr, unsigned long *nr_accounted,
                struct zap_details *details)
 {
        long zap_work = ZAP_BLOCK_SIZE;
-        unsigned long tlb_start = 0;    /* For tlb_finish_mmu */
-        int tlb_start_valid = 0;
        unsigned long start = start_addr;
        spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
-        int fullmm = (*tlbp)->fullmm;
        struct mm_struct *mm = vma->vm_mm;
        mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -1152,11 +1164,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                        untrack_pfn_vma(vma, 0, 0);
                while (start != end) {
-                        if (!tlb_start_valid) {
-                                tlb_start = start;
-                                tlb_start_valid = 1;
-                        }
                        if (unlikely(is_vm_hugetlb_page(vma))) {
                                /*
                                 * It is undesirable to test vma->vm_file as it
@@ -1177,7 +1184,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                                start = end;
                        } else
-                                start = unmap_page_range(*tlbp, vma,
+                                start = unmap_page_range(tlb, vma,
                                                start, end, &zap_work, details);
                        if (zap_work > 0) {
@@ -1185,19 +1192,13 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                                break;
                        }
-                        tlb_finish_mmu(*tlbp, tlb_start, start);
                        if (need_resched() ||
                                (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
-                                if (i_mmap_lock) {
+                                if (i_mmap_lock)
-                                        *tlbp = NULL;
                                        goto out;
-                                }
                                cond_resched();
                        }
-                        *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
-                        tlb_start_valid = 0;
                        zap_work = ZAP_BLOCK_SIZE;
                }
        }
@@ -1217,16 +1218,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *details)
 {
        struct mm_struct *mm = vma->vm_mm;
-        struct mmu_gather *tlb;
+        struct mmu_gather tlb;
        unsigned long end = address + size;
        unsigned long nr_accounted = 0;
        lru_add_drain();
-        tlb = tlb_gather_mmu(mm, 0);
+        tlb_gather_mmu(&tlb, mm, 0);
        update_hiwater_rss(mm);
        end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
-        if (tlb)
+        tlb_finish_mmu(&tlb, address, end);
-                tlb_finish_mmu(tlb, address, end);
        return end;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index adb12527fd0e..40d49986e714 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1903,17 +1903,17 @@ static void unmap_region(struct mm_struct *mm,
                unsigned long start, unsigned long end)
 {
        struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
-        struct mmu_gather *tlb;
+        struct mmu_gather tlb;
        unsigned long nr_accounted = 0;
        lru_add_drain();
-        tlb = tlb_gather_mmu(mm, 0);
+        tlb_gather_mmu(&tlb, mm, 0);
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+        free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
-                                 next? next->vm_start: 0);
+                                 next ? next->vm_start : 0);
-        tlb_finish_mmu(tlb, start, end);
+        tlb_finish_mmu(&tlb, start, end);
 }
 /*
@@ -2255,7 +2255,7 @@ EXPORT_SYMBOL(do_brk);
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct *mm)
 {
-        struct mmu_gather *tlb;
+        struct mmu_gather tlb;
        struct vm_area_struct *vma;
        unsigned long nr_accounted = 0;
        unsigned long end;
@@ -2280,14 +2280,14 @@ void exit_mmap(struct mm_struct *mm)
        lru_add_drain();
        flush_cache_mm(mm);
-        tlb = tlb_gather_mmu(mm, 1);
+        tlb_gather_mmu(&tlb, mm, 1);
        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
+        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
-        tlb_finish_mmu(tlb, 0, end);
+        tlb_finish_mmu(&tlb, 0, end);
        /*
         * Walk the list again, actually closing and freeing it,
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2011-05-24 20:11:45 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-05-25 11:39:12 -0400
commit	d16dfc550f5326a4000f3322582a7c05dec91d7a (patch)
tree	8ee963542705cbf2187777f1d3f2b209cbda827a
parent	d05f3169c0fbca16132ec7c2be71685c6de638b5 (diff)

diff --git a/fs/exec.c b/fs/exec.c index e276d5e0abb9..936f5776655c 100644 --- a/fs/exec.c +++ b/fs/exec.c
@@ -600,7 +600,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
600	unsigned long length = old_end - old_start;	600	unsigned long length = old_end - old_start;
601	unsigned long new_start = old_start - shift;	601	unsigned long new_start = old_start - shift;
602	unsigned long new_end = old_end - shift;	602	unsigned long new_end = old_end - shift;
603	struct mmu_gather *tlb;	603	struct mmu_gather tlb;
604		604
605	BUG_ON(new_start > new_end);	605	BUG_ON(new_start > new_end);
606		606
@@ -626,12 +626,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
626	return -ENOMEM;	626	return -ENOMEM;
627		627
628	lru_add_drain();	628	lru_add_drain();
629	tlb = tlb_gather_mmu(mm, 0);	629	tlb_gather_mmu(&tlb, mm, 0);
630	if (new_end > old_start) {	630	if (new_end > old_start) {
631	/*	631	/*
632	* when the old and new regions overlap clear from new_end.	632	* when the old and new regions overlap clear from new_end.
633	*/	633	*/
634	free_pgd_range(tlb, new_end, old_end, new_end,	634	free_pgd_range(&tlb, new_end, old_end, new_end,
635	vma->vm_next ? vma->vm_next->vm_start : 0);	635	vma->vm_next ? vma->vm_next->vm_start : 0);
636	} else {	636	} else {
637	/*	637	/*
@@ -640,10 +640,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
640	* have constraints on va-space that make this illegal (IA64) -	640	* have constraints on va-space that make this illegal (IA64) -
641	* for the others its just a little faster.	641	* for the others its just a little faster.
642	*/	642	*/
643	free_pgd_range(tlb, old_start, old_end, new_end,	643	free_pgd_range(&tlb, old_start, old_end, new_end,
644	vma->vm_next ? vma->vm_next->vm_start : 0);	644	vma->vm_next ? vma->vm_next->vm_start : 0);
645	}	645	}
646	tlb_finish_mmu(tlb, new_end, old_end);	646	tlb_finish_mmu(&tlb, new_end, old_end);
647		647
648	/*	648	/*
649	* Shrink the vma to just the new range. Always succeeds.	649	* Shrink the vma to just the new range. Always succeeds.


diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index e43f9766259f..2d3547c84235 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h
@@ -5,6 +5,8 @@
5	* Copyright 2001 Red Hat, Inc.	5	* Copyright 2001 Red Hat, Inc.
6	* Based on code from mm/memory.c Copyright Linus Torvalds and others.	6	* Based on code from mm/memory.c Copyright Linus Torvalds and others.
7	*	7	*
		8	* Copyright 2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
		9	*
8	* This program is free software; you can redistribute it and/or	10	* This program is free software; you can redistribute it and/or
9	* modify it under the terms of the GNU General Public License	11	* modify it under the terms of the GNU General Public License
10	* as published by the Free Software Foundation; either version	12	* as published by the Free Software Foundation; either version
@@ -22,51 +24,71 @@
22	* and page free order so much..	24	* and page free order so much..
23	*/	25	*/
24	#ifdef CONFIG_SMP	26	#ifdef CONFIG_SMP
25	#ifdef ARCH_FREE_PTR_NR
26	#define FREE_PTR_NR ARCH_FREE_PTR_NR
27	#else
28	#define FREE_PTE_NR 506
29	#endif
30	#define tlb_fast_mode(tlb) ((tlb)->nr == ~0U)	27	#define tlb_fast_mode(tlb) ((tlb)->nr == ~0U)
31	#else	28	#else
32	#define FREE_PTE_NR 1
33	#define tlb_fast_mode(tlb) 1	29	#define tlb_fast_mode(tlb) 1
34	#endif	30	#endif
35		31
		32	/*
		33	* If we can't allocate a page to make a big batch of page pointers
		34	* to work on, then just handle a few from the on-stack structure.
		35	*/
		36	#define MMU_GATHER_BUNDLE 8
		37
36	/* struct mmu_gather is an opaque type used by the mm code for passing around	38	/* struct mmu_gather is an opaque type used by the mm code for passing around
37	* any data needed by arch specific code for tlb_remove_page.	39	* any data needed by arch specific code for tlb_remove_page.
38	*/	40	*/
39	struct mmu_gather {	41	struct mmu_gather {
40	struct mm_struct *mm;	42	struct mm_struct *mm;
41	unsigned int nr; /* set to ~0U means fast mode */	43	unsigned int nr; /* set to ~0U means fast mode */
		44	unsigned int max; /* nr < max */
42	unsigned int need_flush;/* Really unmapped some ptes? */	45	unsigned int need_flush;/* Really unmapped some ptes? */
43	unsigned int fullmm; /* non-zero means full mm flush */	46	unsigned int fullmm; /* non-zero means full mm flush */
44	struct page * pages[FREE_PTE_NR];	47	#ifdef HAVE_ARCH_MMU_GATHER
		48	struct arch_mmu_gather arch;
		49	#endif
		50	struct page **pages;
		51	struct page *local[MMU_GATHER_BUNDLE];
45	};	52	};
46		53
47	/* Users of the generic TLB shootdown code must declare this storage space. */	54	static inline void __tlb_alloc_page(struct mmu_gather *tlb)
48	DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);	55	{
		56	unsigned long addr = __get_free_pages(GFP_NOWAIT \| __GFP_NOWARN, 0);
		57
		58	if (addr) {
		59	tlb->pages = (void *)addr;
		60	tlb->max = PAGE_SIZE / sizeof(struct page *);
		61	}
		62	}
49		63
50	/* tlb_gather_mmu	64	/* tlb_gather_mmu
51	* Return a pointer to an initialized struct mmu_gather.	65	* Called to initialize an (on-stack) mmu_gather structure for page-table
		66	* tear-down from @mm. The @fullmm argument is used when @mm is without
		67	* users and we're going to destroy the full address space (exit/execve).
52	*/	68	*/
53	static inline struct mmu_gather *	69	static inline void
54	tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)	70	tlb_gather_mmu(struct mmu_gather tlb, struct mm_struct mm, bool fullmm)
55	{	71	{
56	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
57
58	tlb->mm = mm;	72	tlb->mm = mm;
59		73
60	/* Use fast mode if only one CPU is online */	74	tlb->max = ARRAY_SIZE(tlb->local);
61	tlb->nr = num_online_cpus() > 1 ? 0U : ~0U;	75	tlb->pages = tlb->local;
		76
		77	if (num_online_cpus() > 1) {
		78	tlb->nr = 0;
		79	__tlb_alloc_page(tlb);
		80	} else /* Use fast mode if only one CPU is online */
		81	tlb->nr = ~0U;
62		82
63	tlb->fullmm = full_mm_flush;	83	tlb->fullmm = fullmm;
64		84
65	return tlb;	85	#ifdef HAVE_ARCH_MMU_GATHER
		86	tlb->arch = ARCH_MMU_GATHER_INIT;
		87	#endif
66	}	88	}
67		89
68	static inline void	90	static inline void
69	tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)	91	tlb_flush_mmu(struct mmu_gather *tlb)
70	{	92	{
71	if (!tlb->need_flush)	93	if (!tlb->need_flush)
72	return;	94	return;
@@ -75,6 +97,13 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
75	if (!tlb_fast_mode(tlb)) {	97	if (!tlb_fast_mode(tlb)) {
76	free_pages_and_swap_cache(tlb->pages, tlb->nr);	98	free_pages_and_swap_cache(tlb->pages, tlb->nr);
77	tlb->nr = 0;	99	tlb->nr = 0;
		100	/*
		101	* If we are using the local on-stack array of pages for MMU
		102	* gather, try allocating an off-stack array again as we have
		103	* recently freed pages.
		104	*/
		105	if (tlb->pages == tlb->local)
		106	__tlb_alloc_page(tlb);
78	}	107	}
79	}	108	}
80		109
@@ -85,29 +114,42 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
85	static inline void	114	static inline void
86	tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)	115	tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
87	{	116	{
88	tlb_flush_mmu(tlb, start, end);	117	tlb_flush_mmu(tlb);
89		118
90	/* keep the page table cache within bounds */	119	/* keep the page table cache within bounds */
91	check_pgt_cache();	120	check_pgt_cache();
92		121
93	put_cpu_var(mmu_gathers);	122	if (tlb->pages != tlb->local)
		123	free_pages((unsigned long)tlb->pages, 0);
94	}	124	}
95		125
96	/* tlb_remove_page	126	/* __tlb_remove_page
97	* Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while	127	* Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
98	* handling the additional races in SMP caused by other CPUs caching valid	128	* handling the additional races in SMP caused by other CPUs caching valid
99	* mappings in their TLBs.	129	* mappings in their TLBs. Returns the number of free page slots left.
		130	* When out of page slots we must call tlb_flush_mmu().
100	*/	131	*/
101	static inline void tlb_remove_page(struct mmu_gather tlb, struct page page)	132	static inline int __tlb_remove_page(struct mmu_gather tlb, struct page page)
102	{	133	{
103	tlb->need_flush = 1;	134	tlb->need_flush = 1;
104	if (tlb_fast_mode(tlb)) {	135	if (tlb_fast_mode(tlb)) {
105	free_page_and_swap_cache(page);	136	free_page_and_swap_cache(page);
106	return;	137	return 1; /* avoid calling tlb_flush_mmu() */
107	}	138	}
108	tlb->pages[tlb->nr++] = page;	139	tlb->pages[tlb->nr++] = page;
109	if (tlb->nr >= FREE_PTE_NR)	140	VM_BUG_ON(tlb->nr > tlb->max);
110	tlb_flush_mmu(tlb, 0, 0);	141
		142	return tlb->max - tlb->nr;
		143	}
		144
		145	/* tlb_remove_page
		146	* Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when
		147	* required.
		148	*/
		149	static inline void tlb_remove_page(struct mmu_gather tlb, struct page page)
		150	{
		151	if (!__tlb_remove_page(tlb, page))
		152	tlb_flush_mmu(tlb);
111	}	153	}
112		154
113	/**	155	/**


diff --git a/include/linux/mm.h b/include/linux/mm.h index d2948af126ca..ffcce9bf2b54 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h
@@ -906,7 +906,7 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
906	unsigned long size);	906	unsigned long size);
907	unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,	907	unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
908	unsigned long size, struct zap_details *);	908	unsigned long size, struct zap_details *);
909	unsigned long unmap_vmas(struct mmu_gather **tlb,	909	unsigned long unmap_vmas(struct mmu_gather *tlb,
910	struct vm_area_struct *start_vma, unsigned long start_addr,	910	struct vm_area_struct *start_vma, unsigned long start_addr,
911	unsigned long end_addr, unsigned long *nr_accounted,	911	unsigned long end_addr, unsigned long *nr_accounted,
912	struct zap_details *);	912	struct zap_details *);


diff --git a/mm/memory.c b/mm/memory.c index 4c6ea10f3d18..19b2d44de9f0 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -912,12 +912,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
912	long zap_work, struct zap_details details)	912	long zap_work, struct zap_details details)
913	{	913	{
914	struct mm_struct *mm = tlb->mm;	914	struct mm_struct *mm = tlb->mm;
		915	int force_flush = 0;
915	pte_t *pte;	916	pte_t *pte;
916	spinlock_t *ptl;	917	spinlock_t *ptl;
917	int rss[NR_MM_COUNTERS];	918	int rss[NR_MM_COUNTERS];
918		919
919	init_rss_vec(rss);	920	init_rss_vec(rss);
920		921	again:
921	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);	922	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
922	arch_enter_lazy_mmu_mode();	923	arch_enter_lazy_mmu_mode();
923	do {	924	do {
@@ -974,7 +975,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
974	page_remove_rmap(page);	975	page_remove_rmap(page);
975	if (unlikely(page_mapcount(page) < 0))	976	if (unlikely(page_mapcount(page) < 0))
976	print_bad_pte(vma, addr, ptent, page);	977	print_bad_pte(vma, addr, ptent, page);
977	tlb_remove_page(tlb, page);	978	force_flush = !__tlb_remove_page(tlb, page);
		979	if (force_flush)
		980	break;
978	continue;	981	continue;
979	}	982	}
980	/*	983	/*
@@ -1001,6 +1004,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
1001	arch_leave_lazy_mmu_mode();	1004	arch_leave_lazy_mmu_mode();
1002	pte_unmap_unlock(pte - 1, ptl);	1005	pte_unmap_unlock(pte - 1, ptl);
1003		1006
		1007	/*
		1008	* mmu_gather ran out of room to batch pages, we break out of
		1009	* the PTE lock to avoid doing the potential expensive TLB invalidate
		1010	* and page-free while holding it.
		1011	*/
		1012	if (force_flush) {
		1013	force_flush = 0;
		1014	tlb_flush_mmu(tlb);
		1015	if (addr != end)
		1016	goto again;
		1017	}
		1018
1004	return addr;	1019	return addr;
1005	}	1020	}
1006		1021
@@ -1121,17 +1136,14 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1121	* ensure that any thus-far unmapped pages are flushed before unmap_vmas()	1136	* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
1122	* drops the lock and schedules.	1137	* drops the lock and schedules.
1123	*/	1138	*/
1124	unsigned long unmap_vmas(struct mmu_gather **tlbp,	1139	unsigned long unmap_vmas(struct mmu_gather *tlb,
1125	struct vm_area_struct *vma, unsigned long start_addr,	1140	struct vm_area_struct *vma, unsigned long start_addr,
1126	unsigned long end_addr, unsigned long *nr_accounted,	1141	unsigned long end_addr, unsigned long *nr_accounted,
1127	struct zap_details *details)	1142	struct zap_details *details)
1128	{	1143	{
1129	long zap_work = ZAP_BLOCK_SIZE;	1144	long zap_work = ZAP_BLOCK_SIZE;
1130	unsigned long tlb_start = 0; /* For tlb_finish_mmu */
1131	int tlb_start_valid = 0;
1132	unsigned long start = start_addr;	1145	unsigned long start = start_addr;
1133	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;	1146	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
1134	int fullmm = (*tlbp)->fullmm;
1135	struct mm_struct *mm = vma->vm_mm;	1147	struct mm_struct *mm = vma->vm_mm;
1136		1148
1137	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);	1149	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -1152,11 +1164,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
1152	untrack_pfn_vma(vma, 0, 0);	1164	untrack_pfn_vma(vma, 0, 0);
1153		1165
1154	while (start != end) {	1166	while (start != end) {
1155	if (!tlb_start_valid) {
1156	tlb_start = start;
1157	tlb_start_valid = 1;
1158	}
1159
1160	if (unlikely(is_vm_hugetlb_page(vma))) {	1167	if (unlikely(is_vm_hugetlb_page(vma))) {
1161	/*	1168	/*
1162	* It is undesirable to test vma->vm_file as it	1169	* It is undesirable to test vma->vm_file as it
@@ -1177,7 +1184,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
1177		1184
1178	start = end;	1185	start = end;
1179	} else	1186	} else
1180	start = unmap_page_range(*tlbp, vma,	1187	start = unmap_page_range(tlb, vma,
1181	start, end, &zap_work, details);	1188	start, end, &zap_work, details);
1182		1189
1183	if (zap_work > 0) {	1190	if (zap_work > 0) {
@@ -1185,19 +1192,13 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
1185	break;	1192	break;
1186	}	1193	}
1187		1194
1188	tlb_finish_mmu(*tlbp, tlb_start, start);
1189
1190	if (need_resched() \|\|	1195	if (need_resched() \|\|
1191	(i_mmap_lock && spin_needbreak(i_mmap_lock))) {	1196	(i_mmap_lock && spin_needbreak(i_mmap_lock))) {
1192	if (i_mmap_lock) {	1197	if (i_mmap_lock)
1193	*tlbp = NULL;
1194	goto out;	1198	goto out;
1195	}
1196	cond_resched();	1199	cond_resched();
1197	}	1200	}
1198		1201
1199	*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
1200	tlb_start_valid = 0;
1201	zap_work = ZAP_BLOCK_SIZE;	1202	zap_work = ZAP_BLOCK_SIZE;
1202	}	1203	}
1203	}	1204	}
@@ -1217,16 +1218,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
1217	unsigned long size, struct zap_details *details)	1218	unsigned long size, struct zap_details *details)
1218	{	1219	{
1219	struct mm_struct *mm = vma->vm_mm;	1220	struct mm_struct *mm = vma->vm_mm;
1220	struct mmu_gather *tlb;	1221	struct mmu_gather tlb;
1221	unsigned long end = address + size;	1222	unsigned long end = address + size;
1222	unsigned long nr_accounted = 0;	1223	unsigned long nr_accounted = 0;
1223		1224
1224	lru_add_drain();	1225	lru_add_drain();
1225	tlb = tlb_gather_mmu(mm, 0);	1226	tlb_gather_mmu(&tlb, mm, 0);
1226	update_hiwater_rss(mm);	1227	update_hiwater_rss(mm);
1227	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);	1228	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
1228	if (tlb)	1229	tlb_finish_mmu(&tlb, address, end);
1229	tlb_finish_mmu(tlb, address, end);
1230	return end;	1230	return end;
1231	}	1231	}
1232		1232


diff --git a/mm/mmap.c b/mm/mmap.c index adb12527fd0e..40d49986e714 100644 --- a/mm/mmap.c +++ b/mm/mmap.c
@@ -1903,17 +1903,17 @@ static void unmap_region(struct mm_struct *mm,
1903	unsigned long start, unsigned long end)	1903	unsigned long start, unsigned long end)
1904	{	1904	{
1905	struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;	1905	struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
1906	struct mmu_gather *tlb;	1906	struct mmu_gather tlb;
1907	unsigned long nr_accounted = 0;	1907	unsigned long nr_accounted = 0;
1908		1908
1909	lru_add_drain();	1909	lru_add_drain();
1910	tlb = tlb_gather_mmu(mm, 0);	1910	tlb_gather_mmu(&tlb, mm, 0);
1911	update_hiwater_rss(mm);	1911	update_hiwater_rss(mm);
1912	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);	1912	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
1913	vm_unacct_memory(nr_accounted);	1913	vm_unacct_memory(nr_accounted);
1914	free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,	1914	free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
1915	next? next->vm_start: 0);	1915	next ? next->vm_start : 0);
1916	tlb_finish_mmu(tlb, start, end);	1916	tlb_finish_mmu(&tlb, start, end);
1917	}	1917	}
1918		1918
1919	/*	1919	/*
@@ -2255,7 +2255,7 @@ EXPORT_SYMBOL(do_brk);
2255	/* Release all mmaps. */	2255	/* Release all mmaps. */
2256	void exit_mmap(struct mm_struct *mm)	2256	void exit_mmap(struct mm_struct *mm)
2257	{	2257	{
2258	struct mmu_gather *tlb;	2258	struct mmu_gather tlb;
2259	struct vm_area_struct *vma;	2259	struct vm_area_struct *vma;
2260	unsigned long nr_accounted = 0;	2260	unsigned long nr_accounted = 0;
2261	unsigned long end;	2261	unsigned long end;
@@ -2280,14 +2280,14 @@ void exit_mmap(struct mm_struct *mm)
2280		2280
2281	lru_add_drain();	2281	lru_add_drain();
2282	flush_cache_mm(mm);	2282	flush_cache_mm(mm);
2283	tlb = tlb_gather_mmu(mm, 1);	2283	tlb_gather_mmu(&tlb, mm, 1);
2284	/* update_hiwater_rss(mm) here? but nobody should be looking */	2284	/* update_hiwater_rss(mm) here? but nobody should be looking */
2285	/* Use -1 here to ensure all VMAs in the mm are unmapped */	2285	/* Use -1 here to ensure all VMAs in the mm are unmapped */
2286	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);	2286	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2287	vm_unacct_memory(nr_accounted);	2287	vm_unacct_memory(nr_accounted);
2288		2288
2289	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);	2289	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
2290	tlb_finish_mmu(tlb, 0, end);	2290	tlb_finish_mmu(&tlb, 0, end);
2291		2291
2292	/*	2292	/*
2293	* Walk the list again, actually closing and freeing it,	2293	* Walk the list again, actually closing and freeing it,