1 files changed, 176 insertions, 4 deletions
diff --git a/mm/mmap.c b/mm/mmap.c
index 1d102b956fd8..339cf5c4d5d8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -26,12 +26,15 @@
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
+#include "internal.h"
 #ifndef arch_mmap_check
 #define arch_mmap_check(addr, len, flags)       (0)
 #endif
@@ -367,7 +370,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
                if (vma_tmp->vm_end > addr) {
                        vma = vma_tmp;
                        if (vma_tmp->vm_start <= addr)
-                                return vma;
+                                break;
                        __rb_link = &__rb_parent->rb_left;
                } else {
                        rb_prev = __rb_parent;
@@ -1108,6 +1111,9 @@ munmap_back:
        if (!may_expand_vm(mm, len >> PAGE_SHIFT))
                return -ENOMEM;
+        if (flags & MAP_NORESERVE)
+                vm_flags |= VM_NORESERVE;
        if (accountable && (!(flags & MAP_NORESERVE) ||
                            sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
                if (vm_flags & VM_SHARED) {
@@ -1763,7 +1769,7 @@ static void unmap_region(struct mm_struct *mm,
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+        free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
                                 next? next->vm_start: 0);
        tlb_finish_mmu(tlb, start, end);
 }
@@ -1807,7 +1813,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        struct mempolicy *pol;
        struct vm_area_struct *new;
-        if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
+        if (is_vm_hugetlb_page(vma) && (addr &
+                                        ~(huge_page_mask(hstate_vma(vma)))))
                return -EINVAL;
        if (mm->map_count >= sysctl_max_map_count)
@@ -2055,6 +2062,7 @@ void exit_mmap(struct mm_struct *mm)
        /* mm's last user has gone, and its about to be pulled down */
        arch_exit_mmap(mm);
+        mmu_notifier_release(mm);
        lru_add_drain();
        flush_cache_mm(mm);
@@ -2063,7 +2071,7 @@ void exit_mmap(struct mm_struct *mm)
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+        free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
        tlb_finish_mmu(tlb, 0, end);
        /*
@@ -2262,3 +2270,167 @@ int install_special_mapping(struct mm_struct *mm,
        return 0;
 }
+static DEFINE_MUTEX(mm_all_locks_mutex);
+static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
+{
+        if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+                /*
+                 * The LSB of head.next can't change from under us
+                 * because we hold the mm_all_locks_mutex.
+                 */
+                spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+                /*
+                 * We can safely modify head.next after taking the
+                 * anon_vma->lock. If some other vma in this mm shares
+                 * the same anon_vma we won't take it again.
+                 *
+                 * No need of atomic instructions here, head.next
+                 * can't change from under us thanks to the
+                 * anon_vma->lock.
+                 */
+                if (__test_and_set_bit(0, (unsigned long *)
+                                       &anon_vma->head.next))
+                        BUG();
+        }
+}
+static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
+{
+        if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+                /*
+                 * AS_MM_ALL_LOCKS can't change from under us because
+                 * we hold the mm_all_locks_mutex.
+                 *
+                 * Operations on ->flags have to be atomic because
+                 * even if AS_MM_ALL_LOCKS is stable thanks to the
+                 * mm_all_locks_mutex, there may be other cpus
+                 * changing other bitflags in parallel to us.
+                 */
+                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
+                        BUG();
+                spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
+        }
+}
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults.
+ *
+ * The caller must take the mmap_sem in write mode before calling
+ * mm_take_all_locks(). The caller isn't allowed to release the
+ * mmap_sem until mm_drop_all_locks() returns.
+ *
+ * mmap_sem in write mode is required in order to block all operations
+ * that could modify pagetables and free pages without need of
+ * altering the vma layout (for example populate_range() with
+ * nonlinear vmas). It's also needed in write mode to avoid new
+ * anon_vmas to be associated with existing vmas.
+ *
+ * A single task can't take more than one mm_take_all_locks() in a row
+ * or it would deadlock.
+ *
+ * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * mapping->flags avoid to take the same lock twice, if more than one
+ * vma in this mm is backed by the same anon_vma or address_space.
+ *
+ * We can take all the locks in random order because the VM code
+ * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * takes more than one of them in a row. Secondly we're protected
+ * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
+ *
+ * mm_take_all_locks() and mm_drop_all_locks are expensive operations
+ * that may have to take thousand of locks.
+ *
+ * mm_take_all_locks() can fail if it's interrupted by signals.
+ */
+int mm_take_all_locks(struct mm_struct *mm)
+{
+        struct vm_area_struct *vma;
+        int ret = -EINTR;
+        BUG_ON(down_read_trylock(&mm->mmap_sem));
+        mutex_lock(&mm_all_locks_mutex);
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (signal_pending(current))
+                        goto out_unlock;
+                if (vma->vm_file && vma->vm_file->f_mapping)
+                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
+        }
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (signal_pending(current))
+                        goto out_unlock;
+                if (vma->anon_vma)
+                        vm_lock_anon_vma(mm, vma->anon_vma);
+        }
+        ret = 0;
+out_unlock:
+        if (ret)
+                mm_drop_all_locks(mm);
+        return ret;
+}
+static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
+{
+        if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+                /*
+                 * The LSB of head.next can't change to 0 from under
+                 * us because we hold the mm_all_locks_mutex.
+                 *
+                 * We must however clear the bitflag before unlocking
+                 * the vma so the users using the anon_vma->head will
+                 * never see our bitflag.
+                 *
+                 * No need of atomic instructions here, head.next
+                 * can't change from under us until we release the
+                 * anon_vma->lock.
+                 */
+                if (!__test_and_clear_bit(0, (unsigned long *)
+                                          &anon_vma->head.next))
+                        BUG();
+                spin_unlock(&anon_vma->lock);
+        }
+}
+static void vm_unlock_mapping(struct address_space *mapping)
+{
+        if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+                /*
+                 * AS_MM_ALL_LOCKS can't change to 0 from under us
+                 * because we hold the mm_all_locks_mutex.
+                 */
+                spin_unlock(&mapping->i_mmap_lock);
+                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
+                                        &mapping->flags))
+                        BUG();
+        }
+}
+/*
+ * The mmap_sem cannot be released by the caller until
+ * mm_drop_all_locks() returns.
+ */
+void mm_drop_all_locks(struct mm_struct *mm)
+{
+        struct vm_area_struct *vma;
+        BUG_ON(down_read_trylock(&mm->mmap_sem));
+        BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (vma->anon_vma)
+                        vm_unlock_anon_vma(vma->anon_vma);
+                if (vma->vm_file && vma->vm_file->f_mapping)
+                        vm_unlock_mapping(vma->vm_file->f_mapping);
+        }
+        mutex_unlock(&mm_all_locks_mutex);
+}

diff --git a/mm/mmap.c b/mm/mmap.c index 1d102b956fd8..339cf5c4d5d8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c
@@ -26,12 +26,15 @@
26	#include <linux/mount.h>	26	#include <linux/mount.h>
27	#include <linux/mempolicy.h>	27	#include <linux/mempolicy.h>
28	#include <linux/rmap.h>	28	#include <linux/rmap.h>
		29	#include <linux/mmu_notifier.h>
29		30
30	#include <asm/uaccess.h>	31	#include <asm/uaccess.h>
31	#include <asm/cacheflush.h>	32	#include <asm/cacheflush.h>
32	#include <asm/tlb.h>	33	#include <asm/tlb.h>
33	#include <asm/mmu_context.h>	34	#include <asm/mmu_context.h>
34		35
		36	#include "internal.h"
		37
35	#ifndef arch_mmap_check	38	#ifndef arch_mmap_check
36	#define arch_mmap_check(addr, len, flags) (0)	39	#define arch_mmap_check(addr, len, flags) (0)
37	#endif	40	#endif
@@ -367,7 +370,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
367	if (vma_tmp->vm_end > addr) {	370	if (vma_tmp->vm_end > addr) {
368	vma = vma_tmp;	371	vma = vma_tmp;
369	if (vma_tmp->vm_start <= addr)	372	if (vma_tmp->vm_start <= addr)
370	return vma;	373	break;
371	__rb_link = &__rb_parent->rb_left;	374	__rb_link = &__rb_parent->rb_left;
372	} else {	375	} else {
373	rb_prev = __rb_parent;	376	rb_prev = __rb_parent;
@@ -1108,6 +1111,9 @@ munmap_back:
1108	if (!may_expand_vm(mm, len >> PAGE_SHIFT))	1111	if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1109	return -ENOMEM;	1112	return -ENOMEM;
1110		1113
		1114	if (flags & MAP_NORESERVE)
		1115	vm_flags \|= VM_NORESERVE;
		1116
1111	if (accountable && (!(flags & MAP_NORESERVE) \|\|	1117	if (accountable && (!(flags & MAP_NORESERVE) \|\|
1112	sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {	1118	sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
1113	if (vm_flags & VM_SHARED) {	1119	if (vm_flags & VM_SHARED) {
@@ -1763,7 +1769,7 @@ static void unmap_region(struct mm_struct *mm,
1763	update_hiwater_rss(mm);	1769	update_hiwater_rss(mm);
1764	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);	1770	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
1765	vm_unacct_memory(nr_accounted);	1771	vm_unacct_memory(nr_accounted);
1766	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,	1772	free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
1767	next? next->vm_start: 0);	1773	next? next->vm_start: 0);
1768	tlb_finish_mmu(tlb, start, end);	1774	tlb_finish_mmu(tlb, start, end);
1769	}	1775	}
@@ -1807,7 +1813,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1807	struct mempolicy *pol;	1813	struct mempolicy *pol;
1808	struct vm_area_struct *new;	1814	struct vm_area_struct *new;
1809		1815
1810	if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))	1816	if (is_vm_hugetlb_page(vma) && (addr &
		1817	~(huge_page_mask(hstate_vma(vma)))))
1811	return -EINVAL;	1818	return -EINVAL;
1812		1819
1813	if (mm->map_count >= sysctl_max_map_count)	1820	if (mm->map_count >= sysctl_max_map_count)
@@ -2055,6 +2062,7 @@ void exit_mmap(struct mm_struct *mm)
2055		2062
2056	/* mm's last user has gone, and its about to be pulled down */	2063	/* mm's last user has gone, and its about to be pulled down */
2057	arch_exit_mmap(mm);	2064	arch_exit_mmap(mm);
		2065	mmu_notifier_release(mm);
2058		2066
2059	lru_add_drain();	2067	lru_add_drain();
2060	flush_cache_mm(mm);	2068	flush_cache_mm(mm);
@@ -2063,7 +2071,7 @@ void exit_mmap(struct mm_struct *mm)
2063	/* Use -1 here to ensure all VMAs in the mm are unmapped */	2071	/* Use -1 here to ensure all VMAs in the mm are unmapped */
2064	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);	2072	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2065	vm_unacct_memory(nr_accounted);	2073	vm_unacct_memory(nr_accounted);
2066	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);	2074	free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
2067	tlb_finish_mmu(tlb, 0, end);	2075	tlb_finish_mmu(tlb, 0, end);
2068		2076
2069	/*	2077	/*
@@ -2262,3 +2270,167 @@ int install_special_mapping(struct mm_struct *mm,
2262		2270
2263	return 0;	2271	return 0;
2264	}	2272	}
		2273
		2274	static DEFINE_MUTEX(mm_all_locks_mutex);
		2275
		2276	static void vm_lock_anon_vma(struct mm_struct mm, struct anon_vma anon_vma)
		2277	{
		2278	if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
		2279	/*
		2280	* The LSB of head.next can't change from under us
		2281	* because we hold the mm_all_locks_mutex.
		2282	*/
		2283	spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
		2284	/*
		2285	* We can safely modify head.next after taking the
		2286	* anon_vma->lock. If some other vma in this mm shares
		2287	* the same anon_vma we won't take it again.
		2288	*
		2289	* No need of atomic instructions here, head.next
		2290	* can't change from under us thanks to the
		2291	* anon_vma->lock.
		2292	*/
		2293	if (__test_and_set_bit(0, (unsigned long *)
		2294	&anon_vma->head.next))
		2295	BUG();
		2296	}
		2297	}
		2298
		2299	static void vm_lock_mapping(struct mm_struct mm, struct address_space mapping)
		2300	{
		2301	if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
		2302	/*
		2303	* AS_MM_ALL_LOCKS can't change from under us because
		2304	* we hold the mm_all_locks_mutex.
		2305	*
		2306	* Operations on ->flags have to be atomic because
		2307	* even if AS_MM_ALL_LOCKS is stable thanks to the
		2308	* mm_all_locks_mutex, there may be other cpus
		2309	* changing other bitflags in parallel to us.
		2310	*/
		2311	if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
		2312	BUG();
		2313	spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
		2314	}
		2315	}
		2316
		2317	/*
		2318	* This operation locks against the VM for all pte/vma/mm related
		2319	* operations that could ever happen on a certain mm. This includes
		2320	* vmtruncate, try_to_unmap, and all page faults.
		2321	*
		2322	* The caller must take the mmap_sem in write mode before calling
		2323	* mm_take_all_locks(). The caller isn't allowed to release the
		2324	* mmap_sem until mm_drop_all_locks() returns.
		2325	*
		2326	* mmap_sem in write mode is required in order to block all operations
		2327	* that could modify pagetables and free pages without need of
		2328	* altering the vma layout (for example populate_range() with
		2329	* nonlinear vmas). It's also needed in write mode to avoid new
		2330	* anon_vmas to be associated with existing vmas.
		2331	*
		2332	* A single task can't take more than one mm_take_all_locks() in a row
		2333	* or it would deadlock.
		2334	*
		2335	* The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
		2336	* mapping->flags avoid to take the same lock twice, if more than one
		2337	* vma in this mm is backed by the same anon_vma or address_space.
		2338	*
		2339	* We can take all the locks in random order because the VM code
		2340	* taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
		2341	* takes more than one of them in a row. Secondly we're protected
		2342	* against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
		2343	*
		2344	* mm_take_all_locks() and mm_drop_all_locks are expensive operations
		2345	* that may have to take thousand of locks.
		2346	*
		2347	* mm_take_all_locks() can fail if it's interrupted by signals.
		2348	*/
		2349	int mm_take_all_locks(struct mm_struct *mm)
		2350	{
		2351	struct vm_area_struct *vma;
		2352	int ret = -EINTR;
		2353
		2354	BUG_ON(down_read_trylock(&mm->mmap_sem));
		2355
		2356	mutex_lock(&mm_all_locks_mutex);
		2357
		2358	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		2359	if (signal_pending(current))
		2360	goto out_unlock;
		2361	if (vma->vm_file && vma->vm_file->f_mapping)
		2362	vm_lock_mapping(mm, vma->vm_file->f_mapping);
		2363	}
		2364
		2365	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		2366	if (signal_pending(current))
		2367	goto out_unlock;
		2368	if (vma->anon_vma)
		2369	vm_lock_anon_vma(mm, vma->anon_vma);
		2370	}
		2371
		2372	ret = 0;
		2373
		2374	out_unlock:
		2375	if (ret)
		2376	mm_drop_all_locks(mm);
		2377
		2378	return ret;
		2379	}
		2380
		2381	static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
		2382	{
		2383	if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
		2384	/*
		2385	* The LSB of head.next can't change to 0 from under
		2386	* us because we hold the mm_all_locks_mutex.
		2387	*
		2388	* We must however clear the bitflag before unlocking
		2389	* the vma so the users using the anon_vma->head will
		2390	* never see our bitflag.
		2391	*
		2392	* No need of atomic instructions here, head.next
		2393	* can't change from under us until we release the
		2394	* anon_vma->lock.
		2395	*/
		2396	if (!__test_and_clear_bit(0, (unsigned long *)
		2397	&anon_vma->head.next))
		2398	BUG();
		2399	spin_unlock(&anon_vma->lock);
		2400	}
		2401	}
		2402
		2403	static void vm_unlock_mapping(struct address_space *mapping)
		2404	{
		2405	if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
		2406	/*
		2407	* AS_MM_ALL_LOCKS can't change to 0 from under us
		2408	* because we hold the mm_all_locks_mutex.
		2409	*/
		2410	spin_unlock(&mapping->i_mmap_lock);
		2411	if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
		2412	&mapping->flags))
		2413	BUG();
		2414	}
		2415	}
		2416
		2417	/*
		2418	* The mmap_sem cannot be released by the caller until
		2419	* mm_drop_all_locks() returns.
		2420	*/
		2421	void mm_drop_all_locks(struct mm_struct *mm)
		2422	{
		2423	struct vm_area_struct *vma;
		2424
		2425	BUG_ON(down_read_trylock(&mm->mmap_sem));
		2426	BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
		2427
		2428	for (vma = mm->mmap; vma; vma = vma->vm_next) {
		2429	if (vma->anon_vma)
		2430	vm_unlock_anon_vma(vma->anon_vma);
		2431	if (vma->vm_file && vma->vm_file->f_mapping)
		2432	vm_unlock_mapping(vma->vm_file->f_mapping);
		2433	}
		2434
		2435	mutex_unlock(&mm_all_locks_mutex);
		2436	}