[PATCH] mm: arm ready for split ptlock

Prepare arm for the split page_table_lock: three issues. Signal handling's preserve and restore of iwmmxt context currently involves reading and writing that context to and from user space, while holding page_table_lock to secure the user page(s) against kswapd. If we split the lock, then the structure might span two pages, secured by to read into and write from a kernel stack buffer, copying that out and in without locking (the structure is 160 bytes in size, and here we're near the top of the kernel stack). Or would the overhead be noticeable? arm_syscall's cmpxchg emulation use pte_offset_map_lock, instead of pte_offset_map and mm-wide page_table_lock; and strictly, it should now also take mmap_sem before descending to pmd, to guard against another thread munmapping, and the page table pulled out beneath this thread. Updated two comments in fault-armv.c. adjust_pte is interesting, since its modification of a pte in one part of the mm depends on the lock held when calling update_mmu_cache for a pte in some other part of that mm. This can't be done with a split page_table_lock (and we've already taken the lowest lock in the hierarchy here): so we'll have to disable split on arm, unless CONFIG_CPU_CACHE_VIPT to ensures adjust_pte never used. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Hugh Dickins <hugh@veritas.com> 2005-10-29 21:16:36 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2005-10-30 00:40:42 -0400
commit: 69b0475456ff7ef520e16f69d7a15c0d68b74e64 (patch)
tree: 3e70d47f16437254eff3b3cca4aa275be1b5e275
parent: 60ec5585496871345c1a8113d7b60ed9d9474866 (diff)
3 files changed, 33 insertions, 84 deletions
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index a94d75fef598..a917e3dd3666 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -139,93 +139,33 @@ struct iwmmxt_sigframe {
        unsigned long   storage[0x98/4];
 };
-static int page_present(struct mm_struct *mm, void __user *uptr, int wr)
-{
-        unsigned long addr = (unsigned long)uptr;
-        pgd_t *pgd = pgd_offset(mm, addr);
-        if (pgd_present(*pgd)) {
-                pmd_t *pmd = pmd_offset(pgd, addr);
-                if (pmd_present(*pmd)) {
-                        pte_t *pte = pte_offset_map(pmd, addr);
-                        return (pte_present(*pte) && (!wr || pte_write(*pte)));
-                }
-        }
-        return 0;
-}
-static int copy_locked(void __user *uptr, void *kptr, size_t size, int write,
-                       void (*copyfn)(void *, void __user *))
-{
-        unsigned char v, __user *userptr = uptr;
-        int err = 0;
-        do {
-                struct mm_struct *mm;
-                if (write) {
-                        __put_user_error(0, userptr, err);
-                        __put_user_error(0, userptr + size - 1, err);
-                } else {
-                        __get_user_error(v, userptr, err);
-                        __get_user_error(v, userptr + size - 1, err);
-                }
-                if (err)
-                        break;
-                mm = current->mm;
-                spin_lock(&mm->page_table_lock);
-                if (page_present(mm, userptr, write) &&
-                    page_present(mm, userptr + size - 1, write)) {
-                        copyfn(kptr, uptr);
-                } else
-                        err = 1;
-                spin_unlock(&mm->page_table_lock);
-        } while (err);
-        return err;
-}
 static int preserve_iwmmxt_context(struct iwmmxt_sigframe *frame)
 {
-        int err = 0;
+        char kbuf[sizeof(*frame) + 8];
+        struct iwmmxt_sigframe *kframe;
        /* the iWMMXt context must be 64 bit aligned */
-        WARN_ON((unsigned long)frame & 7);
+        kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
+        kframe->magic0 = IWMMXT_MAGIC0;
-        __put_user_error(IWMMXT_MAGIC0, &frame->magic0, err);
+        kframe->magic1 = IWMMXT_MAGIC1;
-        __put_user_error(IWMMXT_MAGIC1, &frame->magic1, err);
+        iwmmxt_task_copy(current_thread_info(), &kframe->storage);
+        return __copy_to_user(frame, kframe, sizeof(*frame));
-        /*
-         * iwmmxt_task_copy() doesn't check user permissions.
-         * Let's do a dummy write on the upper boundary to ensure
-         * access to user mem is OK all way up.
-         */
-        err |= copy_locked(&frame->storage, current_thread_info(),
-                           sizeof(frame->storage), 1, iwmmxt_task_copy);
-        return err;
 }
 static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame)
 {
-        unsigned long magic0, magic1;
+        char kbuf[sizeof(*frame) + 8];
-        int err = 0;
+        struct iwmmxt_sigframe *kframe;
-        /* the iWMMXt context is 64 bit aligned */
+        /* the iWMMXt context must be 64 bit aligned */
-        WARN_ON((unsigned long)frame & 7);
+        kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
+        if (__copy_from_user(kframe, frame, sizeof(*frame)))
-        /*
+                return -1;
-         * Validate iWMMXt context signature.
+        if (kframe->magic0 != IWMMXT_MAGIC0 ||
-         * Also, iwmmxt_task_restore() doesn't check user permissions.
+            kframe->magic1 != IWMMXT_MAGIC1)
-         * Let's do a dummy write on the upper boundary to ensure
+                return -1;
-         * access to user mem is OK all way up.
+        iwmmxt_task_restore(current_thread_info(), &kframe->storage);
-         */
+        return 0;
-        __get_user_error(magic0, &frame->magic0, err);
-        __get_user_error(magic1, &frame->magic1, err);
-        if (!err && magic0 == IWMMXT_MAGIC0 && magic1 == IWMMXT_MAGIC1)
-                err = copy_locked(&frame->storage, current_thread_info(),
-                                  sizeof(frame->storage), 0, iwmmxt_task_restore);
-        return err;
 }
 #endif
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index baa09601a64e..66e5a0516f23 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -483,29 +483,33 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs)
                unsigned long addr = regs->ARM_r2;
                struct mm_struct *mm = current->mm;
                pgd_t *pgd; pmd_t *pmd; pte_t *pte;
+                spinlock_t *ptl;
                regs->ARM_cpsr &= ~PSR_C_BIT;
-                spin_lock(&mm->page_table_lock);
+                down_read(&mm->mmap_sem);
                pgd = pgd_offset(mm, addr);
                if (!pgd_present(*pgd))
                        goto bad_access;
                pmd = pmd_offset(pgd, addr);
                if (!pmd_present(*pmd))
                        goto bad_access;
-                pte = pte_offset_map(pmd, addr);
+                pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
-                if (!pte_present(*pte) || !pte_write(*pte))
+                if (!pte_present(*pte) || !pte_write(*pte)) {
+                        pte_unmap_unlock(pte, ptl);
                        goto bad_access;
+                }
                val = *(unsigned long *)addr;
                val -= regs->ARM_r0;
                if (val == 0) {
                        *(unsigned long *)addr = regs->ARM_r1;
                        regs->ARM_cpsr |= PSR_C_BIT;
                }
-                spin_unlock(&mm->page_table_lock);
+                pte_unmap_unlock(pte, ptl);
+                up_read(&mm->mmap_sem);
                return val;
                bad_access:
-                spin_unlock(&mm->page_table_lock);
+                up_read(&mm->mmap_sem);
                /* simulate a write access fault */
                do_DataAbort(addr, 15 + (1 << 11), regs);
                return -1;
diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index be4ab3d73c91..7fc1b35a6746 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -26,6 +26,11 @@ static unsigned long shared_pte_mask = L_PTE_CACHEABLE;
 /*
 * We take the easy way out of this problem - we make the
 * PTE uncacheable.  However, we leave the write buffer on.
+ *
+ * Note that the pte lock held when calling update_mmu_cache must also
+ * guard the pte (somewhere else in the same mm) that we modify here.
+ * Therefore those configurations which might call adjust_pte (those
+ * without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock.
 */
 static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
 {
@@ -127,7 +132,7 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page);
 *  2. If we have multiple shared mappings of the same space in
 *     an object, we need to deal with the cache aliasing issues.
 *
- * Note that the page_table_lock will be held.
+ * Note that the pte lock will be held.
 */
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {
author	Hugh Dickins <hugh@veritas.com>	2005-10-29 21:16:36 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2005-10-30 00:40:42 -0400
commit	69b0475456ff7ef520e16f69d7a15c0d68b74e64 (patch)
tree	3e70d47f16437254eff3b3cca4aa275be1b5e275
parent	60ec5585496871345c1a8113d7b60ed9d9474866 (diff)

diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index a94d75fef598..a917e3dd3666 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c
@@ -139,93 +139,33 @@ struct iwmmxt_sigframe {
139	unsigned long storage[0x98/4];	139	unsigned long storage[0x98/4];
140	};	140	};
141		141
142	static int page_present(struct mm_struct mm, void __user uptr, int wr)
143	{
144	unsigned long addr = (unsigned long)uptr;
145	pgd_t *pgd = pgd_offset(mm, addr);
146	if (pgd_present(*pgd)) {
147	pmd_t *pmd = pmd_offset(pgd, addr);
148	if (pmd_present(*pmd)) {
149	pte_t *pte = pte_offset_map(pmd, addr);
150	return (pte_present(pte) && (!wr \|\| pte_write(pte)));
151	}
152	}
153	return 0;
154	}
155
156	static int copy_locked(void __user uptr, void kptr, size_t size, int write,
157	void (copyfn)(void , void __user *))
158	{
159	unsigned char v, __user *userptr = uptr;
160	int err = 0;
161
162	do {
163	struct mm_struct *mm;
164
165	if (write) {
166	__put_user_error(0, userptr, err);
167	__put_user_error(0, userptr + size - 1, err);
168	} else {
169	__get_user_error(v, userptr, err);
170	__get_user_error(v, userptr + size - 1, err);
171	}
172
173	if (err)
174	break;
175
176	mm = current->mm;
177	spin_lock(&mm->page_table_lock);
178	if (page_present(mm, userptr, write) &&
179	page_present(mm, userptr + size - 1, write)) {
180	copyfn(kptr, uptr);
181	} else
182	err = 1;
183	spin_unlock(&mm->page_table_lock);
184	} while (err);
185
186	return err;
187	}
188
189	static int preserve_iwmmxt_context(struct iwmmxt_sigframe *frame)	142	static int preserve_iwmmxt_context(struct iwmmxt_sigframe *frame)
190	{	143	{
191	int err = 0;	144	char kbuf[sizeof(*frame) + 8];
		145	struct iwmmxt_sigframe *kframe;
192		146
193	/* the iWMMXt context must be 64 bit aligned */	147	/* the iWMMXt context must be 64 bit aligned */
194	WARN_ON((unsigned long)frame & 7);	148	kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
195		149	kframe->magic0 = IWMMXT_MAGIC0;
196	__put_user_error(IWMMXT_MAGIC0, &frame->magic0, err);	150	kframe->magic1 = IWMMXT_MAGIC1;
197	__put_user_error(IWMMXT_MAGIC1, &frame->magic1, err);	151	iwmmxt_task_copy(current_thread_info(), &kframe->storage);
198		152	return __copy_to_user(frame, kframe, sizeof(*frame));
199	/*
200	* iwmmxt_task_copy() doesn't check user permissions.
201	* Let's do a dummy write on the upper boundary to ensure
202	* access to user mem is OK all way up.
203	*/
204	err \|= copy_locked(&frame->storage, current_thread_info(),
205	sizeof(frame->storage), 1, iwmmxt_task_copy);
206	return err;
207	}	153	}
208		154
209	static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame)	155	static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame)
210	{	156	{
211	unsigned long magic0, magic1;	157	char kbuf[sizeof(*frame) + 8];
212	int err = 0;	158	struct iwmmxt_sigframe *kframe;
213		159
214	/* the iWMMXt context is 64 bit aligned */	160	/* the iWMMXt context must be 64 bit aligned */
215	WARN_ON((unsigned long)frame & 7);	161	kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7);
216		162	if (__copy_from_user(kframe, frame, sizeof(*frame)))
217	/*	163	return -1;
218	* Validate iWMMXt context signature.	164	if (kframe->magic0 != IWMMXT_MAGIC0 \|\|
219	* Also, iwmmxt_task_restore() doesn't check user permissions.	165	kframe->magic1 != IWMMXT_MAGIC1)
220	* Let's do a dummy write on the upper boundary to ensure	166	return -1;
221	* access to user mem is OK all way up.	167	iwmmxt_task_restore(current_thread_info(), &kframe->storage);
222	*/	168	return 0;
223	__get_user_error(magic0, &frame->magic0, err);
224	__get_user_error(magic1, &frame->magic1, err);
225	if (!err && magic0 == IWMMXT_MAGIC0 && magic1 == IWMMXT_MAGIC1)
226	err = copy_locked(&frame->storage, current_thread_info(),
227	sizeof(frame->storage), 0, iwmmxt_task_restore);
228	return err;
229	}	169	}
230		170
231	#endif	171	#endif


diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index baa09601a64e..66e5a0516f23 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c
@@ -483,29 +483,33 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs)
483	unsigned long addr = regs->ARM_r2;	483	unsigned long addr = regs->ARM_r2;
484	struct mm_struct *mm = current->mm;	484	struct mm_struct *mm = current->mm;
485	pgd_t pgd; pmd_t pmd; pte_t *pte;	485	pgd_t pgd; pmd_t pmd; pte_t *pte;
		486	spinlock_t *ptl;
486		487
487	regs->ARM_cpsr &= ~PSR_C_BIT;	488	regs->ARM_cpsr &= ~PSR_C_BIT;
488	spin_lock(&mm->page_table_lock);	489	down_read(&mm->mmap_sem);
489	pgd = pgd_offset(mm, addr);	490	pgd = pgd_offset(mm, addr);
490	if (!pgd_present(*pgd))	491	if (!pgd_present(*pgd))
491	goto bad_access;	492	goto bad_access;
492	pmd = pmd_offset(pgd, addr);	493	pmd = pmd_offset(pgd, addr);
493	if (!pmd_present(*pmd))	494	if (!pmd_present(*pmd))
494	goto bad_access;	495	goto bad_access;
495	pte = pte_offset_map(pmd, addr);	496	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
496	if (!pte_present(pte) \|\| !pte_write(pte))	497	if (!pte_present(pte) \|\| !pte_write(pte)) {
		498	pte_unmap_unlock(pte, ptl);
497	goto bad_access;	499	goto bad_access;
		500	}
498	val = (unsigned long )addr;	501	val = (unsigned long )addr;
499	val -= regs->ARM_r0;	502	val -= regs->ARM_r0;
500	if (val == 0) {	503	if (val == 0) {
501	(unsigned long )addr = regs->ARM_r1;	504	(unsigned long )addr = regs->ARM_r1;
502	regs->ARM_cpsr \|= PSR_C_BIT;	505	regs->ARM_cpsr \|= PSR_C_BIT;
503	}	506	}
504	spin_unlock(&mm->page_table_lock);	507	pte_unmap_unlock(pte, ptl);
		508	up_read(&mm->mmap_sem);
505	return val;	509	return val;
506		510
507	bad_access:	511	bad_access:
508	spin_unlock(&mm->page_table_lock);	512	up_read(&mm->mmap_sem);
509	/* simulate a write access fault */	513	/* simulate a write access fault */
510	do_DataAbort(addr, 15 + (1 << 11), regs);	514	do_DataAbort(addr, 15 + (1 << 11), regs);
511	return -1;	515	return -1;


diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index be4ab3d73c91..7fc1b35a6746 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c
@@ -26,6 +26,11 @@ static unsigned long shared_pte_mask = L_PTE_CACHEABLE;
26	/*	26	/*
27	* We take the easy way out of this problem - we make the	27	* We take the easy way out of this problem - we make the
28	* PTE uncacheable. However, we leave the write buffer on.	28	* PTE uncacheable. However, we leave the write buffer on.
		29	*
		30	* Note that the pte lock held when calling update_mmu_cache must also
		31	* guard the pte (somewhere else in the same mm) that we modify here.
		32	* Therefore those configurations which might call adjust_pte (those
		33	* without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock.
29	*/	34	*/
30	static int adjust_pte(struct vm_area_struct *vma, unsigned long address)	35	static int adjust_pte(struct vm_area_struct *vma, unsigned long address)
31	{	36	{
@@ -127,7 +132,7 @@ void __flush_dcache_page(struct address_space mapping, struct page page);
127	* 2. If we have multiple shared mappings of the same space in	132	* 2. If we have multiple shared mappings of the same space in
128	* an object, we need to deal with the cache aliasing issues.	133	* an object, we need to deal with the cache aliasing issues.
129	*	134	*
130	* Note that the page_table_lock will be held.	135	* Note that the pte lock will be held.
131	*/	136	*/
132	void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte)	137	void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
133	{	138	{