1 files changed, 470 insertions, 0 deletions
diff --git a/arch/cris/arch-v32/lib/usercopy.c b/arch/cris/arch-v32/lib/usercopy.c
new file mode 100644
index 000000000000..f0b08460c1be
--- /dev/null
+++ b/arch/cris/arch-v32/lib/usercopy.c
@@ -0,0 +1,470 @@
+/*
+ * User address space access functions.
+ * The non-inlined parts of asm-cris/uaccess.h are here.
+ *
+ * Copyright (C) 2000, 2003 Axis Communications AB.
+ *
+ * Written by Hans-Peter Nilsson.
+ * Pieces used from memcpy, originally by Kenny Ranerup long time ago.
+ */
+#include <asm/uaccess.h>
+/* Asm:s have been tweaked (within the domain of correctness) to give
+   satisfactory results for "gcc version 3.2.1 Axis release R53/1.53-v32".
+   Check regularly...
+   Note that for CRISv32, the PC saved at a bus-fault is the address
+   *at* the faulting instruction, with a special case for instructions
+   in delay slots: then it's the address of the branch.  Note also that
+   in contrast to v10, a postincrement in the instruction is *not*
+   performed at a bus-fault; the register is seen having the original
+   value in fault handlers.  */
+/* Copy to userspace.  This is based on the memcpy used for
+   kernel-to-kernel copying; see "string.c".  */
+unsigned long
+__copy_user (void __user *pdst, const void *psrc, unsigned long pn)
+{
+  /* We want the parameters put in special registers.
+     Make sure the compiler is able to make something useful of this.
+     As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
+     FIXME: Comment for old gcc version.  Check.
+     If gcc was allright, it really would need no temporaries, and no
+     stack space to save stuff on. */
+  register char *dst __asm__ ("r13") = pdst;
+  register const char *src __asm__ ("r11") = psrc;
+  register int n __asm__ ("r12") = pn;
+  register int retn __asm__ ("r10") = 0;
+  /* When src is aligned but not dst, this makes a few extra needless
+     cycles.  I believe it would take as many to check that the
+     re-alignment was unnecessary.  */
+  if (((unsigned long) dst & 3) != 0
+      /* Don't align if we wouldn't copy more than a few bytes; so we
+         don't have to check further for overflows.  */
+      && n >= 3)
+  {
+    if ((unsigned long) dst & 1)
+    {
+      __asm_copy_to_user_1 (dst, src, retn);
+      n--;
+    }
+    if ((unsigned long) dst & 2)
+    {
+      __asm_copy_to_user_2 (dst, src, retn);
+      n -= 2;
+    }
+  }
+  /* Movem is dirt cheap.  The overheap is low enough to always use the
+     minimum possible block size as the threshold.  */
+  if (n >= 44)
+  {
+    /* For large copies we use 'movem'.  */
+    /* It is not optimal to tell the compiler about clobbering any
+       registers; that will move the saving/restoring of those registers
+       to the function prologue/epilogue, and make non-movem sizes
+       suboptimal.  */
+    __asm__ volatile ("\
+        ;; Check that the register asm declaration got right.           \n\
+        ;; The GCC manual explicitly says TRT will happen.              \n\
+        .ifnc %0%1%2%3,$r13$r11$r12$r10                                 \n\
+        .err                                                            \n\
+        .endif                                                          \n\
+                                                                        \n\
+        ;; Save the registers we'll use in the movem process            \n\
+        ;; on the stack.                                                \n\
+        subq    11*4,$sp                                                \n\
+        movem   $r10,[$sp]                                              \n\
+                                                                        \n\
+        ;; Now we've got this:                                          \n\
+        ;; r11 - src                                                    \n\
+        ;; r13 - dst                                                    \n\
+        ;; r12 - n                                                      \n\
+                                                                        \n\
+        ;; Update n for the first loop                                  \n\
+        subq    44,$r12                                                 \n\
+0:                                                                      \n\
+        movem   [$r11+],$r10                                            \n\
+        subq   44,$r12                                                  \n\
+1:      bge     0b                                                      \n\
+        movem   $r10,[$r13+]                                            \n\
+3:                                                                      \n\
+        addq   44,$r12  ;; compensate for last loop underflowing n      \n\
+                                                                        \n\
+        ;; Restore registers from stack                                 \n\
+        movem [$sp+],$r10                                               \n\
+2:                                                                      \n\
+        .section .fixup,\"ax\"                                          \n\
+4:                                                                      \n\
+; When failing on any of the 1..44 bytes in a chunk, we adjust back the \n\
+; source pointer and just drop through  to the by-16 and by-4 loops to  \n\
+; get the correct number of failing bytes.  This necessarily means a    \n\
+; few extra exceptions, but invalid user pointers shouldn't happen in   \n\
+; time-critical code anyway.                                            \n\
+        jump 3b                                                         \n\
+        subq 44,$r11                                                    \n\
+                                                                        \n\
+        .previous                                                       \n\
+        .section __ex_table,\"a\"                                       \n\
+        .dword 1b,4b                                                    \n\
+        .previous"
+     /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n), "=r" (retn)
+     /* Inputs */ : "0" (dst), "1" (src), "2" (n), "3" (retn));
+  }
+  while (n >= 16)
+  {
+    __asm_copy_to_user_16 (dst, src, retn);
+    n -= 16;
+  }
+  /* Having a separate by-four loops cuts down on cache footprint.
+     FIXME:  Test with and without; increasing switch to be 0..15.  */
+  while (n >= 4)
+  {
+    __asm_copy_to_user_4 (dst, src, retn);
+    n -= 4;
+  }
+  switch (n)
+  {
+    case 0:
+      break;
+    case 1:
+      __asm_copy_to_user_1 (dst, src, retn);
+      break;
+    case 2:
+      __asm_copy_to_user_2 (dst, src, retn);
+      break;
+    case 3:
+      __asm_copy_to_user_3 (dst, src, retn);
+      break;
+  }
+  return retn;
+}
+/* Copy from user to kernel, zeroing the bytes that were inaccessible in
+   userland.  The return-value is the number of bytes that were
+   inaccessible.  */
+unsigned long
+__copy_user_zeroing (void __user *pdst, const void *psrc, unsigned long pn)
+{
+  /* We want the parameters put in special registers.
+     Make sure the compiler is able to make something useful of this.
+     As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
+     FIXME: Comment for old gcc version.  Check.
+     If gcc was allright, it really would need no temporaries, and no
+     stack space to save stuff on.  */
+  register char *dst __asm__ ("r13") = pdst;
+  register const char *src __asm__ ("r11") = psrc;
+  register int n __asm__ ("r12") = pn;
+  register int retn __asm__ ("r10") = 0;
+  /* The best reason to align src is that we then know that a read-fault
+     was for aligned bytes; there's no 1..3 remaining good bytes to
+     pickle.  */
+  if (((unsigned long) src & 3) != 0)
+  {
+    if (((unsigned long) src & 1) && n != 0)
+    {
+      __asm_copy_from_user_1 (dst, src, retn);
+      n--;
+    }
+    if (((unsigned long) src & 2) && n >= 2)
+    {
+      __asm_copy_from_user_2 (dst, src, retn);
+      n -= 2;
+    }
+    /* We only need one check after the unalignment-adjustments, because
+       if both adjustments were done, either both or neither reference
+       had an exception.  */
+    if (retn != 0)
+      goto copy_exception_bytes;
+  }
+  /* Movem is dirt cheap.  The overheap is low enough to always use the
+     minimum possible block size as the threshold.  */
+  if (n >= 44)
+  {
+    /* It is not optimal to tell the compiler about clobbering any
+       registers; that will move the saving/restoring of those registers
+       to the function prologue/epilogue, and make non-movem sizes
+       suboptimal.  */
+    __asm__ volatile ("\
+        .ifnc %0%1%2%3,$r13$r11$r12$r10                                 \n\
+        .err                                                            \n\
+        .endif                                                          \n\
+                                                                        \n\
+        ;; Save the registers we'll use in the movem process            \n\
+        ;; on the stack.                                                \n\
+        subq    11*4,$sp                                                \n\
+        movem   $r10,[$sp]                                              \n\
+                                                                        \n\
+        ;; Now we've got this:                                          \n\
+        ;; r11 - src                                                    \n\
+        ;; r13 - dst                                                    \n\
+        ;; r12 - n                                                      \n\
+                                                                        \n\
+        ;; Update n for the first loop                                  \n\
+        subq    44,$r12                                                 \n\
+0:                                                                      \n\
+        movem   [$r11+],$r10                                            \n\
+                                                                        \n\
+        subq   44,$r12                                                  \n\
+        bge     0b                                                      \n\
+        movem   $r10,[$r13+]                                            \n\
+                                                                        \n\
+4:                                                                      \n\
+        addq   44,$r12  ;; compensate for last loop underflowing n      \n\
+                                                                        \n\
+        ;; Restore registers from stack                                 \n\
+        movem [$sp+],$r10                                               \n\
+        .section .fixup,\"ax\"                                          \n\
+                                                                        \n\
+;; Do not jump back into the loop if we fail.  For some uses, we get a  \n\
+;; page fault somewhere on the line.  Without checking for page limits, \n\
+;; we don't know where, but we need to copy accurately and keep an      \n\
+;; accurate count; not just clear the whole line.  To do that, we fall  \n\
+;; down in the code below, proceeding with smaller amounts.  It should  \n\
+;; be kept in mind that we have to cater to code like what at one time  \n\
+;; was in fs/super.c:                                                   \n\
+;;  i = size - copy_from_user((void *)page, data, size);                \n\
+;; which would cause repeated faults while clearing the remainder of    \n\
+;; the SIZE bytes at PAGE after the first fault.                        \n\
+;; A caveat here is that we must not fall through from a failing page   \n\
+;; to a valid page.                                                     \n\
+                                                                        \n\
+3:                                                                      \n\
+        jump    4b ;; Fall through, pretending the fault didn't happen. \n\
+        nop                                                             \n\
+                                                                        \n\
+        .previous                                                       \n\
+        .section __ex_table,\"a\"                                       \n\
+        .dword 0b,3b                                                    \n\
+        .previous"
+     /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n), "=r" (retn)
+     /* Inputs */ : "0" (dst), "1" (src), "2" (n), "3" (retn));
+  }
+  /* Either we directly start copying here, using dword copying in a loop,
+     or we copy as much as possible with 'movem' and then the last block
+     (<44 bytes) is copied here.  This will work since 'movem' will have
+     updated src, dst and n.  (Except with failing src.)
+     Since we want to keep src accurate, we can't use
+     __asm_copy_from_user_N with N != (1, 2, 4); it updates dst and
+     retn, but not src (by design; it's value is ignored elsewhere).  */
+  while (n >= 4)
+  {
+    __asm_copy_from_user_4 (dst, src, retn);
+    n -= 4;
+    if (retn)
+      goto copy_exception_bytes;
+  }
+  /* If we get here, there were no memory read faults.  */
+  switch (n)
+  {
+    /* These copies are at least "naturally aligned" (so we don't have
+       to check each byte), due to the src alignment code before the
+       movem loop.  The *_3 case *will* get the correct count for retn.  */
+    case 0:
+      /* This case deliberately left in (if you have doubts check the
+         generated assembly code).  */
+      break;
+    case 1:
+      __asm_copy_from_user_1 (dst, src, retn);
+      break;
+    case 2:
+      __asm_copy_from_user_2 (dst, src, retn);
+      break;
+    case 3:
+      __asm_copy_from_user_3 (dst, src, retn);
+      break;
+  }
+  /* If we get here, retn correctly reflects the number of failing
+     bytes.  */
+  return retn;
+copy_exception_bytes:
+  /* We already have "retn" bytes cleared, and need to clear the
+     remaining "n" bytes.  A non-optimized simple byte-for-byte in-line
+     memset is preferred here, since this isn't speed-critical code and
+     we'd rather have this a leaf-function than calling memset.  */
+  {
+    char *endp;
+    for (endp = dst + n; dst < endp; dst++)
+      *dst = 0;
+  }
+  return retn + n;
+}
+/* Zero userspace.  */
+unsigned long
+__do_clear_user (void __user *pto, unsigned long pn)
+{
+  /* We want the parameters put in special registers.
+     Make sure the compiler is able to make something useful of this.
+      As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
+     FIXME: Comment for old gcc version.  Check.
+     If gcc was allright, it really would need no temporaries, and no
+     stack space to save stuff on. */
+  register char *dst __asm__ ("r13") = pto;
+  register int n __asm__ ("r12") = pn;
+  register int retn __asm__ ("r10") = 0;
+  if (((unsigned long) dst & 3) != 0
+     /* Don't align if we wouldn't copy more than a few bytes.  */
+      && n >= 3)
+  {
+    if ((unsigned long) dst & 1)
+    {
+      __asm_clear_1 (dst, retn);
+      n--;
+    }
+    if ((unsigned long) dst & 2)
+    {
+      __asm_clear_2 (dst, retn);
+      n -= 2;
+    }
+  }
+  /* Decide which copying method to use.
+     FIXME: This number is from the "ordinary" kernel memset.  */
+  if (n >= 48)
+  {
+    /* For large clears we use 'movem' */
+    /* It is not optimal to tell the compiler about clobbering any
+       call-saved registers; that will move the saving/restoring of
+       those registers to the function prologue/epilogue, and make
+       non-movem sizes suboptimal.
+       This method is not foolproof; it assumes that the "asm reg"
+       declarations at the beginning of the function really are used
+       here (beware: they may be moved to temporary registers).
+       This way, we do not have to save/move the registers around into
+       temporaries; we can safely use them straight away.
+      If you want to check that the allocation was right; then
+      check the equalities in the first comment.  It should say
+      something like "r13=r13, r11=r11, r12=r12". */
+    __asm__ volatile ("\
+        .ifnc %0%1%2,$r13$r12$r10                                       \n\
+        .err                                                            \n\
+        .endif                                                          \n\
+                                                                        \n\
+        ;; Save the registers we'll clobber in the movem process        \n\
+        ;; on the stack.  Don't mention them to gcc, it will only be    \n\
+        ;; upset.                                                       \n\
+        subq    11*4,$sp                                                \n\
+        movem   $r10,[$sp]                                              \n\
+                                                                        \n\
+        clear.d $r0                                                     \n\
+        clear.d $r1                                                     \n\
+        clear.d $r2                                                     \n\
+        clear.d $r3                                                     \n\
+        clear.d $r4                                                     \n\
+        clear.d $r5                                                     \n\
+        clear.d $r6                                                     \n\
+        clear.d $r7                                                     \n\
+        clear.d $r8                                                     \n\
+        clear.d $r9                                                     \n\
+        clear.d $r10                                                    \n\
+        clear.d $r11                                                    \n\
+                                                                        \n\
+        ;; Now we've got this:                                          \n\
+        ;; r13 - dst                                                    \n\
+        ;; r12 - n                                                      \n\
+                                                                        \n\
+        ;; Update n for the first loop                                  \n\
+        subq    12*4,$r12                                               \n\
+0:                                                                      \n\
+        subq   12*4,$r12                                                \n\
+1:                                                                      \n\
+        bge     0b                                                      \n\
+        movem   $r11,[$r13+]                                            \n\
+                                                                        \n\
+        addq   12*4,$r12 ;; compensate for last loop underflowing n     \n\
+                                                                        \n\
+        ;; Restore registers from stack                                 \n\
+        movem [$sp+],$r10                                               \n\
+2:                                                                      \n\
+        .section .fixup,\"ax\"                                          \n\
+3:                                                                      \n\
+        movem [$sp],$r10                                                \n\
+        addq 12*4,$r10                                                  \n\
+        addq 12*4,$r13                                                  \n\
+        movem $r10,[$sp]                                                \n\
+        jump 0b                                                         \n\
+        clear.d $r10                                                    \n\
+                                                                        \n\
+        .previous                                                       \n\
+        .section __ex_table,\"a\"                                       \n\
+        .dword 1b,3b                                                    \n\
+        .previous"
+     /* Outputs */ : "=r" (dst), "=r" (n), "=r" (retn)
+     /* Inputs */ : "0" (dst), "1" (n), "2" (retn)
+     /* Clobber */ : "r11");
+  }
+  while (n >= 16)
+  {
+    __asm_clear_16 (dst, retn);
+    n -= 16;
+  }
+  /* Having a separate by-four loops cuts down on cache footprint.
+     FIXME:  Test with and without; increasing switch to be 0..15.  */
+  while (n >= 4)
+  {
+    __asm_clear_4 (dst, retn);
+    n -= 4;
+  }
+  switch (n)
+  {
+    case 0:
+      break;
+    case 1:
+      __asm_clear_1 (dst, retn);
+      break;
+    case 2:
+      __asm_clear_2 (dst, retn);
+      break;
+    case 3:
+      __asm_clear_3 (dst, retn);
+      break;
+  }
+  return retn;
+}

diff --git a/arch/cris/arch-v32/lib/usercopy.c b/arch/cris/arch-v32/lib/usercopy.c new file mode 100644 index 000000000000..f0b08460c1be --- /dev/null +++ b/arch/cris/arch-v32/lib/usercopy.c
@@ -0,0 +1,470 @@
	1	/*
	2	* User address space access functions.
	3	* The non-inlined parts of asm-cris/uaccess.h are here.
	4	*
	5	* Copyright (C) 2000, 2003 Axis Communications AB.
	6	*
	7	* Written by Hans-Peter Nilsson.
	8	* Pieces used from memcpy, originally by Kenny Ranerup long time ago.
	9	*/
	10
	11	#include <asm/uaccess.h>
	12
	13	/* Asm:s have been tweaked (within the domain of correctness) to give
	14	satisfactory results for "gcc version 3.2.1 Axis release R53/1.53-v32".
	15
	16	Check regularly...
	17
	18	Note that for CRISv32, the PC saved at a bus-fault is the address
	19	at the faulting instruction, with a special case for instructions
	20	in delay slots: then it's the address of the branch. Note also that
	21	in contrast to v10, a postincrement in the instruction is not
	22	performed at a bus-fault; the register is seen having the original
	23	value in fault handlers. */
	24
	25
	26	/* Copy to userspace. This is based on the memcpy used for
	27	kernel-to-kernel copying; see "string.c". */
	28
	29	unsigned long
	30	__copy_user (void __user pdst, const void psrc, unsigned long pn)
	31	{
	32	/* We want the parameters put in special registers.
	33	Make sure the compiler is able to make something useful of this.
	34	As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
	35
	36	FIXME: Comment for old gcc version. Check.
	37	If gcc was allright, it really would need no temporaries, and no
	38	stack space to save stuff on. */
	39
	40	register char *dst __asm__ ("r13") = pdst;
	41	register const char *src __asm__ ("r11") = psrc;
	42	register int n __asm__ ("r12") = pn;
	43	register int retn __asm__ ("r10") = 0;
	44
	45
	46	/* When src is aligned but not dst, this makes a few extra needless
	47	cycles. I believe it would take as many to check that the
	48	re-alignment was unnecessary. */
	49	if (((unsigned long) dst & 3) != 0
	50	/* Don't align if we wouldn't copy more than a few bytes; so we
	51	don't have to check further for overflows. */
	52	&& n >= 3)
	53	{
	54	if ((unsigned long) dst & 1)
	55	{
	56	__asm_copy_to_user_1 (dst, src, retn);
	57	n--;
	58	}
	59
	60	if ((unsigned long) dst & 2)
	61	{
	62	__asm_copy_to_user_2 (dst, src, retn);
	63	n -= 2;
	64	}
	65	}
	66
	67	/* Movem is dirt cheap. The overheap is low enough to always use the
	68	minimum possible block size as the threshold. */
	69	if (n >= 44)
	70	{
	71	/* For large copies we use 'movem'. */
	72
	73	/* It is not optimal to tell the compiler about clobbering any
	74	registers; that will move the saving/restoring of those registers
	75	to the function prologue/epilogue, and make non-movem sizes
	76	suboptimal. */
	77	__asm__ volatile ("\
	78	;; Check that the register asm declaration got right. \n\
	79	;; The GCC manual explicitly says TRT will happen. \n\
	80	.ifnc %0%1%2%3,$r13$r11$r12$r10 \n\
	81	.err \n\
	82	.endif \n\
	83	\n\
	84	;; Save the registers we'll use in the movem process \n\
	85	;; on the stack. \n\
	86	subq 11*4,$sp \n\
	87	movem $r10,[$sp] \n\
	88	\n\
	89	;; Now we've got this: \n\
	90	;; r11 - src \n\
	91	;; r13 - dst \n\
	92	;; r12 - n \n\
	93	\n\
	94	;; Update n for the first loop \n\
	95	subq 44,$r12 \n\
	96	0: \n\
	97	movem [$r11+],$r10 \n\
	98	subq 44,$r12 \n\
	99	1: bge 0b \n\
	100	movem $r10,[$r13+] \n\
	101	3: \n\
	102	addq 44,$r12 ;; compensate for last loop underflowing n \n\
	103	\n\
	104	;; Restore registers from stack \n\
	105	movem [$sp+],$r10 \n\
	106	2: \n\
	107	.section .fixup,\"ax\" \n\
	108	4: \n\
	109	; When failing on any of the 1..44 bytes in a chunk, we adjust back the \n\
	110	; source pointer and just drop through to the by-16 and by-4 loops to \n\
	111	; get the correct number of failing bytes. This necessarily means a \n\
	112	; few extra exceptions, but invalid user pointers shouldn't happen in \n\
	113	; time-critical code anyway. \n\
	114	jump 3b \n\
	115	subq 44,$r11 \n\
	116	\n\
	117	.previous \n\
	118	.section __ex_table,\"a\" \n\
	119	.dword 1b,4b \n\
	120	.previous"
	121
	122	/* Outputs */ : "=r" (dst), "=r" (src), "=r" (n), "=r" (retn)
	123	/* Inputs */ : "0" (dst), "1" (src), "2" (n), "3" (retn));
	124
	125	}
	126
	127	while (n >= 16)
	128	{
	129	__asm_copy_to_user_16 (dst, src, retn);
	130	n -= 16;
	131	}
	132
	133	/* Having a separate by-four loops cuts down on cache footprint.
	134	FIXME: Test with and without; increasing switch to be 0..15. */
	135	while (n >= 4)
	136	{
	137	__asm_copy_to_user_4 (dst, src, retn);
	138	n -= 4;
	139	}
	140
	141	switch (n)
	142	{
	143	case 0:
	144	break;
	145	case 1:
	146	__asm_copy_to_user_1 (dst, src, retn);
	147	break;
	148	case 2:
	149	__asm_copy_to_user_2 (dst, src, retn);
	150	break;
	151	case 3:
	152	__asm_copy_to_user_3 (dst, src, retn);
	153	break;
	154	}
	155
	156	return retn;
	157	}
	158
	159	/* Copy from user to kernel, zeroing the bytes that were inaccessible in
	160	userland. The return-value is the number of bytes that were
	161	inaccessible. */
	162
	163	unsigned long
	164	__copy_user_zeroing (void __user pdst, const void psrc, unsigned long pn)
	165	{
	166	/* We want the parameters put in special registers.
	167	Make sure the compiler is able to make something useful of this.
	168	As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
	169
	170	FIXME: Comment for old gcc version. Check.
	171	If gcc was allright, it really would need no temporaries, and no
	172	stack space to save stuff on. */
	173
	174	register char *dst __asm__ ("r13") = pdst;
	175	register const char *src __asm__ ("r11") = psrc;
	176	register int n __asm__ ("r12") = pn;
	177	register int retn __asm__ ("r10") = 0;
	178
	179	/* The best reason to align src is that we then know that a read-fault
	180	was for aligned bytes; there's no 1..3 remaining good bytes to
	181	pickle. */
	182	if (((unsigned long) src & 3) != 0)
	183	{
	184	if (((unsigned long) src & 1) && n != 0)
	185	{
	186	__asm_copy_from_user_1 (dst, src, retn);
	187	n--;
	188	}
	189
	190	if (((unsigned long) src & 2) && n >= 2)
	191	{
	192	__asm_copy_from_user_2 (dst, src, retn);
	193	n -= 2;
	194	}
	195
	196	/* We only need one check after the unalignment-adjustments, because
	197	if both adjustments were done, either both or neither reference
	198	had an exception. */
	199	if (retn != 0)
	200	goto copy_exception_bytes;
	201	}
	202
	203	/* Movem is dirt cheap. The overheap is low enough to always use the
	204	minimum possible block size as the threshold. */
	205	if (n >= 44)
	206	{
	207	/* It is not optimal to tell the compiler about clobbering any
	208	registers; that will move the saving/restoring of those registers
	209	to the function prologue/epilogue, and make non-movem sizes
	210	suboptimal. */
	211	__asm__ volatile ("\
	212	.ifnc %0%1%2%3,$r13$r11$r12$r10 \n\
	213	.err \n\
	214	.endif \n\
	215	\n\
	216	;; Save the registers we'll use in the movem process \n\
	217	;; on the stack. \n\
	218	subq 11*4,$sp \n\
	219	movem $r10,[$sp] \n\
	220	\n\
	221	;; Now we've got this: \n\
	222	;; r11 - src \n\
	223	;; r13 - dst \n\
	224	;; r12 - n \n\
	225	\n\
	226	;; Update n for the first loop \n\
	227	subq 44,$r12 \n\
	228	0: \n\
	229	movem [$r11+],$r10 \n\
	230	\n\
	231	subq 44,$r12 \n\
	232	bge 0b \n\
	233	movem $r10,[$r13+] \n\
	234	\n\
	235	4: \n\
	236	addq 44,$r12 ;; compensate for last loop underflowing n \n\
	237	\n\
	238	;; Restore registers from stack \n\
	239	movem [$sp+],$r10 \n\
	240	.section .fixup,\"ax\" \n\
	241	\n\
	242	;; Do not jump back into the loop if we fail. For some uses, we get a \n\
	243	;; page fault somewhere on the line. Without checking for page limits, \n\
	244	;; we don't know where, but we need to copy accurately and keep an \n\
	245	;; accurate count; not just clear the whole line. To do that, we fall \n\
	246	;; down in the code below, proceeding with smaller amounts. It should \n\
	247	;; be kept in mind that we have to cater to code like what at one time \n\
	248	;; was in fs/super.c: \n\
	249	;; i = size - copy_from_user((void *)page, data, size); \n\
	250	;; which would cause repeated faults while clearing the remainder of \n\
	251	;; the SIZE bytes at PAGE after the first fault. \n\
	252	;; A caveat here is that we must not fall through from a failing page \n\
	253	;; to a valid page. \n\
	254	\n\
	255	3: \n\
	256	jump 4b ;; Fall through, pretending the fault didn't happen. \n\
	257	nop \n\
	258	\n\
	259	.previous \n\
	260	.section __ex_table,\"a\" \n\
	261	.dword 0b,3b \n\
	262	.previous"
	263
	264	/* Outputs */ : "=r" (dst), "=r" (src), "=r" (n), "=r" (retn)
	265	/* Inputs */ : "0" (dst), "1" (src), "2" (n), "3" (retn));
	266	}
	267
	268	/* Either we directly start copying here, using dword copying in a loop,
	269	or we copy as much as possible with 'movem' and then the last block
	270	(<44 bytes) is copied here. This will work since 'movem' will have
	271	updated src, dst and n. (Except with failing src.)
	272
	273	Since we want to keep src accurate, we can't use
	274	__asm_copy_from_user_N with N != (1, 2, 4); it updates dst and
	275	retn, but not src (by design; it's value is ignored elsewhere). */
	276
	277	while (n >= 4)
	278	{
	279	__asm_copy_from_user_4 (dst, src, retn);
	280	n -= 4;
	281
	282	if (retn)
	283	goto copy_exception_bytes;
	284	}
	285
	286	/* If we get here, there were no memory read faults. */
	287	switch (n)
	288	{
	289	/* These copies are at least "naturally aligned" (so we don't have
	290	to check each byte), due to the src alignment code before the
	291	movem loop. The _3 case will* get the correct count for retn. */
	292	case 0:
	293	/* This case deliberately left in (if you have doubts check the
	294	generated assembly code). */
	295	break;
	296	case 1:
	297	__asm_copy_from_user_1 (dst, src, retn);
	298	break;
	299	case 2:
	300	__asm_copy_from_user_2 (dst, src, retn);
	301	break;
	302	case 3:
	303	__asm_copy_from_user_3 (dst, src, retn);
	304	break;
	305	}
	306
	307	/* If we get here, retn correctly reflects the number of failing
	308	bytes. */
	309	return retn;
	310
	311	copy_exception_bytes:
	312	/* We already have "retn" bytes cleared, and need to clear the
	313	remaining "n" bytes. A non-optimized simple byte-for-byte in-line
	314	memset is preferred here, since this isn't speed-critical code and
	315	we'd rather have this a leaf-function than calling memset. */
	316	{
	317	char *endp;
	318	for (endp = dst + n; dst < endp; dst++)
	319	*dst = 0;
	320	}
	321
	322	return retn + n;
	323	}
	324
	325	/* Zero userspace. */
	326
	327	unsigned long
	328	__do_clear_user (void __user *pto, unsigned long pn)
	329	{
	330	/* We want the parameters put in special registers.
	331	Make sure the compiler is able to make something useful of this.
	332	As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
	333
	334	FIXME: Comment for old gcc version. Check.
	335	If gcc was allright, it really would need no temporaries, and no
	336	stack space to save stuff on. */
	337
	338	register char *dst __asm__ ("r13") = pto;
	339	register int n __asm__ ("r12") = pn;
	340	register int retn __asm__ ("r10") = 0;
	341
	342
	343	if (((unsigned long) dst & 3) != 0
	344	/* Don't align if we wouldn't copy more than a few bytes. */
	345	&& n >= 3)
	346	{
	347	if ((unsigned long) dst & 1)
	348	{
	349	__asm_clear_1 (dst, retn);
	350	n--;
	351	}
	352
	353	if ((unsigned long) dst & 2)
	354	{
	355	__asm_clear_2 (dst, retn);
	356	n -= 2;
	357	}
	358	}
	359
	360	/* Decide which copying method to use.
	361	FIXME: This number is from the "ordinary" kernel memset. */
	362	if (n >= 48)
	363	{
	364	/* For large clears we use 'movem' */
	365
	366	/* It is not optimal to tell the compiler about clobbering any
	367	call-saved registers; that will move the saving/restoring of
	368	those registers to the function prologue/epilogue, and make
	369	non-movem sizes suboptimal.
	370
	371	This method is not foolproof; it assumes that the "asm reg"
	372	declarations at the beginning of the function really are used
	373	here (beware: they may be moved to temporary registers).
	374	This way, we do not have to save/move the registers around into
	375	temporaries; we can safely use them straight away.
	376
	377	If you want to check that the allocation was right; then
	378	check the equalities in the first comment. It should say
	379	something like "r13=r13, r11=r11, r12=r12". */
	380	__asm__ volatile ("\
	381	.ifnc %0%1%2,$r13$r12$r10 \n\
	382	.err \n\
	383	.endif \n\
	384	\n\
	385	;; Save the registers we'll clobber in the movem process \n\
	386	;; on the stack. Don't mention them to gcc, it will only be \n\
	387	;; upset. \n\
	388	subq 11*4,$sp \n\
	389	movem $r10,[$sp] \n\
	390	\n\
	391	clear.d $r0 \n\
	392	clear.d $r1 \n\
	393	clear.d $r2 \n\
	394	clear.d $r3 \n\
	395	clear.d $r4 \n\
	396	clear.d $r5 \n\
	397	clear.d $r6 \n\
	398	clear.d $r7 \n\
	399	clear.d $r8 \n\
	400	clear.d $r9 \n\
	401	clear.d $r10 \n\
	402	clear.d $r11 \n\
	403	\n\
	404	;; Now we've got this: \n\
	405	;; r13 - dst \n\
	406	;; r12 - n \n\
	407	\n\
	408	;; Update n for the first loop \n\
	409	subq 12*4,$r12 \n\
	410	0: \n\
	411	subq 12*4,$r12 \n\
	412	1: \n\
	413	bge 0b \n\
	414	movem $r11,[$r13+] \n\
	415	\n\
	416	addq 12*4,$r12 ;; compensate for last loop underflowing n \n\
	417	\n\
	418	;; Restore registers from stack \n\
	419	movem [$sp+],$r10 \n\
	420	2: \n\
	421	.section .fixup,\"ax\" \n\
	422	3: \n\
	423	movem [$sp],$r10 \n\
	424	addq 12*4,$r10 \n\
	425	addq 12*4,$r13 \n\
	426	movem $r10,[$sp] \n\
	427	jump 0b \n\
	428	clear.d $r10 \n\
	429	\n\
	430	.previous \n\
	431	.section __ex_table,\"a\" \n\
	432	.dword 1b,3b \n\
	433	.previous"
	434
	435	/* Outputs */ : "=r" (dst), "=r" (n), "=r" (retn)
	436	/* Inputs */ : "0" (dst), "1" (n), "2" (retn)
	437	/* Clobber */ : "r11");
	438	}
	439
	440	while (n >= 16)
	441	{
	442	__asm_clear_16 (dst, retn);
	443	n -= 16;
	444	}
	445
	446	/* Having a separate by-four loops cuts down on cache footprint.
	447	FIXME: Test with and without; increasing switch to be 0..15. */
	448	while (n >= 4)
	449	{
	450	__asm_clear_4 (dst, retn);
	451	n -= 4;
	452	}
	453
	454	switch (n)
	455	{
	456	case 0:
	457	break;
	458	case 1:
	459	__asm_clear_1 (dst, retn);
	460	break;
	461	case 2:
	462	__asm_clear_2 (dst, retn);
	463	break;
	464	case 3:
	465	__asm_clear_3 (dst, retn);
	466	break;
	467	}
	468
	469	return retn;
	470	}