15 files changed, 5788 insertions, 0 deletions
diff --git a/arch/m68k/math-emu/Makefile b/arch/m68k/math-emu/Makefile
new file mode 100644
index 000000000000..539940401814
--- /dev/null
+++ b/arch/m68k/math-emu/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the linux kernel.
+#
+EXTRA_AFLAGS := -traditional
+#EXTRA_AFLAGS += -DFPU_EMU_DEBUG
+#EXTRA_CFLAGS += -DFPU_EMU_DEBUG
+obj-y           := fp_entry.o fp_scan.o fp_util.o fp_move.o fp_movem.o \
+                        fp_cond.o fp_arith.o fp_log.o fp_trig.o
diff --git a/arch/m68k/math-emu/fp_arith.c b/arch/m68k/math-emu/fp_arith.c
new file mode 100644
index 000000000000..08f286db3c5a
--- /dev/null
+++ b/arch/m68k/math-emu/fp_arith.c
@@ -0,0 +1,701 @@
+/*
+   fp_arith.c: floating-point math routines for the Linux-m68k
+   floating point emulator.
+   Copyright (c) 1998-1999 David Huggins-Daines.
+   Somewhat based on the AlphaLinux floating point emulator, by David
+   Mosberger-Tang.
+   You may copy, modify, and redistribute this file under the terms of
+   the GNU General Public License, version 2, or any later version, at
+   your convenience.
+ */
+#include "fp_emu.h"
+#include "multi_arith.h"
+#include "fp_arith.h"
+const struct fp_ext fp_QNaN =
+{
+        .exp = 0x7fff,
+        .mant = { .m64 = ~0 }
+};
+const struct fp_ext fp_Inf =
+{
+        .exp = 0x7fff,
+};
+/* let's start with the easy ones */
+struct fp_ext *
+fp_fabs(struct fp_ext *dest, struct fp_ext *src)
+{
+        dprint(PINSTR, "fabs\n");
+        fp_monadic_check(dest, src);
+        dest->sign = 0;
+        return dest;
+}
+struct fp_ext *
+fp_fneg(struct fp_ext *dest, struct fp_ext *src)
+{
+        dprint(PINSTR, "fneg\n");
+        fp_monadic_check(dest, src);
+        dest->sign = !dest->sign;
+        return dest;
+}
+/* Now, the slightly harder ones */
+/* fp_fadd: Implements the kernel of the FADD, FSADD, FDADD, FSUB,
+   FDSUB, and FCMP instructions. */
+struct fp_ext *
+fp_fadd(struct fp_ext *dest, struct fp_ext *src)
+{
+        int diff;
+        dprint(PINSTR, "fadd\n");
+        fp_dyadic_check(dest, src);
+        if (IS_INF(dest)) {
+                /* infinity - infinity == NaN */
+                if (IS_INF(src) && (src->sign != dest->sign))
+                        fp_set_nan(dest);
+                return dest;
+        }
+        if (IS_INF(src)) {
+                fp_copy_ext(dest, src);
+                return dest;
+        }
+        if (IS_ZERO(dest)) {
+                if (IS_ZERO(src)) {
+                        if (src->sign != dest->sign) {
+                                if (FPDATA->rnd == FPCR_ROUND_RM)
+                                        dest->sign = 1;
+                                else
+                                        dest->sign = 0;
+                        }
+                } else
+                        fp_copy_ext(dest, src);
+                return dest;
+        }
+        dest->lowmant = src->lowmant = 0;
+        if ((diff = dest->exp - src->exp) > 0)
+                fp_denormalize(src, diff);
+        else if ((diff = -diff) > 0)
+                fp_denormalize(dest, diff);
+        if (dest->sign == src->sign) {
+                if (fp_addmant(dest, src))
+                        if (!fp_addcarry(dest))
+                                return dest;
+        } else {
+                if (dest->mant.m64 < src->mant.m64) {
+                        fp_submant(dest, src, dest);
+                        dest->sign = !dest->sign;
+                } else
+                        fp_submant(dest, dest, src);
+        }
+        return dest;
+}
+/* fp_fsub: Implements the kernel of the FSUB, FSSUB, and FDSUB
+   instructions.
+   Remember that the arguments are in assembler-syntax order! */
+struct fp_ext *
+fp_fsub(struct fp_ext *dest, struct fp_ext *src)
+{
+        dprint(PINSTR, "fsub ");
+        src->sign = !src->sign;
+        return fp_fadd(dest, src);
+}
+struct fp_ext *
+fp_fcmp(struct fp_ext *dest, struct fp_ext *src)
+{
+        dprint(PINSTR, "fcmp ");
+        FPDATA->temp[1] = *dest;
+        src->sign = !src->sign;
+        return fp_fadd(&FPDATA->temp[1], src);
+}
+struct fp_ext *
+fp_ftst(struct fp_ext *dest, struct fp_ext *src)
+{
+        dprint(PINSTR, "ftst\n");
+        (void)dest;
+        return src;
+}
+struct fp_ext *
+fp_fmul(struct fp_ext *dest, struct fp_ext *src)
+{
+        union fp_mant128 temp;
+        int exp;
+        dprint(PINSTR, "fmul\n");
+        fp_dyadic_check(dest, src);
+        /* calculate the correct sign now, as it's necessary for infinities */
+        dest->sign = src->sign ^ dest->sign;
+        /* Handle infinities */
+        if (IS_INF(dest)) {
+                if (IS_ZERO(src))
+                        fp_set_nan(dest);
+                return dest;
+        }
+        if (IS_INF(src)) {
+                if (IS_ZERO(dest))
+                        fp_set_nan(dest);
+                else
+                        fp_copy_ext(dest, src);
+                return dest;
+        }
+        /* Of course, as we all know, zero * anything = zero.  You may
+           not have known that it might be a positive or negative
+           zero... */
+        if (IS_ZERO(dest) || IS_ZERO(src)) {
+                dest->exp = 0;
+                dest->mant.m64 = 0;
+                dest->lowmant = 0;
+                return dest;
+        }
+        exp = dest->exp + src->exp - 0x3ffe;
+        /* shift up the mantissa for denormalized numbers,
+           so that the highest bit is set, this makes the
+           shift of the result below easier */
+        if ((long)dest->mant.m32[0] >= 0)
+                exp -= fp_overnormalize(dest);
+        if ((long)src->mant.m32[0] >= 0)
+                exp -= fp_overnormalize(src);
+        /* now, do a 64-bit multiply with expansion */
+        fp_multiplymant(&temp, dest, src);
+        /* normalize it back to 64 bits and stuff it back into the
+           destination struct */
+        if ((long)temp.m32[0] > 0) {
+                exp--;
+                fp_putmant128(dest, &temp, 1);
+        } else
+                fp_putmant128(dest, &temp, 0);
+        if (exp >= 0x7fff) {
+                fp_set_ovrflw(dest);
+                return dest;
+        }
+        dest->exp = exp;
+        if (exp < 0) {
+                fp_set_sr(FPSR_EXC_UNFL);
+                fp_denormalize(dest, -exp);
+        }
+        return dest;
+}
+/* fp_fdiv: Implements the "kernel" of the FDIV, FSDIV, FDDIV and
+   FSGLDIV instructions.
+   Note that the order of the operands is counter-intuitive: instead
+   of src / dest, the result is actually dest / src. */
+struct fp_ext *
+fp_fdiv(struct fp_ext *dest, struct fp_ext *src)
+{
+        union fp_mant128 temp;
+        int exp;
+        dprint(PINSTR, "fdiv\n");
+        fp_dyadic_check(dest, src);
+        /* calculate the correct sign now, as it's necessary for infinities */
+        dest->sign = src->sign ^ dest->sign;
+        /* Handle infinities */
+        if (IS_INF(dest)) {
+                /* infinity / infinity = NaN (quiet, as always) */
+                if (IS_INF(src))
+                        fp_set_nan(dest);
+                /* infinity / anything else = infinity (with approprate sign) */
+                return dest;
+        }
+        if (IS_INF(src)) {
+                /* anything / infinity = zero (with appropriate sign) */
+                dest->exp = 0;
+                dest->mant.m64 = 0;
+                dest->lowmant = 0;
+                return dest;
+        }
+        /* zeroes */
+        if (IS_ZERO(dest)) {
+                /* zero / zero = NaN */
+                if (IS_ZERO(src))
+                        fp_set_nan(dest);
+                /* zero / anything else = zero */
+                return dest;
+        }
+        if (IS_ZERO(src)) {
+                /* anything / zero = infinity (with appropriate sign) */
+                fp_set_sr(FPSR_EXC_DZ);
+                dest->exp = 0x7fff;
+                dest->mant.m64 = 0;
+                return dest;
+        }
+        exp = dest->exp - src->exp + 0x3fff;
+        /* shift up the mantissa for denormalized numbers,
+           so that the highest bit is set, this makes lots
+           of things below easier */
+        if ((long)dest->mant.m32[0] >= 0)
+                exp -= fp_overnormalize(dest);
+        if ((long)src->mant.m32[0] >= 0)
+                exp -= fp_overnormalize(src);
+        /* now, do the 64-bit divide */
+        fp_dividemant(&temp, dest, src);
+        /* normalize it back to 64 bits and stuff it back into the
+           destination struct */
+        if (!temp.m32[0]) {
+                exp--;
+                fp_putmant128(dest, &temp, 32);
+        } else
+                fp_putmant128(dest, &temp, 31);
+        if (exp >= 0x7fff) {
+                fp_set_ovrflw(dest);
+                return dest;
+        }
+        dest->exp = exp;
+        if (exp < 0) {
+                fp_set_sr(FPSR_EXC_UNFL);
+                fp_denormalize(dest, -exp);
+        }
+        return dest;
+}
+struct fp_ext *
+fp_fsglmul(struct fp_ext *dest, struct fp_ext *src)
+{
+        int exp;
+        dprint(PINSTR, "fsglmul\n");
+        fp_dyadic_check(dest, src);
+        /* calculate the correct sign now, as it's necessary for infinities */
+        dest->sign = src->sign ^ dest->sign;
+        /* Handle infinities */
+        if (IS_INF(dest)) {
+                if (IS_ZERO(src))
+                        fp_set_nan(dest);
+                return dest;
+        }
+        if (IS_INF(src)) {
+                if (IS_ZERO(dest))
+                        fp_set_nan(dest);
+                else
+                        fp_copy_ext(dest, src);
+                return dest;
+        }
+        /* Of course, as we all know, zero * anything = zero.  You may
+           not have known that it might be a positive or negative
+           zero... */
+        if (IS_ZERO(dest) || IS_ZERO(src)) {
+                dest->exp = 0;
+                dest->mant.m64 = 0;
+                dest->lowmant = 0;
+                return dest;
+        }
+        exp = dest->exp + src->exp - 0x3ffe;
+        /* do a 32-bit multiply */
+        fp_mul64(dest->mant.m32[0], dest->mant.m32[1],
+                 dest->mant.m32[0] & 0xffffff00,
+                 src->mant.m32[0] & 0xffffff00);
+        if (exp >= 0x7fff) {
+                fp_set_ovrflw(dest);
+                return dest;
+        }
+        dest->exp = exp;
+        if (exp < 0) {
+                fp_set_sr(FPSR_EXC_UNFL);
+                fp_denormalize(dest, -exp);
+        }
+        return dest;
+}
+struct fp_ext *
+fp_fsgldiv(struct fp_ext *dest, struct fp_ext *src)
+{
+        int exp;
+        unsigned long quot, rem;
+        dprint(PINSTR, "fsgldiv\n");
+        fp_dyadic_check(dest, src);
+        /* calculate the correct sign now, as it's necessary for infinities */
+        dest->sign = src->sign ^ dest->sign;
+        /* Handle infinities */
+        if (IS_INF(dest)) {
+                /* infinity / infinity = NaN (quiet, as always) */
+                if (IS_INF(src))
+                        fp_set_nan(dest);
+                /* infinity / anything else = infinity (with approprate sign) */
+                return dest;
+        }
+        if (IS_INF(src)) {
+                /* anything / infinity = zero (with appropriate sign) */
+                dest->exp = 0;
+                dest->mant.m64 = 0;
+                dest->lowmant = 0;
+                return dest;
+        }
+        /* zeroes */
+        if (IS_ZERO(dest)) {
+                /* zero / zero = NaN */
+                if (IS_ZERO(src))
+                        fp_set_nan(dest);
+                /* zero / anything else = zero */
+                return dest;
+        }
+        if (IS_ZERO(src)) {
+                /* anything / zero = infinity (with appropriate sign) */
+                fp_set_sr(FPSR_EXC_DZ);
+                dest->exp = 0x7fff;
+                dest->mant.m64 = 0;
+                return dest;
+        }
+        exp = dest->exp - src->exp + 0x3fff;
+        dest->mant.m32[0] &= 0xffffff00;
+        src->mant.m32[0] &= 0xffffff00;
+        /* do the 32-bit divide */
+        if (dest->mant.m32[0] >= src->mant.m32[0]) {
+                fp_sub64(dest->mant, src->mant);
+                fp_div64(quot, rem, dest->mant.m32[0], 0, src->mant.m32[0]);
+                dest->mant.m32[0] = 0x80000000 | (quot >> 1);
+                dest->mant.m32[1] = (quot & 1) | rem;   /* only for rounding */
+        } else {
+                fp_div64(quot, rem, dest->mant.m32[0], 0, src->mant.m32[0]);
+                dest->mant.m32[0] = quot;
+                dest->mant.m32[1] = rem;                /* only for rounding */
+                exp--;
+        }
+        if (exp >= 0x7fff) {
+                fp_set_ovrflw(dest);
+                return dest;
+        }
+        dest->exp = exp;
+        if (exp < 0) {
+                fp_set_sr(FPSR_EXC_UNFL);
+                fp_denormalize(dest, -exp);
+        }
+        return dest;
+}
+/* fp_roundint: Internal rounding function for use by several of these
+   emulated instructions.
+   This one rounds off the fractional part using the rounding mode
+   specified. */
+static void fp_roundint(struct fp_ext *dest, int mode)
+{
+        union fp_mant64 oldmant;
+        unsigned long mask;
+        if (!fp_normalize_ext(dest))
+                return;
+        /* infinities and zeroes */
+        if (IS_INF(dest) || IS_ZERO(dest))
+                return;
+        /* first truncate the lower bits */
+        oldmant = dest->mant;
+        switch (dest->exp) {
+        case 0 ... 0x3ffe:
+                dest->mant.m64 = 0;
+                break;
+        case 0x3fff ... 0x401e:
+                dest->mant.m32[0] &= 0xffffffffU << (0x401e - dest->exp);
+                dest->mant.m32[1] = 0;
+                if (oldmant.m64 == dest->mant.m64)
+                        return;
+                break;
+        case 0x401f ... 0x403e:
+                dest->mant.m32[1] &= 0xffffffffU << (0x403e - dest->exp);
+                if (oldmant.m32[1] == dest->mant.m32[1])
+                        return;
+                break;
+        default:
+                return;
+        }
+        fp_set_sr(FPSR_EXC_INEX2);
+        /* We might want to normalize upwards here... however, since
+           we know that this is only called on the output of fp_fdiv,
+           or with the input to fp_fint or fp_fintrz, and the inputs
+           to all these functions are either normal or denormalized
+           (no subnormals allowed!), there's really no need.
+           In the case of fp_fdiv, observe that 0x80000000 / 0xffff =
+           0xffff8000, and the same holds for 128-bit / 64-bit. (i.e. the
+           smallest possible normal dividend and the largest possible normal
+           divisor will still produce a normal quotient, therefore, (normal
+           << 64) / normal is normal in all cases) */
+        switch (mode) {
+        case FPCR_ROUND_RN:
+                switch (dest->exp) {
+                case 0 ... 0x3ffd:
+                        return;
+                case 0x3ffe:
+                        /* As noted above, the input is always normal, so the
+                           guard bit (bit 63) is always set.  therefore, the
+                           only case in which we will NOT round to 1.0 is when
+                           the input is exactly 0.5. */
+                        if (oldmant.m64 == (1ULL << 63))
+                                return;
+                        break;
+                case 0x3fff ... 0x401d:
+                        mask = 1 << (0x401d - dest->exp);
+                        if (!(oldmant.m32[0] & mask))
+                                return;
+                        if (oldmant.m32[0] & (mask << 1))
+                                break;
+                        if (!(oldmant.m32[0] << (dest->exp - 0x3ffd)) &&
+                                        !oldmant.m32[1])
+                                return;
+                        break;
+                case 0x401e:
+                        if (!(oldmant.m32[1] >= 0))
+                                return;
+                        if (oldmant.m32[0] & 1)
+                                break;
+                        if (!(oldmant.m32[1] << 1))
+                                return;
+                        break;
+                case 0x401f ... 0x403d:
+                        mask = 1 << (0x403d - dest->exp);
+                        if (!(oldmant.m32[1] & mask))
+                                return;
+                        if (oldmant.m32[1] & (mask << 1))
+                                break;
+                        if (!(oldmant.m32[1] << (dest->exp - 0x401d)))
+                                return;
+                        break;
+                default:
+                        return;
+                }
+                break;
+        case FPCR_ROUND_RZ:
+                return;
+        default:
+                if (dest->sign ^ (mode - FPCR_ROUND_RM))
+                        break;
+                return;
+        }
+        switch (dest->exp) {
+        case 0 ... 0x3ffe:
+                dest->exp = 0x3fff;
+                dest->mant.m64 = 1ULL << 63;
+                break;
+        case 0x3fff ... 0x401e:
+                mask = 1 << (0x401e - dest->exp);
+                if (dest->mant.m32[0] += mask)
+                        break;
+                dest->mant.m32[0] = 0x80000000;
+                dest->exp++;
+                break;
+        case 0x401f ... 0x403e:
+                mask = 1 << (0x403e - dest->exp);
+                if (dest->mant.m32[1] += mask)
+                        break;
+                if (dest->mant.m32[0] += 1)
+                        break;
+                dest->mant.m32[0] = 0x80000000;
+                dest->exp++;
+                break;
+        }
+}
+/* modrem_kernel: Implementation of the FREM and FMOD instructions
+   (which are exactly the same, except for the rounding used on the
+   intermediate value) */
+static struct fp_ext *
+modrem_kernel(struct fp_ext *dest, struct fp_ext *src, int mode)
+{
+        struct fp_ext tmp;
+        fp_dyadic_check(dest, src);
+        /* Infinities and zeros */
+        if (IS_INF(dest) || IS_ZERO(src)) {
+                fp_set_nan(dest);
+                return dest;
+        }
+        if (IS_ZERO(dest) || IS_INF(src))
+                return dest;
+        /* FIXME: there is almost certainly a smarter way to do this */
+        fp_copy_ext(&tmp, dest);
+        fp_fdiv(&tmp, src);             /* NOTE: src might be modified */
+        fp_roundint(&tmp, mode);
+        fp_fmul(&tmp, src);
+        fp_fsub(dest, &tmp);
+        /* set the quotient byte */
+        fp_set_quotient((dest->mant.m64 & 0x7f) | (dest->sign << 7));
+        return dest;
+}
+/* fp_fmod: Implements the kernel of the FMOD instruction.
+   Again, the argument order is backwards.  The result, as defined in
+   the Motorola manuals, is:
+   fmod(src,dest) = (dest - (src * floor(dest / src))) */
+struct fp_ext *
+fp_fmod(struct fp_ext *dest, struct fp_ext *src)
+{
+        dprint(PINSTR, "fmod\n");
+        return modrem_kernel(dest, src, FPCR_ROUND_RZ);
+}
+/* fp_frem: Implements the kernel of the FREM instruction.
+   frem(src,dest) = (dest - (src * round(dest / src)))
+ */
+struct fp_ext *
+fp_frem(struct fp_ext *dest, struct fp_ext *src)
+{
+        dprint(PINSTR, "frem\n");
+        return modrem_kernel(dest, src, FPCR_ROUND_RN);
+}
+struct fp_ext *
+fp_fint(struct fp_ext *dest, struct fp_ext *src)
+{
+        dprint(PINSTR, "fint\n");
+        fp_copy_ext(dest, src);
+        fp_roundint(dest, FPDATA->rnd);
+        return dest;
+}
+struct fp_ext *
+fp_fintrz(struct fp_ext *dest, struct fp_ext *src)
+{
+        dprint(PINSTR, "fintrz\n");
+        fp_copy_ext(dest, src);
+        fp_roundint(dest, FPCR_ROUND_RZ);
+        return dest;
+}
+struct fp_ext *
+fp_fscale(struct fp_ext *dest, struct fp_ext *src)
+{
+        int scale, oldround;
+        dprint(PINSTR, "fscale\n");
+        fp_dyadic_check(dest, src);
+        /* Infinities */
+        if (IS_INF(src)) {
+                fp_set_nan(dest);
+                return dest;
+        }
+        if (IS_INF(dest))
+                return dest;
+        /* zeroes */
+        if (IS_ZERO(src) || IS_ZERO(dest))
+                return dest;
+        /* Source exponent out of range */
+        if (src->exp >= 0x400c) {
+                fp_set_ovrflw(dest);
+                return dest;
+        }
+        /* src must be rounded with round to zero. */
+        oldround = FPDATA->rnd;
+        FPDATA->rnd = FPCR_ROUND_RZ;
+        scale = fp_conv_ext2long(src);
+        FPDATA->rnd = oldround;
+        /* new exponent */
+        scale += dest->exp;
+        if (scale >= 0x7fff) {
+                fp_set_ovrflw(dest);
+        } else if (scale <= 0) {
+                fp_set_sr(FPSR_EXC_UNFL);
+                fp_denormalize(dest, -scale);
+        } else
+                dest->exp = scale;
+        return dest;
+}
diff --git a/arch/m68k/math-emu/fp_arith.h b/arch/m68k/math-emu/fp_arith.h
new file mode 100644
index 000000000000..2cc3f846c393
--- /dev/null
+++ b/arch/m68k/math-emu/fp_arith.h
@@ -0,0 +1,52 @@
+/*
+   fp_arith.h: floating-point math routines for the Linux-m68k
+   floating point emulator.
+   Copyright (c) 1998 David Huggins-Daines.
+   Somewhat based on the AlphaLinux floating point emulator, by David
+   Mosberger-Tang.
+   You may copy, modify, and redistribute this file under the terms of
+   the GNU General Public License, version 2, or any later version, at
+   your convenience.
+ */
+#ifndef FP_ARITH_H
+#define FP_ARITH_H
+/* easy ones */
+struct fp_ext *
+fp_fabs(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fneg(struct fp_ext *dest, struct fp_ext *src);
+/* straightforward arithmetic */
+struct fp_ext *
+fp_fadd(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fsub(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fcmp(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_ftst(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fmul(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fdiv(struct fp_ext *dest, struct fp_ext *src);
+/* ones that do rounding and integer conversions */
+struct fp_ext *
+fp_fmod(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_frem(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fint(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fintrz(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fscale(struct fp_ext *dest, struct fp_ext *src);
+#endif  /* FP_ARITH__H */
diff --git a/arch/m68k/math-emu/fp_cond.S b/arch/m68k/math-emu/fp_cond.S
new file mode 100644
index 000000000000..ddae8b1b8b83
--- /dev/null
+++ b/arch/m68k/math-emu/fp_cond.S
@@ -0,0 +1,334 @@
+/*
+ * fp_cond.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "fp_emu.h"
+#include "fp_decode.h"
+        .globl  fp_fscc, fp_fbccw, fp_fbccl
+#ifdef FPU_EMU_DEBUG
+fp_fnop:
+        printf  PDECODE,"fnop\n"
+        jra     fp_end
+#else
+#define fp_fnop fp_end
+#endif
+fp_fbccw:
+        tst.w   %d2
+        jeq     fp_fnop
+        printf  PDECODE,"fbccw "
+        fp_get_pc %a0
+        lea     (-2,%a0,%d2.w),%a0
+        jra     1f
+fp_fbccl:
+        printf  PDECODE,"fbccl "
+        fp_get_pc %a0
+        move.l  %d2,%d0
+        swap    %d0
+        fp_get_instr_word %d0,fp_err_ua1
+        lea     (-2,%a0,%d0.l),%a0
+1:      printf  PDECODE,"%x",1,%a0
+        move.l  %d2,%d0
+        swap    %d0
+        jsr     fp_compute_cond
+        tst.l   %d0
+        jeq     1f
+        fp_put_pc %a0,1
+1:      printf  PDECODE,"\n"
+        jra     fp_end
+fp_fdbcc:
+        printf  PDECODE,"fdbcc "
+        fp_get_pc %a1                           | calculate new pc
+        fp_get_instr_word %d0,fp_err_ua1
+        add.w   %d0,%a1
+        fp_decode_addr_reg
+        printf  PDECODE,"d%d,%x\n",2,%d0,%a1
+        swap    %d1                             | test condition in %d1
+        tst.w   %d1
+        jne     2f
+        move.l  %d0,%d1
+        jsr     fp_get_data_reg
+        subq.w  #1,%d0
+        jcs     1f
+        fp_put_pc %a1,1
+1:      jsr     fp_put_data_reg
+2:      jra     fp_end
+| set flags for decode macros for fs<cc>
+do_fscc=1
+do_no_pc_mode=1
+fp_fscc:
+        printf  PDECODE,"fscc "
+        move.l  %d2,%d0
+        jsr     fp_compute_cond
+        move.w  %d0,%d1
+        swap    %d1
+        | decode addressing mode
+        fp_decode_addr_mode
+        .long   fp_data, fp_fdbcc
+        .long   fp_indirect, fp_postinc
+        .long   fp_predecr, fp_disp16
+        .long   fp_extmode0, fp_extmode1
+        | addressing mode: data register direct
+fp_data:
+        fp_mode_data_direct
+        move.w  %d0,%d1                 | save register nr
+        jsr     fp_get_data_reg
+        swap    %d1
+        move.b  %d1,%d0
+        swap    %d1
+        jsr     fp_put_data_reg
+        printf  PDECODE,"\n"
+        jra     fp_end
+fp_indirect:
+        fp_mode_addr_indirect
+        jra     fp_do_scc
+fp_postinc:
+        fp_mode_addr_indirect_postinc
+        jra     fp_do_scc
+fp_predecr:
+        fp_mode_addr_indirect_predec
+        jra     fp_do_scc
+fp_disp16:
+        fp_mode_addr_indirect_disp16
+        jra     fp_do_scc
+fp_extmode0:
+        fp_mode_addr_indirect_extmode0
+        jra     fp_do_scc
+fp_extmode1:
+        bfextu  %d2{#13,#3},%d0
+        jmp     ([0f:w,%pc,%d0*4])
+        .align  4
+0:
+        .long   fp_absolute_short, fp_absolute_long
+        .long   fp_ill, fp_ill          | NOTE: jump here to ftrap.x
+        .long   fp_ill, fp_ill
+        .long   fp_ill, fp_ill
+fp_absolute_short:
+        fp_mode_abs_short
+        jra     fp_do_scc
+fp_absolute_long:
+        fp_mode_abs_long
+|       jra     fp_do_scc
+fp_do_scc:
+        swap    %d1
+        putuser.b %d1,(%a0),fp_err_ua1,%a0
+        printf  PDECODE,"\n"
+        jra     fp_end
+#define tst_NAN btst #24,%d1
+#define tst_Z   btst #26,%d1
+#define tst_N   btst #27,%d1
+fp_compute_cond:
+        move.l  (FPD_FPSR,FPDATA),%d1
+        btst    #4,%d0
+        jeq     1f
+        tst_NAN
+        jeq     1f
+        bset    #15,%d1
+        bset    #7,%d1
+        move.l  %d1,(FPD_FPSR,FPDATA)
+1:      and.w   #0xf,%d0
+        jmp     ([0f:w,%pc,%d0.w*4])
+        .align  4
+0:
+        .long   fp_f  , fp_eq , fp_ogt, fp_oge
+        .long   fp_olt, fp_ole, fp_ogl, fp_or
+        .long   fp_un , fp_ueq, fp_ugt, fp_uge
+        .long   fp_ult, fp_ule, fp_ne , fp_t
+fp_f:
+        moveq   #0,%d0
+        rts
+fp_eq:
+        moveq   #0,%d0
+        tst_Z
+        jeq     1f
+        moveq   #-1,%d0
+1:      rts
+fp_ogt:
+        moveq   #0,%d0
+        tst_NAN
+        jne     1f
+        tst_Z
+        jne     1f
+        tst_N
+        jne     1f
+        moveq   #-1,%d0
+1:      rts
+fp_oge:
+        moveq   #-1,%d0
+        tst_Z
+        jne     2f
+        tst_NAN
+        jne     1f
+        tst_N
+        jeq     2f
+1:      moveq   #0,%d0
+2:      rts
+fp_olt:
+        moveq   #0,%d0
+        tst_NAN
+        jne     1f
+        tst_Z
+        jne     1f
+        tst_N
+        jeq     1f
+        moveq   #-1,%d0
+1:      rts
+fp_ole:
+        moveq   #-1,%d0
+        tst_Z
+        jne     2f
+        tst_NAN
+        jne     1f
+        tst_N
+        jne     2f
+1:      moveq   #0,%d0
+2:      rts
+fp_ogl:
+        moveq   #0,%d0
+        tst_NAN
+        jne     1f
+        tst_Z
+        jne     1f
+        moveq   #-1,%d0
+1:      rts
+fp_or:
+        moveq   #0,%d0
+        tst_NAN
+        jne     1f
+        moveq   #-1,%d0
+1:      rts
+fp_un:
+        moveq   #0,%d0
+        tst_NAN
+        jeq     1f
+        moveq   #-1,%d0
+        rts
+fp_ueq:
+        moveq   #-1,%d0
+        tst_NAN
+        jne     1f
+        tst_Z
+        jne     1f
+        moveq   #0,%d0
+1:      rts
+fp_ugt:
+        moveq   #-1,%d0
+        tst_NAN
+        jne     2f
+        tst_N
+        jne     1f
+        tst_Z
+        jeq     2f
+1:      moveq   #0,%d0
+2:      rts
+fp_uge:
+        moveq   #-1,%d0
+        tst_NAN
+        jne     1f
+        tst_Z
+        jne     1f
+        tst_N
+        jeq     1f
+        moveq   #0,%d0
+1:      rts
+fp_ult:
+        moveq   #-1,%d0
+        tst_NAN
+        jne     2f
+        tst_Z
+        jne     1f
+        tst_N
+        jne     2f
+1:      moveq   #0,%d0
+2:      rts
+fp_ule:
+        moveq   #-1,%d0
+        tst_NAN
+        jne     1f
+        tst_Z
+        jne     1f
+        tst_N
+        jne     1f
+        moveq   #0,%d0
+1:      rts
+fp_ne:
+        moveq   #0,%d0
+        tst_Z
+        jne     1f
+        moveq   #-1,%d0
+1:      rts
+fp_t:
+        moveq   #-1,%d0
+        rts
diff --git a/arch/m68k/math-emu/fp_decode.h b/arch/m68k/math-emu/fp_decode.h
new file mode 100644
index 000000000000..759679d9ab96
--- /dev/null
+++ b/arch/m68k/math-emu/fp_decode.h
@@ -0,0 +1,417 @@
+/*
+ * fp_decode.h
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _FP_DECODE_H
+#define _FP_DECODE_H
+/* These macros do the dirty work of the instr decoding, several variables
+ * can be defined in the source file to modify the work of these macros,
+ * currently the following variables are used:
+ * ...
+ * The register usage:
+ * d0 - will contain source operand for data direct mode,
+ *      otherwise scratch register
+ * d1 - upper 16bit are reserved for caller
+ *      lower 16bit may contain further arguments,
+ *      is destroyed during decoding
+ * d2 - contains first two instruction words,
+ *      first word will be used for extension word
+ * a0 - will point to source/dest operand for any indirect mode
+ *      otherwise scratch register
+ * a1 - scratch register
+ * a2 - base addr to the task structure
+ *
+ * the current implementation doesn't check for every disallowed
+ * addressing mode (e.g. pc relative modes as destination), as long
+ * as it only means a new addressing mode, which should not appear
+ * in a program and that doesn't crash the emulation, I think it's
+ * not a problem to allow these modes.
+ */
+do_fmovem=0
+do_fmovem_cr=0
+do_no_pc_mode=0
+do_fscc=0
+| first decoding of the instr type
+| this separates the conditional instr
+.macro  fp_decode_cond_instr_type
+        bfextu  %d2{#8,#2},%d0
+        jmp     ([0f:w,%pc,%d0*4])
+        .align  4
+0:
+|       .long   "f<op>","fscc/fdbcc"
+|       .long   "fbccw","fbccl"
+.endm
+| second decoding of the instr type
+| this separates most move instr
+.macro  fp_decode_move_instr_type
+        bfextu  %d2{#16,#3},%d0
+        jmp     ([0f:w,%pc,%d0*4])
+        .align  4
+0:
+|       .long   "f<op> fpx,fpx","invalid instr"
+|       .long   "f<op> <ea>,fpx","fmove fpx,<ea>"
+|       .long   "fmovem <ea>,fpcr","fmovem <ea>,fpx"
+|       .long   "fmovem fpcr,<ea>","fmovem fpx,<ea>"
+.endm
+| extract the source specifier, specifies
+| either source fp register or data format
+.macro  fp_decode_sourcespec
+        bfextu  %d2{#19,#3},%d0
+.endm
+| decode destination format for fmove reg,ea
+.macro  fp_decode_dest_format
+        bfextu  %d2{#19,#3},%d0
+.endm
+| decode source register for fmove reg,ea
+.macro  fp_decode_src_reg
+        bfextu  %d2{#22,#3},%d0
+.endm
+| extract the addressing mode
+| it depends on the instr which of the modes is valid
+.macro  fp_decode_addr_mode
+        bfextu  %d2{#10,#3},%d0
+        jmp     ([0f:w,%pc,%d0*4])
+        .align  4
+0:
+|       .long   "data register direct","addr register direct"
+|       .long   "addr register indirect"
+|       .long   "addr register indirect postincrement"
+|       .long   "addr register indirect predecrement"
+|       .long   "addr register + index16"
+|       .long   "extension mode1","extension mode2"
+.endm
+| extract the register for the addressing mode
+.macro  fp_decode_addr_reg
+        bfextu  %d2{#13,#3},%d0
+.endm
+| decode the 8bit diplacement from the brief extension word
+.macro  fp_decode_disp8
+        move.b  %d2,%d0
+        ext.w   %d0
+.endm
+| decode the index of the brief/full extension word
+.macro  fp_decode_index
+        bfextu  %d2{#17,#3},%d0         | get the register nr
+        btst    #15,%d2                 | test for data/addr register
+        jne     1\@f
+        printf  PDECODE,"d%d",1,%d0
+        jsr     fp_get_data_reg
+        jra     2\@f
+1\@:    printf  PDECODE,"a%d",1,%d0
+        jsr     fp_get_addr_reg
+        move.l  %a0,%d0
+2\@:
+debug   lea     "'l'.w,%a0"
+        btst    #11,%d2                 | 16/32 bit size?
+        jne     3\@f
+debug   lea     "'w'.w,%a0"
+        ext.l   %d0
+3\@:    printf  PDECODE,":%c",1,%a0
+        move.w  %d2,%d1                 | scale factor
+        rol.w   #7,%d1
+        and.w   #3,%d1
+debug   move.l  "%d1,-(%sp)"
+debug   ext.l   "%d1"
+        printf  PDECODE,":%d",1,%d1
+debug   move.l  "(%sp)+,%d1"
+        lsl.l   %d1,%d0
+.endm
+| decode the base displacement size
+.macro  fp_decode_basedisp
+        bfextu  %d2{#26,#2},%d0
+        jmp     ([0f:w,%pc,%d0*4])
+        .align  4
+0:
+|       .long   "reserved","null displacement"
+|       .long   "word displacement","long displacement"
+.endm
+.macro  fp_decode_outerdisp
+        bfextu  %d2{#30,#2},%d0
+        jmp     ([0f:w,%pc,%d0*4])
+        .align  4
+0:
+|       .long   "no memory indirect action/reserved","null outer displacement"
+|       .long   "word outer displacement","long outer displacement"
+.endm
+| get the extension word and test for brief or full extension type
+.macro  fp_get_test_extword label
+        fp_get_instr_word %d2,fp_err_ua1
+        btst    #8,%d2
+        jne     \label
+.endm
+| test if %pc is the base register for the indirect addr mode
+.macro  fp_test_basereg_d16     label
+        btst    #20,%d2
+        jeq     \label
+.endm
+| test if %pc is the base register for one of the extended modes
+.macro  fp_test_basereg_ext     label
+        btst    #19,%d2
+        jeq     \label
+.endm
+.macro  fp_test_suppr_index label
+        btst    #6,%d2
+        jne     \label
+.endm
+| addressing mode: data register direct
+.macro  fp_mode_data_direct
+        fp_decode_addr_reg
+        printf  PDECODE,"d%d",1,%d0
+.endm
+| addressing mode: address register indirect
+.macro  fp_mode_addr_indirect
+        fp_decode_addr_reg
+        printf  PDECODE,"(a%d)",1,%d0
+        jsr     fp_get_addr_reg
+.endm
+| adjust stack for byte moves from/to stack
+.macro  fp_test_sp_byte_move
+        .if     !do_fmovem
+        .if     do_fscc
+        move.w  #6,%d1
+        .endif
+        cmp.w   #7,%d0
+        jne     1\@f
+        .if     !do_fscc
+        cmp.w   #6,%d1
+        jne     1\@f
+        .endif
+        move.w  #4,%d1
+1\@:
+        .endif
+.endm
+| addressing mode: address register indirect with postincrement
+.macro  fp_mode_addr_indirect_postinc
+        fp_decode_addr_reg
+        printf  PDECODE,"(a%d)+",1,%d0
+        fp_test_sp_byte_move
+        jsr     fp_get_addr_reg
+        move.l  %a0,%a1                 | save addr
+        .if     do_fmovem
+        lea     (%a0,%d1.w*4),%a0
+        .if     !do_fmovem_cr
+        lea     (%a0,%d1.w*8),%a0
+        .endif
+        .else
+        add.w   (fp_datasize,%d1.w*2),%a0
+        .endif
+        jsr     fp_put_addr_reg
+        move.l  %a1,%a0
+.endm
+| addressing mode: address register indirect with predecrement
+.macro  fp_mode_addr_indirect_predec
+        fp_decode_addr_reg
+        printf  PDECODE,"-(a%d)",1,%d0
+        fp_test_sp_byte_move
+        jsr     fp_get_addr_reg
+        .if     do_fmovem
+        .if     !do_fmovem_cr
+        lea     (-12,%a0),%a1           | setup to addr of 1st reg to move
+        neg.w   %d1
+        lea     (%a0,%d1.w*4),%a0
+        add.w   %d1,%d1
+        lea     (%a0,%d1.w*4),%a0
+        jsr     fp_put_addr_reg
+        move.l  %a1,%a0
+        .else
+        neg.w   %d1
+        lea     (%a0,%d1.w*4),%a0
+        jsr     fp_put_addr_reg
+        .endif
+        .else
+        sub.w   (fp_datasize,%d1.w*2),%a0
+        jsr     fp_put_addr_reg
+        .endif
+.endm
+| addressing mode: address register/programm counter indirect
+|                  with 16bit displacement
+.macro  fp_mode_addr_indirect_disp16
+        .if     !do_no_pc_mode
+        fp_test_basereg_d16 1f
+        printf  PDECODE,"pc"
+        fp_get_pc %a0
+        jra     2f
+        .endif
+1:      fp_decode_addr_reg
+        printf  PDECODE,"a%d",1,%d0
+        jsr     fp_get_addr_reg
+2:      fp_get_instr_word %a1,fp_err_ua1
+        printf  PDECODE,"@(%x)",1,%a1
+        add.l   %a1,%a0
+.endm
+| perform preindex (if I/IS == 0xx and xx != 00)
+.macro  fp_do_preindex
+        moveq   #3,%d0
+        and.w   %d2,%d0
+        jeq     1f
+        btst    #2,%d2
+        jne     1f
+        printf  PDECODE,")@("
+        getuser.l (%a1),%a1,fp_err_ua1,%a1
+debug   jra     "2f"
+1:      printf  PDECODE,","
+2:
+.endm
+| perform postindex (if I/IS == 1xx)
+.macro  fp_do_postindex
+        btst    #2,%d2
+        jeq     1f
+        printf  PDECODE,")@("
+        getuser.l (%a1),%a1,fp_err_ua1,%a1
+debug   jra     "2f"
+1:      printf  PDECODE,","
+2:
+.endm
+| all other indirect addressing modes will finally end up here
+.macro  fp_mode_addr_indirect_extmode0
+        .if     !do_no_pc_mode
+        fp_test_basereg_ext 1f
+        printf  PDECODE,"pc"
+        fp_get_pc %a0
+        jra     2f
+        .endif
+1:      fp_decode_addr_reg
+        printf  PDECODE,"a%d",1,%d0
+        jsr     fp_get_addr_reg
+2:      move.l  %a0,%a1
+        swap    %d2
+        fp_get_test_extword 3f
+        | addressing mode: address register/programm counter indirect
+        |                  with index and 8bit displacement
+        fp_decode_disp8
+debug   ext.l   "%d0"
+        printf  PDECODE,"@(%x,",1,%d0
+        add.w   %d0,%a1
+        fp_decode_index
+        add.l   %d0,%a1
+        printf  PDECODE,")"
+        jra     9f
+3:      | addressing mode: address register/programm counter memory indirect
+        |                  with base and/or outer displacement
+        btst    #7,%d2                  | base register suppressed?
+        jeq     1f
+        printf  PDECODE,"!"
+        sub.l   %a1,%a1
+1:      printf  PDECODE,"@("
+        fp_decode_basedisp
+        .long   fp_ill,1f
+        .long   2f,3f
+#ifdef FPU_EMU_DEBUG
+1:      printf  PDECODE,"0"             | null base displacement
+        jra     1f
+#endif
+2:      fp_get_instr_word %a0,fp_err_ua1 | 16bit base displacement
+        printf  PDECODE,"%x:w",1,%a0
+        jra     4f
+3:      fp_get_instr_long %a0,fp_err_ua1 | 32bit base displacement
+        printf  PDECODE,"%x:l",1,%a0
+4:      add.l   %a0,%a1
+1:
+        fp_do_postindex
+        fp_test_suppr_index 1f
+        fp_decode_index
+        add.l   %d0,%a1
+1:      fp_do_preindex
+        fp_decode_outerdisp
+        .long   5f,1f
+        .long   2f,3f
+#ifdef FPU_EMU_DEBUG
+1:      printf  PDECODE,"0"             | null outer displacement
+        jra     1f
+#endif
+2:      fp_get_instr_word %a0,fp_err_ua1 | 16bit outer displacement
+        printf  PDECODE,"%x:w",1,%a0
+        jra     4f
+3:      fp_get_instr_long %a0,fp_err_ua1 | 32bit outer displacement
+        printf  PDECODE,"%x:l",1,%a0
+4:      add.l   %a0,%a1
+1:
+5:      printf  PDECODE,")"
+9:      move.l  %a1,%a0
+        swap    %d2
+.endm
+| get the absolute short address from user space
+.macro  fp_mode_abs_short
+        fp_get_instr_word %a0,fp_err_ua1
+        printf  PDECODE,"%x.w",1,%a0
+.endm
+| get the absolute long address from user space
+.macro  fp_mode_abs_long
+        fp_get_instr_long %a0,fp_err_ua1
+        printf  PDECODE,"%x.l",1,%a0
+.endm
+#endif /* _FP_DECODE_H */
diff --git a/arch/m68k/math-emu/fp_emu.h b/arch/m68k/math-emu/fp_emu.h
new file mode 100644
index 000000000000..1d6edc975d89
--- /dev/null
+++ b/arch/m68k/math-emu/fp_emu.h
@@ -0,0 +1,146 @@
+/*
+ * fp_emu.h
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _FP_EMU_H
+#define _FP_EMU_H
+#ifdef __ASSEMBLY__
+#include <asm/offsets.h>
+#endif
+#include <asm/math-emu.h>
+#ifndef __ASSEMBLY__
+#define IS_INF(a) ((a)->exp == 0x7fff)
+#define IS_ZERO(a) ((a)->mant.m64 == 0)
+#define fp_set_sr(bit) ({                                       \
+        FPDATA->fpsr |= 1 << (bit);                             \
+})
+#define fp_set_quotient(quotient) ({                            \
+        FPDATA->fpsr &= 0xff00ffff;                             \
+        FPDATA->fpsr |= ((quotient) & 0xff) << 16;              \
+})
+/* linkage for several useful functions */
+/* Normalize the extended struct, return 0 for a NaN */
+#define fp_normalize_ext(fpreg) ({                              \
+        register struct fp_ext *reg asm ("a0") = fpreg;         \
+        register int res asm ("d0");                            \
+                                                                \
+        asm volatile ("jsr fp_conv_ext2ext"                     \
+                        : "=d" (res) : "a" (reg)                \
+                        : "a1", "d1", "d2", "memory");          \
+        res;                                                    \
+})
+#define fp_copy_ext(dest, src) ({                               \
+        *dest = *src;                                           \
+})
+#define fp_monadic_check(dest, src) ({                          \
+        fp_copy_ext(dest, src);                                 \
+        if (!fp_normalize_ext(dest))                            \
+                return dest;                                    \
+})
+#define fp_dyadic_check(dest, src) ({                           \
+        if (!fp_normalize_ext(dest))                            \
+                return dest;                                    \
+        if (!fp_normalize_ext(src)) {                           \
+                fp_copy_ext(dest, src);                         \
+                return dest;                                    \
+        }                                                       \
+})
+extern const struct fp_ext fp_QNaN;
+extern const struct fp_ext fp_Inf;
+#define fp_set_nan(dest) ({                                     \
+        fp_set_sr(FPSR_EXC_OPERR);                              \
+        *dest = fp_QNaN;                                        \
+})
+/* TODO check rounding mode? */
+#define fp_set_ovrflw(dest) ({                                  \
+        fp_set_sr(FPSR_EXC_OVFL);                               \
+        dest->exp = 0x7fff;                                     \
+        dest->mant.m64 = 0;                                     \
+})
+#define fp_conv_ext2long(src) ({                                \
+        register struct fp_ext *__src asm ("a0") = src;         \
+        register int __res asm ("d0");                          \
+                                                                \
+        asm volatile ("jsr fp_conv_ext2long"                    \
+                        : "=d" (__res) : "a" (__src)            \
+                        : "a1", "d1", "d2", "memory");          \
+        __res;                                                  \
+})
+#define fp_conv_long2ext(dest, src) ({                          \
+        register struct fp_ext *__dest asm ("a0") = dest;       \
+        register int __src asm ("d0") = src;                    \
+                                                                \
+        asm volatile ("jsr fp_conv_ext2long"                    \
+                        : : "d" (__src), "a" (__dest)           \
+                        : "a1", "d1", "d2", "memory");          \
+})
+#else /* __ASSEMBLY__ */
+/*
+ * set, reset or clear a bit in the fp status register
+ */
+.macro  fp_set_sr       bit
+        bset    #(\bit&7),(FPD_FPSR+3-(\bit/8),FPDATA)
+.endm
+.macro  fp_clr_sr       bit
+        bclr    #(\bit&7),(FPD_FPSR+3-(\bit/8),FPDATA)
+.endm
+.macro  fp_tst_sr       bit
+        btst    #(\bit&7),(FPD_FPSR+3-(\bit/8),FPDATA)
+.endm
+#endif /* __ASSEMBLY__ */
+#endif /* _FP_EMU_H */
diff --git a/arch/m68k/math-emu/fp_entry.S b/arch/m68k/math-emu/fp_entry.S
new file mode 100644
index 000000000000..5ec2d9101ea3
--- /dev/null
+++ b/arch/m68k/math-emu/fp_entry.S
@@ -0,0 +1,325 @@
+/*
+ * fp_emu.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/entry.h>
+#include "fp_emu.h"
+        .globl  fpu_emu
+        .globl  fp_debugprint
+        .globl  fp_err_ua1,fp_err_ua2
+        .text
+fpu_emu:
+        SAVE_ALL_INT
+        GET_CURRENT(%d0)
+#if defined(CPU_M68020_OR_M68030) && defined(CPU_M68040_OR_M68060)
+        tst.l   m68k_is040or060
+        jeq     1f
+#endif
+#if defined(CPU_M68040_OR_M68060)
+        move.l  (FPS_PC2,%sp),(FPS_PC,%sp)
+#endif
+1:
+        | emulate the instruction
+        jsr     fp_scan
+#if defined(CONFIG_M68060)
+#if !defined(CPU_M68060_ONLY)
+        btst    #3,m68k_cputype+3
+        jeq     1f
+#endif
+        btst    #7,(FPS_SR,%sp)
+        jne     fp_sendtrace060
+#endif
+1:
+        | emulation successful?
+        tst.l   %d0
+        jeq     ret_from_exception
+        | send some signal to program here
+        jra     ret_from_exception
+        | we jump here after an access error while trying to access
+        | user space, we correct stackpointer and send a SIGSEGV to
+        | the user process
+fp_err_ua2:
+        addq.l  #4,%sp
+fp_err_ua1:
+        addq.l  #4,%sp
+        move.l  %a0,-(%sp)
+        pea     SEGV_MAPERR
+        pea     SIGSEGV
+        jsr     fpemu_signal
+        add.w   #12,%sp
+        jra     ret_from_exception
+#if defined(CONFIG_M68060)
+        | send a trace signal if we are debugged
+        | it does not really belong here, but...
+fp_sendtrace060:
+        move.l  (FPS_PC,%sp),-(%sp)
+        pea     TRAP_TRACE
+        pea     SIGTRAP
+        jsr     fpemu_signal
+        add.w   #12,%sp
+        jra     ret_from_exception
+#endif
+        .globl  fp_get_data_reg, fp_put_data_reg
+        .globl  fp_get_addr_reg, fp_put_addr_reg
+        | Entry points to get/put a register. Some of them can be get/put
+        | directly, others are on the stack, as we read/write the stack
+        | directly here, these function may only be called from within
+        | instruction decoding, otherwise the stack pointer is incorrect
+        | and the stack gets corrupted.
+fp_get_data_reg:
+        jmp     ([0f:w,%pc,%d0.w*4])
+        .align  4
+0:
+        .long   fp_get_d0, fp_get_d1
+        .long   fp_get_d2, fp_get_d3
+        .long   fp_get_d4, fp_get_d5
+        .long   fp_get_d6, fp_get_d7
+fp_get_d0:
+        move.l  (PT_D0+8,%sp),%d0
+        printf  PREGISTER,"{d0->%08x}",1,%d0
+        rts
+fp_get_d1:
+        move.l  (PT_D1+8,%sp),%d0
+        printf  PREGISTER,"{d1->%08x}",1,%d0
+        rts
+fp_get_d2:
+        move.l  (PT_D2+8,%sp),%d0
+        printf  PREGISTER,"{d2->%08x}",1,%d0
+        rts
+fp_get_d3:
+        move.l  %d3,%d0
+        printf  PREGISTER,"{d3->%08x}",1,%d0
+        rts
+fp_get_d4:
+        move.l  %d4,%d0
+        printf  PREGISTER,"{d4->%08x}",1,%d0
+        rts
+fp_get_d5:
+        move.l  %d5,%d0
+        printf  PREGISTER,"{d5->%08x}",1,%d0
+        rts
+fp_get_d6:
+        move.l  %d6,%d0
+        printf  PREGISTER,"{d6->%08x}",1,%d0
+        rts
+fp_get_d7:
+        move.l  %d7,%d0
+        printf  PREGISTER,"{d7->%08x}",1,%d0
+        rts
+fp_put_data_reg:
+        jmp     ([0f:w,%pc,%d1.w*4])
+        .align  4
+0:
+        .long   fp_put_d0, fp_put_d1
+        .long   fp_put_d2, fp_put_d3
+        .long   fp_put_d4, fp_put_d5
+        .long   fp_put_d6, fp_put_d7
+fp_put_d0:
+        printf  PREGISTER,"{d0<-%08x}",1,%d0
+        move.l  %d0,(PT_D0+8,%sp)
+        rts
+fp_put_d1:
+        printf  PREGISTER,"{d1<-%08x}",1,%d0
+        move.l  %d0,(PT_D1+8,%sp)
+        rts
+fp_put_d2:
+        printf  PREGISTER,"{d2<-%08x}",1,%d0
+        move.l  %d0,(PT_D2+8,%sp)
+        rts
+fp_put_d3:
+        printf  PREGISTER,"{d3<-%08x}",1,%d0
+|       move.l  %d0,%d3
+        move.l  %d0,(PT_D3+8,%sp)
+        rts
+fp_put_d4:
+        printf  PREGISTER,"{d4<-%08x}",1,%d0
+|       move.l  %d0,%d4
+        move.l  %d0,(PT_D4+8,%sp)
+        rts
+fp_put_d5:
+        printf  PREGISTER,"{d5<-%08x}",1,%d0
+|       move.l  %d0,%d5
+        move.l  %d0,(PT_D5+8,%sp)
+        rts
+fp_put_d6:
+        printf  PREGISTER,"{d6<-%08x}",1,%d0
+        move.l  %d0,%d6
+        rts
+fp_put_d7:
+        printf  PREGISTER,"{d7<-%08x}",1,%d0
+        move.l  %d0,%d7
+        rts
+fp_get_addr_reg:
+        jmp     ([0f:w,%pc,%d0.w*4])
+        .align  4
+0:
+        .long   fp_get_a0, fp_get_a1
+        .long   fp_get_a2, fp_get_a3
+        .long   fp_get_a4, fp_get_a5
+        .long   fp_get_a6, fp_get_a7
+fp_get_a0:
+        move.l  (PT_A0+8,%sp),%a0
+        printf  PREGISTER,"{a0->%08x}",1,%a0
+        rts
+fp_get_a1:
+        move.l  (PT_A1+8,%sp),%a0
+        printf  PREGISTER,"{a1->%08x}",1,%a0
+        rts
+fp_get_a2:
+        move.l  (PT_A2+8,%sp),%a0
+        printf  PREGISTER,"{a2->%08x}",1,%a0
+        rts
+fp_get_a3:
+        move.l  %a3,%a0
+        printf  PREGISTER,"{a3->%08x}",1,%a0
+        rts
+fp_get_a4:
+        move.l  %a4,%a0
+        printf  PREGISTER,"{a4->%08x}",1,%a0
+        rts
+fp_get_a5:
+        move.l  %a5,%a0
+        printf  PREGISTER,"{a5->%08x}",1,%a0
+        rts
+fp_get_a6:
+        move.l  %a6,%a0
+        printf  PREGISTER,"{a6->%08x}",1,%a0
+        rts
+fp_get_a7:
+        move.l  %usp,%a0
+        printf  PREGISTER,"{a7->%08x}",1,%a0
+        rts
+fp_put_addr_reg:
+        jmp     ([0f:w,%pc,%d0.w*4])
+        .align  4
+0:
+        .long   fp_put_a0, fp_put_a1
+        .long   fp_put_a2, fp_put_a3
+        .long   fp_put_a4, fp_put_a5
+        .long   fp_put_a6, fp_put_a7
+fp_put_a0:
+        printf  PREGISTER,"{a0<-%08x}",1,%a0
+        move.l  %a0,(PT_A0+8,%sp)
+        rts
+fp_put_a1:
+        printf  PREGISTER,"{a1<-%08x}",1,%a0
+        move.l  %a0,(PT_A1+8,%sp)
+        rts
+fp_put_a2:
+        printf  PREGISTER,"{a2<-%08x}",1,%a0
+        move.l  %a0,(PT_A2+8,%sp)
+        rts
+fp_put_a3:
+        printf  PREGISTER,"{a3<-%08x}",1,%a0
+        move.l  %a0,%a3
+        rts
+fp_put_a4:
+        printf  PREGISTER,"{a4<-%08x}",1,%a0
+        move.l  %a0,%a4
+        rts
+fp_put_a5:
+        printf  PREGISTER,"{a5<-%08x}",1,%a0
+        move.l  %a0,%a5
+        rts
+fp_put_a6:
+        printf  PREGISTER,"{a6<-%08x}",1,%a0
+        move.l  %a0,%a6
+        rts
+fp_put_a7:
+        printf  PREGISTER,"{a7<-%08x}",1,%a0
+        move.l  %a0,%usp
+        rts
+        .data
+        .align  4
+fp_debugprint:
+|       .long   PMDECODE
+        .long   PMINSTR+PMDECODE+PMCONV+PMNORM
+|       .long   PMCONV+PMNORM+PMINSTR
+|       .long   0
diff --git a/arch/m68k/math-emu/fp_log.c b/arch/m68k/math-emu/fp_log.c
new file mode 100644
index 000000000000..87b4f0158560
--- /dev/null
+++ b/arch/m68k/math-emu/fp_log.c
@@ -0,0 +1,223 @@
+/*
+  fp_trig.c: floating-point math routines for the Linux-m68k
+  floating point emulator.
+  Copyright (c) 1998-1999 David Huggins-Daines / Roman Zippel.
+  I hereby give permission, free of charge, to copy, modify, and
+  redistribute this software, in source or binary form, provided that
+  the above copyright notice and the following disclaimer are included
+  in all such copies.
+  THIS SOFTWARE IS PROVIDED "AS IS", WITH ABSOLUTELY NO WARRANTY, REAL
+  OR IMPLIED.
+*/
+#include "fp_emu.h"
+static const struct fp_ext fp_one =
+{
+        .exp = 0x3fff,
+};
+extern struct fp_ext *fp_fadd(struct fp_ext *dest, const struct fp_ext *src);
+extern struct fp_ext *fp_fdiv(struct fp_ext *dest, const struct fp_ext *src);
+extern struct fp_ext *fp_fmul(struct fp_ext *dest, const struct fp_ext *src);
+struct fp_ext *
+fp_fsqrt(struct fp_ext *dest, struct fp_ext *src)
+{
+        struct fp_ext tmp, src2;
+        int i, exp;
+        dprint(PINSTR, "fsqrt\n");
+        fp_monadic_check(dest, src);
+        if (IS_ZERO(dest))
+                return dest;
+        if (dest->sign) {
+                fp_set_nan(dest);
+                return dest;
+        }
+        if (IS_INF(dest))
+                return dest;
+        /*
+         *               sqrt(m) * 2^(p)        , if e = 2*p
+         * sqrt(m*2^e) =
+         *               sqrt(2*m) * 2^(p)      , if e = 2*p + 1
+         *
+         * So we use the last bit of the exponent to decide wether to
+         * use the m or 2*m.
+         *
+         * Since only the fractional part of the mantissa is stored and
+         * the integer part is assumed to be one, we place a 1 or 2 into
+         * the fixed point representation.
+         */
+        exp = dest->exp;
+        dest->exp = 0x3FFF;
+        if (!(exp & 1))         /* lowest bit of exponent is set */
+                dest->exp++;
+        fp_copy_ext(&src2, dest);
+        /*
+         * The taylor row arround a for sqrt(x) is:
+         *      sqrt(x) = sqrt(a) + 1/(2*sqrt(a))*(x-a) + R
+         * With a=1 this gives:
+         *      sqrt(x) = 1 + 1/2*(x-1)
+         *              = 1/2*(1+x)
+         */
+        fp_fadd(dest, &fp_one);
+        dest->exp--;            /* * 1/2 */
+        /*
+         * We now apply the newton rule to the function
+         *      f(x) := x^2 - r
+         * which has a null point on x = sqrt(r).
+         *
+         * It gives:
+         *      x' := x - f(x)/f'(x)
+         *          = x - (x^2 -r)/(2*x)
+         *          = x - (x - r/x)/2
+         *          = (2*x - x + r/x)/2
+         *          = (x + r/x)/2
+         */
+        for (i = 0; i < 9; i++) {
+                fp_copy_ext(&tmp, &src2);
+                fp_fdiv(&tmp, dest);
+                fp_fadd(dest, &tmp);
+                dest->exp--;
+        }
+        dest->exp += (exp - 0x3FFF) / 2;
+        return dest;
+}
+struct fp_ext *
+fp_fetoxm1(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fetoxm1\n");
+        fp_monadic_check(dest, src);
+        if (IS_ZERO(dest))
+                return dest;
+        return dest;
+}
+struct fp_ext *
+fp_fetox(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fetox\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_ftwotox(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("ftwotox\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_ftentox(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("ftentox\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_flogn(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("flogn\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_flognp1(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("flognp1\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_flog10(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("flog10\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_flog2(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("flog2\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_fgetexp(struct fp_ext *dest, struct fp_ext *src)
+{
+        dprint(PINSTR, "fgetexp\n");
+        fp_monadic_check(dest, src);
+        if (IS_INF(dest)) {
+                fp_set_nan(dest);
+                return dest;
+        }
+        if (IS_ZERO(dest))
+                return dest;
+        fp_conv_long2ext(dest, (int)dest->exp - 0x3FFF);
+        fp_normalize_ext(dest);
+        return dest;
+}
+struct fp_ext *
+fp_fgetman(struct fp_ext *dest, struct fp_ext *src)
+{
+        dprint(PINSTR, "fgetman\n");
+        fp_monadic_check(dest, src);
+        if (IS_ZERO(dest))
+                return dest;
+        if (IS_INF(dest))
+                return dest;
+        dest->exp = 0x3FFF;
+        return dest;
+}
diff --git a/arch/m68k/math-emu/fp_move.S b/arch/m68k/math-emu/fp_move.S
new file mode 100644
index 000000000000..71bdf83ba61a
--- /dev/null
+++ b/arch/m68k/math-emu/fp_move.S
@@ -0,0 +1,244 @@
+/*
+ * fp_move.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "fp_emu.h"
+#include "fp_decode.h"
+do_no_pc_mode=1
+        .globl  fp_fmove_fp2mem
+fp_fmove_fp2mem:
+        clr.b   (2+FPD_FPSR,FPDATA)
+        fp_decode_dest_format
+        move.w  %d0,%d1                 | store data size twice in %d1
+        swap    %d1                     | one can be trashed below
+        move.w  %d0,%d1
+#ifdef FPU_EMU_DEBUG
+        lea     0f,%a0
+        clr.l   %d0
+        move.b  (%a0,%d1.w),%d0
+        printf  PDECODE,"fmove.%c ",1,%d0
+        fp_decode_src_reg
+        printf  PDECODE,"fp%d,",1,%d0
+        .data
+0:      .byte   'l','s','x','p','w','d','b','p'
+        .previous
+#endif
+        | encode addressing mode for dest
+        fp_decode_addr_mode
+        .long   fp_data, fp_ill
+        .long   fp_indirect, fp_postinc
+        .long   fp_predecr, fp_disp16
+        .long   fp_extmode0, fp_extmode1
+        | addressing mode: data register direct
+fp_data:
+        fp_mode_data_direct
+        move.w  %d0,%d1
+        fp_decode_src_reg
+        fp_get_fp_reg
+        lea     (FPD_TEMPFP1,FPDATA),%a1
+        move.l  (%a0)+,(%a1)+
+        move.l  (%a0)+,(%a1)+
+        move.l  (%a0),(%a1)
+        lea     (-8,%a1),%a0
+        swap    %d1
+        move.l  %d1,%d2
+        printf  PDECODE,"\n"
+        jmp     ([0f:w,%pc,%d1.w*4])
+        .align  4
+0:
+        .long   fp_data_long, fp_data_single
+        .long   fp_ill, fp_ill
+        .long   fp_data_word, fp_ill
+        .long   fp_data_byte, fp_ill
+fp_data_byte:
+        jsr     fp_normalize_ext
+        jsr     fp_conv_ext2byte
+        move.l  %d0,%d1
+        swap    %d2
+        move.w  %d2,%d0
+        jsr     fp_get_data_reg
+        move.b  %d1,%d0
+        move.w  %d2,%d1
+        jsr     fp_put_data_reg
+        jra     fp_final
+fp_data_word:
+        jsr     fp_normalize_ext
+        jsr     fp_conv_ext2short
+        move.l  %d0,%d1
+        swap    %d2
+        move.w  %d2,%d0
+        jsr     fp_get_data_reg
+        move.w  %d1,%d0
+        move.l  %d2,%d1
+        jsr     fp_put_data_reg
+        jra     fp_final
+fp_data_long:
+        jsr     fp_normalize_ext
+        jsr     fp_conv_ext2long
+        swap    %d2
+        move.w  %d2,%d1
+        jsr     fp_put_data_reg
+        jra     fp_final
+fp_data_single:
+        jsr     fp_normalize_ext
+        jsr     fp_conv_ext2single
+        swap    %d2
+        move.w  %d2,%d1
+        jsr     fp_put_data_reg
+        jra     fp_final
+        | addressing mode: address register indirect
+fp_indirect:
+        fp_mode_addr_indirect
+        jra     fp_putdest
+        | addressing mode: address register indirect with postincrement
+fp_postinc:
+        fp_mode_addr_indirect_postinc
+        jra     fp_putdest
+        | addressing mode: address register indirect with predecrement
+fp_predecr:
+        fp_mode_addr_indirect_predec
+        jra     fp_putdest
+        | addressing mode: address register indirect with 16bit displacement
+fp_disp16:
+        fp_mode_addr_indirect_disp16
+        jra     fp_putdest
+fp_extmode0:
+        fp_mode_addr_indirect_extmode0
+        jra     fp_putdest
+fp_extmode1:
+        fp_decode_addr_reg
+        jmp     ([0f:w,%pc,%d0*4])
+        .align  4
+0:
+        .long   fp_abs_short, fp_abs_long
+        .long   fp_ill, fp_ill
+        .long   fp_ill, fp_ill
+        .long   fp_ill, fp_ill
+fp_abs_short:
+        fp_mode_abs_short
+        jra     fp_putdest
+fp_abs_long:
+        fp_mode_abs_long
+        jra     fp_putdest
+fp_putdest:
+        move.l  %a0,%a1
+        fp_decode_src_reg
+        move.l  %d1,%d2                 | save size
+        fp_get_fp_reg
+        printf  PDECODE,"\n"
+        addq.l  #8,%a0
+        move.l  (%a0),-(%sp)
+        move.l  -(%a0),-(%sp)
+        move.l  -(%a0),-(%sp)
+        move.l  %sp,%a0
+        jsr     fp_normalize_ext
+        swap    %d2
+        jmp     ([0f:w,%pc,%d2.w*4])
+        .align  4
+0:
+        .long   fp_format_long, fp_format_single
+        .long   fp_format_extended, fp_format_packed
+        .long   fp_format_word, fp_format_double
+        .long   fp_format_byte, fp_format_packed
+fp_format_long:
+        jsr     fp_conv_ext2long
+        putuser.l %d0,(%a1),fp_err_ua1,%a1
+        jra     fp_finish_move
+fp_format_single:
+        jsr     fp_conv_ext2single
+        putuser.l %d0,(%a1),fp_err_ua1,%a1
+        jra     fp_finish_move
+fp_format_extended:
+        move.l  (%a0)+,%d0
+        lsl.w   #1,%d0
+        lsl.l   #7,%d0
+        lsl.l   #8,%d0
+        putuser.l %d0,(%a1)+,fp_err_ua1,%a1
+        move.l  (%a0)+,%d0
+        putuser.l %d0,(%a1)+,fp_err_ua1,%a1
+        move.l  (%a0),%d0
+        putuser.l %d0,(%a1),fp_err_ua1,%a1
+        jra     fp_finish_move
+fp_format_packed:
+        /* not supported yet */
+        lea     (12,%sp),%sp
+        jra     fp_ill
+fp_format_word:
+        jsr     fp_conv_ext2short
+        putuser.w %d0,(%a1),fp_err_ua1,%a1
+        jra     fp_finish_move
+fp_format_double:
+        jsr     fp_conv_ext2double
+        jra     fp_finish_move
+fp_format_byte:
+        jsr     fp_conv_ext2byte
+        putuser.b %d0,(%a1),fp_err_ua1,%a1
+|       jra     fp_finish_move
+fp_finish_move:
+        lea     (12,%sp),%sp
+        jra     fp_final
diff --git a/arch/m68k/math-emu/fp_movem.S b/arch/m68k/math-emu/fp_movem.S
new file mode 100644
index 000000000000..8354d39e6c47
--- /dev/null
+++ b/arch/m68k/math-emu/fp_movem.S
@@ -0,0 +1,368 @@
+/*
+ * fp_movem.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "fp_emu.h"
+#include "fp_decode.h"
+| set flags for decode macros for fmovem
+do_fmovem=1
+        .globl  fp_fmovem_fp, fp_fmovem_cr
+| %d1 contains the mask and count of the register list
+| for other register usage see fp_decode.h
+fp_fmovem_fp:
+        printf  PDECODE,"fmovem.x "
+        | get register list and count them
+        btst    #11,%d2
+        jne     1f
+        bfextu  %d2{#24,#8},%d0         | static register list
+        jra     2f
+1:      bfextu  %d2{#25,#3},%d0         | dynamic register list
+        jsr     fp_get_data_reg
+2:      move.l  %d0,%d1
+        swap    %d1
+        jra     2f
+1:      addq.w  #1,%d1                  | count the # of registers in
+2:      lsr.b   #1,%d0                  | register list and keep it in %d1
+        jcs     1b
+        jne     2b
+        printf  PDECODE,"#%08x",1,%d1
+#ifdef FPU_EMU_DEBUG
+        btst    #12,%d2
+        jne     1f
+        printf  PDECODE,"-"             | decremental move
+        jra     2f
+1:      printf  PDECODE,"+"             | incremental move
+2:      btst    #13,%d2
+        jeq     1f
+        printf  PDECODE,"->"            | fpu -> cpu
+        jra     2f
+1:      printf  PDECODE,"<-"            | fpu <- cpu
+2:
+#endif
+        | decode address mode
+        fp_decode_addr_mode
+        .long   fp_ill, fp_ill
+        .long   fpr_indirect, fpr_postinc
+        .long   fpr_predecr, fpr_disp16
+        .long   fpr_extmode0, fpr_extmode1
+        | addressing mode: address register indirect
+fpr_indirect:
+        fp_mode_addr_indirect
+        jra     fpr_do_movem
+        | addressing mode: address register indirect with postincrement
+fpr_postinc:
+        fp_mode_addr_indirect_postinc
+        jra     fpr_do_movem
+fpr_predecr:
+        fp_mode_addr_indirect_predec
+        jra     fpr_do_movem
+        | addressing mode: address register/programm counter indirect
+        |                  with 16bit displacement
+fpr_disp16:
+        fp_mode_addr_indirect_disp16
+        jra     fpr_do_movem
+fpr_extmode0:
+        fp_mode_addr_indirect_extmode0
+        jra     fpr_do_movem
+fpr_extmode1:
+        fp_decode_addr_reg
+        jmp     ([0f:w,%pc,%d0*4])
+        .align  4
+0:
+        .long   fpr_absolute_short, fpr_absolute_long
+        .long   fpr_disp16, fpr_extmode0
+        .long   fp_ill, fp_ill
+        .long   fp_ill, fp_ill
+fpr_absolute_short:
+        fp_mode_abs_short
+        jra     fpr_do_movem
+fpr_absolute_long:
+        fp_mode_abs_long
+|       jra     fpr_do_movem
+fpr_do_movem:
+        swap    %d1                     | get fpu register list
+        lea     (FPD_FPREG,FPDATA),%a1
+        moveq   #12,%d0
+        btst    #12,%d2
+        jne     1f
+        lea     (-12,%a1,%d0*8),%a1
+        neg.l   %d0
+1:      btst    #13,%d2
+        jne     4f
+        | move register from memory into fpu
+        jra     3f
+1:      printf  PMOVEM,"(%p>%p)",2,%a0,%a1
+        getuser.l (%a0)+,%d2,fp_err_ua1,%a0
+        lsr.l   #8,%d2
+        lsr.l   #7,%d2
+        lsr.w   #1,%d2
+        move.l  %d2,(%a1)+
+        getuser.l (%a0)+,%d2,fp_err_ua1,%a0
+        move.l  %d2,(%a1)+
+        getuser.l (%a0),%d2,fp_err_ua1,%a0
+        move.l  %d2,(%a1)
+        subq.l  #8,%a0
+        subq.l  #8,%a1
+        add.l   %d0,%a0
+2:      add.l   %d0,%a1
+3:      lsl.b   #1,%d1
+        jcs     1b
+        jne     2b
+        jra     5f
+        | move register from fpu into memory
+1:      printf  PMOVEM,"(%p>%p)",2,%a1,%a0
+        move.l  (%a1)+,%d2
+        lsl.w   #1,%d2
+        lsl.l   #7,%d2
+        lsl.l   #8,%d2
+        putuser.l %d2,(%a0)+,fp_err_ua1,%a0
+        move.l  (%a1)+,%d2
+        putuser.l %d2,(%a0)+,fp_err_ua1,%a0
+        move.l  (%a1),%d2
+        putuser.l %d2,(%a0),fp_err_ua1,%a0
+        subq.l  #8,%a1
+        subq.l  #8,%a0
+        add.l   %d0,%a0
+2:      add.l   %d0,%a1
+4:      lsl.b   #1,%d1
+        jcs     1b
+        jne     2b
+5:
+        printf  PDECODE,"\n"
+#if 0
+        lea     (FPD_FPREG,FPDATA),%a0
+        printf  PMOVEM,"fp:"
+        printx  PMOVEM,%a0@(0)
+        printx  PMOVEM,%a0@(12)
+        printf  PMOVEM,"\n   "
+        printx  PMOVEM,%a0@(24)
+        printx  PMOVEM,%a0@(36)
+        printf  PMOVEM,"\n   "
+        printx  PMOVEM,%a0@(48)
+        printx  PMOVEM,%a0@(60)
+        printf  PMOVEM,"\n   "
+        printx  PMOVEM,%a0@(72)
+        printx  PMOVEM,%a0@(84)
+        printf  PMOVEM,"\n"
+#endif
+        jra     fp_end
+| set flags for decode macros for fmovem control register
+do_fmovem=1
+do_fmovem_cr=1
+fp_fmovem_cr:
+        printf  PDECODE,"fmovem.cr "
+        | get register list and count them
+        bfextu  %d2{#19,#3},%d0
+        move.l  %d0,%d1
+        swap    %d1
+        jra     2f
+1:      addq.w  #1,%d1
+2:      lsr.l   #1,%d0
+        jcs     1b
+        jne     2b
+        printf  PDECODE,"#%08x",1,%d1
+#ifdef FPU_EMU_DEBUG
+        btst    #13,%d2
+        jeq     1f
+        printf  PDECODE,"->"            | fpu -> cpu
+        jra     2f
+1:      printf  PDECODE,"<-"            | fpu <- cpu
+2:
+#endif
+        | decode address mode
+        fp_decode_addr_mode
+        .long   fpc_data, fpc_addr
+        .long   fpc_indirect, fpc_postinc
+        .long   fpc_predecr, fpc_disp16
+        .long   fpc_extmode0, fpc_extmode1
+fpc_data:
+        fp_mode_data_direct
+        move.w  %d0,%d1
+        bfffo   %d2{#19,#3},%d0
+        sub.w   #19,%d0
+        lea     (FPD_FPCR,FPDATA,%d0.w*4),%a1
+        btst    #13,%d2
+        jne     1f
+        move.w  %d1,%d0
+        jsr     fp_get_data_reg
+        move.l  %d0,(%a1)
+        jra     fpc_movem_fin
+1:      move.l  (%a1),%d0
+        jsr     fp_put_data_reg
+        jra     fpc_movem_fin
+fpc_addr:
+        fp_decode_addr_reg
+        printf  PDECODE,"a%d",1,%d0
+        btst    #13,%d2
+        jne     1f
+        jsr     fp_get_addr_reg
+        move.l  %a0,(FPD_FPIAR,FPDATA)
+        jra     fpc_movem_fin
+1:      move.l  (FPD_FPIAR,FPDATA),%a0
+        jsr     fp_put_addr_reg
+        jra     fpc_movem_fin
+fpc_indirect:
+        fp_mode_addr_indirect
+        jra     fpc_do_movem
+fpc_postinc:
+        fp_mode_addr_indirect_postinc
+        jra     fpc_do_movem
+fpc_predecr:
+        fp_mode_addr_indirect_predec
+        jra     fpc_do_movem
+fpc_disp16:
+        fp_mode_addr_indirect_disp16
+        jra     fpc_do_movem
+fpc_extmode0:
+        fp_mode_addr_indirect_extmode0
+        jra     fpc_do_movem
+fpc_extmode1:
+        fp_decode_addr_reg
+        jmp     ([0f:w,%pc,%d0*4])
+        .align  4
+0:
+        .long   fpc_absolute_short, fpc_absolute_long
+        .long   fpc_disp16, fpc_extmode0
+        .long   fpc_immediate, fp_ill
+        .long   fp_ill, fp_ill
+fpc_absolute_short:
+        fp_mode_abs_short
+        jra     fpc_do_movem
+fpc_absolute_long:
+        fp_mode_abs_long
+        jra     fpc_do_movem
+fpc_immediate:
+        fp_get_pc %a0
+        lea     (%a0,%d1.w*4),%a1
+        fp_put_pc %a1
+        printf  PDECODE,"#imm"
+|       jra     fpc_do_movem
+#if 0
+        swap    %d1
+        lsl.l   #5,%d1
+        lea     (FPD_FPCR,FPDATA),%a0
+        jra     3f
+1:      move.l  %d0,(%a0)
+2:      addq.l  #4,%a0
+3:      lsl.b   #1,%d1
+        jcs     1b
+        jne     2b
+        jra     fpc_movem_fin
+#endif
+fpc_do_movem:
+        swap    %d1                     | get fpu register list
+        lsl.l   #5,%d1
+        lea     (FPD_FPCR,FPDATA),%a1
+1:      btst    #13,%d2
+        jne     4f
+        | move register from memory into fpu
+        jra     3f
+1:      printf  PMOVEM,"(%p>%p)",2,%a0,%a1
+        getuser.l (%a0)+,%d0,fp_err_ua1,%a0
+        move.l  %d0,(%a1)
+2:      addq.l  #4,%a1
+3:      lsl.b   #1,%d1
+        jcs     1b
+        jne     2b
+        jra     fpc_movem_fin
+        | move register from fpu into memory
+1:      printf  PMOVEM,"(%p>%p)",2,%a1,%a0
+        move.l  (%a1),%d0
+        putuser.l %d0,(%a0)+,fp_err_ua1,%a0
+2:      addq.l  #4,%a1
+4:      lsl.b   #1,%d1
+        jcs     1b
+        jne     2b
+fpc_movem_fin:
+        and.l   #0x0000fff0,(FPD_FPCR,FPDATA)
+        and.l   #0x0ffffff8,(FPD_FPSR,FPDATA)
+        move.l  (FPD_FPCR,FPDATA),%d0
+        lsr.l   #4,%d0
+        moveq   #3,%d1
+        and.l   %d0,%d1
+        move.w  %d1,(FPD_RND,FPDATA)
+        lsr.l   #2,%d0
+        moveq   #3,%d1
+        and.l   %d0,%d1
+        move.w  %d1,(FPD_PREC,FPDATA)
+        printf  PDECODE,"\n"
+#if 0
+        printf  PMOVEM,"fpcr : %08x\n",1,FPDATA@(FPD_FPCR)
+        printf  PMOVEM,"fpsr : %08x\n",1,FPDATA@(FPD_FPSR)
+        printf  PMOVEM,"fpiar: %08x\n",1,FPDATA@(FPD_FPIAR)
+        clr.l   %d0
+        move.w  (FPD_PREC,FPDATA),%d0
+        printf  PMOVEM,"prec : %04x\n",1,%d0
+        move.w  (FPD_RND,FPDATA),%d0
+        printf  PMOVEM,"rnd  : %04x\n",1,%d0
+#endif
+        jra     fp_end
diff --git a/arch/m68k/math-emu/fp_scan.S b/arch/m68k/math-emu/fp_scan.S
new file mode 100644
index 000000000000..e4146ed574db
--- /dev/null
+++ b/arch/m68k/math-emu/fp_scan.S
@@ -0,0 +1,478 @@
+/*
+ * fp_scan.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "fp_emu.h"
+#include "fp_decode.h"
+        .globl  fp_scan, fp_datasize
+        .data
+| %d2 - first two instr words
+| %d1 - operand size
+/* operand formats are:
+        Long = 0,               i.e. fmove.l
+        Single,                 i.e. fmove.s
+        Extended,               i.e. fmove.x
+        Packed-BCD,             i.e. fmove.p
+        Word,                   i.e. fmove.w
+        Double,                 i.e. fmove.d
+*/
+        .text
+| On entry:
+| FPDATA - base of emulated FPU registers
+fp_scan:
+| normal fpu instruction? (this excludes fsave/frestore)
+        fp_get_pc %a0
+        printf  PDECODE,"%08x: ",1,%a0
+        getuser.b (%a0),%d0,fp_err_ua1,%a0
+#if 1
+        cmp.b   #0xf2,%d0               | cpid = 1
+#else
+        cmp.b   #0xfc,%d0               | cpid = 6
+#endif
+        jne     fp_nonstd
+| first two instruction words are kept in %d2
+        getuser.l (%a0)+,%d2,fp_err_ua1,%a0
+        fp_put_pc %a0
+fp_decode_cond:                         | separate conditional instr
+        fp_decode_cond_instr_type
+        .long   fp_decode_move, fp_fscc
+        .long   fp_fbccw, fp_fbccl
+fp_decode_move:                         | separate move instr
+        fp_decode_move_instr_type
+        .long   fp_fgen_fp, fp_ill
+        .long   fp_fgen_ea, fp_fmove_fp2mem
+        .long   fp_fmovem_cr, fp_fmovem_cr
+        .long   fp_fmovem_fp, fp_fmovem_fp
+| now all arithmetic instr and a few move instr are left
+fp_fgen_fp:                             | source is a fpu register
+        clr.b   (FPD_FPSR+2,FPDATA)     | clear the exception byte
+        fp_decode_sourcespec
+        printf  PDECODE,"f<op>.x fp%d",1,%d0
+        fp_get_fp_reg
+        lea     (FPD_TEMPFP1,FPDATA),%a1 | copy src into a temp location
+        move.l  (%a0)+,(%a1)+
+        move.l  (%a0)+,(%a1)+
+        move.l  (%a0),(%a1)
+        lea     (-8,%a1),%a0
+        jra     fp_getdest
+fp_fgen_ea:                             | source is <ea>
+        clr.b   (FPD_FPSR+2,FPDATA)     | clear the exception byte
+        | sort out fmovecr, keep data size in %d1
+        fp_decode_sourcespec
+        cmp.w   #7,%d0
+        jeq     fp_fmovecr
+        move.w  %d0,%d1                 | store data size twice in %d1
+        swap    %d1                     | one can be trashed below
+        move.w  %d0,%d1
+#ifdef FPU_EMU_DEBUG
+        lea     0f,%a0
+        clr.l   %d0
+        move.b  (%a0,%d1.w),%d0
+        printf  PDECODE,"f<op>.%c ",1,%d0
+        .data
+0:      .byte   'l','s','x','p','w','d','b',0
+        .previous
+#endif
+/*
+        fp_getsource, fp_getdest
+        basically, we end up with a pointer to the source operand in
+        %a1, and a pointer to the destination operand in %a0.  both
+        are, of course, 96-bit extended floating point numbers.
+*/
+fp_getsource:
+        | decode addressing mode for source
+        fp_decode_addr_mode
+        .long   fp_data, fp_ill
+        .long   fp_indirect, fp_postinc
+        .long   fp_predecr, fp_disp16
+        .long   fp_extmode0, fp_extmode1
+        | addressing mode: data register direct
+fp_data:
+        fp_mode_data_direct
+        jsr     fp_get_data_reg
+        lea     (FPD_TEMPFP1,FPDATA),%a0
+        jmp     ([0f:w,%pc,%d1.w*4])
+        .align  4
+0:
+        .long   fp_data_long, fp_data_single
+        .long   fp_ill, fp_ill
+        .long   fp_data_word, fp_ill
+        .long   fp_data_byte, fp_ill
+        | data types that fit in an integer data register
+fp_data_byte:
+        extb.l  %d0
+        jra     fp_data_long
+fp_data_word:
+        ext.l   %d0
+fp_data_long:
+        jsr     fp_conv_long2ext
+        jra     fp_getdest
+fp_data_single:
+        jsr     fp_conv_single2ext
+        jra     fp_getdest
+        | addressing mode: address register indirect
+fp_indirect:
+        fp_mode_addr_indirect
+        jra     fp_fetchsource
+        | addressing mode: address register indirect with postincrement
+fp_postinc:
+        fp_mode_addr_indirect_postinc
+        jra     fp_fetchsource
+        | addressing mode: address register indirect with predecrement
+fp_predecr:
+        fp_mode_addr_indirect_predec
+        jra     fp_fetchsource
+        | addressing mode: address register/programm counter indirect
+        |                  with 16bit displacement
+fp_disp16:
+        fp_mode_addr_indirect_disp16
+        jra     fp_fetchsource
+        | all other indirect addressing modes will finally end up here
+fp_extmode0:
+        fp_mode_addr_indirect_extmode0
+        jra     fp_fetchsource
+| all pc relative addressing modes and immediate/absolute modes end up here
+| the first ones are sent to fp_extmode0 or fp_disp16
+| and only the latter are handled here
+fp_extmode1:
+        fp_decode_addr_reg
+        jmp     ([0f:w,%pc,%d0*4])
+        .align  4
+0:
+        .long   fp_abs_short, fp_abs_long
+        .long   fp_disp16, fp_extmode0
+        .long   fp_immediate, fp_ill
+        .long   fp_ill, fp_ill
+        | addressing mode: absolute short
+fp_abs_short:
+        fp_mode_abs_short
+        jra     fp_fetchsource
+        | addressing mode: absolute long
+fp_abs_long:
+        fp_mode_abs_long
+        jra     fp_fetchsource
+        | addressing mode: immediate data
+fp_immediate:
+        printf  PDECODE,"#"
+        fp_get_pc %a0
+        move.w  (fp_datasize,%d1.w*2),%d0
+        addq.w  #1,%d0
+        and.w   #-2,%d0
+#ifdef FPU_EMU_DEBUG
+        movem.l %d0/%d1,-(%sp)
+        movel   %a0,%a1
+        clr.l   %d1
+        jra     2f
+1:      getuser.b (%a1)+,%d1,fp_err_ua1,%a1
+        printf  PDECODE,"%02x",1,%d1
+2:      dbra    %d0,1b
+        movem.l (%sp)+,%d0/%d1
+#endif
+        lea     (%a0,%d0.w),%a1
+        fp_put_pc %a1
+|       jra     fp_fetchsource
+fp_fetchsource:
+        move.l  %a0,%a1
+        swap    %d1
+        lea     (FPD_TEMPFP1,FPDATA),%a0
+        jmp     ([0f:w,%pc,%d1.w*4])
+        .align  4
+0:      .long   fp_long, fp_single
+        .long   fp_ext, fp_pack
+        .long   fp_word, fp_double
+        .long   fp_byte, fp_ill
+fp_long:
+        getuser.l (%a1),%d0,fp_err_ua1,%a1
+        jsr     fp_conv_long2ext
+        jra     fp_getdest
+fp_single:
+        getuser.l (%a1),%d0,fp_err_ua1,%a1
+        jsr     fp_conv_single2ext
+        jra     fp_getdest
+fp_ext:
+        getuser.l (%a1)+,%d0,fp_err_ua1,%a1
+        lsr.l   #8,%d0
+        lsr.l   #7,%d0
+        lsr.w   #1,%d0
+        move.l  %d0,(%a0)+
+        getuser.l (%a1)+,%d0,fp_err_ua1,%a1
+        move.l  %d0,(%a0)+
+        getuser.l (%a1),%d0,fp_err_ua1,%a1
+        move.l  %d0,(%a0)
+        subq.l  #8,%a0
+        jra     fp_getdest
+fp_pack:
+        /* not supported yet */
+        jra     fp_ill
+fp_word:
+        getuser.w (%a1),%d0,fp_err_ua1,%a1
+        ext.l   %d0
+        jsr     fp_conv_long2ext
+        jra     fp_getdest
+fp_double:
+        jsr     fp_conv_double2ext
+        jra     fp_getdest
+fp_byte:
+        getuser.b (%a1),%d0,fp_err_ua1,%a1
+        extb.l  %d0
+        jsr     fp_conv_long2ext
+|       jra     fp_getdest
+fp_getdest:
+        move.l  %a0,%a1
+        bfextu  %d2{#22,#3},%d0
+        printf  PDECODE,",fp%d\n",1,%d0
+        fp_get_fp_reg
+        movem.l %a0/%a1,-(%sp)
+        pea     fp_finalrounding
+        bfextu  %d2{#25,#7},%d0
+        jmp     ([0f:w,%pc,%d0*4])
+        .align  4
+0:
+        .long   fp_fmove_mem2fp, fp_fint, fp_fsinh, fp_fintrz
+        .long   fp_fsqrt, fp_ill, fp_flognp1, fp_ill
+        .long   fp_fetoxm1, fp_ftanh, fp_fatan, fp_ill
+        .long   fp_fasin, fp_fatanh, fp_fsin, fp_ftan
+        .long   fp_fetox, fp_ftwotox, fp_ftentox, fp_ill
+        .long   fp_flogn, fp_flog10, fp_flog2, fp_ill
+        .long   fp_fabs, fp_fcosh, fp_fneg, fp_ill
+        .long   fp_facos, fp_fcos, fp_fgetexp, fp_fgetman
+        .long   fp_fdiv, fp_fmod, fp_fadd, fp_fmul
+        .long   fpa_fsgldiv, fp_frem, fp_fscale, fpa_fsglmul
+        .long   fp_fsub, fp_ill, fp_ill, fp_ill
+        .long   fp_ill, fp_ill, fp_ill, fp_ill
+        .long   fp_fsincos0, fp_fsincos1, fp_fsincos2, fp_fsincos3
+        .long   fp_fsincos4, fp_fsincos5, fp_fsincos6, fp_fsincos7
+        .long   fp_fcmp, fp_ill, fp_ftst, fp_ill
+        .long   fp_ill, fp_ill, fp_ill, fp_ill
+        .long   fp_fsmove, fp_fssqrt, fp_ill, fp_ill
+        .long   fp_fdmove, fp_fdsqrt, fp_ill, fp_ill
+        .long   fp_ill, fp_ill, fp_ill, fp_ill
+        .long   fp_ill, fp_ill, fp_ill, fp_ill
+        .long   fp_ill, fp_ill, fp_ill, fp_ill
+        .long   fp_ill, fp_ill, fp_ill, fp_ill
+        .long   fp_fsabs, fp_ill, fp_fsneg, fp_ill
+        .long   fp_fdabs, fp_ill, fp_fdneg, fp_ill
+        .long   fp_fsdiv, fp_ill, fp_fsadd, fp_fsmul
+        .long   fp_fddiv, fp_ill, fp_fdadd, fp_fdmul
+        .long   fp_fssub, fp_ill, fp_ill, fp_ill
+        .long   fp_fdsub, fp_ill, fp_ill, fp_ill
+        .long   fp_ill, fp_ill, fp_ill, fp_ill
+        .long   fp_ill, fp_ill, fp_ill, fp_ill
+        .long   fp_ill, fp_ill, fp_ill, fp_ill
+        .long   fp_ill, fp_ill, fp_ill, fp_ill
+        | Instructions follow
+        | Move an (emulated) ROM constant
+fp_fmovecr:
+        bfextu  %d2{#27,#5},%d0
+        printf  PINSTR,"fp_fmovecr #%d",1,%d0
+        move.l  %d0,%d1
+        add.l   %d0,%d0
+        add.l   %d1,%d0
+        lea     (fp_constants,%d0*4),%a0
+        move.l  #0x801cc0ff,%d0
+        addq.l  #1,%d1
+        lsl.l   %d1,%d0
+        jcc     1f
+        fp_set_sr FPSR_EXC_INEX2                        | INEX2 exception
+1:      moveq   #-128,%d0                               | continue with fmove
+        and.l   %d0,%d2
+        jra     fp_getdest
+        .data
+        .align  4
+fp_constants:
+        .long   0x00004000,0xc90fdaa2,0x2168c235        | pi
+        .extend 0,0,0,0,0,0,0,0,0,0
+        .long   0x00003ffd,0x9a209a84,0xfbcff798        | log10(2)
+        .long   0x00004000,0xadf85458,0xa2bb4a9a        | e
+        .long   0x00003fff,0xb8aa3b29,0x5c17f0bc        | log2(e)
+        .long   0x00003ffd,0xde5bd8a9,0x37287195        | log10(e)
+        .long   0x00000000,0x00000000,0x00000000        | 0.0
+        .long   0x00003ffe,0xb17217f7,0xd1cf79ac        | 1n(2)
+        .long   0x00004000,0x935d8ddd,0xaaa8ac17        | 1n(10)
+        | read this as "1.0 * 2^0" - note the high bit in the mantissa
+        .long   0x00003fff,0x80000000,0x00000000        | 10^0
+        .long   0x00004002,0xa0000000,0x00000000        | 10^1
+        .long   0x00004005,0xc8000000,0x00000000        | 10^2
+        .long   0x0000400c,0x9c400000,0x00000000        | 10^4
+        .long   0x00004019,0xbebc2000,0x00000000        | 10^8
+        .long   0x00004034,0x8e1bc9bf,0x04000000        | 10^16
+        .long   0x00004069,0x9dc5ada8,0x2b70b59e        | 10^32
+        .long   0x000040d3,0xc2781f49,0xffcfa6d5        | 10^64
+        .long   0x000041a8,0x93ba47c9,0x80e98ce0        | 10^128
+        .long   0x00004351,0xaa7eebfb,0x9df9de8e        | 10^256
+        .long   0x000046a3,0xe319a0ae,0xa60e91c7        | 10^512
+        .long   0x00004d48,0xc9767586,0x81750c17        | 10^1024
+        .long   0x00005a92,0x9e8b3b5d,0xc53d5de5        | 10^2048
+        .long   0x00007525,0xc4605202,0x8a20979b        | 10^4096
+        .previous
+fp_fmove_mem2fp:
+        printf  PINSTR,"fmove %p,%p\n",2,%a0,%a1
+        move.l  (%a1)+,(%a0)+
+        move.l  (%a1)+,(%a0)+
+        move.l  (%a1),(%a0)
+        subq.l  #8,%a0
+        rts
+fpa_fsglmul:
+        move.l  #fp_finalrounding_single_fast,(%sp)
+        jra     fp_fsglmul
+fpa_fsgldiv:
+        move.l  #fp_finalrounding_single_fast,(%sp)
+        jra     fp_fsgldiv
+.macro  fp_dosingleprec instr
+        printf  PINSTR,"single "
+        move.l  #fp_finalrounding_single,(%sp)
+        jra     \instr
+.endm
+.macro  fp_dodoubleprec instr
+        printf  PINSTR,"double "
+        move.l  #fp_finalrounding_double,(%sp)
+        jra     \instr
+.endm
+fp_fsmove:
+        fp_dosingleprec fp_fmove_mem2fp
+fp_fssqrt:
+        fp_dosingleprec fp_fsqrt
+fp_fdmove:
+        fp_dodoubleprec fp_fmove_mem2fp
+fp_fdsqrt:
+        fp_dodoubleprec fp_fsqrt
+fp_fsabs:
+        fp_dosingleprec fp_fabs
+fp_fsneg:
+        fp_dosingleprec fp_fneg
+fp_fdabs:
+        fp_dodoubleprec fp_fabs
+fp_fdneg:
+        fp_dodoubleprec fp_fneg
+fp_fsdiv:
+        fp_dosingleprec fp_fdiv
+fp_fsadd:
+        fp_dosingleprec fp_fadd
+fp_fsmul:
+        fp_dosingleprec fp_fmul
+fp_fddiv:
+        fp_dodoubleprec fp_fdiv
+fp_fdadd:
+        fp_dodoubleprec fp_fadd
+fp_fdmul:
+        fp_dodoubleprec fp_fmul
+fp_fssub:
+        fp_dosingleprec fp_fsub
+fp_fdsub:
+        fp_dodoubleprec fp_fsub
+fp_nonstd:
+        fp_get_pc %a0
+        getuser.l (%a0),%d0,fp_err_ua1,%a0
+        printf  ,"nonstd ((%08x)=%08x)\n",2,%a0,%d0
+        moveq   #-1,%d0
+        rts
+        .data
+        .align  4
+        | data sizes corresponding to the operand formats
+fp_datasize:
+        .word   4, 4, 12, 12, 2, 8, 1, 0
diff --git a/arch/m68k/math-emu/fp_trig.c b/arch/m68k/math-emu/fp_trig.c
new file mode 100644
index 000000000000..6361d0784df2
--- /dev/null
+++ b/arch/m68k/math-emu/fp_trig.c
@@ -0,0 +1,183 @@
+/*
+  fp_trig.c: floating-point math routines for the Linux-m68k
+  floating point emulator.
+  Copyright (c) 1998-1999 David Huggins-Daines / Roman Zippel.
+  I hereby give permission, free of charge, to copy, modify, and
+  redistribute this software, in source or binary form, provided that
+  the above copyright notice and the following disclaimer are included
+  in all such copies.
+  THIS SOFTWARE IS PROVIDED "AS IS", WITH ABSOLUTELY NO WARRANTY, REAL
+  OR IMPLIED.
+*/
+#include "fp_emu.h"
+#include "fp_trig.h"
+struct fp_ext *
+fp_fsin(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fsin\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_fcos(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fcos\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_ftan(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("ftan\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_fasin(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fasin\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_facos(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("facos\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_fatan(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fatan\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_fsinh(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fsinh\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_fcosh(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fcosh\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_ftanh(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("ftanh\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_fatanh(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fatanh\n");
+        fp_monadic_check(dest, src);
+        return dest;
+}
+struct fp_ext *
+fp_fsincos0(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fsincos0\n");
+        return dest;
+}
+struct fp_ext *
+fp_fsincos1(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fsincos1\n");
+        return dest;
+}
+struct fp_ext *
+fp_fsincos2(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fsincos2\n");
+        return dest;
+}
+struct fp_ext *
+fp_fsincos3(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fsincos3\n");
+        return dest;
+}
+struct fp_ext *
+fp_fsincos4(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fsincos4\n");
+        return dest;
+}
+struct fp_ext *
+fp_fsincos5(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fsincos5\n");
+        return dest;
+}
+struct fp_ext *
+fp_fsincos6(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fsincos6\n");
+        return dest;
+}
+struct fp_ext *
+fp_fsincos7(struct fp_ext *dest, struct fp_ext *src)
+{
+        uprint("fsincos7\n");
+        return dest;
+}
diff --git a/arch/m68k/math-emu/fp_trig.h b/arch/m68k/math-emu/fp_trig.h
new file mode 100644
index 000000000000..af8b247e9c98
--- /dev/null
+++ b/arch/m68k/math-emu/fp_trig.h
@@ -0,0 +1,32 @@
+/*
+  fp_trig.h: floating-point math routines for the Linux-m68k
+  floating point emulator.
+  Copyright (c) 1998 David Huggins-Daines.
+  I hereby give permission, free of charge, to copy, modify, and
+  redistribute this software, in source or binary form, provided that
+  the above copyright notice and the following disclaimer are included
+  in all such copies.
+  THIS SOFTWARE IS PROVIDED "AS IS", WITH ABSOLUTELY NO WARRANTY, REAL
+  OR IMPLIED.
+*/
+#ifndef FP_TRIG_H
+#define FP_TRIG_H
+#include "fp_emu.h"
+/* floating point trigonometric instructions:
+   the arguments to these are in the "internal" extended format, that
+   is, an "exploded" version of the 96-bit extended fp format used by
+   the 68881.
+   they return a status code, which should end up in %d0, if all goes
+   well.  */
+#endif /* FP_TRIG__H */
diff --git a/arch/m68k/math-emu/fp_util.S b/arch/m68k/math-emu/fp_util.S
new file mode 100644
index 000000000000..a9f7f0129067
--- /dev/null
+++ b/arch/m68k/math-emu/fp_util.S
@@ -0,0 +1,1455 @@
+/*
+ * fp_util.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/config.h>
+#include "fp_emu.h"
+/*
+ * Here are lots of conversion and normalization functions mainly
+ * used by fp_scan.S
+ * Note that these functions are optimized for "normal" numbers,
+ * these are handled first and exit as fast as possible, this is
+ * especially important for fp_normalize_ext/fp_conv_ext2ext, as
+ * it's called very often.
+ * The register usage is optimized for fp_scan.S and which register
+ * is currently at that time unused, be careful if you want change
+ * something here. %d0 and %d1 is always usable, sometimes %d2 (or
+ * only the lower half) most function have to return the %a0
+ * unmodified, so that the caller can immediately reuse it.
+ */
+        .globl  fp_ill, fp_end
+        | exits from fp_scan:
+        | illegal instruction
+fp_ill:
+        printf  ,"fp_illegal\n"
+        rts
+        | completed instruction
+fp_end:
+        tst.l   (TASK_MM-8,%a2)
+        jmi     1f
+        tst.l   (TASK_MM-4,%a2)
+        jmi     1f
+        tst.l   (TASK_MM,%a2)
+        jpl     2f
+1:      printf  ,"oops:%p,%p,%p\n",3,%a2@(TASK_MM-8),%a2@(TASK_MM-4),%a2@(TASK_MM)
+2:      clr.l   %d0
+        rts
+        .globl  fp_conv_long2ext, fp_conv_single2ext
+        .globl  fp_conv_double2ext, fp_conv_ext2ext
+        .globl  fp_normalize_ext, fp_normalize_double
+        .globl  fp_normalize_single, fp_normalize_single_fast
+        .globl  fp_conv_ext2double, fp_conv_ext2single
+        .globl  fp_conv_ext2long, fp_conv_ext2short
+        .globl  fp_conv_ext2byte
+        .globl  fp_finalrounding_single, fp_finalrounding_single_fast
+        .globl  fp_finalrounding_double
+        .globl  fp_finalrounding, fp_finaltest, fp_final
+/*
+ * First several conversion functions from a source operand
+ * into the extended format. Note, that only fp_conv_ext2ext
+ * normalizes the number and is always called after the other
+ * conversion functions, which only move the information into
+ * fp_ext structure.
+ */
+        | fp_conv_long2ext:
+        |
+        | args: %d0 = source (32-bit long)
+        |       %a0 = destination (ptr to struct fp_ext)
+fp_conv_long2ext:
+        printf  PCONV,"l2e: %p -> %p(",2,%d0,%a0
+        clr.l   %d1                     | sign defaults to zero
+        tst.l   %d0
+        jeq     fp_l2e_zero             | is source zero?
+        jpl     1f                      | positive?
+        moveq   #1,%d1
+        neg.l   %d0
+1:      swap    %d1
+        move.w  #0x3fff+31,%d1
+        move.l  %d1,(%a0)+              | set sign / exp
+        move.l  %d0,(%a0)+              | set mantissa
+        clr.l   (%a0)
+        subq.l  #8,%a0                  | restore %a0
+        printx  PCONV,%a0@
+        printf  PCONV,")\n"
+        rts
+        | source is zero
+fp_l2e_zero:
+        clr.l   (%a0)+
+        clr.l   (%a0)+
+        clr.l   (%a0)
+        subq.l  #8,%a0
+        printx  PCONV,%a0@
+        printf  PCONV,")\n"
+        rts
+        | fp_conv_single2ext
+        | args: %d0 = source (single-precision fp value)
+        |       %a0 = dest (struct fp_ext *)
+fp_conv_single2ext:
+        printf  PCONV,"s2e: %p -> %p(",2,%d0,%a0
+        move.l  %d0,%d1
+        lsl.l   #8,%d0                  | shift mantissa
+        lsr.l   #8,%d1                  | exponent / sign
+        lsr.l   #7,%d1
+        lsr.w   #8,%d1
+        jeq     fp_s2e_small            | zero / denormal?
+        cmp.w   #0xff,%d1               | NaN / Inf?
+        jeq     fp_s2e_large
+        bset    #31,%d0                 | set explizit bit
+        add.w   #0x3fff-0x7f,%d1        | re-bias the exponent.
+9:      move.l  %d1,(%a0)+              | fp_ext.sign, fp_ext.exp
+        move.l  %d0,(%a0)+              | high lword of fp_ext.mant
+        clr.l   (%a0)                   | low lword = 0
+        subq.l  #8,%a0
+        printx  PCONV,%a0@
+        printf  PCONV,")\n"
+        rts
+        | zeros and denormalized
+fp_s2e_small:
+        | exponent is zero, so explizit bit is already zero too
+        tst.l   %d0
+        jeq     9b
+        move.w  #0x4000-0x7f,%d1
+        jra     9b
+        | infinities and NAN
+fp_s2e_large:
+        bclr    #31,%d0                 | clear explizit bit
+        move.w  #0x7fff,%d1
+        jra     9b
+fp_conv_double2ext:
+#ifdef FPU_EMU_DEBUG
+        getuser.l %a1@(0),%d0,fp_err_ua2,%a1
+        getuser.l %a1@(4),%d1,fp_err_ua2,%a1
+        printf  PCONV,"d2e: %p%p -> %p(",3,%d0,%d1,%a0
+#endif
+        getuser.l (%a1)+,%d0,fp_err_ua2,%a1
+        move.l  %d0,%d1
+        lsl.l   #8,%d0                  | shift high mantissa
+        lsl.l   #3,%d0
+        lsr.l   #8,%d1                  | exponent / sign
+        lsr.l   #7,%d1
+        lsr.w   #5,%d1
+        jeq     fp_d2e_small            | zero / denormal?
+        cmp.w   #0x7ff,%d1              | NaN / Inf?
+        jeq     fp_d2e_large
+        bset    #31,%d0                 | set explizit bit
+        add.w   #0x3fff-0x3ff,%d1       | re-bias the exponent.
+9:      move.l  %d1,(%a0)+              | fp_ext.sign, fp_ext.exp
+        move.l  %d0,(%a0)+
+        getuser.l (%a1)+,%d0,fp_err_ua2,%a1
+        move.l  %d0,%d1
+        lsl.l   #8,%d0
+        lsl.l   #3,%d0
+        move.l  %d0,(%a0)
+        moveq   #21,%d0
+        lsr.l   %d0,%d1
+        or.l    %d1,-(%a0)
+        subq.l  #4,%a0
+        printx  PCONV,%a0@
+        printf  PCONV,")\n"
+        rts
+        | zeros and denormalized
+fp_d2e_small:
+        | exponent is zero, so explizit bit is already zero too
+        tst.l   %d0
+        jeq     9b
+        move.w  #0x4000-0x3ff,%d1
+        jra     9b
+        | infinities and NAN
+fp_d2e_large:
+        bclr    #31,%d0                 | clear explizit bit
+        move.w  #0x7fff,%d1
+        jra     9b
+        | fp_conv_ext2ext:
+        | originally used to get longdouble from userspace, now it's
+        | called before arithmetic operations to make sure the number
+        | is normalized [maybe rename it?].
+        | args: %a0 = dest (struct fp_ext *)
+        | returns 0 in %d0 for a NaN, otherwise 1
+fp_conv_ext2ext:
+        printf  PCONV,"e2e: %p(",1,%a0
+        printx  PCONV,%a0@
+        printf  PCONV,"), "
+        move.l  (%a0)+,%d0
+        cmp.w   #0x7fff,%d0             | Inf / NaN?
+        jeq     fp_e2e_large
+        move.l  (%a0),%d0
+        jpl     fp_e2e_small            | zero / denorm?
+        | The high bit is set, so normalization is irrelevant.
+fp_e2e_checkround:
+        subq.l  #4,%a0
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+        move.b  (%a0),%d0
+        jne     fp_e2e_round
+#endif
+        printf  PCONV,"%p(",1,%a0
+        printx  PCONV,%a0@
+        printf  PCONV,")\n"
+        moveq   #1,%d0
+        rts
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+fp_e2e_round:
+        fp_set_sr FPSR_EXC_INEX2
+        clr.b   (%a0)
+        move.w  (FPD_RND,FPDATA),%d2
+        jne     fp_e2e_roundother       | %d2 == 0, round to nearest
+        tst.b   %d0                     | test guard bit
+        jpl     9f                      | zero is closer
+        btst    #0,(11,%a0)             | test lsb bit
+        jne     fp_e2e_doroundup        | round to infinity
+        lsl.b   #1,%d0                  | check low bits
+        jeq     9f                      | round to zero
+fp_e2e_doroundup:
+        addq.l  #1,(8,%a0)
+        jcc     9f
+        addq.l  #1,(4,%a0)
+        jcc     9f
+        move.w  #0x8000,(4,%a0)
+        addq.w  #1,(2,%a0)
+9:      printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+fp_e2e_roundother:
+        subq.w  #2,%d2
+        jcs     9b                      | %d2 < 2, round to zero
+        jhi     1f                      | %d2 > 2, round to +infinity
+        tst.b   (1,%a0)                 | to -inf
+        jne     fp_e2e_doroundup        | negative, round to infinity
+        jra     9b                      | positive, round to zero
+1:      tst.b   (1,%a0)                 | to +inf
+        jeq     fp_e2e_doroundup        | positive, round to infinity
+        jra     9b                      | negative, round to zero
+#endif
+        | zeros and subnormals:
+        | try to normalize these anyway.
+fp_e2e_small:
+        jne     fp_e2e_small1           | high lword zero?
+        move.l  (4,%a0),%d0
+        jne     fp_e2e_small2
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+        clr.l   %d0
+        move.b  (-4,%a0),%d0
+        jne     fp_e2e_small3
+#endif
+        | Genuine zero.
+        clr.w   -(%a0)
+        subq.l  #2,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        moveq   #1,%d0
+        rts
+        | definitely subnormal, need to shift all 64 bits
+fp_e2e_small1:
+        bfffo   %d0{#0,#32},%d1
+        move.w  -(%a0),%d2
+        sub.w   %d1,%d2
+        jcc     1f
+        | Pathologically small, denormalize.
+        add.w   %d2,%d1
+        clr.w   %d2
+1:      move.w  %d2,(%a0)+
+        move.w  %d1,%d2
+        jeq     fp_e2e_checkround
+        | fancy 64-bit double-shift begins here
+        lsl.l   %d2,%d0
+        move.l  %d0,(%a0)+
+        move.l  (%a0),%d0
+        move.l  %d0,%d1
+        lsl.l   %d2,%d0
+        move.l  %d0,(%a0)
+        neg.w   %d2
+        and.w   #0x1f,%d2
+        lsr.l   %d2,%d1
+        or.l    %d1,-(%a0)
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+fp_e2e_extra1:
+        clr.l   %d0
+        move.b  (-4,%a0),%d0
+        neg.w   %d2
+        add.w   #24,%d2
+        jcc     1f
+        clr.b   (-4,%a0)
+        lsl.l   %d2,%d0
+        or.l    %d0,(4,%a0)
+        jra     fp_e2e_checkround
+1:      addq.w  #8,%d2
+        lsl.l   %d2,%d0
+        move.b  %d0,(-4,%a0)
+        lsr.l   #8,%d0
+        or.l    %d0,(4,%a0)
+#endif
+        jra     fp_e2e_checkround
+        | pathologically small subnormal
+fp_e2e_small2:
+        bfffo   %d0{#0,#32},%d1
+        add.w   #32,%d1
+        move.w  -(%a0),%d2
+        sub.w   %d1,%d2
+        jcc     1f
+        | Beyond pathologically small, denormalize.
+        add.w   %d2,%d1
+        clr.w   %d2
+1:      move.w  %d2,(%a0)+
+        ext.l   %d1
+        jeq     fp_e2e_checkround
+        clr.l   (4,%a0)
+        sub.w   #32,%d2
+        jcs     1f
+        lsl.l   %d1,%d0                 | lower lword needs only to be shifted
+        move.l  %d0,(%a0)               | into the higher lword
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+        clr.l   %d0
+        move.b  (-4,%a0),%d0
+        clr.b   (-4,%a0)
+        neg.w   %d1
+        add.w   #32,%d1
+        bfins   %d0,(%a0){%d1,#8}
+#endif
+        jra     fp_e2e_checkround
+1:      neg.w   %d1                     | lower lword is splitted between
+        bfins   %d0,(%a0){%d1,#32}      | higher and lower lword
+#ifndef CONFIG_M68KFPU_EMU_EXTRAPREC
+        jra     fp_e2e_checkround
+#else
+        move.w  %d1,%d2
+        jra     fp_e2e_extra1
+        | These are extremely small numbers, that will mostly end up as zero
+        | anyway, so this is only important for correct rounding.
+fp_e2e_small3:
+        bfffo   %d0{#24,#8},%d1
+        add.w   #40,%d1
+        move.w  -(%a0),%d2
+        sub.w   %d1,%d2
+        jcc     1f
+        | Pathologically small, denormalize.
+        add.w   %d2,%d1
+        clr.w   %d2
+1:      move.w  %d2,(%a0)+
+        ext.l   %d1
+        jeq     fp_e2e_checkround
+        cmp.w   #8,%d1
+        jcs     2f
+1:      clr.b   (-4,%a0)
+        sub.w   #64,%d1
+        jcs     1f
+        add.w   #24,%d1
+        lsl.l   %d1,%d0
+        move.l  %d0,(%a0)
+        jra     fp_e2e_checkround
+1:      neg.w   %d1
+        bfins   %d0,(%a0){%d1,#8}
+        jra     fp_e2e_checkround
+2:      lsl.l   %d1,%d0
+        move.b  %d0,(-4,%a0)
+        lsr.l   #8,%d0
+        move.b  %d0,(7,%a0)
+        jra     fp_e2e_checkround
+#endif
+1:      move.l  %d0,%d1                 | lower lword is splitted between
+        lsl.l   %d2,%d0                 | higher and lower lword
+        move.l  %d0,(%a0)
+        move.l  %d1,%d0
+        neg.w   %d2
+        add.w   #32,%d2
+        lsr.l   %d2,%d0
+        move.l  %d0,-(%a0)
+        jra     fp_e2e_checkround
+        | Infinities and NaNs
+fp_e2e_large:
+        move.l  (%a0)+,%d0
+        jne     3f
+1:      tst.l   (%a0)
+        jne     4f
+        moveq   #1,%d0
+2:      subq.l  #8,%a0
+        printf  PCONV,"%p(",1,%a0
+        printx  PCONV,%a0@
+        printf  PCONV,")\n"
+        rts
+        | we have maybe a NaN, shift off the highest bit
+3:      lsl.l   #1,%d0
+        jeq     1b
+        | we have a NaN, clear the return value
+4:      clrl    %d0
+        jra     2b
+/*
+ * Normalization functions.  Call these on the output of general
+ * FP operators, and before any conversion into the destination
+ * formats. fp_normalize_ext has always to be called first, the
+ * following conversion functions expect an already normalized
+ * number.
+ */
+        | fp_normalize_ext:
+        | normalize an extended in extended (unpacked) format, basically
+        | it does the same as fp_conv_ext2ext, additionally it also does
+        | the necessary postprocessing checks.
+        | args: %a0 (struct fp_ext *)
+        | NOTE: it does _not_ modify %a0/%a1 and the upper word of %d2
+fp_normalize_ext:
+        printf  PNORM,"ne: %p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,"), "
+        move.l  (%a0)+,%d0
+        cmp.w   #0x7fff,%d0             | Inf / NaN?
+        jeq     fp_ne_large
+        move.l  (%a0),%d0
+        jpl     fp_ne_small             | zero / denorm?
+        | The high bit is set, so normalization is irrelevant.
+fp_ne_checkround:
+        subq.l  #4,%a0
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+        move.b  (%a0),%d0
+        jne     fp_ne_round
+#endif
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+fp_ne_round:
+        fp_set_sr FPSR_EXC_INEX2
+        clr.b   (%a0)
+        move.w  (FPD_RND,FPDATA),%d2
+        jne     fp_ne_roundother        | %d2 == 0, round to nearest
+        tst.b   %d0                     | test guard bit
+        jpl     9f                      | zero is closer
+        btst    #0,(11,%a0)             | test lsb bit
+        jne     fp_ne_doroundup         | round to infinity
+        lsl.b   #1,%d0                  | check low bits
+        jeq     9f                      | round to zero
+fp_ne_doroundup:
+        addq.l  #1,(8,%a0)
+        jcc     9f
+        addq.l  #1,(4,%a0)
+        jcc     9f
+        addq.w  #1,(2,%a0)
+        move.w  #0x8000,(4,%a0)
+9:      printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+fp_ne_roundother:
+        subq.w  #2,%d2
+        jcs     9b                      | %d2 < 2, round to zero
+        jhi     1f                      | %d2 > 2, round to +infinity
+        tst.b   (1,%a0)                 | to -inf
+        jne     fp_ne_doroundup         | negative, round to infinity
+        jra     9b                      | positive, round to zero
+1:      tst.b   (1,%a0)                 | to +inf
+        jeq     fp_ne_doroundup         | positive, round to infinity
+        jra     9b                      | negative, round to zero
+#endif
+        | Zeros and subnormal numbers
+        | These are probably merely subnormal, rather than "denormalized"
+        |  numbers, so we will try to make them normal again.
+fp_ne_small:
+        jne     fp_ne_small1            | high lword zero?
+        move.l  (4,%a0),%d0
+        jne     fp_ne_small2
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+        clr.l   %d0
+        move.b  (-4,%a0),%d0
+        jne     fp_ne_small3
+#endif
+        | Genuine zero.
+        clr.w   -(%a0)
+        subq.l  #2,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+        | Subnormal.
+fp_ne_small1:
+        bfffo   %d0{#0,#32},%d1
+        move.w  -(%a0),%d2
+        sub.w   %d1,%d2
+        jcc     1f
+        | Pathologically small, denormalize.
+        add.w   %d2,%d1
+        clr.w   %d2
+        fp_set_sr FPSR_EXC_UNFL
+1:      move.w  %d2,(%a0)+
+        move.w  %d1,%d2
+        jeq     fp_ne_checkround
+        | This is exactly the same 64-bit double shift as seen above.
+        lsl.l   %d2,%d0
+        move.l  %d0,(%a0)+
+        move.l  (%a0),%d0
+        move.l  %d0,%d1
+        lsl.l   %d2,%d0
+        move.l  %d0,(%a0)
+        neg.w   %d2
+        and.w   #0x1f,%d2
+        lsr.l   %d2,%d1
+        or.l    %d1,-(%a0)
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+fp_ne_extra1:
+        clr.l   %d0
+        move.b  (-4,%a0),%d0
+        neg.w   %d2
+        add.w   #24,%d2
+        jcc     1f
+        clr.b   (-4,%a0)
+        lsl.l   %d2,%d0
+        or.l    %d0,(4,%a0)
+        jra     fp_ne_checkround
+1:      addq.w  #8,%d2
+        lsl.l   %d2,%d0
+        move.b  %d0,(-4,%a0)
+        lsr.l   #8,%d0
+        or.l    %d0,(4,%a0)
+#endif
+        jra     fp_ne_checkround
+        | May or may not be subnormal, if so, only 32 bits to shift.
+fp_ne_small2:
+        bfffo   %d0{#0,#32},%d1
+        add.w   #32,%d1
+        move.w  -(%a0),%d2
+        sub.w   %d1,%d2
+        jcc     1f
+        | Beyond pathologically small, denormalize.
+        add.w   %d2,%d1
+        clr.w   %d2
+        fp_set_sr FPSR_EXC_UNFL
+1:      move.w  %d2,(%a0)+
+        ext.l   %d1
+        jeq     fp_ne_checkround
+        clr.l   (4,%a0)
+        sub.w   #32,%d1
+        jcs     1f
+        lsl.l   %d1,%d0                 | lower lword needs only to be shifted
+        move.l  %d0,(%a0)               | into the higher lword
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+        clr.l   %d0
+        move.b  (-4,%a0),%d0
+        clr.b   (-4,%a0)
+        neg.w   %d1
+        add.w   #32,%d1
+        bfins   %d0,(%a0){%d1,#8}
+#endif
+        jra     fp_ne_checkround
+1:      neg.w   %d1                     | lower lword is splitted between
+        bfins   %d0,(%a0){%d1,#32}      | higher and lower lword
+#ifndef CONFIG_M68KFPU_EMU_EXTRAPREC
+        jra     fp_ne_checkround
+#else
+        move.w  %d1,%d2
+        jra     fp_ne_extra1
+        | These are extremely small numbers, that will mostly end up as zero
+        | anyway, so this is only important for correct rounding.
+fp_ne_small3:
+        bfffo   %d0{#24,#8},%d1
+        add.w   #40,%d1
+        move.w  -(%a0),%d2
+        sub.w   %d1,%d2
+        jcc     1f
+        | Pathologically small, denormalize.
+        add.w   %d2,%d1
+        clr.w   %d2
+1:      move.w  %d2,(%a0)+
+        ext.l   %d1
+        jeq     fp_ne_checkround
+        cmp.w   #8,%d1
+        jcs     2f
+1:      clr.b   (-4,%a0)
+        sub.w   #64,%d1
+        jcs     1f
+        add.w   #24,%d1
+        lsl.l   %d1,%d0
+        move.l  %d0,(%a0)
+        jra     fp_ne_checkround
+1:      neg.w   %d1
+        bfins   %d0,(%a0){%d1,#8}
+        jra     fp_ne_checkround
+2:      lsl.l   %d1,%d0
+        move.b  %d0,(-4,%a0)
+        lsr.l   #8,%d0
+        move.b  %d0,(7,%a0)
+        jra     fp_ne_checkround
+#endif
+        | Infinities and NaNs, again, same as above.
+fp_ne_large:
+        move.l  (%a0)+,%d0
+        jne     3f
+1:      tst.l   (%a0)
+        jne     4f
+2:      subq.l  #8,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+        | we have maybe a NaN, shift off the highest bit
+3:      move.l  %d0,%d1
+        lsl.l   #1,%d1
+        jne     4f
+        clr.l   (-4,%a0)
+        jra     1b
+        | we have a NaN, test if it is signaling
+4:      bset    #30,%d0
+        jne     2b
+        fp_set_sr FPSR_EXC_SNAN
+        move.l  %d0,(-4,%a0)
+        jra     2b
+        | these next two do rounding as per the IEEE standard.
+        | values for the rounding modes appear to be:
+        | 0:    Round to nearest
+        | 1:    Round to zero
+        | 2:    Round to -Infinity
+        | 3:    Round to +Infinity
+        | both functions expect that fp_normalize was already
+        | called (and extended argument is already normalized
+        | as far as possible), these are used if there is different
+        | rounding precision is selected and before converting
+        | into single/double
+        | fp_normalize_double:
+        | normalize an extended with double (52-bit) precision
+        | args:  %a0 (struct fp_ext *)
+fp_normalize_double:
+        printf  PNORM,"nd: %p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,"), "
+        move.l  (%a0)+,%d2
+        tst.w   %d2
+        jeq     fp_nd_zero              | zero / denormalized
+        cmp.w   #0x7fff,%d2
+        jeq     fp_nd_huge              | NaN / infinitive.
+        sub.w   #0x4000-0x3ff,%d2       | will the exponent fit?
+        jcs     fp_nd_small             | too small.
+        cmp.w   #0x7fe,%d2
+        jcc     fp_nd_large             | too big.
+        addq.l  #4,%a0
+        move.l  (%a0),%d0               | low lword of mantissa
+        | now, round off the low 11 bits.
+fp_nd_round:
+        moveq   #21,%d1
+        lsl.l   %d1,%d0                 | keep 11 low bits.
+        jne     fp_nd_checkround        | Are they non-zero?
+        | nothing to do here
+9:      subq.l  #8,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+        | Be careful with the X bit! It contains the lsb
+        | from the shift above, it is needed for round to nearest.
+fp_nd_checkround:
+        fp_set_sr FPSR_EXC_INEX2        | INEX2 bit
+        and.w   #0xf800,(2,%a0)         | clear bits 0-10
+        move.w  (FPD_RND,FPDATA),%d2    | rounding mode
+        jne     2f                      | %d2 == 0, round to nearest
+        tst.l   %d0                     | test guard bit
+        jpl     9b                      | zero is closer
+        | here we test the X bit by adding it to %d2
+        clr.w   %d2                     | first set z bit, addx only clears it
+        addx.w  %d2,%d2                 | test lsb bit
+        | IEEE754-specified "round to even" behaviour.  If the guard
+        | bit is set, then the number is odd, so rounding works like
+        | in grade-school arithmetic (i.e. 1.5 rounds to 2.0)
+        | Otherwise, an equal distance rounds towards zero, so as not
+        | to produce an odd number.  This is strange, but it is what
+        | the standard says.
+        jne     fp_nd_doroundup         | round to infinity
+        lsl.l   #1,%d0                  | check low bits
+        jeq     9b                      | round to zero
+fp_nd_doroundup:
+        | round (the mantissa, that is) towards infinity
+        add.l   #0x800,(%a0)
+        jcc     9b                      | no overflow, good.
+        addq.l  #1,-(%a0)               | extend to high lword
+        jcc     1f                      | no overflow, good.
+        | Yow! we have managed to overflow the mantissa.  Since this
+        | only happens when %d1 was 0xfffff800, it is now zero, so
+        | reset the high bit, and increment the exponent.
+        move.w  #0x8000,(%a0)
+        addq.w  #1,-(%a0)
+        cmp.w   #0x43ff,(%a0)+          | exponent now overflown?
+        jeq     fp_nd_large             | yes, so make it infinity.
+1:      subq.l  #4,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+2:      subq.w  #2,%d2
+        jcs     9b                      | %d2 < 2, round to zero
+        jhi     3f                      | %d2 > 2, round to +infinity
+        | Round to +Inf or -Inf.  High word of %d2 contains the
+        | sign of the number, by the way.
+        swap    %d2                     | to -inf
+        tst.b   %d2
+        jne     fp_nd_doroundup         | negative, round to infinity
+        jra     9b                      | positive, round to zero
+3:      swap    %d2                     | to +inf
+        tst.b   %d2
+        jeq     fp_nd_doroundup         | positive, round to infinity
+        jra     9b                      | negative, round to zero
+        | Exponent underflow.  Try to make a denormal, and set it to
+        | the smallest possible fraction if this fails.
+fp_nd_small:
+        fp_set_sr FPSR_EXC_UNFL         | set UNFL bit
+        move.w  #0x3c01,(-2,%a0)        | 2**-1022
+        neg.w   %d2                     | degree of underflow
+        cmp.w   #32,%d2                 | single or double shift?
+        jcc     1f
+        | Again, another 64-bit double shift.
+        move.l  (%a0),%d0
+        move.l  %d0,%d1
+        lsr.l   %d2,%d0
+        move.l  %d0,(%a0)+
+        move.l  (%a0),%d0
+        lsr.l   %d2,%d0
+        neg.w   %d2
+        add.w   #32,%d2
+        lsl.l   %d2,%d1
+        or.l    %d1,%d0
+        move.l  (%a0),%d1
+        move.l  %d0,(%a0)
+        | Check to see if we shifted off any significant bits
+        lsl.l   %d2,%d1
+        jeq     fp_nd_round             | Nope, round.
+        bset    #0,%d0                  | Yes, so set the "sticky bit".
+        jra     fp_nd_round             | Now, round.
+        | Another 64-bit single shift and store
+1:      sub.w   #32,%d2
+        cmp.w   #32,%d2                 | Do we really need to shift?
+        jcc     2f                      | No, the number is too small.
+        move.l  (%a0),%d0
+        clr.l   (%a0)+
+        move.l  %d0,%d1
+        lsr.l   %d2,%d0
+        neg.w   %d2
+        add.w   #32,%d2
+        | Again, check to see if we shifted off any significant bits.
+        tst.l   (%a0)
+        jeq     1f
+        bset    #0,%d0                  | Sticky bit.
+1:      move.l  %d0,(%a0)
+        lsl.l   %d2,%d1
+        jeq     fp_nd_round
+        bset    #0,%d0
+        jra     fp_nd_round
+        | Sorry, the number is just too small.
+2:      clr.l   (%a0)+
+        clr.l   (%a0)
+        moveq   #1,%d0                  | Smallest possible fraction,
+        jra     fp_nd_round             | round as desired.
+        | zero and denormalized
+fp_nd_zero:
+        tst.l   (%a0)+
+        jne     1f
+        tst.l   (%a0)
+        jne     1f
+        subq.l  #8,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts                             | zero.  nothing to do.
+        | These are not merely subnormal numbers, but true denormals,
+        | i.e. pathologically small (exponent is 2**-16383) numbers.
+        | It is clearly impossible for even a normal extended number
+        | with that exponent to fit into double precision, so just
+        | write these ones off as "too darn small".
+1:      fp_set_sr FPSR_EXC_UNFL         | Set UNFL bit
+        clr.l   (%a0)
+        clr.l   -(%a0)
+        move.w  #0x3c01,-(%a0)          | i.e. 2**-1022
+        addq.l  #6,%a0
+        moveq   #1,%d0
+        jra     fp_nd_round             | round.
+        | Exponent overflow.  Just call it infinity.
+fp_nd_large:
+        move.w  #0x7ff,%d0
+        and.w   (6,%a0),%d0
+        jeq     1f
+        fp_set_sr FPSR_EXC_INEX2
+1:      fp_set_sr FPSR_EXC_OVFL
+        move.w  (FPD_RND,FPDATA),%d2
+        jne     3f                      | %d2 = 0 round to nearest
+1:      move.w  #0x7fff,(-2,%a0)
+        clr.l   (%a0)+
+        clr.l   (%a0)
+2:      subq.l  #8,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+3:      subq.w  #2,%d2
+        jcs     5f                      | %d2 < 2, round to zero
+        jhi     4f                      | %d2 > 2, round to +infinity
+        tst.b   (-3,%a0)                | to -inf
+        jne     1b
+        jra     5f
+4:      tst.b   (-3,%a0)                | to +inf
+        jeq     1b
+5:      move.w  #0x43fe,(-2,%a0)
+        moveq   #-1,%d0
+        move.l  %d0,(%a0)+
+        move.w  #0xf800,%d0
+        move.l  %d0,(%a0)
+        jra     2b
+        | Infinities or NaNs
+fp_nd_huge:
+        subq.l  #4,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+        | fp_normalize_single:
+        | normalize an extended with single (23-bit) precision
+        | args:  %a0 (struct fp_ext *)
+fp_normalize_single:
+        printf  PNORM,"ns: %p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,") "
+        addq.l  #2,%a0
+        move.w  (%a0)+,%d2
+        jeq     fp_ns_zero              | zero / denormalized
+        cmp.w   #0x7fff,%d2
+        jeq     fp_ns_huge              | NaN / infinitive.
+        sub.w   #0x4000-0x7f,%d2        | will the exponent fit?
+        jcs     fp_ns_small             | too small.
+        cmp.w   #0xfe,%d2
+        jcc     fp_ns_large             | too big.
+        move.l  (%a0)+,%d0              | get high lword of mantissa
+fp_ns_round:
+        tst.l   (%a0)                   | check the low lword
+        jeq     1f
+        | Set a sticky bit if it is non-zero.  This should only
+        | affect the rounding in what would otherwise be equal-
+        | distance situations, which is what we want it to do.
+        bset    #0,%d0
+1:      clr.l   (%a0)                   | zap it from memory.
+        | now, round off the low 8 bits of the hi lword.
+        tst.b   %d0                     | 8 low bits.
+        jne     fp_ns_checkround        | Are they non-zero?
+        | nothing to do here
+        subq.l  #8,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+fp_ns_checkround:
+        fp_set_sr FPSR_EXC_INEX2        | INEX2 bit
+        clr.b   -(%a0)                  | clear low byte of high lword
+        subq.l  #3,%a0
+        move.w  (FPD_RND,FPDATA),%d2    | rounding mode
+        jne     2f                      | %d2 == 0, round to nearest
+        tst.b   %d0                     | test guard bit
+        jpl     9f                      | zero is closer
+        btst    #8,%d0                  | test lsb bit
+        | round to even behaviour, see above.
+        jne     fp_ns_doroundup         | round to infinity
+        lsl.b   #1,%d0                  | check low bits
+        jeq     9f                      | round to zero
+fp_ns_doroundup:
+        | round (the mantissa, that is) towards infinity
+        add.l   #0x100,(%a0)
+        jcc     9f                      | no overflow, good.
+        | Overflow.  This means that the %d1 was 0xffffff00, so it
+        | is now zero.  We will set the mantissa to reflect this, and
+        | increment the exponent (checking for overflow there too)
+        move.w  #0x8000,(%a0)
+        addq.w  #1,-(%a0)
+        cmp.w   #0x407f,(%a0)+          | exponent now overflown?
+        jeq     fp_ns_large             | yes, so make it infinity.
+9:      subq.l  #4,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+        | check nondefault rounding modes
+2:      subq.w  #2,%d2
+        jcs     9b                      | %d2 < 2, round to zero
+        jhi     3f                      | %d2 > 2, round to +infinity
+        tst.b   (-3,%a0)                | to -inf
+        jne     fp_ns_doroundup         | negative, round to infinity
+        jra     9b                      | positive, round to zero
+3:      tst.b   (-3,%a0)                | to +inf
+        jeq     fp_ns_doroundup         | positive, round to infinity
+        jra     9b                      | negative, round to zero
+        | Exponent underflow.  Try to make a denormal, and set it to
+        | the smallest possible fraction if this fails.
+fp_ns_small:
+        fp_set_sr FPSR_EXC_UNFL         | set UNFL bit
+        move.w  #0x3f81,(-2,%a0)        | 2**-126
+        neg.w   %d2                     | degree of underflow
+        cmp.w   #32,%d2                 | single or double shift?
+        jcc     2f
+        | a 32-bit shift.
+        move.l  (%a0),%d0
+        move.l  %d0,%d1
+        lsr.l   %d2,%d0
+        move.l  %d0,(%a0)+
+        | Check to see if we shifted off any significant bits.
+        neg.w   %d2
+        add.w   #32,%d2
+        lsl.l   %d2,%d1
+        jeq     1f
+        bset    #0,%d0                  | Sticky bit.
+        | Check the lower lword
+1:      tst.l   (%a0)
+        jeq     fp_ns_round
+        clr     (%a0)
+        bset    #0,%d0                  | Sticky bit.
+        jra     fp_ns_round
+        | Sorry, the number is just too small.
+2:      clr.l   (%a0)+
+        clr.l   (%a0)
+        moveq   #1,%d0                  | Smallest possible fraction,
+        jra     fp_ns_round             | round as desired.
+        | Exponent overflow.  Just call it infinity.
+fp_ns_large:
+        tst.b   (3,%a0)
+        jeq     1f
+        fp_set_sr FPSR_EXC_INEX2
+1:      fp_set_sr FPSR_EXC_OVFL
+        move.w  (FPD_RND,FPDATA),%d2
+        jne     3f                      | %d2 = 0 round to nearest
+1:      move.w  #0x7fff,(-2,%a0)
+        clr.l   (%a0)+
+        clr.l   (%a0)
+2:      subq.l  #8,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+3:      subq.w  #2,%d2
+        jcs     5f                      | %d2 < 2, round to zero
+        jhi     4f                      | %d2 > 2, round to +infinity
+        tst.b   (-3,%a0)                | to -inf
+        jne     1b
+        jra     5f
+4:      tst.b   (-3,%a0)                | to +inf
+        jeq     1b
+5:      move.w  #0x407e,(-2,%a0)
+        move.l  #0xffffff00,(%a0)+
+        clr.l   (%a0)
+        jra     2b
+        | zero and denormalized
+fp_ns_zero:
+        tst.l   (%a0)+
+        jne     1f
+        tst.l   (%a0)
+        jne     1f
+        subq.l  #8,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts                             | zero.  nothing to do.
+        | These are not merely subnormal numbers, but true denormals,
+        | i.e. pathologically small (exponent is 2**-16383) numbers.
+        | It is clearly impossible for even a normal extended number
+        | with that exponent to fit into single precision, so just
+        | write these ones off as "too darn small".
+1:      fp_set_sr FPSR_EXC_UNFL         | Set UNFL bit
+        clr.l   (%a0)
+        clr.l   -(%a0)
+        move.w  #0x3f81,-(%a0)          | i.e. 2**-126
+        addq.l  #6,%a0
+        moveq   #1,%d0
+        jra     fp_ns_round             | round.
+        | Infinities or NaNs
+fp_ns_huge:
+        subq.l  #4,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+        | fp_normalize_single_fast:
+        | normalize an extended with single (23-bit) precision
+        | this is only used by fsgldiv/fsgdlmul, where the
+        | operand is not completly normalized.
+        | args:  %a0 (struct fp_ext *)
+fp_normalize_single_fast:
+        printf  PNORM,"nsf: %p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,") "
+        addq.l  #2,%a0
+        move.w  (%a0)+,%d2
+        cmp.w   #0x7fff,%d2
+        jeq     fp_nsf_huge             | NaN / infinitive.
+        move.l  (%a0)+,%d0              | get high lword of mantissa
+fp_nsf_round:
+        tst.l   (%a0)                   | check the low lword
+        jeq     1f
+        | Set a sticky bit if it is non-zero.  This should only
+        | affect the rounding in what would otherwise be equal-
+        | distance situations, which is what we want it to do.
+        bset    #0,%d0
+1:      clr.l   (%a0)                   | zap it from memory.
+        | now, round off the low 8 bits of the hi lword.
+        tst.b   %d0                     | 8 low bits.
+        jne     fp_nsf_checkround       | Are they non-zero?
+        | nothing to do here
+        subq.l  #8,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+fp_nsf_checkround:
+        fp_set_sr FPSR_EXC_INEX2        | INEX2 bit
+        clr.b   -(%a0)                  | clear low byte of high lword
+        subq.l  #3,%a0
+        move.w  (FPD_RND,FPDATA),%d2    | rounding mode
+        jne     2f                      | %d2 == 0, round to nearest
+        tst.b   %d0                     | test guard bit
+        jpl     9f                      | zero is closer
+        btst    #8,%d0                  | test lsb bit
+        | round to even behaviour, see above.
+        jne     fp_nsf_doroundup                | round to infinity
+        lsl.b   #1,%d0                  | check low bits
+        jeq     9f                      | round to zero
+fp_nsf_doroundup:
+        | round (the mantissa, that is) towards infinity
+        add.l   #0x100,(%a0)
+        jcc     9f                      | no overflow, good.
+        | Overflow.  This means that the %d1 was 0xffffff00, so it
+        | is now zero.  We will set the mantissa to reflect this, and
+        | increment the exponent (checking for overflow there too)
+        move.w  #0x8000,(%a0)
+        addq.w  #1,-(%a0)
+        cmp.w   #0x407f,(%a0)+          | exponent now overflown?
+        jeq     fp_nsf_large            | yes, so make it infinity.
+9:      subq.l  #4,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+        | check nondefault rounding modes
+2:      subq.w  #2,%d2
+        jcs     9b                      | %d2 < 2, round to zero
+        jhi     3f                      | %d2 > 2, round to +infinity
+        tst.b   (-3,%a0)                | to -inf
+        jne     fp_nsf_doroundup        | negative, round to infinity
+        jra     9b                      | positive, round to zero
+3:      tst.b   (-3,%a0)                | to +inf
+        jeq     fp_nsf_doroundup                | positive, round to infinity
+        jra     9b                      | negative, round to zero
+        | Exponent overflow.  Just call it infinity.
+fp_nsf_large:
+        tst.b   (3,%a0)
+        jeq     1f
+        fp_set_sr FPSR_EXC_INEX2
+1:      fp_set_sr FPSR_EXC_OVFL
+        move.w  (FPD_RND,FPDATA),%d2
+        jne     3f                      | %d2 = 0 round to nearest
+1:      move.w  #0x7fff,(-2,%a0)
+        clr.l   (%a0)+
+        clr.l   (%a0)
+2:      subq.l  #8,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+3:      subq.w  #2,%d2
+        jcs     5f                      | %d2 < 2, round to zero
+        jhi     4f                      | %d2 > 2, round to +infinity
+        tst.b   (-3,%a0)                | to -inf
+        jne     1b
+        jra     5f
+4:      tst.b   (-3,%a0)                | to +inf
+        jeq     1b
+5:      move.w  #0x407e,(-2,%a0)
+        move.l  #0xffffff00,(%a0)+
+        clr.l   (%a0)
+        jra     2b
+        | Infinities or NaNs
+fp_nsf_huge:
+        subq.l  #4,%a0
+        printf  PNORM,"%p(",1,%a0
+        printx  PNORM,%a0@
+        printf  PNORM,")\n"
+        rts
+        | conv_ext2int (macro):
+        | Generates a subroutine that converts an extended value to an
+        | integer of a given size, again, with the appropriate type of
+        | rounding.
+        | Macro arguments:
+        | s:    size, as given in an assembly instruction.
+        | b:    number of bits in that size.
+        | Subroutine arguments:
+        | %a0:  source (struct fp_ext *)
+        | Returns the integer in %d0 (like it should)
+.macro conv_ext2int s,b
+        .set    inf,(1<<(\b-1))-1       | i.e. MAXINT
+        printf  PCONV,"e2i%d: %p(",2,#\b,%a0
+        printx  PCONV,%a0@
+        printf  PCONV,") "
+        addq.l  #2,%a0
+        move.w  (%a0)+,%d2              | exponent
+        jeq     fp_e2i_zero\b           | zero / denorm (== 0, here)
+        cmp.w   #0x7fff,%d2
+        jeq     fp_e2i_huge\b           | Inf / NaN
+        sub.w   #0x3ffe,%d2
+        jcs     fp_e2i_small\b
+        cmp.w   #\b,%d2
+        jhi     fp_e2i_large\b
+        move.l  (%a0),%d0
+        move.l  %d0,%d1
+        lsl.l   %d2,%d1
+        jne     fp_e2i_round\b
+        tst.l   (4,%a0)
+        jne     fp_e2i_round\b
+        neg.w   %d2
+        add.w   #32,%d2
+        lsr.l   %d2,%d0
+9:      tst.w   (-4,%a0)
+        jne     1f
+        tst.\s  %d0
+        jmi     fp_e2i_large\b
+        printf  PCONV,"-> %p\n",1,%d0
+        rts
+1:      neg.\s  %d0
+        jeq     1f
+        jpl     fp_e2i_large\b
+1:      printf  PCONV,"-> %p\n",1,%d0
+        rts
+fp_e2i_round\b:
+        fp_set_sr FPSR_EXC_INEX2        | INEX2 bit
+        neg.w   %d2
+        add.w   #32,%d2
+        .if     \b>16
+        jeq     5f
+        .endif
+        lsr.l   %d2,%d0
+        move.w  (FPD_RND,FPDATA),%d2    | rounding mode
+        jne     2f                      | %d2 == 0, round to nearest
+        tst.l   %d1                     | test guard bit
+        jpl     9b                      | zero is closer
+        btst    %d2,%d0                 | test lsb bit (%d2 still 0)
+        jne     fp_e2i_doroundup\b
+        lsl.l   #1,%d1                  | check low bits
+        jne     fp_e2i_doroundup\b
+        tst.l   (4,%a0)
+        jeq     9b
+fp_e2i_doroundup\b:
+        addq.l  #1,%d0
+        jra     9b
+        | check nondefault rounding modes
+2:      subq.w  #2,%d2
+        jcs     9b                      | %d2 < 2, round to zero
+        jhi     3f                      | %d2 > 2, round to +infinity
+        tst.w   (-4,%a0)                | to -inf
+        jne     fp_e2i_doroundup\b      | negative, round to infinity
+        jra     9b                      | positive, round to zero
+3:      tst.w   (-4,%a0)                | to +inf
+        jeq     fp_e2i_doroundup\b      | positive, round to infinity
+        jra     9b      | negative, round to zero
+        | we are only want -2**127 get correctly rounded here,
+        | since the guard bit is in the lower lword.
+        | everything else ends up anyway as overflow.
+        .if     \b>16
+5:      move.w  (FPD_RND,FPDATA),%d2    | rounding mode
+        jne     2b                      | %d2 == 0, round to nearest
+        move.l  (4,%a0),%d1             | test guard bit
+        jpl     9b                      | zero is closer
+        lsl.l   #1,%d1                  | check low bits
+        jne     fp_e2i_doroundup\b
+        jra     9b
+        .endif
+fp_e2i_zero\b:
+        clr.l   %d0
+        tst.l   (%a0)+
+        jne     1f
+        tst.l   (%a0)
+        jeq     3f
+1:      subq.l  #4,%a0
+        fp_clr_sr FPSR_EXC_UNFL         | fp_normalize_ext has set this bit
+fp_e2i_small\b:
+        fp_set_sr FPSR_EXC_INEX2
+        clr.l   %d0
+        move.w  (FPD_RND,FPDATA),%d2    | rounding mode
+        subq.w  #2,%d2
+        jcs     3f                      | %d2 < 2, round to nearest/zero
+        jhi     2f                      | %d2 > 2, round to +infinity
+        tst.w   (-4,%a0)                | to -inf
+        jeq     3f
+        subq.\s #1,%d0
+        jra     3f
+2:      tst.w   (-4,%a0)                | to +inf
+        jne     3f
+        addq.\s #1,%d0
+3:      printf  PCONV,"-> %p\n",1,%d0
+        rts
+fp_e2i_large\b:
+        fp_set_sr FPSR_EXC_OPERR
+        move.\s #inf,%d0
+        tst.w   (-4,%a0)
+        jeq     1f
+        addq.\s #1,%d0
+1:      printf  PCONV,"-> %p\n",1,%d0
+        rts
+fp_e2i_huge\b:
+        move.\s (%a0),%d0
+        tst.l   (%a0)
+        jne     1f
+        tst.l   (%a0)
+        jeq     fp_e2i_large\b
+        | fp_normalize_ext has set this bit already
+        | and made the number nonsignaling
+1:      fp_tst_sr FPSR_EXC_SNAN
+        jne     1f
+        fp_set_sr FPSR_EXC_OPERR
+1:      printf  PCONV,"-> %p\n",1,%d0
+        rts
+.endm
+fp_conv_ext2long:
+        conv_ext2int l,32
+fp_conv_ext2short:
+        conv_ext2int w,16
+fp_conv_ext2byte:
+        conv_ext2int b,8
+fp_conv_ext2double:
+        jsr     fp_normalize_double
+        printf  PCONV,"e2d: %p(",1,%a0
+        printx  PCONV,%a0@
+        printf  PCONV,"), "
+        move.l  (%a0)+,%d2
+        cmp.w   #0x7fff,%d2
+        jne     1f
+        move.w  #0x7ff,%d2
+        move.l  (%a0)+,%d0
+        jra     2f
+1:      sub.w   #0x3fff-0x3ff,%d2
+        move.l  (%a0)+,%d0
+        jmi     2f
+        clr.w   %d2
+2:      lsl.w   #5,%d2
+        lsl.l   #7,%d2
+        lsl.l   #8,%d2
+        move.l  %d0,%d1
+        lsl.l   #1,%d0
+        lsr.l   #4,%d0
+        lsr.l   #8,%d0
+        or.l    %d2,%d0
+        putuser.l %d0,(%a1)+,fp_err_ua2,%a1
+        moveq   #21,%d0
+        lsl.l   %d0,%d1
+        move.l  (%a0),%d0
+        lsr.l   #4,%d0
+        lsr.l   #7,%d0
+        or.l    %d1,%d0
+        putuser.l %d0,(%a1),fp_err_ua2,%a1
+#ifdef FPU_EMU_DEBUG
+        getuser.l %a1@(-4),%d0,fp_err_ua2,%a1
+        getuser.l %a1@(0),%d1,fp_err_ua2,%a1
+        printf  PCONV,"%p(%08x%08x)\n",3,%a1,%d0,%d1
+#endif
+        rts
+fp_conv_ext2single:
+        jsr     fp_normalize_single
+        printf  PCONV,"e2s: %p(",1,%a0
+        printx  PCONV,%a0@
+        printf  PCONV,"), "
+        move.l  (%a0)+,%d1
+        cmp.w   #0x7fff,%d1
+        jne     1f
+        move.w  #0xff,%d1
+        move.l  (%a0)+,%d0
+        jra     2f
+1:      sub.w   #0x3fff-0x7f,%d1
+        move.l  (%a0)+,%d0
+        jmi     2f
+        clr.w   %d1
+2:      lsl.w   #8,%d1
+        lsl.l   #7,%d1
+        lsl.l   #8,%d1
+        bclr    #31,%d0
+        lsr.l   #8,%d0
+        or.l    %d1,%d0
+        printf  PCONV,"%08x\n",1,%d0
+        rts
+        | special return addresses for instr that
+        | encode the rounding precision in the opcode
+        | (e.g. fsmove,fdmove)
+fp_finalrounding_single:
+        addq.l  #8,%sp
+        jsr     fp_normalize_ext
+        jsr     fp_normalize_single
+        jra     fp_finaltest
+fp_finalrounding_single_fast:
+        addq.l  #8,%sp
+        jsr     fp_normalize_ext
+        jsr     fp_normalize_single_fast
+        jra     fp_finaltest
+fp_finalrounding_double:
+        addq.l  #8,%sp
+        jsr     fp_normalize_ext
+        jsr     fp_normalize_double
+        jra     fp_finaltest
+        | fp_finaltest:
+        | set the emulated status register based on the outcome of an
+        | emulated instruction.
+fp_finalrounding:
+        addq.l  #8,%sp
+|       printf  ,"f: %p\n",1,%a0
+        jsr     fp_normalize_ext
+        move.w  (FPD_PREC,FPDATA),%d0
+        subq.w  #1,%d0
+        jcs     fp_finaltest
+        jne     1f
+        jsr     fp_normalize_single
+        jra     2f
+1:      jsr     fp_normalize_double
+2:|     printf  ,"f: %p\n",1,%a0
+fp_finaltest:
+        | First, we do some of the obvious tests for the exception
+        | status byte and condition code bytes of fp_sr here, so that
+        | they do not have to be handled individually by every
+        | emulated instruction.
+        clr.l   %d0
+        addq.l  #1,%a0
+        tst.b   (%a0)+                  | sign
+        jeq     1f
+        bset    #FPSR_CC_NEG-24,%d0     | N bit
+1:      cmp.w   #0x7fff,(%a0)+          | exponent
+        jeq     2f
+        | test for zero
+        moveq   #FPSR_CC_Z-24,%d1
+        tst.l   (%a0)+
+        jne     9f
+        tst.l   (%a0)
+        jne     9f
+        jra     8f
+        | infinitiv and NAN
+2:      moveq   #FPSR_CC_NAN-24,%d1
+        move.l  (%a0)+,%d2
+        lsl.l   #1,%d2                  | ignore high bit
+        jne     8f
+        tst.l   (%a0)
+        jne     8f
+        moveq   #FPSR_CC_INF-24,%d1
+8:      bset    %d1,%d0
+9:      move.b  %d0,(FPD_FPSR+0,FPDATA) | set condition test result
+        | move instructions enter here
+        | Here, we test things in the exception status byte, and set
+        | other things in the accrued exception byte accordingly.
+        | Emulated instructions can set various things in the former,
+        | as defined in fp_emu.h.
+fp_final:
+        move.l  (FPD_FPSR,FPDATA),%d0
+#if 0
+        btst    #FPSR_EXC_SNAN,%d0      | EXC_SNAN
+        jne     1f
+        btst    #FPSR_EXC_OPERR,%d0     | EXC_OPERR
+        jeq     2f
+1:      bset    #FPSR_AEXC_IOP,%d0      | set IOP bit
+2:      btst    #FPSR_EXC_OVFL,%d0      | EXC_OVFL
+        jeq     1f
+        bset    #FPSR_AEXC_OVFL,%d0     | set OVFL bit
+1:      btst    #FPSR_EXC_UNFL,%d0      | EXC_UNFL
+        jeq     1f
+        btst    #FPSR_EXC_INEX2,%d0     | EXC_INEX2
+        jeq     1f
+        bset    #FPSR_AEXC_UNFL,%d0     | set UNFL bit
+1:      btst    #FPSR_EXC_DZ,%d0        | EXC_INEX1
+        jeq     1f
+        bset    #FPSR_AEXC_DZ,%d0       | set DZ bit
+1:      btst    #FPSR_EXC_OVFL,%d0      | EXC_OVFL
+        jne     1f
+        btst    #FPSR_EXC_INEX2,%d0     | EXC_INEX2
+        jne     1f
+        btst    #FPSR_EXC_INEX1,%d0     | EXC_INEX1
+        jeq     2f
+1:      bset    #FPSR_AEXC_INEX,%d0     | set INEX bit
+2:      move.l  %d0,(FPD_FPSR,FPDATA)
+#else
+        | same as above, greatly optimized, but untested (yet)
+        move.l  %d0,%d2
+        lsr.l   #5,%d0
+        move.l  %d0,%d1
+        lsr.l   #4,%d1
+        or.l    %d0,%d1
+        and.b   #0x08,%d1
+        move.l  %d2,%d0
+        lsr.l   #6,%d0
+        or.l    %d1,%d0
+        move.l  %d2,%d1
+        lsr.l   #4,%d1
+        or.b    #0xdf,%d1
+        and.b   %d1,%d0
+        move.l  %d2,%d1
+        lsr.l   #7,%d1
+        and.b   #0x80,%d1
+        or.b    %d1,%d0
+        and.b   #0xf8,%d0
+        or.b    %d0,%d2
+        move.l  %d2,(FPD_FPSR,FPDATA)
+#endif
+        move.b  (FPD_FPSR+2,FPDATA),%d0
+        and.b   (FPD_FPCR+2,FPDATA),%d0
+        jeq     1f
+        printf  ,"send signal!!!\n"
+1:      jra     fp_end
diff --git a/arch/m68k/math-emu/multi_arith.h b/arch/m68k/math-emu/multi_arith.h
new file mode 100644
index 000000000000..02251e5afd89
--- /dev/null
+++ b/arch/m68k/math-emu/multi_arith.h
@@ -0,0 +1,819 @@
+/* multi_arith.h: multi-precision integer arithmetic functions, needed
+   to do extended-precision floating point.
+   (c) 1998 David Huggins-Daines.
+   Somewhat based on arch/alpha/math-emu/ieee-math.c, which is (c)
+   David Mosberger-Tang.
+   You may copy, modify, and redistribute this file under the terms of
+   the GNU General Public License, version 2, or any later version, at
+   your convenience. */
+/* Note:
+   These are not general multi-precision math routines.  Rather, they
+   implement the subset of integer arithmetic that we need in order to
+   multiply, divide, and normalize 128-bit unsigned mantissae.  */
+#ifndef MULTI_ARITH_H
+#define MULTI_ARITH_H
+#if 0   /* old code... */
+/* Unsigned only, because we don't need signs to multiply and divide. */
+typedef unsigned int int128[4];
+/* Word order */
+enum {
+        MSW128,
+        NMSW128,
+        NLSW128,
+        LSW128
+};
+/* big-endian */
+#define LO_WORD(ll) (((unsigned int *) &ll)[1])
+#define HI_WORD(ll) (((unsigned int *) &ll)[0])
+/* Convenience functions to stuff various integer values into int128s */
+static inline void zero128(int128 a)
+{
+        a[LSW128] = a[NLSW128] = a[NMSW128] = a[MSW128] = 0;
+}
+/* Human-readable word order in the arguments */
+static inline void set128(unsigned int i3, unsigned int i2, unsigned int i1,
+                          unsigned int i0, int128 a)
+{
+        a[LSW128] = i0;
+        a[NLSW128] = i1;
+        a[NMSW128] = i2;
+        a[MSW128] = i3;
+}
+/* Convenience functions (for testing as well) */
+static inline void int64_to_128(unsigned long long src, int128 dest)
+{
+        dest[LSW128] = (unsigned int) src;
+        dest[NLSW128] = src >> 32;
+        dest[NMSW128] = dest[MSW128] = 0;
+}
+static inline void int128_to_64(const int128 src, unsigned long long *dest)
+{
+        *dest = src[LSW128] | (long long) src[NLSW128] << 32;
+}
+static inline void put_i128(const int128 a)
+{
+        printk("%08x %08x %08x %08x\n", a[MSW128], a[NMSW128],
+               a[NLSW128], a[LSW128]);
+}
+/* Internal shifters:
+   Note that these are only good for 0 < count < 32.
+ */
+static inline void _lsl128(unsigned int count, int128 a)
+{
+        a[MSW128] = (a[MSW128] << count) | (a[NMSW128] >> (32 - count));
+        a[NMSW128] = (a[NMSW128] << count) | (a[NLSW128] >> (32 - count));
+        a[NLSW128] = (a[NLSW128] << count) | (a[LSW128] >> (32 - count));
+        a[LSW128] <<= count;
+}
+static inline void _lsr128(unsigned int count, int128 a)
+{
+        a[LSW128] = (a[LSW128] >> count) | (a[NLSW128] << (32 - count));
+        a[NLSW128] = (a[NLSW128] >> count) | (a[NMSW128] << (32 - count));
+        a[NMSW128] = (a[NMSW128] >> count) | (a[MSW128] << (32 - count));
+        a[MSW128] >>= count;
+}
+/* Should be faster, one would hope */
+static inline void lslone128(int128 a)
+{
+        asm volatile ("lsl.l #1,%0\n"
+                      "roxl.l #1,%1\n"
+                      "roxl.l #1,%2\n"
+                      "roxl.l #1,%3\n"
+                      :
+                      "=d" (a[LSW128]),
+                      "=d"(a[NLSW128]),
+                      "=d"(a[NMSW128]),
+                      "=d"(a[MSW128])
+                      :
+                      "0"(a[LSW128]),
+                      "1"(a[NLSW128]),
+                      "2"(a[NMSW128]),
+                      "3"(a[MSW128]));
+}
+static inline void lsrone128(int128 a)
+{
+        asm volatile ("lsr.l #1,%0\n"
+                      "roxr.l #1,%1\n"
+                      "roxr.l #1,%2\n"
+                      "roxr.l #1,%3\n"
+                      :
+                      "=d" (a[MSW128]),
+                      "=d"(a[NMSW128]),
+                      "=d"(a[NLSW128]),
+                      "=d"(a[LSW128])
+                      :
+                      "0"(a[MSW128]),
+                      "1"(a[NMSW128]),
+                      "2"(a[NLSW128]),
+                      "3"(a[LSW128]));
+}
+/* Generalized 128-bit shifters:
+   These bit-shift to a multiple of 32, then move whole longwords.  */
+static inline void lsl128(unsigned int count, int128 a)
+{
+        int wordcount, i;
+        if (count % 32)
+                _lsl128(count % 32, a);
+        if (0 == (wordcount = count / 32))
+                return;
+        /* argh, gak, endian-sensitive */
+        for (i = 0; i < 4 - wordcount; i++) {
+                a[i] = a[i + wordcount];
+        }
+        for (i = 3; i >= 4 - wordcount; --i) {
+                a[i] = 0;
+        }
+}
+static inline void lsr128(unsigned int count, int128 a)
+{
+        int wordcount, i;
+        if (count % 32)
+                _lsr128(count % 32, a);
+        if (0 == (wordcount = count / 32))
+                return;
+        for (i = 3; i >= wordcount; --i) {
+                a[i] = a[i - wordcount];
+        }
+        for (i = 0; i < wordcount; i++) {
+                a[i] = 0;
+        }
+}
+static inline int orl128(int a, int128 b)
+{
+        b[LSW128] |= a;
+}
+static inline int btsthi128(const int128 a)
+{
+        return a[MSW128] & 0x80000000;
+}
+/* test bits (numbered from 0 = LSB) up to and including "top" */
+static inline int bftestlo128(int top, const int128 a)
+{
+        int r = 0;
+        if (top > 31)
+                r |= a[LSW128];
+        if (top > 63)
+                r |= a[NLSW128];
+        if (top > 95)
+                r |= a[NMSW128];
+        r |= a[3 - (top / 32)] & ((1 << (top % 32 + 1)) - 1);
+        return (r != 0);
+}
+/* Aargh.  We need these because GCC is broken */
+/* FIXME: do them in assembly, for goodness' sake! */
+static inline void mask64(int pos, unsigned long long *mask)
+{
+        *mask = 0;
+        if (pos < 32) {
+                LO_WORD(*mask) = (1 << pos) - 1;
+                return;
+        }
+        LO_WORD(*mask) = -1;
+        HI_WORD(*mask) = (1 << (pos - 32)) - 1;
+}
+static inline void bset64(int pos, unsigned long long *dest)
+{
+        /* This conditional will be optimized away.  Thanks, GCC! */
+        if (pos < 32)
+                asm volatile ("bset %1,%0":"=m"
+                              (LO_WORD(*dest)):"id"(pos));
+        else
+                asm volatile ("bset %1,%0":"=m"
+                              (HI_WORD(*dest)):"id"(pos - 32));
+}
+static inline int btst64(int pos, unsigned long long dest)
+{
+        if (pos < 32)
+                return (0 != (LO_WORD(dest) & (1 << pos)));
+        else
+                return (0 != (HI_WORD(dest) & (1 << (pos - 32))));
+}
+static inline void lsl64(int count, unsigned long long *dest)
+{
+        if (count < 32) {
+                HI_WORD(*dest) = (HI_WORD(*dest) << count)
+                    | (LO_WORD(*dest) >> count);
+                LO_WORD(*dest) <<= count;
+                return;
+        }
+        count -= 32;
+        HI_WORD(*dest) = LO_WORD(*dest) << count;
+        LO_WORD(*dest) = 0;
+}
+static inline void lsr64(int count, unsigned long long *dest)
+{
+        if (count < 32) {
+                LO_WORD(*dest) = (LO_WORD(*dest) >> count)
+                    | (HI_WORD(*dest) << (32 - count));
+                HI_WORD(*dest) >>= count;
+                return;
+        }
+        count -= 32;
+        LO_WORD(*dest) = HI_WORD(*dest) >> count;
+        HI_WORD(*dest) = 0;
+}
+#endif
+static inline void fp_denormalize(struct fp_ext *reg, unsigned int cnt)
+{
+        reg->exp += cnt;
+        switch (cnt) {
+        case 0 ... 8:
+                reg->lowmant = reg->mant.m32[1] << (8 - cnt);
+                reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
+                                   (reg->mant.m32[0] << (32 - cnt));
+                reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
+                break;
+        case 9 ... 32:
+                reg->lowmant = reg->mant.m32[1] >> (cnt - 8);
+                if (reg->mant.m32[1] << (40 - cnt))
+                        reg->lowmant |= 1;
+                reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
+                                   (reg->mant.m32[0] << (32 - cnt));
+                reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
+                break;
+        case 33 ... 39:
+                asm volatile ("bfextu %1{%2,#8},%0" : "=d" (reg->lowmant)
+                        : "m" (reg->mant.m32[0]), "d" (64 - cnt));
+                if (reg->mant.m32[1] << (40 - cnt))
+                        reg->lowmant |= 1;
+                reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
+                reg->mant.m32[0] = 0;
+                break;
+        case 40 ... 71:
+                reg->lowmant = reg->mant.m32[0] >> (cnt - 40);
+                if ((reg->mant.m32[0] << (72 - cnt)) || reg->mant.m32[1])
+                        reg->lowmant |= 1;
+                reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
+                reg->mant.m32[0] = 0;
+                break;
+        default:
+                reg->lowmant = reg->mant.m32[0] || reg->mant.m32[1];
+                reg->mant.m32[0] = 0;
+                reg->mant.m32[1] = 0;
+                break;
+        }
+}
+static inline int fp_overnormalize(struct fp_ext *reg)
+{
+        int shift;
+        if (reg->mant.m32[0]) {
+                asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[0]));
+                reg->mant.m32[0] = (reg->mant.m32[0] << shift) | (reg->mant.m32[1] >> (32 - shift));
+                reg->mant.m32[1] = (reg->mant.m32[1] << shift);
+        } else {
+                asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[1]));
+                reg->mant.m32[0] = (reg->mant.m32[1] << shift);
+                reg->mant.m32[1] = 0;
+                shift += 32;
+        }
+        return shift;
+}
+static inline int fp_addmant(struct fp_ext *dest, struct fp_ext *src)
+{
+        int carry;
+        /* we assume here, gcc only insert move and a clr instr */
+        asm volatile ("add.b %1,%0" : "=d,g" (dest->lowmant)
+                : "g,d" (src->lowmant), "0,0" (dest->lowmant));
+        asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[1])
+                : "d" (src->mant.m32[1]), "0" (dest->mant.m32[1]));
+        asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[0])
+                : "d" (src->mant.m32[0]), "0" (dest->mant.m32[0]));
+        asm volatile ("addx.l %0,%0" : "=d" (carry) : "0" (0));
+        return carry;
+}
+static inline int fp_addcarry(struct fp_ext *reg)
+{
+        if (++reg->exp == 0x7fff) {
+                if (reg->mant.m64)
+                        fp_set_sr(FPSR_EXC_INEX2);
+                reg->mant.m64 = 0;
+                fp_set_sr(FPSR_EXC_OVFL);
+                return 0;
+        }
+        reg->lowmant = (reg->mant.m32[1] << 7) | (reg->lowmant ? 1 : 0);
+        reg->mant.m32[1] = (reg->mant.m32[1] >> 1) |
+                           (reg->mant.m32[0] << 31);
+        reg->mant.m32[0] = (reg->mant.m32[0] >> 1) | 0x80000000;
+        return 1;
+}
+static inline void fp_submant(struct fp_ext *dest, struct fp_ext *src1,
+                              struct fp_ext *src2)
+{
+        /* we assume here, gcc only insert move and a clr instr */
+        asm volatile ("sub.b %1,%0" : "=d,g" (dest->lowmant)
+                : "g,d" (src2->lowmant), "0,0" (src1->lowmant));
+        asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[1])
+                : "d" (src2->mant.m32[1]), "0" (src1->mant.m32[1]));
+        asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[0])
+                : "d" (src2->mant.m32[0]), "0" (src1->mant.m32[0]));
+}
+#define fp_mul64(desth, destl, src1, src2) ({                           \
+        asm ("mulu.l %2,%1:%0" : "=d" (destl), "=d" (desth)             \
+                : "g" (src1), "0" (src2));                              \
+})
+#define fp_div64(quot, rem, srch, srcl, div)                            \
+        asm ("divu.l %2,%1:%0" : "=d" (quot), "=d" (rem)                \
+                : "dm" (div), "1" (srch), "0" (srcl))
+#define fp_add64(dest1, dest2, src1, src2) ({                           \
+        asm ("add.l %1,%0" : "=d,dm" (dest2)                            \
+                : "dm,d" (src2), "0,0" (dest2));                        \
+        asm ("addx.l %1,%0" : "=d" (dest1)                              \
+                : "d" (src1), "0" (dest1));                             \
+})
+#define fp_addx96(dest, src) ({                                         \
+        /* we assume here, gcc only insert move and a clr instr */      \
+        asm volatile ("add.l %1,%0" : "=d,g" (dest->m32[2])             \
+                : "g,d" (temp.m32[1]), "0,0" (dest->m32[2]));           \
+        asm volatile ("addx.l %1,%0" : "=d" (dest->m32[1])              \
+                : "d" (temp.m32[0]), "0" (dest->m32[1]));               \
+        asm volatile ("addx.l %1,%0" : "=d" (dest->m32[0])              \
+                : "d" (0), "0" (dest->m32[0]));                         \
+})
+#define fp_sub64(dest, src) ({                                          \
+        asm ("sub.l %1,%0" : "=d,dm" (dest.m32[1])                      \
+                : "dm,d" (src.m32[1]), "0,0" (dest.m32[1]));            \
+        asm ("subx.l %1,%0" : "=d" (dest.m32[0])                        \
+                : "d" (src.m32[0]), "0" (dest.m32[0]));                 \
+})
+#define fp_sub96c(dest, srch, srcm, srcl) ({                            \
+        char carry;                                                     \
+        asm ("sub.l %1,%0" : "=d,dm" (dest.m32[2])                      \
+                : "dm,d" (srcl), "0,0" (dest.m32[2]));                  \
+        asm ("subx.l %1,%0" : "=d" (dest.m32[1])                        \
+                : "d" (srcm), "0" (dest.m32[1]));                       \
+        asm ("subx.l %2,%1; scs %0" : "=d" (carry), "=d" (dest.m32[0])  \
+                : "d" (srch), "1" (dest.m32[0]));                       \
+        carry;                                                          \
+})
+static inline void fp_multiplymant(union fp_mant128 *dest, struct fp_ext *src1,
+                                   struct fp_ext *src2)
+{
+        union fp_mant64 temp;
+        fp_mul64(dest->m32[0], dest->m32[1], src1->mant.m32[0], src2->mant.m32[0]);
+        fp_mul64(dest->m32[2], dest->m32[3], src1->mant.m32[1], src2->mant.m32[1]);
+        fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[0], src2->mant.m32[1]);
+        fp_addx96(dest, temp);
+        fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[1], src2->mant.m32[0]);
+        fp_addx96(dest, temp);
+}
+static inline void fp_dividemant(union fp_mant128 *dest, struct fp_ext *src,
+                                 struct fp_ext *div)
+{
+        union fp_mant128 tmp;
+        union fp_mant64 tmp64;
+        unsigned long *mantp = dest->m32;
+        unsigned long fix, rem, first, dummy;
+        int i;
+        /* the algorithm below requires dest to be smaller than div,
+           but both have the high bit set */
+        if (src->mant.m64 >= div->mant.m64) {
+                fp_sub64(src->mant, div->mant);
+                *mantp = 1;
+        } else
+                *mantp = 0;
+        mantp++;
+        /* basic idea behind this algorithm: we can't divide two 64bit numbers
+           (AB/CD) directly, but we can calculate AB/C0, but this means this
+           quotient is off by C0/CD, so we have to multiply the first result
+           to fix the result, after that we have nearly the correct result
+           and only a few corrections are needed. */
+        /* C0/CD can be precalculated, but it's an 64bit division again, but
+           we can make it a bit easier, by dividing first through C so we get
+           10/1D and now only a single shift and the value fits into 32bit. */
+        fix = 0x80000000;
+        dummy = div->mant.m32[1] / div->mant.m32[0] + 1;
+        dummy = (dummy >> 1) | fix;
+        fp_div64(fix, dummy, fix, 0, dummy);
+        fix--;
+        for (i = 0; i < 3; i++, mantp++) {
+                if (src->mant.m32[0] == div->mant.m32[0]) {
+                        fp_div64(first, rem, 0, src->mant.m32[1], div->mant.m32[0]);
+                        fp_mul64(*mantp, dummy, first, fix);
+                        *mantp += fix;
+                } else {
+                        fp_div64(first, rem, src->mant.m32[0], src->mant.m32[1], div->mant.m32[0]);
+                        fp_mul64(*mantp, dummy, first, fix);
+                }
+                fp_mul64(tmp.m32[0], tmp.m32[1], div->mant.m32[0], first - *mantp);
+                fp_add64(tmp.m32[0], tmp.m32[1], 0, rem);
+                tmp.m32[2] = 0;
+                fp_mul64(tmp64.m32[0], tmp64.m32[1], *mantp, div->mant.m32[1]);
+                fp_sub96c(tmp, 0, tmp64.m32[0], tmp64.m32[1]);
+                src->mant.m32[0] = tmp.m32[1];
+                src->mant.m32[1] = tmp.m32[2];
+                while (!fp_sub96c(tmp, 0, div->mant.m32[0], div->mant.m32[1])) {
+                        src->mant.m32[0] = tmp.m32[1];
+                        src->mant.m32[1] = tmp.m32[2];
+                        *mantp += 1;
+                }
+        }
+}
+#if 0
+static inline unsigned int fp_fls128(union fp_mant128 *src)
+{
+        unsigned long data;
+        unsigned int res, off;
+        if ((data = src->m32[0]))
+                off = 0;
+        else if ((data = src->m32[1]))
+                off = 32;
+        else if ((data = src->m32[2]))
+                off = 64;
+        else if ((data = src->m32[3]))
+                off = 96;
+        else
+                return 128;
+        asm ("bfffo %1{#0,#32},%0" : "=d" (res) : "dm" (data));
+        return res + off;
+}
+static inline void fp_shiftmant128(union fp_mant128 *src, int shift)
+{
+        unsigned long sticky;
+        switch (shift) {
+        case 0:
+                return;
+        case 1:
+                asm volatile ("lsl.l #1,%0"
+                        : "=d" (src->m32[3]) : "0" (src->m32[3]));
+                asm volatile ("roxl.l #1,%0"
+                        : "=d" (src->m32[2]) : "0" (src->m32[2]));
+                asm volatile ("roxl.l #1,%0"
+                        : "=d" (src->m32[1]) : "0" (src->m32[1]));
+                asm volatile ("roxl.l #1,%0"
+                        : "=d" (src->m32[0]) : "0" (src->m32[0]));
+                return;
+        case 2 ... 31:
+                src->m32[0] = (src->m32[0] << shift) | (src->m32[1] >> (32 - shift));
+                src->m32[1] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
+                src->m32[2] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
+                src->m32[3] = (src->m32[3] << shift);
+                return;
+        case 32 ... 63:
+                shift -= 32;
+                src->m32[0] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
+                src->m32[1] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
+                src->m32[2] = (src->m32[3] << shift);
+                src->m32[3] = 0;
+                return;
+        case 64 ... 95:
+                shift -= 64;
+                src->m32[0] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
+                src->m32[1] = (src->m32[3] << shift);
+                src->m32[2] = src->m32[3] = 0;
+                return;
+        case 96 ... 127:
+                shift -= 96;
+                src->m32[0] = (src->m32[3] << shift);
+                src->m32[1] = src->m32[2] = src->m32[3] = 0;
+                return;
+        case -31 ... -1:
+                shift = -shift;
+                sticky = 0;
+                if (src->m32[3] << (32 - shift))
+                        sticky = 1;
+                src->m32[3] = (src->m32[3] >> shift) | (src->m32[2] << (32 - shift)) | sticky;
+                src->m32[2] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift));
+                src->m32[1] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
+                src->m32[0] = (src->m32[0] >> shift);
+                return;
+        case -63 ... -32:
+                shift = -shift - 32;
+                sticky = 0;
+                if ((src->m32[2] << (32 - shift)) || src->m32[3])
+                        sticky = 1;
+                src->m32[3] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift)) | sticky;
+                src->m32[2] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
+                src->m32[1] = (src->m32[0] >> shift);
+                src->m32[0] = 0;
+                return;
+        case -95 ... -64:
+                shift = -shift - 64;
+                sticky = 0;
+                if ((src->m32[1] << (32 - shift)) || src->m32[2] || src->m32[3])
+                        sticky = 1;
+                src->m32[3] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)) | sticky;
+                src->m32[2] = (src->m32[0] >> shift);
+                src->m32[1] = src->m32[0] = 0;
+                return;
+        case -127 ... -96:
+                shift = -shift - 96;
+                sticky = 0;
+                if ((src->m32[0] << (32 - shift)) || src->m32[1] || src->m32[2] || src->m32[3])
+                        sticky = 1;
+                src->m32[3] = (src->m32[0] >> shift) | sticky;
+                src->m32[2] = src->m32[1] = src->m32[0] = 0;
+                return;
+        }
+        if (shift < 0 && (src->m32[0] || src->m32[1] || src->m32[2] || src->m32[3]))
+                src->m32[3] = 1;
+        else
+                src->m32[3] = 0;
+        src->m32[2] = 0;
+        src->m32[1] = 0;
+        src->m32[0] = 0;
+}
+#endif
+static inline void fp_putmant128(struct fp_ext *dest, union fp_mant128 *src,
+                                 int shift)
+{
+        unsigned long tmp;
+        switch (shift) {
+        case 0:
+                dest->mant.m64 = src->m64[0];
+                dest->lowmant = src->m32[2] >> 24;
+                if (src->m32[3] || (src->m32[2] << 8))
+                        dest->lowmant |= 1;
+                break;
+        case 1:
+                asm volatile ("lsl.l #1,%0"
+                        : "=d" (tmp) : "0" (src->m32[2]));
+                asm volatile ("roxl.l #1,%0"
+                        : "=d" (dest->mant.m32[1]) : "0" (src->m32[1]));
+                asm volatile ("roxl.l #1,%0"
+                        : "=d" (dest->mant.m32[0]) : "0" (src->m32[0]));
+                dest->lowmant = tmp >> 24;
+                if (src->m32[3] || (tmp << 8))
+                        dest->lowmant |= 1;
+                break;
+        case 31:
+                asm volatile ("lsr.l #1,%1; roxr.l #1,%0"
+                        : "=d" (dest->mant.m32[0])
+                        : "d" (src->m32[0]), "0" (src->m32[1]));
+                asm volatile ("roxr.l #1,%0"
+                        : "=d" (dest->mant.m32[1]) : "0" (src->m32[2]));
+                asm volatile ("roxr.l #1,%0"
+                        : "=d" (tmp) : "0" (src->m32[3]));
+                dest->lowmant = tmp >> 24;
+                if (src->m32[3] << 7)
+                        dest->lowmant |= 1;
+                break;
+        case 32:
+                dest->mant.m32[0] = src->m32[1];
+                dest->mant.m32[1] = src->m32[2];
+                dest->lowmant = src->m32[3] >> 24;
+                if (src->m32[3] << 8)
+                        dest->lowmant |= 1;
+                break;
+        }
+}
+#if 0 /* old code... */
+static inline int fls(unsigned int a)
+{
+        int r;
+        asm volatile ("bfffo %1{#0,#32},%0"
+                      : "=d" (r) : "md" (a));
+        return r;
+}
+/* fls = "find last set" (cf. ffs(3)) */
+static inline int fls128(const int128 a)
+{
+        if (a[MSW128])
+                return fls(a[MSW128]);
+        if (a[NMSW128])
+                return fls(a[NMSW128]) + 32;
+        /* XXX: it probably never gets beyond this point in actual
+           use, but that's indicative of a more general problem in the
+           algorithm (i.e. as per the actual 68881 implementation, we
+           really only need at most 67 bits of precision [plus
+           overflow]) so I'm not going to fix it. */
+        if (a[NLSW128])
+                return fls(a[NLSW128]) + 64;
+        if (a[LSW128])
+                return fls(a[LSW128]) + 96;
+        else
+                return -1;
+}
+static inline int zerop128(const int128 a)
+{
+        return !(a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
+}
+static inline int nonzerop128(const int128 a)
+{
+        return (a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
+}
+/* Addition and subtraction */
+/* Do these in "pure" assembly, because "extended" asm is unmanageable
+   here */
+static inline void add128(const int128 a, int128 b)
+{
+        /* rotating carry flags */
+        unsigned int carry[2];
+        carry[0] = a[LSW128] > (0xffffffff - b[LSW128]);
+        b[LSW128] += a[LSW128];
+        carry[1] = a[NLSW128] > (0xffffffff - b[NLSW128] - carry[0]);
+        b[NLSW128] = a[NLSW128] + b[NLSW128] + carry[0];
+        carry[0] = a[NMSW128] > (0xffffffff - b[NMSW128] - carry[1]);
+        b[NMSW128] = a[NMSW128] + b[NMSW128] + carry[1];
+        b[MSW128] = a[MSW128] + b[MSW128] + carry[0];
+}
+/* Note: assembler semantics: "b -= a" */
+static inline void sub128(const int128 a, int128 b)
+{
+        /* rotating borrow flags */
+        unsigned int borrow[2];
+        borrow[0] = b[LSW128] < a[LSW128];
+        b[LSW128] -= a[LSW128];
+        borrow[1] = b[NLSW128] < a[NLSW128] + borrow[0];
+        b[NLSW128] = b[NLSW128] - a[NLSW128] - borrow[0];
+        borrow[0] = b[NMSW128] < a[NMSW128] + borrow[1];
+        b[NMSW128] = b[NMSW128] - a[NMSW128] - borrow[1];
+        b[MSW128] = b[MSW128] - a[MSW128] - borrow[0];
+}
+/* Poor man's 64-bit expanding multiply */
+static inline void mul64(unsigned long long a, unsigned long long b, int128 c)
+{
+        unsigned long long acc;
+        int128 acc128;
+        zero128(acc128);
+        zero128(c);
+        /* first the low words */
+        if (LO_WORD(a) && LO_WORD(b)) {
+                acc = (long long) LO_WORD(a) * LO_WORD(b);
+                c[NLSW128] = HI_WORD(acc);
+                c[LSW128] = LO_WORD(acc);
+        }
+        /* Next the high words */
+        if (HI_WORD(a) && HI_WORD(b)) {
+                acc = (long long) HI_WORD(a) * HI_WORD(b);
+                c[MSW128] = HI_WORD(acc);
+                c[NMSW128] = LO_WORD(acc);
+        }
+        /* The middle words */
+        if (LO_WORD(a) && HI_WORD(b)) {
+                acc = (long long) LO_WORD(a) * HI_WORD(b);
+                acc128[NMSW128] = HI_WORD(acc);
+                acc128[NLSW128] = LO_WORD(acc);
+                add128(acc128, c);
+        }
+        /* The first and last words */
+        if (HI_WORD(a) && LO_WORD(b)) {
+                acc = (long long) HI_WORD(a) * LO_WORD(b);
+                acc128[NMSW128] = HI_WORD(acc);
+                acc128[NLSW128] = LO_WORD(acc);
+                add128(acc128, c);
+        }
+}
+/* Note: unsigned */
+static inline int cmp128(int128 a, int128 b)
+{
+        if (a[MSW128] < b[MSW128])
+                return -1;
+        if (a[MSW128] > b[MSW128])
+                return 1;
+        if (a[NMSW128] < b[NMSW128])
+                return -1;
+        if (a[NMSW128] > b[NMSW128])
+                return 1;
+        if (a[NLSW128] < b[NLSW128])
+                return -1;
+        if (a[NLSW128] > b[NLSW128])
+                return 1;
+        return (signed) a[LSW128] - b[LSW128];
+}
+inline void div128(int128 a, int128 b, int128 c)
+{
+        int128 mask;
+        /* Algorithm:
+           Shift the divisor until it's at least as big as the
+           dividend, keeping track of the position to which we've
+           shifted it, i.e. the power of 2 which we've multiplied it
+           by.
+           Then, for this power of 2 (the mask), and every one smaller
+           than it, subtract the mask from the dividend and add it to
+           the quotient until the dividend is smaller than the raised
+           divisor.  At this point, divide the dividend and the mask
+           by 2 (i.e. shift one place to the right).  Lather, rinse,
+           and repeat, until there are no more powers of 2 left. */
+        /* FIXME: needless to say, there's room for improvement here too. */
+        /* Shift up */
+        /* XXX: since it just has to be "at least as big", we can
+           probably eliminate this horribly wasteful loop.  I will
+           have to prove this first, though */
+        set128(0, 0, 0, 1, mask);
+        while (cmp128(b, a) < 0 && !btsthi128(b)) {
+                lslone128(b);
+                lslone128(mask);
+        }
+        /* Shift down */
+        zero128(c);
+        do {
+                if (cmp128(a, b) >= 0) {
+                        sub128(b, a);
+                        add128(mask, c);
+                }
+                lsrone128(mask);
+                lsrone128(b);
+        } while (nonzerop128(mask));
+        /* The remainder is in a... */
+}
+#endif
+#endif  /* MULTI_ARITH_H */