diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-08-04 18:31:14 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-08-04 18:31:14 -0400 |
| commit | b067c9045af4791a5836042f743d12477131f7b5 (patch) | |
| tree | 3f6b853e563afdbd802c5582e418e8e43b1463f4 /arch/parisc | |
| parent | 8e7106a60748e74f4c76b2204e83f14e4dc041cc (diff) | |
| parent | a549c45a22de800e00045959b2d4f1b15b9f815a (diff) | |
Merge branch 'parisc-4.8-1' of git://git.kernel.org/pub/scm/linux/kernel/git/deller/parisc-linux
Pull parisc updates from Helge Deller:
- added an optimized hash implementation for parisc (George Spelvin)
- C99 style cleanups in iomap.c (Amitoj Kaur Chawla)
- added breaks to switch statement in PDC function (noticed by Dan
Carpenter)
* 'parisc-4.8-1' of git://git.kernel.org/pub/scm/linux/kernel/git/deller/parisc-linux:
parisc: Change structure intialisation to C99 style in iomap.c
parisc: Add break statements to pdc_pat_io_pci_cfg_read()
parisc: Add <asm/hash.h>
Diffstat (limited to 'arch/parisc')
| -rw-r--r-- | arch/parisc/Kconfig | 1 | ||||
| -rw-r--r-- | arch/parisc/include/asm/hash.h | 146 | ||||
| -rw-r--r-- | arch/parisc/kernel/firmware.c | 6 | ||||
| -rw-r--r-- | arch/parisc/lib/iomap.c | 64 |
4 files changed, 182 insertions, 35 deletions
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index dc117385ce2e..cd8778103165 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig | |||
| @@ -31,6 +31,7 @@ config PARISC | |||
| 31 | select TTY # Needed for pdc_cons.c | 31 | select TTY # Needed for pdc_cons.c |
| 32 | select HAVE_DEBUG_STACKOVERFLOW | 32 | select HAVE_DEBUG_STACKOVERFLOW |
| 33 | select HAVE_ARCH_AUDITSYSCALL | 33 | select HAVE_ARCH_AUDITSYSCALL |
| 34 | select HAVE_ARCH_HASH | ||
| 34 | select HAVE_ARCH_SECCOMP_FILTER | 35 | select HAVE_ARCH_SECCOMP_FILTER |
| 35 | select HAVE_ARCH_TRACEHOOK | 36 | select HAVE_ARCH_TRACEHOOK |
| 36 | select HAVE_UNSTABLE_SCHED_CLOCK if (SMP || !64BIT) | 37 | select HAVE_UNSTABLE_SCHED_CLOCK if (SMP || !64BIT) |
diff --git a/arch/parisc/include/asm/hash.h b/arch/parisc/include/asm/hash.h new file mode 100644 index 000000000000..dbe93311aa26 --- /dev/null +++ b/arch/parisc/include/asm/hash.h | |||
| @@ -0,0 +1,146 @@ | |||
| 1 | #ifndef _ASM_HASH_H | ||
| 2 | #define _ASM_HASH_H | ||
| 3 | |||
| 4 | /* | ||
| 5 | * HP-PA only implements integer multiply in the FPU. However, for | ||
| 6 | * integer multiplies by constant, it has a number of shift-and-add | ||
| 7 | * (but no shift-and-subtract, sigh!) instructions that a compiler | ||
| 8 | * can synthesize a code sequence with. | ||
| 9 | * | ||
| 10 | * Unfortunately, GCC isn't very efficient at using them. For example | ||
| 11 | * it uses three instructions for "x *= 21" when only two are needed. | ||
| 12 | * But we can find a sequence manually. | ||
| 13 | */ | ||
| 14 | |||
| 15 | #define HAVE_ARCH__HASH_32 1 | ||
| 16 | |||
| 17 | /* | ||
| 18 | * This is a multiply by GOLDEN_RATIO_32 = 0x61C88647 optimized for the | ||
| 19 | * PA7100 pairing rules. This is an in-order 2-way superscalar processor. | ||
| 20 | * Only one instruction in a pair may be a shift (by more than 3 bits), | ||
| 21 | * but other than that, simple ALU ops (including shift-and-add by up | ||
| 22 | * to 3 bits) may be paired arbitrarily. | ||
| 23 | * | ||
| 24 | * PA8xxx processors also dual-issue ALU instructions, although with | ||
| 25 | * fewer constraints, so this schedule is good for them, too. | ||
| 26 | * | ||
| 27 | * This 6-step sequence was found by Yevgen Voronenko's implementation | ||
| 28 | * of the Hcub algorithm at http://spiral.ece.cmu.edu/mcm/gen.html. | ||
| 29 | */ | ||
| 30 | static inline u32 __attribute_const__ __hash_32(u32 x) | ||
| 31 | { | ||
| 32 | u32 a, b, c; | ||
| 33 | |||
| 34 | /* | ||
| 35 | * Phase 1: Compute a = (x << 19) + x, | ||
| 36 | * b = (x << 9) + a, c = (x << 23) + b. | ||
| 37 | */ | ||
| 38 | a = x << 19; /* Two shifts can't be paired */ | ||
| 39 | b = x << 9; a += x; | ||
| 40 | c = x << 23; b += a; | ||
| 41 | c += b; | ||
| 42 | /* Phase 2: Return (b<<11) + (c<<6) + (a<<3) - c */ | ||
| 43 | b <<= 11; | ||
| 44 | a += c << 3; b -= c; | ||
| 45 | return (a << 3) + b; | ||
| 46 | } | ||
| 47 | |||
| 48 | #if BITS_PER_LONG == 64 | ||
| 49 | |||
| 50 | #define HAVE_ARCH_HASH_64 1 | ||
| 51 | |||
| 52 | /* | ||
| 53 | * Finding a good shift-and-add chain for GOLDEN_RATIO_64 is tricky, | ||
| 54 | * because available software for the purpose chokes on constants this | ||
| 55 | * large. (It's mostly designed for compiling FIR filter coefficients | ||
| 56 | * into FPGAs.) | ||
| 57 | * | ||
| 58 | * However, Jason Thong pointed out a work-around. The Hcub software | ||
| 59 | * (http://spiral.ece.cmu.edu/mcm/gen.html) is designed for *multiple* | ||
| 60 | * constant multiplication, and is good at finding shift-and-add chains | ||
| 61 | * which share common terms. | ||
| 62 | * | ||
| 63 | * Looking at 0x0x61C8864680B583EB in binary: | ||
| 64 | * 0110000111001000100001100100011010000000101101011000001111101011 | ||
| 65 | * \______________/ \__________/ \_______/ \________/ | ||
| 66 | * \____________________________/ \____________________/ | ||
| 67 | * you can see the non-zero bits are divided into several well-separated | ||
| 68 | * blocks. Hcub can find algorithms for those terms separately, which | ||
| 69 | * can then be shifted and added together. | ||
| 70 | * | ||
| 71 | * Dividing the input into 2, 3 or 4 blocks, Hcub can find solutions | ||
| 72 | * with 10, 9 or 8 adds, respectively, making a total of 11 for the | ||
| 73 | * whole number. | ||
| 74 | * | ||
| 75 | * Using just two large blocks, 0xC3910C8D << 31 in the high bits, | ||
| 76 | * and 0xB583EB in the low bits, produces as good an algorithm as any, | ||
| 77 | * and with one more small shift than alternatives. | ||
| 78 | * | ||
| 79 | * The high bits are a larger number and more work to compute, as well | ||
| 80 | * as needing one extra cycle to shift left 31 bits before the final | ||
| 81 | * addition, so they are the critical path for scheduling. The low bits | ||
| 82 | * can fit into the scheduling slots left over. | ||
| 83 | */ | ||
| 84 | |||
| 85 | |||
| 86 | /* | ||
| 87 | * This _ASSIGN(dst, src) macro performs "dst = src", but prevents GCC | ||
| 88 | * from inferring anything about the value assigned to "dest". | ||
| 89 | * | ||
| 90 | * This prevents it from mis-optimizing certain sequences. | ||
| 91 | * In particular, gcc is annoyingly eager to combine consecutive shifts. | ||
| 92 | * Given "x <<= 19; y += x; z += x << 1;", GCC will turn this into | ||
| 93 | * "y += x << 19; z += x << 20;" even though the latter sequence needs | ||
| 94 | * an additional instruction and temporary register. | ||
| 95 | * | ||
| 96 | * Because no actual assembly code is generated, this construct is | ||
| 97 | * usefully portable across all GCC platforms, and so can be test-compiled | ||
| 98 | * on non-PA systems. | ||
| 99 | * | ||
| 100 | * In two places, additional unused input dependencies are added. This | ||
| 101 | * forces GCC's scheduling so it does not rearrange instructions too much. | ||
| 102 | * Because the PA-8xxx is out of order, I'm not sure how much this matters, | ||
| 103 | * but why make it more difficult for the processor than necessary? | ||
| 104 | */ | ||
| 105 | #define _ASSIGN(dst, src, ...) asm("" : "=r" (dst) : "0" (src), ##__VA_ARGS__) | ||
| 106 | |||
| 107 | /* | ||
| 108 | * Multiply by GOLDEN_RATIO_64 = 0x0x61C8864680B583EB using a heavily | ||
| 109 | * optimized shift-and-add sequence. | ||
| 110 | * | ||
| 111 | * Without the final shift, the multiply proper is 19 instructions, | ||
| 112 | * 10 cycles and uses only 4 temporaries. Whew! | ||
| 113 | * | ||
| 114 | * You are not expected to understand this. | ||
| 115 | */ | ||
| 116 | static __always_inline u32 __attribute_const__ | ||
| 117 | hash_64(u64 a, unsigned int bits) | ||
| 118 | { | ||
| 119 | u64 b, c, d; | ||
| 120 | |||
| 121 | /* | ||
| 122 | * Encourage GCC to move a dynamic shift to %sar early, | ||
| 123 | * thereby freeing up an additional temporary register. | ||
| 124 | */ | ||
| 125 | if (!__builtin_constant_p(bits)) | ||
| 126 | asm("" : "=q" (bits) : "0" (64 - bits)); | ||
| 127 | else | ||
| 128 | bits = 64 - bits; | ||
| 129 | |||
| 130 | _ASSIGN(b, a*5); c = a << 13; | ||
| 131 | b = (b << 2) + a; _ASSIGN(d, a << 17); | ||
| 132 | a = b + (a << 1); c += d; | ||
| 133 | d = a << 10; _ASSIGN(a, a << 19); | ||
| 134 | d = a - d; _ASSIGN(a, a << 4, "X" (d)); | ||
| 135 | c += b; a += b; | ||
| 136 | d -= c; c += a << 1; | ||
| 137 | a += c << 3; _ASSIGN(b, b << (7+31), "X" (c), "X" (d)); | ||
| 138 | a <<= 31; b += d; | ||
| 139 | a += b; | ||
| 140 | return a >> bits; | ||
| 141 | } | ||
| 142 | #undef _ASSIGN /* We're a widely-used header file, so don't litter! */ | ||
| 143 | |||
| 144 | #endif /* BITS_PER_LONG == 64 */ | ||
| 145 | |||
| 146 | #endif /* _ASM_HASH_H */ | ||
diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c index 22395901d47b..e5d71905cad5 100644 --- a/arch/parisc/kernel/firmware.c +++ b/arch/parisc/kernel/firmware.c | |||
| @@ -1354,9 +1354,9 @@ int pdc_pat_io_pci_cfg_read(unsigned long pci_addr, int pci_size, u32 *mem_addr) | |||
| 1354 | retval = mem_pdc_call(PDC_PAT_IO, PDC_PAT_IO_PCI_CONFIG_READ, | 1354 | retval = mem_pdc_call(PDC_PAT_IO, PDC_PAT_IO_PCI_CONFIG_READ, |
| 1355 | __pa(pdc_result), pci_addr, pci_size); | 1355 | __pa(pdc_result), pci_addr, pci_size); |
| 1356 | switch(pci_size) { | 1356 | switch(pci_size) { |
| 1357 | case 1: *(u8 *) mem_addr = (u8) pdc_result[0]; | 1357 | case 1: *(u8 *) mem_addr = (u8) pdc_result[0]; break; |
| 1358 | case 2: *(u16 *)mem_addr = (u16) pdc_result[0]; | 1358 | case 2: *(u16 *)mem_addr = (u16) pdc_result[0]; break; |
| 1359 | case 4: *(u32 *)mem_addr = (u32) pdc_result[0]; | 1359 | case 4: *(u32 *)mem_addr = (u32) pdc_result[0]; break; |
| 1360 | } | 1360 | } |
| 1361 | spin_unlock_irqrestore(&pdc_lock, flags); | 1361 | spin_unlock_irqrestore(&pdc_lock, flags); |
| 1362 | 1362 | ||
diff --git a/arch/parisc/lib/iomap.c b/arch/parisc/lib/iomap.c index fb8e10a4fb39..eaffbb90aa14 100644 --- a/arch/parisc/lib/iomap.c +++ b/arch/parisc/lib/iomap.c | |||
| @@ -125,22 +125,22 @@ static void ioport_write32r(void __iomem *addr, const void *s, unsigned long n) | |||
| 125 | } | 125 | } |
| 126 | 126 | ||
| 127 | static const struct iomap_ops ioport_ops = { | 127 | static const struct iomap_ops ioport_ops = { |
| 128 | ioport_read8, | 128 | .read8 = ioport_read8, |
| 129 | ioport_read16, | 129 | .read16 = ioport_read16, |
| 130 | ioport_read16, | 130 | .read16be = ioport_read16, |
| 131 | ioport_read32, | 131 | .read32 = ioport_read32, |
| 132 | ioport_read32, | 132 | .read32be = ioport_read32, |
| 133 | ioport_write8, | 133 | .write8 = ioport_write8, |
| 134 | ioport_write16, | 134 | .write16 = ioport_write16, |
| 135 | ioport_write16, | 135 | .write16be = ioport_write16, |
| 136 | ioport_write32, | 136 | .write32 = ioport_write32, |
| 137 | ioport_write32, | 137 | .write32be = ioport_write32, |
| 138 | ioport_read8r, | 138 | .read8r = ioport_read8r, |
| 139 | ioport_read16r, | 139 | .read16r = ioport_read16r, |
| 140 | ioport_read32r, | 140 | .read32r = ioport_read32r, |
| 141 | ioport_write8r, | 141 | .write8r = ioport_write8r, |
| 142 | ioport_write16r, | 142 | .write16r = ioport_write16r, |
| 143 | ioport_write32r, | 143 | .write32r = ioport_write32r, |
| 144 | }; | 144 | }; |
| 145 | 145 | ||
| 146 | /* Legacy I/O memory ops */ | 146 | /* Legacy I/O memory ops */ |
| @@ -244,22 +244,22 @@ static void iomem_write32r(void __iomem *addr, const void *s, unsigned long n) | |||
| 244 | } | 244 | } |
| 245 | 245 | ||
| 246 | static const struct iomap_ops iomem_ops = { | 246 | static const struct iomap_ops iomem_ops = { |
| 247 | iomem_read8, | 247 | .read8 = iomem_read8, |
| 248 | iomem_read16, | 248 | .read16 = iomem_read16, |
| 249 | iomem_read16be, | 249 | .read16be = iomem_read16be, |
| 250 | iomem_read32, | 250 | .read32 = iomem_read32, |
| 251 | iomem_read32be, | 251 | .read32be = iomem_read32be, |
| 252 | iomem_write8, | 252 | .write8 = iomem_write8, |
| 253 | iomem_write16, | 253 | .write16 = iomem_write16, |
| 254 | iomem_write16be, | 254 | .write16be = iomem_write16be, |
| 255 | iomem_write32, | 255 | .write32 = iomem_write32, |
| 256 | iomem_write32be, | 256 | .write32be = iomem_write32be, |
| 257 | iomem_read8r, | 257 | .read8r = iomem_read8r, |
| 258 | iomem_read16r, | 258 | .read16r = iomem_read16r, |
| 259 | iomem_read32r, | 259 | .read32r = iomem_read32r, |
| 260 | iomem_write8r, | 260 | .write8r = iomem_write8r, |
| 261 | iomem_write16r, | 261 | .write16r = iomem_write16r, |
| 262 | iomem_write32r, | 262 | .write32r = iomem_write32r, |
| 263 | }; | 263 | }; |
| 264 | 264 | ||
| 265 | static const struct iomap_ops *iomap_ops[8] = { | 265 | static const struct iomap_ops *iomap_ops[8] = { |
