aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2012-08-30 10:51:32 -0400
committerDavid S. Miller <davem@davemloft.net>2012-08-30 10:51:32 -0400
commit03d168ad122d6e622ad00490211704c4f2994976 (patch)
treed180d1bee9fbe6b85d27c5908f532252cbddf775 /arch
parent9fd130ecbe97f3440d14d3d0c6660413e69ac532 (diff)
sparc64: Unroll ECB encryption loops in AES driver.
The AES opcodes have a 3 cycle latency, so by doing 32-bytes at a time we avoid a pipeline bubble in between every round. For the 256-bit key case, it looks like we're doing more work in order to reload the KEY registers during the loop to make space for scarce temporaries. But the load dual issues with the AES operations so we get the KEY reloads essentially for free. Before: testing speed of ecb(aes) encryption test 0 (128 bit key, 16 byte blocks): 1 operation in 264 cycles (16 bytes) test 1 (128 bit key, 64 byte blocks): 1 operation in 231 cycles (64 bytes) test 2 (128 bit key, 256 byte blocks): 1 operation in 329 cycles (256 bytes) test 3 (128 bit key, 1024 byte blocks): 1 operation in 715 cycles (1024 bytes) test 4 (128 bit key, 8192 byte blocks): 1 operation in 4248 cycles (8192 bytes) test 5 (192 bit key, 16 byte blocks): 1 operation in 221 cycles (16 bytes) test 6 (192 bit key, 64 byte blocks): 1 operation in 234 cycles (64 bytes) test 7 (192 bit key, 256 byte blocks): 1 operation in 359 cycles (256 bytes) test 8 (192 bit key, 1024 byte blocks): 1 operation in 803 cycles (1024 bytes) test 9 (192 bit key, 8192 byte blocks): 1 operation in 5366 cycles (8192 bytes) test 10 (256 bit key, 16 byte blocks): 1 operation in 209 cycles (16 bytes) test 11 (256 bit key, 64 byte blocks): 1 operation in 255 cycles (64 bytes) test 12 (256 bit key, 256 byte blocks): 1 operation in 379 cycles (256 bytes) test 13 (256 bit key, 1024 byte blocks): 1 operation in 938 cycles (1024 bytes) test 14 (256 bit key, 8192 byte blocks): 1 operation in 6041 cycles (8192 bytes) After: testing speed of ecb(aes) encryption test 0 (128 bit key, 16 byte blocks): 1 operation in 266 cycles (16 bytes) test 1 (128 bit key, 64 byte blocks): 1 operation in 256 cycles (64 bytes) test 2 (128 bit key, 256 byte blocks): 1 operation in 305 cycles (256 bytes) test 3 (128 bit key, 1024 byte blocks): 1 operation in 676 cycles (1024 bytes) test 4 (128 bit key, 8192 byte blocks): 1 operation in 3981 cycles (8192 bytes) test 5 (192 bit key, 16 byte blocks): 1 operation in 210 cycles (16 bytes) test 6 (192 bit key, 64 byte blocks): 1 operation in 233 cycles (64 bytes) test 7 (192 bit key, 256 byte blocks): 1 operation in 340 cycles (256 bytes) test 8 (192 bit key, 1024 byte blocks): 1 operation in 766 cycles (1024 bytes) test 9 (192 bit key, 8192 byte blocks): 1 operation in 5136 cycles (8192 bytes) test 10 (256 bit key, 16 byte blocks): 1 operation in 206 cycles (16 bytes) test 11 (256 bit key, 64 byte blocks): 1 operation in 268 cycles (64 bytes) test 12 (256 bit key, 256 byte blocks): 1 operation in 368 cycles (256 bytes) test 13 (256 bit key, 1024 byte blocks): 1 operation in 890 cycles (1024 bytes) test 14 (256 bit key, 8192 byte blocks): 1 operation in 5718 cycles (8192 bytes) Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'arch')
-rw-r--r--arch/sparc/crypto/aes_asm.S166
1 files changed, 148 insertions, 18 deletions
diff --git a/arch/sparc/crypto/aes_asm.S b/arch/sparc/crypto/aes_asm.S
index 7a975d689919..33d59c66f1e2 100644
--- a/arch/sparc/crypto/aes_asm.S
+++ b/arch/sparc/crypto/aes_asm.S
@@ -48,6 +48,10 @@
48 .word 0x81b0230d; 48 .word 0x81b0230d;
49#define MOVXTOD_O5_F2 \ 49#define MOVXTOD_O5_F2 \
50 .word 0x85b0230d; 50 .word 0x85b0230d;
51#define MOVXTOD_G3_F60 \
52 .word 0xbbb02303;
53#define MOVXTOD_G7_F62 \
54 .word 0xbfb02307;
51 55
52#define ENCRYPT_TWO_ROUNDS(KEY_BASE, I0, I1, T0, T1) \ 56#define ENCRYPT_TWO_ROUNDS(KEY_BASE, I0, I1, T0, T1) \
53 AES_EROUND01(KEY_BASE + 0, I0, I1, T0) \ 57 AES_EROUND01(KEY_BASE + 0, I0, I1, T0) \
@@ -55,12 +59,32 @@
55 AES_EROUND01(KEY_BASE + 4, T0, T1, I0) \ 59 AES_EROUND01(KEY_BASE + 4, T0, T1, I0) \
56 AES_EROUND23(KEY_BASE + 6, T0, T1, I1) 60 AES_EROUND23(KEY_BASE + 6, T0, T1, I1)
57 61
62#define ENCRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
63 AES_EROUND01(KEY_BASE + 0, I0, I1, T0) \
64 AES_EROUND23(KEY_BASE + 2, I0, I1, T1) \
65 AES_EROUND01(KEY_BASE + 0, I2, I3, T2) \
66 AES_EROUND23(KEY_BASE + 2, I2, I3, T3) \
67 AES_EROUND01(KEY_BASE + 4, T0, T1, I0) \
68 AES_EROUND23(KEY_BASE + 6, T0, T1, I1) \
69 AES_EROUND01(KEY_BASE + 4, T2, T3, I2) \
70 AES_EROUND23(KEY_BASE + 6, T2, T3, I3)
71
58#define ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE, I0, I1, T0, T1) \ 72#define ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE, I0, I1, T0, T1) \
59 AES_EROUND01(KEY_BASE + 0, I0, I1, T0) \ 73 AES_EROUND01(KEY_BASE + 0, I0, I1, T0) \
60 AES_EROUND23(KEY_BASE + 2, I0, I1, T1) \ 74 AES_EROUND23(KEY_BASE + 2, I0, I1, T1) \
61 AES_EROUND01_L(KEY_BASE + 4, T0, T1, I0) \ 75 AES_EROUND01_L(KEY_BASE + 4, T0, T1, I0) \
62 AES_EROUND23_L(KEY_BASE + 6, T0, T1, I1) 76 AES_EROUND23_L(KEY_BASE + 6, T0, T1, I1)
63 77
78#define ENCRYPT_TWO_ROUNDS_LAST_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
79 AES_EROUND01(KEY_BASE + 0, I0, I1, T0) \
80 AES_EROUND23(KEY_BASE + 2, I0, I1, T1) \
81 AES_EROUND01(KEY_BASE + 0, I2, I3, T2) \
82 AES_EROUND23(KEY_BASE + 2, I2, I3, T3) \
83 AES_EROUND01_L(KEY_BASE + 4, T0, T1, I0) \
84 AES_EROUND23_L(KEY_BASE + 6, T0, T1, I1) \
85 AES_EROUND01_L(KEY_BASE + 4, T2, T3, I2) \
86 AES_EROUND23_L(KEY_BASE + 6, T2, T3, I3)
87
64 /* 10 rounds */ 88 /* 10 rounds */
65#define ENCRYPT_128(KEY_BASE, I0, I1, T0, T1) \ 89#define ENCRYPT_128(KEY_BASE, I0, I1, T0, T1) \
66 ENCRYPT_TWO_ROUNDS(KEY_BASE + 0, I0, I1, T0, T1) \ 90 ENCRYPT_TWO_ROUNDS(KEY_BASE + 0, I0, I1, T0, T1) \
@@ -69,6 +93,13 @@
69 ENCRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \ 93 ENCRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \
70 ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 32, I0, I1, T0, T1) 94 ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 32, I0, I1, T0, T1)
71 95
96#define ENCRYPT_128_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
97 ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 0, I0, I1, I2, I3, T0, T1, T2, T3) \
98 ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 8, I0, I1, I2, I3, T0, T1, T2, T3) \
99 ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \
100 ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \
101 ENCRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3)
102
72 /* 12 rounds */ 103 /* 12 rounds */
73#define ENCRYPT_192(KEY_BASE, I0, I1, T0, T1) \ 104#define ENCRYPT_192(KEY_BASE, I0, I1, T0, T1) \
74 ENCRYPT_TWO_ROUNDS(KEY_BASE + 0, I0, I1, T0, T1) \ 105 ENCRYPT_TWO_ROUNDS(KEY_BASE + 0, I0, I1, T0, T1) \
@@ -78,6 +109,14 @@
78 ENCRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \ 109 ENCRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \
79 ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 40, I0, I1, T0, T1) 110 ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 40, I0, I1, T0, T1)
80 111
112#define ENCRYPT_192_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
113 ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 0, I0, I1, I2, I3, T0, T1, T2, T3) \
114 ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 8, I0, I1, I2, I3, T0, T1, T2, T3) \
115 ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \
116 ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \
117 ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3) \
118 ENCRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 40, I0, I1, I2, I3, T0, T1, T2, T3)
119
81 /* 14 rounds */ 120 /* 14 rounds */
82#define ENCRYPT_256(KEY_BASE, I0, I1, T0, T1) \ 121#define ENCRYPT_256(KEY_BASE, I0, I1, T0, T1) \
83 ENCRYPT_TWO_ROUNDS(KEY_BASE + 0, I0, I1, T0, T1) \ 122 ENCRYPT_TWO_ROUNDS(KEY_BASE + 0, I0, I1, T0, T1) \
@@ -88,6 +127,34 @@
88 ENCRYPT_TWO_ROUNDS(KEY_BASE + 40, I0, I1, T0, T1) \ 127 ENCRYPT_TWO_ROUNDS(KEY_BASE + 40, I0, I1, T0, T1) \
89 ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 48, I0, I1, T0, T1) 128 ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 48, I0, I1, T0, T1)
90 129
130#define ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, TMP_BASE) \
131 ENCRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, \
132 TMP_BASE + 0, TMP_BASE + 2, TMP_BASE + 4, TMP_BASE + 6)
133
134#define ENCRYPT_256_2(KEY_BASE, I0, I1, I2, I3) \
135 ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 0, I0, I1, I2, I3, KEY_BASE + 48) \
136 ldd [%o0 + 0xd0], %f56; \
137 ldd [%o0 + 0xd8], %f58; \
138 ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 8, I0, I1, I2, I3, KEY_BASE + 0) \
139 ldd [%o0 + 0xe0], %f60; \
140 ldd [%o0 + 0xe8], %f62; \
141 ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, KEY_BASE + 0) \
142 ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, KEY_BASE + 0) \
143 ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, KEY_BASE + 0) \
144 ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 40, I0, I1, I2, I3, KEY_BASE + 0) \
145 AES_EROUND01(KEY_BASE + 48, I0, I1, KEY_BASE + 0) \
146 AES_EROUND23(KEY_BASE + 50, I0, I1, KEY_BASE + 2) \
147 AES_EROUND01(KEY_BASE + 48, I2, I3, KEY_BASE + 4) \
148 AES_EROUND23(KEY_BASE + 50, I2, I3, KEY_BASE + 6) \
149 AES_EROUND01_L(KEY_BASE + 52, KEY_BASE + 0, KEY_BASE + 2, I0) \
150 AES_EROUND23_L(KEY_BASE + 54, KEY_BASE + 0, KEY_BASE + 2, I1) \
151 ldd [%o0 + 0x10], %f8; \
152 ldd [%o0 + 0x18], %f10; \
153 AES_EROUND01_L(KEY_BASE + 52, KEY_BASE + 4, KEY_BASE + 6, I2) \
154 AES_EROUND23_L(KEY_BASE + 54, KEY_BASE + 4, KEY_BASE + 6, I3) \
155 ldd [%o0 + 0x20], %f12; \
156 ldd [%o0 + 0x28], %f14;
157
91#define DECRYPT_TWO_ROUNDS(KEY_BASE, I0, I1, T0, T1) \ 158#define DECRYPT_TWO_ROUNDS(KEY_BASE, I0, I1, T0, T1) \
92 AES_DROUND23(KEY_BASE + 0, I0, I1, T1) \ 159 AES_DROUND23(KEY_BASE + 0, I0, I1, T1) \
93 AES_DROUND01(KEY_BASE + 2, I0, I1, T0) \ 160 AES_DROUND01(KEY_BASE + 2, I0, I1, T0) \
@@ -832,10 +899,34 @@ ENDPROC(aes_sparc64_load_decrypt_keys_256)
832ENTRY(aes_sparc64_ecb_encrypt_128) 899ENTRY(aes_sparc64_ecb_encrypt_128)
833 /* %o0=key, %o1=input, %o2=output, %o3=len */ 900 /* %o0=key, %o1=input, %o2=output, %o3=len */
834 ldx [%o0 + 0x00], %g1 901 ldx [%o0 + 0x00], %g1
835 ldx [%o0 + 0x08], %g2 902 subcc %o3, 0x10, %o3
903 be 10f
904 ldx [%o0 + 0x08], %g2
8361: ldx [%o1 + 0x00], %g3 9051: ldx [%o1 + 0x00], %g3
837 ldx [%o1 + 0x08], %g7 906 ldx [%o1 + 0x08], %g7
838 add %o1, 0x10, %o1 907 ldx [%o1 + 0x10], %o4
908 ldx [%o1 + 0x18], %o5
909 xor %g1, %g3, %g3
910 xor %g2, %g7, %g7
911 MOVXTOD_G3_F4
912 MOVXTOD_G7_F6
913 xor %g1, %o4, %g3
914 xor %g2, %o5, %g7
915 MOVXTOD_G3_F60
916 MOVXTOD_G7_F62
917 ENCRYPT_128_2(8, 4, 6, 60, 62, 0, 2, 56, 58)
918 std %f4, [%o2 + 0x00]
919 std %f6, [%o2 + 0x08]
920 std %f60, [%o2 + 0x10]
921 std %f62, [%o2 + 0x18]
922 sub %o3, 0x20, %o3
923 add %o1, 0x20, %o1
924 brgz %o3, 1b
925 add %o2, 0x20, %o2
926 brlz,pt %o3, 11f
927 nop
92810: ldx [%o1 + 0x00], %g3
929 ldx [%o1 + 0x08], %g7
839 xor %g1, %g3, %g3 930 xor %g1, %g3, %g3
840 xor %g2, %g7, %g7 931 xor %g2, %g7, %g7
841 MOVXTOD_G3_F4 932 MOVXTOD_G3_F4
@@ -843,10 +934,7 @@ ENTRY(aes_sparc64_ecb_encrypt_128)
843 ENCRYPT_128(8, 4, 6, 0, 2) 934 ENCRYPT_128(8, 4, 6, 0, 2)
844 std %f4, [%o2 + 0x00] 935 std %f4, [%o2 + 0x00]
845 std %f6, [%o2 + 0x08] 936 std %f6, [%o2 + 0x08]
846 subcc %o3, 0x10, %o3 93711: retl
847 bne,pt %xcc, 1b
848 add %o2, 0x10, %o2
849 retl
850 nop 938 nop
851ENDPROC(aes_sparc64_ecb_encrypt_128) 939ENDPROC(aes_sparc64_ecb_encrypt_128)
852 940
@@ -854,10 +942,34 @@ ENDPROC(aes_sparc64_ecb_encrypt_128)
854ENTRY(aes_sparc64_ecb_encrypt_192) 942ENTRY(aes_sparc64_ecb_encrypt_192)
855 /* %o0=key, %o1=input, %o2=output, %o3=len */ 943 /* %o0=key, %o1=input, %o2=output, %o3=len */
856 ldx [%o0 + 0x00], %g1 944 ldx [%o0 + 0x00], %g1
857 ldx [%o0 + 0x08], %g2 945 subcc %o3, 0x10, %o3
946 be 10f
947 ldx [%o0 + 0x08], %g2
8581: ldx [%o1 + 0x00], %g3 9481: ldx [%o1 + 0x00], %g3
859 ldx [%o1 + 0x08], %g7 949 ldx [%o1 + 0x08], %g7
860 add %o1, 0x10, %o1 950 ldx [%o1 + 0x10], %o4
951 ldx [%o1 + 0x18], %o5
952 xor %g1, %g3, %g3
953 xor %g2, %g7, %g7
954 MOVXTOD_G3_F4
955 MOVXTOD_G7_F6
956 xor %g1, %o4, %g3
957 xor %g2, %o5, %g7
958 MOVXTOD_G3_F60
959 MOVXTOD_G7_F62
960 ENCRYPT_192_2(8, 4, 6, 60, 62, 0, 2, 56, 58)
961 std %f4, [%o2 + 0x00]
962 std %f6, [%o2 + 0x08]
963 std %f60, [%o2 + 0x10]
964 std %f62, [%o2 + 0x18]
965 sub %o3, 0x20, %o3
966 add %o1, 0x20, %o1
967 brgz %o3, 1b
968 add %o2, 0x20, %o2
969 brlz,pt %o3, 11f
970 nop
97110: ldx [%o1 + 0x00], %g3
972 ldx [%o1 + 0x08], %g7
861 xor %g1, %g3, %g3 973 xor %g1, %g3, %g3
862 xor %g2, %g7, %g7 974 xor %g2, %g7, %g7
863 MOVXTOD_G3_F4 975 MOVXTOD_G3_F4
@@ -865,10 +977,7 @@ ENTRY(aes_sparc64_ecb_encrypt_192)
865 ENCRYPT_192(8, 4, 6, 0, 2) 977 ENCRYPT_192(8, 4, 6, 0, 2)
866 std %f4, [%o2 + 0x00] 978 std %f4, [%o2 + 0x00]
867 std %f6, [%o2 + 0x08] 979 std %f6, [%o2 + 0x08]
868 subcc %o3, 0x10, %o3 98011: retl
869 bne,pt %xcc, 1b
870 add %o2, 0x10, %o2
871 retl
872 nop 981 nop
873ENDPROC(aes_sparc64_ecb_encrypt_192) 982ENDPROC(aes_sparc64_ecb_encrypt_192)
874 983
@@ -876,10 +985,34 @@ ENDPROC(aes_sparc64_ecb_encrypt_192)
876ENTRY(aes_sparc64_ecb_encrypt_256) 985ENTRY(aes_sparc64_ecb_encrypt_256)
877 /* %o0=key, %o1=input, %o2=output, %o3=len */ 986 /* %o0=key, %o1=input, %o2=output, %o3=len */
878 ldx [%o0 + 0x00], %g1 987 ldx [%o0 + 0x00], %g1
879 ldx [%o0 + 0x08], %g2 988 subcc %o3, 0x10, %o3
989 be 10f
990 ldx [%o0 + 0x08], %g2
8801: ldx [%o1 + 0x00], %g3 9911: ldx [%o1 + 0x00], %g3
881 ldx [%o1 + 0x08], %g7 992 ldx [%o1 + 0x08], %g7
882 add %o1, 0x10, %o1 993 ldx [%o1 + 0x10], %o4
994 ldx [%o1 + 0x18], %o5
995 xor %g1, %g3, %g3
996 xor %g2, %g7, %g7
997 MOVXTOD_G3_F4
998 MOVXTOD_G7_F6
999 xor %g1, %o4, %g3
1000 xor %g2, %o5, %g7
1001 MOVXTOD_G3_F0
1002 MOVXTOD_G7_F2
1003 ENCRYPT_256_2(8, 4, 6, 0, 2)
1004 std %f4, [%o2 + 0x00]
1005 std %f6, [%o2 + 0x08]
1006 std %f0, [%o2 + 0x10]
1007 std %f2, [%o2 + 0x18]
1008 sub %o3, 0x20, %o3
1009 add %o1, 0x20, %o1
1010 brgz %o3, 1b
1011 add %o2, 0x20, %o2
1012 brlz,pt %o3, 11f
1013 nop
101410: ldx [%o1 + 0x00], %g3
1015 ldx [%o1 + 0x08], %g7
883 xor %g1, %g3, %g3 1016 xor %g1, %g3, %g3
884 xor %g2, %g7, %g7 1017 xor %g2, %g7, %g7
885 MOVXTOD_G3_F4 1018 MOVXTOD_G3_F4
@@ -887,10 +1020,7 @@ ENTRY(aes_sparc64_ecb_encrypt_256)
887 ENCRYPT_256(8, 4, 6, 0, 2) 1020 ENCRYPT_256(8, 4, 6, 0, 2)
888 std %f4, [%o2 + 0x00] 1021 std %f4, [%o2 + 0x00]
889 std %f6, [%o2 + 0x08] 1022 std %f6, [%o2 + 0x08]
890 subcc %o3, 0x10, %o3 102311: retl
891 bne,pt %xcc, 1b
892 add %o2, 0x10, %o2
893 retl
894 nop 1024 nop
895ENDPROC(aes_sparc64_ecb_encrypt_256) 1025ENDPROC(aes_sparc64_ecb_encrypt_256)
896 1026