include/linux/coff.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351

/* This file is derived from the GAS 2.1.4 assembler control file.
   The GAS product is under the GNU General Public License, version 2 or later.
   As such, this file is also under that license.

   If the file format changes in the COFF object, this file should be
   subsequently updated to reflect the changes.

   The actual loader module only uses a few of these structures. The full
   set is documented here because I received the full set. If you wish
   more information about COFF, then O'Reilly has a very excellent book.
*/

#define  E_SYMNMLEN  8   /* Number of characters in a symbol name         */
#define  E_FILNMLEN 14   /* Number of characters in a file name           */
#define  E_DIMNUM    4   /* Number of array dimensions in auxiliary entry */

/*
 * These defines are byte order independent. There is no alignment of fields
 * permitted in the structures. Therefore they are declared as characters
 * and the values loaded from the character positions. It also makes it
 * nice to have it "endian" independent.
 */
 
/* Load a short int from the following tables with little-endian formats */
#define COFF_SHORT_L(ps) ((short)(((unsigned short)((unsigned char)ps[1])<<8)|\
				  ((unsigned short)((unsigned char)ps[0]))))

/* Load a long int from the following tables with little-endian formats */
#define COFF_LONG_L(ps) (((long)(((unsigned long)((unsigned char)ps[3])<<24) |\
				 ((unsigned long)((unsigned char)ps[2])<<16) |\
				 ((unsigned long)((unsigned char)ps[1])<<8)  |\
				 ((unsigned long)((unsigned char)ps[0])))))
 
/* Load a short int from the following tables with big-endian formats */
#define COFF_SHORT_H(ps) ((short)(((unsigned short)((unsigned char)ps[0])<<8)|\
				  ((unsigned short)((unsigned char)ps[1]))))

/* Load a long int from the following tables with big-endian formats */
#define COFF_LONG_H(ps) (((long)(((unsigned long)((unsigned char)ps[0])<<24) |\
				 ((unsigned long)((unsigned char)ps[1])<<16) |\
				 ((unsigned long)((unsigned char)ps[2])<<8)  |\
				 ((unsigned long)((unsigned char)ps[3])))))

/* These may be overridden later by brain dead implementations which generate
   a big-endian header with little-endian data. In that case, generate a
   replacement macro which tests a flag and uses either of the two above
   as appropriate. */

#define COFF_LONG(v)   COFF_LONG_L(v)
#define COFF_SHORT(v)  COFF_SHORT_L(v)

/*** coff information for Intel 386/486.  */

/********************** FILE HEADER **********************/

struct COFF_filehdr {
	char f_magic[2];	/* magic number			*/
	char f_nscns[2];	/* number of sections		*/
	char f_timdat[4];	/* time & date stamp		*/
	char f_symptr[4];	/* file pointer to symtab	*/
	char f_nsyms[4];	/* number of symtab entries	*/
	char f_opthdr[2];	/* sizeof(optional hdr)		*/
	char f_flags[2];	/* flags			*/
};

/*
 *   Bits for f_flags:
 *
 *	F_RELFLG	relocation info stripped from file
 *	F_EXEC		file is executable  (i.e. no unresolved external
 *			references)
 *	F_LNNO		line numbers stripped from file
 *	F_LSYMS		local symbols stripped from file
 *	F_MINMAL	this is a minimal object file (".m") output of fextract
 *	F_UPDATE	this is a fully bound update file, output of ogen
 *	F_SWABD		this file has had its bytes swabbed (in names)
 *	F_AR16WR	this file has the byte ordering of an AR16WR
 *			(e.g. 11/70) machine
 *	F_AR32WR	this file has the byte ordering of an AR32WR machine
 *			(e.g. vax and iNTEL 386)
 *	F_AR32W		this file has the byte ordering of an AR32W machine
 *			(e.g. 3b,maxi)
 *	F_PATCH		file contains "patch" list in optional header
 *	F_NODF		(minimal file only) no decision functions for
 *			replaced functions
 */

#define  COFF_F_RELFLG		0000001
#define  COFF_F_EXEC		0000002
#define  COFF_F_LNNO		0000004
#define  COFF_F_LSYMS		0000010
#define  COFF_F_MINMAL		0000020
#define  COFF_F_UPDATE		0000040
#define  COFF_F_SWABD		0000100
#define  COFF_F_AR16WR		0000200
#define  COFF_F_AR32WR		0000400
#define  COFF_F_AR32W		0001000
#define  COFF_F_PATCH		0002000
#define  COFF_F_NODF		0002000

#define	COFF_I386MAGIC	        0x14c   /* Linux's system    */

#if 0   /* Perhaps, someday, these formats may be used.      */
#define COFF_I386PTXMAGIC	0x154
#define COFF_I386AIXMAGIC	0x175   /* IBM's AIX system  */
#define COFF_I386BADMAG(x) ((COFF_SHORT((x).f_magic) != COFF_I386MAGIC) \
			  && COFF_SHORT((x).f_magic) != COFF_I386PTXMAGIC \
			  && COFF_SHORT((x).f_magic) != COFF_I386AIXMAGIC)
#else
#define COFF_I386BADMAG(x) (COFF_SHORT((x).f_magic) != COFF_I386MAGIC)
#endif

#define	COFF_FILHDR	struct COFF_filehdr
#define	COFF_FILHSZ	sizeof(COFF_FILHDR)

/********************** AOUT "OPTIONAL HEADER" **********************/

/* Linux COFF must have this "optional" header. Standard COFF has no entry
   location for the "entry" point. They normally would start with the first
   location of the .text section. This is not a good idea for linux. So,
   the use of this "optional" header is not optional. It is required.

   Do not be tempted to assume that the size of the optional header is
   a constant and simply index the next byte by the size of this structure.
   Use the 'f_opthdr' field in the main coff header for the size of the
   structure actually written to the file!!
*/

typedef struct 
{
  char 	magic[2];		/* type of file				 */
  char	vstamp[2];		/* version stamp			 */
  char	tsize[4];		/* text size in bytes, padded to FW bdry */
  char	dsize[4];		/* initialized   data "   "		 */
  char	bsize[4];		/* uninitialized data "   "		 */
  char	entry[4];		/* entry pt.				 */
  char 	text_start[4];		/* base of text used for this file       */
  char 	data_start[4];		/* base of data used for this file       */
}
COFF_AOUTHDR;

#define COFF_AOUTSZ (sizeof(COFF_AOUTHDR))

#define COFF_STMAGIC	0401
#define COFF_OMAGIC     0404
#define COFF_JMAGIC     0407    /* dirty text and data image, can't share  */
#define COFF_DMAGIC     0410    /* dirty text segment, data aligned        */
#define COFF_ZMAGIC     0413    /* The proper magic number for executables  */
#define COFF_SHMAGIC	0443	/* shared library header                   */

/********************** SECTION HEADER **********************/

struct COFF_scnhdr {
  char		s_name[8];	/* section name			    */
  char		s_paddr[4];	/* physical address, aliased s_nlib */
  char		s_vaddr[4];	/* virtual address		    */
  char		s_size[4];	/* section size			    */
  char		s_scnptr[4];	/* file ptr to raw data for section */
  char		s_relptr[4];	/* file ptr to relocation	    */
  char		s_lnnoptr[4];	/* file ptr to line numbers	    */
  char		s_nreloc[2];	/* number of relocation entries	    */
  char		s_nlnno[2];	/* number of line number entries    */
  char		s_flags[4];	/* flags			    */
};

#define	COFF_SCNHDR	struct COFF_scnhdr
#define	COFF_SCNHSZ	sizeof(COFF_SCNHDR)

/*
 * names of "special" sections
 */

#define COFF_TEXT	".text"
#define COFF_DATA	".data"
#define COFF_BSS	".bss"
#define COFF_COMMENT    ".comment"
#define COFF_LIB        ".lib"

#define COFF_SECT_TEXT  0      /* Section for instruction code             */
#define COFF_SECT_DATA  1      /* Section for initialized globals          */
#define COFF_SECT_BSS   2      /* Section for un-initialized globals       */
#define COFF_SECT_REQD  3      /* Minimum number of sections for good file */

#define COFF_STYP_REG     0x00 /* regular segment                          */
#define COFF_STYP_DSECT   0x01 /* dummy segment                            */
#define COFF_STYP_NOLOAD  0x02 /* no-load segment                          */
#define COFF_STYP_GROUP   0x04 /* group segment                            */
#define COFF_STYP_PAD     0x08 /* .pad segment                             */
#define COFF_STYP_COPY    0x10 /* copy section                             */
#define COFF_STYP_TEXT    0x20 /* .text segment                            */
#define COFF_STYP_DATA    0x40 /* .data segment                            */
#define COFF_STYP_BSS     0x80 /* .bss segment                             */
#define COFF_STYP_INFO   0x200 /* .comment section                         */
#define COFF_STYP_OVER   0x400 /* overlay section                          */
#define COFF_STYP_LIB    0x800 /* library section                          */

/*
 * Shared libraries have the following section header in the data field for
 * each library.
 */

struct COFF_slib {
  char		sl_entsz[4];	/* Size of this entry               */
  char		sl_pathndx[4];	/* size of the header field         */
};

#define	COFF_SLIBHD	struct COFF_slib
#define	COFF_SLIBSZ	sizeof(COFF_SLIBHD)

/********************** LINE NUMBERS **********************/

/* 1 line number entry for every "breakpointable" source line in a section.
 * Line numbers are grouped on a per function basis; first entry in a function
 * grouping will have l_lnno = 0 and in place of physical address will be the
 * symbol table index of the function name.
 */

struct COFF_lineno {
  union {
    char l_symndx[4];	/* function name symbol index, iff l_lnno == 0*/
    char l_paddr[4];	/* (physical) address of line number	*/
  } l_addr;
  char l_lnno[2];	/* line number		*/
};

#define	COFF_LINENO	struct COFF_lineno
#define	COFF_LINESZ	6

/********************** SYMBOLS **********************/

#define COFF_E_SYMNMLEN	 8	/* # characters in a short symbol name	*/
#define COFF_E_FILNMLEN	14	/* # characters in a file name		*/
#define COFF_E_DIMNUM	 4	/* # array dimensions in auxiliary entry */

/*
 *  All symbols and sections have the following definition
 */

struct COFF_syment 
{
  union {
    char e_name[E_SYMNMLEN];    /* Symbol name (first 8 characters) */
    struct {
      char e_zeroes[4];         /* Leading zeros */
      char e_offset[4];         /* Offset if this is a header section */
    } e;
  } e;

  char e_value[4];              /* Value (address) of the segment */
  char e_scnum[2];              /* Section number */
  char e_type[2];               /* Type of section */
  char e_sclass[1];             /* Loader class */
  char e_numaux[1];             /* Number of auxiliary entries which follow */
};

#define COFF_N_BTMASK	(0xf)   /* Mask for important class bits */
#define COFF_N_TMASK	(0x30)  /* Mask for important type bits  */
#define COFF_N_BTSHFT	(4)     /* # bits to shift class field   */
#define COFF_N_TSHIFT	(2)     /* # bits to shift type field    */

/*
 *  Auxiliary entries because the main table is too limiting.
 */
  
union COFF_auxent {

/*
 *  Debugger information
 */

  struct {
    char x_tagndx[4];	        /* str, un, or enum tag indx */
    union {
      struct {
	char  x_lnno[2];        /* declaration line number */
	char  x_size[2];        /* str/union/array size */
      } x_lnsz;
      char x_fsize[4];	        /* size of function */
    } x_misc;

    union {
      struct {		        /* if ISFCN, tag, or .bb */
	char x_lnnoptr[4];	/* ptr to fcn line # */
	char x_endndx[4];	/* entry ndx past block end */
      } x_fcn;

      struct {		        /* if ISARY, up to 4 dimen. */
	char x_dimen[E_DIMNUM][2];
      } x_ary;
    } x_fcnary;

    char x_tvndx[2];	/* tv index */
  } x_sym;

/*
 *   Source file names (debugger information)
 */

  union {
    char x_fname[E_FILNMLEN];
    struct {
      char x_zeroes[4];
      char x_offset[4];
    } x_n;
  } x_file;

/*
 *   Section information
 */

  struct {
    char x_scnlen[4];	/* section length */
    char x_nreloc[2];	/* # relocation entries */
    char x_nlinno[2];	/* # line numbers */
  } x_scn;

/*
 *   Transfer vector (branch table)
 */
  
  struct {
    char x_tvfill[4];	/* tv fill value */
    char x_tvlen[2];	/* length of .tv */
    char x_tvran[2][2];	/* tv range */
  } x_tv;		/* info about .tv section (in auxent of symbol .tv)) */
};

#define	COFF_SYMENT	struct COFF_syment
#define	COFF_SYMESZ	18	
#define	COFF_AUXENT	union COFF_auxent
#define	COFF_AUXESZ	18

#define COFF_ETEXT	"etext"

/********************** RELOCATION DIRECTIVES **********************/

struct COFF_reloc {
  char r_vaddr[4];        /* Virtual address of item    */
  char r_symndx[4];       /* Symbol index in the symtab */
  char r_type[2];         /* Relocation type            */
};

#define COFF_RELOC struct COFF_reloc
#define COFF_RELSZ 10

#define COFF_DEF_DATA_SECTION_ALIGNMENT  4
#define COFF_DEF_BSS_SECTION_ALIGNMENT   4
#define COFF_DEF_TEXT_SECTION_ALIGNMENT  4

/* For new sections we haven't heard of before */
#define COFF_DEF_SECTION_ALIGNMENT       4
#!/usr/bin/env perl

# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
# granted.
# ====================================================================

# Bit-sliced AES for ARM NEON
#
# February 2012.
#
# This implementation is direct adaptation of bsaes-x86_64 module for
# ARM NEON. Except that this module is endian-neutral [in sense that
# it can be compiled for either endianness] by courtesy of vld1.8's
# neutrality. Initial version doesn't implement interface to OpenSSL,
# only low-level primitives and unsupported entry points, just enough
# to collect performance results, which for Cortex-A8 core are:
#
# encrypt	19.5 cycles per byte processed with 128-bit key
# decrypt	22.1 cycles per byte processed with 128-bit key
# key conv.	440  cycles per 128-bit key/0.18 of 8x block
#
# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
# which is [much] worse than anticipated (for further details see
# http://www.openssl.org/~appro/Snapdragon-S4.html).
#
# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
# manages in 20.0 cycles].
#
# When comparing to x86_64 results keep in mind that NEON unit is
# [mostly] single-issue and thus can't [fully] benefit from
# instruction-level parallelism. And when comparing to aes-armv4
# results keep in mind key schedule conversion overhead (see
# bsaes-x86_64.pl for further details)...
#
#						<appro@openssl.org>

# April-August 2013
#
# Add CBC, CTR and XTS subroutines, adapt for kernel use.
#
#					<ard.biesheuvel@linaro.org>

while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
my @XMM=map("q$_",(0..15));

{
my ($key,$rounds,$const)=("r4","r5","r6");

sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }

sub Sbox {
# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
my @b=@_[0..7];
my @t=@_[8..11];
my @s=@_[12..15];
	&InBasisChange	(@b);
	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
}

sub InBasisChange {
# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
my @b=@_[0..7];
$code.=<<___;
	veor	@b[2], @b[2], @b[1]
	veor	@b[5], @b[5], @b[6]
	veor	@b[3], @b[3], @b[0]
	veor	@b[6], @b[6], @b[2]
	veor	@b[5], @b[5], @b[0]

	veor	@b[6], @b[6], @b[3]
	veor	@b[3], @b[3], @b[7]
	veor	@b[7], @b[7], @b[5]
	veor	@b[3], @b[3], @b[4]
	veor	@b[4], @b[4], @b[5]

	veor	@b[2], @b[2], @b[7]
	veor	@b[3], @b[3], @b[1]
	veor	@b[1], @b[1], @b[5]
___
}

sub OutBasisChange {
# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
my @b=@_[0..7];
$code.=<<___;
	veor	@b[0], @b[0], @b[6]
	veor	@b[1], @b[1], @b[4]
	veor	@b[4], @b[4], @b[6]
	veor	@b[2], @b[2], @b[0]
	veor	@b[6], @b[6], @b[1]

	veor	@b[1], @b[1], @b[5]
	veor	@b[5], @b[5], @b[3]
	veor	@b[3], @b[3], @b[7]
	veor	@b[7], @b[7], @b[5]
	veor	@b[2], @b[2], @b[5]

	veor	@b[4], @b[4], @b[7]
___
}

sub InvSbox {
# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
my @b=@_[0..7];
my @t=@_[8..11];
my @s=@_[12..15];
	&InvInBasisChange	(@b);
	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
}

sub InvInBasisChange {		# OutBasisChange in reverse (with twist)
my @b=@_[5,1,2,6,3,7,0,4];
$code.=<<___
	 veor	@b[1], @b[1], @b[7]
	veor	@b[4], @b[4], @b[7]

	veor	@b[7], @b[7], @b[5]
	 veor	@b[1], @b[1], @b[3]
	veor	@b[2], @b[2], @b[5]
	veor	@b[3], @b[3], @b[7]

	veor	@b[6], @b[6], @b[1]
	veor	@b[2], @b[2], @b[0]
	 veor	@b[5], @b[5], @b[3]
	veor	@b[4], @b[4], @b[6]
	veor	@b[0], @b[0], @b[6]
	veor	@b[1], @b[1], @b[4]
___
}

sub InvOutBasisChange {		# InBasisChange in reverse
my @b=@_[2,5,7,3,6,1,0,4];
$code.=<<___;
	veor	@b[1], @b[1], @b[5]
	veor	@b[2], @b[2], @b[7]

	veor	@b[3], @b[3], @b[1]
	veor	@b[4], @b[4], @b[5]
	veor	@b[7], @b[7], @b[5]
	veor	@b[3], @b[3], @b[4]
	 veor 	@b[5], @b[5], @b[0]
	veor	@b[3], @b[3], @b[7]
	 veor	@b[6], @b[6], @b[2]
	 veor	@b[2], @b[2], @b[1]
	veor	@b[6], @b[6], @b[3]

	veor	@b[3], @b[3], @b[0]
	veor	@b[5], @b[5], @b[6]
___
}

sub Mul_GF4 {
#;*************************************************************
#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
#;*************************************************************
my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
$code.=<<___;
	veor 	$t0, $y0, $y1
	vand	$t0, $t0, $x0
	veor	$x0, $x0, $x1
	vand	$t1, $x1, $y0
	vand	$x0, $x0, $y1
	veor	$x1, $t1, $t0
	veor	$x0, $x0, $t1
___
}

sub Mul_GF4_N {				# not used, see next subroutine
# multiply and scale by N
my ($x0,$x1,$y0,$y1,$t0)=@_;
$code.=<<___;
	veor	$t0, $y0, $y1
	vand	$t0, $t0, $x0
	veor	$x0, $x0, $x1
	vand	$x1, $x1, $y0
	vand	$x0, $x0, $y1
	veor	$x1, $x1, $x0
	veor	$x0, $x0, $t0
___
}

sub Mul_GF4_N_GF4 {
# interleaved Mul_GF4_N and Mul_GF4
my ($x0,$x1,$y0,$y1,$t0,
    $x2,$x3,$y2,$y3,$t1)=@_;
$code.=<<___;
	veor	$t0, $y0, $y1
	 veor 	$t1, $y2, $y3
	vand	$t0, $t0, $x0
	 vand	$t1, $t1, $x2
	veor	$x0, $x0, $x1
	 veor	$x2, $x2, $x3
	vand	$x1, $x1, $y0
	 vand	$x3, $x3, $y2
	vand	$x0, $x0, $y1
	 vand	$x2, $x2, $y3
	veor	$x1, $x1, $x0
	 veor	$x2, $x2, $x3
	veor	$x0, $x0, $t0
	 veor	$x3, $x3, $t1
___
}
sub Mul_GF16_2 {
my @x=@_[0..7];
my @y=@_[8..11];
my @t=@_[12..15];
$code.=<<___;
	veor	@t[0], @x[0], @x[2]
	veor	@t[1], @x[1], @x[3]
___
	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2..3]);
$code.=<<___;
	veor	@y[0], @y[0], @y[2]
	veor	@y[1], @y[1], @y[3]
___
	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
			 @x[2], @x[3], @y[2], @y[3], @t[2]);
$code.=<<___;
	veor	@x[0], @x[0], @t[0]
	veor	@x[2], @x[2], @t[0]
	veor	@x[1], @x[1], @t[1]
	veor	@x[3], @x[3], @t[1]

	veor	@t[0], @x[4], @x[6]
	veor	@t[1], @x[5], @x[7]
___
	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
			 @x[6], @x[7], @y[2], @y[3], @t[2]);
$code.=<<___;
	veor	@y[0], @y[0], @y[2]
	veor	@y[1], @y[1], @y[3]
___
	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[2..3]);
$code.=<<___;
	veor	@x[4], @x[4], @t[0]
	veor	@x[6], @x[6], @t[0]
	veor	@x[5], @x[5], @t[1]
	veor	@x[7], @x[7], @t[1]
___
}
sub Inv_GF256 {
#;********************************************************************
#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
#;********************************************************************
my @x=@_[0..7];
my @t=@_[8..11];
my @s=@_[12..15];
# direct optimizations from hardware
$code.=<<___;
	veor	@t[3], @x[4], @x[6]
	veor	@t[2], @x[5], @x[7]
	veor	@t[1], @x[1], @x[3]
	veor	@s[1], @x[7], @x[6]
	 vmov	@t[0], @t[2]
	veor	@s[0], @x[0], @x[2]

	vorr	@t[2], @t[2], @t[1]
	veor	@s[3], @t[3], @t[0]
	vand	@s[2], @t[3], @s[0]
	vorr	@t[3], @t[3], @s[0]
	veor	@s[0], @s[0], @t[1]
	vand	@t[0], @t[0], @t[1]
	veor	@t[1], @x[3], @x[2]
	vand	@s[3], @s[3], @s[0]
	vand	@s[1], @s[1], @t[1]
	veor	@t[1], @x[4], @x[5]
	veor	@s[0], @x[1], @x[0]
	veor	@t[3], @t[3], @s[1]
	veor	@t[2], @t[2], @s[1]
	vand	@s[1], @t[1], @s[0]
	vorr	@t[1], @t[1], @s[0]
	veor	@t[3], @t[3], @s[3]
	veor	@t[0], @t[0], @s[1]
	veor	@t[2], @t[2], @s[2]
	veor	@t[1], @t[1], @s[3]
	veor	@t[0], @t[0], @s[2]
	vand	@s[0], @x[7], @x[3]
	veor	@t[1], @t[1], @s[2]
	vand	@s[1], @x[6], @x[2]
	vand	@s[2], @x[5], @x[1]
	vorr	@s[3], @x[4], @x[0]
	veor	@t[3], @t[3], @s[0]
	veor	@t[1], @t[1], @s[2]
	veor	@t[0], @t[0], @s[3]
	veor	@t[2], @t[2], @s[1]

	@ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3

	@ new smaller inversion

	vand	@s[2], @t[3], @t[1]
	vmov	@s[0], @t[0]

	veor	@s[1], @t[2], @s[2]
	veor	@s[3], @t[0], @s[2]
	veor	@s[2], @t[0], @s[2]	@ @s[2]=@s[3]

	vbsl	@s[1], @t[1], @t[0]
	vbsl	@s[3], @t[3], @t[2]
	veor	@t[3], @t[3], @t[2]

	vbsl	@s[0], @s[1], @s[2]