|  | /* SPDX-License-Identifier: GPL-2.0-only */ | 
|  | /* | 
|  | * Accelerated GHASH implementation with ARMv8 PMULL instructions. | 
|  | * | 
|  | * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> | 
|  | */ | 
|  |  | 
|  | #include <linux/linkage.h> | 
|  | #include <asm/assembler.h> | 
|  |  | 
|  | SHASH		.req	v0 | 
|  | SHASH2		.req	v1 | 
|  | T1		.req	v2 | 
|  | T2		.req	v3 | 
|  | MASK		.req	v4 | 
|  | XL		.req	v5 | 
|  | XM		.req	v6 | 
|  | XH		.req	v7 | 
|  | IN1		.req	v7 | 
|  |  | 
|  | k00_16		.req	v8 | 
|  | k32_48		.req	v9 | 
|  |  | 
|  | t3		.req	v10 | 
|  | t4		.req	v11 | 
|  | t5		.req	v12 | 
|  | t6		.req	v13 | 
|  | t7		.req	v14 | 
|  | t8		.req	v15 | 
|  | t9		.req	v16 | 
|  |  | 
|  | perm1		.req	v17 | 
|  | perm2		.req	v18 | 
|  | perm3		.req	v19 | 
|  |  | 
|  | sh1		.req	v20 | 
|  | sh2		.req	v21 | 
|  | sh3		.req	v22 | 
|  | sh4		.req	v23 | 
|  |  | 
|  | ss1		.req	v24 | 
|  | ss2		.req	v25 | 
|  | ss3		.req	v26 | 
|  | ss4		.req	v27 | 
|  |  | 
|  | XL2		.req	v8 | 
|  | XM2		.req	v9 | 
|  | XH2		.req	v10 | 
|  | XL3		.req	v11 | 
|  | XM3		.req	v12 | 
|  | XH3		.req	v13 | 
|  | TT3		.req	v14 | 
|  | TT4		.req	v15 | 
|  | HH		.req	v16 | 
|  | HH3		.req	v17 | 
|  | HH4		.req	v18 | 
|  | HH34		.req	v19 | 
|  |  | 
|  | .text | 
|  | .arch		armv8-a+crypto | 
|  |  | 
|  | .macro		__pmull_p64, rd, rn, rm | 
|  | pmull		\rd\().1q, \rn\().1d, \rm\().1d | 
|  | .endm | 
|  |  | 
|  | .macro		__pmull2_p64, rd, rn, rm | 
|  | pmull2		\rd\().1q, \rn\().2d, \rm\().2d | 
|  | .endm | 
|  |  | 
|  | .macro		__pmull_p8, rq, ad, bd | 
|  | ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1 | 
|  | ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2 | 
|  | ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3 | 
|  |  | 
|  | __pmull_p8_\bd	\rq, \ad | 
|  | .endm | 
|  |  | 
|  | .macro		__pmull2_p8, rq, ad, bd | 
|  | tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1 | 
|  | tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2 | 
|  | tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3 | 
|  |  | 
|  | __pmull2_p8_\bd	\rq, \ad | 
|  | .endm | 
|  |  | 
|  | .macro		__pmull_p8_SHASH, rq, ad | 
|  | __pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 | 
|  | .endm | 
|  |  | 
|  | .macro		__pmull_p8_SHASH2, rq, ad | 
|  | __pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 | 
|  | .endm | 
|  |  | 
|  | .macro		__pmull2_p8_SHASH, rq, ad | 
|  | __pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 | 
|  | .endm | 
|  |  | 
|  | .macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 | 
|  | pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B | 
|  | pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1 | 
|  | pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B | 
|  | pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2 | 
|  | pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B | 
|  | pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3 | 
|  | pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4 | 
|  | pmull\t		\rq\().8h, \ad, \bd			// D = A*B | 
|  |  | 
|  | eor		t3.16b, t3.16b, t4.16b			// L = E + F | 
|  | eor		t5.16b, t5.16b, t6.16b			// M = G + H | 
|  | eor		t7.16b, t7.16b, t8.16b			// N = I + J | 
|  |  | 
|  | uzp1		t4.2d, t3.2d, t5.2d | 
|  | uzp2		t3.2d, t3.2d, t5.2d | 
|  | uzp1		t6.2d, t7.2d, t9.2d | 
|  | uzp2		t7.2d, t7.2d, t9.2d | 
|  |  | 
|  | // t3 = (L) (P0 + P1) << 8 | 
|  | // t5 = (M) (P2 + P3) << 16 | 
|  | eor		t4.16b, t4.16b, t3.16b | 
|  | and		t3.16b, t3.16b, k32_48.16b | 
|  |  | 
|  | // t7 = (N) (P4 + P5) << 24 | 
|  | // t9 = (K) (P6 + P7) << 32 | 
|  | eor		t6.16b, t6.16b, t7.16b | 
|  | and		t7.16b, t7.16b, k00_16.16b | 
|  |  | 
|  | eor		t4.16b, t4.16b, t3.16b | 
|  | eor		t6.16b, t6.16b, t7.16b | 
|  |  | 
|  | zip2		t5.2d, t4.2d, t3.2d | 
|  | zip1		t3.2d, t4.2d, t3.2d | 
|  | zip2		t9.2d, t6.2d, t7.2d | 
|  | zip1		t7.2d, t6.2d, t7.2d | 
|  |  | 
|  | ext		t3.16b, t3.16b, t3.16b, #15 | 
|  | ext		t5.16b, t5.16b, t5.16b, #14 | 
|  | ext		t7.16b, t7.16b, t7.16b, #13 | 
|  | ext		t9.16b, t9.16b, t9.16b, #12 | 
|  |  | 
|  | eor		t3.16b, t3.16b, t5.16b | 
|  | eor		t7.16b, t7.16b, t9.16b | 
|  | eor		\rq\().16b, \rq\().16b, t3.16b | 
|  | eor		\rq\().16b, \rq\().16b, t7.16b | 
|  | .endm | 
|  |  | 
|  | .macro		__pmull_pre_p64 | 
|  | add		x8, x3, #16 | 
|  | ld1		{HH.2d-HH4.2d}, [x8] | 
|  |  | 
|  | trn1		SHASH2.2d, SHASH.2d, HH.2d | 
|  | trn2		T1.2d, SHASH.2d, HH.2d | 
|  | eor		SHASH2.16b, SHASH2.16b, T1.16b | 
|  |  | 
|  | trn1		HH34.2d, HH3.2d, HH4.2d | 
|  | trn2		T1.2d, HH3.2d, HH4.2d | 
|  | eor		HH34.16b, HH34.16b, T1.16b | 
|  |  | 
|  | movi		MASK.16b, #0xe1 | 
|  | shl		MASK.2d, MASK.2d, #57 | 
|  | .endm | 
|  |  | 
|  | .macro		__pmull_pre_p8 | 
|  | ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8 | 
|  | eor		SHASH2.16b, SHASH2.16b, SHASH.16b | 
|  |  | 
|  | // k00_16 := 0x0000000000000000_000000000000ffff | 
|  | // k32_48 := 0x00000000ffffffff_0000ffffffffffff | 
|  | movi		k32_48.2d, #0xffffffff | 
|  | mov		k32_48.h[2], k32_48.h[0] | 
|  | ushr		k00_16.2d, k32_48.2d, #32 | 
|  |  | 
|  | // prepare the permutation vectors | 
|  | mov_q		x5, 0x080f0e0d0c0b0a09 | 
|  | movi		T1.8b, #8 | 
|  | dup		perm1.2d, x5 | 
|  | eor		perm1.16b, perm1.16b, T1.16b | 
|  | ushr		perm2.2d, perm1.2d, #8 | 
|  | ushr		perm3.2d, perm1.2d, #16 | 
|  | ushr		T1.2d, perm1.2d, #24 | 
|  | sli		perm2.2d, perm1.2d, #56 | 
|  | sli		perm3.2d, perm1.2d, #48 | 
|  | sli		T1.2d, perm1.2d, #40 | 
|  |  | 
|  | // precompute loop invariants | 
|  | tbl		sh1.16b, {SHASH.16b}, perm1.16b | 
|  | tbl		sh2.16b, {SHASH.16b}, perm2.16b | 
|  | tbl		sh3.16b, {SHASH.16b}, perm3.16b | 
|  | tbl		sh4.16b, {SHASH.16b}, T1.16b | 
|  | ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1 | 
|  | ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2 | 
|  | ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3 | 
|  | ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4 | 
|  | .endm | 
|  |  | 
|  | // | 
|  | // PMULL (64x64->128) based reduction for CPUs that can do | 
|  | // it in a single instruction. | 
|  | // | 
|  | .macro		__pmull_reduce_p64 | 
|  | pmull		T2.1q, XL.1d, MASK.1d | 
|  | eor		XM.16b, XM.16b, T1.16b | 
|  |  | 
|  | mov		XH.d[0], XM.d[1] | 
|  | mov		XM.d[1], XL.d[0] | 
|  |  | 
|  | eor		XL.16b, XM.16b, T2.16b | 
|  | ext		T2.16b, XL.16b, XL.16b, #8 | 
|  | pmull		XL.1q, XL.1d, MASK.1d | 
|  | .endm | 
|  |  | 
|  | // | 
|  | // Alternative reduction for CPUs that lack support for the | 
|  | // 64x64->128 PMULL instruction | 
|  | // | 
|  | .macro		__pmull_reduce_p8 | 
|  | eor		XM.16b, XM.16b, T1.16b | 
|  |  | 
|  | mov		XL.d[1], XM.d[0] | 
|  | mov		XH.d[0], XM.d[1] | 
|  |  | 
|  | shl		T1.2d, XL.2d, #57 | 
|  | shl		T2.2d, XL.2d, #62 | 
|  | eor		T2.16b, T2.16b, T1.16b | 
|  | shl		T1.2d, XL.2d, #63 | 
|  | eor		T2.16b, T2.16b, T1.16b | 
|  | ext		T1.16b, XL.16b, XH.16b, #8 | 
|  | eor		T2.16b, T2.16b, T1.16b | 
|  |  | 
|  | mov		XL.d[1], T2.d[0] | 
|  | mov		XH.d[0], T2.d[1] | 
|  |  | 
|  | ushr		T2.2d, XL.2d, #1 | 
|  | eor		XH.16b, XH.16b, XL.16b | 
|  | eor		XL.16b, XL.16b, T2.16b | 
|  | ushr		T2.2d, T2.2d, #6 | 
|  | ushr		XL.2d, XL.2d, #1 | 
|  | .endm | 
|  |  | 
|  | .macro		__pmull_ghash, pn | 
|  | ld1		{SHASH.2d}, [x3] | 
|  | ld1		{XL.2d}, [x1] | 
|  |  | 
|  | __pmull_pre_\pn | 
|  |  | 
|  | /* do the head block first, if supplied */ | 
|  | cbz		x4, 0f | 
|  | ld1		{T1.2d}, [x4] | 
|  | mov		x4, xzr | 
|  | b		3f | 
|  |  | 
|  | 0:	.ifc		\pn, p64 | 
|  | tbnz		w0, #0, 2f		// skip until #blocks is a | 
|  | tbnz		w0, #1, 2f		// round multiple of 4 | 
|  |  | 
|  | 1:	ld1		{XM3.16b-TT4.16b}, [x2], #64 | 
|  |  | 
|  | sub		w0, w0, #4 | 
|  |  | 
|  | rev64		T1.16b, XM3.16b | 
|  | rev64		T2.16b, XH3.16b | 
|  | rev64		TT4.16b, TT4.16b | 
|  | rev64		TT3.16b, TT3.16b | 
|  |  | 
|  | ext		IN1.16b, TT4.16b, TT4.16b, #8 | 
|  | ext		XL3.16b, TT3.16b, TT3.16b, #8 | 
|  |  | 
|  | eor		TT4.16b, TT4.16b, IN1.16b | 
|  | pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1 | 
|  | pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0 | 
|  | pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0) | 
|  |  | 
|  | eor		TT3.16b, TT3.16b, XL3.16b | 
|  | pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1 | 
|  | pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0 | 
|  | pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0) | 
|  |  | 
|  | ext		IN1.16b, T2.16b, T2.16b, #8 | 
|  | eor		XL2.16b, XL2.16b, XL3.16b | 
|  | eor		XH2.16b, XH2.16b, XH3.16b | 
|  | eor		XM2.16b, XM2.16b, XM3.16b | 
|  |  | 
|  | eor		T2.16b, T2.16b, IN1.16b | 
|  | pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1 | 
|  | pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0 | 
|  | pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0) | 
|  |  | 
|  | eor		XL2.16b, XL2.16b, XL3.16b | 
|  | eor		XH2.16b, XH2.16b, XH3.16b | 
|  | eor		XM2.16b, XM2.16b, XM3.16b | 
|  |  | 
|  | ext		IN1.16b, T1.16b, T1.16b, #8 | 
|  | ext		TT3.16b, XL.16b, XL.16b, #8 | 
|  | eor		XL.16b, XL.16b, IN1.16b | 
|  | eor		T1.16b, T1.16b, TT3.16b | 
|  |  | 
|  | pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1 | 
|  | eor		T1.16b, T1.16b, XL.16b | 
|  | pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0 | 
|  | pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0) | 
|  |  | 
|  | eor		XL.16b, XL.16b, XL2.16b | 
|  | eor		XH.16b, XH.16b, XH2.16b | 
|  | eor		XM.16b, XM.16b, XM2.16b | 
|  |  | 
|  | eor		T2.16b, XL.16b, XH.16b | 
|  | ext		T1.16b, XL.16b, XH.16b, #8 | 
|  | eor		XM.16b, XM.16b, T2.16b | 
|  |  | 
|  | __pmull_reduce_p64 | 
|  |  | 
|  | eor		T2.16b, T2.16b, XH.16b | 
|  | eor		XL.16b, XL.16b, T2.16b | 
|  |  | 
|  | cbz		w0, 5f | 
|  | b		1b | 
|  | .endif | 
|  |  | 
|  | 2:	ld1		{T1.2d}, [x2], #16 | 
|  | sub		w0, w0, #1 | 
|  |  | 
|  | 3:	/* multiply XL by SHASH in GF(2^128) */ | 
|  | CPU_LE(	rev64		T1.16b, T1.16b	) | 
|  |  | 
|  | ext		T2.16b, XL.16b, XL.16b, #8 | 
|  | ext		IN1.16b, T1.16b, T1.16b, #8 | 
|  | eor		T1.16b, T1.16b, T2.16b | 
|  | eor		XL.16b, XL.16b, IN1.16b | 
|  |  | 
|  | __pmull2_\pn	XH, XL, SHASH			// a1 * b1 | 
|  | eor		T1.16b, T1.16b, XL.16b | 
|  | __pmull_\pn 	XL, XL, SHASH			// a0 * b0 | 
|  | __pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0) | 
|  |  | 
|  | 4:	eor		T2.16b, XL.16b, XH.16b | 
|  | ext		T1.16b, XL.16b, XH.16b, #8 | 
|  | eor		XM.16b, XM.16b, T2.16b | 
|  |  | 
|  | __pmull_reduce_\pn | 
|  |  | 
|  | eor		T2.16b, T2.16b, XH.16b | 
|  | eor		XL.16b, XL.16b, T2.16b | 
|  |  | 
|  | cbnz		w0, 0b | 
|  |  | 
|  | 5:	st1		{XL.2d}, [x1] | 
|  | ret | 
|  | .endm | 
|  |  | 
|  | /* | 
|  | * void pmull_ghash_update(int blocks, u64 dg[], const char *src, | 
|  | *			   struct ghash_key const *k, const char *head) | 
|  | */ | 
|  | ENTRY(pmull_ghash_update_p64) | 
|  | __pmull_ghash	p64 | 
|  | ENDPROC(pmull_ghash_update_p64) | 
|  |  | 
|  | ENTRY(pmull_ghash_update_p8) | 
|  | __pmull_ghash	p8 | 
|  | ENDPROC(pmull_ghash_update_p8) | 
|  |  | 
|  | KS0		.req	v12 | 
|  | KS1		.req	v13 | 
|  | INP0		.req	v14 | 
|  | INP1		.req	v15 | 
|  |  | 
|  | .macro		load_round_keys, rounds, rk | 
|  | cmp		\rounds, #12 | 
|  | blo		2222f		/* 128 bits */ | 
|  | beq		1111f		/* 192 bits */ | 
|  | ld1		{v17.4s-v18.4s}, [\rk], #32 | 
|  | 1111:	ld1		{v19.4s-v20.4s}, [\rk], #32 | 
|  | 2222:	ld1		{v21.4s-v24.4s}, [\rk], #64 | 
|  | ld1		{v25.4s-v28.4s}, [\rk], #64 | 
|  | ld1		{v29.4s-v31.4s}, [\rk] | 
|  | .endm | 
|  |  | 
|  | .macro		enc_round, state, key | 
|  | aese		\state\().16b, \key\().16b | 
|  | aesmc		\state\().16b, \state\().16b | 
|  | .endm | 
|  |  | 
|  | .macro		enc_block, state, rounds | 
|  | cmp		\rounds, #12 | 
|  | b.lo		2222f		/* 128 bits */ | 
|  | b.eq		1111f		/* 192 bits */ | 
|  | enc_round	\state, v17 | 
|  | enc_round	\state, v18 | 
|  | 1111:	enc_round	\state, v19 | 
|  | enc_round	\state, v20 | 
|  | 2222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29 | 
|  | enc_round	\state, \key | 
|  | .endr | 
|  | aese		\state\().16b, v30.16b | 
|  | eor		\state\().16b, \state\().16b, v31.16b | 
|  | .endm | 
|  |  | 
|  | .macro		pmull_gcm_do_crypt, enc | 
|  | ld1		{SHASH.2d}, [x4], #16 | 
|  | ld1		{HH.2d}, [x4] | 
|  | ld1		{XL.2d}, [x1] | 
|  | ldr		x8, [x5, #8]			// load lower counter | 
|  |  | 
|  | movi		MASK.16b, #0xe1 | 
|  | trn1		SHASH2.2d, SHASH.2d, HH.2d | 
|  | trn2		T1.2d, SHASH.2d, HH.2d | 
|  | CPU_LE(	rev		x8, x8		) | 
|  | shl		MASK.2d, MASK.2d, #57 | 
|  | eor		SHASH2.16b, SHASH2.16b, T1.16b | 
|  |  | 
|  | .if		\enc == 1 | 
|  | ldr		x10, [sp] | 
|  | ld1		{KS0.16b-KS1.16b}, [x10] | 
|  | .endif | 
|  |  | 
|  | cbnz		x6, 4f | 
|  |  | 
|  | 0:	ld1		{INP0.16b-INP1.16b}, [x3], #32 | 
|  |  | 
|  | rev		x9, x8 | 
|  | add		x11, x8, #1 | 
|  | add		x8, x8, #2 | 
|  |  | 
|  | .if		\enc == 1 | 
|  | eor		INP0.16b, INP0.16b, KS0.16b	// encrypt input | 
|  | eor		INP1.16b, INP1.16b, KS1.16b | 
|  | .endif | 
|  |  | 
|  | ld1		{KS0.8b}, [x5]			// load upper counter | 
|  | rev		x11, x11 | 
|  | sub		w0, w0, #2 | 
|  | mov		KS1.8b, KS0.8b | 
|  | ins		KS0.d[1], x9			// set lower counter | 
|  | ins		KS1.d[1], x11 | 
|  |  | 
|  | rev64		T1.16b, INP1.16b | 
|  |  | 
|  | cmp		w7, #12 | 
|  | b.ge		2f				// AES-192/256? | 
|  |  | 
|  | 1:	enc_round	KS0, v21 | 
|  | ext		IN1.16b, T1.16b, T1.16b, #8 | 
|  |  | 
|  | enc_round	KS1, v21 | 
|  | pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1 | 
|  |  | 
|  | enc_round	KS0, v22 | 
|  | eor		T1.16b, T1.16b, IN1.16b | 
|  |  | 
|  | enc_round	KS1, v22 | 
|  | pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0 | 
|  |  | 
|  | enc_round	KS0, v23 | 
|  | pmull		XM2.1q, SHASH2.1d, T1.1d	// (a1 + a0)(b1 + b0) | 
|  |  | 
|  | enc_round	KS1, v23 | 
|  | rev64		T1.16b, INP0.16b | 
|  | ext		T2.16b, XL.16b, XL.16b, #8 | 
|  |  | 
|  | enc_round	KS0, v24 | 
|  | ext		IN1.16b, T1.16b, T1.16b, #8 | 
|  | eor		T1.16b, T1.16b, T2.16b | 
|  |  | 
|  | enc_round	KS1, v24 | 
|  | eor		XL.16b, XL.16b, IN1.16b | 
|  |  | 
|  | enc_round	KS0, v25 | 
|  | eor		T1.16b, T1.16b, XL.16b | 
|  |  | 
|  | enc_round	KS1, v25 | 
|  | pmull2		XH.1q, HH.2d, XL.2d		// a1 * b1 | 
|  |  | 
|  | enc_round	KS0, v26 | 
|  | pmull		XL.1q, HH.1d, XL.1d		// a0 * b0 | 
|  |  | 
|  | enc_round	KS1, v26 | 
|  | pmull2		XM.1q, SHASH2.2d, T1.2d		// (a1 + a0)(b1 + b0) | 
|  |  | 
|  | enc_round	KS0, v27 | 
|  | eor		XL.16b, XL.16b, XL2.16b | 
|  | eor		XH.16b, XH.16b, XH2.16b | 
|  |  | 
|  | enc_round	KS1, v27 | 
|  | eor		XM.16b, XM.16b, XM2.16b | 
|  | ext		T1.16b, XL.16b, XH.16b, #8 | 
|  |  | 
|  | enc_round	KS0, v28 | 
|  | eor		T2.16b, XL.16b, XH.16b | 
|  | eor		XM.16b, XM.16b, T1.16b | 
|  |  | 
|  | enc_round	KS1, v28 | 
|  | eor		XM.16b, XM.16b, T2.16b | 
|  |  | 
|  | enc_round	KS0, v29 | 
|  | pmull		T2.1q, XL.1d, MASK.1d | 
|  |  | 
|  | enc_round	KS1, v29 | 
|  | mov		XH.d[0], XM.d[1] | 
|  | mov		XM.d[1], XL.d[0] | 
|  |  | 
|  | aese		KS0.16b, v30.16b | 
|  | eor		XL.16b, XM.16b, T2.16b | 
|  |  | 
|  | aese		KS1.16b, v30.16b | 
|  | ext		T2.16b, XL.16b, XL.16b, #8 | 
|  |  | 
|  | eor		KS0.16b, KS0.16b, v31.16b | 
|  | pmull		XL.1q, XL.1d, MASK.1d | 
|  | eor		T2.16b, T2.16b, XH.16b | 
|  |  | 
|  | eor		KS1.16b, KS1.16b, v31.16b | 
|  | eor		XL.16b, XL.16b, T2.16b | 
|  |  | 
|  | .if		\enc == 0 | 
|  | eor		INP0.16b, INP0.16b, KS0.16b | 
|  | eor		INP1.16b, INP1.16b, KS1.16b | 
|  | .endif | 
|  |  | 
|  | st1		{INP0.16b-INP1.16b}, [x2], #32 | 
|  |  | 
|  | cbnz		w0, 0b | 
|  |  | 
|  | CPU_LE(	rev		x8, x8		) | 
|  | st1		{XL.2d}, [x1] | 
|  | str		x8, [x5, #8]			// store lower counter | 
|  |  | 
|  | .if		\enc == 1 | 
|  | st1		{KS0.16b-KS1.16b}, [x10] | 
|  | .endif | 
|  |  | 
|  | ret | 
|  |  | 
|  | 2:	b.eq		3f				// AES-192? | 
|  | enc_round	KS0, v17 | 
|  | enc_round	KS1, v17 | 
|  | enc_round	KS0, v18 | 
|  | enc_round	KS1, v18 | 
|  | 3:	enc_round	KS0, v19 | 
|  | enc_round	KS1, v19 | 
|  | enc_round	KS0, v20 | 
|  | enc_round	KS1, v20 | 
|  | b		1b | 
|  |  | 
|  | 4:	load_round_keys	w7, x6 | 
|  | b		0b | 
|  | .endm | 
|  |  | 
|  | /* | 
|  | * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], | 
|  | *			  struct ghash_key const *k, u8 ctr[], | 
|  | *			  int rounds, u8 ks[]) | 
|  | */ | 
|  | ENTRY(pmull_gcm_encrypt) | 
|  | pmull_gcm_do_crypt	1 | 
|  | ENDPROC(pmull_gcm_encrypt) | 
|  |  | 
|  | /* | 
|  | * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], | 
|  | *			  struct ghash_key const *k, u8 ctr[], | 
|  | *			  int rounds) | 
|  | */ | 
|  | ENTRY(pmull_gcm_decrypt) | 
|  | pmull_gcm_do_crypt	0 | 
|  | ENDPROC(pmull_gcm_decrypt) | 
|  |  | 
|  | /* | 
|  | * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds) | 
|  | */ | 
|  | ENTRY(pmull_gcm_encrypt_block) | 
|  | cbz		x2, 0f | 
|  | load_round_keys	w3, x2 | 
|  | 0:	ld1		{v0.16b}, [x1] | 
|  | enc_block	v0, w3 | 
|  | st1		{v0.16b}, [x0] | 
|  | ret | 
|  | ENDPROC(pmull_gcm_encrypt_block) |