Adding in curl and openssl repos

2025-08-14 12:09:30 -04:00
parent af2117b574
commit 0ace93e303
21174 changed files with 3607720 additions and 2 deletions
--- a/openssl-3.4.2/crypto/modes/aes-gcm-avx512.s
+++ b/openssl-3.4.2/crypto/modes/aes-gcm-avx512.s
--- a/openssl-3.4.2/crypto/modes/aesni-gcm-x86_64.s
+++ b/openssl-3.4.2/crypto/modes/aesni-gcm-x86_64.s
@@ -0,0 +1,812 @@
+.text	
+
+.type	_aesni_ctr32_ghash_6x,@function
+.align	32
+_aesni_ctr32_ghash_6x:
+.cfi_startproc	
+	vmovdqu	32(%r11),%xmm2
+	subq	$6,%rdx
+	vpxor	%xmm4,%xmm4,%xmm4
+	vmovdqu	0-128(%rcx),%xmm15
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovdqu	%xmm4,16+8(%rsp)
+	jmp	.Loop6x
+
+.align	32
+.Loop6x:
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+	vmovdqu	%xmm1,(%r8)
+	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm15,%xmm12,%xmm12
+	vmovups	16-128(%rcx),%xmm2
+	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
+	xorq	%r12,%r12
+	cmpq	%r14,%r15
+
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vmovdqu	48+8(%rsp),%xmm0
+	vpxor	%xmm15,%xmm13,%xmm13
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm14,%xmm14
+	setnc	%r12b
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vmovdqu	16-32(%r9),%xmm3
+	negq	%r12
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
+	vpxor	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm2,%xmm13,%xmm13
+	vpxor	%xmm5,%xmm1,%xmm4
+	andq	$0x60,%r12
+	vmovups	32-128(%rcx),%xmm15
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
+	vaesenc	%xmm2,%xmm14,%xmm14
+
+	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
+	leaq	(%r14,%r12,1),%r14
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
+	vmovdqu	64+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	88(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	80(%r14),%r12
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,32+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,40+8(%rsp)
+	vmovdqu	48-32(%r9),%xmm5
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	48-128(%rcx),%xmm15
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
+	vmovdqu	80+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqu	64-32(%r9),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	64-128(%rcx),%xmm15
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	72(%r14),%r13
+	vpxor	%xmm5,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	64(%r14),%r12
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
+	vmovdqu	96+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,48+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,56+8(%rsp)
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	96-32(%r9),%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	80-128(%rcx),%xmm15
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	56(%r14),%r13
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
+	vpxor	112+8(%rsp),%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	48(%r14),%r12
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,64+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,72+8(%rsp)
+	vpxor	%xmm3,%xmm4,%xmm4
+	vmovdqu	112-32(%r9),%xmm3
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	96-128(%rcx),%xmm15
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	40(%r14),%r13
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	32(%r14),%r12
+	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,80+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,88+8(%rsp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm6,%xmm6
+
+	vmovups	112-128(%rcx),%xmm15
+	vpslldq	$8,%xmm6,%xmm5
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	16(%r11),%xmm3
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm4,%xmm4
+	movbeq	24(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	16(%r14),%r12
+	vpalignr	$8,%xmm4,%xmm4,%xmm0
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	movq	%r13,96+8(%rsp)
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r12,104+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vmovups	128-128(%rcx),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	144-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vpsrldq	$8,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm6,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm0,%xmm4,%xmm4
+	movbeq	8(%r14),%r13
+	vaesenc	%xmm1,%xmm13,%xmm13
+	movbeq	0(%r14),%r12
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	160-128(%rcx),%xmm1
+	cmpl	$11,%ebp
+	jb	.Lenc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	176-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	192-128(%rcx),%xmm1
+	je	.Lenc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	208-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	224-128(%rcx),%xmm1
+	jmp	.Lenc_tail
+
+.align	32
+.Lhandle_ctr32:
+	vmovdqu	(%r11),%xmm0
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm15,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpshufb	%xmm0,%xmm1,%xmm1
+	jmp	.Lresume_ctr32
+
+.align	32
+.Lenc_tail:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vmovdqu	%xmm7,16+8(%rsp)
+	vpalignr	$8,%xmm4,%xmm4,%xmm8
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	vpxor	0(%rdi),%xmm1,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	16(%rdi),%xmm1,%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	32(%rdi),%xmm1,%xmm5
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	48(%rdi),%xmm1,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	64(%rdi),%xmm1,%xmm7
+	vpxor	80(%rdi),%xmm1,%xmm3
+	vmovdqu	(%r8),%xmm1
+
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vmovdqu	32(%r11),%xmm2
+	vaesenclast	%xmm0,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm1,%xmm0
+	movq	%r13,112+8(%rsp)
+	leaq	96(%rdi),%rdi
+	vaesenclast	%xmm5,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm0,%xmm5
+	movq	%r12,120+8(%rsp)
+	leaq	96(%rsi),%rsi
+	vmovdqu	0-128(%rcx),%xmm15
+	vaesenclast	%xmm6,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm5,%xmm6
+	vaesenclast	%xmm7,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm6,%xmm7
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vpaddb	%xmm2,%xmm7,%xmm3
+
+	addq	$0x60,%r10
+	subq	$0x6,%rdx
+	jc	.L6x_done
+
+	vmovups	%xmm9,-96(%rsi)
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovups	%xmm10,-80(%rsi)
+	vmovdqa	%xmm0,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vmovdqa	%xmm5,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vmovdqa	%xmm6,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vmovdqa	%xmm7,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vmovdqa	%xmm3,%xmm14
+	vmovdqu	32+8(%rsp),%xmm7
+	jmp	.Loop6x
+
+.L6x_done:
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpxor	%xmm4,%xmm8,%xmm8
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+.globl	aesni_gcm_decrypt
+.type	aesni_gcm_decrypt,@function
+.align	32
+aesni_gcm_decrypt:
+.cfi_startproc	
+	xorq	%r10,%r10
+	cmpq	$0x60,%rdx
+	jb	.Lgcm_dec_abort
+
+	leaq	(%rsp),%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	vmovdqu	(%r9),%xmm8
+	andq	$-128,%rsp
+	vmovdqu	(%r11),%xmm0
+	leaq	128(%rcx),%rcx
+	leaq	32+32(%r9),%r9
+	movl	240-128(%rcx),%ebp
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Ldec_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Ldec_no_key_aliasing
+	subq	%r15,%rsp
+.Ldec_no_key_aliasing:
+
+	vmovdqu	80(%rdi),%xmm7
+	leaq	(%rdi),%r14
+	vmovdqu	64(%rdi),%xmm4
+	leaq	-192(%rdi,%rdx,1),%r15
+	vmovdqu	48(%rdi),%xmm5
+	shrq	$4,%rdx
+	xorq	%r10,%r10
+	vmovdqu	32(%rdi),%xmm6
+	vpshufb	%xmm0,%xmm7,%xmm7
+	vmovdqu	16(%rdi),%xmm2
+	vpshufb	%xmm0,%xmm4,%xmm4
+	vmovdqu	(%rdi),%xmm3
+	vpshufb	%xmm0,%xmm5,%xmm5
+	vmovdqu	%xmm4,48(%rsp)
+	vpshufb	%xmm0,%xmm6,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm2,%xmm2
+	vmovdqu	%xmm6,80(%rsp)
+	vpshufb	%xmm0,%xmm3,%xmm3
+	vmovdqu	%xmm2,96(%rsp)
+	vmovdqu	%xmm3,112(%rsp)
+
+	call	_aesni_ctr32_ghash_6x
+
+	vmovups	%xmm9,-96(%rsi)
+	vmovups	%xmm10,-80(%rsi)
+	vmovups	%xmm11,-64(%rsi)
+	vmovups	%xmm12,-48(%rsi)
+	vmovups	%xmm13,-32(%rsi)
+	vmovups	%xmm14,-16(%rsi)
+
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,-64(%r9)
+
+	vzeroupper
+	movq	-48(%rax),%r15
+.cfi_restore	%r15
+	movq	-40(%rax),%r14
+.cfi_restore	%r14
+	movq	-32(%rax),%r13
+.cfi_restore	%r13
+	movq	-24(%rax),%r12
+.cfi_restore	%r12
+	movq	-16(%rax),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rax),%rbx
+.cfi_restore	%rbx
+	leaq	(%rax),%rsp
+.cfi_def_cfa_register	%rsp
+.Lgcm_dec_abort:
+	movq	%r10,%rax
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type	_aesni_ctr32_6x,@function
+.align	32
+_aesni_ctr32_6x:
+.cfi_startproc	
+	vmovdqu	0-128(%rcx),%xmm4
+	vmovdqu	32(%r11),%xmm2
+	leaq	-1(%rbp),%r13
+	vmovups	16-128(%rcx),%xmm15
+	leaq	32-128(%rcx),%r12
+	vpxor	%xmm4,%xmm1,%xmm9
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32_2
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+
+.align	16
+.Loop_ctr32:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vmovups	(%r12),%xmm15
+	leaq	16(%r12),%r12
+	decl	%r13d
+	jnz	.Loop_ctr32
+
+	vmovdqu	(%r12),%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	0(%rdi),%xmm3,%xmm4
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	16(%rdi),%xmm3,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	32(%rdi),%xmm3,%xmm6
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	48(%rdi),%xmm3,%xmm8
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	64(%rdi),%xmm3,%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	80(%rdi),%xmm3,%xmm3
+	leaq	96(%rdi),%rdi
+
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm5,%xmm10,%xmm10
+	vaesenclast	%xmm6,%xmm11,%xmm11
+	vaesenclast	%xmm8,%xmm12,%xmm12
+	vaesenclast	%xmm2,%xmm13,%xmm13
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vmovups	%xmm9,0(%rsi)
+	vmovups	%xmm10,16(%rsi)
+	vmovups	%xmm11,32(%rsi)
+	vmovups	%xmm12,48(%rsi)
+	vmovups	%xmm13,64(%rsi)
+	vmovups	%xmm14,80(%rsi)
+	leaq	96(%rsi),%rsi
+
+	.byte	0xf3,0xc3
+.align	32
+.Lhandle_ctr32_2:
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+.cfi_endproc	
+.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl	aesni_gcm_encrypt
+.type	aesni_gcm_encrypt,@function
+.align	32
+aesni_gcm_encrypt:
+.cfi_startproc	
+	xorq	%r10,%r10
+	cmpq	$288,%rdx
+	jb	.Lgcm_enc_abort
+
+	leaq	(%rsp),%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	leaq	128(%rcx),%rcx
+	vmovdqu	(%r11),%xmm0
+	andq	$-128,%rsp
+	movl	240-128(%rcx),%ebp
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Lenc_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Lenc_no_key_aliasing
+	subq	%r15,%rsp
+.Lenc_no_key_aliasing:
+
+	leaq	(%rsi),%r14
+	leaq	-192(%rsi,%rdx,1),%r15
+	shrq	$4,%rdx
+
+	call	_aesni_ctr32_6x
+	vpshufb	%xmm0,%xmm9,%xmm8
+	vpshufb	%xmm0,%xmm10,%xmm2
+	vmovdqu	%xmm8,112(%rsp)
+	vpshufb	%xmm0,%xmm11,%xmm4
+	vmovdqu	%xmm2,96(%rsp)
+	vpshufb	%xmm0,%xmm12,%xmm5
+	vmovdqu	%xmm4,80(%rsp)
+	vpshufb	%xmm0,%xmm13,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm14,%xmm7
+	vmovdqu	%xmm6,48(%rsp)
+
+	call	_aesni_ctr32_6x
+
+	vmovdqu	(%r9),%xmm8
+	leaq	32+32(%r9),%r9
+	subq	$12,%rdx
+	movq	$192,%r10
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	call	_aesni_ctr32_ghash_6x
+	vmovdqu	32(%rsp),%xmm7
+	vmovdqu	(%r11),%xmm0
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm7,%xmm7,%xmm1
+	vmovdqu	32-32(%r9),%xmm15
+	vmovups	%xmm9,-96(%rsi)
+	vpshufb	%xmm0,%xmm9,%xmm9
+	vpxor	%xmm7,%xmm1,%xmm1
+	vmovups	%xmm10,-80(%rsi)
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vmovdqu	%xmm9,16(%rsp)
+	vmovdqu	48(%rsp),%xmm6
+	vmovdqu	16-32(%r9),%xmm0
+	vpunpckhqdq	%xmm6,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm6,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+
+	vmovdqu	64(%rsp),%xmm9
+	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm9,%xmm9,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
+	vpxor	%xmm9,%xmm5,%xmm5
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vmovdqu	80(%rsp),%xmm1
+	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpunpckhqdq	%xmm1,%xmm1,%xmm4
+	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpxor	%xmm6,%xmm9,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	96(%rsp),%xmm2
+	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpunpckhqdq	%xmm2,%xmm2,%xmm7
+	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpxor	%xmm9,%xmm1,%xmm1
+	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm5,%xmm4,%xmm4
+
+	vpxor	112(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
+	vmovdqu	112-32(%r9),%xmm0
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm7,%xmm4
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm1
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
+	vpxor	%xmm14,%xmm1,%xmm1
+	vpxor	%xmm5,%xmm6,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
+	vmovdqu	32-32(%r9),%xmm15
+	vpxor	%xmm2,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm9,%xmm6
+
+	vmovdqu	16-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm7,%xmm9
+	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
+	vpxor	%xmm9,%xmm6,%xmm6
+	vpunpckhqdq	%xmm13,%xmm13,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
+	vpxor	%xmm13,%xmm2,%xmm2
+	vpslldq	$8,%xmm6,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+	vpxor	%xmm9,%xmm5,%xmm8
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm12,%xmm12,%xmm9
+	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
+	vpxor	%xmm12,%xmm9,%xmm9
+	vpxor	%xmm14,%xmm13,%xmm13
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm11,%xmm11,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
+	vpxor	%xmm11,%xmm1,%xmm1
+	vpxor	%xmm13,%xmm12,%xmm12
+	vxorps	16(%rsp),%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm9,%xmm9
+
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm10,%xmm10,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
+	vpxor	%xmm10,%xmm2,%xmm2
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpxor	%xmm12,%xmm11,%xmm11
+	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vxorps	%xmm7,%xmm14,%xmm14
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
+	vmovdqu	112-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
+	vpxor	%xmm10,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm6,%xmm6
+
+	vpxor	%xmm5,%xmm7,%xmm4
+	vpxor	%xmm4,%xmm6,%xmm6
+	vpslldq	$8,%xmm6,%xmm1
+	vmovdqu	16(%r11),%xmm3
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm5,%xmm8
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm2,%xmm8,%xmm8
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm7,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm8,%xmm8
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,-64(%r9)
+
+	vzeroupper
+	movq	-48(%rax),%r15
+.cfi_restore	%r15
+	movq	-40(%rax),%r14
+.cfi_restore	%r14
+	movq	-32(%rax),%r13
+.cfi_restore	%r13
+	movq	-24(%rax),%r12
+.cfi_restore	%r12
+	movq	-16(%rax),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rax),%rbx
+.cfi_restore	%rbx
+	leaq	(%rax),%rsp
+.cfi_def_cfa_register	%rsp
+.Lgcm_enc_abort:
+	movq	%r10,%rax
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.section	.rodata
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.previous	
+.align	64
+	.section ".note.gnu.property", "a"
+	.p2align 3
+	.long 1f - 0f
+	.long 4f - 1f
+	.long 5
+0:
+	# "GNU" encoded with .byte, since .asciz isn't supported
+	# on Solaris.
+	.byte 0x47
+	.byte 0x4e
+	.byte 0x55
+	.byte 0
+1:
+	.p2align 3
+	.long 0xc0000002
+	.long 3f - 2f
+2:
+	.long 3
+3:
+	.p2align 3
+4:
--- a/openssl-3.4.2/crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl
+++ b/openssl-3.4.2/crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl
--- a/openssl-3.4.2/crypto/modes/asm/aes-gcm-armv8_64.pl
+++ b/openssl-3.4.2/crypto/modes/asm/aes-gcm-armv8_64.pl
--- a/openssl-3.4.2/crypto/modes/asm/aes-gcm-avx512.pl
+++ b/openssl-3.4.2/crypto/modes/asm/aes-gcm-avx512.pl
--- a/openssl-3.4.2/crypto/modes/asm/aes-gcm-ppc.pl
+++ b/openssl-3.4.2/crypto/modes/asm/aes-gcm-ppc.pl
--- a/openssl-3.4.2/crypto/modes/asm/aes-gcm-riscv64-zvkb-zvkg-zvkned.pl
+++ b/openssl-3.4.2/crypto/modes/asm/aes-gcm-riscv64-zvkb-zvkg-zvkned.pl
@@ -0,0 +1,975 @@
+#! /usr/bin/env perl
+# This file is dual-licensed, meaning that you can use it under your
+# choice of either of the following two licenses:
+#
+# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You can obtain
+# a copy in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# or
+#
+# Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# - RV64I
+# - RISC-V Vector ('V') with VLEN >= 128
+# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+# - RISC-V Vector GCM/GMAC extension ('Zvkg')
+# - RISC-V Vector AES Block Cipher extension ('Zvkned')
+# - RISC-V Zicclsm(Main memory supports misaligned loads/stores)
+
+# Reference: https://github.com/riscv/riscv-crypto/issues/192#issuecomment-1270447575
+#
+# Assume we have 12 GCM blocks and we try to parallelize GCM computation for 4 blocks.
+# Tag = M0*H^12 + M1*H^11 + M2*H^10 + M3*H^9 +
+#       M4*H^8  + M5*H^7  + M6*H^6  + M7*H^5 +
+#       M8*H^4  + M9*H^3  + M10*H^2 + M11*H^1
+# We could rewrite the formula into:
+# T0 = 0
+# T1 = (T0+M1)*H^4   T2 = (T0+M2)*H^4    T3 = (T0+M3)*H^4    T4 = (T0+M4)*H^4
+# T5 = (T1+M5)*H^4   T6 = (T2+M6)*H^4    T7 = (T3+M7)*H^4    T8 = (T4+M8)*H^4
+# T9 = (T5+M9)*H^4  T10 = (T6+M10)*H^3  T11 = (T7+M11)*H^2  T12 = (T8+M12)*H^1
+#
+# We will multiply with [H^4, H^4, H^4, H^4] in each steps except the last iteration.
+# The last iteration will multiply with [H^4, H^3, H^2, H^1].
+
+use strict;
+use warnings;
+
+use FindBin qw($Bin);
+use lib "$Bin";
+use lib "$Bin/../../perlasm";
+use riscv;
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$output and open STDOUT,">$output";
+
+my $code=<<___;
+.text
+___
+
+{
+my ($INP, $OUTP, $LEN, $KEYP, $IVP, $XIP) = ("a0", "a1", "a2", "a3", "a4", "a5");
+my ($T0, $T1, $T2, $T3) = ("t0", "t1", "t2", "t3");
+my ($PADDING_LEN32) = ("t4");
+my ($LEN32) = ("t5");
+my ($CTR) = ("t6");
+my ($FULL_BLOCK_LEN32) = ("a6");
+my ($ORIGINAL_LEN32) = ("a7");
+my ($PROCESSED_LEN) = ("a0");
+my ($CTR_MASK) = ("v0");
+my ($INPUT_PADDING_MASK) = ("v0");
+my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7,
+    $V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15,
+    $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23,
+    $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31,
+) = map("v$_",(0..31));
+
+# Do aes-128 enc.
+sub aes_128_cipher_body {
+    my $code=<<___;
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vaesz_vs $V28, $V1]}
+    @{[vaesem_vs $V28, $V2]}
+    @{[vaesem_vs $V28, $V3]}
+    @{[vaesem_vs $V28, $V4]}
+    @{[vaesem_vs $V28, $V5]}
+    @{[vaesem_vs $V28, $V6]}
+    @{[vaesem_vs $V28, $V7]}
+    @{[vaesem_vs $V28, $V8]}
+    @{[vaesem_vs $V28, $V9]}
+    @{[vaesem_vs $V28, $V10]}
+    @{[vaesef_vs $V28, $V11]}
+___
+
+    return $code;
+}
+
+# Do aes-192 enc.
+sub aes_192_cipher_body {
+    my $TMP_REG = shift;
+
+    my $code=<<___;
+    # Load key 4
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    addi $TMP_REG, $KEYP, 48
+    @{[vle32_v $V11, $TMP_REG]}
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vaesz_vs $V28, $V1]}
+    @{[vaesem_vs $V28, $V2]}
+    @{[vaesem_vs $V28, $V3]}
+    @{[vaesem_vs $V28, $V11]}
+    # Load key 8
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    addi $TMP_REG, $KEYP, 112
+    @{[vle32_v $V11, $TMP_REG]}
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vaesem_vs $V28, $V4]}
+    @{[vaesem_vs $V28, $V5]}
+    @{[vaesem_vs $V28, $V6]}
+    @{[vaesem_vs $V28, $V11]}
+    # Load key 13
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    addi $TMP_REG, $KEYP, 192
+    @{[vle32_v $V11, $TMP_REG]}
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vaesem_vs $V28, $V7]}
+    @{[vaesem_vs $V28, $V8]}
+    @{[vaesem_vs $V28, $V9]}
+    @{[vaesem_vs $V28, $V10]}
+    @{[vaesef_vs $V28, $V11]}
+___
+
+    return $code;
+}
+
+# Do aes-256 enc.
+sub aes_256_cipher_body {
+    my $TMP_REG = shift;
+
+    my $code=<<___;
+    # Load key 3
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    addi $TMP_REG, $KEYP, 32
+    @{[vle32_v $V11, $TMP_REG]}
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vaesz_vs $V28, $V1]}
+    @{[vaesem_vs $V28, $V2]}
+    @{[vaesem_vs $V28, $V11]}
+    # Load key 6
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    addi $TMP_REG, $KEYP, 80
+    @{[vle32_v $V11, $TMP_REG]}
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vaesem_vs $V28, $V3]}
+    @{[vaesem_vs $V28, $V4]}
+    @{[vaesem_vs $V28, $V11]}
+    # Load key 9
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    addi $TMP_REG, $KEYP, 128
+    @{[vle32_v $V11, $TMP_REG]}
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vaesem_vs $V28, $V5]}
+    @{[vaesem_vs $V28, $V6]}
+    @{[vaesem_vs $V28, $V11]}
+    # Load key 12
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    addi $TMP_REG, $KEYP, 176
+    @{[vle32_v $V11, $TMP_REG]}
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vaesem_vs $V28, $V7]}
+    @{[vaesem_vs $V28, $V8]}
+    @{[vaesem_vs $V28, $V11]}
+    # Load key 15
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    addi $TMP_REG, $KEYP, 224
+    @{[vle32_v $V11, $TMP_REG]}
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vaesem_vs $V28, $V9]}
+    @{[vaesem_vs $V28, $V10]}
+    @{[vaesef_vs $V28, $V11]}
+___
+
+    return $code;
+}
+
+sub handle_padding_in_first_round {
+    my $TMP_REG = shift;
+
+    my $code=<<___;
+    bnez $PADDING_LEN32, 1f
+
+    ## without padding
+    # Store ciphertext/plaintext
+    @{[vse32_v $V28, $OUTP]}
+    j 2f
+
+    ## with padding
+1:
+    # Store ciphertext/plaintext using mask
+    @{[vse32_v $V28, $OUTP, $INPUT_PADDING_MASK]}
+
+    # Fill zero for the padding blocks
+    @{[vsetvli "zero", $PADDING_LEN32, "e32", "m4", "tu", "ma"]}
+    @{[vmv_v_i $V28, 0]}
+
+    # We have used mask register for `INPUT_PADDING_MASK` before. We need to
+    # setup the ctr mask back.
+    # ctr mask : [000100010001....]
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e8", "m1", "ta", "ma"]}
+    li $TMP_REG, 0b10001000
+    @{[vmv_v_x $CTR_MASK, $TMP_REG]}
+2:
+
+___
+
+    return $code;
+}
+
+
+# Do aes-128 enc for first round.
+sub aes_128_first_round {
+    my $PTR_OFFSET_REG = shift;
+    my $TMP_REG = shift;
+
+    my $code=<<___;
+    # Load all 11 aes round keys to v1-v11 registers.
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $V1, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V2, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V3, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V4, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V5, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V6, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V7, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V8, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V9, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V10, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V11, $KEYP]}
+
+    # We already have the ciphertext/plaintext and ctr data for the first round.
+    @{[aes_128_cipher_body]}
+
+    # Compute AES ctr result.
+    @{[vxor_vv $V28, $V28, $V24]}
+
+    @{[handle_padding_in_first_round $TMP_REG]}
+
+    add $INP, $INP, $PTR_OFFSET_REG
+    add $OUTP, $OUTP, $PTR_OFFSET_REG
+___
+
+    return $code;
+}
+
+# Do aes-192 enc for first round.
+sub aes_192_first_round {
+    my $PTR_OFFSET_REG = shift;
+    my $TMP_REG = shift;
+
+    my $code=<<___;
+    # We run out of 32 vector registers, so we just preserve some round keys
+    # and load the remaining round keys inside the aes body.
+    # We keep the round keys for:
+    #   1, 2, 3, 5, 6, 7, 9, 10, 11 and 12th keys.
+    # The following keys will be loaded in the aes body:
+    #   4, 8 and 13th keys.
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    # key 1
+    @{[vle32_v $V1, $KEYP]}
+    # key 2
+    addi $TMP_REG, $KEYP, 16
+    @{[vle32_v $V2, $TMP_REG]}
+    # key 3
+    addi $TMP_REG, $KEYP, 32
+    @{[vle32_v $V3, $TMP_REG]}
+    # key 5
+    addi $TMP_REG, $KEYP, 64
+    @{[vle32_v $V4, $TMP_REG]}
+    # key 6
+    addi $TMP_REG, $KEYP, 80
+    @{[vle32_v $V5, $TMP_REG]}
+    # key 7
+    addi $TMP_REG, $KEYP, 96
+    @{[vle32_v $V6, $TMP_REG]}
+    # key 9
+    addi $TMP_REG, $KEYP, 128
+    @{[vle32_v $V7, $TMP_REG]}
+    # key 10
+    addi $TMP_REG, $KEYP, 144
+    @{[vle32_v $V8, $TMP_REG]}
+    # key 11
+    addi $TMP_REG, $KEYP, 160
+    @{[vle32_v $V9, $TMP_REG]}
+    # key 12
+    addi $TMP_REG, $KEYP, 176
+    @{[vle32_v $V10, $TMP_REG]}
+
+    # We already have the ciphertext/plaintext and ctr data for the first round.
+    @{[aes_192_cipher_body $TMP_REG]}
+
+    # Compute AES ctr result.
+    @{[vxor_vv $V28, $V28, $V24]}
+
+    @{[handle_padding_in_first_round $TMP_REG]}
+
+    add $INP, $INP, $PTR_OFFSET_REG
+    add $OUTP, $OUTP, $PTR_OFFSET_REG
+___
+
+    return $code;
+}
+
+# Do aes-256 enc for first round.
+sub aes_256_first_round {
+    my $PTR_OFFSET_REG = shift;
+    my $TMP_REG = shift;
+
+    my $code=<<___;
+    # We run out of 32 vector registers, so we just preserve some round keys
+    # and load the remaining round keys inside the aes body.
+    # We keep the round keys for:
+    #   1, 2, 4, 5, 7, 8, 10, 11, 13 and 14th keys.
+    # The following keys will be loaded in the aes body:
+    #   3, 6, 9, 12 and 15th keys.
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    # key 1
+    @{[vle32_v $V1, $KEYP]}
+    # key 2
+    addi $TMP_REG, $KEYP, 16
+    @{[vle32_v $V2, $TMP_REG]}
+    # key 4
+    addi $TMP_REG, $KEYP, 48
+    @{[vle32_v $V3, $TMP_REG]}
+    # key 5
+    addi $TMP_REG, $KEYP, 64
+    @{[vle32_v $V4, $TMP_REG]}
+    # key 7
+    addi $TMP_REG, $KEYP, 96
+    @{[vle32_v $V5, $TMP_REG]}
+    # key 8
+    addi $TMP_REG, $KEYP, 112
+    @{[vle32_v $V6, $TMP_REG]}
+    # key 10
+    addi $TMP_REG, $KEYP, 144
+    @{[vle32_v $V7, $TMP_REG]}
+    # key 11
+    addi $TMP_REG, $KEYP, 160
+    @{[vle32_v $V8, $TMP_REG]}
+    # key 13
+    addi $TMP_REG, $KEYP, 192
+    @{[vle32_v $V9, $TMP_REG]}
+    # key 14
+    addi $TMP_REG, $KEYP, 208
+    @{[vle32_v $V10, $TMP_REG]}
+
+    # We already have the ciphertext/plaintext and ctr data for the first round.
+    @{[aes_256_cipher_body $TMP_REG]}
+
+    # Compute AES ctr result.
+    @{[vxor_vv $V28, $V28, $V24]}
+
+    @{[handle_padding_in_first_round $TMP_REG]}
+
+    add $INP, $INP, $PTR_OFFSET_REG
+    add $OUTP, $OUTP, $PTR_OFFSET_REG
+___
+
+    return $code;
+}
+
+sub aes_gcm_init {
+    my $code=<<___;
+    # Compute the AES-GCM full-block e32 length for `LMUL=4`. We will handle
+    # the multiple AES-GCM blocks at the same time within `LMUL=4` register.
+    # The AES-GCM's SEW is e32 and EGW is 128 bits.
+    #   FULL_BLOCK_LEN32 = (VLEN*LMUL)/(EGW) * (EGW/SEW) = (VLEN*4)/(32*4) * 4
+    #                    = (VLEN*4)/32
+    # We could get the block_num using the VL value of `vsetvli with e32, m4`.
+    @{[vsetvli $FULL_BLOCK_LEN32, "zero", "e32", "m4", "ta", "ma"]}
+    # If `LEN32 % FULL_BLOCK_LEN32` is not equal to zero, we could fill the
+    # zero padding data to make sure we could always handle FULL_BLOCK_LEN32
+    # blocks for all iterations.
+
+    ## Prepare the H^n multiplier in v16 for GCM multiplier. The `n` is the gcm
+    ## block number in a LMUL=4 register group.
+    ##   n = ((VLEN*LMUL)/(32*4)) = ((VLEN*4)/(32*4))
+    ##     = (VLEN/32)
+    ## We could use vsetvli with `e32, m1` to compute the `n` number.
+    @{[vsetvli $T0, "zero", "e32", "m1", "ta", "ma"]}
+
+    # The H is at `gcm128_context.Htable[0]`(addr(Xi)+16*2).
+    addi $T1, $XIP, 32
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $V31, $T1]}
+
+    # Compute the H^n
+    li $T1, 1
+1:
+    @{[vgmul_vv $V31, $V31]}
+    slli $T1, $T1, 1
+    bltu $T1, $T0, 1b
+
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vmv_v_i $V16, 0]}
+    @{[vaesz_vs $V16, $V31]}
+
+    #### Load plaintext into v24 and handle padding. We also load the init tag
+    #### data into v20 and prepare the AES ctr input data into v12 and v28.
+    @{[vmv_v_i $V20, 0]}
+
+    ## Prepare the AES ctr input data into v12.
+    # Setup ctr input mask.
+    # ctr mask : [000100010001....]
+    # Note: The actual vl should be `FULL_BLOCK_LEN32/4 * 2`, but we just use
+    #   `FULL_BLOCK_LEN32` here.
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e8", "m1", "ta", "ma"]}
+    li $T0, 0b10001000
+    @{[vmv_v_x $CTR_MASK, $T0]}
+    # Load IV.
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $V31, $IVP]}
+    # Convert the big-endian counter into little-endian.
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "mu"]}
+    @{[vrev8_v $V31, $V31, $CTR_MASK]}
+    # Splat the `single block of IV` to v12
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vmv_v_i $V12, 0]}
+    @{[vaesz_vs $V12, $V31]}
+    # Prepare the ctr counter into v8
+    # v8: [x, x, x, 0, x, x, x, 1, x, x, x, 2, ...]
+    @{[viota_m $V8, $CTR_MASK, $CTR_MASK]}
+    # Merge IV and ctr counter into v12.
+    # v12:[x, x, x, count+0, x, x, x, count+1, ...]
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
+    @{[vadd_vv $V12, $V12, $V8, $CTR_MASK]}
+
+    li $PADDING_LEN32, 0
+    # Get the SEW32 size in the first round.
+    # If we have the non-zero value for `LEN32&(FULL_BLOCK_LEN32-1)`, then
+    # we will have the leading padding zero.
+    addi $T0, $FULL_BLOCK_LEN32, -1
+    and $T0, $T0, $LEN32
+    beqz $T0, 1f
+
+    ## with padding
+    sub $LEN32, $LEN32, $T0
+    sub $PADDING_LEN32, $FULL_BLOCK_LEN32, $T0
+    # padding block size
+    srli $T1, $PADDING_LEN32, 2
+    # padding byte size
+    slli $T2, $PADDING_LEN32, 2
+
+    # Adjust the ctr counter to make the counter start from `counter+0` for the
+    # first non-padding block.
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
+    @{[vsub_vx $V12, $V12, $T1, $CTR_MASK]}
+    # Prepare the AES ctr input into v28.
+    # The ctr data uses big-endian form.
+    @{[vmv_v_v $V28, $V12]}
+    @{[vrev8_v $V28, $V28, $CTR_MASK]}
+
+    # Prepare the mask for input loading in the first round. We use
+    # `VL=FULL_BLOCK_LEN32` with the mask in the first round.
+    # Adjust input ptr.
+    sub $INP, $INP, $T2
+    # Adjust output ptr.
+    sub $OUTP, $OUTP, $T2
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e16", "m2", "ta", "ma"]}
+    @{[vid_v $V2]}
+    # We don't use the pseudo instruction `vmsgeu` here. Use `vmsgtu` instead.
+    # The original code is:
+    #   vmsgeu.vx $INPUT_PADDING_MASK, $V2, $PADDING_LEN32
+    addi $T0, $PADDING_LEN32, -1
+    @{[vmsgtu_vx $INPUT_PADDING_MASK, $V2, $T0]}
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vmv_v_i $V24, 0]}
+    # Load the input for length FULL_BLOCK_LEN32 with mask.
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
+    @{[vle32_v $V24, $INP, $INPUT_PADDING_MASK]}
+
+    # Load the init `Xi` data to v20 with preceding zero padding.
+    # Adjust Xi ptr.
+    sub $T0, $XIP, $T2
+    # Load for length `zero-padding-e32-length + 4`.
+    addi $T1, $PADDING_LEN32, 4
+    @{[vsetvli "zero", $T1, "e32", "m4", "tu", "mu"]}
+    @{[vle32_v $V20, $T0, $INPUT_PADDING_MASK]}
+    j 2f
+
+1:
+    ## without padding
+    sub $LEN32, $LEN32, $FULL_BLOCK_LEN32
+
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vle32_v $V24, $INP]}
+
+    # Load the init Xi data to v20.
+    @{[vsetivli "zero", 4, "e32", "m1", "tu", "ma"]}
+    @{[vle32_v $V20, $XIP]}
+
+    # Prepare the AES ctr input into v28.
+    # The ctr data uses big-endian form.
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
+    @{[vmv_v_v $V28, $V12]}
+    @{[vrev8_v $V28, $V28, $CTR_MASK]}
+2:
+___
+
+    return $code;
+}
+
+sub prepare_input_and_ctr {
+    my $PTR_OFFSET_REG = shift;
+
+    my $code=<<___;
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
+    # Increase ctr in v12.
+    @{[vadd_vx $V12, $V12, $CTR, $CTR_MASK]}
+    sub $LEN32, $LEN32, $FULL_BLOCK_LEN32
+    # Load plaintext into v24
+    @{[vsetvli "zero", "zero", "e32", "m4", "ta", "ma"]}
+    @{[vle32_v $V24, $INP]}
+    # Prepare the AES ctr input into v28.
+    # The ctr data uses big-endian form.
+    @{[vmv_v_v $V28, $V12]}
+    add $INP, $INP, $PTR_OFFSET_REG
+    @{[vsetvli "zero", "zero", "e32", "m4", "ta", "mu"]}
+    @{[vrev8_v $V28, $V28, $CTR_MASK]}
+___
+
+    return $code;
+}
+
+# Store the current CTR back to IV buffer.
+sub store_current_ctr {
+    my $code=<<___;
+    @{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]}
+    # Update current ctr value to v12
+    @{[vadd_vx $V12, $V12, $CTR, $CTR_MASK]}
+    # Convert ctr to big-endian counter.
+    @{[vrev8_v $V12, $V12, $CTR_MASK]}
+    @{[vse32_v $V12, $IVP, $CTR_MASK]}
+___
+
+    return $code;
+}
+
+# Compute the final tag into v0 from the partial tag v20.
+sub compute_final_tag {
+    my $TMP_REG = shift;
+
+    my $code=<<___;
+    # The H is at `gcm128_context.Htable[0]` (addr(Xi)+16*2).
+    # Load H to v1
+    addi $TMP_REG, $XIP, 32
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $V1, $TMP_REG]}
+    # Multiply H for each partial tag and XOR them together.
+    # Handle 1st partial tag
+    @{[vmv_v_v $V0, $V20]}
+    @{[vgmul_vv $V0, $V1]}
+    # Handle 2nd to N-th partial tags
+    li $TMP_REG, 4
+1:
+    @{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]}
+    @{[vslidedown_vx $V4, $V20, $TMP_REG]}
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vghsh_vv $V0, $V1, $V4]}
+    addi $TMP_REG, $TMP_REG, 4
+    blt $TMP_REG, $FULL_BLOCK_LEN32, 1b
+___
+
+    return $code;
+}
+
+################################################################################
+# size_t rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt(const unsigned char *in,
+#                                               unsigned char *out, size_t len,
+#                                               const void *key,
+#                                               unsigned char ivec[16], u64 *Xi);
+{
+$code .= <<___;
+.p2align 3
+.globl rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt
+.type rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt,\@function
+rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt:
+    srli $T0, $LEN, 4
+    beqz $T0, .Lenc_end
+    slli $LEN32, $T0, 2
+
+    mv $ORIGINAL_LEN32, $LEN32
+
+    @{[aes_gcm_init]}
+
+    # Load number of rounds
+    lwu $T0, 240($KEYP)
+    li $T1, 14
+    li $T2, 12
+    li $T3, 10
+
+    beq $T0, $T1, aes_gcm_enc_blocks_256
+    beq $T0, $T2, aes_gcm_enc_blocks_192
+    beq $T0, $T3, aes_gcm_enc_blocks_128
+
+.Lenc_end:
+    li $PROCESSED_LEN, 0
+    ret
+
+.size rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt,.-rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt
+___
+
+$code .= <<___;
+.p2align 3
+aes_gcm_enc_blocks_128:
+    srli $CTR, $FULL_BLOCK_LEN32, 2
+    slli $T0, $FULL_BLOCK_LEN32, 2
+
+    @{[aes_128_first_round $T0, $T1]}
+
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+
+.Lenc_blocks_128:
+    # Compute the partial tags.
+    # The partial tags will multiply with [H^n, H^n, ..., H^n]
+    #   [tag0, tag1, ...] =
+    #     ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
+    # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
+    beqz $LEN32, .Lenc_blocks_128_end
+    @{[vghsh_vv $V20, $V16, $V28]}
+
+    @{[prepare_input_and_ctr $T0]}
+
+    @{[aes_128_cipher_body]}
+
+    # Compute AES ctr ciphertext result.
+    @{[vxor_vv $V28, $V28, $V24]}
+
+    # Store ciphertext
+    @{[vse32_v $V28, $OUTP]}
+    add $OUTP, $OUTP, $T0
+
+    j .Lenc_blocks_128
+.Lenc_blocks_128_end:
+
+    # Add ciphertext into partial tag
+    @{[vxor_vv $V20, $V20, $V28]}
+
+    @{[store_current_ctr]}
+
+    @{[compute_final_tag $T1]}
+
+    # Save the final tag
+    @{[vse32_v $V0, $XIP]}
+
+    # return the processed size.
+    slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
+    ret
+.size aes_gcm_enc_blocks_128,.-aes_gcm_enc_blocks_128
+___
+
+$code .= <<___;
+.p2align 3
+aes_gcm_enc_blocks_192:
+    srli $CTR, $FULL_BLOCK_LEN32, 2
+    slli $T0, $FULL_BLOCK_LEN32, 2
+
+    @{[aes_192_first_round $T0, $T1]}
+
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+
+.Lenc_blocks_192:
+    # Compute the partial tags.
+    # The partial tags will multiply with [H^n, H^n, ..., H^n]
+    #   [tag0, tag1, ...] =
+    #     ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
+    # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
+    beqz $LEN32, .Lenc_blocks_192_end
+    @{[vghsh_vv $V20, $V16, $V28]}
+
+    @{[prepare_input_and_ctr $T0]}
+
+    @{[aes_192_cipher_body $T1]}
+
+    # Compute AES ctr ciphertext result.
+    @{[vxor_vv $V28, $V28, $V24]}
+
+    # Store ciphertext
+    @{[vse32_v $V28, $OUTP]}
+    add $OUTP, $OUTP, $T0
+
+    j .Lenc_blocks_192
+.Lenc_blocks_192_end:
+
+    # Add ciphertext into partial tag
+    @{[vxor_vv $V20, $V20, $V28]}
+
+    @{[store_current_ctr]}
+
+    @{[compute_final_tag $T1]}
+
+    # Save the final tag
+    @{[vse32_v $V0, $XIP]}
+
+    # return the processed size.
+    slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
+    ret
+.size aes_gcm_enc_blocks_192,.-aes_gcm_enc_blocks_192
+___
+
+$code .= <<___;
+.p2align 3
+aes_gcm_enc_blocks_256:
+    srli $CTR, $FULL_BLOCK_LEN32, 2
+    slli $T0, $FULL_BLOCK_LEN32, 2
+
+    @{[aes_256_first_round $T0, $T1]}
+
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+
+.Lenc_blocks_256:
+    # Compute the partial tags.
+    # The partial tags will multiply with [H^n, H^n, ..., H^n]
+    #   [tag0, tag1, ...] =
+    #     ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
+    # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
+    beqz $LEN32, .Lenc_blocks_256_end
+    @{[vghsh_vv $V20, $V16, $V28]}
+
+    @{[prepare_input_and_ctr $T0]}
+
+    @{[aes_256_cipher_body $T1]}
+
+    # Compute AES ctr ciphertext result.
+    @{[vxor_vv $V28, $V28, $V24]}
+
+    # Store ciphertext
+    @{[vse32_v $V28, $OUTP]}
+    add $OUTP, $OUTP, $T0
+
+    j .Lenc_blocks_256
+.Lenc_blocks_256_end:
+
+    # Add ciphertext into partial tag
+    @{[vxor_vv $V20, $V20, $V28]}
+
+    @{[store_current_ctr]}
+
+    @{[compute_final_tag $T1]}
+
+    # Save the final tag
+    @{[vse32_v $V0, $XIP]}
+
+    # return the processed size.
+    slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
+    ret
+.size aes_gcm_enc_blocks_256,.-aes_gcm_enc_blocks_256
+___
+
+}
+
+################################################################################
+# size_t rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt(const unsigned char *in,
+#                                               unsigned char *out, size_t len,
+#                                               const void *key,
+#                                               unsigned char ivec[16], u64 *Xi);
+{
+$code .= <<___;
+.p2align 3
+.globl rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt
+.type rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt,\@function
+rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt:
+    srli $T0, $LEN, 4
+    beqz $T0, .Ldec_end
+    slli $LEN32, $T0, 2
+
+    mv $ORIGINAL_LEN32, $LEN32
+
+    @{[aes_gcm_init]}
+
+    # Load number of rounds
+    lwu $T0, 240($KEYP)
+    li $T1, 14
+    li $T2, 12
+    li $T3, 10
+
+    beq $T0, $T1, aes_gcm_dec_blocks_256
+    beq $T0, $T2, aes_gcm_dec_blocks_192
+    beq $T0, $T3, aes_gcm_dec_blocks_128
+
+.Ldec_end:
+    li $PROCESSED_LEN, 0
+    ret
+.size rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt,.-rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt
+___
+
+$code .= <<___;
+.p2align 3
+aes_gcm_dec_blocks_128:
+    srli $CTR, $FULL_BLOCK_LEN32, 2
+    slli $T0, $FULL_BLOCK_LEN32, 2
+
+    @{[aes_128_first_round $T0, $T1]}
+
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+
+.Ldec_blocks_128:
+    # Compute the partial tags.
+    # The partial tags will multiply with [H^n, H^n, ..., H^n]
+    #   [tag0, tag1, ...] =
+    #     ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
+    # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
+    beqz $LEN32, .Ldec_blocks_256_end
+    @{[vghsh_vv $V20, $V16, $V24]}
+
+    @{[prepare_input_and_ctr $T0]}
+
+    @{[aes_128_cipher_body]}
+
+    # Compute AES ctr plaintext result.
+    @{[vxor_vv $V28, $V28, $V24]}
+
+    # Store plaintext
+    @{[vse32_v $V28, $OUTP]}
+    add $OUTP, $OUTP, $T0
+
+    j .Ldec_blocks_128
+.Ldec_blocks_128_end:
+
+    # Add ciphertext into partial tag
+    @{[vxor_vv $V20, $V20, $V24]}
+
+    @{[store_current_ctr]}
+
+    @{[compute_final_tag $T1]}
+
+    # Save the final tag
+    @{[vse32_v $V0, $XIP]}
+
+    # return the processed size.
+    slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
+    ret
+.size aes_gcm_dec_blocks_128,.-aes_gcm_dec_blocks_128
+___
+
+$code .= <<___;
+.p2align 3
+aes_gcm_dec_blocks_192:
+    srli $CTR, $FULL_BLOCK_LEN32, 2
+    slli $T0, $FULL_BLOCK_LEN32, 2
+
+    @{[aes_192_first_round $T0, $T1]}
+
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+
+.Ldec_blocks_192:
+    # Compute the partial tags.
+    # The partial tags will multiply with [H^n, H^n, ..., H^n]
+    #   [tag0, tag1, ...] =
+    #     ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
+    # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
+    beqz $LEN32, .Ldec_blocks_192_end
+    @{[vghsh_vv $V20, $V16, $V24]}
+
+    @{[prepare_input_and_ctr $T0]}
+
+    @{[aes_192_cipher_body $T1]}
+
+    # Compute AES ctr plaintext result.
+    @{[vxor_vv $V28, $V28, $V24]}
+
+    # Store plaintext
+    @{[vse32_v $V28, $OUTP]}
+    add $OUTP, $OUTP, $T0
+
+    j .Ldec_blocks_192
+.Ldec_blocks_192_end:
+
+    # Add ciphertext into partial tag
+    @{[vxor_vv $V20, $V20, $V24]}
+
+    @{[store_current_ctr]}
+
+    @{[compute_final_tag $T1]}
+
+    # Save the final tag
+    @{[vse32_v $V0, $XIP]}
+
+    # return the processed size.
+    slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
+    ret
+.size aes_gcm_dec_blocks_192,.-aes_gcm_dec_blocks_192
+___
+
+$code .= <<___;
+.p2align 3
+aes_gcm_dec_blocks_256:
+    srli $CTR, $FULL_BLOCK_LEN32, 2
+    slli $T0, $FULL_BLOCK_LEN32, 2
+
+    @{[aes_256_first_round $T0, $T1]}
+
+    @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
+
+.Ldec_blocks_256:
+    # Compute the partial tags.
+    # The partial tags will multiply with [H^n, H^n, ..., H^n]
+    #   [tag0, tag1, ...] =
+    #     ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
+    # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
+    beqz $LEN32, .Ldec_blocks_256_end
+    @{[vghsh_vv $V20, $V16, $V24]}
+
+    @{[prepare_input_and_ctr $T0]}
+
+    @{[aes_256_cipher_body $T1]}
+
+    # Compute AES ctr plaintext result.
+    @{[vxor_vv $V28, $V28, $V24]}
+
+    # Store plaintext
+    @{[vse32_v $V28, $OUTP]}
+    add $OUTP, $OUTP, $T0
+
+    j .Ldec_blocks_256
+.Ldec_blocks_256_end:
+
+    # Add ciphertext into partial tag
+    @{[vxor_vv $V20, $V20, $V24]}
+
+    @{[store_current_ctr]}
+
+    @{[compute_final_tag $T1]}
+
+    # Save the final tag
+    @{[vse32_v $V0, $XIP]}
+
+    # return the processed size.
+    slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
+    ret
+.size aes_gcm_dec_blocks_256,.-aes_gcm_dec_blocks_256
+___
+
+}
+}
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/openssl-3.4.2/crypto/modes/asm/aesni-gcm-x86_64.pl
--- a/openssl-3.4.2/crypto/modes/asm/ghash-alpha.pl
+++ b/openssl-3.4.2/crypto/modes/asm/ghash-alpha.pl
@@ -0,0 +1,467 @@
+#! /usr/bin/env perl
+# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Even though
+# loops are aggressively modulo-scheduled in respect to references to
+# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
+# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
+# scheduling "glitch," because uprofile(1) indicates uniform sample
+# distribution, as if all instruction bundles execute in 1.5 cycles.
+# Meaning that it could have been even faster, yet 12 cycles is ~60%
+# better than gcc-generated code and ~80% than code generated by vendor
+# compiler.
+
+$cnt="v0";	# $0
+$t0="t0";
+$t1="t1";
+$t2="t2";
+$Thi0="t3";	# $4
+$Tlo0="t4";
+$Thi1="t5";
+$Tlo1="t6";
+$rem="t7";	# $8
+#################
+$Xi="a0";	# $16, input argument block
+$Htbl="a1";
+$inp="a2";
+$len="a3";
+$nlo="a4";	# $20
+$nhi="a5";
+$Zhi="t8";
+$Zlo="t9";
+$Xhi="t10";	# $24
+$Xlo="t11";
+$remp="t12";
+$rem_4bit="AT";	# $28
+
+{ my $N;
+  sub loop() {
+
+	$N++;
+$code.=<<___;
+.align	4
+	extbl	$Xlo,7,$nlo
+	and	$nlo,0xf0,$nhi
+	sll	$nlo,4,$nlo
+	and	$nlo,0xf0,$nlo
+
+	addq	$nlo,$Htbl,$nlo
+	ldq	$Zlo,8($nlo)
+	addq	$nhi,$Htbl,$nhi
+	ldq	$Zhi,0($nlo)
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	lda	$cnt,6(zero)
+	extbl	$Xlo,6,$nlo
+
+	ldq	$Tlo1,8($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+	ldq	$Thi1,0($nhi)
+	srl	$Zlo,4,$Zlo
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	and	$nlo,0xf0,$nhi
+
+	xor	$Tlo1,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	xor	$Thi1,$Zhi,$Zhi
+	and	$nlo,0xf0,$nlo
+
+	addq	$nlo,$Htbl,$nlo
+	ldq	$Tlo0,8($nlo)
+	addq	$nhi,$Htbl,$nhi
+	ldq	$Thi0,0($nlo)
+
+.Looplo$N:
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	subq	$cnt,1,$cnt
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xlo,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	bne	$cnt,.Looplo$N
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	lda	$cnt,7(zero)
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xhi,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	unop
+
+
+.Loophi$N:
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	subq	$cnt,1,$cnt
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xhi,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	bne	$cnt,.Loophi$N
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo0,$Zlo,$Zlo
+	xor	$Thi0,$Zhi,$Zhi
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	xor	$rem,$Zhi,$Zhi
+___
+}}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set	noat
+.set	noreorder
+.globl	gcm_gmult_4bit
+.align	4
+.ent	gcm_gmult_4bit
+gcm_gmult_4bit:
+	.frame	sp,0,ra
+	.prologue 0
+
+	ldq	$Xlo,8($Xi)
+	ldq	$Xhi,0($Xi)
+
+	bsr	$t0,picmeup
+	nop
+___
+
+	&loop();
+
+$code.=<<___;
+	srl	$Zlo,24,$t0	# byte swap
+	srl	$Zlo,8,$t1
+
+	sll	$Zlo,8,$t2
+	sll	$Zlo,24,$Zlo
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+
+	zapnot	$Zlo,0x88,$Zlo
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zlo,$t0,$Zlo
+	srl	$Zhi,24,$t0
+	srl	$Zhi,8,$t1
+
+	or	$Zlo,$t2,$Zlo
+	sll	$Zhi,8,$t2
+	sll	$Zhi,24,$Zhi
+
+	srl	$Zlo,32,$Xlo
+	sll	$Zlo,32,$Zlo
+
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+	stq	$Xlo,8($Xi)
+	stq	$Xhi,0($Xi)
+
+	ret	(ra)
+.end	gcm_gmult_4bit
+___
+
+$inhi="s0";
+$inlo="s1";
+
+$code.=<<___;
+.globl	gcm_ghash_4bit
+.align	4
+.ent	gcm_ghash_4bit
+gcm_ghash_4bit:
+	lda	sp,-32(sp)
+	stq	ra,0(sp)
+	stq	s0,8(sp)
+	stq	s1,16(sp)
+	.mask	0x04000600,-32
+	.frame	sp,32,ra
+	.prologue 0
+
+	ldq_u	$inhi,0($inp)
+	ldq_u	$Thi0,7($inp)
+	ldq_u	$inlo,8($inp)
+	ldq_u	$Tlo0,15($inp)
+	ldq	$Xhi,0($Xi)
+	ldq	$Xlo,8($Xi)
+
+	bsr	$t0,picmeup
+	nop
+
+.Louter:
+	extql	$inhi,$inp,$inhi
+	extqh	$Thi0,$inp,$Thi0
+	or	$inhi,$Thi0,$inhi
+	lda	$inp,16($inp)
+
+	extql	$inlo,$inp,$inlo
+	extqh	$Tlo0,$inp,$Tlo0
+	or	$inlo,$Tlo0,$inlo
+	subq	$len,16,$len
+
+	xor	$Xlo,$inlo,$Xlo
+	xor	$Xhi,$inhi,$Xhi
+___
+
+	&loop();
+
+$code.=<<___;
+	srl	$Zlo,24,$t0	# byte swap
+	srl	$Zlo,8,$t1
+
+	sll	$Zlo,8,$t2
+	sll	$Zlo,24,$Zlo
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+
+	zapnot	$Zlo,0x88,$Zlo
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zlo,$t0,$Zlo
+	srl	$Zhi,24,$t0
+	srl	$Zhi,8,$t1
+
+	or	$Zlo,$t2,$Zlo
+	sll	$Zhi,8,$t2
+	sll	$Zhi,24,$Zhi
+
+	srl	$Zlo,32,$Xlo
+	sll	$Zlo,32,$Zlo
+	beq	$len,.Ldone
+
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+	ldq_u	$inhi,0($inp)
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+	ldq_u	$Thi0,7($inp)
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+	ldq_u	$inlo,8($inp)
+	ldq_u	$Tlo0,15($inp)
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+	br	zero,.Louter
+
+.Ldone:
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+
+	stq	$Xlo,8($Xi)
+	stq	$Xhi,0($Xi)
+
+	.set	noreorder
+	/*ldq	ra,0(sp)*/
+	ldq	s0,8(sp)
+	ldq	s1,16(sp)
+	lda	sp,32(sp)
+	ret	(ra)
+.end	gcm_ghash_4bit
+
+.align	4
+.ent	picmeup
+picmeup:
+	.frame	sp,0,$t0
+	.prologue 0
+	br	$rem_4bit,.Lpic
+.Lpic:	lda	$rem_4bit,12($rem_4bit)
+	ret	($t0)
+.end	picmeup
+	nop
+rem_4bit:
+	.long	0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
+	.long	0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
+	.long	0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
+	.long	0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
+.ascii	"GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+
+___
+$output=pop and open STDOUT,">$output";
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
+
--- a/openssl-3.4.2/crypto/modes/asm/ghash-armv4.pl
+++ b/openssl-3.4.2/crypto/modes/asm/ghash-armv4.pl
@@ -0,0 +1,554 @@
+#! /usr/bin/env perl
+# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+32 bytes shared table]. There is no
+# experimental performance data available yet. The only approximation
+# that can be made at this point is based on code size. Inner loop is
+# 32 instructions long and on single-issue core should execute in <40
+# cycles. Having verified that gcc 3.4 didn't unroll corresponding
+# loop, this assembler loop body was found to be ~3x smaller than
+# compiler-generated one...
+#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+#
+# April 2014
+#
+# Switch to multiplication algorithm suggested in paper referred
+# below and combine it with reduction algorithm from x86 module.
+# Performance improvement over previous version varies from 65% on
+# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
+# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
+# Snapdragon S4 - in 9.33.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
+
+# ====================================================================
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+#   bit shift operation is neatly fused with 128-bit xor here, and
+#   "538B" variant would eliminate only 4-5 instructions out of 32
+#   in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+#   consumption might be unappreciated (for so little improvement);
+#
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in Htable,
+# namely with *least* significant dword of 128-bit value at *lower*
+# address. This differs completely from C code and has everything to
+# do with ldm instruction and order in which dwords are "consumed" by
+# algorithm. *Byte* order within these dwords in turn is whatever
+# *native* byte order on current platform. See gcm128.c for working
+# example...
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
+        or die "can't call $xlate: $!";
+} else {
+    $output and open STDOUT,">$output";
+}
+
+$Xi="r0";	# argument block
+$Htbl="r1";
+$inp="r2";
+$len="r3";
+
+$Zll="r4";	# variables
+$Zlh="r5";
+$Zhl="r6";
+$Zhh="r7";
+$Tll="r8";
+$Tlh="r9";
+$Thl="r10";
+$Thh="r11";
+$nlo="r12";
+################# r13 is stack pointer
+$nhi="r14";
+################# r15 is program counter
+
+$rem_4bit=$inp;	# used in gcm_gmult_4bit
+$cnt=$len;
+
+sub Zsmash() {
+  my $i=12;
+  my @args=@_;
+  for ($Zll,$Zlh,$Zhl,$Zhh) {
+    $code.=<<___;
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	$_,$_
+	str	$_,[$Xi,#$i]
+#elif defined(__ARMEB__)
+	str	$_,[$Xi,#$i]
+#else
+	mov	$Tlh,$_,lsr#8
+	strb	$_,[$Xi,#$i+3]
+	mov	$Thl,$_,lsr#16
+	strb	$Tlh,[$Xi,#$i+2]
+	mov	$Thh,$_,lsr#24
+	strb	$Thl,[$Xi,#$i+1]
+	strb	$Thh,[$Xi,#$i]
+#endif
+___
+    $code.="\t".shift(@args)."\n";
+    $i-=4;
+  }
+}
+
+$code=<<___;
+#include "arm_arch.h"
+
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#define ldrplb  ldrbpl
+#define ldrneb  ldrbne
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+.text
+
+.type	rem_4bit,%object
+.align	5
+rem_4bit:
+.short	0x0000,0x1C20,0x3840,0x2460
+.short	0x7080,0x6CA0,0x48C0,0x54E0
+.short	0xE100,0xFD20,0xD940,0xC560
+.short	0x9180,0x8DA0,0xA9C0,0xB5E0
+.size	rem_4bit,.-rem_4bit
+
+.type	rem_4bit_get,%function
+rem_4bit_get:
+#if defined(__thumb2__)
+	adr	$rem_4bit,rem_4bit
+#else
+	sub	$rem_4bit,pc,#8+32	@ &rem_4bit
+#endif
+	b	.Lrem_4bit_got
+	nop
+	nop
+.size	rem_4bit_get,.-rem_4bit_get
+
+.global	gcm_ghash_4bit
+.type	gcm_ghash_4bit,%function
+.align	4
+gcm_ghash_4bit:
+#if defined(__thumb2__)
+	adr	r12,rem_4bit
+#else
+	sub	r12,pc,#8+48		@ &rem_4bit
+#endif
+	add	$len,$inp,$len		@ $len to point at the end
+	stmdb	sp!,{r3-r11,lr}		@ save $len/end too
+
+	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
+	stmdb	sp!,{r4-r11}		@ ... to stack
+
+	ldrb	$nlo,[$inp,#15]
+	ldrb	$nhi,[$Xi,#15]
+.Louter:
+	eor	$nlo,$nlo,$nhi
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	mov	$cnt,#14
+
+	add	$Zhh,$Htbl,$nlo,lsl#4
+	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	add	$Thh,$Htbl,$nhi
+	ldrb	$nlo,[$inp,#14]
+
+	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	add	$nhi,$nhi,$nhi
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	ldrb	$nhi,[$Xi,#14]
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	eor	$nlo,$nlo,$nhi
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16
+
+.Linner:
+	add	$Thh,$Htbl,$nlo,lsl#4
+	and	$nlo,$Zll,#0xf		@ rem
+	subs	$cnt,$cnt,#1
+	add	$nlo,$nlo,$nlo
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+#ifdef	__thumb2__
+	it	pl
+#endif
+	ldrplb	$nlo,[$inp,$cnt]
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	add	$nhi,$nhi,$nhi
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	eor	$Zll,$Tll,$Zll,lsr#4
+#ifdef	__thumb2__
+	it	pl
+#endif
+	ldrplb	$Tll,[$Xi,$cnt]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrh	$Tlh,[sp,$nhi]
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+#ifdef	__thumb2__
+	it	pl
+#endif
+	eorpl	$nlo,$nlo,$Tll
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+#ifdef	__thumb2__
+	itt	pl
+#endif
+	andpl	$nhi,$nlo,#0xf0
+	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tlh,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Linner
+
+	ldr	$len,[sp,#32]		@ re-load $len/end
+	add	$inp,$inp,#16
+	mov	$nhi,$Zll
+___
+	&Zsmash("cmp\t$inp,$len","\n".
+				 "#ifdef __thumb2__\n".
+				 "	it	ne\n".
+				 "#endif\n".
+				 "	ldrneb	$nlo,[$inp,#15]");
+$code.=<<___;
+	bne	.Louter
+
+	add	sp,sp,#36
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+
+.global	gcm_gmult_4bit
+.type	gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+	stmdb	sp!,{r4-r11,lr}
+	ldrb	$nlo,[$Xi,#15]
+	b	rem_4bit_get
+.Lrem_4bit_got:
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	mov	$cnt,#14
+
+	add	$Zhh,$Htbl,$nlo,lsl#4
+	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	ldrb	$nlo,[$Xi,#14]
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	add	$nhi,$nhi,$nhi
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	and	$nhi,$nlo,#0xf0
+	eor	$Zhh,$Zhh,$Tll,lsl#16
+	and	$nlo,$nlo,#0x0f
+
+.Loop:
+	add	$Thh,$Htbl,$nlo,lsl#4
+	and	$nlo,$Zll,#0xf		@ rem
+	subs	$cnt,$cnt,#1
+	add	$nlo,$nlo,$nlo
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+#ifdef	__thumb2__
+	it	pl
+#endif
+	ldrplb	$nlo,[$Xi,$cnt]
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	add	$nhi,$nhi,$nhi
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+#ifdef	__thumb2__
+	itt	pl
+#endif
+	andpl	$nhi,$nlo,#0xf0
+	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Loop
+___
+	&Zsmash();
+$code.=<<___;
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
+my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
+
+sub clmul64x64 {
+my ($r,$a,$b)=@_;
+$code.=<<___;
+	vext.8		$t0#lo, $a, $a, #1	@ A1
+	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B
+	vext.8		$r#lo, $b, $b, #1	@ B1
+	vmull.p8	$r, $a, $r#lo		@ E = A*B1
+	vext.8		$t1#lo, $a, $a, #2	@ A2
+	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B
+	vext.8		$t3#lo, $b, $b, #2	@ B2
+	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2
+	vext.8		$t2#lo, $a, $a, #3	@ A3
+	veor		$t0, $t0, $r		@ L = E + F
+	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B
+	vext.8		$r#lo, $b, $b, #3	@ B3
+	veor		$t1, $t1, $t3		@ M = G + H
+	vmull.p8	$r, $a, $r#lo		@ I = A*B3
+	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8
+	vand		$t0#hi, $t0#hi, $k48
+	vext.8		$t3#lo, $b, $b, #4	@ B4
+	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16
+	vand		$t1#hi, $t1#hi, $k32
+	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4
+	veor		$t2, $t2, $r		@ N = I + J
+	veor		$t0#lo, $t0#lo, $t0#hi
+	veor		$t1#lo, $t1#lo, $t1#hi
+	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24
+	vand		$t2#hi, $t2#hi, $k16
+	vext.8		$t0, $t0, $t0, #15
+	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	$t3#hi, #0
+	vext.8		$t1, $t1, $t1, #14
+	veor		$t2#lo, $t2#lo, $t2#hi
+	vmull.p8	$r, $a, $b		@ D = A*B
+	vext.8		$t3, $t3, $t3, #12
+	vext.8		$t2, $t2, $t2, #13
+	veor		$t0, $t0, $t1
+	veor		$t2, $t2, $t3
+	veor		$r, $r, $t0
+	veor		$r, $r, $t2
+___
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	vld1.64		$IN#hi,[r1]!		@ load H
+	vmov.i8		$t0,#0xe1
+	vld1.64		$IN#lo,[r1]
+	vshl.i64	$t0#hi,#57
+	vshr.u64	$t0#lo,#63		@ t0=0xc2....01
+	vdup.8		$t1,$IN#hi[7]
+	vshr.u64	$Hlo,$IN#lo,#63
+	vshr.s8		$t1,#7			@ broadcast carry bit
+	vshl.i64	$IN,$IN,#1
+	vand		$t0,$t0,$t1
+	vorr		$IN#hi,$Hlo		@ H<<<=1
+	veor		$IN,$IN,$t0		@ twisted H
+	vstmia		r0,{$IN}
+
+	ret					@ bx lr
+.size	gcm_init_neon,.-gcm_init_neon
+
+.global	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	vld1.64		$IN#hi,[$Xi]!		@ load Xi
+	vld1.64		$IN#lo,[$Xi]!
+	vmov.i64	$k48,#0x0000ffffffffffff
+	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
+	vmov.i64	$k32,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	vmov.i64	$k16,#0x000000000000ffff
+	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
+	mov		$len,#16
+	b		.Lgmult_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.global	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	vld1.64		$Xl#hi,[$Xi]!		@ load Xi
+	vld1.64		$Xl#lo,[$Xi]!
+	vmov.i64	$k48,#0x0000ffffffffffff
+	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
+	vmov.i64	$k32,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	$Xl,$Xl
+#endif
+	vmov.i64	$k16,#0x000000000000ffff
+	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
+
+.Loop_neon:
+	vld1.64		$IN#hi,[$inp]!		@ load inp
+	vld1.64		$IN#lo,[$inp]!
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	veor		$IN,$Xl			@ inp^=Xi
+.Lgmult_neon:
+___
+	&clmul64x64	($Xl,$Hlo,"$IN#lo");	# H.lo·Xi.lo
+$code.=<<___;
+	veor		$IN#lo,$IN#lo,$IN#hi	@ Karatsuba pre-processing
+___
+	&clmul64x64	($Xm,$Hhl,"$IN#lo");	# (H.lo+H.hi)·(Xi.lo+Xi.hi)
+	&clmul64x64	($Xh,$Hhi,"$IN#hi");	# H.hi·Xi.hi
+$code.=<<___;
+	veor		$Xm,$Xm,$Xl		@ Karatsuba post-processing
+	veor		$Xm,$Xm,$Xh
+	veor		$Xl#hi,$Xl#hi,$Xm#lo
+	veor		$Xh#lo,$Xh#lo,$Xm#hi	@ Xh|Xl - 256-bit result
+
+	@ equivalent of reduction_avx from ghash-x86_64.pl
+	vshl.i64	$t1,$Xl,#57		@ 1st phase
+	vshl.i64	$t2,$Xl,#62
+	veor		$t2,$t2,$t1		@
+	vshl.i64	$t1,$Xl,#63
+	veor		$t2, $t2, $t1		@
+ 	veor		$Xl#hi,$Xl#hi,$t2#lo	@
+	veor		$Xh#lo,$Xh#lo,$t2#hi
+
+	vshr.u64	$t2,$Xl,#1		@ 2nd phase
+	veor		$Xh,$Xh,$Xl
+	veor		$Xl,$Xl,$t2		@
+	vshr.u64	$t2,$t2,#6
+	vshr.u64	$Xl,$Xl,#1		@
+	veor		$Xl,$Xl,$Xh		@
+	veor		$Xl,$Xl,$t2		@
+
+	subs		$len,#16
+	bne		.Loop_neon
+
+#ifdef __ARMEL__
+	vrev64.8	$Xl,$Xl
+#endif
+	sub		$Xi,#16
+	vst1.64		$Xl#hi,[$Xi]!		@ write out Xi
+	vst1.64		$Xl#lo,[$Xi]
+
+	ret					@ bx lr
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
--- a/openssl-3.4.2/crypto/modes/asm/ghash-c64xplus.pl
+++ b/openssl-3.4.2/crypto/modes/asm/ghash-c64xplus.pl
@@ -0,0 +1,246 @@
+#! /usr/bin/env perl
+# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# December 2011
+#
+# The module implements GCM GHASH function and underlying single
+# multiplication operation in GF(2^128). Even though subroutines
+# have _4bit suffix, they are not using any tables, but rely on
+# hardware Galois Field Multiply support. Streamed GHASH processes
+# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
+# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
+# comparing apples vs. oranges, but compiler surely could have done
+# better, because theoretical [though not necessarily achievable]
+# estimate for "4-bit" table-driven implementation is ~12 cycles.
+
+$output = pop and open STDOUT,">$output";
+
+($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6");	# arguments
+
+($Z0,$Z1,$Z2,$Z3,	$H0, $H1, $H2, $H3,
+			$H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
+($H01u,$H01y,$H2u,$H3u,	$H0y,$H1y,$H2y,$H3y,
+			$H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
+($FF000000,$E10000)=("B30","B31");
+($xip,$x0,$x1,$xib)=map("B$_",(6..9));	# $xip zaps $len
+ $xia="A9";
+($rem,$res)=("B4","B5");		# $rem zaps $Htable
+
+$code.=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.asg	gcm_gmult_1bit,_gcm_gmult_1bit
+	.asg	gcm_gmult_4bit,_gcm_gmult_4bit
+	.asg	gcm_ghash_4bit,_gcm_ghash_4bit
+	.endif
+
+	.asg	B3,RA
+
+	.if	0
+	.global	_gcm_gmult_1bit
+_gcm_gmult_1bit:
+	ADDAD	$Htable,2,$Htable
+	.endif
+	.global	_gcm_gmult_4bit
+_gcm_gmult_4bit:
+	.asmfunc
+	LDDW	*${Htable}[-1],$H1:$H0	; H.lo
+	LDDW	*${Htable}[-2],$H3:$H2	; H.hi
+||	MV	$Xip,${xip}		; reassign Xi
+||	MVK	15,B1			; SPLOOPD constant
+
+	MVK	0xE1,$E10000
+||	LDBU	*++${xip}[15],$x1	; Xi[15]
+	MVK	0xFF,$FF000000
+||	LDBU	*--${xip},$x0		; Xi[14]
+	SHL	$E10000,16,$E10000	; [pre-shifted] reduction polynomial
+	SHL	$FF000000,24,$FF000000	; upper byte mask
+||	BNOP	ghash_loop?
+||	MVK	1,B0			; take a single spin
+
+	PACKH2	$H0,$H1,$xia		; pack H0' and H1's upper bytes
+	AND	$H2,$FF000000,$H2u	; H2's upper byte
+	AND	$H3,$FF000000,$H3u	; H3's upper byte
+||	SHRU	$H2u,8,$H2u
+	SHRU	$H3u,8,$H3u
+||	ZERO	$Z1:$Z0
+	SHRU2	$xia,8,$H01u
+||	ZERO	$Z3:$Z2
+	.endasmfunc
+
+	.global	_gcm_ghash_4bit
+_gcm_ghash_4bit:
+	.asmfunc
+	LDDW	*${Htable}[-1],$H1:$H0	; H.lo
+||	SHRU	$len,4,B0		; reassign len
+	LDDW	*${Htable}[-2],$H3:$H2	; H.hi
+||	MV	$Xip,${xip}		; reassign Xi
+||	MVK	15,B1			; SPLOOPD constant
+
+	MVK	0xE1,$E10000
+|| [B0]	LDNDW	*${inp}[1],$H1x:$H0x
+	MVK	0xFF,$FF000000
+|| [B0]	LDNDW	*${inp}++[2],$H3x:$H2x
+	SHL	$E10000,16,$E10000	; [pre-shifted] reduction polynomial
+||	LDDW	*${xip}[1],$Z1:$Z0
+	SHL	$FF000000,24,$FF000000	; upper byte mask
+||	LDDW	*${xip}[0],$Z3:$Z2
+
+	PACKH2	$H0,$H1,$xia		; pack H0' and H1's upper bytes
+	AND	$H2,$FF000000,$H2u	; H2's upper byte
+	AND	$H3,$FF000000,$H3u	; H3's upper byte
+||	SHRU	$H2u,8,$H2u
+	SHRU	$H3u,8,$H3u
+	SHRU2	$xia,8,$H01u
+
+|| [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
+|| [B0]	XOR	$H1x,$Z1,$Z1
+	.if	.LITTLE_ENDIAN
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	SHRU	$Z1,24,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0]	SHRU	$Z1,16,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.else
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	MV	$Z0,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0] SHRU	$Z0,8,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.endif
+	STDW	$Z3:$Z2,*${xip}[0]
+|| [B0]	ZERO	$Z3:$Z2
+|| [B0]	MV	$xia,$x1
+   [B0]	ADDK	14,${xip}
+
+ghash_loop?:
+	SPLOOPD	6			; 6*16+7
+||	MVC	B1,ILC
+|| [B0]	SUB	B0,1,B0
+||	ZERO	A0
+||	ADD	$x1,$x1,$xib		; SHL	$x1,1,$xib
+||	SHL	$x1,1,$xia
+___
+
+########____________________________
+#  0    D2.     M1          M2      |
+#  1            M1                  |
+#  2            M1          M2      |
+#  3        D1. M1          M2      |
+#  4        S1. L1                  |
+#  5    S2  S1x L1          D2  L2  |____________________________
+#  6/0          L1  S1      L2  S2x |D2.     M1          M2      |
+#  7/1          L1  S1  D1x S2  M2  |        M1                  |
+#  8/2              S1  L1x S2      |        M1          M2      |
+#  9/3              S1  L1x         |    D1. M1          M2      |
+# 10/4                  D1x         |    S1. L1                  |
+# 11/5                              |S2  S1x L1          D2  L2  |____________
+# 12/6/0                D1x       __|        L1  S1      L2  S2x |D2.     ....
+#    7/1                                     L1  S1  D1x S2  M2  |        ....
+#    8/2                                         S1  L1x S2      |        ....
+#####...                                         ................|............
+$code.=<<___;
+	XORMPY	$H0,$xia,$H0x		; 0	; H·(Xi[i]<<1)
+||	XORMPY	$H01u,$xib,$H01y
+|| [A0]	LDBU	*--${xip},$x0
+	XORMPY	$H1,$xia,$H1x		; 1
+	XORMPY	$H2,$xia,$H2x		; 2
+||	XORMPY	$H2u,$xib,$H2y
+	XORMPY	$H3,$xia,$H3x		; 3
+||	XORMPY	$H3u,$xib,$H3y
+||[!A0]	MVK.D	15,A0				; *--${xip} counter
+	XOR.L	$H0x,$Z0,$Z0		; 4	; Z^=H·(Xi[i]<<1)
+|| [A0]	SUB.S	A0,1,A0
+	XOR.L	$H1x,$Z1,$Z1		; 5
+||	AND.D	$H01y,$FF000000,$H0z
+||	SWAP2.L	$H01y,$H1y		;	; SHL	$H01y,16,$H1y
+||	SHL	$x0,1,$xib
+||	SHL	$x0,1,$xia
+
+	XOR.L	$H2x,$Z2,$Z2		; 6/0	; [0,0] in epilogue
+||	SHL	$Z0,1,$rem		;	; rem=Z<<1
+||	SHRMB.S	$Z1,$Z0,$Z0		;	; Z>>=8
+||	AND.L	$H1y,$FF000000,$H1z
+	XOR.L	$H3x,$Z3,$Z3		; 7/1
+||	SHRMB.S	$Z2,$Z1,$Z1
+||	XOR.D	$H0z,$Z0,$Z0			; merge upper byte products
+||	AND.S	$H2y,$FF000000,$H2z
+||	XORMPY	$E10000,$rem,$res	;	; implicit rem&0x1FE
+	XOR.L	$H1z,$Z1,$Z1		; 8/2
+||	SHRMB.S	$Z3,$Z2,$Z2
+||	AND.S	$H3y,$FF000000,$H3z
+	XOR.L	$H2z,$Z2,$Z2		; 9/3
+||	SHRU	$Z3,8,$Z3
+	XOR.D	$H3z,$Z3,$Z3		; 10/4
+	NOP				; 11/5
+
+	SPKERNEL 0,2
+||	XOR.D	$res,$Z3,$Z3		; 12/6/0; Z^=res
+
+	; input pre-fetch is possible where D1 slot is available...
+   [B0]	LDNDW	*${inp}[1],$H1x:$H0x	; 8/-
+   [B0]	LDNDW	*${inp}++[2],$H3x:$H2x	; 9/-
+	NOP				; 10/-
+	.if	.LITTLE_ENDIAN
+	SWAP2	$Z0,$Z1			; 11/-
+||	SWAP4	$Z1,$Z0
+	SWAP4	$Z1,$Z1			; 12/-
+||	SWAP2	$Z0,$Z0
+	SWAP2	$Z2,$Z3
+||	SWAP4	$Z3,$Z2
+||[!B0]	BNOP	RA
+	SWAP4	$Z3,$Z3
+||	SWAP2	$Z2,$Z2
+|| [B0]	BNOP	ghash_loop?
+   [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
+|| [B0]	XOR	$H1x,$Z1,$Z1
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	SHRU	$Z1,24,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0]	SHRU	$Z1,16,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.else
+  [!B0]	BNOP	RA			; 11/-
+   [B0]	BNOP	ghash_loop?		; 12/-
+   [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
+|| [B0]	XOR	$H1x,$Z1,$Z1
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	MV	$Z0,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0] SHRU	$Z0,8,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.endif
+	STDW	$Z3:$Z2,*${xip}[0]
+|| [B0]	ZERO	$Z3:$Z2
+|| [B0]	MV	$xia,$x1
+   [B0]	ADDK	14,${xip}
+	.endasmfunc
+
+	.sect	.const
+	.cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/modes/asm/ghash-ia64.pl
+++ b/openssl-3.4.2/crypto/modes/asm/ghash-ia64.pl
@@ -0,0 +1,470 @@
+#! /usr/bin/env perl
+# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
+# GHASH performance was measured to be 6.67 cycles per processed byte
+# on Itanium 2, which is >90% better than Microsoft compiler generated
+# code. To anchor to something else sha1-ia64.pl module processes one
+# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
+# byte.
+
+# September 2010
+#
+# It was originally thought that it makes lesser sense to implement
+# "528B" variant on Itanium 2 for following reason. Because number of
+# functional units is naturally limited, it appeared impossible to
+# implement "528B" loop in 4 cycles, only in 5. This would mean that
+# theoretically performance improvement couldn't be more than 20%.
+# But occasionally you prove yourself wrong:-) I figured out a way to
+# fold couple of instructions and having freed yet another instruction
+# slot by unrolling the loop... Resulting performance is 4.45 cycles
+# per processed byte and 50% better than "256B" version. On original
+# Itanium performance should remain the same as the "256B" version,
+# i.e. ~8.5 cycles.
+
+$output=pop and (open STDOUT,">$output" or die "can't open $output: $!");
+
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+for (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
+                $big_endian=0 if (/\-DL_ENDIAN/);  }
+if (!defined($big_endian))
+             {  $big_endian=(unpack('L',pack('N',1))==1);  }
+
+sub loop() {
+my $label=shift;
+my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
+
+# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
+# in scalable manner;-) Naturally assuming data in L1 cache...
+# Special note about 'dep' instruction, which is used to construct
+# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
+# bytes boundary and lower 7 bits of its address are guaranteed to
+# be zero.
+$code.=<<___;
+$label:
+{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
+	(p19)	dep	rem=Zlo,rem_4bitp,3,4	}
+{ .mfi;	(p19)	xor	Zhi=Zhi,Hhi
+	($p17)	xor	xi[1]=xi[1],in[1]	};;
+{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
+	(p19)	shrp	Zlo=Zhi,Zlo,4		}
+{ .mfi;	(p19)	ld8	rem=[rem]
+	(p18)	and	Hi[1]=mask0xf0,xi[2]	};;
+{ .mmi;	($p16)	ld1	in[0]=[inp],-1
+	(p18)	xor	Zlo=Zlo,Hlo
+	(p19)	shr.u	Zhi=Zhi,4		}
+{ .mib;	(p19)	xor	Hhi=Hhi,rem
+	(p18)	add	Hi[1]=Htbl,Hi[1]	};;
+
+{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
+	(p18)	dep	rem=Zlo,rem_4bitp,3,4	}
+{ .mfi;	(p17)	shladd	Hi[0]=xi[1],4,r0
+	(p18)	xor	Zhi=Zhi,Hhi		};;
+{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
+	(p18)	shrp	Zlo=Zhi,Zlo,4		}
+{ .mfi;	(p18)	ld8	rem=[rem]
+	(p17)	and	Hi[0]=mask0xf0,Hi[0]	};;
+{ .mmi;	(p16)	ld1	xi[0]=[Xi],-1
+	(p18)	xor	Zlo=Zlo,Hlo
+	(p18)	shr.u	Zhi=Zhi,4		}
+{ .mib;	(p18)	xor	Hhi=Hhi,rem
+	(p17)	add	Hi[0]=Htbl,Hi[0]
+	br.ctop.sptk	$label			};;
+___
+}
+
+$code=<<___;
+.explicit
+.text
+
+prevfs=r2;	prevlc=r3;	prevpr=r8;
+mask0xf0=r21;
+rem=r22;	rem_4bitp=r23;
+Xi=r24;		Htbl=r25;
+inp=r26;	end=r27;
+Hhi=r28;	Hlo=r29;
+Zhi=r30;	Zlo=r31;
+
+.align	128
+.skip	16					// aligns loop body
+.global	gcm_gmult_4bit#
+.proc	gcm_gmult_4bit#
+gcm_gmult_4bit:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,2,6,0,8
+	$ADDP	Xi=15,in0			// &Xi[15]
+	mov	rem_4bitp=ip		}
+{ .mii;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
+	.save	ar.lc,prevlc
+	mov	prevlc=ar.lc
+	.save	pr,prevpr
+	mov	prevpr=pr		};;
+
+	.body
+	.rotr	in[3],xi[3],Hi[2]
+
+{ .mib;	ld1	xi[2]=[Xi],-1			// Xi[15]
+	mov	mask0xf0=0xf0
+	brp.loop.imp	.Loop1,.Lend1-16};;
+{ .mmi;	ld1	xi[1]=[Xi],-1			// Xi[14]
+					};;
+{ .mii;	shladd	Hi[1]=xi[2],4,r0
+	mov	pr.rot=0x7<<16
+	mov	ar.lc=13		};;
+{ .mii;	and	Hi[1]=mask0xf0,Hi[1]
+	mov	ar.ec=3
+	xor	Zlo=Zlo,Zlo		};;
+{ .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
+	add	rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
+	xor	Zhi=Zhi,Zhi		};;
+___
+	&loop	(".Loop1",1);
+$code.=<<___;
+.Lend1:
+{ .mib;	xor	Zhi=Zhi,Hhi		};;	// modulo-scheduling artefact
+{ .mib;	mux1	Zlo=Zlo,\@rev		};;
+{ .mib;	mux1	Zhi=Zhi,\@rev		};;
+{ .mmi;	add	Hlo=9,Xi;;			// ;; is here to prevent
+	add	Hhi=1,Xi		};;	// pipeline flush on Itanium
+{ .mib;	st8	[Hlo]=Zlo
+	mov	pr=prevpr,0x1ffff	};;
+{ .mib;	st8	[Hhi]=Zhi
+	mov	ar.lc=prevlc
+	br.ret.sptk.many	b0	};;
+.endp	gcm_gmult_4bit#
+___
+
+######################################################################
+# "528B" (well, "512B" actually) streamed GHASH
+#
+$Xip="in0";
+$Htbl="in1";
+$inp="in2";
+$len="in3";
+$rem_8bit="loc0";
+$mask0xff="loc1";
+($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
+
+sub load_htable() {
+    for (my $i=0;$i<8;$i++) {
+	$code.=<<___;
+{ .mmi;	ld8	r`16+2*$i+1`=[r8],16		// Htable[$i].hi
+	ld8	r`16+2*$i`=[r9],16	}	// Htable[$i].lo
+{ .mmi;	ldf8	f`32+2*$i+1`=[r10],16		// Htable[`8+$i`].hi
+	ldf8	f`32+2*$i`=[r11],16		// Htable[`8+$i`].lo
+___
+	$code.=shift	if (($i+$#_)==7);
+	$code.="\t};;\n"
+    }
+}
+
+$code.=<<___;
+prevsp=r3;
+
+.align	32
+.skip	16					// aligns loop body
+.global	gcm_ghash_4bit#
+.proc	gcm_ghash_4bit#
+gcm_ghash_4bit:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,4,2,0,0
+	.vframe	prevsp
+	mov	prevsp=sp
+	mov	$rem_8bit=ip		};;
+	.body
+{ .mfi;	$ADDP	r8=0+0,$Htbl
+	$ADDP	r9=0+8,$Htbl		}
+{ .mfi;	$ADDP	r10=128+0,$Htbl
+	$ADDP	r11=128+8,$Htbl		};;
+___
+	&load_htable(
+	"	$ADDP	$Xip=15,$Xip",		# &Xi[15]
+	"	$ADDP	$len=$len,$inp",	# &inp[len]
+	"	$ADDP	$inp=15,$inp",		# &inp[15]
+	"	mov	$mask0xff=0xff",
+	"	add	sp=-512,sp",
+	"	andcm	sp=sp,$mask0xff",	# align stack frame
+	"	add	r14=0,sp",
+	"	add	r15=8,sp");
+$code.=<<___;
+{ .mmi;	$sum	1<<1				// go big-endian
+	add	r8=256+0,sp
+	add	r9=256+8,sp		}
+{ .mmi;	add	r10=256+128+0,sp
+	add	r11=256+128+8,sp
+	add	$len=-17,$len		};;
+___
+for($i=0;$i<8;$i++) {	# generate first half of Hshr4[]
+my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
+$code.=<<___;
+{ .mmi;	st8	[r8]=$rlo,16			// Htable[$i].lo
+	st8	[r9]=$rhi,16			// Htable[$i].hi
+	shrp	$rlo=$rhi,$rlo,4	}//;;
+{ .mmi;	stf8	[r10]=f`32+2*$i`,16		// Htable[`8+$i`].lo
+	stf8	[r11]=f`32+2*$i+1`,16		// Htable[`8+$i`].hi
+	shr.u	$rhi=$rhi,4		};;
+{ .mmi;	st8	[r14]=$rlo,16			// Htable[$i].lo>>4
+	st8	[r15]=$rhi,16		}//;;	// Htable[$i].hi>>4
+___
+}
+$code.=<<___;
+{ .mmi;	ld8	r16=[r8],16			// Htable[8].lo
+	ld8	r17=[r9],16		};;	// Htable[8].hi
+{ .mmi;	ld8	r18=[r8],16			// Htable[9].lo
+	ld8	r19=[r9],16		}	// Htable[9].hi
+{ .mmi;	rum	1<<5				// clear um.mfh
+	shrp	r16=r17,r16,4		};;
+___
+for($i=0;$i<6;$i++) {	# generate second half of Hshr4[]
+$code.=<<___;
+{ .mmi;	ld8	r`20+2*$i`=[r8],16		// Htable[`10+$i`].lo
+	ld8	r`20+2*$i+1`=[r9],16		// Htable[`10+$i`].hi
+	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
+{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
+	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
+	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
+___
+}
+$code.=<<___;
+{ .mmi;	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
+{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
+	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
+	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
+{ .mmi;	add	$Htbl=256,sp			// &Htable[0]
+	add	$rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
+	shr.u	r`18+2*$i+1`=r`18+2*$i+1`,4	};;
+{ .mmi;	st8	[r14]=r`18+2*$i`		// Htable[`8+$i`].lo>>4
+	st8	[r15]=r`18+2*$i+1`	}	// Htable[`8+$i`].hi>>4
+___
+
+$in="r15";
+@xi=("r16","r17");
+@rem=("r18","r19");
+($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
+($Atbl,$Btbl)=("r26","r27");
+
+$code.=<<___;	# (p16)
+{ .mmi;	ld1	$in=[$inp],-1			//(p16) *inp--
+	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	cmp.eq	p0,p6=r0,r0		};;	//	clear p6
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p16),(p17)
+{ .mmi;	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mii;	ld1	$in=[$inp],-1			//(p16) *inp--
+	dep	$Atbl=$xi[1],$Htbl,4,4		//(p17) &Htable[nlo].lo
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+.align	32
+.LOOP:
+{ .mmi;
+(p6)	st8	[$Xip]=$Zhi,13
+	xor	$Zlo=$Zlo,$Zlo
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi].lo
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p16),(p17),(p18)
+{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
+{ .mfi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
+	xor	$Zlo=$Zlo,$Alo		};;	//(p18) Z.lo^=Htable[nlo].lo
+{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+	ld1	$in=[$inp],-1		}	//(p16) *inp--
+{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
+	mov	$Zhi=$Ahi			//(p18) Z.hi^=Htable[nlo].hi
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
+	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+for ($i=1;$i<14;$i++) {
+# Above and below fragments are derived from this one by removing
+# unsuitable (p??) instructions.
+$code.=<<___;	# (p16),(p17),(p18),(p19)
+{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
+{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
+{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
+	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
+	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+	ld1	$in=[$inp],-1			//(p16) *inp--
+	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
+{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
+	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
+	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+}
+
+$code.=<<___;	# (p17),(p18),(p19)
+{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
+{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+	dep	$Atbl=$xi[1],$Htbl,4,4	};;	//(p17) &Htable[nlo].lo
+{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
+	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
+	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
+{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
+	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
+	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p18),(p19)
+{ .mfi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
+{ .mfi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo		};;	//(p19) Z.lo^=Hshr4[nhi].lo
+{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	xor	$Zlo=$Zlo,$Alo		}	//(p18) Z.lo^=Htable[nlo].lo
+{ .mfi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
+{ .mfi;	ld8	$Blo=[$Btbl],8			//(p18) Htable[nhi].lo,&Htable[nhi].hi
+	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
+{ .mfi;	shladd	$rem[0]=$Zlo,4,r0		//(p18) Z.lo<<4
+	xor	$Zhi=$Zhi,$Ahi		};;	//(p18) Z.hi^=Htable[nlo].hi
+{ .mfi;	ld8	$Bhi=[$Btbl]			//(p18) Htable[nhi].hi
+	shrp	$Zlo=$Zhi,$Zlo,4	}	//(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
+{ .mfi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	xor	$Zhi=$Zhi,$rem[1]	};;	//(p19) Z.hi^=rem_8bit[rem]<<48
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p19)
+{ .mmi;	cmp.ltu	p6,p0=$inp,$len
+	add	$inp=32,$inp
+	shr.u	$Zhi=$Zhi,4		}	//(p19) Z.hi>>=4
+{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
+	add	$Xip=9,$Xip		};;	//	&Xi.lo
+{ .mmi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
+(p6)	extr.u	$xi[1]=$Zlo,8,8		}	//[p17] Xi[14]
+{ .mmi;	xor	$Zhi=$Zhi,$Bhi			//(p19) Z.hi^=Hshr4[nhi].hi
+(p6)	and	$xi[0]=$Zlo,$mask0xff	};;	//[p16] Xi[15]
+{ .mmi;	st8	[$Xip]=$Zlo,-8
+(p6)	xor	$xi[0]=$xi[0],$in		//[p17] xi=$xi[i]^inp[i]
+	shl	$rem[1]=$rem[1],48	};;	//(p19) rem_8bit[rem]<<48
+{ .mmi;
+(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
+	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
+(p6)	dep	$Atbl=$xi[0],$Htbl,4,4	}	//[p17] &Htable[nlo].lo
+{ .mib;
+(p6)	and	$xi[0]=-16,$xi[0]		//[p17] nhi=xi&0xf0
+(p6)	br.cond.dptk.many	.LOOP	};;
+
+{ .mib;	st8	[$Xip]=$Zhi		};;
+{ .mib;	$rum	1<<1				// return to little-endian
+	.restore	sp
+	mov	sp=prevsp
+	br.ret.sptk.many	b0	};;
+.endp	gcm_ghash_4bit#
+___
+$code.=<<___;
+.align	128
+.type	rem_4bit#,\@object
+rem_4bit:
+        data8	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+        data8	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+        data8	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+        data8	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.size	rem_4bit#,128
+.type	rem_8bit#,\@object
+rem_8bit:
+	data1	0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
+	data1	0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
+	data1	0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
+	data1	0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
+	data1	0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
+	data1	0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
+	data1	0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
+	data1	0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
+	data1	0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
+	data1	0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
+	data1	0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
+	data1	0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
+	data1	0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
+	data1	0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
+	data1	0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
+	data1	0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
+	data1	0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
+	data1	0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
+	data1	0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
+	data1	0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
+	data1	0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
+	data1	0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
+	data1	0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
+	data1	0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
+	data1	0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
+	data1	0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
+	data1	0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
+	data1	0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
+	data1	0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
+	data1	0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
+	data1	0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
+	data1	0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
+.size	rem_8bit#,512
+stringz	"GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/modes/asm/ghash-parisc.pl
+++ b/openssl-3.4.2/crypto/modes/asm/ghash-parisc.pl
@@ -0,0 +1,751 @@
+#! /usr/bin/env perl
+# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
+# it processes one byte in 19.6 cycles, which is more than twice as
+# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
+# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
+# processed byte. This is ~2.2x faster than 64-bit code generated by
+# vendor compiler (which used to be very hard to beat:-).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$output and open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+	$NREGS		=6;
+} else {
+	$LEVEL		="1.0";	#"\n\t.ALLOW\t2.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+	$NREGS		=11;
+}
+
+$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
+				#                 [+ argument transfer]
+
+################# volatile registers
+$Xi="%r26";	# argument block
+$Htbl="%r25";
+$inp="%r24";
+$len="%r23";
+$Hhh=$Htbl;	# variables
+$Hll="%r22";
+$Zhh="%r21";
+$Zll="%r20";
+$cnt="%r19";
+$rem_4bit="%r28";
+$rem="%r29";
+$mask0xf0="%r31";
+
+################# preserved registers
+$Thh="%r1";
+$Tll="%r2";
+$nlo="%r3";
+$nhi="%r4";
+$byte="%r5";
+if ($SIZE_T==4) {
+	$Zhl="%r6";
+	$Zlh="%r7";
+	$Hhl="%r8";
+	$Hlh="%r9";
+	$Thl="%r10";
+	$Tlh="%r11";
+}
+$rem2="%r6";	# used in PA-RISC 2.0 code
+
+$code.=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	64
+gcm_gmult_4bit
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+	blr	%r0,$rem_4bit
+	ldi	3,$rem
+L\$pic_gmult
+	andcm	$rem_4bit,$rem,$rem_4bit
+	addl	$inp,$len,$len
+	ldo	L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
+	ldi	0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldi	31,$rem
+	mtctl	$rem,%cr11
+	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
+	b	L\$parisc1_gmult
+	nop
+___
+
+$code.=<<___;
+	ldb	15($Xi),$nlo
+	ldo	8($Htbl),$Hll
+
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	ldd	$nlo($Hll),$Zll
+	ldd	$nlo($Hhh),$Zhh
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldb	14($Xi),$nlo
+
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+	b	L\$oop_gmult_pa2
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_gmult_pa2
+	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
+	depd,z	$Zll,60,4,$rem
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem
+	ldbx	$cnt($Xi),$nlo
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$Tll,$Zll,$Zll
+	addib,uv -1,$cnt,L\$oop_gmult_pa2
+	xor	$Thh,$Zhh,$Zhh
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	std	$Zll,8($Xi)
+	std	$Zhh,0($Xi)
+___
+
+$code.=<<___ if ($SIZE_T==4);
+	b	L\$done_gmult
+	nop
+
+L\$parisc1_gmult
+	ldb	15($Xi),$nlo
+	ldo	12($Htbl),$Hll
+	ldo	8($Htbl),$Hlh
+	ldo	4($Htbl),$Hhl
+
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	ldwx	$nlo($Hll),$Zll
+	ldwx	$nlo($Hlh),$Zlh
+	ldwx	$nlo($Hhl),$Zhl
+	ldwx	$nlo($Hhh),$Zhh
+	zdep	$Zll,28,4,$rem
+	ldb	14($Xi),$nlo
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	extru	$Zhh,27,28,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	xor	$rem,$Zhh,$Zhh
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$Thl,$Zhl,$Zhl
+	b	L\$oop_gmult_pa1
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_gmult_pa1
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldbx	$cnt($Xi),$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$rem,$Zhh,$Zhh
+	zdep	$Zll,28,4,$rem
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	and	$mask0xf0,$nlo,$nhi
+	extru	$Zhh,27,28,$Zhh
+	zdep	$nlo,27,4,$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$rem,$Zhh,$Zhh
+	addib,uv -1,$cnt,L\$oop_gmult_pa1
+	xor	$Thl,$Zhl,$Zhl
+
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$rem,$Zhh,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	zdep	$Zll,28,4,$rem
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	extru	$Zhh,27,28,$Zhh
+	xor	$Tll,$Zll,$Zll
+	xor	$Tlh,$Zlh,$Zlh
+	xor	$rem,$Zhh,$Zhh
+	stw	$Zll,12($Xi)
+	xor	$Thl,$Zhl,$Zhl
+	stw	$Zlh,8($Xi)
+	xor	$Thh,$Zhh,$Zhh
+	stw	$Zhl,4($Xi)
+	stw	$Zhh,0($Xi)
+___
+$code.=<<___;
+L\$done_gmult
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.EXPORT	gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+	.ALIGN	64
+gcm_ghash_4bit
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+	blr	%r0,$rem_4bit
+	ldi	3,$rem
+L\$pic_ghash
+	andcm	$rem_4bit,$rem,$rem_4bit
+	addl	$inp,$len,$len
+	ldo	L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
+	ldi	0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldi	31,$rem
+	mtctl	$rem,%cr11
+	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
+	b	L\$parisc1_ghash
+	nop
+___
+
+$code.=<<___;
+	ldb	15($Xi),$nlo
+	ldo	8($Htbl),$Hll
+
+L\$outer_ghash_pa2
+	ldb	15($inp),$nhi
+	xor	$nhi,$nlo,$nlo
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	ldd	$nlo($Hll),$Zll
+	ldd	$nlo($Hhh),$Zhh
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldb	14($Xi),$nlo
+	ldb	14($inp),$byte
+
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+	xor	$byte,$nlo,$nlo
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+	b	L\$oop_ghash_pa2
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_ghash_pa2
+	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
+	depd,z	$Zll,60,4,$rem2
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldbx	$cnt($Xi),$nlo
+	ldbx	$cnt($inp),$byte
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	ldd	$rem2($rem_4bit),$rem2
+
+	xor	$rem2,$Zhh,$Zhh
+	xor	$byte,$nlo,$nlo
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	extrd,u	$Zhh,59,60,$Zhh
+	xor	$Tll,$Zll,$Zll
+
+	ldd	$rem($rem_4bit),$rem
+	addib,uv -1,$cnt,L\$oop_ghash_pa2
+	xor	$Thh,$Zhh,$Zhh
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem2
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	ldd	$rem2($rem_4bit),$rem2
+
+	xor	$rem2,$Zhh,$Zhh
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	extrd,u	$Zhh,59,60,$Zhh
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	std	$Zll,8($Xi)
+	ldo	16($inp),$inp
+	std	$Zhh,0($Xi)
+	cmpb,*<> $inp,$len,L\$outer_ghash_pa2
+	copy	$Zll,$nlo
+___
+
+$code.=<<___ if ($SIZE_T==4);
+	b	L\$done_ghash
+	nop
+
+L\$parisc1_ghash
+	ldb	15($Xi),$nlo
+	ldo	12($Htbl),$Hll
+	ldo	8($Htbl),$Hlh
+	ldo	4($Htbl),$Hhl
+
+L\$outer_ghash_pa1
+	ldb	15($inp),$byte
+	xor	$byte,$nlo,$nlo
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	ldwx	$nlo($Hll),$Zll
+	ldwx	$nlo($Hlh),$Zlh
+	ldwx	$nlo($Hhl),$Zhl
+	ldwx	$nlo($Hhh),$Zhh
+	zdep	$Zll,28,4,$rem
+	ldb	14($Xi),$nlo
+	ldb	14($inp),$byte
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	extru	$Zhh,27,28,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	xor	$byte,$nlo,$nlo
+	xor	$rem,$Zhh,$Zhh
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$Thl,$Zhl,$Zhl
+	b	L\$oop_ghash_pa1
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_ghash_pa1
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldbx	$cnt($Xi),$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	ldbx	$cnt($inp),$byte
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$rem,$Zhh,$Zhh
+	zdep	$Zll,28,4,$rem
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	xor	$byte,$nlo,$nlo
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	and	$mask0xf0,$nlo,$nhi
+	extru	$Zhh,27,28,$Zhh
+	zdep	$nlo,27,4,$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$rem,$Zhh,$Zhh
+	addib,uv -1,$cnt,L\$oop_ghash_pa1
+	xor	$Thl,$Zhl,$Zhl
+
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$rem,$Zhh,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	zdep	$Zll,28,4,$rem
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	extru	$Zhh,27,28,$Zhh
+	xor	$Tll,$Zll,$Zll
+	xor	$Tlh,$Zlh,$Zlh
+	xor	$rem,$Zhh,$Zhh
+	stw	$Zll,12($Xi)
+	xor	$Thl,$Zhl,$Zhl
+	stw	$Zlh,8($Xi)
+	xor	$Thh,$Zhh,$Zhh
+	stw	$Zhl,4($Xi)
+	ldo	16($inp),$inp
+	stw	$Zhh,0($Xi)
+	comb,<>	$inp,$len,L\$outer_ghash_pa1
+	copy	$Zll,$nlo
+___
+$code.=<<___;
+L\$done_ghash
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.ALIGN	64
+L\$rem_4bit
+	.WORD	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+	.WORD	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+	.WORD	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+	.WORD	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+	.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
+	.ALIGN	64
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
+    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
+    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
+    {	sprintf "\t.WORD\t0x%08x\t; %s",
+		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $depd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "depd$mod\t$args";
+
+    # I only have ",z" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 16
+    {	my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
+    	my $cpos=63-$2;
+	my $len=32-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+	=~ /GNU assembler/) {
+    $gnuas = 1;
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	if ($SIZE_T==4) {
+		s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
+		s/cmpb,\*/comb,/;
+		s/,\*/,/;
+	}
+
+	s/(\.LEVEL\s+2\.0)W/$1w/	if ($gnuas && $SIZE_T==8);
+	s/\.SPACE\s+\$TEXT\$/.text/	if ($gnuas && $SIZE_T==8);
+	s/\.SUBSPA.*//			if ($gnuas && $SIZE_T==8);
+	s/\bbv\b/bve/			if ($SIZE_T==8);
+
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/modes/asm/ghash-riscv64-zvkb-zvbc.pl
+++ b/openssl-3.4.2/crypto/modes/asm/ghash-riscv64-zvkb-zvbc.pl
@@ -0,0 +1,378 @@
+#! /usr/bin/env perl
+# This file is dual-licensed, meaning that you can use it under your
+# choice of either of the following two licenses:
+#
+# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You can obtain
+# a copy in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# or
+#
+# Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# - RV64I
+# - RISC-V Vector ('V') with VLEN >= 128
+# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+# - RISC-V Vector Carryless Multiplication extension ('Zvbc')
+
+use strict;
+use warnings;
+
+use FindBin qw($Bin);
+use lib "$Bin";
+use lib "$Bin/../../perlasm";
+use riscv;
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$output and open STDOUT,">$output";
+
+my $code=<<___;
+.text
+___
+
+################################################################################
+# void gcm_init_rv64i_zvkb_zvbc(u128 Htable[16], const u64 H[2]);
+#
+# input:	H: 128-bit H - secret parameter E(K, 0^128)
+# output:	Htable: Preprocessed key data for gcm_gmult_rv64i_zvkb_zvbc and
+#                       gcm_ghash_rv64i_zvkb_zvbc
+{
+my ($Htable,$H,$TMP0,$TMP1,$TMP2) = ("a0","a1","t0","t1","t2");
+my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_init_rv64i_zvkb_zvbc
+.type gcm_init_rv64i_zvkb_zvbc,\@function
+gcm_init_rv64i_zvkb_zvbc:
+    # Load/store data in reverse order.
+    # This is needed as a part of endianness swap.
+    add $H, $H, 8
+    li $TMP0, -8
+    li $TMP1, 63
+    la $TMP2, Lpolymod
+
+    @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
+
+    @{[vlse64_v  $V1, $H, $TMP0]}    # vlse64.v v1, (a1), t0
+    @{[vle64_v $V2, $TMP2]}          # vle64.v v2, (t2)
+
+    # Shift one left and get the carry bits.
+    @{[vsrl_vx $V3, $V1, $TMP1]}     # vsrl.vx v3, v1, t1
+    @{[vsll_vi $V1, $V1, 1]}         # vsll.vi v1, v1, 1
+
+    # Use the fact that the polynomial degree is no more than 128,
+    # i.e. only the LSB of the upper half could be set.
+    # Thanks to this we don't need to do the full reduction here.
+    # Instead simply subtract the reduction polynomial.
+    # This idea was taken from x86 ghash implementation in OpenSSL.
+    @{[vslideup_vi $V4, $V3, 1]}     # vslideup.vi v4, v3, 1
+    @{[vslidedown_vi $V3, $V3, 1]}   # vslidedown.vi v3, v3, 1
+
+    @{[vmv_v_i $V0, 2]}              # vmv.v.i v0, 2
+    @{[vor_vv_v0t $V1, $V1, $V4]}    # vor.vv v1, v1, v4, v0.t
+
+    # Need to set the mask to 3, if the carry bit is set.
+    @{[vmv_v_v $V0, $V3]}            # vmv.v.v v0, v3
+    @{[vmv_v_i $V3, 0]}              # vmv.v.i v3, 0
+    @{[vmerge_vim $V3, $V3, 3]}      # vmerge.vim v3, v3, 3, v0
+    @{[vmv_v_v $V0, $V3]}            # vmv.v.v v0, v3
+
+    @{[vxor_vv_v0t $V1, $V1, $V2]}   # vxor.vv v1, v1, v2, v0.t
+
+    @{[vse64_v $V1, $Htable]}        # vse64.v v1, (a0)
+    ret
+.size gcm_init_rv64i_zvkb_zvbc,.-gcm_init_rv64i_zvkb_zvbc
+___
+}
+
+################################################################################
+# void gcm_gmult_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16]);
+#
+# input:	Xi: current hash value
+#		Htable: preprocessed H
+# output:	Xi: next hash value Xi = (Xi * H mod f)
+{
+my ($Xi,$Htable,$TMP0,$TMP1,$TMP2,$TMP3,$TMP4) = ("a0","a1","t0","t1","t2","t3","t4");
+my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");
+
+$code .= <<___;
+.text
+.p2align 3
+.globl gcm_gmult_rv64i_zvkb_zvbc
+.type gcm_gmult_rv64i_zvkb_zvbc,\@function
+gcm_gmult_rv64i_zvkb_zvbc:
+    ld $TMP0, ($Htable)
+    ld $TMP1, 8($Htable)
+    li $TMP2, 63
+    la $TMP3, Lpolymod
+    ld $TMP3, 8($TMP3)
+
+    # Load/store data in reverse order.
+    # This is needed as a part of endianness swap.
+    add $Xi, $Xi, 8
+    li $TMP4, -8
+
+    @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
+
+    @{[vlse64_v $V5, $Xi, $TMP4]}    # vlse64.v v5, (a0), t4
+    @{[vrev8_v $V5, $V5]}            # vrev8.v v5, v5
+
+    # Multiplication
+
+    # Do two 64x64 multiplications in one go to save some time
+    # and simplify things.
+
+    # A = a1a0 (t1, t0)
+    # B = b1b0 (v5)
+    # C = c1c0 (256 bit)
+    # c1 = a1b1 + (a0b1)h + (a1b0)h
+    # c0 = a0b0 + (a0b1)l + (a1b0)h
+
+    # v1 = (a0b1)l,(a0b0)l
+    @{[vclmul_vx $V1, $V5, $TMP0]}   # vclmul.vx v1, v5, t0
+    # v3 = (a0b1)h,(a0b0)h
+    @{[vclmulh_vx $V3, $V5, $TMP0]}  # vclmulh.vx v3, v5, t0
+
+    # v4 = (a1b1)l,(a1b0)l
+    @{[vclmul_vx $V4, $V5, $TMP1]}   # vclmul.vx v4, v5, t1
+    # v2 = (a1b1)h,(a1b0)h
+    @{[vclmulh_vx $V2, $V5, $TMP1]}   # vclmulh.vx v2, v5, t1
+
+    # Is there a better way to do this?
+    # Would need to swap the order of elements within a vector register.
+    @{[vslideup_vi $V5, $V3, 1]}     # vslideup.vi v5, v3, 1
+    @{[vslideup_vi $V6, $V4, 1]}     # vslideup.vi v6, v4, 1
+    @{[vslidedown_vi $V3, $V3, 1]}   # vslidedown.vi v3, v3, 1
+    @{[vslidedown_vi $V4, $V4, 1]}   # vslidedown.vi v4, v4, 1
+
+    @{[vmv_v_i $V0, 1]}              # vmv.v.i v0, 1
+    # v2 += (a0b1)h
+    @{[vxor_vv_v0t $V2, $V2, $V3]}   # vxor.vv v2, v2, v3, v0.t
+    # v2 += (a1b1)l
+    @{[vxor_vv_v0t $V2, $V2, $V4]}   # vxor.vv v2, v2, v4, v0.t
+
+    @{[vmv_v_i $V0, 2]}              # vmv.v.i v0, 2
+    # v1 += (a0b0)h,0
+    @{[vxor_vv_v0t $V1, $V1, $V5]}   # vxor.vv v1, v1, v5, v0.t
+    # v1 += (a1b0)l,0
+    @{[vxor_vv_v0t $V1, $V1, $V6]}   # vxor.vv v1, v1, v6, v0.t
+
+    # Now the 256bit product should be stored in (v2,v1)
+    # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
+    # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
+
+    # Reduction
+    # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
+    # This is a slight variation of the Gueron's Montgomery reduction.
+    # The difference being the order of some operations has been changed,
+    # to make a better use of vclmul(h) instructions.
+
+    # First step:
+    # c1 += (c0 * P)l
+    # vmv.v.i v0, 2
+    @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
+    @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
+    @{[vxor_vv_v0t $V1, $V1, $V3]}   # vxor.vv v1, v1, v3, v0.t
+
+    # Second step:
+    # D = d1,d0 is final result
+    # We want:
+    # m1 = c1 + (c1 * P)h
+    # m0 = (c1 * P)l + (c0 * P)h + c0
+    # d1 = c3 + m1
+    # d0 = c2 + m0
+
+    #v3 = (c1 * P)l, 0
+    @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
+    #v4 = (c1 * P)h, (c0 * P)h
+    @{[vclmulh_vx $V4, $V1, $TMP3]}   # vclmulh.vx v4, v1, t3
+
+    @{[vmv_v_i $V0, 1]}              # vmv.v.i v0, 1
+    @{[vslidedown_vi $V3, $V3, 1]}   # vslidedown.vi v3, v3, 1
+
+    @{[vxor_vv $V1, $V1, $V4]}       # vxor.vv v1, v1, v4
+    @{[vxor_vv_v0t $V1, $V1, $V3]}   # vxor.vv v1, v1, v3, v0.t
+
+    # XOR in the upper upper part of the product
+    @{[vxor_vv $V2, $V2, $V1]}       # vxor.vv v2, v2, v1
+
+    @{[vrev8_v $V2, $V2]}            # vrev8.v v2, v2
+    @{[vsse64_v $V2, $Xi, $TMP4]}    # vsse64.v v2, (a0), t4
+    ret
+.size gcm_gmult_rv64i_zvkb_zvbc,.-gcm_gmult_rv64i_zvkb_zvbc
+___
+}
+
+################################################################################
+# void gcm_ghash_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16],
+#                                const u8 *inp, size_t len);
+#
+# input:	Xi: current hash value
+#		Htable: preprocessed H
+#		inp: pointer to input data
+#		len: length of input data in bytes (multiple of block size)
+# output:	Xi: Xi+1 (next hash value Xi)
+{
+my ($Xi,$Htable,$inp,$len,$TMP0,$TMP1,$TMP2,$TMP3,$M8,$TMP5,$TMP6) = ("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6");
+my ($V0,$V1,$V2,$V3,$V4,$V5,$V6,$Vinp) = ("v0","v1","v2","v3","v4","v5","v6","v7");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_ghash_rv64i_zvkb_zvbc
+.type gcm_ghash_rv64i_zvkb_zvbc,\@function
+gcm_ghash_rv64i_zvkb_zvbc:
+    ld $TMP0, ($Htable)
+    ld $TMP1, 8($Htable)
+    li $TMP2, 63
+    la $TMP3, Lpolymod
+    ld $TMP3, 8($TMP3)
+
+    # Load/store data in reverse order.
+    # This is needed as a part of endianness swap.
+    add $Xi, $Xi, 8
+    add $inp, $inp, 8
+    li $M8, -8
+
+    @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
+
+    @{[vlse64_v $V5, $Xi, $M8]}      # vlse64.v v5, (a0), t4
+
+Lstep:
+    # Read input data
+    @{[vlse64_v $Vinp, $inp, $M8]}   # vle64.v v0, (a2)
+    add $inp, $inp, 16
+    add $len, $len, -16
+    # XOR them into Xi
+    @{[vxor_vv $V5, $V5, $Vinp]}       # vxor.vv v0, v0, v1
+
+    @{[vrev8_v $V5, $V5]}            # vrev8.v v5, v5
+
+    # Multiplication
+
+    # Do two 64x64 multiplications in one go to save some time
+    # and simplify things.
+
+    # A = a1a0 (t1, t0)
+    # B = b1b0 (v5)
+    # C = c1c0 (256 bit)
+    # c1 = a1b1 + (a0b1)h + (a1b0)h
+    # c0 = a0b0 + (a0b1)l + (a1b0)h
+
+    # v1 = (a0b1)l,(a0b0)l
+    @{[vclmul_vx $V1, $V5, $TMP0]}   # vclmul.vx v1, v5, t0
+    # v3 = (a0b1)h,(a0b0)h
+    @{[vclmulh_vx $V3, $V5, $TMP0]}  # vclmulh.vx v3, v5, t0
+
+    # v4 = (a1b1)l,(a1b0)l
+    @{[vclmul_vx $V4, $V5, $TMP1]}   # vclmul.vx v4, v5, t1
+    # v2 = (a1b1)h,(a1b0)h
+    @{[vclmulh_vx $V2, $V5, $TMP1]}   # vclmulh.vx v2, v5, t1
+
+    # Is there a better way to do this?
+    # Would need to swap the order of elements within a vector register.
+    @{[vslideup_vi $V5, $V3, 1]}     # vslideup.vi v5, v3, 1
+    @{[vslideup_vi $V6, $V4, 1]}     # vslideup.vi v6, v4, 1
+    @{[vslidedown_vi $V3, $V3, 1]}   # vslidedown.vi v3, v3, 1
+    @{[vslidedown_vi $V4, $V4, 1]}   # vslidedown.vi v4, v4, 1
+
+    @{[vmv_v_i $V0, 1]}              # vmv.v.i v0, 1
+    # v2 += (a0b1)h
+    @{[vxor_vv_v0t $V2, $V2, $V3]}   # vxor.vv v2, v2, v3, v0.t
+    # v2 += (a1b1)l
+    @{[vxor_vv_v0t $V2, $V2, $V4]}   # vxor.vv v2, v2, v4, v0.t
+
+    @{[vmv_v_i $V0, 2]}              # vmv.v.i v0, 2
+    # v1 += (a0b0)h,0
+    @{[vxor_vv_v0t $V1, $V1, $V5]}   # vxor.vv v1, v1, v5, v0.t
+    # v1 += (a1b0)l,0
+    @{[vxor_vv_v0t $V1, $V1, $V6]}   # vxor.vv v1, v1, v6, v0.t
+
+    # Now the 256bit product should be stored in (v2,v1)
+    # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
+    # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
+
+    # Reduction
+    # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
+    # This is a slight variation of the Gueron's Montgomery reduction.
+    # The difference being the order of some operations has been changed,
+    # to make a better use of vclmul(h) instructions.
+
+    # First step:
+    # c1 += (c0 * P)l
+    # vmv.v.i v0, 2
+    @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
+    @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
+    @{[vxor_vv_v0t $V1, $V1, $V3]}   # vxor.vv v1, v1, v3, v0.t
+
+    # Second step:
+    # D = d1,d0 is final result
+    # We want:
+    # m1 = c1 + (c1 * P)h
+    # m0 = (c1 * P)l + (c0 * P)h + c0
+    # d1 = c3 + m1
+    # d0 = c2 + m0
+
+    #v3 = (c1 * P)l, 0
+    @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
+    #v4 = (c1 * P)h, (c0 * P)h
+    @{[vclmulh_vx $V4, $V1, $TMP3]}   # vclmulh.vx v4, v1, t3
+
+    @{[vmv_v_i $V0, 1]}              # vmv.v.i v0, 1
+    @{[vslidedown_vi $V3, $V3, 1]}   # vslidedown.vi v3, v3, 1
+
+    @{[vxor_vv $V1, $V1, $V4]}       # vxor.vv v1, v1, v4
+    @{[vxor_vv_v0t $V1, $V1, $V3]}   # vxor.vv v1, v1, v3, v0.t
+
+    # XOR in the upper upper part of the product
+    @{[vxor_vv $V2, $V2, $V1]}       # vxor.vv v2, v2, v1
+
+    @{[vrev8_v $V5, $V2]}            # vrev8.v v2, v2
+
+    bnez $len, Lstep
+
+    @{[vsse64_v $V5, $Xi, $M8]}    # vsse64.v v2, (a0), t4
+    ret
+.size gcm_ghash_rv64i_zvkb_zvbc,.-gcm_ghash_rv64i_zvkb_zvbc
+___
+}
+
+$code .= <<___;
+.p2align 4
+Lpolymod:
+        .dword 0x0000000000000001
+        .dword 0xc200000000000000
+.size Lpolymod,.-Lpolymod
+___
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/modes/asm/ghash-riscv64-zvkg.pl
+++ b/openssl-3.4.2/crypto/modes/asm/ghash-riscv64-zvkg.pl
@@ -0,0 +1,169 @@
+#! /usr/bin/env perl
+# This file is dual-licensed, meaning that you can use it under your
+# choice of either of the following two licenses:
+#
+# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You can obtain
+# a copy in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# or
+#
+# Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# - RV64I
+# - RISC-V Vector ('V') with VLEN >= 128
+# - RISC-V Vector GCM/GMAC extension ('Zvkg')
+#
+# Optional:
+# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+use strict;
+use warnings;
+
+use FindBin qw($Bin);
+use lib "$Bin";
+use lib "$Bin/../../perlasm";
+use riscv;
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$output and open STDOUT,">$output";
+
+my $code=<<___;
+.text
+___
+
+################################################################################
+# void gcm_init_rv64i_zvkg(u128 Htable[16], const u64 H[2]);
+# void gcm_init_rv64i_zvkg_zvkb(u128 Htable[16], const u64 H[2]);
+#
+# input: H: 128-bit H - secret parameter E(K, 0^128)
+# output: Htable: Copy of secret parameter (in normalized byte order)
+#
+# All callers of this function revert the byte-order unconditionally
+# on little-endian machines. So we need to revert the byte-order back.
+{
+my ($Htable,$H,$VAL0,$VAL1,$TMP0) = ("a0","a1","a2","a3","t0");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_init_rv64i_zvkg
+.type gcm_init_rv64i_zvkg,\@function
+gcm_init_rv64i_zvkg:
+    ld      $VAL0, 0($H)
+    ld      $VAL1, 8($H)
+    @{[sd_rev8_rv64i $VAL0, $Htable, 0, $TMP0]}
+    @{[sd_rev8_rv64i $VAL1, $Htable, 8, $TMP0]}
+    ret
+.size gcm_init_rv64i_zvkg,.-gcm_init_rv64i_zvkg
+___
+}
+
+{
+my ($Htable,$H,$V0) = ("a0","a1","v0");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_init_rv64i_zvkg_zvkb
+.type gcm_init_rv64i_zvkg_zvkb,\@function
+gcm_init_rv64i_zvkg_zvkb:
+    @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, ta, ma
+    @{[vle64_v $V0, $H]}             # vle64.v v0, (a1)
+    @{[vrev8_v $V0, $V0]}            # vrev8.v v0, v0
+    @{[vse64_v $V0, $Htable]}        # vse64.v v0, (a0)
+    ret
+.size gcm_init_rv64i_zvkg_zvkb,.-gcm_init_rv64i_zvkg_zvkb
+___
+}
+
+################################################################################
+# void gcm_gmult_rv64i_zvkg(u64 Xi[2], const u128 Htable[16]);
+#
+# input: Xi: current hash value
+#        Htable: copy of H
+# output: Xi: next hash value Xi
+{
+my ($Xi,$Htable) = ("a0","a1");
+my ($VD,$VS2) = ("v1","v2");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_gmult_rv64i_zvkg
+.type gcm_gmult_rv64i_zvkg,\@function
+gcm_gmult_rv64i_zvkg:
+    @{[vsetivli__x0_4_e32_m1_tu_mu]}
+    @{[vle32_v $VS2, $Htable]}
+    @{[vle32_v $VD, $Xi]}
+    @{[vgmul_vv $VD, $VS2]}
+    @{[vse32_v $VD, $Xi]}
+    ret
+.size gcm_gmult_rv64i_zvkg,.-gcm_gmult_rv64i_zvkg
+___
+}
+
+################################################################################
+# void gcm_ghash_rv64i_zvkg(u64 Xi[2], const u128 Htable[16],
+#                           const u8 *inp, size_t len);
+#
+# input: Xi: current hash value
+#        Htable: copy of H
+#        inp: pointer to input data
+#        len: length of input data in bytes (multiple of block size)
+# output: Xi: Xi+1 (next hash value Xi)
+{
+my ($Xi,$Htable,$inp,$len) = ("a0","a1","a2","a3");
+my ($vXi,$vH,$vinp,$Vzero) = ("v1","v2","v3","v4");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_ghash_rv64i_zvkg
+.type gcm_ghash_rv64i_zvkg,\@function
+gcm_ghash_rv64i_zvkg:
+    @{[vsetivli__x0_4_e32_m1_tu_mu]}
+    @{[vle32_v $vH, $Htable]}
+    @{[vle32_v $vXi, $Xi]}
+
+Lstep:
+    @{[vle32_v $vinp, $inp]}
+    add $inp, $inp, 16
+    add $len, $len, -16
+    @{[vghsh_vv $vXi, $vH, $vinp]}
+    bnez $len, Lstep
+
+    @{[vse32_v $vXi, $Xi]}
+    ret
+
+.size gcm_ghash_rv64i_zvkg,.-gcm_ghash_rv64i_zvkg
+___
+}
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/modes/asm/ghash-riscv64.pl
+++ b/openssl-3.4.2/crypto/modes/asm/ghash-riscv64.pl
@@ -0,0 +1,428 @@
+#! /usr/bin/env perl
+# This file is dual-licensed, meaning that you can use it under your
+# choice of either of the following two licenses:
+#
+# Copyright 2022-2023 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You can obtain
+# a copy in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# or
+#
+# Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+use strict;
+use warnings;
+
+use FindBin qw($Bin);
+use lib "$Bin";
+use lib "$Bin/../../perlasm";
+use riscv;
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$output and open STDOUT,">$output";
+
+my $code=<<___;
+.text
+___
+
+################################################################################
+# void gcm_init_rv64i_zbc(u128 Htable[16], const u64 H[2]);
+# void gcm_init_rv64i_zbc__zbb(u128 Htable[16], const u64 H[2]);
+# void gcm_init_rv64i_zbc__zbkb(u128 Htable[16], const u64 H[2]);
+#
+# input:  H: 128-bit H - secret parameter E(K, 0^128)
+# output: Htable: Preprocessed key data for gcm_gmult_rv64i_zbc* and
+#                 gcm_ghash_rv64i_zbc*
+#
+# All callers of this function revert the byte-order unconditionally
+# on little-endian machines. So we need to revert the byte-order back.
+# Additionally we reverse the bits of each byte.
+
+{
+my ($Htable,$H,$VAL0,$VAL1,$TMP0,$TMP1,$TMP2) = ("a0","a1","a2","a3","t0","t1","t2");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_init_rv64i_zbc
+.type gcm_init_rv64i_zbc,\@function
+gcm_init_rv64i_zbc:
+    ld      $VAL0,0($H)
+    ld      $VAL1,8($H)
+    @{[brev8_rv64i   $VAL0, $TMP0, $TMP1, $TMP2]}
+    @{[brev8_rv64i   $VAL1, $TMP0, $TMP1, $TMP2]}
+    @{[sd_rev8_rv64i $VAL0, $Htable, 0, $TMP0]}
+    @{[sd_rev8_rv64i $VAL1, $Htable, 8, $TMP0]}
+    ret
+.size gcm_init_rv64i_zbc,.-gcm_init_rv64i_zbc
+___
+}
+
+{
+my ($Htable,$H,$VAL0,$VAL1,$TMP0,$TMP1,$TMP2) = ("a0","a1","a2","a3","t0","t1","t2");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_init_rv64i_zbc__zbb
+.type gcm_init_rv64i_zbc__zbb,\@function
+gcm_init_rv64i_zbc__zbb:
+    ld      $VAL0,0($H)
+    ld      $VAL1,8($H)
+    @{[brev8_rv64i $VAL0, $TMP0, $TMP1, $TMP2]}
+    @{[brev8_rv64i $VAL1, $TMP0, $TMP1, $TMP2]}
+    @{[rev8 $VAL0, $VAL0]}
+    @{[rev8 $VAL1, $VAL1]}
+    sd      $VAL0,0($Htable)
+    sd      $VAL1,8($Htable)
+    ret
+.size gcm_init_rv64i_zbc__zbb,.-gcm_init_rv64i_zbc__zbb
+___
+}
+
+{
+my ($Htable,$H,$TMP0,$TMP1) = ("a0","a1","t0","t1");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_init_rv64i_zbc__zbkb
+.type gcm_init_rv64i_zbc__zbkb,\@function
+gcm_init_rv64i_zbc__zbkb:
+    ld      $TMP0,0($H)
+    ld      $TMP1,8($H)
+    @{[brev8 $TMP0, $TMP0]}
+    @{[brev8 $TMP1, $TMP1]}
+    @{[rev8 $TMP0, $TMP0]}
+    @{[rev8 $TMP1, $TMP1]}
+    sd      $TMP0,0($Htable)
+    sd      $TMP1,8($Htable)
+    ret
+.size gcm_init_rv64i_zbc__zbkb,.-gcm_init_rv64i_zbc__zbkb
+___
+}
+
+################################################################################
+# void gcm_gmult_rv64i_zbc(u64 Xi[2], const u128 Htable[16]);
+# void gcm_gmult_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16]);
+#
+# input:  Xi: current hash value
+#         Htable: copy of H
+# output: Xi: next hash value Xi
+#
+# Compute GMULT (Xi*H mod f) using the Zbc (clmul) and Zbb (basic bit manip)
+# extensions. Using the no-Karatsuba approach and clmul for the final reduction.
+# This results in an implementation with minimized number of instructions.
+# HW with clmul latencies higher than 2 cycles might observe a performance
+# improvement with Karatsuba. HW with clmul latencies higher than 6 cycles
+# might observe a performance improvement with additionally converting the
+# reduction to shift&xor. For a full discussion of this estimates see
+# https://github.com/riscv/riscv-crypto/blob/master/doc/supp/gcm-mode-cmul.adoc
+{
+my ($Xi,$Htable,$x0,$x1,$y0,$y1) = ("a0","a1","a4","a5","a6","a7");
+my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_gmult_rv64i_zbc
+.type gcm_gmult_rv64i_zbc,\@function
+gcm_gmult_rv64i_zbc:
+    # Load Xi and bit-reverse it
+    ld        $x0, 0($Xi)
+    ld        $x1, 8($Xi)
+    @{[brev8_rv64i $x0, $z0, $z1, $z2]}
+    @{[brev8_rv64i $x1, $z0, $z1, $z2]}
+
+    # Load the key (already bit-reversed)
+    ld        $y0, 0($Htable)
+    ld        $y1, 8($Htable)
+
+    # Load the reduction constant
+    la        $polymod, Lpolymod
+    lbu       $polymod, 0($polymod)
+
+    # Multiplication (without Karatsuba)
+    @{[clmulh $z3, $x1, $y1]}
+    @{[clmul  $z2, $x1, $y1]}
+    @{[clmulh $t1, $x0, $y1]}
+    @{[clmul  $z1, $x0, $y1]}
+    xor       $z2, $z2, $t1
+    @{[clmulh $t1, $x1, $y0]}
+    @{[clmul  $t0, $x1, $y0]}
+    xor       $z2, $z2, $t1
+    xor       $z1, $z1, $t0
+    @{[clmulh $t1, $x0, $y0]}
+    @{[clmul  $z0, $x0, $y0]}
+    xor       $z1, $z1, $t1
+
+    # Reduction with clmul
+    @{[clmulh $t1, $z3, $polymod]}
+    @{[clmul  $t0, $z3, $polymod]}
+    xor       $z2, $z2, $t1
+    xor       $z1, $z1, $t0
+    @{[clmulh $t1, $z2, $polymod]}
+    @{[clmul  $t0, $z2, $polymod]}
+    xor       $x1, $z1, $t1
+    xor       $x0, $z0, $t0
+
+    # Bit-reverse Xi back and store it
+    @{[brev8_rv64i $x0, $z0, $z1, $z2]}
+    @{[brev8_rv64i $x1, $z0, $z1, $z2]}
+    sd        $x0, 0($Xi)
+    sd        $x1, 8($Xi)
+    ret
+.size gcm_gmult_rv64i_zbc,.-gcm_gmult_rv64i_zbc
+___
+}
+
+{
+my ($Xi,$Htable,$x0,$x1,$y0,$y1) = ("a0","a1","a4","a5","a6","a7");
+my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_gmult_rv64i_zbc__zbkb
+.type gcm_gmult_rv64i_zbc__zbkb,\@function
+gcm_gmult_rv64i_zbc__zbkb:
+    # Load Xi and bit-reverse it
+    ld        $x0, 0($Xi)
+    ld        $x1, 8($Xi)
+    @{[brev8  $x0, $x0]}
+    @{[brev8  $x1, $x1]}
+
+    # Load the key (already bit-reversed)
+    ld        $y0, 0($Htable)
+    ld        $y1, 8($Htable)
+
+    # Load the reduction constant
+    la        $polymod, Lpolymod
+    lbu       $polymod, 0($polymod)
+
+    # Multiplication (without Karatsuba)
+    @{[clmulh $z3, $x1, $y1]}
+    @{[clmul  $z2, $x1, $y1]}
+    @{[clmulh $t1, $x0, $y1]}
+    @{[clmul  $z1, $x0, $y1]}
+    xor       $z2, $z2, $t1
+    @{[clmulh $t1, $x1, $y0]}
+    @{[clmul  $t0, $x1, $y0]}
+    xor       $z2, $z2, $t1
+    xor       $z1, $z1, $t0
+    @{[clmulh $t1, $x0, $y0]}
+    @{[clmul  $z0, $x0, $y0]}
+    xor       $z1, $z1, $t1
+
+    # Reduction with clmul
+    @{[clmulh $t1, $z3, $polymod]}
+    @{[clmul  $t0, $z3, $polymod]}
+    xor       $z2, $z2, $t1
+    xor       $z1, $z1, $t0
+    @{[clmulh $t1, $z2, $polymod]}
+    @{[clmul  $t0, $z2, $polymod]}
+    xor       $x1, $z1, $t1
+    xor       $x0, $z0, $t0
+
+    # Bit-reverse Xi back and store it
+    @{[brev8  $x0, $x0]}
+    @{[brev8  $x1, $x1]}
+    sd        $x0, 0($Xi)
+    sd        $x1, 8($Xi)
+    ret
+.size gcm_gmult_rv64i_zbc__zbkb,.-gcm_gmult_rv64i_zbc__zbkb
+___
+}
+
+################################################################################
+# void gcm_ghash_rv64i_zbc(u64 Xi[2], const u128 Htable[16],
+#                          const u8 *inp, size_t len);
+# void gcm_ghash_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16],
+#                                const u8 *inp, size_t len);
+#
+# input:  Xi: current hash value
+#         Htable: copy of H
+#         inp: pointer to input data
+#         len: length of input data in bytes (multiple of block size)
+# output: Xi: Xi+1 (next hash value Xi)
+{
+my ($Xi,$Htable,$inp,$len,$x0,$x1,$y0,$y1) = ("a0","a1","a2","a3","a4","a5","a6","a7");
+my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_ghash_rv64i_zbc
+.type gcm_ghash_rv64i_zbc,\@function
+gcm_ghash_rv64i_zbc:
+    # Load Xi and bit-reverse it
+    ld        $x0, 0($Xi)
+    ld        $x1, 8($Xi)
+    @{[brev8_rv64i $x0, $z0, $z1, $z2]}
+    @{[brev8_rv64i $x1, $z0, $z1, $z2]}
+
+    # Load the key (already bit-reversed)
+    ld        $y0, 0($Htable)
+    ld        $y1, 8($Htable)
+
+    # Load the reduction constant
+    la        $polymod, Lpolymod
+    lbu       $polymod, 0($polymod)
+
+Lstep:
+    # Load the input data, bit-reverse them, and XOR them with Xi
+    ld        $t0, 0($inp)
+    ld        $t1, 8($inp)
+    add       $inp, $inp, 16
+    add       $len, $len, -16
+    @{[brev8_rv64i $t0, $z0, $z1, $z2]}
+    @{[brev8_rv64i $t1, $z0, $z1, $z2]}
+    xor       $x0, $x0, $t0
+    xor       $x1, $x1, $t1
+
+    # Multiplication (without Karatsuba)
+    @{[clmulh $z3, $x1, $y1]}
+    @{[clmul  $z2, $x1, $y1]}
+    @{[clmulh $t1, $x0, $y1]}
+    @{[clmul  $z1, $x0, $y1]}
+    xor       $z2, $z2, $t1
+    @{[clmulh $t1, $x1, $y0]}
+    @{[clmul  $t0, $x1, $y0]}
+    xor       $z2, $z2, $t1
+    xor       $z1, $z1, $t0
+    @{[clmulh $t1, $x0, $y0]}
+    @{[clmul  $z0, $x0, $y0]}
+    xor       $z1, $z1, $t1
+
+    # Reduction with clmul
+    @{[clmulh $t1, $z3, $polymod]}
+    @{[clmul  $t0, $z3, $polymod]}
+    xor       $z2, $z2, $t1
+    xor       $z1, $z1, $t0
+    @{[clmulh $t1, $z2, $polymod]}
+    @{[clmul  $t0, $z2, $polymod]}
+    xor       $x1, $z1, $t1
+    xor       $x0, $z0, $t0
+
+    # Iterate over all blocks
+    bnez      $len, Lstep
+
+    # Bit-reverse final Xi back and store it
+    @{[brev8_rv64i $x0, $z0, $z1, $z2]}
+    @{[brev8_rv64i $x1, $z0, $z1, $z2]}
+    sd        $x0, 0($Xi)
+    sd        $x1, 8($Xi)
+    ret
+.size gcm_ghash_rv64i_zbc,.-gcm_ghash_rv64i_zbc
+___
+}
+
+{
+my ($Xi,$Htable,$inp,$len,$x0,$x1,$y0,$y1) = ("a0","a1","a2","a3","a4","a5","a6","a7");
+my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
+
+$code .= <<___;
+.p2align 3
+.globl gcm_ghash_rv64i_zbc__zbkb
+.type gcm_ghash_rv64i_zbc__zbkb,\@function
+gcm_ghash_rv64i_zbc__zbkb:
+    # Load Xi and bit-reverse it
+    ld        $x0, 0($Xi)
+    ld        $x1, 8($Xi)
+    @{[brev8  $x0, $x0]}
+    @{[brev8  $x1, $x1]}
+
+    # Load the key (already bit-reversed)
+    ld        $y0, 0($Htable)
+    ld        $y1, 8($Htable)
+
+    # Load the reduction constant
+    la        $polymod, Lpolymod
+    lbu       $polymod, 0($polymod)
+
+Lstep_zkbk:
+    # Load the input data, bit-reverse them, and XOR them with Xi
+    ld        $t0, 0($inp)
+    ld        $t1, 8($inp)
+    add       $inp, $inp, 16
+    add       $len, $len, -16
+    @{[brev8  $t0, $t0]}
+    @{[brev8  $t1, $t1]}
+    xor       $x0, $x0, $t0
+    xor       $x1, $x1, $t1
+
+    # Multiplication (without Karatsuba)
+    @{[clmulh $z3, $x1, $y1]}
+    @{[clmul  $z2, $x1, $y1]}
+    @{[clmulh $t1, $x0, $y1]}
+    @{[clmul  $z1, $x0, $y1]}
+    xor       $z2, $z2, $t1
+    @{[clmulh $t1, $x1, $y0]}
+    @{[clmul  $t0, $x1, $y0]}
+    xor       $z2, $z2, $t1
+    xor       $z1, $z1, $t0
+    @{[clmulh $t1, $x0, $y0]}
+    @{[clmul  $z0, $x0, $y0]}
+    xor       $z1, $z1, $t1
+
+    # Reduction with clmul
+    @{[clmulh $t1, $z3, $polymod]}
+    @{[clmul  $t0, $z3, $polymod]}
+    xor       $z2, $z2, $t1
+    xor       $z1, $z1, $t0
+    @{[clmulh $t1, $z2, $polymod]}
+    @{[clmul  $t0, $z2, $polymod]}
+    xor       $x1, $z1, $t1
+    xor       $x0, $z0, $t0
+
+    # Iterate over all blocks
+    bnez      $len, Lstep_zkbk
+
+    # Bit-reverse final Xi back and store it
+    @{[brev8  $x0, $x0]}
+    @{[brev8  $x1, $x1]}
+    sd $x0,  0($Xi)
+    sd $x1,  8($Xi)
+    ret
+.size gcm_ghash_rv64i_zbc__zbkb,.-gcm_ghash_rv64i_zbc__zbkb
+___
+}
+
+$code .= <<___;
+.p2align 3
+Lbrev8_const:
+    .dword  0xAAAAAAAAAAAAAAAA
+    .dword  0xCCCCCCCCCCCCCCCC
+    .dword  0xF0F0F0F0F0F0F0F0
+.size Lbrev8_const,.-Lbrev8_const
+
+Lpolymod:
+    .byte 0x87
+.size Lpolymod,.-Lpolymod
+___
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/modes/asm/ghash-s390x.pl
+++ b/openssl-3.4.2/crypto/modes/asm/ghash-s390x.pl
@@ -0,0 +1,256 @@
+#! /usr/bin/env perl
+# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# September 2010.
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# was measured to be ~18 cycles per processed byte on z10, which is
+# almost 40% better than gcc-generated code. It should be noted that
+# 18 cycles is worse result than expected: loop is scheduled for 12
+# and the result should be close to 12. In the lack of instruction-
+# level profiling data it's impossible to tell why...
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2.8x better than 32-bit code generated by gcc 4.3.
+
+# March 2011.
+#
+# Support for hardware KIMD-GHASH is verified to produce correct
+# result and therefore is engaged. On z196 it was measured to process
+# 8KB buffer ~7 faster than software implementation. It's not as
+# impressive for smaller buffer sizes and for smallest 16-bytes buffer
+# it's actually almost 2 times slower. Which is the reason why
+# KIMD-GHASH is not used in gcm_gmult_4bit.
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+$output and open STDOUT,">$output";
+
+$softonly=0;
+
+$Zhi="%r0";
+$Zlo="%r1";
+
+$Xi="%r2";	# argument block
+$Htbl="%r3";
+$inp="%r4";
+$len="%r5";
+
+$rem0="%r6";	# variables
+$rem1="%r7";
+$nlo="%r8";
+$nhi="%r9";
+$xi="%r10";
+$cnt="%r11";
+$tmp="%r12";
+$x78="%r13";
+$rem_4bit="%r14";
+
+$sp="%r15";
+
+$code.=<<___;
+#include "s390x_arch.h"
+
+.text
+
+.globl	gcm_gmult_4bit
+.align	32
+gcm_gmult_4bit:
+___
+$code.=<<___;
+	stm${g}	%r6,%r14,6*$SIZE_T($sp)
+
+	aghi	$Xi,-1
+	lghi	$len,1
+	lghi	$x78,`0xf<<3`
+	larl	$rem_4bit,rem_4bit
+
+	lg	$Zlo,8+1($Xi)		# Xi
+	j	.Lgmult_shortcut
+.type	gcm_gmult_4bit,\@function
+.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
+
+.globl	gcm_ghash_4bit
+.align	32
+gcm_ghash_4bit:
+___
+$code.=<<___ if(!$softonly);
+	larl	%r1,OPENSSL_s390xcap_P
+	lg	%r0,S390X_KIMD+8(%r1)	# load second word of kimd capabilities
+					#  vector
+	tmhh	%r0,0x4000	# check for function 65
+	jz	.Lsoft_ghash
+	# Do not assume this function is called from a gcm128_context.
+	# This is not true, e.g., for AES-GCM-SIV.
+	# Parameter Block:
+	# Chaining Value (XI) 128byte
+	# Key (Htable[8]) 128byte
+	lmg	%r0,%r1,0($Xi)
+	stmg	%r0,%r1,8($sp)
+	lmg	%r0,%r1,8*16($Htbl)
+	stmg	%r0,%r1,24($sp)
+	la	%r1,8($sp)
+	lghi	%r0,S390X_GHASH	# function 65
+	.long	0xb93e0004	# kimd %r0,$inp
+	brc	1,.-4		# pay attention to "partial completion"
+	lmg	%r0,%r1,8($sp)
+	stmg	%r0,%r1,0($Xi)
+	br	%r14
+.align	32
+.Lsoft_ghash:
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	llgfr	$len,$len
+___
+$code.=<<___;
+	stm${g}	%r6,%r14,6*$SIZE_T($sp)
+
+	aghi	$Xi,-1
+	srlg	$len,$len,4
+	lghi	$x78,`0xf<<3`
+	larl	$rem_4bit,rem_4bit
+
+	lg	$Zlo,8+1($Xi)		# Xi
+	lg	$Zhi,0+1($Xi)
+	lghi	$tmp,0
+.Louter:
+	xg	$Zhi,0($inp)		# Xi ^= inp
+	xg	$Zlo,8($inp)
+	xgr	$Zhi,$tmp
+	stg	$Zlo,8+1($Xi)
+	stg	$Zhi,0+1($Xi)
+
+.Lgmult_shortcut:
+	lghi	$tmp,0xf0
+	sllg	$nlo,$Zlo,4
+	srlg	$xi,$Zlo,8		# extract second byte
+	ngr	$nlo,$tmp
+	lgr	$nhi,$Zlo
+	lghi	$cnt,14
+	ngr	$nhi,$tmp
+
+	lg	$Zlo,8($nlo,$Htbl)
+	lg	$Zhi,0($nlo,$Htbl)
+
+	sllg	$nlo,$xi,4
+	sllg	$rem0,$Zlo,3
+	ngr	$nlo,$tmp
+	ngr	$rem0,$x78
+	ngr	$xi,$tmp
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nhi,$Htbl)
+	xg	$Zhi,0($nhi,$Htbl)
+	lgr	$nhi,$xi
+	sllg	$rem1,$Zlo,3
+	xgr	$Zlo,$tmp
+	ngr	$rem1,$x78
+	sllg	$tmp,$Zhi,60
+	j	.Lghash_inner
+.align	16
+.Lghash_inner:
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nlo,$Htbl)
+	llgc	$xi,0($cnt,$Xi)
+	xg	$Zhi,0($nlo,$Htbl)
+	sllg	$nlo,$xi,4
+	xg	$Zhi,0($rem0,$rem_4bit)
+	nill	$nlo,0xf0
+	sllg	$rem0,$Zlo,3
+	xgr	$Zlo,$tmp
+	ngr	$rem0,$x78
+	nill	$xi,0xf0
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nhi,$Htbl)
+	xg	$Zhi,0($nhi,$Htbl)
+	lgr	$nhi,$xi
+	xg	$Zhi,0($rem1,$rem_4bit)
+	sllg	$rem1,$Zlo,3
+	xgr	$Zlo,$tmp
+	ngr	$rem1,$x78
+	sllg	$tmp,$Zhi,60
+	brct	$cnt,.Lghash_inner
+
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nlo,$Htbl)
+	xg	$Zhi,0($nlo,$Htbl)
+	sllg	$xi,$Zlo,3
+	xg	$Zhi,0($rem0,$rem_4bit)
+	xgr	$Zlo,$tmp
+	ngr	$xi,$x78
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nhi,$Htbl)
+	xg	$Zhi,0($nhi,$Htbl)
+	xgr	$Zlo,$tmp
+	xg	$Zhi,0($rem1,$rem_4bit)
+
+	lg	$tmp,0($xi,$rem_4bit)
+	la	$inp,16($inp)
+	sllg	$tmp,$tmp,4		# correct last rem_4bit[rem]
+	brctg	$len,.Louter
+
+	xgr	$Zhi,$tmp
+	stg	$Zlo,8+1($Xi)
+	stg	$Zhi,0+1($Xi)
+	lm${g}	%r6,%r14,6*$SIZE_T($sp)
+	br	%r14
+.type	gcm_ghash_4bit,\@function
+.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
+
+.align	64
+rem_4bit:
+	.long	`0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
+	.long	`0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
+	.long	`0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
+	.long	`0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
+.type	rem_4bit,\@object
+.size	rem_4bit,(.-rem_4bit)
+.string	"GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/modes/asm/ghash-sparcv9.pl
+++ b/openssl-3.4.2/crypto/modes/asm/ghash-sparcv9.pl
@@ -0,0 +1,583 @@
+#! /usr/bin/env perl
+# Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
+# and are expressed in cycles per processed byte, less is better:
+#
+#		gcc 3.3.x	cc 5.2		this assembler
+#
+# 32-bit build	81.4		43.3		12.6	(+546%/+244%)
+# 64-bit build	20.2		21.2		12.6	(+60%/+68%)
+#
+# Here is data collected on UltraSPARC T1 system running Linux:
+#
+#		gcc 4.4.1			this assembler
+#
+# 32-bit build	566				50	(+1000%)
+# 64-bit build	56				50	(+12%)
+#
+# I don't quite understand why difference between 32-bit and 64-bit
+# compiler-generated code is so big. Compilers *were* instructed to
+# generate code for UltraSPARC and should have used 64-bit registers
+# for Z vector (see C code) even in 32-bit build... Oh well, it only
+# means more impressive improvement coefficients for this assembler
+# module;-) Loops are aggressively modulo-scheduled in respect to
+# references to input data and Z.hi updates to achieve 12 cycles
+# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
+# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
+#
+# October 2012
+#
+# Add VIS3 lookup-table-free implementation using polynomial
+# multiplication xmulx[hi] and extended addition addxc[cc]
+# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
+# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
+# saturates at ~15.5x single-process result on 8-core processor,
+# or ~20.5GBps per 2.85GHz socket.
+
+$output=pop and open STDOUT,">$output";
+
+$frame="STACK_FRAME";
+$bias="STACK_BIAS";
+
+$Zhi="%o0";	# 64-bit values
+$Zlo="%o1";
+$Thi="%o2";
+$Tlo="%o3";
+$rem="%o4";
+$tmp="%o5";
+
+$nhi="%l0";	# small values and pointers
+$nlo="%l1";
+$xi0="%l2";
+$xi1="%l3";
+$rem_4bit="%l4";
+$remi="%l5";
+$Htblo="%l6";
+$cnt="%l7";
+
+$Xi="%i0";	# input argument block
+$Htbl="%i1";
+$inp="%i2";
+$len="%i3";
+
+$code.=<<___;
+#ifndef __ASSEMBLER__
+# define __ASSEMBLER__ 1
+#endif
+#include "crypto/sparc_arch.h"
+
+#ifdef  __arch64__
+.register	%g2,#scratch
+.register	%g3,#scratch
+#endif
+
+.section	".text",#alloc,#execinstr
+
+.align	64
+rem_4bit:
+	.long	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+	.long	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+	.long	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+	.long	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+.type	rem_4bit,#object
+.size	rem_4bit,(.-rem_4bit)
+
+.globl	gcm_ghash_4bit
+.align	32
+gcm_ghash_4bit:
+	save	%sp,-$frame,%sp
+	ldub	[$inp+15],$nlo
+	ldub	[$Xi+15],$xi0
+	ldub	[$Xi+14],$xi1
+	add	$len,$inp,$len
+	add	$Htbl,8,$Htblo
+
+1:	call	.+8
+	add	%o7,rem_4bit-1b,$rem_4bit
+
+.Louter:
+	xor	$xi0,$nlo,$nlo
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	sll	$nlo,4,$nlo
+	ldx	[$Htblo+$nlo],$Zlo
+	ldx	[$Htbl+$nlo],$Zhi
+
+	ldub	[$inp+14],$nlo
+
+	ldx	[$Htblo+$nhi],$Tlo
+	and	$Zlo,0xf,$remi
+	ldx	[$Htbl+$nhi],$Thi
+	sll	$remi,3,$remi
+	ldx	[$rem_4bit+$remi],$rem
+	srlx	$Zlo,4,$Zlo
+	mov	13,$cnt
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+
+	xor	$xi1,$nlo,$nlo
+	and	$Zlo,0xf,$remi
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	ba	.Lghash_inner
+	sll	$nlo,4,$nlo
+.align	32
+.Lghash_inner:
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$inp+$cnt],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	ldub	[$Xi+$cnt],$xi1
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$xi1,$nlo,$nlo
+	srlx	$Zhi,4,$Zhi
+	and	$nlo,0xf0,$nhi
+	addcc	$cnt,-1,$cnt
+	xor	$Zlo,$tmp,$Zlo
+	and	$nlo,0x0f,$nlo
+	xor	$Tlo,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	blu	.Lghash_inner
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+
+	add	$inp,16,$inp
+	cmp	$inp,$len
+	be,pn	SIZE_T_CC,.Ldone
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$inp+15],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+	srl	$Zlo,8,$xi1
+	and	$Zlo,0xff,$xi0
+	ba	.Louter
+	and	$xi1,0xff,$xi1
+.align	32
+.Ldone:
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+
+	ret
+	restore
+.type	gcm_ghash_4bit,#function
+.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
+___
+
+undef $inp;
+undef $len;
+
+$code.=<<___;
+.globl	gcm_gmult_4bit
+.align	32
+gcm_gmult_4bit:
+	save	%sp,-$frame,%sp
+	ldub	[$Xi+15],$nlo
+	add	$Htbl,8,$Htblo
+
+1:	call	.+8
+	add	%o7,rem_4bit-1b,$rem_4bit
+
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	sll	$nlo,4,$nlo
+	ldx	[$Htblo+$nlo],$Zlo
+	ldx	[$Htbl+$nlo],$Zhi
+
+	ldub	[$Xi+14],$nlo
+
+	ldx	[$Htblo+$nhi],$Tlo
+	and	$Zlo,0xf,$remi
+	ldx	[$Htbl+$nhi],$Thi
+	sll	$remi,3,$remi
+	ldx	[$rem_4bit+$remi],$rem
+	srlx	$Zlo,4,$Zlo
+	mov	13,$cnt
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+
+	and	$Zlo,0xf,$remi
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	ba	.Lgmult_inner
+	sll	$nlo,4,$nlo
+.align	32
+.Lgmult_inner:
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$Xi+$cnt],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	srlx	$Zhi,4,$Zhi
+	and	$nlo,0xf0,$nhi
+	addcc	$cnt,-1,$cnt
+	xor	$Zlo,$tmp,$Zlo
+	and	$nlo,0x0f,$nlo
+	xor	$Tlo,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	blu	.Lgmult_inner
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+
+	ret
+	restore
+.type	gcm_gmult_4bit,#function
+.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
+___
+
+{{{
+# Straightforward 128x128-bit multiplication using Karatsuba algorithm
+# followed by pair of 64-bit reductions [with a shortcut in first one,
+# which allowed to break dependency between reductions and remove one
+# multiplication from critical path]. While it might be suboptimal
+# with regard to sheer number of multiplications, other methods [such
+# as aggregate reduction] would require more 64-bit registers, which
+# we don't have in 32-bit application context.
+
+($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
+
+($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
+	(map("%o$_",(0..5,7)),map("%g$_",(1..5)));
+
+($shl,$shr)=map("%l$_",(0..7));
+
+# For details regarding "twisted H" see ghash-x86.pl.
+$code.=<<___;
+.globl	gcm_init_vis3
+.align	32
+gcm_init_vis3:
+	save	%sp,-$frame,%sp
+
+	ldx	[%i1+0],$Hhi
+	ldx	[%i1+8],$Hlo
+	mov	0xE1,$Xhi
+	mov	1,$Xlo
+	sllx	$Xhi,57,$Xhi
+	srax	$Hhi,63,$C0		! broadcast carry
+	addcc	$Hlo,$Hlo,$Hlo		! H<<=1
+	addxc	$Hhi,$Hhi,$Hhi
+	and	$C0,$Xlo,$Xlo
+	and	$C0,$Xhi,$Xhi
+	xor	$Xlo,$Hlo,$Hlo
+	xor	$Xhi,$Hhi,$Hhi
+	stx	$Hlo,[%i0+8]		! save twisted H
+	stx	$Hhi,[%i0+0]
+
+	sethi	%hi(0xA0406080),$V
+	sethi	%hi(0x20C0E000),%l0
+	or	$V,%lo(0xA0406080),$V
+	or	%l0,%lo(0x20C0E000),%l0
+	sllx	$V,32,$V
+	or	%l0,$V,$V		! (0xE0·i)&0xff=0xA040608020C0E000
+	stx	$V,[%i0+16]
+
+	ret
+	restore
+.type	gcm_init_vis3,#function
+.size	gcm_init_vis3,.-gcm_init_vis3
+
+.globl	gcm_gmult_vis3
+.align	32
+gcm_gmult_vis3:
+	save	%sp,-$frame,%sp
+
+	ldx	[$Xip+8],$Xlo		! load Xi
+	ldx	[$Xip+0],$Xhi
+	ldx	[$Htable+8],$Hlo	! load twisted H
+	ldx	[$Htable+0],$Hhi
+
+	mov	0xE1,%l7
+	sllx	%l7,57,$xE1		! 57 is not a typo
+	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000
+
+	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
+	xmulx	$Xlo,$Hlo,$C0
+	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
+	xmulx	$C2,$Hhl,$C1
+	xmulxhi	$Xlo,$Hlo,$Xlo
+	xmulxhi	$C2,$Hhl,$C2
+	xmulxhi	$Xhi,$Hhi,$C3
+	xmulx	$Xhi,$Hhi,$Xhi
+
+	sll	$C0,3,$sqr
+	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)]
+	xor	$C0,$sqr,$sqr
+	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f]
+
+	xor	$C0,$C1,$C1		! Karatsuba post-processing
+	xor	$Xlo,$C2,$C2
+	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
+	xor	$C3,$C2,$C2
+	xor	$Xlo,$C1,$C1
+	xor	$Xhi,$C2,$C2
+	xor	$Xhi,$C1,$C1
+
+	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56
+	 xor	$C0,$C2,$C2
+	xmulx	$C1,$xE1,$C0
+	 xor	$C1,$C3,$C3
+	xmulxhi	$C1,$xE1,$C1
+
+	xor	$Xlo,$C2,$C2
+	xor	$C0,$C2,$C2
+	xor	$C1,$C3,$C3
+
+	stx	$C2,[$Xip+8]		! save Xi
+	stx	$C3,[$Xip+0]
+
+	ret
+	restore
+.type	gcm_gmult_vis3,#function
+.size	gcm_gmult_vis3,.-gcm_gmult_vis3
+
+.globl	gcm_ghash_vis3
+.align	32
+gcm_ghash_vis3:
+	save	%sp,-$frame,%sp
+	nop
+	srln	$len,0,$len		! needed on v8+, "nop" on v9
+
+	ldx	[$Xip+8],$C2		! load Xi
+	ldx	[$Xip+0],$C3
+	ldx	[$Htable+8],$Hlo	! load twisted H
+	ldx	[$Htable+0],$Hhi
+
+	mov	0xE1,%l7
+	sllx	%l7,57,$xE1		! 57 is not a typo
+	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000
+
+	and	$inp,7,$shl
+	andn	$inp,7,$inp
+	sll	$shl,3,$shl
+	prefetch [$inp+63], 20
+	sub	%g0,$shl,$shr
+
+	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
+.Loop:
+	ldx	[$inp+8],$Xlo
+	brz,pt	$shl,1f
+	ldx	[$inp+0],$Xhi
+
+	ldx	[$inp+16],$C1		! align data
+	srlx	$Xlo,$shr,$C0
+	sllx	$Xlo,$shl,$Xlo
+	sllx	$Xhi,$shl,$Xhi
+	srlx	$C1,$shr,$C1
+	or	$C0,$Xhi,$Xhi
+	or	$C1,$Xlo,$Xlo
+1:
+	add	$inp,16,$inp
+	sub	$len,16,$len
+	xor	$C2,$Xlo,$Xlo
+	xor	$C3,$Xhi,$Xhi
+	prefetch [$inp+63], 20
+
+	xmulx	$Xlo,$Hlo,$C0
+	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
+	xmulx	$C2,$Hhl,$C1
+	xmulxhi	$Xlo,$Hlo,$Xlo
+	xmulxhi	$C2,$Hhl,$C2
+	xmulxhi	$Xhi,$Hhi,$C3
+	xmulx	$Xhi,$Hhi,$Xhi
+
+	sll	$C0,3,$sqr
+	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)]
+	xor	$C0,$sqr,$sqr
+	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f]
+
+	xor	$C0,$C1,$C1		! Karatsuba post-processing
+	xor	$Xlo,$C2,$C2
+	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
+	xor	$C3,$C2,$C2
+	xor	$Xlo,$C1,$C1
+	xor	$Xhi,$C2,$C2
+	xor	$Xhi,$C1,$C1
+
+	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56
+	 xor	$C0,$C2,$C2
+	xmulx	$C1,$xE1,$C0
+	 xor	$C1,$C3,$C3
+	xmulxhi	$C1,$xE1,$C1
+
+	xor	$Xlo,$C2,$C2
+	xor	$C0,$C2,$C2
+	brnz,pt	$len,.Loop
+	xor	$C1,$C3,$C3
+
+	stx	$C2,[$Xip+8]		! save Xi
+	stx	$C3,[$Xip+0]
+
+	ret
+	restore
+.type	gcm_ghash_vis3,#function
+.size	gcm_ghash_vis3,.-gcm_ghash_vis3
+___
+}}}
+$code.=<<___;
+.asciz	"GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = (	"addxc"		=> 0x011,
+		"addxccc"	=> 0x013,
+		"xmulx"		=> 0x115,
+		"xmulxhi"	=> 0x116	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if ($opf=$visopf{$mnemonic}) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%([goli])([0-9])/);
+	    $_=$bias{$1}+$2;
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+		&unvis3($1,$2,$3,$4)
+	 /ge;
+
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/modes/asm/ghash-x86.pl
+++ b/openssl-3.4.2/crypto/modes/asm/ghash-x86.pl
--- a/openssl-3.4.2/crypto/modes/asm/ghash-x86_64.pl
+++ b/openssl-3.4.2/crypto/modes/asm/ghash-x86_64.pl
--- a/openssl-3.4.2/crypto/modes/asm/ghashp8-ppc.pl
+++ b/openssl-3.4.2/crypto/modes/asm/ghashp8-ppc.pl
@@ -0,0 +1,674 @@
+#! /usr/bin/env perl
+# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for for PowerISA v2.07.
+#
+# July 2014
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This initial
+# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
+# faster than "4-bit" integer-only compiler-generated 64-bit code.
+# "Initial version" means that there is room for further improvement.
+
+# May 2016
+#
+# 2x aggregated reduction improves performance by 50% (resulting
+# performance on POWER8 is 1 cycle per processed byte), and 4x
+# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
+# POWER9 delivers 0.51 cpb.
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+if ($flavour =~ /64/) {
+	$SIZE_T=8;
+	$LRSAVE=2*$SIZE_T;
+	$STU="stdu";
+	$POP="ld";
+	$PUSH="std";
+	$UCMP="cmpld";
+	$SHRI="srdi";
+} elsif ($flavour =~ /32/) {
+	$SIZE_T=4;
+	$LRSAVE=$SIZE_T;
+	$STU="stwu";
+	$POP="lwz";
+	$PUSH="stw";
+	$UCMP="cmplw";
+	$SHRI="srwi";
+} else { die "nonsense $flavour"; }
+
+$sp="r1";
+$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
+
+my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
+my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
+my $vrsave="r12";
+
+$code=<<___;
+.machine	"any"
+
+.text
+
+.globl	.gcm_init_p8
+.align	5
+.gcm_init_p8:
+	li		r0,-4096
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$H,0,r4			# load H
+
+	vspltisb	$xC2,-16		# 0xf0
+	vspltisb	$t0,1			# one
+	vaddubm		$xC2,$xC2,$xC2		# 0xe0
+	vxor		$zero,$zero,$zero
+	vor		$xC2,$xC2,$t0		# 0xe1
+	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
+	vsldoi		$t1,$zero,$t0,1		# ...1
+	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
+	vspltisb	$t2,7
+	vor		$xC2,$xC2,$t1		# 0xc2....01
+	vspltb		$t1,$H,0		# most significant byte
+	vsl		$H,$H,$t0		# H<<=1
+	vsrab		$t1,$t1,$t2		# broadcast carry bit
+	vand		$t1,$t1,$xC2
+	vxor		$IN,$H,$t1		# twisted H
+
+	vsldoi		$H,$IN,$IN,8		# twist even more ...
+	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
+	vsldoi		$Hl,$zero,$H,8		# ... and split
+	vsldoi		$Hh,$H,$zero,8
+
+	stvx_u		$xC2,0,r3		# save pre-computed table
+	stvx_u		$Hl,r8,r3
+	li		r8,0x40
+	stvx_u		$H, r9,r3
+	li		r9,0x50
+	stvx_u		$Hh,r10,r3
+	li		r10,0x60
+
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·H.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·H.lo+H.lo·H.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·H.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$IN1,$Xl,$t1
+
+	vsldoi		$H2,$IN1,$IN1,8
+	vsldoi		$H2l,$zero,$H2,8
+	vsldoi		$H2h,$H2,$zero,8
+
+	stvx_u		$H2l,r8,r3		# save H^2
+	li		r8,0x70
+	stvx_u		$H2,r9,r3
+	li		r9,0x80
+	stvx_u		$H2h,r10,r3
+	li		r10,0x90
+___
+{
+my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
+$code.=<<___;
+	vpmsumd		$Xl,$IN,$H2l		# H.lo·H^2.lo
+	 vpmsumd	$Xl1,$IN1,$H2l		# H^2.lo·H^2.lo
+	vpmsumd		$Xm,$IN,$H2		# H.hi·H^2.lo+H.lo·H^2.hi
+	 vpmsumd	$Xm1,$IN1,$H2		# H^2.hi·H^2.lo+H^2.lo·H^2.hi
+	vpmsumd		$Xh,$IN,$H2h		# H.hi·H^2.hi
+	 vpmsumd	$Xh1,$IN1,$H2h		# H^2.hi·H^2.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+	 vpmsumd	$t6,$Xl1,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vsldoi		$t4,$Xm1,$zero,8
+	 vsldoi		$t5,$zero,$Xm1,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+	 vxor		$Xl1,$Xl1,$t4
+	 vxor		$Xh1,$Xh1,$t5
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	 vsldoi		$Xl1,$Xl1,$Xl1,8
+	vxor		$Xl,$Xl,$t2
+	 vxor		$Xl1,$Xl1,$t6
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	 vsldoi		$t5,$Xl1,$Xl1,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	 vpmsumd	$Xl1,$Xl1,$xC2
+	vxor		$t1,$t1,$Xh
+	 vxor		$t5,$t5,$Xh1
+	vxor		$Xl,$Xl,$t1
+	 vxor		$Xl1,$Xl1,$t5
+
+	vsldoi		$H,$Xl,$Xl,8
+	 vsldoi		$H2,$Xl1,$Xl1,8
+	vsldoi		$Hl,$zero,$H,8
+	vsldoi		$Hh,$H,$zero,8
+	 vsldoi		$H2l,$zero,$H2,8
+	 vsldoi		$H2h,$H2,$zero,8
+
+	stvx_u		$Hl,r8,r3		# save H^3
+	li		r8,0xa0
+	stvx_u		$H,r9,r3
+	li		r9,0xb0
+	stvx_u		$Hh,r10,r3
+	li		r10,0xc0
+	 stvx_u		$H2l,r8,r3		# save H^4
+	 stvx_u		$H2,r9,r3
+	 stvx_u		$H2h,r10,r3
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,2,0
+	.long		0
+.size	.gcm_init_p8,.-.gcm_init_p8
+___
+}
+$code.=<<___;
+.globl	.gcm_gmult_p8
+.align	5
+.gcm_gmult_p8:
+	lis		r0,0xfff8
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$IN,0,$Xip		# load Xi
+
+	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	 le?lvsl	$lemask,r0,r0
+	lvx_u		$H, r9,$Htbl
+	 le?vspltisb	$t0,0x07
+	lvx_u		$Hh,r10,$Htbl
+	 le?vxor	$lemask,$lemask,$t0
+	lvx_u		$xC2,0,$Htbl
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$zero,$zero,$zero
+
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$Xl,$Xl,$t1
+
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,2,0
+	.long		0
+.size	.gcm_gmult_p8,.-.gcm_gmult_p8
+
+.globl	.gcm_ghash_p8
+.align	5
+.gcm_ghash_p8:
+	li		r0,-4096
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$Xl,0,$Xip		# load Xi
+
+	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	li		r8,0x40
+	 le?lvsl	$lemask,r0,r0
+	lvx_u		$H, r9,$Htbl
+	li		r9,0x50
+	 le?vspltisb	$t0,0x07
+	lvx_u		$Hh,r10,$Htbl
+	li		r10,0x60
+	 le?vxor	$lemask,$lemask,$t0
+	lvx_u		$xC2,0,$Htbl
+	 le?vperm	$Xl,$Xl,$Xl,$lemask
+	vxor		$zero,$zero,$zero
+
+	${UCMP}i	$len,64
+	bge		Lgcm_ghash_p8_4x
+
+	lvx_u		$IN,0,$inp
+	addi		$inp,$inp,16
+	subic.		$len,$len,16
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$IN,$IN,$Xl
+	beq		Lshort
+
+	lvx_u		$H2l,r8,$Htbl		# load H^2
+	li		r8,16
+	lvx_u		$H2, r9,$Htbl
+	add		r9,$inp,$len		# end of input
+	lvx_u		$H2h,r10,$Htbl
+	be?b		Loop_2x
+
+.align	5
+Loop_2x:
+	lvx_u		$IN1,0,$inp
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+
+	 subic		$len,$len,32
+	vpmsumd		$Xl,$IN,$H2l		# H^2.lo·Xi.lo
+	 vpmsumd	$Xl1,$IN1,$Hl		# H.lo·Xi+1.lo
+	 subfe		r0,r0,r0		# borrow?-1:0
+	vpmsumd		$Xm,$IN,$H2		# H^2.hi·Xi.lo+H^2.lo·Xi.hi
+	 vpmsumd	$Xm1,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+1.hi
+	 and		r0,r0,$len
+	vpmsumd		$Xh,$IN,$H2h		# H^2.hi·Xi.hi
+	 vpmsumd	$Xh1,$IN1,$Hh		# H.hi·Xi+1.hi
+	 add		$inp,$inp,r0
+
+	vxor		$Xl,$Xl,$Xl1
+	vxor		$Xm,$Xm,$Xm1
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vxor		$Xh,$Xh,$Xh1
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+	 lvx_u		$IN,r8,$inp
+	 addi		$inp,$inp,32
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$t1,$t1,$Xh
+	vxor		$IN,$IN,$t1
+	vxor		$IN,$IN,$Xl
+	$UCMP		r9,$inp
+	bgt		Loop_2x			# done yet?
+
+	cmplwi		$len,0
+	bne		Leven
+
+Lshort:
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+
+Leven:
+	vxor		$Xl,$Xl,$t1
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,4,0
+	.long		0
+___
+{
+my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
+    $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
+my $IN0=$IN;
+my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
+
+$code.=<<___;
+.align	5
+.gcm_ghash_p8_4x:
+Lgcm_ghash_p8_4x:
+	$STU		$sp,-$FRAME($sp)
+	li		r10,`15+6*$SIZE_T`
+	li		r11,`31+6*$SIZE_T`
+	stvx		v20,r10,$sp
+	addi		r10,r10,32
+	stvx		v21,r11,$sp
+	addi		r11,r11,32
+	stvx		v22,r10,$sp
+	addi		r10,r10,32
+	stvx		v23,r11,$sp
+	addi		r11,r11,32
+	stvx		v24,r10,$sp
+	addi		r10,r10,32
+	stvx		v25,r11,$sp
+	addi		r11,r11,32
+	stvx		v26,r10,$sp
+	addi		r10,r10,32
+	stvx		v27,r11,$sp
+	addi		r11,r11,32
+	stvx		v28,r10,$sp
+	addi		r10,r10,32
+	stvx		v29,r11,$sp
+	addi		r11,r11,32
+	stvx		v30,r10,$sp
+	li		r10,0x60
+	stvx		v31,r11,$sp
+	li		r0,-1
+	stw		$vrsave,`$FRAME-4`($sp)	# save vrsave
+	mtspr		256,r0			# preserve all AltiVec registers
+
+	lvsl		$t0,0,r8		# 0x0001..0e0f
+	#lvx_u		$H2l,r8,$Htbl		# load H^2
+	li		r8,0x70
+	lvx_u		$H2, r9,$Htbl
+	li		r9,0x80
+	vspltisb	$t1,8			# 0x0808..0808
+	#lvx_u		$H2h,r10,$Htbl
+	li		r10,0x90
+	lvx_u		$H3l,r8,$Htbl		# load H^3
+	li		r8,0xa0
+	lvx_u		$H3, r9,$Htbl
+	li		r9,0xb0
+	lvx_u		$H3h,r10,$Htbl
+	li		r10,0xc0
+	lvx_u		$H4l,r8,$Htbl		# load H^4
+	li		r8,0x10
+	lvx_u		$H4, r9,$Htbl
+	li		r9,0x20
+	lvx_u		$H4h,r10,$Htbl
+	li		r10,0x30
+
+	vsldoi		$t2,$zero,$t1,8		# 0x0000..0808
+	vaddubm		$hiperm,$t0,$t2		# 0x0001..1617
+	vaddubm		$loperm,$t1,$hiperm	# 0x0809..1e1f
+
+	$SHRI		$len,$len,4		# this allows to use sign bit
+						# as carry
+	lvx_u		$IN0,0,$inp		# load input
+	lvx_u		$IN1,r8,$inp
+	subic.		$len,$len,8
+	lvx_u		$IN2,r9,$inp
+	lvx_u		$IN3,r10,$inp
+	addi		$inp,$inp,0x40
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+	le?vperm	$IN3,$IN3,$IN3,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+
+	 vpmsumd	$Xl1,$IN1,$H3l
+	 vpmsumd	$Xm1,$IN1,$H3
+	 vpmsumd	$Xh1,$IN1,$H3h
+
+	 vperm		$H21l,$H2,$H,$hiperm
+	 vperm		$t0,$IN2,$IN3,$loperm
+	 vperm		$H21h,$H2,$H,$loperm
+	 vperm		$t1,$IN2,$IN3,$hiperm
+	 vpmsumd	$Xm2,$IN2,$H2		# H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
+	 vpmsumd	$Xl3,$t0,$H21l		# H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
+	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
+	 vpmsumd	$Xh3,$t1,$H21h		# H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
+
+	 vxor		$Xm2,$Xm2,$Xm1
+	 vxor		$Xl3,$Xl3,$Xl1
+	 vxor		$Xm3,$Xm3,$Xm2
+	 vxor		$Xh3,$Xh3,$Xh1
+
+	blt		Ltail_4x
+
+Loop_4x:
+	lvx_u		$IN0,0,$inp
+	lvx_u		$IN1,r8,$inp
+	subic.		$len,$len,4
+	lvx_u		$IN2,r9,$inp
+	lvx_u		$IN3,r10,$inp
+	addi		$inp,$inp,0x40
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+	le?vperm	$IN3,$IN3,$IN3,$lemask
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+
+	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
+	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
+	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
+	 vpmsumd	$Xl1,$IN1,$H3l
+	 vpmsumd	$Xm1,$IN1,$H3
+	 vpmsumd	$Xh1,$IN1,$H3h
+
+	vxor		$Xl,$Xl,$Xl3
+	vxor		$Xm,$Xm,$Xm3
+	vxor		$Xh,$Xh,$Xh3
+	 vperm		$t0,$IN2,$IN3,$loperm
+	 vperm		$t1,$IN2,$IN3,$hiperm
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+	 vpmsumd	$Xl3,$t0,$H21l		# H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
+	 vpmsumd	$Xh3,$t1,$H21h		# H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	 vpmsumd	$Xm2,$IN2,$H2		# H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
+	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
+	vpmsumd		$Xl,$Xl,$xC2
+
+	 vxor		$Xl3,$Xl3,$Xl1
+	 vxor		$Xh3,$Xh3,$Xh1
+	vxor		$Xh,$Xh,$IN0
+	 vxor		$Xm2,$Xm2,$Xm1
+	vxor		$Xh,$Xh,$t1
+	 vxor		$Xm3,$Xm3,$Xm2
+	vxor		$Xh,$Xh,$Xl
+	bge		Loop_4x
+
+Ltail_4x:
+	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
+	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
+	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
+
+	vxor		$Xl,$Xl,$Xl3
+	vxor		$Xm,$Xm,$Xm3
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vxor		$Xh,$Xh,$Xh3
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$Xl,$Xl,$t1
+
+	addic.		$len,$len,4
+	beq		Ldone_4x
+
+	lvx_u		$IN0,0,$inp
+	${UCMP}i	$len,2
+	li		$len,-4
+	blt		Lone
+	lvx_u		$IN1,r8,$inp
+	beq		Ltwo
+
+Lthree:
+	lvx_u		$IN2,r9,$inp
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+	vmr		$H4l,$H3l
+	vmr		$H4, $H3
+	vmr		$H4h,$H3h
+
+	vperm		$t0,$IN1,$IN2,$loperm
+	vperm		$t1,$IN1,$IN2,$hiperm
+	vpmsumd		$Xm2,$IN1,$H2		# H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
+	vpmsumd		$Xm3,$IN2,$H		# H.hi·Xi+2.lo  +H.lo·Xi+2.hi
+	vpmsumd		$Xl3,$t0,$H21l		# H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
+	vpmsumd		$Xh3,$t1,$H21h		# H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
+
+	vxor		$Xm3,$Xm3,$Xm2
+	b		Ltail_4x
+
+.align	4
+Ltwo:
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+	vperm		$t0,$zero,$IN1,$loperm
+	vperm		$t1,$zero,$IN1,$hiperm
+
+	vsldoi		$H4l,$zero,$H2,8
+	vmr		$H4, $H2
+	vsldoi		$H4h,$H2,$zero,8
+
+	vpmsumd		$Xl3,$t0, $H21l		# H.lo·Xi+1.lo
+	vpmsumd		$Xm3,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+2.hi
+	vpmsumd		$Xh3,$t1, $H21h		# H.hi·Xi+1.hi
+
+	b		Ltail_4x
+
+.align	4
+Lone:
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+
+	vsldoi		$H4l,$zero,$H,8
+	vmr		$H4, $H
+	vsldoi		$H4h,$H,$zero,8
+
+	vxor		$Xh,$IN0,$Xl
+	vxor		$Xl3,$Xl3,$Xl3
+	vxor		$Xm3,$Xm3,$Xm3
+	vxor		$Xh3,$Xh3,$Xh3
+
+	b		Ltail_4x
+
+Ldone_4x:
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	li		r10,`15+6*$SIZE_T`
+	li		r11,`31+6*$SIZE_T`
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	addi		$sp,$sp,$FRAME
+	blr
+	.long		0
+	.byte		0,12,0x04,0,0x80,0,4,0
+	.long		0
+___
+}
+$code.=<<___;
+.size	.gcm_ghash_p8,.-.gcm_ghash_p8
+
+.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	if ($flavour =~ /le$/o) {	# little-endian
+	    s/le\?//o		or
+	    s/be\?/#be#/o;
+	} else {
+	    s/le\?/#le#/o	or
+	    s/be\?//o;
+	}
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
--- a/openssl-3.4.2/crypto/modes/asm/ghashv8-armx.pl
+++ b/openssl-3.4.2/crypto/modes/asm/ghashv8-armx.pl
@@ -0,0 +1,900 @@
+#! /usr/bin/env perl
+# Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
+#
+# June 2014
+#
+# Initial version was developed in tight cooperation with Ard
+# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
+# Just like aesv8-armx.pl this module supports both AArch32 and
+# AArch64 execution modes.
+#
+# July 2014
+#
+# Implement 2x aggregated reduction [see ghash-x86.pl for background
+# information].
+#
+# November 2017
+#
+# AArch64 register bank to "accommodate" 4x aggregated reduction and
+# improve performance by 20-70% depending on processor.
+#
+# Current performance in cycles per processed byte:
+#
+#		64-bit PMULL	32-bit PMULL	32-bit NEON(*)
+# Apple A7	0.58		0.92		5.62
+# Cortex-A53	0.85		1.01		8.39
+# Cortex-A57	0.73		1.17		7.61
+# Denver	0.51		0.65		6.02
+# Mongoose	0.65		1.10		8.06
+# Kryo		0.76		1.16		8.00
+# ThunderX2	1.05
+#
+# (*)	presented for reference/comparison purposes;
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+$Xi="x0";	# argument block
+$Htbl="x1";
+$inp="x2";
+$len="x3";
+
+$inc="x12";
+
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
+my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
+
+$code=<<___;
+#include "arm_arch.h"
+
+#if __ARM_MAX_ARCH__>=7
+___
+$code.=".arch	armv8-a+crypto\n.text\n"	if ($flavour =~ /64/);
+$code.=<<___					if ($flavour !~ /64/);
+.fpu	neon
+#ifdef __thumb2__
+.syntax        unified
+.thumb
+# define INST(a,b,c,d) $_byte  c,0xef,a,b
+#else
+.code  32
+# define INST(a,b,c,d) $_byte  a,b,c,0xf2
+#endif
+
+.text
+___
+
+################################################################################
+# void gcm_init_v8(u128 Htable[16],const u64 H[2]);
+#
+# input:	128-bit H - secret parameter E(K,0^128)
+# output:	precomputed table filled with degrees of twisted H;
+#		H is twisted to handle reverse bitness of GHASH;
+#		only few of 16 slots of Htable[16] are used;
+#		data is opaque to outside world (which allows to
+#		optimize the code independently);
+#
+$code.=<<___;
+.global	gcm_init_v8
+.type	gcm_init_v8,%function
+.align	4
+gcm_init_v8:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	AARCH64_VALID_CALL_TARGET
+___
+$code.=<<___;
+	vld1.64		{$t1},[x1]		@ load input H
+	vmov.i8		$xC2,#0xe1
+	vshl.i64	$xC2,$xC2,#57		@ 0xc2.0
+	vext.8		$IN,$t1,$t1,#8
+	vshr.u64	$t2,$xC2,#63
+	vdup.32		$t1,${t1}[1]
+	vext.8		$t0,$t2,$xC2,#8		@ t0=0xc2....01
+	vshr.u64	$t2,$IN,#63
+	vshr.s32	$t1,$t1,#31		@ broadcast carry bit
+	vand		$t2,$t2,$t0
+	vshl.i64	$IN,$IN,#1
+	vext.8		$t2,$t2,$t2,#8
+	vand		$t0,$t0,$t1
+	vorr		$IN,$IN,$t2		@ H<<<=1
+	veor		$H,$IN,$t0		@ twisted H
+	vst1.64		{$H},[x0],#16		@ store Htable[0]
+
+	@ calculate H^2
+	vext.8		$t0,$H,$H,#8		@ Karatsuba pre-processing
+	vpmull.p64	$Xl,$H,$H
+	veor		$t0,$t0,$H
+	vpmull2.p64	$Xh,$H,$H
+	vpmull.p64	$Xm,$t0,$t0
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$H2,$Xl,$t2
+
+	vext.8		$t1,$H2,$H2,#8		@ Karatsuba pre-processing
+	veor		$t1,$t1,$H2
+	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
+	vst1.64		{$Hhl-$H2},[x0],#32	@ store Htable[1..2]
+___
+if ($flavour =~ /64/) {
+my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
+my ($H3,$H34k,$H4,$H5,$H56k,$H6,$H7,$H78k,$H8) = map("q$_",(15..23));
+
+$code.=<<___;
+	@ calculate H^3 and H^4
+	vpmull.p64	$Xl,$H, $H2
+	 vpmull.p64	$Yl,$H2,$H2
+	vpmull2.p64	$Xh,$H, $H2
+	 vpmull2.p64	$Yh,$H2,$H2
+	vpmull.p64	$Xm,$t0,$t1
+	 vpmull.p64	$Ym,$t1,$t1
+
+	vext.8		$t0,$Xl,$Xh,#8		@ Karatsuba post-processing
+	 vext.8		$t1,$Yl,$Yh,#8
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t0
+	 veor		$t3,$Yl,$Yh
+	 veor		$Ym,$Ym,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+	 veor		$Ym,$Ym,$t3
+	 vpmull.p64	$t3,$Yl,$xC2
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	 vmov		$Yh#lo,$Ym#hi
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	 vmov		$Ym#hi,$Yl#lo
+	veor		$Xl,$Xm,$t2
+	 veor		$Yl,$Ym,$t3
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	 vext.8		$t3,$Yl,$Yl,#8
+	vpmull.p64	$Xl,$Xl,$xC2
+	 vpmull.p64	$Yl,$Yl,$xC2
+	veor		$t2,$t2,$Xh
+	 veor		$t3,$t3,$Yh
+	veor		$H3, $Xl,$t2		@ H^3
+	 veor		$H4,$Yl,$t3		@ H^4
+
+	vext.8		$t0,$H3, $H3,#8		@ Karatsuba pre-processing
+	 vext.8		$t1,$H4,$H4,#8
+	 vext.8		$t2,$H2,$H2,#8
+	veor		$t0,$t0,$H3
+	 veor		$t1,$t1,$H4
+	 veor		$t2,$t2,$H2
+	vext.8		$H34k,$t0,$t1,#8		@ pack Karatsuba pre-processed
+	vst1.64		{$H3-$H4},[x0],#48		@ store Htable[3..5]
+
+	@ calculate H^5 and H^6
+	vpmull.p64	$Xl,$H2, $H3
+	 vpmull.p64	$Yl,$H3,$H3
+	vpmull2.p64	$Xh,$H2, $H3
+	 vpmull2.p64	$Yh,$H3,$H3
+	vpmull.p64	$Xm,$t0,$t2
+	 vpmull.p64	$Ym,$t0,$t0
+
+	vext.8		$t0,$Xl,$Xh,#8		@ Karatsuba post-processing
+	 vext.8		$t1,$Yl,$Yh,#8
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t0
+	 veor		$t3,$Yl,$Yh
+	 veor		$Ym,$Ym,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+	 veor		$Ym,$Ym,$t3
+	 vpmull.p64	$t3,$Yl,$xC2
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	 vmov		$Yh#lo,$Ym#hi
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	 vmov		$Ym#hi,$Yl#lo
+	veor		$Xl,$Xm,$t2
+	 veor		$Yl,$Ym,$t3
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	 vext.8		$t3,$Yl,$Yl,#8
+	vpmull.p64	$Xl,$Xl,$xC2
+	 vpmull.p64	$Yl,$Yl,$xC2
+	veor		$t2,$t2,$Xh
+	 veor		$t3,$t3,$Yh
+	veor		$H5,$Xl,$t2		@ H^5
+	 veor		$H6,$Yl,$t3		@ H^6
+
+	vext.8		$t0,$H5, $H5,#8		@ Karatsuba pre-processing
+	 vext.8		$t1,$H6,$H6,#8
+	 vext.8		$t2,$H2,$H2,#8
+	veor		$t0,$t0,$H5
+	 veor		$t1,$t1,$H6
+	 veor		$t2,$t2,$H2
+	vext.8		$H56k,$t0,$t1,#8		@ pack Karatsuba pre-processed
+	vst1.64		{$H5-$H6},[x0],#48		@ store Htable[6..8]
+
+	@ calculate H^7 and H^8
+	vpmull.p64	$Xl,$H2,$H5
+	 vpmull.p64	$Yl,$H2,$H6
+	vpmull2.p64	$Xh,$H2,$H5
+	 vpmull2.p64	$Yh,$H2,$H6
+	vpmull.p64	$Xm,$t0,$t2
+	 vpmull.p64	$Ym,$t1,$t2
+
+	vext.8		$t0,$Xl,$Xh,#8		@ Karatsuba post-processing
+	 vext.8		$t1,$Yl,$Yh,#8
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t0
+	 veor		$t3,$Yl,$Yh
+	 veor		$Ym,$Ym,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+	 veor		$Ym,$Ym,$t3
+	 vpmull.p64	$t3,$Yl,$xC2
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	 vmov		$Yh#lo,$Ym#hi
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	 vmov		$Ym#hi,$Yl#lo
+	veor		$Xl,$Xm,$t2
+	 veor		$Yl,$Ym,$t3
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	 vext.8		$t3,$Yl,$Yl,#8
+	vpmull.p64	$Xl,$Xl,$xC2
+	 vpmull.p64	$Yl,$Yl,$xC2
+	veor		$t2,$t2,$Xh
+	 veor		$t3,$t3,$Yh
+	veor		$H7,$Xl,$t2		@ H^7
+	 veor		$H8,$Yl,$t3		@ H^8
+
+	vext.8		$t0,$H7,$H7,#8		@ Karatsuba pre-processing
+	 vext.8		$t1,$H8,$H8,#8
+	veor		$t0,$t0,$H7
+	 veor		$t1,$t1,$H8
+	vext.8		$H78k,$t0,$t1,#8		@ pack Karatsuba pre-processed
+	vst1.64		{$H7-$H8},[x0]		@ store Htable[9..11]
+___
+}
+$code.=<<___;
+	ret
+.size	gcm_init_v8,.-gcm_init_v8
+___
+################################################################################
+# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
+#
+# input:	Xi - current hash value;
+#		Htable - table precomputed in gcm_init_v8;
+# output:	Xi - next hash value Xi;
+#
+$code.=<<___;
+.global	gcm_gmult_v8
+.type	gcm_gmult_v8,%function
+.align	4
+gcm_gmult_v8:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	AARCH64_VALID_CALL_TARGET
+___
+$code.=<<___;
+	vld1.64		{$t1},[$Xi]		@ load Xi
+	vmov.i8		$xC2,#0xe1
+	vld1.64		{$H-$Hhl},[$Htbl]	@ load twisted H, ...
+	vshl.u64	$xC2,$xC2,#57
+#ifndef __ARMEB__
+	vrev64.8	$t1,$t1
+#endif
+	vext.8		$IN,$t1,$t1,#8
+
+	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
+	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
+	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$Xl,$Xl,$Xl,#8
+	vst1.64		{$Xl},[$Xi]		@ write out Xi
+
+	ret
+.size	gcm_gmult_v8,.-gcm_gmult_v8
+___
+################################################################################
+# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#
+# input:	table precomputed in gcm_init_v8;
+#		current hash value Xi;
+#		pointer to input data;
+#		length of input data in bytes, but divisible by block size;
+# output:	next hash value Xi;
+#
+$code.=<<___;
+.global	gcm_ghash_v8
+.type	gcm_ghash_v8,%function
+.align	4
+gcm_ghash_v8:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	AARCH64_VALID_CALL_TARGET
+	cmp		$len,#64
+	b.hs		.Lgcm_ghash_v8_4x
+___
+$code.=<<___		if ($flavour !~ /64/);
+	vstmdb		sp!,{d8-d15}		@ 32-bit ABI says so
+___
+$code.=<<___;
+	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
+						@ "[rotated]" means that
+						@ loaded value would have
+						@ to be rotated in order to
+						@ make it appear as in
+						@ algorithm specification
+	subs		$len,$len,#32		@ see if $len is 32 or larger
+	mov		$inc,#16		@ $inc is used as post-
+						@ increment for input pointer;
+						@ as loop is modulo-scheduled
+						@ $inc is zeroed just in time
+						@ to preclude overstepping
+						@ inp[len], which means that
+						@ last block[s] are actually
+						@ loaded twice, but last
+						@ copy is not processed
+	vld1.64		{$H-$Hhl},[$Htbl],#32	@ load twisted H, ..., H^2
+	vmov.i8		$xC2,#0xe1
+	vld1.64		{$H2},[$Htbl]
+	cclr		$inc,eq			@ is it time to zero $inc?
+	vext.8		$Xl,$Xl,$Xl,#8		@ rotate Xi
+	vld1.64		{$t0},[$inp],#16	@ load [rotated] I[0]
+	vshl.u64	$xC2,$xC2,#57		@ compose 0xc2.0 constant
+#ifndef __ARMEB__
+	vrev64.8	$t0,$t0
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$IN,$t0,$t0,#8		@ rotate I[0]
+	b.lo		.Lodd_tail_v8		@ $len was less than 32
+___
+{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+$code.=<<___;
+	vld1.64		{$t1},[$inp],$inc	@ load [rotated] I[1]
+#ifndef __ARMEB__
+	vrev64.8	$t1,$t1
+#endif
+	vext.8		$In,$t1,$t1,#8
+	veor		$IN,$IN,$Xl		@ I[i]^=Xi
+	vpmull.p64	$Xln,$H,$In		@ H·Ii+1
+	veor		$t1,$t1,$In		@ Karatsuba pre-processing
+	vpmull2.p64	$Xhn,$H,$In
+	b		.Loop_mod2x_v8
+
+.align	4
+.Loop_mod2x_v8:
+	vext.8		$t2,$IN,$IN,#8
+	subs		$len,$len,#32		@ is there more data?
+	vpmull.p64	$Xl,$H2,$IN		@ H^2.lo·Xi.lo
+	cclr		$inc,lo			@ is it time to zero $inc?
+
+	 vpmull.p64	$Xmn,$Hhl,$t1
+	veor		$t2,$t2,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H2,$IN		@ H^2.hi·Xi.hi
+	veor		$Xl,$Xl,$Xln		@ accumulate
+	vpmull2.p64	$Xm,$Hhl,$t2		@ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	 vld1.64	{$t0},[$inp],$inc	@ load [rotated] I[i+2]
+
+	veor		$Xh,$Xh,$Xhn
+	 cclr		$inc,eq			@ is it time to zero $inc?
+	veor		$Xm,$Xm,$Xmn
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	 vld1.64	{$t1},[$inp],$inc	@ load [rotated] I[i+3]
+#ifndef __ARMEB__
+	 vrev64.8	$t0,$t0
+#endif
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+
+#ifndef __ARMEB__
+	 vrev64.8	$t1,$t1
+#endif
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	 vext.8		$In,$t1,$t1,#8
+	 vext.8		$IN,$t0,$t0,#8
+	veor		$Xl,$Xm,$t2
+	 vpmull.p64	$Xln,$H,$In		@ H·Ii+1
+	veor		$IN,$IN,$Xh		@ accumulate $IN early
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$IN,$IN,$t2
+	 veor		$t1,$t1,$In		@ Karatsuba pre-processing
+	veor		$IN,$IN,$Xl
+	 vpmull2.p64	$Xhn,$H,$In
+	b.hs		.Loop_mod2x_v8		@ there was at least 32 more bytes
+
+	veor		$Xh,$Xh,$t2
+	vext.8		$IN,$t0,$t0,#8		@ re-construct $IN
+	adds		$len,$len,#32		@ re-construct $len
+	veor		$Xl,$Xl,$Xh		@ re-construct $Xl
+	b.eq		.Ldone_v8		@ is $len zero?
+___
+}
+$code.=<<___;
+.Lodd_tail_v8:
+	vext.8		$t2,$Xl,$Xl,#8
+	veor		$IN,$IN,$Xl		@ inp^=Xi
+	veor		$t1,$t0,$t2		@ $t1 is rotated inp^Xi
+
+	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
+	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
+	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+
+.Ldone_v8:
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$Xl,$Xl,$Xl,#8
+	vst1.64		{$Xl},[$Xi]		@ write out Xi
+
+___
+$code.=<<___		if ($flavour !~ /64/);
+	vldmia		sp!,{d8-d15}		@ 32-bit ABI says so
+___
+$code.=<<___;
+	ret
+.size	gcm_ghash_v8,.-gcm_ghash_v8
+___
+
+if ($flavour =~ /64/) {				# 4x subroutine
+my ($I0,$j1,$j2,$j3,
+    $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
+
+$code.=<<___;
+.type	gcm_ghash_v8_4x,%function
+.align	4
+gcm_ghash_v8_4x:
+.Lgcm_ghash_v8_4x:
+	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
+	vld1.64		{$H-$H2},[$Htbl],#48	@ load twisted H, ..., H^2
+	vmov.i8		$xC2,#0xe1
+	vld1.64		{$H3-$H4},[$Htbl]	@ load twisted H^3, ..., H^4
+	vshl.u64	$xC2,$xC2,#57		@ compose 0xc2.0 constant
+
+	vld1.64		{$I0-$j3},[$inp],#64
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+	vrev64.8	$j1,$j1
+	vrev64.8	$j2,$j2
+	vrev64.8	$j3,$j3
+	vrev64.8	$I0,$I0
+#endif
+	vext.8		$I3,$j3,$j3,#8
+	vext.8		$I2,$j2,$j2,#8
+	vext.8		$I1,$j1,$j1,#8
+
+	vpmull.p64	$Yl,$H,$I3		@ H·Ii+3
+	veor		$j3,$j3,$I3
+	vpmull2.p64	$Yh,$H,$I3
+	vpmull.p64	$Ym,$Hhl,$j3
+
+	vpmull.p64	$t0,$H2,$I2		@ H^2·Ii+2
+	veor		$j2,$j2,$I2
+	vpmull2.p64	$I2,$H2,$I2
+	vpmull2.p64	$j2,$Hhl,$j2
+
+	veor		$Yl,$Yl,$t0
+	veor		$Yh,$Yh,$I2
+	veor		$Ym,$Ym,$j2
+
+	vpmull.p64	$j3,$H3,$I1		@ H^3·Ii+1
+	veor		$j1,$j1,$I1
+	vpmull2.p64	$I1,$H3,$I1
+	vpmull.p64	$j1,$H34,$j1
+
+	veor		$Yl,$Yl,$j3
+	veor		$Yh,$Yh,$I1
+	veor		$Ym,$Ym,$j1
+
+	subs		$len,$len,#128
+	b.lo		.Ltail4x
+
+	b		.Loop4x
+
+.align	4
+.Loop4x:
+	veor		$t0,$I0,$Xl
+	 vld1.64	{$I0-$j3},[$inp],#64
+	vext.8		$IN,$t0,$t0,#8
+#ifndef __ARMEB__
+	 vrev64.8	$j1,$j1
+	 vrev64.8	$j2,$j2
+	 vrev64.8	$j3,$j3
+	 vrev64.8	$I0,$I0
+#endif
+
+	vpmull.p64	$Xl,$H4,$IN		@ H^4·(Xi+Ii)
+	veor		$t0,$t0,$IN
+	vpmull2.p64	$Xh,$H4,$IN
+	 vext.8		$I3,$j3,$j3,#8
+	vpmull2.p64	$Xm,$H34,$t0
+
+	veor		$Xl,$Xl,$Yl
+	veor		$Xh,$Xh,$Yh
+	 vext.8		$I2,$j2,$j2,#8
+	veor		$Xm,$Xm,$Ym
+	 vext.8		$I1,$j1,$j1,#8
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	 vpmull.p64	$Yl,$H,$I3		@ H·Ii+3
+	 veor		$j3,$j3,$I3
+	veor		$Xm,$Xm,$t1
+	 vpmull2.p64	$Yh,$H,$I3
+	veor		$Xm,$Xm,$t2
+	 vpmull.p64	$Ym,$Hhl,$j3
+
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	 vpmull.p64	$t0,$H2,$I2		@ H^2·Ii+2
+	 veor		$j2,$j2,$I2
+	 vpmull2.p64	$I2,$H2,$I2
+	veor		$Xl,$Xm,$t2
+	 vpmull2.p64	$j2,$Hhl,$j2
+
+	 veor		$Yl,$Yl,$t0
+	 veor		$Yh,$Yh,$I2
+	 veor		$Ym,$Ym,$j2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	 vpmull.p64	$j3,$H3,$I1		@ H^3·Ii+1
+	 veor		$j1,$j1,$I1
+	veor		$t2,$t2,$Xh
+	 vpmull2.p64	$I1,$H3,$I1
+	 vpmull.p64	$j1,$H34,$j1
+
+	veor		$Xl,$Xl,$t2
+	 veor		$Yl,$Yl,$j3
+	 veor		$Yh,$Yh,$I1
+	vext.8		$Xl,$Xl,$Xl,#8
+	 veor		$Ym,$Ym,$j1
+
+	subs		$len,$len,#64
+	b.hs		.Loop4x
+
+.Ltail4x:
+	veor		$t0,$I0,$Xl
+	vext.8		$IN,$t0,$t0,#8
+
+	vpmull.p64	$Xl,$H4,$IN		@ H^4·(Xi+Ii)
+	veor		$t0,$t0,$IN
+	vpmull2.p64	$Xh,$H4,$IN
+	vpmull2.p64	$Xm,$H34,$t0
+
+	veor		$Xl,$Xl,$Yl
+	veor		$Xh,$Xh,$Yh
+	veor		$Xm,$Xm,$Ym
+
+	adds		$len,$len,#64
+	b.eq		.Ldone4x
+
+	cmp		$len,#32
+	b.lo		.Lone
+	b.eq		.Ltwo
+.Lthree:
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	 vld1.64	{$I0-$j2},[$inp]
+	veor		$Xm,$Xm,$t2
+#ifndef	__ARMEB__
+	 vrev64.8	$j1,$j1
+	 vrev64.8	$j2,$j2
+	 vrev64.8	$I0,$I0
+#endif
+
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	 vext.8		$I2,$j2,$j2,#8
+	 vext.8		$I1,$j1,$j1,#8
+	veor		$Xl,$Xm,$t2
+
+	 vpmull.p64	$Yl,$H,$I2		@ H·Ii+2
+	 veor		$j2,$j2,$I2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	 vpmull2.p64	$Yh,$H,$I2
+	 vpmull.p64	$Ym,$Hhl,$j2
+	veor		$Xl,$Xl,$t2
+	 vpmull.p64	$j3,$H2,$I1		@ H^2·Ii+1
+	 veor		$j1,$j1,$I1
+	vext.8		$Xl,$Xl,$Xl,#8
+
+	 vpmull2.p64	$I1,$H2,$I1
+	veor		$t0,$I0,$Xl
+	 vpmull2.p64	$j1,$Hhl,$j1
+	vext.8		$IN,$t0,$t0,#8
+
+	 veor		$Yl,$Yl,$j3
+	 veor		$Yh,$Yh,$I1
+	 veor		$Ym,$Ym,$j1
+
+	vpmull.p64	$Xl,$H3,$IN		@ H^3·(Xi+Ii)
+	veor		$t0,$t0,$IN
+	vpmull2.p64	$Xh,$H3,$IN
+	vpmull.p64	$Xm,$H34,$t0
+
+	veor		$Xl,$Xl,$Yl
+	veor		$Xh,$Xh,$Yh
+	veor		$Xm,$Xm,$Ym
+	b		.Ldone4x
+
+.align	4
+.Ltwo:
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	 vld1.64	{$I0-$j1},[$inp]
+	veor		$Xm,$Xm,$t2
+#ifndef	__ARMEB__
+	 vrev64.8	$j1,$j1
+	 vrev64.8	$I0,$I0
+#endif
+
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	 vext.8		$I1,$j1,$j1,#8
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+	vext.8		$Xl,$Xl,$Xl,#8
+
+	 vpmull.p64	$Yl,$H,$I1		@ H·Ii+1
+	 veor		$j1,$j1,$I1
+
+	veor		$t0,$I0,$Xl
+	vext.8		$IN,$t0,$t0,#8
+
+	 vpmull2.p64	$Yh,$H,$I1
+	 vpmull.p64	$Ym,$Hhl,$j1
+
+	vpmull.p64	$Xl,$H2,$IN		@ H^2·(Xi+Ii)
+	veor		$t0,$t0,$IN
+	vpmull2.p64	$Xh,$H2,$IN
+	vpmull2.p64	$Xm,$Hhl,$t0
+
+	veor		$Xl,$Xl,$Yl
+	veor		$Xh,$Xh,$Yh
+	veor		$Xm,$Xm,$Ym
+	b		.Ldone4x
+
+.align	4
+.Lone:
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	 vld1.64	{$I0},[$inp]
+	veor		$Xm,$Xm,$t2
+#ifndef	__ARMEB__
+	 vrev64.8	$I0,$I0
+#endif
+
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+	vext.8		$Xl,$Xl,$Xl,#8
+
+	veor		$t0,$I0,$Xl
+	vext.8		$IN,$t0,$t0,#8
+
+	vpmull.p64	$Xl,$H,$IN
+	veor		$t0,$t0,$IN
+	vpmull2.p64	$Xh,$H,$IN
+	vpmull.p64	$Xm,$Hhl,$t0
+
+.Ldone4x:
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+	vext.8		$Xl,$Xl,$Xl,#8
+
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+#endif
+	vst1.64		{$Xl},[$Xi]		@ write out Xi
+
+	ret
+.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
+___
+
+}
+}
+
+$code.=<<___;
+.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+#endif
+___
+
+if ($flavour =~ /64/) {			######## 64-bit code
+    sub unvmov {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
+	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
+					     $3<8?$3:$3+8,($4 eq "lo")?0:1;
+    }
+    foreach(split("\n",$code)) {
+	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
+	s/vmov\.i8/movi/o		or	# fix up legacy mnemonics
+	s/vmov\s+(.*)/unvmov($1)/geo	or
+	s/vext\.8/ext/o			or
+	s/vshr\.s/sshr\.s/o		or
+	s/vshr/ushr/o			or
+	s/^(\s+)v/$1/o			or	# strip off v prefix
+	s/\bbx\s+lr\b/ret/o;
+
+	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
+	s/@\s/\/\//o;				# old->new style commentary
+
+	# fix up remaining legacy suffixes
+	s/\.[ui]?8(\s)/$1/o;
+	s/\.[uis]?32//o and s/\.16b/\.4s/go;
+	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument
+	m/l\.p64/o and s/\.16b/\.1d/go;		# 2nd and 3rd pmull arguments
+	s/\.[uisp]?64//o and s/\.16b/\.2d/go;
+	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+	# Switch preprocessor checks to aarch64 versions.
+	s/__ARME([BL])__/__AARCH64E$1__/go;
+
+	print $_,"\n";
+    }
+} else {				######## 32-bit code
+    sub unvdup32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+    }
+    sub unvpmullp64 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
+	    my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
+				 |(($2&7)<<17)|(($2&8)<<4)
+				 |(($3&7)<<1) |(($3&8)<<2);
+	    $word |= 0x00010001	 if ($mnemonic =~ "2");
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+
+    foreach(split("\n",$code)) {
+	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
+	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
+	s/\/\/\s?/@ /o;				# new->old style commentary
+
+	# fix up remaining new-style suffixes
+	s/\],#[0-9]+/]!/o;
+
+	s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2	$1,#0/o			or
+	s/vdup\.32\s+(.*)/unvdup32($1)/geo				or
+	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo		or
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/^(\s+)b\./$1b/o						or
+	s/^(\s+)ret/$1bx\tlr/o;
+
+	if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
+	    print "     it      $2\n";
+	}
+
+	print $_,"\n";
+    }
+}
+
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
--- a/openssl-3.4.2/crypto/modes/build.info
+++ b/openssl-3.4.2/crypto/modes/build.info
@@ -0,0 +1,96 @@
+LIBS=../../libcrypto
+
+$MODESASM=
+IF[{- !$disabled{asm} -}]
+  $MODESASM_x86=ghash-x86.S
+  $MODESDEF_x86=GHASH_ASM
+  $MODESASM_x86_64=ghash-x86_64.s aesni-gcm-x86_64.s aes-gcm-avx512.s
+  $MODESDEF_x86_64=GHASH_ASM
+
+  # ghash-ia64.s doesn't work on VMS
+  IF[{- $config{target} !~ /^vms-/ -}]
+    $MODESASM_ia64=ghash-ia64.s
+    $MODESDEF_ia64=GHASH_ASM
+  ENDIF
+
+  $MODESASM_sparcv9=ghash-sparcv9.S
+  $MODESDEF_sparcv9=GHASH_ASM
+
+  $MODESASM_alpha=ghash-alpha.S
+  $MODESDEF_alpha=GHASH_ASM
+
+  $MODESASM_s390x=ghash-s390x.S
+  $MODESDEF_s390x=GHASH_ASM
+
+  $MODESASM_armv4=ghash-armv4.S ghashv8-armx.S
+  $MODESDEF_armv4=GHASH_ASM
+  $MODESASM_aarch64=ghashv8-armx.S aes-gcm-armv8_64.S aes-gcm-armv8-unroll8_64.S
+  $MODESDEF_aarch64=
+
+  $MODESASM_parisc11=ghash-parisc.s
+  $MODESDEF_parisc11=GHASH_ASM
+  $MODESASM_parisc20_64=$MODESASM_parisc11
+  $MODESDEF_parisc20_64=$MODESDEF_parisc11
+
+  $MODESASM_ppc32=ghashp8-ppc.s
+  $MODESDEF_ppc32=
+  $MODESASM_ppc64=$MODESASM_ppc32
+  IF[{- $target{sys_id} ne "AIX" && $target{sys_id} ne "MACOSX" -}]
+    $MODESASM_ppc64=$MODESASM_ppc32 aes-gcm-ppc.s
+  ENDIF
+  $MODESDEF_ppc64=$MODESDEF_ppc32
+
+  $MODESASM_c64xplus=ghash-c64xplus.s
+  $MODESDEF_c64xplus=GHASH_ASM
+
+  $MODESASM_riscv64=ghash-riscv64.s ghash-riscv64-zvkb-zvbc.s ghash-riscv64-zvkg.s aes-gcm-riscv64-zvkb-zvkg-zvkned.s
+  $MODESDEF_riscv64=GHASH_ASM
+
+  # Now that we have defined all the arch specific variables, use the
+  # appropriate one, and define the appropriate macros
+  IF[$MODESASM_{- $target{asm_arch} -}]
+    $MODESASM=$MODESASM_{- $target{asm_arch} -}
+    $MODESDEF=$MODESDEF_{- $target{asm_arch} -}
+  ENDIF
+ENDIF
+
+$COMMON=cbc128.c ctr128.c cfb128.c ofb128.c gcm128.c ccm128.c xts128.c \
+        wrap128.c xts128gb.c $MODESASM
+SOURCE[../../libcrypto]=$COMMON \
+        cts128.c ocb128.c siv128.c
+SOURCE[../../providers/libfips.a]=$COMMON
+
+# Implementations are now spread across several libraries, so the defines
+# need to be applied to all affected libraries and modules.
+DEFINE[../../libcrypto]=$MODESDEF
+DEFINE[../../providers/libfips.a]=$MODESDEF
+
+
+INCLUDE[gcm128.o]=..
+
+GENERATE[ghash-ia64.s]=asm/ghash-ia64.pl
+GENERATE[ghash-x86.S]=asm/ghash-x86.pl
+GENERATE[ghash-x86_64.s]=asm/ghash-x86_64.pl
+GENERATE[aesni-gcm-x86_64.s]=asm/aesni-gcm-x86_64.pl
+GENERATE[aes-gcm-avx512.s]=asm/aes-gcm-avx512.pl
+GENERATE[ghash-sparcv9.S]=asm/ghash-sparcv9.pl
+INCLUDE[ghash-sparcv9.o]=..
+GENERATE[ghash-alpha.S]=asm/ghash-alpha.pl
+GENERATE[ghash-parisc.s]=asm/ghash-parisc.pl
+GENERATE[ghashp8-ppc.s]=asm/ghashp8-ppc.pl
+GENERATE[aes-gcm-ppc.s]=asm/aes-gcm-ppc.pl
+GENERATE[ghash-armv4.S]=asm/ghash-armv4.pl
+INCLUDE[ghash-armv4.o]=..
+GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl
+INCLUDE[ghashv8-armx.o]=..
+GENERATE[aes-gcm-armv8_64.S]=asm/aes-gcm-armv8_64.pl
+INCLUDE[aes-gcm-armv8_64.o]=..
+GENERATE[aes-gcm-armv8-unroll8_64.S]=asm/aes-gcm-armv8-unroll8_64.pl
+INCLUDE[aes-gcm-armv8-unroll8_64.o]=..
+GENERATE[ghash-s390x.S]=asm/ghash-s390x.pl
+INCLUDE[ghash-s390x.o]=..
+GENERATE[ghash-c64xplus.S]=asm/ghash-c64xplus.pl
+GENERATE[ghash-riscv64.s]=asm/ghash-riscv64.pl
+GENERATE[ghash-riscv64-zvkb-zvbc.s]=asm/ghash-riscv64-zvkb-zvbc.pl
+GENERATE[ghash-riscv64-zvkg.s]=asm/ghash-riscv64-zvkg.pl
+GENERATE[aes-gcm-riscv64-zvkb-zvkg-zvkned.s]=asm/aes-gcm-riscv64-zvkb-zvkg-zvkned.pl
--- a/openssl-3.4.2/crypto/modes/cbc128.c
+++ b/openssl-3.4.2/crypto/modes/cbc128.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright 2008-2021 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <string.h>
+#include <openssl/crypto.h>
+#include "crypto/modes.h"
+
+#if !defined(STRICT_ALIGNMENT) && !defined(PEDANTIC)
+# define STRICT_ALIGNMENT 0
+#endif
+
+#if defined(__GNUC__) && !STRICT_ALIGNMENT
+typedef size_t size_t_aX __attribute((__aligned__(1)));
+#else
+typedef size_t size_t_aX;
+#endif
+
+void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
+                           size_t len, const void *key,
+                           unsigned char ivec[16], block128_f block)
+{
+    size_t n;
+    const unsigned char *iv = ivec;
+
+    if (len == 0)
+        return;
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+    if (STRICT_ALIGNMENT &&
+        ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+        while (len >= 16) {
+            for (n = 0; n < 16; ++n)
+                out[n] = in[n] ^ iv[n];
+            (*block) (out, out, key);
+            iv = out;
+            len -= 16;
+            in += 16;
+            out += 16;
+        }
+    } else {
+        while (len >= 16) {
+            for (n = 0; n < 16; n += sizeof(size_t))
+                *(size_t_aX *)(out + n) =
+                    *(size_t_aX *)(in + n) ^ *(size_t_aX *)(iv + n);
+            (*block) (out, out, key);
+            iv = out;
+            len -= 16;
+            in += 16;
+            out += 16;
+        }
+    }
+#endif
+    while (len) {
+        for (n = 0; n < 16 && n < len; ++n)
+            out[n] = in[n] ^ iv[n];
+        for (; n < 16; ++n)
+            out[n] = iv[n];
+        (*block) (out, out, key);
+        iv = out;
+        if (len <= 16)
+            break;
+        len -= 16;
+        in += 16;
+        out += 16;
+    }
+    if (ivec != iv)
+        memcpy(ivec, iv, 16);
+}
+
+void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
+                           size_t len, const void *key,
+                           unsigned char ivec[16], block128_f block)
+{
+    size_t n;
+    union {
+        size_t t[16 / sizeof(size_t)];
+        unsigned char c[16];
+    } tmp;
+
+    if (len == 0)
+        return;
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+    if (in != out) {
+        const unsigned char *iv = ivec;
+
+        if (STRICT_ALIGNMENT &&
+            ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+            while (len >= 16) {
+                (*block) (in, out, key);
+                for (n = 0; n < 16; ++n)
+                    out[n] ^= iv[n];
+                iv = in;
+                len -= 16;
+                in += 16;
+                out += 16;
+            }
+        } else if (16 % sizeof(size_t) == 0) { /* always true */
+            while (len >= 16) {
+                size_t_aX *out_t = (size_t_aX *)out;
+                size_t_aX *iv_t = (size_t_aX *)iv;
+
+                (*block) (in, out, key);
+                for (n = 0; n < 16 / sizeof(size_t); n++)
+                    out_t[n] ^= iv_t[n];
+                iv = in;
+                len -= 16;
+                in += 16;
+                out += 16;
+            }
+        }
+        if (ivec != iv)
+            memcpy(ivec, iv, 16);
+    } else {
+        if (STRICT_ALIGNMENT &&
+            ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+            unsigned char c;
+            while (len >= 16) {
+                (*block) (in, tmp.c, key);
+                for (n = 0; n < 16; ++n) {
+                    c = in[n];
+                    out[n] = tmp.c[n] ^ ivec[n];
+                    ivec[n] = c;
+                }
+                len -= 16;
+                in += 16;
+                out += 16;
+            }
+        } else if (16 % sizeof(size_t) == 0) { /* always true */
+            while (len >= 16) {
+                size_t c;
+                size_t_aX *out_t = (size_t_aX *)out;
+                size_t_aX *ivec_t = (size_t_aX *)ivec;
+                const size_t_aX *in_t = (const size_t_aX *)in;
+
+                (*block) (in, tmp.c, key);
+                for (n = 0; n < 16 / sizeof(size_t); n++) {
+                    c = in_t[n];
+                    out_t[n] = tmp.t[n] ^ ivec_t[n];
+                    ivec_t[n] = c;
+                }
+                len -= 16;
+                in += 16;
+                out += 16;
+            }
+        }
+    }
+#endif
+    while (len) {
+        unsigned char c;
+        (*block) (in, tmp.c, key);
+        for (n = 0; n < 16 && n < len; ++n) {
+            c = in[n];
+            out[n] = tmp.c[n] ^ ivec[n];
+            ivec[n] = c;
+        }
+        if (len <= 16) {
+            for (; n < 16; ++n)
+                ivec[n] = in[n];
+            break;
+        }
+        len -= 16;
+        in += 16;
+        out += 16;
+    }
+}
--- a/openssl-3.4.2/crypto/modes/ccm128.c
+++ b/openssl-3.4.2/crypto/modes/ccm128.c
@@ -0,0 +1,442 @@
+/*
+ * Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <string.h>
+#include <openssl/crypto.h>
+#include "crypto/modes.h"
+
+#ifndef STRICT_ALIGNMENT
+# ifdef __GNUC__
+typedef u64 u64_a1 __attribute((__aligned__(1)));
+# else
+typedef u64 u64_a1;
+# endif
+#endif
+
+/*
+ * First you setup M and L parameters and pass the key schedule. This is
+ * called once per session setup...
+ */
+void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
+                        unsigned int M, unsigned int L, void *key,
+                        block128_f block)
+{
+    memset(ctx->nonce.c, 0, sizeof(ctx->nonce.c));
+    ctx->nonce.c[0] = ((u8)(L - 1) & 7) | (u8)(((M - 2) / 2) & 7) << 3;
+    ctx->blocks = 0;
+    ctx->block = block;
+    ctx->key = key;
+}
+
+/* !!! Following interfaces are to be called *once* per packet !!! */
+
+/* Then you setup per-message nonce and pass the length of the message */
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
+                        const unsigned char *nonce, size_t nlen, size_t mlen)
+{
+    unsigned int L = ctx->nonce.c[0] & 7; /* the L parameter */
+
+    if (nlen < (14 - L))
+        return -1;              /* nonce is too short */
+
+    if (sizeof(mlen) == 8 && L >= 3) {
+        ctx->nonce.c[8] = (u8)(mlen >> (56 % (sizeof(mlen) * 8)));
+        ctx->nonce.c[9] = (u8)(mlen >> (48 % (sizeof(mlen) * 8)));
+        ctx->nonce.c[10] = (u8)(mlen >> (40 % (sizeof(mlen) * 8)));
+        ctx->nonce.c[11] = (u8)(mlen >> (32 % (sizeof(mlen) * 8)));
+    } else
+        ctx->nonce.u[1] = 0;
+
+    ctx->nonce.c[12] = (u8)(mlen >> 24);
+    ctx->nonce.c[13] = (u8)(mlen >> 16);
+    ctx->nonce.c[14] = (u8)(mlen >> 8);
+    ctx->nonce.c[15] = (u8)mlen;
+
+    ctx->nonce.c[0] &= ~0x40;   /* clear Adata flag */
+    memcpy(&ctx->nonce.c[1], nonce, 14 - L);
+
+    return 0;
+}
+
+/* Then you pass additional authentication data, this is optional */
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
+                       const unsigned char *aad, size_t alen)
+{
+    unsigned int i;
+    block128_f block = ctx->block;
+
+    if (alen == 0)
+        return;
+
+    ctx->nonce.c[0] |= 0x40;    /* set Adata flag */
+    (*block) (ctx->nonce.c, ctx->cmac.c, ctx->key), ctx->blocks++;
+
+    if (alen < (0x10000 - 0x100)) {
+        ctx->cmac.c[0] ^= (u8)(alen >> 8);
+        ctx->cmac.c[1] ^= (u8)alen;
+        i = 2;
+    } else if (sizeof(alen) == 8
+               && alen >= (size_t)1 << (32 % (sizeof(alen) * 8))) {
+        ctx->cmac.c[0] ^= 0xFF;
+        ctx->cmac.c[1] ^= 0xFF;
+        ctx->cmac.c[2] ^= (u8)(alen >> (56 % (sizeof(alen) * 8)));
+        ctx->cmac.c[3] ^= (u8)(alen >> (48 % (sizeof(alen) * 8)));
+        ctx->cmac.c[4] ^= (u8)(alen >> (40 % (sizeof(alen) * 8)));
+        ctx->cmac.c[5] ^= (u8)(alen >> (32 % (sizeof(alen) * 8)));
+        ctx->cmac.c[6] ^= (u8)(alen >> 24);
+        ctx->cmac.c[7] ^= (u8)(alen >> 16);
+        ctx->cmac.c[8] ^= (u8)(alen >> 8);
+        ctx->cmac.c[9] ^= (u8)alen;
+        i = 10;
+    } else {
+        ctx->cmac.c[0] ^= 0xFF;
+        ctx->cmac.c[1] ^= 0xFE;
+        ctx->cmac.c[2] ^= (u8)(alen >> 24);
+        ctx->cmac.c[3] ^= (u8)(alen >> 16);
+        ctx->cmac.c[4] ^= (u8)(alen >> 8);
+        ctx->cmac.c[5] ^= (u8)alen;
+        i = 6;
+    }
+
+    do {
+        for (; i < 16 && alen; ++i, ++aad, --alen)
+            ctx->cmac.c[i] ^= *aad;
+        (*block) (ctx->cmac.c, ctx->cmac.c, ctx->key), ctx->blocks++;
+        i = 0;
+    } while (alen);
+}
+
+/* Finally you encrypt or decrypt the message */
+
+/*
+ * counter part of nonce may not be larger than L*8 bits, L is not larger
+ * than 8, therefore 64-bit counter...
+ */
+static void ctr64_inc(unsigned char *counter)
+{
+    unsigned int n = 8;
+    u8 c;
+
+    counter += 8;
+    do {
+        --n;
+        c = counter[n];
+        ++c;
+        counter[n] = c;
+        if (c)
+            return;
+    } while (n);
+}
+
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
+                          const unsigned char *inp, unsigned char *out,
+                          size_t len)
+{
+    size_t n;
+    unsigned int i, L;
+    unsigned char flags0 = ctx->nonce.c[0];
+    block128_f block = ctx->block;
+    void *key = ctx->key;
+    union {
+        u64 u[2];
+        u8 c[16];
+    } scratch;
+
+    if (!(flags0 & 0x40))
+        (*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++;
+
+    ctx->nonce.c[0] = L = flags0 & 7;
+    for (n = 0, i = 15 - L; i < 15; ++i) {
+        n |= ctx->nonce.c[i];
+        ctx->nonce.c[i] = 0;
+        n <<= 8;
+    }
+    n |= ctx->nonce.c[15];      /* reconstructed length */
+    ctx->nonce.c[15] = 1;
+
+    if (n != len)
+        return -1;              /* length mismatch */
+
+    ctx->blocks += ((len + 15) >> 3) | 1;
+    if (ctx->blocks > (U64(1) << 61))
+        return -2;              /* too much data */
+
+    while (len >= 16) {
+#if defined(STRICT_ALIGNMENT)
+        union {
+            u64 u[2];
+            u8 c[16];
+        } temp;
+
+        memcpy(temp.c, inp, 16);
+        ctx->cmac.u[0] ^= temp.u[0];
+        ctx->cmac.u[1] ^= temp.u[1];
+#else
+        ctx->cmac.u[0] ^= ((u64_a1 *)inp)[0];
+        ctx->cmac.u[1] ^= ((u64_a1 *)inp)[1];
+#endif
+        (*block) (ctx->cmac.c, ctx->cmac.c, key);
+        (*block) (ctx->nonce.c, scratch.c, key);
+        ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+        temp.u[0] ^= scratch.u[0];
+        temp.u[1] ^= scratch.u[1];
+        memcpy(out, temp.c, 16);
+#else
+        ((u64_a1 *)out)[0] = scratch.u[0] ^ ((u64_a1 *)inp)[0];
+        ((u64_a1 *)out)[1] = scratch.u[1] ^ ((u64_a1 *)inp)[1];
+#endif
+        inp += 16;
+        out += 16;
+        len -= 16;
+    }
+
+    if (len) {
+        for (i = 0; i < len; ++i)
+            ctx->cmac.c[i] ^= inp[i];
+        (*block) (ctx->cmac.c, ctx->cmac.c, key);
+        (*block) (ctx->nonce.c, scratch.c, key);
+        for (i = 0; i < len; ++i)
+            out[i] = scratch.c[i] ^ inp[i];
+    }
+
+    for (i = 15 - L; i < 16; ++i)
+        ctx->nonce.c[i] = 0;
+
+    (*block) (ctx->nonce.c, scratch.c, key);
+    ctx->cmac.u[0] ^= scratch.u[0];
+    ctx->cmac.u[1] ^= scratch.u[1];
+
+    ctx->nonce.c[0] = flags0;
+
+    return 0;
+}
+
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
+                          const unsigned char *inp, unsigned char *out,
+                          size_t len)
+{
+    size_t n;
+    unsigned int i, L;
+    unsigned char flags0 = ctx->nonce.c[0];
+    block128_f block = ctx->block;
+    void *key = ctx->key;
+    union {
+        u64 u[2];
+        u8 c[16];
+    } scratch;
+
+    if (!(flags0 & 0x40))
+        (*block) (ctx->nonce.c, ctx->cmac.c, key);
+
+    ctx->nonce.c[0] = L = flags0 & 7;
+    for (n = 0, i = 15 - L; i < 15; ++i) {
+        n |= ctx->nonce.c[i];
+        ctx->nonce.c[i] = 0;
+        n <<= 8;
+    }
+    n |= ctx->nonce.c[15];      /* reconstructed length */
+    ctx->nonce.c[15] = 1;
+
+    if (n != len)
+        return -1;
+
+    while (len >= 16) {
+#if defined(STRICT_ALIGNMENT)
+        union {
+            u64 u[2];
+            u8 c[16];
+        } temp;
+#endif
+        (*block) (ctx->nonce.c, scratch.c, key);
+        ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+        memcpy(temp.c, inp, 16);
+        ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
+        ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
+        memcpy(out, scratch.c, 16);
+#else
+        ctx->cmac.u[0] ^= (((u64_a1 *)out)[0]
+                            = scratch.u[0] ^ ((u64_a1 *)inp)[0]);
+        ctx->cmac.u[1] ^= (((u64_a1 *)out)[1]
+                            = scratch.u[1] ^ ((u64_a1 *)inp)[1]);
+#endif
+        (*block) (ctx->cmac.c, ctx->cmac.c, key);
+
+        inp += 16;
+        out += 16;
+        len -= 16;
+    }
+
+    if (len) {
+        (*block) (ctx->nonce.c, scratch.c, key);
+        for (i = 0; i < len; ++i)
+            ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
+        (*block) (ctx->cmac.c, ctx->cmac.c, key);
+    }
+
+    for (i = 15 - L; i < 16; ++i)
+        ctx->nonce.c[i] = 0;
+
+    (*block) (ctx->nonce.c, scratch.c, key);
+    ctx->cmac.u[0] ^= scratch.u[0];
+    ctx->cmac.u[1] ^= scratch.u[1];
+
+    ctx->nonce.c[0] = flags0;
+
+    return 0;
+}
+
+static void ctr64_add(unsigned char *counter, size_t inc)
+{
+    size_t n = 8, val = 0;
+
+    counter += 8;
+    do {
+        --n;
+        val += counter[n] + (inc & 0xff);
+        counter[n] = (unsigned char)val;
+        val >>= 8;              /* carry bit */
+        inc >>= 8;
+    } while (n && (inc || val));
+}
+
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
+                                const unsigned char *inp, unsigned char *out,
+                                size_t len, ccm128_f stream)
+{
+    size_t n;
+    unsigned int i, L;
+    unsigned char flags0 = ctx->nonce.c[0];
+    block128_f block = ctx->block;
+    void *key = ctx->key;
+    union {
+        u64 u[2];
+        u8 c[16];
+    } scratch;
+
+    if (!(flags0 & 0x40))
+        (*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++;
+
+    ctx->nonce.c[0] = L = flags0 & 7;
+    for (n = 0, i = 15 - L; i < 15; ++i) {
+        n |= ctx->nonce.c[i];
+        ctx->nonce.c[i] = 0;
+        n <<= 8;
+    }
+    n |= ctx->nonce.c[15];      /* reconstructed length */
+    ctx->nonce.c[15] = 1;
+
+    if (n != len)
+        return -1;              /* length mismatch */
+
+    ctx->blocks += ((len + 15) >> 3) | 1;
+    if (ctx->blocks > (U64(1) << 61))
+        return -2;              /* too much data */
+
+    if ((n = len / 16)) {
+        (*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
+        n *= 16;
+        inp += n;
+        out += n;
+        len -= n;
+        if (len)
+            ctr64_add(ctx->nonce.c, n / 16);
+    }
+
+    if (len) {
+        for (i = 0; i < len; ++i)
+            ctx->cmac.c[i] ^= inp[i];
+        (*block) (ctx->cmac.c, ctx->cmac.c, key);
+        (*block) (ctx->nonce.c, scratch.c, key);
+        for (i = 0; i < len; ++i)
+            out[i] = scratch.c[i] ^ inp[i];
+    }
+
+    for (i = 15 - L; i < 16; ++i)
+        ctx->nonce.c[i] = 0;
+
+    (*block) (ctx->nonce.c, scratch.c, key);
+    ctx->cmac.u[0] ^= scratch.u[0];
+    ctx->cmac.u[1] ^= scratch.u[1];
+
+    ctx->nonce.c[0] = flags0;
+
+    return 0;
+}
+
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
+                                const unsigned char *inp, unsigned char *out,
+                                size_t len, ccm128_f stream)
+{
+    size_t n;
+    unsigned int i, L;
+    unsigned char flags0 = ctx->nonce.c[0];
+    block128_f block = ctx->block;
+    void *key = ctx->key;
+    union {
+        u64 u[2];
+        u8 c[16];
+    } scratch;
+
+    if (!(flags0 & 0x40))
+        (*block) (ctx->nonce.c, ctx->cmac.c, key);
+
+    ctx->nonce.c[0] = L = flags0 & 7;
+    for (n = 0, i = 15 - L; i < 15; ++i) {
+        n |= ctx->nonce.c[i];
+        ctx->nonce.c[i] = 0;
+        n <<= 8;
+    }
+    n |= ctx->nonce.c[15];      /* reconstructed length */
+    ctx->nonce.c[15] = 1;
+
+    if (n != len)
+        return -1;
+
+    if ((n = len / 16)) {
+        (*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
+        n *= 16;
+        inp += n;
+        out += n;
+        len -= n;
+        if (len)
+            ctr64_add(ctx->nonce.c, n / 16);
+    }
+
+    if (len) {
+        (*block) (ctx->nonce.c, scratch.c, key);
+        for (i = 0; i < len; ++i)
+            ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
+        (*block) (ctx->cmac.c, ctx->cmac.c, key);
+    }
+
+    for (i = 15 - L; i < 16; ++i)
+        ctx->nonce.c[i] = 0;
+
+    (*block) (ctx->nonce.c, scratch.c, key);
+    ctx->cmac.u[0] ^= scratch.u[0];
+    ctx->cmac.u[1] ^= scratch.u[1];
+
+    ctx->nonce.c[0] = flags0;
+
+    return 0;
+}
+
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
+{
+    unsigned int M = (ctx->nonce.c[0] >> 3) & 7; /* the M parameter */
+
+    M *= 2;
+    M += 2;
+    if (len != M)
+        return 0;
+    memcpy(tag, ctx->cmac.c, M);
+    return M;
+}
--- a/openssl-3.4.2/crypto/modes/cfb128.c
+++ b/openssl-3.4.2/crypto/modes/cfb128.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2008-2021 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <string.h>
+#include <openssl/crypto.h>
+#include "crypto/modes.h"
+
+#if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
+typedef size_t size_t_aX __attribute((__aligned__(1)));
+#else
+typedef size_t size_t_aX;
+#endif
+
+/*
+ * The input and output encrypted as though 128bit cfb mode is being used.
+ * The extra state information to record how much of the 128bit block we have
+ * used is contained in *num;
+ */
+void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
+                           size_t len, const void *key,
+                           unsigned char ivec[16], int *num,
+                           int enc, block128_f block)
+{
+    unsigned int n;
+    size_t l = 0;
+
+    if (*num < 0) {
+        /* There is no good way to signal an error return from here */
+        *num = -1;
+        return;
+    }
+    n = *num;
+
+    if (enc) {
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+        if (16 % sizeof(size_t) == 0) { /* always true actually */
+            do {
+                while (n && len) {
+                    *(out++) = ivec[n] ^= *(in++);
+                    --len;
+                    n = (n + 1) % 16;
+                }
+# if defined(STRICT_ALIGNMENT)
+                if (((size_t)in | (size_t)out | (size_t)ivec) %
+                    sizeof(size_t) != 0)
+                    break;
+# endif
+                while (len >= 16) {
+                    (*block) (ivec, ivec, key);
+                    for (; n < 16; n += sizeof(size_t)) {
+                        *(size_t_aX *)(out + n) =
+                            *(size_t_aX *)(ivec + n)
+                                ^= *(size_t_aX *)(in + n);
+                    }
+                    len -= 16;
+                    out += 16;
+                    in += 16;
+                    n = 0;
+                }
+                if (len) {
+                    (*block) (ivec, ivec, key);
+                    while (len--) {
+                        out[n] = ivec[n] ^= in[n];
+                        ++n;
+                    }
+                }
+                *num = n;
+                return;
+            } while (0);
+        }
+        /* the rest would be commonly eliminated by x86* compiler */
+#endif
+        while (l < len) {
+            if (n == 0) {
+                (*block) (ivec, ivec, key);
+            }
+            out[l] = ivec[n] ^= in[l];
+            ++l;
+            n = (n + 1) % 16;
+        }
+        *num = n;
+    } else {
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+        if (16 % sizeof(size_t) == 0) { /* always true actually */
+            do {
+                while (n && len) {
+                    unsigned char c;
+                    *(out++) = ivec[n] ^ (c = *(in++));
+                    ivec[n] = c;
+                    --len;
+                    n = (n + 1) % 16;
+                }
+# if defined(STRICT_ALIGNMENT)
+                if (((size_t)in | (size_t)out | (size_t)ivec) %
+                    sizeof(size_t) != 0)
+                    break;
+# endif
+                while (len >= 16) {
+                    (*block) (ivec, ivec, key);
+                    for (; n < 16; n += sizeof(size_t)) {
+                        size_t t = *(size_t_aX *)(in + n);
+                        *(size_t_aX *)(out + n)
+                            = *(size_t_aX *)(ivec + n) ^ t;
+                        *(size_t_aX *)(ivec + n) = t;
+                    }
+                    len -= 16;
+                    out += 16;
+                    in += 16;
+                    n = 0;
+                }
+                if (len) {
+                    (*block) (ivec, ivec, key);
+                    while (len--) {
+                        unsigned char c;
+                        out[n] = ivec[n] ^ (c = in[n]);
+                        ivec[n] = c;
+                        ++n;
+                    }
+                }
+                *num = n;
+                return;
+            } while (0);
+        }
+        /* the rest would be commonly eliminated by x86* compiler */
+#endif
+        while (l < len) {
+            unsigned char c;
+            if (n == 0) {
+                (*block) (ivec, ivec, key);
+            }
+            out[l] = ivec[n] ^ (c = in[l]);
+            ivec[n] = c;
+            ++l;
+            n = (n + 1) % 16;
+        }
+        *num = n;
+    }
+}
+
+/*
+ * This expects a single block of size nbits for both in and out. Note that
+ * it corrupts any extra bits in the last byte of out
+ */
+static void cfbr_encrypt_block(const unsigned char *in, unsigned char *out,
+                               int nbits, const void *key,
+                               unsigned char ivec[16], int enc,
+                               block128_f block)
+{
+    int n, rem, num;
+    unsigned char ovec[16 * 2 + 1]; /* +1 because we dereference (but don't
+                                     * use) one byte off the end */
+
+    if (nbits <= 0 || nbits > 128)
+        return;
+
+    /* fill in the first half of the new IV with the current IV */
+    memcpy(ovec, ivec, 16);
+    /* construct the new IV */
+    (*block) (ivec, ivec, key);
+    num = (nbits + 7) / 8;
+    if (enc)                    /* encrypt the input */
+        for (n = 0; n < num; ++n)
+            out[n] = (ovec[16 + n] = in[n] ^ ivec[n]);
+    else                        /* decrypt the input */
+        for (n = 0; n < num; ++n)
+            out[n] = (ovec[16 + n] = in[n]) ^ ivec[n];
+    /* shift ovec left... */
+    rem = nbits % 8;
+    num = nbits / 8;
+    if (rem == 0)
+        memcpy(ivec, ovec + num, 16);
+    else
+        for (n = 0; n < 16; ++n)
+            ivec[n] = ovec[n + num] << rem | ovec[n + num + 1] >> (8 - rem);
+
+    /* it is not necessary to cleanse ovec, since the IV is not secret */
+}
+
+/* N.B. This expects the input to be packed, MS bit first */
+void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
+                             size_t bits, const void *key,
+                             unsigned char ivec[16], int *num,
+                             int enc, block128_f block)
+{
+    size_t n;
+    unsigned char c[1], d[1];
+
+    for (n = 0; n < bits; ++n) {
+        c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0;
+        cfbr_encrypt_block(c, d, 1, key, ivec, enc, block);
+        out[n / 8] = (out[n / 8] & ~(1 << (unsigned int)(7 - n % 8))) |
+            ((d[0] & 0x80) >> (unsigned int)(n % 8));
+    }
+}
+
+void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
+                             size_t length, const void *key,
+                             unsigned char ivec[16], int *num,
+                             int enc, block128_f block)
+{
+    size_t n;
+
+    for (n = 0; n < length; ++n)
+        cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block);
+}
--- a/openssl-3.4.2/crypto/modes/ctr128.c
+++ b/openssl-3.4.2/crypto/modes/ctr128.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright 2008-2021 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <string.h>
+#include <openssl/crypto.h>
+#include "internal/endian.h"
+#include "crypto/modes.h"
+
+#if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
+typedef size_t size_t_aX __attribute((__aligned__(1)));
+#else
+typedef size_t size_t_aX;
+#endif
+
+/*
+ * NOTE: the IV/counter CTR mode is big-endian.  The code itself is
+ * endian-neutral.
+ */
+
+/* increment counter (128-bit int) by 1 */
+static void ctr128_inc(unsigned char *counter)
+{
+    u32 n = 16, c = 1;
+
+    do {
+        --n;
+        c += counter[n];
+        counter[n] = (u8)c;
+        c >>= 8;
+    } while (n);
+}
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+static void ctr128_inc_aligned(unsigned char *counter)
+{
+    size_t *data, c, d, n;
+    DECLARE_IS_ENDIAN;
+
+    if (IS_LITTLE_ENDIAN || ((size_t)counter % sizeof(size_t)) != 0) {
+        ctr128_inc(counter);
+        return;
+    }
+
+    data = (size_t *)counter;
+    c = 1;
+    n = 16 / sizeof(size_t);
+    do {
+        --n;
+        d = data[n] += c;
+        /* did addition carry? */
+        c = ((d - c) & ~d) >> (sizeof(size_t) * 8 - 1);
+    } while (n);
+}
+#endif
+
+/*
+ * The input encrypted as though 128bit counter mode is being used.  The
+ * extra state information to record how much of the 128bit block we have
+ * used is contained in *num, and the encrypted counter is kept in
+ * ecount_buf.  Both *num and ecount_buf must be initialised with zeros
+ * before the first call to CRYPTO_ctr128_encrypt(). This algorithm assumes
+ * that the counter is in the x lower bits of the IV (ivec), and that the
+ * application has full control over overflow and the rest of the IV.  This
+ * implementation takes NO responsibility for checking that the counter
+ * doesn't overflow into the rest of the IV when incremented.
+ */
+void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
+                           size_t len, const void *key,
+                           unsigned char ivec[16],
+                           unsigned char ecount_buf[16], unsigned int *num,
+                           block128_f block)
+{
+    unsigned int n;
+    size_t l = 0;
+
+    n = *num;
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+    if (16 % sizeof(size_t) == 0) { /* always true actually */
+        do {
+            while (n && len) {
+                *(out++) = *(in++) ^ ecount_buf[n];
+                --len;
+                n = (n + 1) % 16;
+            }
+
+# if defined(STRICT_ALIGNMENT)
+            if (((size_t)in | (size_t)out | (size_t)ecount_buf)
+                % sizeof(size_t) != 0)
+                break;
+# endif
+            while (len >= 16) {
+                (*block) (ivec, ecount_buf, key);
+                ctr128_inc_aligned(ivec);
+                for (n = 0; n < 16; n += sizeof(size_t))
+                    *(size_t_aX *)(out + n) =
+                        *(size_t_aX *)(in + n)
+                        ^ *(size_t_aX *)(ecount_buf + n);
+                len -= 16;
+                out += 16;
+                in += 16;
+                n = 0;
+            }
+            if (len) {
+                (*block) (ivec, ecount_buf, key);
+                ctr128_inc_aligned(ivec);
+                while (len--) {
+                    out[n] = in[n] ^ ecount_buf[n];
+                    ++n;
+                }
+            }
+            *num = n;
+            return;
+        } while (0);
+    }
+    /* the rest would be commonly eliminated by x86* compiler */
+#endif
+    while (l < len) {
+        if (n == 0) {
+            (*block) (ivec, ecount_buf, key);
+            ctr128_inc(ivec);
+        }
+        out[l] = in[l] ^ ecount_buf[n];
+        ++l;
+        n = (n + 1) % 16;
+    }
+
+    *num = n;
+}
+
+/* increment upper 96 bits of 128-bit counter by 1 */
+static void ctr96_inc(unsigned char *counter)
+{
+    u32 n = 12, c = 1;
+
+    do {
+        --n;
+        c += counter[n];
+        counter[n] = (u8)c;
+        c >>= 8;
+    } while (n);
+}
+
+void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
+                                 size_t len, const void *key,
+                                 unsigned char ivec[16],
+                                 unsigned char ecount_buf[16],
+                                 unsigned int *num, ctr128_f func)
+{
+    unsigned int n, ctr32;
+
+   n = *num;
+
+    while (n && len) {
+        *(out++) = *(in++) ^ ecount_buf[n];
+        --len;
+        n = (n + 1) % 16;
+    }
+
+    ctr32 = GETU32(ivec + 12);
+    while (len >= 16) {
+        size_t blocks = len / 16;
+        /*
+         * 1<<28 is just a not-so-small yet not-so-large number...
+         * Below condition is practically never met, but it has to
+         * be checked for code correctness.
+         */
+        if (sizeof(size_t) > sizeof(unsigned int) && blocks > (1U << 28))
+            blocks = (1U << 28);
+        /*
+         * As (*func) operates on 32-bit counter, caller
+         * has to handle overflow. 'if' below detects the
+         * overflow, which is then handled by limiting the
+         * amount of blocks to the exact overflow point...
+         */
+        ctr32 += (u32)blocks;
+        if (ctr32 < blocks) {
+            blocks -= ctr32;
+            ctr32 = 0;
+        }
+        (*func) (in, out, blocks, key, ivec);
+        /* (*ctr) does not update ivec, caller does: */
+        PUTU32(ivec + 12, ctr32);
+        /* ... overflow was detected, propagate carry. */
+        if (ctr32 == 0)
+            ctr96_inc(ivec);
+        blocks *= 16;
+        len -= blocks;
+        out += blocks;
+        in += blocks;
+    }
+    if (len) {
+        memset(ecount_buf, 0, 16);
+        (*func) (ecount_buf, ecount_buf, 1, key, ivec);
+        ++ctr32;
+        PUTU32(ivec + 12, ctr32);
+        if (ctr32 == 0)
+            ctr96_inc(ivec);
+        while (len--) {
+            out[n] = in[n] ^ ecount_buf[n];
+            ++n;
+        }
+    }
+
+    *num = n;
+}
--- a/openssl-3.4.2/crypto/modes/cts128.c
+++ b/openssl-3.4.2/crypto/modes/cts128.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <string.h>
+#include <openssl/crypto.h>
+#include "crypto/modes.h"
+
+/*
+ * Trouble with Ciphertext Stealing, CTS, mode is that there is no
+ * common official specification, but couple of cipher/application
+ * specific ones: RFC2040 and RFC3962. Then there is 'Proposal to
+ * Extend CBC Mode By "Ciphertext Stealing"' at NIST site, which
+ * deviates from mentioned RFCs. Most notably it allows input to be
+ * of block length and it doesn't flip the order of the last two
+ * blocks. CTS is being discussed even in ECB context, but it's not
+ * adopted for any known application. This implementation provides
+ * two interfaces: one compliant with above mentioned RFCs and one
+ * compliant with the NIST proposal, both extending CBC mode.
+ */
+
+size_t CRYPTO_cts128_encrypt_block(const unsigned char *in,
+                                   unsigned char *out, size_t len,
+                                   const void *key, unsigned char ivec[16],
+                                   block128_f block)
+{
+    size_t residue, n;
+
+    if (len <= 16)
+        return 0;
+
+    if ((residue = len % 16) == 0)
+        residue = 16;
+
+    len -= residue;
+
+    CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block);
+
+    in += len;
+    out += len;
+
+    for (n = 0; n < residue; ++n)
+        ivec[n] ^= in[n];
+    (*block) (ivec, ivec, key);
+    memcpy(out, out - 16, residue);
+    memcpy(out - 16, ivec, 16);
+
+    return len + residue;
+}
+
+size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in,
+                                       unsigned char *out, size_t len,
+                                       const void *key,
+                                       unsigned char ivec[16],
+                                       block128_f block)
+{
+    size_t residue, n;
+
+    if (len < 16)
+        return 0;
+
+    residue = len % 16;
+
+    len -= residue;
+
+    CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block);
+
+    if (residue == 0)
+        return len;
+
+    in += len;
+    out += len;
+
+    for (n = 0; n < residue; ++n)
+        ivec[n] ^= in[n];
+    (*block) (ivec, ivec, key);
+    memcpy(out - 16 + residue, ivec, 16);
+
+    return len + residue;
+}
+
+size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
+                             size_t len, const void *key,
+                             unsigned char ivec[16], cbc128_f cbc)
+{
+    size_t residue;
+    union {
+        size_t align;
+        unsigned char c[16];
+    } tmp;
+
+    if (len <= 16)
+        return 0;
+
+    if ((residue = len % 16) == 0)
+        residue = 16;
+
+    len -= residue;
+
+    (*cbc) (in, out, len, key, ivec, 1);
+
+    in += len;
+    out += len;
+
+#if defined(CBC_HANDLES_TRUNCATED_IO)
+    memcpy(tmp.c, out - 16, 16);
+    (*cbc) (in, out - 16, residue, key, ivec, 1);
+    memcpy(out, tmp.c, residue);
+#else
+    memset(tmp.c, 0, sizeof(tmp));
+    memcpy(tmp.c, in, residue);
+    memcpy(out, out - 16, residue);
+    (*cbc) (tmp.c, out - 16, 16, key, ivec, 1);
+#endif
+    return len + residue;
+}
+
+size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
+                                 size_t len, const void *key,
+                                 unsigned char ivec[16], cbc128_f cbc)
+{
+    size_t residue;
+    union {
+        size_t align;
+        unsigned char c[16];
+    } tmp;
+
+    if (len < 16)
+        return 0;
+
+    residue = len % 16;
+
+    len -= residue;
+
+    (*cbc) (in, out, len, key, ivec, 1);
+
+    if (residue == 0)
+        return len;
+
+    in += len;
+    out += len;
+
+#if defined(CBC_HANDLES_TRUNCATED_IO)
+    (*cbc) (in, out - 16 + residue, residue, key, ivec, 1);
+#else
+    memset(tmp.c, 0, sizeof(tmp));
+    memcpy(tmp.c, in, residue);
+    (*cbc) (tmp.c, out - 16 + residue, 16, key, ivec, 1);
+#endif
+    return len + residue;
+}
+
+size_t CRYPTO_cts128_decrypt_block(const unsigned char *in,
+                                   unsigned char *out, size_t len,
+                                   const void *key, unsigned char ivec[16],
+                                   block128_f block)
+{
+    size_t residue, n;
+    union {
+        size_t align;
+        unsigned char c[32];
+    } tmp;
+
+    if (len <= 16)
+        return 0;
+
+    if ((residue = len % 16) == 0)
+        residue = 16;
+
+    len -= 16 + residue;
+
+    if (len) {
+        CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
+        in += len;
+        out += len;
+    }
+
+    (*block) (in, tmp.c + 16, key);
+
+    memcpy(tmp.c, tmp.c + 16, 16);
+    memcpy(tmp.c, in + 16, residue);
+    (*block) (tmp.c, tmp.c, key);
+
+    for (n = 0; n < 16; ++n) {
+        unsigned char c = in[n];
+        out[n] = tmp.c[n] ^ ivec[n];
+        ivec[n] = c;
+    }
+    for (residue += 16; n < residue; ++n)
+        out[n] = tmp.c[n] ^ in[n];
+
+    return 16 + len + residue;
+}
+
+size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in,
+                                       unsigned char *out, size_t len,
+                                       const void *key,
+                                       unsigned char ivec[16],
+                                       block128_f block)
+{
+    size_t residue, n;
+    union {
+        size_t align;
+        unsigned char c[32];
+    } tmp;
+
+    if (len < 16)
+        return 0;
+
+    residue = len % 16;
+
+    if (residue == 0) {
+        CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
+        return len;
+    }
+
+    len -= 16 + residue;
+
+    if (len) {
+        CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
+        in += len;
+        out += len;
+    }
+
+    (*block) (in + residue, tmp.c + 16, key);
+
+    memcpy(tmp.c, tmp.c + 16, 16);
+    memcpy(tmp.c, in, residue);
+    (*block) (tmp.c, tmp.c, key);
+
+    for (n = 0; n < 16; ++n) {
+        unsigned char c = in[n];
+        out[n] = tmp.c[n] ^ ivec[n];
+        ivec[n] = in[n + residue];
+        tmp.c[n] = c;
+    }
+    for (residue += 16; n < residue; ++n)
+        out[n] = tmp.c[n] ^ tmp.c[n - 16];
+
+    return 16 + len + residue;
+}
+
+size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
+                             size_t len, const void *key,
+                             unsigned char ivec[16], cbc128_f cbc)
+{
+    size_t residue;
+    union {
+        size_t align;
+        unsigned char c[32];
+    } tmp;
+
+    if (len <= 16)
+        return 0;
+
+    if ((residue = len % 16) == 0)
+        residue = 16;
+
+    len -= 16 + residue;
+
+    if (len) {
+        (*cbc) (in, out, len, key, ivec, 0);
+        in += len;
+        out += len;
+    }
+
+    memset(tmp.c, 0, sizeof(tmp));
+    /*
+     * this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0]
+     */
+    (*cbc) (in, tmp.c, 16, key, tmp.c + 16, 0);
+
+    memcpy(tmp.c, in + 16, residue);
+#if defined(CBC_HANDLES_TRUNCATED_IO)
+    (*cbc) (tmp.c, out, 16 + residue, key, ivec, 0);
+#else
+    (*cbc) (tmp.c, tmp.c, 32, key, ivec, 0);
+    memcpy(out, tmp.c, 16 + residue);
+#endif
+    return 16 + len + residue;
+}
+
+size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
+                                 size_t len, const void *key,
+                                 unsigned char ivec[16], cbc128_f cbc)
+{
+    size_t residue;
+    union {
+        size_t align;
+        unsigned char c[32];
+    } tmp;
+
+    if (len < 16)
+        return 0;
+
+    residue = len % 16;
+
+    if (residue == 0) {
+        (*cbc) (in, out, len, key, ivec, 0);
+        return len;
+    }
+
+    len -= 16 + residue;
+
+    if (len) {
+        (*cbc) (in, out, len, key, ivec, 0);
+        in += len;
+        out += len;
+    }
+
+    memset(tmp.c, 0, sizeof(tmp));
+    /*
+     * this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0]
+     */
+    (*cbc) (in + residue, tmp.c, 16, key, tmp.c + 16, 0);
+
+    memcpy(tmp.c, in, residue);
+#if defined(CBC_HANDLES_TRUNCATED_IO)
+    (*cbc) (tmp.c, out, 16 + residue, key, ivec, 0);
+#else
+    (*cbc) (tmp.c, tmp.c, 32, key, ivec, 0);
+    memcpy(out, tmp.c, 16 + residue);
+#endif
+    return 16 + len + residue;
+}
--- a/openssl-3.4.2/crypto/modes/gcm128.c
+++ b/openssl-3.4.2/crypto/modes/gcm128.c
--- a/openssl-3.4.2/crypto/modes/ghash-x86_64.s
+++ b/openssl-3.4.2/crypto/modes/ghash-x86_64.s
--- a/openssl-3.4.2/crypto/modes/libcrypto-lib-cbc128.d
+++ b/openssl-3.4.2/crypto/modes/libcrypto-lib-cbc128.d
@@ -0,0 +1,8 @@
+crypto/modes/libcrypto-lib-cbc128.o: crypto/modes/cbc128.c \
+ include/openssl/crypto.h include/openssl/macros.h \
+ include/openssl/opensslconf.h include/openssl/configuration.h \
+ include/openssl/opensslv.h include/openssl/e_os2.h \
+ include/openssl/safestack.h include/openssl/stack.h \
+ include/openssl/types.h include/openssl/cryptoerr.h \
+ include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
+ include/openssl/core.h include/crypto/modes.h include/openssl/modes.h
--- a/openssl-3.4.2/crypto/modes/libcrypto-lib-ccm128.d
+++ b/openssl-3.4.2/crypto/modes/libcrypto-lib-ccm128.d
@@ -0,0 +1,8 @@
+crypto/modes/libcrypto-lib-ccm128.o: crypto/modes/ccm128.c \
+ include/openssl/crypto.h include/openssl/macros.h \
+ include/openssl/opensslconf.h include/openssl/configuration.h \
+ include/openssl/opensslv.h include/openssl/e_os2.h \
+ include/openssl/safestack.h include/openssl/stack.h \
+ include/openssl/types.h include/openssl/cryptoerr.h \
+ include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
+ include/openssl/core.h include/crypto/modes.h include/openssl/modes.h
--- a/openssl-3.4.2/crypto/modes/libcrypto-lib-cfb128.d
+++ b/openssl-3.4.2/crypto/modes/libcrypto-lib-cfb128.d
@@ -0,0 +1,8 @@
+crypto/modes/libcrypto-lib-cfb128.o: crypto/modes/cfb128.c \
+ include/openssl/crypto.h include/openssl/macros.h \
+ include/openssl/opensslconf.h include/openssl/configuration.h \
+ include/openssl/opensslv.h include/openssl/e_os2.h \
+ include/openssl/safestack.h include/openssl/stack.h \
+ include/openssl/types.h include/openssl/cryptoerr.h \
+ include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
+ include/openssl/core.h include/crypto/modes.h include/openssl/modes.h
--- a/openssl-3.4.2/crypto/modes/libcrypto-lib-ctr128.d
+++ b/openssl-3.4.2/crypto/modes/libcrypto-lib-ctr128.d
@@ -0,0 +1,9 @@
+crypto/modes/libcrypto-lib-ctr128.o: crypto/modes/ctr128.c \
+ include/openssl/crypto.h include/openssl/macros.h \
+ include/openssl/opensslconf.h include/openssl/configuration.h \
+ include/openssl/opensslv.h include/openssl/e_os2.h \
+ include/openssl/safestack.h include/openssl/stack.h \
+ include/openssl/types.h include/openssl/cryptoerr.h \
+ include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
+ include/openssl/core.h include/internal/endian.h include/crypto/modes.h \
+ include/openssl/modes.h
--- a/openssl-3.4.2/crypto/modes/libcrypto-lib-cts128.d
+++ b/openssl-3.4.2/crypto/modes/libcrypto-lib-cts128.d
@@ -0,0 +1,8 @@
+crypto/modes/libcrypto-lib-cts128.o: crypto/modes/cts128.c \
+ include/openssl/crypto.h include/openssl/macros.h \
+ include/openssl/opensslconf.h include/openssl/configuration.h \
+ include/openssl/opensslv.h include/openssl/e_os2.h \
+ include/openssl/safestack.h include/openssl/stack.h \
+ include/openssl/types.h include/openssl/cryptoerr.h \
+ include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
+ include/openssl/core.h include/crypto/modes.h include/openssl/modes.h
--- a/openssl-3.4.2/crypto/modes/libcrypto-lib-gcm128.d
+++ b/openssl-3.4.2/crypto/modes/libcrypto-lib-gcm128.d
@@ -0,0 +1,15 @@
+crypto/modes/libcrypto-lib-gcm128.o: crypto/modes/gcm128.c \
+ include/openssl/crypto.h include/openssl/macros.h \
+ include/openssl/opensslconf.h include/openssl/configuration.h \
+ include/openssl/opensslv.h include/openssl/e_os2.h \
+ include/openssl/safestack.h include/openssl/stack.h \
+ include/openssl/types.h include/openssl/cryptoerr.h \
+ include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
+ include/openssl/core.h include/internal/cryptlib.h \
+ include/internal/common.h include/internal/e_os.h \
+ include/internal/numbers.h include/internal/nelem.h \
+ include/openssl/buffer.h include/openssl/buffererr.h \
+ include/openssl/bio.h include/openssl/bioerr.h include/openssl/asn1.h \
+ include/openssl/asn1err.h include/openssl/bn.h include/openssl/bnerr.h \
+ include/openssl/err.h include/openssl/lhash.h include/internal/endian.h \
+ include/crypto/modes.h include/openssl/modes.h
--- a/openssl-3.4.2/crypto/modes/libcrypto-lib-ocb128.d
+++ b/openssl-3.4.2/crypto/modes/libcrypto-lib-ocb128.d
@@ -0,0 +1,10 @@
+crypto/modes/libcrypto-lib-ocb128.o: crypto/modes/ocb128.c \
+ include/openssl/crypto.h include/openssl/macros.h \
+ include/openssl/opensslconf.h include/openssl/configuration.h \
+ include/openssl/opensslv.h include/openssl/e_os2.h \
+ include/openssl/safestack.h include/openssl/stack.h \
+ include/openssl/types.h include/openssl/cryptoerr.h \
+ include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
+ include/openssl/core.h include/openssl/err.h include/openssl/bio.h \
+ include/openssl/bioerr.h include/openssl/lhash.h include/crypto/modes.h \
+ include/openssl/modes.h
--- a/openssl-3.4.2/crypto/modes/libcrypto-lib-ofb128.d
+++ b/openssl-3.4.2/crypto/modes/libcrypto-lib-ofb128.d
@@ -0,0 +1,8 @@
+crypto/modes/libcrypto-lib-ofb128.o: crypto/modes/ofb128.c \
+ include/openssl/crypto.h include/openssl/macros.h \
+ include/openssl/opensslconf.h include/openssl/configuration.h \
+ include/openssl/opensslv.h include/openssl/e_os2.h \
+ include/openssl/safestack.h include/openssl/stack.h \
+ include/openssl/types.h include/openssl/cryptoerr.h \
+ include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
+ include/openssl/core.h include/crypto/modes.h include/openssl/modes.h
--- a/openssl-3.4.2/crypto/modes/libcrypto-lib-siv128.d
+++ b/openssl-3.4.2/crypto/modes/libcrypto-lib-siv128.d
@@ -0,0 +1,16 @@
+crypto/modes/libcrypto-lib-siv128.o: crypto/modes/siv128.c \
+ include/openssl/crypto.h include/openssl/macros.h \
+ include/openssl/opensslconf.h include/openssl/configuration.h \
+ include/openssl/opensslv.h include/openssl/e_os2.h \
+ include/openssl/safestack.h include/openssl/stack.h \
+ include/openssl/types.h include/openssl/cryptoerr.h \
+ include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
+ include/openssl/core.h include/openssl/evp.h \
+ include/openssl/core_dispatch.h include/openssl/indicator.h \
+ include/openssl/params.h include/openssl/bn.h include/openssl/bnerr.h \
+ include/openssl/bio.h include/openssl/bioerr.h include/openssl/evperr.h \
+ include/openssl/objects.h include/openssl/obj_mac.h \
+ include/openssl/asn1.h include/openssl/asn1err.h \
+ include/openssl/objectserr.h include/openssl/core_names.h \
+ include/internal/endian.h include/crypto/modes.h include/openssl/modes.h \
+ include/crypto/siv.h
--- a/openssl-3.4.2/crypto/modes/libcrypto-lib-wrap128.d
+++ b/openssl-3.4.2/crypto/modes/libcrypto-lib-wrap128.d
@@ -0,0 +1,14 @@
+crypto/modes/libcrypto-lib-wrap128.o: crypto/modes/wrap128.c \
+ include/internal/cryptlib.h include/internal/common.h \
+ include/openssl/configuration.h include/internal/e_os.h \
+ include/openssl/opensslconf.h include/openssl/macros.h \
+ include/openssl/opensslv.h include/openssl/e_os2.h \
+ include/openssl/crypto.h include/openssl/safestack.h \
+ include/openssl/stack.h include/openssl/types.h \
+ include/openssl/cryptoerr.h include/openssl/symhacks.h \
+ include/openssl/cryptoerr_legacy.h include/openssl/core.h \
+ include/internal/numbers.h include/internal/nelem.h \
+ include/openssl/buffer.h include/openssl/buffererr.h \
+ include/openssl/bio.h include/openssl/bioerr.h include/openssl/asn1.h \
+ include/openssl/asn1err.h include/openssl/bn.h include/openssl/bnerr.h \
+ include/openssl/err.h include/openssl/lhash.h include/openssl/modes.h
--- a/openssl-3.4.2/crypto/modes/libcrypto-lib-xts128.d
+++ b/openssl-3.4.2/crypto/modes/libcrypto-lib-xts128.d
@@ -0,0 +1,9 @@
+crypto/modes/libcrypto-lib-xts128.o: crypto/modes/xts128.c \
+ include/openssl/crypto.h include/openssl/macros.h \
+ include/openssl/opensslconf.h include/openssl/configuration.h \
+ include/openssl/opensslv.h include/openssl/e_os2.h \
+ include/openssl/safestack.h include/openssl/stack.h \
+ include/openssl/types.h include/openssl/cryptoerr.h \
+ include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
+ include/openssl/core.h include/internal/endian.h include/crypto/modes.h \
+ include/openssl/modes.h
--- a/openssl-3.4.2/crypto/modes/libcrypto-lib-xts128gb.d
+++ b/openssl-3.4.2/crypto/modes/libcrypto-lib-xts128gb.d
@@ -0,0 +1,9 @@
+crypto/modes/libcrypto-lib-xts128gb.o: crypto/modes/xts128gb.c \
+ include/openssl/crypto.h include/openssl/macros.h \
+ include/openssl/opensslconf.h include/openssl/configuration.h \
+ include/openssl/opensslv.h include/openssl/e_os2.h \
+ include/openssl/safestack.h include/openssl/stack.h \
+ include/openssl/types.h include/openssl/cryptoerr.h \
+ include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
+ include/openssl/core.h include/internal/endian.h include/crypto/modes.h \
+ include/openssl/modes.h
--- a/openssl-3.4.2/crypto/modes/ocb128.c
+++ b/openssl-3.4.2/crypto/modes/ocb128.c
@@ -0,0 +1,558 @@
+/*
+ * Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <string.h>
+#include <openssl/crypto.h>
+#include <openssl/err.h>
+#include "crypto/modes.h"
+
+#ifndef OPENSSL_NO_OCB
+
+/*
+ * Calculate the number of binary trailing zero's in any given number
+ */
+static u32 ocb_ntz(u64 n)
+{
+    u32 cnt = 0;
+
+    /*
+     * We do a right-to-left simple sequential search. This is surprisingly
+     * efficient as the distribution of trailing zeros is not uniform,
+     * e.g. the number of possible inputs with no trailing zeros is equal to
+     * the number with 1 or more; the number with exactly 1 is equal to the
+     * number with 2 or more, etc. Checking the last two bits covers 75% of
+     * all numbers. Checking the last three covers 87.5%
+     */
+    while (!(n & 1)) {
+        n >>= 1;
+        cnt++;
+    }
+    return cnt;
+}
+
+/*
+ * Shift a block of 16 bytes left by shift bits
+ */
+static void ocb_block_lshift(const unsigned char *in, size_t shift,
+                             unsigned char *out)
+{
+    int i;
+    unsigned char carry = 0, carry_next;
+
+    for (i = 15; i >= 0; i--) {
+        carry_next = in[i] >> (8 - shift);
+        out[i] = (in[i] << shift) | carry;
+        carry = carry_next;
+    }
+}
+
+/*
+ * Perform a "double" operation as per OCB spec
+ */
+static void ocb_double(OCB_BLOCK *in, OCB_BLOCK *out)
+{
+    unsigned char mask;
+
+    /*
+     * Calculate the mask based on the most significant bit. There are more
+     * efficient ways to do this - but this way is constant time
+     */
+    mask = in->c[0] & 0x80;
+    mask >>= 7;
+    mask = (0 - mask) & 0x87;
+
+    ocb_block_lshift(in->c, 1, out->c);
+
+    out->c[15] ^= mask;
+}
+
+/*
+ * Perform an xor on in1 and in2 - each of len bytes. Store result in out
+ */
+static void ocb_block_xor(const unsigned char *in1,
+                          const unsigned char *in2, size_t len,
+                          unsigned char *out)
+{
+    size_t i;
+    for (i = 0; i < len; i++) {
+        out[i] = in1[i] ^ in2[i];
+    }
+}
+
+/*
+ * Lookup L_index in our lookup table. If we haven't already got it we need to
+ * calculate it
+ */
+static OCB_BLOCK *ocb_lookup_l(OCB128_CONTEXT *ctx, size_t idx)
+{
+    size_t l_index = ctx->l_index;
+
+    if (idx <= l_index) {
+        return ctx->l + idx;
+    }
+
+    /* We don't have it - so calculate it */
+    if (idx >= ctx->max_l_index) {
+        void *tmp_ptr;
+        /*
+         * Each additional entry allows to process almost double as
+         * much data, so that in linear world the table will need to
+         * be expanded with smaller and smaller increments. Originally
+         * it was doubling in size, which was a waste. Growing it
+         * linearly is not formally optimal, but is simpler to implement.
+         * We grow table by minimally required 4*n that would accommodate
+         * the index.
+         */
+        ctx->max_l_index += (idx - ctx->max_l_index + 4) & ~3;
+        tmp_ptr = OPENSSL_realloc(ctx->l, ctx->max_l_index * sizeof(OCB_BLOCK));
+        if (tmp_ptr == NULL) /* prevent ctx->l from being clobbered */
+            return NULL;
+        ctx->l = tmp_ptr;
+    }
+    while (l_index < idx) {
+        ocb_double(ctx->l + l_index, ctx->l + l_index + 1);
+        l_index++;
+    }
+    ctx->l_index = l_index;
+
+    return ctx->l + idx;
+}
+
+/*
+ * Create a new OCB128_CONTEXT
+ */
+OCB128_CONTEXT *CRYPTO_ocb128_new(void *keyenc, void *keydec,
+                                  block128_f encrypt, block128_f decrypt,
+                                  ocb128_f stream)
+{
+    OCB128_CONTEXT *octx;
+    int ret;
+
+    if ((octx = OPENSSL_malloc(sizeof(*octx))) != NULL) {
+        ret = CRYPTO_ocb128_init(octx, keyenc, keydec, encrypt, decrypt,
+                                 stream);
+        if (ret)
+            return octx;
+        OPENSSL_free(octx);
+    }
+
+    return NULL;
+}
+
+/*
+ * Initialise an existing OCB128_CONTEXT
+ */
+int CRYPTO_ocb128_init(OCB128_CONTEXT *ctx, void *keyenc, void *keydec,
+                       block128_f encrypt, block128_f decrypt,
+                       ocb128_f stream)
+{
+    memset(ctx, 0, sizeof(*ctx));
+    ctx->l_index = 0;
+    ctx->max_l_index = 5;
+    if ((ctx->l = OPENSSL_malloc(ctx->max_l_index * 16)) == NULL)
+        return 0;
+
+    /*
+     * We set both the encryption and decryption key schedules - decryption
+     * needs both. Don't really need decryption schedule if only doing
+     * encryption - but it simplifies things to take it anyway
+     */
+    ctx->encrypt = encrypt;
+    ctx->decrypt = decrypt;
+    ctx->stream = stream;
+    ctx->keyenc = keyenc;
+    ctx->keydec = keydec;
+
+    /* L_* = ENCIPHER(K, zeros(128)) */
+    ctx->encrypt(ctx->l_star.c, ctx->l_star.c, ctx->keyenc);
+
+    /* L_$ = double(L_*) */
+    ocb_double(&ctx->l_star, &ctx->l_dollar);
+
+    /* L_0 = double(L_$) */
+    ocb_double(&ctx->l_dollar, ctx->l);
+
+    /* L_{i} = double(L_{i-1}) */
+    ocb_double(ctx->l, ctx->l+1);
+    ocb_double(ctx->l+1, ctx->l+2);
+    ocb_double(ctx->l+2, ctx->l+3);
+    ocb_double(ctx->l+3, ctx->l+4);
+    ctx->l_index = 4;   /* enough to process up to 496 bytes */
+
+    return 1;
+}
+
+/*
+ * Copy an OCB128_CONTEXT object
+ */
+int CRYPTO_ocb128_copy_ctx(OCB128_CONTEXT *dest, OCB128_CONTEXT *src,
+                           void *keyenc, void *keydec)
+{
+    memcpy(dest, src, sizeof(OCB128_CONTEXT));
+    if (keyenc)
+        dest->keyenc = keyenc;
+    if (keydec)
+        dest->keydec = keydec;
+    if (src->l) {
+        if ((dest->l = OPENSSL_malloc(src->max_l_index * 16)) == NULL)
+            return 0;
+        memcpy(dest->l, src->l, (src->l_index + 1) * 16);
+    }
+    return 1;
+}
+
+/*
+ * Set the IV to be used for this operation. Must be 1 - 15 bytes.
+ */
+int CRYPTO_ocb128_setiv(OCB128_CONTEXT *ctx, const unsigned char *iv,
+                        size_t len, size_t taglen)
+{
+    unsigned char ktop[16], tmp[16], mask;
+    unsigned char stretch[24], nonce[16];
+    size_t bottom, shift;
+
+    /*
+     * Spec says IV is 120 bits or fewer - it allows non byte aligned lengths.
+     * We don't support this at this stage
+     */
+    if ((len > 15) || (len < 1) || (taglen > 16) || (taglen < 1)) {
+        return -1;
+    }
+
+    /* Reset nonce-dependent variables */
+    memset(&ctx->sess, 0, sizeof(ctx->sess));
+
+    /* Nonce = num2str(TAGLEN mod 128,7) || zeros(120-bitlen(N)) || 1 || N */
+    nonce[0] = ((taglen * 8) % 128) << 1;
+    memset(nonce + 1, 0, 15);
+    memcpy(nonce + 16 - len, iv, len);
+    nonce[15 - len] |= 1;
+
+    /* Ktop = ENCIPHER(K, Nonce[1..122] || zeros(6)) */
+    memcpy(tmp, nonce, 16);
+    tmp[15] &= 0xc0;
+    ctx->encrypt(tmp, ktop, ctx->keyenc);
+
+    /* Stretch = Ktop || (Ktop[1..64] xor Ktop[9..72]) */
+    memcpy(stretch, ktop, 16);
+    ocb_block_xor(ktop, ktop + 1, 8, stretch + 16);
+
+    /* bottom = str2num(Nonce[123..128]) */
+    bottom = nonce[15] & 0x3f;
+
+    /* Offset_0 = Stretch[1+bottom..128+bottom] */
+    shift = bottom % 8;
+    ocb_block_lshift(stretch + (bottom / 8), shift, ctx->sess.offset.c);
+    mask = 0xff;
+    mask <<= 8 - shift;
+    ctx->sess.offset.c[15] |=
+        (*(stretch + (bottom / 8) + 16) & mask) >> (8 - shift);
+
+    return 1;
+}
+
+/*
+ * Provide any AAD. This can be called multiple times. Only the final time can
+ * have a partial block
+ */
+int CRYPTO_ocb128_aad(OCB128_CONTEXT *ctx, const unsigned char *aad,
+                      size_t len)
+{
+    u64 i, all_num_blocks;
+    size_t num_blocks, last_len;
+    OCB_BLOCK tmp;
+
+    /* Calculate the number of blocks of AAD provided now, and so far */
+    num_blocks = len / 16;
+    all_num_blocks = num_blocks + ctx->sess.blocks_hashed;
+
+    /* Loop through all full blocks of AAD */
+    for (i = ctx->sess.blocks_hashed + 1; i <= all_num_blocks; i++) {
+        OCB_BLOCK *lookup;
+
+        /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+        lookup = ocb_lookup_l(ctx, ocb_ntz(i));
+        if (lookup == NULL)
+            return 0;
+        ocb_block16_xor(&ctx->sess.offset_aad, lookup, &ctx->sess.offset_aad);
+
+        memcpy(tmp.c, aad, 16);
+        aad += 16;
+
+        /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+        ocb_block16_xor(&ctx->sess.offset_aad, &tmp, &tmp);
+        ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
+        ocb_block16_xor(&tmp, &ctx->sess.sum, &ctx->sess.sum);
+    }
+
+    /*
+     * Check if we have any partial blocks left over. This is only valid in the
+     * last call to this function
+     */
+    last_len = len % 16;
+
+    if (last_len > 0) {
+        /* Offset_* = Offset_m xor L_* */
+        ocb_block16_xor(&ctx->sess.offset_aad, &ctx->l_star,
+                        &ctx->sess.offset_aad);
+
+        /* CipherInput = (A_* || 1 || zeros(127-bitlen(A_*))) xor Offset_* */
+        memset(tmp.c, 0, 16);
+        memcpy(tmp.c, aad, last_len);
+        tmp.c[last_len] = 0x80;
+        ocb_block16_xor(&ctx->sess.offset_aad, &tmp, &tmp);
+
+        /* Sum = Sum_m xor ENCIPHER(K, CipherInput) */
+        ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
+        ocb_block16_xor(&tmp, &ctx->sess.sum, &ctx->sess.sum);
+    }
+
+    ctx->sess.blocks_hashed = all_num_blocks;
+
+    return 1;
+}
+
+/*
+ * Provide any data to be encrypted. This can be called multiple times. Only
+ * the final time can have a partial block
+ */
+int CRYPTO_ocb128_encrypt(OCB128_CONTEXT *ctx,
+                          const unsigned char *in, unsigned char *out,
+                          size_t len)
+{
+    u64 i, all_num_blocks;
+    size_t num_blocks, last_len;
+
+    /*
+     * Calculate the number of blocks of data to be encrypted provided now, and
+     * so far
+     */
+    num_blocks = len / 16;
+    all_num_blocks = num_blocks + ctx->sess.blocks_processed;
+
+    if (num_blocks && all_num_blocks == (size_t)all_num_blocks
+        && ctx->stream != NULL) {
+        size_t max_idx = 0, top = (size_t)all_num_blocks;
+
+        /*
+         * See how many L_{i} entries we need to process data at hand
+         * and pre-compute missing entries in the table [if any]...
+         */
+        while (top >>= 1)
+            max_idx++;
+        if (ocb_lookup_l(ctx, max_idx) == NULL)
+            return 0;
+
+        ctx->stream(in, out, num_blocks, ctx->keyenc,
+                    (size_t)ctx->sess.blocks_processed + 1, ctx->sess.offset.c,
+                    (const unsigned char (*)[16])ctx->l, ctx->sess.checksum.c);
+    } else {
+        /* Loop through all full blocks to be encrypted */
+        for (i = ctx->sess.blocks_processed + 1; i <= all_num_blocks; i++) {
+            OCB_BLOCK *lookup;
+            OCB_BLOCK tmp;
+
+            /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+            lookup = ocb_lookup_l(ctx, ocb_ntz(i));
+            if (lookup == NULL)
+                return 0;
+            ocb_block16_xor(&ctx->sess.offset, lookup, &ctx->sess.offset);
+
+            memcpy(tmp.c, in, 16);
+            in += 16;
+
+            /* Checksum_i = Checksum_{i-1} xor P_i */
+            ocb_block16_xor(&tmp, &ctx->sess.checksum, &ctx->sess.checksum);
+
+            /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+            ocb_block16_xor(&ctx->sess.offset, &tmp, &tmp);
+            ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
+            ocb_block16_xor(&ctx->sess.offset, &tmp, &tmp);
+
+            memcpy(out, tmp.c, 16);
+            out += 16;
+        }
+    }
+
+    /*
+     * Check if we have any partial blocks left over. This is only valid in the
+     * last call to this function
+     */
+    last_len = len % 16;
+
+    if (last_len > 0) {
+        OCB_BLOCK pad;
+
+        /* Offset_* = Offset_m xor L_* */
+        ocb_block16_xor(&ctx->sess.offset, &ctx->l_star, &ctx->sess.offset);
+
+        /* Pad = ENCIPHER(K, Offset_*) */
+        ctx->encrypt(ctx->sess.offset.c, pad.c, ctx->keyenc);
+
+        /* C_* = P_* xor Pad[1..bitlen(P_*)] */
+        ocb_block_xor(in, pad.c, last_len, out);
+
+        /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
+        memset(pad.c, 0, 16);           /* borrow pad */
+        memcpy(pad.c, in, last_len);
+        pad.c[last_len] = 0x80;
+        ocb_block16_xor(&pad, &ctx->sess.checksum, &ctx->sess.checksum);
+    }
+
+    ctx->sess.blocks_processed = all_num_blocks;
+
+    return 1;
+}
+
+/*
+ * Provide any data to be decrypted. This can be called multiple times. Only
+ * the final time can have a partial block
+ */
+int CRYPTO_ocb128_decrypt(OCB128_CONTEXT *ctx,
+                          const unsigned char *in, unsigned char *out,
+                          size_t len)
+{
+    u64 i, all_num_blocks;
+    size_t num_blocks, last_len;
+
+    /*
+     * Calculate the number of blocks of data to be decrypted provided now, and
+     * so far
+     */
+    num_blocks = len / 16;
+    all_num_blocks = num_blocks + ctx->sess.blocks_processed;
+
+    if (num_blocks && all_num_blocks == (size_t)all_num_blocks
+        && ctx->stream != NULL) {
+        size_t max_idx = 0, top = (size_t)all_num_blocks;
+
+        /*
+         * See how many L_{i} entries we need to process data at hand
+         * and pre-compute missing entries in the table [if any]...
+         */
+        while (top >>= 1)
+            max_idx++;
+        if (ocb_lookup_l(ctx, max_idx) == NULL)
+            return 0;
+
+        ctx->stream(in, out, num_blocks, ctx->keydec,
+                    (size_t)ctx->sess.blocks_processed + 1, ctx->sess.offset.c,
+                    (const unsigned char (*)[16])ctx->l, ctx->sess.checksum.c);
+    } else {
+        OCB_BLOCK tmp;
+
+        /* Loop through all full blocks to be decrypted */
+        for (i = ctx->sess.blocks_processed + 1; i <= all_num_blocks; i++) {
+
+            /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+            OCB_BLOCK *lookup = ocb_lookup_l(ctx, ocb_ntz(i));
+            if (lookup == NULL)
+                return 0;
+            ocb_block16_xor(&ctx->sess.offset, lookup, &ctx->sess.offset);
+
+            memcpy(tmp.c, in, 16);
+            in += 16;
+
+            /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
+            ocb_block16_xor(&ctx->sess.offset, &tmp, &tmp);
+            ctx->decrypt(tmp.c, tmp.c, ctx->keydec);
+            ocb_block16_xor(&ctx->sess.offset, &tmp, &tmp);
+
+            /* Checksum_i = Checksum_{i-1} xor P_i */
+            ocb_block16_xor(&tmp, &ctx->sess.checksum, &ctx->sess.checksum);
+
+            memcpy(out, tmp.c, 16);
+            out += 16;
+        }
+    }
+
+    /*
+     * Check if we have any partial blocks left over. This is only valid in the
+     * last call to this function
+     */
+    last_len = len % 16;
+
+    if (last_len > 0) {
+        OCB_BLOCK pad;
+
+        /* Offset_* = Offset_m xor L_* */
+        ocb_block16_xor(&ctx->sess.offset, &ctx->l_star, &ctx->sess.offset);
+
+        /* Pad = ENCIPHER(K, Offset_*) */
+        ctx->encrypt(ctx->sess.offset.c, pad.c, ctx->keyenc);
+
+        /* P_* = C_* xor Pad[1..bitlen(C_*)] */
+        ocb_block_xor(in, pad.c, last_len, out);
+
+        /* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
+        memset(pad.c, 0, 16);           /* borrow pad */
+        memcpy(pad.c, out, last_len);
+        pad.c[last_len] = 0x80;
+        ocb_block16_xor(&pad, &ctx->sess.checksum, &ctx->sess.checksum);
+    }
+
+    ctx->sess.blocks_processed = all_num_blocks;
+
+    return 1;
+}
+
+static int ocb_finish(OCB128_CONTEXT *ctx, unsigned char *tag, size_t len,
+                      int write)
+{
+    OCB_BLOCK tmp;
+
+    if (len > 16 || len < 1) {
+        return -1;
+    }
+
+    /*
+     * Tag = ENCIPHER(K, Checksum_* xor Offset_* xor L_$) xor HASH(K,A)
+     */
+    ocb_block16_xor(&ctx->sess.checksum, &ctx->sess.offset, &tmp);
+    ocb_block16_xor(&ctx->l_dollar, &tmp, &tmp);
+    ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
+    ocb_block16_xor(&tmp, &ctx->sess.sum, &tmp);
+
+    if (write) {
+        memcpy(tag, &tmp, len);
+        return 1;
+    } else {
+        return CRYPTO_memcmp(&tmp, tag, len);
+    }
+}
+
+/*
+ * Calculate the tag and verify it against the supplied tag
+ */
+int CRYPTO_ocb128_finish(OCB128_CONTEXT *ctx, const unsigned char *tag,
+                         size_t len)
+{
+    return ocb_finish(ctx, (unsigned char*)tag, len, 0);
+}
+
+/*
+ * Retrieve the calculated tag
+ */
+int CRYPTO_ocb128_tag(OCB128_CONTEXT *ctx, unsigned char *tag, size_t len)
+{
+    return ocb_finish(ctx, tag, len, 1);
+}
+
+/*
+ * Release all resources
+ */
+void CRYPTO_ocb128_cleanup(OCB128_CONTEXT *ctx)
+{
+    if (ctx) {
+        OPENSSL_clear_free(ctx->l, ctx->max_l_index * 16);
+        OPENSSL_cleanse(ctx, sizeof(*ctx));
+    }
+}
+
+#endif                          /* OPENSSL_NO_OCB */
--- a/openssl-3.4.2/crypto/modes/ofb128.c
+++ b/openssl-3.4.2/crypto/modes/ofb128.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright 2008-2021 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <string.h>
+#include <openssl/crypto.h>
+#include "crypto/modes.h"
+
+#if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
+typedef size_t size_t_aX __attribute((__aligned__(1)));
+#else
+typedef size_t size_t_aX;
+#endif
+
+/*
+ * The input and output encrypted as though 128bit ofb mode is being used.
+ * The extra state information to record how much of the 128bit block we have
+ * used is contained in *num;
+ */
+void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
+                           size_t len, const void *key,
+                           unsigned char ivec[16], int *num, block128_f block)
+{
+    unsigned int n;
+    size_t l = 0;
+
+    if (*num < 0) {
+        /* There is no good way to signal an error return from here */
+        *num = -1;
+        return;
+    }
+    n = *num;
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+    if (16 % sizeof(size_t) == 0) { /* always true actually */
+        do {
+            while (n && len) {
+                *(out++) = *(in++) ^ ivec[n];
+                --len;
+                n = (n + 1) % 16;
+            }
+# if defined(STRICT_ALIGNMENT)
+            if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) !=
+                0)
+                break;
+# endif
+            while (len >= 16) {
+                (*block) (ivec, ivec, key);
+                for (; n < 16; n += sizeof(size_t))
+                    *(size_t_aX *)(out + n) =
+                        *(size_t_aX *)(in + n)
+                        ^ *(size_t_aX *)(ivec + n);
+                len -= 16;
+                out += 16;
+                in += 16;
+                n = 0;
+            }
+            if (len) {
+                (*block) (ivec, ivec, key);
+                while (len--) {
+                    out[n] = in[n] ^ ivec[n];
+                    ++n;
+                }
+            }
+            *num = n;
+            return;
+        } while (0);
+    }
+    /* the rest would be commonly eliminated by x86* compiler */
+#endif
+    while (l < len) {
+        if (n == 0) {
+            (*block) (ivec, ivec, key);
+        }
+        out[l] = in[l] ^ ivec[n];
+        ++l;
+        n = (n + 1) % 16;
+    }
+
+    *num = n;
+}
--- a/openssl-3.4.2/crypto/modes/siv128.c
+++ b/openssl-3.4.2/crypto/modes/siv128.c
@@ -0,0 +1,393 @@
+/*
+ * Copyright 2018-2021 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <openssl/crypto.h>
+#include <openssl/evp.h>
+#include <openssl/core_names.h>
+#include <openssl/params.h>
+#include "internal/endian.h"
+#include "crypto/modes.h"
+#include "crypto/siv.h"
+
+#ifndef OPENSSL_NO_SIV
+
+__owur static ossl_inline uint32_t rotl8(uint32_t x)
+{
+    return (x << 8) | (x >> 24);
+}
+
+__owur static ossl_inline uint32_t rotr8(uint32_t x)
+{
+    return (x >> 8) | (x << 24);
+}
+
+__owur static ossl_inline uint64_t byteswap8(uint64_t x)
+{
+    uint32_t high = (uint32_t)(x >> 32);
+    uint32_t low = (uint32_t)x;
+
+    high = (rotl8(high) & 0x00ff00ff) | (rotr8(high) & 0xff00ff00);
+    low = (rotl8(low) & 0x00ff00ff) | (rotr8(low) & 0xff00ff00);
+    return ((uint64_t)low) << 32 | (uint64_t)high;
+}
+
+__owur static ossl_inline uint64_t siv128_getword(SIV_BLOCK const *b, size_t i)
+{
+    DECLARE_IS_ENDIAN;
+
+    if (IS_LITTLE_ENDIAN)
+        return byteswap8(b->word[i]);
+    return b->word[i];
+}
+
+static ossl_inline void siv128_putword(SIV_BLOCK *b, size_t i, uint64_t x)
+{
+    DECLARE_IS_ENDIAN;
+
+    if (IS_LITTLE_ENDIAN)
+        b->word[i] = byteswap8(x);
+    else
+        b->word[i] = x;
+}
+
+static ossl_inline void siv128_xorblock(SIV_BLOCK *x,
+                                        SIV_BLOCK const *y)
+{
+    x->word[0] ^= y->word[0];
+    x->word[1] ^= y->word[1];
+}
+
+/*
+ * Doubles |b|, which is 16 bytes representing an element
+ * of GF(2**128) modulo the irreducible polynomial
+ * x**128 + x**7 + x**2 + x + 1.
+ * Assumes two's-complement arithmetic
+ */
+static ossl_inline void siv128_dbl(SIV_BLOCK *b)
+{
+    uint64_t high = siv128_getword(b, 0);
+    uint64_t low = siv128_getword(b, 1);
+    uint64_t high_carry = high & (((uint64_t)1) << 63);
+    uint64_t low_carry = low & (((uint64_t)1) << 63);
+    int64_t low_mask = -((int64_t)(high_carry >> 63)) & 0x87;
+    uint64_t high_mask = low_carry >> 63;
+
+    high = (high << 1) | high_mask;
+    low = (low << 1) ^ (uint64_t)low_mask;
+    siv128_putword(b, 0, high);
+    siv128_putword(b, 1, low);
+}
+
+__owur static ossl_inline int siv128_do_s2v_p(SIV128_CONTEXT *ctx, SIV_BLOCK *out,
+                                              unsigned char const* in, size_t len)
+{
+    SIV_BLOCK t;
+    size_t out_len = sizeof(out->byte);
+    EVP_MAC_CTX *mac_ctx;
+    int ret = 0;
+
+    mac_ctx = EVP_MAC_CTX_dup(ctx->mac_ctx_init);
+    if (mac_ctx == NULL)
+        return 0;
+
+    if (len >= SIV_LEN) {
+        if (!EVP_MAC_update(mac_ctx, in, len - SIV_LEN))
+            goto err;
+        memcpy(&t, in + (len-SIV_LEN), SIV_LEN);
+        siv128_xorblock(&t, &ctx->d);
+        if (!EVP_MAC_update(mac_ctx, t.byte, SIV_LEN))
+            goto err;
+    } else {
+        memset(&t, 0, sizeof(t));
+        memcpy(&t, in, len);
+        t.byte[len] = 0x80;
+        siv128_dbl(&ctx->d);
+        siv128_xorblock(&t, &ctx->d);
+        if (!EVP_MAC_update(mac_ctx, t.byte, SIV_LEN))
+            goto err;
+    }
+    if (!EVP_MAC_final(mac_ctx, out->byte, &out_len, sizeof(out->byte))
+        || out_len != SIV_LEN)
+        goto err;
+
+    ret = 1;
+
+err:
+    EVP_MAC_CTX_free(mac_ctx);
+    return ret;
+}
+
+
+__owur static ossl_inline int siv128_do_encrypt(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                                             unsigned char const *in, size_t len,
+                                             SIV_BLOCK *icv)
+{
+    int out_len = (int)len;
+
+    if (!EVP_CipherInit_ex(ctx, NULL, NULL, NULL, icv->byte, 1))
+        return 0;
+    return EVP_EncryptUpdate(ctx, out, &out_len, in, out_len);
+}
+
+/*
+ * Create a new SIV128_CONTEXT
+ */
+SIV128_CONTEXT *ossl_siv128_new(const unsigned char *key, int klen,
+                                  EVP_CIPHER *cbc, EVP_CIPHER *ctr,
+                                  OSSL_LIB_CTX *libctx, const char *propq)
+{
+    SIV128_CONTEXT *ctx;
+    int ret;
+
+    if ((ctx = OPENSSL_malloc(sizeof(*ctx))) != NULL) {
+        ret = ossl_siv128_init(ctx, key, klen, cbc, ctr, libctx, propq);
+        if (ret)
+            return ctx;
+        OPENSSL_free(ctx);
+    }
+
+    return NULL;
+}
+
+/*
+ * Initialise an existing SIV128_CONTEXT
+ */
+int ossl_siv128_init(SIV128_CONTEXT *ctx, const unsigned char *key, int klen,
+                       const EVP_CIPHER *cbc, const EVP_CIPHER *ctr,
+                       OSSL_LIB_CTX *libctx, const char *propq)
+{
+    static const unsigned char zero[SIV_LEN] = { 0 };
+    size_t out_len = SIV_LEN;
+    EVP_MAC_CTX *mac_ctx = NULL;
+    OSSL_PARAM params[3];
+    const char *cbc_name;
+
+    if (ctx == NULL)
+        return 0;
+
+    memset(&ctx->d, 0, sizeof(ctx->d));
+    EVP_CIPHER_CTX_free(ctx->cipher_ctx);
+    EVP_MAC_CTX_free(ctx->mac_ctx_init);
+    EVP_MAC_free(ctx->mac);
+    ctx->mac = NULL;
+    ctx->cipher_ctx = NULL;
+    ctx->mac_ctx_init = NULL;
+
+    if (key == NULL || cbc == NULL || ctr == NULL)
+        return 0;
+
+    cbc_name = EVP_CIPHER_get0_name(cbc);
+    params[0] = OSSL_PARAM_construct_utf8_string(OSSL_MAC_PARAM_CIPHER,
+                                                 (char *)cbc_name, 0);
+    params[1] = OSSL_PARAM_construct_octet_string(OSSL_MAC_PARAM_KEY,
+                                                  (void *)key, klen);
+    params[2] = OSSL_PARAM_construct_end();
+
+    if ((ctx->cipher_ctx = EVP_CIPHER_CTX_new()) == NULL
+            || (ctx->mac =
+                EVP_MAC_fetch(libctx, OSSL_MAC_NAME_CMAC, propq)) == NULL
+            || (ctx->mac_ctx_init = EVP_MAC_CTX_new(ctx->mac)) == NULL
+            || !EVP_MAC_CTX_set_params(ctx->mac_ctx_init, params)
+            || !EVP_EncryptInit_ex(ctx->cipher_ctx, ctr, NULL, key + klen, NULL)
+            || (mac_ctx = EVP_MAC_CTX_dup(ctx->mac_ctx_init)) == NULL
+            || !EVP_MAC_update(mac_ctx, zero, sizeof(zero))
+            || !EVP_MAC_final(mac_ctx, ctx->d.byte, &out_len,
+                              sizeof(ctx->d.byte))) {
+        EVP_CIPHER_CTX_free(ctx->cipher_ctx);
+        EVP_MAC_CTX_free(ctx->mac_ctx_init);
+        EVP_MAC_CTX_free(mac_ctx);
+        EVP_MAC_free(ctx->mac);
+        return 0;
+    }
+    EVP_MAC_CTX_free(mac_ctx);
+
+    ctx->final_ret = -1;
+    ctx->crypto_ok = 1;
+
+    return 1;
+}
+
+/*
+ * Copy an SIV128_CONTEXT object
+ */
+int ossl_siv128_copy_ctx(SIV128_CONTEXT *dest, SIV128_CONTEXT *src)
+{
+    memcpy(&dest->d, &src->d, sizeof(src->d));
+    if (dest->cipher_ctx == NULL) {
+        dest->cipher_ctx = EVP_CIPHER_CTX_new();
+        if (dest->cipher_ctx == NULL)
+            return 0;
+    }
+    if (!EVP_CIPHER_CTX_copy(dest->cipher_ctx, src->cipher_ctx))
+        return 0;
+    EVP_MAC_CTX_free(dest->mac_ctx_init);
+    dest->mac_ctx_init = EVP_MAC_CTX_dup(src->mac_ctx_init);
+    if (dest->mac_ctx_init == NULL)
+        return 0;
+    dest->mac = src->mac;
+    if (dest->mac != NULL)
+        EVP_MAC_up_ref(dest->mac);
+    return 1;
+}
+
+/*
+ * Provide any AAD. This can be called multiple times.
+ * Per RFC5297, the last piece of associated data
+ * is the nonce, but it's not treated special
+ */
+int ossl_siv128_aad(SIV128_CONTEXT *ctx, const unsigned char *aad,
+                      size_t len)
+{
+    SIV_BLOCK mac_out;
+    size_t out_len = SIV_LEN;
+    EVP_MAC_CTX *mac_ctx;
+
+    siv128_dbl(&ctx->d);
+
+    if ((mac_ctx = EVP_MAC_CTX_dup(ctx->mac_ctx_init)) == NULL
+        || !EVP_MAC_update(mac_ctx, aad, len)
+        || !EVP_MAC_final(mac_ctx, mac_out.byte, &out_len,
+                          sizeof(mac_out.byte))
+        || out_len != SIV_LEN) {
+        EVP_MAC_CTX_free(mac_ctx);
+        return 0;
+    }
+    EVP_MAC_CTX_free(mac_ctx);
+
+    siv128_xorblock(&ctx->d, &mac_out);
+
+    return 1;
+}
+
+/*
+ * Provide any data to be encrypted. This can be called once.
+ */
+int ossl_siv128_encrypt(SIV128_CONTEXT *ctx,
+                          const unsigned char *in, unsigned char *out,
+                          size_t len)
+{
+    SIV_BLOCK q;
+
+    /* can only do one crypto operation */
+    if (ctx->crypto_ok == 0)
+        return 0;
+    ctx->crypto_ok--;
+
+    if (!siv128_do_s2v_p(ctx, &q, in, len))
+        return 0;
+
+    memcpy(ctx->tag.byte, &q, SIV_LEN);
+    q.byte[8] &= 0x7f;
+    q.byte[12] &= 0x7f;
+
+    if (!siv128_do_encrypt(ctx->cipher_ctx, out, in, len, &q))
+        return 0;
+    ctx->final_ret = 0;
+    return len;
+}
+
+/*
+ * Provide any data to be decrypted. This can be called once.
+ */
+int ossl_siv128_decrypt(SIV128_CONTEXT *ctx,
+                          const unsigned char *in, unsigned char *out,
+                          size_t len)
+{
+    unsigned char* p;
+    SIV_BLOCK t, q;
+    int i;
+
+    /* can only do one crypto operation */
+    if (ctx->crypto_ok == 0)
+        return 0;
+    ctx->crypto_ok--;
+
+    memcpy(&q, ctx->tag.byte, SIV_LEN);
+    q.byte[8] &= 0x7f;
+    q.byte[12] &= 0x7f;
+
+    if (!siv128_do_encrypt(ctx->cipher_ctx, out, in, len, &q)
+        || !siv128_do_s2v_p(ctx, &t, out, len))
+        return 0;
+
+    p = ctx->tag.byte;
+    for (i = 0; i < SIV_LEN; i++)
+        t.byte[i] ^= p[i];
+
+    if ((t.word[0] | t.word[1]) != 0) {
+        OPENSSL_cleanse(out, len);
+        return 0;
+    }
+    ctx->final_ret = 0;
+    return len;
+}
+
+/*
+ * Return the already calculated final result.
+ */
+int ossl_siv128_finish(SIV128_CONTEXT *ctx)
+{
+    return ctx->final_ret;
+}
+
+/*
+ * Set the tag
+ */
+int ossl_siv128_set_tag(SIV128_CONTEXT *ctx, const unsigned char *tag, size_t len)
+{
+    if (len != SIV_LEN)
+        return 0;
+
+    /* Copy the tag from the supplied buffer */
+    memcpy(ctx->tag.byte, tag, len);
+    return 1;
+}
+
+/*
+ * Retrieve the calculated tag
+ */
+int ossl_siv128_get_tag(SIV128_CONTEXT *ctx, unsigned char *tag, size_t len)
+{
+    if (len != SIV_LEN)
+        return 0;
+
+    /* Copy the tag into the supplied buffer */
+    memcpy(tag, ctx->tag.byte, len);
+    return 1;
+}
+
+/*
+ * Release all resources
+ */
+int ossl_siv128_cleanup(SIV128_CONTEXT *ctx)
+{
+    if (ctx != NULL) {
+        EVP_CIPHER_CTX_free(ctx->cipher_ctx);
+        ctx->cipher_ctx = NULL;
+        EVP_MAC_CTX_free(ctx->mac_ctx_init);
+        ctx->mac_ctx_init = NULL;
+        EVP_MAC_free(ctx->mac);
+        ctx->mac = NULL;
+        OPENSSL_cleanse(&ctx->d, sizeof(ctx->d));
+        OPENSSL_cleanse(&ctx->tag, sizeof(ctx->tag));
+        ctx->final_ret = -1;
+        ctx->crypto_ok = 1;
+    }
+    return 1;
+}
+
+int ossl_siv128_speed(SIV128_CONTEXT *ctx, int arg)
+{
+    ctx->crypto_ok = (arg == 1) ? -1 : 1;
+    return 1;
+}
+
+#endif                          /* OPENSSL_NO_SIV */
--- a/openssl-3.4.2/crypto/modes/wrap128.c
+++ b/openssl-3.4.2/crypto/modes/wrap128.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright 2013-2018 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/**  Beware!
+ *
+ *  Following wrapping modes were designed for AES but this implementation
+ *  allows you to use them for any 128 bit block cipher.
+ */
+
+#include "internal/cryptlib.h"
+#include <openssl/modes.h>
+
+/** RFC 3394 section 2.2.3.1 Default Initial Value */
+static const unsigned char default_iv[] = {
+    0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
+};
+
+/** RFC 5649 section 3 Alternative Initial Value 32-bit constant */
+static const unsigned char default_aiv[] = {
+    0xA6, 0x59, 0x59, 0xA6
+};
+
+/** Input size limit: lower than maximum of standards but far larger than
+ *  anything that will be used in practice.
+ */
+#define CRYPTO128_WRAP_MAX (1UL << 31)
+
+/** Wrapping according to RFC 3394 section 2.2.1.
+ *
+ *  @param[in]  key    Key value.
+ *  @param[in]  iv     IV value. Length = 8 bytes. NULL = use default_iv.
+ *  @param[in]  in     Plaintext as n 64-bit blocks, n >= 2.
+ *  @param[in]  inlen  Length of in.
+ *  @param[out] out    Ciphertext. Minimal buffer length = (inlen + 8) bytes.
+ *                     Input and output buffers can overlap if block function
+ *                     supports that.
+ *  @param[in]  block  Block processing function.
+ *  @return            0 if inlen does not consist of n 64-bit blocks, n >= 2.
+ *                     or if inlen > CRYPTO128_WRAP_MAX.
+ *                     Output length if wrapping succeeded.
+ */
+size_t CRYPTO_128_wrap(void *key, const unsigned char *iv,
+                       unsigned char *out,
+                       const unsigned char *in, size_t inlen,
+                       block128_f block)
+{
+    unsigned char *A, B[16], *R;
+    size_t i, j, t;
+    if ((inlen & 0x7) || (inlen < 16) || (inlen > CRYPTO128_WRAP_MAX))
+        return 0;
+    A = B;
+    t = 1;
+    memmove(out + 8, in, inlen);
+    if (!iv)
+        iv = default_iv;
+
+    memcpy(A, iv, 8);
+
+    for (j = 0; j < 6; j++) {
+        R = out + 8;
+        for (i = 0; i < inlen; i += 8, t++, R += 8) {
+            memcpy(B + 8, R, 8);
+            block(B, B, key);
+            A[7] ^= (unsigned char)(t & 0xff);
+            if (t > 0xff) {
+                A[6] ^= (unsigned char)((t >> 8) & 0xff);
+                A[5] ^= (unsigned char)((t >> 16) & 0xff);
+                A[4] ^= (unsigned char)((t >> 24) & 0xff);
+            }
+            memcpy(R, B + 8, 8);
+        }
+    }
+    memcpy(out, A, 8);
+    return inlen + 8;
+}
+
+/** Unwrapping according to RFC 3394 section 2.2.2 steps 1-2.
+ *  The IV check (step 3) is responsibility of the caller.
+ *
+ *  @param[in]  key    Key value.
+ *  @param[out] iv     Unchecked IV value. Minimal buffer length = 8 bytes.
+ *  @param[out] out    Plaintext without IV.
+ *                     Minimal buffer length = (inlen - 8) bytes.
+ *                     Input and output buffers can overlap if block function
+ *                     supports that.
+ *  @param[in]  in     Ciphertext as n 64-bit blocks.
+ *  @param[in]  inlen  Length of in.
+ *  @param[in]  block  Block processing function.
+ *  @return            0 if inlen is out of range [24, CRYPTO128_WRAP_MAX]
+ *                     or if inlen is not a multiple of 8.
+ *                     Output length otherwise.
+ */
+static size_t crypto_128_unwrap_raw(void *key, unsigned char *iv,
+                                    unsigned char *out,
+                                    const unsigned char *in, size_t inlen,
+                                    block128_f block)
+{
+    unsigned char *A, B[16], *R;
+    size_t i, j, t;
+    inlen -= 8;
+    if ((inlen & 0x7) || (inlen < 16) || (inlen > CRYPTO128_WRAP_MAX))
+        return 0;
+    A = B;
+    t = 6 * (inlen >> 3);
+    memcpy(A, in, 8);
+    memmove(out, in + 8, inlen);
+    for (j = 0; j < 6; j++) {
+        R = out + inlen - 8;
+        for (i = 0; i < inlen; i += 8, t--, R -= 8) {
+            A[7] ^= (unsigned char)(t & 0xff);
+            if (t > 0xff) {
+                A[6] ^= (unsigned char)((t >> 8) & 0xff);
+                A[5] ^= (unsigned char)((t >> 16) & 0xff);
+                A[4] ^= (unsigned char)((t >> 24) & 0xff);
+            }
+            memcpy(B + 8, R, 8);
+            block(B, B, key);
+            memcpy(R, B + 8, 8);
+        }
+    }
+    memcpy(iv, A, 8);
+    return inlen;
+}
+
+/** Unwrapping according to RFC 3394 section 2.2.2, including the IV check.
+ *  The first block of plaintext has to match the supplied IV, otherwise an
+ *  error is returned.
+ *
+ *  @param[in]  key    Key value.
+ *  @param[out] iv     IV value to match against. Length = 8 bytes.
+ *                     NULL = use default_iv.
+ *  @param[out] out    Plaintext without IV.
+ *                     Minimal buffer length = (inlen - 8) bytes.
+ *                     Input and output buffers can overlap if block function
+ *                     supports that.
+ *  @param[in]  in     Ciphertext as n 64-bit blocks.
+ *  @param[in]  inlen  Length of in.
+ *  @param[in]  block  Block processing function.
+ *  @return            0 if inlen is out of range [24, CRYPTO128_WRAP_MAX]
+ *                     or if inlen is not a multiple of 8
+ *                     or if IV doesn't match expected value.
+ *                     Output length otherwise.
+ */
+size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv,
+                         unsigned char *out, const unsigned char *in,
+                         size_t inlen, block128_f block)
+{
+    size_t ret;
+    unsigned char got_iv[8];
+
+    ret = crypto_128_unwrap_raw(key, got_iv, out, in, inlen, block);
+    if (ret == 0)
+        return 0;
+
+    if (!iv)
+        iv = default_iv;
+    if (CRYPTO_memcmp(got_iv, iv, 8)) {
+        OPENSSL_cleanse(out, ret);
+        return 0;
+    }
+    return ret;
+}
+
+/** Wrapping according to RFC 5649 section 4.1.
+ *
+ *  @param[in]  key    Key value.
+ *  @param[in]  icv    (Non-standard) IV, 4 bytes. NULL = use default_aiv.
+ *  @param[out] out    Ciphertext. Minimal buffer length = (inlen + 15) bytes.
+ *                     Input and output buffers can overlap if block function
+ *                     supports that.
+ *  @param[in]  in     Plaintext as n 64-bit blocks, n >= 2.
+ *  @param[in]  inlen  Length of in.
+ *  @param[in]  block  Block processing function.
+ *  @return            0 if inlen is out of range [1, CRYPTO128_WRAP_MAX].
+ *                     Output length if wrapping succeeded.
+ */
+size_t CRYPTO_128_wrap_pad(void *key, const unsigned char *icv,
+                           unsigned char *out,
+                           const unsigned char *in, size_t inlen,
+                           block128_f block)
+{
+    /* n: number of 64-bit blocks in the padded key data
+     *
+     * If length of plain text is not a multiple of 8, pad the plain text octet
+     * string on the right with octets of zeros, where final length is the
+     * smallest multiple of 8 that is greater than length of plain text.
+     * If length of plain text is a multiple of 8, then there is no padding. */
+    const size_t blocks_padded = (inlen + 7) / 8; /* CEILING(m/8) */
+    const size_t padded_len = blocks_padded * 8;
+    const size_t padding_len = padded_len - inlen;
+    /* RFC 5649 section 3: Alternative Initial Value */
+    unsigned char aiv[8];
+    int ret;
+
+    /* Section 1: use 32-bit fixed field for plaintext octet length */
+    if (inlen == 0 || inlen >= CRYPTO128_WRAP_MAX)
+        return 0;
+
+    /* Section 3: Alternative Initial Value */
+    if (!icv)
+        memcpy(aiv, default_aiv, 4);
+    else
+        memcpy(aiv, icv, 4);    /* Standard doesn't mention this. */
+
+    aiv[4] = (inlen >> 24) & 0xFF;
+    aiv[5] = (inlen >> 16) & 0xFF;
+    aiv[6] = (inlen >> 8) & 0xFF;
+    aiv[7] = inlen & 0xFF;
+
+    if (padded_len == 8) {
+        /*
+         * Section 4.1 - special case in step 2: If the padded plaintext
+         * contains exactly eight octets, then prepend the AIV and encrypt
+         * the resulting 128-bit block using AES in ECB mode.
+         */
+        memmove(out + 8, in, inlen);
+        memcpy(out, aiv, 8);
+        memset(out + 8 + inlen, 0, padding_len);
+        block(out, out, key);
+        ret = 16;               /* AIV + padded input */
+    } else {
+        memmove(out, in, inlen);
+        memset(out + inlen, 0, padding_len); /* Section 4.1 step 1 */
+        ret = CRYPTO_128_wrap(key, aiv, out, out, padded_len, block);
+    }
+
+    return ret;
+}
+
+/** Unwrapping according to RFC 5649 section 4.2.
+ *
+ *  @param[in]  key    Key value.
+ *  @param[in]  icv    (Non-standard) IV, 4 bytes. NULL = use default_aiv.
+ *  @param[out] out    Plaintext. Minimal buffer length = (inlen - 8) bytes.
+ *                     Input and output buffers can overlap if block function
+ *                     supports that.
+ *  @param[in]  in     Ciphertext as n 64-bit blocks.
+ *  @param[in]  inlen  Length of in.
+ *  @param[in]  block  Block processing function.
+ *  @return            0 if inlen is out of range [16, CRYPTO128_WRAP_MAX],
+ *                     or if inlen is not a multiple of 8
+ *                     or if IV and message length indicator doesn't match.
+ *                     Output length if unwrapping succeeded and IV matches.
+ */
+size_t CRYPTO_128_unwrap_pad(void *key, const unsigned char *icv,
+                             unsigned char *out,
+                             const unsigned char *in, size_t inlen,
+                             block128_f block)
+{
+    /* n: number of 64-bit blocks in the padded key data */
+    size_t n = inlen / 8 - 1;
+    size_t padded_len;
+    size_t padding_len;
+    size_t ptext_len;
+    /* RFC 5649 section 3: Alternative Initial Value */
+    unsigned char aiv[8];
+    static unsigned char zeros[8] = { 0x0 };
+    size_t ret;
+
+    /* Section 4.2: Ciphertext length has to be (n+1) 64-bit blocks. */
+    if ((inlen & 0x7) != 0 || inlen < 16 || inlen >= CRYPTO128_WRAP_MAX)
+        return 0;
+
+    if (inlen == 16) {
+        /*
+         * Section 4.2 - special case in step 1: When n=1, the ciphertext
+         * contains exactly two 64-bit blocks and they are decrypted as a
+         * single AES block using AES in ECB mode: AIV | P[1] = DEC(K, C[0] |
+         * C[1])
+         */
+        unsigned char buff[16];
+
+        block(in, buff, key);
+        memcpy(aiv, buff, 8);
+        /* Remove AIV */
+        memcpy(out, buff + 8, 8);
+        padded_len = 8;
+        OPENSSL_cleanse(buff, inlen);
+    } else {
+        padded_len = inlen - 8;
+        ret = crypto_128_unwrap_raw(key, aiv, out, in, inlen, block);
+        if (padded_len != ret) {
+            OPENSSL_cleanse(out, inlen);
+            return 0;
+        }
+    }
+
+    /*
+     * Section 3: AIV checks: Check that MSB(32,A) = A65959A6. Optionally a
+     * user-supplied value can be used (even if standard doesn't mention
+     * this).
+     */
+    if ((!icv && CRYPTO_memcmp(aiv, default_aiv, 4))
+        || (icv && CRYPTO_memcmp(aiv, icv, 4))) {
+        OPENSSL_cleanse(out, inlen);
+        return 0;
+    }
+
+    /*
+     * Check that 8*(n-1) < LSB(32,AIV) <= 8*n. If so, let ptext_len =
+     * LSB(32,AIV).
+     */
+
+    ptext_len =   ((unsigned int)aiv[4] << 24)
+                | ((unsigned int)aiv[5] << 16)
+                | ((unsigned int)aiv[6] <<  8)
+                |  (unsigned int)aiv[7];
+    if (8 * (n - 1) >= ptext_len || ptext_len > 8 * n) {
+        OPENSSL_cleanse(out, inlen);
+        return 0;
+    }
+
+    /*
+     * Check that the rightmost padding_len octets of the output data are
+     * zero.
+     */
+    padding_len = padded_len - ptext_len;
+    if (CRYPTO_memcmp(out + ptext_len, zeros, padding_len) != 0) {
+        OPENSSL_cleanse(out, inlen);
+        return 0;
+    }
+
+    /* Section 4.2 step 3: Remove padding */
+    return ptext_len;
+}
--- a/openssl-3.4.2/crypto/modes/xts128.c
+++ b/openssl-3.4.2/crypto/modes/xts128.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <string.h>
+#include <openssl/crypto.h>
+#include "internal/endian.h"
+#include "crypto/modes.h"
+
+#ifndef STRICT_ALIGNMENT
+# ifdef __GNUC__
+typedef u64 u64_a1 __attribute((__aligned__(1)));
+# else
+typedef u64 u64_a1;
+# endif
+#endif
+
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx,
+                          const unsigned char iv[16],
+                          const unsigned char *inp, unsigned char *out,
+                          size_t len, int enc)
+{
+    DECLARE_IS_ENDIAN;
+    union {
+        u64 u[2];
+        u32 d[4];
+        u8 c[16];
+    } tweak, scratch;
+    unsigned int i;
+
+    if (len < 16)
+        return -1;
+
+    memcpy(tweak.c, iv, 16);
+
+    (*ctx->block2) (tweak.c, tweak.c, ctx->key2);
+
+    if (!enc && (len % 16))
+        len -= 16;
+
+    while (len >= 16) {
+#if defined(STRICT_ALIGNMENT)
+        memcpy(scratch.c, inp, 16);
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+#else
+        scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak.u[0];
+        scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak.u[1];
+#endif
+        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+        memcpy(out, scratch.c, 16);
+#else
+        ((u64_a1 *)out)[0] = scratch.u[0] ^= tweak.u[0];
+        ((u64_a1 *)out)[1] = scratch.u[1] ^= tweak.u[1];
+#endif
+        inp += 16;
+        out += 16;
+        len -= 16;
+
+        if (len == 0)
+            return 0;
+
+        if (IS_LITTLE_ENDIAN) {
+            unsigned int carry, res;
+
+            res = 0x87 & (((int)tweak.d[3]) >> 31);
+            carry = (unsigned int)(tweak.u[0] >> 63);
+            tweak.u[0] = (tweak.u[0] << 1) ^ res;
+            tweak.u[1] = (tweak.u[1] << 1) | carry;
+        } else {
+            size_t c;
+
+            for (c = 0, i = 0; i < 16; ++i) {
+                /*
+                 * + substitutes for |, because c is 1 bit
+                 */
+                c += ((size_t)tweak.c[i]) << 1;
+                tweak.c[i] = (u8)c;
+                c = c >> 8;
+            }
+            tweak.c[0] ^= (u8)(0x87 & (0 - c));
+        }
+    }
+    if (enc) {
+        for (i = 0; i < len; ++i) {
+            u8 c = inp[i];
+            out[i] = scratch.c[i];
+            scratch.c[i] = c;
+        }
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+        memcpy(out - 16, scratch.c, 16);
+    } else {
+        union {
+            u64 u[2];
+            u8 c[16];
+        } tweak1;
+
+        if (IS_LITTLE_ENDIAN) {
+            unsigned int carry, res;
+
+            res = 0x87 & (((int)tweak.d[3]) >> 31);
+            carry = (unsigned int)(tweak.u[0] >> 63);
+            tweak1.u[0] = (tweak.u[0] << 1) ^ res;
+            tweak1.u[1] = (tweak.u[1] << 1) | carry;
+        } else {
+            size_t c;
+
+            for (c = 0, i = 0; i < 16; ++i) {
+                /*
+                 * + substitutes for |, because c is 1 bit
+                 */
+                c += ((size_t)tweak.c[i]) << 1;
+                tweak1.c[i] = (u8)c;
+                c = c >> 8;
+            }
+            tweak1.c[0] ^= (u8)(0x87 & (0 - c));
+        }
+#if defined(STRICT_ALIGNMENT)
+        memcpy(scratch.c, inp, 16);
+        scratch.u[0] ^= tweak1.u[0];
+        scratch.u[1] ^= tweak1.u[1];
+#else
+        scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak1.u[0];
+        scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak1.u[1];
+#endif
+        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+        scratch.u[0] ^= tweak1.u[0];
+        scratch.u[1] ^= tweak1.u[1];
+
+        for (i = 0; i < len; ++i) {
+            u8 c = inp[16 + i];
+            out[16 + i] = scratch.c[i];
+            scratch.c[i] = c;
+        }
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+        memcpy(out, scratch.c, 16);
+#else
+        ((u64_a1 *)out)[0] = scratch.u[0] ^ tweak.u[0];
+        ((u64_a1 *)out)[1] = scratch.u[1] ^ tweak.u[1];
+#endif
+    }
+
+    return 0;
+}
--- a/openssl-3.4.2/crypto/modes/xts128gb.c
+++ b/openssl-3.4.2/crypto/modes/xts128gb.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <string.h>
+#include <openssl/crypto.h>
+#include "internal/endian.h"
+#include "crypto/modes.h"
+
+#ifndef STRICT_ALIGNMENT
+# ifdef __GNUC__
+typedef u64 u64_a1 __attribute((__aligned__(1)));
+# else
+typedef u64 u64_a1;
+# endif
+#endif
+
+int ossl_crypto_xts128gb_encrypt(const XTS128_CONTEXT *ctx,
+                                 const unsigned char iv[16],
+                                 const unsigned char *inp, unsigned char *out,
+                                 size_t len, int enc)
+{
+    DECLARE_IS_ENDIAN;
+    union {
+        u64 u[2];
+        u32 d[4];
+        u8 c[16];
+    } tweak, scratch;
+    unsigned int i;
+
+    if (len < 16)
+        return -1;
+
+    memcpy(tweak.c, iv, 16);
+
+    (*ctx->block2) (tweak.c, tweak.c, ctx->key2);
+
+    if (!enc && (len % 16))
+        len -= 16;
+
+    while (len >= 16) {
+#if defined(STRICT_ALIGNMENT)
+        memcpy(scratch.c, inp, 16);
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+#else
+        scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak.u[0];
+        scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak.u[1];
+#endif
+        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+        memcpy(out, scratch.c, 16);
+#else
+        ((u64_a1 *)out)[0] = scratch.u[0] ^= tweak.u[0];
+        ((u64_a1 *)out)[1] = scratch.u[1] ^= tweak.u[1];
+#endif
+        inp += 16;
+        out += 16;
+        len -= 16;
+
+        if (len == 0)
+            return 0;
+
+        if (IS_LITTLE_ENDIAN) {
+            u8 res;
+            u64 hi, lo;
+#ifdef BSWAP8
+            hi = BSWAP8(tweak.u[0]);
+            lo = BSWAP8(tweak.u[1]);
+#else
+            u8 *p = tweak.c;
+
+            hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
+            lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
+#endif
+            res = (u8)lo & 1;
+            tweak.u[0] = (lo >> 1) | (hi << 63);
+            tweak.u[1] = hi >> 1;
+            if (res)
+                tweak.c[15] ^= 0xe1;
+#ifdef BSWAP8
+            hi = BSWAP8(tweak.u[0]);
+            lo = BSWAP8(tweak.u[1]);
+#else
+            p = tweak.c;
+
+            hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
+            lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
+#endif
+            tweak.u[0] = lo;
+            tweak.u[1] = hi;
+        } else {
+            u8 carry, res;
+            carry = 0;
+            for (i = 0; i < 16; ++i) {
+                res = (tweak.c[i] << 7) & 0x80;
+                tweak.c[i] = ((tweak.c[i] >> 1) + carry) & 0xff;
+                carry = res;
+            }
+            if (res)
+                tweak.c[0] ^= 0xe1;
+        }
+    }
+    if (enc) {
+        for (i = 0; i < len; ++i) {
+            u8 c = inp[i];
+            out[i] = scratch.c[i];
+            scratch.c[i] = c;
+        }
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+        memcpy(out - 16, scratch.c, 16);
+    } else {
+        union {
+            u64 u[2];
+            u8 c[16];
+        } tweak1;
+
+        if (IS_LITTLE_ENDIAN) {
+            u8 res;
+            u64 hi, lo;
+#ifdef BSWAP8
+            hi = BSWAP8(tweak.u[0]);
+            lo = BSWAP8(tweak.u[1]);
+#else
+            u8 *p = tweak.c;
+
+            hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
+            lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
+#endif
+            res = (u8)lo & 1;
+            tweak1.u[0] = (lo >> 1) | (hi << 63);
+            tweak1.u[1] = hi >> 1;
+            if (res)
+                tweak1.c[15] ^= 0xe1;
+#ifdef BSWAP8
+            hi = BSWAP8(tweak1.u[0]);
+            lo = BSWAP8(tweak1.u[1]);
+#else
+            p = tweak1.c;
+
+            hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
+            lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
+#endif
+            tweak1.u[0] = lo;
+            tweak1.u[1] = hi;
+        } else {
+            u8 carry, res;
+            carry = 0;
+            for (i = 0; i < 16; ++i) {
+                res = (tweak.c[i] << 7) & 0x80;
+                tweak1.c[i] = ((tweak.c[i] >> 1) + carry) & 0xff;
+                carry = res;
+            }
+            if (res)
+                tweak1.c[0] ^= 0xe1;
+        }
+#if defined(STRICT_ALIGNMENT)
+        memcpy(scratch.c, inp, 16);
+        scratch.u[0] ^= tweak1.u[0];
+        scratch.u[1] ^= tweak1.u[1];
+#else
+        scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak1.u[0];
+        scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak1.u[1];
+#endif
+        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+        scratch.u[0] ^= tweak1.u[0];
+        scratch.u[1] ^= tweak1.u[1];
+
+        for (i = 0; i < len; ++i) {
+            u8 c = inp[16 + i];
+            out[16 + i] = scratch.c[i];
+            scratch.c[i] = c;
+        }
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+        scratch.u[0] ^= tweak.u[0];
+        scratch.u[1] ^= tweak.u[1];
+        memcpy(out, scratch.c, 16);
+#else
+        ((u64_a1 *)out)[0] = scratch.u[0] ^ tweak.u[0];
+        ((u64_a1 *)out)[1] = scratch.u[1] ^ tweak.u[1];
+#endif
+    }
+
+    return 0;
+}