Adding in curl and openssl repos

This commit is contained in:
2025-08-14 12:09:30 -04:00
parent af2117b574
commit 0ace93e303
21174 changed files with 3607720 additions and 2 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,812 @@
.text
.type _aesni_ctr32_ghash_6x,@function
.align 32
_aesni_ctr32_ghash_6x:
.cfi_startproc
vmovdqu 32(%r11),%xmm2
subq $6,%rdx
vpxor %xmm4,%xmm4,%xmm4
vmovdqu 0-128(%rcx),%xmm15
vpaddb %xmm2,%xmm1,%xmm10
vpaddb %xmm2,%xmm10,%xmm11
vpaddb %xmm2,%xmm11,%xmm12
vpaddb %xmm2,%xmm12,%xmm13
vpaddb %xmm2,%xmm13,%xmm14
vpxor %xmm15,%xmm1,%xmm9
vmovdqu %xmm4,16+8(%rsp)
jmp .Loop6x
.align 32
.Loop6x:
addl $100663296,%ebx
jc .Lhandle_ctr32
vmovdqu 0-32(%r9),%xmm3
vpaddb %xmm2,%xmm14,%xmm1
vpxor %xmm15,%xmm10,%xmm10
vpxor %xmm15,%xmm11,%xmm11
.Lresume_ctr32:
vmovdqu %xmm1,(%r8)
vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
vpxor %xmm15,%xmm12,%xmm12
vmovups 16-128(%rcx),%xmm2
vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
xorq %r12,%r12
cmpq %r14,%r15
vaesenc %xmm2,%xmm9,%xmm9
vmovdqu 48+8(%rsp),%xmm0
vpxor %xmm15,%xmm13,%xmm13
vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
vaesenc %xmm2,%xmm10,%xmm10
vpxor %xmm15,%xmm14,%xmm14
setnc %r12b
vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
vaesenc %xmm2,%xmm11,%xmm11
vmovdqu 16-32(%r9),%xmm3
negq %r12
vaesenc %xmm2,%xmm12,%xmm12
vpxor %xmm5,%xmm6,%xmm6
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
vpxor %xmm4,%xmm8,%xmm8
vaesenc %xmm2,%xmm13,%xmm13
vpxor %xmm5,%xmm1,%xmm4
andq $0x60,%r12
vmovups 32-128(%rcx),%xmm15
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
vaesenc %xmm2,%xmm14,%xmm14
vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
leaq (%r14,%r12,1),%r14
vaesenc %xmm15,%xmm9,%xmm9
vpxor 16+8(%rsp),%xmm8,%xmm8
vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
vmovdqu 64+8(%rsp),%xmm0
vaesenc %xmm15,%xmm10,%xmm10
movbeq 88(%r14),%r13
vaesenc %xmm15,%xmm11,%xmm11
movbeq 80(%r14),%r12
vaesenc %xmm15,%xmm12,%xmm12
movq %r13,32+8(%rsp)
vaesenc %xmm15,%xmm13,%xmm13
movq %r12,40+8(%rsp)
vmovdqu 48-32(%r9),%xmm5
vaesenc %xmm15,%xmm14,%xmm14
vmovups 48-128(%rcx),%xmm15
vpxor %xmm1,%xmm6,%xmm6
vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
vaesenc %xmm15,%xmm9,%xmm9
vpxor %xmm2,%xmm6,%xmm6
vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
vaesenc %xmm15,%xmm10,%xmm10
vpxor %xmm3,%xmm7,%xmm7
vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
vaesenc %xmm15,%xmm11,%xmm11
vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
vmovdqu 80+8(%rsp),%xmm0
vaesenc %xmm15,%xmm12,%xmm12
vaesenc %xmm15,%xmm13,%xmm13
vpxor %xmm1,%xmm4,%xmm4
vmovdqu 64-32(%r9),%xmm1
vaesenc %xmm15,%xmm14,%xmm14
vmovups 64-128(%rcx),%xmm15
vpxor %xmm2,%xmm6,%xmm6
vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
vaesenc %xmm15,%xmm9,%xmm9
vpxor %xmm3,%xmm6,%xmm6
vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
vaesenc %xmm15,%xmm10,%xmm10
movbeq 72(%r14),%r13
vpxor %xmm5,%xmm7,%xmm7
vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
vaesenc %xmm15,%xmm11,%xmm11
movbeq 64(%r14),%r12
vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
vmovdqu 96+8(%rsp),%xmm0
vaesenc %xmm15,%xmm12,%xmm12
movq %r13,48+8(%rsp)
vaesenc %xmm15,%xmm13,%xmm13
movq %r12,56+8(%rsp)
vpxor %xmm2,%xmm4,%xmm4
vmovdqu 96-32(%r9),%xmm2
vaesenc %xmm15,%xmm14,%xmm14
vmovups 80-128(%rcx),%xmm15
vpxor %xmm3,%xmm6,%xmm6
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
vaesenc %xmm15,%xmm9,%xmm9
vpxor %xmm5,%xmm6,%xmm6
vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
vaesenc %xmm15,%xmm10,%xmm10
movbeq 56(%r14),%r13
vpxor %xmm1,%xmm7,%xmm7
vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
vpxor 112+8(%rsp),%xmm8,%xmm8
vaesenc %xmm15,%xmm11,%xmm11
movbeq 48(%r14),%r12
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
vaesenc %xmm15,%xmm12,%xmm12
movq %r13,64+8(%rsp)
vaesenc %xmm15,%xmm13,%xmm13
movq %r12,72+8(%rsp)
vpxor %xmm3,%xmm4,%xmm4
vmovdqu 112-32(%r9),%xmm3
vaesenc %xmm15,%xmm14,%xmm14
vmovups 96-128(%rcx),%xmm15
vpxor %xmm5,%xmm6,%xmm6
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
vaesenc %xmm15,%xmm9,%xmm9
vpxor %xmm1,%xmm6,%xmm6
vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
vaesenc %xmm15,%xmm10,%xmm10
movbeq 40(%r14),%r13
vpxor %xmm2,%xmm7,%xmm7
vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
vaesenc %xmm15,%xmm11,%xmm11
movbeq 32(%r14),%r12
vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
vaesenc %xmm15,%xmm12,%xmm12
movq %r13,80+8(%rsp)
vaesenc %xmm15,%xmm13,%xmm13
movq %r12,88+8(%rsp)
vpxor %xmm5,%xmm6,%xmm6
vaesenc %xmm15,%xmm14,%xmm14
vpxor %xmm1,%xmm6,%xmm6
vmovups 112-128(%rcx),%xmm15
vpslldq $8,%xmm6,%xmm5
vpxor %xmm2,%xmm4,%xmm4
vmovdqu 16(%r11),%xmm3
vaesenc %xmm15,%xmm9,%xmm9
vpxor %xmm8,%xmm7,%xmm7
vaesenc %xmm15,%xmm10,%xmm10
vpxor %xmm5,%xmm4,%xmm4
movbeq 24(%r14),%r13
vaesenc %xmm15,%xmm11,%xmm11
movbeq 16(%r14),%r12
vpalignr $8,%xmm4,%xmm4,%xmm0
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
movq %r13,96+8(%rsp)
vaesenc %xmm15,%xmm12,%xmm12
movq %r12,104+8(%rsp)
vaesenc %xmm15,%xmm13,%xmm13
vmovups 128-128(%rcx),%xmm1
vaesenc %xmm15,%xmm14,%xmm14
vaesenc %xmm1,%xmm9,%xmm9
vmovups 144-128(%rcx),%xmm15
vaesenc %xmm1,%xmm10,%xmm10
vpsrldq $8,%xmm6,%xmm6
vaesenc %xmm1,%xmm11,%xmm11
vpxor %xmm6,%xmm7,%xmm7
vaesenc %xmm1,%xmm12,%xmm12
vpxor %xmm0,%xmm4,%xmm4
movbeq 8(%r14),%r13
vaesenc %xmm1,%xmm13,%xmm13
movbeq 0(%r14),%r12
vaesenc %xmm1,%xmm14,%xmm14
vmovups 160-128(%rcx),%xmm1
cmpl $11,%ebp
jb .Lenc_tail
vaesenc %xmm15,%xmm9,%xmm9
vaesenc %xmm15,%xmm10,%xmm10
vaesenc %xmm15,%xmm11,%xmm11
vaesenc %xmm15,%xmm12,%xmm12
vaesenc %xmm15,%xmm13,%xmm13
vaesenc %xmm15,%xmm14,%xmm14
vaesenc %xmm1,%xmm9,%xmm9
vaesenc %xmm1,%xmm10,%xmm10
vaesenc %xmm1,%xmm11,%xmm11
vaesenc %xmm1,%xmm12,%xmm12
vaesenc %xmm1,%xmm13,%xmm13
vmovups 176-128(%rcx),%xmm15
vaesenc %xmm1,%xmm14,%xmm14
vmovups 192-128(%rcx),%xmm1
je .Lenc_tail
vaesenc %xmm15,%xmm9,%xmm9
vaesenc %xmm15,%xmm10,%xmm10
vaesenc %xmm15,%xmm11,%xmm11
vaesenc %xmm15,%xmm12,%xmm12
vaesenc %xmm15,%xmm13,%xmm13
vaesenc %xmm15,%xmm14,%xmm14
vaesenc %xmm1,%xmm9,%xmm9
vaesenc %xmm1,%xmm10,%xmm10
vaesenc %xmm1,%xmm11,%xmm11
vaesenc %xmm1,%xmm12,%xmm12
vaesenc %xmm1,%xmm13,%xmm13
vmovups 208-128(%rcx),%xmm15
vaesenc %xmm1,%xmm14,%xmm14
vmovups 224-128(%rcx),%xmm1
jmp .Lenc_tail
.align 32
.Lhandle_ctr32:
vmovdqu (%r11),%xmm0
vpshufb %xmm0,%xmm1,%xmm6
vmovdqu 48(%r11),%xmm5
vpaddd 64(%r11),%xmm6,%xmm10
vpaddd %xmm5,%xmm6,%xmm11
vmovdqu 0-32(%r9),%xmm3
vpaddd %xmm5,%xmm10,%xmm12
vpshufb %xmm0,%xmm10,%xmm10
vpaddd %xmm5,%xmm11,%xmm13
vpshufb %xmm0,%xmm11,%xmm11
vpxor %xmm15,%xmm10,%xmm10
vpaddd %xmm5,%xmm12,%xmm14
vpshufb %xmm0,%xmm12,%xmm12
vpxor %xmm15,%xmm11,%xmm11
vpaddd %xmm5,%xmm13,%xmm1
vpshufb %xmm0,%xmm13,%xmm13
vpshufb %xmm0,%xmm14,%xmm14
vpshufb %xmm0,%xmm1,%xmm1
jmp .Lresume_ctr32
.align 32
.Lenc_tail:
vaesenc %xmm15,%xmm9,%xmm9
vmovdqu %xmm7,16+8(%rsp)
vpalignr $8,%xmm4,%xmm4,%xmm8
vaesenc %xmm15,%xmm10,%xmm10
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
vpxor 0(%rdi),%xmm1,%xmm2
vaesenc %xmm15,%xmm11,%xmm11
vpxor 16(%rdi),%xmm1,%xmm0
vaesenc %xmm15,%xmm12,%xmm12
vpxor 32(%rdi),%xmm1,%xmm5
vaesenc %xmm15,%xmm13,%xmm13
vpxor 48(%rdi),%xmm1,%xmm6
vaesenc %xmm15,%xmm14,%xmm14
vpxor 64(%rdi),%xmm1,%xmm7
vpxor 80(%rdi),%xmm1,%xmm3
vmovdqu (%r8),%xmm1
vaesenclast %xmm2,%xmm9,%xmm9
vmovdqu 32(%r11),%xmm2
vaesenclast %xmm0,%xmm10,%xmm10
vpaddb %xmm2,%xmm1,%xmm0
movq %r13,112+8(%rsp)
leaq 96(%rdi),%rdi
vaesenclast %xmm5,%xmm11,%xmm11
vpaddb %xmm2,%xmm0,%xmm5
movq %r12,120+8(%rsp)
leaq 96(%rsi),%rsi
vmovdqu 0-128(%rcx),%xmm15
vaesenclast %xmm6,%xmm12,%xmm12
vpaddb %xmm2,%xmm5,%xmm6
vaesenclast %xmm7,%xmm13,%xmm13
vpaddb %xmm2,%xmm6,%xmm7
vaesenclast %xmm3,%xmm14,%xmm14
vpaddb %xmm2,%xmm7,%xmm3
addq $0x60,%r10
subq $0x6,%rdx
jc .L6x_done
vmovups %xmm9,-96(%rsi)
vpxor %xmm15,%xmm1,%xmm9
vmovups %xmm10,-80(%rsi)
vmovdqa %xmm0,%xmm10
vmovups %xmm11,-64(%rsi)
vmovdqa %xmm5,%xmm11
vmovups %xmm12,-48(%rsi)
vmovdqa %xmm6,%xmm12
vmovups %xmm13,-32(%rsi)
vmovdqa %xmm7,%xmm13
vmovups %xmm14,-16(%rsi)
vmovdqa %xmm3,%xmm14
vmovdqu 32+8(%rsp),%xmm7
jmp .Loop6x
.L6x_done:
vpxor 16+8(%rsp),%xmm8,%xmm8
vpxor %xmm4,%xmm8,%xmm8
.byte 0xf3,0xc3
.cfi_endproc
.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
.globl aesni_gcm_decrypt
.type aesni_gcm_decrypt,@function
.align 32
aesni_gcm_decrypt:
.cfi_startproc
xorq %r10,%r10
cmpq $0x60,%rdx
jb .Lgcm_dec_abort
leaq (%rsp),%rax
.cfi_def_cfa_register %rax
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
.cfi_offset %rbp,-24
pushq %r12
.cfi_offset %r12,-32
pushq %r13
.cfi_offset %r13,-40
pushq %r14
.cfi_offset %r14,-48
pushq %r15
.cfi_offset %r15,-56
vzeroupper
vmovdqu (%r8),%xmm1
addq $-128,%rsp
movl 12(%r8),%ebx
leaq .Lbswap_mask(%rip),%r11
leaq -128(%rcx),%r14
movq $0xf80,%r15
vmovdqu (%r9),%xmm8
andq $-128,%rsp
vmovdqu (%r11),%xmm0
leaq 128(%rcx),%rcx
leaq 32+32(%r9),%r9
movl 240-128(%rcx),%ebp
vpshufb %xmm0,%xmm8,%xmm8
andq %r15,%r14
andq %rsp,%r15
subq %r14,%r15
jc .Ldec_no_key_aliasing
cmpq $768,%r15
jnc .Ldec_no_key_aliasing
subq %r15,%rsp
.Ldec_no_key_aliasing:
vmovdqu 80(%rdi),%xmm7
leaq (%rdi),%r14
vmovdqu 64(%rdi),%xmm4
leaq -192(%rdi,%rdx,1),%r15
vmovdqu 48(%rdi),%xmm5
shrq $4,%rdx
xorq %r10,%r10
vmovdqu 32(%rdi),%xmm6
vpshufb %xmm0,%xmm7,%xmm7
vmovdqu 16(%rdi),%xmm2
vpshufb %xmm0,%xmm4,%xmm4
vmovdqu (%rdi),%xmm3
vpshufb %xmm0,%xmm5,%xmm5
vmovdqu %xmm4,48(%rsp)
vpshufb %xmm0,%xmm6,%xmm6
vmovdqu %xmm5,64(%rsp)
vpshufb %xmm0,%xmm2,%xmm2
vmovdqu %xmm6,80(%rsp)
vpshufb %xmm0,%xmm3,%xmm3
vmovdqu %xmm2,96(%rsp)
vmovdqu %xmm3,112(%rsp)
call _aesni_ctr32_ghash_6x
vmovups %xmm9,-96(%rsi)
vmovups %xmm10,-80(%rsi)
vmovups %xmm11,-64(%rsi)
vmovups %xmm12,-48(%rsi)
vmovups %xmm13,-32(%rsi)
vmovups %xmm14,-16(%rsi)
vpshufb (%r11),%xmm8,%xmm8
vmovdqu %xmm8,-64(%r9)
vzeroupper
movq -48(%rax),%r15
.cfi_restore %r15
movq -40(%rax),%r14
.cfi_restore %r14
movq -32(%rax),%r13
.cfi_restore %r13
movq -24(%rax),%r12
.cfi_restore %r12
movq -16(%rax),%rbp
.cfi_restore %rbp
movq -8(%rax),%rbx
.cfi_restore %rbx
leaq (%rax),%rsp
.cfi_def_cfa_register %rsp
.Lgcm_dec_abort:
movq %r10,%rax
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
.type _aesni_ctr32_6x,@function
.align 32
_aesni_ctr32_6x:
.cfi_startproc
vmovdqu 0-128(%rcx),%xmm4
vmovdqu 32(%r11),%xmm2
leaq -1(%rbp),%r13
vmovups 16-128(%rcx),%xmm15
leaq 32-128(%rcx),%r12
vpxor %xmm4,%xmm1,%xmm9
addl $100663296,%ebx
jc .Lhandle_ctr32_2
vpaddb %xmm2,%xmm1,%xmm10
vpaddb %xmm2,%xmm10,%xmm11
vpxor %xmm4,%xmm10,%xmm10
vpaddb %xmm2,%xmm11,%xmm12
vpxor %xmm4,%xmm11,%xmm11
vpaddb %xmm2,%xmm12,%xmm13
vpxor %xmm4,%xmm12,%xmm12
vpaddb %xmm2,%xmm13,%xmm14
vpxor %xmm4,%xmm13,%xmm13
vpaddb %xmm2,%xmm14,%xmm1
vpxor %xmm4,%xmm14,%xmm14
jmp .Loop_ctr32
.align 16
.Loop_ctr32:
vaesenc %xmm15,%xmm9,%xmm9
vaesenc %xmm15,%xmm10,%xmm10
vaesenc %xmm15,%xmm11,%xmm11
vaesenc %xmm15,%xmm12,%xmm12
vaesenc %xmm15,%xmm13,%xmm13
vaesenc %xmm15,%xmm14,%xmm14
vmovups (%r12),%xmm15
leaq 16(%r12),%r12
decl %r13d
jnz .Loop_ctr32
vmovdqu (%r12),%xmm3
vaesenc %xmm15,%xmm9,%xmm9
vpxor 0(%rdi),%xmm3,%xmm4
vaesenc %xmm15,%xmm10,%xmm10
vpxor 16(%rdi),%xmm3,%xmm5
vaesenc %xmm15,%xmm11,%xmm11
vpxor 32(%rdi),%xmm3,%xmm6
vaesenc %xmm15,%xmm12,%xmm12
vpxor 48(%rdi),%xmm3,%xmm8
vaesenc %xmm15,%xmm13,%xmm13
vpxor 64(%rdi),%xmm3,%xmm2
vaesenc %xmm15,%xmm14,%xmm14
vpxor 80(%rdi),%xmm3,%xmm3
leaq 96(%rdi),%rdi
vaesenclast %xmm4,%xmm9,%xmm9
vaesenclast %xmm5,%xmm10,%xmm10
vaesenclast %xmm6,%xmm11,%xmm11
vaesenclast %xmm8,%xmm12,%xmm12
vaesenclast %xmm2,%xmm13,%xmm13
vaesenclast %xmm3,%xmm14,%xmm14
vmovups %xmm9,0(%rsi)
vmovups %xmm10,16(%rsi)
vmovups %xmm11,32(%rsi)
vmovups %xmm12,48(%rsi)
vmovups %xmm13,64(%rsi)
vmovups %xmm14,80(%rsi)
leaq 96(%rsi),%rsi
.byte 0xf3,0xc3
.align 32
.Lhandle_ctr32_2:
vpshufb %xmm0,%xmm1,%xmm6
vmovdqu 48(%r11),%xmm5
vpaddd 64(%r11),%xmm6,%xmm10
vpaddd %xmm5,%xmm6,%xmm11
vpaddd %xmm5,%xmm10,%xmm12
vpshufb %xmm0,%xmm10,%xmm10
vpaddd %xmm5,%xmm11,%xmm13
vpshufb %xmm0,%xmm11,%xmm11
vpxor %xmm4,%xmm10,%xmm10
vpaddd %xmm5,%xmm12,%xmm14
vpshufb %xmm0,%xmm12,%xmm12
vpxor %xmm4,%xmm11,%xmm11
vpaddd %xmm5,%xmm13,%xmm1
vpshufb %xmm0,%xmm13,%xmm13
vpxor %xmm4,%xmm12,%xmm12
vpshufb %xmm0,%xmm14,%xmm14
vpxor %xmm4,%xmm13,%xmm13
vpshufb %xmm0,%xmm1,%xmm1
vpxor %xmm4,%xmm14,%xmm14
jmp .Loop_ctr32
.cfi_endproc
.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
.globl aesni_gcm_encrypt
.type aesni_gcm_encrypt,@function
.align 32
aesni_gcm_encrypt:
.cfi_startproc
xorq %r10,%r10
cmpq $288,%rdx
jb .Lgcm_enc_abort
leaq (%rsp),%rax
.cfi_def_cfa_register %rax
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
.cfi_offset %rbp,-24
pushq %r12
.cfi_offset %r12,-32
pushq %r13
.cfi_offset %r13,-40
pushq %r14
.cfi_offset %r14,-48
pushq %r15
.cfi_offset %r15,-56
vzeroupper
vmovdqu (%r8),%xmm1
addq $-128,%rsp
movl 12(%r8),%ebx
leaq .Lbswap_mask(%rip),%r11
leaq -128(%rcx),%r14
movq $0xf80,%r15
leaq 128(%rcx),%rcx
vmovdqu (%r11),%xmm0
andq $-128,%rsp
movl 240-128(%rcx),%ebp
andq %r15,%r14
andq %rsp,%r15
subq %r14,%r15
jc .Lenc_no_key_aliasing
cmpq $768,%r15
jnc .Lenc_no_key_aliasing
subq %r15,%rsp
.Lenc_no_key_aliasing:
leaq (%rsi),%r14
leaq -192(%rsi,%rdx,1),%r15
shrq $4,%rdx
call _aesni_ctr32_6x
vpshufb %xmm0,%xmm9,%xmm8
vpshufb %xmm0,%xmm10,%xmm2
vmovdqu %xmm8,112(%rsp)
vpshufb %xmm0,%xmm11,%xmm4
vmovdqu %xmm2,96(%rsp)
vpshufb %xmm0,%xmm12,%xmm5
vmovdqu %xmm4,80(%rsp)
vpshufb %xmm0,%xmm13,%xmm6
vmovdqu %xmm5,64(%rsp)
vpshufb %xmm0,%xmm14,%xmm7
vmovdqu %xmm6,48(%rsp)
call _aesni_ctr32_6x
vmovdqu (%r9),%xmm8
leaq 32+32(%r9),%r9
subq $12,%rdx
movq $192,%r10
vpshufb %xmm0,%xmm8,%xmm8
call _aesni_ctr32_ghash_6x
vmovdqu 32(%rsp),%xmm7
vmovdqu (%r11),%xmm0
vmovdqu 0-32(%r9),%xmm3
vpunpckhqdq %xmm7,%xmm7,%xmm1
vmovdqu 32-32(%r9),%xmm15
vmovups %xmm9,-96(%rsi)
vpshufb %xmm0,%xmm9,%xmm9
vpxor %xmm7,%xmm1,%xmm1
vmovups %xmm10,-80(%rsi)
vpshufb %xmm0,%xmm10,%xmm10
vmovups %xmm11,-64(%rsi)
vpshufb %xmm0,%xmm11,%xmm11
vmovups %xmm12,-48(%rsi)
vpshufb %xmm0,%xmm12,%xmm12
vmovups %xmm13,-32(%rsi)
vpshufb %xmm0,%xmm13,%xmm13
vmovups %xmm14,-16(%rsi)
vpshufb %xmm0,%xmm14,%xmm14
vmovdqu %xmm9,16(%rsp)
vmovdqu 48(%rsp),%xmm6
vmovdqu 16-32(%r9),%xmm0
vpunpckhqdq %xmm6,%xmm6,%xmm2
vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
vpxor %xmm6,%xmm2,%xmm2
vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
vmovdqu 64(%rsp),%xmm9
vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
vmovdqu 48-32(%r9),%xmm3
vpxor %xmm5,%xmm4,%xmm4
vpunpckhqdq %xmm9,%xmm9,%xmm5
vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
vpxor %xmm9,%xmm5,%xmm5
vpxor %xmm7,%xmm6,%xmm6
vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
vmovdqu 80-32(%r9),%xmm15
vpxor %xmm1,%xmm2,%xmm2
vmovdqu 80(%rsp),%xmm1
vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
vmovdqu 64-32(%r9),%xmm0
vpxor %xmm4,%xmm7,%xmm7
vpunpckhqdq %xmm1,%xmm1,%xmm4
vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
vpxor %xmm1,%xmm4,%xmm4
vpxor %xmm6,%xmm9,%xmm9
vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
vpxor %xmm2,%xmm5,%xmm5
vmovdqu 96(%rsp),%xmm2
vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
vmovdqu 96-32(%r9),%xmm3
vpxor %xmm7,%xmm6,%xmm6
vpunpckhqdq %xmm2,%xmm2,%xmm7
vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
vpxor %xmm2,%xmm7,%xmm7
vpxor %xmm9,%xmm1,%xmm1
vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
vmovdqu 128-32(%r9),%xmm15
vpxor %xmm5,%xmm4,%xmm4
vpxor 112(%rsp),%xmm8,%xmm8
vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
vmovdqu 112-32(%r9),%xmm0
vpunpckhqdq %xmm8,%xmm8,%xmm9
vpxor %xmm6,%xmm5,%xmm5
vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
vpxor %xmm8,%xmm9,%xmm9
vpxor %xmm1,%xmm2,%xmm2
vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
vpxor %xmm4,%xmm7,%xmm4
vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
vmovdqu 0-32(%r9),%xmm3
vpunpckhqdq %xmm14,%xmm14,%xmm1
vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
vpxor %xmm14,%xmm1,%xmm1
vpxor %xmm5,%xmm6,%xmm5
vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
vmovdqu 32-32(%r9),%xmm15
vpxor %xmm2,%xmm8,%xmm7
vpxor %xmm4,%xmm9,%xmm6
vmovdqu 16-32(%r9),%xmm0
vpxor %xmm5,%xmm7,%xmm9
vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
vpxor %xmm9,%xmm6,%xmm6
vpunpckhqdq %xmm13,%xmm13,%xmm2
vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
vpxor %xmm13,%xmm2,%xmm2
vpslldq $8,%xmm6,%xmm9
vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
vpxor %xmm9,%xmm5,%xmm8
vpsrldq $8,%xmm6,%xmm6
vpxor %xmm6,%xmm7,%xmm7
vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
vmovdqu 48-32(%r9),%xmm3
vpxor %xmm4,%xmm5,%xmm5
vpunpckhqdq %xmm12,%xmm12,%xmm9
vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
vpxor %xmm12,%xmm9,%xmm9
vpxor %xmm14,%xmm13,%xmm13
vpalignr $8,%xmm8,%xmm8,%xmm14
vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
vmovdqu 80-32(%r9),%xmm15
vpxor %xmm1,%xmm2,%xmm2
vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
vmovdqu 64-32(%r9),%xmm0
vpxor %xmm5,%xmm4,%xmm4
vpunpckhqdq %xmm11,%xmm11,%xmm1
vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
vpxor %xmm11,%xmm1,%xmm1
vpxor %xmm13,%xmm12,%xmm12
vxorps 16(%rsp),%xmm7,%xmm7
vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
vpxor %xmm2,%xmm9,%xmm9
vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
vxorps %xmm14,%xmm8,%xmm8
vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
vmovdqu 96-32(%r9),%xmm3
vpxor %xmm4,%xmm5,%xmm5
vpunpckhqdq %xmm10,%xmm10,%xmm2
vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
vpxor %xmm10,%xmm2,%xmm2
vpalignr $8,%xmm8,%xmm8,%xmm14
vpxor %xmm12,%xmm11,%xmm11
vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
vmovdqu 128-32(%r9),%xmm15
vpxor %xmm9,%xmm1,%xmm1
vxorps %xmm7,%xmm14,%xmm14
vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
vxorps %xmm14,%xmm8,%xmm8
vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
vmovdqu 112-32(%r9),%xmm0
vpxor %xmm5,%xmm4,%xmm4
vpunpckhqdq %xmm8,%xmm8,%xmm9
vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
vpxor %xmm8,%xmm9,%xmm9
vpxor %xmm11,%xmm10,%xmm10
vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
vpxor %xmm1,%xmm2,%xmm2
vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
vpxor %xmm4,%xmm5,%xmm5
vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
vpxor %xmm10,%xmm7,%xmm7
vpxor %xmm2,%xmm6,%xmm6
vpxor %xmm5,%xmm7,%xmm4
vpxor %xmm4,%xmm6,%xmm6
vpslldq $8,%xmm6,%xmm1
vmovdqu 16(%r11),%xmm3
vpsrldq $8,%xmm6,%xmm6
vpxor %xmm1,%xmm5,%xmm8
vpxor %xmm6,%xmm7,%xmm7
vpalignr $8,%xmm8,%xmm8,%xmm2
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
vpxor %xmm2,%xmm8,%xmm8
vpalignr $8,%xmm8,%xmm8,%xmm2
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
vpxor %xmm7,%xmm2,%xmm2
vpxor %xmm2,%xmm8,%xmm8
vpshufb (%r11),%xmm8,%xmm8
vmovdqu %xmm8,-64(%r9)
vzeroupper
movq -48(%rax),%r15
.cfi_restore %r15
movq -40(%rax),%r14
.cfi_restore %r14
movq -32(%rax),%r13
.cfi_restore %r13
movq -24(%rax),%r12
.cfi_restore %r12
movq -16(%rax),%rbp
.cfi_restore %rbp
movq -8(%rax),%rbx
.cfi_restore %rbx
leaq (%rax),%rsp
.cfi_def_cfa_register %rsp
.Lgcm_enc_abort:
movq %r10,%rax
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
.section .rodata
.align 64
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.Lpoly:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
.Lone_msb:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
.Ltwo_lsb:
.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
.Lone_lsb:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.previous
.align 64
.section ".note.gnu.property", "a"
.p2align 3
.long 1f - 0f
.long 4f - 1f
.long 5
0:
# "GNU" encoded with .byte, since .asciz isn't supported
# on Solaris.
.byte 0x47
.byte 0x4e
.byte 0x55
.byte 0
1:
.p2align 3
.long 0xc0000002
.long 3f - 2f
2:
.long 3
3:
.p2align 3
4:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,975 @@
#! /usr/bin/env perl
# This file is dual-licensed, meaning that you can use it under your
# choice of either of the following two licenses:
#
# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You can obtain
# a copy in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# or
#
# Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# - RV64I
# - RISC-V Vector ('V') with VLEN >= 128
# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
# - RISC-V Vector GCM/GMAC extension ('Zvkg')
# - RISC-V Vector AES Block Cipher extension ('Zvkned')
# - RISC-V Zicclsm(Main memory supports misaligned loads/stores)
# Reference: https://github.com/riscv/riscv-crypto/issues/192#issuecomment-1270447575
#
# Assume we have 12 GCM blocks and we try to parallelize GCM computation for 4 blocks.
# Tag = M0*H^12 + M1*H^11 + M2*H^10 + M3*H^9 +
# M4*H^8 + M5*H^7 + M6*H^6 + M7*H^5 +
# M8*H^4 + M9*H^3 + M10*H^2 + M11*H^1
# We could rewrite the formula into:
# T0 = 0
# T1 = (T0+M1)*H^4 T2 = (T0+M2)*H^4 T3 = (T0+M3)*H^4 T4 = (T0+M4)*H^4
# T5 = (T1+M5)*H^4 T6 = (T2+M6)*H^4 T7 = (T3+M7)*H^4 T8 = (T4+M8)*H^4
# T9 = (T5+M9)*H^4 T10 = (T6+M10)*H^3 T11 = (T7+M11)*H^2 T12 = (T8+M12)*H^1
#
# We will multiply with [H^4, H^4, H^4, H^4] in each steps except the last iteration.
# The last iteration will multiply with [H^4, H^3, H^2, H^1].
use strict;
use warnings;
use FindBin qw($Bin);
use lib "$Bin";
use lib "$Bin/../../perlasm";
use riscv;
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$output and open STDOUT,">$output";
my $code=<<___;
.text
___
{
my ($INP, $OUTP, $LEN, $KEYP, $IVP, $XIP) = ("a0", "a1", "a2", "a3", "a4", "a5");
my ($T0, $T1, $T2, $T3) = ("t0", "t1", "t2", "t3");
my ($PADDING_LEN32) = ("t4");
my ($LEN32) = ("t5");
my ($CTR) = ("t6");
my ($FULL_BLOCK_LEN32) = ("a6");
my ($ORIGINAL_LEN32) = ("a7");
my ($PROCESSED_LEN) = ("a0");
my ($CTR_MASK) = ("v0");
my ($INPUT_PADDING_MASK) = ("v0");
my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7,
$V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15,
$V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23,
$V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31,
) = map("v$_",(0..31));
# Do aes-128 enc.
sub aes_128_cipher_body {
my $code=<<___;
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
@{[vaesz_vs $V28, $V1]}
@{[vaesem_vs $V28, $V2]}
@{[vaesem_vs $V28, $V3]}
@{[vaesem_vs $V28, $V4]}
@{[vaesem_vs $V28, $V5]}
@{[vaesem_vs $V28, $V6]}
@{[vaesem_vs $V28, $V7]}
@{[vaesem_vs $V28, $V8]}
@{[vaesem_vs $V28, $V9]}
@{[vaesem_vs $V28, $V10]}
@{[vaesef_vs $V28, $V11]}
___
return $code;
}
# Do aes-192 enc.
sub aes_192_cipher_body {
my $TMP_REG = shift;
my $code=<<___;
# Load key 4
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
addi $TMP_REG, $KEYP, 48
@{[vle32_v $V11, $TMP_REG]}
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
@{[vaesz_vs $V28, $V1]}
@{[vaesem_vs $V28, $V2]}
@{[vaesem_vs $V28, $V3]}
@{[vaesem_vs $V28, $V11]}
# Load key 8
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
addi $TMP_REG, $KEYP, 112
@{[vle32_v $V11, $TMP_REG]}
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
@{[vaesem_vs $V28, $V4]}
@{[vaesem_vs $V28, $V5]}
@{[vaesem_vs $V28, $V6]}
@{[vaesem_vs $V28, $V11]}
# Load key 13
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
addi $TMP_REG, $KEYP, 192
@{[vle32_v $V11, $TMP_REG]}
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
@{[vaesem_vs $V28, $V7]}
@{[vaesem_vs $V28, $V8]}
@{[vaesem_vs $V28, $V9]}
@{[vaesem_vs $V28, $V10]}
@{[vaesef_vs $V28, $V11]}
___
return $code;
}
# Do aes-256 enc.
sub aes_256_cipher_body {
my $TMP_REG = shift;
my $code=<<___;
# Load key 3
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
addi $TMP_REG, $KEYP, 32
@{[vle32_v $V11, $TMP_REG]}
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
@{[vaesz_vs $V28, $V1]}
@{[vaesem_vs $V28, $V2]}
@{[vaesem_vs $V28, $V11]}
# Load key 6
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
addi $TMP_REG, $KEYP, 80
@{[vle32_v $V11, $TMP_REG]}
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
@{[vaesem_vs $V28, $V3]}
@{[vaesem_vs $V28, $V4]}
@{[vaesem_vs $V28, $V11]}
# Load key 9
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
addi $TMP_REG, $KEYP, 128
@{[vle32_v $V11, $TMP_REG]}
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
@{[vaesem_vs $V28, $V5]}
@{[vaesem_vs $V28, $V6]}
@{[vaesem_vs $V28, $V11]}
# Load key 12
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
addi $TMP_REG, $KEYP, 176
@{[vle32_v $V11, $TMP_REG]}
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
@{[vaesem_vs $V28, $V7]}
@{[vaesem_vs $V28, $V8]}
@{[vaesem_vs $V28, $V11]}
# Load key 15
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
addi $TMP_REG, $KEYP, 224
@{[vle32_v $V11, $TMP_REG]}
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
@{[vaesem_vs $V28, $V9]}
@{[vaesem_vs $V28, $V10]}
@{[vaesef_vs $V28, $V11]}
___
return $code;
}
sub handle_padding_in_first_round {
my $TMP_REG = shift;
my $code=<<___;
bnez $PADDING_LEN32, 1f
## without padding
# Store ciphertext/plaintext
@{[vse32_v $V28, $OUTP]}
j 2f
## with padding
1:
# Store ciphertext/plaintext using mask
@{[vse32_v $V28, $OUTP, $INPUT_PADDING_MASK]}
# Fill zero for the padding blocks
@{[vsetvli "zero", $PADDING_LEN32, "e32", "m4", "tu", "ma"]}
@{[vmv_v_i $V28, 0]}
# We have used mask register for `INPUT_PADDING_MASK` before. We need to
# setup the ctr mask back.
# ctr mask : [000100010001....]
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e8", "m1", "ta", "ma"]}
li $TMP_REG, 0b10001000
@{[vmv_v_x $CTR_MASK, $TMP_REG]}
2:
___
return $code;
}
# Do aes-128 enc for first round.
sub aes_128_first_round {
my $PTR_OFFSET_REG = shift;
my $TMP_REG = shift;
my $code=<<___;
# Load all 11 aes round keys to v1-v11 registers.
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vle32_v $V1, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V2, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V3, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V4, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V5, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V6, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V7, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V8, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V9, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V10, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V11, $KEYP]}
# We already have the ciphertext/plaintext and ctr data for the first round.
@{[aes_128_cipher_body]}
# Compute AES ctr result.
@{[vxor_vv $V28, $V28, $V24]}
@{[handle_padding_in_first_round $TMP_REG]}
add $INP, $INP, $PTR_OFFSET_REG
add $OUTP, $OUTP, $PTR_OFFSET_REG
___
return $code;
}
# Do aes-192 enc for first round.
sub aes_192_first_round {
my $PTR_OFFSET_REG = shift;
my $TMP_REG = shift;
my $code=<<___;
# We run out of 32 vector registers, so we just preserve some round keys
# and load the remaining round keys inside the aes body.
# We keep the round keys for:
# 1, 2, 3, 5, 6, 7, 9, 10, 11 and 12th keys.
# The following keys will be loaded in the aes body:
# 4, 8 and 13th keys.
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
# key 1
@{[vle32_v $V1, $KEYP]}
# key 2
addi $TMP_REG, $KEYP, 16
@{[vle32_v $V2, $TMP_REG]}
# key 3
addi $TMP_REG, $KEYP, 32
@{[vle32_v $V3, $TMP_REG]}
# key 5
addi $TMP_REG, $KEYP, 64
@{[vle32_v $V4, $TMP_REG]}
# key 6
addi $TMP_REG, $KEYP, 80
@{[vle32_v $V5, $TMP_REG]}
# key 7
addi $TMP_REG, $KEYP, 96
@{[vle32_v $V6, $TMP_REG]}
# key 9
addi $TMP_REG, $KEYP, 128
@{[vle32_v $V7, $TMP_REG]}
# key 10
addi $TMP_REG, $KEYP, 144
@{[vle32_v $V8, $TMP_REG]}
# key 11
addi $TMP_REG, $KEYP, 160
@{[vle32_v $V9, $TMP_REG]}
# key 12
addi $TMP_REG, $KEYP, 176
@{[vle32_v $V10, $TMP_REG]}
# We already have the ciphertext/plaintext and ctr data for the first round.
@{[aes_192_cipher_body $TMP_REG]}
# Compute AES ctr result.
@{[vxor_vv $V28, $V28, $V24]}
@{[handle_padding_in_first_round $TMP_REG]}
add $INP, $INP, $PTR_OFFSET_REG
add $OUTP, $OUTP, $PTR_OFFSET_REG
___
return $code;
}
# Do aes-256 enc for first round.
sub aes_256_first_round {
my $PTR_OFFSET_REG = shift;
my $TMP_REG = shift;
my $code=<<___;
# We run out of 32 vector registers, so we just preserve some round keys
# and load the remaining round keys inside the aes body.
# We keep the round keys for:
# 1, 2, 4, 5, 7, 8, 10, 11, 13 and 14th keys.
# The following keys will be loaded in the aes body:
# 3, 6, 9, 12 and 15th keys.
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
# key 1
@{[vle32_v $V1, $KEYP]}
# key 2
addi $TMP_REG, $KEYP, 16
@{[vle32_v $V2, $TMP_REG]}
# key 4
addi $TMP_REG, $KEYP, 48
@{[vle32_v $V3, $TMP_REG]}
# key 5
addi $TMP_REG, $KEYP, 64
@{[vle32_v $V4, $TMP_REG]}
# key 7
addi $TMP_REG, $KEYP, 96
@{[vle32_v $V5, $TMP_REG]}
# key 8
addi $TMP_REG, $KEYP, 112
@{[vle32_v $V6, $TMP_REG]}
# key 10
addi $TMP_REG, $KEYP, 144
@{[vle32_v $V7, $TMP_REG]}
# key 11
addi $TMP_REG, $KEYP, 160
@{[vle32_v $V8, $TMP_REG]}
# key 13
addi $TMP_REG, $KEYP, 192
@{[vle32_v $V9, $TMP_REG]}
# key 14
addi $TMP_REG, $KEYP, 208
@{[vle32_v $V10, $TMP_REG]}
# We already have the ciphertext/plaintext and ctr data for the first round.
@{[aes_256_cipher_body $TMP_REG]}
# Compute AES ctr result.
@{[vxor_vv $V28, $V28, $V24]}
@{[handle_padding_in_first_round $TMP_REG]}
add $INP, $INP, $PTR_OFFSET_REG
add $OUTP, $OUTP, $PTR_OFFSET_REG
___
return $code;
}
sub aes_gcm_init {
my $code=<<___;
# Compute the AES-GCM full-block e32 length for `LMUL=4`. We will handle
# the multiple AES-GCM blocks at the same time within `LMUL=4` register.
# The AES-GCM's SEW is e32 and EGW is 128 bits.
# FULL_BLOCK_LEN32 = (VLEN*LMUL)/(EGW) * (EGW/SEW) = (VLEN*4)/(32*4) * 4
# = (VLEN*4)/32
# We could get the block_num using the VL value of `vsetvli with e32, m4`.
@{[vsetvli $FULL_BLOCK_LEN32, "zero", "e32", "m4", "ta", "ma"]}
# If `LEN32 % FULL_BLOCK_LEN32` is not equal to zero, we could fill the
# zero padding data to make sure we could always handle FULL_BLOCK_LEN32
# blocks for all iterations.
## Prepare the H^n multiplier in v16 for GCM multiplier. The `n` is the gcm
## block number in a LMUL=4 register group.
## n = ((VLEN*LMUL)/(32*4)) = ((VLEN*4)/(32*4))
## = (VLEN/32)
## We could use vsetvli with `e32, m1` to compute the `n` number.
@{[vsetvli $T0, "zero", "e32", "m1", "ta", "ma"]}
# The H is at `gcm128_context.Htable[0]`(addr(Xi)+16*2).
addi $T1, $XIP, 32
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vle32_v $V31, $T1]}
# Compute the H^n
li $T1, 1
1:
@{[vgmul_vv $V31, $V31]}
slli $T1, $T1, 1
bltu $T1, $T0, 1b
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
@{[vmv_v_i $V16, 0]}
@{[vaesz_vs $V16, $V31]}
#### Load plaintext into v24 and handle padding. We also load the init tag
#### data into v20 and prepare the AES ctr input data into v12 and v28.
@{[vmv_v_i $V20, 0]}
## Prepare the AES ctr input data into v12.
# Setup ctr input mask.
# ctr mask : [000100010001....]
# Note: The actual vl should be `FULL_BLOCK_LEN32/4 * 2`, but we just use
# `FULL_BLOCK_LEN32` here.
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e8", "m1", "ta", "ma"]}
li $T0, 0b10001000
@{[vmv_v_x $CTR_MASK, $T0]}
# Load IV.
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vle32_v $V31, $IVP]}
# Convert the big-endian counter into little-endian.
@{[vsetivli "zero", 4, "e32", "m1", "ta", "mu"]}
@{[vrev8_v $V31, $V31, $CTR_MASK]}
# Splat the `single block of IV` to v12
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
@{[vmv_v_i $V12, 0]}
@{[vaesz_vs $V12, $V31]}
# Prepare the ctr counter into v8
# v8: [x, x, x, 0, x, x, x, 1, x, x, x, 2, ...]
@{[viota_m $V8, $CTR_MASK, $CTR_MASK]}
# Merge IV and ctr counter into v12.
# v12:[x, x, x, count+0, x, x, x, count+1, ...]
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
@{[vadd_vv $V12, $V12, $V8, $CTR_MASK]}
li $PADDING_LEN32, 0
# Get the SEW32 size in the first round.
# If we have the non-zero value for `LEN32&(FULL_BLOCK_LEN32-1)`, then
# we will have the leading padding zero.
addi $T0, $FULL_BLOCK_LEN32, -1
and $T0, $T0, $LEN32
beqz $T0, 1f
## with padding
sub $LEN32, $LEN32, $T0
sub $PADDING_LEN32, $FULL_BLOCK_LEN32, $T0
# padding block size
srli $T1, $PADDING_LEN32, 2
# padding byte size
slli $T2, $PADDING_LEN32, 2
# Adjust the ctr counter to make the counter start from `counter+0` for the
# first non-padding block.
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
@{[vsub_vx $V12, $V12, $T1, $CTR_MASK]}
# Prepare the AES ctr input into v28.
# The ctr data uses big-endian form.
@{[vmv_v_v $V28, $V12]}
@{[vrev8_v $V28, $V28, $CTR_MASK]}
# Prepare the mask for input loading in the first round. We use
# `VL=FULL_BLOCK_LEN32` with the mask in the first round.
# Adjust input ptr.
sub $INP, $INP, $T2
# Adjust output ptr.
sub $OUTP, $OUTP, $T2
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e16", "m2", "ta", "ma"]}
@{[vid_v $V2]}
# We don't use the pseudo instruction `vmsgeu` here. Use `vmsgtu` instead.
# The original code is:
# vmsgeu.vx $INPUT_PADDING_MASK, $V2, $PADDING_LEN32
addi $T0, $PADDING_LEN32, -1
@{[vmsgtu_vx $INPUT_PADDING_MASK, $V2, $T0]}
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
@{[vmv_v_i $V24, 0]}
# Load the input for length FULL_BLOCK_LEN32 with mask.
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
@{[vle32_v $V24, $INP, $INPUT_PADDING_MASK]}
# Load the init `Xi` data to v20 with preceding zero padding.
# Adjust Xi ptr.
sub $T0, $XIP, $T2
# Load for length `zero-padding-e32-length + 4`.
addi $T1, $PADDING_LEN32, 4
@{[vsetvli "zero", $T1, "e32", "m4", "tu", "mu"]}
@{[vle32_v $V20, $T0, $INPUT_PADDING_MASK]}
j 2f
1:
## without padding
sub $LEN32, $LEN32, $FULL_BLOCK_LEN32
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
@{[vle32_v $V24, $INP]}
# Load the init Xi data to v20.
@{[vsetivli "zero", 4, "e32", "m1", "tu", "ma"]}
@{[vle32_v $V20, $XIP]}
# Prepare the AES ctr input into v28.
# The ctr data uses big-endian form.
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
@{[vmv_v_v $V28, $V12]}
@{[vrev8_v $V28, $V28, $CTR_MASK]}
2:
___
return $code;
}
sub prepare_input_and_ctr {
my $PTR_OFFSET_REG = shift;
my $code=<<___;
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
# Increase ctr in v12.
@{[vadd_vx $V12, $V12, $CTR, $CTR_MASK]}
sub $LEN32, $LEN32, $FULL_BLOCK_LEN32
# Load plaintext into v24
@{[vsetvli "zero", "zero", "e32", "m4", "ta", "ma"]}
@{[vle32_v $V24, $INP]}
# Prepare the AES ctr input into v28.
# The ctr data uses big-endian form.
@{[vmv_v_v $V28, $V12]}
add $INP, $INP, $PTR_OFFSET_REG
@{[vsetvli "zero", "zero", "e32", "m4", "ta", "mu"]}
@{[vrev8_v $V28, $V28, $CTR_MASK]}
___
return $code;
}
# Store the current CTR back to IV buffer.
sub store_current_ctr {
my $code=<<___;
@{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]}
# Update current ctr value to v12
@{[vadd_vx $V12, $V12, $CTR, $CTR_MASK]}
# Convert ctr to big-endian counter.
@{[vrev8_v $V12, $V12, $CTR_MASK]}
@{[vse32_v $V12, $IVP, $CTR_MASK]}
___
return $code;
}
# Compute the final tag into v0 from the partial tag v20.
sub compute_final_tag {
my $TMP_REG = shift;
my $code=<<___;
# The H is at `gcm128_context.Htable[0]` (addr(Xi)+16*2).
# Load H to v1
addi $TMP_REG, $XIP, 32
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vle32_v $V1, $TMP_REG]}
# Multiply H for each partial tag and XOR them together.
# Handle 1st partial tag
@{[vmv_v_v $V0, $V20]}
@{[vgmul_vv $V0, $V1]}
# Handle 2nd to N-th partial tags
li $TMP_REG, 4
1:
@{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]}
@{[vslidedown_vx $V4, $V20, $TMP_REG]}
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vghsh_vv $V0, $V1, $V4]}
addi $TMP_REG, $TMP_REG, 4
blt $TMP_REG, $FULL_BLOCK_LEN32, 1b
___
return $code;
}
################################################################################
# size_t rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt(const unsigned char *in,
# unsigned char *out, size_t len,
# const void *key,
# unsigned char ivec[16], u64 *Xi);
{
$code .= <<___;
.p2align 3
.globl rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt
.type rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt,\@function
rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt:
srli $T0, $LEN, 4
beqz $T0, .Lenc_end
slli $LEN32, $T0, 2
mv $ORIGINAL_LEN32, $LEN32
@{[aes_gcm_init]}
# Load number of rounds
lwu $T0, 240($KEYP)
li $T1, 14
li $T2, 12
li $T3, 10
beq $T0, $T1, aes_gcm_enc_blocks_256
beq $T0, $T2, aes_gcm_enc_blocks_192
beq $T0, $T3, aes_gcm_enc_blocks_128
.Lenc_end:
li $PROCESSED_LEN, 0
ret
.size rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt,.-rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt
___
$code .= <<___;
.p2align 3
aes_gcm_enc_blocks_128:
srli $CTR, $FULL_BLOCK_LEN32, 2
slli $T0, $FULL_BLOCK_LEN32, 2
@{[aes_128_first_round $T0, $T1]}
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
.Lenc_blocks_128:
# Compute the partial tags.
# The partial tags will multiply with [H^n, H^n, ..., H^n]
# [tag0, tag1, ...] =
# ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
# We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
beqz $LEN32, .Lenc_blocks_128_end
@{[vghsh_vv $V20, $V16, $V28]}
@{[prepare_input_and_ctr $T0]}
@{[aes_128_cipher_body]}
# Compute AES ctr ciphertext result.
@{[vxor_vv $V28, $V28, $V24]}
# Store ciphertext
@{[vse32_v $V28, $OUTP]}
add $OUTP, $OUTP, $T0
j .Lenc_blocks_128
.Lenc_blocks_128_end:
# Add ciphertext into partial tag
@{[vxor_vv $V20, $V20, $V28]}
@{[store_current_ctr]}
@{[compute_final_tag $T1]}
# Save the final tag
@{[vse32_v $V0, $XIP]}
# return the processed size.
slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
ret
.size aes_gcm_enc_blocks_128,.-aes_gcm_enc_blocks_128
___
$code .= <<___;
.p2align 3
aes_gcm_enc_blocks_192:
srli $CTR, $FULL_BLOCK_LEN32, 2
slli $T0, $FULL_BLOCK_LEN32, 2
@{[aes_192_first_round $T0, $T1]}
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
.Lenc_blocks_192:
# Compute the partial tags.
# The partial tags will multiply with [H^n, H^n, ..., H^n]
# [tag0, tag1, ...] =
# ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
# We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
beqz $LEN32, .Lenc_blocks_192_end
@{[vghsh_vv $V20, $V16, $V28]}
@{[prepare_input_and_ctr $T0]}
@{[aes_192_cipher_body $T1]}
# Compute AES ctr ciphertext result.
@{[vxor_vv $V28, $V28, $V24]}
# Store ciphertext
@{[vse32_v $V28, $OUTP]}
add $OUTP, $OUTP, $T0
j .Lenc_blocks_192
.Lenc_blocks_192_end:
# Add ciphertext into partial tag
@{[vxor_vv $V20, $V20, $V28]}
@{[store_current_ctr]}
@{[compute_final_tag $T1]}
# Save the final tag
@{[vse32_v $V0, $XIP]}
# return the processed size.
slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
ret
.size aes_gcm_enc_blocks_192,.-aes_gcm_enc_blocks_192
___
$code .= <<___;
.p2align 3
aes_gcm_enc_blocks_256:
srli $CTR, $FULL_BLOCK_LEN32, 2
slli $T0, $FULL_BLOCK_LEN32, 2
@{[aes_256_first_round $T0, $T1]}
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
.Lenc_blocks_256:
# Compute the partial tags.
# The partial tags will multiply with [H^n, H^n, ..., H^n]
# [tag0, tag1, ...] =
# ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
# We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
beqz $LEN32, .Lenc_blocks_256_end
@{[vghsh_vv $V20, $V16, $V28]}
@{[prepare_input_and_ctr $T0]}
@{[aes_256_cipher_body $T1]}
# Compute AES ctr ciphertext result.
@{[vxor_vv $V28, $V28, $V24]}
# Store ciphertext
@{[vse32_v $V28, $OUTP]}
add $OUTP, $OUTP, $T0
j .Lenc_blocks_256
.Lenc_blocks_256_end:
# Add ciphertext into partial tag
@{[vxor_vv $V20, $V20, $V28]}
@{[store_current_ctr]}
@{[compute_final_tag $T1]}
# Save the final tag
@{[vse32_v $V0, $XIP]}
# return the processed size.
slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
ret
.size aes_gcm_enc_blocks_256,.-aes_gcm_enc_blocks_256
___
}
################################################################################
# size_t rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt(const unsigned char *in,
# unsigned char *out, size_t len,
# const void *key,
# unsigned char ivec[16], u64 *Xi);
{
$code .= <<___;
.p2align 3
.globl rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt
.type rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt,\@function
rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt:
srli $T0, $LEN, 4
beqz $T0, .Ldec_end
slli $LEN32, $T0, 2
mv $ORIGINAL_LEN32, $LEN32
@{[aes_gcm_init]}
# Load number of rounds
lwu $T0, 240($KEYP)
li $T1, 14
li $T2, 12
li $T3, 10
beq $T0, $T1, aes_gcm_dec_blocks_256
beq $T0, $T2, aes_gcm_dec_blocks_192
beq $T0, $T3, aes_gcm_dec_blocks_128
.Ldec_end:
li $PROCESSED_LEN, 0
ret
.size rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt,.-rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt
___
$code .= <<___;
.p2align 3
aes_gcm_dec_blocks_128:
srli $CTR, $FULL_BLOCK_LEN32, 2
slli $T0, $FULL_BLOCK_LEN32, 2
@{[aes_128_first_round $T0, $T1]}
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
.Ldec_blocks_128:
# Compute the partial tags.
# The partial tags will multiply with [H^n, H^n, ..., H^n]
# [tag0, tag1, ...] =
# ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
# We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
beqz $LEN32, .Ldec_blocks_256_end
@{[vghsh_vv $V20, $V16, $V24]}
@{[prepare_input_and_ctr $T0]}
@{[aes_128_cipher_body]}
# Compute AES ctr plaintext result.
@{[vxor_vv $V28, $V28, $V24]}
# Store plaintext
@{[vse32_v $V28, $OUTP]}
add $OUTP, $OUTP, $T0
j .Ldec_blocks_128
.Ldec_blocks_128_end:
# Add ciphertext into partial tag
@{[vxor_vv $V20, $V20, $V24]}
@{[store_current_ctr]}
@{[compute_final_tag $T1]}
# Save the final tag
@{[vse32_v $V0, $XIP]}
# return the processed size.
slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
ret
.size aes_gcm_dec_blocks_128,.-aes_gcm_dec_blocks_128
___
$code .= <<___;
.p2align 3
aes_gcm_dec_blocks_192:
srli $CTR, $FULL_BLOCK_LEN32, 2
slli $T0, $FULL_BLOCK_LEN32, 2
@{[aes_192_first_round $T0, $T1]}
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
.Ldec_blocks_192:
# Compute the partial tags.
# The partial tags will multiply with [H^n, H^n, ..., H^n]
# [tag0, tag1, ...] =
# ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
# We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
beqz $LEN32, .Ldec_blocks_192_end
@{[vghsh_vv $V20, $V16, $V24]}
@{[prepare_input_and_ctr $T0]}
@{[aes_192_cipher_body $T1]}
# Compute AES ctr plaintext result.
@{[vxor_vv $V28, $V28, $V24]}
# Store plaintext
@{[vse32_v $V28, $OUTP]}
add $OUTP, $OUTP, $T0
j .Ldec_blocks_192
.Ldec_blocks_192_end:
# Add ciphertext into partial tag
@{[vxor_vv $V20, $V20, $V24]}
@{[store_current_ctr]}
@{[compute_final_tag $T1]}
# Save the final tag
@{[vse32_v $V0, $XIP]}
# return the processed size.
slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
ret
.size aes_gcm_dec_blocks_192,.-aes_gcm_dec_blocks_192
___
$code .= <<___;
.p2align 3
aes_gcm_dec_blocks_256:
srli $CTR, $FULL_BLOCK_LEN32, 2
slli $T0, $FULL_BLOCK_LEN32, 2
@{[aes_256_first_round $T0, $T1]}
@{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
.Ldec_blocks_256:
# Compute the partial tags.
# The partial tags will multiply with [H^n, H^n, ..., H^n]
# [tag0, tag1, ...] =
# ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
# We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
beqz $LEN32, .Ldec_blocks_256_end
@{[vghsh_vv $V20, $V16, $V24]}
@{[prepare_input_and_ctr $T0]}
@{[aes_256_cipher_body $T1]}
# Compute AES ctr plaintext result.
@{[vxor_vv $V28, $V28, $V24]}
# Store plaintext
@{[vse32_v $V28, $OUTP]}
add $OUTP, $OUTP, $T0
j .Ldec_blocks_256
.Ldec_blocks_256_end:
# Add ciphertext into partial tag
@{[vxor_vv $V20, $V20, $V24]}
@{[store_current_ctr]}
@{[compute_final_tag $T1]}
# Save the final tag
@{[vse32_v $V0, $XIP]}
# return the processed size.
slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
ret
.size aes_gcm_dec_blocks_256,.-aes_gcm_dec_blocks_256
___
}
}
print $code;
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,467 @@
#! /usr/bin/env perl
# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# March 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+128 bytes shared table]. Even though
# loops are aggressively modulo-scheduled in respect to references to
# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
# scheduling "glitch," because uprofile(1) indicates uniform sample
# distribution, as if all instruction bundles execute in 1.5 cycles.
# Meaning that it could have been even faster, yet 12 cycles is ~60%
# better than gcc-generated code and ~80% than code generated by vendor
# compiler.
$cnt="v0"; # $0
$t0="t0";
$t1="t1";
$t2="t2";
$Thi0="t3"; # $4
$Tlo0="t4";
$Thi1="t5";
$Tlo1="t6";
$rem="t7"; # $8
#################
$Xi="a0"; # $16, input argument block
$Htbl="a1";
$inp="a2";
$len="a3";
$nlo="a4"; # $20
$nhi="a5";
$Zhi="t8";
$Zlo="t9";
$Xhi="t10"; # $24
$Xlo="t11";
$remp="t12";
$rem_4bit="AT"; # $28
{ my $N;
sub loop() {
$N++;
$code.=<<___;
.align 4
extbl $Xlo,7,$nlo
and $nlo,0xf0,$nhi
sll $nlo,4,$nlo
and $nlo,0xf0,$nlo
addq $nlo,$Htbl,$nlo
ldq $Zlo,8($nlo)
addq $nhi,$Htbl,$nhi
ldq $Zhi,0($nlo)
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
lda $cnt,6(zero)
extbl $Xlo,6,$nlo
ldq $Tlo1,8($nhi)
s8addq $remp,$rem_4bit,$remp
ldq $Thi1,0($nhi)
srl $Zlo,4,$Zlo
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $t0,$Zlo,$Zlo
and $nlo,0xf0,$nhi
xor $Tlo1,$Zlo,$Zlo
sll $nlo,4,$nlo
xor $Thi1,$Zhi,$Zhi
and $nlo,0xf0,$nlo
addq $nlo,$Htbl,$nlo
ldq $Tlo0,8($nlo)
addq $nhi,$Htbl,$nhi
ldq $Thi0,0($nlo)
.Looplo$N:
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
subq $cnt,1,$cnt
srl $Zlo,4,$Zlo
ldq $Tlo1,8($nhi)
xor $rem,$Zhi,$Zhi
ldq $Thi1,0($nhi)
s8addq $remp,$rem_4bit,$remp
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $t0,$Zlo,$Zlo
extbl $Xlo,$cnt,$nlo
and $nlo,0xf0,$nhi
xor $Thi0,$Zhi,$Zhi
xor $Tlo0,$Zlo,$Zlo
sll $nlo,4,$nlo
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
and $nlo,0xf0,$nlo
srl $Zlo,4,$Zlo
s8addq $remp,$rem_4bit,$remp
xor $rem,$Zhi,$Zhi
addq $nlo,$Htbl,$nlo
addq $nhi,$Htbl,$nhi
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
ldq $Tlo0,8($nlo)
xor $t0,$Zlo,$Zlo
xor $Tlo1,$Zlo,$Zlo
xor $Thi1,$Zhi,$Zhi
ldq $Thi0,0($nlo)
bne $cnt,.Looplo$N
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
lda $cnt,7(zero)
srl $Zlo,4,$Zlo
ldq $Tlo1,8($nhi)
xor $rem,$Zhi,$Zhi
ldq $Thi1,0($nhi)
s8addq $remp,$rem_4bit,$remp
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $t0,$Zlo,$Zlo
extbl $Xhi,$cnt,$nlo
and $nlo,0xf0,$nhi
xor $Thi0,$Zhi,$Zhi
xor $Tlo0,$Zlo,$Zlo
sll $nlo,4,$nlo
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
and $nlo,0xf0,$nlo
srl $Zlo,4,$Zlo
s8addq $remp,$rem_4bit,$remp
xor $rem,$Zhi,$Zhi
addq $nlo,$Htbl,$nlo
addq $nhi,$Htbl,$nhi
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
ldq $Tlo0,8($nlo)
xor $t0,$Zlo,$Zlo
xor $Tlo1,$Zlo,$Zlo
xor $Thi1,$Zhi,$Zhi
ldq $Thi0,0($nlo)
unop
.Loophi$N:
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
subq $cnt,1,$cnt
srl $Zlo,4,$Zlo
ldq $Tlo1,8($nhi)
xor $rem,$Zhi,$Zhi
ldq $Thi1,0($nhi)
s8addq $remp,$rem_4bit,$remp
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $t0,$Zlo,$Zlo
extbl $Xhi,$cnt,$nlo
and $nlo,0xf0,$nhi
xor $Thi0,$Zhi,$Zhi
xor $Tlo0,$Zlo,$Zlo
sll $nlo,4,$nlo
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
and $nlo,0xf0,$nlo
srl $Zlo,4,$Zlo
s8addq $remp,$rem_4bit,$remp
xor $rem,$Zhi,$Zhi
addq $nlo,$Htbl,$nlo
addq $nhi,$Htbl,$nhi
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
ldq $Tlo0,8($nlo)
xor $t0,$Zlo,$Zlo
xor $Tlo1,$Zlo,$Zlo
xor $Thi1,$Zhi,$Zhi
ldq $Thi0,0($nlo)
bne $cnt,.Loophi$N
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
srl $Zlo,4,$Zlo
ldq $Tlo1,8($nhi)
xor $rem,$Zhi,$Zhi
ldq $Thi1,0($nhi)
s8addq $remp,$rem_4bit,$remp
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $t0,$Zlo,$Zlo
xor $Tlo0,$Zlo,$Zlo
xor $Thi0,$Zhi,$Zhi
and $Zlo,0x0f,$remp
sll $Zhi,60,$t0
srl $Zlo,4,$Zlo
s8addq $remp,$rem_4bit,$remp
xor $rem,$Zhi,$Zhi
ldq $rem,0($remp)
srl $Zhi,4,$Zhi
xor $Tlo1,$Zlo,$Zlo
xor $Thi1,$Zhi,$Zhi
xor $t0,$Zlo,$Zlo
xor $rem,$Zhi,$Zhi
___
}}
$code=<<___;
#ifdef __linux__
#include <asm/regdef.h>
#else
#include <asm.h>
#include <regdef.h>
#endif
.text
.set noat
.set noreorder
.globl gcm_gmult_4bit
.align 4
.ent gcm_gmult_4bit
gcm_gmult_4bit:
.frame sp,0,ra
.prologue 0
ldq $Xlo,8($Xi)
ldq $Xhi,0($Xi)
bsr $t0,picmeup
nop
___
&loop();
$code.=<<___;
srl $Zlo,24,$t0 # byte swap
srl $Zlo,8,$t1
sll $Zlo,8,$t2
sll $Zlo,24,$Zlo
zapnot $t0,0x11,$t0
zapnot $t1,0x22,$t1
zapnot $Zlo,0x88,$Zlo
or $t0,$t1,$t0
zapnot $t2,0x44,$t2
or $Zlo,$t0,$Zlo
srl $Zhi,24,$t0
srl $Zhi,8,$t1
or $Zlo,$t2,$Zlo
sll $Zhi,8,$t2
sll $Zhi,24,$Zhi
srl $Zlo,32,$Xlo
sll $Zlo,32,$Zlo
zapnot $t0,0x11,$t0
zapnot $t1,0x22,$t1
or $Zlo,$Xlo,$Xlo
zapnot $Zhi,0x88,$Zhi
or $t0,$t1,$t0
zapnot $t2,0x44,$t2
or $Zhi,$t0,$Zhi
or $Zhi,$t2,$Zhi
srl $Zhi,32,$Xhi
sll $Zhi,32,$Zhi
or $Zhi,$Xhi,$Xhi
stq $Xlo,8($Xi)
stq $Xhi,0($Xi)
ret (ra)
.end gcm_gmult_4bit
___
$inhi="s0";
$inlo="s1";
$code.=<<___;
.globl gcm_ghash_4bit
.align 4
.ent gcm_ghash_4bit
gcm_ghash_4bit:
lda sp,-32(sp)
stq ra,0(sp)
stq s0,8(sp)
stq s1,16(sp)
.mask 0x04000600,-32
.frame sp,32,ra
.prologue 0
ldq_u $inhi,0($inp)
ldq_u $Thi0,7($inp)
ldq_u $inlo,8($inp)
ldq_u $Tlo0,15($inp)
ldq $Xhi,0($Xi)
ldq $Xlo,8($Xi)
bsr $t0,picmeup
nop
.Louter:
extql $inhi,$inp,$inhi
extqh $Thi0,$inp,$Thi0
or $inhi,$Thi0,$inhi
lda $inp,16($inp)
extql $inlo,$inp,$inlo
extqh $Tlo0,$inp,$Tlo0
or $inlo,$Tlo0,$inlo
subq $len,16,$len
xor $Xlo,$inlo,$Xlo
xor $Xhi,$inhi,$Xhi
___
&loop();
$code.=<<___;
srl $Zlo,24,$t0 # byte swap
srl $Zlo,8,$t1
sll $Zlo,8,$t2
sll $Zlo,24,$Zlo
zapnot $t0,0x11,$t0
zapnot $t1,0x22,$t1
zapnot $Zlo,0x88,$Zlo
or $t0,$t1,$t0
zapnot $t2,0x44,$t2
or $Zlo,$t0,$Zlo
srl $Zhi,24,$t0
srl $Zhi,8,$t1
or $Zlo,$t2,$Zlo
sll $Zhi,8,$t2
sll $Zhi,24,$Zhi
srl $Zlo,32,$Xlo
sll $Zlo,32,$Zlo
beq $len,.Ldone
zapnot $t0,0x11,$t0
zapnot $t1,0x22,$t1
or $Zlo,$Xlo,$Xlo
ldq_u $inhi,0($inp)
zapnot $Zhi,0x88,$Zhi
or $t0,$t1,$t0
zapnot $t2,0x44,$t2
ldq_u $Thi0,7($inp)
or $Zhi,$t0,$Zhi
or $Zhi,$t2,$Zhi
ldq_u $inlo,8($inp)
ldq_u $Tlo0,15($inp)
srl $Zhi,32,$Xhi
sll $Zhi,32,$Zhi
or $Zhi,$Xhi,$Xhi
br zero,.Louter
.Ldone:
zapnot $t0,0x11,$t0
zapnot $t1,0x22,$t1
or $Zlo,$Xlo,$Xlo
zapnot $Zhi,0x88,$Zhi
or $t0,$t1,$t0
zapnot $t2,0x44,$t2
or $Zhi,$t0,$Zhi
or $Zhi,$t2,$Zhi
srl $Zhi,32,$Xhi
sll $Zhi,32,$Zhi
or $Zhi,$Xhi,$Xhi
stq $Xlo,8($Xi)
stq $Xhi,0($Xi)
.set noreorder
/*ldq ra,0(sp)*/
ldq s0,8(sp)
ldq s1,16(sp)
lda sp,32(sp)
ret (ra)
.end gcm_ghash_4bit
.align 4
.ent picmeup
picmeup:
.frame sp,0,$t0
.prologue 0
br $rem_4bit,.Lpic
.Lpic: lda $rem_4bit,12($rem_4bit)
ret ($t0)
.end picmeup
nop
rem_4bit:
.long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
.long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
.long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
.long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
$output=pop and open STDOUT,">$output";
print $code;
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,554 @@
#! /usr/bin/env perl
# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# April 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+32 bytes shared table]. There is no
# experimental performance data available yet. The only approximation
# that can be made at this point is based on code size. Inner loop is
# 32 instructions long and on single-issue core should execute in <40
# cycles. Having verified that gcc 3.4 didn't unroll corresponding
# loop, this assembler loop body was found to be ~3x smaller than
# compiler-generated one...
#
# July 2010
#
# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
# Cortex A8 core and ~25 cycles per processed byte (which was observed
# to be ~3 times faster than gcc-generated code:-)
#
# February 2011
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Cortex A8 core and ~23.5 cycles per byte.
#
# March 2011
#
# Add NEON implementation featuring polynomial multiplication, i.e. no
# lookup tables involved. On Cortex A8 it was measured to process one
# byte in 15 cycles or 55% faster than integer-only code.
#
# April 2014
#
# Switch to multiplication algorithm suggested in paper referred
# below and combine it with reduction algorithm from x86 module.
# Performance improvement over previous version varies from 65% on
# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
# Snapdragon S4 - in 9.33.
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
# ====================================================================
# Note about "528B" variant. In ARM case it makes lesser sense to
# implement it for following reasons:
#
# - performance improvement won't be anywhere near 50%, because 128-
# bit shift operation is neatly fused with 128-bit xor here, and
# "538B" variant would eliminate only 4-5 instructions out of 32
# in the inner loop (meaning that estimated improvement is ~15%);
# - ARM-based systems are often embedded ones and extra memory
# consumption might be unappreciated (for so little improvement);
#
# Byte order [in]dependence. =========================================
#
# Caller is expected to maintain specific *dword* order in Htable,
# namely with *least* significant dword of 128-bit value at *lower*
# address. This differs completely from C code and has everything to
# do with ldm instruction and order in which dwords are "consumed" by
# algorithm. *Byte* order within these dwords in turn is whatever
# *native* byte order on current platform. See gcm128.c for working
# example...
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
or die "can't call $xlate: $!";
} else {
$output and open STDOUT,">$output";
}
$Xi="r0"; # argument block
$Htbl="r1";
$inp="r2";
$len="r3";
$Zll="r4"; # variables
$Zlh="r5";
$Zhl="r6";
$Zhh="r7";
$Tll="r8";
$Tlh="r9";
$Thl="r10";
$Thh="r11";
$nlo="r12";
################# r13 is stack pointer
$nhi="r14";
################# r15 is program counter
$rem_4bit=$inp; # used in gcm_gmult_4bit
$cnt=$len;
sub Zsmash() {
my $i=12;
my @args=@_;
for ($Zll,$Zlh,$Zhl,$Zhh) {
$code.=<<___;
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev $_,$_
str $_,[$Xi,#$i]
#elif defined(__ARMEB__)
str $_,[$Xi,#$i]
#else
mov $Tlh,$_,lsr#8
strb $_,[$Xi,#$i+3]
mov $Thl,$_,lsr#16
strb $Tlh,[$Xi,#$i+2]
mov $Thh,$_,lsr#24
strb $Thl,[$Xi,#$i+1]
strb $Thh,[$Xi,#$i]
#endif
___
$code.="\t".shift(@args)."\n";
$i-=4;
}
}
$code=<<___;
#include "arm_arch.h"
#if defined(__thumb2__) || defined(__clang__)
.syntax unified
#define ldrplb ldrbpl
#define ldrneb ldrbne
#endif
#if defined(__thumb2__)
.thumb
#else
.code 32
#endif
.text
.type rem_4bit,%object
.align 5
rem_4bit:
.short 0x0000,0x1C20,0x3840,0x2460
.short 0x7080,0x6CA0,0x48C0,0x54E0
.short 0xE100,0xFD20,0xD940,0xC560
.short 0x9180,0x8DA0,0xA9C0,0xB5E0
.size rem_4bit,.-rem_4bit
.type rem_4bit_get,%function
rem_4bit_get:
#if defined(__thumb2__)
adr $rem_4bit,rem_4bit
#else
sub $rem_4bit,pc,#8+32 @ &rem_4bit
#endif
b .Lrem_4bit_got
nop
nop
.size rem_4bit_get,.-rem_4bit_get
.global gcm_ghash_4bit
.type gcm_ghash_4bit,%function
.align 4
gcm_ghash_4bit:
#if defined(__thumb2__)
adr r12,rem_4bit
#else
sub r12,pc,#8+48 @ &rem_4bit
#endif
add $len,$inp,$len @ $len to point at the end
stmdb sp!,{r3-r11,lr} @ save $len/end too
ldmia r12,{r4-r11} @ copy rem_4bit ...
stmdb sp!,{r4-r11} @ ... to stack
ldrb $nlo,[$inp,#15]
ldrb $nhi,[$Xi,#15]
.Louter:
eor $nlo,$nlo,$nhi
and $nhi,$nlo,#0xf0
and $nlo,$nlo,#0x0f
mov $cnt,#14
add $Zhh,$Htbl,$nlo,lsl#4
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
add $Thh,$Htbl,$nhi
ldrb $nlo,[$inp,#14]
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
ldrb $nhi,[$Xi,#14]
eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
eor $nlo,$nlo,$nhi
and $nhi,$nlo,#0xf0
and $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16
.Linner:
add $Thh,$Htbl,$nlo,lsl#4
and $nlo,$Zll,#0xf @ rem
subs $cnt,$cnt,#1
add $nlo,$nlo,$nlo
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
eor $Zll,$Tll,$Zll,lsr#4
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
eor $Zhl,$Thl,$Zhl,lsr#4
#ifdef __thumb2__
it pl
#endif
ldrplb $nlo,[$inp,$cnt]
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
add $nhi,$nhi,$nhi
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
eor $Zll,$Tll,$Zll,lsr#4
#ifdef __thumb2__
it pl
#endif
ldrplb $Tll,[$Xi,$cnt]
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
ldrh $Tlh,[sp,$nhi]
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
#ifdef __thumb2__
it pl
#endif
eorpl $nlo,$nlo,$Tll
eor $Zhh,$Thh,$Zhh,lsr#4
#ifdef __thumb2__
itt pl
#endif
andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
bpl .Linner
ldr $len,[sp,#32] @ re-load $len/end
add $inp,$inp,#16
mov $nhi,$Zll
___
&Zsmash("cmp\t$inp,$len","\n".
"#ifdef __thumb2__\n".
" it ne\n".
"#endif\n".
" ldrneb $nlo,[$inp,#15]");
$code.=<<___;
bne .Louter
add sp,sp,#36
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size gcm_ghash_4bit,.-gcm_ghash_4bit
.global gcm_gmult_4bit
.type gcm_gmult_4bit,%function
gcm_gmult_4bit:
stmdb sp!,{r4-r11,lr}
ldrb $nlo,[$Xi,#15]
b rem_4bit_get
.Lrem_4bit_got:
and $nhi,$nlo,#0xf0
and $nlo,$nlo,#0x0f
mov $cnt,#14
add $Zhh,$Htbl,$nlo,lsl#4
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
ldrb $nlo,[$Xi,#14]
add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
and $nhi,$nlo,#0xf0
eor $Zhh,$Zhh,$Tll,lsl#16
and $nlo,$nlo,#0x0f
.Loop:
add $Thh,$Htbl,$nlo,lsl#4
and $nlo,$Zll,#0xf @ rem
subs $cnt,$cnt,#1
add $nlo,$nlo,$nlo
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
eor $Zll,$Tll,$Zll,lsr#4
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
eor $Zhl,$Thl,$Zhl,lsr#4
#ifdef __thumb2__
it pl
#endif
ldrplb $nlo,[$Xi,$cnt]
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
add $nhi,$nhi,$nhi
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
eor $Zll,$Tll,$Zll,lsr#4
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
#ifdef __thumb2__
itt pl
#endif
andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop
___
&Zsmash();
$code.=<<___;
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size gcm_gmult_4bit,.-gcm_gmult_4bit
___
{
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
sub clmul64x64 {
my ($r,$a,$b)=@_;
$code.=<<___;
vext.8 $t0#lo, $a, $a, #1 @ A1
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
vext.8 $r#lo, $b, $b, #1 @ B1
vmull.p8 $r, $a, $r#lo @ E = A*B1
vext.8 $t1#lo, $a, $a, #2 @ A2
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
vext.8 $t3#lo, $b, $b, #2 @ B2
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
vext.8 $t2#lo, $a, $a, #3 @ A3
veor $t0, $t0, $r @ L = E + F
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
vext.8 $r#lo, $b, $b, #3 @ B3
veor $t1, $t1, $t3 @ M = G + H
vmull.p8 $r, $a, $r#lo @ I = A*B3
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
vand $t0#hi, $t0#hi, $k48
vext.8 $t3#lo, $b, $b, #4 @ B4
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
vand $t1#hi, $t1#hi, $k32
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
veor $t2, $t2, $r @ N = I + J
veor $t0#lo, $t0#lo, $t0#hi
veor $t1#lo, $t1#lo, $t1#hi
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
vand $t2#hi, $t2#hi, $k16
vext.8 $t0, $t0, $t0, #15
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
vmov.i64 $t3#hi, #0
vext.8 $t1, $t1, $t1, #14
veor $t2#lo, $t2#lo, $t2#hi
vmull.p8 $r, $a, $b @ D = A*B
vext.8 $t3, $t3, $t3, #12
vext.8 $t2, $t2, $t2, #13
veor $t0, $t0, $t1
veor $t2, $t2, $t3
veor $r, $r, $t0
veor $r, $r, $t2
___
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.global gcm_init_neon
.type gcm_init_neon,%function
.align 4
gcm_init_neon:
vld1.64 $IN#hi,[r1]! @ load H
vmov.i8 $t0,#0xe1
vld1.64 $IN#lo,[r1]
vshl.i64 $t0#hi,#57
vshr.u64 $t0#lo,#63 @ t0=0xc2....01
vdup.8 $t1,$IN#hi[7]
vshr.u64 $Hlo,$IN#lo,#63
vshr.s8 $t1,#7 @ broadcast carry bit
vshl.i64 $IN,$IN,#1
vand $t0,$t0,$t1
vorr $IN#hi,$Hlo @ H<<<=1
veor $IN,$IN,$t0 @ twisted H
vstmia r0,{$IN}
ret @ bx lr
.size gcm_init_neon,.-gcm_init_neon
.global gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
vld1.64 $IN#hi,[$Xi]! @ load Xi
vld1.64 $IN#lo,[$Xi]!
vmov.i64 $k48,#0x0000ffffffffffff
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
vmov.i64 $k16,#0x000000000000ffff
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
mov $len,#16
b .Lgmult_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
vld1.64 $Xl#hi,[$Xi]! @ load Xi
vld1.64 $Xl#lo,[$Xi]!
vmov.i64 $k48,#0x0000ffffffffffff
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 $Xl,$Xl
#endif
vmov.i64 $k16,#0x000000000000ffff
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
.Loop_neon:
vld1.64 $IN#hi,[$inp]! @ load inp
vld1.64 $IN#lo,[$inp]!
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
veor $IN,$Xl @ inp^=Xi
.Lgmult_neon:
___
&clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
$code.=<<___;
veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
___
&clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
&clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
$code.=<<___;
veor $Xm,$Xm,$Xl @ Karatsuba post-processing
veor $Xm,$Xm,$Xh
veor $Xl#hi,$Xl#hi,$Xm#lo
veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
@ equivalent of reduction_avx from ghash-x86_64.pl
vshl.i64 $t1,$Xl,#57 @ 1st phase
vshl.i64 $t2,$Xl,#62
veor $t2,$t2,$t1 @
vshl.i64 $t1,$Xl,#63
veor $t2, $t2, $t1 @
veor $Xl#hi,$Xl#hi,$t2#lo @
veor $Xh#lo,$Xh#lo,$t2#hi
vshr.u64 $t2,$Xl,#1 @ 2nd phase
veor $Xh,$Xh,$Xl
veor $Xl,$Xl,$t2 @
vshr.u64 $t2,$t2,#6
vshr.u64 $Xl,$Xl,#1 @
veor $Xl,$Xl,$Xh @
veor $Xl,$Xl,$t2 @
subs $len,#16
bne .Loop_neon
#ifdef __ARMEL__
vrev64.8 $Xl,$Xl
#endif
sub $Xi,#16
vst1.64 $Xl#hi,[$Xi]! @ write out Xi
vst1.64 $Xl#lo,[$Xi]
ret @ bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
#endif
___
}
$code.=<<___;
.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!"; # enforce flush

View File

@@ -0,0 +1,246 @@
#! /usr/bin/env perl
# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# December 2011
#
# The module implements GCM GHASH function and underlying single
# multiplication operation in GF(2^128). Even though subroutines
# have _4bit suffix, they are not using any tables, but rely on
# hardware Galois Field Multiply support. Streamed GHASH processes
# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
# comparing apples vs. oranges, but compiler surely could have done
# better, because theoretical [though not necessarily achievable]
# estimate for "4-bit" table-driven implementation is ~12 cycles.
$output = pop and open STDOUT,">$output";
($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments
($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3,
$H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y,
$H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
($FF000000,$E10000)=("B30","B31");
($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len
$xia="A9";
($rem,$res)=("B4","B5"); # $rem zaps $Htable
$code.=<<___;
.text
.if .ASSEMBLER_VERSION<7000000
.asg 0,__TI_EABI__
.endif
.if __TI_EABI__
.asg gcm_gmult_1bit,_gcm_gmult_1bit
.asg gcm_gmult_4bit,_gcm_gmult_4bit
.asg gcm_ghash_4bit,_gcm_ghash_4bit
.endif
.asg B3,RA
.if 0
.global _gcm_gmult_1bit
_gcm_gmult_1bit:
ADDAD $Htable,2,$Htable
.endif
.global _gcm_gmult_4bit
_gcm_gmult_4bit:
.asmfunc
LDDW *${Htable}[-1],$H1:$H0 ; H.lo
LDDW *${Htable}[-2],$H3:$H2 ; H.hi
|| MV $Xip,${xip} ; reassign Xi
|| MVK 15,B1 ; SPLOOPD constant
MVK 0xE1,$E10000
|| LDBU *++${xip}[15],$x1 ; Xi[15]
MVK 0xFF,$FF000000
|| LDBU *--${xip},$x0 ; Xi[14]
SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
SHL $FF000000,24,$FF000000 ; upper byte mask
|| BNOP ghash_loop?
|| MVK 1,B0 ; take a single spin
PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
AND $H2,$FF000000,$H2u ; H2's upper byte
AND $H3,$FF000000,$H3u ; H3's upper byte
|| SHRU $H2u,8,$H2u
SHRU $H3u,8,$H3u
|| ZERO $Z1:$Z0
SHRU2 $xia,8,$H01u
|| ZERO $Z3:$Z2
.endasmfunc
.global _gcm_ghash_4bit
_gcm_ghash_4bit:
.asmfunc
LDDW *${Htable}[-1],$H1:$H0 ; H.lo
|| SHRU $len,4,B0 ; reassign len
LDDW *${Htable}[-2],$H3:$H2 ; H.hi
|| MV $Xip,${xip} ; reassign Xi
|| MVK 15,B1 ; SPLOOPD constant
MVK 0xE1,$E10000
|| [B0] LDNDW *${inp}[1],$H1x:$H0x
MVK 0xFF,$FF000000
|| [B0] LDNDW *${inp}++[2],$H3x:$H2x
SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
|| LDDW *${xip}[1],$Z1:$Z0
SHL $FF000000,24,$FF000000 ; upper byte mask
|| LDDW *${xip}[0],$Z3:$Z2
PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
AND $H2,$FF000000,$H2u ; H2's upper byte
AND $H3,$FF000000,$H3u ; H3's upper byte
|| SHRU $H2u,8,$H2u
SHRU $H3u,8,$H3u
SHRU2 $xia,8,$H01u
|| [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
|| [B0] XOR $H1x,$Z1,$Z1
.if .LITTLE_ENDIAN
[B0] XOR $H2x,$Z2,$Z2
|| [B0] XOR $H3x,$Z3,$Z3
|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
STDW $Z1:$Z0,*${xip}[1]
|| [B0] SHRU $Z1,16,$x0 ; Xi[14]
|| [B0] ZERO $Z1:$Z0
.else
[B0] XOR $H2x,$Z2,$Z2
|| [B0] XOR $H3x,$Z3,$Z3
|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
STDW $Z1:$Z0,*${xip}[1]
|| [B0] SHRU $Z0,8,$x0 ; Xi[14]
|| [B0] ZERO $Z1:$Z0
.endif
STDW $Z3:$Z2,*${xip}[0]
|| [B0] ZERO $Z3:$Z2
|| [B0] MV $xia,$x1
[B0] ADDK 14,${xip}
ghash_loop?:
SPLOOPD 6 ; 6*16+7
|| MVC B1,ILC
|| [B0] SUB B0,1,B0
|| ZERO A0
|| ADD $x1,$x1,$xib ; SHL $x1,1,$xib
|| SHL $x1,1,$xia
___
########____________________________
# 0 D2. M1 M2 |
# 1 M1 |
# 2 M1 M2 |
# 3 D1. M1 M2 |
# 4 S1. L1 |
# 5 S2 S1x L1 D2 L2 |____________________________
# 6/0 L1 S1 L2 S2x |D2. M1 M2 |
# 7/1 L1 S1 D1x S2 M2 | M1 |
# 8/2 S1 L1x S2 | M1 M2 |
# 9/3 S1 L1x | D1. M1 M2 |
# 10/4 D1x | S1. L1 |
# 11/5 |S2 S1x L1 D2 L2 |____________
# 12/6/0 D1x __| L1 S1 L2 S2x |D2. ....
# 7/1 L1 S1 D1x S2 M2 | ....
# 8/2 S1 L1x S2 | ....
#####... ................|............
$code.=<<___;
XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1)
|| XORMPY $H01u,$xib,$H01y
|| [A0] LDBU *--${xip},$x0
XORMPY $H1,$xia,$H1x ; 1
XORMPY $H2,$xia,$H2x ; 2
|| XORMPY $H2u,$xib,$H2y
XORMPY $H3,$xia,$H3x ; 3
|| XORMPY $H3u,$xib,$H3y
||[!A0] MVK.D 15,A0 ; *--${xip} counter
XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1)
|| [A0] SUB.S A0,1,A0
XOR.L $H1x,$Z1,$Z1 ; 5
|| AND.D $H01y,$FF000000,$H0z
|| SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y
|| SHL $x0,1,$xib
|| SHL $x0,1,$xia
XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue
|| SHL $Z0,1,$rem ; ; rem=Z<<1
|| SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8
|| AND.L $H1y,$FF000000,$H1z
XOR.L $H3x,$Z3,$Z3 ; 7/1
|| SHRMB.S $Z2,$Z1,$Z1
|| XOR.D $H0z,$Z0,$Z0 ; merge upper byte products
|| AND.S $H2y,$FF000000,$H2z
|| XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE
XOR.L $H1z,$Z1,$Z1 ; 8/2
|| SHRMB.S $Z3,$Z2,$Z2
|| AND.S $H3y,$FF000000,$H3z
XOR.L $H2z,$Z2,$Z2 ; 9/3
|| SHRU $Z3,8,$Z3
XOR.D $H3z,$Z3,$Z3 ; 10/4
NOP ; 11/5
SPKERNEL 0,2
|| XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res
; input pre-fetch is possible where D1 slot is available...
[B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/-
[B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/-
NOP ; 10/-
.if .LITTLE_ENDIAN
SWAP2 $Z0,$Z1 ; 11/-
|| SWAP4 $Z1,$Z0
SWAP4 $Z1,$Z1 ; 12/-
|| SWAP2 $Z0,$Z0
SWAP2 $Z2,$Z3
|| SWAP4 $Z3,$Z2
||[!B0] BNOP RA
SWAP4 $Z3,$Z3
|| SWAP2 $Z2,$Z2
|| [B0] BNOP ghash_loop?
[B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
|| [B0] XOR $H1x,$Z1,$Z1
[B0] XOR $H2x,$Z2,$Z2
|| [B0] XOR $H3x,$Z3,$Z3
|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
STDW $Z1:$Z0,*${xip}[1]
|| [B0] SHRU $Z1,16,$x0 ; Xi[14]
|| [B0] ZERO $Z1:$Z0
.else
[!B0] BNOP RA ; 11/-
[B0] BNOP ghash_loop? ; 12/-
[B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
|| [B0] XOR $H1x,$Z1,$Z1
[B0] XOR $H2x,$Z2,$Z2
|| [B0] XOR $H3x,$Z3,$Z3
|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
STDW $Z1:$Z0,*${xip}[1]
|| [B0] SHRU $Z0,8,$x0 ; Xi[14]
|| [B0] ZERO $Z1:$Z0
.endif
STDW $Z3:$Z2,*${xip}[0]
|| [B0] ZERO $Z3:$Z2
|| [B0] MV $xia,$x1
[B0] ADDK 14,${xip}
.endasmfunc
.sect .const
.cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
print $code;
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,470 @@
#! /usr/bin/env perl
# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# March 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
# GHASH performance was measured to be 6.67 cycles per processed byte
# on Itanium 2, which is >90% better than Microsoft compiler generated
# code. To anchor to something else sha1-ia64.pl module processes one
# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
# byte.
# September 2010
#
# It was originally thought that it makes lesser sense to implement
# "528B" variant on Itanium 2 for following reason. Because number of
# functional units is naturally limited, it appeared impossible to
# implement "528B" loop in 4 cycles, only in 5. This would mean that
# theoretically performance improvement couldn't be more than 20%.
# But occasionally you prove yourself wrong:-) I figured out a way to
# fold couple of instructions and having freed yet another instruction
# slot by unrolling the loop... Resulting performance is 4.45 cycles
# per processed byte and 50% better than "256B" version. On original
# Itanium performance should remain the same as the "256B" version,
# i.e. ~8.5 cycles.
$output=pop and (open STDOUT,">$output" or die "can't open $output: $!");
if ($^O eq "hpux") {
$ADDP="addp4";
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
$big_endian=0 if (/\-DL_ENDIAN/); }
if (!defined($big_endian))
{ $big_endian=(unpack('L',pack('N',1))==1); }
sub loop() {
my $label=shift;
my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
# in scalable manner;-) Naturally assuming data in L1 cache...
# Special note about 'dep' instruction, which is used to construct
# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
# bytes boundary and lower 7 bits of its address are guaranteed to
# be zero.
$code.=<<___;
$label:
{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
(p19) dep rem=Zlo,rem_4bitp,3,4 }
{ .mfi; (p19) xor Zhi=Zhi,Hhi
($p17) xor xi[1]=xi[1],in[1] };;
{ .mfi; (p18) ld8 Hhi=[Hi[1]]
(p19) shrp Zlo=Zhi,Zlo,4 }
{ .mfi; (p19) ld8 rem=[rem]
(p18) and Hi[1]=mask0xf0,xi[2] };;
{ .mmi; ($p16) ld1 in[0]=[inp],-1
(p18) xor Zlo=Zlo,Hlo
(p19) shr.u Zhi=Zhi,4 }
{ .mib; (p19) xor Hhi=Hhi,rem
(p18) add Hi[1]=Htbl,Hi[1] };;
{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
(p18) dep rem=Zlo,rem_4bitp,3,4 }
{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
(p18) xor Zhi=Zhi,Hhi };;
{ .mfi; (p18) ld8 Hhi=[Hi[1]]
(p18) shrp Zlo=Zhi,Zlo,4 }
{ .mfi; (p18) ld8 rem=[rem]
(p17) and Hi[0]=mask0xf0,Hi[0] };;
{ .mmi; (p16) ld1 xi[0]=[Xi],-1
(p18) xor Zlo=Zlo,Hlo
(p18) shr.u Zhi=Zhi,4 }
{ .mib; (p18) xor Hhi=Hhi,rem
(p17) add Hi[0]=Htbl,Hi[0]
br.ctop.sptk $label };;
___
}
$code=<<___;
.explicit
.text
prevfs=r2; prevlc=r3; prevpr=r8;
mask0xf0=r21;
rem=r22; rem_4bitp=r23;
Xi=r24; Htbl=r25;
inp=r26; end=r27;
Hhi=r28; Hlo=r29;
Zhi=r30; Zlo=r31;
.align 128
.skip 16 // aligns loop body
.global gcm_gmult_4bit#
.proc gcm_gmult_4bit#
gcm_gmult_4bit:
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,2,6,0,8
$ADDP Xi=15,in0 // &Xi[15]
mov rem_4bitp=ip }
{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
.save ar.lc,prevlc
mov prevlc=ar.lc
.save pr,prevpr
mov prevpr=pr };;
.body
.rotr in[3],xi[3],Hi[2]
{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
mov mask0xf0=0xf0
brp.loop.imp .Loop1,.Lend1-16};;
{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
};;
{ .mii; shladd Hi[1]=xi[2],4,r0
mov pr.rot=0x7<<16
mov ar.lc=13 };;
{ .mii; and Hi[1]=mask0xf0,Hi[1]
mov ar.ec=3
xor Zlo=Zlo,Zlo };;
{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
xor Zhi=Zhi,Zhi };;
___
&loop (".Loop1",1);
$code.=<<___;
.Lend1:
{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
{ .mib; mux1 Zlo=Zlo,\@rev };;
{ .mib; mux1 Zhi=Zhi,\@rev };;
{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
add Hhi=1,Xi };; // pipeline flush on Itanium
{ .mib; st8 [Hlo]=Zlo
mov pr=prevpr,0x1ffff };;
{ .mib; st8 [Hhi]=Zhi
mov ar.lc=prevlc
br.ret.sptk.many b0 };;
.endp gcm_gmult_4bit#
___
######################################################################
# "528B" (well, "512B" actually) streamed GHASH
#
$Xip="in0";
$Htbl="in1";
$inp="in2";
$len="in3";
$rem_8bit="loc0";
$mask0xff="loc1";
($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
sub load_htable() {
for (my $i=0;$i<8;$i++) {
$code.=<<___;
{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
___
$code.=shift if (($i+$#_)==7);
$code.="\t};;\n"
}
}
$code.=<<___;
prevsp=r3;
.align 32
.skip 16 // aligns loop body
.global gcm_ghash_4bit#
.proc gcm_ghash_4bit#
gcm_ghash_4bit:
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,4,2,0,0
.vframe prevsp
mov prevsp=sp
mov $rem_8bit=ip };;
.body
{ .mfi; $ADDP r8=0+0,$Htbl
$ADDP r9=0+8,$Htbl }
{ .mfi; $ADDP r10=128+0,$Htbl
$ADDP r11=128+8,$Htbl };;
___
&load_htable(
" $ADDP $Xip=15,$Xip", # &Xi[15]
" $ADDP $len=$len,$inp", # &inp[len]
" $ADDP $inp=15,$inp", # &inp[15]
" mov $mask0xff=0xff",
" add sp=-512,sp",
" andcm sp=sp,$mask0xff", # align stack frame
" add r14=0,sp",
" add r15=8,sp");
$code.=<<___;
{ .mmi; $sum 1<<1 // go big-endian
add r8=256+0,sp
add r9=256+8,sp }
{ .mmi; add r10=256+128+0,sp
add r11=256+128+8,sp
add $len=-17,$len };;
___
for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
$code.=<<___;
{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
st8 [r9]=$rhi,16 // Htable[$i].hi
shrp $rlo=$rhi,$rlo,4 }//;;
{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
shr.u $rhi=$rhi,4 };;
{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
___
}
$code.=<<___;
{ .mmi; ld8 r16=[r8],16 // Htable[8].lo
ld8 r17=[r9],16 };; // Htable[8].hi
{ .mmi; ld8 r18=[r8],16 // Htable[9].lo
ld8 r19=[r9],16 } // Htable[9].hi
{ .mmi; rum 1<<5 // clear um.mfh
shrp r16=r17,r16,4 };;
___
for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
$code.=<<___;
{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
___
}
$code.=<<___;
{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
{ .mmi; add $Htbl=256,sp // &Htable[0]
add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
___
$in="r15";
@xi=("r16","r17");
@rem=("r18","r19");
($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
($Atbl,$Btbl)=("r26","r27");
$code.=<<___; # (p16)
{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
cmp.eq p0,p6=r0,r0 };; // clear p6
___
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
$code.=<<___; # (p16),(p17)
{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
{ .mii; ld1 $in=[$inp],-1 //(p16) *inp--
dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
.align 32
.LOOP:
{ .mmi;
(p6) st8 [$Xip]=$Zhi,13
xor $Zlo=$Zlo,$Zlo
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
___
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
$code.=<<___; # (p16),(p17),(p18)
{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
ld1 $in=[$inp],-1 } //(p16) *inp--
{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
___
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
for ($i=1;$i<14;$i++) {
# Above and below fragments are derived from this one by removing
# unsuitable (p??) instructions.
$code.=<<___; # (p16),(p17),(p18),(p19)
{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
ld1 $in=[$inp],-1 //(p16) *inp--
shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
___
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
}
$code.=<<___; # (p17),(p18),(p19)
{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
___
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
$code.=<<___; # (p18),(p19)
{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
___
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
$code.=<<___; # (p19)
{ .mmi; cmp.ltu p6,p0=$inp,$len
add $inp=32,$inp
shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
add $Xip=9,$Xip };; // &Xi.lo
{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
(p6) ld1 $in=[$inp],-1 //[p16] *inp--
(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
{ .mmi; st8 [$Xip]=$Zlo,-8
(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
{ .mmi;
(p6) ld1 $in=[$inp],-1 //[p16] *inp--
xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
{ .mib;
(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
(p6) br.cond.dptk.many .LOOP };;
{ .mib; st8 [$Xip]=$Zhi };;
{ .mib; $rum 1<<1 // return to little-endian
.restore sp
mov sp=prevsp
br.ret.sptk.many b0 };;
.endp gcm_ghash_4bit#
___
$code.=<<___;
.align 128
.type rem_4bit#,\@object
rem_4bit:
data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
.size rem_4bit#,128
.type rem_8bit#,\@object
rem_8bit:
data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
.size rem_8bit#,512
stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,751 @@
#! /usr/bin/env perl
# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# April 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
# it processes one byte in 19.6 cycles, which is more than twice as
# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
# processed byte. This is ~2.2x faster than 64-bit code generated by
# vendor compiler (which used to be very hard to beat:-).
#
# Special thanks to polarhome.com for providing HP-UX account.
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$output and open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
$NREGS =6;
} else {
$LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
$NREGS =11;
}
$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
# [+ argument transfer]
################# volatile registers
$Xi="%r26"; # argument block
$Htbl="%r25";
$inp="%r24";
$len="%r23";
$Hhh=$Htbl; # variables
$Hll="%r22";
$Zhh="%r21";
$Zll="%r20";
$cnt="%r19";
$rem_4bit="%r28";
$rem="%r29";
$mask0xf0="%r31";
################# preserved registers
$Thh="%r1";
$Tll="%r2";
$nlo="%r3";
$nhi="%r4";
$byte="%r5";
if ($SIZE_T==4) {
$Zhl="%r6";
$Zlh="%r7";
$Hhl="%r8";
$Hlh="%r9";
$Thl="%r10";
$Tlh="%r11";
}
$rem2="%r6"; # used in PA-RISC 2.0 code
$code.=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
.ALIGN 64
gcm_gmult_4bit
.PROC
.CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
___
$code.=<<___ if ($SIZE_T==4);
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
___
$code.=<<___;
blr %r0,$rem_4bit
ldi 3,$rem
L\$pic_gmult
andcm $rem_4bit,$rem,$rem_4bit
addl $inp,$len,$len
ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
ldi 0xf0,$mask0xf0
___
$code.=<<___ if ($SIZE_T==4);
ldi 31,$rem
mtctl $rem,%cr11
extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
b L\$parisc1_gmult
nop
___
$code.=<<___;
ldb 15($Xi),$nlo
ldo 8($Htbl),$Hll
and $mask0xf0,$nlo,$nhi
depd,z $nlo,59,4,$nlo
ldd $nlo($Hll),$Zll
ldd $nlo($Hhh),$Zhh
depd,z $Zll,60,4,$rem
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldb 14($Xi),$nlo
ldd $nhi($Hll),$Tll
ldd $nhi($Hhh),$Thh
and $mask0xf0,$nlo,$nhi
depd,z $nlo,59,4,$nlo
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
ldd $rem($rem_4bit),$rem
b L\$oop_gmult_pa2
ldi 13,$cnt
.ALIGN 8
L\$oop_gmult_pa2
xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
depd,z $Zll,60,4,$rem
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldd $nlo($Hll),$Tll
ldd $nlo($Hhh),$Thh
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
ldd $rem($rem_4bit),$rem
xor $rem,$Zhh,$Zhh
depd,z $Zll,60,4,$rem
ldbx $cnt($Xi),$nlo
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldd $nhi($Hll),$Tll
ldd $nhi($Hhh),$Thh
and $mask0xf0,$nlo,$nhi
depd,z $nlo,59,4,$nlo
ldd $rem($rem_4bit),$rem
xor $Tll,$Zll,$Zll
addib,uv -1,$cnt,L\$oop_gmult_pa2
xor $Thh,$Zhh,$Zhh
xor $rem,$Zhh,$Zhh
depd,z $Zll,60,4,$rem
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldd $nlo($Hll),$Tll
ldd $nlo($Hhh),$Thh
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
ldd $rem($rem_4bit),$rem
xor $rem,$Zhh,$Zhh
depd,z $Zll,60,4,$rem
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldd $nhi($Hll),$Tll
ldd $nhi($Hhh),$Thh
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
ldd $rem($rem_4bit),$rem
xor $rem,$Zhh,$Zhh
std $Zll,8($Xi)
std $Zhh,0($Xi)
___
$code.=<<___ if ($SIZE_T==4);
b L\$done_gmult
nop
L\$parisc1_gmult
ldb 15($Xi),$nlo
ldo 12($Htbl),$Hll
ldo 8($Htbl),$Hlh
ldo 4($Htbl),$Hhl
and $mask0xf0,$nlo,$nhi
zdep $nlo,27,4,$nlo
ldwx $nlo($Hll),$Zll
ldwx $nlo($Hlh),$Zlh
ldwx $nlo($Hhl),$Zhl
ldwx $nlo($Hhh),$Zhh
zdep $Zll,28,4,$rem
ldb 14($Xi),$nlo
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
ldwx $nhi($Hll),$Tll
shrpw $Zhl,$Zlh,4,$Zlh
ldwx $nhi($Hlh),$Tlh
shrpw $Zhh,$Zhl,4,$Zhl
ldwx $nhi($Hhl),$Thl
extru $Zhh,27,28,$Zhh
ldwx $nhi($Hhh),$Thh
xor $rem,$Zhh,$Zhh
and $mask0xf0,$nlo,$nhi
zdep $nlo,27,4,$nlo
xor $Tll,$Zll,$Zll
ldwx $nlo($Hll),$Tll
xor $Tlh,$Zlh,$Zlh
ldwx $nlo($Hlh),$Tlh
xor $Thl,$Zhl,$Zhl
b L\$oop_gmult_pa1
ldi 13,$cnt
.ALIGN 8
L\$oop_gmult_pa1
zdep $Zll,28,4,$rem
ldwx $nlo($Hhl),$Thl
xor $Thh,$Zhh,$Zhh
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
ldwx $nlo($Hhh),$Thh
shrpw $Zhl,$Zlh,4,$Zlh
ldbx $cnt($Xi),$nlo
xor $Tll,$Zll,$Zll
ldwx $nhi($Hll),$Tll
shrpw $Zhh,$Zhl,4,$Zhl
xor $Tlh,$Zlh,$Zlh
ldwx $nhi($Hlh),$Tlh
extru $Zhh,27,28,$Zhh
xor $Thl,$Zhl,$Zhl
ldwx $nhi($Hhl),$Thl
xor $rem,$Zhh,$Zhh
zdep $Zll,28,4,$rem
xor $Thh,$Zhh,$Zhh
ldwx $nhi($Hhh),$Thh
shrpw $Zlh,$Zll,4,$Zll
ldwx $rem($rem_4bit),$rem
shrpw $Zhl,$Zlh,4,$Zlh
shrpw $Zhh,$Zhl,4,$Zhl
and $mask0xf0,$nlo,$nhi
extru $Zhh,27,28,$Zhh
zdep $nlo,27,4,$nlo
xor $Tll,$Zll,$Zll
ldwx $nlo($Hll),$Tll
xor $Tlh,$Zlh,$Zlh
ldwx $nlo($Hlh),$Tlh
xor $rem,$Zhh,$Zhh
addib,uv -1,$cnt,L\$oop_gmult_pa1
xor $Thl,$Zhl,$Zhl
zdep $Zll,28,4,$rem
ldwx $nlo($Hhl),$Thl
xor $Thh,$Zhh,$Zhh
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
ldwx $nlo($Hhh),$Thh
shrpw $Zhl,$Zlh,4,$Zlh
xor $Tll,$Zll,$Zll
ldwx $nhi($Hll),$Tll
shrpw $Zhh,$Zhl,4,$Zhl
xor $Tlh,$Zlh,$Zlh
ldwx $nhi($Hlh),$Tlh
extru $Zhh,27,28,$Zhh
xor $rem,$Zhh,$Zhh
xor $Thl,$Zhl,$Zhl
ldwx $nhi($Hhl),$Thl
xor $Thh,$Zhh,$Zhh
ldwx $nhi($Hhh),$Thh
zdep $Zll,28,4,$rem
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
shrpw $Zhl,$Zlh,4,$Zlh
shrpw $Zhh,$Zhl,4,$Zhl
extru $Zhh,27,28,$Zhh
xor $Tll,$Zll,$Zll
xor $Tlh,$Zlh,$Zlh
xor $rem,$Zhh,$Zhh
stw $Zll,12($Xi)
xor $Thl,$Zhl,$Zhl
stw $Zlh,8($Xi)
xor $Thh,$Zhh,$Zhh
stw $Zhl,4($Xi)
stw $Zhh,0($Xi)
___
$code.=<<___;
L\$done_gmult
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
___
$code.=<<___ if ($SIZE_T==4);
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
___
$code.=<<___;
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
.ALIGN 64
gcm_ghash_4bit
.PROC
.CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
___
$code.=<<___ if ($SIZE_T==4);
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
___
$code.=<<___;
blr %r0,$rem_4bit
ldi 3,$rem
L\$pic_ghash
andcm $rem_4bit,$rem,$rem_4bit
addl $inp,$len,$len
ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
ldi 0xf0,$mask0xf0
___
$code.=<<___ if ($SIZE_T==4);
ldi 31,$rem
mtctl $rem,%cr11
extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
b L\$parisc1_ghash
nop
___
$code.=<<___;
ldb 15($Xi),$nlo
ldo 8($Htbl),$Hll
L\$outer_ghash_pa2
ldb 15($inp),$nhi
xor $nhi,$nlo,$nlo
and $mask0xf0,$nlo,$nhi
depd,z $nlo,59,4,$nlo
ldd $nlo($Hll),$Zll
ldd $nlo($Hhh),$Zhh
depd,z $Zll,60,4,$rem
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldb 14($Xi),$nlo
ldb 14($inp),$byte
ldd $nhi($Hll),$Tll
ldd $nhi($Hhh),$Thh
xor $byte,$nlo,$nlo
and $mask0xf0,$nlo,$nhi
depd,z $nlo,59,4,$nlo
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
ldd $rem($rem_4bit),$rem
b L\$oop_ghash_pa2
ldi 13,$cnt
.ALIGN 8
L\$oop_ghash_pa2
xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
depd,z $Zll,60,4,$rem2
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldd $nlo($Hll),$Tll
ldd $nlo($Hhh),$Thh
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
ldbx $cnt($Xi),$nlo
ldbx $cnt($inp),$byte
depd,z $Zll,60,4,$rem
shrpd $Zhh,$Zll,4,$Zll
ldd $rem2($rem_4bit),$rem2
xor $rem2,$Zhh,$Zhh
xor $byte,$nlo,$nlo
ldd $nhi($Hll),$Tll
ldd $nhi($Hhh),$Thh
and $mask0xf0,$nlo,$nhi
depd,z $nlo,59,4,$nlo
extrd,u $Zhh,59,60,$Zhh
xor $Tll,$Zll,$Zll
ldd $rem($rem_4bit),$rem
addib,uv -1,$cnt,L\$oop_ghash_pa2
xor $Thh,$Zhh,$Zhh
xor $rem,$Zhh,$Zhh
depd,z $Zll,60,4,$rem2
shrpd $Zhh,$Zll,4,$Zll
extrd,u $Zhh,59,60,$Zhh
ldd $nlo($Hll),$Tll
ldd $nlo($Hhh),$Thh
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
depd,z $Zll,60,4,$rem
shrpd $Zhh,$Zll,4,$Zll
ldd $rem2($rem_4bit),$rem2
xor $rem2,$Zhh,$Zhh
ldd $nhi($Hll),$Tll
ldd $nhi($Hhh),$Thh
extrd,u $Zhh,59,60,$Zhh
xor $Tll,$Zll,$Zll
xor $Thh,$Zhh,$Zhh
ldd $rem($rem_4bit),$rem
xor $rem,$Zhh,$Zhh
std $Zll,8($Xi)
ldo 16($inp),$inp
std $Zhh,0($Xi)
cmpb,*<> $inp,$len,L\$outer_ghash_pa2
copy $Zll,$nlo
___
$code.=<<___ if ($SIZE_T==4);
b L\$done_ghash
nop
L\$parisc1_ghash
ldb 15($Xi),$nlo
ldo 12($Htbl),$Hll
ldo 8($Htbl),$Hlh
ldo 4($Htbl),$Hhl
L\$outer_ghash_pa1
ldb 15($inp),$byte
xor $byte,$nlo,$nlo
and $mask0xf0,$nlo,$nhi
zdep $nlo,27,4,$nlo
ldwx $nlo($Hll),$Zll
ldwx $nlo($Hlh),$Zlh
ldwx $nlo($Hhl),$Zhl
ldwx $nlo($Hhh),$Zhh
zdep $Zll,28,4,$rem
ldb 14($Xi),$nlo
ldb 14($inp),$byte
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
ldwx $nhi($Hll),$Tll
shrpw $Zhl,$Zlh,4,$Zlh
ldwx $nhi($Hlh),$Tlh
shrpw $Zhh,$Zhl,4,$Zhl
ldwx $nhi($Hhl),$Thl
extru $Zhh,27,28,$Zhh
ldwx $nhi($Hhh),$Thh
xor $byte,$nlo,$nlo
xor $rem,$Zhh,$Zhh
and $mask0xf0,$nlo,$nhi
zdep $nlo,27,4,$nlo
xor $Tll,$Zll,$Zll
ldwx $nlo($Hll),$Tll
xor $Tlh,$Zlh,$Zlh
ldwx $nlo($Hlh),$Tlh
xor $Thl,$Zhl,$Zhl
b L\$oop_ghash_pa1
ldi 13,$cnt
.ALIGN 8
L\$oop_ghash_pa1
zdep $Zll,28,4,$rem
ldwx $nlo($Hhl),$Thl
xor $Thh,$Zhh,$Zhh
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
ldwx $nlo($Hhh),$Thh
shrpw $Zhl,$Zlh,4,$Zlh
ldbx $cnt($Xi),$nlo
xor $Tll,$Zll,$Zll
ldwx $nhi($Hll),$Tll
shrpw $Zhh,$Zhl,4,$Zhl
ldbx $cnt($inp),$byte
xor $Tlh,$Zlh,$Zlh
ldwx $nhi($Hlh),$Tlh
extru $Zhh,27,28,$Zhh
xor $Thl,$Zhl,$Zhl
ldwx $nhi($Hhl),$Thl
xor $rem,$Zhh,$Zhh
zdep $Zll,28,4,$rem
xor $Thh,$Zhh,$Zhh
ldwx $nhi($Hhh),$Thh
shrpw $Zlh,$Zll,4,$Zll
ldwx $rem($rem_4bit),$rem
shrpw $Zhl,$Zlh,4,$Zlh
xor $byte,$nlo,$nlo
shrpw $Zhh,$Zhl,4,$Zhl
and $mask0xf0,$nlo,$nhi
extru $Zhh,27,28,$Zhh
zdep $nlo,27,4,$nlo
xor $Tll,$Zll,$Zll
ldwx $nlo($Hll),$Tll
xor $Tlh,$Zlh,$Zlh
ldwx $nlo($Hlh),$Tlh
xor $rem,$Zhh,$Zhh
addib,uv -1,$cnt,L\$oop_ghash_pa1
xor $Thl,$Zhl,$Zhl
zdep $Zll,28,4,$rem
ldwx $nlo($Hhl),$Thl
xor $Thh,$Zhh,$Zhh
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
ldwx $nlo($Hhh),$Thh
shrpw $Zhl,$Zlh,4,$Zlh
xor $Tll,$Zll,$Zll
ldwx $nhi($Hll),$Tll
shrpw $Zhh,$Zhl,4,$Zhl
xor $Tlh,$Zlh,$Zlh
ldwx $nhi($Hlh),$Tlh
extru $Zhh,27,28,$Zhh
xor $rem,$Zhh,$Zhh
xor $Thl,$Zhl,$Zhl
ldwx $nhi($Hhl),$Thl
xor $Thh,$Zhh,$Zhh
ldwx $nhi($Hhh),$Thh
zdep $Zll,28,4,$rem
ldwx $rem($rem_4bit),$rem
shrpw $Zlh,$Zll,4,$Zll
shrpw $Zhl,$Zlh,4,$Zlh
shrpw $Zhh,$Zhl,4,$Zhl
extru $Zhh,27,28,$Zhh
xor $Tll,$Zll,$Zll
xor $Tlh,$Zlh,$Zlh
xor $rem,$Zhh,$Zhh
stw $Zll,12($Xi)
xor $Thl,$Zhl,$Zhl
stw $Zlh,8($Xi)
xor $Thh,$Zhh,$Zhh
stw $Zhl,4($Xi)
ldo 16($inp),$inp
stw $Zhh,0($Xi)
comb,<> $inp,$len,L\$outer_ghash_pa1
copy $Zll,$nlo
___
$code.=<<___;
L\$done_ghash
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
___
$code.=<<___ if ($SIZE_T==4);
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
___
$code.=<<___;
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.ALIGN 64
L\$rem_4bit
.WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
.WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
.WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
.WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
.ALIGN 64
___
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
# that it can be compiled with .LEVEL 1.0. It should be noted that I
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
# directive...
my $ldd = sub {
my ($mod,$args) = @_;
my $orig = "ldd$mod\t$args";
if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
{ my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
{ my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
$opcode|=(1<<5) if ($mod =~ /^,m/);
$opcode|=(1<<13) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $std = sub {
my ($mod,$args) = @_;
my $orig = "std$mod\t$args";
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
{ my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $extrd = sub {
my ($mod,$args) = @_;
my $orig = "extrd$mod\t$args";
# I only have ",u" completer, it's implicitly encoded...
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
my $len=32-$3;
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
my $len=32-$2;
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
$opcode |= (1<<13) if ($mod =~ /,\**=/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $shrpd = sub {
my ($mod,$args) = @_;
my $orig = "shrpd$mod\t$args";
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
my $cpos=63-$3;
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
{ sprintf "\t.WORD\t0x%08x\t; %s",
(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
}
else { "\t".$orig; }
};
my $depd = sub {
my ($mod,$args) = @_;
my $orig = "depd$mod\t$args";
# I only have ",z" completer, it's implicitly encoded...
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
{ my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
my $cpos=63-$2;
my $len=32-$3;
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
sub assemble {
my ($mnemonic,$mod,$args)=@_;
my $opcode = eval("\$$mnemonic");
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
}
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler/) {
$gnuas = 1;
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
if ($SIZE_T==4) {
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
s/cmpb,\*/comb,/;
s/,\*/,/;
}
s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8);
s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8);
s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8);
s/\bbv\b/bve/ if ($SIZE_T==8);
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,378 @@
#! /usr/bin/env perl
# This file is dual-licensed, meaning that you can use it under your
# choice of either of the following two licenses:
#
# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You can obtain
# a copy in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# or
#
# Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# - RV64I
# - RISC-V Vector ('V') with VLEN >= 128
# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
# - RISC-V Vector Carryless Multiplication extension ('Zvbc')
use strict;
use warnings;
use FindBin qw($Bin);
use lib "$Bin";
use lib "$Bin/../../perlasm";
use riscv;
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$output and open STDOUT,">$output";
my $code=<<___;
.text
___
################################################################################
# void gcm_init_rv64i_zvkb_zvbc(u128 Htable[16], const u64 H[2]);
#
# input: H: 128-bit H - secret parameter E(K, 0^128)
# output: Htable: Preprocessed key data for gcm_gmult_rv64i_zvkb_zvbc and
# gcm_ghash_rv64i_zvkb_zvbc
{
my ($Htable,$H,$TMP0,$TMP1,$TMP2) = ("a0","a1","t0","t1","t2");
my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");
$code .= <<___;
.p2align 3
.globl gcm_init_rv64i_zvkb_zvbc
.type gcm_init_rv64i_zvkb_zvbc,\@function
gcm_init_rv64i_zvkb_zvbc:
# Load/store data in reverse order.
# This is needed as a part of endianness swap.
add $H, $H, 8
li $TMP0, -8
li $TMP1, 63
la $TMP2, Lpolymod
@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
@{[vlse64_v $V1, $H, $TMP0]} # vlse64.v v1, (a1), t0
@{[vle64_v $V2, $TMP2]} # vle64.v v2, (t2)
# Shift one left and get the carry bits.
@{[vsrl_vx $V3, $V1, $TMP1]} # vsrl.vx v3, v1, t1
@{[vsll_vi $V1, $V1, 1]} # vsll.vi v1, v1, 1
# Use the fact that the polynomial degree is no more than 128,
# i.e. only the LSB of the upper half could be set.
# Thanks to this we don't need to do the full reduction here.
# Instead simply subtract the reduction polynomial.
# This idea was taken from x86 ghash implementation in OpenSSL.
@{[vslideup_vi $V4, $V3, 1]} # vslideup.vi v4, v3, 1
@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
@{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
@{[vor_vv_v0t $V1, $V1, $V4]} # vor.vv v1, v1, v4, v0.t
# Need to set the mask to 3, if the carry bit is set.
@{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3
@{[vmv_v_i $V3, 0]} # vmv.v.i v3, 0
@{[vmerge_vim $V3, $V3, 3]} # vmerge.vim v3, v3, 3, v0
@{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3
@{[vxor_vv_v0t $V1, $V1, $V2]} # vxor.vv v1, v1, v2, v0.t
@{[vse64_v $V1, $Htable]} # vse64.v v1, (a0)
ret
.size gcm_init_rv64i_zvkb_zvbc,.-gcm_init_rv64i_zvkb_zvbc
___
}
################################################################################
# void gcm_gmult_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16]);
#
# input: Xi: current hash value
# Htable: preprocessed H
# output: Xi: next hash value Xi = (Xi * H mod f)
{
my ($Xi,$Htable,$TMP0,$TMP1,$TMP2,$TMP3,$TMP4) = ("a0","a1","t0","t1","t2","t3","t4");
my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");
$code .= <<___;
.text
.p2align 3
.globl gcm_gmult_rv64i_zvkb_zvbc
.type gcm_gmult_rv64i_zvkb_zvbc,\@function
gcm_gmult_rv64i_zvkb_zvbc:
ld $TMP0, ($Htable)
ld $TMP1, 8($Htable)
li $TMP2, 63
la $TMP3, Lpolymod
ld $TMP3, 8($TMP3)
# Load/store data in reverse order.
# This is needed as a part of endianness swap.
add $Xi, $Xi, 8
li $TMP4, -8
@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
@{[vlse64_v $V5, $Xi, $TMP4]} # vlse64.v v5, (a0), t4
@{[vrev8_v $V5, $V5]} # vrev8.v v5, v5
# Multiplication
# Do two 64x64 multiplications in one go to save some time
# and simplify things.
# A = a1a0 (t1, t0)
# B = b1b0 (v5)
# C = c1c0 (256 bit)
# c1 = a1b1 + (a0b1)h + (a1b0)h
# c0 = a0b0 + (a0b1)l + (a1b0)h
# v1 = (a0b1)l,(a0b0)l
@{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0
# v3 = (a0b1)h,(a0b0)h
@{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0
# v4 = (a1b1)l,(a1b0)l
@{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1
# v2 = (a1b1)h,(a1b0)h
@{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1
# Is there a better way to do this?
# Would need to swap the order of elements within a vector register.
@{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1
@{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1
@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
@{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1
@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
# v2 += (a0b1)h
@{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t
# v2 += (a1b1)l
@{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t
@{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
# v1 += (a0b0)h,0
@{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t
# v1 += (a1b0)l,0
@{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t
# Now the 256bit product should be stored in (v2,v1)
# v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
# v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
# Reduction
# Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
# This is a slight variation of the Gueron's Montgomery reduction.
# The difference being the order of some operations has been changed,
# to make a better use of vclmul(h) instructions.
# First step:
# c1 += (c0 * P)l
# vmv.v.i v0, 2
@{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
@{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
# Second step:
# D = d1,d0 is final result
# We want:
# m1 = c1 + (c1 * P)h
# m0 = (c1 * P)l + (c0 * P)h + c0
# d1 = c3 + m1
# d0 = c2 + m0
#v3 = (c1 * P)l, 0
@{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
#v4 = (c1 * P)h, (c0 * P)h
@{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3
@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
@{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4
@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
# XOR in the upper upper part of the product
@{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1
@{[vrev8_v $V2, $V2]} # vrev8.v v2, v2
@{[vsse64_v $V2, $Xi, $TMP4]} # vsse64.v v2, (a0), t4
ret
.size gcm_gmult_rv64i_zvkb_zvbc,.-gcm_gmult_rv64i_zvkb_zvbc
___
}
################################################################################
# void gcm_ghash_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16],
# const u8 *inp, size_t len);
#
# input: Xi: current hash value
# Htable: preprocessed H
# inp: pointer to input data
# len: length of input data in bytes (multiple of block size)
# output: Xi: Xi+1 (next hash value Xi)
{
my ($Xi,$Htable,$inp,$len,$TMP0,$TMP1,$TMP2,$TMP3,$M8,$TMP5,$TMP6) = ("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6");
my ($V0,$V1,$V2,$V3,$V4,$V5,$V6,$Vinp) = ("v0","v1","v2","v3","v4","v5","v6","v7");
$code .= <<___;
.p2align 3
.globl gcm_ghash_rv64i_zvkb_zvbc
.type gcm_ghash_rv64i_zvkb_zvbc,\@function
gcm_ghash_rv64i_zvkb_zvbc:
ld $TMP0, ($Htable)
ld $TMP1, 8($Htable)
li $TMP2, 63
la $TMP3, Lpolymod
ld $TMP3, 8($TMP3)
# Load/store data in reverse order.
# This is needed as a part of endianness swap.
add $Xi, $Xi, 8
add $inp, $inp, 8
li $M8, -8
@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
@{[vlse64_v $V5, $Xi, $M8]} # vlse64.v v5, (a0), t4
Lstep:
# Read input data
@{[vlse64_v $Vinp, $inp, $M8]} # vle64.v v0, (a2)
add $inp, $inp, 16
add $len, $len, -16
# XOR them into Xi
@{[vxor_vv $V5, $V5, $Vinp]} # vxor.vv v0, v0, v1
@{[vrev8_v $V5, $V5]} # vrev8.v v5, v5
# Multiplication
# Do two 64x64 multiplications in one go to save some time
# and simplify things.
# A = a1a0 (t1, t0)
# B = b1b0 (v5)
# C = c1c0 (256 bit)
# c1 = a1b1 + (a0b1)h + (a1b0)h
# c0 = a0b0 + (a0b1)l + (a1b0)h
# v1 = (a0b1)l,(a0b0)l
@{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0
# v3 = (a0b1)h,(a0b0)h
@{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0
# v4 = (a1b1)l,(a1b0)l
@{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1
# v2 = (a1b1)h,(a1b0)h
@{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1
# Is there a better way to do this?
# Would need to swap the order of elements within a vector register.
@{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1
@{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1
@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
@{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1
@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
# v2 += (a0b1)h
@{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t
# v2 += (a1b1)l
@{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t
@{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
# v1 += (a0b0)h,0
@{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t
# v1 += (a1b0)l,0
@{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t
# Now the 256bit product should be stored in (v2,v1)
# v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
# v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
# Reduction
# Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
# This is a slight variation of the Gueron's Montgomery reduction.
# The difference being the order of some operations has been changed,
# to make a better use of vclmul(h) instructions.
# First step:
# c1 += (c0 * P)l
# vmv.v.i v0, 2
@{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
@{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
# Second step:
# D = d1,d0 is final result
# We want:
# m1 = c1 + (c1 * P)h
# m0 = (c1 * P)l + (c0 * P)h + c0
# d1 = c3 + m1
# d0 = c2 + m0
#v3 = (c1 * P)l, 0
@{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
#v4 = (c1 * P)h, (c0 * P)h
@{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3
@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
@{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4
@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
# XOR in the upper upper part of the product
@{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1
@{[vrev8_v $V5, $V2]} # vrev8.v v2, v2
bnez $len, Lstep
@{[vsse64_v $V5, $Xi, $M8]} # vsse64.v v2, (a0), t4
ret
.size gcm_ghash_rv64i_zvkb_zvbc,.-gcm_ghash_rv64i_zvkb_zvbc
___
}
$code .= <<___;
.p2align 4
Lpolymod:
.dword 0x0000000000000001
.dword 0xc200000000000000
.size Lpolymod,.-Lpolymod
___
print $code;
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,169 @@
#! /usr/bin/env perl
# This file is dual-licensed, meaning that you can use it under your
# choice of either of the following two licenses:
#
# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You can obtain
# a copy in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# or
#
# Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# - RV64I
# - RISC-V Vector ('V') with VLEN >= 128
# - RISC-V Vector GCM/GMAC extension ('Zvkg')
#
# Optional:
# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
use strict;
use warnings;
use FindBin qw($Bin);
use lib "$Bin";
use lib "$Bin/../../perlasm";
use riscv;
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$output and open STDOUT,">$output";
my $code=<<___;
.text
___
################################################################################
# void gcm_init_rv64i_zvkg(u128 Htable[16], const u64 H[2]);
# void gcm_init_rv64i_zvkg_zvkb(u128 Htable[16], const u64 H[2]);
#
# input: H: 128-bit H - secret parameter E(K, 0^128)
# output: Htable: Copy of secret parameter (in normalized byte order)
#
# All callers of this function revert the byte-order unconditionally
# on little-endian machines. So we need to revert the byte-order back.
{
my ($Htable,$H,$VAL0,$VAL1,$TMP0) = ("a0","a1","a2","a3","t0");
$code .= <<___;
.p2align 3
.globl gcm_init_rv64i_zvkg
.type gcm_init_rv64i_zvkg,\@function
gcm_init_rv64i_zvkg:
ld $VAL0, 0($H)
ld $VAL1, 8($H)
@{[sd_rev8_rv64i $VAL0, $Htable, 0, $TMP0]}
@{[sd_rev8_rv64i $VAL1, $Htable, 8, $TMP0]}
ret
.size gcm_init_rv64i_zvkg,.-gcm_init_rv64i_zvkg
___
}
{
my ($Htable,$H,$V0) = ("a0","a1","v0");
$code .= <<___;
.p2align 3
.globl gcm_init_rv64i_zvkg_zvkb
.type gcm_init_rv64i_zvkg_zvkb,\@function
gcm_init_rv64i_zvkg_zvkb:
@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, ta, ma
@{[vle64_v $V0, $H]} # vle64.v v0, (a1)
@{[vrev8_v $V0, $V0]} # vrev8.v v0, v0
@{[vse64_v $V0, $Htable]} # vse64.v v0, (a0)
ret
.size gcm_init_rv64i_zvkg_zvkb,.-gcm_init_rv64i_zvkg_zvkb
___
}
################################################################################
# void gcm_gmult_rv64i_zvkg(u64 Xi[2], const u128 Htable[16]);
#
# input: Xi: current hash value
# Htable: copy of H
# output: Xi: next hash value Xi
{
my ($Xi,$Htable) = ("a0","a1");
my ($VD,$VS2) = ("v1","v2");
$code .= <<___;
.p2align 3
.globl gcm_gmult_rv64i_zvkg
.type gcm_gmult_rv64i_zvkg,\@function
gcm_gmult_rv64i_zvkg:
@{[vsetivli__x0_4_e32_m1_tu_mu]}
@{[vle32_v $VS2, $Htable]}
@{[vle32_v $VD, $Xi]}
@{[vgmul_vv $VD, $VS2]}
@{[vse32_v $VD, $Xi]}
ret
.size gcm_gmult_rv64i_zvkg,.-gcm_gmult_rv64i_zvkg
___
}
################################################################################
# void gcm_ghash_rv64i_zvkg(u64 Xi[2], const u128 Htable[16],
# const u8 *inp, size_t len);
#
# input: Xi: current hash value
# Htable: copy of H
# inp: pointer to input data
# len: length of input data in bytes (multiple of block size)
# output: Xi: Xi+1 (next hash value Xi)
{
my ($Xi,$Htable,$inp,$len) = ("a0","a1","a2","a3");
my ($vXi,$vH,$vinp,$Vzero) = ("v1","v2","v3","v4");
$code .= <<___;
.p2align 3
.globl gcm_ghash_rv64i_zvkg
.type gcm_ghash_rv64i_zvkg,\@function
gcm_ghash_rv64i_zvkg:
@{[vsetivli__x0_4_e32_m1_tu_mu]}
@{[vle32_v $vH, $Htable]}
@{[vle32_v $vXi, $Xi]}
Lstep:
@{[vle32_v $vinp, $inp]}
add $inp, $inp, 16
add $len, $len, -16
@{[vghsh_vv $vXi, $vH, $vinp]}
bnez $len, Lstep
@{[vse32_v $vXi, $Xi]}
ret
.size gcm_ghash_rv64i_zvkg,.-gcm_ghash_rv64i_zvkg
___
}
print $code;
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,428 @@
#! /usr/bin/env perl
# This file is dual-licensed, meaning that you can use it under your
# choice of either of the following two licenses:
#
# Copyright 2022-2023 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You can obtain
# a copy in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# or
#
# Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
use strict;
use warnings;
use FindBin qw($Bin);
use lib "$Bin";
use lib "$Bin/../../perlasm";
use riscv;
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$output and open STDOUT,">$output";
my $code=<<___;
.text
___
################################################################################
# void gcm_init_rv64i_zbc(u128 Htable[16], const u64 H[2]);
# void gcm_init_rv64i_zbc__zbb(u128 Htable[16], const u64 H[2]);
# void gcm_init_rv64i_zbc__zbkb(u128 Htable[16], const u64 H[2]);
#
# input: H: 128-bit H - secret parameter E(K, 0^128)
# output: Htable: Preprocessed key data for gcm_gmult_rv64i_zbc* and
# gcm_ghash_rv64i_zbc*
#
# All callers of this function revert the byte-order unconditionally
# on little-endian machines. So we need to revert the byte-order back.
# Additionally we reverse the bits of each byte.
{
my ($Htable,$H,$VAL0,$VAL1,$TMP0,$TMP1,$TMP2) = ("a0","a1","a2","a3","t0","t1","t2");
$code .= <<___;
.p2align 3
.globl gcm_init_rv64i_zbc
.type gcm_init_rv64i_zbc,\@function
gcm_init_rv64i_zbc:
ld $VAL0,0($H)
ld $VAL1,8($H)
@{[brev8_rv64i $VAL0, $TMP0, $TMP1, $TMP2]}
@{[brev8_rv64i $VAL1, $TMP0, $TMP1, $TMP2]}
@{[sd_rev8_rv64i $VAL0, $Htable, 0, $TMP0]}
@{[sd_rev8_rv64i $VAL1, $Htable, 8, $TMP0]}
ret
.size gcm_init_rv64i_zbc,.-gcm_init_rv64i_zbc
___
}
{
my ($Htable,$H,$VAL0,$VAL1,$TMP0,$TMP1,$TMP2) = ("a0","a1","a2","a3","t0","t1","t2");
$code .= <<___;
.p2align 3
.globl gcm_init_rv64i_zbc__zbb
.type gcm_init_rv64i_zbc__zbb,\@function
gcm_init_rv64i_zbc__zbb:
ld $VAL0,0($H)
ld $VAL1,8($H)
@{[brev8_rv64i $VAL0, $TMP0, $TMP1, $TMP2]}
@{[brev8_rv64i $VAL1, $TMP0, $TMP1, $TMP2]}
@{[rev8 $VAL0, $VAL0]}
@{[rev8 $VAL1, $VAL1]}
sd $VAL0,0($Htable)
sd $VAL1,8($Htable)
ret
.size gcm_init_rv64i_zbc__zbb,.-gcm_init_rv64i_zbc__zbb
___
}
{
my ($Htable,$H,$TMP0,$TMP1) = ("a0","a1","t0","t1");
$code .= <<___;
.p2align 3
.globl gcm_init_rv64i_zbc__zbkb
.type gcm_init_rv64i_zbc__zbkb,\@function
gcm_init_rv64i_zbc__zbkb:
ld $TMP0,0($H)
ld $TMP1,8($H)
@{[brev8 $TMP0, $TMP0]}
@{[brev8 $TMP1, $TMP1]}
@{[rev8 $TMP0, $TMP0]}
@{[rev8 $TMP1, $TMP1]}
sd $TMP0,0($Htable)
sd $TMP1,8($Htable)
ret
.size gcm_init_rv64i_zbc__zbkb,.-gcm_init_rv64i_zbc__zbkb
___
}
################################################################################
# void gcm_gmult_rv64i_zbc(u64 Xi[2], const u128 Htable[16]);
# void gcm_gmult_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16]);
#
# input: Xi: current hash value
# Htable: copy of H
# output: Xi: next hash value Xi
#
# Compute GMULT (Xi*H mod f) using the Zbc (clmul) and Zbb (basic bit manip)
# extensions. Using the no-Karatsuba approach and clmul for the final reduction.
# This results in an implementation with minimized number of instructions.
# HW with clmul latencies higher than 2 cycles might observe a performance
# improvement with Karatsuba. HW with clmul latencies higher than 6 cycles
# might observe a performance improvement with additionally converting the
# reduction to shift&xor. For a full discussion of this estimates see
# https://github.com/riscv/riscv-crypto/blob/master/doc/supp/gcm-mode-cmul.adoc
{
my ($Xi,$Htable,$x0,$x1,$y0,$y1) = ("a0","a1","a4","a5","a6","a7");
my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
$code .= <<___;
.p2align 3
.globl gcm_gmult_rv64i_zbc
.type gcm_gmult_rv64i_zbc,\@function
gcm_gmult_rv64i_zbc:
# Load Xi and bit-reverse it
ld $x0, 0($Xi)
ld $x1, 8($Xi)
@{[brev8_rv64i $x0, $z0, $z1, $z2]}
@{[brev8_rv64i $x1, $z0, $z1, $z2]}
# Load the key (already bit-reversed)
ld $y0, 0($Htable)
ld $y1, 8($Htable)
# Load the reduction constant
la $polymod, Lpolymod
lbu $polymod, 0($polymod)
# Multiplication (without Karatsuba)
@{[clmulh $z3, $x1, $y1]}
@{[clmul $z2, $x1, $y1]}
@{[clmulh $t1, $x0, $y1]}
@{[clmul $z1, $x0, $y1]}
xor $z2, $z2, $t1
@{[clmulh $t1, $x1, $y0]}
@{[clmul $t0, $x1, $y0]}
xor $z2, $z2, $t1
xor $z1, $z1, $t0
@{[clmulh $t1, $x0, $y0]}
@{[clmul $z0, $x0, $y0]}
xor $z1, $z1, $t1
# Reduction with clmul
@{[clmulh $t1, $z3, $polymod]}
@{[clmul $t0, $z3, $polymod]}
xor $z2, $z2, $t1
xor $z1, $z1, $t0
@{[clmulh $t1, $z2, $polymod]}
@{[clmul $t0, $z2, $polymod]}
xor $x1, $z1, $t1
xor $x0, $z0, $t0
# Bit-reverse Xi back and store it
@{[brev8_rv64i $x0, $z0, $z1, $z2]}
@{[brev8_rv64i $x1, $z0, $z1, $z2]}
sd $x0, 0($Xi)
sd $x1, 8($Xi)
ret
.size gcm_gmult_rv64i_zbc,.-gcm_gmult_rv64i_zbc
___
}
{
my ($Xi,$Htable,$x0,$x1,$y0,$y1) = ("a0","a1","a4","a5","a6","a7");
my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
$code .= <<___;
.p2align 3
.globl gcm_gmult_rv64i_zbc__zbkb
.type gcm_gmult_rv64i_zbc__zbkb,\@function
gcm_gmult_rv64i_zbc__zbkb:
# Load Xi and bit-reverse it
ld $x0, 0($Xi)
ld $x1, 8($Xi)
@{[brev8 $x0, $x0]}
@{[brev8 $x1, $x1]}
# Load the key (already bit-reversed)
ld $y0, 0($Htable)
ld $y1, 8($Htable)
# Load the reduction constant
la $polymod, Lpolymod
lbu $polymod, 0($polymod)
# Multiplication (without Karatsuba)
@{[clmulh $z3, $x1, $y1]}
@{[clmul $z2, $x1, $y1]}
@{[clmulh $t1, $x0, $y1]}
@{[clmul $z1, $x0, $y1]}
xor $z2, $z2, $t1
@{[clmulh $t1, $x1, $y0]}
@{[clmul $t0, $x1, $y0]}
xor $z2, $z2, $t1
xor $z1, $z1, $t0
@{[clmulh $t1, $x0, $y0]}
@{[clmul $z0, $x0, $y0]}
xor $z1, $z1, $t1
# Reduction with clmul
@{[clmulh $t1, $z3, $polymod]}
@{[clmul $t0, $z3, $polymod]}
xor $z2, $z2, $t1
xor $z1, $z1, $t0
@{[clmulh $t1, $z2, $polymod]}
@{[clmul $t0, $z2, $polymod]}
xor $x1, $z1, $t1
xor $x0, $z0, $t0
# Bit-reverse Xi back and store it
@{[brev8 $x0, $x0]}
@{[brev8 $x1, $x1]}
sd $x0, 0($Xi)
sd $x1, 8($Xi)
ret
.size gcm_gmult_rv64i_zbc__zbkb,.-gcm_gmult_rv64i_zbc__zbkb
___
}
################################################################################
# void gcm_ghash_rv64i_zbc(u64 Xi[2], const u128 Htable[16],
# const u8 *inp, size_t len);
# void gcm_ghash_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16],
# const u8 *inp, size_t len);
#
# input: Xi: current hash value
# Htable: copy of H
# inp: pointer to input data
# len: length of input data in bytes (multiple of block size)
# output: Xi: Xi+1 (next hash value Xi)
{
my ($Xi,$Htable,$inp,$len,$x0,$x1,$y0,$y1) = ("a0","a1","a2","a3","a4","a5","a6","a7");
my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
$code .= <<___;
.p2align 3
.globl gcm_ghash_rv64i_zbc
.type gcm_ghash_rv64i_zbc,\@function
gcm_ghash_rv64i_zbc:
# Load Xi and bit-reverse it
ld $x0, 0($Xi)
ld $x1, 8($Xi)
@{[brev8_rv64i $x0, $z0, $z1, $z2]}
@{[brev8_rv64i $x1, $z0, $z1, $z2]}
# Load the key (already bit-reversed)
ld $y0, 0($Htable)
ld $y1, 8($Htable)
# Load the reduction constant
la $polymod, Lpolymod
lbu $polymod, 0($polymod)
Lstep:
# Load the input data, bit-reverse them, and XOR them with Xi
ld $t0, 0($inp)
ld $t1, 8($inp)
add $inp, $inp, 16
add $len, $len, -16
@{[brev8_rv64i $t0, $z0, $z1, $z2]}
@{[brev8_rv64i $t1, $z0, $z1, $z2]}
xor $x0, $x0, $t0
xor $x1, $x1, $t1
# Multiplication (without Karatsuba)
@{[clmulh $z3, $x1, $y1]}
@{[clmul $z2, $x1, $y1]}
@{[clmulh $t1, $x0, $y1]}
@{[clmul $z1, $x0, $y1]}
xor $z2, $z2, $t1
@{[clmulh $t1, $x1, $y0]}
@{[clmul $t0, $x1, $y0]}
xor $z2, $z2, $t1
xor $z1, $z1, $t0
@{[clmulh $t1, $x0, $y0]}
@{[clmul $z0, $x0, $y0]}
xor $z1, $z1, $t1
# Reduction with clmul
@{[clmulh $t1, $z3, $polymod]}
@{[clmul $t0, $z3, $polymod]}
xor $z2, $z2, $t1
xor $z1, $z1, $t0
@{[clmulh $t1, $z2, $polymod]}
@{[clmul $t0, $z2, $polymod]}
xor $x1, $z1, $t1
xor $x0, $z0, $t0
# Iterate over all blocks
bnez $len, Lstep
# Bit-reverse final Xi back and store it
@{[brev8_rv64i $x0, $z0, $z1, $z2]}
@{[brev8_rv64i $x1, $z0, $z1, $z2]}
sd $x0, 0($Xi)
sd $x1, 8($Xi)
ret
.size gcm_ghash_rv64i_zbc,.-gcm_ghash_rv64i_zbc
___
}
{
my ($Xi,$Htable,$inp,$len,$x0,$x1,$y0,$y1) = ("a0","a1","a2","a3","a4","a5","a6","a7");
my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
$code .= <<___;
.p2align 3
.globl gcm_ghash_rv64i_zbc__zbkb
.type gcm_ghash_rv64i_zbc__zbkb,\@function
gcm_ghash_rv64i_zbc__zbkb:
# Load Xi and bit-reverse it
ld $x0, 0($Xi)
ld $x1, 8($Xi)
@{[brev8 $x0, $x0]}
@{[brev8 $x1, $x1]}
# Load the key (already bit-reversed)
ld $y0, 0($Htable)
ld $y1, 8($Htable)
# Load the reduction constant
la $polymod, Lpolymod
lbu $polymod, 0($polymod)
Lstep_zkbk:
# Load the input data, bit-reverse them, and XOR them with Xi
ld $t0, 0($inp)
ld $t1, 8($inp)
add $inp, $inp, 16
add $len, $len, -16
@{[brev8 $t0, $t0]}
@{[brev8 $t1, $t1]}
xor $x0, $x0, $t0
xor $x1, $x1, $t1
# Multiplication (without Karatsuba)
@{[clmulh $z3, $x1, $y1]}
@{[clmul $z2, $x1, $y1]}
@{[clmulh $t1, $x0, $y1]}
@{[clmul $z1, $x0, $y1]}
xor $z2, $z2, $t1
@{[clmulh $t1, $x1, $y0]}
@{[clmul $t0, $x1, $y0]}
xor $z2, $z2, $t1
xor $z1, $z1, $t0
@{[clmulh $t1, $x0, $y0]}
@{[clmul $z0, $x0, $y0]}
xor $z1, $z1, $t1
# Reduction with clmul
@{[clmulh $t1, $z3, $polymod]}
@{[clmul $t0, $z3, $polymod]}
xor $z2, $z2, $t1
xor $z1, $z1, $t0
@{[clmulh $t1, $z2, $polymod]}
@{[clmul $t0, $z2, $polymod]}
xor $x1, $z1, $t1
xor $x0, $z0, $t0
# Iterate over all blocks
bnez $len, Lstep_zkbk
# Bit-reverse final Xi back and store it
@{[brev8 $x0, $x0]}
@{[brev8 $x1, $x1]}
sd $x0, 0($Xi)
sd $x1, 8($Xi)
ret
.size gcm_ghash_rv64i_zbc__zbkb,.-gcm_ghash_rv64i_zbc__zbkb
___
}
$code .= <<___;
.p2align 3
Lbrev8_const:
.dword 0xAAAAAAAAAAAAAAAA
.dword 0xCCCCCCCCCCCCCCCC
.dword 0xF0F0F0F0F0F0F0F0
.size Lbrev8_const,.-Lbrev8_const
Lpolymod:
.byte 0x87
.size Lpolymod,.-Lpolymod
___
print $code;
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,256 @@
#! /usr/bin/env perl
# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# September 2010.
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+128 bytes shared table]. Performance
# was measured to be ~18 cycles per processed byte on z10, which is
# almost 40% better than gcc-generated code. It should be noted that
# 18 cycles is worse result than expected: loop is scheduled for 12
# and the result should be close to 12. In the lack of instruction-
# level profiling data it's impossible to tell why...
# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. On z990 it was measured to perform
# 2.8x better than 32-bit code generated by gcc 4.3.
# March 2011.
#
# Support for hardware KIMD-GHASH is verified to produce correct
# result and therefore is engaged. On z196 it was measured to process
# 8KB buffer ~7 faster than software implementation. It's not as
# impressive for smaller buffer sizes and for smallest 16-bytes buffer
# it's actually almost 2 times slower. Which is the reason why
# KIMD-GHASH is not used in gcm_gmult_4bit.
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
$output and open STDOUT,">$output";
$softonly=0;
$Zhi="%r0";
$Zlo="%r1";
$Xi="%r2"; # argument block
$Htbl="%r3";
$inp="%r4";
$len="%r5";
$rem0="%r6"; # variables
$rem1="%r7";
$nlo="%r8";
$nhi="%r9";
$xi="%r10";
$cnt="%r11";
$tmp="%r12";
$x78="%r13";
$rem_4bit="%r14";
$sp="%r15";
$code.=<<___;
#include "s390x_arch.h"
.text
.globl gcm_gmult_4bit
.align 32
gcm_gmult_4bit:
___
$code.=<<___;
stm${g} %r6,%r14,6*$SIZE_T($sp)
aghi $Xi,-1
lghi $len,1
lghi $x78,`0xf<<3`
larl $rem_4bit,rem_4bit
lg $Zlo,8+1($Xi) # Xi
j .Lgmult_shortcut
.type gcm_gmult_4bit,\@function
.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
.globl gcm_ghash_4bit
.align 32
gcm_ghash_4bit:
___
$code.=<<___ if(!$softonly);
larl %r1,OPENSSL_s390xcap_P
lg %r0,S390X_KIMD+8(%r1) # load second word of kimd capabilities
# vector
tmhh %r0,0x4000 # check for function 65
jz .Lsoft_ghash
# Do not assume this function is called from a gcm128_context.
# This is not true, e.g., for AES-GCM-SIV.
# Parameter Block:
# Chaining Value (XI) 128byte
# Key (Htable[8]) 128byte
lmg %r0,%r1,0($Xi)
stmg %r0,%r1,8($sp)
lmg %r0,%r1,8*16($Htbl)
stmg %r0,%r1,24($sp)
la %r1,8($sp)
lghi %r0,S390X_GHASH # function 65
.long 0xb93e0004 # kimd %r0,$inp
brc 1,.-4 # pay attention to "partial completion"
lmg %r0,%r1,8($sp)
stmg %r0,%r1,0($Xi)
br %r14
.align 32
.Lsoft_ghash:
___
$code.=<<___ if ($flavour =~ /3[12]/);
llgfr $len,$len
___
$code.=<<___;
stm${g} %r6,%r14,6*$SIZE_T($sp)
aghi $Xi,-1
srlg $len,$len,4
lghi $x78,`0xf<<3`
larl $rem_4bit,rem_4bit
lg $Zlo,8+1($Xi) # Xi
lg $Zhi,0+1($Xi)
lghi $tmp,0
.Louter:
xg $Zhi,0($inp) # Xi ^= inp
xg $Zlo,8($inp)
xgr $Zhi,$tmp
stg $Zlo,8+1($Xi)
stg $Zhi,0+1($Xi)
.Lgmult_shortcut:
lghi $tmp,0xf0
sllg $nlo,$Zlo,4
srlg $xi,$Zlo,8 # extract second byte
ngr $nlo,$tmp
lgr $nhi,$Zlo
lghi $cnt,14
ngr $nhi,$tmp
lg $Zlo,8($nlo,$Htbl)
lg $Zhi,0($nlo,$Htbl)
sllg $nlo,$xi,4
sllg $rem0,$Zlo,3
ngr $nlo,$tmp
ngr $rem0,$x78
ngr $xi,$tmp
sllg $tmp,$Zhi,60
srlg $Zlo,$Zlo,4
srlg $Zhi,$Zhi,4
xg $Zlo,8($nhi,$Htbl)
xg $Zhi,0($nhi,$Htbl)
lgr $nhi,$xi
sllg $rem1,$Zlo,3
xgr $Zlo,$tmp
ngr $rem1,$x78
sllg $tmp,$Zhi,60
j .Lghash_inner
.align 16
.Lghash_inner:
srlg $Zlo,$Zlo,4
srlg $Zhi,$Zhi,4
xg $Zlo,8($nlo,$Htbl)
llgc $xi,0($cnt,$Xi)
xg $Zhi,0($nlo,$Htbl)
sllg $nlo,$xi,4
xg $Zhi,0($rem0,$rem_4bit)
nill $nlo,0xf0
sllg $rem0,$Zlo,3
xgr $Zlo,$tmp
ngr $rem0,$x78
nill $xi,0xf0
sllg $tmp,$Zhi,60
srlg $Zlo,$Zlo,4
srlg $Zhi,$Zhi,4
xg $Zlo,8($nhi,$Htbl)
xg $Zhi,0($nhi,$Htbl)
lgr $nhi,$xi
xg $Zhi,0($rem1,$rem_4bit)
sllg $rem1,$Zlo,3
xgr $Zlo,$tmp
ngr $rem1,$x78
sllg $tmp,$Zhi,60
brct $cnt,.Lghash_inner
srlg $Zlo,$Zlo,4
srlg $Zhi,$Zhi,4
xg $Zlo,8($nlo,$Htbl)
xg $Zhi,0($nlo,$Htbl)
sllg $xi,$Zlo,3
xg $Zhi,0($rem0,$rem_4bit)
xgr $Zlo,$tmp
ngr $xi,$x78
sllg $tmp,$Zhi,60
srlg $Zlo,$Zlo,4
srlg $Zhi,$Zhi,4
xg $Zlo,8($nhi,$Htbl)
xg $Zhi,0($nhi,$Htbl)
xgr $Zlo,$tmp
xg $Zhi,0($rem1,$rem_4bit)
lg $tmp,0($xi,$rem_4bit)
la $inp,16($inp)
sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
brctg $len,.Louter
xgr $Zhi,$tmp
stg $Zlo,8+1($Xi)
stg $Zhi,0+1($Xi)
lm${g} %r6,%r14,6*$SIZE_T($sp)
br %r14
.type gcm_ghash_4bit,\@function
.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
.align 64
rem_4bit:
.long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
.long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
.long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
.long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
.type rem_4bit,\@object
.size rem_4bit,(.-rem_4bit)
.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,583 @@
#! /usr/bin/env perl
# Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# March 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+128 bytes shared table]. Performance
# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
# and are expressed in cycles per processed byte, less is better:
#
# gcc 3.3.x cc 5.2 this assembler
#
# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
#
# Here is data collected on UltraSPARC T1 system running Linux:
#
# gcc 4.4.1 this assembler
#
# 32-bit build 566 50 (+1000%)
# 64-bit build 56 50 (+12%)
#
# I don't quite understand why difference between 32-bit and 64-bit
# compiler-generated code is so big. Compilers *were* instructed to
# generate code for UltraSPARC and should have used 64-bit registers
# for Z vector (see C code) even in 32-bit build... Oh well, it only
# means more impressive improvement coefficients for this assembler
# module;-) Loops are aggressively modulo-scheduled in respect to
# references to input data and Z.hi updates to achieve 12 cycles
# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
#
# October 2012
#
# Add VIS3 lookup-table-free implementation using polynomial
# multiplication xmulx[hi] and extended addition addxc[cc]
# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
# saturates at ~15.5x single-process result on 8-core processor,
# or ~20.5GBps per 2.85GHz socket.
$output=pop and open STDOUT,">$output";
$frame="STACK_FRAME";
$bias="STACK_BIAS";
$Zhi="%o0"; # 64-bit values
$Zlo="%o1";
$Thi="%o2";
$Tlo="%o3";
$rem="%o4";
$tmp="%o5";
$nhi="%l0"; # small values and pointers
$nlo="%l1";
$xi0="%l2";
$xi1="%l3";
$rem_4bit="%l4";
$remi="%l5";
$Htblo="%l6";
$cnt="%l7";
$Xi="%i0"; # input argument block
$Htbl="%i1";
$inp="%i2";
$len="%i3";
$code.=<<___;
#ifndef __ASSEMBLER__
# define __ASSEMBLER__ 1
#endif
#include "crypto/sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.section ".text",#alloc,#execinstr
.align 64
rem_4bit:
.long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
.long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
.long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
.long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
.type rem_4bit,#object
.size rem_4bit,(.-rem_4bit)
.globl gcm_ghash_4bit
.align 32
gcm_ghash_4bit:
save %sp,-$frame,%sp
ldub [$inp+15],$nlo
ldub [$Xi+15],$xi0
ldub [$Xi+14],$xi1
add $len,$inp,$len
add $Htbl,8,$Htblo
1: call .+8
add %o7,rem_4bit-1b,$rem_4bit
.Louter:
xor $xi0,$nlo,$nlo
and $nlo,0xf0,$nhi
and $nlo,0x0f,$nlo
sll $nlo,4,$nlo
ldx [$Htblo+$nlo],$Zlo
ldx [$Htbl+$nlo],$Zhi
ldub [$inp+14],$nlo
ldx [$Htblo+$nhi],$Tlo
and $Zlo,0xf,$remi
ldx [$Htbl+$nhi],$Thi
sll $remi,3,$remi
ldx [$rem_4bit+$remi],$rem
srlx $Zlo,4,$Zlo
mov 13,$cnt
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
xor $xi1,$nlo,$nlo
and $Zlo,0xf,$remi
and $nlo,0xf0,$nhi
and $nlo,0x0f,$nlo
ba .Lghash_inner
sll $nlo,4,$nlo
.align 32
.Lghash_inner:
ldx [$Htblo+$nlo],$Tlo
sll $remi,3,$remi
xor $Thi,$Zhi,$Zhi
ldx [$Htbl+$nlo],$Thi
srlx $Zlo,4,$Zlo
xor $rem,$Zhi,$Zhi
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
ldub [$inp+$cnt],$nlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
ldub [$Xi+$cnt],$xi1
xor $Thi,$Zhi,$Zhi
and $Zlo,0xf,$remi
ldx [$Htblo+$nhi],$Tlo
sll $remi,3,$remi
xor $rem,$Zhi,$Zhi
ldx [$Htbl+$nhi],$Thi
srlx $Zlo,4,$Zlo
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $xi1,$nlo,$nlo
srlx $Zhi,4,$Zhi
and $nlo,0xf0,$nhi
addcc $cnt,-1,$cnt
xor $Zlo,$tmp,$Zlo
and $nlo,0x0f,$nlo
xor $Tlo,$Zlo,$Zlo
sll $nlo,4,$nlo
blu .Lghash_inner
and $Zlo,0xf,$remi
ldx [$Htblo+$nlo],$Tlo
sll $remi,3,$remi
xor $Thi,$Zhi,$Zhi
ldx [$Htbl+$nlo],$Thi
srlx $Zlo,4,$Zlo
xor $rem,$Zhi,$Zhi
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
xor $Thi,$Zhi,$Zhi
add $inp,16,$inp
cmp $inp,$len
be,pn SIZE_T_CC,.Ldone
and $Zlo,0xf,$remi
ldx [$Htblo+$nhi],$Tlo
sll $remi,3,$remi
xor $rem,$Zhi,$Zhi
ldx [$Htbl+$nhi],$Thi
srlx $Zlo,4,$Zlo
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
ldub [$inp+15],$nlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
xor $Thi,$Zhi,$Zhi
stx $Zlo,[$Xi+8]
xor $rem,$Zhi,$Zhi
stx $Zhi,[$Xi]
srl $Zlo,8,$xi1
and $Zlo,0xff,$xi0
ba .Louter
and $xi1,0xff,$xi1
.align 32
.Ldone:
ldx [$Htblo+$nhi],$Tlo
sll $remi,3,$remi
xor $rem,$Zhi,$Zhi
ldx [$Htbl+$nhi],$Thi
srlx $Zlo,4,$Zlo
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
xor $Thi,$Zhi,$Zhi
stx $Zlo,[$Xi+8]
xor $rem,$Zhi,$Zhi
stx $Zhi,[$Xi]
ret
restore
.type gcm_ghash_4bit,#function
.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
___
undef $inp;
undef $len;
$code.=<<___;
.globl gcm_gmult_4bit
.align 32
gcm_gmult_4bit:
save %sp,-$frame,%sp
ldub [$Xi+15],$nlo
add $Htbl,8,$Htblo
1: call .+8
add %o7,rem_4bit-1b,$rem_4bit
and $nlo,0xf0,$nhi
and $nlo,0x0f,$nlo
sll $nlo,4,$nlo
ldx [$Htblo+$nlo],$Zlo
ldx [$Htbl+$nlo],$Zhi
ldub [$Xi+14],$nlo
ldx [$Htblo+$nhi],$Tlo
and $Zlo,0xf,$remi
ldx [$Htbl+$nhi],$Thi
sll $remi,3,$remi
ldx [$rem_4bit+$remi],$rem
srlx $Zlo,4,$Zlo
mov 13,$cnt
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
and $Zlo,0xf,$remi
and $nlo,0xf0,$nhi
and $nlo,0x0f,$nlo
ba .Lgmult_inner
sll $nlo,4,$nlo
.align 32
.Lgmult_inner:
ldx [$Htblo+$nlo],$Tlo
sll $remi,3,$remi
xor $Thi,$Zhi,$Zhi
ldx [$Htbl+$nlo],$Thi
srlx $Zlo,4,$Zlo
xor $rem,$Zhi,$Zhi
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
ldub [$Xi+$cnt],$nlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
xor $Thi,$Zhi,$Zhi
and $Zlo,0xf,$remi
ldx [$Htblo+$nhi],$Tlo
sll $remi,3,$remi
xor $rem,$Zhi,$Zhi
ldx [$Htbl+$nhi],$Thi
srlx $Zlo,4,$Zlo
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
srlx $Zhi,4,$Zhi
and $nlo,0xf0,$nhi
addcc $cnt,-1,$cnt
xor $Zlo,$tmp,$Zlo
and $nlo,0x0f,$nlo
xor $Tlo,$Zlo,$Zlo
sll $nlo,4,$nlo
blu .Lgmult_inner
and $Zlo,0xf,$remi
ldx [$Htblo+$nlo],$Tlo
sll $remi,3,$remi
xor $Thi,$Zhi,$Zhi
ldx [$Htbl+$nlo],$Thi
srlx $Zlo,4,$Zlo
xor $rem,$Zhi,$Zhi
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
xor $Thi,$Zhi,$Zhi
and $Zlo,0xf,$remi
ldx [$Htblo+$nhi],$Tlo
sll $remi,3,$remi
xor $rem,$Zhi,$Zhi
ldx [$Htbl+$nhi],$Thi
srlx $Zlo,4,$Zlo
ldx [$rem_4bit+$remi],$rem
sllx $Zhi,60,$tmp
xor $Tlo,$Zlo,$Zlo
srlx $Zhi,4,$Zhi
xor $Zlo,$tmp,$Zlo
xor $Thi,$Zhi,$Zhi
stx $Zlo,[$Xi+8]
xor $rem,$Zhi,$Zhi
stx $Zhi,[$Xi]
ret
restore
.type gcm_gmult_4bit,#function
.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
___
{{{
# Straightforward 128x128-bit multiplication using Karatsuba algorithm
# followed by pair of 64-bit reductions [with a shortcut in first one,
# which allowed to break dependency between reductions and remove one
# multiplication from critical path]. While it might be suboptimal
# with regard to sheer number of multiplications, other methods [such
# as aggregate reduction] would require more 64-bit registers, which
# we don't have in 32-bit application context.
($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
(map("%o$_",(0..5,7)),map("%g$_",(1..5)));
($shl,$shr)=map("%l$_",(0..7));
# For details regarding "twisted H" see ghash-x86.pl.
$code.=<<___;
.globl gcm_init_vis3
.align 32
gcm_init_vis3:
save %sp,-$frame,%sp
ldx [%i1+0],$Hhi
ldx [%i1+8],$Hlo
mov 0xE1,$Xhi
mov 1,$Xlo
sllx $Xhi,57,$Xhi
srax $Hhi,63,$C0 ! broadcast carry
addcc $Hlo,$Hlo,$Hlo ! H<<=1
addxc $Hhi,$Hhi,$Hhi
and $C0,$Xlo,$Xlo
and $C0,$Xhi,$Xhi
xor $Xlo,$Hlo,$Hlo
xor $Xhi,$Hhi,$Hhi
stx $Hlo,[%i0+8] ! save twisted H
stx $Hhi,[%i0+0]
sethi %hi(0xA0406080),$V
sethi %hi(0x20C0E000),%l0
or $V,%lo(0xA0406080),$V
or %l0,%lo(0x20C0E000),%l0
sllx $V,32,$V
or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
stx $V,[%i0+16]
ret
restore
.type gcm_init_vis3,#function
.size gcm_init_vis3,.-gcm_init_vis3
.globl gcm_gmult_vis3
.align 32
gcm_gmult_vis3:
save %sp,-$frame,%sp
ldx [$Xip+8],$Xlo ! load Xi
ldx [$Xip+0],$Xhi
ldx [$Htable+8],$Hlo ! load twisted H
ldx [$Htable+0],$Hhi
mov 0xE1,%l7
sllx %l7,57,$xE1 ! 57 is not a typo
ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
xmulx $Xlo,$Hlo,$C0
xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
xmulx $C2,$Hhl,$C1
xmulxhi $Xlo,$Hlo,$Xlo
xmulxhi $C2,$Hhl,$C2
xmulxhi $Xhi,$Hhi,$C3
xmulx $Xhi,$Hhi,$Xhi
sll $C0,3,$sqr
srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
xor $C0,$sqr,$sqr
sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
xor $C0,$C1,$C1 ! Karatsuba post-processing
xor $Xlo,$C2,$C2
xor $sqr,$Xlo,$Xlo ! real destination is $C1
xor $C3,$C2,$C2
xor $Xlo,$C1,$C1
xor $Xhi,$C2,$C2
xor $Xhi,$C1,$C1
xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
xor $C0,$C2,$C2
xmulx $C1,$xE1,$C0
xor $C1,$C3,$C3
xmulxhi $C1,$xE1,$C1
xor $Xlo,$C2,$C2
xor $C0,$C2,$C2
xor $C1,$C3,$C3
stx $C2,[$Xip+8] ! save Xi
stx $C3,[$Xip+0]
ret
restore
.type gcm_gmult_vis3,#function
.size gcm_gmult_vis3,.-gcm_gmult_vis3
.globl gcm_ghash_vis3
.align 32
gcm_ghash_vis3:
save %sp,-$frame,%sp
nop
srln $len,0,$len ! needed on v8+, "nop" on v9
ldx [$Xip+8],$C2 ! load Xi
ldx [$Xip+0],$C3
ldx [$Htable+8],$Hlo ! load twisted H
ldx [$Htable+0],$Hhi
mov 0xE1,%l7
sllx %l7,57,$xE1 ! 57 is not a typo
ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
and $inp,7,$shl
andn $inp,7,$inp
sll $shl,3,$shl
prefetch [$inp+63], 20
sub %g0,$shl,$shr
xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
.Loop:
ldx [$inp+8],$Xlo
brz,pt $shl,1f
ldx [$inp+0],$Xhi
ldx [$inp+16],$C1 ! align data
srlx $Xlo,$shr,$C0
sllx $Xlo,$shl,$Xlo
sllx $Xhi,$shl,$Xhi
srlx $C1,$shr,$C1
or $C0,$Xhi,$Xhi
or $C1,$Xlo,$Xlo
1:
add $inp,16,$inp
sub $len,16,$len
xor $C2,$Xlo,$Xlo
xor $C3,$Xhi,$Xhi
prefetch [$inp+63], 20
xmulx $Xlo,$Hlo,$C0
xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
xmulx $C2,$Hhl,$C1
xmulxhi $Xlo,$Hlo,$Xlo
xmulxhi $C2,$Hhl,$C2
xmulxhi $Xhi,$Hhi,$C3
xmulx $Xhi,$Hhi,$Xhi
sll $C0,3,$sqr
srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
xor $C0,$sqr,$sqr
sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
xor $C0,$C1,$C1 ! Karatsuba post-processing
xor $Xlo,$C2,$C2
xor $sqr,$Xlo,$Xlo ! real destination is $C1
xor $C3,$C2,$C2
xor $Xlo,$C1,$C1
xor $Xhi,$C2,$C2
xor $Xhi,$C1,$C1
xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
xor $C0,$C2,$C2
xmulx $C1,$xE1,$C0
xor $C1,$C3,$C3
xmulxhi $C1,$xE1,$C1
xor $Xlo,$C2,$C2
xor $C0,$C2,$C2
brnz,pt $len,.Loop
xor $C1,$C3,$C3
stx $C2,[$Xip+8] ! save Xi
stx $C3,[$Xip+0]
ret
restore
.type gcm_ghash_vis3,#function
.size gcm_ghash_vis3,.-gcm_ghash_vis3
___
}}}
$code.=<<___;
.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis3 {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my ($ref,$opf);
my %visopf = ( "addxc" => 0x011,
"addxccc" => 0x013,
"xmulx" => 0x115,
"xmulxhi" => 0x116 );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
if ($opf=$visopf{$mnemonic}) {
foreach ($rs1,$rs2,$rd) {
return $ref if (!/%([goli])([0-9])/);
$_=$bias{$1}+$2;
}
return sprintf ".word\t0x%08x !%s",
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
$ref;
} else {
return $ref;
}
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
&unvis3($1,$2,$3,$4)
/ge;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,674 @@
#! /usr/bin/env perl
# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# GHASH for for PowerISA v2.07.
#
# July 2014
#
# Accurate performance measurements are problematic, because it's
# always virtualized setup with possibly throttled processor.
# Relative comparison is therefore more informative. This initial
# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
# faster than "4-bit" integer-only compiler-generated 64-bit code.
# "Initial version" means that there is room for further improvement.
# May 2016
#
# 2x aggregated reduction improves performance by 50% (resulting
# performance on POWER8 is 1 cycle per processed byte), and 4x
# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
# POWER9 delivers 0.51 cpb.
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
if ($flavour =~ /64/) {
$SIZE_T=8;
$LRSAVE=2*$SIZE_T;
$STU="stdu";
$POP="ld";
$PUSH="std";
$UCMP="cmpld";
$SHRI="srdi";
} elsif ($flavour =~ /32/) {
$SIZE_T=4;
$LRSAVE=$SIZE_T;
$STU="stwu";
$POP="lwz";
$PUSH="stw";
$UCMP="cmplw";
$SHRI="srwi";
} else { die "nonsense $flavour"; }
$sp="r1";
$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour \"$output\""
or die "can't call $xlate: $!";
my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
my $vrsave="r12";
$code=<<___;
.machine "any"
.text
.globl .gcm_init_p8
.align 5
.gcm_init_p8:
li r0,-4096
li r8,0x10
mfspr $vrsave,256
li r9,0x20
mtspr 256,r0
li r10,0x30
lvx_u $H,0,r4 # load H
vspltisb $xC2,-16 # 0xf0
vspltisb $t0,1 # one
vaddubm $xC2,$xC2,$xC2 # 0xe0
vxor $zero,$zero,$zero
vor $xC2,$xC2,$t0 # 0xe1
vsldoi $xC2,$xC2,$zero,15 # 0xe1...
vsldoi $t1,$zero,$t0,1 # ...1
vaddubm $xC2,$xC2,$xC2 # 0xc2...
vspltisb $t2,7
vor $xC2,$xC2,$t1 # 0xc2....01
vspltb $t1,$H,0 # most significant byte
vsl $H,$H,$t0 # H<<=1
vsrab $t1,$t1,$t2 # broadcast carry bit
vand $t1,$t1,$xC2
vxor $IN,$H,$t1 # twisted H
vsldoi $H,$IN,$IN,8 # twist even more ...
vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
vsldoi $Hl,$zero,$H,8 # ... and split
vsldoi $Hh,$H,$zero,8
stvx_u $xC2,0,r3 # save pre-computed table
stvx_u $Hl,r8,r3
li r8,0x40
stvx_u $H, r9,r3
li r9,0x50
stvx_u $Hh,r10,r3
li r10,0x60
vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
vxor $t1,$t1,$Xh
vxor $IN1,$Xl,$t1
vsldoi $H2,$IN1,$IN1,8
vsldoi $H2l,$zero,$H2,8
vsldoi $H2h,$H2,$zero,8
stvx_u $H2l,r8,r3 # save H^2
li r8,0x70
stvx_u $H2,r9,r3
li r9,0x80
stvx_u $H2h,r10,r3
li r10,0x90
___
{
my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
$code.=<<___;
vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vsldoi $t4,$Xm1,$zero,8
vsldoi $t5,$zero,$Xm1,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vxor $Xl1,$Xl1,$t4
vxor $Xh1,$Xh1,$t5
vsldoi $Xl,$Xl,$Xl,8
vsldoi $Xl1,$Xl1,$Xl1,8
vxor $Xl,$Xl,$t2
vxor $Xl1,$Xl1,$t6
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
vpmsumd $Xl1,$Xl1,$xC2
vxor $t1,$t1,$Xh
vxor $t5,$t5,$Xh1
vxor $Xl,$Xl,$t1
vxor $Xl1,$Xl1,$t5
vsldoi $H,$Xl,$Xl,8
vsldoi $H2,$Xl1,$Xl1,8
vsldoi $Hl,$zero,$H,8
vsldoi $Hh,$H,$zero,8
vsldoi $H2l,$zero,$H2,8
vsldoi $H2h,$H2,$zero,8
stvx_u $Hl,r8,r3 # save H^3
li r8,0xa0
stvx_u $H,r9,r3
li r9,0xb0
stvx_u $Hh,r10,r3
li r10,0xc0
stvx_u $H2l,r8,r3 # save H^4
stvx_u $H2,r9,r3
stvx_u $H2h,r10,r3
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
.size .gcm_init_p8,.-.gcm_init_p8
___
}
$code.=<<___;
.globl .gcm_gmult_p8
.align 5
.gcm_gmult_p8:
lis r0,0xfff8
li r8,0x10
mfspr $vrsave,256
li r9,0x20
mtspr 256,r0
li r10,0x30
lvx_u $IN,0,$Xip # load Xi
lvx_u $Hl,r8,$Htbl # load pre-computed table
le?lvsl $lemask,r0,r0
lvx_u $H, r9,$Htbl
le?vspltisb $t0,0x07
lvx_u $Hh,r10,$Htbl
le?vxor $lemask,$lemask,$t0
lvx_u $xC2,0,$Htbl
le?vperm $IN,$IN,$IN,$lemask
vxor $zero,$zero,$zero
vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
vxor $t1,$t1,$Xh
vxor $Xl,$Xl,$t1
le?vperm $Xl,$Xl,$Xl,$lemask
stvx_u $Xl,0,$Xip # write out Xi
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
.size .gcm_gmult_p8,.-.gcm_gmult_p8
.globl .gcm_ghash_p8
.align 5
.gcm_ghash_p8:
li r0,-4096
li r8,0x10
mfspr $vrsave,256
li r9,0x20
mtspr 256,r0
li r10,0x30
lvx_u $Xl,0,$Xip # load Xi
lvx_u $Hl,r8,$Htbl # load pre-computed table
li r8,0x40
le?lvsl $lemask,r0,r0
lvx_u $H, r9,$Htbl
li r9,0x50
le?vspltisb $t0,0x07
lvx_u $Hh,r10,$Htbl
li r10,0x60
le?vxor $lemask,$lemask,$t0
lvx_u $xC2,0,$Htbl
le?vperm $Xl,$Xl,$Xl,$lemask
vxor $zero,$zero,$zero
${UCMP}i $len,64
bge Lgcm_ghash_p8_4x
lvx_u $IN,0,$inp
addi $inp,$inp,16
subic. $len,$len,16
le?vperm $IN,$IN,$IN,$lemask
vxor $IN,$IN,$Xl
beq Lshort
lvx_u $H2l,r8,$Htbl # load H^2
li r8,16
lvx_u $H2, r9,$Htbl
add r9,$inp,$len # end of input
lvx_u $H2h,r10,$Htbl
be?b Loop_2x
.align 5
Loop_2x:
lvx_u $IN1,0,$inp
le?vperm $IN1,$IN1,$IN1,$lemask
subic $len,$len,32
vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo
vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo
subfe r0,r0,r0 # borrow?-1:0
vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi
vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi
and r0,r0,$len
vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi
vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi
add $inp,$inp,r0
vxor $Xl,$Xl,$Xl1
vxor $Xm,$Xm,$Xm1
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xh,$Xh,$Xh1
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
lvx_u $IN,r8,$inp
addi $inp,$inp,32
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
le?vperm $IN,$IN,$IN,$lemask
vxor $t1,$t1,$Xh
vxor $IN,$IN,$t1
vxor $IN,$IN,$Xl
$UCMP r9,$inp
bgt Loop_2x # done yet?
cmplwi $len,0
bne Leven
Lshort:
vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
vxor $t1,$t1,$Xh
Leven:
vxor $Xl,$Xl,$t1
le?vperm $Xl,$Xl,$Xl,$lemask
stvx_u $Xl,0,$Xip # write out Xi
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
___
{
my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
$Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
my $IN0=$IN;
my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
$code.=<<___;
.align 5
.gcm_ghash_p8_4x:
Lgcm_ghash_p8_4x:
$STU $sp,-$FRAME($sp)
li r10,`15+6*$SIZE_T`
li r11,`31+6*$SIZE_T`
stvx v20,r10,$sp
addi r10,r10,32
stvx v21,r11,$sp
addi r11,r11,32
stvx v22,r10,$sp
addi r10,r10,32
stvx v23,r11,$sp
addi r11,r11,32
stvx v24,r10,$sp
addi r10,r10,32
stvx v25,r11,$sp
addi r11,r11,32
stvx v26,r10,$sp
addi r10,r10,32
stvx v27,r11,$sp
addi r11,r11,32
stvx v28,r10,$sp
addi r10,r10,32
stvx v29,r11,$sp
addi r11,r11,32
stvx v30,r10,$sp
li r10,0x60
stvx v31,r11,$sp
li r0,-1
stw $vrsave,`$FRAME-4`($sp) # save vrsave
mtspr 256,r0 # preserve all AltiVec registers
lvsl $t0,0,r8 # 0x0001..0e0f
#lvx_u $H2l,r8,$Htbl # load H^2
li r8,0x70
lvx_u $H2, r9,$Htbl
li r9,0x80
vspltisb $t1,8 # 0x0808..0808
#lvx_u $H2h,r10,$Htbl
li r10,0x90
lvx_u $H3l,r8,$Htbl # load H^3
li r8,0xa0
lvx_u $H3, r9,$Htbl
li r9,0xb0
lvx_u $H3h,r10,$Htbl
li r10,0xc0
lvx_u $H4l,r8,$Htbl # load H^4
li r8,0x10
lvx_u $H4, r9,$Htbl
li r9,0x20
lvx_u $H4h,r10,$Htbl
li r10,0x30
vsldoi $t2,$zero,$t1,8 # 0x0000..0808
vaddubm $hiperm,$t0,$t2 # 0x0001..1617
vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f
$SHRI $len,$len,4 # this allows to use sign bit
# as carry
lvx_u $IN0,0,$inp # load input
lvx_u $IN1,r8,$inp
subic. $len,$len,8
lvx_u $IN2,r9,$inp
lvx_u $IN3,r10,$inp
addi $inp,$inp,0x40
le?vperm $IN0,$IN0,$IN0,$lemask
le?vperm $IN1,$IN1,$IN1,$lemask
le?vperm $IN2,$IN2,$IN2,$lemask
le?vperm $IN3,$IN3,$IN3,$lemask
vxor $Xh,$IN0,$Xl
vpmsumd $Xl1,$IN1,$H3l
vpmsumd $Xm1,$IN1,$H3
vpmsumd $Xh1,$IN1,$H3h
vperm $H21l,$H2,$H,$hiperm
vperm $t0,$IN2,$IN3,$loperm
vperm $H21h,$H2,$H,$loperm
vperm $t1,$IN2,$IN3,$hiperm
vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
vxor $Xm2,$Xm2,$Xm1
vxor $Xl3,$Xl3,$Xl1
vxor $Xm3,$Xm3,$Xm2
vxor $Xh3,$Xh3,$Xh1
blt Ltail_4x
Loop_4x:
lvx_u $IN0,0,$inp
lvx_u $IN1,r8,$inp
subic. $len,$len,4
lvx_u $IN2,r9,$inp
lvx_u $IN3,r10,$inp
addi $inp,$inp,0x40
le?vperm $IN1,$IN1,$IN1,$lemask
le?vperm $IN2,$IN2,$IN2,$lemask
le?vperm $IN3,$IN3,$IN3,$lemask
le?vperm $IN0,$IN0,$IN0,$lemask
vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
vpmsumd $Xl1,$IN1,$H3l
vpmsumd $Xm1,$IN1,$H3
vpmsumd $Xh1,$IN1,$H3h
vxor $Xl,$Xl,$Xl3
vxor $Xm,$Xm,$Xm3
vxor $Xh,$Xh,$Xh3
vperm $t0,$IN2,$IN3,$loperm
vperm $t1,$IN2,$IN3,$hiperm
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
vpmsumd $Xl,$Xl,$xC2
vxor $Xl3,$Xl3,$Xl1
vxor $Xh3,$Xh3,$Xh1
vxor $Xh,$Xh,$IN0
vxor $Xm2,$Xm2,$Xm1
vxor $Xh,$Xh,$t1
vxor $Xm3,$Xm3,$Xm2
vxor $Xh,$Xh,$Xl
bge Loop_4x
Ltail_4x:
vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
vxor $Xl,$Xl,$Xl3
vxor $Xm,$Xm,$Xm3
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xh,$Xh,$Xh3
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
vxor $t1,$t1,$Xh
vxor $Xl,$Xl,$t1
addic. $len,$len,4
beq Ldone_4x
lvx_u $IN0,0,$inp
${UCMP}i $len,2
li $len,-4
blt Lone
lvx_u $IN1,r8,$inp
beq Ltwo
Lthree:
lvx_u $IN2,r9,$inp
le?vperm $IN0,$IN0,$IN0,$lemask
le?vperm $IN1,$IN1,$IN1,$lemask
le?vperm $IN2,$IN2,$IN2,$lemask
vxor $Xh,$IN0,$Xl
vmr $H4l,$H3l
vmr $H4, $H3
vmr $H4h,$H3h
vperm $t0,$IN1,$IN2,$loperm
vperm $t1,$IN1,$IN2,$hiperm
vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi
vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
vxor $Xm3,$Xm3,$Xm2
b Ltail_4x
.align 4
Ltwo:
le?vperm $IN0,$IN0,$IN0,$lemask
le?vperm $IN1,$IN1,$IN1,$lemask
vxor $Xh,$IN0,$Xl
vperm $t0,$zero,$IN1,$loperm
vperm $t1,$zero,$IN1,$hiperm
vsldoi $H4l,$zero,$H2,8
vmr $H4, $H2
vsldoi $H4h,$H2,$zero,8
vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo
vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi
vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi
b Ltail_4x
.align 4
Lone:
le?vperm $IN0,$IN0,$IN0,$lemask
vsldoi $H4l,$zero,$H,8
vmr $H4, $H
vsldoi $H4h,$H,$zero,8
vxor $Xh,$IN0,$Xl
vxor $Xl3,$Xl3,$Xl3
vxor $Xm3,$Xm3,$Xm3
vxor $Xh3,$Xh3,$Xh3
b Ltail_4x
Ldone_4x:
le?vperm $Xl,$Xl,$Xl,$lemask
stvx_u $Xl,0,$Xip # write out Xi
li r10,`15+6*$SIZE_T`
li r11,`31+6*$SIZE_T`
mtspr 256,$vrsave
lvx v20,r10,$sp
addi r10,r10,32
lvx v21,r11,$sp
addi r11,r11,32
lvx v22,r10,$sp
addi r10,r10,32
lvx v23,r11,$sp
addi r11,r11,32
lvx v24,r10,$sp
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
lvx v26,r10,$sp
addi r10,r10,32
lvx v27,r11,$sp
addi r11,r11,32
lvx v28,r10,$sp
addi r10,r10,32
lvx v29,r11,$sp
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,0x04,0,0x80,0,4,0
.long 0
___
}
$code.=<<___;
.size .gcm_ghash_p8,.-.gcm_ghash_p8
.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
if ($flavour =~ /le$/o) { # little-endian
s/le\?//o or
s/be\?/#be#/o;
} else {
s/le\?/#le#/o or
s/be\?//o;
}
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!"; # enforce flush

View File

@@ -0,0 +1,900 @@
#! /usr/bin/env perl
# Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
#
# June 2014
#
# Initial version was developed in tight cooperation with Ard
# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
# Just like aesv8-armx.pl this module supports both AArch32 and
# AArch64 execution modes.
#
# July 2014
#
# Implement 2x aggregated reduction [see ghash-x86.pl for background
# information].
#
# November 2017
#
# AArch64 register bank to "accommodate" 4x aggregated reduction and
# improve performance by 20-70% depending on processor.
#
# Current performance in cycles per processed byte:
#
# 64-bit PMULL 32-bit PMULL 32-bit NEON(*)
# Apple A7 0.58 0.92 5.62
# Cortex-A53 0.85 1.01 8.39
# Cortex-A57 0.73 1.17 7.61
# Denver 0.51 0.65 6.02
# Mongoose 0.65 1.10 8.06
# Kryo 0.76 1.16 8.00
# ThunderX2 1.05
#
# (*) presented for reference/comparison purposes;
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour \"$output\""
or die "can't call $xlate: $!";
*STDOUT=*OUT;
$Xi="x0"; # argument block
$Htbl="x1";
$inp="x2";
$len="x3";
$inc="x12";
{
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
$code=<<___;
#include "arm_arch.h"
#if __ARM_MAX_ARCH__>=7
___
$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
$code.=<<___ if ($flavour !~ /64/);
.fpu neon
#ifdef __thumb2__
.syntax unified
.thumb
# define INST(a,b,c,d) $_byte c,0xef,a,b
#else
.code 32
# define INST(a,b,c,d) $_byte a,b,c,0xf2
#endif
.text
___
################################################################################
# void gcm_init_v8(u128 Htable[16],const u64 H[2]);
#
# input: 128-bit H - secret parameter E(K,0^128)
# output: precomputed table filled with degrees of twisted H;
# H is twisted to handle reverse bitness of GHASH;
# only few of 16 slots of Htable[16] are used;
# data is opaque to outside world (which allows to
# optimize the code independently);
#
$code.=<<___;
.global gcm_init_v8
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
___
$code.=<<___ if ($flavour =~ /64/);
AARCH64_VALID_CALL_TARGET
___
$code.=<<___;
vld1.64 {$t1},[x1] @ load input H
vmov.i8 $xC2,#0xe1
vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
vext.8 $IN,$t1,$t1,#8
vshr.u64 $t2,$xC2,#63
vdup.32 $t1,${t1}[1]
vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01
vshr.u64 $t2,$IN,#63
vshr.s32 $t1,$t1,#31 @ broadcast carry bit
vand $t2,$t2,$t0
vshl.i64 $IN,$IN,#1
vext.8 $t2,$t2,$t2,#8
vand $t0,$t0,$t1
vorr $IN,$IN,$t2 @ H<<<=1
veor $H,$IN,$t0 @ twisted H
vst1.64 {$H},[x0],#16 @ store Htable[0]
@ calculate H^2
vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing
vpmull.p64 $Xl,$H,$H
veor $t0,$t0,$H
vpmull2.p64 $Xh,$H,$H
vpmull.p64 $Xm,$t0,$t0
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $H2,$Xl,$t2
vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
veor $t1,$t1,$H2
vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2]
___
if ($flavour =~ /64/) {
my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
my ($H3,$H34k,$H4,$H5,$H56k,$H6,$H7,$H78k,$H8) = map("q$_",(15..23));
$code.=<<___;
@ calculate H^3 and H^4
vpmull.p64 $Xl,$H, $H2
vpmull.p64 $Yl,$H2,$H2
vpmull2.p64 $Xh,$H, $H2
vpmull2.p64 $Yh,$H2,$H2
vpmull.p64 $Xm,$t0,$t1
vpmull.p64 $Ym,$t1,$t1
vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
vext.8 $t1,$Yl,$Yh,#8
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t0
veor $t3,$Yl,$Yh
veor $Ym,$Ym,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
veor $Ym,$Ym,$t3
vpmull.p64 $t3,$Yl,$xC2
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Yh#lo,$Ym#hi
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vmov $Ym#hi,$Yl#lo
veor $Xl,$Xm,$t2
veor $Yl,$Ym,$t3
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
vext.8 $t3,$Yl,$Yl,#8
vpmull.p64 $Xl,$Xl,$xC2
vpmull.p64 $Yl,$Yl,$xC2
veor $t2,$t2,$Xh
veor $t3,$t3,$Yh
veor $H3, $Xl,$t2 @ H^3
veor $H4,$Yl,$t3 @ H^4
vext.8 $t0,$H3, $H3,#8 @ Karatsuba pre-processing
vext.8 $t1,$H4,$H4,#8
vext.8 $t2,$H2,$H2,#8
veor $t0,$t0,$H3
veor $t1,$t1,$H4
veor $t2,$t2,$H2
vext.8 $H34k,$t0,$t1,#8 @ pack Karatsuba pre-processed
vst1.64 {$H3-$H4},[x0],#48 @ store Htable[3..5]
@ calculate H^5 and H^6
vpmull.p64 $Xl,$H2, $H3
vpmull.p64 $Yl,$H3,$H3
vpmull2.p64 $Xh,$H2, $H3
vpmull2.p64 $Yh,$H3,$H3
vpmull.p64 $Xm,$t0,$t2
vpmull.p64 $Ym,$t0,$t0
vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
vext.8 $t1,$Yl,$Yh,#8
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t0
veor $t3,$Yl,$Yh
veor $Ym,$Ym,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
veor $Ym,$Ym,$t3
vpmull.p64 $t3,$Yl,$xC2
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Yh#lo,$Ym#hi
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vmov $Ym#hi,$Yl#lo
veor $Xl,$Xm,$t2
veor $Yl,$Ym,$t3
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
vext.8 $t3,$Yl,$Yl,#8
vpmull.p64 $Xl,$Xl,$xC2
vpmull.p64 $Yl,$Yl,$xC2
veor $t2,$t2,$Xh
veor $t3,$t3,$Yh
veor $H5,$Xl,$t2 @ H^5
veor $H6,$Yl,$t3 @ H^6
vext.8 $t0,$H5, $H5,#8 @ Karatsuba pre-processing
vext.8 $t1,$H6,$H6,#8
vext.8 $t2,$H2,$H2,#8
veor $t0,$t0,$H5
veor $t1,$t1,$H6
veor $t2,$t2,$H2
vext.8 $H56k,$t0,$t1,#8 @ pack Karatsuba pre-processed
vst1.64 {$H5-$H6},[x0],#48 @ store Htable[6..8]
@ calculate H^7 and H^8
vpmull.p64 $Xl,$H2,$H5
vpmull.p64 $Yl,$H2,$H6
vpmull2.p64 $Xh,$H2,$H5
vpmull2.p64 $Yh,$H2,$H6
vpmull.p64 $Xm,$t0,$t2
vpmull.p64 $Ym,$t1,$t2
vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
vext.8 $t1,$Yl,$Yh,#8
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t0
veor $t3,$Yl,$Yh
veor $Ym,$Ym,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
veor $Ym,$Ym,$t3
vpmull.p64 $t3,$Yl,$xC2
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Yh#lo,$Ym#hi
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vmov $Ym#hi,$Yl#lo
veor $Xl,$Xm,$t2
veor $Yl,$Ym,$t3
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
vext.8 $t3,$Yl,$Yl,#8
vpmull.p64 $Xl,$Xl,$xC2
vpmull.p64 $Yl,$Yl,$xC2
veor $t2,$t2,$Xh
veor $t3,$t3,$Yh
veor $H7,$Xl,$t2 @ H^7
veor $H8,$Yl,$t3 @ H^8
vext.8 $t0,$H7,$H7,#8 @ Karatsuba pre-processing
vext.8 $t1,$H8,$H8,#8
veor $t0,$t0,$H7
veor $t1,$t1,$H8
vext.8 $H78k,$t0,$t1,#8 @ pack Karatsuba pre-processed
vst1.64 {$H7-$H8},[x0] @ store Htable[9..11]
___
}
$code.=<<___;
ret
.size gcm_init_v8,.-gcm_init_v8
___
################################################################################
# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
#
# input: Xi - current hash value;
# Htable - table precomputed in gcm_init_v8;
# output: Xi - next hash value Xi;
#
$code.=<<___;
.global gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
___
$code.=<<___ if ($flavour =~ /64/);
AARCH64_VALID_CALL_TARGET
___
$code.=<<___;
vld1.64 {$t1},[$Xi] @ load Xi
vmov.i8 $xC2,#0xe1
vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
vshl.u64 $xC2,$xC2,#57
#ifndef __ARMEB__
vrev64.8 $t1,$t1
#endif
vext.8 $IN,$t1,$t1,#8
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
veor $t1,$t1,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
#endif
vext.8 $Xl,$Xl,$Xl,#8
vst1.64 {$Xl},[$Xi] @ write out Xi
ret
.size gcm_gmult_v8,.-gcm_gmult_v8
___
################################################################################
# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
#
# input: table precomputed in gcm_init_v8;
# current hash value Xi;
# pointer to input data;
# length of input data in bytes, but divisible by block size;
# output: next hash value Xi;
#
$code.=<<___;
.global gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
___
$code.=<<___ if ($flavour =~ /64/);
AARCH64_VALID_CALL_TARGET
cmp $len,#64
b.hs .Lgcm_ghash_v8_4x
___
$code.=<<___ if ($flavour !~ /64/);
vstmdb sp!,{d8-d15} @ 32-bit ABI says so
___
$code.=<<___;
vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
@ "[rotated]" means that
@ loaded value would have
@ to be rotated in order to
@ make it appear as in
@ algorithm specification
subs $len,$len,#32 @ see if $len is 32 or larger
mov $inc,#16 @ $inc is used as post-
@ increment for input pointer;
@ as loop is modulo-scheduled
@ $inc is zeroed just in time
@ to preclude overstepping
@ inp[len], which means that
@ last block[s] are actually
@ loaded twice, but last
@ copy is not processed
vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2
vmov.i8 $xC2,#0xe1
vld1.64 {$H2},[$Htbl]
cclr $inc,eq @ is it time to zero $inc?
vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi
vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0]
vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
#ifndef __ARMEB__
vrev64.8 $t0,$t0
vrev64.8 $Xl,$Xl
#endif
vext.8 $IN,$t0,$t0,#8 @ rotate I[0]
b.lo .Lodd_tail_v8 @ $len was less than 32
___
{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
#######
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
# [(H*Ii+1) + (H*Xi+1)] mod P =
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
#
$code.=<<___;
vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1]
#ifndef __ARMEB__
vrev64.8 $t1,$t1
#endif
vext.8 $In,$t1,$t1,#8
veor $IN,$IN,$Xl @ I[i]^=Xi
vpmull.p64 $Xln,$H,$In @ H·Ii+1
veor $t1,$t1,$In @ Karatsuba pre-processing
vpmull2.p64 $Xhn,$H,$In
b .Loop_mod2x_v8
.align 4
.Loop_mod2x_v8:
vext.8 $t2,$IN,$IN,#8
subs $len,$len,#32 @ is there more data?
vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo
cclr $inc,lo @ is it time to zero $inc?
vpmull.p64 $Xmn,$Hhl,$t1
veor $t2,$t2,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi
veor $Xl,$Xl,$Xln @ accumulate
vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2]
veor $Xh,$Xh,$Xhn
cclr $inc,eq @ is it time to zero $inc?
veor $Xm,$Xm,$Xmn
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3]
#ifndef __ARMEB__
vrev64.8 $t0,$t0
#endif
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
#ifndef __ARMEB__
vrev64.8 $t1,$t1
#endif
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vext.8 $In,$t1,$t1,#8
vext.8 $IN,$t0,$t0,#8
veor $Xl,$Xm,$t2
vpmull.p64 $Xln,$H,$In @ H·Ii+1
veor $IN,$IN,$Xh @ accumulate $IN early
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $IN,$IN,$t2
veor $t1,$t1,$In @ Karatsuba pre-processing
veor $IN,$IN,$Xl
vpmull2.p64 $Xhn,$H,$In
b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes
veor $Xh,$Xh,$t2
vext.8 $IN,$t0,$t0,#8 @ re-construct $IN
adds $len,$len,#32 @ re-construct $len
veor $Xl,$Xl,$Xh @ re-construct $Xl
b.eq .Ldone_v8 @ is $len zero?
___
}
$code.=<<___;
.Lodd_tail_v8:
vext.8 $t2,$Xl,$Xl,#8
veor $IN,$IN,$Xl @ inp^=Xi
veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
veor $t1,$t1,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
.Ldone_v8:
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
#endif
vext.8 $Xl,$Xl,$Xl,#8
vst1.64 {$Xl},[$Xi] @ write out Xi
___
$code.=<<___ if ($flavour !~ /64/);
vldmia sp!,{d8-d15} @ 32-bit ABI says so
___
$code.=<<___;
ret
.size gcm_ghash_v8,.-gcm_ghash_v8
___
if ($flavour =~ /64/) { # 4x subroutine
my ($I0,$j1,$j2,$j3,
$I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
$code.=<<___;
.type gcm_ghash_v8_4x,%function
.align 4
gcm_ghash_v8_4x:
.Lgcm_ghash_v8_4x:
vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2
vmov.i8 $xC2,#0xe1
vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4
vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
vld1.64 {$I0-$j3},[$inp],#64
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
vrev64.8 $j1,$j1
vrev64.8 $j2,$j2
vrev64.8 $j3,$j3
vrev64.8 $I0,$I0
#endif
vext.8 $I3,$j3,$j3,#8
vext.8 $I2,$j2,$j2,#8
vext.8 $I1,$j1,$j1,#8
vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
veor $j3,$j3,$I3
vpmull2.p64 $Yh,$H,$I3
vpmull.p64 $Ym,$Hhl,$j3
vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
veor $j2,$j2,$I2
vpmull2.p64 $I2,$H2,$I2
vpmull2.p64 $j2,$Hhl,$j2
veor $Yl,$Yl,$t0
veor $Yh,$Yh,$I2
veor $Ym,$Ym,$j2
vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
veor $j1,$j1,$I1
vpmull2.p64 $I1,$H3,$I1
vpmull.p64 $j1,$H34,$j1
veor $Yl,$Yl,$j3
veor $Yh,$Yh,$I1
veor $Ym,$Ym,$j1
subs $len,$len,#128
b.lo .Ltail4x
b .Loop4x
.align 4
.Loop4x:
veor $t0,$I0,$Xl
vld1.64 {$I0-$j3},[$inp],#64
vext.8 $IN,$t0,$t0,#8
#ifndef __ARMEB__
vrev64.8 $j1,$j1
vrev64.8 $j2,$j2
vrev64.8 $j3,$j3
vrev64.8 $I0,$I0
#endif
vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
veor $t0,$t0,$IN
vpmull2.p64 $Xh,$H4,$IN
vext.8 $I3,$j3,$j3,#8
vpmull2.p64 $Xm,$H34,$t0
veor $Xl,$Xl,$Yl
veor $Xh,$Xh,$Yh
vext.8 $I2,$j2,$j2,#8
veor $Xm,$Xm,$Ym
vext.8 $I1,$j1,$j1,#8
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
veor $j3,$j3,$I3
veor $Xm,$Xm,$t1
vpmull2.p64 $Yh,$H,$I3
veor $Xm,$Xm,$t2
vpmull.p64 $Ym,$Hhl,$j3
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
veor $j2,$j2,$I2
vpmull2.p64 $I2,$H2,$I2
veor $Xl,$Xm,$t2
vpmull2.p64 $j2,$Hhl,$j2
veor $Yl,$Yl,$t0
veor $Yh,$Yh,$I2
veor $Ym,$Ym,$j2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
veor $j1,$j1,$I1
veor $t2,$t2,$Xh
vpmull2.p64 $I1,$H3,$I1
vpmull.p64 $j1,$H34,$j1
veor $Xl,$Xl,$t2
veor $Yl,$Yl,$j3
veor $Yh,$Yh,$I1
vext.8 $Xl,$Xl,$Xl,#8
veor $Ym,$Ym,$j1
subs $len,$len,#64
b.hs .Loop4x
.Ltail4x:
veor $t0,$I0,$Xl
vext.8 $IN,$t0,$t0,#8
vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
veor $t0,$t0,$IN
vpmull2.p64 $Xh,$H4,$IN
vpmull2.p64 $Xm,$H34,$t0
veor $Xl,$Xl,$Yl
veor $Xh,$Xh,$Yh
veor $Xm,$Xm,$Ym
adds $len,$len,#64
b.eq .Ldone4x
cmp $len,#32
b.lo .Lone
b.eq .Ltwo
.Lthree:
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
vld1.64 {$I0-$j2},[$inp]
veor $Xm,$Xm,$t2
#ifndef __ARMEB__
vrev64.8 $j1,$j1
vrev64.8 $j2,$j2
vrev64.8 $I0,$I0
#endif
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vext.8 $I2,$j2,$j2,#8
vext.8 $I1,$j1,$j1,#8
veor $Xl,$Xm,$t2
vpmull.p64 $Yl,$H,$I2 @ H·Ii+2
veor $j2,$j2,$I2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
vpmull2.p64 $Yh,$H,$I2
vpmull.p64 $Ym,$Hhl,$j2
veor $Xl,$Xl,$t2
vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1
veor $j1,$j1,$I1
vext.8 $Xl,$Xl,$Xl,#8
vpmull2.p64 $I1,$H2,$I1
veor $t0,$I0,$Xl
vpmull2.p64 $j1,$Hhl,$j1
vext.8 $IN,$t0,$t0,#8
veor $Yl,$Yl,$j3
veor $Yh,$Yh,$I1
veor $Ym,$Ym,$j1
vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii)
veor $t0,$t0,$IN
vpmull2.p64 $Xh,$H3,$IN
vpmull.p64 $Xm,$H34,$t0
veor $Xl,$Xl,$Yl
veor $Xh,$Xh,$Yh
veor $Xm,$Xm,$Ym
b .Ldone4x
.align 4
.Ltwo:
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
vld1.64 {$I0-$j1},[$inp]
veor $Xm,$Xm,$t2
#ifndef __ARMEB__
vrev64.8 $j1,$j1
vrev64.8 $I0,$I0
#endif
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vext.8 $I1,$j1,$j1,#8
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
vext.8 $Xl,$Xl,$Xl,#8
vpmull.p64 $Yl,$H,$I1 @ H·Ii+1
veor $j1,$j1,$I1
veor $t0,$I0,$Xl
vext.8 $IN,$t0,$t0,#8
vpmull2.p64 $Yh,$H,$I1
vpmull.p64 $Ym,$Hhl,$j1
vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii)
veor $t0,$t0,$IN
vpmull2.p64 $Xh,$H2,$IN
vpmull2.p64 $Xm,$Hhl,$t0
veor $Xl,$Xl,$Yl
veor $Xh,$Xh,$Yh
veor $Xm,$Xm,$Ym
b .Ldone4x
.align 4
.Lone:
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
vld1.64 {$I0},[$inp]
veor $Xm,$Xm,$t2
#ifndef __ARMEB__
vrev64.8 $I0,$I0
#endif
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
vext.8 $Xl,$Xl,$Xl,#8
veor $t0,$I0,$Xl
vext.8 $IN,$t0,$t0,#8
vpmull.p64 $Xl,$H,$IN
veor $t0,$t0,$IN
vpmull2.p64 $Xh,$H,$IN
vpmull.p64 $Xm,$Hhl,$t0
.Ldone4x:
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
vext.8 $Xl,$Xl,$Xl,#8
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
#endif
vst1.64 {$Xl},[$Xi] @ write out Xi
ret
.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
___
}
}
$code.=<<___;
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#endif
___
if ($flavour =~ /64/) { ######## 64-bit code
sub unvmov {
my $arg=shift;
$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
$3<8?$3:$3+8,($4 eq "lo")?0:1;
}
foreach(split("\n",$code)) {
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
s/vmov\.i8/movi/o or # fix up legacy mnemonics
s/vmov\s+(.*)/unvmov($1)/geo or
s/vext\.8/ext/o or
s/vshr\.s/sshr\.s/o or
s/vshr/ushr/o or
s/^(\s+)v/$1/o or # strip off v prefix
s/\bbx\s+lr\b/ret/o;
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
s/@\s/\/\//o; # old->new style commentary
# fix up remaining legacy suffixes
s/\.[ui]?8(\s)/$1/o;
s/\.[uis]?32//o and s/\.16b/\.4s/go;
m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
s/\.[uisp]?64//o and s/\.16b/\.2d/go;
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
# Switch preprocessor checks to aarch64 versions.
s/__ARME([BL])__/__AARCH64E$1__/go;
print $_,"\n";
}
} else { ######## 32-bit code
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvpmullp64 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
$word |= 0x00010001 if ($mnemonic =~ "2");
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
s/\/\/\s?/@ /o; # new->old style commentary
# fix up remaining new-style suffixes
s/\],#[0-9]+/]!/o;
s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)ret/$1bx\tlr/o;
if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
print " it $2\n";
}
print $_,"\n";
}
}
close STDOUT or die "error closing STDOUT: $!"; # enforce flush

View File

@@ -0,0 +1,96 @@
LIBS=../../libcrypto
$MODESASM=
IF[{- !$disabled{asm} -}]
$MODESASM_x86=ghash-x86.S
$MODESDEF_x86=GHASH_ASM
$MODESASM_x86_64=ghash-x86_64.s aesni-gcm-x86_64.s aes-gcm-avx512.s
$MODESDEF_x86_64=GHASH_ASM
# ghash-ia64.s doesn't work on VMS
IF[{- $config{target} !~ /^vms-/ -}]
$MODESASM_ia64=ghash-ia64.s
$MODESDEF_ia64=GHASH_ASM
ENDIF
$MODESASM_sparcv9=ghash-sparcv9.S
$MODESDEF_sparcv9=GHASH_ASM
$MODESASM_alpha=ghash-alpha.S
$MODESDEF_alpha=GHASH_ASM
$MODESASM_s390x=ghash-s390x.S
$MODESDEF_s390x=GHASH_ASM
$MODESASM_armv4=ghash-armv4.S ghashv8-armx.S
$MODESDEF_armv4=GHASH_ASM
$MODESASM_aarch64=ghashv8-armx.S aes-gcm-armv8_64.S aes-gcm-armv8-unroll8_64.S
$MODESDEF_aarch64=
$MODESASM_parisc11=ghash-parisc.s
$MODESDEF_parisc11=GHASH_ASM
$MODESASM_parisc20_64=$MODESASM_parisc11
$MODESDEF_parisc20_64=$MODESDEF_parisc11
$MODESASM_ppc32=ghashp8-ppc.s
$MODESDEF_ppc32=
$MODESASM_ppc64=$MODESASM_ppc32
IF[{- $target{sys_id} ne "AIX" && $target{sys_id} ne "MACOSX" -}]
$MODESASM_ppc64=$MODESASM_ppc32 aes-gcm-ppc.s
ENDIF
$MODESDEF_ppc64=$MODESDEF_ppc32
$MODESASM_c64xplus=ghash-c64xplus.s
$MODESDEF_c64xplus=GHASH_ASM
$MODESASM_riscv64=ghash-riscv64.s ghash-riscv64-zvkb-zvbc.s ghash-riscv64-zvkg.s aes-gcm-riscv64-zvkb-zvkg-zvkned.s
$MODESDEF_riscv64=GHASH_ASM
# Now that we have defined all the arch specific variables, use the
# appropriate one, and define the appropriate macros
IF[$MODESASM_{- $target{asm_arch} -}]
$MODESASM=$MODESASM_{- $target{asm_arch} -}
$MODESDEF=$MODESDEF_{- $target{asm_arch} -}
ENDIF
ENDIF
$COMMON=cbc128.c ctr128.c cfb128.c ofb128.c gcm128.c ccm128.c xts128.c \
wrap128.c xts128gb.c $MODESASM
SOURCE[../../libcrypto]=$COMMON \
cts128.c ocb128.c siv128.c
SOURCE[../../providers/libfips.a]=$COMMON
# Implementations are now spread across several libraries, so the defines
# need to be applied to all affected libraries and modules.
DEFINE[../../libcrypto]=$MODESDEF
DEFINE[../../providers/libfips.a]=$MODESDEF
INCLUDE[gcm128.o]=..
GENERATE[ghash-ia64.s]=asm/ghash-ia64.pl
GENERATE[ghash-x86.S]=asm/ghash-x86.pl
GENERATE[ghash-x86_64.s]=asm/ghash-x86_64.pl
GENERATE[aesni-gcm-x86_64.s]=asm/aesni-gcm-x86_64.pl
GENERATE[aes-gcm-avx512.s]=asm/aes-gcm-avx512.pl
GENERATE[ghash-sparcv9.S]=asm/ghash-sparcv9.pl
INCLUDE[ghash-sparcv9.o]=..
GENERATE[ghash-alpha.S]=asm/ghash-alpha.pl
GENERATE[ghash-parisc.s]=asm/ghash-parisc.pl
GENERATE[ghashp8-ppc.s]=asm/ghashp8-ppc.pl
GENERATE[aes-gcm-ppc.s]=asm/aes-gcm-ppc.pl
GENERATE[ghash-armv4.S]=asm/ghash-armv4.pl
INCLUDE[ghash-armv4.o]=..
GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl
INCLUDE[ghashv8-armx.o]=..
GENERATE[aes-gcm-armv8_64.S]=asm/aes-gcm-armv8_64.pl
INCLUDE[aes-gcm-armv8_64.o]=..
GENERATE[aes-gcm-armv8-unroll8_64.S]=asm/aes-gcm-armv8-unroll8_64.pl
INCLUDE[aes-gcm-armv8-unroll8_64.o]=..
GENERATE[ghash-s390x.S]=asm/ghash-s390x.pl
INCLUDE[ghash-s390x.o]=..
GENERATE[ghash-c64xplus.S]=asm/ghash-c64xplus.pl
GENERATE[ghash-riscv64.s]=asm/ghash-riscv64.pl
GENERATE[ghash-riscv64-zvkb-zvbc.s]=asm/ghash-riscv64-zvkb-zvbc.pl
GENERATE[ghash-riscv64-zvkg.s]=asm/ghash-riscv64-zvkg.pl
GENERATE[aes-gcm-riscv64-zvkb-zvkg-zvkned.s]=asm/aes-gcm-riscv64-zvkb-zvkg-zvkned.pl

View File

@@ -0,0 +1,172 @@
/*
* Copyright 2008-2021 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <string.h>
#include <openssl/crypto.h>
#include "crypto/modes.h"
#if !defined(STRICT_ALIGNMENT) && !defined(PEDANTIC)
# define STRICT_ALIGNMENT 0
#endif
#if defined(__GNUC__) && !STRICT_ALIGNMENT
typedef size_t size_t_aX __attribute((__aligned__(1)));
#else
typedef size_t size_t_aX;
#endif
void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
size_t len, const void *key,
unsigned char ivec[16], block128_f block)
{
size_t n;
const unsigned char *iv = ivec;
if (len == 0)
return;
#if !defined(OPENSSL_SMALL_FOOTPRINT)
if (STRICT_ALIGNMENT &&
((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
while (len >= 16) {
for (n = 0; n < 16; ++n)
out[n] = in[n] ^ iv[n];
(*block) (out, out, key);
iv = out;
len -= 16;
in += 16;
out += 16;
}
} else {
while (len >= 16) {
for (n = 0; n < 16; n += sizeof(size_t))
*(size_t_aX *)(out + n) =
*(size_t_aX *)(in + n) ^ *(size_t_aX *)(iv + n);
(*block) (out, out, key);
iv = out;
len -= 16;
in += 16;
out += 16;
}
}
#endif
while (len) {
for (n = 0; n < 16 && n < len; ++n)
out[n] = in[n] ^ iv[n];
for (; n < 16; ++n)
out[n] = iv[n];
(*block) (out, out, key);
iv = out;
if (len <= 16)
break;
len -= 16;
in += 16;
out += 16;
}
if (ivec != iv)
memcpy(ivec, iv, 16);
}
void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
size_t len, const void *key,
unsigned char ivec[16], block128_f block)
{
size_t n;
union {
size_t t[16 / sizeof(size_t)];
unsigned char c[16];
} tmp;
if (len == 0)
return;
#if !defined(OPENSSL_SMALL_FOOTPRINT)
if (in != out) {
const unsigned char *iv = ivec;
if (STRICT_ALIGNMENT &&
((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
while (len >= 16) {
(*block) (in, out, key);
for (n = 0; n < 16; ++n)
out[n] ^= iv[n];
iv = in;
len -= 16;
in += 16;
out += 16;
}
} else if (16 % sizeof(size_t) == 0) { /* always true */
while (len >= 16) {
size_t_aX *out_t = (size_t_aX *)out;
size_t_aX *iv_t = (size_t_aX *)iv;
(*block) (in, out, key);
for (n = 0; n < 16 / sizeof(size_t); n++)
out_t[n] ^= iv_t[n];
iv = in;
len -= 16;
in += 16;
out += 16;
}
}
if (ivec != iv)
memcpy(ivec, iv, 16);
} else {
if (STRICT_ALIGNMENT &&
((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
unsigned char c;
while (len >= 16) {
(*block) (in, tmp.c, key);
for (n = 0; n < 16; ++n) {
c = in[n];
out[n] = tmp.c[n] ^ ivec[n];
ivec[n] = c;
}
len -= 16;
in += 16;
out += 16;
}
} else if (16 % sizeof(size_t) == 0) { /* always true */
while (len >= 16) {
size_t c;
size_t_aX *out_t = (size_t_aX *)out;
size_t_aX *ivec_t = (size_t_aX *)ivec;
const size_t_aX *in_t = (const size_t_aX *)in;
(*block) (in, tmp.c, key);
for (n = 0; n < 16 / sizeof(size_t); n++) {
c = in_t[n];
out_t[n] = tmp.t[n] ^ ivec_t[n];
ivec_t[n] = c;
}
len -= 16;
in += 16;
out += 16;
}
}
}
#endif
while (len) {
unsigned char c;
(*block) (in, tmp.c, key);
for (n = 0; n < 16 && n < len; ++n) {
c = in[n];
out[n] = tmp.c[n] ^ ivec[n];
ivec[n] = c;
}
if (len <= 16) {
for (; n < 16; ++n)
ivec[n] = in[n];
break;
}
len -= 16;
in += 16;
out += 16;
}
}

View File

@@ -0,0 +1,442 @@
/*
* Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <string.h>
#include <openssl/crypto.h>
#include "crypto/modes.h"
#ifndef STRICT_ALIGNMENT
# ifdef __GNUC__
typedef u64 u64_a1 __attribute((__aligned__(1)));
# else
typedef u64 u64_a1;
# endif
#endif
/*
* First you setup M and L parameters and pass the key schedule. This is
* called once per session setup...
*/
void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
unsigned int M, unsigned int L, void *key,
block128_f block)
{
memset(ctx->nonce.c, 0, sizeof(ctx->nonce.c));
ctx->nonce.c[0] = ((u8)(L - 1) & 7) | (u8)(((M - 2) / 2) & 7) << 3;
ctx->blocks = 0;
ctx->block = block;
ctx->key = key;
}
/* !!! Following interfaces are to be called *once* per packet !!! */
/* Then you setup per-message nonce and pass the length of the message */
int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
const unsigned char *nonce, size_t nlen, size_t mlen)
{
unsigned int L = ctx->nonce.c[0] & 7; /* the L parameter */
if (nlen < (14 - L))
return -1; /* nonce is too short */
if (sizeof(mlen) == 8 && L >= 3) {
ctx->nonce.c[8] = (u8)(mlen >> (56 % (sizeof(mlen) * 8)));
ctx->nonce.c[9] = (u8)(mlen >> (48 % (sizeof(mlen) * 8)));
ctx->nonce.c[10] = (u8)(mlen >> (40 % (sizeof(mlen) * 8)));
ctx->nonce.c[11] = (u8)(mlen >> (32 % (sizeof(mlen) * 8)));
} else
ctx->nonce.u[1] = 0;
ctx->nonce.c[12] = (u8)(mlen >> 24);
ctx->nonce.c[13] = (u8)(mlen >> 16);
ctx->nonce.c[14] = (u8)(mlen >> 8);
ctx->nonce.c[15] = (u8)mlen;
ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */
memcpy(&ctx->nonce.c[1], nonce, 14 - L);
return 0;
}
/* Then you pass additional authentication data, this is optional */
void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
const unsigned char *aad, size_t alen)
{
unsigned int i;
block128_f block = ctx->block;
if (alen == 0)
return;
ctx->nonce.c[0] |= 0x40; /* set Adata flag */
(*block) (ctx->nonce.c, ctx->cmac.c, ctx->key), ctx->blocks++;
if (alen < (0x10000 - 0x100)) {
ctx->cmac.c[0] ^= (u8)(alen >> 8);
ctx->cmac.c[1] ^= (u8)alen;
i = 2;
} else if (sizeof(alen) == 8
&& alen >= (size_t)1 << (32 % (sizeof(alen) * 8))) {
ctx->cmac.c[0] ^= 0xFF;
ctx->cmac.c[1] ^= 0xFF;
ctx->cmac.c[2] ^= (u8)(alen >> (56 % (sizeof(alen) * 8)));
ctx->cmac.c[3] ^= (u8)(alen >> (48 % (sizeof(alen) * 8)));
ctx->cmac.c[4] ^= (u8)(alen >> (40 % (sizeof(alen) * 8)));
ctx->cmac.c[5] ^= (u8)(alen >> (32 % (sizeof(alen) * 8)));
ctx->cmac.c[6] ^= (u8)(alen >> 24);
ctx->cmac.c[7] ^= (u8)(alen >> 16);
ctx->cmac.c[8] ^= (u8)(alen >> 8);
ctx->cmac.c[9] ^= (u8)alen;
i = 10;
} else {
ctx->cmac.c[0] ^= 0xFF;
ctx->cmac.c[1] ^= 0xFE;
ctx->cmac.c[2] ^= (u8)(alen >> 24);
ctx->cmac.c[3] ^= (u8)(alen >> 16);
ctx->cmac.c[4] ^= (u8)(alen >> 8);
ctx->cmac.c[5] ^= (u8)alen;
i = 6;
}
do {
for (; i < 16 && alen; ++i, ++aad, --alen)
ctx->cmac.c[i] ^= *aad;
(*block) (ctx->cmac.c, ctx->cmac.c, ctx->key), ctx->blocks++;
i = 0;
} while (alen);
}
/* Finally you encrypt or decrypt the message */
/*
* counter part of nonce may not be larger than L*8 bits, L is not larger
* than 8, therefore 64-bit counter...
*/
static void ctr64_inc(unsigned char *counter)
{
unsigned int n = 8;
u8 c;
counter += 8;
do {
--n;
c = counter[n];
++c;
counter[n] = c;
if (c)
return;
} while (n);
}
int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
const unsigned char *inp, unsigned char *out,
size_t len)
{
size_t n;
unsigned int i, L;
unsigned char flags0 = ctx->nonce.c[0];
block128_f block = ctx->block;
void *key = ctx->key;
union {
u64 u[2];
u8 c[16];
} scratch;
if (!(flags0 & 0x40))
(*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++;
ctx->nonce.c[0] = L = flags0 & 7;
for (n = 0, i = 15 - L; i < 15; ++i) {
n |= ctx->nonce.c[i];
ctx->nonce.c[i] = 0;
n <<= 8;
}
n |= ctx->nonce.c[15]; /* reconstructed length */
ctx->nonce.c[15] = 1;
if (n != len)
return -1; /* length mismatch */
ctx->blocks += ((len + 15) >> 3) | 1;
if (ctx->blocks > (U64(1) << 61))
return -2; /* too much data */
while (len >= 16) {
#if defined(STRICT_ALIGNMENT)
union {
u64 u[2];
u8 c[16];
} temp;
memcpy(temp.c, inp, 16);
ctx->cmac.u[0] ^= temp.u[0];
ctx->cmac.u[1] ^= temp.u[1];
#else
ctx->cmac.u[0] ^= ((u64_a1 *)inp)[0];
ctx->cmac.u[1] ^= ((u64_a1 *)inp)[1];
#endif
(*block) (ctx->cmac.c, ctx->cmac.c, key);
(*block) (ctx->nonce.c, scratch.c, key);
ctr64_inc(ctx->nonce.c);
#if defined(STRICT_ALIGNMENT)
temp.u[0] ^= scratch.u[0];
temp.u[1] ^= scratch.u[1];
memcpy(out, temp.c, 16);
#else
((u64_a1 *)out)[0] = scratch.u[0] ^ ((u64_a1 *)inp)[0];
((u64_a1 *)out)[1] = scratch.u[1] ^ ((u64_a1 *)inp)[1];
#endif
inp += 16;
out += 16;
len -= 16;
}
if (len) {
for (i = 0; i < len; ++i)
ctx->cmac.c[i] ^= inp[i];
(*block) (ctx->cmac.c, ctx->cmac.c, key);
(*block) (ctx->nonce.c, scratch.c, key);
for (i = 0; i < len; ++i)
out[i] = scratch.c[i] ^ inp[i];
}
for (i = 15 - L; i < 16; ++i)
ctx->nonce.c[i] = 0;
(*block) (ctx->nonce.c, scratch.c, key);
ctx->cmac.u[0] ^= scratch.u[0];
ctx->cmac.u[1] ^= scratch.u[1];
ctx->nonce.c[0] = flags0;
return 0;
}
int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
const unsigned char *inp, unsigned char *out,
size_t len)
{
size_t n;
unsigned int i, L;
unsigned char flags0 = ctx->nonce.c[0];
block128_f block = ctx->block;
void *key = ctx->key;
union {
u64 u[2];
u8 c[16];
} scratch;
if (!(flags0 & 0x40))
(*block) (ctx->nonce.c, ctx->cmac.c, key);
ctx->nonce.c[0] = L = flags0 & 7;
for (n = 0, i = 15 - L; i < 15; ++i) {
n |= ctx->nonce.c[i];
ctx->nonce.c[i] = 0;
n <<= 8;
}
n |= ctx->nonce.c[15]; /* reconstructed length */
ctx->nonce.c[15] = 1;
if (n != len)
return -1;
while (len >= 16) {
#if defined(STRICT_ALIGNMENT)
union {
u64 u[2];
u8 c[16];
} temp;
#endif
(*block) (ctx->nonce.c, scratch.c, key);
ctr64_inc(ctx->nonce.c);
#if defined(STRICT_ALIGNMENT)
memcpy(temp.c, inp, 16);
ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
memcpy(out, scratch.c, 16);
#else
ctx->cmac.u[0] ^= (((u64_a1 *)out)[0]
= scratch.u[0] ^ ((u64_a1 *)inp)[0]);
ctx->cmac.u[1] ^= (((u64_a1 *)out)[1]
= scratch.u[1] ^ ((u64_a1 *)inp)[1]);
#endif
(*block) (ctx->cmac.c, ctx->cmac.c, key);
inp += 16;
out += 16;
len -= 16;
}
if (len) {
(*block) (ctx->nonce.c, scratch.c, key);
for (i = 0; i < len; ++i)
ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
(*block) (ctx->cmac.c, ctx->cmac.c, key);
}
for (i = 15 - L; i < 16; ++i)
ctx->nonce.c[i] = 0;
(*block) (ctx->nonce.c, scratch.c, key);
ctx->cmac.u[0] ^= scratch.u[0];
ctx->cmac.u[1] ^= scratch.u[1];
ctx->nonce.c[0] = flags0;
return 0;
}
static void ctr64_add(unsigned char *counter, size_t inc)
{
size_t n = 8, val = 0;
counter += 8;
do {
--n;
val += counter[n] + (inc & 0xff);
counter[n] = (unsigned char)val;
val >>= 8; /* carry bit */
inc >>= 8;
} while (n && (inc || val));
}
int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
const unsigned char *inp, unsigned char *out,
size_t len, ccm128_f stream)
{
size_t n;
unsigned int i, L;
unsigned char flags0 = ctx->nonce.c[0];
block128_f block = ctx->block;
void *key = ctx->key;
union {
u64 u[2];
u8 c[16];
} scratch;
if (!(flags0 & 0x40))
(*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++;
ctx->nonce.c[0] = L = flags0 & 7;
for (n = 0, i = 15 - L; i < 15; ++i) {
n |= ctx->nonce.c[i];
ctx->nonce.c[i] = 0;
n <<= 8;
}
n |= ctx->nonce.c[15]; /* reconstructed length */
ctx->nonce.c[15] = 1;
if (n != len)
return -1; /* length mismatch */
ctx->blocks += ((len + 15) >> 3) | 1;
if (ctx->blocks > (U64(1) << 61))
return -2; /* too much data */
if ((n = len / 16)) {
(*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
n *= 16;
inp += n;
out += n;
len -= n;
if (len)
ctr64_add(ctx->nonce.c, n / 16);
}
if (len) {
for (i = 0; i < len; ++i)
ctx->cmac.c[i] ^= inp[i];
(*block) (ctx->cmac.c, ctx->cmac.c, key);
(*block) (ctx->nonce.c, scratch.c, key);
for (i = 0; i < len; ++i)
out[i] = scratch.c[i] ^ inp[i];
}
for (i = 15 - L; i < 16; ++i)
ctx->nonce.c[i] = 0;
(*block) (ctx->nonce.c, scratch.c, key);
ctx->cmac.u[0] ^= scratch.u[0];
ctx->cmac.u[1] ^= scratch.u[1];
ctx->nonce.c[0] = flags0;
return 0;
}
int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
const unsigned char *inp, unsigned char *out,
size_t len, ccm128_f stream)
{
size_t n;
unsigned int i, L;
unsigned char flags0 = ctx->nonce.c[0];
block128_f block = ctx->block;
void *key = ctx->key;
union {
u64 u[2];
u8 c[16];
} scratch;
if (!(flags0 & 0x40))
(*block) (ctx->nonce.c, ctx->cmac.c, key);
ctx->nonce.c[0] = L = flags0 & 7;
for (n = 0, i = 15 - L; i < 15; ++i) {
n |= ctx->nonce.c[i];
ctx->nonce.c[i] = 0;
n <<= 8;
}
n |= ctx->nonce.c[15]; /* reconstructed length */
ctx->nonce.c[15] = 1;
if (n != len)
return -1;
if ((n = len / 16)) {
(*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
n *= 16;
inp += n;
out += n;
len -= n;
if (len)
ctr64_add(ctx->nonce.c, n / 16);
}
if (len) {
(*block) (ctx->nonce.c, scratch.c, key);
for (i = 0; i < len; ++i)
ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
(*block) (ctx->cmac.c, ctx->cmac.c, key);
}
for (i = 15 - L; i < 16; ++i)
ctx->nonce.c[i] = 0;
(*block) (ctx->nonce.c, scratch.c, key);
ctx->cmac.u[0] ^= scratch.u[0];
ctx->cmac.u[1] ^= scratch.u[1];
ctx->nonce.c[0] = flags0;
return 0;
}
size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
{
unsigned int M = (ctx->nonce.c[0] >> 3) & 7; /* the M parameter */
M *= 2;
M += 2;
if (len != M)
return 0;
memcpy(tag, ctx->cmac.c, M);
return M;
}

View File

@@ -0,0 +1,211 @@
/*
* Copyright 2008-2021 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <string.h>
#include <openssl/crypto.h>
#include "crypto/modes.h"
#if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
typedef size_t size_t_aX __attribute((__aligned__(1)));
#else
typedef size_t size_t_aX;
#endif
/*
* The input and output encrypted as though 128bit cfb mode is being used.
* The extra state information to record how much of the 128bit block we have
* used is contained in *num;
*/
void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
size_t len, const void *key,
unsigned char ivec[16], int *num,
int enc, block128_f block)
{
unsigned int n;
size_t l = 0;
if (*num < 0) {
/* There is no good way to signal an error return from here */
*num = -1;
return;
}
n = *num;
if (enc) {
#if !defined(OPENSSL_SMALL_FOOTPRINT)
if (16 % sizeof(size_t) == 0) { /* always true actually */
do {
while (n && len) {
*(out++) = ivec[n] ^= *(in++);
--len;
n = (n + 1) % 16;
}
# if defined(STRICT_ALIGNMENT)
if (((size_t)in | (size_t)out | (size_t)ivec) %
sizeof(size_t) != 0)
break;
# endif
while (len >= 16) {
(*block) (ivec, ivec, key);
for (; n < 16; n += sizeof(size_t)) {
*(size_t_aX *)(out + n) =
*(size_t_aX *)(ivec + n)
^= *(size_t_aX *)(in + n);
}
len -= 16;
out += 16;
in += 16;
n = 0;
}
if (len) {
(*block) (ivec, ivec, key);
while (len--) {
out[n] = ivec[n] ^= in[n];
++n;
}
}
*num = n;
return;
} while (0);
}
/* the rest would be commonly eliminated by x86* compiler */
#endif
while (l < len) {
if (n == 0) {
(*block) (ivec, ivec, key);
}
out[l] = ivec[n] ^= in[l];
++l;
n = (n + 1) % 16;
}
*num = n;
} else {
#if !defined(OPENSSL_SMALL_FOOTPRINT)
if (16 % sizeof(size_t) == 0) { /* always true actually */
do {
while (n && len) {
unsigned char c;
*(out++) = ivec[n] ^ (c = *(in++));
ivec[n] = c;
--len;
n = (n + 1) % 16;
}
# if defined(STRICT_ALIGNMENT)
if (((size_t)in | (size_t)out | (size_t)ivec) %
sizeof(size_t) != 0)
break;
# endif
while (len >= 16) {
(*block) (ivec, ivec, key);
for (; n < 16; n += sizeof(size_t)) {
size_t t = *(size_t_aX *)(in + n);
*(size_t_aX *)(out + n)
= *(size_t_aX *)(ivec + n) ^ t;
*(size_t_aX *)(ivec + n) = t;
}
len -= 16;
out += 16;
in += 16;
n = 0;
}
if (len) {
(*block) (ivec, ivec, key);
while (len--) {
unsigned char c;
out[n] = ivec[n] ^ (c = in[n]);
ivec[n] = c;
++n;
}
}
*num = n;
return;
} while (0);
}
/* the rest would be commonly eliminated by x86* compiler */
#endif
while (l < len) {
unsigned char c;
if (n == 0) {
(*block) (ivec, ivec, key);
}
out[l] = ivec[n] ^ (c = in[l]);
ivec[n] = c;
++l;
n = (n + 1) % 16;
}
*num = n;
}
}
/*
* This expects a single block of size nbits for both in and out. Note that
* it corrupts any extra bits in the last byte of out
*/
static void cfbr_encrypt_block(const unsigned char *in, unsigned char *out,
int nbits, const void *key,
unsigned char ivec[16], int enc,
block128_f block)
{
int n, rem, num;
unsigned char ovec[16 * 2 + 1]; /* +1 because we dereference (but don't
* use) one byte off the end */
if (nbits <= 0 || nbits > 128)
return;
/* fill in the first half of the new IV with the current IV */
memcpy(ovec, ivec, 16);
/* construct the new IV */
(*block) (ivec, ivec, key);
num = (nbits + 7) / 8;
if (enc) /* encrypt the input */
for (n = 0; n < num; ++n)
out[n] = (ovec[16 + n] = in[n] ^ ivec[n]);
else /* decrypt the input */
for (n = 0; n < num; ++n)
out[n] = (ovec[16 + n] = in[n]) ^ ivec[n];
/* shift ovec left... */
rem = nbits % 8;
num = nbits / 8;
if (rem == 0)
memcpy(ivec, ovec + num, 16);
else
for (n = 0; n < 16; ++n)
ivec[n] = ovec[n + num] << rem | ovec[n + num + 1] >> (8 - rem);
/* it is not necessary to cleanse ovec, since the IV is not secret */
}
/* N.B. This expects the input to be packed, MS bit first */
void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
size_t bits, const void *key,
unsigned char ivec[16], int *num,
int enc, block128_f block)
{
size_t n;
unsigned char c[1], d[1];
for (n = 0; n < bits; ++n) {
c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0;
cfbr_encrypt_block(c, d, 1, key, ivec, enc, block);
out[n / 8] = (out[n / 8] & ~(1 << (unsigned int)(7 - n % 8))) |
((d[0] & 0x80) >> (unsigned int)(n % 8));
}
}
void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const void *key,
unsigned char ivec[16], int *num,
int enc, block128_f block)
{
size_t n;
for (n = 0; n < length; ++n)
cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block);
}

View File

@@ -0,0 +1,212 @@
/*
* Copyright 2008-2021 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <string.h>
#include <openssl/crypto.h>
#include "internal/endian.h"
#include "crypto/modes.h"
#if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
typedef size_t size_t_aX __attribute((__aligned__(1)));
#else
typedef size_t size_t_aX;
#endif
/*
* NOTE: the IV/counter CTR mode is big-endian. The code itself is
* endian-neutral.
*/
/* increment counter (128-bit int) by 1 */
static void ctr128_inc(unsigned char *counter)
{
u32 n = 16, c = 1;
do {
--n;
c += counter[n];
counter[n] = (u8)c;
c >>= 8;
} while (n);
}
#if !defined(OPENSSL_SMALL_FOOTPRINT)
static void ctr128_inc_aligned(unsigned char *counter)
{
size_t *data, c, d, n;
DECLARE_IS_ENDIAN;
if (IS_LITTLE_ENDIAN || ((size_t)counter % sizeof(size_t)) != 0) {
ctr128_inc(counter);
return;
}
data = (size_t *)counter;
c = 1;
n = 16 / sizeof(size_t);
do {
--n;
d = data[n] += c;
/* did addition carry? */
c = ((d - c) & ~d) >> (sizeof(size_t) * 8 - 1);
} while (n);
}
#endif
/*
* The input encrypted as though 128bit counter mode is being used. The
* extra state information to record how much of the 128bit block we have
* used is contained in *num, and the encrypted counter is kept in
* ecount_buf. Both *num and ecount_buf must be initialised with zeros
* before the first call to CRYPTO_ctr128_encrypt(). This algorithm assumes
* that the counter is in the x lower bits of the IV (ivec), and that the
* application has full control over overflow and the rest of the IV. This
* implementation takes NO responsibility for checking that the counter
* doesn't overflow into the rest of the IV when incremented.
*/
void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
size_t len, const void *key,
unsigned char ivec[16],
unsigned char ecount_buf[16], unsigned int *num,
block128_f block)
{
unsigned int n;
size_t l = 0;
n = *num;
#if !defined(OPENSSL_SMALL_FOOTPRINT)
if (16 % sizeof(size_t) == 0) { /* always true actually */
do {
while (n && len) {
*(out++) = *(in++) ^ ecount_buf[n];
--len;
n = (n + 1) % 16;
}
# if defined(STRICT_ALIGNMENT)
if (((size_t)in | (size_t)out | (size_t)ecount_buf)
% sizeof(size_t) != 0)
break;
# endif
while (len >= 16) {
(*block) (ivec, ecount_buf, key);
ctr128_inc_aligned(ivec);
for (n = 0; n < 16; n += sizeof(size_t))
*(size_t_aX *)(out + n) =
*(size_t_aX *)(in + n)
^ *(size_t_aX *)(ecount_buf + n);
len -= 16;
out += 16;
in += 16;
n = 0;
}
if (len) {
(*block) (ivec, ecount_buf, key);
ctr128_inc_aligned(ivec);
while (len--) {
out[n] = in[n] ^ ecount_buf[n];
++n;
}
}
*num = n;
return;
} while (0);
}
/* the rest would be commonly eliminated by x86* compiler */
#endif
while (l < len) {
if (n == 0) {
(*block) (ivec, ecount_buf, key);
ctr128_inc(ivec);
}
out[l] = in[l] ^ ecount_buf[n];
++l;
n = (n + 1) % 16;
}
*num = n;
}
/* increment upper 96 bits of 128-bit counter by 1 */
static void ctr96_inc(unsigned char *counter)
{
u32 n = 12, c = 1;
do {
--n;
c += counter[n];
counter[n] = (u8)c;
c >>= 8;
} while (n);
}
void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
size_t len, const void *key,
unsigned char ivec[16],
unsigned char ecount_buf[16],
unsigned int *num, ctr128_f func)
{
unsigned int n, ctr32;
n = *num;
while (n && len) {
*(out++) = *(in++) ^ ecount_buf[n];
--len;
n = (n + 1) % 16;
}
ctr32 = GETU32(ivec + 12);
while (len >= 16) {
size_t blocks = len / 16;
/*
* 1<<28 is just a not-so-small yet not-so-large number...
* Below condition is practically never met, but it has to
* be checked for code correctness.
*/
if (sizeof(size_t) > sizeof(unsigned int) && blocks > (1U << 28))
blocks = (1U << 28);
/*
* As (*func) operates on 32-bit counter, caller
* has to handle overflow. 'if' below detects the
* overflow, which is then handled by limiting the
* amount of blocks to the exact overflow point...
*/
ctr32 += (u32)blocks;
if (ctr32 < blocks) {
blocks -= ctr32;
ctr32 = 0;
}
(*func) (in, out, blocks, key, ivec);
/* (*ctr) does not update ivec, caller does: */
PUTU32(ivec + 12, ctr32);
/* ... overflow was detected, propagate carry. */
if (ctr32 == 0)
ctr96_inc(ivec);
blocks *= 16;
len -= blocks;
out += blocks;
in += blocks;
}
if (len) {
memset(ecount_buf, 0, 16);
(*func) (ecount_buf, ecount_buf, 1, key, ivec);
++ctr32;
PUTU32(ivec + 12, ctr32);
if (ctr32 == 0)
ctr96_inc(ivec);
while (len--) {
out[n] = in[n] ^ ecount_buf[n];
++n;
}
}
*num = n;
}

View File

@@ -0,0 +1,330 @@
/*
* Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <string.h>
#include <openssl/crypto.h>
#include "crypto/modes.h"
/*
* Trouble with Ciphertext Stealing, CTS, mode is that there is no
* common official specification, but couple of cipher/application
* specific ones: RFC2040 and RFC3962. Then there is 'Proposal to
* Extend CBC Mode By "Ciphertext Stealing"' at NIST site, which
* deviates from mentioned RFCs. Most notably it allows input to be
* of block length and it doesn't flip the order of the last two
* blocks. CTS is being discussed even in ECB context, but it's not
* adopted for any known application. This implementation provides
* two interfaces: one compliant with above mentioned RFCs and one
* compliant with the NIST proposal, both extending CBC mode.
*/
size_t CRYPTO_cts128_encrypt_block(const unsigned char *in,
unsigned char *out, size_t len,
const void *key, unsigned char ivec[16],
block128_f block)
{
size_t residue, n;
if (len <= 16)
return 0;
if ((residue = len % 16) == 0)
residue = 16;
len -= residue;
CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block);
in += len;
out += len;
for (n = 0; n < residue; ++n)
ivec[n] ^= in[n];
(*block) (ivec, ivec, key);
memcpy(out, out - 16, residue);
memcpy(out - 16, ivec, 16);
return len + residue;
}
size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in,
unsigned char *out, size_t len,
const void *key,
unsigned char ivec[16],
block128_f block)
{
size_t residue, n;
if (len < 16)
return 0;
residue = len % 16;
len -= residue;
CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block);
if (residue == 0)
return len;
in += len;
out += len;
for (n = 0; n < residue; ++n)
ivec[n] ^= in[n];
(*block) (ivec, ivec, key);
memcpy(out - 16 + residue, ivec, 16);
return len + residue;
}
size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
size_t len, const void *key,
unsigned char ivec[16], cbc128_f cbc)
{
size_t residue;
union {
size_t align;
unsigned char c[16];
} tmp;
if (len <= 16)
return 0;
if ((residue = len % 16) == 0)
residue = 16;
len -= residue;
(*cbc) (in, out, len, key, ivec, 1);
in += len;
out += len;
#if defined(CBC_HANDLES_TRUNCATED_IO)
memcpy(tmp.c, out - 16, 16);
(*cbc) (in, out - 16, residue, key, ivec, 1);
memcpy(out, tmp.c, residue);
#else
memset(tmp.c, 0, sizeof(tmp));
memcpy(tmp.c, in, residue);
memcpy(out, out - 16, residue);
(*cbc) (tmp.c, out - 16, 16, key, ivec, 1);
#endif
return len + residue;
}
size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
size_t len, const void *key,
unsigned char ivec[16], cbc128_f cbc)
{
size_t residue;
union {
size_t align;
unsigned char c[16];
} tmp;
if (len < 16)
return 0;
residue = len % 16;
len -= residue;
(*cbc) (in, out, len, key, ivec, 1);
if (residue == 0)
return len;
in += len;
out += len;
#if defined(CBC_HANDLES_TRUNCATED_IO)
(*cbc) (in, out - 16 + residue, residue, key, ivec, 1);
#else
memset(tmp.c, 0, sizeof(tmp));
memcpy(tmp.c, in, residue);
(*cbc) (tmp.c, out - 16 + residue, 16, key, ivec, 1);
#endif
return len + residue;
}
size_t CRYPTO_cts128_decrypt_block(const unsigned char *in,
unsigned char *out, size_t len,
const void *key, unsigned char ivec[16],
block128_f block)
{
size_t residue, n;
union {
size_t align;
unsigned char c[32];
} tmp;
if (len <= 16)
return 0;
if ((residue = len % 16) == 0)
residue = 16;
len -= 16 + residue;
if (len) {
CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
in += len;
out += len;
}
(*block) (in, tmp.c + 16, key);
memcpy(tmp.c, tmp.c + 16, 16);
memcpy(tmp.c, in + 16, residue);
(*block) (tmp.c, tmp.c, key);
for (n = 0; n < 16; ++n) {
unsigned char c = in[n];
out[n] = tmp.c[n] ^ ivec[n];
ivec[n] = c;
}
for (residue += 16; n < residue; ++n)
out[n] = tmp.c[n] ^ in[n];
return 16 + len + residue;
}
size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in,
unsigned char *out, size_t len,
const void *key,
unsigned char ivec[16],
block128_f block)
{
size_t residue, n;
union {
size_t align;
unsigned char c[32];
} tmp;
if (len < 16)
return 0;
residue = len % 16;
if (residue == 0) {
CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
return len;
}
len -= 16 + residue;
if (len) {
CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
in += len;
out += len;
}
(*block) (in + residue, tmp.c + 16, key);
memcpy(tmp.c, tmp.c + 16, 16);
memcpy(tmp.c, in, residue);
(*block) (tmp.c, tmp.c, key);
for (n = 0; n < 16; ++n) {
unsigned char c = in[n];
out[n] = tmp.c[n] ^ ivec[n];
ivec[n] = in[n + residue];
tmp.c[n] = c;
}
for (residue += 16; n < residue; ++n)
out[n] = tmp.c[n] ^ tmp.c[n - 16];
return 16 + len + residue;
}
size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
size_t len, const void *key,
unsigned char ivec[16], cbc128_f cbc)
{
size_t residue;
union {
size_t align;
unsigned char c[32];
} tmp;
if (len <= 16)
return 0;
if ((residue = len % 16) == 0)
residue = 16;
len -= 16 + residue;
if (len) {
(*cbc) (in, out, len, key, ivec, 0);
in += len;
out += len;
}
memset(tmp.c, 0, sizeof(tmp));
/*
* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0]
*/
(*cbc) (in, tmp.c, 16, key, tmp.c + 16, 0);
memcpy(tmp.c, in + 16, residue);
#if defined(CBC_HANDLES_TRUNCATED_IO)
(*cbc) (tmp.c, out, 16 + residue, key, ivec, 0);
#else
(*cbc) (tmp.c, tmp.c, 32, key, ivec, 0);
memcpy(out, tmp.c, 16 + residue);
#endif
return 16 + len + residue;
}
size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
size_t len, const void *key,
unsigned char ivec[16], cbc128_f cbc)
{
size_t residue;
union {
size_t align;
unsigned char c[32];
} tmp;
if (len < 16)
return 0;
residue = len % 16;
if (residue == 0) {
(*cbc) (in, out, len, key, ivec, 0);
return len;
}
len -= 16 + residue;
if (len) {
(*cbc) (in, out, len, key, ivec, 0);
in += len;
out += len;
}
memset(tmp.c, 0, sizeof(tmp));
/*
* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0]
*/
(*cbc) (in + residue, tmp.c, 16, key, tmp.c + 16, 0);
memcpy(tmp.c, in, residue);
#if defined(CBC_HANDLES_TRUNCATED_IO)
(*cbc) (tmp.c, out, 16 + residue, key, ivec, 0);
#else
(*cbc) (tmp.c, tmp.c, 32, key, ivec, 0);
memcpy(out, tmp.c, 16 + residue);
#endif
return 16 + len + residue;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,8 @@
crypto/modes/libcrypto-lib-cbc128.o: crypto/modes/cbc128.c \
include/openssl/crypto.h include/openssl/macros.h \
include/openssl/opensslconf.h include/openssl/configuration.h \
include/openssl/opensslv.h include/openssl/e_os2.h \
include/openssl/safestack.h include/openssl/stack.h \
include/openssl/types.h include/openssl/cryptoerr.h \
include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
include/openssl/core.h include/crypto/modes.h include/openssl/modes.h

View File

@@ -0,0 +1,8 @@
crypto/modes/libcrypto-lib-ccm128.o: crypto/modes/ccm128.c \
include/openssl/crypto.h include/openssl/macros.h \
include/openssl/opensslconf.h include/openssl/configuration.h \
include/openssl/opensslv.h include/openssl/e_os2.h \
include/openssl/safestack.h include/openssl/stack.h \
include/openssl/types.h include/openssl/cryptoerr.h \
include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
include/openssl/core.h include/crypto/modes.h include/openssl/modes.h

View File

@@ -0,0 +1,8 @@
crypto/modes/libcrypto-lib-cfb128.o: crypto/modes/cfb128.c \
include/openssl/crypto.h include/openssl/macros.h \
include/openssl/opensslconf.h include/openssl/configuration.h \
include/openssl/opensslv.h include/openssl/e_os2.h \
include/openssl/safestack.h include/openssl/stack.h \
include/openssl/types.h include/openssl/cryptoerr.h \
include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
include/openssl/core.h include/crypto/modes.h include/openssl/modes.h

View File

@@ -0,0 +1,9 @@
crypto/modes/libcrypto-lib-ctr128.o: crypto/modes/ctr128.c \
include/openssl/crypto.h include/openssl/macros.h \
include/openssl/opensslconf.h include/openssl/configuration.h \
include/openssl/opensslv.h include/openssl/e_os2.h \
include/openssl/safestack.h include/openssl/stack.h \
include/openssl/types.h include/openssl/cryptoerr.h \
include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
include/openssl/core.h include/internal/endian.h include/crypto/modes.h \
include/openssl/modes.h

View File

@@ -0,0 +1,8 @@
crypto/modes/libcrypto-lib-cts128.o: crypto/modes/cts128.c \
include/openssl/crypto.h include/openssl/macros.h \
include/openssl/opensslconf.h include/openssl/configuration.h \
include/openssl/opensslv.h include/openssl/e_os2.h \
include/openssl/safestack.h include/openssl/stack.h \
include/openssl/types.h include/openssl/cryptoerr.h \
include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
include/openssl/core.h include/crypto/modes.h include/openssl/modes.h

View File

@@ -0,0 +1,15 @@
crypto/modes/libcrypto-lib-gcm128.o: crypto/modes/gcm128.c \
include/openssl/crypto.h include/openssl/macros.h \
include/openssl/opensslconf.h include/openssl/configuration.h \
include/openssl/opensslv.h include/openssl/e_os2.h \
include/openssl/safestack.h include/openssl/stack.h \
include/openssl/types.h include/openssl/cryptoerr.h \
include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
include/openssl/core.h include/internal/cryptlib.h \
include/internal/common.h include/internal/e_os.h \
include/internal/numbers.h include/internal/nelem.h \
include/openssl/buffer.h include/openssl/buffererr.h \
include/openssl/bio.h include/openssl/bioerr.h include/openssl/asn1.h \
include/openssl/asn1err.h include/openssl/bn.h include/openssl/bnerr.h \
include/openssl/err.h include/openssl/lhash.h include/internal/endian.h \
include/crypto/modes.h include/openssl/modes.h

View File

@@ -0,0 +1,10 @@
crypto/modes/libcrypto-lib-ocb128.o: crypto/modes/ocb128.c \
include/openssl/crypto.h include/openssl/macros.h \
include/openssl/opensslconf.h include/openssl/configuration.h \
include/openssl/opensslv.h include/openssl/e_os2.h \
include/openssl/safestack.h include/openssl/stack.h \
include/openssl/types.h include/openssl/cryptoerr.h \
include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
include/openssl/core.h include/openssl/err.h include/openssl/bio.h \
include/openssl/bioerr.h include/openssl/lhash.h include/crypto/modes.h \
include/openssl/modes.h

View File

@@ -0,0 +1,8 @@
crypto/modes/libcrypto-lib-ofb128.o: crypto/modes/ofb128.c \
include/openssl/crypto.h include/openssl/macros.h \
include/openssl/opensslconf.h include/openssl/configuration.h \
include/openssl/opensslv.h include/openssl/e_os2.h \
include/openssl/safestack.h include/openssl/stack.h \
include/openssl/types.h include/openssl/cryptoerr.h \
include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
include/openssl/core.h include/crypto/modes.h include/openssl/modes.h

View File

@@ -0,0 +1,16 @@
crypto/modes/libcrypto-lib-siv128.o: crypto/modes/siv128.c \
include/openssl/crypto.h include/openssl/macros.h \
include/openssl/opensslconf.h include/openssl/configuration.h \
include/openssl/opensslv.h include/openssl/e_os2.h \
include/openssl/safestack.h include/openssl/stack.h \
include/openssl/types.h include/openssl/cryptoerr.h \
include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
include/openssl/core.h include/openssl/evp.h \
include/openssl/core_dispatch.h include/openssl/indicator.h \
include/openssl/params.h include/openssl/bn.h include/openssl/bnerr.h \
include/openssl/bio.h include/openssl/bioerr.h include/openssl/evperr.h \
include/openssl/objects.h include/openssl/obj_mac.h \
include/openssl/asn1.h include/openssl/asn1err.h \
include/openssl/objectserr.h include/openssl/core_names.h \
include/internal/endian.h include/crypto/modes.h include/openssl/modes.h \
include/crypto/siv.h

View File

@@ -0,0 +1,14 @@
crypto/modes/libcrypto-lib-wrap128.o: crypto/modes/wrap128.c \
include/internal/cryptlib.h include/internal/common.h \
include/openssl/configuration.h include/internal/e_os.h \
include/openssl/opensslconf.h include/openssl/macros.h \
include/openssl/opensslv.h include/openssl/e_os2.h \
include/openssl/crypto.h include/openssl/safestack.h \
include/openssl/stack.h include/openssl/types.h \
include/openssl/cryptoerr.h include/openssl/symhacks.h \
include/openssl/cryptoerr_legacy.h include/openssl/core.h \
include/internal/numbers.h include/internal/nelem.h \
include/openssl/buffer.h include/openssl/buffererr.h \
include/openssl/bio.h include/openssl/bioerr.h include/openssl/asn1.h \
include/openssl/asn1err.h include/openssl/bn.h include/openssl/bnerr.h \
include/openssl/err.h include/openssl/lhash.h include/openssl/modes.h

View File

@@ -0,0 +1,9 @@
crypto/modes/libcrypto-lib-xts128.o: crypto/modes/xts128.c \
include/openssl/crypto.h include/openssl/macros.h \
include/openssl/opensslconf.h include/openssl/configuration.h \
include/openssl/opensslv.h include/openssl/e_os2.h \
include/openssl/safestack.h include/openssl/stack.h \
include/openssl/types.h include/openssl/cryptoerr.h \
include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
include/openssl/core.h include/internal/endian.h include/crypto/modes.h \
include/openssl/modes.h

View File

@@ -0,0 +1,9 @@
crypto/modes/libcrypto-lib-xts128gb.o: crypto/modes/xts128gb.c \
include/openssl/crypto.h include/openssl/macros.h \
include/openssl/opensslconf.h include/openssl/configuration.h \
include/openssl/opensslv.h include/openssl/e_os2.h \
include/openssl/safestack.h include/openssl/stack.h \
include/openssl/types.h include/openssl/cryptoerr.h \
include/openssl/symhacks.h include/openssl/cryptoerr_legacy.h \
include/openssl/core.h include/internal/endian.h include/crypto/modes.h \
include/openssl/modes.h

View File

@@ -0,0 +1,558 @@
/*
* Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <string.h>
#include <openssl/crypto.h>
#include <openssl/err.h>
#include "crypto/modes.h"
#ifndef OPENSSL_NO_OCB
/*
* Calculate the number of binary trailing zero's in any given number
*/
static u32 ocb_ntz(u64 n)
{
u32 cnt = 0;
/*
* We do a right-to-left simple sequential search. This is surprisingly
* efficient as the distribution of trailing zeros is not uniform,
* e.g. the number of possible inputs with no trailing zeros is equal to
* the number with 1 or more; the number with exactly 1 is equal to the
* number with 2 or more, etc. Checking the last two bits covers 75% of
* all numbers. Checking the last three covers 87.5%
*/
while (!(n & 1)) {
n >>= 1;
cnt++;
}
return cnt;
}
/*
* Shift a block of 16 bytes left by shift bits
*/
static void ocb_block_lshift(const unsigned char *in, size_t shift,
unsigned char *out)
{
int i;
unsigned char carry = 0, carry_next;
for (i = 15; i >= 0; i--) {
carry_next = in[i] >> (8 - shift);
out[i] = (in[i] << shift) | carry;
carry = carry_next;
}
}
/*
* Perform a "double" operation as per OCB spec
*/
static void ocb_double(OCB_BLOCK *in, OCB_BLOCK *out)
{
unsigned char mask;
/*
* Calculate the mask based on the most significant bit. There are more
* efficient ways to do this - but this way is constant time
*/
mask = in->c[0] & 0x80;
mask >>= 7;
mask = (0 - mask) & 0x87;
ocb_block_lshift(in->c, 1, out->c);
out->c[15] ^= mask;
}
/*
* Perform an xor on in1 and in2 - each of len bytes. Store result in out
*/
static void ocb_block_xor(const unsigned char *in1,
const unsigned char *in2, size_t len,
unsigned char *out)
{
size_t i;
for (i = 0; i < len; i++) {
out[i] = in1[i] ^ in2[i];
}
}
/*
* Lookup L_index in our lookup table. If we haven't already got it we need to
* calculate it
*/
static OCB_BLOCK *ocb_lookup_l(OCB128_CONTEXT *ctx, size_t idx)
{
size_t l_index = ctx->l_index;
if (idx <= l_index) {
return ctx->l + idx;
}
/* We don't have it - so calculate it */
if (idx >= ctx->max_l_index) {
void *tmp_ptr;
/*
* Each additional entry allows to process almost double as
* much data, so that in linear world the table will need to
* be expanded with smaller and smaller increments. Originally
* it was doubling in size, which was a waste. Growing it
* linearly is not formally optimal, but is simpler to implement.
* We grow table by minimally required 4*n that would accommodate
* the index.
*/
ctx->max_l_index += (idx - ctx->max_l_index + 4) & ~3;
tmp_ptr = OPENSSL_realloc(ctx->l, ctx->max_l_index * sizeof(OCB_BLOCK));
if (tmp_ptr == NULL) /* prevent ctx->l from being clobbered */
return NULL;
ctx->l = tmp_ptr;
}
while (l_index < idx) {
ocb_double(ctx->l + l_index, ctx->l + l_index + 1);
l_index++;
}
ctx->l_index = l_index;
return ctx->l + idx;
}
/*
* Create a new OCB128_CONTEXT
*/
OCB128_CONTEXT *CRYPTO_ocb128_new(void *keyenc, void *keydec,
block128_f encrypt, block128_f decrypt,
ocb128_f stream)
{
OCB128_CONTEXT *octx;
int ret;
if ((octx = OPENSSL_malloc(sizeof(*octx))) != NULL) {
ret = CRYPTO_ocb128_init(octx, keyenc, keydec, encrypt, decrypt,
stream);
if (ret)
return octx;
OPENSSL_free(octx);
}
return NULL;
}
/*
* Initialise an existing OCB128_CONTEXT
*/
int CRYPTO_ocb128_init(OCB128_CONTEXT *ctx, void *keyenc, void *keydec,
block128_f encrypt, block128_f decrypt,
ocb128_f stream)
{
memset(ctx, 0, sizeof(*ctx));
ctx->l_index = 0;
ctx->max_l_index = 5;
if ((ctx->l = OPENSSL_malloc(ctx->max_l_index * 16)) == NULL)
return 0;
/*
* We set both the encryption and decryption key schedules - decryption
* needs both. Don't really need decryption schedule if only doing
* encryption - but it simplifies things to take it anyway
*/
ctx->encrypt = encrypt;
ctx->decrypt = decrypt;
ctx->stream = stream;
ctx->keyenc = keyenc;
ctx->keydec = keydec;
/* L_* = ENCIPHER(K, zeros(128)) */
ctx->encrypt(ctx->l_star.c, ctx->l_star.c, ctx->keyenc);
/* L_$ = double(L_*) */
ocb_double(&ctx->l_star, &ctx->l_dollar);
/* L_0 = double(L_$) */
ocb_double(&ctx->l_dollar, ctx->l);
/* L_{i} = double(L_{i-1}) */
ocb_double(ctx->l, ctx->l+1);
ocb_double(ctx->l+1, ctx->l+2);
ocb_double(ctx->l+2, ctx->l+3);
ocb_double(ctx->l+3, ctx->l+4);
ctx->l_index = 4; /* enough to process up to 496 bytes */
return 1;
}
/*
* Copy an OCB128_CONTEXT object
*/
int CRYPTO_ocb128_copy_ctx(OCB128_CONTEXT *dest, OCB128_CONTEXT *src,
void *keyenc, void *keydec)
{
memcpy(dest, src, sizeof(OCB128_CONTEXT));
if (keyenc)
dest->keyenc = keyenc;
if (keydec)
dest->keydec = keydec;
if (src->l) {
if ((dest->l = OPENSSL_malloc(src->max_l_index * 16)) == NULL)
return 0;
memcpy(dest->l, src->l, (src->l_index + 1) * 16);
}
return 1;
}
/*
* Set the IV to be used for this operation. Must be 1 - 15 bytes.
*/
int CRYPTO_ocb128_setiv(OCB128_CONTEXT *ctx, const unsigned char *iv,
size_t len, size_t taglen)
{
unsigned char ktop[16], tmp[16], mask;
unsigned char stretch[24], nonce[16];
size_t bottom, shift;
/*
* Spec says IV is 120 bits or fewer - it allows non byte aligned lengths.
* We don't support this at this stage
*/
if ((len > 15) || (len < 1) || (taglen > 16) || (taglen < 1)) {
return -1;
}
/* Reset nonce-dependent variables */
memset(&ctx->sess, 0, sizeof(ctx->sess));
/* Nonce = num2str(TAGLEN mod 128,7) || zeros(120-bitlen(N)) || 1 || N */
nonce[0] = ((taglen * 8) % 128) << 1;
memset(nonce + 1, 0, 15);
memcpy(nonce + 16 - len, iv, len);
nonce[15 - len] |= 1;
/* Ktop = ENCIPHER(K, Nonce[1..122] || zeros(6)) */
memcpy(tmp, nonce, 16);
tmp[15] &= 0xc0;
ctx->encrypt(tmp, ktop, ctx->keyenc);
/* Stretch = Ktop || (Ktop[1..64] xor Ktop[9..72]) */
memcpy(stretch, ktop, 16);
ocb_block_xor(ktop, ktop + 1, 8, stretch + 16);
/* bottom = str2num(Nonce[123..128]) */
bottom = nonce[15] & 0x3f;
/* Offset_0 = Stretch[1+bottom..128+bottom] */
shift = bottom % 8;
ocb_block_lshift(stretch + (bottom / 8), shift, ctx->sess.offset.c);
mask = 0xff;
mask <<= 8 - shift;
ctx->sess.offset.c[15] |=
(*(stretch + (bottom / 8) + 16) & mask) >> (8 - shift);
return 1;
}
/*
* Provide any AAD. This can be called multiple times. Only the final time can
* have a partial block
*/
int CRYPTO_ocb128_aad(OCB128_CONTEXT *ctx, const unsigned char *aad,
size_t len)
{
u64 i, all_num_blocks;
size_t num_blocks, last_len;
OCB_BLOCK tmp;
/* Calculate the number of blocks of AAD provided now, and so far */
num_blocks = len / 16;
all_num_blocks = num_blocks + ctx->sess.blocks_hashed;
/* Loop through all full blocks of AAD */
for (i = ctx->sess.blocks_hashed + 1; i <= all_num_blocks; i++) {
OCB_BLOCK *lookup;
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
lookup = ocb_lookup_l(ctx, ocb_ntz(i));
if (lookup == NULL)
return 0;
ocb_block16_xor(&ctx->sess.offset_aad, lookup, &ctx->sess.offset_aad);
memcpy(tmp.c, aad, 16);
aad += 16;
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
ocb_block16_xor(&ctx->sess.offset_aad, &tmp, &tmp);
ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
ocb_block16_xor(&tmp, &ctx->sess.sum, &ctx->sess.sum);
}
/*
* Check if we have any partial blocks left over. This is only valid in the
* last call to this function
*/
last_len = len % 16;
if (last_len > 0) {
/* Offset_* = Offset_m xor L_* */
ocb_block16_xor(&ctx->sess.offset_aad, &ctx->l_star,
&ctx->sess.offset_aad);
/* CipherInput = (A_* || 1 || zeros(127-bitlen(A_*))) xor Offset_* */
memset(tmp.c, 0, 16);
memcpy(tmp.c, aad, last_len);
tmp.c[last_len] = 0x80;
ocb_block16_xor(&ctx->sess.offset_aad, &tmp, &tmp);
/* Sum = Sum_m xor ENCIPHER(K, CipherInput) */
ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
ocb_block16_xor(&tmp, &ctx->sess.sum, &ctx->sess.sum);
}
ctx->sess.blocks_hashed = all_num_blocks;
return 1;
}
/*
* Provide any data to be encrypted. This can be called multiple times. Only
* the final time can have a partial block
*/
int CRYPTO_ocb128_encrypt(OCB128_CONTEXT *ctx,
const unsigned char *in, unsigned char *out,
size_t len)
{
u64 i, all_num_blocks;
size_t num_blocks, last_len;
/*
* Calculate the number of blocks of data to be encrypted provided now, and
* so far
*/
num_blocks = len / 16;
all_num_blocks = num_blocks + ctx->sess.blocks_processed;
if (num_blocks && all_num_blocks == (size_t)all_num_blocks
&& ctx->stream != NULL) {
size_t max_idx = 0, top = (size_t)all_num_blocks;
/*
* See how many L_{i} entries we need to process data at hand
* and pre-compute missing entries in the table [if any]...
*/
while (top >>= 1)
max_idx++;
if (ocb_lookup_l(ctx, max_idx) == NULL)
return 0;
ctx->stream(in, out, num_blocks, ctx->keyenc,
(size_t)ctx->sess.blocks_processed + 1, ctx->sess.offset.c,
(const unsigned char (*)[16])ctx->l, ctx->sess.checksum.c);
} else {
/* Loop through all full blocks to be encrypted */
for (i = ctx->sess.blocks_processed + 1; i <= all_num_blocks; i++) {
OCB_BLOCK *lookup;
OCB_BLOCK tmp;
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
lookup = ocb_lookup_l(ctx, ocb_ntz(i));
if (lookup == NULL)
return 0;
ocb_block16_xor(&ctx->sess.offset, lookup, &ctx->sess.offset);
memcpy(tmp.c, in, 16);
in += 16;
/* Checksum_i = Checksum_{i-1} xor P_i */
ocb_block16_xor(&tmp, &ctx->sess.checksum, &ctx->sess.checksum);
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
ocb_block16_xor(&ctx->sess.offset, &tmp, &tmp);
ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
ocb_block16_xor(&ctx->sess.offset, &tmp, &tmp);
memcpy(out, tmp.c, 16);
out += 16;
}
}
/*
* Check if we have any partial blocks left over. This is only valid in the
* last call to this function
*/
last_len = len % 16;
if (last_len > 0) {
OCB_BLOCK pad;
/* Offset_* = Offset_m xor L_* */
ocb_block16_xor(&ctx->sess.offset, &ctx->l_star, &ctx->sess.offset);
/* Pad = ENCIPHER(K, Offset_*) */
ctx->encrypt(ctx->sess.offset.c, pad.c, ctx->keyenc);
/* C_* = P_* xor Pad[1..bitlen(P_*)] */
ocb_block_xor(in, pad.c, last_len, out);
/* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
memset(pad.c, 0, 16); /* borrow pad */
memcpy(pad.c, in, last_len);
pad.c[last_len] = 0x80;
ocb_block16_xor(&pad, &ctx->sess.checksum, &ctx->sess.checksum);
}
ctx->sess.blocks_processed = all_num_blocks;
return 1;
}
/*
* Provide any data to be decrypted. This can be called multiple times. Only
* the final time can have a partial block
*/
int CRYPTO_ocb128_decrypt(OCB128_CONTEXT *ctx,
const unsigned char *in, unsigned char *out,
size_t len)
{
u64 i, all_num_blocks;
size_t num_blocks, last_len;
/*
* Calculate the number of blocks of data to be decrypted provided now, and
* so far
*/
num_blocks = len / 16;
all_num_blocks = num_blocks + ctx->sess.blocks_processed;
if (num_blocks && all_num_blocks == (size_t)all_num_blocks
&& ctx->stream != NULL) {
size_t max_idx = 0, top = (size_t)all_num_blocks;
/*
* See how many L_{i} entries we need to process data at hand
* and pre-compute missing entries in the table [if any]...
*/
while (top >>= 1)
max_idx++;
if (ocb_lookup_l(ctx, max_idx) == NULL)
return 0;
ctx->stream(in, out, num_blocks, ctx->keydec,
(size_t)ctx->sess.blocks_processed + 1, ctx->sess.offset.c,
(const unsigned char (*)[16])ctx->l, ctx->sess.checksum.c);
} else {
OCB_BLOCK tmp;
/* Loop through all full blocks to be decrypted */
for (i = ctx->sess.blocks_processed + 1; i <= all_num_blocks; i++) {
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
OCB_BLOCK *lookup = ocb_lookup_l(ctx, ocb_ntz(i));
if (lookup == NULL)
return 0;
ocb_block16_xor(&ctx->sess.offset, lookup, &ctx->sess.offset);
memcpy(tmp.c, in, 16);
in += 16;
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
ocb_block16_xor(&ctx->sess.offset, &tmp, &tmp);
ctx->decrypt(tmp.c, tmp.c, ctx->keydec);
ocb_block16_xor(&ctx->sess.offset, &tmp, &tmp);
/* Checksum_i = Checksum_{i-1} xor P_i */
ocb_block16_xor(&tmp, &ctx->sess.checksum, &ctx->sess.checksum);
memcpy(out, tmp.c, 16);
out += 16;
}
}
/*
* Check if we have any partial blocks left over. This is only valid in the
* last call to this function
*/
last_len = len % 16;
if (last_len > 0) {
OCB_BLOCK pad;
/* Offset_* = Offset_m xor L_* */
ocb_block16_xor(&ctx->sess.offset, &ctx->l_star, &ctx->sess.offset);
/* Pad = ENCIPHER(K, Offset_*) */
ctx->encrypt(ctx->sess.offset.c, pad.c, ctx->keyenc);
/* P_* = C_* xor Pad[1..bitlen(C_*)] */
ocb_block_xor(in, pad.c, last_len, out);
/* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
memset(pad.c, 0, 16); /* borrow pad */
memcpy(pad.c, out, last_len);
pad.c[last_len] = 0x80;
ocb_block16_xor(&pad, &ctx->sess.checksum, &ctx->sess.checksum);
}
ctx->sess.blocks_processed = all_num_blocks;
return 1;
}
static int ocb_finish(OCB128_CONTEXT *ctx, unsigned char *tag, size_t len,
int write)
{
OCB_BLOCK tmp;
if (len > 16 || len < 1) {
return -1;
}
/*
* Tag = ENCIPHER(K, Checksum_* xor Offset_* xor L_$) xor HASH(K,A)
*/
ocb_block16_xor(&ctx->sess.checksum, &ctx->sess.offset, &tmp);
ocb_block16_xor(&ctx->l_dollar, &tmp, &tmp);
ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
ocb_block16_xor(&tmp, &ctx->sess.sum, &tmp);
if (write) {
memcpy(tag, &tmp, len);
return 1;
} else {
return CRYPTO_memcmp(&tmp, tag, len);
}
}
/*
* Calculate the tag and verify it against the supplied tag
*/
int CRYPTO_ocb128_finish(OCB128_CONTEXT *ctx, const unsigned char *tag,
size_t len)
{
return ocb_finish(ctx, (unsigned char*)tag, len, 0);
}
/*
* Retrieve the calculated tag
*/
int CRYPTO_ocb128_tag(OCB128_CONTEXT *ctx, unsigned char *tag, size_t len)
{
return ocb_finish(ctx, tag, len, 1);
}
/*
* Release all resources
*/
void CRYPTO_ocb128_cleanup(OCB128_CONTEXT *ctx)
{
if (ctx) {
OPENSSL_clear_free(ctx->l, ctx->max_l_index * 16);
OPENSSL_cleanse(ctx, sizeof(*ctx));
}
}
#endif /* OPENSSL_NO_OCB */

View File

@@ -0,0 +1,86 @@
/*
* Copyright 2008-2021 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <string.h>
#include <openssl/crypto.h>
#include "crypto/modes.h"
#if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
typedef size_t size_t_aX __attribute((__aligned__(1)));
#else
typedef size_t size_t_aX;
#endif
/*
* The input and output encrypted as though 128bit ofb mode is being used.
* The extra state information to record how much of the 128bit block we have
* used is contained in *num;
*/
void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
size_t len, const void *key,
unsigned char ivec[16], int *num, block128_f block)
{
unsigned int n;
size_t l = 0;
if (*num < 0) {
/* There is no good way to signal an error return from here */
*num = -1;
return;
}
n = *num;
#if !defined(OPENSSL_SMALL_FOOTPRINT)
if (16 % sizeof(size_t) == 0) { /* always true actually */
do {
while (n && len) {
*(out++) = *(in++) ^ ivec[n];
--len;
n = (n + 1) % 16;
}
# if defined(STRICT_ALIGNMENT)
if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) !=
0)
break;
# endif
while (len >= 16) {
(*block) (ivec, ivec, key);
for (; n < 16; n += sizeof(size_t))
*(size_t_aX *)(out + n) =
*(size_t_aX *)(in + n)
^ *(size_t_aX *)(ivec + n);
len -= 16;
out += 16;
in += 16;
n = 0;
}
if (len) {
(*block) (ivec, ivec, key);
while (len--) {
out[n] = in[n] ^ ivec[n];
++n;
}
}
*num = n;
return;
} while (0);
}
/* the rest would be commonly eliminated by x86* compiler */
#endif
while (l < len) {
if (n == 0) {
(*block) (ivec, ivec, key);
}
out[l] = in[l] ^ ivec[n];
++l;
n = (n + 1) % 16;
}
*num = n;
}

View File

@@ -0,0 +1,393 @@
/*
* Copyright 2018-2021 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <string.h>
#include <stdlib.h>
#include <openssl/crypto.h>
#include <openssl/evp.h>
#include <openssl/core_names.h>
#include <openssl/params.h>
#include "internal/endian.h"
#include "crypto/modes.h"
#include "crypto/siv.h"
#ifndef OPENSSL_NO_SIV
__owur static ossl_inline uint32_t rotl8(uint32_t x)
{
return (x << 8) | (x >> 24);
}
__owur static ossl_inline uint32_t rotr8(uint32_t x)
{
return (x >> 8) | (x << 24);
}
__owur static ossl_inline uint64_t byteswap8(uint64_t x)
{
uint32_t high = (uint32_t)(x >> 32);
uint32_t low = (uint32_t)x;
high = (rotl8(high) & 0x00ff00ff) | (rotr8(high) & 0xff00ff00);
low = (rotl8(low) & 0x00ff00ff) | (rotr8(low) & 0xff00ff00);
return ((uint64_t)low) << 32 | (uint64_t)high;
}
__owur static ossl_inline uint64_t siv128_getword(SIV_BLOCK const *b, size_t i)
{
DECLARE_IS_ENDIAN;
if (IS_LITTLE_ENDIAN)
return byteswap8(b->word[i]);
return b->word[i];
}
static ossl_inline void siv128_putword(SIV_BLOCK *b, size_t i, uint64_t x)
{
DECLARE_IS_ENDIAN;
if (IS_LITTLE_ENDIAN)
b->word[i] = byteswap8(x);
else
b->word[i] = x;
}
static ossl_inline void siv128_xorblock(SIV_BLOCK *x,
SIV_BLOCK const *y)
{
x->word[0] ^= y->word[0];
x->word[1] ^= y->word[1];
}
/*
* Doubles |b|, which is 16 bytes representing an element
* of GF(2**128) modulo the irreducible polynomial
* x**128 + x**7 + x**2 + x + 1.
* Assumes two's-complement arithmetic
*/
static ossl_inline void siv128_dbl(SIV_BLOCK *b)
{
uint64_t high = siv128_getword(b, 0);
uint64_t low = siv128_getword(b, 1);
uint64_t high_carry = high & (((uint64_t)1) << 63);
uint64_t low_carry = low & (((uint64_t)1) << 63);
int64_t low_mask = -((int64_t)(high_carry >> 63)) & 0x87;
uint64_t high_mask = low_carry >> 63;
high = (high << 1) | high_mask;
low = (low << 1) ^ (uint64_t)low_mask;
siv128_putword(b, 0, high);
siv128_putword(b, 1, low);
}
__owur static ossl_inline int siv128_do_s2v_p(SIV128_CONTEXT *ctx, SIV_BLOCK *out,
unsigned char const* in, size_t len)
{
SIV_BLOCK t;
size_t out_len = sizeof(out->byte);
EVP_MAC_CTX *mac_ctx;
int ret = 0;
mac_ctx = EVP_MAC_CTX_dup(ctx->mac_ctx_init);
if (mac_ctx == NULL)
return 0;
if (len >= SIV_LEN) {
if (!EVP_MAC_update(mac_ctx, in, len - SIV_LEN))
goto err;
memcpy(&t, in + (len-SIV_LEN), SIV_LEN);
siv128_xorblock(&t, &ctx->d);
if (!EVP_MAC_update(mac_ctx, t.byte, SIV_LEN))
goto err;
} else {
memset(&t, 0, sizeof(t));
memcpy(&t, in, len);
t.byte[len] = 0x80;
siv128_dbl(&ctx->d);
siv128_xorblock(&t, &ctx->d);
if (!EVP_MAC_update(mac_ctx, t.byte, SIV_LEN))
goto err;
}
if (!EVP_MAC_final(mac_ctx, out->byte, &out_len, sizeof(out->byte))
|| out_len != SIV_LEN)
goto err;
ret = 1;
err:
EVP_MAC_CTX_free(mac_ctx);
return ret;
}
__owur static ossl_inline int siv128_do_encrypt(EVP_CIPHER_CTX *ctx, unsigned char *out,
unsigned char const *in, size_t len,
SIV_BLOCK *icv)
{
int out_len = (int)len;
if (!EVP_CipherInit_ex(ctx, NULL, NULL, NULL, icv->byte, 1))
return 0;
return EVP_EncryptUpdate(ctx, out, &out_len, in, out_len);
}
/*
* Create a new SIV128_CONTEXT
*/
SIV128_CONTEXT *ossl_siv128_new(const unsigned char *key, int klen,
EVP_CIPHER *cbc, EVP_CIPHER *ctr,
OSSL_LIB_CTX *libctx, const char *propq)
{
SIV128_CONTEXT *ctx;
int ret;
if ((ctx = OPENSSL_malloc(sizeof(*ctx))) != NULL) {
ret = ossl_siv128_init(ctx, key, klen, cbc, ctr, libctx, propq);
if (ret)
return ctx;
OPENSSL_free(ctx);
}
return NULL;
}
/*
* Initialise an existing SIV128_CONTEXT
*/
int ossl_siv128_init(SIV128_CONTEXT *ctx, const unsigned char *key, int klen,
const EVP_CIPHER *cbc, const EVP_CIPHER *ctr,
OSSL_LIB_CTX *libctx, const char *propq)
{
static const unsigned char zero[SIV_LEN] = { 0 };
size_t out_len = SIV_LEN;
EVP_MAC_CTX *mac_ctx = NULL;
OSSL_PARAM params[3];
const char *cbc_name;
if (ctx == NULL)
return 0;
memset(&ctx->d, 0, sizeof(ctx->d));
EVP_CIPHER_CTX_free(ctx->cipher_ctx);
EVP_MAC_CTX_free(ctx->mac_ctx_init);
EVP_MAC_free(ctx->mac);
ctx->mac = NULL;
ctx->cipher_ctx = NULL;
ctx->mac_ctx_init = NULL;
if (key == NULL || cbc == NULL || ctr == NULL)
return 0;
cbc_name = EVP_CIPHER_get0_name(cbc);
params[0] = OSSL_PARAM_construct_utf8_string(OSSL_MAC_PARAM_CIPHER,
(char *)cbc_name, 0);
params[1] = OSSL_PARAM_construct_octet_string(OSSL_MAC_PARAM_KEY,
(void *)key, klen);
params[2] = OSSL_PARAM_construct_end();
if ((ctx->cipher_ctx = EVP_CIPHER_CTX_new()) == NULL
|| (ctx->mac =
EVP_MAC_fetch(libctx, OSSL_MAC_NAME_CMAC, propq)) == NULL
|| (ctx->mac_ctx_init = EVP_MAC_CTX_new(ctx->mac)) == NULL
|| !EVP_MAC_CTX_set_params(ctx->mac_ctx_init, params)
|| !EVP_EncryptInit_ex(ctx->cipher_ctx, ctr, NULL, key + klen, NULL)
|| (mac_ctx = EVP_MAC_CTX_dup(ctx->mac_ctx_init)) == NULL
|| !EVP_MAC_update(mac_ctx, zero, sizeof(zero))
|| !EVP_MAC_final(mac_ctx, ctx->d.byte, &out_len,
sizeof(ctx->d.byte))) {
EVP_CIPHER_CTX_free(ctx->cipher_ctx);
EVP_MAC_CTX_free(ctx->mac_ctx_init);
EVP_MAC_CTX_free(mac_ctx);
EVP_MAC_free(ctx->mac);
return 0;
}
EVP_MAC_CTX_free(mac_ctx);
ctx->final_ret = -1;
ctx->crypto_ok = 1;
return 1;
}
/*
* Copy an SIV128_CONTEXT object
*/
int ossl_siv128_copy_ctx(SIV128_CONTEXT *dest, SIV128_CONTEXT *src)
{
memcpy(&dest->d, &src->d, sizeof(src->d));
if (dest->cipher_ctx == NULL) {
dest->cipher_ctx = EVP_CIPHER_CTX_new();
if (dest->cipher_ctx == NULL)
return 0;
}
if (!EVP_CIPHER_CTX_copy(dest->cipher_ctx, src->cipher_ctx))
return 0;
EVP_MAC_CTX_free(dest->mac_ctx_init);
dest->mac_ctx_init = EVP_MAC_CTX_dup(src->mac_ctx_init);
if (dest->mac_ctx_init == NULL)
return 0;
dest->mac = src->mac;
if (dest->mac != NULL)
EVP_MAC_up_ref(dest->mac);
return 1;
}
/*
* Provide any AAD. This can be called multiple times.
* Per RFC5297, the last piece of associated data
* is the nonce, but it's not treated special
*/
int ossl_siv128_aad(SIV128_CONTEXT *ctx, const unsigned char *aad,
size_t len)
{
SIV_BLOCK mac_out;
size_t out_len = SIV_LEN;
EVP_MAC_CTX *mac_ctx;
siv128_dbl(&ctx->d);
if ((mac_ctx = EVP_MAC_CTX_dup(ctx->mac_ctx_init)) == NULL
|| !EVP_MAC_update(mac_ctx, aad, len)
|| !EVP_MAC_final(mac_ctx, mac_out.byte, &out_len,
sizeof(mac_out.byte))
|| out_len != SIV_LEN) {
EVP_MAC_CTX_free(mac_ctx);
return 0;
}
EVP_MAC_CTX_free(mac_ctx);
siv128_xorblock(&ctx->d, &mac_out);
return 1;
}
/*
* Provide any data to be encrypted. This can be called once.
*/
int ossl_siv128_encrypt(SIV128_CONTEXT *ctx,
const unsigned char *in, unsigned char *out,
size_t len)
{
SIV_BLOCK q;
/* can only do one crypto operation */
if (ctx->crypto_ok == 0)
return 0;
ctx->crypto_ok--;
if (!siv128_do_s2v_p(ctx, &q, in, len))
return 0;
memcpy(ctx->tag.byte, &q, SIV_LEN);
q.byte[8] &= 0x7f;
q.byte[12] &= 0x7f;
if (!siv128_do_encrypt(ctx->cipher_ctx, out, in, len, &q))
return 0;
ctx->final_ret = 0;
return len;
}
/*
* Provide any data to be decrypted. This can be called once.
*/
int ossl_siv128_decrypt(SIV128_CONTEXT *ctx,
const unsigned char *in, unsigned char *out,
size_t len)
{
unsigned char* p;
SIV_BLOCK t, q;
int i;
/* can only do one crypto operation */
if (ctx->crypto_ok == 0)
return 0;
ctx->crypto_ok--;
memcpy(&q, ctx->tag.byte, SIV_LEN);
q.byte[8] &= 0x7f;
q.byte[12] &= 0x7f;
if (!siv128_do_encrypt(ctx->cipher_ctx, out, in, len, &q)
|| !siv128_do_s2v_p(ctx, &t, out, len))
return 0;
p = ctx->tag.byte;
for (i = 0; i < SIV_LEN; i++)
t.byte[i] ^= p[i];
if ((t.word[0] | t.word[1]) != 0) {
OPENSSL_cleanse(out, len);
return 0;
}
ctx->final_ret = 0;
return len;
}
/*
* Return the already calculated final result.
*/
int ossl_siv128_finish(SIV128_CONTEXT *ctx)
{
return ctx->final_ret;
}
/*
* Set the tag
*/
int ossl_siv128_set_tag(SIV128_CONTEXT *ctx, const unsigned char *tag, size_t len)
{
if (len != SIV_LEN)
return 0;
/* Copy the tag from the supplied buffer */
memcpy(ctx->tag.byte, tag, len);
return 1;
}
/*
* Retrieve the calculated tag
*/
int ossl_siv128_get_tag(SIV128_CONTEXT *ctx, unsigned char *tag, size_t len)
{
if (len != SIV_LEN)
return 0;
/* Copy the tag into the supplied buffer */
memcpy(tag, ctx->tag.byte, len);
return 1;
}
/*
* Release all resources
*/
int ossl_siv128_cleanup(SIV128_CONTEXT *ctx)
{
if (ctx != NULL) {
EVP_CIPHER_CTX_free(ctx->cipher_ctx);
ctx->cipher_ctx = NULL;
EVP_MAC_CTX_free(ctx->mac_ctx_init);
ctx->mac_ctx_init = NULL;
EVP_MAC_free(ctx->mac);
ctx->mac = NULL;
OPENSSL_cleanse(&ctx->d, sizeof(ctx->d));
OPENSSL_cleanse(&ctx->tag, sizeof(ctx->tag));
ctx->final_ret = -1;
ctx->crypto_ok = 1;
}
return 1;
}
int ossl_siv128_speed(SIV128_CONTEXT *ctx, int arg)
{
ctx->crypto_ok = (arg == 1) ? -1 : 1;
return 1;
}
#endif /* OPENSSL_NO_SIV */

View File

@@ -0,0 +1,331 @@
/*
* Copyright 2013-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
/** Beware!
*
* Following wrapping modes were designed for AES but this implementation
* allows you to use them for any 128 bit block cipher.
*/
#include "internal/cryptlib.h"
#include <openssl/modes.h>
/** RFC 3394 section 2.2.3.1 Default Initial Value */
static const unsigned char default_iv[] = {
0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
};
/** RFC 5649 section 3 Alternative Initial Value 32-bit constant */
static const unsigned char default_aiv[] = {
0xA6, 0x59, 0x59, 0xA6
};
/** Input size limit: lower than maximum of standards but far larger than
* anything that will be used in practice.
*/
#define CRYPTO128_WRAP_MAX (1UL << 31)
/** Wrapping according to RFC 3394 section 2.2.1.
*
* @param[in] key Key value.
* @param[in] iv IV value. Length = 8 bytes. NULL = use default_iv.
* @param[in] in Plaintext as n 64-bit blocks, n >= 2.
* @param[in] inlen Length of in.
* @param[out] out Ciphertext. Minimal buffer length = (inlen + 8) bytes.
* Input and output buffers can overlap if block function
* supports that.
* @param[in] block Block processing function.
* @return 0 if inlen does not consist of n 64-bit blocks, n >= 2.
* or if inlen > CRYPTO128_WRAP_MAX.
* Output length if wrapping succeeded.
*/
size_t CRYPTO_128_wrap(void *key, const unsigned char *iv,
unsigned char *out,
const unsigned char *in, size_t inlen,
block128_f block)
{
unsigned char *A, B[16], *R;
size_t i, j, t;
if ((inlen & 0x7) || (inlen < 16) || (inlen > CRYPTO128_WRAP_MAX))
return 0;
A = B;
t = 1;
memmove(out + 8, in, inlen);
if (!iv)
iv = default_iv;
memcpy(A, iv, 8);
for (j = 0; j < 6; j++) {
R = out + 8;
for (i = 0; i < inlen; i += 8, t++, R += 8) {
memcpy(B + 8, R, 8);
block(B, B, key);
A[7] ^= (unsigned char)(t & 0xff);
if (t > 0xff) {
A[6] ^= (unsigned char)((t >> 8) & 0xff);
A[5] ^= (unsigned char)((t >> 16) & 0xff);
A[4] ^= (unsigned char)((t >> 24) & 0xff);
}
memcpy(R, B + 8, 8);
}
}
memcpy(out, A, 8);
return inlen + 8;
}
/** Unwrapping according to RFC 3394 section 2.2.2 steps 1-2.
* The IV check (step 3) is responsibility of the caller.
*
* @param[in] key Key value.
* @param[out] iv Unchecked IV value. Minimal buffer length = 8 bytes.
* @param[out] out Plaintext without IV.
* Minimal buffer length = (inlen - 8) bytes.
* Input and output buffers can overlap if block function
* supports that.
* @param[in] in Ciphertext as n 64-bit blocks.
* @param[in] inlen Length of in.
* @param[in] block Block processing function.
* @return 0 if inlen is out of range [24, CRYPTO128_WRAP_MAX]
* or if inlen is not a multiple of 8.
* Output length otherwise.
*/
static size_t crypto_128_unwrap_raw(void *key, unsigned char *iv,
unsigned char *out,
const unsigned char *in, size_t inlen,
block128_f block)
{
unsigned char *A, B[16], *R;
size_t i, j, t;
inlen -= 8;
if ((inlen & 0x7) || (inlen < 16) || (inlen > CRYPTO128_WRAP_MAX))
return 0;
A = B;
t = 6 * (inlen >> 3);
memcpy(A, in, 8);
memmove(out, in + 8, inlen);
for (j = 0; j < 6; j++) {
R = out + inlen - 8;
for (i = 0; i < inlen; i += 8, t--, R -= 8) {
A[7] ^= (unsigned char)(t & 0xff);
if (t > 0xff) {
A[6] ^= (unsigned char)((t >> 8) & 0xff);
A[5] ^= (unsigned char)((t >> 16) & 0xff);
A[4] ^= (unsigned char)((t >> 24) & 0xff);
}
memcpy(B + 8, R, 8);
block(B, B, key);
memcpy(R, B + 8, 8);
}
}
memcpy(iv, A, 8);
return inlen;
}
/** Unwrapping according to RFC 3394 section 2.2.2, including the IV check.
* The first block of plaintext has to match the supplied IV, otherwise an
* error is returned.
*
* @param[in] key Key value.
* @param[out] iv IV value to match against. Length = 8 bytes.
* NULL = use default_iv.
* @param[out] out Plaintext without IV.
* Minimal buffer length = (inlen - 8) bytes.
* Input and output buffers can overlap if block function
* supports that.
* @param[in] in Ciphertext as n 64-bit blocks.
* @param[in] inlen Length of in.
* @param[in] block Block processing function.
* @return 0 if inlen is out of range [24, CRYPTO128_WRAP_MAX]
* or if inlen is not a multiple of 8
* or if IV doesn't match expected value.
* Output length otherwise.
*/
size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv,
unsigned char *out, const unsigned char *in,
size_t inlen, block128_f block)
{
size_t ret;
unsigned char got_iv[8];
ret = crypto_128_unwrap_raw(key, got_iv, out, in, inlen, block);
if (ret == 0)
return 0;
if (!iv)
iv = default_iv;
if (CRYPTO_memcmp(got_iv, iv, 8)) {
OPENSSL_cleanse(out, ret);
return 0;
}
return ret;
}
/** Wrapping according to RFC 5649 section 4.1.
*
* @param[in] key Key value.
* @param[in] icv (Non-standard) IV, 4 bytes. NULL = use default_aiv.
* @param[out] out Ciphertext. Minimal buffer length = (inlen + 15) bytes.
* Input and output buffers can overlap if block function
* supports that.
* @param[in] in Plaintext as n 64-bit blocks, n >= 2.
* @param[in] inlen Length of in.
* @param[in] block Block processing function.
* @return 0 if inlen is out of range [1, CRYPTO128_WRAP_MAX].
* Output length if wrapping succeeded.
*/
size_t CRYPTO_128_wrap_pad(void *key, const unsigned char *icv,
unsigned char *out,
const unsigned char *in, size_t inlen,
block128_f block)
{
/* n: number of 64-bit blocks in the padded key data
*
* If length of plain text is not a multiple of 8, pad the plain text octet
* string on the right with octets of zeros, where final length is the
* smallest multiple of 8 that is greater than length of plain text.
* If length of plain text is a multiple of 8, then there is no padding. */
const size_t blocks_padded = (inlen + 7) / 8; /* CEILING(m/8) */
const size_t padded_len = blocks_padded * 8;
const size_t padding_len = padded_len - inlen;
/* RFC 5649 section 3: Alternative Initial Value */
unsigned char aiv[8];
int ret;
/* Section 1: use 32-bit fixed field for plaintext octet length */
if (inlen == 0 || inlen >= CRYPTO128_WRAP_MAX)
return 0;
/* Section 3: Alternative Initial Value */
if (!icv)
memcpy(aiv, default_aiv, 4);
else
memcpy(aiv, icv, 4); /* Standard doesn't mention this. */
aiv[4] = (inlen >> 24) & 0xFF;
aiv[5] = (inlen >> 16) & 0xFF;
aiv[6] = (inlen >> 8) & 0xFF;
aiv[7] = inlen & 0xFF;
if (padded_len == 8) {
/*
* Section 4.1 - special case in step 2: If the padded plaintext
* contains exactly eight octets, then prepend the AIV and encrypt
* the resulting 128-bit block using AES in ECB mode.
*/
memmove(out + 8, in, inlen);
memcpy(out, aiv, 8);
memset(out + 8 + inlen, 0, padding_len);
block(out, out, key);
ret = 16; /* AIV + padded input */
} else {
memmove(out, in, inlen);
memset(out + inlen, 0, padding_len); /* Section 4.1 step 1 */
ret = CRYPTO_128_wrap(key, aiv, out, out, padded_len, block);
}
return ret;
}
/** Unwrapping according to RFC 5649 section 4.2.
*
* @param[in] key Key value.
* @param[in] icv (Non-standard) IV, 4 bytes. NULL = use default_aiv.
* @param[out] out Plaintext. Minimal buffer length = (inlen - 8) bytes.
* Input and output buffers can overlap if block function
* supports that.
* @param[in] in Ciphertext as n 64-bit blocks.
* @param[in] inlen Length of in.
* @param[in] block Block processing function.
* @return 0 if inlen is out of range [16, CRYPTO128_WRAP_MAX],
* or if inlen is not a multiple of 8
* or if IV and message length indicator doesn't match.
* Output length if unwrapping succeeded and IV matches.
*/
size_t CRYPTO_128_unwrap_pad(void *key, const unsigned char *icv,
unsigned char *out,
const unsigned char *in, size_t inlen,
block128_f block)
{
/* n: number of 64-bit blocks in the padded key data */
size_t n = inlen / 8 - 1;
size_t padded_len;
size_t padding_len;
size_t ptext_len;
/* RFC 5649 section 3: Alternative Initial Value */
unsigned char aiv[8];
static unsigned char zeros[8] = { 0x0 };
size_t ret;
/* Section 4.2: Ciphertext length has to be (n+1) 64-bit blocks. */
if ((inlen & 0x7) != 0 || inlen < 16 || inlen >= CRYPTO128_WRAP_MAX)
return 0;
if (inlen == 16) {
/*
* Section 4.2 - special case in step 1: When n=1, the ciphertext
* contains exactly two 64-bit blocks and they are decrypted as a
* single AES block using AES in ECB mode: AIV | P[1] = DEC(K, C[0] |
* C[1])
*/
unsigned char buff[16];
block(in, buff, key);
memcpy(aiv, buff, 8);
/* Remove AIV */
memcpy(out, buff + 8, 8);
padded_len = 8;
OPENSSL_cleanse(buff, inlen);
} else {
padded_len = inlen - 8;
ret = crypto_128_unwrap_raw(key, aiv, out, in, inlen, block);
if (padded_len != ret) {
OPENSSL_cleanse(out, inlen);
return 0;
}
}
/*
* Section 3: AIV checks: Check that MSB(32,A) = A65959A6. Optionally a
* user-supplied value can be used (even if standard doesn't mention
* this).
*/
if ((!icv && CRYPTO_memcmp(aiv, default_aiv, 4))
|| (icv && CRYPTO_memcmp(aiv, icv, 4))) {
OPENSSL_cleanse(out, inlen);
return 0;
}
/*
* Check that 8*(n-1) < LSB(32,AIV) <= 8*n. If so, let ptext_len =
* LSB(32,AIV).
*/
ptext_len = ((unsigned int)aiv[4] << 24)
| ((unsigned int)aiv[5] << 16)
| ((unsigned int)aiv[6] << 8)
| (unsigned int)aiv[7];
if (8 * (n - 1) >= ptext_len || ptext_len > 8 * n) {
OPENSSL_cleanse(out, inlen);
return 0;
}
/*
* Check that the rightmost padding_len octets of the output data are
* zero.
*/
padding_len = padded_len - ptext_len;
if (CRYPTO_memcmp(out + ptext_len, zeros, padding_len) != 0) {
OPENSSL_cleanse(out, inlen);
return 0;
}
/* Section 4.2 step 3: Remove padding */
return ptext_len;
}

View File

@@ -0,0 +1,161 @@
/*
* Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <string.h>
#include <openssl/crypto.h>
#include "internal/endian.h"
#include "crypto/modes.h"
#ifndef STRICT_ALIGNMENT
# ifdef __GNUC__
typedef u64 u64_a1 __attribute((__aligned__(1)));
# else
typedef u64 u64_a1;
# endif
#endif
int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx,
const unsigned char iv[16],
const unsigned char *inp, unsigned char *out,
size_t len, int enc)
{
DECLARE_IS_ENDIAN;
union {
u64 u[2];
u32 d[4];
u8 c[16];
} tweak, scratch;
unsigned int i;
if (len < 16)
return -1;
memcpy(tweak.c, iv, 16);
(*ctx->block2) (tweak.c, tweak.c, ctx->key2);
if (!enc && (len % 16))
len -= 16;
while (len >= 16) {
#if defined(STRICT_ALIGNMENT)
memcpy(scratch.c, inp, 16);
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
#else
scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak.u[0];
scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak.u[1];
#endif
(*ctx->block1) (scratch.c, scratch.c, ctx->key1);
#if defined(STRICT_ALIGNMENT)
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
memcpy(out, scratch.c, 16);
#else
((u64_a1 *)out)[0] = scratch.u[0] ^= tweak.u[0];
((u64_a1 *)out)[1] = scratch.u[1] ^= tweak.u[1];
#endif
inp += 16;
out += 16;
len -= 16;
if (len == 0)
return 0;
if (IS_LITTLE_ENDIAN) {
unsigned int carry, res;
res = 0x87 & (((int)tweak.d[3]) >> 31);
carry = (unsigned int)(tweak.u[0] >> 63);
tweak.u[0] = (tweak.u[0] << 1) ^ res;
tweak.u[1] = (tweak.u[1] << 1) | carry;
} else {
size_t c;
for (c = 0, i = 0; i < 16; ++i) {
/*
* + substitutes for |, because c is 1 bit
*/
c += ((size_t)tweak.c[i]) << 1;
tweak.c[i] = (u8)c;
c = c >> 8;
}
tweak.c[0] ^= (u8)(0x87 & (0 - c));
}
}
if (enc) {
for (i = 0; i < len; ++i) {
u8 c = inp[i];
out[i] = scratch.c[i];
scratch.c[i] = c;
}
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
(*ctx->block1) (scratch.c, scratch.c, ctx->key1);
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
memcpy(out - 16, scratch.c, 16);
} else {
union {
u64 u[2];
u8 c[16];
} tweak1;
if (IS_LITTLE_ENDIAN) {
unsigned int carry, res;
res = 0x87 & (((int)tweak.d[3]) >> 31);
carry = (unsigned int)(tweak.u[0] >> 63);
tweak1.u[0] = (tweak.u[0] << 1) ^ res;
tweak1.u[1] = (tweak.u[1] << 1) | carry;
} else {
size_t c;
for (c = 0, i = 0; i < 16; ++i) {
/*
* + substitutes for |, because c is 1 bit
*/
c += ((size_t)tweak.c[i]) << 1;
tweak1.c[i] = (u8)c;
c = c >> 8;
}
tweak1.c[0] ^= (u8)(0x87 & (0 - c));
}
#if defined(STRICT_ALIGNMENT)
memcpy(scratch.c, inp, 16);
scratch.u[0] ^= tweak1.u[0];
scratch.u[1] ^= tweak1.u[1];
#else
scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak1.u[0];
scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak1.u[1];
#endif
(*ctx->block1) (scratch.c, scratch.c, ctx->key1);
scratch.u[0] ^= tweak1.u[0];
scratch.u[1] ^= tweak1.u[1];
for (i = 0; i < len; ++i) {
u8 c = inp[16 + i];
out[16 + i] = scratch.c[i];
scratch.c[i] = c;
}
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
(*ctx->block1) (scratch.c, scratch.c, ctx->key1);
#if defined(STRICT_ALIGNMENT)
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
memcpy(out, scratch.c, 16);
#else
((u64_a1 *)out)[0] = scratch.u[0] ^ tweak.u[0];
((u64_a1 *)out)[1] = scratch.u[1] ^ tweak.u[1];
#endif
}
return 0;
}

View File

@@ -0,0 +1,199 @@
/*
* Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the Apache License 2.0 (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <string.h>
#include <openssl/crypto.h>
#include "internal/endian.h"
#include "crypto/modes.h"
#ifndef STRICT_ALIGNMENT
# ifdef __GNUC__
typedef u64 u64_a1 __attribute((__aligned__(1)));
# else
typedef u64 u64_a1;
# endif
#endif
int ossl_crypto_xts128gb_encrypt(const XTS128_CONTEXT *ctx,
const unsigned char iv[16],
const unsigned char *inp, unsigned char *out,
size_t len, int enc)
{
DECLARE_IS_ENDIAN;
union {
u64 u[2];
u32 d[4];
u8 c[16];
} tweak, scratch;
unsigned int i;
if (len < 16)
return -1;
memcpy(tweak.c, iv, 16);
(*ctx->block2) (tweak.c, tweak.c, ctx->key2);
if (!enc && (len % 16))
len -= 16;
while (len >= 16) {
#if defined(STRICT_ALIGNMENT)
memcpy(scratch.c, inp, 16);
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
#else
scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak.u[0];
scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak.u[1];
#endif
(*ctx->block1) (scratch.c, scratch.c, ctx->key1);
#if defined(STRICT_ALIGNMENT)
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
memcpy(out, scratch.c, 16);
#else
((u64_a1 *)out)[0] = scratch.u[0] ^= tweak.u[0];
((u64_a1 *)out)[1] = scratch.u[1] ^= tweak.u[1];
#endif
inp += 16;
out += 16;
len -= 16;
if (len == 0)
return 0;
if (IS_LITTLE_ENDIAN) {
u8 res;
u64 hi, lo;
#ifdef BSWAP8
hi = BSWAP8(tweak.u[0]);
lo = BSWAP8(tweak.u[1]);
#else
u8 *p = tweak.c;
hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
#endif
res = (u8)lo & 1;
tweak.u[0] = (lo >> 1) | (hi << 63);
tweak.u[1] = hi >> 1;
if (res)
tweak.c[15] ^= 0xe1;
#ifdef BSWAP8
hi = BSWAP8(tweak.u[0]);
lo = BSWAP8(tweak.u[1]);
#else
p = tweak.c;
hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
#endif
tweak.u[0] = lo;
tweak.u[1] = hi;
} else {
u8 carry, res;
carry = 0;
for (i = 0; i < 16; ++i) {
res = (tweak.c[i] << 7) & 0x80;
tweak.c[i] = ((tweak.c[i] >> 1) + carry) & 0xff;
carry = res;
}
if (res)
tweak.c[0] ^= 0xe1;
}
}
if (enc) {
for (i = 0; i < len; ++i) {
u8 c = inp[i];
out[i] = scratch.c[i];
scratch.c[i] = c;
}
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
(*ctx->block1) (scratch.c, scratch.c, ctx->key1);
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
memcpy(out - 16, scratch.c, 16);
} else {
union {
u64 u[2];
u8 c[16];
} tweak1;
if (IS_LITTLE_ENDIAN) {
u8 res;
u64 hi, lo;
#ifdef BSWAP8
hi = BSWAP8(tweak.u[0]);
lo = BSWAP8(tweak.u[1]);
#else
u8 *p = tweak.c;
hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
#endif
res = (u8)lo & 1;
tweak1.u[0] = (lo >> 1) | (hi << 63);
tweak1.u[1] = hi >> 1;
if (res)
tweak1.c[15] ^= 0xe1;
#ifdef BSWAP8
hi = BSWAP8(tweak1.u[0]);
lo = BSWAP8(tweak1.u[1]);
#else
p = tweak1.c;
hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
#endif
tweak1.u[0] = lo;
tweak1.u[1] = hi;
} else {
u8 carry, res;
carry = 0;
for (i = 0; i < 16; ++i) {
res = (tweak.c[i] << 7) & 0x80;
tweak1.c[i] = ((tweak.c[i] >> 1) + carry) & 0xff;
carry = res;
}
if (res)
tweak1.c[0] ^= 0xe1;
}
#if defined(STRICT_ALIGNMENT)
memcpy(scratch.c, inp, 16);
scratch.u[0] ^= tweak1.u[0];
scratch.u[1] ^= tweak1.u[1];
#else
scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak1.u[0];
scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak1.u[1];
#endif
(*ctx->block1) (scratch.c, scratch.c, ctx->key1);
scratch.u[0] ^= tweak1.u[0];
scratch.u[1] ^= tweak1.u[1];
for (i = 0; i < len; ++i) {
u8 c = inp[16 + i];
out[16 + i] = scratch.c[i];
scratch.c[i] = c;
}
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
(*ctx->block1) (scratch.c, scratch.c, ctx->key1);
#if defined(STRICT_ALIGNMENT)
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
memcpy(out, scratch.c, 16);
#else
((u64_a1 *)out)[0] = scratch.u[0] ^ tweak.u[0];
((u64_a1 *)out)[1] = scratch.u[1] ^ tweak.u[1];
#endif
}
return 0;
}