Written by Solar Designer <solar at openwall.com> in 1998-2010.
No copyright is claimed, and the software is hereby placed in the public
domain.  In case this attempt to disclaim copyright and place the software
in the public domain is deemed null and void, then the software is
Copyright (c) 1998-2010 Solar Designer and it is hereby released to the
general public under the following terms:

Redistribution and use in source and binary forms, with or without
modification, are permitted.

There's ABSOLUTELY NO WARRANTY, express or implied.

See crypt_blowfish.c for more information.

ifdef i386

if defined(__OpenBSD__) && !defined(__ELF__) define UNDERSCORES define ALIGN_LOG endif

if defined(CYGWIN32) || defined(MINGW32) define UNDERSCORES endif

ifdef __DJGPP__ define UNDERSCORES define ALIGN_LOG endif

ifdef UNDERSCORES define _BF_body_r __BF_body_r endif

ifdef ALIGN_LOG define DO_ALIGN(log) .align (log) elif defined(DUMBAS) define DO_ALIGN(log) .align 1 << log else define DO_ALIGN(log) .align (1 << (log)) endif

define BF_FRAME 0x200 define ctx %esp

define BF_ptr (ctx)

define S(N, r) N+BF_FRAME(ctx,r,4) ifdef DUMBAS define P(N) 0x1000+N+N+N+N+BF_FRAME(ctx) else define P(N) 0x1000+4*N+BF_FRAME(ctx) endif

 This version of the assembly code is optimized primarily for the original
 Intel Pentium but is also careful to avoid partial register stalls on the
 Pentium Pro family of processors (tested up to Pentium III Coppermine).

 It is possible to do 15% faster on the Pentium Pro family and probably on
 many non-Intel x86 processors, but, unfortunately, that would make things
 twice slower for the original Pentium.

 An additional 2% speedup may be achieved with non-reentrant code.
/

define L %esi define R %edi define tmp1 %eax define tmp1_lo %al define tmp2 %ecx define tmp2_hi %ch define tmp3 %edx define tmp3_lo %dl define tmp4 %ebx define tmp4_hi %bh define tmp5 %ebp

.text

define BF_ROUND(L, R, N) \

xorl L,tmp2; \
xorl tmp1,tmp1; \
movl tmp2,L; \
shrl $16,tmp2; \
movl L,tmp4; \
movb tmp2_hi,tmp1_lo; \
andl $0xFF,tmp2; \
movb tmp4_hi,tmp3_lo; \
andl $0xFF,tmp4; \
movl S(0,tmp1),tmp1; \
movl S(0x400,tmp2),tmp5; \
addl tmp5,tmp1; \
movl S(0x800,tmp3),tmp5; \
xorl tmp5,tmp1; \
movl S(0xC00,tmp4),tmp5; \
addl tmp1,tmp5; \
movl 4+P(N),tmp2; \
xorl tmp5,R

define BF_ENCRYPT_START \

BF_ROUND(L, R, 0); \
BF_ROUND(R, L, 1); \
BF_ROUND(L, R, 2); \
BF_ROUND(R, L, 3); \
BF_ROUND(L, R, 4); \
BF_ROUND(R, L, 5); \
BF_ROUND(L, R, 6); \
BF_ROUND(R, L, 7); \
BF_ROUND(L, R, 8); \
BF_ROUND(R, L, 9); \
BF_ROUND(L, R, 10); \
BF_ROUND(R, L, 11); \
BF_ROUND(L, R, 12); \
BF_ROUND(R, L, 13); \
BF_ROUND(L, R, 14); \
BF_ROUND(R, L, 15); \
movl BF_ptr,tmp5; \
xorl L,tmp2; \
movl P(17),L

define BF_ENCRYPT_END \

xorl R,L; \
movl tmp2,R

DO_ALIGN(5) .globl _BF_body_r _BF_body_r:

movl 4(%esp),%eax
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
subl $BF_FRAME-8,%eax
xorl L,L
cmpl %esp,%eax
ja BF_die
xchgl %eax,%esp
xorl R,R
pushl %eax
leal 0x1000+BF_FRAME-4(ctx),%eax
movl 0x1000+BF_FRAME-4(ctx),tmp2
pushl %eax
xorl tmp3,tmp3

BF_loop_P:

BF_ENCRYPT_START
addl $8,tmp5
BF_ENCRYPT_END
leal 0x1000+18*4+BF_FRAME(ctx),tmp1
movl tmp5,BF_ptr
cmpl tmp5,tmp1
movl L,-8(tmp5)
movl R,-4(tmp5)
movl P(0),tmp2
ja BF_loop_P
leal BF_FRAME(ctx),tmp5
xorl tmp3,tmp3
movl tmp5,BF_ptr

BF_loop_S:

BF_ENCRYPT_START
BF_ENCRYPT_END
movl P(0),tmp2
movl L,(tmp5)
movl R,4(tmp5)
BF_ENCRYPT_START
BF_ENCRYPT_END
movl P(0),tmp2
movl L,8(tmp5)
movl R,12(tmp5)
BF_ENCRYPT_START
BF_ENCRYPT_END
movl P(0),tmp2
movl L,16(tmp5)
movl R,20(tmp5)
BF_ENCRYPT_START
addl $32,tmp5
BF_ENCRYPT_END
leal 0x1000+BF_FRAME(ctx),tmp1
movl tmp5,BF_ptr
cmpl tmp5,tmp1
movl P(0),tmp2
movl L,-8(tmp5)
movl R,-4(tmp5)
ja BF_loop_S
movl 4(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret

BF_die: /* Oops, need to re-compile with a larger BF_FRAME.

hlt
jmp BF_die

endif

if defined(__ELF__) && defined(__linux__) .section .note.GNU-stack,“”,@progbits endif