314 lines
6.6 KiB
Raku
314 lines
6.6 KiB
Raku
#!/usr/bin/env perl
|
|
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
# ====================================================================
|
|
|
|
# RC4 for PA-RISC.
|
|
|
|
# June 2009.
|
|
#
|
|
# Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
|
|
# For reference, [4x] unrolled loop is >40% faster than folded one.
|
|
# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
|
|
# is believed to be not sufficient to justify the effort...
|
|
#
|
|
# Special thanks to polarhome.com for providing HP-UX account.
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
|
|
$flavour = shift;
|
|
$output = shift;
|
|
open STDOUT,">$output";
|
|
|
|
if ($flavour =~ /64/) {
|
|
$LEVEL ="2.0W";
|
|
$SIZE_T =8;
|
|
$FRAME_MARKER =80;
|
|
$SAVED_RP =16;
|
|
$PUSH ="std";
|
|
$PUSHMA ="std,ma";
|
|
$POP ="ldd";
|
|
$POPMB ="ldd,mb";
|
|
} else {
|
|
$LEVEL ="1.0";
|
|
$SIZE_T =4;
|
|
$FRAME_MARKER =48;
|
|
$SAVED_RP =20;
|
|
$PUSH ="stw";
|
|
$PUSHMA ="stwm";
|
|
$POP ="ldw";
|
|
$POPMB ="ldwm";
|
|
}
|
|
|
|
$FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker
|
|
# [+ argument transfer]
|
|
$SZ=1; # defaults to RC4_CHAR
|
|
if (open CONF,"<${dir}../../opensslconf.h") {
|
|
while(<CONF>) {
|
|
if (m/#\s*define\s+RC4_INT\s+(.*)/) {
|
|
$SZ = ($1=~/char$/) ? 1 : 4;
|
|
last;
|
|
}
|
|
}
|
|
close CONF;
|
|
}
|
|
|
|
if ($SZ==1) { # RC4_CHAR
|
|
$LD="ldb";
|
|
$LDX="ldbx";
|
|
$MKX="addl";
|
|
$ST="stb";
|
|
} else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
|
|
$LD="ldw";
|
|
$LDX="ldwx,s";
|
|
$MKX="sh2addl";
|
|
$ST="stw";
|
|
}
|
|
|
|
$key="%r26";
|
|
$len="%r25";
|
|
$inp="%r24";
|
|
$out="%r23";
|
|
|
|
@XX=("%r19","%r20");
|
|
@TX=("%r21","%r22");
|
|
$YY="%r28";
|
|
$TY="%r29";
|
|
|
|
$acc="%r1";
|
|
$ix="%r2";
|
|
$iy="%r3";
|
|
$dat0="%r4";
|
|
$dat1="%r5";
|
|
$rem="%r6";
|
|
$mask="%r31";
|
|
|
|
sub unrolledloopbody {
|
|
for ($i=0;$i<4;$i++) {
|
|
$code.=<<___;
|
|
ldo 1($XX[0]),$XX[1]
|
|
`sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
|
|
and $mask,$XX[1],$XX[1]
|
|
$LDX $YY($key),$TY
|
|
$MKX $YY,$key,$ix
|
|
$LDX $XX[1]($key),$TX[1]
|
|
$MKX $XX[0],$key,$iy
|
|
$ST $TX[0],0($ix)
|
|
comclr,<> $XX[1],$YY,%r0 ; conditional
|
|
copy $TX[0],$TX[1] ; move
|
|
`sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
|
|
$ST $TY,0($iy)
|
|
addl $TX[0],$TY,$TY
|
|
addl $TX[1],$YY,$YY
|
|
and $mask,$TY,$TY
|
|
and $mask,$YY,$YY
|
|
___
|
|
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
|
|
} }
|
|
|
|
sub foldedloop {
|
|
my ($label,$count)=@_;
|
|
$code.=<<___;
|
|
$label
|
|
$MKX $YY,$key,$iy
|
|
$LDX $YY($key),$TY
|
|
$MKX $XX[0],$key,$ix
|
|
$ST $TX[0],0($iy)
|
|
ldo 1($XX[0]),$XX[0]
|
|
$ST $TY,0($ix)
|
|
addl $TX[0],$TY,$TY
|
|
ldbx $inp($out),$dat1
|
|
and $mask,$TY,$TY
|
|
and $mask,$XX[0],$XX[0]
|
|
$LDX $TY($key),$acc
|
|
$LDX $XX[0]($key),$TX[0]
|
|
ldo 1($out),$out
|
|
xor $dat1,$acc,$acc
|
|
addl $TX[0],$YY,$YY
|
|
stb $acc,-1($out)
|
|
addib,<> -1,$count,$label ; $count is always small
|
|
and $mask,$YY,$YY
|
|
___
|
|
}
|
|
|
|
$code=<<___;
|
|
.LEVEL $LEVEL
|
|
.SPACE \$TEXT\$
|
|
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
|
|
|
|
.EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
|
|
RC4
|
|
.PROC
|
|
.CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
|
|
.ENTRY
|
|
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
|
|
$PUSHMA %r3,$FRAME(%sp)
|
|
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
|
|
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
|
|
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
|
|
|
|
cmpib,*= 0,$len,L\$abort
|
|
sub $inp,$out,$inp ; distance between $inp and $out
|
|
|
|
$LD `0*$SZ`($key),$XX[0]
|
|
$LD `1*$SZ`($key),$YY
|
|
ldo `2*$SZ`($key),$key
|
|
|
|
ldi 0xff,$mask
|
|
ldi 3,$dat0
|
|
|
|
ldo 1($XX[0]),$XX[0] ; warm up loop
|
|
and $mask,$XX[0],$XX[0]
|
|
$LDX $XX[0]($key),$TX[0]
|
|
addl $TX[0],$YY,$YY
|
|
cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother?
|
|
and $mask,$YY,$YY
|
|
|
|
and,<> $out,$dat0,$rem ; is $out aligned?
|
|
b L\$alignedout
|
|
subi 4,$rem,$rem
|
|
sub $len,$rem,$len
|
|
___
|
|
&foldedloop("L\$alignout",$rem); # process till $out is aligned
|
|
|
|
$code.=<<___;
|
|
L\$alignedout ; $len is at least 4 here
|
|
and,<> $inp,$dat0,$acc ; is $inp aligned?
|
|
b L\$oop4
|
|
sub $inp,$acc,$rem ; align $inp
|
|
|
|
sh3addl $acc,%r0,$acc
|
|
subi 32,$acc,$acc
|
|
mtctl $acc,%cr11 ; load %sar with vshd align factor
|
|
ldwx $rem($out),$dat0
|
|
ldo 4($rem),$rem
|
|
L\$oop4misalignedinp
|
|
___
|
|
&unrolledloopbody();
|
|
$code.=<<___;
|
|
$LDX $TY($key),$ix
|
|
ldwx $rem($out),$dat1
|
|
ldo -4($len),$len
|
|
or $ix,$acc,$acc ; last piece, no need to dep
|
|
vshd $dat0,$dat1,$iy ; align data
|
|
copy $dat1,$dat0
|
|
xor $iy,$acc,$acc
|
|
stw $acc,0($out)
|
|
cmpib,*<< 3,$len,L\$oop4misalignedinp
|
|
ldo 4($out),$out
|
|
cmpib,*= 0,$len,L\$done
|
|
nop
|
|
b L\$oop1
|
|
nop
|
|
|
|
.ALIGN 8
|
|
L\$oop4
|
|
___
|
|
&unrolledloopbody();
|
|
$code.=<<___;
|
|
$LDX $TY($key),$ix
|
|
ldwx $inp($out),$dat0
|
|
ldo -4($len),$len
|
|
or $ix,$acc,$acc ; last piece, no need to dep
|
|
xor $dat0,$acc,$acc
|
|
stw $acc,0($out)
|
|
cmpib,*<< 3,$len,L\$oop4
|
|
ldo 4($out),$out
|
|
cmpib,*= 0,$len,L\$done
|
|
nop
|
|
___
|
|
&foldedloop("L\$oop1",$len);
|
|
$code.=<<___;
|
|
L\$done
|
|
$POP `-$FRAME-$SAVED_RP`(%sp),%r2
|
|
ldo -1($XX[0]),$XX[0] ; chill out loop
|
|
sub $YY,$TX[0],$YY
|
|
and $mask,$XX[0],$XX[0]
|
|
and $mask,$YY,$YY
|
|
$ST $XX[0],`-2*$SZ`($key)
|
|
$ST $YY,`-1*$SZ`($key)
|
|
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
|
|
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
|
|
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
|
|
L\$abort
|
|
bv (%r2)
|
|
.EXIT
|
|
$POPMB -$FRAME(%sp),%r3
|
|
.PROCEND
|
|
___
|
|
|
|
$code.=<<___;
|
|
|
|
.EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
|
|
.ALIGN 8
|
|
private_RC4_set_key
|
|
.PROC
|
|
.CALLINFO NO_CALLS
|
|
.ENTRY
|
|
$ST %r0,`0*$SZ`($key)
|
|
$ST %r0,`1*$SZ`($key)
|
|
ldo `2*$SZ`($key),$key
|
|
copy %r0,@XX[0]
|
|
L\$1st
|
|
$ST @XX[0],0($key)
|
|
ldo 1(@XX[0]),@XX[0]
|
|
bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256
|
|
ldo $SZ($key),$key
|
|
|
|
ldo `-256*$SZ`($key),$key ; rewind $key
|
|
addl $len,$inp,$inp ; $inp to point at the end
|
|
sub %r0,$len,%r23 ; inverse index
|
|
copy %r0,@XX[0]
|
|
copy %r0,@XX[1]
|
|
ldi 0xff,$mask
|
|
|
|
L\$2nd
|
|
$LDX @XX[0]($key),@TX[0]
|
|
ldbx %r23($inp),@TX[1]
|
|
addi,nuv 1,%r23,%r23 ; increment and conditional
|
|
sub %r0,$len,%r23 ; inverse index
|
|
addl @TX[0],@XX[1],@XX[1]
|
|
addl @TX[1],@XX[1],@XX[1]
|
|
and $mask,@XX[1],@XX[1]
|
|
$MKX @XX[0],$key,$TY
|
|
$LDX @XX[1]($key),@TX[1]
|
|
$MKX @XX[1],$key,$YY
|
|
ldo 1(@XX[0]),@XX[0]
|
|
$ST @TX[0],0($YY)
|
|
bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256
|
|
$ST @TX[1],0($TY)
|
|
|
|
bv,n (%r2)
|
|
.EXIT
|
|
nop
|
|
.PROCEND
|
|
|
|
.EXPORT RC4_options,ENTRY
|
|
.ALIGN 8
|
|
RC4_options
|
|
.PROC
|
|
.CALLINFO NO_CALLS
|
|
.ENTRY
|
|
blr %r0,%r28
|
|
ldi 3,%r1
|
|
L\$pic
|
|
andcm %r28,%r1,%r28
|
|
bv (%r2)
|
|
.EXIT
|
|
ldo L\$opts-L\$pic(%r28),%r28
|
|
.PROCEND
|
|
.ALIGN 8
|
|
L\$opts
|
|
.STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
|
|
.STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
|
|
___
|
|
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
|
$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
|
|
$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
|
|
|
|
print $code;
|
|
close STDOUT;
|