rc4-parisc.pl 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # RC4 for PA-RISC.
  9. # June 2009.
  10. #
  11. # Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
  12. # For reference, [4x] unrolled loop is >40% faster than folded one.
  13. # It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
  14. # is believed to be not sufficient to justify the effort...
  15. #
  16. # Special thanks to polarhome.com for providing HP-UX account.
  17. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  18. $flavour = shift;
  19. $output = shift;
  20. open STDOUT,">$output";
  21. if ($flavour =~ /64/) {
  22. $LEVEL ="2.0W";
  23. $SIZE_T =8;
  24. $FRAME_MARKER =80;
  25. $SAVED_RP =16;
  26. $PUSH ="std";
  27. $PUSHMA ="std,ma";
  28. $POP ="ldd";
  29. $POPMB ="ldd,mb";
  30. } else {
  31. $LEVEL ="1.0";
  32. $SIZE_T =4;
  33. $FRAME_MARKER =48;
  34. $SAVED_RP =20;
  35. $PUSH ="stw";
  36. $PUSHMA ="stwm";
  37. $POP ="ldw";
  38. $POPMB ="ldwm";
  39. }
  40. $FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker
  41. # [+ argument transfer]
  42. $SZ=1; # defaults to RC4_CHAR
  43. if (open CONF,"<${dir}../../opensslconf.h") {
  44. while(<CONF>) {
  45. if (m/#\s*define\s+RC4_INT\s+(.*)/) {
  46. $SZ = ($1=~/char$/) ? 1 : 4;
  47. last;
  48. }
  49. }
  50. close CONF;
  51. }
  52. if ($SZ==1) { # RC4_CHAR
  53. $LD="ldb";
  54. $LDX="ldbx";
  55. $MKX="addl";
  56. $ST="stb";
  57. } else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
  58. $LD="ldw";
  59. $LDX="ldwx,s";
  60. $MKX="sh2addl";
  61. $ST="stw";
  62. }
  63. $key="%r26";
  64. $len="%r25";
  65. $inp="%r24";
  66. $out="%r23";
  67. @XX=("%r19","%r20");
  68. @TX=("%r21","%r22");
  69. $YY="%r28";
  70. $TY="%r29";
  71. $acc="%r1";
  72. $ix="%r2";
  73. $iy="%r3";
  74. $dat0="%r4";
  75. $dat1="%r5";
  76. $rem="%r6";
  77. $mask="%r31";
  78. sub unrolledloopbody {
  79. for ($i=0;$i<4;$i++) {
  80. $code.=<<___;
  81. ldo 1($XX[0]),$XX[1]
  82. `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
  83. and $mask,$XX[1],$XX[1]
  84. $LDX $YY($key),$TY
  85. $MKX $YY,$key,$ix
  86. $LDX $XX[1]($key),$TX[1]
  87. $MKX $XX[0],$key,$iy
  88. $ST $TX[0],0($ix)
  89. comclr,<> $XX[1],$YY,%r0 ; conditional
  90. copy $TX[0],$TX[1] ; move
  91. `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
  92. $ST $TY,0($iy)
  93. addl $TX[0],$TY,$TY
  94. addl $TX[1],$YY,$YY
  95. and $mask,$TY,$TY
  96. and $mask,$YY,$YY
  97. ___
  98. push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
  99. } }
  100. sub foldedloop {
  101. my ($label,$count)=@_;
  102. $code.=<<___;
  103. $label
  104. $MKX $YY,$key,$iy
  105. $LDX $YY($key),$TY
  106. $MKX $XX[0],$key,$ix
  107. $ST $TX[0],0($iy)
  108. ldo 1($XX[0]),$XX[0]
  109. $ST $TY,0($ix)
  110. addl $TX[0],$TY,$TY
  111. ldbx $inp($out),$dat1
  112. and $mask,$TY,$TY
  113. and $mask,$XX[0],$XX[0]
  114. $LDX $TY($key),$acc
  115. $LDX $XX[0]($key),$TX[0]
  116. ldo 1($out),$out
  117. xor $dat1,$acc,$acc
  118. addl $TX[0],$YY,$YY
  119. stb $acc,-1($out)
  120. addib,<> -1,$count,$label ; $count is always small
  121. and $mask,$YY,$YY
  122. ___
  123. }
  124. $code=<<___;
  125. .LEVEL $LEVEL
  126. .SPACE \$TEXT\$
  127. .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
  128. .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
  129. RC4
  130. .PROC
  131. .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
  132. .ENTRY
  133. $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
  134. $PUSHMA %r3,$FRAME(%sp)
  135. $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
  136. $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
  137. $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
  138. cmpib,*= 0,$len,L\$abort
  139. sub $inp,$out,$inp ; distance between $inp and $out
  140. $LD `0*$SZ`($key),$XX[0]
  141. $LD `1*$SZ`($key),$YY
  142. ldo `2*$SZ`($key),$key
  143. ldi 0xff,$mask
  144. ldi 3,$dat0
  145. ldo 1($XX[0]),$XX[0] ; warm up loop
  146. and $mask,$XX[0],$XX[0]
  147. $LDX $XX[0]($key),$TX[0]
  148. addl $TX[0],$YY,$YY
  149. cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother?
  150. and $mask,$YY,$YY
  151. and,<> $out,$dat0,$rem ; is $out aligned?
  152. b L\$alignedout
  153. subi 4,$rem,$rem
  154. sub $len,$rem,$len
  155. ___
  156. &foldedloop("L\$alignout",$rem); # process till $out is aligned
  157. $code.=<<___;
  158. L\$alignedout ; $len is at least 4 here
  159. and,<> $inp,$dat0,$acc ; is $inp aligned?
  160. b L\$oop4
  161. sub $inp,$acc,$rem ; align $inp
  162. sh3addl $acc,%r0,$acc
  163. subi 32,$acc,$acc
  164. mtctl $acc,%cr11 ; load %sar with vshd align factor
  165. ldwx $rem($out),$dat0
  166. ldo 4($rem),$rem
  167. L\$oop4misalignedinp
  168. ___
  169. &unrolledloopbody();
  170. $code.=<<___;
  171. $LDX $TY($key),$ix
  172. ldwx $rem($out),$dat1
  173. ldo -4($len),$len
  174. or $ix,$acc,$acc ; last piece, no need to dep
  175. vshd $dat0,$dat1,$iy ; align data
  176. copy $dat1,$dat0
  177. xor $iy,$acc,$acc
  178. stw $acc,0($out)
  179. cmpib,*<< 3,$len,L\$oop4misalignedinp
  180. ldo 4($out),$out
  181. cmpib,*= 0,$len,L\$done
  182. nop
  183. b L\$oop1
  184. nop
  185. .ALIGN 8
  186. L\$oop4
  187. ___
  188. &unrolledloopbody();
  189. $code.=<<___;
  190. $LDX $TY($key),$ix
  191. ldwx $inp($out),$dat0
  192. ldo -4($len),$len
  193. or $ix,$acc,$acc ; last piece, no need to dep
  194. xor $dat0,$acc,$acc
  195. stw $acc,0($out)
  196. cmpib,*<< 3,$len,L\$oop4
  197. ldo 4($out),$out
  198. cmpib,*= 0,$len,L\$done
  199. nop
  200. ___
  201. &foldedloop("L\$oop1",$len);
  202. $code.=<<___;
  203. L\$done
  204. $POP `-$FRAME-$SAVED_RP`(%sp),%r2
  205. ldo -1($XX[0]),$XX[0] ; chill out loop
  206. sub $YY,$TX[0],$YY
  207. and $mask,$XX[0],$XX[0]
  208. and $mask,$YY,$YY
  209. $ST $XX[0],`-2*$SZ`($key)
  210. $ST $YY,`-1*$SZ`($key)
  211. $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
  212. $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
  213. $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
  214. L\$abort
  215. bv (%r2)
  216. .EXIT
  217. $POPMB -$FRAME(%sp),%r3
  218. .PROCEND
  219. ___
  220. $code.=<<___;
  221. .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
  222. .ALIGN 8
  223. private_RC4_set_key
  224. .PROC
  225. .CALLINFO NO_CALLS
  226. .ENTRY
  227. $ST %r0,`0*$SZ`($key)
  228. $ST %r0,`1*$SZ`($key)
  229. ldo `2*$SZ`($key),$key
  230. copy %r0,@XX[0]
  231. L\$1st
  232. $ST @XX[0],0($key)
  233. ldo 1(@XX[0]),@XX[0]
  234. bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256
  235. ldo $SZ($key),$key
  236. ldo `-256*$SZ`($key),$key ; rewind $key
  237. addl $len,$inp,$inp ; $inp to point at the end
  238. sub %r0,$len,%r23 ; inverse index
  239. copy %r0,@XX[0]
  240. copy %r0,@XX[1]
  241. ldi 0xff,$mask
  242. L\$2nd
  243. $LDX @XX[0]($key),@TX[0]
  244. ldbx %r23($inp),@TX[1]
  245. addi,nuv 1,%r23,%r23 ; increment and conditional
  246. sub %r0,$len,%r23 ; inverse index
  247. addl @TX[0],@XX[1],@XX[1]
  248. addl @TX[1],@XX[1],@XX[1]
  249. and $mask,@XX[1],@XX[1]
  250. $MKX @XX[0],$key,$TY
  251. $LDX @XX[1]($key),@TX[1]
  252. $MKX @XX[1],$key,$YY
  253. ldo 1(@XX[0]),@XX[0]
  254. $ST @TX[0],0($YY)
  255. bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256
  256. $ST @TX[1],0($TY)
  257. bv,n (%r2)
  258. .EXIT
  259. nop
  260. .PROCEND
  261. .EXPORT RC4_options,ENTRY
  262. .ALIGN 8
  263. RC4_options
  264. .PROC
  265. .CALLINFO NO_CALLS
  266. .ENTRY
  267. blr %r0,%r28
  268. ldi 3,%r1
  269. L\$pic
  270. andcm %r28,%r1,%r28
  271. bv (%r2)
  272. .EXIT
  273. ldo L\$opts-L\$pic(%r28),%r28
  274. .PROCEND
  275. .ALIGN 8
  276. L\$opts
  277. .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
  278. .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
  279. ___
  280. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  281. $code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
  282. $code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
  283. print $code;
  284. close STDOUT;