rc4-s390x.pl 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # February 2009
  11. #
  12. # Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to
  13. # "cluster" Address Generation Interlocks, so that one pipeline stall
  14. # resolves several dependencies.
  15. # November 2010.
  16. #
  17. # Adapt for -m31 build. If kernel supports what's called "highgprs"
  18. # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
  19. # instructions and achieve "64-bit" performance even in 31-bit legacy
  20. # application context. The feature is not specific to any particular
  21. # processor, as long as it's "z-CPU". Latter implies that the code
  22. # remains z/Architecture specific. On z990 it was measured to perform
  23. # 50% better than code generated by gcc 4.3.
  24. $flavour = shift;
  25. if ($flavour =~ /3[12]/) {
  26. $SIZE_T=4;
  27. $g="";
  28. } else {
  29. $SIZE_T=8;
  30. $g="g";
  31. }
  32. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  33. open STDOUT,">$output";
  34. $rp="%r14";
  35. $sp="%r15";
  36. $code=<<___;
  37. .text
  38. ___
  39. # void RC4(RC4_KEY *key,size_t len,const void *inp,void *out)
  40. {
  41. $acc="%r0";
  42. $cnt="%r1";
  43. $key="%r2";
  44. $len="%r3";
  45. $inp="%r4";
  46. $out="%r5";
  47. @XX=("%r6","%r7");
  48. @TX=("%r8","%r9");
  49. $YY="%r10";
  50. $TY="%r11";
  51. $code.=<<___;
  52. .globl RC4
  53. .type RC4,\@function
  54. .align 64
  55. RC4:
  56. stm${g} %r6,%r11,6*$SIZE_T($sp)
  57. ___
  58. $code.=<<___ if ($flavour =~ /3[12]/);
  59. llgfr $len,$len
  60. ___
  61. $code.=<<___;
  62. llgc $XX[0],0($key)
  63. llgc $YY,1($key)
  64. la $XX[0],1($XX[0])
  65. nill $XX[0],0xff
  66. srlg $cnt,$len,3
  67. ltgr $cnt,$cnt
  68. llgc $TX[0],2($XX[0],$key)
  69. jz .Lshort
  70. j .Loop8
  71. .align 64
  72. .Loop8:
  73. ___
  74. for ($i=0;$i<8;$i++) {
  75. $code.=<<___;
  76. la $YY,0($YY,$TX[0]) # $i
  77. nill $YY,255
  78. la $XX[1],1($XX[0])
  79. nill $XX[1],255
  80. ___
  81. $code.=<<___ if ($i==1);
  82. llgc $acc,2($TY,$key)
  83. ___
  84. $code.=<<___ if ($i>1);
  85. sllg $acc,$acc,8
  86. ic $acc,2($TY,$key)
  87. ___
  88. $code.=<<___;
  89. llgc $TY,2($YY,$key)
  90. stc $TX[0],2($YY,$key)
  91. llgc $TX[1],2($XX[1],$key)
  92. stc $TY,2($XX[0],$key)
  93. cr $XX[1],$YY
  94. jne .Lcmov$i
  95. la $TX[1],0($TX[0])
  96. .Lcmov$i:
  97. la $TY,0($TY,$TX[0])
  98. nill $TY,255
  99. ___
  100. push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
  101. }
  102. $code.=<<___;
  103. lg $TX[1],0($inp)
  104. sllg $acc,$acc,8
  105. la $inp,8($inp)
  106. ic $acc,2($TY,$key)
  107. xgr $acc,$TX[1]
  108. stg $acc,0($out)
  109. la $out,8($out)
  110. brctg $cnt,.Loop8
  111. .Lshort:
  112. lghi $acc,7
  113. ngr $len,$acc
  114. jz .Lexit
  115. j .Loop1
  116. .align 16
  117. .Loop1:
  118. la $YY,0($YY,$TX[0])
  119. nill $YY,255
  120. llgc $TY,2($YY,$key)
  121. stc $TX[0],2($YY,$key)
  122. stc $TY,2($XX[0],$key)
  123. ar $TY,$TX[0]
  124. ahi $XX[0],1
  125. nill $TY,255
  126. nill $XX[0],255
  127. llgc $acc,0($inp)
  128. la $inp,1($inp)
  129. llgc $TY,2($TY,$key)
  130. llgc $TX[0],2($XX[0],$key)
  131. xr $acc,$TY
  132. stc $acc,0($out)
  133. la $out,1($out)
  134. brct $len,.Loop1
  135. .Lexit:
  136. ahi $XX[0],-1
  137. stc $XX[0],0($key)
  138. stc $YY,1($key)
  139. lm${g} %r6,%r11,6*$SIZE_T($sp)
  140. br $rp
  141. .size RC4,.-RC4
  142. .string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
  143. ___
  144. }
  145. # void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp)
  146. {
  147. $cnt="%r0";
  148. $idx="%r1";
  149. $key="%r2";
  150. $len="%r3";
  151. $inp="%r4";
  152. $acc="%r5";
  153. $dat="%r6";
  154. $ikey="%r7";
  155. $iinp="%r8";
  156. $code.=<<___;
  157. .globl private_RC4_set_key
  158. .type private_RC4_set_key,\@function
  159. .align 64
  160. private_RC4_set_key:
  161. stm${g} %r6,%r8,6*$SIZE_T($sp)
  162. lhi $cnt,256
  163. la $idx,0(%r0)
  164. sth $idx,0($key)
  165. .align 4
  166. .L1stloop:
  167. stc $idx,2($idx,$key)
  168. la $idx,1($idx)
  169. brct $cnt,.L1stloop
  170. lghi $ikey,-256
  171. lr $cnt,$len
  172. la $iinp,0(%r0)
  173. la $idx,0(%r0)
  174. .align 16
  175. .L2ndloop:
  176. llgc $acc,2+256($ikey,$key)
  177. llgc $dat,0($iinp,$inp)
  178. la $idx,0($idx,$acc)
  179. la $ikey,1($ikey)
  180. la $idx,0($idx,$dat)
  181. nill $idx,255
  182. la $iinp,1($iinp)
  183. tml $ikey,255
  184. llgc $dat,2($idx,$key)
  185. stc $dat,2+256-1($ikey,$key)
  186. stc $acc,2($idx,$key)
  187. jz .Ldone
  188. brct $cnt,.L2ndloop
  189. lr $cnt,$len
  190. la $iinp,0(%r0)
  191. j .L2ndloop
  192. .Ldone:
  193. lm${g} %r6,%r8,6*$SIZE_T($sp)
  194. br $rp
  195. .size private_RC4_set_key,.-private_RC4_set_key
  196. ___
  197. }
  198. # const char *RC4_options()
  199. $code.=<<___;
  200. .globl RC4_options
  201. .type RC4_options,\@function
  202. .align 16
  203. RC4_options:
  204. larl %r2,.Loptions
  205. br %r14
  206. .size RC4_options,.-RC4_options
  207. .section .rodata
  208. .Loptions:
  209. .align 8
  210. .string "rc4(8x,char)"
  211. ___
  212. print $code;
  213. close STDOUT; # force flush