ghashp8-ppc.pl 5.1 KB


  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # GHASH for for PowerISA v2.07.
  11. #
  12. # July 2014
  13. #
  14. # Accurate performance measurements are problematic, because it's
  15. # always virtualized setup with possibly throttled processor.
  16. # Relative comparison is therefore more informative. This initial
  17. # version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
  18. # faster than "4-bit" integer-only compiler-generated 64-bit code.
  19. # "Initial version" means that there is room for futher improvement.
  20. $flavour=shift;
  21. $output =shift;
  22. if ($flavour =~ /64/) {
  23. $SIZE_T=8;
  24. $LRSAVE=2*$SIZE_T;
  25. $STU="stdu";
  26. $POP="ld";
  27. $PUSH="std";
  28. } elsif ($flavour =~ /32/) {
  29. $SIZE_T=4;
  30. $LRSAVE=$SIZE_T;
  31. $STU="stwu";
  32. $POP="lwz";
  33. $PUSH="stw";
  34. } else { die "nonsense $flavour"; }
  35. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  36. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  37. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  38. die "can't locate ppc-xlate.pl";
  39. open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
  40. my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
  41. my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
  42. my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
  43. my $vrsave="r12";
  44. $code=<<___;
  45. .machine "any"
  46. .text
  47. .globl .gcm_init_p8
  48. .align 5
  49. .gcm_init_p8:
  50. lis r0,0xfff0
  51. li r8,0x10
  52. mfspr $vrsave,256
  53. li r9,0x20
  54. mtspr 256,r0
  55. li r10,0x30
  56. lvx_u $H,0,r4 # load H
  57. vspltisb $xC2,-16 # 0xf0
  58. vspltisb $t0,1 # one
  59. vaddubm $xC2,$xC2,$xC2 # 0xe0
  60. vxor $zero,$zero,$zero
  61. vor $xC2,$xC2,$t0 # 0xe1
  62. vsldoi $xC2,$xC2,$zero,15 # 0xe1...
  63. vsldoi $t1,$zero,$t0,1 # ...1
  64. vaddubm $xC2,$xC2,$xC2 # 0xc2...
  65. vspltisb $t2,7
  66. vor $xC2,$xC2,$t1 # 0xc2....01
  67. vspltb $t1,$H,0 # most significant byte
  68. vsl $H,$H,$t0 # H<<=1
  69. vsrab $t1,$t1,$t2 # broadcast carry bit
  70. vand $t1,$t1,$xC2
  71. vxor $H,$H,$t1 # twisted H
  72. vsldoi $H,$H,$H,8 # twist even more ...
  73. vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
  74. vsldoi $Hl,$zero,$H,8 # ... and split
  75. vsldoi $Hh,$H,$zero,8
  76. stvx_u $xC2,0,r3 # save pre-computed table
  77. stvx_u $Hl,r8,r3
  78. stvx_u $H, r9,r3
  79. stvx_u $Hh,r10,r3
  80. mtspr 256,$vrsave
  81. blr
  82. .long 0
  83. .byte 0,12,0x14,0,0,0,2,0
  84. .long 0
  85. .size .gcm_init_p8,.-.gcm_init_p8
  86. .globl .gcm_gmult_p8
  87. .align 5
  88. .gcm_gmult_p8:
  89. lis r0,0xfff8
  90. li r8,0x10
  91. mfspr $vrsave,256
  92. li r9,0x20
  93. mtspr 256,r0
  94. li r10,0x30
  95. lvx_u $IN,0,$Xip # load Xi
  96. lvx_u $Hl,r8,$Htbl # load pre-computed table
  97. le?lvsl $lemask,r0,r0
  98. lvx_u $H, r9,$Htbl
  99. le?vspltisb $t0,0x07
  100. lvx_u $Hh,r10,$Htbl
  101. le?vxor $lemask,$lemask,$t0
  102. lvx_u $xC2,0,$Htbl
  103. le?vperm $IN,$IN,$IN,$lemask
  104. vxor $zero,$zero,$zero
  105. vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
  106. vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
  107. vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
  108. vpmsumd $t2,$Xl,$xC2 # 1st phase
  109. vsldoi $t0,$Xm,$zero,8
  110. vsldoi $t1,$zero,$Xm,8
  111. vxor $Xl,$Xl,$t0
  112. vxor $Xh,$Xh,$t1
  113. vsldoi $Xl,$Xl,$Xl,8
  114. vxor $Xl,$Xl,$t2
  115. vsldoi $t1,$Xl,$Xl,8 # 2nd phase
  116. vpmsumd $Xl,$Xl,$xC2
  117. vxor $t1,$t1,$Xh
  118. vxor $Xl,$Xl,$t1
  119. le?vperm $Xl,$Xl,$Xl,$lemask
  120. stvx_u $Xl,0,$Xip # write out Xi
  121. mtspr 256,$vrsave
  122. blr
  123. .long 0
  124. .byte 0,12,0x14,0,0,0,2,0
  125. .long 0
  126. .size .gcm_gmult_p8,.-.gcm_gmult_p8
  127. .globl .gcm_ghash_p8
  128. .align 5
  129. .gcm_ghash_p8:
  130. lis r0,0xfff8
  131. li r8,0x10
  132. mfspr $vrsave,256
  133. li r9,0x20
  134. mtspr 256,r0
  135. li r10,0x30
  136. lvx_u $Xl,0,$Xip # load Xi
  137. lvx_u $Hl,r8,$Htbl # load pre-computed table
  138. le?lvsl $lemask,r0,r0
  139. lvx_u $H, r9,$Htbl
  140. le?vspltisb $t0,0x07
  141. lvx_u $Hh,r10,$Htbl
  142. le?vxor $lemask,$lemask,$t0
  143. lvx_u $xC2,0,$Htbl
  144. le?vperm $Xl,$Xl,$Xl,$lemask
  145. vxor $zero,$zero,$zero
  146. lvx_u $IN,0,$inp
  147. addi $inp,$inp,16
  148. subi $len,$len,16
  149. le?vperm $IN,$IN,$IN,$lemask
  150. vxor $IN,$IN,$Xl
  151. b Loop
  152. .align 5
  153. Loop:
  154. subic $len,$len,16
  155. vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
  156. subfe. r0,r0,r0 # borrow?-1:0
  157. vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
  158. and r0,r0,$len
  159. vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
  160. add $inp,$inp,r0
  161. vpmsumd $t2,$Xl,$xC2 # 1st phase
  162. vsldoi $t0,$Xm,$zero,8
  163. vsldoi $t1,$zero,$Xm,8
  164. vxor $Xl,$Xl,$t0
  165. vxor $Xh,$Xh,$t1
  166. vsldoi $Xl,$Xl,$Xl,8
  167. vxor $Xl,$Xl,$t2
  168. lvx_u $IN,0,$inp
  169. addi $inp,$inp,16
  170. vsldoi $t1,$Xl,$Xl,8 # 2nd phase
  171. vpmsumd $Xl,$Xl,$xC2
  172. le?vperm $IN,$IN,$IN,$lemask
  173. vxor $t1,$t1,$Xh
  174. vxor $IN,$IN,$t1
  175. vxor $IN,$IN,$Xl
  176. beq Loop # did $len-=16 borrow?
  177. vxor $Xl,$Xl,$t1
  178. le?vperm $Xl,$Xl,$Xl,$lemask
  179. stvx_u $Xl,0,$Xip # write out Xi
  180. mtspr 256,$vrsave
  181. blr
  182. .long 0
  183. .byte 0,12,0x14,0,0,0,4,0
  184. .long 0
  185. .size .gcm_ghash_p8,.-.gcm_ghash_p8
  186. .asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
  187. .align 2
  188. ___
  189. foreach (split("\n",$code)) {
  190. if ($flavour =~ /le$/o) { # little-endian
  191. s/le\?//o or
  192. s/be\?/#be#/o;
  193. } else {
  194. s/le\?/#le#/o or
  195. s/be\?//o;
  196. }
  197. print $_,"\n";
  198. }
  199. close STDOUT; # enforce flush