aesni-sha256-x86_64.pl 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802
  1. #! /usr/bin/env perl
  2. # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # January 2013
  17. #
  18. # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  19. # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  20. # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  21. # parallelism, interleaving it with another algorithm would allow to
  22. # utilize processor resources better and achieve better performance.
  23. # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  24. # AESNI code is weaved into it. As SHA256 dominates execution time,
  25. # stitch performance does not depend on AES key length. Below are
  26. # performance numbers in cycles per processed byte, less is better,
  27. # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  28. # subroutine:
  29. #
  30. # AES-128/-192/-256+SHA256 this(**) gain
  31. # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
  32. # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
  33. # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
  34. # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
  35. # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
  36. # Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54%
  37. # Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60%
  38. #
  39. # (*) there are XOP, AVX1 and AVX2 code paths, meaning that
  40. # Westmere is omitted from loop, this is because gain was not
  41. # estimated high enough to justify the effort;
  42. # (**) these are EVP-free results, results obtained with 'speed
  43. # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  44. # (***) these are SHAEXT results;
  45. $flavour = shift;
  46. $output = shift;
  47. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  48. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  49. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  50. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  51. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  52. die "can't locate x86_64-xlate.pl";
  53. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  54. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  55. $avx = ($1>=2.19) + ($1>=2.22);
  56. }
  57. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  58. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  59. $avx = ($1>=2.09) + ($1>=2.10);
  60. }
  61. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  62. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  63. $avx = ($1>=10) + ($1>=12);
  64. }
  65. if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  66. $avx = ($2>=3.0) + ($2>3.0);
  67. }
  68. $shaext=$avx; ### set to zero if compiling for 1.0.1
  69. $avx=1 if (!$shaext && $avx);
  70. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  71. *STDOUT=*OUT;
  72. $func="aesni_cbc_sha256_enc";
  73. $TABLE="K256";
  74. $SZ=4;
  75. @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  76. "%r8d","%r9d","%r10d","%r11d");
  77. ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  78. @Sigma0=( 2,13,22);
  79. @Sigma1=( 6,11,25);
  80. @sigma0=( 7,18, 3);
  81. @sigma1=(17,19,10);
  82. $rounds=64;
  83. ########################################################################
  84. # void aesni_cbc_sha256_enc(const void *inp,
  85. # void *out,
  86. # size_t length,
  87. # const AES_KEY *key,
  88. # unsigned char *iv,
  89. # SHA256_CTX *ctx,
  90. # const void *in0);
  91. ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
  92. ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  93. $Tbl="%rbp";
  94. $_inp="16*$SZ+0*8(%rsp)";
  95. $_out="16*$SZ+1*8(%rsp)";
  96. $_end="16*$SZ+2*8(%rsp)";
  97. $_key="16*$SZ+3*8(%rsp)";
  98. $_ivp="16*$SZ+4*8(%rsp)";
  99. $_ctx="16*$SZ+5*8(%rsp)";
  100. $_in0="16*$SZ+6*8(%rsp)";
  101. $_rsp="`16*$SZ+7*8`(%rsp)";
  102. $framesz=16*$SZ+8*8;
  103. $code=<<___;
  104. .text
  105. .extern OPENSSL_ia32cap_P
  106. .globl $func
  107. .type $func,\@abi-omnipotent
  108. .align 16
  109. $func:
  110. .cfi_startproc
  111. ___
  112. if ($avx) {
  113. $code.=<<___;
  114. lea OPENSSL_ia32cap_P(%rip),%r11
  115. mov \$1,%eax
  116. cmp \$0,`$win64?"%rcx":"%rdi"`
  117. je .Lprobe
  118. mov 0(%r11),%eax
  119. mov 4(%r11),%r10
  120. ___
  121. $code.=<<___ if ($shaext);
  122. bt \$61,%r10 # check for SHA
  123. jc ${func}_shaext
  124. ___
  125. $code.=<<___;
  126. mov %r10,%r11
  127. shr \$32,%r11
  128. test \$`1<<11`,%r10d # check for XOP
  129. jnz ${func}_xop
  130. ___
  131. $code.=<<___ if ($avx>1);
  132. and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
  133. cmp \$`1<<8|1<<5|1<<3`,%r11d
  134. je ${func}_avx2
  135. ___
  136. $code.=<<___;
  137. and \$`1<<28`,%r10d # check for AVX
  138. jnz ${func}_avx
  139. ud2
  140. ___
  141. }
  142. $code.=<<___;
  143. xor %eax,%eax
  144. cmp \$0,`$win64?"%rcx":"%rdi"`
  145. je .Lprobe
  146. ud2
  147. .Lprobe:
  148. ret
  149. .cfi_endproc
  150. .size $func,.-$func
  151. .align 64
  152. .type $TABLE,\@object
  153. $TABLE:
  154. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  155. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  156. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  157. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  158. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  159. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  160. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  161. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  162. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  163. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  164. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  165. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  166. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  167. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  168. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  169. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  170. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  171. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  172. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  173. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  174. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  175. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  176. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  177. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  178. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  179. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  180. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  181. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  182. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  183. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  184. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  185. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  186. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  187. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  188. .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
  189. .long 0,0,0,0, 0,0,0,0
  190. .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  191. .align 64
  192. ___
  193. ######################################################################
  194. # SIMD code paths
  195. #
  196. {{{
  197. ($iv,$inout,$roundkey,$temp,
  198. $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
  199. $aesni_cbc_idx=0;
  200. @aesni_cbc_block = (
  201. ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
  202. ## &vmovdqu ($inout,($inp));
  203. ## &mov ($_inp,$inp);
  204. '&vpxor ($inout,$inout,$roundkey);'.
  205. ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
  206. '&vpxor ($inout,$inout,$iv);',
  207. '&vaesenc ($inout,$inout,$roundkey);'.
  208. ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
  209. '&vaesenc ($inout,$inout,$roundkey);'.
  210. ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
  211. '&vaesenc ($inout,$inout,$roundkey);'.
  212. ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
  213. '&vaesenc ($inout,$inout,$roundkey);'.
  214. ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
  215. '&vaesenc ($inout,$inout,$roundkey);'.
  216. ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
  217. '&vaesenc ($inout,$inout,$roundkey);'.
  218. ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
  219. '&vaesenc ($inout,$inout,$roundkey);'.
  220. ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
  221. '&vaesenc ($inout,$inout,$roundkey);'.
  222. ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
  223. '&vaesenc ($inout,$inout,$roundkey);'.
  224. ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
  225. '&vaesenclast ($temp,$inout,$roundkey);'.
  226. ' &vaesenc ($inout,$inout,$roundkey);'.
  227. ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
  228. '&vpand ($iv,$temp,$mask10);'.
  229. ' &vaesenc ($inout,$inout,$roundkey);'.
  230. ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
  231. '&vaesenclast ($temp,$inout,$roundkey);'.
  232. ' &vaesenc ($inout,$inout,$roundkey);'.
  233. ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
  234. '&vpand ($temp,$temp,$mask12);'.
  235. ' &vaesenc ($inout,$inout,$roundkey);'.
  236. '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
  237. '&vpor ($iv,$iv,$temp);'.
  238. ' &vaesenclast ($temp,$inout,$roundkey);'.
  239. ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
  240. ## &mov ($inp,$_inp);
  241. ## &mov ($out,$_out);
  242. ## &vpand ($temp,$temp,$mask14);
  243. ## &vpor ($iv,$iv,$temp);
  244. ## &vmovdqu ($iv,($out,$inp);
  245. ## &lea (inp,16($inp));
  246. );
  247. my $a4=$T1;
  248. my ($a,$b,$c,$d,$e,$f,$g,$h);
  249. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  250. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  251. my $arg = pop;
  252. $arg = "\$$arg" if ($arg*1 eq $arg);
  253. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  254. }
  255. sub body_00_15 () {
  256. (
  257. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  258. '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
  259. '&mov ($a,$a1)',
  260. '&mov ($a4,$f)',
  261. '&xor ($a0,$e)',
  262. '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
  263. '&xor ($a4,$g)', # f^g
  264. '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
  265. '&xor ($a1,$a)',
  266. '&and ($a4,$e)', # (f^g)&e
  267. @aesni_cbc_block[$aesni_cbc_idx++].
  268. '&xor ($a0,$e)',
  269. '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
  270. '&mov ($a2,$a)',
  271. '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
  272. '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
  273. '&xor ($a2,$b)', # a^b, b^c in next round
  274. '&ror ($a0,$Sigma1[0])', # Sigma1(e)
  275. '&add ($h,$a4)', # h+=Ch(e,f,g)
  276. '&and ($a3,$a2)', # (b^c)&(a^b)
  277. '&xor ($a1,$a)',
  278. '&add ($h,$a0)', # h+=Sigma1(e)
  279. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  280. '&add ($d,$h)', # d+=h
  281. '&ror ($a1,$Sigma0[0])', # Sigma0(a)
  282. '&add ($h,$a3)', # h+=Maj(a,b,c)
  283. '&mov ($a0,$d)',
  284. '&add ($a1,$h);'. # h+=Sigma0(a)
  285. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  286. );
  287. }
  288. if ($avx) {{
  289. ######################################################################
  290. # XOP code path
  291. #
  292. $code.=<<___;
  293. .type ${func}_xop,\@function,6
  294. .align 64
  295. ${func}_xop:
  296. .cfi_startproc
  297. .Lxop_shortcut:
  298. mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
  299. mov %rsp,%rax # copy %rsp
  300. .cfi_def_cfa_register %rax
  301. push %rbx
  302. .cfi_push %rbx
  303. push %rbp
  304. .cfi_push %rbp
  305. push %r12
  306. .cfi_push %r12
  307. push %r13
  308. .cfi_push %r13
  309. push %r14
  310. .cfi_push %r14
  311. push %r15
  312. .cfi_push %r15
  313. sub \$`$framesz+$win64*16*10`,%rsp
  314. and \$-64,%rsp # align stack frame
  315. shl \$6,$len
  316. sub $inp,$out # re-bias
  317. sub $inp,$in0
  318. add $inp,$len # end of input
  319. #mov $inp,$_inp # saved later
  320. mov $out,$_out
  321. mov $len,$_end
  322. #mov $key,$_key # remains resident in $inp register
  323. mov $ivp,$_ivp
  324. mov $ctx,$_ctx
  325. mov $in0,$_in0
  326. mov %rax,$_rsp
  327. .cfi_cfa_expression $_rsp,deref,+8
  328. ___
  329. $code.=<<___ if ($win64);
  330. movaps %xmm6,`$framesz+16*0`(%rsp)
  331. movaps %xmm7,`$framesz+16*1`(%rsp)
  332. movaps %xmm8,`$framesz+16*2`(%rsp)
  333. movaps %xmm9,`$framesz+16*3`(%rsp)
  334. movaps %xmm10,`$framesz+16*4`(%rsp)
  335. movaps %xmm11,`$framesz+16*5`(%rsp)
  336. movaps %xmm12,`$framesz+16*6`(%rsp)
  337. movaps %xmm13,`$framesz+16*7`(%rsp)
  338. movaps %xmm14,`$framesz+16*8`(%rsp)
  339. movaps %xmm15,`$framesz+16*9`(%rsp)
  340. ___
  341. $code.=<<___;
  342. .Lprologue_xop:
  343. vzeroall
  344. mov $inp,%r12 # borrow $a4
  345. lea 0x80($key),$inp # size optimization, reassign
  346. lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
  347. mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
  348. mov $ctx,%r15 # borrow $a2
  349. mov $in0,%rsi # borrow $a3
  350. vmovdqu ($ivp),$iv # load IV
  351. sub \$9,%r14
  352. mov $SZ*0(%r15),$A
  353. mov $SZ*1(%r15),$B
  354. mov $SZ*2(%r15),$C
  355. mov $SZ*3(%r15),$D
  356. mov $SZ*4(%r15),$E
  357. mov $SZ*5(%r15),$F
  358. mov $SZ*6(%r15),$G
  359. mov $SZ*7(%r15),$H
  360. vmovdqa 0x00(%r13,%r14,8),$mask14
  361. vmovdqa 0x10(%r13,%r14,8),$mask12
  362. vmovdqa 0x20(%r13,%r14,8),$mask10
  363. vmovdqu 0x00-0x80($inp),$roundkey
  364. jmp .Lloop_xop
  365. ___
  366. if ($SZ==4) { # SHA256
  367. my @X = map("%xmm$_",(0..3));
  368. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
  369. $code.=<<___;
  370. .align 16
  371. .Lloop_xop:
  372. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  373. vmovdqu 0x00(%rsi,%r12),@X[0]
  374. vmovdqu 0x10(%rsi,%r12),@X[1]
  375. vmovdqu 0x20(%rsi,%r12),@X[2]
  376. vmovdqu 0x30(%rsi,%r12),@X[3]
  377. vpshufb $t3,@X[0],@X[0]
  378. lea $TABLE(%rip),$Tbl
  379. vpshufb $t3,@X[1],@X[1]
  380. vpshufb $t3,@X[2],@X[2]
  381. vpaddd 0x00($Tbl),@X[0],$t0
  382. vpshufb $t3,@X[3],@X[3]
  383. vpaddd 0x20($Tbl),@X[1],$t1
  384. vpaddd 0x40($Tbl),@X[2],$t2
  385. vpaddd 0x60($Tbl),@X[3],$t3
  386. vmovdqa $t0,0x00(%rsp)
  387. mov $A,$a1
  388. vmovdqa $t1,0x10(%rsp)
  389. mov $B,$a3
  390. vmovdqa $t2,0x20(%rsp)
  391. xor $C,$a3 # magic
  392. vmovdqa $t3,0x30(%rsp)
  393. mov $E,$a0
  394. jmp .Lxop_00_47
  395. .align 16
  396. .Lxop_00_47:
  397. sub \$-16*2*$SZ,$Tbl # size optimization
  398. vmovdqu (%r12),$inout # $a4
  399. mov %r12,$_inp # $a4
  400. ___
  401. sub XOP_256_00_47 () {
  402. my $j = shift;
  403. my $body = shift;
  404. my @X = @_;
  405. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  406. &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
  407. eval(shift(@insns));
  408. eval(shift(@insns));
  409. &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
  410. eval(shift(@insns));
  411. eval(shift(@insns));
  412. &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
  413. eval(shift(@insns));
  414. eval(shift(@insns));
  415. &vpsrld ($t0,$t0,$sigma0[2]);
  416. eval(shift(@insns));
  417. eval(shift(@insns));
  418. &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
  419. eval(shift(@insns));
  420. eval(shift(@insns));
  421. eval(shift(@insns));
  422. eval(shift(@insns));
  423. &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
  424. eval(shift(@insns));
  425. eval(shift(@insns));
  426. &vpxor ($t0,$t0,$t1);
  427. eval(shift(@insns));
  428. eval(shift(@insns));
  429. eval(shift(@insns));
  430. eval(shift(@insns));
  431. &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
  432. eval(shift(@insns));
  433. eval(shift(@insns));
  434. &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
  435. eval(shift(@insns));
  436. eval(shift(@insns));
  437. &vpsrld ($t2,@X[3],$sigma1[2]);
  438. eval(shift(@insns));
  439. eval(shift(@insns));
  440. &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
  441. eval(shift(@insns));
  442. eval(shift(@insns));
  443. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  444. eval(shift(@insns));
  445. eval(shift(@insns));
  446. &vpxor ($t3,$t3,$t2);
  447. eval(shift(@insns));
  448. eval(shift(@insns));
  449. eval(shift(@insns));
  450. eval(shift(@insns));
  451. &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
  452. eval(shift(@insns));
  453. eval(shift(@insns));
  454. eval(shift(@insns));
  455. eval(shift(@insns));
  456. &vpsrldq ($t3,$t3,8);
  457. eval(shift(@insns));
  458. eval(shift(@insns));
  459. eval(shift(@insns));
  460. eval(shift(@insns));
  461. &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  462. eval(shift(@insns));
  463. eval(shift(@insns));
  464. eval(shift(@insns));
  465. eval(shift(@insns));
  466. &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
  467. eval(shift(@insns));
  468. eval(shift(@insns));
  469. &vpsrld ($t2,@X[0],$sigma1[2]);
  470. eval(shift(@insns));
  471. eval(shift(@insns));
  472. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  473. eval(shift(@insns));
  474. eval(shift(@insns));
  475. &vpxor ($t3,$t3,$t2);
  476. eval(shift(@insns));
  477. eval(shift(@insns));
  478. eval(shift(@insns));
  479. eval(shift(@insns));
  480. &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
  481. eval(shift(@insns));
  482. eval(shift(@insns));
  483. eval(shift(@insns));
  484. eval(shift(@insns));
  485. &vpslldq ($t3,$t3,8); # 22 instructions
  486. eval(shift(@insns));
  487. eval(shift(@insns));
  488. eval(shift(@insns));
  489. eval(shift(@insns));
  490. &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
  491. eval(shift(@insns));
  492. eval(shift(@insns));
  493. eval(shift(@insns));
  494. eval(shift(@insns));
  495. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  496. foreach (@insns) { eval; } # remaining instructions
  497. &vmovdqa (16*$j."(%rsp)",$t2);
  498. }
  499. $aesni_cbc_idx=0;
  500. for ($i=0,$j=0; $j<4; $j++) {
  501. &XOP_256_00_47($j,\&body_00_15,@X);
  502. push(@X,shift(@X)); # rotate(@X)
  503. }
  504. &mov ("%r12",$_inp); # borrow $a4
  505. &vpand ($temp,$temp,$mask14);
  506. &mov ("%r15",$_out); # borrow $a2
  507. &vpor ($iv,$iv,$temp);
  508. &vmovdqu ("(%r15,%r12)",$iv); # write output
  509. &lea ("%r12","16(%r12)"); # inp++
  510. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  511. &jne (".Lxop_00_47");
  512. &vmovdqu ($inout,"(%r12)");
  513. &mov ($_inp,"%r12");
  514. $aesni_cbc_idx=0;
  515. for ($i=0; $i<16; ) {
  516. foreach(body_00_15()) { eval; }
  517. }
  518. }
  519. $code.=<<___;
  520. mov $_inp,%r12 # borrow $a4
  521. mov $_out,%r13 # borrow $a0
  522. mov $_ctx,%r15 # borrow $a2
  523. mov $_in0,%rsi # borrow $a3
  524. vpand $mask14,$temp,$temp
  525. mov $a1,$A
  526. vpor $temp,$iv,$iv
  527. vmovdqu $iv,(%r13,%r12) # write output
  528. lea 16(%r12),%r12 # inp++
  529. add $SZ*0(%r15),$A
  530. add $SZ*1(%r15),$B
  531. add $SZ*2(%r15),$C
  532. add $SZ*3(%r15),$D
  533. add $SZ*4(%r15),$E
  534. add $SZ*5(%r15),$F
  535. add $SZ*6(%r15),$G
  536. add $SZ*7(%r15),$H
  537. cmp $_end,%r12
  538. mov $A,$SZ*0(%r15)
  539. mov $B,$SZ*1(%r15)
  540. mov $C,$SZ*2(%r15)
  541. mov $D,$SZ*3(%r15)
  542. mov $E,$SZ*4(%r15)
  543. mov $F,$SZ*5(%r15)
  544. mov $G,$SZ*6(%r15)
  545. mov $H,$SZ*7(%r15)
  546. jb .Lloop_xop
  547. mov $_ivp,$ivp
  548. mov $_rsp,%rsi
  549. .cfi_def_cfa %rsi,8
  550. vmovdqu $iv,($ivp) # output IV
  551. vzeroall
  552. ___
  553. $code.=<<___ if ($win64);
  554. movaps `$framesz+16*0`(%rsp),%xmm6
  555. movaps `$framesz+16*1`(%rsp),%xmm7
  556. movaps `$framesz+16*2`(%rsp),%xmm8
  557. movaps `$framesz+16*3`(%rsp),%xmm9
  558. movaps `$framesz+16*4`(%rsp),%xmm10
  559. movaps `$framesz+16*5`(%rsp),%xmm11
  560. movaps `$framesz+16*6`(%rsp),%xmm12
  561. movaps `$framesz+16*7`(%rsp),%xmm13
  562. movaps `$framesz+16*8`(%rsp),%xmm14
  563. movaps `$framesz+16*9`(%rsp),%xmm15
  564. ___
  565. $code.=<<___;
  566. mov -48(%rsi),%r15
  567. .cfi_restore %r15
  568. mov -40(%rsi),%r14
  569. .cfi_restore %r14
  570. mov -32(%rsi),%r13
  571. .cfi_restore %r13
  572. mov -24(%rsi),%r12
  573. .cfi_restore %r12
  574. mov -16(%rsi),%rbp
  575. .cfi_restore %rbp
  576. mov -8(%rsi),%rbx
  577. .cfi_restore %rbx
  578. lea (%rsi),%rsp
  579. .cfi_def_cfa_register %rsp
  580. .Lepilogue_xop:
  581. ret
  582. .cfi_endproc
  583. .size ${func}_xop,.-${func}_xop
  584. ___
  585. ######################################################################
  586. # AVX+shrd code path
  587. #
  588. local *ror = sub { &shrd(@_[0],@_) };
  589. $code.=<<___;
  590. .type ${func}_avx,\@function,6
  591. .align 64
  592. ${func}_avx:
  593. .cfi_startproc
  594. .Lavx_shortcut:
  595. mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
  596. mov %rsp,%rax # copy %rsp
  597. .cfi_def_cfa_register %rax
  598. push %rbx
  599. .cfi_push %rbx
  600. push %rbp
  601. .cfi_push %rbp
  602. push %r12
  603. .cfi_push %r12
  604. push %r13
  605. .cfi_push %r13
  606. push %r14
  607. .cfi_push %r14
  608. push %r15
  609. .cfi_push %r15
  610. sub \$`$framesz+$win64*16*10`,%rsp
  611. and \$-64,%rsp # align stack frame
  612. shl \$6,$len
  613. sub $inp,$out # re-bias
  614. sub $inp,$in0
  615. add $inp,$len # end of input
  616. #mov $inp,$_inp # saved later
  617. mov $out,$_out
  618. mov $len,$_end
  619. #mov $key,$_key # remains resident in $inp register
  620. mov $ivp,$_ivp
  621. mov $ctx,$_ctx
  622. mov $in0,$_in0
  623. mov %rax,$_rsp
  624. .cfi_cfa_expression $_rsp,deref,+8
  625. ___
  626. $code.=<<___ if ($win64);
  627. movaps %xmm6,`$framesz+16*0`(%rsp)
  628. movaps %xmm7,`$framesz+16*1`(%rsp)
  629. movaps %xmm8,`$framesz+16*2`(%rsp)
  630. movaps %xmm9,`$framesz+16*3`(%rsp)
  631. movaps %xmm10,`$framesz+16*4`(%rsp)
  632. movaps %xmm11,`$framesz+16*5`(%rsp)
  633. movaps %xmm12,`$framesz+16*6`(%rsp)
  634. movaps %xmm13,`$framesz+16*7`(%rsp)
  635. movaps %xmm14,`$framesz+16*8`(%rsp)
  636. movaps %xmm15,`$framesz+16*9`(%rsp)
  637. ___
  638. $code.=<<___;
  639. .Lprologue_avx:
  640. vzeroall
  641. mov $inp,%r12 # borrow $a4
  642. lea 0x80($key),$inp # size optimization, reassign
  643. lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
  644. mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
  645. mov $ctx,%r15 # borrow $a2
  646. mov $in0,%rsi # borrow $a3
  647. vmovdqu ($ivp),$iv # load IV
  648. sub \$9,%r14
  649. mov $SZ*0(%r15),$A
  650. mov $SZ*1(%r15),$B
  651. mov $SZ*2(%r15),$C
  652. mov $SZ*3(%r15),$D
  653. mov $SZ*4(%r15),$E
  654. mov $SZ*5(%r15),$F
  655. mov $SZ*6(%r15),$G
  656. mov $SZ*7(%r15),$H
  657. vmovdqa 0x00(%r13,%r14,8),$mask14
  658. vmovdqa 0x10(%r13,%r14,8),$mask12
  659. vmovdqa 0x20(%r13,%r14,8),$mask10
  660. vmovdqu 0x00-0x80($inp),$roundkey
  661. ___
  662. if ($SZ==4) { # SHA256
  663. my @X = map("%xmm$_",(0..3));
  664. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
  665. $code.=<<___;
  666. jmp .Lloop_avx
  667. .align 16
  668. .Lloop_avx:
  669. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  670. vmovdqu 0x00(%rsi,%r12),@X[0]
  671. vmovdqu 0x10(%rsi,%r12),@X[1]
  672. vmovdqu 0x20(%rsi,%r12),@X[2]
  673. vmovdqu 0x30(%rsi,%r12),@X[3]
  674. vpshufb $t3,@X[0],@X[0]
  675. lea $TABLE(%rip),$Tbl
  676. vpshufb $t3,@X[1],@X[1]
  677. vpshufb $t3,@X[2],@X[2]
  678. vpaddd 0x00($Tbl),@X[0],$t0
  679. vpshufb $t3,@X[3],@X[3]
  680. vpaddd 0x20($Tbl),@X[1],$t1
  681. vpaddd 0x40($Tbl),@X[2],$t2
  682. vpaddd 0x60($Tbl),@X[3],$t3
  683. vmovdqa $t0,0x00(%rsp)
  684. mov $A,$a1
  685. vmovdqa $t1,0x10(%rsp)
  686. mov $B,$a3
  687. vmovdqa $t2,0x20(%rsp)
  688. xor $C,$a3 # magic
  689. vmovdqa $t3,0x30(%rsp)
  690. mov $E,$a0
  691. jmp .Lavx_00_47
  692. .align 16
  693. .Lavx_00_47:
  694. sub \$-16*2*$SZ,$Tbl # size optimization
  695. vmovdqu (%r12),$inout # $a4
  696. mov %r12,$_inp # $a4
  697. ___
  698. sub Xupdate_256_AVX () {
  699. (
  700. '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
  701. '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
  702. '&vpsrld ($t2,$t0,$sigma0[0]);',
  703. '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
  704. '&vpsrld ($t3,$t0,$sigma0[2])',
  705. '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
  706. '&vpxor ($t0,$t3,$t2)',
  707. '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
  708. '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  709. '&vpxor ($t0,$t0,$t1)',
  710. '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  711. '&vpxor ($t0,$t0,$t2)',
  712. '&vpsrld ($t2,$t3,$sigma1[2]);',
  713. '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
  714. '&vpsrlq ($t3,$t3,$sigma1[0]);',
  715. '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
  716. '&vpxor ($t2,$t2,$t3);',
  717. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  718. '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
  719. '&vpshufd ($t2,$t2,0b10000100)',
  720. '&vpsrldq ($t2,$t2,8)',
  721. '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
  722. '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
  723. '&vpsrld ($t2,$t3,$sigma1[2])',
  724. '&vpsrlq ($t3,$t3,$sigma1[0])',
  725. '&vpxor ($t2,$t2,$t3);',
  726. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  727. '&vpxor ($t2,$t2,$t3)',
  728. '&vpshufd ($t2,$t2,0b11101000)',
  729. '&vpslldq ($t2,$t2,8)',
  730. '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
  731. );
  732. }
  733. sub AVX_256_00_47 () {
  734. my $j = shift;
  735. my $body = shift;
  736. my @X = @_;
  737. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  738. foreach (Xupdate_256_AVX()) { # 29 instructions
  739. eval;
  740. eval(shift(@insns));
  741. eval(shift(@insns));
  742. eval(shift(@insns));
  743. }
  744. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  745. foreach (@insns) { eval; } # remaining instructions
  746. &vmovdqa (16*$j."(%rsp)",$t2);
  747. }
  748. $aesni_cbc_idx=0;
  749. for ($i=0,$j=0; $j<4; $j++) {
  750. &AVX_256_00_47($j,\&body_00_15,@X);
  751. push(@X,shift(@X)); # rotate(@X)
  752. }
  753. &mov ("%r12",$_inp); # borrow $a4
  754. &vpand ($temp,$temp,$mask14);
  755. &mov ("%r15",$_out); # borrow $a2
  756. &vpor ($iv,$iv,$temp);
  757. &vmovdqu ("(%r15,%r12)",$iv); # write output
  758. &lea ("%r12","16(%r12)"); # inp++
  759. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  760. &jne (".Lavx_00_47");
  761. &vmovdqu ($inout,"(%r12)");
  762. &mov ($_inp,"%r12");
  763. $aesni_cbc_idx=0;
  764. for ($i=0; $i<16; ) {
  765. foreach(body_00_15()) { eval; }
  766. }
  767. }
  768. $code.=<<___;
  769. mov $_inp,%r12 # borrow $a4
  770. mov $_out,%r13 # borrow $a0
  771. mov $_ctx,%r15 # borrow $a2
  772. mov $_in0,%rsi # borrow $a3
  773. vpand $mask14,$temp,$temp
  774. mov $a1,$A
  775. vpor $temp,$iv,$iv
  776. vmovdqu $iv,(%r13,%r12) # write output
  777. lea 16(%r12),%r12 # inp++
  778. add $SZ*0(%r15),$A
  779. add $SZ*1(%r15),$B
  780. add $SZ*2(%r15),$C
  781. add $SZ*3(%r15),$D
  782. add $SZ*4(%r15),$E
  783. add $SZ*5(%r15),$F
  784. add $SZ*6(%r15),$G
  785. add $SZ*7(%r15),$H
  786. cmp $_end,%r12
  787. mov $A,$SZ*0(%r15)
  788. mov $B,$SZ*1(%r15)
  789. mov $C,$SZ*2(%r15)
  790. mov $D,$SZ*3(%r15)
  791. mov $E,$SZ*4(%r15)
  792. mov $F,$SZ*5(%r15)
  793. mov $G,$SZ*6(%r15)
  794. mov $H,$SZ*7(%r15)
  795. jb .Lloop_avx
  796. mov $_ivp,$ivp
  797. mov $_rsp,%rsi
  798. .cfi_def_cfa %rsi,8
  799. vmovdqu $iv,($ivp) # output IV
  800. vzeroall
  801. ___
  802. $code.=<<___ if ($win64);
  803. movaps `$framesz+16*0`(%rsp),%xmm6
  804. movaps `$framesz+16*1`(%rsp),%xmm7
  805. movaps `$framesz+16*2`(%rsp),%xmm8
  806. movaps `$framesz+16*3`(%rsp),%xmm9
  807. movaps `$framesz+16*4`(%rsp),%xmm10
  808. movaps `$framesz+16*5`(%rsp),%xmm11
  809. movaps `$framesz+16*6`(%rsp),%xmm12
  810. movaps `$framesz+16*7`(%rsp),%xmm13
  811. movaps `$framesz+16*8`(%rsp),%xmm14
  812. movaps `$framesz+16*9`(%rsp),%xmm15
  813. ___
  814. $code.=<<___;
  815. mov -48(%rsi),%r15
  816. .cfi_restore %r15
  817. mov -40(%rsi),%r14
  818. .cfi_restore %r14
  819. mov -32(%rsi),%r13
  820. .cfi_restore %r13
  821. mov -24(%rsi),%r12
  822. .cfi_restore %r12
  823. mov -16(%rsi),%rbp
  824. .cfi_restore %rbp
  825. mov -8(%rsi),%rbx
  826. .cfi_restore %rbx
  827. lea (%rsi),%rsp
  828. .cfi_def_cfa_register %rsp
  829. .Lepilogue_avx:
  830. ret
  831. .cfi_endproc
  832. .size ${func}_avx,.-${func}_avx
  833. ___
  834. if ($avx>1) {{
  835. ######################################################################
  836. # AVX2+BMI code path
  837. #
  838. my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
  839. my $PUSH8=8*2*$SZ;
  840. use integer;
  841. sub bodyx_00_15 () {
  842. # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
  843. (
  844. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  845. '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
  846. '&and ($a4,$e)', # f&e
  847. '&rorx ($a0,$e,$Sigma1[2])',
  848. '&rorx ($a2,$e,$Sigma1[1])',
  849. '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
  850. '&lea ($h,"($h,$a4)")',
  851. '&andn ($a4,$e,$g)', # ~e&g
  852. '&xor ($a0,$a2)',
  853. '&rorx ($a1,$e,$Sigma1[0])',
  854. '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
  855. '&xor ($a0,$a1)', # Sigma1(e)
  856. '&mov ($a2,$a)',
  857. '&rorx ($a4,$a,$Sigma0[2])',
  858. '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
  859. '&xor ($a2,$b)', # a^b, b^c in next round
  860. '&rorx ($a1,$a,$Sigma0[1])',
  861. '&rorx ($a0,$a,$Sigma0[0])',
  862. '&lea ($d,"($d,$h)")', # d+=h
  863. '&and ($a3,$a2)', # (b^c)&(a^b)
  864. @aesni_cbc_block[$aesni_cbc_idx++].
  865. '&xor ($a1,$a4)',
  866. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  867. '&xor ($a1,$a0)', # Sigma0(a)
  868. '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
  869. '&mov ($a4,$e)', # copy of f in future
  870. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  871. );
  872. # and at the finish one has to $a+=$a1
  873. }
  874. $code.=<<___;
  875. .type ${func}_avx2,\@function,6
  876. .align 64
  877. ${func}_avx2:
  878. .cfi_startproc
  879. .Lavx2_shortcut:
  880. mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
  881. mov %rsp,%rax # copy %rsp
  882. .cfi_def_cfa_register %rax
  883. push %rbx
  884. .cfi_push %rbx
  885. push %rbp
  886. .cfi_push %rbp
  887. push %r12
  888. .cfi_push %r12
  889. push %r13
  890. .cfi_push %r13
  891. push %r14
  892. .cfi_push %r14
  893. push %r15
  894. .cfi_push %r15
  895. sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
  896. and \$-256*$SZ,%rsp # align stack frame
  897. add \$`2*$SZ*($rounds-8)`,%rsp
  898. shl \$6,$len
  899. sub $inp,$out # re-bias
  900. sub $inp,$in0
  901. add $inp,$len # end of input
  902. #mov $inp,$_inp # saved later
  903. #mov $out,$_out # kept in $offload
  904. mov $len,$_end
  905. #mov $key,$_key # remains resident in $inp register
  906. mov $ivp,$_ivp
  907. mov $ctx,$_ctx
  908. mov $in0,$_in0
  909. mov %rax,$_rsp
  910. .cfi_cfa_expression $_rsp,deref,+8
  911. ___
  912. $code.=<<___ if ($win64);
  913. movaps %xmm6,`$framesz+16*0`(%rsp)
  914. movaps %xmm7,`$framesz+16*1`(%rsp)
  915. movaps %xmm8,`$framesz+16*2`(%rsp)
  916. movaps %xmm9,`$framesz+16*3`(%rsp)
  917. movaps %xmm10,`$framesz+16*4`(%rsp)
  918. movaps %xmm11,`$framesz+16*5`(%rsp)
  919. movaps %xmm12,`$framesz+16*6`(%rsp)
  920. movaps %xmm13,`$framesz+16*7`(%rsp)
  921. movaps %xmm14,`$framesz+16*8`(%rsp)
  922. movaps %xmm15,`$framesz+16*9`(%rsp)
  923. ___
  924. $code.=<<___;
  925. .Lprologue_avx2:
  926. vzeroall
  927. mov $inp,%r13 # borrow $a0
  928. vpinsrq \$1,$out,$offload,$offload
  929. lea 0x80($key),$inp # size optimization, reassign
  930. lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
  931. mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
  932. mov $ctx,%r15 # borrow $a2
  933. mov $in0,%rsi # borrow $a3
  934. vmovdqu ($ivp),$iv # load IV
  935. lea -9(%r14),%r14
  936. vmovdqa 0x00(%r12,%r14,8),$mask14
  937. vmovdqa 0x10(%r12,%r14,8),$mask12
  938. vmovdqa 0x20(%r12,%r14,8),$mask10
  939. sub \$-16*$SZ,%r13 # inp++, size optimization
  940. mov $SZ*0(%r15),$A
  941. lea (%rsi,%r13),%r12 # borrow $a0
  942. mov $SZ*1(%r15),$B
  943. cmp $len,%r13 # $_end
  944. mov $SZ*2(%r15),$C
  945. cmove %rsp,%r12 # next block or random data
  946. mov $SZ*3(%r15),$D
  947. mov $SZ*4(%r15),$E
  948. mov $SZ*5(%r15),$F
  949. mov $SZ*6(%r15),$G
  950. mov $SZ*7(%r15),$H
  951. vmovdqu 0x00-0x80($inp),$roundkey
  952. ___
  953. if ($SZ==4) { # SHA256
  954. my @X = map("%ymm$_",(0..3));
  955. my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
  956. $code.=<<___;
  957. jmp .Loop_avx2
  958. .align 16
  959. .Loop_avx2:
  960. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  961. vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
  962. vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
  963. vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
  964. vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
  965. vinserti128 \$1,(%r12),@X[0],@X[0]
  966. vinserti128 \$1,16(%r12),@X[1],@X[1]
  967. vpshufb $t3,@X[0],@X[0]
  968. vinserti128 \$1,32(%r12),@X[2],@X[2]
  969. vpshufb $t3,@X[1],@X[1]
  970. vinserti128 \$1,48(%r12),@X[3],@X[3]
  971. lea $TABLE(%rip),$Tbl
  972. vpshufb $t3,@X[2],@X[2]
  973. lea -16*$SZ(%r13),%r13
  974. vpaddd 0x00($Tbl),@X[0],$t0
  975. vpshufb $t3,@X[3],@X[3]
  976. vpaddd 0x20($Tbl),@X[1],$t1
  977. vpaddd 0x40($Tbl),@X[2],$t2
  978. vpaddd 0x60($Tbl),@X[3],$t3
  979. vmovdqa $t0,0x00(%rsp)
  980. xor $a1,$a1
  981. vmovdqa $t1,0x20(%rsp)
  982. ___
  983. $code.=<<___ if (!$win64);
  984. # temporarily use %rsi as frame pointer
  985. mov $_rsp,%rsi
  986. .cfi_def_cfa %rsi,8
  987. ___
  988. $code.=<<___;
  989. lea -$PUSH8(%rsp),%rsp
  990. ___
  991. $code.=<<___ if (!$win64);
  992. # the frame info is at $_rsp, but the stack is moving...
  993. # so a second frame pointer is saved at -8(%rsp)
  994. # that is in the red zone
  995. mov %rsi,-8(%rsp)
  996. .cfi_cfa_expression %rsp-8,deref,+8
  997. ___
  998. $code.=<<___;
  999. mov $B,$a3
  1000. vmovdqa $t2,0x00(%rsp)
  1001. xor $C,$a3 # magic
  1002. vmovdqa $t3,0x20(%rsp)
  1003. mov $F,$a4
  1004. sub \$-16*2*$SZ,$Tbl # size optimization
  1005. jmp .Lavx2_00_47
  1006. .align 16
  1007. .Lavx2_00_47:
  1008. vmovdqu (%r13),$inout
  1009. vpinsrq \$0,%r13,$offload,$offload
  1010. ___
  1011. sub AVX2_256_00_47 () {
  1012. my $j = shift;
  1013. my $body = shift;
  1014. my @X = @_;
  1015. my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
  1016. my $base = "+2*$PUSH8(%rsp)";
  1017. if (($j%2)==0) {
  1018. &lea ("%rsp","-$PUSH8(%rsp)");
  1019. $code.=<<___ if (!$win64);
  1020. .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
  1021. # copy secondary frame pointer to new location again at -8(%rsp)
  1022. pushq $PUSH8-8(%rsp)
  1023. .cfi_cfa_expression %rsp,deref,+8
  1024. lea 8(%rsp),%rsp
  1025. .cfi_cfa_expression %rsp-8,deref,+8
  1026. ___
  1027. }
  1028. foreach (Xupdate_256_AVX()) { # 29 instructions
  1029. eval;
  1030. eval(shift(@insns));
  1031. eval(shift(@insns));
  1032. eval(shift(@insns));
  1033. }
  1034. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1035. foreach (@insns) { eval; } # remaining instructions
  1036. &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
  1037. }
  1038. $aesni_cbc_idx=0;
  1039. for ($i=0,$j=0; $j<4; $j++) {
  1040. &AVX2_256_00_47($j,\&bodyx_00_15,@X);
  1041. push(@X,shift(@X)); # rotate(@X)
  1042. }
  1043. &vmovq ("%r13",$offload); # borrow $a0
  1044. &vpextrq ("%r15",$offload,1); # borrow $a2
  1045. &vpand ($temp,$temp,$mask14);
  1046. &vpor ($iv,$iv,$temp);
  1047. &vmovdqu ("(%r15,%r13)",$iv); # write output
  1048. &lea ("%r13","16(%r13)"); # inp++
  1049. &lea ($Tbl,16*2*$SZ."($Tbl)");
  1050. &cmpb (($SZ-1)."($Tbl)",0);
  1051. &jne (".Lavx2_00_47");
  1052. &vmovdqu ($inout,"(%r13)");
  1053. &vpinsrq ($offload,$offload,"%r13",0);
  1054. $aesni_cbc_idx=0;
  1055. for ($i=0; $i<16; ) {
  1056. my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
  1057. foreach(bodyx_00_15()) { eval; }
  1058. }
  1059. }
  1060. $code.=<<___;
  1061. vpextrq \$1,$offload,%r12 # $_out, borrow $a4
  1062. vmovq $offload,%r13 # $_inp, borrow $a0
  1063. mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
  1064. add $a1,$A
  1065. lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
  1066. vpand $mask14,$temp,$temp
  1067. vpor $temp,$iv,$iv
  1068. vmovdqu $iv,(%r12,%r13) # write output
  1069. lea 16(%r13),%r13
  1070. add $SZ*0(%r15),$A
  1071. add $SZ*1(%r15),$B
  1072. add $SZ*2(%r15),$C
  1073. add $SZ*3(%r15),$D
  1074. add $SZ*4(%r15),$E
  1075. add $SZ*5(%r15),$F
  1076. add $SZ*6(%r15),$G
  1077. add $SZ*7(%r15),$H
  1078. mov $A,$SZ*0(%r15)
  1079. mov $B,$SZ*1(%r15)
  1080. mov $C,$SZ*2(%r15)
  1081. mov $D,$SZ*3(%r15)
  1082. mov $E,$SZ*4(%r15)
  1083. mov $F,$SZ*5(%r15)
  1084. mov $G,$SZ*6(%r15)
  1085. mov $H,$SZ*7(%r15)
  1086. cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
  1087. je .Ldone_avx2
  1088. xor $a1,$a1
  1089. mov $B,$a3
  1090. mov $F,$a4
  1091. xor $C,$a3 # magic
  1092. jmp .Lower_avx2
  1093. .align 16
  1094. .Lower_avx2:
  1095. vmovdqu (%r13),$inout
  1096. vpinsrq \$0,%r13,$offload,$offload
  1097. ___
  1098. $aesni_cbc_idx=0;
  1099. for ($i=0; $i<16; ) {
  1100. my $base="+16($Tbl)";
  1101. foreach(bodyx_00_15()) { eval; }
  1102. &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
  1103. }
  1104. $code.=<<___;
  1105. vmovq $offload,%r13 # borrow $a0
  1106. vpextrq \$1,$offload,%r15 # borrow $a2
  1107. vpand $mask14,$temp,$temp
  1108. vpor $temp,$iv,$iv
  1109. lea -$PUSH8($Tbl),$Tbl
  1110. vmovdqu $iv,(%r15,%r13) # write output
  1111. lea 16(%r13),%r13 # inp++
  1112. cmp %rsp,$Tbl
  1113. jae .Lower_avx2
  1114. mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
  1115. lea 16*$SZ(%r13),%r13
  1116. mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
  1117. add $a1,$A
  1118. lea `2*$SZ*($rounds-8)`(%rsp),%rsp
  1119. add $SZ*0(%r15),$A
  1120. add $SZ*1(%r15),$B
  1121. add $SZ*2(%r15),$C
  1122. add $SZ*3(%r15),$D
  1123. add $SZ*4(%r15),$E
  1124. add $SZ*5(%r15),$F
  1125. add $SZ*6(%r15),$G
  1126. lea (%rsi,%r13),%r12
  1127. add $SZ*7(%r15),$H
  1128. cmp $_end,%r13
  1129. mov $A,$SZ*0(%r15)
  1130. cmove %rsp,%r12 # next block or stale data
  1131. mov $B,$SZ*1(%r15)
  1132. mov $C,$SZ*2(%r15)
  1133. mov $D,$SZ*3(%r15)
  1134. mov $E,$SZ*4(%r15)
  1135. mov $F,$SZ*5(%r15)
  1136. mov $G,$SZ*6(%r15)
  1137. mov $H,$SZ*7(%r15)
  1138. jbe .Loop_avx2
  1139. lea (%rsp),$Tbl
  1140. # temporarily use $Tbl as index to $_rsp
  1141. # this avoids the need to save a secondary frame pointer at -8(%rsp)
  1142. .cfi_cfa_expression $Tbl+`16*$SZ+7*8`,deref,+8
  1143. .Ldone_avx2:
  1144. mov 16*$SZ+4*8($Tbl),$ivp
  1145. mov 16*$SZ+7*8($Tbl),%rsi
  1146. .cfi_def_cfa %rsi,8
  1147. vmovdqu $iv,($ivp) # output IV
  1148. vzeroall
  1149. ___
  1150. $code.=<<___ if ($win64);
  1151. movaps `$framesz+16*0`($Tbl),%xmm6
  1152. movaps `$framesz+16*1`($Tbl),%xmm7
  1153. movaps `$framesz+16*2`($Tbl),%xmm8
  1154. movaps `$framesz+16*3`($Tbl),%xmm9
  1155. movaps `$framesz+16*4`($Tbl),%xmm10
  1156. movaps `$framesz+16*5`($Tbl),%xmm11
  1157. movaps `$framesz+16*6`($Tbl),%xmm12
  1158. movaps `$framesz+16*7`($Tbl),%xmm13
  1159. movaps `$framesz+16*8`($Tbl),%xmm14
  1160. movaps `$framesz+16*9`($Tbl),%xmm15
  1161. ___
  1162. $code.=<<___;
  1163. mov -48(%rsi),%r15
  1164. .cfi_restore %r15
  1165. mov -40(%rsi),%r14
  1166. .cfi_restore %r14
  1167. mov -32(%rsi),%r13
  1168. .cfi_restore %r13
  1169. mov -24(%rsi),%r12
  1170. .cfi_restore %r12
  1171. mov -16(%rsi),%rbp
  1172. .cfi_restore %rbp
  1173. mov -8(%rsi),%rbx
  1174. .cfi_restore %rbx
  1175. lea (%rsi),%rsp
  1176. .cfi_def_cfa_register %rsp
  1177. .Lepilogue_avx2:
  1178. ret
  1179. .cfi_endproc
  1180. .size ${func}_avx2,.-${func}_avx2
  1181. ___
  1182. }}
  1183. }}
  1184. {{
  1185. my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  1186. my ($rounds,$Tbl)=("%r11d","%rbx");
  1187. my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
  1188. my @rndkey=("%xmm4","%xmm5");
  1189. my $r=0;
  1190. my $sn=0;
  1191. my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
  1192. my @MSG=map("%xmm$_",(10..13));
  1193. my $aesenc=sub {
  1194. use integer;
  1195. my ($n,$k)=($r/10,$r%10);
  1196. if ($k==0) {
  1197. $code.=<<___;
  1198. movups `16*$n`($in0),$in # load input
  1199. xorps $rndkey0,$in
  1200. ___
  1201. $code.=<<___ if ($n);
  1202. movups $iv,`16*($n-1)`($out,$in0) # write output
  1203. ___
  1204. $code.=<<___;
  1205. xorps $in,$iv
  1206. movups `32+16*$k-112`($key),$rndkey[1]
  1207. aesenc $rndkey[0],$iv
  1208. ___
  1209. } elsif ($k==9) {
  1210. $sn++;
  1211. $code.=<<___;
  1212. cmp \$11,$rounds
  1213. jb .Laesenclast$sn
  1214. movups `32+16*($k+0)-112`($key),$rndkey[1]
  1215. aesenc $rndkey[0],$iv
  1216. movups `32+16*($k+1)-112`($key),$rndkey[0]
  1217. aesenc $rndkey[1],$iv
  1218. je .Laesenclast$sn
  1219. movups `32+16*($k+2)-112`($key),$rndkey[1]
  1220. aesenc $rndkey[0],$iv
  1221. movups `32+16*($k+3)-112`($key),$rndkey[0]
  1222. aesenc $rndkey[1],$iv
  1223. .Laesenclast$sn:
  1224. aesenclast $rndkey[0],$iv
  1225. movups 16-112($key),$rndkey[1] # forward reference
  1226. nop
  1227. ___
  1228. } else {
  1229. $code.=<<___;
  1230. movups `32+16*$k-112`($key),$rndkey[1]
  1231. aesenc $rndkey[0],$iv
  1232. ___
  1233. }
  1234. $r++; unshift(@rndkey,pop(@rndkey));
  1235. };
  1236. if ($shaext) {
  1237. my $Tbl="%rax";
  1238. $code.=<<___;
  1239. .type ${func}_shaext,\@function,6
  1240. .align 32
  1241. ${func}_shaext:
  1242. .cfi_startproc
  1243. mov `($win64?56:8)`(%rsp),$inp # load 7th argument
  1244. ___
  1245. $code.=<<___ if ($win64);
  1246. lea `-8-10*16`(%rsp),%rsp
  1247. movaps %xmm6,-8-10*16(%rax)
  1248. movaps %xmm7,-8-9*16(%rax)
  1249. movaps %xmm8,-8-8*16(%rax)
  1250. movaps %xmm9,-8-7*16(%rax)
  1251. movaps %xmm10,-8-6*16(%rax)
  1252. movaps %xmm11,-8-5*16(%rax)
  1253. movaps %xmm12,-8-4*16(%rax)
  1254. movaps %xmm13,-8-3*16(%rax)
  1255. movaps %xmm14,-8-2*16(%rax)
  1256. movaps %xmm15,-8-1*16(%rax)
  1257. .Lprologue_shaext:
  1258. ___
  1259. $code.=<<___;
  1260. lea K256+0x80(%rip),$Tbl
  1261. movdqu ($ctx),$ABEF # DCBA
  1262. movdqu 16($ctx),$CDGH # HGFE
  1263. movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
  1264. mov 240($key),$rounds
  1265. sub $in0,$out
  1266. movups ($key),$rndkey0 # $key[0]
  1267. movups ($ivp),$iv # load IV
  1268. movups 16($key),$rndkey[0] # forward reference
  1269. lea 112($key),$key # size optimization
  1270. pshufd \$0x1b,$ABEF,$Wi # ABCD
  1271. pshufd \$0xb1,$ABEF,$ABEF # CDAB
  1272. pshufd \$0x1b,$CDGH,$CDGH # EFGH
  1273. movdqa $TMP,$BSWAP # offload
  1274. palignr \$8,$CDGH,$ABEF # ABEF
  1275. punpcklqdq $Wi,$CDGH # CDGH
  1276. jmp .Loop_shaext
  1277. .align 16
  1278. .Loop_shaext:
  1279. movdqu ($inp),@MSG[0]
  1280. movdqu 0x10($inp),@MSG[1]
  1281. movdqu 0x20($inp),@MSG[2]
  1282. pshufb $TMP,@MSG[0]
  1283. movdqu 0x30($inp),@MSG[3]
  1284. movdqa 0*32-0x80($Tbl),$Wi
  1285. paddd @MSG[0],$Wi
  1286. pshufb $TMP,@MSG[1]
  1287. movdqa $CDGH,$CDGH_SAVE # offload
  1288. movdqa $ABEF,$ABEF_SAVE # offload
  1289. ___
  1290. &$aesenc();
  1291. $code.=<<___;
  1292. sha256rnds2 $ABEF,$CDGH # 0-3
  1293. pshufd \$0x0e,$Wi,$Wi
  1294. ___
  1295. &$aesenc();
  1296. $code.=<<___;
  1297. sha256rnds2 $CDGH,$ABEF
  1298. movdqa 1*32-0x80($Tbl),$Wi
  1299. paddd @MSG[1],$Wi
  1300. pshufb $TMP,@MSG[2]
  1301. lea 0x40($inp),$inp
  1302. ___
  1303. &$aesenc();
  1304. $code.=<<___;
  1305. sha256rnds2 $ABEF,$CDGH # 4-7
  1306. pshufd \$0x0e,$Wi,$Wi
  1307. ___
  1308. &$aesenc();
  1309. $code.=<<___;
  1310. sha256rnds2 $CDGH,$ABEF
  1311. movdqa 2*32-0x80($Tbl),$Wi
  1312. paddd @MSG[2],$Wi
  1313. pshufb $TMP,@MSG[3]
  1314. sha256msg1 @MSG[1],@MSG[0]
  1315. ___
  1316. &$aesenc();
  1317. $code.=<<___;
  1318. sha256rnds2 $ABEF,$CDGH # 8-11
  1319. pshufd \$0x0e,$Wi,$Wi
  1320. movdqa @MSG[3],$TMP
  1321. palignr \$4,@MSG[2],$TMP
  1322. paddd $TMP,@MSG[0]
  1323. ___
  1324. &$aesenc();
  1325. $code.=<<___;
  1326. sha256rnds2 $CDGH,$ABEF
  1327. movdqa 3*32-0x80($Tbl),$Wi
  1328. paddd @MSG[3],$Wi
  1329. sha256msg2 @MSG[3],@MSG[0]
  1330. sha256msg1 @MSG[2],@MSG[1]
  1331. ___
  1332. &$aesenc();
  1333. $code.=<<___;
  1334. sha256rnds2 $ABEF,$CDGH # 12-15
  1335. pshufd \$0x0e,$Wi,$Wi
  1336. ___
  1337. &$aesenc();
  1338. $code.=<<___;
  1339. movdqa @MSG[0],$TMP
  1340. palignr \$4,@MSG[3],$TMP
  1341. paddd $TMP,@MSG[1]
  1342. sha256rnds2 $CDGH,$ABEF
  1343. ___
  1344. for($i=4;$i<16-3;$i++) {
  1345. &$aesenc() if (($r%10)==0);
  1346. $code.=<<___;
  1347. movdqa $i*32-0x80($Tbl),$Wi
  1348. paddd @MSG[0],$Wi
  1349. sha256msg2 @MSG[0],@MSG[1]
  1350. sha256msg1 @MSG[3],@MSG[2]
  1351. ___
  1352. &$aesenc();
  1353. $code.=<<___;
  1354. sha256rnds2 $ABEF,$CDGH # 16-19...
  1355. pshufd \$0x0e,$Wi,$Wi
  1356. movdqa @MSG[1],$TMP
  1357. palignr \$4,@MSG[0],$TMP
  1358. paddd $TMP,@MSG[2]
  1359. ___
  1360. &$aesenc();
  1361. &$aesenc() if ($r==19);
  1362. $code.=<<___;
  1363. sha256rnds2 $CDGH,$ABEF
  1364. ___
  1365. push(@MSG,shift(@MSG));
  1366. }
  1367. $code.=<<___;
  1368. movdqa 13*32-0x80($Tbl),$Wi
  1369. paddd @MSG[0],$Wi
  1370. sha256msg2 @MSG[0],@MSG[1]
  1371. sha256msg1 @MSG[3],@MSG[2]
  1372. ___
  1373. &$aesenc();
  1374. $code.=<<___;
  1375. sha256rnds2 $ABEF,$CDGH # 52-55
  1376. pshufd \$0x0e,$Wi,$Wi
  1377. movdqa @MSG[1],$TMP
  1378. palignr \$4,@MSG[0],$TMP
  1379. paddd $TMP,@MSG[2]
  1380. ___
  1381. &$aesenc();
  1382. &$aesenc();
  1383. $code.=<<___;
  1384. sha256rnds2 $CDGH,$ABEF
  1385. movdqa 14*32-0x80($Tbl),$Wi
  1386. paddd @MSG[1],$Wi
  1387. sha256msg2 @MSG[1],@MSG[2]
  1388. movdqa $BSWAP,$TMP
  1389. ___
  1390. &$aesenc();
  1391. $code.=<<___;
  1392. sha256rnds2 $ABEF,$CDGH # 56-59
  1393. pshufd \$0x0e,$Wi,$Wi
  1394. ___
  1395. &$aesenc();
  1396. $code.=<<___;
  1397. sha256rnds2 $CDGH,$ABEF
  1398. movdqa 15*32-0x80($Tbl),$Wi
  1399. paddd @MSG[2],$Wi
  1400. ___
  1401. &$aesenc();
  1402. &$aesenc();
  1403. $code.=<<___;
  1404. sha256rnds2 $ABEF,$CDGH # 60-63
  1405. pshufd \$0x0e,$Wi,$Wi
  1406. ___
  1407. &$aesenc();
  1408. $code.=<<___;
  1409. sha256rnds2 $CDGH,$ABEF
  1410. #pxor $CDGH,$rndkey0 # black magic
  1411. ___
  1412. while ($r<40) { &$aesenc(); } # remaining aesenc's
  1413. $code.=<<___;
  1414. #xorps $CDGH,$rndkey0 # black magic
  1415. paddd $CDGH_SAVE,$CDGH
  1416. paddd $ABEF_SAVE,$ABEF
  1417. dec $len
  1418. movups $iv,48($out,$in0) # write output
  1419. lea 64($in0),$in0
  1420. jnz .Loop_shaext
  1421. pshufd \$0xb1,$CDGH,$CDGH # DCHG
  1422. pshufd \$0x1b,$ABEF,$TMP # FEBA
  1423. pshufd \$0xb1,$ABEF,$ABEF # BAFE
  1424. punpckhqdq $CDGH,$ABEF # DCBA
  1425. palignr \$8,$TMP,$CDGH # HGFE
  1426. movups $iv,($ivp) # write IV
  1427. movdqu $ABEF,($ctx)
  1428. movdqu $CDGH,16($ctx)
  1429. ___
  1430. $code.=<<___ if ($win64);
  1431. movaps 0*16(%rsp),%xmm6
  1432. movaps 1*16(%rsp),%xmm7
  1433. movaps 2*16(%rsp),%xmm8
  1434. movaps 3*16(%rsp),%xmm9
  1435. movaps 4*16(%rsp),%xmm10
  1436. movaps 5*16(%rsp),%xmm11
  1437. movaps 6*16(%rsp),%xmm12
  1438. movaps 7*16(%rsp),%xmm13
  1439. movaps 8*16(%rsp),%xmm14
  1440. movaps 9*16(%rsp),%xmm15
  1441. lea 8+10*16(%rsp),%rsp
  1442. .Lepilogue_shaext:
  1443. ___
  1444. $code.=<<___;
  1445. ret
  1446. .cfi_endproc
  1447. .size ${func}_shaext,.-${func}_shaext
  1448. ___
  1449. }
  1450. }}}}}
  1451. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1452. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1453. if ($win64 && $avx) {
  1454. $rec="%rcx";
  1455. $frame="%rdx";
  1456. $context="%r8";
  1457. $disp="%r9";
  1458. $code.=<<___;
  1459. .extern __imp_RtlVirtualUnwind
  1460. .type se_handler,\@abi-omnipotent
  1461. .align 16
  1462. se_handler:
  1463. push %rsi
  1464. push %rdi
  1465. push %rbx
  1466. push %rbp
  1467. push %r12
  1468. push %r13
  1469. push %r14
  1470. push %r15
  1471. pushfq
  1472. sub \$64,%rsp
  1473. mov 120($context),%rax # pull context->Rax
  1474. mov 248($context),%rbx # pull context->Rip
  1475. mov 8($disp),%rsi # disp->ImageBase
  1476. mov 56($disp),%r11 # disp->HanderlData
  1477. mov 0(%r11),%r10d # HandlerData[0]
  1478. lea (%rsi,%r10),%r10 # prologue label
  1479. cmp %r10,%rbx # context->Rip<prologue label
  1480. jb .Lin_prologue
  1481. mov 152($context),%rax # pull context->Rsp
  1482. mov 4(%r11),%r10d # HandlerData[1]
  1483. lea (%rsi,%r10),%r10 # epilogue label
  1484. cmp %r10,%rbx # context->Rip>=epilogue label
  1485. jae .Lin_prologue
  1486. ___
  1487. $code.=<<___ if ($shaext);
  1488. lea aesni_cbc_sha256_enc_shaext(%rip),%r10
  1489. cmp %r10,%rbx
  1490. jb .Lnot_in_shaext
  1491. lea (%rax),%rsi
  1492. lea 512($context),%rdi # &context.Xmm6
  1493. mov \$20,%ecx
  1494. .long 0xa548f3fc # cld; rep movsq
  1495. lea 168(%rax),%rax # adjust stack pointer
  1496. jmp .Lin_prologue
  1497. .Lnot_in_shaext:
  1498. ___
  1499. $code.=<<___ if ($avx>1);
  1500. lea .Lavx2_shortcut(%rip),%r10
  1501. cmp %r10,%rbx # context->Rip<avx2_shortcut
  1502. jb .Lnot_in_avx2
  1503. and \$-256*$SZ,%rax
  1504. add \$`2*$SZ*($rounds-8)`,%rax
  1505. .Lnot_in_avx2:
  1506. ___
  1507. $code.=<<___;
  1508. mov %rax,%rsi # put aside Rsp
  1509. mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
  1510. mov -8(%rax),%rbx
  1511. mov -16(%rax),%rbp
  1512. mov -24(%rax),%r12
  1513. mov -32(%rax),%r13
  1514. mov -40(%rax),%r14
  1515. mov -48(%rax),%r15
  1516. mov %rbx,144($context) # restore context->Rbx
  1517. mov %rbp,160($context) # restore context->Rbp
  1518. mov %r12,216($context) # restore context->R12
  1519. mov %r13,224($context) # restore context->R13
  1520. mov %r14,232($context) # restore context->R14
  1521. mov %r15,240($context) # restore context->R15
  1522. lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
  1523. lea 512($context),%rdi # &context.Xmm6
  1524. mov \$20,%ecx
  1525. .long 0xa548f3fc # cld; rep movsq
  1526. .Lin_prologue:
  1527. mov 8(%rax),%rdi
  1528. mov 16(%rax),%rsi
  1529. mov %rax,152($context) # restore context->Rsp
  1530. mov %rsi,168($context) # restore context->Rsi
  1531. mov %rdi,176($context) # restore context->Rdi
  1532. mov 40($disp),%rdi # disp->ContextRecord
  1533. mov $context,%rsi # context
  1534. mov \$154,%ecx # sizeof(CONTEXT)
  1535. .long 0xa548f3fc # cld; rep movsq
  1536. mov $disp,%rsi
  1537. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1538. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1539. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1540. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1541. mov 40(%rsi),%r10 # disp->ContextRecord
  1542. lea 56(%rsi),%r11 # &disp->HandlerData
  1543. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1544. mov %r10,32(%rsp) # arg5
  1545. mov %r11,40(%rsp) # arg6
  1546. mov %r12,48(%rsp) # arg7
  1547. mov %rcx,56(%rsp) # arg8, (NULL)
  1548. call *__imp_RtlVirtualUnwind(%rip)
  1549. mov \$1,%eax # ExceptionContinueSearch
  1550. add \$64,%rsp
  1551. popfq
  1552. pop %r15
  1553. pop %r14
  1554. pop %r13
  1555. pop %r12
  1556. pop %rbp
  1557. pop %rbx
  1558. pop %rdi
  1559. pop %rsi
  1560. ret
  1561. .size se_handler,.-se_handler
  1562. .section .pdata
  1563. .rva .LSEH_begin_${func}_xop
  1564. .rva .LSEH_end_${func}_xop
  1565. .rva .LSEH_info_${func}_xop
  1566. .rva .LSEH_begin_${func}_avx
  1567. .rva .LSEH_end_${func}_avx
  1568. .rva .LSEH_info_${func}_avx
  1569. ___
  1570. $code.=<<___ if ($avx>1);
  1571. .rva .LSEH_begin_${func}_avx2
  1572. .rva .LSEH_end_${func}_avx2
  1573. .rva .LSEH_info_${func}_avx2
  1574. ___
  1575. $code.=<<___ if ($shaext);
  1576. .rva .LSEH_begin_${func}_shaext
  1577. .rva .LSEH_end_${func}_shaext
  1578. .rva .LSEH_info_${func}_shaext
  1579. ___
  1580. $code.=<<___;
  1581. .section .xdata
  1582. .align 8
  1583. .LSEH_info_${func}_xop:
  1584. .byte 9,0,0,0
  1585. .rva se_handler
  1586. .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
  1587. .LSEH_info_${func}_avx:
  1588. .byte 9,0,0,0
  1589. .rva se_handler
  1590. .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
  1591. ___
  1592. $code.=<<___ if ($avx>1);
  1593. .LSEH_info_${func}_avx2:
  1594. .byte 9,0,0,0
  1595. .rva se_handler
  1596. .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
  1597. ___
  1598. $code.=<<___ if ($shaext);
  1599. .LSEH_info_${func}_shaext:
  1600. .byte 9,0,0,0
  1601. .rva se_handler
  1602. .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
  1603. ___
  1604. }
  1605. ####################################################################
  1606. sub rex {
  1607. local *opcode=shift;
  1608. my ($dst,$src)=@_;
  1609. my $rex=0;
  1610. $rex|=0x04 if($dst>=8);
  1611. $rex|=0x01 if($src>=8);
  1612. unshift @opcode,$rex|0x40 if($rex);
  1613. }
  1614. {
  1615. my %opcodelet = (
  1616. "sha256rnds2" => 0xcb,
  1617. "sha256msg1" => 0xcc,
  1618. "sha256msg2" => 0xcd );
  1619. sub sha256op38 {
  1620. my $instr = shift;
  1621. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1622. my @opcode=(0x0f,0x38);
  1623. rex(\@opcode,$2,$1);
  1624. push @opcode,$opcodelet{$instr};
  1625. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  1626. return ".byte\t".join(',',@opcode);
  1627. } else {
  1628. return $instr."\t".@_[0];
  1629. }
  1630. }
  1631. }
  1632. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1633. $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
  1634. print $code;
  1635. close STDOUT or die "error closing STDOUT: $!";