sha1-armv4-large.pl 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # sha1_block procedure for ARMv4.
  9. #
  10. # January 2007.
  11. # Size/performance trade-off
  12. # ====================================================================
  13. # impl size in bytes comp cycles[*] measured performance
  14. # ====================================================================
  15. # thumb 304 3212 4420
  16. # armv4-small 392/+29% 1958/+64% 2250/+96%
  17. # armv4-compact 740/+89% 1552/+26% 1840/+22%
  18. # armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
  19. # full unroll ~5100/+260% ~1260/+4% ~1300/+5%
  20. # ====================================================================
  21. # thumb = same as 'small' but in Thumb instructions[**] and
  22. # with recurring code in two private functions;
  23. # small = detached Xload/update, loops are folded;
  24. # compact = detached Xload/update, 5x unroll;
  25. # large = interleaved Xload/update, 5x unroll;
  26. # full unroll = interleaved Xload/update, full unroll, estimated[!];
  27. #
  28. # [*] Manually counted instructions in "grand" loop body. Measured
  29. # performance is affected by prologue and epilogue overhead,
  30. # i-cache availability, branch penalties, etc.
  31. # [**] While each Thumb instruction is twice smaller, they are not as
  32. # diverse as ARM ones: e.g., there are only two arithmetic
  33. # instructions with 3 arguments, no [fixed] rotate, addressing
  34. # modes are limited. As result it takes more instructions to do
  35. # the same job in Thumb, therefore the code is never twice as
  36. # small and always slower.
  37. # [***] which is also ~35% better than compiler generated code. Dual-
  38. # issue Cortex A8 core was measured to process input block in
  39. # ~990 cycles.
  40. # August 2010.
  41. #
  42. # Rescheduling for dual-issue pipeline resulted in 13% improvement on
  43. # Cortex A8 core and in absolute terms ~870 cycles per input block
  44. # [or 13.6 cycles per byte].
  45. # February 2011.
  46. #
  47. # Profiler-assisted and platform-specific optimization resulted in 10%
  48. # improvement on Cortex A8 core and 12.2 cycles per byte.
  49. # September 2013.
  50. #
  51. # Add NEON implementation (see sha1-586.pl for background info). On
  52. # Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
  53. # faster than integer-only code. Because [fully unrolled] NEON code
  54. # is ~2.5x larger and there are some redundant instructions executed
  55. # when processing last block, improvement is not as big for smallest
  56. # blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
  57. # byte, which is also >80% faster than integer-only code.
  58. # May 2014.
  59. #
  60. # Add ARMv8 code path performing at 2.35 cpb on Apple A7.
  61. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  62. open STDOUT,">$output";
  63. $ctx="r0";
  64. $inp="r1";
  65. $len="r2";
  66. $a="r3";
  67. $b="r4";
  68. $c="r5";
  69. $d="r6";
  70. $e="r7";
  71. $K="r8";
  72. $t0="r9";
  73. $t1="r10";
  74. $t2="r11";
  75. $t3="r12";
  76. $Xi="r14";
  77. @V=($a,$b,$c,$d,$e);
  78. sub Xupdate {
  79. my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
  80. $code.=<<___;
  81. ldr $t0,[$Xi,#15*4]
  82. ldr $t1,[$Xi,#13*4]
  83. ldr $t2,[$Xi,#7*4]
  84. add $e,$K,$e,ror#2 @ E+=K_xx_xx
  85. ldr $t3,[$Xi,#2*4]
  86. eor $t0,$t0,$t1
  87. eor $t2,$t2,$t3 @ 1 cycle stall
  88. eor $t1,$c,$d @ F_xx_xx
  89. mov $t0,$t0,ror#31
  90. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  91. eor $t0,$t0,$t2,ror#31
  92. str $t0,[$Xi,#-4]!
  93. $opt1 @ F_xx_xx
  94. $opt2 @ F_xx_xx
  95. add $e,$e,$t0 @ E+=X[i]
  96. ___
  97. }
  98. sub BODY_00_15 {
  99. my ($a,$b,$c,$d,$e)=@_;
  100. $code.=<<___;
  101. #if __ARM_ARCH__<7
  102. ldrb $t1,[$inp,#2]
  103. ldrb $t0,[$inp,#3]
  104. ldrb $t2,[$inp,#1]
  105. add $e,$K,$e,ror#2 @ E+=K_00_19
  106. ldrb $t3,[$inp],#4
  107. orr $t0,$t0,$t1,lsl#8
  108. eor $t1,$c,$d @ F_xx_xx
  109. orr $t0,$t0,$t2,lsl#16
  110. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  111. orr $t0,$t0,$t3,lsl#24
  112. #else
  113. ldr $t0,[$inp],#4 @ handles unaligned
  114. add $e,$K,$e,ror#2 @ E+=K_00_19
  115. eor $t1,$c,$d @ F_xx_xx
  116. add $e,$e,$a,ror#27 @ E+=ROR(A,27)
  117. #ifdef __ARMEL__
  118. rev $t0,$t0 @ byte swap
  119. #endif
  120. #endif
  121. and $t1,$b,$t1,ror#2
  122. add $e,$e,$t0 @ E+=X[i]
  123. eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
  124. str $t0,[$Xi,#-4]!
  125. add $e,$e,$t1 @ E+=F_00_19(B,C,D)
  126. ___
  127. }
  128. sub BODY_16_19 {
  129. my ($a,$b,$c,$d,$e)=@_;
  130. &Xupdate(@_,"and $t1,$b,$t1,ror#2");
  131. $code.=<<___;
  132. eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
  133. add $e,$e,$t1 @ E+=F_00_19(B,C,D)
  134. ___
  135. }
  136. sub BODY_20_39 {
  137. my ($a,$b,$c,$d,$e)=@_;
  138. &Xupdate(@_,"eor $t1,$b,$t1,ror#2");
  139. $code.=<<___;
  140. add $e,$e,$t1 @ E+=F_20_39(B,C,D)
  141. ___
  142. }
  143. sub BODY_40_59 {
  144. my ($a,$b,$c,$d,$e)=@_;
  145. &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
  146. $code.=<<___;
  147. add $e,$e,$t1 @ E+=F_40_59(B,C,D)
  148. add $e,$e,$t2,ror#2
  149. ___
  150. }
  151. $code=<<___;
  152. #include "arm_arch.h"
  153. .text
  154. .code 32
  155. .global sha1_block_data_order
  156. .type sha1_block_data_order,%function
  157. .align 5
  158. sha1_block_data_order:
  159. #if __ARM_MAX_ARCH__>=7
  160. sub r3,pc,#8 @ sha1_block_data_order
  161. ldr r12,.LOPENSSL_armcap
  162. ldr r12,[r3,r12] @ OPENSSL_armcap_P
  163. tst r12,#ARMV8_SHA1
  164. bne .LARMv8
  165. tst r12,#ARMV7_NEON
  166. bne .LNEON
  167. #endif
  168. stmdb sp!,{r4-r12,lr}
  169. add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
  170. ldmia $ctx,{$a,$b,$c,$d,$e}
  171. .Lloop:
  172. ldr $K,.LK_00_19
  173. mov $Xi,sp
  174. sub sp,sp,#15*4
  175. mov $c,$c,ror#30
  176. mov $d,$d,ror#30
  177. mov $e,$e,ror#30 @ [6]
  178. .L_00_15:
  179. ___
  180. for($i=0;$i<5;$i++) {
  181. &BODY_00_15(@V); unshift(@V,pop(@V));
  182. }
  183. $code.=<<___;
  184. teq $Xi,sp
  185. bne .L_00_15 @ [((11+4)*5+2)*3]
  186. sub sp,sp,#25*4
  187. ___
  188. &BODY_00_15(@V); unshift(@V,pop(@V));
  189. &BODY_16_19(@V); unshift(@V,pop(@V));
  190. &BODY_16_19(@V); unshift(@V,pop(@V));
  191. &BODY_16_19(@V); unshift(@V,pop(@V));
  192. &BODY_16_19(@V); unshift(@V,pop(@V));
  193. $code.=<<___;
  194. ldr $K,.LK_20_39 @ [+15+16*4]
  195. cmn sp,#0 @ [+3], clear carry to denote 20_39
  196. .L_20_39_or_60_79:
  197. ___
  198. for($i=0;$i<5;$i++) {
  199. &BODY_20_39(@V); unshift(@V,pop(@V));
  200. }
  201. $code.=<<___;
  202. teq $Xi,sp @ preserve carry
  203. bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
  204. bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
  205. ldr $K,.LK_40_59
  206. sub sp,sp,#20*4 @ [+2]
  207. .L_40_59:
  208. ___
  209. for($i=0;$i<5;$i++) {
  210. &BODY_40_59(@V); unshift(@V,pop(@V));
  211. }
  212. $code.=<<___;
  213. teq $Xi,sp
  214. bne .L_40_59 @ [+((12+5)*5+2)*4]
  215. ldr $K,.LK_60_79
  216. sub sp,sp,#20*4
  217. cmp sp,#0 @ set carry to denote 60_79
  218. b .L_20_39_or_60_79 @ [+4], spare 300 bytes
  219. .L_done:
  220. add sp,sp,#80*4 @ "deallocate" stack frame
  221. ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
  222. add $a,$K,$a
  223. add $b,$t0,$b
  224. add $c,$t1,$c,ror#2
  225. add $d,$t2,$d,ror#2
  226. add $e,$t3,$e,ror#2
  227. stmia $ctx,{$a,$b,$c,$d,$e}
  228. teq $inp,$len
  229. bne .Lloop @ [+18], total 1307
  230. #if __ARM_ARCH__>=5
  231. ldmia sp!,{r4-r12,pc}
  232. #else
  233. ldmia sp!,{r4-r12,lr}
  234. tst lr,#1
  235. moveq pc,lr @ be binary compatible with V4, yet
  236. bx lr @ interoperable with Thumb ISA:-)
  237. #endif
  238. .size sha1_block_data_order,.-sha1_block_data_order
  239. .align 5
  240. .LK_00_19: .word 0x5a827999
  241. .LK_20_39: .word 0x6ed9eba1
  242. .LK_40_59: .word 0x8f1bbcdc
  243. .LK_60_79: .word 0xca62c1d6
  244. #if __ARM_MAX_ARCH__>=7
  245. .LOPENSSL_armcap:
  246. .word OPENSSL_armcap_P-sha1_block_data_order
  247. #endif
  248. .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  249. .align 5
  250. ___
  251. #####################################################################
  252. # NEON stuff
  253. #
  254. {{{
  255. my @V=($a,$b,$c,$d,$e);
  256. my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
  257. my $Xi=4;
  258. my @X=map("q$_",(8..11,0..3));
  259. my @Tx=("q12","q13");
  260. my ($K,$zero)=("q14","q15");
  261. my $j=0;
  262. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  263. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  264. my $arg = pop;
  265. $arg = "#$arg" if ($arg*1 eq $arg);
  266. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  267. }
  268. sub body_00_19 () {
  269. (
  270. '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
  271. '&bic ($t0,$d,$b)',
  272. '&add ($e,$e,$Ki)', # e+=X[i]+K
  273. '&and ($t1,$c,$b)',
  274. '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
  275. '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
  276. '&eor ($t1,$t1,$t0)', # F_00_19
  277. '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
  278. '&add ($e,$e,$t1);'. # e+=F_00_19
  279. '$j++; unshift(@V,pop(@V));'
  280. )
  281. }
  282. sub body_20_39 () {
  283. (
  284. '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
  285. '&eor ($t0,$b,$d)',
  286. '&add ($e,$e,$Ki)', # e+=X[i]+K
  287. '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
  288. '&eor ($t1,$t0,$c)', # F_20_39
  289. '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
  290. '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
  291. '&add ($e,$e,$t1);'. # e+=F_20_39
  292. '$j++; unshift(@V,pop(@V));'
  293. )
  294. }
  295. sub body_40_59 () {
  296. (
  297. '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
  298. '&add ($e,$e,$Ki)', # e+=X[i]+K
  299. '&and ($t0,$c,$d)',
  300. '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
  301. '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
  302. '&eor ($t1,$c,$d)',
  303. '&add ($e,$e,$t0)',
  304. '&and ($t1,$t1,$b)',
  305. '&mov ($b,$b,"ror#2")', # b=ROR(b,2)
  306. '&add ($e,$e,$t1);'. # e+=F_40_59
  307. '$j++; unshift(@V,pop(@V));'
  308. )
  309. }
  310. sub Xupdate_16_31 ()
  311. { use integer;
  312. my $body = shift;
  313. my @insns = (&$body,&$body,&$body,&$body);
  314. my ($a,$b,$c,$d,$e);
  315. &vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]"
  316. eval(shift(@insns));
  317. eval(shift(@insns));
  318. eval(shift(@insns));
  319. &vadd_i32 (@Tx[1],@X[-1&7],$K);
  320. eval(shift(@insns));
  321. &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
  322. eval(shift(@insns));
  323. &vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words
  324. eval(shift(@insns));
  325. eval(shift(@insns));
  326. eval(shift(@insns));
  327. &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
  328. eval(shift(@insns));
  329. eval(shift(@insns));
  330. &veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
  331. eval(shift(@insns));
  332. eval(shift(@insns));
  333. &veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8]
  334. eval(shift(@insns));
  335. eval(shift(@insns));
  336. &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
  337. &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
  338. eval(shift(@insns));
  339. eval(shift(@insns));
  340. &vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword
  341. eval(shift(@insns));
  342. eval(shift(@insns));
  343. &vadd_i32 (@X[0],@Tx[0],@Tx[0]);
  344. eval(shift(@insns));
  345. eval(shift(@insns));
  346. &vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1
  347. eval(shift(@insns));
  348. eval(shift(@insns));
  349. eval(shift(@insns));
  350. &vshr_u32 (@Tx[0],@Tx[1],30);
  351. eval(shift(@insns));
  352. eval(shift(@insns));
  353. &vshl_u32 (@Tx[1],@Tx[1],2);
  354. eval(shift(@insns));
  355. eval(shift(@insns));
  356. &veor (@X[0],@X[0],@Tx[0]);
  357. eval(shift(@insns));
  358. eval(shift(@insns));
  359. &veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
  360. foreach (@insns) { eval; } # remaining instructions [if any]
  361. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  362. }
  363. sub Xupdate_32_79 ()
  364. { use integer;
  365. my $body = shift;
  366. my @insns = (&$body,&$body,&$body,&$body);
  367. my ($a,$b,$c,$d,$e);
  368. &vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]"
  369. eval(shift(@insns));
  370. eval(shift(@insns));
  371. eval(shift(@insns));
  372. &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
  373. eval(shift(@insns));
  374. eval(shift(@insns));
  375. &veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
  376. eval(shift(@insns));
  377. eval(shift(@insns));
  378. &vadd_i32 (@Tx[1],@X[-1&7],$K);
  379. eval(shift(@insns));
  380. &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
  381. eval(shift(@insns));
  382. &veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]"
  383. eval(shift(@insns));
  384. eval(shift(@insns));
  385. &vshr_u32 (@X[0],@Tx[0],30);
  386. eval(shift(@insns));
  387. eval(shift(@insns));
  388. &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
  389. &sub ($Xfer,$Xfer,64) if ($Xi%4==0);
  390. eval(shift(@insns));
  391. eval(shift(@insns));
  392. &vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2
  393. foreach (@insns) { eval; } # remaining instructions [if any]
  394. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  395. }
  396. sub Xuplast_80 ()
  397. { use integer;
  398. my $body = shift;
  399. my @insns = (&$body,&$body,&$body,&$body);
  400. my ($a,$b,$c,$d,$e);
  401. &vadd_i32 (@Tx[1],@X[-1&7],$K);
  402. eval(shift(@insns));
  403. eval(shift(@insns));
  404. &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!");
  405. &sub ($Xfer,$Xfer,64);
  406. &teq ($inp,$len);
  407. &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
  408. &subeq ($inp,$inp,64); # reload last block to avoid SEGV
  409. &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
  410. eval(shift(@insns));
  411. eval(shift(@insns));
  412. &vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!");
  413. eval(shift(@insns));
  414. eval(shift(@insns));
  415. &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19
  416. eval(shift(@insns));
  417. eval(shift(@insns));
  418. &vrev32_8 (@X[-4&7],@X[-4&7]);
  419. foreach (@insns) { eval; } # remaining instructions
  420. $Xi=0;
  421. }
  422. sub Xloop()
  423. { use integer;
  424. my $body = shift;
  425. my @insns = (&$body,&$body,&$body,&$body);
  426. my ($a,$b,$c,$d,$e);
  427. &vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]);
  428. eval(shift(@insns));
  429. eval(shift(@insns));
  430. &vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K);
  431. eval(shift(@insns));
  432. eval(shift(@insns));
  433. &vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
  434. foreach (@insns) { eval; }
  435. $Xi++;
  436. }
  437. $code.=<<___;
  438. #if __ARM_MAX_ARCH__>=7
  439. .arch armv7-a
  440. .fpu neon
  441. .type sha1_block_data_order_neon,%function
  442. .align 4
  443. sha1_block_data_order_neon:
  444. .LNEON:
  445. stmdb sp!,{r4-r12,lr}
  446. add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
  447. @ dmb @ errata #451034 on early Cortex A8
  448. @ vstmdb sp!,{d8-d15} @ ABI specification says so
  449. mov $saved_sp,sp
  450. sub sp,sp,#64 @ alloca
  451. adr $K_XX_XX,.LK_00_19
  452. bic sp,sp,#15 @ align for 128-bit stores
  453. ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
  454. mov $Xfer,sp
  455. vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
  456. veor $zero,$zero,$zero
  457. vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
  458. vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
  459. vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
  460. vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
  461. vrev32.8 @X[-2&7],@X[-2&7]
  462. vadd.i32 @X[0],@X[-4&7],$K
  463. vrev32.8 @X[-1&7],@X[-1&7]
  464. vadd.i32 @X[1],@X[-3&7],$K
  465. vst1.32 {@X[0]},[$Xfer,:128]!
  466. vadd.i32 @X[2],@X[-2&7],$K
  467. vst1.32 {@X[1]},[$Xfer,:128]!
  468. vst1.32 {@X[2]},[$Xfer,:128]!
  469. ldr $Ki,[sp] @ big RAW stall
  470. .Loop_neon:
  471. ___
  472. &Xupdate_16_31(\&body_00_19);
  473. &Xupdate_16_31(\&body_00_19);
  474. &Xupdate_16_31(\&body_00_19);
  475. &Xupdate_16_31(\&body_00_19);
  476. &Xupdate_32_79(\&body_00_19);
  477. &Xupdate_32_79(\&body_20_39);
  478. &Xupdate_32_79(\&body_20_39);
  479. &Xupdate_32_79(\&body_20_39);
  480. &Xupdate_32_79(\&body_20_39);
  481. &Xupdate_32_79(\&body_20_39);
  482. &Xupdate_32_79(\&body_40_59);
  483. &Xupdate_32_79(\&body_40_59);
  484. &Xupdate_32_79(\&body_40_59);
  485. &Xupdate_32_79(\&body_40_59);
  486. &Xupdate_32_79(\&body_40_59);
  487. &Xupdate_32_79(\&body_20_39);
  488. &Xuplast_80(\&body_20_39);
  489. &Xloop(\&body_20_39);
  490. &Xloop(\&body_20_39);
  491. &Xloop(\&body_20_39);
  492. $code.=<<___;
  493. ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
  494. add $a,$a,$Ki
  495. ldr $Ki,[$ctx,#16]
  496. add $b,$b,$t0
  497. add $c,$c,$t1
  498. add $d,$d,$Xfer
  499. moveq sp,$saved_sp
  500. add $e,$e,$Ki
  501. ldrne $Ki,[sp]
  502. stmia $ctx,{$a,$b,$c,$d,$e}
  503. addne $Xfer,sp,#3*16
  504. bne .Loop_neon
  505. @ vldmia sp!,{d8-d15}
  506. ldmia sp!,{r4-r12,pc}
  507. .size sha1_block_data_order_neon,.-sha1_block_data_order_neon
  508. #endif
  509. ___
  510. }}}
  511. #####################################################################
  512. # ARMv8 stuff
  513. #
  514. {{{
  515. my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
  516. my @MSG=map("q$_",(4..7));
  517. my @Kxx=map("q$_",(8..11));
  518. my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
  519. $code.=<<___;
  520. #if __ARM_MAX_ARCH__>=7
  521. .type sha1_block_data_order_armv8,%function
  522. .align 5
  523. sha1_block_data_order_armv8:
  524. .LARMv8:
  525. vstmdb sp!,{d8-d15} @ ABI specification says so
  526. veor $E,$E,$E
  527. adr r3,.LK_00_19
  528. vld1.32 {$ABCD},[$ctx]!
  529. vld1.32 {$E\[0]},[$ctx]
  530. sub $ctx,$ctx,#16
  531. vld1.32 {@Kxx[0]\[]},[r3,:32]!
  532. vld1.32 {@Kxx[1]\[]},[r3,:32]!
  533. vld1.32 {@Kxx[2]\[]},[r3,:32]!
  534. vld1.32 {@Kxx[3]\[]},[r3,:32]
  535. .Loop_v8:
  536. vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
  537. vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
  538. vrev32.8 @MSG[0],@MSG[0]
  539. vrev32.8 @MSG[1],@MSG[1]
  540. vadd.i32 $W0,@Kxx[0],@MSG[0]
  541. vrev32.8 @MSG[2],@MSG[2]
  542. vmov $ABCD_SAVE,$ABCD @ offload
  543. subs $len,$len,#1
  544. vadd.i32 $W1,@Kxx[0],@MSG[1]
  545. vrev32.8 @MSG[3],@MSG[3]
  546. sha1h $E1,$ABCD @ 0
  547. sha1c $ABCD,$E,$W0
  548. vadd.i32 $W0,@Kxx[$j],@MSG[2]
  549. sha1su0 @MSG[0],@MSG[1],@MSG[2]
  550. ___
  551. for ($j=0,$i=1;$i<20-3;$i++) {
  552. my $f=("c","p","m","p")[$i/5];
  553. $code.=<<___;
  554. sha1h $E0,$ABCD @ $i
  555. sha1$f $ABCD,$E1,$W1
  556. vadd.i32 $W1,@Kxx[$j],@MSG[3]
  557. sha1su1 @MSG[0],@MSG[3]
  558. ___
  559. $code.=<<___ if ($i<20-4);
  560. sha1su0 @MSG[1],@MSG[2],@MSG[3]
  561. ___
  562. ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
  563. push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
  564. }
  565. $code.=<<___;
  566. sha1h $E0,$ABCD @ $i
  567. sha1p $ABCD,$E1,$W1
  568. vadd.i32 $W1,@Kxx[$j],@MSG[3]
  569. sha1h $E1,$ABCD @ 18
  570. sha1p $ABCD,$E0,$W0
  571. sha1h $E0,$ABCD @ 19
  572. sha1p $ABCD,$E1,$W1
  573. vadd.i32 $E,$E,$E0
  574. vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
  575. bne .Loop_v8
  576. vst1.32 {$ABCD},[$ctx]!
  577. vst1.32 {$E\[0]},[$ctx]
  578. vldmia sp!,{d8-d15}
  579. ret @ bx lr
  580. .size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
  581. #endif
  582. ___
  583. }}}
  584. $code.=<<___;
  585. #if __ARM_MAX_ARCH__>=7
  586. .comm OPENSSL_armcap_P,4,4
  587. #endif
  588. ___
  589. { my %opcode = (
  590. "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
  591. "sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40,
  592. "sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 );
  593. sub unsha1 {
  594. my ($mnemonic,$arg)=@_;
  595. if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
  596. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  597. |(($2&7)<<17)|(($2&8)<<4)
  598. |(($3&7)<<1) |(($3&8)<<2);
  599. # since ARMv7 instructions are always encoded little-endian.
  600. # correct solution is to use .inst directive, but older
  601. # assemblers don't implement it:-(
  602. sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
  603. $word&0xff,($word>>8)&0xff,
  604. ($word>>16)&0xff,($word>>24)&0xff,
  605. $mnemonic,$arg;
  606. }
  607. }
  608. }
  609. foreach (split($/,$code)) {
  610. s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or
  611. s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
  612. s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
  613. s/\bret\b/bx lr/o or
  614. s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
  615. print $_,$/;
  616. }
  617. close STDOUT; # enforce flush