sha256-armv4.pl 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. #
  8. # Permission to use under GPL terms is granted.
  9. # ====================================================================
  10. # SHA256 block procedure for ARMv4. May 2007.
  11. # Performance is ~2x better than gcc 3.4 generated code and in "abso-
  12. # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
  13. # byte [on single-issue Xscale PXA250 core].
  14. # July 2010.
  15. #
  16. # Rescheduling for dual-issue pipeline resulted in 22% improvement on
  17. # Cortex A8 core and ~20 cycles per processed byte.
  18. # February 2011.
  19. #
  20. # Profiler-assisted and platform-specific optimization resulted in 16%
  21. # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
  22. # September 2013.
  23. #
  24. # Add NEON implementation. On Cortex A8 it was measured to process one
  25. # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
  26. # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
  27. # code (meaning that latter performs sub-optimally, nothing was done
  28. # about it).
  29. # May 2014.
  30. #
  31. # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
  32. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  33. open STDOUT,">$output";
  34. $ctx="r0"; $t0="r0";
  35. $inp="r1"; $t4="r1";
  36. $len="r2"; $t1="r2";
  37. $T1="r3"; $t3="r3";
  38. $A="r4";
  39. $B="r5";
  40. $C="r6";
  41. $D="r7";
  42. $E="r8";
  43. $F="r9";
  44. $G="r10";
  45. $H="r11";
  46. @V=($A,$B,$C,$D,$E,$F,$G,$H);
  47. $t2="r12";
  48. $Ktbl="r14";
  49. @Sigma0=( 2,13,22);
  50. @Sigma1=( 6,11,25);
  51. @sigma0=( 7,18, 3);
  52. @sigma1=(17,19,10);
  53. sub BODY_00_15 {
  54. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  55. $code.=<<___ if ($i<16);
  56. #if __ARM_ARCH__>=7
  57. @ ldr $t1,[$inp],#4 @ $i
  58. # if $i==15
  59. str $inp,[sp,#17*4] @ make room for $t4
  60. # endif
  61. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  62. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  63. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  64. rev $t1,$t1
  65. #else
  66. @ ldrb $t1,[$inp,#3] @ $i
  67. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  68. ldrb $t2,[$inp,#2]
  69. ldrb $t0,[$inp,#1]
  70. orr $t1,$t1,$t2,lsl#8
  71. ldrb $t2,[$inp],#4
  72. orr $t1,$t1,$t0,lsl#16
  73. # if $i==15
  74. str $inp,[sp,#17*4] @ make room for $t4
  75. # endif
  76. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
  77. orr $t1,$t1,$t2,lsl#24
  78. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  79. #endif
  80. ___
  81. $code.=<<___;
  82. ldr $t2,[$Ktbl],#4 @ *K256++
  83. add $h,$h,$t1 @ h+=X[i]
  84. str $t1,[sp,#`$i%16`*4]
  85. eor $t1,$f,$g
  86. add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
  87. and $t1,$t1,$e
  88. add $h,$h,$t2 @ h+=K256[i]
  89. eor $t1,$t1,$g @ Ch(e,f,g)
  90. eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
  91. add $h,$h,$t1 @ h+=Ch(e,f,g)
  92. #if $i==31
  93. and $t2,$t2,#0xff
  94. cmp $t2,#0xf2 @ done?
  95. #endif
  96. #if $i<15
  97. # if __ARM_ARCH__>=7
  98. ldr $t1,[$inp],#4 @ prefetch
  99. # else
  100. ldrb $t1,[$inp,#3]
  101. # endif
  102. eor $t2,$a,$b @ a^b, b^c in next round
  103. #else
  104. ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
  105. eor $t2,$a,$b @ a^b, b^c in next round
  106. ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
  107. #endif
  108. eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
  109. and $t3,$t3,$t2 @ (b^c)&=(a^b)
  110. add $d,$d,$h @ d+=h
  111. eor $t3,$t3,$b @ Maj(a,b,c)
  112. add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
  113. @ add $h,$h,$t3 @ h+=Maj(a,b,c)
  114. ___
  115. ($t2,$t3)=($t3,$t2);
  116. }
  117. sub BODY_16_XX {
  118. my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  119. $code.=<<___;
  120. @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
  121. @ ldr $t4,[sp,#`($i+14)%16`*4]
  122. mov $t0,$t1,ror#$sigma0[0]
  123. add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
  124. mov $t2,$t4,ror#$sigma1[0]
  125. eor $t0,$t0,$t1,ror#$sigma0[1]
  126. eor $t2,$t2,$t4,ror#$sigma1[1]
  127. eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
  128. ldr $t1,[sp,#`($i+0)%16`*4]
  129. eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
  130. ldr $t4,[sp,#`($i+9)%16`*4]
  131. add $t2,$t2,$t0
  132. eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
  133. add $t1,$t1,$t2
  134. eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
  135. add $t1,$t1,$t4 @ X[i]
  136. ___
  137. &BODY_00_15(@_);
  138. }
  139. $code=<<___;
  140. #ifndef __KERNEL__
  141. # include "arm_arch.h"
  142. #else
  143. # define __ARM_ARCH__ __LINUX_ARM_ARCH__
  144. # define __ARM_MAX_ARCH__ 7
  145. #endif
  146. .text
  147. #if __ARM_ARCH__<7
  148. .code 32
  149. #else
  150. .syntax unified
  151. # ifdef __thumb2__
  152. .thumb
  153. # else
  154. .code 32
  155. # endif
  156. #endif
  157. .type K256,%object
  158. .align 5
  159. K256:
  160. .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  161. .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  162. .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  163. .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  164. .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  165. .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  166. .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  167. .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  168. .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  169. .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  170. .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  171. .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  172. .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  173. .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  174. .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  175. .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  176. .size K256,.-K256
  177. .word 0 @ terminator
  178. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  179. .LOPENSSL_armcap:
  180. .word OPENSSL_armcap_P-sha256_block_data_order
  181. #endif
  182. .align 5
  183. .global sha256_block_data_order
  184. .type sha256_block_data_order,%function
  185. sha256_block_data_order:
  186. #if __ARM_ARCH__<7
  187. sub r3,pc,#8 @ sha256_block_data_order
  188. #else
  189. adr r3,sha256_block_data_order
  190. #endif
  191. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  192. ldr r12,.LOPENSSL_armcap
  193. ldr r12,[r3,r12] @ OPENSSL_armcap_P
  194. tst r12,#ARMV8_SHA256
  195. bne .LARMv8
  196. tst r12,#ARMV7_NEON
  197. bne .LNEON
  198. #endif
  199. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  200. stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
  201. ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
  202. sub $Ktbl,r3,#256+32 @ K256
  203. sub sp,sp,#16*4 @ alloca(X[16])
  204. .Loop:
  205. # if __ARM_ARCH__>=7
  206. ldr $t1,[$inp],#4
  207. # else
  208. ldrb $t1,[$inp,#3]
  209. # endif
  210. eor $t3,$B,$C @ magic
  211. eor $t2,$t2,$t2
  212. ___
  213. for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
  214. $code.=".Lrounds_16_xx:\n";
  215. for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
  216. $code.=<<___;
  217. #if __ARM_ARCH__>=7
  218. ite eq @ Thumb2 thing, sanity check in ARM
  219. #endif
  220. ldreq $t3,[sp,#16*4] @ pull ctx
  221. bne .Lrounds_16_xx
  222. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  223. ldr $t0,[$t3,#0]
  224. ldr $t1,[$t3,#4]
  225. ldr $t2,[$t3,#8]
  226. add $A,$A,$t0
  227. ldr $t0,[$t3,#12]
  228. add $B,$B,$t1
  229. ldr $t1,[$t3,#16]
  230. add $C,$C,$t2
  231. ldr $t2,[$t3,#20]
  232. add $D,$D,$t0
  233. ldr $t0,[$t3,#24]
  234. add $E,$E,$t1
  235. ldr $t1,[$t3,#28]
  236. add $F,$F,$t2
  237. ldr $inp,[sp,#17*4] @ pull inp
  238. ldr $t2,[sp,#18*4] @ pull inp+len
  239. add $G,$G,$t0
  240. add $H,$H,$t1
  241. stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
  242. cmp $inp,$t2
  243. sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
  244. bne .Loop
  245. add sp,sp,#`16+3`*4 @ destroy frame
  246. #if __ARM_ARCH__>=5
  247. ldmia sp!,{r4-r11,pc}
  248. #else
  249. ldmia sp!,{r4-r11,lr}
  250. tst lr,#1
  251. moveq pc,lr @ be binary compatible with V4, yet
  252. bx lr @ interoperable with Thumb ISA:-)
  253. #endif
  254. .size sha256_block_data_order,.-sha256_block_data_order
  255. ___
  256. ######################################################################
  257. # NEON stuff
  258. #
  259. {{{
  260. my @X=map("q$_",(0..3));
  261. my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
  262. my $Xfer=$t4;
  263. my $j=0;
  264. sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
  265. sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
  266. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  267. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  268. my $arg = pop;
  269. $arg = "#$arg" if ($arg*1 eq $arg);
  270. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  271. }
  272. sub Xupdate()
  273. { use integer;
  274. my $body = shift;
  275. my @insns = (&$body,&$body,&$body,&$body);
  276. my ($a,$b,$c,$d,$e,$f,$g,$h);
  277. &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
  278. eval(shift(@insns));
  279. eval(shift(@insns));
  280. eval(shift(@insns));
  281. &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
  282. eval(shift(@insns));
  283. eval(shift(@insns));
  284. eval(shift(@insns));
  285. &vshr_u32 ($T2,$T0,$sigma0[0]);
  286. eval(shift(@insns));
  287. eval(shift(@insns));
  288. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
  289. eval(shift(@insns));
  290. eval(shift(@insns));
  291. &vshr_u32 ($T1,$T0,$sigma0[2]);
  292. eval(shift(@insns));
  293. eval(shift(@insns));
  294. &vsli_32 ($T2,$T0,32-$sigma0[0]);
  295. eval(shift(@insns));
  296. eval(shift(@insns));
  297. &vshr_u32 ($T3,$T0,$sigma0[1]);
  298. eval(shift(@insns));
  299. eval(shift(@insns));
  300. &veor ($T1,$T1,$T2);
  301. eval(shift(@insns));
  302. eval(shift(@insns));
  303. &vsli_32 ($T3,$T0,32-$sigma0[1]);
  304. eval(shift(@insns));
  305. eval(shift(@insns));
  306. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
  307. eval(shift(@insns));
  308. eval(shift(@insns));
  309. &veor ($T1,$T1,$T3); # sigma0(X[1..4])
  310. eval(shift(@insns));
  311. eval(shift(@insns));
  312. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
  313. eval(shift(@insns));
  314. eval(shift(@insns));
  315. &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
  316. eval(shift(@insns));
  317. eval(shift(@insns));
  318. &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
  319. eval(shift(@insns));
  320. eval(shift(@insns));
  321. &veor ($T5,$T5,$T4);
  322. eval(shift(@insns));
  323. eval(shift(@insns));
  324. &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
  325. eval(shift(@insns));
  326. eval(shift(@insns));
  327. &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
  328. eval(shift(@insns));
  329. eval(shift(@insns));
  330. &veor ($T5,$T5,$T4); # sigma1(X[14..15])
  331. eval(shift(@insns));
  332. eval(shift(@insns));
  333. &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
  334. eval(shift(@insns));
  335. eval(shift(@insns));
  336. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
  337. eval(shift(@insns));
  338. eval(shift(@insns));
  339. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
  340. eval(shift(@insns));
  341. eval(shift(@insns));
  342. &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
  343. eval(shift(@insns));
  344. eval(shift(@insns));
  345. &veor ($T5,$T5,$T4);
  346. eval(shift(@insns));
  347. eval(shift(@insns));
  348. &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
  349. eval(shift(@insns));
  350. eval(shift(@insns));
  351. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  352. eval(shift(@insns));
  353. eval(shift(@insns));
  354. &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
  355. eval(shift(@insns));
  356. eval(shift(@insns));
  357. &veor ($T5,$T5,$T4); # sigma1(X[16..17])
  358. eval(shift(@insns));
  359. eval(shift(@insns));
  360. &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
  361. eval(shift(@insns));
  362. eval(shift(@insns));
  363. &vadd_i32 ($T0,$T0,@X[0]);
  364. while($#insns>=2) { eval(shift(@insns)); }
  365. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  366. eval(shift(@insns));
  367. eval(shift(@insns));
  368. push(@X,shift(@X)); # "rotate" X[]
  369. }
  370. sub Xpreload()
  371. { use integer;
  372. my $body = shift;
  373. my @insns = (&$body,&$body,&$body,&$body);
  374. my ($a,$b,$c,$d,$e,$f,$g,$h);
  375. eval(shift(@insns));
  376. eval(shift(@insns));
  377. eval(shift(@insns));
  378. eval(shift(@insns));
  379. &vld1_32 ("{$T0}","[$Ktbl,:128]!");
  380. eval(shift(@insns));
  381. eval(shift(@insns));
  382. eval(shift(@insns));
  383. eval(shift(@insns));
  384. &vrev32_8 (@X[0],@X[0]);
  385. eval(shift(@insns));
  386. eval(shift(@insns));
  387. eval(shift(@insns));
  388. eval(shift(@insns));
  389. &vadd_i32 ($T0,$T0,@X[0]);
  390. foreach (@insns) { eval; } # remaining instructions
  391. &vst1_32 ("{$T0}","[$Xfer,:128]!");
  392. push(@X,shift(@X)); # "rotate" X[]
  393. }
  394. sub body_00_15 () {
  395. (
  396. '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
  397. '&add ($h,$h,$t1)', # h+=X[i]+K[i]
  398. '&eor ($t1,$f,$g)',
  399. '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
  400. '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
  401. '&and ($t1,$t1,$e)',
  402. '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
  403. '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
  404. '&eor ($t1,$t1,$g)', # Ch(e,f,g)
  405. '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
  406. '&eor ($t2,$a,$b)', # a^b, b^c in next round
  407. '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
  408. '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
  409. '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
  410. '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
  411. '&ldr ($t1,"[sp,#64]") if ($j==31)',
  412. '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
  413. '&add ($d,$d,$h)', # d+=h
  414. '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
  415. '&eor ($t3,$t3,$b)', # Maj(a,b,c)
  416. '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
  417. )
  418. }
  419. $code.=<<___;
  420. #if __ARM_MAX_ARCH__>=7
  421. .arch armv7-a
  422. .fpu neon
  423. .global sha256_block_data_order_neon
  424. .type sha256_block_data_order_neon,%function
  425. .align 4
  426. sha256_block_data_order_neon:
  427. .LNEON:
  428. stmdb sp!,{r4-r12,lr}
  429. sub $H,sp,#16*4+16
  430. adr $Ktbl,K256
  431. bic $H,$H,#15 @ align for 128-bit stores
  432. mov $t2,sp
  433. mov sp,$H @ alloca
  434. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  435. vld1.8 {@X[0]},[$inp]!
  436. vld1.8 {@X[1]},[$inp]!
  437. vld1.8 {@X[2]},[$inp]!
  438. vld1.8 {@X[3]},[$inp]!
  439. vld1.32 {$T0},[$Ktbl,:128]!
  440. vld1.32 {$T1},[$Ktbl,:128]!
  441. vld1.32 {$T2},[$Ktbl,:128]!
  442. vld1.32 {$T3},[$Ktbl,:128]!
  443. vrev32.8 @X[0],@X[0] @ yes, even on
  444. str $ctx,[sp,#64]
  445. vrev32.8 @X[1],@X[1] @ big-endian
  446. str $inp,[sp,#68]
  447. mov $Xfer,sp
  448. vrev32.8 @X[2],@X[2]
  449. str $len,[sp,#72]
  450. vrev32.8 @X[3],@X[3]
  451. str $t2,[sp,#76] @ save original sp
  452. vadd.i32 $T0,$T0,@X[0]
  453. vadd.i32 $T1,$T1,@X[1]
  454. vst1.32 {$T0},[$Xfer,:128]!
  455. vadd.i32 $T2,$T2,@X[2]
  456. vst1.32 {$T1},[$Xfer,:128]!
  457. vadd.i32 $T3,$T3,@X[3]
  458. vst1.32 {$T2},[$Xfer,:128]!
  459. vst1.32 {$T3},[$Xfer,:128]!
  460. ldmia $ctx,{$A-$H}
  461. sub $Xfer,$Xfer,#64
  462. ldr $t1,[sp,#0]
  463. eor $t2,$t2,$t2
  464. eor $t3,$B,$C
  465. b .L_00_48
  466. .align 4
  467. .L_00_48:
  468. ___
  469. &Xupdate(\&body_00_15);
  470. &Xupdate(\&body_00_15);
  471. &Xupdate(\&body_00_15);
  472. &Xupdate(\&body_00_15);
  473. $code.=<<___;
  474. teq $t1,#0 @ check for K256 terminator
  475. ldr $t1,[sp,#0]
  476. sub $Xfer,$Xfer,#64
  477. bne .L_00_48
  478. ldr $inp,[sp,#68]
  479. ldr $t0,[sp,#72]
  480. sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
  481. teq $inp,$t0
  482. it eq
  483. subeq $inp,$inp,#64 @ avoid SEGV
  484. vld1.8 {@X[0]},[$inp]! @ load next input block
  485. vld1.8 {@X[1]},[$inp]!
  486. vld1.8 {@X[2]},[$inp]!
  487. vld1.8 {@X[3]},[$inp]!
  488. it ne
  489. strne $inp,[sp,#68]
  490. mov $Xfer,sp
  491. ___
  492. &Xpreload(\&body_00_15);
  493. &Xpreload(\&body_00_15);
  494. &Xpreload(\&body_00_15);
  495. &Xpreload(\&body_00_15);
  496. $code.=<<___;
  497. ldr $t0,[$t1,#0]
  498. add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
  499. ldr $t2,[$t1,#4]
  500. ldr $t3,[$t1,#8]
  501. ldr $t4,[$t1,#12]
  502. add $A,$A,$t0 @ accumulate
  503. ldr $t0,[$t1,#16]
  504. add $B,$B,$t2
  505. ldr $t2,[$t1,#20]
  506. add $C,$C,$t3
  507. ldr $t3,[$t1,#24]
  508. add $D,$D,$t4
  509. ldr $t4,[$t1,#28]
  510. add $E,$E,$t0
  511. str $A,[$t1],#4
  512. add $F,$F,$t2
  513. str $B,[$t1],#4
  514. add $G,$G,$t3
  515. str $C,[$t1],#4
  516. add $H,$H,$t4
  517. str $D,[$t1],#4
  518. stmia $t1,{$E-$H}
  519. ittte ne
  520. movne $Xfer,sp
  521. ldrne $t1,[sp,#0]
  522. eorne $t2,$t2,$t2
  523. ldreq sp,[sp,#76] @ restore original sp
  524. itt ne
  525. eorne $t3,$B,$C
  526. bne .L_00_48
  527. ldmia sp!,{r4-r12,pc}
  528. .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
  529. #endif
  530. ___
  531. }}}
  532. ######################################################################
  533. # ARMv8 stuff
  534. #
  535. {{{
  536. my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
  537. my @MSG=map("q$_",(8..11));
  538. my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
  539. my $Ktbl="r3";
  540. $code.=<<___;
  541. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  542. # ifdef __thumb2__
  543. # define INST(a,b,c,d) .byte c,d|0xc,a,b
  544. # else
  545. # define INST(a,b,c,d) .byte a,b,c,d
  546. # endif
  547. .type sha256_block_data_order_armv8,%function
  548. .align 5
  549. sha256_block_data_order_armv8:
  550. .LARMv8:
  551. vld1.32 {$ABCD,$EFGH},[$ctx]
  552. # ifdef __thumb2__
  553. adr $Ktbl,.LARMv8
  554. sub $Ktbl,$Ktbl,#.LARMv8-K256
  555. # else
  556. adrl $Ktbl,K256
  557. # endif
  558. add $len,$inp,$len,lsl#6 @ len to point at the end of inp
  559. .Loop_v8:
  560. vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
  561. vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
  562. vld1.32 {$W0},[$Ktbl]!
  563. vrev32.8 @MSG[0],@MSG[0]
  564. vrev32.8 @MSG[1],@MSG[1]
  565. vrev32.8 @MSG[2],@MSG[2]
  566. vrev32.8 @MSG[3],@MSG[3]
  567. vmov $ABCD_SAVE,$ABCD @ offload
  568. vmov $EFGH_SAVE,$EFGH
  569. teq $inp,$len
  570. ___
  571. for($i=0;$i<12;$i++) {
  572. $code.=<<___;
  573. vld1.32 {$W1},[$Ktbl]!
  574. vadd.i32 $W0,$W0,@MSG[0]
  575. sha256su0 @MSG[0],@MSG[1]
  576. vmov $abcd,$ABCD
  577. sha256h $ABCD,$EFGH,$W0
  578. sha256h2 $EFGH,$abcd,$W0
  579. sha256su1 @MSG[0],@MSG[2],@MSG[3]
  580. ___
  581. ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
  582. }
  583. $code.=<<___;
  584. vld1.32 {$W1},[$Ktbl]!
  585. vadd.i32 $W0,$W0,@MSG[0]
  586. vmov $abcd,$ABCD
  587. sha256h $ABCD,$EFGH,$W0
  588. sha256h2 $EFGH,$abcd,$W0
  589. vld1.32 {$W0},[$Ktbl]!
  590. vadd.i32 $W1,$W1,@MSG[1]
  591. vmov $abcd,$ABCD
  592. sha256h $ABCD,$EFGH,$W1
  593. sha256h2 $EFGH,$abcd,$W1
  594. vld1.32 {$W1},[$Ktbl]
  595. vadd.i32 $W0,$W0,@MSG[2]
  596. sub $Ktbl,$Ktbl,#256-16 @ rewind
  597. vmov $abcd,$ABCD
  598. sha256h $ABCD,$EFGH,$W0
  599. sha256h2 $EFGH,$abcd,$W0
  600. vadd.i32 $W1,$W1,@MSG[3]
  601. vmov $abcd,$ABCD
  602. sha256h $ABCD,$EFGH,$W1
  603. sha256h2 $EFGH,$abcd,$W1
  604. vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
  605. vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
  606. it ne
  607. bne .Loop_v8
  608. vst1.32 {$ABCD,$EFGH},[$ctx]
  609. ret @ bx lr
  610. .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
  611. #endif
  612. ___
  613. }}}
  614. $code.=<<___;
  615. .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  616. .align 2
  617. #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
  618. .comm OPENSSL_armcap_P,4,4
  619. #endif
  620. ___
  621. open SELF,$0;
  622. while(<SELF>) {
  623. next if (/^#!/);
  624. last if (!s/^#/@/ and !/^$/);
  625. print;
  626. }
  627. close SELF;
  628. { my %opcode = (
  629. "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
  630. "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
  631. sub unsha256 {
  632. my ($mnemonic,$arg)=@_;
  633. if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
  634. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  635. |(($2&7)<<17)|(($2&8)<<4)
  636. |(($3&7)<<1) |(($3&8)<<2);
  637. # since ARMv7 instructions are always encoded little-endian.
  638. # correct solution is to use .inst directive, but older
  639. # assemblers don't implement it:-(
  640. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  641. $word&0xff,($word>>8)&0xff,
  642. ($word>>16)&0xff,($word>>24)&0xff,
  643. $mnemonic,$arg;
  644. }
  645. }
  646. }
  647. foreach (split($/,$code)) {
  648. s/\`([^\`]*)\`/eval $1/geo;
  649. s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
  650. s/\bret\b/bx lr/go or
  651. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  652. print $_,"\n";
  653. }
  654. close STDOUT; # enforce flush