keccak1600-armv8.pl 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880
  1. #!/usr/bin/env perl
  2. # Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # Keccak-1600 for ARMv8.
  17. #
  18. # June 2017.
  19. #
  20. # This is straightforward KECCAK_1X_ALT implementation. It makes no
  21. # sense to attempt SIMD/NEON implementation for following reason.
  22. # 64-bit lanes of vector registers can't be addressed as easily as in
  23. # 32-bit mode. This means that 64-bit NEON is bound to be slower than
  24. # 32-bit NEON, and this implementation is faster than 32-bit NEON on
  25. # same processor. Even though it takes more scalar xor's and andn's,
  26. # it gets compensated by availability of rotate. Not to forget that
  27. # most processors achieve higher issue rate with scalar instructions.
  28. #
  29. # February 2018.
  30. #
  31. # Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
  32. # variant with register permutation/rotation twist that allows to
  33. # eliminate copies to temporary registers. If you look closely you'll
  34. # notice that it uses only one lane of vector registers. The new
  35. # instructions effectively facilitate parallel hashing, which we don't
  36. # support [yet?]. But lowest-level core procedure is prepared for it.
  37. # The inner round is 67 [vector] instructions, so it's not actually
  38. # obvious that it will provide performance improvement [in serial
  39. # hash] as long as vector instructions issue rate is limited to 1 per
  40. # cycle...
  41. #
  42. ######################################################################
  43. # Numbers are cycles per processed byte.
  44. #
  45. # r=1088(*)
  46. #
  47. # Cortex-A53 13
  48. # Cortex-A57 12
  49. # X-Gene 14
  50. # Mongoose 10
  51. # Kryo 12
  52. # Denver 7.8
  53. # Apple A7 7.2
  54. #
  55. # (*) Corresponds to SHA3-256. No improvement coefficients are listed
  56. # because they vary too much from compiler to compiler. Newer
  57. # compiler does much better and improvement varies from 5% on
  58. # Cortex-A57 to 25% on Cortex-A53. While in comparison to older
  59. # compiler this code is at least 2x faster...
  60. $flavour = shift;
  61. $output = shift;
  62. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  63. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  64. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  65. die "can't locate arm-xlate.pl";
  66. open OUT,"| \"$^X\" $xlate $flavour $output";
  67. *STDOUT=*OUT;
  68. my @rhotates = ([ 0, 1, 62, 28, 27 ],
  69. [ 36, 44, 6, 55, 20 ],
  70. [ 3, 10, 43, 25, 39 ],
  71. [ 41, 45, 15, 21, 8 ],
  72. [ 18, 2, 61, 56, 14 ]);
  73. $code.=<<___;
  74. .text
  75. .align 8 // strategic alignment and padding that allows to use
  76. // address value as loop termination condition...
  77. .quad 0,0,0,0,0,0,0,0
  78. .type iotas,%object
  79. iotas:
  80. .quad 0x0000000000000001
  81. .quad 0x0000000000008082
  82. .quad 0x800000000000808a
  83. .quad 0x8000000080008000
  84. .quad 0x000000000000808b
  85. .quad 0x0000000080000001
  86. .quad 0x8000000080008081
  87. .quad 0x8000000000008009
  88. .quad 0x000000000000008a
  89. .quad 0x0000000000000088
  90. .quad 0x0000000080008009
  91. .quad 0x000000008000000a
  92. .quad 0x000000008000808b
  93. .quad 0x800000000000008b
  94. .quad 0x8000000000008089
  95. .quad 0x8000000000008003
  96. .quad 0x8000000000008002
  97. .quad 0x8000000000000080
  98. .quad 0x000000000000800a
  99. .quad 0x800000008000000a
  100. .quad 0x8000000080008081
  101. .quad 0x8000000000008080
  102. .quad 0x0000000080000001
  103. .quad 0x8000000080008008
  104. .size iotas,.-iotas
  105. ___
  106. {{{
  107. my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
  108. (0, 5, 10, 15, 20));
  109. $A[3][3] = "x25"; # x18 is reserved
  110. my @C = map("x$_", (26,27,28,30));
  111. $code.=<<___;
  112. .type KeccakF1600_int,%function
  113. .align 5
  114. KeccakF1600_int:
  115. adr $C[2],iotas
  116. .inst 0xd503233f // paciasp
  117. stp $C[2],x30,[sp,#16] // 32 bytes on top are mine
  118. b .Loop
  119. .align 4
  120. .Loop:
  121. ////////////////////////////////////////// Theta
  122. eor $C[0],$A[0][0],$A[1][0]
  123. stp $A[0][4],$A[1][4],[sp,#0] // offload pair...
  124. eor $C[1],$A[0][1],$A[1][1]
  125. eor $C[2],$A[0][2],$A[1][2]
  126. eor $C[3],$A[0][3],$A[1][3]
  127. ___
  128. $C[4]=$A[0][4];
  129. $C[5]=$A[1][4];
  130. $code.=<<___;
  131. eor $C[4],$A[0][4],$A[1][4]
  132. eor $C[0],$C[0],$A[2][0]
  133. eor $C[1],$C[1],$A[2][1]
  134. eor $C[2],$C[2],$A[2][2]
  135. eor $C[3],$C[3],$A[2][3]
  136. eor $C[4],$C[4],$A[2][4]
  137. eor $C[0],$C[0],$A[3][0]
  138. eor $C[1],$C[1],$A[3][1]
  139. eor $C[2],$C[2],$A[3][2]
  140. eor $C[3],$C[3],$A[3][3]
  141. eor $C[4],$C[4],$A[3][4]
  142. eor $C[0],$C[0],$A[4][0]
  143. eor $C[2],$C[2],$A[4][2]
  144. eor $C[1],$C[1],$A[4][1]
  145. eor $C[3],$C[3],$A[4][3]
  146. eor $C[4],$C[4],$A[4][4]
  147. eor $C[5],$C[0],$C[2],ror#63
  148. eor $A[0][1],$A[0][1],$C[5]
  149. eor $A[1][1],$A[1][1],$C[5]
  150. eor $A[2][1],$A[2][1],$C[5]
  151. eor $A[3][1],$A[3][1],$C[5]
  152. eor $A[4][1],$A[4][1],$C[5]
  153. eor $C[5],$C[1],$C[3],ror#63
  154. eor $C[2],$C[2],$C[4],ror#63
  155. eor $C[3],$C[3],$C[0],ror#63
  156. eor $C[4],$C[4],$C[1],ror#63
  157. eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2]
  158. eor $A[1][2],$A[1][2],$C[5]
  159. eor $A[2][2],$A[2][2],$C[5]
  160. eor $A[3][2],$A[3][2],$C[5]
  161. eor $A[4][2],$A[4][2],$C[5]
  162. eor $A[0][0],$A[0][0],$C[4]
  163. eor $A[1][0],$A[1][0],$C[4]
  164. eor $A[2][0],$A[2][0],$C[4]
  165. eor $A[3][0],$A[3][0],$C[4]
  166. eor $A[4][0],$A[4][0],$C[4]
  167. ___
  168. $C[4]=undef;
  169. $C[5]=undef;
  170. $code.=<<___;
  171. ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data
  172. eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3]
  173. eor $A[1][3],$A[1][3],$C[2]
  174. eor $A[2][3],$A[2][3],$C[2]
  175. eor $A[3][3],$A[3][3],$C[2]
  176. eor $A[4][3],$A[4][3],$C[2]
  177. eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4]
  178. eor $A[1][4],$A[1][4],$C[3]
  179. eor $A[2][4],$A[2][4],$C[3]
  180. eor $A[3][4],$A[3][4],$C[3]
  181. eor $A[4][4],$A[4][4],$C[3]
  182. ////////////////////////////////////////// Rho+Pi
  183. mov $C[3],$A[0][1]
  184. ror $A[0][1],$A[1][1],#64-$rhotates[1][1]
  185. //mov $C[1],$A[0][2]
  186. ror $A[0][2],$A[2][2],#64-$rhotates[2][2]
  187. //mov $C[0],$A[0][3]
  188. ror $A[0][3],$A[3][3],#64-$rhotates[3][3]
  189. //mov $C[2],$A[0][4]
  190. ror $A[0][4],$A[4][4],#64-$rhotates[4][4]
  191. ror $A[1][1],$A[1][4],#64-$rhotates[1][4]
  192. ror $A[2][2],$A[2][3],#64-$rhotates[2][3]
  193. ror $A[3][3],$A[3][2],#64-$rhotates[3][2]
  194. ror $A[4][4],$A[4][1],#64-$rhotates[4][1]
  195. ror $A[1][4],$A[4][2],#64-$rhotates[4][2]
  196. ror $A[2][3],$A[3][4],#64-$rhotates[3][4]
  197. ror $A[3][2],$A[2][1],#64-$rhotates[2][1]
  198. ror $A[4][1],$A[1][3],#64-$rhotates[1][3]
  199. ror $A[4][2],$A[2][4],#64-$rhotates[2][4]
  200. ror $A[3][4],$A[4][3],#64-$rhotates[4][3]
  201. ror $A[2][1],$A[1][2],#64-$rhotates[1][2]
  202. ror $A[1][3],$A[3][1],#64-$rhotates[3][1]
  203. ror $A[2][4],$A[4][0],#64-$rhotates[4][0]
  204. ror $A[4][3],$A[3][0],#64-$rhotates[3][0]
  205. ror $A[1][2],$A[2][0],#64-$rhotates[2][0]
  206. ror $A[3][1],$A[1][0],#64-$rhotates[1][0]
  207. ror $A[1][0],$C[0],#64-$rhotates[0][3]
  208. ror $A[2][0],$C[3],#64-$rhotates[0][1]
  209. ror $A[3][0],$C[2],#64-$rhotates[0][4]
  210. ror $A[4][0],$C[1],#64-$rhotates[0][2]
  211. ////////////////////////////////////////// Chi+Iota
  212. bic $C[0],$A[0][2],$A[0][1]
  213. bic $C[1],$A[0][3],$A[0][2]
  214. bic $C[2],$A[0][0],$A[0][4]
  215. bic $C[3],$A[0][1],$A[0][0]
  216. eor $A[0][0],$A[0][0],$C[0]
  217. bic $C[0],$A[0][4],$A[0][3]
  218. eor $A[0][1],$A[0][1],$C[1]
  219. ldr $C[1],[sp,#16]
  220. eor $A[0][3],$A[0][3],$C[2]
  221. eor $A[0][4],$A[0][4],$C[3]
  222. eor $A[0][2],$A[0][2],$C[0]
  223. ldr $C[3],[$C[1]],#8 // Iota[i++]
  224. bic $C[0],$A[1][2],$A[1][1]
  225. tst $C[1],#255 // are we done?
  226. str $C[1],[sp,#16]
  227. bic $C[1],$A[1][3],$A[1][2]
  228. bic $C[2],$A[1][0],$A[1][4]
  229. eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota
  230. bic $C[3],$A[1][1],$A[1][0]
  231. eor $A[1][0],$A[1][0],$C[0]
  232. bic $C[0],$A[1][4],$A[1][3]
  233. eor $A[1][1],$A[1][1],$C[1]
  234. eor $A[1][3],$A[1][3],$C[2]
  235. eor $A[1][4],$A[1][4],$C[3]
  236. eor $A[1][2],$A[1][2],$C[0]
  237. bic $C[0],$A[2][2],$A[2][1]
  238. bic $C[1],$A[2][3],$A[2][2]
  239. bic $C[2],$A[2][0],$A[2][4]
  240. bic $C[3],$A[2][1],$A[2][0]
  241. eor $A[2][0],$A[2][0],$C[0]
  242. bic $C[0],$A[2][4],$A[2][3]
  243. eor $A[2][1],$A[2][1],$C[1]
  244. eor $A[2][3],$A[2][3],$C[2]
  245. eor $A[2][4],$A[2][4],$C[3]
  246. eor $A[2][2],$A[2][2],$C[0]
  247. bic $C[0],$A[3][2],$A[3][1]
  248. bic $C[1],$A[3][3],$A[3][2]
  249. bic $C[2],$A[3][0],$A[3][4]
  250. bic $C[3],$A[3][1],$A[3][0]
  251. eor $A[3][0],$A[3][0],$C[0]
  252. bic $C[0],$A[3][4],$A[3][3]
  253. eor $A[3][1],$A[3][1],$C[1]
  254. eor $A[3][3],$A[3][3],$C[2]
  255. eor $A[3][4],$A[3][4],$C[3]
  256. eor $A[3][2],$A[3][2],$C[0]
  257. bic $C[0],$A[4][2],$A[4][1]
  258. bic $C[1],$A[4][3],$A[4][2]
  259. bic $C[2],$A[4][0],$A[4][4]
  260. bic $C[3],$A[4][1],$A[4][0]
  261. eor $A[4][0],$A[4][0],$C[0]
  262. bic $C[0],$A[4][4],$A[4][3]
  263. eor $A[4][1],$A[4][1],$C[1]
  264. eor $A[4][3],$A[4][3],$C[2]
  265. eor $A[4][4],$A[4][4],$C[3]
  266. eor $A[4][2],$A[4][2],$C[0]
  267. bne .Loop
  268. ldr x30,[sp,#24]
  269. .inst 0xd50323bf // autiasp
  270. ret
  271. .size KeccakF1600_int,.-KeccakF1600_int
  272. .type KeccakF1600,%function
  273. .align 5
  274. KeccakF1600:
  275. .inst 0xd503233f // paciasp
  276. stp x29,x30,[sp,#-128]!
  277. add x29,sp,#0
  278. stp x19,x20,[sp,#16]
  279. stp x21,x22,[sp,#32]
  280. stp x23,x24,[sp,#48]
  281. stp x25,x26,[sp,#64]
  282. stp x27,x28,[sp,#80]
  283. sub sp,sp,#48
  284. str x0,[sp,#32] // offload argument
  285. mov $C[0],x0
  286. ldp $A[0][0],$A[0][1],[x0,#16*0]
  287. ldp $A[0][2],$A[0][3],[$C[0],#16*1]
  288. ldp $A[0][4],$A[1][0],[$C[0],#16*2]
  289. ldp $A[1][1],$A[1][2],[$C[0],#16*3]
  290. ldp $A[1][3],$A[1][4],[$C[0],#16*4]
  291. ldp $A[2][0],$A[2][1],[$C[0],#16*5]
  292. ldp $A[2][2],$A[2][3],[$C[0],#16*6]
  293. ldp $A[2][4],$A[3][0],[$C[0],#16*7]
  294. ldp $A[3][1],$A[3][2],[$C[0],#16*8]
  295. ldp $A[3][3],$A[3][4],[$C[0],#16*9]
  296. ldp $A[4][0],$A[4][1],[$C[0],#16*10]
  297. ldp $A[4][2],$A[4][3],[$C[0],#16*11]
  298. ldr $A[4][4],[$C[0],#16*12]
  299. bl KeccakF1600_int
  300. ldr $C[0],[sp,#32]
  301. stp $A[0][0],$A[0][1],[$C[0],#16*0]
  302. stp $A[0][2],$A[0][3],[$C[0],#16*1]
  303. stp $A[0][4],$A[1][0],[$C[0],#16*2]
  304. stp $A[1][1],$A[1][2],[$C[0],#16*3]
  305. stp $A[1][3],$A[1][4],[$C[0],#16*4]
  306. stp $A[2][0],$A[2][1],[$C[0],#16*5]
  307. stp $A[2][2],$A[2][3],[$C[0],#16*6]
  308. stp $A[2][4],$A[3][0],[$C[0],#16*7]
  309. stp $A[3][1],$A[3][2],[$C[0],#16*8]
  310. stp $A[3][3],$A[3][4],[$C[0],#16*9]
  311. stp $A[4][0],$A[4][1],[$C[0],#16*10]
  312. stp $A[4][2],$A[4][3],[$C[0],#16*11]
  313. str $A[4][4],[$C[0],#16*12]
  314. ldp x19,x20,[x29,#16]
  315. add sp,sp,#48
  316. ldp x21,x22,[x29,#32]
  317. ldp x23,x24,[x29,#48]
  318. ldp x25,x26,[x29,#64]
  319. ldp x27,x28,[x29,#80]
  320. ldp x29,x30,[sp],#128
  321. .inst 0xd50323bf // autiasp
  322. ret
  323. .size KeccakF1600,.-KeccakF1600
  324. .globl SHA3_absorb
  325. .type SHA3_absorb,%function
  326. .align 5
  327. SHA3_absorb:
  328. .inst 0xd503233f // paciasp
  329. stp x29,x30,[sp,#-128]!
  330. add x29,sp,#0
  331. stp x19,x20,[sp,#16]
  332. stp x21,x22,[sp,#32]
  333. stp x23,x24,[sp,#48]
  334. stp x25,x26,[sp,#64]
  335. stp x27,x28,[sp,#80]
  336. sub sp,sp,#64
  337. stp x0,x1,[sp,#32] // offload arguments
  338. stp x2,x3,[sp,#48]
  339. mov $C[0],x0 // uint64_t A[5][5]
  340. mov $C[1],x1 // const void *inp
  341. mov $C[2],x2 // size_t len
  342. mov $C[3],x3 // size_t bsz
  343. ldp $A[0][0],$A[0][1],[$C[0],#16*0]
  344. ldp $A[0][2],$A[0][3],[$C[0],#16*1]
  345. ldp $A[0][4],$A[1][0],[$C[0],#16*2]
  346. ldp $A[1][1],$A[1][2],[$C[0],#16*3]
  347. ldp $A[1][3],$A[1][4],[$C[0],#16*4]
  348. ldp $A[2][0],$A[2][1],[$C[0],#16*5]
  349. ldp $A[2][2],$A[2][3],[$C[0],#16*6]
  350. ldp $A[2][4],$A[3][0],[$C[0],#16*7]
  351. ldp $A[3][1],$A[3][2],[$C[0],#16*8]
  352. ldp $A[3][3],$A[3][4],[$C[0],#16*9]
  353. ldp $A[4][0],$A[4][1],[$C[0],#16*10]
  354. ldp $A[4][2],$A[4][3],[$C[0],#16*11]
  355. ldr $A[4][4],[$C[0],#16*12]
  356. b .Loop_absorb
  357. .align 4
  358. .Loop_absorb:
  359. subs $C[0],$C[2],$C[3] // len - bsz
  360. blo .Labsorbed
  361. str $C[0],[sp,#48] // save len - bsz
  362. ___
  363. for (my $i=0; $i<24; $i+=2) {
  364. my $j = $i+1;
  365. $code.=<<___;
  366. ldr $C[0],[$C[1]],#8 // *inp++
  367. #ifdef __AARCH64EB__
  368. rev $C[0],$C[0]
  369. #endif
  370. eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
  371. cmp $C[3],#8*($i+2)
  372. blo .Lprocess_block
  373. ldr $C[0],[$C[1]],#8 // *inp++
  374. #ifdef __AARCH64EB__
  375. rev $C[0],$C[0]
  376. #endif
  377. eor $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
  378. beq .Lprocess_block
  379. ___
  380. }
  381. $code.=<<___;
  382. ldr $C[0],[$C[1]],#8 // *inp++
  383. #ifdef __AARCH64EB__
  384. rev $C[0],$C[0]
  385. #endif
  386. eor $A[4][4],$A[4][4],$C[0]
  387. .Lprocess_block:
  388. str $C[1],[sp,#40] // save inp
  389. bl KeccakF1600_int
  390. ldr $C[1],[sp,#40] // restore arguments
  391. ldp $C[2],$C[3],[sp,#48]
  392. b .Loop_absorb
  393. .align 4
  394. .Labsorbed:
  395. ldr $C[1],[sp,#32]
  396. stp $A[0][0],$A[0][1],[$C[1],#16*0]
  397. stp $A[0][2],$A[0][3],[$C[1],#16*1]
  398. stp $A[0][4],$A[1][0],[$C[1],#16*2]
  399. stp $A[1][1],$A[1][2],[$C[1],#16*3]
  400. stp $A[1][3],$A[1][4],[$C[1],#16*4]
  401. stp $A[2][0],$A[2][1],[$C[1],#16*5]
  402. stp $A[2][2],$A[2][3],[$C[1],#16*6]
  403. stp $A[2][4],$A[3][0],[$C[1],#16*7]
  404. stp $A[3][1],$A[3][2],[$C[1],#16*8]
  405. stp $A[3][3],$A[3][4],[$C[1],#16*9]
  406. stp $A[4][0],$A[4][1],[$C[1],#16*10]
  407. stp $A[4][2],$A[4][3],[$C[1],#16*11]
  408. str $A[4][4],[$C[1],#16*12]
  409. mov x0,$C[2] // return value
  410. ldp x19,x20,[x29,#16]
  411. add sp,sp,#64
  412. ldp x21,x22,[x29,#32]
  413. ldp x23,x24,[x29,#48]
  414. ldp x25,x26,[x29,#64]
  415. ldp x27,x28,[x29,#80]
  416. ldp x29,x30,[sp],#128
  417. .inst 0xd50323bf // autiasp
  418. ret
  419. .size SHA3_absorb,.-SHA3_absorb
  420. ___
  421. {
  422. my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
  423. $code.=<<___;
  424. .globl SHA3_squeeze
  425. .type SHA3_squeeze,%function
  426. .align 5
  427. SHA3_squeeze:
  428. .inst 0xd503233f // paciasp
  429. stp x29,x30,[sp,#-48]!
  430. add x29,sp,#0
  431. stp x19,x20,[sp,#16]
  432. stp x21,x22,[sp,#32]
  433. mov $A_flat,x0 // put aside arguments
  434. mov $out,x1
  435. mov $len,x2
  436. mov $bsz,x3
  437. .Loop_squeeze:
  438. ldr x4,[x0],#8
  439. cmp $len,#8
  440. blo .Lsqueeze_tail
  441. #ifdef __AARCH64EB__
  442. rev x4,x4
  443. #endif
  444. str x4,[$out],#8
  445. subs $len,$len,#8
  446. beq .Lsqueeze_done
  447. subs x3,x3,#8
  448. bhi .Loop_squeeze
  449. mov x0,$A_flat
  450. bl KeccakF1600
  451. mov x0,$A_flat
  452. mov x3,$bsz
  453. b .Loop_squeeze
  454. .align 4
  455. .Lsqueeze_tail:
  456. strb w4,[$out],#1
  457. lsr x4,x4,#8
  458. subs $len,$len,#1
  459. beq .Lsqueeze_done
  460. strb w4,[$out],#1
  461. lsr x4,x4,#8
  462. subs $len,$len,#1
  463. beq .Lsqueeze_done
  464. strb w4,[$out],#1
  465. lsr x4,x4,#8
  466. subs $len,$len,#1
  467. beq .Lsqueeze_done
  468. strb w4,[$out],#1
  469. lsr x4,x4,#8
  470. subs $len,$len,#1
  471. beq .Lsqueeze_done
  472. strb w4,[$out],#1
  473. lsr x4,x4,#8
  474. subs $len,$len,#1
  475. beq .Lsqueeze_done
  476. strb w4,[$out],#1
  477. lsr x4,x4,#8
  478. subs $len,$len,#1
  479. beq .Lsqueeze_done
  480. strb w4,[$out],#1
  481. .Lsqueeze_done:
  482. ldp x19,x20,[sp,#16]
  483. ldp x21,x22,[sp,#32]
  484. ldp x29,x30,[sp],#48
  485. .inst 0xd50323bf // autiasp
  486. ret
  487. .size SHA3_squeeze,.-SHA3_squeeze
  488. ___
  489. } }}}
  490. {{{
  491. my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
  492. "v".($_+3).".16b", "v".($_+4).".16b" ],
  493. (0, 5, 10, 15, 20));
  494. my @C = map("v$_.16b", (25..31));
  495. $code.=<<___;
  496. .type KeccakF1600_ce,%function
  497. .align 5
  498. KeccakF1600_ce:
  499. mov x9,#12
  500. adr x10,iotas
  501. b .Loop_ce
  502. .align 4
  503. .Loop_ce:
  504. ___
  505. for($i=0; $i<2; $i++) {
  506. $code.=<<___;
  507. ////////////////////////////////////////////////// Theta
  508. eor3 $C[0],$A[0][0],$A[1][0],$A[2][0]
  509. eor3 $C[1],$A[0][1],$A[1][1],$A[2][1]
  510. eor3 $C[2],$A[0][2],$A[1][2],$A[2][2]
  511. eor3 $C[3],$A[0][3],$A[1][3],$A[2][3]
  512. eor3 $C[4],$A[0][4],$A[1][4],$A[2][4]
  513. eor3 $C[0],$C[0], $A[3][0],$A[4][0]
  514. eor3 $C[1],$C[1], $A[3][1],$A[4][1]
  515. eor3 $C[2],$C[2], $A[3][2],$A[4][2]
  516. eor3 $C[3],$C[3], $A[3][3],$A[4][3]
  517. eor3 $C[4],$C[4], $A[3][4],$A[4][4]
  518. rax1 $C[5],$C[0],$C[2] // D[1]
  519. rax1 $C[6],$C[1],$C[3] // D[2]
  520. rax1 $C[2],$C[2],$C[4] // D[3]
  521. rax1 $C[3],$C[3],$C[0] // D[4]
  522. rax1 $C[4],$C[4],$C[1] // D[0]
  523. ////////////////////////////////////////////////// Theta+Rho+Pi
  524. xar $C[0], $A[1][1],$C[5],#64-$rhotates[1][1] // C[0]=A[0][1]
  525. xar $A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4]
  526. xar $A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2]
  527. xar $A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4]
  528. xar $A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0]
  529. xar $A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2]
  530. xar $A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2]
  531. xar $A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3]
  532. xar $A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4]
  533. xar $A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3]
  534. xar $A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0]
  535. xar $A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4]
  536. eor $A[0][0],$A[0][0],$C[4]
  537. ldr x11,[x10],#8
  538. xar $C[1], $A[3][3],$C[2],#64-$rhotates[3][3] // C[1]=A[0][3]
  539. xar $A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2]
  540. xar $A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1]
  541. xar $A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2]
  542. xar $A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0]
  543. xar $A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1] // *
  544. xar $A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4]
  545. xar $A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1]
  546. xar $A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3]
  547. xar $A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1]
  548. xar $A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0]
  549. xar $C[2], $A[0][3],$C[2],#64-$rhotates[0][3] // C[2]=A[1][0]
  550. ////////////////////////////////////////////////// Chi+Iota
  551. dup $C[6],x11 // borrow C[6]
  552. bcax $C[3], $A[0][0],$A[0][2],$C[0] // *
  553. bcax $A[0][1],$C[0], $C[1], $A[0][2] // *
  554. bcax $A[0][2],$A[0][2],$A[0][4],$C[1]
  555. bcax $A[0][3],$C[1], $A[0][0],$A[0][4]
  556. bcax $A[0][4],$A[0][4],$C[0], $A[0][0]
  557. bcax $A[1][0],$C[2], $A[1][2],$A[1][1] // *
  558. bcax $C[0], $A[1][1],$A[1][3],$A[1][2] // *
  559. bcax $A[1][2],$A[1][2],$A[1][4],$A[1][3]
  560. bcax $A[1][3],$A[1][3],$C[2], $A[1][4]
  561. bcax $A[1][4],$A[1][4],$A[1][1],$C[2]
  562. eor $A[0][0],$C[3],$C[6] // Iota
  563. bcax $C[1], $A[2][0],$A[2][2],$A[2][1] // *
  564. bcax $C[2], $A[2][1],$A[2][3],$A[2][2] // *
  565. bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3]
  566. bcax $A[2][3],$A[2][3],$A[2][0],$A[2][4]
  567. bcax $A[2][4],$A[2][4],$A[2][1],$A[2][0]
  568. bcax $C[3], $A[3][0],$A[3][2],$A[3][1] // *
  569. bcax $C[4], $A[3][1],$A[3][3],$A[3][2] // *
  570. bcax $A[3][2],$A[3][2],$A[3][4],$A[3][3]
  571. bcax $A[3][3],$A[3][3],$A[3][0],$A[3][4]
  572. bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0]
  573. bcax $C[5], $A[4][0],$A[4][2],$A[4][1] // *
  574. bcax $C[6], $A[4][1],$A[4][3],$A[4][2] // *
  575. bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3]
  576. bcax $A[4][3],$A[4][3],$A[4][0],$A[4][4]
  577. bcax $A[4][4],$A[4][4],$A[4][1],$A[4][0]
  578. ___
  579. ( $A[1][1], $C[0]) = ( $C[0], $A[1][1]);
  580. ($A[2][0],$A[2][1], $C[1],$C[2]) = ($C[1],$C[2], $A[2][0],$A[2][1]);
  581. ($A[3][0],$A[3][1], $C[3],$C[4]) = ($C[3],$C[4], $A[3][0],$A[3][1]);
  582. ($A[4][0],$A[4][1], $C[5],$C[6]) = ($C[5],$C[6], $A[4][0],$A[4][1]);
  583. }
  584. $code.=<<___;
  585. subs x9,x9,#1
  586. bne .Loop_ce
  587. ret
  588. .size KeccakF1600_ce,.-KeccakF1600_ce
  589. .type KeccakF1600_cext,%function
  590. .align 5
  591. KeccakF1600_cext:
  592. .inst 0xd503233f // paciasp
  593. stp x29,x30,[sp,#-80]!
  594. add x29,sp,#0
  595. stp d8,d9,[sp,#16] // per ABI requirement
  596. stp d10,d11,[sp,#32]
  597. stp d12,d13,[sp,#48]
  598. stp d14,d15,[sp,#64]
  599. ___
  600. for($i=0; $i<24; $i+=2) { # load A[5][5]
  601. my $j=$i+1;
  602. $code.=<<___;
  603. ldp d$i,d$j,[x0,#8*$i]
  604. ___
  605. }
  606. $code.=<<___;
  607. ldr d24,[x0,#8*$i]
  608. bl KeccakF1600_ce
  609. ldr x30,[sp,#8]
  610. ___
  611. for($i=0; $i<24; $i+=2) { # store A[5][5]
  612. my $j=$i+1;
  613. $code.=<<___;
  614. stp d$i,d$j,[x0,#8*$i]
  615. ___
  616. }
  617. $code.=<<___;
  618. str d24,[x0,#8*$i]
  619. ldp d8,d9,[sp,#16]
  620. ldp d10,d11,[sp,#32]
  621. ldp d12,d13,[sp,#48]
  622. ldp d14,d15,[sp,#64]
  623. ldr x29,[sp],#80
  624. .inst 0xd50323bf // autiasp
  625. ret
  626. .size KeccakF1600_cext,.-KeccakF1600_cext
  627. ___
  628. {
  629. my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));
  630. $code.=<<___;
  631. .globl SHA3_absorb_cext
  632. .type SHA3_absorb_cext,%function
  633. .align 5
  634. SHA3_absorb_cext:
  635. .inst 0xd503233f // paciasp
  636. stp x29,x30,[sp,#-80]!
  637. add x29,sp,#0
  638. stp d8,d9,[sp,#16] // per ABI requirement
  639. stp d10,d11,[sp,#32]
  640. stp d12,d13,[sp,#48]
  641. stp d14,d15,[sp,#64]
  642. ___
  643. for($i=0; $i<24; $i+=2) { # load A[5][5]
  644. my $j=$i+1;
  645. $code.=<<___;
  646. ldp d$i,d$j,[x0,#8*$i]
  647. ___
  648. }
  649. $code.=<<___;
  650. ldr d24,[x0,#8*$i]
  651. b .Loop_absorb_ce
  652. .align 4
  653. .Loop_absorb_ce:
  654. subs $len,$len,$bsz // len - bsz
  655. blo .Labsorbed_ce
  656. ___
  657. for (my $i=0; $i<24; $i+=2) {
  658. my $j = $i+1;
  659. $code.=<<___;
  660. ldr d31,[$inp],#8 // *inp++
  661. #ifdef __AARCH64EB__
  662. rev64 v31.16b,v31.16b
  663. #endif
  664. eor $A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b
  665. cmp $bsz,#8*($i+2)
  666. blo .Lprocess_block_ce
  667. ldr d31,[$inp],#8 // *inp++
  668. #ifdef __AARCH64EB__
  669. rev64 v31.16b,v31.16b
  670. #endif
  671. eor $A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b
  672. beq .Lprocess_block_ce
  673. ___
  674. }
  675. $code.=<<___;
  676. ldr d31,[$inp],#8 // *inp++
  677. #ifdef __AARCH64EB__
  678. rev64 v31.16b,v31.16b
  679. #endif
  680. eor $A[4][4],$A[4][4],v31.16b
  681. .Lprocess_block_ce:
  682. bl KeccakF1600_ce
  683. b .Loop_absorb_ce
  684. .align 4
  685. .Labsorbed_ce:
  686. ___
  687. for($i=0; $i<24; $i+=2) { # store A[5][5]
  688. my $j=$i+1;
  689. $code.=<<___;
  690. stp d$i,d$j,[x0,#8*$i]
  691. ___
  692. }
  693. $code.=<<___;
  694. str d24,[x0,#8*$i]
  695. add x0,$len,$bsz // return value
  696. ldp d8,d9,[sp,#16]
  697. ldp d10,d11,[sp,#32]
  698. ldp d12,d13,[sp,#48]
  699. ldp d14,d15,[sp,#64]
  700. ldp x29,x30,[sp],#80
  701. .inst 0xd50323bf // autiasp
  702. ret
  703. .size SHA3_absorb_cext,.-SHA3_absorb_cext
  704. ___
  705. }
  706. {
  707. my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
  708. $code.=<<___;
  709. .globl SHA3_squeeze_cext
  710. .type SHA3_squeeze_cext,%function
  711. .align 5
  712. SHA3_squeeze_cext:
  713. .inst 0xd503233f // paciasp
  714. stp x29,x30,[sp,#-16]!
  715. add x29,sp,#0
  716. mov x9,$ctx
  717. mov x10,$bsz
  718. .Loop_squeeze_ce:
  719. ldr x4,[x9],#8
  720. cmp $len,#8
  721. blo .Lsqueeze_tail_ce
  722. #ifdef __AARCH64EB__
  723. rev x4,x4
  724. #endif
  725. str x4,[$out],#8
  726. beq .Lsqueeze_done_ce
  727. sub $len,$len,#8
  728. subs x10,x10,#8
  729. bhi .Loop_squeeze_ce
  730. bl KeccakF1600_cext
  731. ldr x30,[sp,#8]
  732. mov x9,$ctx
  733. mov x10,$bsz
  734. b .Loop_squeeze_ce
  735. .align 4
  736. .Lsqueeze_tail_ce:
  737. strb w4,[$out],#1
  738. lsr x4,x4,#8
  739. subs $len,$len,#1
  740. beq .Lsqueeze_done_ce
  741. strb w4,[$out],#1
  742. lsr x4,x4,#8
  743. subs $len,$len,#1
  744. beq .Lsqueeze_done_ce
  745. strb w4,[$out],#1
  746. lsr x4,x4,#8
  747. subs $len,$len,#1
  748. beq .Lsqueeze_done_ce
  749. strb w4,[$out],#1
  750. lsr x4,x4,#8
  751. subs $len,$len,#1
  752. beq .Lsqueeze_done_ce
  753. strb w4,[$out],#1
  754. lsr x4,x4,#8
  755. subs $len,$len,#1
  756. beq .Lsqueeze_done_ce
  757. strb w4,[$out],#1
  758. lsr x4,x4,#8
  759. subs $len,$len,#1
  760. beq .Lsqueeze_done_ce
  761. strb w4,[$out],#1
  762. .Lsqueeze_done_ce:
  763. ldr x29,[sp],#16
  764. .inst 0xd50323bf // autiasp
  765. ret
  766. .size SHA3_squeeze_cext,.-SHA3_squeeze_cext
  767. ___
  768. } }}}
  769. $code.=<<___;
  770. .asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  771. ___
  772. { my %opcode = (
  773. "rax1" => 0xce608c00, "eor3" => 0xce000000,
  774. "bcax" => 0xce200000, "xar" => 0xce800000 );
  775. sub unsha3 {
  776. my ($mnemonic,$arg)=@_;
  777. $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
  778. &&
  779. sprintf ".inst\t0x%08x\t//%s %s",
  780. $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
  781. $mnemonic,$arg;
  782. }
  783. }
  784. foreach(split("\n",$code)) {
  785. s/\`([^\`]*)\`/eval($1)/ge;
  786. m/\bdup\b/ and s/\.16b/.2d/g or
  787. s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
  788. print $_,"\n";
  789. }
  790. close STDOUT or die "error closing STDOUT: $!";