aesv8-armx.pl 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # This module implements support for ARMv8 AES instructions. The
  11. # module is endian-agnostic in sense that it supports both big- and
  12. # little-endian cases. As does it support both 32- and 64-bit modes
  13. # of operation. Latter is achieved by limiting amount of utilized
  14. # registers to 16, which implies additional NEON load and integer
  15. # instructions. This has no effect on mighty Apple A7, where results
  16. # are literally equal to the theoretical estimates based on AES
  17. # instruction latencies and issue rates. On Cortex-A53, an in-order
  18. # execution core, this costs up to 10-15%, which is partially
  19. # compensated by implementing dedicated code path for 128-bit
  20. # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  21. # seems to be limited by sheer amount of NEON instructions...
  22. #
  23. # Performance in cycles per byte processed with 128-bit key:
  24. #
  25. # CBC enc CBC dec CTR
  26. # Apple A7 2.39 1.20 1.20
  27. # Cortex-A53 1.32 1.29 1.46
  28. # Cortex-A57(*) 1.95 0.85 0.93
  29. # Denver 1.96 0.86 0.80
  30. #
  31. # (*) original 3.64/1.34/1.32 results were for r0p0 revision
  32. # and are still same even for updated module;
  33. $flavour = shift;
  34. open STDOUT,">".shift;
  35. $prefix="aes_v8";
  36. $code=<<___;
  37. #include "arm_arch.h"
  38. #if __ARM_MAX_ARCH__>=7
  39. .text
  40. ___
  41. $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
  42. $code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
  43. #^^^^^^ this is done to simplify adoption by not depending
  44. # on latest binutils.
  45. # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  46. # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
  47. # maintain both 32- and 64-bit codes within single module and
  48. # transliterate common code to either flavour with regex vodoo.
  49. #
  50. {{{
  51. my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
  52. my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
  53. $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
  54. $code.=<<___;
  55. .align 5
  56. rcon:
  57. .long 0x01,0x01,0x01,0x01
  58. .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
  59. .long 0x1b,0x1b,0x1b,0x1b
  60. .globl ${prefix}_set_encrypt_key
  61. .type ${prefix}_set_encrypt_key,%function
  62. .align 5
  63. ${prefix}_set_encrypt_key:
  64. .Lenc_key:
  65. ___
  66. $code.=<<___ if ($flavour =~ /64/);
  67. stp x29,x30,[sp,#-16]!
  68. add x29,sp,#0
  69. ___
  70. $code.=<<___;
  71. mov $ptr,#-1
  72. cmp $inp,#0
  73. b.eq .Lenc_key_abort
  74. cmp $out,#0
  75. b.eq .Lenc_key_abort
  76. mov $ptr,#-2
  77. cmp $bits,#128
  78. b.lt .Lenc_key_abort
  79. cmp $bits,#256
  80. b.gt .Lenc_key_abort
  81. tst $bits,#0x3f
  82. b.ne .Lenc_key_abort
  83. adr $ptr,rcon
  84. cmp $bits,#192
  85. veor $zero,$zero,$zero
  86. vld1.8 {$in0},[$inp],#16
  87. mov $bits,#8 // reuse $bits
  88. vld1.32 {$rcon,$mask},[$ptr],#32
  89. b.lt .Loop128
  90. b.eq .L192
  91. b .L256
  92. .align 4
  93. .Loop128:
  94. vtbl.8 $key,{$in0},$mask
  95. vext.8 $tmp,$zero,$in0,#12
  96. vst1.32 {$in0},[$out],#16
  97. aese $key,$zero
  98. subs $bits,$bits,#1
  99. veor $in0,$in0,$tmp
  100. vext.8 $tmp,$zero,$tmp,#12
  101. veor $in0,$in0,$tmp
  102. vext.8 $tmp,$zero,$tmp,#12
  103. veor $key,$key,$rcon
  104. veor $in0,$in0,$tmp
  105. vshl.u8 $rcon,$rcon,#1
  106. veor $in0,$in0,$key
  107. b.ne .Loop128
  108. vld1.32 {$rcon},[$ptr]
  109. vtbl.8 $key,{$in0},$mask
  110. vext.8 $tmp,$zero,$in0,#12
  111. vst1.32 {$in0},[$out],#16
  112. aese $key,$zero
  113. veor $in0,$in0,$tmp
  114. vext.8 $tmp,$zero,$tmp,#12
  115. veor $in0,$in0,$tmp
  116. vext.8 $tmp,$zero,$tmp,#12
  117. veor $key,$key,$rcon
  118. veor $in0,$in0,$tmp
  119. vshl.u8 $rcon,$rcon,#1
  120. veor $in0,$in0,$key
  121. vtbl.8 $key,{$in0},$mask
  122. vext.8 $tmp,$zero,$in0,#12
  123. vst1.32 {$in0},[$out],#16
  124. aese $key,$zero
  125. veor $in0,$in0,$tmp
  126. vext.8 $tmp,$zero,$tmp,#12
  127. veor $in0,$in0,$tmp
  128. vext.8 $tmp,$zero,$tmp,#12
  129. veor $key,$key,$rcon
  130. veor $in0,$in0,$tmp
  131. veor $in0,$in0,$key
  132. vst1.32 {$in0},[$out]
  133. add $out,$out,#0x50
  134. mov $rounds,#10
  135. b .Ldone
  136. .align 4
  137. .L192:
  138. vld1.8 {$in1},[$inp],#8
  139. vmov.i8 $key,#8 // borrow $key
  140. vst1.32 {$in0},[$out],#16
  141. vsub.i8 $mask,$mask,$key // adjust the mask
  142. .Loop192:
  143. vtbl.8 $key,{$in1},$mask
  144. vext.8 $tmp,$zero,$in0,#12
  145. vst1.32 {$in1},[$out],#8
  146. aese $key,$zero
  147. subs $bits,$bits,#1
  148. veor $in0,$in0,$tmp
  149. vext.8 $tmp,$zero,$tmp,#12
  150. veor $in0,$in0,$tmp
  151. vext.8 $tmp,$zero,$tmp,#12
  152. veor $in0,$in0,$tmp
  153. vdup.32 $tmp,${in0}[3]
  154. veor $tmp,$tmp,$in1
  155. veor $key,$key,$rcon
  156. vext.8 $in1,$zero,$in1,#12
  157. vshl.u8 $rcon,$rcon,#1
  158. veor $in1,$in1,$tmp
  159. veor $in0,$in0,$key
  160. veor $in1,$in1,$key
  161. vst1.32 {$in0},[$out],#16
  162. b.ne .Loop192
  163. mov $rounds,#12
  164. add $out,$out,#0x20
  165. b .Ldone
  166. .align 4
  167. .L256:
  168. vld1.8 {$in1},[$inp]
  169. mov $bits,#7
  170. mov $rounds,#14
  171. vst1.32 {$in0},[$out],#16
  172. .Loop256:
  173. vtbl.8 $key,{$in1},$mask
  174. vext.8 $tmp,$zero,$in0,#12
  175. vst1.32 {$in1},[$out],#16
  176. aese $key,$zero
  177. subs $bits,$bits,#1
  178. veor $in0,$in0,$tmp
  179. vext.8 $tmp,$zero,$tmp,#12
  180. veor $in0,$in0,$tmp
  181. vext.8 $tmp,$zero,$tmp,#12
  182. veor $key,$key,$rcon
  183. veor $in0,$in0,$tmp
  184. vshl.u8 $rcon,$rcon,#1
  185. veor $in0,$in0,$key
  186. vst1.32 {$in0},[$out],#16
  187. b.eq .Ldone
  188. vdup.32 $key,${in0}[3] // just splat
  189. vext.8 $tmp,$zero,$in1,#12
  190. aese $key,$zero
  191. veor $in1,$in1,$tmp
  192. vext.8 $tmp,$zero,$tmp,#12
  193. veor $in1,$in1,$tmp
  194. vext.8 $tmp,$zero,$tmp,#12
  195. veor $in1,$in1,$tmp
  196. veor $in1,$in1,$key
  197. b .Loop256
  198. .Ldone:
  199. str $rounds,[$out]
  200. mov $ptr,#0
  201. .Lenc_key_abort:
  202. mov x0,$ptr // return value
  203. `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
  204. ret
  205. .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
  206. .globl ${prefix}_set_decrypt_key
  207. .type ${prefix}_set_decrypt_key,%function
  208. .align 5
  209. ${prefix}_set_decrypt_key:
  210. ___
  211. $code.=<<___ if ($flavour =~ /64/);
  212. stp x29,x30,[sp,#-16]!
  213. add x29,sp,#0
  214. ___
  215. $code.=<<___ if ($flavour !~ /64/);
  216. stmdb sp!,{r4,lr}
  217. ___
  218. $code.=<<___;
  219. bl .Lenc_key
  220. cmp x0,#0
  221. b.ne .Ldec_key_abort
  222. sub $out,$out,#240 // restore original $out
  223. mov x4,#-16
  224. add $inp,$out,x12,lsl#4 // end of key schedule
  225. vld1.32 {v0.16b},[$out]
  226. vld1.32 {v1.16b},[$inp]
  227. vst1.32 {v0.16b},[$inp],x4
  228. vst1.32 {v1.16b},[$out],#16
  229. .Loop_imc:
  230. vld1.32 {v0.16b},[$out]
  231. vld1.32 {v1.16b},[$inp]
  232. aesimc v0.16b,v0.16b
  233. aesimc v1.16b,v1.16b
  234. vst1.32 {v0.16b},[$inp],x4
  235. vst1.32 {v1.16b},[$out],#16
  236. cmp $inp,$out
  237. b.hi .Loop_imc
  238. vld1.32 {v0.16b},[$out]
  239. aesimc v0.16b,v0.16b
  240. vst1.32 {v0.16b},[$inp]
  241. eor x0,x0,x0 // return value
  242. .Ldec_key_abort:
  243. ___
  244. $code.=<<___ if ($flavour !~ /64/);
  245. ldmia sp!,{r4,pc}
  246. ___
  247. $code.=<<___ if ($flavour =~ /64/);
  248. ldp x29,x30,[sp],#16
  249. ret
  250. ___
  251. $code.=<<___;
  252. .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
  253. ___
  254. }}}
  255. {{{
  256. sub gen_block () {
  257. my $dir = shift;
  258. my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
  259. my ($inp,$out,$key)=map("x$_",(0..2));
  260. my $rounds="w3";
  261. my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
  262. $code.=<<___;
  263. .globl ${prefix}_${dir}crypt
  264. .type ${prefix}_${dir}crypt,%function
  265. .align 5
  266. ${prefix}_${dir}crypt:
  267. ldr $rounds,[$key,#240]
  268. vld1.32 {$rndkey0},[$key],#16
  269. vld1.8 {$inout},[$inp]
  270. sub $rounds,$rounds,#2
  271. vld1.32 {$rndkey1},[$key],#16
  272. .Loop_${dir}c:
  273. aes$e $inout,$rndkey0
  274. aes$mc $inout,$inout
  275. vld1.32 {$rndkey0},[$key],#16
  276. subs $rounds,$rounds,#2
  277. aes$e $inout,$rndkey1
  278. aes$mc $inout,$inout
  279. vld1.32 {$rndkey1},[$key],#16
  280. b.gt .Loop_${dir}c
  281. aes$e $inout,$rndkey0
  282. aes$mc $inout,$inout
  283. vld1.32 {$rndkey0},[$key]
  284. aes$e $inout,$rndkey1
  285. veor $inout,$inout,$rndkey0
  286. vst1.8 {$inout},[$out]
  287. ret
  288. .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
  289. ___
  290. }
  291. &gen_block("en");
  292. &gen_block("de");
  293. }}}
  294. {{{
  295. my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
  296. my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
  297. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  298. my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
  299. my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
  300. ### q8-q15 preloaded key schedule
  301. $code.=<<___;
  302. .globl ${prefix}_cbc_encrypt
  303. .type ${prefix}_cbc_encrypt,%function
  304. .align 5
  305. ${prefix}_cbc_encrypt:
  306. ___
  307. $code.=<<___ if ($flavour =~ /64/);
  308. stp x29,x30,[sp,#-16]!
  309. add x29,sp,#0
  310. ___
  311. $code.=<<___ if ($flavour !~ /64/);
  312. mov ip,sp
  313. stmdb sp!,{r4-r8,lr}
  314. vstmdb sp!,{d8-d15} @ ABI specification says so
  315. ldmia ip,{r4-r5} @ load remaining args
  316. ___
  317. $code.=<<___;
  318. subs $len,$len,#16
  319. mov $step,#16
  320. b.lo .Lcbc_abort
  321. cclr $step,eq
  322. cmp $enc,#0 // en- or decrypting?
  323. ldr $rounds,[$key,#240]
  324. and $len,$len,#-16
  325. vld1.8 {$ivec},[$ivp]
  326. vld1.8 {$dat},[$inp],$step
  327. vld1.32 {q8-q9},[$key] // load key schedule...
  328. sub $rounds,$rounds,#6
  329. add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
  330. sub $rounds,$rounds,#2
  331. vld1.32 {q10-q11},[$key_],#32
  332. vld1.32 {q12-q13},[$key_],#32
  333. vld1.32 {q14-q15},[$key_],#32
  334. vld1.32 {$rndlast},[$key_]
  335. add $key_,$key,#32
  336. mov $cnt,$rounds
  337. b.eq .Lcbc_dec
  338. cmp $rounds,#2
  339. veor $dat,$dat,$ivec
  340. veor $rndzero_n_last,q8,$rndlast
  341. b.eq .Lcbc_enc128
  342. vld1.32 {$in0-$in1},[$key_]
  343. add $key_,$key,#16
  344. add $key4,$key,#16*4
  345. add $key5,$key,#16*5
  346. aese $dat,q8
  347. aesmc $dat,$dat
  348. add $key6,$key,#16*6
  349. add $key7,$key,#16*7
  350. b .Lenter_cbc_enc
  351. .align 4
  352. .Loop_cbc_enc:
  353. aese $dat,q8
  354. aesmc $dat,$dat
  355. vst1.8 {$ivec},[$out],#16
  356. .Lenter_cbc_enc:
  357. aese $dat,q9
  358. aesmc $dat,$dat
  359. aese $dat,$in0
  360. aesmc $dat,$dat
  361. vld1.32 {q8},[$key4]
  362. cmp $rounds,#4
  363. aese $dat,$in1
  364. aesmc $dat,$dat
  365. vld1.32 {q9},[$key5]
  366. b.eq .Lcbc_enc192
  367. aese $dat,q8
  368. aesmc $dat,$dat
  369. vld1.32 {q8},[$key6]
  370. aese $dat,q9
  371. aesmc $dat,$dat
  372. vld1.32 {q9},[$key7]
  373. nop
  374. .Lcbc_enc192:
  375. aese $dat,q8
  376. aesmc $dat,$dat
  377. subs $len,$len,#16
  378. aese $dat,q9
  379. aesmc $dat,$dat
  380. cclr $step,eq
  381. aese $dat,q10
  382. aesmc $dat,$dat
  383. aese $dat,q11
  384. aesmc $dat,$dat
  385. vld1.8 {q8},[$inp],$step
  386. aese $dat,q12
  387. aesmc $dat,$dat
  388. veor q8,q8,$rndzero_n_last
  389. aese $dat,q13
  390. aesmc $dat,$dat
  391. vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
  392. aese $dat,q14
  393. aesmc $dat,$dat
  394. aese $dat,q15
  395. veor $ivec,$dat,$rndlast
  396. b.hs .Loop_cbc_enc
  397. vst1.8 {$ivec},[$out],#16
  398. b .Lcbc_done
  399. .align 5
  400. .Lcbc_enc128:
  401. vld1.32 {$in0-$in1},[$key_]
  402. aese $dat,q8
  403. aesmc $dat,$dat
  404. b .Lenter_cbc_enc128
  405. .Loop_cbc_enc128:
  406. aese $dat,q8
  407. aesmc $dat,$dat
  408. vst1.8 {$ivec},[$out],#16
  409. .Lenter_cbc_enc128:
  410. aese $dat,q9
  411. aesmc $dat,$dat
  412. subs $len,$len,#16
  413. aese $dat,$in0
  414. aesmc $dat,$dat
  415. cclr $step,eq
  416. aese $dat,$in1
  417. aesmc $dat,$dat
  418. aese $dat,q10
  419. aesmc $dat,$dat
  420. aese $dat,q11
  421. aesmc $dat,$dat
  422. vld1.8 {q8},[$inp],$step
  423. aese $dat,q12
  424. aesmc $dat,$dat
  425. aese $dat,q13
  426. aesmc $dat,$dat
  427. aese $dat,q14
  428. aesmc $dat,$dat
  429. veor q8,q8,$rndzero_n_last
  430. aese $dat,q15
  431. veor $ivec,$dat,$rndlast
  432. b.hs .Loop_cbc_enc128
  433. vst1.8 {$ivec},[$out],#16
  434. b .Lcbc_done
  435. ___
  436. {
  437. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  438. $code.=<<___;
  439. .align 5
  440. .Lcbc_dec:
  441. vld1.8 {$dat2},[$inp],#16
  442. subs $len,$len,#32 // bias
  443. add $cnt,$rounds,#2
  444. vorr $in1,$dat,$dat
  445. vorr $dat1,$dat,$dat
  446. vorr $in2,$dat2,$dat2
  447. b.lo .Lcbc_dec_tail
  448. vorr $dat1,$dat2,$dat2
  449. vld1.8 {$dat2},[$inp],#16
  450. vorr $in0,$dat,$dat
  451. vorr $in1,$dat1,$dat1
  452. vorr $in2,$dat2,$dat2
  453. .Loop3x_cbc_dec:
  454. aesd $dat0,q8
  455. aesimc $dat0,$dat0
  456. aesd $dat1,q8
  457. aesimc $dat1,$dat1
  458. aesd $dat2,q8
  459. aesimc $dat2,$dat2
  460. vld1.32 {q8},[$key_],#16
  461. subs $cnt,$cnt,#2
  462. aesd $dat0,q9
  463. aesimc $dat0,$dat0
  464. aesd $dat1,q9
  465. aesimc $dat1,$dat1
  466. aesd $dat2,q9
  467. aesimc $dat2,$dat2
  468. vld1.32 {q9},[$key_],#16
  469. b.gt .Loop3x_cbc_dec
  470. aesd $dat0,q8
  471. aesimc $dat0,$dat0
  472. aesd $dat1,q8
  473. aesimc $dat1,$dat1
  474. aesd $dat2,q8
  475. aesimc $dat2,$dat2
  476. veor $tmp0,$ivec,$rndlast
  477. subs $len,$len,#0x30
  478. veor $tmp1,$in0,$rndlast
  479. mov.lo x6,$len // x6, $cnt, is zero at this point
  480. aesd $dat0,q9
  481. aesimc $dat0,$dat0
  482. aesd $dat1,q9
  483. aesimc $dat1,$dat1
  484. aesd $dat2,q9
  485. aesimc $dat2,$dat2
  486. veor $tmp2,$in1,$rndlast
  487. add $inp,$inp,x6 // $inp is adjusted in such way that
  488. // at exit from the loop $dat1-$dat2
  489. // are loaded with last "words"
  490. vorr $ivec,$in2,$in2
  491. mov $key_,$key
  492. aesd $dat0,q12
  493. aesimc $dat0,$dat0
  494. aesd $dat1,q12
  495. aesimc $dat1,$dat1
  496. aesd $dat2,q12
  497. aesimc $dat2,$dat2
  498. vld1.8 {$in0},[$inp],#16
  499. aesd $dat0,q13
  500. aesimc $dat0,$dat0
  501. aesd $dat1,q13
  502. aesimc $dat1,$dat1
  503. aesd $dat2,q13
  504. aesimc $dat2,$dat2
  505. vld1.8 {$in1},[$inp],#16
  506. aesd $dat0,q14
  507. aesimc $dat0,$dat0
  508. aesd $dat1,q14
  509. aesimc $dat1,$dat1
  510. aesd $dat2,q14
  511. aesimc $dat2,$dat2
  512. vld1.8 {$in2},[$inp],#16
  513. aesd $dat0,q15
  514. aesd $dat1,q15
  515. aesd $dat2,q15
  516. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  517. add $cnt,$rounds,#2
  518. veor $tmp0,$tmp0,$dat0
  519. veor $tmp1,$tmp1,$dat1
  520. veor $dat2,$dat2,$tmp2
  521. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  522. vst1.8 {$tmp0},[$out],#16
  523. vorr $dat0,$in0,$in0
  524. vst1.8 {$tmp1},[$out],#16
  525. vorr $dat1,$in1,$in1
  526. vst1.8 {$dat2},[$out],#16
  527. vorr $dat2,$in2,$in2
  528. b.hs .Loop3x_cbc_dec
  529. cmn $len,#0x30
  530. b.eq .Lcbc_done
  531. nop
  532. .Lcbc_dec_tail:
  533. aesd $dat1,q8
  534. aesimc $dat1,$dat1
  535. aesd $dat2,q8
  536. aesimc $dat2,$dat2
  537. vld1.32 {q8},[$key_],#16
  538. subs $cnt,$cnt,#2
  539. aesd $dat1,q9
  540. aesimc $dat1,$dat1
  541. aesd $dat2,q9
  542. aesimc $dat2,$dat2
  543. vld1.32 {q9},[$key_],#16
  544. b.gt .Lcbc_dec_tail
  545. aesd $dat1,q8
  546. aesimc $dat1,$dat1
  547. aesd $dat2,q8
  548. aesimc $dat2,$dat2
  549. aesd $dat1,q9
  550. aesimc $dat1,$dat1
  551. aesd $dat2,q9
  552. aesimc $dat2,$dat2
  553. aesd $dat1,q12
  554. aesimc $dat1,$dat1
  555. aesd $dat2,q12
  556. aesimc $dat2,$dat2
  557. cmn $len,#0x20
  558. aesd $dat1,q13
  559. aesimc $dat1,$dat1
  560. aesd $dat2,q13
  561. aesimc $dat2,$dat2
  562. veor $tmp1,$ivec,$rndlast
  563. aesd $dat1,q14
  564. aesimc $dat1,$dat1
  565. aesd $dat2,q14
  566. aesimc $dat2,$dat2
  567. veor $tmp2,$in1,$rndlast
  568. aesd $dat1,q15
  569. aesd $dat2,q15
  570. b.eq .Lcbc_dec_one
  571. veor $tmp1,$tmp1,$dat1
  572. veor $tmp2,$tmp2,$dat2
  573. vorr $ivec,$in2,$in2
  574. vst1.8 {$tmp1},[$out],#16
  575. vst1.8 {$tmp2},[$out],#16
  576. b .Lcbc_done
  577. .Lcbc_dec_one:
  578. veor $tmp1,$tmp1,$dat2
  579. vorr $ivec,$in2,$in2
  580. vst1.8 {$tmp1},[$out],#16
  581. .Lcbc_done:
  582. vst1.8 {$ivec},[$ivp]
  583. .Lcbc_abort:
  584. ___
  585. }
  586. $code.=<<___ if ($flavour !~ /64/);
  587. vldmia sp!,{d8-d15}
  588. ldmia sp!,{r4-r8,pc}
  589. ___
  590. $code.=<<___ if ($flavour =~ /64/);
  591. ldr x29,[sp],#16
  592. ret
  593. ___
  594. $code.=<<___;
  595. .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
  596. ___
  597. }}}
  598. {{{
  599. my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
  600. my ($rounds,$cnt,$key_)=("w5","w6","x7");
  601. my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
  602. my $step="x12"; # aliases with $tctr2
  603. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  604. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  605. my ($dat,$tmp)=($dat0,$tmp0);
  606. ### q8-q15 preloaded key schedule
  607. $code.=<<___;
  608. .globl ${prefix}_ctr32_encrypt_blocks
  609. .type ${prefix}_ctr32_encrypt_blocks,%function
  610. .align 5
  611. ${prefix}_ctr32_encrypt_blocks:
  612. ___
  613. $code.=<<___ if ($flavour =~ /64/);
  614. stp x29,x30,[sp,#-16]!
  615. add x29,sp,#0
  616. ___
  617. $code.=<<___ if ($flavour !~ /64/);
  618. mov ip,sp
  619. stmdb sp!,{r4-r10,lr}
  620. vstmdb sp!,{d8-d15} @ ABI specification says so
  621. ldr r4, [ip] @ load remaining arg
  622. ___
  623. $code.=<<___;
  624. ldr $rounds,[$key,#240]
  625. ldr $ctr, [$ivp, #12]
  626. vld1.32 {$dat0},[$ivp]
  627. vld1.32 {q8-q9},[$key] // load key schedule...
  628. sub $rounds,$rounds,#4
  629. mov $step,#16
  630. cmp $len,#2
  631. add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
  632. sub $rounds,$rounds,#2
  633. vld1.32 {q12-q13},[$key_],#32
  634. vld1.32 {q14-q15},[$key_],#32
  635. vld1.32 {$rndlast},[$key_]
  636. add $key_,$key,#32
  637. mov $cnt,$rounds
  638. cclr $step,lo
  639. #ifndef __ARMEB__
  640. rev $ctr, $ctr
  641. #endif
  642. vorr $dat1,$dat0,$dat0
  643. add $tctr1, $ctr, #1
  644. vorr $dat2,$dat0,$dat0
  645. add $ctr, $ctr, #2
  646. vorr $ivec,$dat0,$dat0
  647. rev $tctr1, $tctr1
  648. vmov.32 ${dat1}[3],$tctr1
  649. b.ls .Lctr32_tail
  650. rev $tctr2, $ctr
  651. sub $len,$len,#3 // bias
  652. vmov.32 ${dat2}[3],$tctr2
  653. b .Loop3x_ctr32
  654. .align 4
  655. .Loop3x_ctr32:
  656. aese $dat0,q8
  657. aesmc $dat0,$dat0
  658. aese $dat1,q8
  659. aesmc $dat1,$dat1
  660. aese $dat2,q8
  661. aesmc $dat2,$dat2
  662. vld1.32 {q8},[$key_],#16
  663. subs $cnt,$cnt,#2
  664. aese $dat0,q9
  665. aesmc $dat0,$dat0
  666. aese $dat1,q9
  667. aesmc $dat1,$dat1
  668. aese $dat2,q9
  669. aesmc $dat2,$dat2
  670. vld1.32 {q9},[$key_],#16
  671. b.gt .Loop3x_ctr32
  672. aese $dat0,q8
  673. aesmc $tmp0,$dat0
  674. aese $dat1,q8
  675. aesmc $tmp1,$dat1
  676. vld1.8 {$in0},[$inp],#16
  677. vorr $dat0,$ivec,$ivec
  678. aese $dat2,q8
  679. aesmc $dat2,$dat2
  680. vld1.8 {$in1},[$inp],#16
  681. vorr $dat1,$ivec,$ivec
  682. aese $tmp0,q9
  683. aesmc $tmp0,$tmp0
  684. aese $tmp1,q9
  685. aesmc $tmp1,$tmp1
  686. vld1.8 {$in2},[$inp],#16
  687. mov $key_,$key
  688. aese $dat2,q9
  689. aesmc $tmp2,$dat2
  690. vorr $dat2,$ivec,$ivec
  691. add $tctr0,$ctr,#1
  692. aese $tmp0,q12
  693. aesmc $tmp0,$tmp0
  694. aese $tmp1,q12
  695. aesmc $tmp1,$tmp1
  696. veor $in0,$in0,$rndlast
  697. add $tctr1,$ctr,#2
  698. aese $tmp2,q12
  699. aesmc $tmp2,$tmp2
  700. veor $in1,$in1,$rndlast
  701. add $ctr,$ctr,#3
  702. aese $tmp0,q13
  703. aesmc $tmp0,$tmp0
  704. aese $tmp1,q13
  705. aesmc $tmp1,$tmp1
  706. veor $in2,$in2,$rndlast
  707. rev $tctr0,$tctr0
  708. aese $tmp2,q13
  709. aesmc $tmp2,$tmp2
  710. vmov.32 ${dat0}[3], $tctr0
  711. rev $tctr1,$tctr1
  712. aese $tmp0,q14
  713. aesmc $tmp0,$tmp0
  714. aese $tmp1,q14
  715. aesmc $tmp1,$tmp1
  716. vmov.32 ${dat1}[3], $tctr1
  717. rev $tctr2,$ctr
  718. aese $tmp2,q14
  719. aesmc $tmp2,$tmp2
  720. vmov.32 ${dat2}[3], $tctr2
  721. subs $len,$len,#3
  722. aese $tmp0,q15
  723. aese $tmp1,q15
  724. aese $tmp2,q15
  725. veor $in0,$in0,$tmp0
  726. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  727. vst1.8 {$in0},[$out],#16
  728. veor $in1,$in1,$tmp1
  729. mov $cnt,$rounds
  730. vst1.8 {$in1},[$out],#16
  731. veor $in2,$in2,$tmp2
  732. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  733. vst1.8 {$in2},[$out],#16
  734. b.hs .Loop3x_ctr32
  735. adds $len,$len,#3
  736. b.eq .Lctr32_done
  737. cmp $len,#1
  738. mov $step,#16
  739. cclr $step,eq
  740. .Lctr32_tail:
  741. aese $dat0,q8
  742. aesmc $dat0,$dat0
  743. aese $dat1,q8
  744. aesmc $dat1,$dat1
  745. vld1.32 {q8},[$key_],#16
  746. subs $cnt,$cnt,#2
  747. aese $dat0,q9
  748. aesmc $dat0,$dat0
  749. aese $dat1,q9
  750. aesmc $dat1,$dat1
  751. vld1.32 {q9},[$key_],#16
  752. b.gt .Lctr32_tail
  753. aese $dat0,q8
  754. aesmc $dat0,$dat0
  755. aese $dat1,q8
  756. aesmc $dat1,$dat1
  757. aese $dat0,q9
  758. aesmc $dat0,$dat0
  759. aese $dat1,q9
  760. aesmc $dat1,$dat1
  761. vld1.8 {$in0},[$inp],$step
  762. aese $dat0,q12
  763. aesmc $dat0,$dat0
  764. aese $dat1,q12
  765. aesmc $dat1,$dat1
  766. vld1.8 {$in1},[$inp]
  767. aese $dat0,q13
  768. aesmc $dat0,$dat0
  769. aese $dat1,q13
  770. aesmc $dat1,$dat1
  771. veor $in0,$in0,$rndlast
  772. aese $dat0,q14
  773. aesmc $dat0,$dat0
  774. aese $dat1,q14
  775. aesmc $dat1,$dat1
  776. veor $in1,$in1,$rndlast
  777. aese $dat0,q15
  778. aese $dat1,q15
  779. cmp $len,#1
  780. veor $in0,$in0,$dat0
  781. veor $in1,$in1,$dat1
  782. vst1.8 {$in0},[$out],#16
  783. b.eq .Lctr32_done
  784. vst1.8 {$in1},[$out]
  785. .Lctr32_done:
  786. ___
  787. $code.=<<___ if ($flavour !~ /64/);
  788. vldmia sp!,{d8-d15}
  789. ldmia sp!,{r4-r10,pc}
  790. ___
  791. $code.=<<___ if ($flavour =~ /64/);
  792. ldr x29,[sp],#16
  793. ret
  794. ___
  795. $code.=<<___;
  796. .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
  797. ___
  798. }}}
  799. $code.=<<___;
  800. #endif
  801. ___
  802. ########################################
  803. if ($flavour =~ /64/) { ######## 64-bit code
  804. my %opcode = (
  805. "aesd" => 0x4e285800, "aese" => 0x4e284800,
  806. "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
  807. local *unaes = sub {
  808. my ($mnemonic,$arg)=@_;
  809. $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
  810. sprintf ".inst\t0x%08x\t//%s %s",
  811. $opcode{$mnemonic}|$1|($2<<5),
  812. $mnemonic,$arg;
  813. };
  814. foreach(split("\n",$code)) {
  815. s/\`([^\`]*)\`/eval($1)/geo;
  816. s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
  817. s/@\s/\/\//o; # old->new style commentary
  818. #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
  819. s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
  820. s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
  821. s/vmov\.i8/movi/o or # fix up legacy mnemonics
  822. s/vext\.8/ext/o or
  823. s/vrev32\.8/rev32/o or
  824. s/vtst\.8/cmtst/o or
  825. s/vshr/ushr/o or
  826. s/^(\s+)v/$1/o or # strip off v prefix
  827. s/\bbx\s+lr\b/ret/o;
  828. # fix up remainig legacy suffixes
  829. s/\.[ui]?8//o;
  830. m/\],#8/o and s/\.16b/\.8b/go;
  831. s/\.[ui]?32//o and s/\.16b/\.4s/go;
  832. s/\.[ui]?64//o and s/\.16b/\.2d/go;
  833. s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
  834. print $_,"\n";
  835. }
  836. } else { ######## 32-bit code
  837. my %opcode = (
  838. "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
  839. "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
  840. local *unaes = sub {
  841. my ($mnemonic,$arg)=@_;
  842. if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
  843. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  844. |(($2&7)<<1) |(($2&8)<<2);
  845. # since ARMv7 instructions are always encoded little-endian.
  846. # correct solution is to use .inst directive, but older
  847. # assemblers don't implement it:-(
  848. sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
  849. $word&0xff,($word>>8)&0xff,
  850. ($word>>16)&0xff,($word>>24)&0xff,
  851. $mnemonic,$arg;
  852. }
  853. };
  854. sub unvtbl {
  855. my $arg=shift;
  856. $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
  857. sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
  858. "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
  859. }
  860. sub unvdup32 {
  861. my $arg=shift;
  862. $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
  863. sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
  864. }
  865. sub unvmov32 {
  866. my $arg=shift;
  867. $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
  868. sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
  869. }
  870. foreach(split("\n",$code)) {
  871. s/\`([^\`]*)\`/eval($1)/geo;
  872. s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
  873. s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
  874. s/\/\/\s?/@ /o; # new->old style commentary
  875. # fix up remainig new-style suffixes
  876. s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
  877. s/\],#[0-9]+/]!/o;
  878. s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
  879. s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
  880. s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
  881. s/vdup\.32\s+(.*)/unvdup32($1)/geo or
  882. s/vmov\.32\s+(.*)/unvmov32($1)/geo or
  883. s/^(\s+)b\./$1b/o or
  884. s/^(\s+)mov\./$1mov/o or
  885. s/^(\s+)ret/$1bx\tlr/o;
  886. print $_,"\n";
  887. }
  888. }
  889. close STDOUT;