aesv8-armx.pl 22 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019
  1. #! /usr/bin/env perl
  2. # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # This module implements support for ARMv8 AES instructions. The
  17. # module is endian-agnostic in sense that it supports both big- and
  18. # little-endian cases. As does it support both 32- and 64-bit modes
  19. # of operation. Latter is achieved by limiting amount of utilized
  20. # registers to 16, which implies additional NEON load and integer
  21. # instructions. This has no effect on mighty Apple A7, where results
  22. # are literally equal to the theoretical estimates based on AES
  23. # instruction latencies and issue rates. On Cortex-A53, an in-order
  24. # execution core, this costs up to 10-15%, which is partially
  25. # compensated by implementing dedicated code path for 128-bit
  26. # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  27. # seems to be limited by sheer amount of NEON instructions...
  28. #
  29. # Performance in cycles per byte processed with 128-bit key:
  30. #
  31. # CBC enc CBC dec CTR
  32. # Apple A7 2.39 1.20 1.20
  33. # Cortex-A53 1.32 1.29 1.46
  34. # Cortex-A57(*) 1.95 0.85 0.93
  35. # Denver 1.96 0.86 0.80
  36. # Mongoose 1.33 1.20 1.20
  37. # Kryo 1.26 0.94 1.00
  38. #
  39. # (*) original 3.64/1.34/1.32 results were for r0p0 revision
  40. # and are still same even for updated module;
  41. $flavour = shift;
  42. $output = shift;
  43. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  44. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  45. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  46. die "can't locate arm-xlate.pl";
  47. open OUT,"| \"$^X\" $xlate $flavour $output";
  48. *STDOUT=*OUT;
  49. $prefix="aes_v8";
  50. $code=<<___;
  51. #include "arm_arch.h"
  52. #if __ARM_MAX_ARCH__>=7
  53. .text
  54. ___
  55. $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
  56. $code.=<<___ if ($flavour !~ /64/);
  57. .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
  58. .fpu neon
  59. .code 32
  60. #undef __thumb2__
  61. ___
  62. # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  63. # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
  64. # maintain both 32- and 64-bit codes within single module and
  65. # transliterate common code to either flavour with regex vodoo.
  66. #
  67. {{{
  68. my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
  69. my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
  70. $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
  71. $code.=<<___;
  72. .align 5
  73. .Lrcon:
  74. .long 0x01,0x01,0x01,0x01
  75. .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
  76. .long 0x1b,0x1b,0x1b,0x1b
  77. .globl ${prefix}_set_encrypt_key
  78. .type ${prefix}_set_encrypt_key,%function
  79. .align 5
  80. ${prefix}_set_encrypt_key:
  81. .Lenc_key:
  82. ___
  83. $code.=<<___ if ($flavour =~ /64/);
  84. stp x29,x30,[sp,#-16]!
  85. add x29,sp,#0
  86. ___
  87. $code.=<<___;
  88. mov $ptr,#-1
  89. cmp $inp,#0
  90. b.eq .Lenc_key_abort
  91. cmp $out,#0
  92. b.eq .Lenc_key_abort
  93. mov $ptr,#-2
  94. cmp $bits,#128
  95. b.lt .Lenc_key_abort
  96. cmp $bits,#256
  97. b.gt .Lenc_key_abort
  98. tst $bits,#0x3f
  99. b.ne .Lenc_key_abort
  100. adr $ptr,.Lrcon
  101. cmp $bits,#192
  102. veor $zero,$zero,$zero
  103. vld1.8 {$in0},[$inp],#16
  104. mov $bits,#8 // reuse $bits
  105. vld1.32 {$rcon,$mask},[$ptr],#32
  106. b.lt .Loop128
  107. b.eq .L192
  108. b .L256
  109. .align 4
  110. .Loop128:
  111. vtbl.8 $key,{$in0},$mask
  112. vext.8 $tmp,$zero,$in0,#12
  113. vst1.32 {$in0},[$out],#16
  114. aese $key,$zero
  115. subs $bits,$bits,#1
  116. veor $in0,$in0,$tmp
  117. vext.8 $tmp,$zero,$tmp,#12
  118. veor $in0,$in0,$tmp
  119. vext.8 $tmp,$zero,$tmp,#12
  120. veor $key,$key,$rcon
  121. veor $in0,$in0,$tmp
  122. vshl.u8 $rcon,$rcon,#1
  123. veor $in0,$in0,$key
  124. b.ne .Loop128
  125. vld1.32 {$rcon},[$ptr]
  126. vtbl.8 $key,{$in0},$mask
  127. vext.8 $tmp,$zero,$in0,#12
  128. vst1.32 {$in0},[$out],#16
  129. aese $key,$zero
  130. veor $in0,$in0,$tmp
  131. vext.8 $tmp,$zero,$tmp,#12
  132. veor $in0,$in0,$tmp
  133. vext.8 $tmp,$zero,$tmp,#12
  134. veor $key,$key,$rcon
  135. veor $in0,$in0,$tmp
  136. vshl.u8 $rcon,$rcon,#1
  137. veor $in0,$in0,$key
  138. vtbl.8 $key,{$in0},$mask
  139. vext.8 $tmp,$zero,$in0,#12
  140. vst1.32 {$in0},[$out],#16
  141. aese $key,$zero
  142. veor $in0,$in0,$tmp
  143. vext.8 $tmp,$zero,$tmp,#12
  144. veor $in0,$in0,$tmp
  145. vext.8 $tmp,$zero,$tmp,#12
  146. veor $key,$key,$rcon
  147. veor $in0,$in0,$tmp
  148. veor $in0,$in0,$key
  149. vst1.32 {$in0},[$out]
  150. add $out,$out,#0x50
  151. mov $rounds,#10
  152. b .Ldone
  153. .align 4
  154. .L192:
  155. vld1.8 {$in1},[$inp],#8
  156. vmov.i8 $key,#8 // borrow $key
  157. vst1.32 {$in0},[$out],#16
  158. vsub.i8 $mask,$mask,$key // adjust the mask
  159. .Loop192:
  160. vtbl.8 $key,{$in1},$mask
  161. vext.8 $tmp,$zero,$in0,#12
  162. #ifdef __ARMEB__
  163. vst1.32 {$in1},[$out],#16
  164. sub $out,$out,#8
  165. #else
  166. vst1.32 {$in1},[$out],#8
  167. #endif
  168. aese $key,$zero
  169. subs $bits,$bits,#1
  170. veor $in0,$in0,$tmp
  171. vext.8 $tmp,$zero,$tmp,#12
  172. veor $in0,$in0,$tmp
  173. vext.8 $tmp,$zero,$tmp,#12
  174. veor $in0,$in0,$tmp
  175. vdup.32 $tmp,${in0}[3]
  176. veor $tmp,$tmp,$in1
  177. veor $key,$key,$rcon
  178. vext.8 $in1,$zero,$in1,#12
  179. vshl.u8 $rcon,$rcon,#1
  180. veor $in1,$in1,$tmp
  181. veor $in0,$in0,$key
  182. veor $in1,$in1,$key
  183. vst1.32 {$in0},[$out],#16
  184. b.ne .Loop192
  185. mov $rounds,#12
  186. add $out,$out,#0x20
  187. b .Ldone
  188. .align 4
  189. .L256:
  190. vld1.8 {$in1},[$inp]
  191. mov $bits,#7
  192. mov $rounds,#14
  193. vst1.32 {$in0},[$out],#16
  194. .Loop256:
  195. vtbl.8 $key,{$in1},$mask
  196. vext.8 $tmp,$zero,$in0,#12
  197. vst1.32 {$in1},[$out],#16
  198. aese $key,$zero
  199. subs $bits,$bits,#1
  200. veor $in0,$in0,$tmp
  201. vext.8 $tmp,$zero,$tmp,#12
  202. veor $in0,$in0,$tmp
  203. vext.8 $tmp,$zero,$tmp,#12
  204. veor $key,$key,$rcon
  205. veor $in0,$in0,$tmp
  206. vshl.u8 $rcon,$rcon,#1
  207. veor $in0,$in0,$key
  208. vst1.32 {$in0},[$out],#16
  209. b.eq .Ldone
  210. vdup.32 $key,${in0}[3] // just splat
  211. vext.8 $tmp,$zero,$in1,#12
  212. aese $key,$zero
  213. veor $in1,$in1,$tmp
  214. vext.8 $tmp,$zero,$tmp,#12
  215. veor $in1,$in1,$tmp
  216. vext.8 $tmp,$zero,$tmp,#12
  217. veor $in1,$in1,$tmp
  218. veor $in1,$in1,$key
  219. b .Loop256
  220. .Ldone:
  221. str $rounds,[$out]
  222. mov $ptr,#0
  223. .Lenc_key_abort:
  224. mov x0,$ptr // return value
  225. `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
  226. ret
  227. .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
  228. .globl ${prefix}_set_decrypt_key
  229. .type ${prefix}_set_decrypt_key,%function
  230. .align 5
  231. ${prefix}_set_decrypt_key:
  232. ___
  233. $code.=<<___ if ($flavour =~ /64/);
  234. .inst 0xd503233f // paciasp
  235. stp x29,x30,[sp,#-16]!
  236. add x29,sp,#0
  237. ___
  238. $code.=<<___ if ($flavour !~ /64/);
  239. stmdb sp!,{r4,lr}
  240. ___
  241. $code.=<<___;
  242. bl .Lenc_key
  243. cmp x0,#0
  244. b.ne .Ldec_key_abort
  245. sub $out,$out,#240 // restore original $out
  246. mov x4,#-16
  247. add $inp,$out,x12,lsl#4 // end of key schedule
  248. vld1.32 {v0.16b},[$out]
  249. vld1.32 {v1.16b},[$inp]
  250. vst1.32 {v0.16b},[$inp],x4
  251. vst1.32 {v1.16b},[$out],#16
  252. .Loop_imc:
  253. vld1.32 {v0.16b},[$out]
  254. vld1.32 {v1.16b},[$inp]
  255. aesimc v0.16b,v0.16b
  256. aesimc v1.16b,v1.16b
  257. vst1.32 {v0.16b},[$inp],x4
  258. vst1.32 {v1.16b},[$out],#16
  259. cmp $inp,$out
  260. b.hi .Loop_imc
  261. vld1.32 {v0.16b},[$out]
  262. aesimc v0.16b,v0.16b
  263. vst1.32 {v0.16b},[$inp]
  264. eor x0,x0,x0 // return value
  265. .Ldec_key_abort:
  266. ___
  267. $code.=<<___ if ($flavour !~ /64/);
  268. ldmia sp!,{r4,pc}
  269. ___
  270. $code.=<<___ if ($flavour =~ /64/);
  271. ldp x29,x30,[sp],#16
  272. .inst 0xd50323bf // autiasp
  273. ret
  274. ___
  275. $code.=<<___;
  276. .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
  277. ___
  278. }}}
  279. {{{
  280. sub gen_block () {
  281. my $dir = shift;
  282. my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
  283. my ($inp,$out,$key)=map("x$_",(0..2));
  284. my $rounds="w3";
  285. my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
  286. $code.=<<___;
  287. .globl ${prefix}_${dir}crypt
  288. .type ${prefix}_${dir}crypt,%function
  289. .align 5
  290. ${prefix}_${dir}crypt:
  291. ldr $rounds,[$key,#240]
  292. vld1.32 {$rndkey0},[$key],#16
  293. vld1.8 {$inout},[$inp]
  294. sub $rounds,$rounds,#2
  295. vld1.32 {$rndkey1},[$key],#16
  296. .Loop_${dir}c:
  297. aes$e $inout,$rndkey0
  298. aes$mc $inout,$inout
  299. vld1.32 {$rndkey0},[$key],#16
  300. subs $rounds,$rounds,#2
  301. aes$e $inout,$rndkey1
  302. aes$mc $inout,$inout
  303. vld1.32 {$rndkey1},[$key],#16
  304. b.gt .Loop_${dir}c
  305. aes$e $inout,$rndkey0
  306. aes$mc $inout,$inout
  307. vld1.32 {$rndkey0},[$key]
  308. aes$e $inout,$rndkey1
  309. veor $inout,$inout,$rndkey0
  310. vst1.8 {$inout},[$out]
  311. ret
  312. .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
  313. ___
  314. }
  315. &gen_block("en");
  316. &gen_block("de");
  317. }}}
  318. {{{
  319. my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
  320. my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
  321. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  322. my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
  323. my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
  324. ### q8-q15 preloaded key schedule
  325. $code.=<<___;
  326. .globl ${prefix}_cbc_encrypt
  327. .type ${prefix}_cbc_encrypt,%function
  328. .align 5
  329. ${prefix}_cbc_encrypt:
  330. ___
  331. $code.=<<___ if ($flavour =~ /64/);
  332. stp x29,x30,[sp,#-16]!
  333. add x29,sp,#0
  334. ___
  335. $code.=<<___ if ($flavour !~ /64/);
  336. mov ip,sp
  337. stmdb sp!,{r4-r8,lr}
  338. vstmdb sp!,{d8-d15} @ ABI specification says so
  339. ldmia ip,{r4-r5} @ load remaining args
  340. ___
  341. $code.=<<___;
  342. subs $len,$len,#16
  343. mov $step,#16
  344. b.lo .Lcbc_abort
  345. cclr $step,eq
  346. cmp $enc,#0 // en- or decrypting?
  347. ldr $rounds,[$key,#240]
  348. and $len,$len,#-16
  349. vld1.8 {$ivec},[$ivp]
  350. vld1.8 {$dat},[$inp],$step
  351. vld1.32 {q8-q9},[$key] // load key schedule...
  352. sub $rounds,$rounds,#6
  353. add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
  354. sub $rounds,$rounds,#2
  355. vld1.32 {q10-q11},[$key_],#32
  356. vld1.32 {q12-q13},[$key_],#32
  357. vld1.32 {q14-q15},[$key_],#32
  358. vld1.32 {$rndlast},[$key_]
  359. add $key_,$key,#32
  360. mov $cnt,$rounds
  361. b.eq .Lcbc_dec
  362. cmp $rounds,#2
  363. veor $dat,$dat,$ivec
  364. veor $rndzero_n_last,q8,$rndlast
  365. b.eq .Lcbc_enc128
  366. vld1.32 {$in0-$in1},[$key_]
  367. add $key_,$key,#16
  368. add $key4,$key,#16*4
  369. add $key5,$key,#16*5
  370. aese $dat,q8
  371. aesmc $dat,$dat
  372. add $key6,$key,#16*6
  373. add $key7,$key,#16*7
  374. b .Lenter_cbc_enc
  375. .align 4
  376. .Loop_cbc_enc:
  377. aese $dat,q8
  378. aesmc $dat,$dat
  379. vst1.8 {$ivec},[$out],#16
  380. .Lenter_cbc_enc:
  381. aese $dat,q9
  382. aesmc $dat,$dat
  383. aese $dat,$in0
  384. aesmc $dat,$dat
  385. vld1.32 {q8},[$key4]
  386. cmp $rounds,#4
  387. aese $dat,$in1
  388. aesmc $dat,$dat
  389. vld1.32 {q9},[$key5]
  390. b.eq .Lcbc_enc192
  391. aese $dat,q8
  392. aesmc $dat,$dat
  393. vld1.32 {q8},[$key6]
  394. aese $dat,q9
  395. aesmc $dat,$dat
  396. vld1.32 {q9},[$key7]
  397. nop
  398. .Lcbc_enc192:
  399. aese $dat,q8
  400. aesmc $dat,$dat
  401. subs $len,$len,#16
  402. aese $dat,q9
  403. aesmc $dat,$dat
  404. cclr $step,eq
  405. aese $dat,q10
  406. aesmc $dat,$dat
  407. aese $dat,q11
  408. aesmc $dat,$dat
  409. vld1.8 {q8},[$inp],$step
  410. aese $dat,q12
  411. aesmc $dat,$dat
  412. veor q8,q8,$rndzero_n_last
  413. aese $dat,q13
  414. aesmc $dat,$dat
  415. vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
  416. aese $dat,q14
  417. aesmc $dat,$dat
  418. aese $dat,q15
  419. veor $ivec,$dat,$rndlast
  420. b.hs .Loop_cbc_enc
  421. vst1.8 {$ivec},[$out],#16
  422. b .Lcbc_done
  423. .align 5
  424. .Lcbc_enc128:
  425. vld1.32 {$in0-$in1},[$key_]
  426. aese $dat,q8
  427. aesmc $dat,$dat
  428. b .Lenter_cbc_enc128
  429. .Loop_cbc_enc128:
  430. aese $dat,q8
  431. aesmc $dat,$dat
  432. vst1.8 {$ivec},[$out],#16
  433. .Lenter_cbc_enc128:
  434. aese $dat,q9
  435. aesmc $dat,$dat
  436. subs $len,$len,#16
  437. aese $dat,$in0
  438. aesmc $dat,$dat
  439. cclr $step,eq
  440. aese $dat,$in1
  441. aesmc $dat,$dat
  442. aese $dat,q10
  443. aesmc $dat,$dat
  444. aese $dat,q11
  445. aesmc $dat,$dat
  446. vld1.8 {q8},[$inp],$step
  447. aese $dat,q12
  448. aesmc $dat,$dat
  449. aese $dat,q13
  450. aesmc $dat,$dat
  451. aese $dat,q14
  452. aesmc $dat,$dat
  453. veor q8,q8,$rndzero_n_last
  454. aese $dat,q15
  455. veor $ivec,$dat,$rndlast
  456. b.hs .Loop_cbc_enc128
  457. vst1.8 {$ivec},[$out],#16
  458. b .Lcbc_done
  459. ___
  460. {
  461. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  462. $code.=<<___;
  463. .align 5
  464. .Lcbc_dec:
  465. vld1.8 {$dat2},[$inp],#16
  466. subs $len,$len,#32 // bias
  467. add $cnt,$rounds,#2
  468. vorr $in1,$dat,$dat
  469. vorr $dat1,$dat,$dat
  470. vorr $in2,$dat2,$dat2
  471. b.lo .Lcbc_dec_tail
  472. vorr $dat1,$dat2,$dat2
  473. vld1.8 {$dat2},[$inp],#16
  474. vorr $in0,$dat,$dat
  475. vorr $in1,$dat1,$dat1
  476. vorr $in2,$dat2,$dat2
  477. .Loop3x_cbc_dec:
  478. aesd $dat0,q8
  479. aesimc $dat0,$dat0
  480. aesd $dat1,q8
  481. aesimc $dat1,$dat1
  482. aesd $dat2,q8
  483. aesimc $dat2,$dat2
  484. vld1.32 {q8},[$key_],#16
  485. subs $cnt,$cnt,#2
  486. aesd $dat0,q9
  487. aesimc $dat0,$dat0
  488. aesd $dat1,q9
  489. aesimc $dat1,$dat1
  490. aesd $dat2,q9
  491. aesimc $dat2,$dat2
  492. vld1.32 {q9},[$key_],#16
  493. b.gt .Loop3x_cbc_dec
  494. aesd $dat0,q8
  495. aesimc $dat0,$dat0
  496. aesd $dat1,q8
  497. aesimc $dat1,$dat1
  498. aesd $dat2,q8
  499. aesimc $dat2,$dat2
  500. veor $tmp0,$ivec,$rndlast
  501. subs $len,$len,#0x30
  502. veor $tmp1,$in0,$rndlast
  503. mov.lo x6,$len // x6, $cnt, is zero at this point
  504. aesd $dat0,q9
  505. aesimc $dat0,$dat0
  506. aesd $dat1,q9
  507. aesimc $dat1,$dat1
  508. aesd $dat2,q9
  509. aesimc $dat2,$dat2
  510. veor $tmp2,$in1,$rndlast
  511. add $inp,$inp,x6 // $inp is adjusted in such way that
  512. // at exit from the loop $dat1-$dat2
  513. // are loaded with last "words"
  514. vorr $ivec,$in2,$in2
  515. mov $key_,$key
  516. aesd $dat0,q12
  517. aesimc $dat0,$dat0
  518. aesd $dat1,q12
  519. aesimc $dat1,$dat1
  520. aesd $dat2,q12
  521. aesimc $dat2,$dat2
  522. vld1.8 {$in0},[$inp],#16
  523. aesd $dat0,q13
  524. aesimc $dat0,$dat0
  525. aesd $dat1,q13
  526. aesimc $dat1,$dat1
  527. aesd $dat2,q13
  528. aesimc $dat2,$dat2
  529. vld1.8 {$in1},[$inp],#16
  530. aesd $dat0,q14
  531. aesimc $dat0,$dat0
  532. aesd $dat1,q14
  533. aesimc $dat1,$dat1
  534. aesd $dat2,q14
  535. aesimc $dat2,$dat2
  536. vld1.8 {$in2},[$inp],#16
  537. aesd $dat0,q15
  538. aesd $dat1,q15
  539. aesd $dat2,q15
  540. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  541. add $cnt,$rounds,#2
  542. veor $tmp0,$tmp0,$dat0
  543. veor $tmp1,$tmp1,$dat1
  544. veor $dat2,$dat2,$tmp2
  545. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  546. vst1.8 {$tmp0},[$out],#16
  547. vorr $dat0,$in0,$in0
  548. vst1.8 {$tmp1},[$out],#16
  549. vorr $dat1,$in1,$in1
  550. vst1.8 {$dat2},[$out],#16
  551. vorr $dat2,$in2,$in2
  552. b.hs .Loop3x_cbc_dec
  553. cmn $len,#0x30
  554. b.eq .Lcbc_done
  555. nop
  556. .Lcbc_dec_tail:
  557. aesd $dat1,q8
  558. aesimc $dat1,$dat1
  559. aesd $dat2,q8
  560. aesimc $dat2,$dat2
  561. vld1.32 {q8},[$key_],#16
  562. subs $cnt,$cnt,#2
  563. aesd $dat1,q9
  564. aesimc $dat1,$dat1
  565. aesd $dat2,q9
  566. aesimc $dat2,$dat2
  567. vld1.32 {q9},[$key_],#16
  568. b.gt .Lcbc_dec_tail
  569. aesd $dat1,q8
  570. aesimc $dat1,$dat1
  571. aesd $dat2,q8
  572. aesimc $dat2,$dat2
  573. aesd $dat1,q9
  574. aesimc $dat1,$dat1
  575. aesd $dat2,q9
  576. aesimc $dat2,$dat2
  577. aesd $dat1,q12
  578. aesimc $dat1,$dat1
  579. aesd $dat2,q12
  580. aesimc $dat2,$dat2
  581. cmn $len,#0x20
  582. aesd $dat1,q13
  583. aesimc $dat1,$dat1
  584. aesd $dat2,q13
  585. aesimc $dat2,$dat2
  586. veor $tmp1,$ivec,$rndlast
  587. aesd $dat1,q14
  588. aesimc $dat1,$dat1
  589. aesd $dat2,q14
  590. aesimc $dat2,$dat2
  591. veor $tmp2,$in1,$rndlast
  592. aesd $dat1,q15
  593. aesd $dat2,q15
  594. b.eq .Lcbc_dec_one
  595. veor $tmp1,$tmp1,$dat1
  596. veor $tmp2,$tmp2,$dat2
  597. vorr $ivec,$in2,$in2
  598. vst1.8 {$tmp1},[$out],#16
  599. vst1.8 {$tmp2},[$out],#16
  600. b .Lcbc_done
  601. .Lcbc_dec_one:
  602. veor $tmp1,$tmp1,$dat2
  603. vorr $ivec,$in2,$in2
  604. vst1.8 {$tmp1},[$out],#16
  605. .Lcbc_done:
  606. vst1.8 {$ivec},[$ivp]
  607. .Lcbc_abort:
  608. ___
  609. }
  610. $code.=<<___ if ($flavour !~ /64/);
  611. vldmia sp!,{d8-d15}
  612. ldmia sp!,{r4-r8,pc}
  613. ___
  614. $code.=<<___ if ($flavour =~ /64/);
  615. ldr x29,[sp],#16
  616. ret
  617. ___
  618. $code.=<<___;
  619. .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
  620. ___
  621. }}}
  622. {{{
  623. my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
  624. my ($rounds,$cnt,$key_)=("w5","w6","x7");
  625. my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
  626. my $step="x12"; # aliases with $tctr2
  627. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  628. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  629. my ($dat,$tmp)=($dat0,$tmp0);
  630. ### q8-q15 preloaded key schedule
  631. $code.=<<___;
  632. .globl ${prefix}_ctr32_encrypt_blocks
  633. .type ${prefix}_ctr32_encrypt_blocks,%function
  634. .align 5
  635. ${prefix}_ctr32_encrypt_blocks:
  636. ___
  637. $code.=<<___ if ($flavour =~ /64/);
  638. stp x29,x30,[sp,#-16]!
  639. add x29,sp,#0
  640. ___
  641. $code.=<<___ if ($flavour !~ /64/);
  642. mov ip,sp
  643. stmdb sp!,{r4-r10,lr}
  644. vstmdb sp!,{d8-d15} @ ABI specification says so
  645. ldr r4, [ip] @ load remaining arg
  646. ___
  647. $code.=<<___;
  648. ldr $rounds,[$key,#240]
  649. ldr $ctr, [$ivp, #12]
  650. #ifdef __ARMEB__
  651. vld1.8 {$dat0},[$ivp]
  652. #else
  653. vld1.32 {$dat0},[$ivp]
  654. #endif
  655. vld1.32 {q8-q9},[$key] // load key schedule...
  656. sub $rounds,$rounds,#4
  657. mov $step,#16
  658. cmp $len,#2
  659. add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
  660. sub $rounds,$rounds,#2
  661. vld1.32 {q12-q13},[$key_],#32
  662. vld1.32 {q14-q15},[$key_],#32
  663. vld1.32 {$rndlast},[$key_]
  664. add $key_,$key,#32
  665. mov $cnt,$rounds
  666. cclr $step,lo
  667. #ifndef __ARMEB__
  668. rev $ctr, $ctr
  669. #endif
  670. add $tctr1, $ctr, #1
  671. vorr $ivec,$dat0,$dat0
  672. rev $tctr1, $tctr1
  673. vmov.32 ${ivec}[3],$tctr1
  674. add $ctr, $ctr, #2
  675. vorr $dat1,$ivec,$ivec
  676. b.ls .Lctr32_tail
  677. rev $tctr2, $ctr
  678. vmov.32 ${ivec}[3],$tctr2
  679. sub $len,$len,#3 // bias
  680. vorr $dat2,$ivec,$ivec
  681. b .Loop3x_ctr32
  682. .align 4
  683. .Loop3x_ctr32:
  684. aese $dat0,q8
  685. aesmc $dat0,$dat0
  686. aese $dat1,q8
  687. aesmc $dat1,$dat1
  688. aese $dat2,q8
  689. aesmc $dat2,$dat2
  690. vld1.32 {q8},[$key_],#16
  691. subs $cnt,$cnt,#2
  692. aese $dat0,q9
  693. aesmc $dat0,$dat0
  694. aese $dat1,q9
  695. aesmc $dat1,$dat1
  696. aese $dat2,q9
  697. aesmc $dat2,$dat2
  698. vld1.32 {q9},[$key_],#16
  699. b.gt .Loop3x_ctr32
  700. aese $dat0,q8
  701. aesmc $tmp0,$dat0
  702. aese $dat1,q8
  703. aesmc $tmp1,$dat1
  704. vld1.8 {$in0},[$inp],#16
  705. add $tctr0,$ctr,#1
  706. aese $dat2,q8
  707. aesmc $dat2,$dat2
  708. vld1.8 {$in1},[$inp],#16
  709. rev $tctr0,$tctr0
  710. aese $tmp0,q9
  711. aesmc $tmp0,$tmp0
  712. aese $tmp1,q9
  713. aesmc $tmp1,$tmp1
  714. vld1.8 {$in2},[$inp],#16
  715. mov $key_,$key
  716. aese $dat2,q9
  717. aesmc $tmp2,$dat2
  718. aese $tmp0,q12
  719. aesmc $tmp0,$tmp0
  720. aese $tmp1,q12
  721. aesmc $tmp1,$tmp1
  722. veor $in0,$in0,$rndlast
  723. add $tctr1,$ctr,#2
  724. aese $tmp2,q12
  725. aesmc $tmp2,$tmp2
  726. veor $in1,$in1,$rndlast
  727. add $ctr,$ctr,#3
  728. aese $tmp0,q13
  729. aesmc $tmp0,$tmp0
  730. aese $tmp1,q13
  731. aesmc $tmp1,$tmp1
  732. veor $in2,$in2,$rndlast
  733. vmov.32 ${ivec}[3], $tctr0
  734. aese $tmp2,q13
  735. aesmc $tmp2,$tmp2
  736. vorr $dat0,$ivec,$ivec
  737. rev $tctr1,$tctr1
  738. aese $tmp0,q14
  739. aesmc $tmp0,$tmp0
  740. vmov.32 ${ivec}[3], $tctr1
  741. rev $tctr2,$ctr
  742. aese $tmp1,q14
  743. aesmc $tmp1,$tmp1
  744. vorr $dat1,$ivec,$ivec
  745. vmov.32 ${ivec}[3], $tctr2
  746. aese $tmp2,q14
  747. aesmc $tmp2,$tmp2
  748. vorr $dat2,$ivec,$ivec
  749. subs $len,$len,#3
  750. aese $tmp0,q15
  751. aese $tmp1,q15
  752. aese $tmp2,q15
  753. veor $in0,$in0,$tmp0
  754. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  755. vst1.8 {$in0},[$out],#16
  756. veor $in1,$in1,$tmp1
  757. mov $cnt,$rounds
  758. vst1.8 {$in1},[$out],#16
  759. veor $in2,$in2,$tmp2
  760. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  761. vst1.8 {$in2},[$out],#16
  762. b.hs .Loop3x_ctr32
  763. adds $len,$len,#3
  764. b.eq .Lctr32_done
  765. cmp $len,#1
  766. mov $step,#16
  767. cclr $step,eq
  768. .Lctr32_tail:
  769. aese $dat0,q8
  770. aesmc $dat0,$dat0
  771. aese $dat1,q8
  772. aesmc $dat1,$dat1
  773. vld1.32 {q8},[$key_],#16
  774. subs $cnt,$cnt,#2
  775. aese $dat0,q9
  776. aesmc $dat0,$dat0
  777. aese $dat1,q9
  778. aesmc $dat1,$dat1
  779. vld1.32 {q9},[$key_],#16
  780. b.gt .Lctr32_tail
  781. aese $dat0,q8
  782. aesmc $dat0,$dat0
  783. aese $dat1,q8
  784. aesmc $dat1,$dat1
  785. aese $dat0,q9
  786. aesmc $dat0,$dat0
  787. aese $dat1,q9
  788. aesmc $dat1,$dat1
  789. vld1.8 {$in0},[$inp],$step
  790. aese $dat0,q12
  791. aesmc $dat0,$dat0
  792. aese $dat1,q12
  793. aesmc $dat1,$dat1
  794. vld1.8 {$in1},[$inp]
  795. aese $dat0,q13
  796. aesmc $dat0,$dat0
  797. aese $dat1,q13
  798. aesmc $dat1,$dat1
  799. veor $in0,$in0,$rndlast
  800. aese $dat0,q14
  801. aesmc $dat0,$dat0
  802. aese $dat1,q14
  803. aesmc $dat1,$dat1
  804. veor $in1,$in1,$rndlast
  805. aese $dat0,q15
  806. aese $dat1,q15
  807. cmp $len,#1
  808. veor $in0,$in0,$dat0
  809. veor $in1,$in1,$dat1
  810. vst1.8 {$in0},[$out],#16
  811. b.eq .Lctr32_done
  812. vst1.8 {$in1},[$out]
  813. .Lctr32_done:
  814. ___
  815. $code.=<<___ if ($flavour !~ /64/);
  816. vldmia sp!,{d8-d15}
  817. ldmia sp!,{r4-r10,pc}
  818. ___
  819. $code.=<<___ if ($flavour =~ /64/);
  820. ldr x29,[sp],#16
  821. ret
  822. ___
  823. $code.=<<___;
  824. .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
  825. ___
  826. }}}
  827. $code.=<<___;
  828. #endif
  829. ___
  830. ########################################
  831. if ($flavour =~ /64/) { ######## 64-bit code
  832. my %opcode = (
  833. "aesd" => 0x4e285800, "aese" => 0x4e284800,
  834. "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
  835. local *unaes = sub {
  836. my ($mnemonic,$arg)=@_;
  837. $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
  838. sprintf ".inst\t0x%08x\t//%s %s",
  839. $opcode{$mnemonic}|$1|($2<<5),
  840. $mnemonic,$arg;
  841. };
  842. foreach(split("\n",$code)) {
  843. s/\`([^\`]*)\`/eval($1)/geo;
  844. s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
  845. s/@\s/\/\//o; # old->new style commentary
  846. #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
  847. s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
  848. s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
  849. s/vmov\.i8/movi/o or # fix up legacy mnemonics
  850. s/vext\.8/ext/o or
  851. s/vrev32\.8/rev32/o or
  852. s/vtst\.8/cmtst/o or
  853. s/vshr/ushr/o or
  854. s/^(\s+)v/$1/o or # strip off v prefix
  855. s/\bbx\s+lr\b/ret/o;
  856. # fix up remaining legacy suffixes
  857. s/\.[ui]?8//o;
  858. m/\],#8/o and s/\.16b/\.8b/go;
  859. s/\.[ui]?32//o and s/\.16b/\.4s/go;
  860. s/\.[ui]?64//o and s/\.16b/\.2d/go;
  861. s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
  862. print $_,"\n";
  863. }
  864. } else { ######## 32-bit code
  865. my %opcode = (
  866. "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
  867. "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
  868. local *unaes = sub {
  869. my ($mnemonic,$arg)=@_;
  870. if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
  871. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  872. |(($2&7)<<1) |(($2&8)<<2);
  873. # since ARMv7 instructions are always encoded little-endian.
  874. # correct solution is to use .inst directive, but older
  875. # assemblers don't implement it:-(
  876. sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
  877. $word&0xff,($word>>8)&0xff,
  878. ($word>>16)&0xff,($word>>24)&0xff,
  879. $mnemonic,$arg;
  880. }
  881. };
  882. sub unvtbl {
  883. my $arg=shift;
  884. $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
  885. sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
  886. "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
  887. }
  888. sub unvdup32 {
  889. my $arg=shift;
  890. $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
  891. sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
  892. }
  893. sub unvmov32 {
  894. my $arg=shift;
  895. $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
  896. sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
  897. }
  898. foreach(split("\n",$code)) {
  899. s/\`([^\`]*)\`/eval($1)/geo;
  900. s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
  901. s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
  902. s/\/\/\s?/@ /o; # new->old style commentary
  903. # fix up remaining new-style suffixes
  904. s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
  905. s/\],#[0-9]+/]!/o;
  906. s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
  907. s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
  908. s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
  909. s/vdup\.32\s+(.*)/unvdup32($1)/geo or
  910. s/vmov\.32\s+(.*)/unvmov32($1)/geo or
  911. s/^(\s+)b\./$1b/o or
  912. s/^(\s+)mov\./$1mov/o or
  913. s/^(\s+)ret/$1bx\tlr/o;
  914. print $_,"\n";
  915. }
  916. }
  917. close STDOUT or die "error closing STDOUT: $!";