sha1_avx2_x86_64_asm.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708
  1. /*
  2. * Implement fast SHA-1 with AVX2 instructions. (x86_64)
  3. *
  4. * This file is provided under a dual BSD/GPLv2 license. When using or
  5. * redistributing this file, you may do so under either license.
  6. *
  7. * GPL LICENSE SUMMARY
  8. *
  9. * Copyright(c) 2014 Intel Corporation.
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of version 2 of the GNU General Public License as
  13. * published by the Free Software Foundation.
  14. *
  15. * This program is distributed in the hope that it will be useful, but
  16. * WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * General Public License for more details.
  19. *
  20. * Contact Information:
  21. * Ilya Albrekht <ilya.albrekht@intel.com>
  22. * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
  23. * Ronen Zohar <ronen.zohar@intel.com>
  24. * Chandramouli Narayanan <mouli@linux.intel.com>
  25. *
  26. * BSD LICENSE
  27. *
  28. * Copyright(c) 2014 Intel Corporation.
  29. *
  30. * Redistribution and use in source and binary forms, with or without
  31. * modification, are permitted provided that the following conditions
  32. * are met:
  33. *
  34. * Redistributions of source code must retain the above copyright
  35. * notice, this list of conditions and the following disclaimer.
  36. * Redistributions in binary form must reproduce the above copyright
  37. * notice, this list of conditions and the following disclaimer in
  38. * the documentation and/or other materials provided with the
  39. * distribution.
  40. * Neither the name of Intel Corporation nor the names of its
  41. * contributors may be used to endorse or promote products derived
  42. * from this software without specific prior written permission.
  43. *
  44. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  45. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  46. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  47. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  48. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  49. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  50. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  51. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  52. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  53. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  54. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  55. *
  56. */
  57. /*
  58. * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
  59. *
  60. *This implementation is based on the previous SSSE3 release:
  61. *Visit http://software.intel.com/en-us/articles/
  62. *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
  63. *
  64. *Updates 20-byte SHA-1 record in 'hash' for even number of
  65. *'num_blocks' consecutive 64-byte blocks
  66. *
  67. *extern "C" void sha1_transform_avx2(
  68. * int *hash, const char* input, size_t num_blocks );
  69. */
  70. #include <linux/linkage.h>
  71. #define CTX %rdi /* arg1 */
  72. #define BUF %rsi /* arg2 */
  73. #define CNT %rdx /* arg3 */
  74. #define REG_A %ecx
  75. #define REG_B %esi
  76. #define REG_C %edi
  77. #define REG_D %eax
  78. #define REG_E %edx
  79. #define REG_TB %ebx
  80. #define REG_TA %r12d
  81. #define REG_RA %rcx
  82. #define REG_RB %rsi
  83. #define REG_RC %rdi
  84. #define REG_RD %rax
  85. #define REG_RE %rdx
  86. #define REG_RTA %r12
  87. #define REG_RTB %rbx
  88. #define REG_T1 %ebp
  89. #define xmm_mov vmovups
  90. #define avx2_zeroupper vzeroupper
  91. #define RND_F1 1
  92. #define RND_F2 2
  93. #define RND_F3 3
  94. .macro REGALLOC
  95. .set A, REG_A
  96. .set B, REG_B
  97. .set C, REG_C
  98. .set D, REG_D
  99. .set E, REG_E
  100. .set TB, REG_TB
  101. .set TA, REG_TA
  102. .set RA, REG_RA
  103. .set RB, REG_RB
  104. .set RC, REG_RC
  105. .set RD, REG_RD
  106. .set RE, REG_RE
  107. .set RTA, REG_RTA
  108. .set RTB, REG_RTB
  109. .set T1, REG_T1
  110. .endm
  111. #define K_BASE %r8
  112. #define HASH_PTR %r9
  113. #define BUFFER_PTR %r10
  114. #define BUFFER_PTR2 %r13
  115. #define BUFFER_END %r11
  116. #define PRECALC_BUF %r14
  117. #define WK_BUF %r15
  118. #define W_TMP %xmm0
  119. #define WY_TMP %ymm0
  120. #define WY_TMP2 %ymm9
  121. # AVX2 variables
  122. #define WY0 %ymm3
  123. #define WY4 %ymm5
  124. #define WY08 %ymm7
  125. #define WY12 %ymm8
  126. #define WY16 %ymm12
  127. #define WY20 %ymm13
  128. #define WY24 %ymm14
  129. #define WY28 %ymm15
  130. #define YMM_SHUFB_BSWAP %ymm10
  131. /*
  132. * Keep 2 iterations precalculated at a time:
  133. * - 80 DWORDs per iteration * 2
  134. */
  135. #define W_SIZE (80*2*2 +16)
  136. #define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
  137. #define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
  138. .macro UPDATE_HASH hash, val
  139. add \hash, \val
  140. mov \val, \hash
  141. .endm
  142. .macro PRECALC_RESET_WY
  143. .set WY_00, WY0
  144. .set WY_04, WY4
  145. .set WY_08, WY08
  146. .set WY_12, WY12
  147. .set WY_16, WY16
  148. .set WY_20, WY20
  149. .set WY_24, WY24
  150. .set WY_28, WY28
  151. .set WY_32, WY_00
  152. .endm
  153. .macro PRECALC_ROTATE_WY
  154. /* Rotate macros */
  155. .set WY_32, WY_28
  156. .set WY_28, WY_24
  157. .set WY_24, WY_20
  158. .set WY_20, WY_16
  159. .set WY_16, WY_12
  160. .set WY_12, WY_08
  161. .set WY_08, WY_04
  162. .set WY_04, WY_00
  163. .set WY_00, WY_32
  164. /* Define register aliases */
  165. .set WY, WY_00
  166. .set WY_minus_04, WY_04
  167. .set WY_minus_08, WY_08
  168. .set WY_minus_12, WY_12
  169. .set WY_minus_16, WY_16
  170. .set WY_minus_20, WY_20
  171. .set WY_minus_24, WY_24
  172. .set WY_minus_28, WY_28
  173. .set WY_minus_32, WY
  174. .endm
  175. .macro PRECALC_00_15
  176. .if (i == 0) # Initialize and rotate registers
  177. PRECALC_RESET_WY
  178. PRECALC_ROTATE_WY
  179. .endif
  180. /* message scheduling pre-compute for rounds 0-15 */
  181. .if ((i & 7) == 0)
  182. /*
  183. * blended AVX2 and ALU instruction scheduling
  184. * 1 vector iteration per 8 rounds
  185. */
  186. vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
  187. .elseif ((i & 7) == 1)
  188. vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
  189. WY_TMP, WY_TMP
  190. .elseif ((i & 7) == 2)
  191. vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
  192. .elseif ((i & 7) == 4)
  193. vpaddd K_XMM(K_BASE), WY, WY_TMP
  194. .elseif ((i & 7) == 7)
  195. vmovdqu WY_TMP, PRECALC_WK(i&~7)
  196. PRECALC_ROTATE_WY
  197. .endif
  198. .endm
  199. .macro PRECALC_16_31
  200. /*
  201. * message scheduling pre-compute for rounds 16-31
  202. * calculating last 32 w[i] values in 8 XMM registers
  203. * pre-calculate K+w[i] values and store to mem
  204. * for later load by ALU add instruction
  205. *
  206. * "brute force" vectorization for rounds 16-31 only
  207. * due to w[i]->w[i-3] dependency
  208. */
  209. .if ((i & 7) == 0)
  210. /*
  211. * blended AVX2 and ALU instruction scheduling
  212. * 1 vector iteration per 8 rounds
  213. */
  214. /* w[i-14] */
  215. vpalignr $8, WY_minus_16, WY_minus_12, WY
  216. vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
  217. .elseif ((i & 7) == 1)
  218. vpxor WY_minus_08, WY, WY
  219. vpxor WY_minus_16, WY_TMP, WY_TMP
  220. .elseif ((i & 7) == 2)
  221. vpxor WY_TMP, WY, WY
  222. vpslldq $12, WY, WY_TMP2
  223. .elseif ((i & 7) == 3)
  224. vpslld $1, WY, WY_TMP
  225. vpsrld $31, WY, WY
  226. .elseif ((i & 7) == 4)
  227. vpor WY, WY_TMP, WY_TMP
  228. vpslld $2, WY_TMP2, WY
  229. .elseif ((i & 7) == 5)
  230. vpsrld $30, WY_TMP2, WY_TMP2
  231. vpxor WY, WY_TMP, WY_TMP
  232. .elseif ((i & 7) == 7)
  233. vpxor WY_TMP2, WY_TMP, WY
  234. vpaddd K_XMM(K_BASE), WY, WY_TMP
  235. vmovdqu WY_TMP, PRECALC_WK(i&~7)
  236. PRECALC_ROTATE_WY
  237. .endif
  238. .endm
  239. .macro PRECALC_32_79
  240. /*
  241. * in SHA-1 specification:
  242. * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
  243. * instead we do equal:
  244. * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
  245. * allows more efficient vectorization
  246. * since w[i]=>w[i-3] dependency is broken
  247. */
  248. .if ((i & 7) == 0)
  249. /*
  250. * blended AVX2 and ALU instruction scheduling
  251. * 1 vector iteration per 8 rounds
  252. */
  253. vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
  254. .elseif ((i & 7) == 1)
  255. /* W is W_minus_32 before xor */
  256. vpxor WY_minus_28, WY, WY
  257. .elseif ((i & 7) == 2)
  258. vpxor WY_minus_16, WY_TMP, WY_TMP
  259. .elseif ((i & 7) == 3)
  260. vpxor WY_TMP, WY, WY
  261. .elseif ((i & 7) == 4)
  262. vpslld $2, WY, WY_TMP
  263. .elseif ((i & 7) == 5)
  264. vpsrld $30, WY, WY
  265. vpor WY, WY_TMP, WY
  266. .elseif ((i & 7) == 7)
  267. vpaddd K_XMM(K_BASE), WY, WY_TMP
  268. vmovdqu WY_TMP, PRECALC_WK(i&~7)
  269. PRECALC_ROTATE_WY
  270. .endif
  271. .endm
  272. .macro PRECALC r, s
  273. .set i, \r
  274. .if (i < 40)
  275. .set K_XMM, 32*0
  276. .elseif (i < 80)
  277. .set K_XMM, 32*1
  278. .elseif (i < 120)
  279. .set K_XMM, 32*2
  280. .else
  281. .set K_XMM, 32*3
  282. .endif
  283. .if (i<32)
  284. PRECALC_00_15 \s
  285. .elseif (i<64)
  286. PRECALC_16_31 \s
  287. .elseif (i < 160)
  288. PRECALC_32_79 \s
  289. .endif
  290. .endm
  291. .macro ROTATE_STATE
  292. .set T_REG, E
  293. .set E, D
  294. .set D, C
  295. .set C, B
  296. .set B, TB
  297. .set TB, A
  298. .set A, T_REG
  299. .set T_REG, RE
  300. .set RE, RD
  301. .set RD, RC
  302. .set RC, RB
  303. .set RB, RTB
  304. .set RTB, RA
  305. .set RA, T_REG
  306. .endm
  307. /* Macro relies on saved ROUND_Fx */
  308. .macro RND_FUN f, r
  309. .if (\f == RND_F1)
  310. ROUND_F1 \r
  311. .elseif (\f == RND_F2)
  312. ROUND_F2 \r
  313. .elseif (\f == RND_F3)
  314. ROUND_F3 \r
  315. .endif
  316. .endm
  317. .macro RR r
  318. .set round_id, (\r % 80)
  319. .if (round_id == 0) /* Precalculate F for first round */
  320. .set ROUND_FUNC, RND_F1
  321. mov B, TB
  322. rorx $(32-30), B, B /* b>>>2 */
  323. andn D, TB, T1
  324. and C, TB
  325. xor T1, TB
  326. .endif
  327. RND_FUN ROUND_FUNC, \r
  328. ROTATE_STATE
  329. .if (round_id == 18)
  330. .set ROUND_FUNC, RND_F2
  331. .elseif (round_id == 38)
  332. .set ROUND_FUNC, RND_F3
  333. .elseif (round_id == 58)
  334. .set ROUND_FUNC, RND_F2
  335. .endif
  336. .set round_id, ( (\r+1) % 80)
  337. RND_FUN ROUND_FUNC, (\r+1)
  338. ROTATE_STATE
  339. .endm
  340. .macro ROUND_F1 r
  341. add WK(\r), E
  342. andn C, A, T1 /* ~b&d */
  343. lea (RE,RTB), E /* Add F from the previous round */
  344. rorx $(32-5), A, TA /* T2 = A >>> 5 */
  345. rorx $(32-30),A, TB /* b>>>2 for next round */
  346. PRECALC (\r) /* msg scheduling for next 2 blocks */
  347. /*
  348. * Calculate F for the next round
  349. * (b & c) ^ andn[b, d]
  350. */
  351. and B, A /* b&c */
  352. xor T1, A /* F1 = (b&c) ^ (~b&d) */
  353. lea (RE,RTA), E /* E += A >>> 5 */
  354. .endm
  355. .macro ROUND_F2 r
  356. add WK(\r), E
  357. lea (RE,RTB), E /* Add F from the previous round */
  358. /* Calculate F for the next round */
  359. rorx $(32-5), A, TA /* T2 = A >>> 5 */
  360. .if ((round_id) < 79)
  361. rorx $(32-30), A, TB /* b>>>2 for next round */
  362. .endif
  363. PRECALC (\r) /* msg scheduling for next 2 blocks */
  364. .if ((round_id) < 79)
  365. xor B, A
  366. .endif
  367. add TA, E /* E += A >>> 5 */
  368. .if ((round_id) < 79)
  369. xor C, A
  370. .endif
  371. .endm
  372. .macro ROUND_F3 r
  373. add WK(\r), E
  374. PRECALC (\r) /* msg scheduling for next 2 blocks */
  375. lea (RE,RTB), E /* Add F from the previous round */
  376. mov B, T1
  377. or A, T1
  378. rorx $(32-5), A, TA /* T2 = A >>> 5 */
  379. rorx $(32-30), A, TB /* b>>>2 for next round */
  380. /* Calculate F for the next round
  381. * (b and c) or (d and (b or c))
  382. */
  383. and C, T1
  384. and B, A
  385. or T1, A
  386. add TA, E /* E += A >>> 5 */
  387. .endm
  388. /*
  389. * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
  390. */
  391. .macro SHA1_PIPELINED_MAIN_BODY
  392. REGALLOC
  393. mov (HASH_PTR), A
  394. mov 4(HASH_PTR), B
  395. mov 8(HASH_PTR), C
  396. mov 12(HASH_PTR), D
  397. mov 16(HASH_PTR), E
  398. mov %rsp, PRECALC_BUF
  399. lea (2*4*80+32)(%rsp), WK_BUF
  400. # Precalc WK for first 2 blocks
  401. PRECALC_OFFSET = 0
  402. .set i, 0
  403. .rept 160
  404. PRECALC i
  405. .set i, i + 1
  406. .endr
  407. PRECALC_OFFSET = 128
  408. xchg WK_BUF, PRECALC_BUF
  409. .align 32
  410. _loop:
  411. /*
  412. * code loops through more than one block
  413. * we use K_BASE value as a signal of a last block,
  414. * it is set below by: cmovae BUFFER_PTR, K_BASE
  415. */
  416. cmp K_BASE, BUFFER_PTR
  417. jne _begin
  418. .align 32
  419. jmp _end
  420. .align 32
  421. _begin:
  422. /*
  423. * Do first block
  424. * rounds: 0,2,4,6,8
  425. */
  426. .set j, 0
  427. .rept 5
  428. RR j
  429. .set j, j+2
  430. .endr
  431. jmp _loop0
  432. _loop0:
  433. /*
  434. * rounds:
  435. * 10,12,14,16,18
  436. * 20,22,24,26,28
  437. * 30,32,34,36,38
  438. * 40,42,44,46,48
  439. * 50,52,54,56,58
  440. */
  441. .rept 25
  442. RR j
  443. .set j, j+2
  444. .endr
  445. add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */
  446. cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */
  447. cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
  448. /*
  449. * rounds
  450. * 60,62,64,66,68
  451. * 70,72,74,76,78
  452. */
  453. .rept 10
  454. RR j
  455. .set j, j+2
  456. .endr
  457. UPDATE_HASH (HASH_PTR), A
  458. UPDATE_HASH 4(HASH_PTR), TB
  459. UPDATE_HASH 8(HASH_PTR), C
  460. UPDATE_HASH 12(HASH_PTR), D
  461. UPDATE_HASH 16(HASH_PTR), E
  462. cmp K_BASE, BUFFER_PTR /* is current block the last one? */
  463. je _loop
  464. mov TB, B
  465. /* Process second block */
  466. /*
  467. * rounds
  468. * 0+80, 2+80, 4+80, 6+80, 8+80
  469. * 10+80,12+80,14+80,16+80,18+80
  470. */
  471. .set j, 0
  472. .rept 10
  473. RR j+80
  474. .set j, j+2
  475. .endr
  476. jmp _loop1
  477. _loop1:
  478. /*
  479. * rounds
  480. * 20+80,22+80,24+80,26+80,28+80
  481. * 30+80,32+80,34+80,36+80,38+80
  482. */
  483. .rept 10
  484. RR j+80
  485. .set j, j+2
  486. .endr
  487. jmp _loop2
  488. _loop2:
  489. /*
  490. * rounds
  491. * 40+80,42+80,44+80,46+80,48+80
  492. * 50+80,52+80,54+80,56+80,58+80
  493. */
  494. .rept 10
  495. RR j+80
  496. .set j, j+2
  497. .endr
  498. add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */
  499. cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */
  500. cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */
  501. jmp _loop3
  502. _loop3:
  503. /*
  504. * rounds
  505. * 60+80,62+80,64+80,66+80,68+80
  506. * 70+80,72+80,74+80,76+80,78+80
  507. */
  508. .rept 10
  509. RR j+80
  510. .set j, j+2
  511. .endr
  512. UPDATE_HASH (HASH_PTR), A
  513. UPDATE_HASH 4(HASH_PTR), TB
  514. UPDATE_HASH 8(HASH_PTR), C
  515. UPDATE_HASH 12(HASH_PTR), D
  516. UPDATE_HASH 16(HASH_PTR), E
  517. /* Reset state for AVX2 reg permutation */
  518. mov A, TA
  519. mov TB, A
  520. mov C, TB
  521. mov E, C
  522. mov D, B
  523. mov TA, D
  524. REGALLOC
  525. xchg WK_BUF, PRECALC_BUF
  526. jmp _loop
  527. .align 32
  528. _end:
  529. .endm
  530. /*
  531. * macro implements SHA-1 function's body for several 64-byte blocks
  532. * param: function's name
  533. */
  534. .macro SHA1_VECTOR_ASM name
  535. ENTRY(\name)
  536. push %rbx
  537. push %rbp
  538. push %r12
  539. push %r13
  540. push %r14
  541. push %r15
  542. RESERVE_STACK = (W_SIZE*4 + 8+24)
  543. /* Align stack */
  544. mov %rsp, %rbx
  545. and $~(0x20-1), %rsp
  546. push %rbx
  547. sub $RESERVE_STACK, %rsp
  548. avx2_zeroupper
  549. lea K_XMM_AR(%rip), K_BASE
  550. mov CTX, HASH_PTR
  551. mov BUF, BUFFER_PTR
  552. lea 64(BUF), BUFFER_PTR2
  553. shl $6, CNT /* mul by 64 */
  554. add BUF, CNT
  555. add $64, CNT
  556. mov CNT, BUFFER_END
  557. cmp BUFFER_END, BUFFER_PTR2
  558. cmovae K_BASE, BUFFER_PTR2
  559. xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
  560. SHA1_PIPELINED_MAIN_BODY
  561. avx2_zeroupper
  562. add $RESERVE_STACK, %rsp
  563. pop %rsp
  564. pop %r15
  565. pop %r14
  566. pop %r13
  567. pop %r12
  568. pop %rbp
  569. pop %rbx
  570. ret
  571. ENDPROC(\name)
  572. .endm
  573. .section .rodata
  574. #define K1 0x5a827999
  575. #define K2 0x6ed9eba1
  576. #define K3 0x8f1bbcdc
  577. #define K4 0xca62c1d6
  578. .align 128
  579. K_XMM_AR:
  580. .long K1, K1, K1, K1
  581. .long K1, K1, K1, K1
  582. .long K2, K2, K2, K2
  583. .long K2, K2, K2, K2
  584. .long K3, K3, K3, K3
  585. .long K3, K3, K3, K3
  586. .long K4, K4, K4, K4
  587. .long K4, K4, K4, K4
  588. BSWAP_SHUFB_CTL:
  589. .long 0x00010203
  590. .long 0x04050607
  591. .long 0x08090a0b
  592. .long 0x0c0d0e0f
  593. .long 0x00010203
  594. .long 0x04050607
  595. .long 0x08090a0b
  596. .long 0x0c0d0e0f
  597. .text
  598. SHA1_VECTOR_ASM sha1_transform_avx2