sha1_x8_avx2.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. /*
  2. * Multi-buffer SHA1 algorithm hash compute routine
  3. *
  4. * This file is provided under a dual BSD/GPLv2 license. When using or
  5. * redistributing this file, you may do so under either license.
  6. *
  7. * GPL LICENSE SUMMARY
  8. *
  9. * Copyright(c) 2014 Intel Corporation.
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of version 2 of the GNU General Public License as
  13. * published by the Free Software Foundation.
  14. *
  15. * This program is distributed in the hope that it will be useful, but
  16. * WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * General Public License for more details.
  19. *
  20. * Contact Information:
  21. * James Guilford <james.guilford@intel.com>
  22. * Tim Chen <tim.c.chen@linux.intel.com>
  23. *
  24. * BSD LICENSE
  25. *
  26. * Copyright(c) 2014 Intel Corporation.
  27. *
  28. * Redistribution and use in source and binary forms, with or without
  29. * modification, are permitted provided that the following conditions
  30. * are met:
  31. *
  32. * * Redistributions of source code must retain the above copyright
  33. * notice, this list of conditions and the following disclaimer.
  34. * * Redistributions in binary form must reproduce the above copyright
  35. * notice, this list of conditions and the following disclaimer in
  36. * the documentation and/or other materials provided with the
  37. * distribution.
  38. * * Neither the name of Intel Corporation nor the names of its
  39. * contributors may be used to endorse or promote products derived
  40. * from this software without specific prior written permission.
  41. *
  42. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  43. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  44. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  45. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  46. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  47. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  48. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  49. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  50. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  51. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  52. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  53. */
  54. #include <linux/linkage.h>
  55. #include "sha1_mb_mgr_datastruct.S"
  56. ## code to compute oct SHA1 using SSE-256
  57. ## outer calling routine takes care of save and restore of XMM registers
  58. ## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15# ymm0-15
  59. ##
  60. ## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
  61. ## Linux preserves: rdi rbp r8
  62. ##
  63. ## clobbers ymm0-15
  64. # TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
  65. # "transpose" data in {r0...r7} using temps {t0...t1}
  66. # Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
  67. # r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
  68. # r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
  69. # r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
  70. # r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
  71. # r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
  72. # r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
  73. # r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
  74. # r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
  75. #
  76. # Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
  77. # r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
  78. # r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
  79. # r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
  80. # r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
  81. # r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
  82. # r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
  83. # r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
  84. # r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
  85. #
  86. .macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
  87. # process top half (r0..r3) {a...d}
  88. vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
  89. vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
  90. vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
  91. vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
  92. vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
  93. vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
  94. vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
  95. vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
  96. # use r2 in place of t0
  97. # process bottom half (r4..r7) {e...h}
  98. vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
  99. vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
  100. vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
  101. vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
  102. vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
  103. vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
  104. vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
  105. vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
  106. vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6
  107. vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2
  108. vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5
  109. vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1
  110. vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7
  111. vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3
  112. vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4
  113. vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0
  114. .endm
  115. ##
  116. ## Magic functions defined in FIPS 180-1
  117. ##
  118. # macro MAGIC_F0 F,B,C,D,T ## F = (D ^ (B & (C ^ D)))
  119. .macro MAGIC_F0 regF regB regC regD regT
  120. vpxor \regD, \regC, \regF
  121. vpand \regB, \regF, \regF
  122. vpxor \regD, \regF, \regF
  123. .endm
  124. # macro MAGIC_F1 F,B,C,D,T ## F = (B ^ C ^ D)
  125. .macro MAGIC_F1 regF regB regC regD regT
  126. vpxor \regC, \regD, \regF
  127. vpxor \regB, \regF, \regF
  128. .endm
  129. # macro MAGIC_F2 F,B,C,D,T ## F = ((B & C) | (B & D) | (C & D))
  130. .macro MAGIC_F2 regF regB regC regD regT
  131. vpor \regC, \regB, \regF
  132. vpand \regC, \regB, \regT
  133. vpand \regD, \regF, \regF
  134. vpor \regT, \regF, \regF
  135. .endm
  136. # macro MAGIC_F3 F,B,C,D,T ## F = (B ^ C ^ D)
  137. .macro MAGIC_F3 regF regB regC regD regT
  138. MAGIC_F1 \regF,\regB,\regC,\regD,\regT
  139. .endm
  140. # PROLD reg, imm, tmp
  141. .macro PROLD reg imm tmp
  142. vpsrld $(32-\imm), \reg, \tmp
  143. vpslld $\imm, \reg, \reg
  144. vpor \tmp, \reg, \reg
  145. .endm
  146. .macro PROLD_nd reg imm tmp src
  147. vpsrld $(32-\imm), \src, \tmp
  148. vpslld $\imm, \src, \reg
  149. vpor \tmp, \reg, \reg
  150. .endm
  151. .macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC
  152. vpaddd \immCNT, \regE, \regE
  153. vpaddd \memW*32(%rsp), \regE, \regE
  154. PROLD_nd \regT, 5, \regF, \regA
  155. vpaddd \regT, \regE, \regE
  156. \MAGIC \regF, \regB, \regC, \regD, \regT
  157. PROLD \regB, 30, \regT
  158. vpaddd \regF, \regE, \regE
  159. .endm
  160. .macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC
  161. vpaddd \immCNT, \regE, \regE
  162. offset = ((\memW - 14) & 15) * 32
  163. vmovdqu offset(%rsp), W14
  164. vpxor W14, W16, W16
  165. offset = ((\memW - 8) & 15) * 32
  166. vpxor offset(%rsp), W16, W16
  167. offset = ((\memW - 3) & 15) * 32
  168. vpxor offset(%rsp), W16, W16
  169. vpsrld $(32-1), W16, \regF
  170. vpslld $1, W16, W16
  171. vpor W16, \regF, \regF
  172. ROTATE_W
  173. offset = ((\memW - 0) & 15) * 32
  174. vmovdqu \regF, offset(%rsp)
  175. vpaddd \regF, \regE, \regE
  176. PROLD_nd \regT, 5, \regF, \regA
  177. vpaddd \regT, \regE, \regE
  178. \MAGIC \regF,\regB,\regC,\regD,\regT ## FUN = MAGIC_Fi(B,C,D)
  179. PROLD \regB,30, \regT
  180. vpaddd \regF, \regE, \regE
  181. .endm
  182. ########################################################################
  183. ########################################################################
  184. ########################################################################
  185. ## FRAMESZ plus pushes must be an odd multiple of 8
  186. YMM_SAVE = (15-15)*32
  187. FRAMESZ = 32*16 + YMM_SAVE
  188. _YMM = FRAMESZ - YMM_SAVE
  189. #define VMOVPS vmovups
  190. IDX = %rax
  191. inp0 = %r9
  192. inp1 = %r10
  193. inp2 = %r11
  194. inp3 = %r12
  195. inp4 = %r13
  196. inp5 = %r14
  197. inp6 = %r15
  198. inp7 = %rcx
  199. arg1 = %rdi
  200. arg2 = %rsi
  201. RSP_SAVE = %rdx
  202. # ymm0 A
  203. # ymm1 B
  204. # ymm2 C
  205. # ymm3 D
  206. # ymm4 E
  207. # ymm5 F AA
  208. # ymm6 T0 BB
  209. # ymm7 T1 CC
  210. # ymm8 T2 DD
  211. # ymm9 T3 EE
  212. # ymm10 T4 TMP
  213. # ymm11 T5 FUN
  214. # ymm12 T6 K
  215. # ymm13 T7 W14
  216. # ymm14 T8 W15
  217. # ymm15 T9 W16
  218. A = %ymm0
  219. B = %ymm1
  220. C = %ymm2
  221. D = %ymm3
  222. E = %ymm4
  223. F = %ymm5
  224. T0 = %ymm6
  225. T1 = %ymm7
  226. T2 = %ymm8
  227. T3 = %ymm9
  228. T4 = %ymm10
  229. T5 = %ymm11
  230. T6 = %ymm12
  231. T7 = %ymm13
  232. T8 = %ymm14
  233. T9 = %ymm15
  234. AA = %ymm5
  235. BB = %ymm6
  236. CC = %ymm7
  237. DD = %ymm8
  238. EE = %ymm9
  239. TMP = %ymm10
  240. FUN = %ymm11
  241. K = %ymm12
  242. W14 = %ymm13
  243. W15 = %ymm14
  244. W16 = %ymm15
  245. .macro ROTATE_ARGS
  246. TMP_ = E
  247. E = D
  248. D = C
  249. C = B
  250. B = A
  251. A = TMP_
  252. .endm
  253. .macro ROTATE_W
  254. TMP_ = W16
  255. W16 = W15
  256. W15 = W14
  257. W14 = TMP_
  258. .endm
  259. # 8 streams x 5 32bit words per digest x 4 bytes per word
  260. #define DIGEST_SIZE (8*5*4)
  261. .align 32
  262. # void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size)
  263. # arg 1 : pointer to array[4] of pointer to input data
  264. # arg 2 : size (in blocks) ;; assumed to be >= 1
  265. #
  266. ENTRY(sha1_x8_avx2)
  267. push RSP_SAVE
  268. #save rsp
  269. mov %rsp, RSP_SAVE
  270. sub $FRAMESZ, %rsp
  271. #align rsp to 32 Bytes
  272. and $~0x1F, %rsp
  273. ## Initialize digests
  274. vmovdqu 0*32(arg1), A
  275. vmovdqu 1*32(arg1), B
  276. vmovdqu 2*32(arg1), C
  277. vmovdqu 3*32(arg1), D
  278. vmovdqu 4*32(arg1), E
  279. ## transpose input onto stack
  280. mov _data_ptr+0*8(arg1),inp0
  281. mov _data_ptr+1*8(arg1),inp1
  282. mov _data_ptr+2*8(arg1),inp2
  283. mov _data_ptr+3*8(arg1),inp3
  284. mov _data_ptr+4*8(arg1),inp4
  285. mov _data_ptr+5*8(arg1),inp5
  286. mov _data_ptr+6*8(arg1),inp6
  287. mov _data_ptr+7*8(arg1),inp7
  288. xor IDX, IDX
  289. lloop:
  290. vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), F
  291. I=0
  292. .rep 2
  293. VMOVPS (inp0, IDX), T0
  294. VMOVPS (inp1, IDX), T1
  295. VMOVPS (inp2, IDX), T2
  296. VMOVPS (inp3, IDX), T3
  297. VMOVPS (inp4, IDX), T4
  298. VMOVPS (inp5, IDX), T5
  299. VMOVPS (inp6, IDX), T6
  300. VMOVPS (inp7, IDX), T7
  301. TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
  302. vpshufb F, T0, T0
  303. vmovdqu T0, (I*8)*32(%rsp)
  304. vpshufb F, T1, T1
  305. vmovdqu T1, (I*8+1)*32(%rsp)
  306. vpshufb F, T2, T2
  307. vmovdqu T2, (I*8+2)*32(%rsp)
  308. vpshufb F, T3, T3
  309. vmovdqu T3, (I*8+3)*32(%rsp)
  310. vpshufb F, T4, T4
  311. vmovdqu T4, (I*8+4)*32(%rsp)
  312. vpshufb F, T5, T5
  313. vmovdqu T5, (I*8+5)*32(%rsp)
  314. vpshufb F, T6, T6
  315. vmovdqu T6, (I*8+6)*32(%rsp)
  316. vpshufb F, T7, T7
  317. vmovdqu T7, (I*8+7)*32(%rsp)
  318. add $32, IDX
  319. I = (I+1)
  320. .endr
  321. # save old digests
  322. vmovdqu A,AA
  323. vmovdqu B,BB
  324. vmovdqu C,CC
  325. vmovdqu D,DD
  326. vmovdqu E,EE
  327. ##
  328. ## perform 0-79 steps
  329. ##
  330. vmovdqu K00_19(%rip), K
  331. ## do rounds 0...15
  332. I = 0
  333. .rep 16
  334. SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
  335. ROTATE_ARGS
  336. I = (I+1)
  337. .endr
  338. ## do rounds 16...19
  339. vmovdqu ((16 - 16) & 15) * 32 (%rsp), W16
  340. vmovdqu ((16 - 15) & 15) * 32 (%rsp), W15
  341. .rep 4
  342. SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
  343. ROTATE_ARGS
  344. I = (I+1)
  345. .endr
  346. ## do rounds 20...39
  347. vmovdqu K20_39(%rip), K
  348. .rep 20
  349. SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
  350. ROTATE_ARGS
  351. I = (I+1)
  352. .endr
  353. ## do rounds 40...59
  354. vmovdqu K40_59(%rip), K
  355. .rep 20
  356. SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
  357. ROTATE_ARGS
  358. I = (I+1)
  359. .endr
  360. ## do rounds 60...79
  361. vmovdqu K60_79(%rip), K
  362. .rep 20
  363. SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
  364. ROTATE_ARGS
  365. I = (I+1)
  366. .endr
  367. vpaddd AA,A,A
  368. vpaddd BB,B,B
  369. vpaddd CC,C,C
  370. vpaddd DD,D,D
  371. vpaddd EE,E,E
  372. sub $1, arg2
  373. jne lloop
  374. # write out digests
  375. vmovdqu A, 0*32(arg1)
  376. vmovdqu B, 1*32(arg1)
  377. vmovdqu C, 2*32(arg1)
  378. vmovdqu D, 3*32(arg1)
  379. vmovdqu E, 4*32(arg1)
  380. # update input pointers
  381. add IDX, inp0
  382. add IDX, inp1
  383. add IDX, inp2
  384. add IDX, inp3
  385. add IDX, inp4
  386. add IDX, inp5
  387. add IDX, inp6
  388. add IDX, inp7
  389. mov inp0, _data_ptr (arg1)
  390. mov inp1, _data_ptr + 1*8(arg1)
  391. mov inp2, _data_ptr + 2*8(arg1)
  392. mov inp3, _data_ptr + 3*8(arg1)
  393. mov inp4, _data_ptr + 4*8(arg1)
  394. mov inp5, _data_ptr + 5*8(arg1)
  395. mov inp6, _data_ptr + 6*8(arg1)
  396. mov inp7, _data_ptr + 7*8(arg1)
  397. ################
  398. ## Postamble
  399. mov RSP_SAVE, %rsp
  400. pop RSP_SAVE
  401. ret
  402. ENDPROC(sha1_x8_avx2)
  403. .data
  404. .align 32
  405. K00_19:
  406. .octa 0x5A8279995A8279995A8279995A827999
  407. .octa 0x5A8279995A8279995A8279995A827999
  408. K20_39:
  409. .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
  410. .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
  411. K40_59:
  412. .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
  413. .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
  414. K60_79:
  415. .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
  416. .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
  417. PSHUFFLE_BYTE_FLIP_MASK:
  418. .octa 0x0c0d0e0f08090a0b0405060700010203
  419. .octa 0x0c0d0e0f08090a0b0405060700010203