sha1_mb_mgr_flush_avx2.S 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. /*
  2. * Flush routine for SHA1 multibuffer
  3. *
  4. * This file is provided under a dual BSD/GPLv2 license. When using or
  5. * redistributing this file, you may do so under either license.
  6. *
  7. * GPL LICENSE SUMMARY
  8. *
  9. * Copyright(c) 2014 Intel Corporation.
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of version 2 of the GNU General Public License as
  13. * published by the Free Software Foundation.
  14. *
  15. * This program is distributed in the hope that it will be useful, but
  16. * WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * General Public License for more details.
  19. *
  20. * Contact Information:
  21. * James Guilford <james.guilford@intel.com>
  22. * Tim Chen <tim.c.chen@linux.intel.com>
  23. *
  24. * BSD LICENSE
  25. *
  26. * Copyright(c) 2014 Intel Corporation.
  27. *
  28. * Redistribution and use in source and binary forms, with or without
  29. * modification, are permitted provided that the following conditions
  30. * are met:
  31. *
  32. * * Redistributions of source code must retain the above copyright
  33. * notice, this list of conditions and the following disclaimer.
  34. * * Redistributions in binary form must reproduce the above copyright
  35. * notice, this list of conditions and the following disclaimer in
  36. * the documentation and/or other materials provided with the
  37. * distribution.
  38. * * Neither the name of Intel Corporation nor the names of its
  39. * contributors may be used to endorse or promote products derived
  40. * from this software without specific prior written permission.
  41. *
  42. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  43. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  44. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  45. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  46. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  47. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  48. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  49. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  50. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  51. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  52. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  53. */
  54. #include <linux/linkage.h>
  55. #include "sha1_mb_mgr_datastruct.S"
  56. .extern sha1_x8_avx2
  57. # LINUX register definitions
  58. #define arg1 %rdi
  59. #define arg2 %rsi
  60. # Common definitions
  61. #define state arg1
  62. #define job arg2
  63. #define len2 arg2
  64. # idx must be a register not clobbered by sha1_x8_avx2
  65. #define idx %r8
  66. #define DWORD_idx %r8d
  67. #define unused_lanes %rbx
  68. #define lane_data %rbx
  69. #define tmp2 %rbx
  70. #define tmp2_w %ebx
  71. #define job_rax %rax
  72. #define tmp1 %rax
  73. #define size_offset %rax
  74. #define tmp %rax
  75. #define start_offset %rax
  76. #define tmp3 %arg1
  77. #define extra_blocks %arg2
  78. #define p %arg2
  79. # STACK_SPACE needs to be an odd multiple of 8
  80. _XMM_SAVE_SIZE = 10*16
  81. _GPR_SAVE_SIZE = 8*8
  82. _ALIGN_SIZE = 8
  83. _XMM_SAVE = 0
  84. _GPR_SAVE = _XMM_SAVE + _XMM_SAVE_SIZE
  85. STACK_SPACE = _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
  86. .macro LABEL prefix n
  87. \prefix\n\():
  88. .endm
  89. .macro JNE_SKIP i
  90. jne skip_\i
  91. .endm
  92. .altmacro
  93. .macro SET_OFFSET _offset
  94. offset = \_offset
  95. .endm
  96. .noaltmacro
  97. # JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
  98. # arg 1 : rcx : state
  99. ENTRY(sha1_mb_mgr_flush_avx2)
  100. mov %rsp, %r10
  101. sub $STACK_SPACE, %rsp
  102. and $~31, %rsp
  103. mov %rbx, _GPR_SAVE(%rsp)
  104. mov %r10, _GPR_SAVE+8*1(%rsp) #save rsp
  105. mov %rbp, _GPR_SAVE+8*3(%rsp)
  106. mov %r12, _GPR_SAVE+8*4(%rsp)
  107. mov %r13, _GPR_SAVE+8*5(%rsp)
  108. mov %r14, _GPR_SAVE+8*6(%rsp)
  109. mov %r15, _GPR_SAVE+8*7(%rsp)
  110. # If bit (32+3) is set, then all lanes are empty
  111. mov _unused_lanes(state), unused_lanes
  112. bt $32+3, unused_lanes
  113. jc return_null
  114. # find a lane with a non-null job
  115. xor idx, idx
  116. offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
  117. cmpq $0, offset(state)
  118. cmovne one(%rip), idx
  119. offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
  120. cmpq $0, offset(state)
  121. cmovne two(%rip), idx
  122. offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
  123. cmpq $0, offset(state)
  124. cmovne three(%rip), idx
  125. offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
  126. cmpq $0, offset(state)
  127. cmovne four(%rip), idx
  128. offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
  129. cmpq $0, offset(state)
  130. cmovne five(%rip), idx
  131. offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
  132. cmpq $0, offset(state)
  133. cmovne six(%rip), idx
  134. offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
  135. cmpq $0, offset(state)
  136. cmovne seven(%rip), idx
  137. # copy idx to empty lanes
  138. copy_lane_data:
  139. offset = (_args + _data_ptr)
  140. mov offset(state,idx,8), tmp
  141. I = 0
  142. .rep 8
  143. offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
  144. cmpq $0, offset(state)
  145. .altmacro
  146. JNE_SKIP %I
  147. offset = (_args + _data_ptr + 8*I)
  148. mov tmp, offset(state)
  149. offset = (_lens + 4*I)
  150. movl $0xFFFFFFFF, offset(state)
  151. LABEL skip_ %I
  152. I = (I+1)
  153. .noaltmacro
  154. .endr
  155. # Find min length
  156. vmovdqa _lens+0*16(state), %xmm0
  157. vmovdqa _lens+1*16(state), %xmm1
  158. vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
  159. vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
  160. vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
  161. vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
  162. vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
  163. vmovd %xmm2, DWORD_idx
  164. mov idx, len2
  165. and $0xF, idx
  166. shr $4, len2
  167. jz len_is_0
  168. vpand clear_low_nibble(%rip), %xmm2, %xmm2
  169. vpshufd $0, %xmm2, %xmm2
  170. vpsubd %xmm2, %xmm0, %xmm0
  171. vpsubd %xmm2, %xmm1, %xmm1
  172. vmovdqa %xmm0, _lens+0*16(state)
  173. vmovdqa %xmm1, _lens+1*16(state)
  174. # "state" and "args" are the same address, arg1
  175. # len is arg2
  176. call sha1_x8_avx2
  177. # state and idx are intact
  178. len_is_0:
  179. # process completed job "idx"
  180. imul $_LANE_DATA_size, idx, lane_data
  181. lea _ldata(state, lane_data), lane_data
  182. mov _job_in_lane(lane_data), job_rax
  183. movq $0, _job_in_lane(lane_data)
  184. movl $STS_COMPLETED, _status(job_rax)
  185. mov _unused_lanes(state), unused_lanes
  186. shl $4, unused_lanes
  187. or idx, unused_lanes
  188. mov unused_lanes, _unused_lanes(state)
  189. movl $0xFFFFFFFF, _lens(state, idx, 4)
  190. vmovd _args_digest(state , idx, 4) , %xmm0
  191. vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
  192. vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
  193. vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
  194. movl _args_digest+4*32(state, idx, 4), tmp2_w
  195. vmovdqu %xmm0, _result_digest(job_rax)
  196. offset = (_result_digest + 1*16)
  197. mov tmp2_w, offset(job_rax)
  198. return:
  199. mov _GPR_SAVE(%rsp), %rbx
  200. mov _GPR_SAVE+8*1(%rsp), %r10 #saved rsp
  201. mov _GPR_SAVE+8*3(%rsp), %rbp
  202. mov _GPR_SAVE+8*4(%rsp), %r12
  203. mov _GPR_SAVE+8*5(%rsp), %r13
  204. mov _GPR_SAVE+8*6(%rsp), %r14
  205. mov _GPR_SAVE+8*7(%rsp), %r15
  206. mov %r10, %rsp
  207. ret
  208. return_null:
  209. xor job_rax, job_rax
  210. jmp return
  211. ENDPROC(sha1_mb_mgr_flush_avx2)
  212. #################################################################
  213. .align 16
  214. ENTRY(sha1_mb_mgr_get_comp_job_avx2)
  215. push %rbx
  216. ## if bit 32+3 is set, then all lanes are empty
  217. mov _unused_lanes(state), unused_lanes
  218. bt $(32+3), unused_lanes
  219. jc .return_null
  220. # Find min length
  221. vmovdqa _lens(state), %xmm0
  222. vmovdqa _lens+1*16(state), %xmm1
  223. vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
  224. vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
  225. vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
  226. vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
  227. vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
  228. vmovd %xmm2, DWORD_idx
  229. test $~0xF, idx
  230. jnz .return_null
  231. # process completed job "idx"
  232. imul $_LANE_DATA_size, idx, lane_data
  233. lea _ldata(state, lane_data), lane_data
  234. mov _job_in_lane(lane_data), job_rax
  235. movq $0, _job_in_lane(lane_data)
  236. movl $STS_COMPLETED, _status(job_rax)
  237. mov _unused_lanes(state), unused_lanes
  238. shl $4, unused_lanes
  239. or idx, unused_lanes
  240. mov unused_lanes, _unused_lanes(state)
  241. movl $0xFFFFFFFF, _lens(state, idx, 4)
  242. vmovd _args_digest(state, idx, 4), %xmm0
  243. vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
  244. vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
  245. vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
  246. movl _args_digest+4*32(state, idx, 4), tmp2_w
  247. vmovdqu %xmm0, _result_digest(job_rax)
  248. movl tmp2_w, _result_digest+1*16(job_rax)
  249. pop %rbx
  250. ret
  251. .return_null:
  252. xor job_rax, job_rax
  253. pop %rbx
  254. ret
  255. ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
  256. .data
  257. .align 16
  258. clear_low_nibble:
  259. .octa 0x000000000000000000000000FFFFFFF0
  260. one:
  261. .quad 1
  262. two:
  263. .quad 2
  264. three:
  265. .quad 3
  266. four:
  267. .quad 4
  268. five:
  269. .quad 5
  270. six:
  271. .quad 6
  272. seven:
  273. .quad 7