sha1-armv7-neon.S 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639
  1. /* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
  2. *
  3. * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  4. *
  5. * This program is free software; you can redistribute it and/or modify it
  6. * under the terms of the GNU General Public License as published by the Free
  7. * Software Foundation; either version 2 of the License, or (at your option)
  8. * any later version.
  9. */
  10. #include <linux/linkage.h>
  11. #include <asm/assembler.h>
  12. .syntax unified
  13. .code 32
  14. .fpu neon
  15. .text
  16. /* Context structure */
  17. #define state_h0 0
  18. #define state_h1 4
  19. #define state_h2 8
  20. #define state_h3 12
  21. #define state_h4 16
  22. /* Constants */
  23. #define K1 0x5A827999
  24. #define K2 0x6ED9EBA1
  25. #define K3 0x8F1BBCDC
  26. #define K4 0xCA62C1D6
  27. .align 4
  28. .LK_VEC:
  29. .LK1: .long K1, K1, K1, K1
  30. .LK2: .long K2, K2, K2, K2
  31. .LK3: .long K3, K3, K3, K3
  32. .LK4: .long K4, K4, K4, K4
  33. /* Register macros */
  34. #define RSTATE r0
  35. #define RDATA r1
  36. #define RNBLKS r2
  37. #define ROLDSTACK r3
  38. #define RWK lr
  39. #define _a r4
  40. #define _b r5
  41. #define _c r6
  42. #define _d r7
  43. #define _e r8
  44. #define RT0 r9
  45. #define RT1 r10
  46. #define RT2 r11
  47. #define RT3 r12
  48. #define W0 q0
  49. #define W1 q7
  50. #define W2 q2
  51. #define W3 q3
  52. #define W4 q4
  53. #define W5 q6
  54. #define W6 q5
  55. #define W7 q1
  56. #define tmp0 q8
  57. #define tmp1 q9
  58. #define tmp2 q10
  59. #define tmp3 q11
  60. #define qK1 q12
  61. #define qK2 q13
  62. #define qK3 q14
  63. #define qK4 q15
  64. #ifdef CONFIG_CPU_BIG_ENDIAN
  65. #define ARM_LE(code...)
  66. #else
  67. #define ARM_LE(code...) code
  68. #endif
  69. /* Round function macros. */
  70. #define WK_offs(i) (((i) & 15) * 4)
  71. #define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  72. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  73. ldr RT3, [sp, WK_offs(i)]; \
  74. pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  75. bic RT0, d, b; \
  76. add e, e, a, ror #(32 - 5); \
  77. and RT1, c, b; \
  78. pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  79. add RT0, RT0, RT3; \
  80. add e, e, RT1; \
  81. ror b, #(32 - 30); \
  82. pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  83. add e, e, RT0;
  84. #define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  85. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  86. ldr RT3, [sp, WK_offs(i)]; \
  87. pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  88. eor RT0, d, b; \
  89. add e, e, a, ror #(32 - 5); \
  90. eor RT0, RT0, c; \
  91. pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  92. add e, e, RT3; \
  93. ror b, #(32 - 30); \
  94. pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  95. add e, e, RT0; \
  96. #define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  97. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  98. ldr RT3, [sp, WK_offs(i)]; \
  99. pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  100. eor RT0, b, c; \
  101. and RT1, b, c; \
  102. add e, e, a, ror #(32 - 5); \
  103. pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  104. and RT0, RT0, d; \
  105. add RT1, RT1, RT3; \
  106. add e, e, RT0; \
  107. ror b, #(32 - 30); \
  108. pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
  109. add e, e, RT1;
  110. #define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  111. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  112. _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  113. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
  114. #define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
  115. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  116. _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
  117. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
  118. #define R(a,b,c,d,e,f,i) \
  119. _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
  120. W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
  121. #define dummy(...)
  122. /* Input expansion macros. */
  123. /********* Precalc macros for rounds 0-15 *************************************/
  124. #define W_PRECALC_00_15() \
  125. add RWK, sp, #(WK_offs(0)); \
  126. \
  127. vld1.32 {W0, W7}, [RDATA]!; \
  128. ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
  129. vld1.32 {W6, W5}, [RDATA]!; \
  130. vadd.u32 tmp0, W0, curK; \
  131. ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
  132. ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
  133. vadd.u32 tmp1, W7, curK; \
  134. ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
  135. vadd.u32 tmp2, W6, curK; \
  136. vst1.32 {tmp0, tmp1}, [RWK]!; \
  137. vadd.u32 tmp3, W5, curK; \
  138. vst1.32 {tmp2, tmp3}, [RWK]; \
  139. #define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  140. vld1.32 {W0, W7}, [RDATA]!; \
  141. #define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  142. add RWK, sp, #(WK_offs(0)); \
  143. #define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  144. ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \
  145. #define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  146. vld1.32 {W6, W5}, [RDATA]!; \
  147. #define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  148. vadd.u32 tmp0, W0, curK; \
  149. #define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  150. ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \
  151. #define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  152. ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \
  153. #define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  154. vadd.u32 tmp1, W7, curK; \
  155. #define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  156. ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \
  157. #define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  158. vadd.u32 tmp2, W6, curK; \
  159. #define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  160. vst1.32 {tmp0, tmp1}, [RWK]!; \
  161. #define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  162. vadd.u32 tmp3, W5, curK; \
  163. #define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  164. vst1.32 {tmp2, tmp3}, [RWK]; \
  165. /********* Precalc macros for rounds 16-31 ************************************/
  166. #define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  167. veor tmp0, tmp0; \
  168. vext.8 W, W_m16, W_m12, #8; \
  169. #define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  170. add RWK, sp, #(WK_offs(i)); \
  171. vext.8 tmp0, W_m04, tmp0, #4; \
  172. #define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  173. veor tmp0, tmp0, W_m16; \
  174. veor.32 W, W, W_m08; \
  175. #define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  176. veor tmp1, tmp1; \
  177. veor W, W, tmp0; \
  178. #define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  179. vshl.u32 tmp0, W, #1; \
  180. #define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  181. vext.8 tmp1, tmp1, W, #(16-12); \
  182. vshr.u32 W, W, #31; \
  183. #define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  184. vorr tmp0, tmp0, W; \
  185. vshr.u32 W, tmp1, #30; \
  186. #define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  187. vshl.u32 tmp1, tmp1, #2; \
  188. #define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  189. veor tmp0, tmp0, W; \
  190. #define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  191. veor W, tmp0, tmp1; \
  192. #define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  193. vadd.u32 tmp0, W, curK; \
  194. #define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  195. vst1.32 {tmp0}, [RWK];
  196. /********* Precalc macros for rounds 32-79 ************************************/
  197. #define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  198. veor W, W_m28; \
  199. #define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  200. vext.8 tmp0, W_m08, W_m04, #8; \
  201. #define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  202. veor W, W_m16; \
  203. #define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  204. veor W, tmp0; \
  205. #define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  206. add RWK, sp, #(WK_offs(i&~3)); \
  207. #define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  208. vshl.u32 tmp1, W, #2; \
  209. #define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  210. vshr.u32 tmp0, W, #30; \
  211. #define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  212. vorr W, tmp0, tmp1; \
  213. #define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  214. vadd.u32 tmp0, W, curK; \
  215. #define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
  216. vst1.32 {tmp0}, [RWK];
  217. /*
  218. * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
  219. *
  220. * unsigned int
  221. * sha1_transform_neon (void *ctx, const unsigned char *data,
  222. * unsigned int nblks)
  223. */
  224. .align 3
  225. ENTRY(sha1_transform_neon)
  226. /* input:
  227. * r0: ctx, CTX
  228. * r1: data (64*nblks bytes)
  229. * r2: nblks
  230. */
  231. cmp RNBLKS, #0;
  232. beq .Ldo_nothing;
  233. push {r4-r12, lr};
  234. /*vpush {q4-q7};*/
  235. adr RT3, .LK_VEC;
  236. mov ROLDSTACK, sp;
  237. /* Align stack. */
  238. sub RT0, sp, #(16*4);
  239. and RT0, #(~(16-1));
  240. mov sp, RT0;
  241. vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
  242. /* Get the values of the chaining variables. */
  243. ldm RSTATE, {_a-_e};
  244. vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
  245. #undef curK
  246. #define curK qK1
  247. /* Precalc 0-15. */
  248. W_PRECALC_00_15();
  249. .Loop:
  250. /* Transform 0-15 + Precalc 16-31. */
  251. _R( _a, _b, _c, _d, _e, F1, 0,
  252. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
  253. W4, W5, W6, W7, W0, _, _, _ );
  254. _R( _e, _a, _b, _c, _d, F1, 1,
  255. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
  256. W4, W5, W6, W7, W0, _, _, _ );
  257. _R( _d, _e, _a, _b, _c, F1, 2,
  258. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
  259. W4, W5, W6, W7, W0, _, _, _ );
  260. _R( _c, _d, _e, _a, _b, F1, 3,
  261. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
  262. W4, W5, W6, W7, W0, _, _, _ );
  263. #undef curK
  264. #define curK qK2
  265. _R( _b, _c, _d, _e, _a, F1, 4,
  266. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
  267. W3, W4, W5, W6, W7, _, _, _ );
  268. _R( _a, _b, _c, _d, _e, F1, 5,
  269. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
  270. W3, W4, W5, W6, W7, _, _, _ );
  271. _R( _e, _a, _b, _c, _d, F1, 6,
  272. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
  273. W3, W4, W5, W6, W7, _, _, _ );
  274. _R( _d, _e, _a, _b, _c, F1, 7,
  275. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
  276. W3, W4, W5, W6, W7, _, _, _ );
  277. _R( _c, _d, _e, _a, _b, F1, 8,
  278. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
  279. W2, W3, W4, W5, W6, _, _, _ );
  280. _R( _b, _c, _d, _e, _a, F1, 9,
  281. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
  282. W2, W3, W4, W5, W6, _, _, _ );
  283. _R( _a, _b, _c, _d, _e, F1, 10,
  284. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
  285. W2, W3, W4, W5, W6, _, _, _ );
  286. _R( _e, _a, _b, _c, _d, F1, 11,
  287. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
  288. W2, W3, W4, W5, W6, _, _, _ );
  289. _R( _d, _e, _a, _b, _c, F1, 12,
  290. WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
  291. W1, W2, W3, W4, W5, _, _, _ );
  292. _R( _c, _d, _e, _a, _b, F1, 13,
  293. WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
  294. W1, W2, W3, W4, W5, _, _, _ );
  295. _R( _b, _c, _d, _e, _a, F1, 14,
  296. WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
  297. W1, W2, W3, W4, W5, _, _, _ );
  298. _R( _a, _b, _c, _d, _e, F1, 15,
  299. WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
  300. W1, W2, W3, W4, W5, _, _, _ );
  301. /* Transform 16-63 + Precalc 32-79. */
  302. _R( _e, _a, _b, _c, _d, F1, 16,
  303. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
  304. W0, W1, W2, W3, W4, W5, W6, W7);
  305. _R( _d, _e, _a, _b, _c, F1, 17,
  306. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
  307. W0, W1, W2, W3, W4, W5, W6, W7);
  308. _R( _c, _d, _e, _a, _b, F1, 18,
  309. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32,
  310. W0, W1, W2, W3, W4, W5, W6, W7);
  311. _R( _b, _c, _d, _e, _a, F1, 19,
  312. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32,
  313. W0, W1, W2, W3, W4, W5, W6, W7);
  314. _R( _a, _b, _c, _d, _e, F2, 20,
  315. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
  316. W7, W0, W1, W2, W3, W4, W5, W6);
  317. _R( _e, _a, _b, _c, _d, F2, 21,
  318. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
  319. W7, W0, W1, W2, W3, W4, W5, W6);
  320. _R( _d, _e, _a, _b, _c, F2, 22,
  321. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36,
  322. W7, W0, W1, W2, W3, W4, W5, W6);
  323. _R( _c, _d, _e, _a, _b, F2, 23,
  324. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36,
  325. W7, W0, W1, W2, W3, W4, W5, W6);
  326. #undef curK
  327. #define curK qK3
  328. _R( _b, _c, _d, _e, _a, F2, 24,
  329. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
  330. W6, W7, W0, W1, W2, W3, W4, W5);
  331. _R( _a, _b, _c, _d, _e, F2, 25,
  332. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
  333. W6, W7, W0, W1, W2, W3, W4, W5);
  334. _R( _e, _a, _b, _c, _d, F2, 26,
  335. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40,
  336. W6, W7, W0, W1, W2, W3, W4, W5);
  337. _R( _d, _e, _a, _b, _c, F2, 27,
  338. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40,
  339. W6, W7, W0, W1, W2, W3, W4, W5);
  340. _R( _c, _d, _e, _a, _b, F2, 28,
  341. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
  342. W5, W6, W7, W0, W1, W2, W3, W4);
  343. _R( _b, _c, _d, _e, _a, F2, 29,
  344. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
  345. W5, W6, W7, W0, W1, W2, W3, W4);
  346. _R( _a, _b, _c, _d, _e, F2, 30,
  347. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44,
  348. W5, W6, W7, W0, W1, W2, W3, W4);
  349. _R( _e, _a, _b, _c, _d, F2, 31,
  350. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44,
  351. W5, W6, W7, W0, W1, W2, W3, W4);
  352. _R( _d, _e, _a, _b, _c, F2, 32,
  353. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
  354. W4, W5, W6, W7, W0, W1, W2, W3);
  355. _R( _c, _d, _e, _a, _b, F2, 33,
  356. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
  357. W4, W5, W6, W7, W0, W1, W2, W3);
  358. _R( _b, _c, _d, _e, _a, F2, 34,
  359. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48,
  360. W4, W5, W6, W7, W0, W1, W2, W3);
  361. _R( _a, _b, _c, _d, _e, F2, 35,
  362. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48,
  363. W4, W5, W6, W7, W0, W1, W2, W3);
  364. _R( _e, _a, _b, _c, _d, F2, 36,
  365. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
  366. W3, W4, W5, W6, W7, W0, W1, W2);
  367. _R( _d, _e, _a, _b, _c, F2, 37,
  368. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
  369. W3, W4, W5, W6, W7, W0, W1, W2);
  370. _R( _c, _d, _e, _a, _b, F2, 38,
  371. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52,
  372. W3, W4, W5, W6, W7, W0, W1, W2);
  373. _R( _b, _c, _d, _e, _a, F2, 39,
  374. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52,
  375. W3, W4, W5, W6, W7, W0, W1, W2);
  376. _R( _a, _b, _c, _d, _e, F3, 40,
  377. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
  378. W2, W3, W4, W5, W6, W7, W0, W1);
  379. _R( _e, _a, _b, _c, _d, F3, 41,
  380. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
  381. W2, W3, W4, W5, W6, W7, W0, W1);
  382. _R( _d, _e, _a, _b, _c, F3, 42,
  383. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56,
  384. W2, W3, W4, W5, W6, W7, W0, W1);
  385. _R( _c, _d, _e, _a, _b, F3, 43,
  386. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56,
  387. W2, W3, W4, W5, W6, W7, W0, W1);
  388. #undef curK
  389. #define curK qK4
  390. _R( _b, _c, _d, _e, _a, F3, 44,
  391. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
  392. W1, W2, W3, W4, W5, W6, W7, W0);
  393. _R( _a, _b, _c, _d, _e, F3, 45,
  394. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
  395. W1, W2, W3, W4, W5, W6, W7, W0);
  396. _R( _e, _a, _b, _c, _d, F3, 46,
  397. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60,
  398. W1, W2, W3, W4, W5, W6, W7, W0);
  399. _R( _d, _e, _a, _b, _c, F3, 47,
  400. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60,
  401. W1, W2, W3, W4, W5, W6, W7, W0);
  402. _R( _c, _d, _e, _a, _b, F3, 48,
  403. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
  404. W0, W1, W2, W3, W4, W5, W6, W7);
  405. _R( _b, _c, _d, _e, _a, F3, 49,
  406. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
  407. W0, W1, W2, W3, W4, W5, W6, W7);
  408. _R( _a, _b, _c, _d, _e, F3, 50,
  409. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64,
  410. W0, W1, W2, W3, W4, W5, W6, W7);
  411. _R( _e, _a, _b, _c, _d, F3, 51,
  412. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64,
  413. W0, W1, W2, W3, W4, W5, W6, W7);
  414. _R( _d, _e, _a, _b, _c, F3, 52,
  415. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
  416. W7, W0, W1, W2, W3, W4, W5, W6);
  417. _R( _c, _d, _e, _a, _b, F3, 53,
  418. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
  419. W7, W0, W1, W2, W3, W4, W5, W6);
  420. _R( _b, _c, _d, _e, _a, F3, 54,
  421. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68,
  422. W7, W0, W1, W2, W3, W4, W5, W6);
  423. _R( _a, _b, _c, _d, _e, F3, 55,
  424. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68,
  425. W7, W0, W1, W2, W3, W4, W5, W6);
  426. _R( _e, _a, _b, _c, _d, F3, 56,
  427. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
  428. W6, W7, W0, W1, W2, W3, W4, W5);
  429. _R( _d, _e, _a, _b, _c, F3, 57,
  430. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
  431. W6, W7, W0, W1, W2, W3, W4, W5);
  432. _R( _c, _d, _e, _a, _b, F3, 58,
  433. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72,
  434. W6, W7, W0, W1, W2, W3, W4, W5);
  435. _R( _b, _c, _d, _e, _a, F3, 59,
  436. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72,
  437. W6, W7, W0, W1, W2, W3, W4, W5);
  438. subs RNBLKS, #1;
  439. _R( _a, _b, _c, _d, _e, F4, 60,
  440. WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
  441. W5, W6, W7, W0, W1, W2, W3, W4);
  442. _R( _e, _a, _b, _c, _d, F4, 61,
  443. WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
  444. W5, W6, W7, W0, W1, W2, W3, W4);
  445. _R( _d, _e, _a, _b, _c, F4, 62,
  446. WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76,
  447. W5, W6, W7, W0, W1, W2, W3, W4);
  448. _R( _c, _d, _e, _a, _b, F4, 63,
  449. WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76,
  450. W5, W6, W7, W0, W1, W2, W3, W4);
  451. beq .Lend;
  452. /* Transform 64-79 + Precalc 0-15 of next block. */
  453. #undef curK
  454. #define curK qK1
  455. _R( _b, _c, _d, _e, _a, F4, 64,
  456. WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  457. _R( _a, _b, _c, _d, _e, F4, 65,
  458. WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  459. _R( _e, _a, _b, _c, _d, F4, 66,
  460. WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  461. _R( _d, _e, _a, _b, _c, F4, 67,
  462. WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  463. _R( _c, _d, _e, _a, _b, F4, 68,
  464. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  465. _R( _b, _c, _d, _e, _a, F4, 69,
  466. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  467. _R( _a, _b, _c, _d, _e, F4, 70,
  468. WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  469. _R( _e, _a, _b, _c, _d, F4, 71,
  470. WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  471. _R( _d, _e, _a, _b, _c, F4, 72,
  472. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  473. _R( _c, _d, _e, _a, _b, F4, 73,
  474. dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  475. _R( _b, _c, _d, _e, _a, F4, 74,
  476. WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  477. _R( _a, _b, _c, _d, _e, F4, 75,
  478. WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  479. _R( _e, _a, _b, _c, _d, F4, 76,
  480. WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  481. _R( _d, _e, _a, _b, _c, F4, 77,
  482. WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  483. _R( _c, _d, _e, _a, _b, F4, 78,
  484. WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
  485. _R( _b, _c, _d, _e, _a, F4, 79,
  486. WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
  487. /* Update the chaining variables. */
  488. ldm RSTATE, {RT0-RT3};
  489. add _a, RT0;
  490. ldr RT0, [RSTATE, #state_h4];
  491. add _b, RT1;
  492. add _c, RT2;
  493. add _d, RT3;
  494. add _e, RT0;
  495. stm RSTATE, {_a-_e};
  496. b .Loop;
  497. .Lend:
  498. /* Transform 64-79 */
  499. R( _b, _c, _d, _e, _a, F4, 64 );
  500. R( _a, _b, _c, _d, _e, F4, 65 );
  501. R( _e, _a, _b, _c, _d, F4, 66 );
  502. R( _d, _e, _a, _b, _c, F4, 67 );
  503. R( _c, _d, _e, _a, _b, F4, 68 );
  504. R( _b, _c, _d, _e, _a, F4, 69 );
  505. R( _a, _b, _c, _d, _e, F4, 70 );
  506. R( _e, _a, _b, _c, _d, F4, 71 );
  507. R( _d, _e, _a, _b, _c, F4, 72 );
  508. R( _c, _d, _e, _a, _b, F4, 73 );
  509. R( _b, _c, _d, _e, _a, F4, 74 );
  510. R( _a, _b, _c, _d, _e, F4, 75 );
  511. R( _e, _a, _b, _c, _d, F4, 76 );
  512. R( _d, _e, _a, _b, _c, F4, 77 );
  513. R( _c, _d, _e, _a, _b, F4, 78 );
  514. R( _b, _c, _d, _e, _a, F4, 79 );
  515. mov sp, ROLDSTACK;
  516. /* Update the chaining variables. */
  517. ldm RSTATE, {RT0-RT3};
  518. add _a, RT0;
  519. ldr RT0, [RSTATE, #state_h4];
  520. add _b, RT1;
  521. add _c, RT2;
  522. add _d, RT3;
  523. /*vpop {q4-q7};*/
  524. add _e, RT0;
  525. stm RSTATE, {_a-_e};
  526. pop {r4-r12, pc};
  527. .Ldo_nothing:
  528. bx lr
  529. ENDPROC(sha1_transform_neon)