crc32_x86.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. /*
  2. +----------------------------------------------------------------------+
  3. | Copyright (c) The PHP Group |
  4. +----------------------------------------------------------------------+
  5. | This source file is subject to version 3.01 of the PHP license, |
  6. | that is bundled with this package in the file LICENSE, and is |
  7. | available through the world-wide-web at the following url: |
  8. | https://www.php.net/license/3_01.txt |
  9. | If you did not receive a copy of the PHP license and are unable to |
  10. | obtain it through the world-wide-web, please send a note to |
  11. | license@php.net so we can mail you a copy immediately. |
  12. +----------------------------------------------------------------------+
  13. | Author: Frank Du <frank.du@intel.com> |
  14. +----------------------------------------------------------------------+
  15. | Compute the crc32 of the buffer. Based on: |
  16. | "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ" |
  17. | V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 |
  18. */
  19. #include "crc32_x86.h"
  20. #if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
  21. # include <nmmintrin.h>
  22. # include <wmmintrin.h>
  23. #endif
  24. #if ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
  25. # include "Zend/zend_cpuinfo.h"
  26. #endif
  27. #if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
  28. typedef struct _crc32_pclmul_bit_consts {
  29. uint64_t k1k2[2];
  30. uint64_t k3k4[2];
  31. uint64_t k5k6[2];
  32. uint64_t uPx[2];
  33. } crc32_pclmul_consts;
  34. static const crc32_pclmul_consts crc32_pclmul_consts_maps[X86_CRC32_MAX] = {
  35. { /* X86_CRC32, polynomial: 0x04C11DB7 */
  36. {0x00e6228b11, 0x008833794c}, /* endianness swap */
  37. {0x00e8a45605, 0x00c5b9cd4c}, /* endianness swap */
  38. {0x00490d678d, 0x00f200aa66}, /* endianness swap */
  39. {0x0104d101df, 0x0104c11db7}
  40. },
  41. { /* X86_CRC32B, polynomial: 0x04C11DB7 with reversed ordering */
  42. {0x0154442bd4, 0x01c6e41596},
  43. {0x01751997d0, 0x00ccaa009e},
  44. {0x0163cd6124, 0x01db710640},
  45. {0x01f7011641, 0x01db710641},
  46. },
  47. { /* X86_CRC32C, polynomial: 0x1EDC6F41 with reversed ordering */
  48. {0x00740eef02, 0x009e4addf8},
  49. {0x00f20c0dfe, 0x014cd00bd6},
  50. {0x00dd45aab8, 0x0000000000},
  51. {0x00dea713f1, 0x0105ec76f0}
  52. }
  53. };
  54. static uint8_t pclmul_shuf_mask_table[16] = {
  55. 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
  56. 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
  57. };
  58. /* Folding of 128-bit data chunks */
  59. #define CRC32_FOLDING_BLOCK_SIZE (16)
  60. /* PCLMUL version of non-relfected crc32 */
  61. ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts));
  62. size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)
  63. {
  64. size_t nr_in = nr;
  65. __m128i x0, x1, x2, k, shuf_mask;
  66. if (nr < CRC32_FOLDING_BLOCK_SIZE) {
  67. return 0;
  68. }
  69. shuf_mask = _mm_loadu_si128((__m128i *)(pclmul_shuf_mask_table));
  70. x0 = _mm_cvtsi32_si128(*crc);
  71. x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
  72. x0 = _mm_slli_si128(x0, 12);
  73. x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */
  74. x0 = _mm_xor_si128(x1, x0);
  75. p += CRC32_FOLDING_BLOCK_SIZE;
  76. nr -= CRC32_FOLDING_BLOCK_SIZE;
  77. if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
  78. __m128i x3, x4;
  79. x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
  80. x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */
  81. x2 = _mm_loadu_si128((__m128i *)(p + 0x10));
  82. x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */
  83. x3 = _mm_loadu_si128((__m128i *)(p + 0x20));
  84. x3 = _mm_shuffle_epi8(x3, shuf_mask); /* endianness swap */
  85. p += CRC32_FOLDING_BLOCK_SIZE * 3;
  86. nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
  87. k = _mm_loadu_si128((__m128i *)consts->k1k2);
  88. /* parallel folding by 4 */
  89. while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
  90. __m128i x5, x6, x7, x8, x9, x10, x11;
  91. x4 = _mm_clmulepi64_si128(x0, k, 0x00);
  92. x5 = _mm_clmulepi64_si128(x1, k, 0x00);
  93. x6 = _mm_clmulepi64_si128(x2, k, 0x00);
  94. x7 = _mm_clmulepi64_si128(x3, k, 0x00);
  95. x0 = _mm_clmulepi64_si128(x0, k, 0x11);
  96. x1 = _mm_clmulepi64_si128(x1, k, 0x11);
  97. x2 = _mm_clmulepi64_si128(x2, k, 0x11);
  98. x3 = _mm_clmulepi64_si128(x3, k, 0x11);
  99. x8 = _mm_loadu_si128((__m128i *)(p + 0x00));
  100. x8 = _mm_shuffle_epi8(x8, shuf_mask); /* endianness swap */
  101. x9 = _mm_loadu_si128((__m128i *)(p + 0x10));
  102. x9 = _mm_shuffle_epi8(x9, shuf_mask); /* endianness swap */
  103. x10 = _mm_loadu_si128((__m128i *)(p + 0x20));
  104. x10 = _mm_shuffle_epi8(x10, shuf_mask); /* endianness swap */
  105. x11 = _mm_loadu_si128((__m128i *)(p + 0x30));
  106. x11 = _mm_shuffle_epi8(x11, shuf_mask); /* endianness swap */
  107. x0 = _mm_xor_si128(x0, x4);
  108. x1 = _mm_xor_si128(x1, x5);
  109. x2 = _mm_xor_si128(x2, x6);
  110. x3 = _mm_xor_si128(x3, x7);
  111. x0 = _mm_xor_si128(x0, x8);
  112. x1 = _mm_xor_si128(x1, x9);
  113. x2 = _mm_xor_si128(x2, x10);
  114. x3 = _mm_xor_si128(x3, x11);
  115. p += CRC32_FOLDING_BLOCK_SIZE * 4;
  116. nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
  117. }
  118. k = _mm_loadu_si128((__m128i *)consts->k3k4);
  119. /* fold 4 to 1, [x1, x2, x3] -> x0 */
  120. x4 = _mm_clmulepi64_si128(x0, k, 0x00);
  121. x0 = _mm_clmulepi64_si128(x0, k, 0x11);
  122. x0 = _mm_xor_si128(x0, x1);
  123. x0 = _mm_xor_si128(x0, x4);
  124. x4 = _mm_clmulepi64_si128(x0, k, 0x00);
  125. x0 = _mm_clmulepi64_si128(x0, k, 0x11);
  126. x0 = _mm_xor_si128(x0, x2);
  127. x0 = _mm_xor_si128(x0, x4);
  128. x4 = _mm_clmulepi64_si128(x0, k, 0x00);
  129. x0 = _mm_clmulepi64_si128(x0, k, 0x11);
  130. x0 = _mm_xor_si128(x0, x3);
  131. x0 = _mm_xor_si128(x0, x4);
  132. }
  133. k = _mm_loadu_si128((__m128i *)consts->k3k4);
  134. /* folding by 1 */
  135. while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
  136. /* load next to x2, fold to x0, x1 */
  137. x2 = _mm_loadu_si128((__m128i *)(p + 0x00));
  138. x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */
  139. x1 = _mm_clmulepi64_si128(x0, k, 0x00);
  140. x0 = _mm_clmulepi64_si128(x0, k, 0x11);
  141. x0 = _mm_xor_si128(x0, x2);
  142. x0 = _mm_xor_si128(x0, x1);
  143. p += CRC32_FOLDING_BLOCK_SIZE;
  144. nr -= CRC32_FOLDING_BLOCK_SIZE;
  145. }
  146. /* reduce 128-bits(final fold) to 96-bits */
  147. k = _mm_loadu_si128((__m128i*)consts->k5k6);
  148. x1 = _mm_clmulepi64_si128(x0, k, 0x11);
  149. x0 = _mm_slli_si128(x0, 8);
  150. x0 = _mm_srli_si128(x0, 4);
  151. x0 = _mm_xor_si128(x0, x1);
  152. /* reduce 96-bits to 64-bits */
  153. x1 = _mm_clmulepi64_si128(x0, k, 0x01);
  154. x0 = _mm_xor_si128(x0, x1);
  155. /* barrett reduction */
  156. k = _mm_loadu_si128((__m128i*)consts->uPx);
  157. x1 = _mm_move_epi64(x0);
  158. x1 = _mm_srli_si128(x1, 4);
  159. x1 = _mm_clmulepi64_si128(x1, k, 0x00);
  160. x1 = _mm_srli_si128(x1, 4);
  161. x1 = _mm_clmulepi64_si128(x1, k, 0x10);
  162. x0 = _mm_xor_si128(x1, x0);
  163. *crc = _mm_extract_epi32(x0, 0);
  164. return (nr_in - nr); /* the nr processed */
  165. }
  166. /* PCLMUL version of relfected crc32 */
  167. ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts));
  168. size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)
  169. {
  170. size_t nr_in = nr;
  171. __m128i x0, x1, x2, k;
  172. if (nr < CRC32_FOLDING_BLOCK_SIZE) {
  173. return 0;
  174. }
  175. x0 = _mm_loadu_si128((__m128i *)(p + 0x00));
  176. x0 = _mm_xor_si128(x0, _mm_cvtsi32_si128(*crc));
  177. p += CRC32_FOLDING_BLOCK_SIZE;
  178. nr -= CRC32_FOLDING_BLOCK_SIZE;
  179. if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
  180. __m128i x3, x4;
  181. x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
  182. x2 = _mm_loadu_si128((__m128i *)(p + 0x10));
  183. x3 = _mm_loadu_si128((__m128i *)(p + 0x20));
  184. p += CRC32_FOLDING_BLOCK_SIZE * 3;
  185. nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
  186. k = _mm_loadu_si128((__m128i *)consts->k1k2);
  187. /* parallel folding by 4 */
  188. while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
  189. __m128i x5, x6, x7, x8, x9, x10, x11;
  190. x4 = _mm_clmulepi64_si128(x0, k, 0x00);
  191. x5 = _mm_clmulepi64_si128(x1, k, 0x00);
  192. x6 = _mm_clmulepi64_si128(x2, k, 0x00);
  193. x7 = _mm_clmulepi64_si128(x3, k, 0x00);
  194. x0 = _mm_clmulepi64_si128(x0, k, 0x11);
  195. x1 = _mm_clmulepi64_si128(x1, k, 0x11);
  196. x2 = _mm_clmulepi64_si128(x2, k, 0x11);
  197. x3 = _mm_clmulepi64_si128(x3, k, 0x11);
  198. x8 = _mm_loadu_si128((__m128i *)(p + 0x00));
  199. x9 = _mm_loadu_si128((__m128i *)(p + 0x10));
  200. x10 = _mm_loadu_si128((__m128i *)(p + 0x20));
  201. x11 = _mm_loadu_si128((__m128i *)(p + 0x30));
  202. x0 = _mm_xor_si128(x0, x4);
  203. x1 = _mm_xor_si128(x1, x5);
  204. x2 = _mm_xor_si128(x2, x6);
  205. x3 = _mm_xor_si128(x3, x7);
  206. x0 = _mm_xor_si128(x0, x8);
  207. x1 = _mm_xor_si128(x1, x9);
  208. x2 = _mm_xor_si128(x2, x10);
  209. x3 = _mm_xor_si128(x3, x11);
  210. p += CRC32_FOLDING_BLOCK_SIZE * 4;
  211. nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
  212. }
  213. k = _mm_loadu_si128((__m128i *)consts->k3k4);
  214. /* fold 4 to 1, [x1, x2, x3] -> x0 */
  215. x4 = _mm_clmulepi64_si128(x0, k, 0x00);
  216. x0 = _mm_clmulepi64_si128(x0, k, 0x11);
  217. x0 = _mm_xor_si128(x0, x1);
  218. x0 = _mm_xor_si128(x0, x4);
  219. x4 = _mm_clmulepi64_si128(x0, k, 0x00);
  220. x0 = _mm_clmulepi64_si128(x0, k, 0x11);
  221. x0 = _mm_xor_si128(x0, x2);
  222. x0 = _mm_xor_si128(x0, x4);
  223. x4 = _mm_clmulepi64_si128(x0, k, 0x00);
  224. x0 = _mm_clmulepi64_si128(x0, k, 0x11);
  225. x0 = _mm_xor_si128(x0, x3);
  226. x0 = _mm_xor_si128(x0, x4);
  227. }
  228. k = _mm_loadu_si128((__m128i *)consts->k3k4);
  229. /* folding by 1 */
  230. while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
  231. /* load next to x2, fold to x0, x1 */
  232. x2 = _mm_loadu_si128((__m128i *)(p + 0x00));
  233. x1 = _mm_clmulepi64_si128(x0, k, 0x00);
  234. x0 = _mm_clmulepi64_si128(x0, k, 0x11);
  235. x0 = _mm_xor_si128(x0, x2);
  236. x0 = _mm_xor_si128(x0, x1);
  237. p += CRC32_FOLDING_BLOCK_SIZE;
  238. nr -= CRC32_FOLDING_BLOCK_SIZE;
  239. }
  240. /* reduce 128-bits(final fold) to 96-bits */
  241. x1 = _mm_clmulepi64_si128(x0, k, 0x10);
  242. x0 = _mm_srli_si128(x0, 8);
  243. x0 = _mm_xor_si128(x0, x1);
  244. /* reduce 96-bits to 64-bits */
  245. x1 = _mm_shuffle_epi32(x0, 0xfc);
  246. x0 = _mm_shuffle_epi32(x0, 0xf9);
  247. k = _mm_loadu_si128((__m128i*)consts->k5k6);
  248. x1 = _mm_clmulepi64_si128(x1, k, 0x00);
  249. x0 = _mm_xor_si128(x0, x1);
  250. /* barrett reduction */
  251. x1 = _mm_shuffle_epi32(x0, 0xf3);
  252. x0 = _mm_slli_si128(x0, 4);
  253. k = _mm_loadu_si128((__m128i*)consts->uPx);
  254. x1 = _mm_clmulepi64_si128(x1, k, 0x00);
  255. x1 = _mm_clmulepi64_si128(x1, k, 0x10);
  256. x0 = _mm_xor_si128(x1, x0);
  257. *crc = _mm_extract_epi32(x0, 2);
  258. return (nr_in - nr); /* the nr processed */
  259. }
  260. # if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE
  261. size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
  262. # else /* ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER */
  263. size_t crc32_sse42_pclmul_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
  264. # endif
  265. {
  266. if (type > X86_CRC32_MAX) {
  267. return 0;
  268. }
  269. const crc32_pclmul_consts *consts = &crc32_pclmul_consts_maps[type];
  270. switch (type) {
  271. case X86_CRC32:
  272. return crc32_pclmul_batch(crc, p, nr, consts);
  273. case X86_CRC32B:
  274. case X86_CRC32C:
  275. return crc32_pclmul_reflected_batch(crc, p, nr, consts);
  276. default:
  277. return 0;
  278. }
  279. }
  280. #endif
  281. #if ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
  282. static size_t crc32_x86_simd_update_default(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
  283. {
  284. return 0;
  285. }
  286. # if ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PROTO
  287. size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) __attribute__((ifunc("resolve_crc32_x86_simd_update")));
  288. typedef size_t (*crc32_x86_simd_func_t)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr);
  289. ZEND_NO_SANITIZE_ADDRESS
  290. ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
  291. static crc32_x86_simd_func_t resolve_crc32_x86_simd_update() {
  292. if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) {
  293. return crc32_sse42_pclmul_update;
  294. }
  295. return crc32_x86_simd_update_default;
  296. }
  297. # else /* ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR */
  298. static size_t (*crc32_x86_simd_ptr)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) = crc32_x86_simd_update_default;
  299. size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) {
  300. return crc32_x86_simd_ptr(type, crc, p, nr);
  301. }
  302. /* {{{ PHP_MINIT_FUNCTION */
  303. PHP_MINIT_FUNCTION(crc32_x86_intrin)
  304. {
  305. if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) {
  306. crc32_x86_simd_ptr = crc32_sse42_pclmul_update;
  307. }
  308. return SUCCESS;
  309. }
  310. /* }}} */
  311. # endif
  312. #endif