123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349 |
- /*
- +----------------------------------------------------------------------+
- | Copyright (c) The PHP Group |
- +----------------------------------------------------------------------+
- | This source file is subject to version 3.01 of the PHP license, |
- | that is bundled with this package in the file LICENSE, and is |
- | available through the world-wide-web at the following url: |
- | https://www.php.net/license/3_01.txt |
- | If you did not receive a copy of the PHP license and are unable to |
- | obtain it through the world-wide-web, please send a note to |
- | license@php.net so we can mail you a copy immediately. |
- +----------------------------------------------------------------------+
- | Author: Frank Du <frank.du@intel.com> |
- +----------------------------------------------------------------------+
- | Compute the crc32 of the buffer. Based on: |
- | "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ" |
- | V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 |
- */
- #include "crc32_x86.h"
- #if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
- # include <nmmintrin.h>
- # include <wmmintrin.h>
- #endif
- #if ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
- # include "Zend/zend_cpuinfo.h"
- #endif
- #if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE || ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
- typedef struct _crc32_pclmul_bit_consts {
- uint64_t k1k2[2];
- uint64_t k3k4[2];
- uint64_t k5k6[2];
- uint64_t uPx[2];
- } crc32_pclmul_consts;
- static const crc32_pclmul_consts crc32_pclmul_consts_maps[X86_CRC32_MAX] = {
- { /* X86_CRC32, polynomial: 0x04C11DB7 */
- {0x00e6228b11, 0x008833794c}, /* endianness swap */
- {0x00e8a45605, 0x00c5b9cd4c}, /* endianness swap */
- {0x00490d678d, 0x00f200aa66}, /* endianness swap */
- {0x0104d101df, 0x0104c11db7}
- },
- { /* X86_CRC32B, polynomial: 0x04C11DB7 with reversed ordering */
- {0x0154442bd4, 0x01c6e41596},
- {0x01751997d0, 0x00ccaa009e},
- {0x0163cd6124, 0x01db710640},
- {0x01f7011641, 0x01db710641},
- },
- { /* X86_CRC32C, polynomial: 0x1EDC6F41 with reversed ordering */
- {0x00740eef02, 0x009e4addf8},
- {0x00f20c0dfe, 0x014cd00bd6},
- {0x00dd45aab8, 0x0000000000},
- {0x00dea713f1, 0x0105ec76f0}
- }
- };
- static uint8_t pclmul_shuf_mask_table[16] = {
- 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
- 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
- };
- /* Folding of 128-bit data chunks */
- #define CRC32_FOLDING_BLOCK_SIZE (16)
- /* PCLMUL version of non-relfected crc32 */
- ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts));
- size_t crc32_pclmul_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)
- {
- size_t nr_in = nr;
- __m128i x0, x1, x2, k, shuf_mask;
- if (nr < CRC32_FOLDING_BLOCK_SIZE) {
- return 0;
- }
- shuf_mask = _mm_loadu_si128((__m128i *)(pclmul_shuf_mask_table));
- x0 = _mm_cvtsi32_si128(*crc);
- x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
- x0 = _mm_slli_si128(x0, 12);
- x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */
- x0 = _mm_xor_si128(x1, x0);
- p += CRC32_FOLDING_BLOCK_SIZE;
- nr -= CRC32_FOLDING_BLOCK_SIZE;
- if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
- __m128i x3, x4;
- x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
- x1 = _mm_shuffle_epi8(x1, shuf_mask); /* endianness swap */
- x2 = _mm_loadu_si128((__m128i *)(p + 0x10));
- x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */
- x3 = _mm_loadu_si128((__m128i *)(p + 0x20));
- x3 = _mm_shuffle_epi8(x3, shuf_mask); /* endianness swap */
- p += CRC32_FOLDING_BLOCK_SIZE * 3;
- nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
- k = _mm_loadu_si128((__m128i *)consts->k1k2);
- /* parallel folding by 4 */
- while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
- __m128i x5, x6, x7, x8, x9, x10, x11;
- x4 = _mm_clmulepi64_si128(x0, k, 0x00);
- x5 = _mm_clmulepi64_si128(x1, k, 0x00);
- x6 = _mm_clmulepi64_si128(x2, k, 0x00);
- x7 = _mm_clmulepi64_si128(x3, k, 0x00);
- x0 = _mm_clmulepi64_si128(x0, k, 0x11);
- x1 = _mm_clmulepi64_si128(x1, k, 0x11);
- x2 = _mm_clmulepi64_si128(x2, k, 0x11);
- x3 = _mm_clmulepi64_si128(x3, k, 0x11);
- x8 = _mm_loadu_si128((__m128i *)(p + 0x00));
- x8 = _mm_shuffle_epi8(x8, shuf_mask); /* endianness swap */
- x9 = _mm_loadu_si128((__m128i *)(p + 0x10));
- x9 = _mm_shuffle_epi8(x9, shuf_mask); /* endianness swap */
- x10 = _mm_loadu_si128((__m128i *)(p + 0x20));
- x10 = _mm_shuffle_epi8(x10, shuf_mask); /* endianness swap */
- x11 = _mm_loadu_si128((__m128i *)(p + 0x30));
- x11 = _mm_shuffle_epi8(x11, shuf_mask); /* endianness swap */
- x0 = _mm_xor_si128(x0, x4);
- x1 = _mm_xor_si128(x1, x5);
- x2 = _mm_xor_si128(x2, x6);
- x3 = _mm_xor_si128(x3, x7);
- x0 = _mm_xor_si128(x0, x8);
- x1 = _mm_xor_si128(x1, x9);
- x2 = _mm_xor_si128(x2, x10);
- x3 = _mm_xor_si128(x3, x11);
- p += CRC32_FOLDING_BLOCK_SIZE * 4;
- nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
- }
- k = _mm_loadu_si128((__m128i *)consts->k3k4);
- /* fold 4 to 1, [x1, x2, x3] -> x0 */
- x4 = _mm_clmulepi64_si128(x0, k, 0x00);
- x0 = _mm_clmulepi64_si128(x0, k, 0x11);
- x0 = _mm_xor_si128(x0, x1);
- x0 = _mm_xor_si128(x0, x4);
- x4 = _mm_clmulepi64_si128(x0, k, 0x00);
- x0 = _mm_clmulepi64_si128(x0, k, 0x11);
- x0 = _mm_xor_si128(x0, x2);
- x0 = _mm_xor_si128(x0, x4);
- x4 = _mm_clmulepi64_si128(x0, k, 0x00);
- x0 = _mm_clmulepi64_si128(x0, k, 0x11);
- x0 = _mm_xor_si128(x0, x3);
- x0 = _mm_xor_si128(x0, x4);
- }
- k = _mm_loadu_si128((__m128i *)consts->k3k4);
- /* folding by 1 */
- while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
- /* load next to x2, fold to x0, x1 */
- x2 = _mm_loadu_si128((__m128i *)(p + 0x00));
- x2 = _mm_shuffle_epi8(x2, shuf_mask); /* endianness swap */
- x1 = _mm_clmulepi64_si128(x0, k, 0x00);
- x0 = _mm_clmulepi64_si128(x0, k, 0x11);
- x0 = _mm_xor_si128(x0, x2);
- x0 = _mm_xor_si128(x0, x1);
- p += CRC32_FOLDING_BLOCK_SIZE;
- nr -= CRC32_FOLDING_BLOCK_SIZE;
- }
- /* reduce 128-bits(final fold) to 96-bits */
- k = _mm_loadu_si128((__m128i*)consts->k5k6);
- x1 = _mm_clmulepi64_si128(x0, k, 0x11);
- x0 = _mm_slli_si128(x0, 8);
- x0 = _mm_srli_si128(x0, 4);
- x0 = _mm_xor_si128(x0, x1);
- /* reduce 96-bits to 64-bits */
- x1 = _mm_clmulepi64_si128(x0, k, 0x01);
- x0 = _mm_xor_si128(x0, x1);
- /* barrett reduction */
- k = _mm_loadu_si128((__m128i*)consts->uPx);
- x1 = _mm_move_epi64(x0);
- x1 = _mm_srli_si128(x1, 4);
- x1 = _mm_clmulepi64_si128(x1, k, 0x00);
- x1 = _mm_srli_si128(x1, 4);
- x1 = _mm_clmulepi64_si128(x1, k, 0x10);
- x0 = _mm_xor_si128(x1, x0);
- *crc = _mm_extract_epi32(x0, 0);
- return (nr_in - nr); /* the nr processed */
- }
- /* PCLMUL version of relfected crc32 */
- ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts));
- size_t crc32_pclmul_reflected_batch(uint32_t *crc, const unsigned char *p, size_t nr, const crc32_pclmul_consts *consts)
- {
- size_t nr_in = nr;
- __m128i x0, x1, x2, k;
- if (nr < CRC32_FOLDING_BLOCK_SIZE) {
- return 0;
- }
- x0 = _mm_loadu_si128((__m128i *)(p + 0x00));
- x0 = _mm_xor_si128(x0, _mm_cvtsi32_si128(*crc));
- p += CRC32_FOLDING_BLOCK_SIZE;
- nr -= CRC32_FOLDING_BLOCK_SIZE;
- if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
- __m128i x3, x4;
- x1 = _mm_loadu_si128((__m128i *)(p + 0x00));
- x2 = _mm_loadu_si128((__m128i *)(p + 0x10));
- x3 = _mm_loadu_si128((__m128i *)(p + 0x20));
- p += CRC32_FOLDING_BLOCK_SIZE * 3;
- nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
- k = _mm_loadu_si128((__m128i *)consts->k1k2);
- /* parallel folding by 4 */
- while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
- __m128i x5, x6, x7, x8, x9, x10, x11;
- x4 = _mm_clmulepi64_si128(x0, k, 0x00);
- x5 = _mm_clmulepi64_si128(x1, k, 0x00);
- x6 = _mm_clmulepi64_si128(x2, k, 0x00);
- x7 = _mm_clmulepi64_si128(x3, k, 0x00);
- x0 = _mm_clmulepi64_si128(x0, k, 0x11);
- x1 = _mm_clmulepi64_si128(x1, k, 0x11);
- x2 = _mm_clmulepi64_si128(x2, k, 0x11);
- x3 = _mm_clmulepi64_si128(x3, k, 0x11);
- x8 = _mm_loadu_si128((__m128i *)(p + 0x00));
- x9 = _mm_loadu_si128((__m128i *)(p + 0x10));
- x10 = _mm_loadu_si128((__m128i *)(p + 0x20));
- x11 = _mm_loadu_si128((__m128i *)(p + 0x30));
- x0 = _mm_xor_si128(x0, x4);
- x1 = _mm_xor_si128(x1, x5);
- x2 = _mm_xor_si128(x2, x6);
- x3 = _mm_xor_si128(x3, x7);
- x0 = _mm_xor_si128(x0, x8);
- x1 = _mm_xor_si128(x1, x9);
- x2 = _mm_xor_si128(x2, x10);
- x3 = _mm_xor_si128(x3, x11);
- p += CRC32_FOLDING_BLOCK_SIZE * 4;
- nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
- }
- k = _mm_loadu_si128((__m128i *)consts->k3k4);
- /* fold 4 to 1, [x1, x2, x3] -> x0 */
- x4 = _mm_clmulepi64_si128(x0, k, 0x00);
- x0 = _mm_clmulepi64_si128(x0, k, 0x11);
- x0 = _mm_xor_si128(x0, x1);
- x0 = _mm_xor_si128(x0, x4);
- x4 = _mm_clmulepi64_si128(x0, k, 0x00);
- x0 = _mm_clmulepi64_si128(x0, k, 0x11);
- x0 = _mm_xor_si128(x0, x2);
- x0 = _mm_xor_si128(x0, x4);
- x4 = _mm_clmulepi64_si128(x0, k, 0x00);
- x0 = _mm_clmulepi64_si128(x0, k, 0x11);
- x0 = _mm_xor_si128(x0, x3);
- x0 = _mm_xor_si128(x0, x4);
- }
- k = _mm_loadu_si128((__m128i *)consts->k3k4);
- /* folding by 1 */
- while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
- /* load next to x2, fold to x0, x1 */
- x2 = _mm_loadu_si128((__m128i *)(p + 0x00));
- x1 = _mm_clmulepi64_si128(x0, k, 0x00);
- x0 = _mm_clmulepi64_si128(x0, k, 0x11);
- x0 = _mm_xor_si128(x0, x2);
- x0 = _mm_xor_si128(x0, x1);
- p += CRC32_FOLDING_BLOCK_SIZE;
- nr -= CRC32_FOLDING_BLOCK_SIZE;
- }
- /* reduce 128-bits(final fold) to 96-bits */
- x1 = _mm_clmulepi64_si128(x0, k, 0x10);
- x0 = _mm_srli_si128(x0, 8);
- x0 = _mm_xor_si128(x0, x1);
- /* reduce 96-bits to 64-bits */
- x1 = _mm_shuffle_epi32(x0, 0xfc);
- x0 = _mm_shuffle_epi32(x0, 0xf9);
- k = _mm_loadu_si128((__m128i*)consts->k5k6);
- x1 = _mm_clmulepi64_si128(x1, k, 0x00);
- x0 = _mm_xor_si128(x0, x1);
- /* barrett reduction */
- x1 = _mm_shuffle_epi32(x0, 0xf3);
- x0 = _mm_slli_si128(x0, 4);
- k = _mm_loadu_si128((__m128i*)consts->uPx);
- x1 = _mm_clmulepi64_si128(x1, k, 0x00);
- x1 = _mm_clmulepi64_si128(x1, k, 0x10);
- x0 = _mm_xor_si128(x1, x0);
- *crc = _mm_extract_epi32(x0, 2);
- return (nr_in - nr); /* the nr processed */
- }
- # if ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE
- size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
- # else /* ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER */
- size_t crc32_sse42_pclmul_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
- # endif
- {
- if (type > X86_CRC32_MAX) {
- return 0;
- }
- const crc32_pclmul_consts *consts = &crc32_pclmul_consts_maps[type];
- switch (type) {
- case X86_CRC32:
- return crc32_pclmul_batch(crc, p, nr, consts);
- case X86_CRC32B:
- case X86_CRC32C:
- return crc32_pclmul_reflected_batch(crc, p, nr, consts);
- default:
- return 0;
- }
- }
- #endif
- #if ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
- static size_t crc32_x86_simd_update_default(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr)
- {
- return 0;
- }
- # if ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PROTO
- size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) __attribute__((ifunc("resolve_crc32_x86_simd_update")));
- typedef size_t (*crc32_x86_simd_func_t)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr);
- ZEND_NO_SANITIZE_ADDRESS
- ZEND_ATTRIBUTE_UNUSED /* clang mistakenly warns about this */
- static crc32_x86_simd_func_t resolve_crc32_x86_simd_update() {
- if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) {
- return crc32_sse42_pclmul_update;
- }
- return crc32_x86_simd_update_default;
- }
- # else /* ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PTR */
- static size_t (*crc32_x86_simd_ptr)(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) = crc32_x86_simd_update_default;
- size_t crc32_x86_simd_update(X86_CRC32_TYPE type, uint32_t *crc, const unsigned char *p, size_t nr) {
- return crc32_x86_simd_ptr(type, crc, p, nr);
- }
- /* {{{ PHP_MINIT_FUNCTION */
- PHP_MINIT_FUNCTION(crc32_x86_intrin)
- {
- if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) {
- crc32_x86_simd_ptr = crc32_sse42_pclmul_update;
- }
- return SUCCESS;
- }
- /* }}} */
- # endif
- #endif
|