فهرست منبع

libobs: Add aarch64 compatibility layer

Add a compatibility layer utilizing simde
(https://github.com/nemequ/simde) to allow compilation on aarch64
without modifying existing functions.
Peter Geis 6 سال پیش
والد
کامیت
f96545acf3
8فایلهای تغییر یافته به همراه10717 افزوده شده و 0 حذف شده
  1. 258 0
      libobs/util/aarch/check.h
  2. 1616 0
      libobs/util/aarch/hedley.h
  3. 1356 0
      libobs/util/aarch/mmx.h
  4. 355 0
      libobs/util/aarch/simde-arch.h
  5. 278 0
      libobs/util/aarch/simde-common.h
  6. 2591 0
      libobs/util/aarch/sse.h
  7. 4197 0
      libobs/util/aarch/sse2.h
  8. 66 0
      libobs/util/sse-intrin.h

+ 258 - 0
libobs/util/aarch/check.h

@@ -0,0 +1,258 @@
+/* Check (assertions)
+ * Portable Snippets - https://gitub.com/nemequ/portable-snippets
+ * Created by Evan Nemerson <[email protected]>
+ *
+ *   To the extent possible under law, the authors have waived all
+ *   copyright and related or neighboring rights to this code.  For
+ *   details, see the Creative Commons Zero 1.0 Universal license at
+ *   https://creativecommons.org/publicdomain/zero/1.0/
+ */
+
+#if !defined(SIMDE_CHECK_H)
+#define SIMDE_CHECK_H
+
+#if !defined(SIMDE_NDEBUG) && !defined(SIMDE_DEBUG)
+#define SIMDE_NDEBUG 1
+#endif
+
+#include <stdint.h>
+
+#if !defined(_WIN32)
+#define SIMDE_SIZE_MODIFIER "z"
+#define SIMDE_CHAR_MODIFIER "hh"
+#define SIMDE_SHORT_MODIFIER "h"
+#else
+#if defined(_M_X64) || defined(__amd64__)
+#define SIMDE_SIZE_MODIFIER "I64"
+#else
+#define SIMDE_SIZE_MODIFIER ""
+#endif
+#define SIMDE_CHAR_MODIFIER ""
+#define SIMDE_SHORT_MODIFIER ""
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER >= 1500)
+#define SIMDE__PUSH_DISABLE_MSVC_C4127 \
+	__pragma(warning(push)) __pragma(warning(disable : 4127))
+#define SIMDE__POP_DISABLE_MSVC_C4127 __pragma(warning(pop))
+#else
+#define SIMDE__PUSH_DISABLE_MSVC_C4127
+#define SIMDE__POP_DISABLE_MSVC_C4127
+#endif
+
+#if !defined(simde_errorf)
+#include <stdio.h>
+#include <stdlib.h>
+#define simde_errorf(format, ...) \
+	(fprintf(stderr, format, __VA_ARGS__), abort())
+#endif
+
+#define simde_error(msg) simde_errorf("%s", msg)
+
+#if defined(SIMDE_NDEBUG)
+#if defined(SIMDE_CHECK_FAIL_DEFINED)
+#define simde_assert(expr)
+#else
+#if defined(HEDLEY_ASSUME)
+#define simde_assert(expr) HEDLEY_ASSUME(expr)
+#elif HEDLEY_GCC_VERSION_CHECK(4, 5, 0)
+#define simde_assert(expr) ((void)(!!(expr) ? 1 : (__builtin_unreachable(), 1)))
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0)
+#define simde_assert(expr) __assume(expr)
+#else
+#define simde_assert(expr)
+#endif
+#endif
+#define simde_assert_true(expr) simde_assert(expr)
+#define simde_assert_false(expr) simde_assert(!(expr))
+#define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b) \
+	simde_assert(((a)op(b)))
+#define simde_assert_double_equal(a, b, precision)
+#define simde_assert_string_equal(a, b)
+#define simde_assert_string_not_equal(a, b)
+#define simde_assert_memory_equal(size, a, b)
+#define simde_assert_memory_not_equal(size, a, b)
+#else
+#define simde_assert(expr)                                            \
+	do {                                                          \
+		if (!HEDLEY_LIKELY(expr)) {                           \
+			simde_error("assertion failed: " #expr "\n"); \
+		}                                                     \
+		SIMDE__PUSH_DISABLE_MSVC_C4127                        \
+	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+
+#define simde_assert_true(expr)                                \
+	do {                                                   \
+		if (!HEDLEY_LIKELY(expr)) {                    \
+			simde_error("assertion failed: " #expr \
+				    " is not true\n");         \
+		}                                              \
+		SIMDE__PUSH_DISABLE_MSVC_C4127                 \
+	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+
+#define simde_assert_false(expr)                               \
+	do {                                                   \
+		if (!HEDLEY_LIKELY(!(expr))) {                 \
+			simde_error("assertion failed: " #expr \
+				    " is not false\n");        \
+		}                                              \
+		SIMDE__PUSH_DISABLE_MSVC_C4127                 \
+	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+
+#define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b)           \
+	do {                                                               \
+		T simde_tmp_a_ = (a);                                      \
+		T simde_tmp_b_ = (b);                                      \
+		if (!(simde_tmp_a_ op simde_tmp_b_)) {                     \
+			simde_errorf("assertion failed: %s %s %s (" prefix \
+				     "%" fmt suffix " %s " prefix          \
+				     "%" fmt suffix ")\n",                 \
+				     #a, #op, #b, simde_tmp_a_, #op,       \
+				     simde_tmp_b_);                        \
+		}                                                          \
+		SIMDE__PUSH_DISABLE_MSVC_C4127                             \
+	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+
+#define simde_assert_double_equal(a, b, precision)                           \
+	do {                                                                 \
+		const double simde_tmp_a_ = (a);                             \
+		const double simde_tmp_b_ = (b);                             \
+		const double simde_tmp_diff_ =                               \
+			((simde_tmp_a_ - simde_tmp_b_) < 0)                  \
+				? -(simde_tmp_a_ - simde_tmp_b_)             \
+				: (simde_tmp_a_ - simde_tmp_b_);             \
+		if (HEDLEY_UNLIKELY(simde_tmp_diff_ > 1e-##precision)) {     \
+			simde_errorf(                                        \
+				"assertion failed: %s == %s (%0." #precision \
+				"g == %0." #precision "g)\n",                \
+				#a, #b, simde_tmp_a_, simde_tmp_b_);         \
+		}                                                            \
+		SIMDE__PUSH_DISABLE_MSVC_C4127                               \
+	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+
+#include <string.h>
+#define simde_assert_string_equal(a, b)                                                   \
+	do {                                                                              \
+		const char *simde_tmp_a_ = a;                                             \
+		const char *simde_tmp_b_ = b;                                             \
+		if (HEDLEY_UNLIKELY(strcmp(simde_tmp_a_, simde_tmp_b_) !=                 \
+				    0)) {                                                 \
+			simde_errorf(                                                     \
+				"assertion failed: string %s == %s (\"%s\" == \"%s\")\n", \
+				#a, #b, simde_tmp_a_, simde_tmp_b_);                      \
+		}                                                                         \
+		SIMDE__PUSH_DISABLE_MSVC_C4127                                            \
+	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+
+#define simde_assert_string_not_equal(a, b)                                               \
+	do {                                                                              \
+		const char *simde_tmp_a_ = a;                                             \
+		const char *simde_tmp_b_ = b;                                             \
+		if (HEDLEY_UNLIKELY(strcmp(simde_tmp_a_, simde_tmp_b_) ==                 \
+				    0)) {                                                 \
+			simde_errorf(                                                     \
+				"assertion failed: string %s != %s (\"%s\" == \"%s\")\n", \
+				#a, #b, simde_tmp_a_, simde_tmp_b_);                      \
+		}                                                                         \
+		SIMDE__PUSH_DISABLE_MSVC_C4127                                            \
+	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+
+#define simde_assert_memory_equal(size, a, b)                                                                        \
+	do {                                                                                                         \
+		const unsigned char *simde_tmp_a_ =                                                                  \
+			(const unsigned char *)(a);                                                                  \
+		const unsigned char *simde_tmp_b_ =                                                                  \
+			(const unsigned char *)(b);                                                                  \
+		const size_t simde_tmp_size_ = (size);                                                               \
+		if (HEDLEY_UNLIKELY(memcmp(simde_tmp_a_, simde_tmp_b_,                                               \
+					   simde_tmp_size_)) != 0) {                                                 \
+			size_t simde_tmp_pos_;                                                                       \
+			for (simde_tmp_pos_ = 0;                                                                     \
+			     simde_tmp_pos_ < simde_tmp_size_;                                                       \
+			     simde_tmp_pos_++) {                                                                     \
+				if (simde_tmp_a_[simde_tmp_pos_] !=                                                  \
+				    simde_tmp_b_[simde_tmp_pos_]) {                                                  \
+					simde_errorf(                                                                \
+						"assertion failed: memory %s == %s, at offset %" SIMDE_SIZE_MODIFIER \
+						"u\n",                                                               \
+						#a, #b, simde_tmp_pos_);                                             \
+					break;                                                                       \
+				}                                                                                    \
+			}                                                                                            \
+		}                                                                                                    \
+		SIMDE__PUSH_DISABLE_MSVC_C4127                                                                       \
+	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+
+#define simde_assert_memory_not_equal(size, a, b)                                          \
+	do {                                                                               \
+		const unsigned char *simde_tmp_a_ =                                        \
+			(const unsigned char *)(a);                                        \
+		const unsigned char *simde_tmp_b_ =                                        \
+			(const unsigned char *)(b);                                        \
+		const size_t simde_tmp_size_ = (size);                                     \
+		if (HEDLEY_UNLIKELY(memcmp(simde_tmp_a_, simde_tmp_b_,                     \
+					   simde_tmp_size_)) == 0) {                       \
+			simde_errorf(                                                      \
+				"assertion failed: memory %s != %s (%" SIMDE_SIZE_MODIFIER \
+				"u bytes)\n",                                              \
+				#a, #b, simde_tmp_size_);                                  \
+		}                                                                          \
+		SIMDE__PUSH_DISABLE_MSVC_C4127                                             \
+	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+#endif
+
+#define simde_assert_type(T, fmt, a, op, b) \
+	simde_assert_type_full("", "", T, fmt, a, op, b)
+
+#define simde_assert_char(a, op, b)               \
+	simde_assert_type_full("'\\x", "'", char, \
+			       "02" SIMDE_CHAR_MODIFIER "x", a, op, b)
+#define simde_assert_uchar(a, op, b)                       \
+	simde_assert_type_full("'\\x", "'", unsigned char, \
+			       "02" SIMDE_CHAR_MODIFIER "x", a, op, b)
+#define simde_assert_short(a, op, b) \
+	simde_assert_type(short, SIMDE_SHORT_MODIFIER "d", a, op, b)
+#define simde_assert_ushort(a, op, b) \
+	simde_assert_type(unsigned short, SIMDE_SHORT_MODIFIER "u", a, op, b)
+#define simde_assert_int(a, op, b) simde_assert_type(int, "d", a, op, b)
+#define simde_assert_uint(a, op, b) \
+	simde_assert_type(unsigned int, "u", a, op, b)
+#define simde_assert_long(a, op, b) simde_assert_type(long int, "ld", a, op, b)
+#define simde_assert_ulong(a, op, b) \
+	simde_assert_type(unsigned long int, "lu", a, op, b)
+#define simde_assert_llong(a, op, b) \
+	simde_assert_type(long long int, "lld", a, op, b)
+#define simde_assert_ullong(a, op, b) \
+	simde_assert_type(unsigned long long int, "llu", a, op, b)
+
+#define simde_assert_size(a, op, b) \
+	simde_assert_type(size_t, SIMDE_SIZE_MODIFIER "u", a, op, b)
+
+#define simde_assert_float(a, op, b) simde_assert_type(float, "f", a, op, b)
+#define simde_assert_double(a, op, b) simde_assert_type(double, "g", a, op, b)
+#define simde_assert_ptr(a, op, b) \
+	simde_assert_type(const void *, "p", a, op, b)
+
+#define simde_assert_int8(a, op, b) simde_assert_type(int8_t, PRIi8, a, op, b)
+#define simde_assert_uint8(a, op, b) simde_assert_type(uint8_t, PRIu8, a, op, b)
+#define simde_assert_int16(a, op, b) \
+	simde_assert_type(int16_t, PRIi16, a, op, b)
+#define simde_assert_uint16(a, op, b) \
+	simde_assert_type(uint16_t, PRIu16, a, op, b)
+#define simde_assert_int32(a, op, b) \
+	simde_assert_type(int32_t, PRIi32, a, op, b)
+#define simde_assert_uint32(a, op, b) \
+	simde_assert_type(uint32_t, PRIu32, a, op, b)
+#define simde_assert_int64(a, op, b) \
+	simde_assert_type(int64_t, PRIi64, a, op, b)
+#define simde_assert_uint64(a, op, b) \
+	simde_assert_type(uint64_t, PRIu64, a, op, b)
+
+#define simde_assert_ptr_equal(a, b) simde_assert_ptr(a, ==, b)
+#define simde_assert_ptr_not_equal(a, b) simde_assert_ptr(a, !=, b)
+#define simde_assert_null(ptr) simde_assert_ptr(ptr, ==, NULL)
+#define simde_assert_not_null(ptr) simde_assert_ptr(ptr, !=, NULL)
+#define simde_assert_ptr_null(ptr) simde_assert_ptr(ptr, ==, NULL)
+#define simde_assert_ptr_not_null(ptr) simde_assert_ptr(ptr, !=, NULL)
+
+#endif /* !defined(SIMDE_CHECK_H) */

+ 1616 - 0
libobs/util/aarch/hedley.h

@@ -0,0 +1,1616 @@
+/* Hedley - https://nemequ.github.io/hedley
+ * Created by Evan Nemerson <[email protected]>
+ *
+ * To the extent possible under law, the author(s) have dedicated all
+ * copyright and related and neighboring rights to this software to
+ * the public domain worldwide. This software is distributed without
+ * any warranty.
+ *
+ * For details, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ * SPDX-License-Identifier: CC0-1.0
+ */
+
+#if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 10)
+#if defined(HEDLEY_VERSION)
+#undef HEDLEY_VERSION
+#endif
+#define HEDLEY_VERSION 10
+
+#if defined(HEDLEY_STRINGIFY_EX)
+#undef HEDLEY_STRINGIFY_EX
+#endif
+#define HEDLEY_STRINGIFY_EX(x) #x
+
+#if defined(HEDLEY_STRINGIFY)
+#undef HEDLEY_STRINGIFY
+#endif
+#define HEDLEY_STRINGIFY(x) HEDLEY_STRINGIFY_EX(x)
+
+#if defined(HEDLEY_CONCAT_EX)
+#undef HEDLEY_CONCAT_EX
+#endif
+#define HEDLEY_CONCAT_EX(a, b) a##b
+
+#if defined(HEDLEY_CONCAT)
+#undef HEDLEY_CONCAT
+#endif
+#define HEDLEY_CONCAT(a, b) HEDLEY_CONCAT_EX(a, b)
+
+#if defined(HEDLEY_VERSION_ENCODE)
+#undef HEDLEY_VERSION_ENCODE
+#endif
+#define HEDLEY_VERSION_ENCODE(major, minor, revision) \
+	(((major)*1000000) + ((minor)*1000) + (revision))
+
+#if defined(HEDLEY_VERSION_DECODE_MAJOR)
+#undef HEDLEY_VERSION_DECODE_MAJOR
+#endif
+#define HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000)
+
+#if defined(HEDLEY_VERSION_DECODE_MINOR)
+#undef HEDLEY_VERSION_DECODE_MINOR
+#endif
+#define HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000)
+
+#if defined(HEDLEY_VERSION_DECODE_REVISION)
+#undef HEDLEY_VERSION_DECODE_REVISION
+#endif
+#define HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000)
+
+#if defined(HEDLEY_GNUC_VERSION)
+#undef HEDLEY_GNUC_VERSION
+#endif
+#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__)
+#define HEDLEY_GNUC_VERSION \
+	HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
+#elif defined(__GNUC__)
+#define HEDLEY_GNUC_VERSION HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0)
+#endif
+
+#if defined(HEDLEY_GNUC_VERSION_CHECK)
+#undef HEDLEY_GNUC_VERSION_CHECK
+#endif
+#if defined(HEDLEY_GNUC_VERSION)
+#define HEDLEY_GNUC_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_GNUC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_GNUC_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(HEDLEY_MSVC_VERSION)
+#undef HEDLEY_MSVC_VERSION
+#endif
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000)
+#define HEDLEY_MSVC_VERSION                                        \
+	HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000,            \
+			      (_MSC_FULL_VER % 10000000) / 100000, \
+			      (_MSC_FULL_VER % 100000) / 100)
+#elif defined(_MSC_FULL_VER)
+#define HEDLEY_MSVC_VERSION                                      \
+	HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000,           \
+			      (_MSC_FULL_VER % 1000000) / 10000, \
+			      (_MSC_FULL_VER % 10000) / 10)
+#elif defined(_MSC_VER)
+#define HEDLEY_MSVC_VERSION \
+	HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0)
+#endif
+
+#if defined(HEDLEY_MSVC_VERSION_CHECK)
+#undef HEDLEY_MSVC_VERSION_CHECK
+#endif
+#if !defined(_MSC_VER)
+#define HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) (0)
+#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
+#define HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) \
+	(_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch)))
+#elif defined(_MSC_VER) && (_MSC_VER >= 1200)
+#define HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) \
+	(_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch)))
+#else
+#define HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) \
+	(_MSC_VER >= ((major * 100) + (minor)))
+#endif
+
+#if defined(HEDLEY_INTEL_VERSION)
+#undef HEDLEY_INTEL_VERSION
+#endif
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE)
+#define HEDLEY_INTEL_VERSION                                                  \
+	HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, \
+			      __INTEL_COMPILER_UPDATE)
+#elif defined(__INTEL_COMPILER)
+#define HEDLEY_INTEL_VERSION \
+	HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0)
+#endif
+
+#if defined(HEDLEY_INTEL_VERSION_CHECK)
+#undef HEDLEY_INTEL_VERSION_CHECK
+#endif
+#if defined(HEDLEY_INTEL_VERSION)
+#define HEDLEY_INTEL_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_INTEL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_INTEL_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(HEDLEY_PGI_VERSION)
+#undef HEDLEY_PGI_VERSION
+#endif
+#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && \
+	defined(__PGIC_PATCHLEVEL__)
+#define HEDLEY_PGI_VERSION \
+	HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__)
+#endif
+
+#if defined(HEDLEY_PGI_VERSION_CHECK)
+#undef HEDLEY_PGI_VERSION_CHECK
+#endif
+#if defined(HEDLEY_PGI_VERSION)
+#define HEDLEY_PGI_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_PGI_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_PGI_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(HEDLEY_SUNPRO_VERSION)
+#undef HEDLEY_SUNPRO_VERSION
+#endif
+#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000)
+#define HEDLEY_SUNPRO_VERSION                                     \
+	HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + \
+				      ((__SUNPRO_C >> 12) & 0xf), \
+			      (((__SUNPRO_C >> 8) & 0xf) * 10) +  \
+				      ((__SUNPRO_C >> 4) & 0xf),  \
+			      (__SUNPRO_C & 0xf) * 10)
+#elif defined(__SUNPRO_C)
+#define HEDLEY_SUNPRO_VERSION                          \
+	HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, \
+			      (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C)&0xf)
+#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000)
+#define HEDLEY_SUNPRO_VERSION                                      \
+	HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + \
+				      ((__SUNPRO_CC >> 12) & 0xf), \
+			      (((__SUNPRO_CC >> 8) & 0xf) * 10) +  \
+				      ((__SUNPRO_CC >> 4) & 0xf),  \
+			      (__SUNPRO_CC & 0xf) * 10)
+#elif defined(__SUNPRO_CC)
+#define HEDLEY_SUNPRO_VERSION                           \
+	HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, \
+			      (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC)&0xf)
+#endif
+
+#if defined(HEDLEY_SUNPRO_VERSION_CHECK)
+#undef HEDLEY_SUNPRO_VERSION_CHECK
+#endif
+#if defined(HEDLEY_SUNPRO_VERSION)
+#define HEDLEY_SUNPRO_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_SUNPRO_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_SUNPRO_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(HEDLEY_EMSCRIPTEN_VERSION)
+#undef HEDLEY_EMSCRIPTEN_VERSION
+#endif
+#if defined(__EMSCRIPTEN__)
+#define HEDLEY_EMSCRIPTEN_VERSION                                         \
+	HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, \
+			      __EMSCRIPTEN_tiny__)
+#endif
+
+#if defined(HEDLEY_EMSCRIPTEN_VERSION_CHECK)
+#undef HEDLEY_EMSCRIPTEN_VERSION_CHECK
+#endif
+#if defined(HEDLEY_EMSCRIPTEN_VERSION)
+#define HEDLEY_EMSCRIPTEN_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_EMSCRIPTEN_VERSION >=                        \
+	 HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_EMSCRIPTEN_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(HEDLEY_ARM_VERSION)
+#undef HEDLEY_ARM_VERSION
+#endif
+#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION)
+#define HEDLEY_ARM_VERSION                                               \
+	HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000,           \
+			      (__ARMCOMPILER_VERSION % 1000000) / 10000, \
+			      (__ARMCOMPILER_VERSION % 10000) / 100)
+#elif defined(__CC_ARM) && defined(__ARMCC_VERSION)
+#define HEDLEY_ARM_VERSION                                         \
+	HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000,           \
+			      (__ARMCC_VERSION % 1000000) / 10000, \
+			      (__ARMCC_VERSION % 10000) / 100)
+#endif
+
+#if defined(HEDLEY_ARM_VERSION_CHECK)
+#undef HEDLEY_ARM_VERSION_CHECK
+#endif
+#if defined(HEDLEY_ARM_VERSION)
+#define HEDLEY_ARM_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_ARM_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_ARM_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(HEDLEY_IBM_VERSION)
+#undef HEDLEY_IBM_VERSION
+#endif
+#if defined(__ibmxl__)
+#define HEDLEY_IBM_VERSION                                          \
+	HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, \
+			      __ibmxl_modification__)
+#elif defined(__xlC__) && defined(__xlC_ver__)
+#define HEDLEY_IBM_VERSION                                  \
+	HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, \
+			      (__xlC_ver__ >> 8) & 0xff)
+#elif defined(__xlC__)
+#define HEDLEY_IBM_VERSION \
+	HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0)
+#endif
+
+#if defined(HEDLEY_IBM_VERSION_CHECK)
+#undef HEDLEY_IBM_VERSION_CHECK
+#endif
+#if defined(HEDLEY_IBM_VERSION)
+#define HEDLEY_IBM_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_IBM_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_IBM_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(HEDLEY_TI_VERSION)
+#undef HEDLEY_TI_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__)
+#define HEDLEY_TI_VERSION                                                 \
+	HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000,          \
+			      (__TI_COMPILER_VERSION__ % 1000000) / 1000, \
+			      (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(HEDLEY_TI_VERSION_CHECK)
+#undef HEDLEY_TI_VERSION_CHECK
+#endif
+#if defined(HEDLEY_TI_VERSION)
+#define HEDLEY_TI_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_TI_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_TI_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(HEDLEY_CRAY_VERSION)
+#undef HEDLEY_CRAY_VERSION
+#endif
+#if defined(_CRAYC)
+#if defined(_RELEASE_PATCHLEVEL)
+#define HEDLEY_CRAY_VERSION                                   \
+	HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, \
+			      _RELEASE_PATCHLEVEL)
+#else
+#define HEDLEY_CRAY_VERSION \
+	HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0)
+#endif
+#endif
+
+#if defined(HEDLEY_CRAY_VERSION_CHECK)
+#undef HEDLEY_CRAY_VERSION_CHECK
+#endif
+#if defined(HEDLEY_CRAY_VERSION)
+#define HEDLEY_CRAY_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_CRAY_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_CRAY_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(HEDLEY_IAR_VERSION)
+#undef HEDLEY_IAR_VERSION
+#endif
+#if defined(__IAR_SYSTEMS_ICC__)
+#if __VER__ > 1000
+#define HEDLEY_IAR_VERSION                                                    \
+	HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), \
+			      (__VER__ % 1000))
+#else
+#define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE(VER / 100, __VER__ % 100, 0)
+#endif
+#endif
+
+#if defined(HEDLEY_IAR_VERSION_CHECK)
+#undef HEDLEY_IAR_VERSION_CHECK
+#endif
+#if defined(HEDLEY_IAR_VERSION)
+#define HEDLEY_IAR_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_IAR_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_IAR_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(HEDLEY_TINYC_VERSION)
+#undef HEDLEY_TINYC_VERSION
+#endif
+#if defined(__TINYC__)
+#define HEDLEY_TINYC_VERSION                                            \
+	HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, \
+			      __TINYC__ % 100)
+#endif
+
+#if defined(HEDLEY_TINYC_VERSION_CHECK)
+#undef HEDLEY_TINYC_VERSION_CHECK
+#endif
+#if defined(HEDLEY_TINYC_VERSION)
+#define HEDLEY_TINYC_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_TINYC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_TINYC_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(HEDLEY_DMC_VERSION)
+#undef HEDLEY_DMC_VERSION
+#endif
+#if defined(__DMC__)
+#define HEDLEY_DMC_VERSION \
+	HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf)
+#endif
+
+#if defined(HEDLEY_DMC_VERSION_CHECK)
+#undef HEDLEY_DMC_VERSION_CHECK
+#endif
+#if defined(HEDLEY_DMC_VERSION)
+#define HEDLEY_DMC_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_DMC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_DMC_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(HEDLEY_COMPCERT_VERSION)
+#undef HEDLEY_COMPCERT_VERSION
+#endif
+#if defined(__COMPCERT_VERSION__)
+#define HEDLEY_COMPCERT_VERSION                                   \
+	HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000,       \
+			      (__COMPCERT_VERSION__ / 100) % 100, \
+			      __COMPCERT_VERSION__ % 100)
+#endif
+
+#if defined(HEDLEY_COMPCERT_VERSION_CHECK)
+#undef HEDLEY_COMPCERT_VERSION_CHECK
+#endif
+#if defined(HEDLEY_COMPCERT_VERSION)
+#define HEDLEY_COMPCERT_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_COMPCERT_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_COMPCERT_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(HEDLEY_PELLES_VERSION)
+#undef HEDLEY_PELLES_VERSION
+#endif
+#if defined(__POCC__)
+#define HEDLEY_PELLES_VERSION \
+	HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0)
+#endif
+
+#if defined(HEDLEY_PELLES_VERSION_CHECK)
+#undef HEDLEY_PELLES_VERSION_CHECK
+#endif
+#if defined(HEDLEY_PELLES_VERSION)
+#define HEDLEY_PELLES_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_PELLES_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_PELLES_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(HEDLEY_GCC_VERSION)
+#undef HEDLEY_GCC_VERSION
+#endif
+#if defined(HEDLEY_GNUC_VERSION) && !defined(__clang__) &&                \
+	!defined(HEDLEY_INTEL_VERSION) && !defined(HEDLEY_PGI_VERSION) && \
+	!defined(HEDLEY_ARM_VERSION) && !defined(HEDLEY_TI_VERSION) &&    \
+	!defined(__COMPCERT__)
+#define HEDLEY_GCC_VERSION HEDLEY_GNUC_VERSION
+#endif
+
+#if defined(HEDLEY_GCC_VERSION_CHECK)
+#undef HEDLEY_GCC_VERSION_CHECK
+#endif
+#if defined(HEDLEY_GCC_VERSION)
+#define HEDLEY_GCC_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_GCC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_GCC_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(HEDLEY_HAS_ATTRIBUTE)
+#undef HEDLEY_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+#define HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute)
+#else
+#define HEDLEY_HAS_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(HEDLEY_GNUC_HAS_ATTRIBUTE)
+#undef HEDLEY_GNUC_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+#define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute, major, minor, patch) \
+	__has_attribute(attribute)
+#else
+#define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute, major, minor, patch) \
+	HEDLEY_GNUC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(HEDLEY_GCC_HAS_ATTRIBUTE)
+#undef HEDLEY_GCC_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+#define HEDLEY_GCC_HAS_ATTRIBUTE(attribute, major, minor, patch) \
+	__has_attribute(attribute)
+#else
+#define HEDLEY_GCC_HAS_ATTRIBUTE(attribute, major, minor, patch) \
+	HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(HEDLEY_HAS_CPP_ATTRIBUTE)
+#undef HEDLEY_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus)
+#define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute)
+#else
+#define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(HEDLEY_GNUC_HAS_CPP_ATTRIBUTE)
+#undef HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus)
+#define HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute, major, minor, patch) \
+	__has_cpp_attribute(attribute)
+#else
+#define HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute, major, minor, patch) \
+	HEDLEY_GNUC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(HEDLEY_GCC_HAS_CPP_ATTRIBUTE)
+#undef HEDLEY_GCC_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus)
+#define HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute, major, minor, patch) \
+	__has_cpp_attribute(attribute)
+#else
+#define HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute, major, minor, patch) \
+	HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(HEDLEY_HAS_BUILTIN)
+#undef HEDLEY_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+#define HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin)
+#else
+#define HEDLEY_HAS_BUILTIN(builtin) (0)
+#endif
+
+#if defined(HEDLEY_GNUC_HAS_BUILTIN)
+#undef HEDLEY_GNUC_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+#define HEDLEY_GNUC_HAS_BUILTIN(builtin, major, minor, patch) \
+	__has_builtin(builtin)
+#else
+#define HEDLEY_GNUC_HAS_BUILTIN(builtin, major, minor, patch) \
+	HEDLEY_GNUC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(HEDLEY_GCC_HAS_BUILTIN)
+#undef HEDLEY_GCC_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+#define HEDLEY_GCC_HAS_BUILTIN(builtin, major, minor, patch) \
+	__has_builtin(builtin)
+#else
+#define HEDLEY_GCC_HAS_BUILTIN(builtin, major, minor, patch) \
+	HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(HEDLEY_HAS_FEATURE)
+#undef HEDLEY_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+#define HEDLEY_HAS_FEATURE(feature) __has_feature(feature)
+#else
+#define HEDLEY_HAS_FEATURE(feature) (0)
+#endif
+
+#if defined(HEDLEY_GNUC_HAS_FEATURE)
+#undef HEDLEY_GNUC_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+#define HEDLEY_GNUC_HAS_FEATURE(feature, major, minor, patch) \
+	__has_feature(feature)
+#else
+#define HEDLEY_GNUC_HAS_FEATURE(feature, major, minor, patch) \
+	HEDLEY_GNUC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(HEDLEY_GCC_HAS_FEATURE)
+#undef HEDLEY_GCC_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+#define HEDLEY_GCC_HAS_FEATURE(feature, major, minor, patch) \
+	__has_feature(feature)
+#else
+#define HEDLEY_GCC_HAS_FEATURE(feature, major, minor, patch) \
+	HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(HEDLEY_HAS_EXTENSION)
+#undef HEDLEY_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+#define HEDLEY_HAS_EXTENSION(extension) __has_extension(extension)
+#else
+#define HEDLEY_HAS_EXTENSION(extension) (0)
+#endif
+
+#if defined(HEDLEY_GNUC_HAS_EXTENSION)
+#undef HEDLEY_GNUC_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+#define HEDLEY_GNUC_HAS_EXTENSION(extension, major, minor, patch) \
+	__has_extension(extension)
+#else
+#define HEDLEY_GNUC_HAS_EXTENSION(extension, major, minor, patch) \
+	HEDLEY_GNUC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(HEDLEY_GCC_HAS_EXTENSION)
+#undef HEDLEY_GCC_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+#define HEDLEY_GCC_HAS_EXTENSION(extension, major, minor, patch) \
+	__has_extension(extension)
+#else
+#define HEDLEY_GCC_HAS_EXTENSION(extension, major, minor, patch) \
+	HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(HEDLEY_HAS_DECLSPEC_ATTRIBUTE)
+#undef HEDLEY_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+#define HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) \
+	__has_declspec_attribute(attribute)
+#else
+#define HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE)
+#undef HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+#define HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute, major, minor, patch) \
+	__has_declspec_attribute(attribute)
+#else
+#define HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute, major, minor, patch) \
+	HEDLEY_GNUC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE)
+#undef HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+#define HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute, major, minor, patch) \
+	__has_declspec_attribute(attribute)
+#else
+#define HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute, major, minor, patch) \
+	HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(HEDLEY_HAS_WARNING)
+#undef HEDLEY_HAS_WARNING
+#endif
+#if defined(__has_warning)
+#define HEDLEY_HAS_WARNING(warning) __has_warning(warning)
+#else
+#define HEDLEY_HAS_WARNING(warning) (0)
+#endif
+
+#if defined(HEDLEY_GNUC_HAS_WARNING)
+#undef HEDLEY_GNUC_HAS_WARNING
+#endif
+#if defined(__has_warning)
+#define HEDLEY_GNUC_HAS_WARNING(warning, major, minor, patch) \
+	__has_warning(warning)
+#else
+#define HEDLEY_GNUC_HAS_WARNING(warning, major, minor, patch) \
+	HEDLEY_GNUC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(HEDLEY_GCC_HAS_WARNING)
+#undef HEDLEY_GCC_HAS_WARNING
+#endif
+#if defined(__has_warning)
+#define HEDLEY_GCC_HAS_WARNING(warning, major, minor, patch) \
+	__has_warning(warning)
+#else
+#define HEDLEY_GCC_HAS_WARNING(warning, major, minor, patch) \
+	HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
+	defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(3, 0, 0) ||  \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                     \
+	HEDLEY_IAR_VERSION_CHECK(8, 0, 0) ||                        \
+	HEDLEY_PGI_VERSION_CHECK(18, 4, 0) ||                       \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                        \
+	HEDLEY_TI_VERSION_CHECK(6, 0, 0) ||                         \
+	HEDLEY_CRAY_VERSION_CHECK(5, 0, 0) ||                       \
+	HEDLEY_TINYC_VERSION_CHECK(0, 9, 17) ||                     \
+	HEDLEY_SUNPRO_VERSION_CHECK(8, 0, 0) ||                     \
+	(HEDLEY_IBM_VERSION_CHECK(10, 1, 0) && defined(__C99_PRAGMA_OPERATOR))
+#define HEDLEY_PRAGMA(value) _Pragma(#value)
+#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
+#define HEDLEY_PRAGMA(value) __pragma(value)
+#else
+#define HEDLEY_PRAGMA(value)
+#endif
+
+#if defined(HEDLEY_DIAGNOSTIC_PUSH)
+#undef HEDLEY_DIAGNOSTIC_PUSH
+#endif
+#if defined(HEDLEY_DIAGNOSTIC_POP)
+#undef HEDLEY_DIAGNOSTIC_POP
+#endif
+#if defined(__clang__)
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
+#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#elif HEDLEY_GCC_VERSION_CHECK(4, 6, 0)
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
+#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
+#define HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
+#define HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
+#elif HEDLEY_ARM_VERSION_CHECK(5, 6, 0)
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
+#elif HEDLEY_TI_VERSION_CHECK(8, 1, 0)
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
+#elif HEDLEY_PELLES_VERSION_CHECK(2, 90, 0)
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#else
+#define HEDLEY_DIAGNOSTIC_PUSH
+#define HEDLEY_DIAGNOSTIC_POP
+#endif
+
+#if defined(HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED)
+#undef HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#endif
+#if HEDLEY_HAS_WARNING("-Wdeprecated-declarations")
+#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \
+	_Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \
+	_Pragma("warning(disable:1478 1786)")
+#elif HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
+#elif HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \
+	_Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable : 4996))
+#elif HEDLEY_TI_VERSION_CHECK(8, 0, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718")
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 13, 0) && !defined(__cplusplus)
+#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \
+	_Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)")
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 13, 0) && defined(__cplusplus)
+#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \
+	_Pragma("error_messages(off,symdeprecated,symdeprecated2)")
+#elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \
+	_Pragma("diag_suppress=Pe1444,Pe1215")
+#elif HEDLEY_PELLES_VERSION_CHECK(2, 90, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)")
+#else
+#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#endif
+
+#if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS)
+#undef HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#endif
+#if HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+	_Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"")
+#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+	_Pragma("warning(disable:161)")
+#elif HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675")
+#elif HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+	_Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"")
+#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+	__pragma(warning(disable : 4068))
+#elif HEDLEY_TI_VERSION_CHECK(8, 0, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
+#elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161")
+#else
+#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#endif
+
+#if defined(HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL)
+#undef HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#endif
+#if HEDLEY_HAS_WARNING("-Wcast-qual")
+#define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
+	_Pragma("clang diagnostic ignored \"-Wcast-qual\"")
+#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
+	_Pragma("warning(disable:2203 2331)")
+#elif HEDLEY_GCC_VERSION_CHECK(3, 0, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
+	_Pragma("GCC diagnostic ignored \"-Wcast-qual\"")
+#else
+#define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#endif
+
+#if defined(HEDLEY_DEPRECATED)
+#undef HEDLEY_DEPRECATED
+#endif
+#if defined(HEDLEY_DEPRECATED_FOR)
+#undef HEDLEY_DEPRECATED_FOR
+#endif
+#if defined(__cplusplus) && (__cplusplus >= 201402L)
+#define HEDLEY_DEPRECATED(since) [[deprecated("Since " #since)]]
+#define HEDLEY_DEPRECATED_FOR(since, replacement) \
+	[[deprecated("Since " #since "; use " #replacement)]]
+#elif HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) || \
+	HEDLEY_GCC_VERSION_CHECK(4, 5, 0) ||                     \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                  \
+	HEDLEY_ARM_VERSION_CHECK(5, 6, 0) ||                     \
+	HEDLEY_SUNPRO_VERSION_CHECK(5, 13, 0) ||                 \
+	HEDLEY_PGI_VERSION_CHECK(17, 10, 0) ||                   \
+	HEDLEY_TI_VERSION_CHECK(8, 3, 0)
+#define HEDLEY_DEPRECATED(since) \
+	__attribute__((__deprecated__("Since " #since)))
+#define HEDLEY_DEPRECATED_FOR(since, replacement) \
+	__attribute__((__deprecated__("Since " #since "; use " #replacement)))
+#elif HEDLEY_HAS_ATTRIBUTE(deprecated) || HEDLEY_GCC_VERSION_CHECK(3, 1, 0) || \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                                   \
+	HEDLEY_TI_VERSION_CHECK(8, 0, 0) ||                                    \
+	(HEDLEY_TI_VERSION_CHECK(7, 3, 0) &&                                   \
+	 defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+#define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__))
+#define HEDLEY_DEPRECATED_FOR(since, replacement) \
+	__attribute__((__deprecated__))
+#elif HEDLEY_MSVC_VERSION_CHECK(14, 0, 0)
+#define HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " #since))
+#define HEDLEY_DEPRECATED_FOR(since, replacement) \
+	__declspec(deprecated("Since " #since "; use " #replacement))
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || \
+	HEDLEY_PELLES_VERSION_CHECK(6, 50, 0)
+#define HEDLEY_DEPRECATED(since) __declspec(deprecated)
+#define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated)
+#elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
+#define HEDLEY_DEPRECATED(since) _Pragma("deprecated")
+#define HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated")
+#else
+#define HEDLEY_DEPRECATED(since)
+#define HEDLEY_DEPRECATED_FOR(since, replacement)
+#endif
+
+#if defined(HEDLEY_UNAVAILABLE)
+#undef HEDLEY_UNAVAILABLE
+#endif
+#if HEDLEY_HAS_ATTRIBUTE(warning) || HEDLEY_GCC_VERSION_CHECK(4, 3, 0) || \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define HEDLEY_UNAVAILABLE(available_since) \
+	__attribute__((__warning__("Not available until " #available_since)))
+#else
+#define HEDLEY_UNAVAILABLE(available_since)
+#endif
+
+#if defined(HEDLEY_WARN_UNUSED_RESULT)
+#undef HEDLEY_WARN_UNUSED_RESULT
+#endif
+#if defined(__cplusplus) && (__cplusplus >= 201703L)
+#define HEDLEY_WARN_UNUSED_RESULT [[nodiscard]]
+#elif HEDLEY_HAS_ATTRIBUTE(warn_unused_result) ||                          \
+	HEDLEY_GCC_VERSION_CHECK(3, 4, 0) ||                               \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
+	HEDLEY_TI_VERSION_CHECK(8, 0, 0) ||                                \
+	(HEDLEY_TI_VERSION_CHECK(7, 3, 0) &&                               \
+	 defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||                         \
+	(HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0) && defined(__cplusplus)) || \
+	HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
+#define HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+#elif defined(_Check_return_) /* SAL */
+#define HEDLEY_WARN_UNUSED_RESULT _Check_return_
+#else
+#define HEDLEY_WARN_UNUSED_RESULT
+#endif
+
+#if defined(HEDLEY_SENTINEL)
+#undef HEDLEY_SENTINEL
+#endif
+#if HEDLEY_HAS_ATTRIBUTE(sentinel) || HEDLEY_GCC_VERSION_CHECK(4, 0, 0) || \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
+	HEDLEY_ARM_VERSION_CHECK(5, 4, 0)
+#define HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position)))
+#else
+#define HEDLEY_SENTINEL(position)
+#endif
+
+#if defined(HEDLEY_NO_RETURN)
+#undef HEDLEY_NO_RETURN
+#endif
+#if HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
+#define HEDLEY_NO_RETURN __noreturn
+#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define HEDLEY_NO_RETURN __attribute__((__noreturn__))
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#define HEDLEY_NO_RETURN _Noreturn
+#elif defined(__cplusplus) && (__cplusplus >= 201103L)
+#define HEDLEY_NO_RETURN [[noreturn]]
+#elif HEDLEY_HAS_ATTRIBUTE(noreturn) || HEDLEY_GCC_VERSION_CHECK(3, 2, 0) || \
+	HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) ||                             \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                                 \
+	HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||                                \
+	HEDLEY_TI_VERSION_CHECK(18, 0, 0) ||                                 \
+	(HEDLEY_TI_VERSION_CHECK(17, 3, 0) &&                                \
+	 defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+#define HEDLEY_NO_RETURN __attribute__((__noreturn__))
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0)
+#define HEDLEY_NO_RETURN _Pragma("does_not_return")
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0)
+#define HEDLEY_NO_RETURN __declspec(noreturn)
+#elif HEDLEY_TI_VERSION_CHECK(6, 0, 0) && defined(__cplusplus)
+#define HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;")
+#elif HEDLEY_COMPCERT_VERSION_CHECK(3, 2, 0)
+#define HEDLEY_NO_RETURN __attribute((noreturn))
+#elif HEDLEY_PELLES_VERSION_CHECK(9, 0, 0)
+#define HEDLEY_NO_RETURN __declspec(noreturn)
+#else
+#define HEDLEY_NO_RETURN
+#endif
+
+#if defined(HEDLEY_UNREACHABLE)
+#undef HEDLEY_UNREACHABLE
+#endif
+#if defined(HEDLEY_UNREACHABLE_RETURN)
+#undef HEDLEY_UNREACHABLE_RETURN
+#endif
+#if (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && \
+     (!defined(HEDLEY_ARM_VERSION))) ||           \
+	HEDLEY_GCC_VERSION_CHECK(4, 5, 0) ||      \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||   \
+	HEDLEY_IBM_VERSION_CHECK(13, 1, 5)
+#define HEDLEY_UNREACHABLE() __builtin_unreachable()
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0)
+#define HEDLEY_UNREACHABLE() __assume(0)
+#elif HEDLEY_TI_VERSION_CHECK(6, 0, 0)
+#if defined(__cplusplus)
+#define HEDLEY_UNREACHABLE() std::_nassert(0)
+#else
+#define HEDLEY_UNREACHABLE() _nassert(0)
+#endif
+#define HEDLEY_UNREACHABLE_RETURN(value) return value
+#elif defined(EXIT_FAILURE)
+#define HEDLEY_UNREACHABLE() abort()
+#else
+#define HEDLEY_UNREACHABLE()
+#define HEDLEY_UNREACHABLE_RETURN(value) return value
+#endif
+#if !defined(HEDLEY_UNREACHABLE_RETURN)
+#define HEDLEY_UNREACHABLE_RETURN(value) HEDLEY_UNREACHABLE()
+#endif
+
+#if defined(HEDLEY_ASSUME)
+#undef HEDLEY_ASSUME
+#endif
+#if HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define HEDLEY_ASSUME(expr) __assume(expr)
+#elif HEDLEY_HAS_BUILTIN(__builtin_assume)
+#define HEDLEY_ASSUME(expr) __builtin_assume(expr)
+#elif HEDLEY_TI_VERSION_CHECK(6, 0, 0)
+#if defined(__cplusplus)
+#define HEDLEY_ASSUME(expr) std::_nassert(expr)
+#else
+#define HEDLEY_ASSUME(expr) _nassert(expr)
+#endif
+#elif (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && \
+       !defined(HEDLEY_ARM_VERSION)) ||             \
+	HEDLEY_GCC_VERSION_CHECK(4, 5, 0) ||        \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||     \
+	HEDLEY_IBM_VERSION_CHECK(13, 1, 5)
+#define HEDLEY_ASSUME(expr) ((void)((expr) ? 1 : (__builtin_unreachable(), 1)))
+#else
+#define HEDLEY_ASSUME(expr) ((void)(expr))
+#endif
+
+HEDLEY_DIAGNOSTIC_PUSH
+#if HEDLEY_HAS_WARNING("-Wvariadic-macros") || HEDLEY_GCC_VERSION_CHECK(4, 0, 0)
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wvariadic-macros"
+#elif defined(HEDLEY_GCC_VERSION)
+#pragma GCC diagnostic ignored "-Wvariadic-macros"
+#endif
+#endif
+#if defined(HEDLEY_NON_NULL)
+#undef HEDLEY_NON_NULL
+#endif
+#if HEDLEY_HAS_ATTRIBUTE(nonnull) || HEDLEY_GCC_VERSION_CHECK(3, 3, 0) || \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                           \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0)
+#define HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__)))
+#else
+#define HEDLEY_NON_NULL(...)
+#endif
+HEDLEY_DIAGNOSTIC_POP
+
+#if defined(HEDLEY_PRINTF_FORMAT)
+#undef HEDLEY_PRINTF_FORMAT
+#endif
+#if defined(__MINGW32__) && HEDLEY_GCC_HAS_ATTRIBUTE(format, 4, 4, 0) && \
+	!defined(__USE_MINGW_ANSI_STDIO)
+#define HEDLEY_PRINTF_FORMAT(string_idx, first_to_check) \
+	__attribute__((__format__(ms_printf, string_idx, first_to_check)))
+#elif defined(__MINGW32__) && HEDLEY_GCC_HAS_ATTRIBUTE(format, 4, 4, 0) && \
+	defined(__USE_MINGW_ANSI_STDIO)
+#define HEDLEY_PRINTF_FORMAT(string_idx, first_to_check) \
+	__attribute__((__format__(gnu_printf, string_idx, first_to_check)))
+#elif HEDLEY_HAS_ATTRIBUTE(format) || HEDLEY_GCC_VERSION_CHECK(3, 1, 0) || \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
+	HEDLEY_ARM_VERSION_CHECK(5, 6, 0) ||                               \
+	HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||                              \
+	HEDLEY_TI_VERSION_CHECK(8, 0, 0) ||                                \
+	(HEDLEY_TI_VERSION_CHECK(7, 3, 0) &&                               \
+	 defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+#define HEDLEY_PRINTF_FORMAT(string_idx, first_to_check) \
+	__attribute__((__format__(__printf__, string_idx, first_to_check)))
+#elif HEDLEY_PELLES_VERSION_CHECK(6, 0, 0)
+#define HEDLEY_PRINTF_FORMAT(string_idx, first_to_check) \
+	__declspec(vaformat(printf, string_idx, first_to_check))
+#else
+#define HEDLEY_PRINTF_FORMAT(string_idx, first_to_check)
+#endif
+
+#if defined(HEDLEY_CONSTEXPR)
+#undef HEDLEY_CONSTEXPR
+#endif
+#if defined(__cplusplus)
+#if __cplusplus >= 201103L
+#define HEDLEY_CONSTEXPR constexpr
+#endif
+#endif
+#if !defined(HEDLEY_CONSTEXPR)
+#define HEDLEY_CONSTEXPR
+#endif
+
+#if defined(HEDLEY_PREDICT)
+#undef HEDLEY_PREDICT
+#endif
+#if defined(HEDLEY_LIKELY)
+#undef HEDLEY_LIKELY
+#endif
+#if defined(HEDLEY_UNLIKELY)
+#undef HEDLEY_UNLIKELY
+#endif
+#if defined(HEDLEY_UNPREDICTABLE)
+#undef HEDLEY_UNPREDICTABLE
+#endif
+#if HEDLEY_HAS_BUILTIN(__builtin_unpredictable)
+#define HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable(!!(expr))
+#endif
+#if HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) || \
+	HEDLEY_GCC_VERSION_CHECK(9, 0, 0)
+#define HEDLEY_PREDICT(expr, value, probability) \
+	__builtin_expect_with_probability(expr, value, probability)
+#define HEDLEY_PREDICT_TRUE(expr, probability) \
+	__builtin_expect_with_probability(!!(expr), 1, probability)
+#define HEDLEY_PREDICT_FALSE(expr, probability) \
+	__builtin_expect_with_probability(!!(expr), 0, probability)
+#define HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1)
+#define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#if !defined(HEDLEY_BUILTIN_UNPREDICTABLE)
+#define HEDLEY_BUILTIN_UNPREDICTABLE(expr) \
+	__builtin_expect_with_probability(!!(expr), 1, 0.5)
+#endif
+#elif HEDLEY_HAS_BUILTIN(__builtin_expect) ||                              \
+	HEDLEY_GCC_VERSION_CHECK(3, 0, 0) ||                               \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
+	(HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0) && defined(__cplusplus)) || \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                               \
+	HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||                              \
+	HEDLEY_TI_VERSION_CHECK(6, 1, 0) ||                                \
+	HEDLEY_TINYC_VERSION_CHECK(0, 9, 27)
+#define HEDLEY_PREDICT(expr, expected, probability)                      \
+	(((probability) >= 0.9) ? __builtin_expect(!!(expr), (expected)) \
+				: (((void)(expected)), !!(expr)))
+#define HEDLEY_PREDICT_TRUE(expr, probability)                               \
+	(__extension__({                                                     \
+		HEDLEY_CONSTEXPR double hedley_probability_ = (probability); \
+		((hedley_probability_ >= 0.9)                                \
+			 ? __builtin_expect(!!(expr), 1)                     \
+			 : ((hedley_probability_ <= 0.1)                     \
+				    ? __builtin_expect(!!(expr), 0)          \
+				    : !!(expr)));                            \
+	}))
+#define HEDLEY_PREDICT_FALSE(expr, probability)                              \
+	(__extension__({                                                     \
+		HEDLEY_CONSTEXPR double hedley_probability_ = (probability); \
+		((hedley_probability_ >= 0.9)                                \
+			 ? __builtin_expect(!!(expr), 0)                     \
+			 : ((hedley_probability_ <= 0.1)                     \
+				    ? __builtin_expect(!!(expr), 1)          \
+				    : !!(expr)));                            \
+	}))
+#define HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1)
+#define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#else
+#define HEDLEY_PREDICT(expr, expected, probability) \
+	(((void)(expected)), !!(expr))
+#define HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr))
+#define HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr))
+#define HEDLEY_LIKELY(expr) (!!(expr))
+#define HEDLEY_UNLIKELY(expr) (!!(expr))
+#endif
+#if !defined(HEDLEY_UNPREDICTABLE)
+#define HEDLEY_UNPREDICTABLE(expr) HEDLEY_PREDICT(expr, 1, 0.5)
+#endif
+
+#if defined(HEDLEY_MALLOC)
+#undef HEDLEY_MALLOC
+#endif
+#if HEDLEY_HAS_ATTRIBUTE(malloc) || HEDLEY_GCC_VERSION_CHECK(3, 1, 0) || \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                          \
+	HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) ||                         \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                             \
+	HEDLEY_IBM_VERSION_CHECK(12, 1, 0) ||                            \
+	HEDLEY_TI_VERSION_CHECK(8, 0, 0) ||                              \
+	(HEDLEY_TI_VERSION_CHECK(7, 3, 0) &&                             \
+	 defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+#define HEDLEY_MALLOC __attribute__((__malloc__))
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0)
+#define HEDLEY_MALLOC _Pragma("returns_new_memory")
+#elif HEDLEY_MSVC_VERSION_CHECK(14, 0, 0)
+#define HEDLEY_MALLOC __declspec(restrict)
+#else
+#define HEDLEY_MALLOC
+#endif
+
+#if defined(HEDLEY_PURE)
+#undef HEDLEY_PURE
+#endif
+#if HEDLEY_HAS_ATTRIBUTE(pure) || HEDLEY_GCC_VERSION_CHECK(2, 96, 0) || \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                         \
+	HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) ||                        \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                            \
+	HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||                           \
+	HEDLEY_TI_VERSION_CHECK(8, 0, 0) ||                             \
+	(HEDLEY_TI_VERSION_CHECK(7, 3, 0) &&                            \
+	 defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||                      \
+	HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
+#define HEDLEY_PURE __attribute__((__pure__))
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0)
+#define HEDLEY_PURE _Pragma("does_not_write_global_data")
+#elif HEDLEY_TI_VERSION_CHECK(6, 0, 0) && defined(__cplusplus)
+#define HEDLEY_PURE _Pragma("FUNC_IS_PURE;")
+#else
+#define HEDLEY_PURE
+#endif
+
+#if defined(HEDLEY_CONST)
+#undef HEDLEY_CONST
+#endif
+#if HEDLEY_HAS_ATTRIBUTE(const) || HEDLEY_GCC_VERSION_CHECK(2, 5, 0) || \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                         \
+	HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) ||                        \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                            \
+	HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||                           \
+	HEDLEY_TI_VERSION_CHECK(8, 0, 0) ||                             \
+	(HEDLEY_TI_VERSION_CHECK(7, 3, 0) &&                            \
+	 defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||                      \
+	HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
+#define HEDLEY_CONST __attribute__((__const__))
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0)
+#define HEDLEY_CONST _Pragma("no_side_effect")
+#else
+#define HEDLEY_CONST HEDLEY_PURE
+#endif
+
+#if defined(HEDLEY_RESTRICT)
+#undef HEDLEY_RESTRICT
+#endif
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
+	!defined(__cplusplus)
+#define HEDLEY_RESTRICT restrict
+#elif HEDLEY_GCC_VERSION_CHECK(3, 1, 0) ||                                 \
+	HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) ||                             \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                               \
+	HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||                              \
+	HEDLEY_PGI_VERSION_CHECK(17, 10, 0) ||                             \
+	HEDLEY_TI_VERSION_CHECK(8, 0, 0) ||                                \
+	(HEDLEY_SUNPRO_VERSION_CHECK(5, 14, 0) && defined(__cplusplus)) || \
+	HEDLEY_IAR_VERSION_CHECK(8, 0, 0) || defined(__clang__)
+#define HEDLEY_RESTRICT __restrict
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 3, 0) && !defined(__cplusplus)
+#define HEDLEY_RESTRICT _Restrict
+#else
+#define HEDLEY_RESTRICT
+#endif
+
+#if defined(HEDLEY_INLINE)
+#undef HEDLEY_INLINE
+#endif
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
+	(defined(__cplusplus) && (__cplusplus >= 199711L))
+#define HEDLEY_INLINE inline
+#elif defined(HEDLEY_GCC_VERSION) || HEDLEY_ARM_VERSION_CHECK(6, 2, 0)
+#define HEDLEY_INLINE __inline__
+#elif HEDLEY_MSVC_VERSION_CHECK(12, 0, 0) || \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || HEDLEY_TI_VERSION_CHECK(8, 0, 0)
+#define HEDLEY_INLINE __inline
+#else
+#define HEDLEY_INLINE
+#endif
+
+#if defined(HEDLEY_ALWAYS_INLINE)
+#undef HEDLEY_ALWAYS_INLINE
+#endif
+#if HEDLEY_HAS_ATTRIBUTE(always_inline) ||       \
+	HEDLEY_GCC_VERSION_CHECK(4, 0, 0) ||     \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||  \
+	HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||     \
+	HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||    \
+	HEDLEY_TI_VERSION_CHECK(8, 0, 0) ||      \
+	(HEDLEY_TI_VERSION_CHECK(7, 3, 0) &&     \
+	 defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+#define HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) HEDLEY_INLINE
+#elif HEDLEY_MSVC_VERSION_CHECK(12, 0, 0)
+#define HEDLEY_ALWAYS_INLINE __forceinline
+#elif HEDLEY_TI_VERSION_CHECK(7, 0, 0) && defined(__cplusplus)
+#define HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;")
+#elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
+#define HEDLEY_ALWAYS_INLINE _Pragma("inline=forced")
+#else
+#define HEDLEY_ALWAYS_INLINE HEDLEY_INLINE
+#endif
+
+#if defined(HEDLEY_NEVER_INLINE)
+#undef HEDLEY_NEVER_INLINE
+#endif
+#if HEDLEY_HAS_ATTRIBUTE(noinline) || HEDLEY_GCC_VERSION_CHECK(4, 0, 0) || \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
+	HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) ||                           \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                               \
+	HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||                              \
+	HEDLEY_TI_VERSION_CHECK(8, 0, 0) ||                                \
+	(HEDLEY_TI_VERSION_CHECK(7, 3, 0) &&                               \
+	 defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+#define HEDLEY_NEVER_INLINE __attribute__((__noinline__))
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0)
+#define HEDLEY_NEVER_INLINE __declspec(noinline)
+#elif HEDLEY_PGI_VERSION_CHECK(10, 2, 0)
+#define HEDLEY_NEVER_INLINE _Pragma("noinline")
+#elif HEDLEY_TI_VERSION_CHECK(6, 0, 0) && defined(__cplusplus)
+#define HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;")
+#elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
+#define HEDLEY_NEVER_INLINE _Pragma("inline=never")
+#elif HEDLEY_COMPCERT_VERSION_CHECK(3, 2, 0)
+#define HEDLEY_NEVER_INLINE __attribute((noinline))
+#elif HEDLEY_PELLES_VERSION_CHECK(9, 0, 0)
+#define HEDLEY_NEVER_INLINE __declspec(noinline)
+#else
+#define HEDLEY_NEVER_INLINE
+#endif
+
+#if defined(HEDLEY_PRIVATE)
+#undef HEDLEY_PRIVATE
+#endif
+#if defined(HEDLEY_PUBLIC)
+#undef HEDLEY_PUBLIC
+#endif
+#if defined(HEDLEY_IMPORT)
+#undef HEDLEY_IMPORT
+#endif
+#if defined(_WIN32) || defined(__CYGWIN__)
+#define HEDLEY_PRIVATE
+#define HEDLEY_PUBLIC __declspec(dllexport)
+#define HEDLEY_IMPORT __declspec(dllimport)
+#else
+#if HEDLEY_HAS_ATTRIBUTE(visibility) || HEDLEY_GCC_VERSION_CHECK(3, 3, 0) || \
+	HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) ||                             \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                              \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                                 \
+	HEDLEY_IBM_VERSION_CHECK(13, 1, 0) ||                                \
+	HEDLEY_TI_VERSION_CHECK(8, 0, 0) ||                                  \
+	(HEDLEY_TI_VERSION_CHECK(7, 3, 0) && defined(__TI_EABI__) &&         \
+	 defined(__TI_GNU_ATTRIBUTE_SUPPORT__))
+#define HEDLEY_PRIVATE __attribute__((__visibility__("hidden")))
+#define HEDLEY_PUBLIC __attribute__((__visibility__("default")))
+#else
+#define HEDLEY_PRIVATE
+#define HEDLEY_PUBLIC
+#endif
+#define HEDLEY_IMPORT extern
+#endif
+
+#if defined(HEDLEY_NO_THROW)
+#undef HEDLEY_NO_THROW
+#endif
+#if HEDLEY_HAS_ATTRIBUTE(nothrow) || HEDLEY_GCC_VERSION_CHECK(3, 3, 0) || \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define HEDLEY_NO_THROW __attribute__((__nothrow__))
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 1, 0) || HEDLEY_ARM_VERSION_CHECK(4, 1, 0)
+#define HEDLEY_NO_THROW __declspec(nothrow)
+#else
+#define HEDLEY_NO_THROW
+#endif
+
+#if defined(HEDLEY_FALL_THROUGH)
+#undef HEDLEY_FALL_THROUGH
+#endif
+#if defined(__cplusplus) &&                        \
+	(!defined(HEDLEY_SUNPRO_VERSION) ||        \
+	 HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0)) && \
+	!defined(HEDLEY_PGI_VERSION)
+#if (__cplusplus >= 201703L) || \
+	((__cplusplus >= 201103L) && HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough))
+#define HEDLEY_FALL_THROUGH [[fallthrough]]
+#elif (__cplusplus >= 201103L) && HEDLEY_HAS_CPP_ATTRIBUTE(clang::fallthrough)
+#define HEDLEY_FALL_THROUGH [[clang::fallthrough]]
+#elif (__cplusplus >= 201103L) && HEDLEY_GCC_VERSION_CHECK(7, 0, 0)
+#define HEDLEY_FALL_THROUGH [[gnu::fallthrough]]
+#endif
+#endif
+#if !defined(HEDLEY_FALL_THROUGH)
+#if HEDLEY_GNUC_HAS_ATTRIBUTE(fallthrough, 7, 0, 0) && \
+	!defined(HEDLEY_PGI_VERSION)
+#define HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
+#elif defined(__fallthrough) /* SAL */
+#define HEDLEY_FALL_THROUGH __fallthrough
+#else
+#define HEDLEY_FALL_THROUGH
+#endif
+#endif
+
+#if defined(HEDLEY_RETURNS_NON_NULL)
+#undef HEDLEY_RETURNS_NON_NULL
+#endif
+#if HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || HEDLEY_GCC_VERSION_CHECK(4, 9, 0)
+#define HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__))
+#elif defined(_Ret_notnull_) /* SAL */
+#define HEDLEY_RETURNS_NON_NULL _Ret_notnull_
+#else
+#define HEDLEY_RETURNS_NON_NULL
+#endif
+
+#if defined(HEDLEY_ARRAY_PARAM)
+#undef HEDLEY_ARRAY_PARAM
+#endif
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
+	!defined(__STDC_NO_VLA__) && !defined(__cplusplus) &&     \
+	!defined(HEDLEY_PGI_VERSION) && !defined(HEDLEY_TINYC_VERSION)
+#define HEDLEY_ARRAY_PARAM(name) (name)
+#else
+#define HEDLEY_ARRAY_PARAM(name)
+#endif
+
+#if defined(HEDLEY_IS_CONSTANT)
+#undef HEDLEY_IS_CONSTANT
+#endif
+#if defined(HEDLEY_REQUIRE_CONSTEXPR)
+#undef HEDLEY_REQUIRE_CONSTEXPR
+#endif
+/* Note the double-underscore. For internal use only; no API
+ * guarantees! */
+#if defined(HEDLEY__IS_CONSTEXPR)
+#undef HEDLEY__IS_CONSTEXPR
+#endif
+
+#if HEDLEY_HAS_BUILTIN(__builtin_constant_p) ||                             \
+	HEDLEY_GCC_VERSION_CHECK(3, 4, 0) ||                                \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                             \
+	HEDLEY_TINYC_VERSION_CHECK(0, 9, 19) ||                             \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                                \
+	HEDLEY_IBM_VERSION_CHECK(13, 1, 0) ||                               \
+	HEDLEY_TI_VERSION_CHECK(6, 1, 0) ||                                 \
+	(HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0) && !defined(__cplusplus)) || \
+	HEDLEY_CRAY_VERSION_CHECK(8, 1, 0)
+#define HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr)
+#endif
+#if !defined(__cplusplus)
+#if HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \
+	HEDLEY_GCC_VERSION_CHECK(3, 4, 0) ||            \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||         \
+	HEDLEY_IBM_VERSION_CHECK(13, 1, 0) ||           \
+	HEDLEY_CRAY_VERSION_CHECK(8, 1, 0) ||           \
+	HEDLEY_ARM_VERSION_CHECK(5, 4, 0) ||            \
+	HEDLEY_TINYC_VERSION_CHECK(0, 9, 24)
+#if defined(__INTPTR_TYPE__)
+#define HEDLEY__IS_CONSTEXPR(expr)                                    \
+	__builtin_types_compatible_p(                                 \
+		__typeof__((1 ? (void *)((__INTPTR_TYPE__)((expr)*0)) \
+			      : (int *)0)),                           \
+		int *)
+#else
+#include <stdint.h>
+#define HEDLEY__IS_CONSTEXPR(expr)                                           \
+	__builtin_types_compatible_p(                                        \
+		__typeof__((1 ? (void *)((intptr_t)((expr)*0)) : (int *)0)), \
+		int *)
+#endif
+#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) &&       \
+       !defined(HEDLEY_SUNPRO_VERSION) && !defined(HEDLEY_PGI_VERSION)) || \
+	HEDLEY_HAS_EXTENSION(c_generic_selections) ||                      \
+	HEDLEY_GCC_VERSION_CHECK(4, 9, 0) ||                               \
+	HEDLEY_INTEL_VERSION_CHECK(17, 0, 0) ||                            \
+	HEDLEY_IBM_VERSION_CHECK(12, 1, 0) ||                              \
+	HEDLEY_ARM_VERSION_CHECK(5, 3, 0)
+#if defined(__INTPTR_TYPE__)
+#define HEDLEY__IS_CONSTEXPR(expr)                                       \
+	_Generic((1 ? (void *)((__INTPTR_TYPE__)((expr)*0)) : (int *)0), \
+		 int * : 1, void * : 0)
+#else
+#include <stdint.h>
+#define HEDLEY__IS_CONSTEXPR(expr) \
+	_Generic((1 ? (void *)((intptr_t)*0) : (int *)0), int * : 1, void * : 0)
+#endif
+#elif defined(HEDLEY_GCC_VERSION) || defined(HEDLEY_INTEL_VERSION) ||  \
+	defined(HEDLEY_TINYC_VERSION) || defined(HEDLEY_TI_VERSION) || \
+	defined(__clang__)
+#define HEDLEY__IS_CONSTEXPR(expr)                                       \
+	(sizeof(void) != sizeof(*(1 ? ((void *)((expr)*0L)) : ((struct { \
+		 char v[sizeof(void) * 2];                               \
+	 } *)1))))
+#endif
+#endif
+#if defined(HEDLEY__IS_CONSTEXPR)
+#if !defined(HEDLEY_IS_CONSTANT)
+#define HEDLEY_IS_CONSTANT(expr) HEDLEY__IS_CONSTEXPR(expr)
+#endif
+#define HEDLEY_REQUIRE_CONSTEXPR(expr) \
+	(HEDLEY__IS_CONSTEXPR(expr) ? (expr) : (-1))
+#else
+#if !defined(HEDLEY_IS_CONSTANT)
+#define HEDLEY_IS_CONSTANT(expr) (0)
+#endif
+#define HEDLEY_REQUIRE_CONSTEXPR(expr) (expr)
+#endif
+
+#if defined(HEDLEY_BEGIN_C_DECLS)
+#undef HEDLEY_BEGIN_C_DECLS
+#endif
+#if defined(HEDLEY_END_C_DECLS)
+#undef HEDLEY_END_C_DECLS
+#endif
+#if defined(HEDLEY_C_DECL)
+#undef HEDLEY_C_DECL
+#endif
+#if defined(__cplusplus)
+#define HEDLEY_BEGIN_C_DECLS extern "C" {
+#define HEDLEY_END_C_DECLS }
+#define HEDLEY_C_DECL extern "C"
+#else
+#define HEDLEY_BEGIN_C_DECLS
+#define HEDLEY_END_C_DECLS
+#define HEDLEY_C_DECL
+#endif
+
+#if defined(HEDLEY_STATIC_ASSERT)
+#undef HEDLEY_STATIC_ASSERT
+#endif
+#if !defined(__cplusplus) &&                                             \
+	((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
+	 HEDLEY_HAS_FEATURE(c_static_assert) ||                          \
+	 HEDLEY_GCC_VERSION_CHECK(6, 0, 0) ||                            \
+	 HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || defined(_Static_assert))
+#define HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
+#elif (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
+	HEDLEY_MSVC_VERSION_CHECK(16, 0, 0) ||              \
+	(defined(__cplusplus) && HEDLEY_TI_VERSION_CHECK(8, 3, 0))
+#define HEDLEY_STATIC_ASSERT(expr, message) static_assert(expr, message)
+#else
+#define HEDLEY_STATIC_ASSERT(expr, message)
+#endif
+
+#if defined(HEDLEY_CONST_CAST)
+#undef HEDLEY_CONST_CAST
+#endif
+#if defined(__cplusplus)
+#define HEDLEY_CONST_CAST(T, expr) (const_cast<T>(expr))
+#elif HEDLEY_HAS_WARNING("-Wcast-qual") ||   \
+	HEDLEY_GCC_VERSION_CHECK(4, 6, 0) || \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define HEDLEY_CONST_CAST(T, expr)                              \
+	(__extension__({                                        \
+		HEDLEY_DIAGNOSTIC_PUSH                          \
+		HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL((T)(expr)); \
+		HEDLEY_DIAGNOSTIC_POP                           \
+	}))
+#else
+#define HEDLEY_CONST_CAST(T, expr) ((T)(expr))
+#endif
+
+#if defined(HEDLEY_REINTERPRET_CAST)
+#undef HEDLEY_REINTERPRET_CAST
+#endif
+#if defined(__cplusplus)
+#define HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast<T>(expr))
+#else
+#define HEDLEY_REINTERPRET_CAST(T, expr) (*((T *)&(expr)))
+#endif
+
+#if defined(HEDLEY_STATIC_CAST)
+#undef HEDLEY_STATIC_CAST
+#endif
+#if defined(__cplusplus)
+#define HEDLEY_STATIC_CAST(T, expr) (static_cast<T>(expr))
+#else
+#define HEDLEY_STATIC_CAST(T, expr) ((T)(expr))
+#endif
+
+#if defined(HEDLEY_CPP_CAST)
+#undef HEDLEY_CPP_CAST
+#endif
+#if defined(__cplusplus)
+#define HEDLEY_CPP_CAST(T, expr) static_cast<T>(expr)
+#else
+#define HEDLEY_CPP_CAST(T, expr) (expr)
+#endif
+
+#if defined(HEDLEY_MESSAGE)
+#undef HEDLEY_MESSAGE
+#endif
+#if HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#define HEDLEY_MESSAGE(msg)                       \
+	HEDLEY_DIAGNOSTIC_PUSH                    \
+	HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+	HEDLEY_PRAGMA(message msg)                \
+	HEDLEY_DIAGNOSTIC_POP
+#elif HEDLEY_GCC_VERSION_CHECK(4, 4, 0) || HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message msg)
+#elif HEDLEY_CRAY_VERSION_CHECK(5, 0, 0)
+#define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(_CRI message msg)
+#elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
+#define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message(msg))
+#elif HEDLEY_PELLES_VERSION_CHECK(2, 0, 0)
+#define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message(msg))
+#else
+#define HEDLEY_MESSAGE(msg)
+#endif
+
+#if defined(HEDLEY_WARNING)
+#undef HEDLEY_WARNING
+#endif
+#if HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#define HEDLEY_WARNING(msg)                       \
+	HEDLEY_DIAGNOSTIC_PUSH                    \
+	HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+	HEDLEY_PRAGMA(clang warning msg)          \
+	HEDLEY_DIAGNOSTIC_POP
+#elif HEDLEY_GCC_VERSION_CHECK(4, 8, 0) || HEDLEY_PGI_VERSION_CHECK(18, 4, 0)
+#define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(GCC warning msg)
+#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
+#define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(message(msg))
+#else
+#define HEDLEY_WARNING(msg) HEDLEY_MESSAGE(msg)
+#endif
+
+#if defined(HEDLEY_REQUIRE_MSG)
+#undef HEDLEY_REQUIRE_MSG
+#endif
+#if HEDLEY_HAS_ATTRIBUTE(diagnose_if)
+#if HEDLEY_HAS_WARNING("-Wgcc-compat")
+#define HEDLEY_REQUIRE_MSG(expr, msg)                                   \
+	HEDLEY_DIAGNOSTIC_PUSH                                          \
+	_Pragma("clang diagnostic ignored \"-Wgcc-compat\"")            \
+		__attribute__((__diagnose_if__(!(expr), msg, "error"))) \
+			HEDLEY_DIAGNOSTIC_POP
+#else
+#define HEDLEY_REQUIRE_MSG(expr, msg) \
+	__attribute__((__diagnose_if__(!(expr), msg, "error")))
+#endif
+#else
+#define HEDLEY_REQUIRE_MSG(expr, msg)
+#endif
+
+#if defined(HEDLEY_REQUIRE)
+#undef HEDLEY_REQUIRE
+#endif
+#define HEDLEY_REQUIRE(expr) HEDLEY_REQUIRE_MSG(expr, #expr)
+
+#if defined(HEDLEY_FLAGS)
+#undef HEDLEY_FLAGS
+#endif
+#if HEDLEY_HAS_ATTRIBUTE(flag_enum)
+#define HEDLEY_FLAGS __attribute__((__flag_enum__))
+#endif
+
+#if defined(HEDLEY_FLAGS_CAST)
+#undef HEDLEY_FLAGS_CAST
+#endif
+#if HEDLEY_INTEL_VERSION_CHECK(19, 0, 0)
+#define HEDLEY_FLAGS_CAST(T, expr)                          \
+	(__extension__({                                    \
+		HEDLEY_DIAGNOSTIC_PUSH                      \
+		_Pragma("warning(disable:188)")((T)(expr)); \
+		HEDLEY_DIAGNOSTIC_POP                       \
+	}))
+#else
+#define HEDLEY_FLAGS_CAST(T, expr) HEDLEY_STATIC_CAST(T, expr)
+#endif
+
+#if defined(HEDLEY_EMPTY_BASES)
+#undef HEDLEY_EMPTY_BASES
+#endif
+#if HEDLEY_MSVC_VERSION_CHECK(19, 0, 23918) && \
+	!HEDLEY_MSVC_VERSION_CHECK(20, 0, 0)
+#define HEDLEY_EMPTY_BASES __declspec(empty_bases)
+#else
+#define HEDLEY_EMPTY_BASES
+#endif
+
+/* Remaining macros are deprecated. */
+
+#if defined(HEDLEY_GCC_NOT_CLANG_VERSION_CHECK)
+#undef HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
+#endif
+#if defined(__clang__)
+#define HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major, minor, patch) (0)
+#else
+#define HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major, minor, patch) \
+	HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(HEDLEY_CLANG_HAS_ATTRIBUTE)
+#undef HEDLEY_CLANG_HAS_ATTRIBUTE
+#endif
+#define HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) HEDLEY_HAS_ATTRIBUTE(attribute)
+
+#if defined(HEDLEY_CLANG_HAS_CPP_ATTRIBUTE)
+#undef HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
+#endif
+#define HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) \
+	HEDLEY_HAS_CPP_ATTRIBUTE(attribute)
+
+#if defined(HEDLEY_CLANG_HAS_BUILTIN)
+#undef HEDLEY_CLANG_HAS_BUILTIN
+#endif
+#define HEDLEY_CLANG_HAS_BUILTIN(builtin) HEDLEY_HAS_BUILTIN(builtin)
+
+#if defined(HEDLEY_CLANG_HAS_FEATURE)
+#undef HEDLEY_CLANG_HAS_FEATURE
+#endif
+#define HEDLEY_CLANG_HAS_FEATURE(feature) HEDLEY_HAS_FEATURE(feature)
+
+#if defined(HEDLEY_CLANG_HAS_EXTENSION)
+#undef HEDLEY_CLANG_HAS_EXTENSION
+#endif
+#define HEDLEY_CLANG_HAS_EXTENSION(extension) HEDLEY_HAS_EXTENSION(extension)
+
+#if defined(HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE)
+#undef HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
+#endif
+#define HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) \
+	HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute)
+
+#if defined(HEDLEY_CLANG_HAS_WARNING)
+#undef HEDLEY_CLANG_HAS_WARNING
+#endif
+#define HEDLEY_CLANG_HAS_WARNING(warning) HEDLEY_HAS_WARNING(warning)
+
+#endif /* !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < X) */

+ 1356 - 0
libobs/util/aarch/mmx.h

@@ -0,0 +1,1356 @@
+/* Copyright (c) 2017-2018 Evan Nemerson <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(SIMDE__MMX_H)
+#if !defined(SIMDE__MMX_H)
+#define SIMDE__MMX_H
+#endif
+#include "simde-common.h"
+
+#if defined(SIMDE_MMX_FORCE_NATIVE)
+#define SIMDE_MMX_NATIVE
+#elif defined(__MMX__) && !defined(SIMDE_MMX_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#define SIMDE_MMX_NATIVE
+#elif defined(__ARM_NEON) && !defined(SIMDE_MMX_NO_NEON) && \
+	!defined(SIMDE_NO_NEON)
+#define SIMDE_MMX_NEON
+#endif
+
+#if defined(SIMDE_MMX_NATIVE)
+#include <mmintrin.h>
+#else
+#if defined(SIMDE_MMX_NEON)
+#include <arm_neon.h>
+#endif
+#endif
+#include <stdint.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+
+SIMDE__BEGIN_DECLS
+
+typedef union {
+#if defined(SIMDE__ENABLE_GCC_VEC_EXT)
+	int8_t i8 __attribute__((__vector_size__(8), __may_alias__));
+	int16_t i16 __attribute__((__vector_size__(8), __may_alias__));
+	int32_t i32 __attribute__((__vector_size__(8), __may_alias__));
+	int64_t i64 __attribute__((__vector_size__(8), __may_alias__));
+	uint8_t u8 __attribute__((__vector_size__(8), __may_alias__));
+	uint16_t u16 __attribute__((__vector_size__(8), __may_alias__));
+	uint32_t u32 __attribute__((__vector_size__(8), __may_alias__));
+	uint64_t u64 __attribute__((__vector_size__(8), __may_alias__));
+	simde_float32 f32 __attribute__((__vector_size__(8), __may_alias__));
+#else
+	int8_t i8[8];
+	int16_t i16[4];
+	int32_t i32[2];
+	int64_t i64[1];
+	uint8_t u8[8];
+	uint16_t u16[4];
+	uint32_t u32[2];
+	uint64_t u64[1];
+	simde_float32 f32[2];
+#endif
+
+#if defined(SIMDE_MMX_NATIVE)
+	__m64 n;
+#elif defined(SIMDE_MMX_NEON)
+	int8x8_t neon_i8;
+	int16x4_t neon_i16;
+	int32x2_t neon_i32;
+	int64x1_t neon_i64;
+	uint8x8_t neon_u8;
+	uint16x4_t neon_u16;
+	uint32x2_t neon_u32;
+	uint64x1_t neon_u64;
+	float32x2_t neon_f32;
+#endif
+} simde__m64;
+
+#if defined(SIMDE_MMX_NATIVE)
+HEDLEY_STATIC_ASSERT(sizeof(__m64) == sizeof(simde__m64),
+		     "__m64 size doesn't match simde__m64 size");
+SIMDE__FUNCTION_ATTRIBUTES simde__m64 SIMDE__M64_C(__m64 v)
+{
+	simde__m64 r;
+	r.n = v;
+	return r;
+}
+#elif defined(SIMDE_MMX_NEON)
+#define SIMDE__M64_NEON_C(T, expr) \
+	(simde__m64) { .neon_##T = (expr) }
+#endif
+HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect");
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_add_pi8(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_add_pi8(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < 8; i++) {
+		r.i8[i] = a.i8[i] + b.i8[i];
+	}
+	return r;
+#endif
+}
+#define simde_m_paddb(a, b) simde_mm_add_pi8(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_add_pi16(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_add_pi16(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
+		r.i16[i] = a.i16[i] + b.i16[i];
+	}
+	return r;
+#endif
+}
+#define simde_m_paddw(a, b) simde_mm_add_pi16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_add_pi32(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_add_pi32(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(int32_t)); i++) {
+		r.i32[i] = a.i32[i] + b.i32[i];
+	}
+	return r;
+#endif
+}
+#define simde_m_paddd(a, b) simde_mm_add_pi32(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_adds_pi8(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_adds_pi8(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (int i = 0; i < 8; i++) {
+		if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) {
+			r.i8[i] = INT8_MAX;
+		} else if ((((b.i8[i]) < 0) &&
+			    ((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) {
+			r.i8[i] = INT8_MIN;
+		} else {
+			r.i8[i] = (a.i8[i]) + (b.i8[i]);
+		}
+	}
+	return r;
+#endif
+}
+#define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_adds_pu8(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_adds_pu8(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < 8; i++) {
+		const int32_t x = a.u8[i] + b.u8[i];
+		if (x < 0)
+			r.u8[i] = 0;
+		else if (x > UINT8_MAX)
+			r.u8[i] = UINT8_MAX;
+		else
+			r.u8[i] = (uint8_t)x;
+	}
+	return r;
+#endif
+}
+#define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_adds_pi16(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_adds_pi16(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (int i = 0; i < 4; i++) {
+		if ((((b.i16[i]) > 0) &&
+		     ((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) {
+			r.i16[i] = INT16_MAX;
+		} else if ((((b.i16[i]) < 0) &&
+			    ((a.i16[i]) < (SHRT_MIN - (b.i16[i]))))) {
+			r.i16[i] = SHRT_MIN;
+		} else {
+			r.i16[i] = (a.i16[i]) + (b.i16[i]);
+		}
+	}
+	return r;
+#endif
+}
+#define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_adds_pu16(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_adds_pu16(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
+		const uint32_t x = a.u16[i] + b.u16[i];
+		if (x > UINT16_MAX)
+			r.u16[i] = UINT16_MAX;
+		else
+			r.u16[i] = (uint16_t)x;
+	}
+	return r;
+#endif
+}
+#define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_and_si64(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_and_si64(a.n, b.n));
+#else
+	simde__m64 r;
+	r.i64[0] = a.i64[0] & b.i64[0];
+	return r;
+#endif
+}
+#define simde_m_pand(a, b) simde_mm_and_si64(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_andnot_si64(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_andnot_si64(a.n, b.n));
+#else
+	simde__m64 r;
+	r.i64[0] = ~(a.i64[0]) & b.i64[0];
+	return r;
+#endif
+}
+#define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cmpeq_pi8(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_cmpeq_pi8(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (int i = 0; i < 8; i++) {
+		r.i8[i] = (a.i8[i] == b.i8[i]) * 0xff;
+	}
+	return r;
+#endif
+}
+#define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cmpeq_pi16(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_cmpeq_pi16(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (int i = 0; i < 4; i++) {
+		r.i16[i] = (a.i16[i] == b.i16[i]) * 0xffff;
+	}
+	return r;
+#endif
+}
+#define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cmpeq_pi32(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_cmpeq_pi32(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (int i = 0; i < 2; i++) {
+		r.i32[i] = (a.i32[i] == b.i32[i]) * 0xffffffff;
+	}
+	return r;
+#endif
+}
+#define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cmpgt_pi8(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_cmpgt_pi8(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (int i = 0; i < 8; i++) {
+		r.i8[i] = (a.i8[i] > b.i8[i]) * 0xff;
+	}
+	return r;
+#endif
+}
+#define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cmpgt_pi16(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_cmpgt_pi16(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (int i = 0; i < 4; i++) {
+		r.i16[i] = (a.i16[i] > b.i16[i]) * 0xffff;
+	}
+	return r;
+#endif
+}
+#define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cmpgt_pi32(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_cmpgt_pi32(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (int i = 0; i < 2; i++) {
+		r.i32[i] = (a.i32[i] > b.i32[i]) * 0xffffffff;
+	}
+	return r;
+#endif
+}
+#define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+int64_t simde_mm_cvtm64_si64(simde__m64 a)
+{
+#if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI)
+	return _mm_cvtm64_si64(a.n);
+#else
+	return a.i64[0];
+#endif
+}
+#define simde_m_to_int64(a) simde_mm_cvtm64_si64(a)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cvtsi32_si64(int32_t a)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_cvtsi32_si64(a));
+#else
+	simde__m64 r;
+	r.i32[0] = a;
+	r.i32[1] = 0;
+	return r;
+#endif
+}
+#define simde_m_from_int(a) simde_mm_cvtsi32_si64(a)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cvtsi64_m64(int64_t a)
+{
+#if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI)
+	return SIMDE__M64_C(_mm_cvtsi64_m64(a));
+#else
+	simde__m64 r;
+	r.i64[0] = a;
+	return r;
+#endif
+}
+#define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a)
+
+SIMDE__FUNCTION_ATTRIBUTES
+int32_t simde_mm_cvtsi64_si32(simde__m64 a)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return _mm_cvtsi64_si32(a.n);
+#else
+	return a.i32[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_empty(void)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	_mm_empty();
+#else
+#endif
+}
+#define simde_m_empty() simde_mm_empty()
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_madd_pi16(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_madd_pi16(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (int i = 0; i < 4; i += 2) {
+		r.i32[i / 2] =
+			(a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]);
+	}
+	return r;
+#endif
+}
+#define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_mulhi_pi16(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_mulhi_pi16(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (int i = 0; i < 4; i++) {
+		r.i16[i] = (int16_t)((a.i16[i] * b.i16[i]) >> 16);
+	}
+	return r;
+#endif
+}
+#define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_mullo_pi16(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_mullo_pi16(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (int i = 0; i < 4; i++) {
+		r.i16[i] = (int16_t)((a.i16[i] * b.i16[i]) & 0xffff);
+	}
+	return r;
+#endif
+}
+#define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_or_si64(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_or_si64(a.n, b.n));
+#else
+	simde__m64 r;
+	r.i64[0] = a.i64[0] | b.i64[0];
+	return r;
+#endif
+}
+#define simde_m_por(a, b) simde_mm_or_si64(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_packs_pi16(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_packs_pi16(a.n, b.n));
+#else
+	simde__m64 r;
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
+		if (a.i16[i] < INT8_MIN) {
+			r.i8[i] = INT8_MIN;
+		} else if (a.i16[i] > INT8_MAX) {
+			r.i8[i] = INT8_MAX;
+		} else {
+			r.i8[i] = (int8_t)a.i16[i];
+		}
+	}
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
+		if (b.i16[i] < INT8_MIN) {
+			r.i8[i + 4] = INT8_MIN;
+		} else if (b.i16[i] > INT8_MAX) {
+			r.i8[i + 4] = INT8_MAX;
+		} else {
+			r.i8[i + 4] = (int8_t)b.i16[i];
+		}
+	}
+
+	return r;
+#endif
+}
+#define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_packs_pi32(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_packs_pi32(a.n, b.n));
+#else
+	simde__m64 r;
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(a.i32[0])); i++) {
+		if (a.i32[i] < SHRT_MIN) {
+			r.i16[i] = SHRT_MIN;
+		} else if (a.i32[i] > INT16_MAX) {
+			r.i16[i] = INT16_MAX;
+		} else {
+			r.i16[i] = (int16_t)a.i32[i];
+		}
+	}
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(b.i32[0])); i++) {
+		if (b.i32[i] < SHRT_MIN) {
+			r.i16[i + 2] = SHRT_MIN;
+		} else if (b.i32[i] > INT16_MAX) {
+			r.i16[i + 2] = INT16_MAX;
+		} else {
+			r.i16[i + 2] = (int16_t)b.i32[i];
+		}
+	}
+
+	return r;
+#endif
+}
+#define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_packs_pu16(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_packs_pu16(a.n, b.n));
+#else
+	simde__m64 r;
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
+		if (a.i16[i] > UINT8_MAX) {
+			r.u8[i] = UINT8_MAX;
+		} else if (a.i16[i] < 0) {
+			r.u8[i] = 0;
+		} else {
+			r.u8[i] = (int8_t)a.i16[i];
+		}
+	}
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
+		if (b.i16[i] > UINT8_MAX) {
+			r.u8[i + 4] = UINT8_MAX;
+		} else if (b.i16[i] < 0) {
+			r.u8[i + 4] = 0;
+		} else {
+			r.u8[i + 4] = (int8_t)b.i16[i];
+		}
+	}
+
+	return r;
+#endif
+}
+#define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_set_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
+			    int8_t e3, int8_t e2, int8_t e1, int8_t e0)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0));
+#else
+	simde__m64 r;
+	r.i8[0] = e0;
+	r.i8[1] = e1;
+	r.i8[2] = e2;
+	r.i8[3] = e3;
+	r.i8[4] = e4;
+	r.i8[5] = e5;
+	r.i8[6] = e6;
+	r.i8[7] = e7;
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_x_mm_set_pu8(uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4,
+			      uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_set_pi8((int8_t)e7, (int8_t)e6, (int8_t)e5,
+					(int8_t)e4, (int8_t)e3, (int8_t)e2,
+					(int8_t)e1, (int8_t)e0));
+#else
+	simde__m64 r;
+	r.u8[0] = e0;
+	r.u8[1] = e1;
+	r.u8[2] = e2;
+	r.u8[3] = e3;
+	r.u8[4] = e4;
+	r.u8[5] = e5;
+	r.u8[6] = e6;
+	r.u8[7] = e7;
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_set_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_set_pi16(e3, e2, e1, e0));
+#else
+	simde__m64 r;
+	r.i16[0] = e0;
+	r.i16[1] = e1;
+	r.i16[2] = e2;
+	r.i16[3] = e3;
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_x_mm_set_pu16(uint16_t e3, uint16_t e2, uint16_t e1,
+			       uint16_t e0)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_set_pi16((int16_t)e3, (int16_t)e2, (int16_t)e1,
+					 (int16_t)e0));
+#else
+	simde__m64 r;
+	r.u16[0] = e0;
+	r.u16[1] = e1;
+	r.u16[2] = e2;
+	r.u16[3] = e3;
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_x_mm_set_pu32(uint32_t e1, uint32_t e0)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_set_pi32((int32_t)e1, (int32_t)e0));
+#else
+	simde__m64 r;
+	r.u32[0] = e0;
+	r.u32[1] = e1;
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_set_pi32(int32_t e1, int32_t e0)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_set_pi32(e1, e0));
+#else
+	simde__m64 r;
+	r.i32[0] = e0;
+	r.i32[1] = e1;
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_set1_pi8(int8_t a)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_set1_pi8(a));
+#else
+	return simde_mm_set_pi8(a, a, a, a, a, a, a, a);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_set1_pi16(int16_t a)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_set1_pi16(a));
+#else
+	return simde_mm_set_pi16(a, a, a, a);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_set1_pi32(int32_t a)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_set1_pi32(a));
+#else
+	return simde_mm_set_pi32(a, a);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_setr_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
+			     int8_t e3, int8_t e2, int8_t e1, int8_t e0)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0));
+#else
+	return simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_setr_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_setr_pi16(e3, e2, e1, e0));
+#else
+	return simde_mm_set_pi16(e0, e1, e2, e3);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_setr_pi32(int32_t e1, int32_t e0)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_setr_pi32(e1, e0));
+#else
+	return simde_mm_set_pi32(e0, e1);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_setzero_si64(void)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_setzero_si64());
+#else
+	return simde_mm_set_pi32(0, 0);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_sll_pi16(simde__m64 a, simde__m64 count)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_sll_pi16(a.n, count.n));
+#else
+	simde__m64 r;
+
+	if (HEDLEY_UNLIKELY(count.u64[0] > 15)) {
+		memset(&r, 0, sizeof(r));
+		return r;
+	}
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
+		r.u16[i] = a.u16[i] << count.u64[0];
+	}
+	return r;
+#endif
+}
+#define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_sll_pi32(simde__m64 a, simde__m64 count)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_sll_pi32(a.n, count.n));
+#else
+	simde__m64 r;
+
+	if (HEDLEY_UNLIKELY(count.u64[0] > 31)) {
+		memset(&r, 0, sizeof(r));
+		return r;
+	}
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) {
+		r.u32[i] = a.u32[i] << count.u64[0];
+	}
+	return r;
+#endif
+}
+#define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_slli_pi16(simde__m64 a, int count)
+{
+#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
+	return SIMDE__M64_C(_mm_slli_pi16(a.n, count));
+#else
+	simde__m64 r;
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
+		r.u16[i] = a.u16[i] << count;
+	}
+
+	return r;
+#endif
+}
+#define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_slli_pi32(simde__m64 a, int count)
+{
+#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
+	return SIMDE__M64_C(_mm_slli_pi32(a.n, count));
+#else
+	simde__m64 r;
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(int)); i++) {
+		r.u32[i] = a.u32[i] << count;
+	}
+
+	return r;
+#endif
+}
+#define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_slli_si64(simde__m64 a, int count)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_slli_si64(a.n, count));
+#else
+	simde__m64 r;
+	r.u64[0] = a.u64[0] << count;
+	return r;
+#endif
+}
+#define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_sll_si64(simde__m64 a, simde__m64 count)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_sll_si64(a.n, count.n));
+#else
+	simde__m64 r;
+
+	if (HEDLEY_UNLIKELY(count.u64[0] > 63)) {
+		memset(&r, 0, sizeof(r));
+		return r;
+	}
+
+	r.u64[0] = a.u64[0] << count.u64[0];
+
+	return r;
+#endif
+}
+#define simde_m_psllq(a, count) simde_mm_sll_si64(a, count)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_srl_pi16(simde__m64 a, simde__m64 count)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_srl_pi16(a.n, count.n));
+#else
+	simde__m64 r;
+
+	if (HEDLEY_UNLIKELY(count.u64[0] > 15)) {
+		memset(&r, 0, sizeof(r));
+		return r;
+	}
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < sizeof(r.u16) / sizeof(r.u16[0]); i++) {
+		r.u16[i] = a.u16[i] >> count.u64[0];
+	}
+	return r;
+#endif
+}
+#define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_srl_pi32(simde__m64 a, simde__m64 count)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_srl_pi32(a.n, count.n));
+#else
+	simde__m64 r;
+
+	if (HEDLEY_UNLIKELY(count.u64[0] > 31)) {
+		memset(&r, 0, sizeof(r));
+		return r;
+	}
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < sizeof(r.u32) / sizeof(r.u32[0]); i++) {
+		r.u32[i] = a.u32[i] >> count.u64[0];
+	}
+	return r;
+#endif
+}
+#define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_srli_pi16(simde__m64 a, int count)
+{
+#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
+	return SIMDE__M64_C(_mm_srli_pi16(a.n, count));
+#else
+	simde__m64 r;
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(uint16_t)); i++) {
+		r.u16[i] = a.u16[i] >> count;
+	}
+
+	return r;
+#endif
+}
+#define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_srli_pi32(simde__m64 a, int count)
+{
+#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
+	return SIMDE__M64_C(_mm_srli_pi32(a.n, count));
+#else
+	simde__m64 r;
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(int)); i++) {
+		r.u32[i] = a.u32[i] >> count;
+	}
+
+	return r;
+#endif
+}
+#define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_srli_si64(simde__m64 a, int count)
+{
+#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
+	return SIMDE__M64_C(_mm_srli_si64(a.n, count));
+#else
+	simde__m64 r;
+	r.u64[0] = a.u64[0] >> count;
+	return r;
+#endif
+}
+#define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_srl_si64(simde__m64 a, simde__m64 count)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_srl_si64(a.n, count.n));
+#else
+	simde__m64 r;
+
+	if (HEDLEY_UNLIKELY(count.u64[0] > 63)) {
+		memset(&r, 0, sizeof(r));
+		return r;
+	}
+
+	r.u64[0] = a.u64[0] >> count.u64[0];
+	return r;
+#endif
+}
+#define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_srai_pi16(simde__m64 a, int count)
+{
+#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
+	return SIMDE__M64_C(_mm_srai_pi16(a.n, count));
+#else
+	simde__m64 r;
+
+	const uint16_t m =
+		(uint16_t)((~0U) << ((sizeof(int16_t) * CHAR_BIT) - count));
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
+		const uint16_t is_neg = ((uint16_t)(
+			((a.u16[i]) >> ((sizeof(int16_t) * CHAR_BIT) - 1))));
+		r.u16[i] = (a.u16[i] >> count) | (m * is_neg);
+	}
+
+	return r;
+#endif
+}
+#define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_srai_pi32(simde__m64 a, int count)
+{
+#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
+	return SIMDE__M64_C(_mm_srai_pi32(a.n, count));
+#else
+	simde__m64 r;
+
+	const uint32_t m =
+		(uint32_t)((~0U) << ((sizeof(int) * CHAR_BIT) - count));
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(int)); i++) {
+		const uint32_t is_neg = ((uint32_t)(
+			((a.u32[i]) >> ((sizeof(int) * CHAR_BIT) - 1))));
+		r.u32[i] = (a.u32[i] >> count) | (m * is_neg);
+	}
+
+	return r;
+#endif
+}
+#define simde_m_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_sra_pi16(simde__m64 a, simde__m64 count)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_sra_pi16(a.n, count.n));
+#else
+	simde__m64 r;
+	int cnt = (int)count.i64[0];
+
+	if (cnt > 15 || cnt < 0) {
+		for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
+		     i++) {
+			r.u16[i] = (a.i16[i] < 0) ? 0xffff : 0x0000;
+		}
+	} else {
+		const uint16_t m = (uint16_t)(
+			(~0U) << ((sizeof(int16_t) * CHAR_BIT) - cnt));
+		for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
+		     i++) {
+			const uint16_t is_neg = a.i16[i] < 0;
+			r.u16[i] = (a.u16[i] >> cnt) | (m * is_neg);
+		}
+	}
+
+	return r;
+#endif
+}
+#define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_sra_pi32(simde__m64 a, simde__m64 count)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_sra_pi32(a.n, count.n));
+#else
+	simde__m64 r;
+	const uint64_t cnt = count.u64[0];
+
+	if (cnt > 31) {
+		for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
+		     i++) {
+			r.u32[i] = (a.i32[i] < 0) ? UINT32_MAX : 0;
+		}
+	} else if (cnt == 0) {
+		memcpy(&r, &a, sizeof(r));
+	} else {
+		const uint32_t m = (uint32_t)(
+			(~0U) << ((sizeof(int32_t) * CHAR_BIT) - cnt));
+		for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
+		     i++) {
+			const uint32_t is_neg = a.i32[i] < 0;
+			r.u32[i] = (a.u32[i] >> cnt) | (m * is_neg);
+		}
+	}
+
+	return r;
+#endif
+}
+#define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_sub_pi8(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_sub_pi8(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < 8; i++) {
+		r.i8[i] = a.i8[i] - b.i8[i];
+	}
+	return r;
+#endif
+}
+#define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_sub_pi16(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_sub_pi16(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
+		r.i16[i] = a.i16[i] - b.i16[i];
+	}
+	return r;
+#endif
+}
+#define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_sub_pi32(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_sub_pi32(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(int)); i++) {
+		r.i32[i] = a.i32[i] - b.i32[i];
+	}
+	return r;
+#endif
+}
+#define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_subs_pi8(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_subs_pi8(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8); i++) {
+		if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) {
+			r.i8[i] = INT8_MIN;
+		} else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) {
+			r.i8[i] = INT8_MAX;
+		} else {
+			r.i8[i] = (a.i8[i]) - (b.i8[i]);
+		}
+	}
+	return r;
+#endif
+}
+#define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_subs_pu8(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_subs_pu8(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8); i++) {
+		const int32_t x = a.u8[i] - b.u8[i];
+		if (x < 0) {
+			r.u8[i] = 0;
+		} else if (x > UINT8_MAX) {
+			r.u8[i] = UINT8_MAX;
+		} else {
+			r.u8[i] = (uint8_t)x;
+		}
+	}
+	return r;
+#endif
+}
+#define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_subs_pi16(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_subs_pi16(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
+		if (((b.i16[i]) > 0 && (a.i16[i]) < SHRT_MIN + (b.i16[i]))) {
+			r.i16[i] = SHRT_MIN;
+		} else if ((b.i16[i]) < 0 &&
+			   (a.i16[i]) > INT16_MAX + (b.i16[i])) {
+			r.i16[i] = INT16_MAX;
+		} else {
+			r.i16[i] = (a.i16[i]) - (b.i16[i]);
+		}
+	}
+	return r;
+#endif
+}
+#define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_subs_pu16(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_subs_pu16(a.n, b.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(uint16_t)); i++) {
+		const int x = a.u16[i] - b.u16[i];
+		if (x < 0) {
+			r.u16[i] = 0;
+		} else if (x > UINT16_MAX) {
+			r.u16[i] = UINT16_MAX;
+		} else {
+			r.u16[i] = (uint16_t)x;
+		}
+	}
+	return r;
+#endif
+}
+#define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_unpackhi_pi8(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_unpackhi_pi8(a.n, b.n));
+#else
+	simde__m64 r;
+	r.i8[0] = a.i8[4];
+	r.i8[1] = b.i8[4];
+	r.i8[2] = a.i8[5];
+	r.i8[3] = b.i8[5];
+	r.i8[4] = a.i8[6];
+	r.i8[5] = b.i8[6];
+	r.i8[6] = a.i8[7];
+	r.i8[7] = b.i8[7];
+	return r;
+#endif
+}
+#define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_unpackhi_pi16(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_unpackhi_pi16(a.n, b.n));
+#else
+	simde__m64 r;
+	r.i16[0] = a.i16[2];
+	r.i16[1] = b.i16[2];
+	r.i16[2] = a.i16[3];
+	r.i16[3] = b.i16[3];
+	return r;
+#endif
+}
+#define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_unpackhi_pi32(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_unpackhi_pi32(a.n, b.n));
+#else
+	simde__m64 r;
+	r.i32[0] = a.i32[1];
+	r.i32[1] = b.i32[1];
+	return r;
+#endif
+}
+#define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_unpacklo_pi8(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_unpacklo_pi8(a.n, b.n));
+#else
+	simde__m64 r;
+	r.i8[0] = a.i8[0];
+	r.i8[1] = b.i8[0];
+	r.i8[2] = a.i8[1];
+	r.i8[3] = b.i8[1];
+	r.i8[4] = a.i8[2];
+	r.i8[5] = b.i8[2];
+	r.i8[6] = a.i8[3];
+	r.i8[7] = b.i8[3];
+	return r;
+#endif
+}
+#define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_unpacklo_pi16(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_unpacklo_pi16(a.n, b.n));
+#else
+	simde__m64 r;
+	r.i16[0] = a.i16[0];
+	r.i16[1] = b.i16[0];
+	r.i16[2] = a.i16[1];
+	r.i16[3] = b.i16[1];
+	return r;
+#endif
+}
+#define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_unpacklo_pi32(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_unpacklo_pi32(a.n, b.n));
+#else
+	simde__m64 r;
+	r.i32[0] = a.i32[0];
+	r.i32[1] = b.i32[0];
+	return r;
+#endif
+}
+#define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_xor_si64(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return SIMDE__M64_C(_mm_xor_si64(a.n, b.n));
+#else
+	simde__m64 r;
+	r.i64[0] = a.i64[0] ^ b.i64[0];
+	return r;
+#endif
+}
+#define simde_m_pxor(a, b) simde_mm_xor_si64(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+int32_t simde_m_to_int(simde__m64 a)
+{
+#if defined(SIMDE_MMX_NATIVE)
+	return _m_to_int(a.n);
+#else
+	return a.i32[0];
+#endif
+}
+
+SIMDE__END_DECLS
+
+#endif /* !defined(SIMDE__MMX_H) */

+ 355 - 0
libobs/util/aarch/simde-arch.h

@@ -0,0 +1,355 @@
+/* Architecture detection
+ * Created by Evan Nemerson <[email protected]>
+ *
+ *   To the extent possible under law, the authors have waived all
+ *   copyright and related or neighboring rights to this code.  For
+ *   details, see the Creative Commons Zero 1.0 Universal license at
+ *   <https://creativecommons.org/publicdomain/zero/1.0/>
+ *
+ * Different compilers define different preprocessor macros for the
+ * same architecture.  This is an attempt to provide a single
+ * interface which is usable on any compiler.
+ *
+ * In general, a macro named SIMDE_ARCH_* is defined for each
+ * architecture the CPU supports.  When there are multiple possible
+ * versions, we try to define the macro to the target version.  For
+ * example, if you want to check for i586+, you could do something
+ * like:
+ *
+ *   #if defined(SIMDE_ARCH_X86) && (SIMDE_ARCH_X86 >= 5)
+ *   ...
+ *   #endif
+ *
+ * You could also just check that SIMDE_ARCH_X86 >= 5 without checking
+ * if it's defined first, but some compilers may emit a warning about
+ * an undefined macro being used (e.g., GCC with -Wundef).
+ *
+ * This was originally created for SIMDe
+ * <https://github.com/nemequ/simde> (hence the prefix), but this
+ * header has no dependencies and may be used anywhere.  It is
+ * originally based on information from
+ * <https://sourceforge.net/p/predef/wiki/Architectures/>, though it
+ * has been enhanced with additional information.
+ *
+ * If you improve this file, or find a bug, please file the issue at
+ * <https://github.com/nemequ/simde/issues>.  If you copy this into
+ * your project, even if you change the prefix, please keep the links
+ * to SIMDe intact so others know where to report issues, submit
+ * enhancements, and find the latest version. */
+
+#if !defined(SIMDE_ARCH_H)
+#define SIMDE_ARCH_H
+
+/* Alpha
+   <https://en.wikipedia.org/wiki/DEC_Alpha> */
+#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
+#if defined(__alpha_ev6__)
+#define SIMDE_ARCH_ALPHA 6
+#elif defined(__alpha_ev5__)
+#define SIMDE_ARCH_ALPHA 5
+#elif defined(__alpha_ev4__)
+#define SIMDE_ARCH_ALPHA 4
+#else
+#define SIMDE_ARCH_ALPHA 1
+#endif
+#endif
+
+/* Atmel AVR
+   <https://en.wikipedia.org/wiki/Atmel_AVR> */
+#if defined(__AVR_ARCH__)
+#define SIMDE_ARCH_AVR __AVR_ARCH__
+#endif
+
+/* AMD64 / x86_64
+   <https://en.wikipedia.org/wiki/X86-64> */
+#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || \
+	defined(__x86_64) || defined(_M_X66) || defined(_M_AMD64)
+#define SIMDE_ARCH_AMD64 1
+#endif
+
+/* ARM
+   <https://en.wikipedia.org/wiki/ARM_architecture> */
+#if defined(__ARM_ARCH_8A__)
+#define SIMDE_ARCH_ARM 82
+#elif defined(__ARM_ARCH_8R__)
+#define SIMDE_ARCH_ARM 81
+#elif defined(__ARM_ARCH_8__)
+#define SIMDE_ARCH_ARM 80
+#elif defined(__ARM_ARCH_7S__)
+#define SIMDE_ARCH_ARM 74
+#elif defined(__ARM_ARCH_7M__)
+#define SIMDE_ARCH_ARM 73
+#elif defined(__ARM_ARCH_7R__)
+#define SIMDE_ARCH_ARM 72
+#elif defined(__ARM_ARCH_7A__)
+#define SIMDE_ARCH_ARM 71
+#elif defined(__ARM_ARCH_7__)
+#define SIMDE_ARCH_ARM 70
+#elif defined(__ARM_ARCH)
+#define SIMDE_ARCH_ARM (__ARM_ARCH * 10)
+#elif defined(_M_ARM)
+#define SIMDE_ARCH_ARM (_M_ARM * 10)
+#elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) || \
+	defined(_ARM) || defined(_M_ARM) || defined(_M_ARM)
+#define SIMDE_ARCH_ARM 1
+#endif
+
+/* AArch64
+   <https://en.wikipedia.org/wiki/ARM_architecture> */
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define SIMDE_ARCH_AARCH64 10
+#endif
+
+/* Blackfin
+   <https://en.wikipedia.org/wiki/Blackfin> */
+#if defined(__bfin) || defined(__BFIN__) || defined(__bfin__)
+#define SIMDE_ARCH_BLACKFIN 1
+#endif
+
+/* CRIS
+   <https://en.wikipedia.org/wiki/ETRAX_CRIS> */
+#if defined(__CRIS_arch_version)
+#define SIMDE_ARCH_CRIS __CRIS_arch_version
+#elif defined(__cris__) || defined(__cris) || defined(__CRIS) || \
+	defined(__CRIS__)
+#define SIMDE_ARCH_CRIS 1
+#endif
+
+/* Convex
+   <https://en.wikipedia.org/wiki/Convex_Computer> */
+#if defined(__convex_c38__)
+#define SIMDE_ARCH_CONVEX 38
+#elif defined(__convex_c34__)
+#define SIMDE_ARCH_CONVEX 34
+#elif defined(__convex_c32__)
+#define SIMDE_ARCH_CONVEX 32
+#elif defined(__convex_c2__)
+#define SIMDE_ARCH_CONVEX 2
+#elif defined(__convex__)
+#define SIMDE_ARCH_CONVEX 1
+#endif
+
+/* Adapteva Epiphany
+   <https://en.wikipedia.org/wiki/Adapteva_Epiphany> */
+#if defined(__epiphany__)
+#define SIMDE_ARCH_EPIPHANY 1
+#endif
+
+/* Fujitsu FR-V
+   <https://en.wikipedia.org/wiki/FR-V_(microprocessor)> */
+#if defined(__frv__)
+#define SIMDE_ARCH_FRV 1
+#endif
+
+/* H8/300
+   <https://en.wikipedia.org/wiki/H8_Family> */
+#if defined(__H8300__)
+#define SIMDE_ARCH_H8300
+#endif
+
+/* HP/PA / PA-RISC
+   <https://en.wikipedia.org/wiki/PA-RISC> */
+#if defined(__PA8000__) || defined(__HPPA20__) || defined(__RISC2_0__) || \
+	defined(_PA_RISC2_0)
+#define SIMDE_ARCH_HPPA 20
+#elif defined(__PA7100__) || defined(__HPPA11__) || defined(_PA_RISC1_1)
+#define SIMDE_ARCH_HPPA 11
+#elif defined(_PA_RISC1_0)
+#define SIMDE_ARCH_HPPA 10
+#elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa)
+#define SIMDE_ARCH_HPPA 1
+#endif
+
+/* x86
+   <https://en.wikipedia.org/wiki/X86> */
+#if defined(_M_IX86)
+#define SIMDE_ARCH_X86 (_M_IX86 / 100)
+#elif defined(__I86__)
+#define SIMDE_ARCH_X86 __I86__
+#elif defined(i686) || defined(__i686) || defined(__i686__)
+#define SIMDE_ARCH_X86 6
+#elif defined(i586) || defined(__i586) || defined(__i586__)
+#define SIMDE_ARCH_X86 5
+#elif defined(i486) || defined(__i486) || defined(__i486__)
+#define SIMDE_ARCH_X86 4
+#elif defined(i386) || defined(__i386) || defined(__i386__)
+#define SIMDE_ARCH_X86 3
+#elif defined(_X86_) || defined(__X86__) || defined(__THW_INTEL__)
+#define SIMDE_ARCH_X86 3
+#endif
+
+/* Itanium
+   <https://en.wikipedia.org/wiki/Itanium> */
+#if defined(__ia64__) || defined(_IA64) || defined(__IA64__) || \
+	defined(__ia64) || defined(_M_IA64) || defined(__itanium__)
+#define SIMDE_ARCH_IA64 1
+#endif
+
+/* Renesas M32R
+   <https://en.wikipedia.org/wiki/M32R> */
+#if defined(__m32r__) || defined(__M32R__)
+#define SIMDE_ARCH_M32R
+#endif
+
+/* Motorola 68000
+   <https://en.wikipedia.org/wiki/Motorola_68000> */
+#if defined(__mc68060__) || defined(__MC68060__)
+#define SIMDE_ARCH_M68K 68060
+#elif defined(__mc68040__) || defined(__MC68040__)
+#define SIMDE_ARCH_M68K 68040
+#elif defined(__mc68030__) || defined(__MC68030__)
+#define SIMDE_ARCH_M68K 68030
+#elif defined(__mc68020__) || defined(__MC68020__)
+#define SIMDE_ARCH_M68K 68020
+#elif defined(__mc68010__) || defined(__MC68010__)
+#define SIMDE_ARCH_M68K 68010
+#elif defined(__mc68000__) || defined(__MC68000__)
+#define SIMDE_ARCH_M68K 68000
+#endif
+
+/* Xilinx MicroBlaze
+   <https://en.wikipedia.org/wiki/MicroBlaze> */
+#if defined(__MICROBLAZE__) || defined(__microblaze__)
+#define SIMDE_ARCH_MICROBLAZE
+#endif
+
+/* MIPS
+   <https://en.wikipedia.org/wiki/MIPS_architecture> */
+#if defined(_MIPS_ISA_MIPS64R2)
+#define SIMDE_ARCH_MIPS 642
+#elif defined(_MIPS_ISA_MIPS64)
+#define SIMDE_ARCH_MIPS 640
+#elif defined(_MIPS_ISA_MIPS32R2)
+#define SIMDE_ARCH_MIPS 322
+#elif defined(_MIPS_ISA_MIPS32)
+#define SIMDE_ARCH_MIPS 320
+#elif defined(_MIPS_ISA_MIPS4)
+#define SIMDE_ARCH_MIPS 4
+#elif defined(_MIPS_ISA_MIPS3)
+#define SIMDE_ARCH_MIPS 3
+#elif defined(_MIPS_ISA_MIPS2)
+#define SIMDE_ARCH_MIPS 2
+#elif defined(_MIPS_ISA_MIPS1)
+#define SIMDE_ARCH_MIPS 1
+#elif defined(_MIPS_ISA_MIPS) || defined(__mips) || defined(__MIPS__)
+#define SIMDE_ARCH_MIPS 1
+#endif
+
+/* Matsushita MN10300
+   <https://en.wikipedia.org/wiki/MN103> */
+#if defined(__MN10300__) || defined(__mn10300__)
+#define SIMDE_ARCH_MN10300 1
+#endif
+
+/* POWER
+   <https://en.wikipedia.org/wiki/IBM_POWER_Instruction_Set_Architecture> */
+#if defined(_M_PPC)
+#define SIMDE_ARCH_POWER _M_PPC
+#elif defined(_ARCH_PWR8)
+#define SIMDE_ARCH_POWER 800
+#elif defined(_ARCH_PWR7)
+#define SIMDE_ARCH_POWER 700
+#elif defined(_ARCH_PWR6)
+#define SIMDE_ARCH_POWER 600
+#elif defined(_ARCH_PWR5)
+#define SIMDE_ARCH_POWER 500
+#elif defined(_ARCH_PWR4)
+#define SIMDE_ARCH_POWER 400
+#elif defined(_ARCH_440) || defined(__ppc440__)
+#define SIMDE_ARCH_POWER 440
+#elif defined(_ARCH_450) || defined(__ppc450__)
+#define SIMDE_ARCH_POWER 450
+#elif defined(_ARCH_601) || defined(__ppc601__)
+#define SIMDE_ARCH_POWER 601
+#elif defined(_ARCH_603) || defined(__ppc603__)
+#define SIMDE_ARCH_POWER 603
+#elif defined(_ARCH_604) || defined(__ppc604__)
+#define SIMDE_ARCH_POWER 604
+#elif defined(_ARCH_605) || defined(__ppc605__)
+#define SIMDE_ARCH_POWER 605
+#elif defined(_ARCH_620) || defined(__ppc620__)
+#define SIMDE_ARCH_POWER 620
+#elif defined(__powerpc) || defined(__powerpc__) || defined(__POWERPC__) || \
+	defined(__ppc__) || defined(__PPC__) || defined(_ARCH_PPC) ||       \
+	defined(__ppc)
+#define SIMDE_ARCH_POWER 1
+#endif
+
+/* SPARC
+   <https://en.wikipedia.org/wiki/SPARC> */
+#if defined(__sparc_v9__) || defined(__sparcv9)
+#define SIMDE_ARCH_SPARC 9
+#elif defined(__sparc_v8__) || defined(__sparcv8)
+#define SIMDE_ARCH_SPARC 8
+#elif defined(__sparc_v7__) || defined(__sparcv7)
+#define SIMDE_ARCH_SPARC 7
+#elif defined(__sparc_v6__) || defined(__sparcv6)
+#define SIMDE_ARCH_SPARC 6
+#elif defined(__sparc_v5__) || defined(__sparcv5)
+#define SIMDE_ARCH_SPARC 5
+#elif defined(__sparc_v4__) || defined(__sparcv4)
+#define SIMDE_ARCH_SPARC 4
+#elif defined(__sparc_v3__) || defined(__sparcv3)
+#define SIMDE_ARCH_SPARC 3
+#elif defined(__sparc_v2__) || defined(__sparcv2)
+#define SIMDE_ARCH_SPARC 2
+#elif defined(__sparc_v1__) || defined(__sparcv1)
+#define SIMDE_ARCH_SPARC 1
+#elif defined(__sparc__) || defined(__sparc)
+#define SIMDE_ARCH_SPARC 1
+#endif
+
+/* SuperH
+   <https://en.wikipedia.org/wiki/SuperH> */
+#if defined(__sh5__) || defined(__SH5__)
+#define SIMDE_ARCH_SUPERH 5
+#elif defined(__sh4__) || defined(__SH4__)
+#define SIMDE_ARCH_SUPERH 4
+#elif defined(__sh3__) || defined(__SH3__)
+#define SIMDE_ARCH_SUPERH 3
+#elif defined(__sh2__) || defined(__SH2__)
+#define SIMDE_ARCH_SUPERH 2
+#elif defined(__sh1__) || defined(__SH1__)
+#define SIMDE_ARCH_SUPERH 1
+#elif defined(__sh__) || defined(__SH__)
+#define SIMDE_ARCH_SUPERH 1
+#endif
+
+/* IBM System z
+   <https://en.wikipedia.org/wiki/IBM_System_z> */
+#if defined(__370__) || defined(__THW_370__) || defined(__s390__) || \
+	defined(__s390x__) || defined(__zarch__) || defined(__SYSC_ZARCH__)
+#define SIMDE_ARCH_SYSTEMZ
+#endif
+
+/* TMS320 DSP
+   <https://en.wikipedia.org/wiki/Texas_Instruments_TMS320> */
+#if defined(_TMS320C6740) || defined(__TMS320C6740__)
+#define SIMDE_ARCH_TMS320 6740
+#elif defined(_TMS320C6700_PLUS) || defined(__TMS320C6700_PLUS__)
+#define SIMDE_ARCH_TMS320 6701
+#elif defined(_TMS320C6700) || defined(__TMS320C6700__)
+#define SIMDE_ARCH_TMS320 6700
+#elif defined(_TMS320C6600) || defined(__TMS320C6600__)
+#define SIMDE_ARCH_TMS320 6600
+#elif defined(_TMS320C6400_PLUS) || defined(__TMS320C6400_PLUS__)
+#define SIMDE_ARCH_TMS320 6401
+#elif defined(_TMS320C6400) || defined(__TMS320C6400__)
+#define SIMDE_ARCH_TMS320 6400
+#elif defined(_TMS320C6200) || defined(__TMS320C6200__)
+#define SIMDE_ARCH_TMS320 6200
+#elif defined(_TMS320C55X) || defined(__TMS320C55X__)
+#define SIMDE_ARCH_TMS320 550
+#elif defined(_TMS320C54X) || defined(__TMS320C54X__)
+#define SIMDE_ARCH_TMS320 540
+#elif defined(_TMS320C28X) || defined(__TMS320C28X__)
+#define SIMDE_ARCH_TMS320 280
+#endif
+
+/* Xtensa
+   <https://en.wikipedia.org/wiki/> */
+#if defined(__xtensa__) || defined(__XTENSA__)
+#define SIMDE_ARCH_XTENSA 1
+#endif
+
+#endif /* !defined(SIMDE_ARCH_H) */

+ 278 - 0
libobs/util/aarch/simde-common.h

@@ -0,0 +1,278 @@
+/* Copyright (c) 2017-2019 Evan Nemerson <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(SIMDE_COMMON_H)
+#define SIMDE_COMMON_H
+
+#include "hedley.h"
+#include "check.h"
+#include "simde-arch.h"
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
+#define SIMDE_ALIGN(alignment) _Alignas(alignment)
+#elif (defined(__cplusplus) && (__cplusplus >= 201103L))
+#define SIMDE_ALIGN(alignment) alignas(alignment)
+#elif HEDLEY_GCC_VERSION_CHECK(2, 95, 0) ||     \
+	HEDLEY_CRAY_VERSION_CHECK(8, 4, 0) ||   \
+	HEDLEY_IBM_VERSION_CHECK(11, 1, 0) ||   \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \
+	HEDLEY_PGI_VERSION_CHECK(19, 4, 0) ||   \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||    \
+	HEDLEY_TINYC_VERSION_CHECK(0, 9, 24) || \
+	HEDLEY_TI_VERSION_CHECK(8, 1, 0)
+#define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment)))
+#elif defined(_MSC_VER) && (!defined(_M_IX86) || defined(_M_AMD64))
+#define SIMDE_ALIGN(alignment) __declspec(align(alignment))
+#else
+#define SIMDE_ALIGN(alignment)
+#endif
+
+#define simde_assert_aligned(alignment, val) \
+	simde_assert_int(((uintptr_t)(val)) % (alignment), ==, 0)
+
+#if HEDLEY_GCC_HAS_ATTRIBUTE(vector_size, 4, 6, 0)
+#define SIMDE__ENABLE_GCC_VEC_EXT
+#endif
+
+#if !defined(SIMDE_ENABLE_OPENMP) &&                   \
+	((defined(_OPENMP) && (_OPENMP >= 201307L)) || \
+	 (defined(_OPENMP_SIMD) && (_OPENMP_SIMD >= 201307L)))
+#define SIMDE_ENABLE_OPENMP
+#endif
+
+#if !defined(SIMDE_ENABLE_CILKPLUS) && defined(__cilk)
+#define SIMDE_ENABLE_CILKPLUS
+#endif
+
+#if defined(SIMDE_ENABLE_OPENMP)
+#define SIMDE__VECTORIZE _Pragma("omp simd")
+#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(omp simd safelen(l))
+#define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(omp simd reduction(r))
+#define SIMDE__VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd aligned(a))
+#elif defined(SIMDE_ENABLE_CILKPLUS)
+#define SIMDE__VECTORIZE _Pragma("simd")
+#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l))
+#define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r))
+#define SIMDE__VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(simd aligned(a))
+#elif defined(__INTEL_COMPILER)
+#define SIMDE__VECTORIZE _Pragma("simd")
+#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l))
+#define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r))
+#define SIMDE__VECTORIZE_ALIGNED(a)
+#elif defined(__clang__)
+#define SIMDE__VECTORIZE _Pragma("clang loop vectorize(enable)")
+#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l))
+#define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE
+#define SIMDE__VECTORIZE_ALIGNED(a)
+#elif HEDLEY_GCC_VERSION_CHECK(4, 9, 0)
+#define SIMDE__VECTORIZE _Pragma("GCC ivdep")
+#define SIMDE__VECTORIZE_SAFELEN(l) SIMDE__VECTORIZE
+#define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE
+#define SIMDE__VECTORIZE_ALIGNED(a)
+#elif HEDLEY_CRAY_VERSION_CHECK(5, 0, 0)
+#define SIMDE__VECTORIZE _Pragma("_CRI ivdep")
+#define SIMDE__VECTORIZE_SAFELEN(l) SIMDE__VECTORIZE
+#define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE
+#define SIMDE__VECTORIZE_ALIGNED(a)
+#else
+#define SIMDE__VECTORIZE
+#define SIMDE__VECTORIZE_SAFELEN(l)
+#define SIMDE__VECTORIZE_REDUCTION(r)
+#define SIMDE__VECTORIZE_ALIGNED(a)
+#endif
+
+#if HEDLEY_GCC_HAS_ATTRIBUTE(unused, 3, 1, 0)
+#define SIMDE__UNUSED __attribute__((__unused__))
+#else
+#define SIMDE__UNUSED
+#endif
+
+#if HEDLEY_GCC_HAS_ATTRIBUTE(artificial, 4, 3, 0)
+#define SIMDE__ARTIFICIAL __attribute__((__artificial__))
+#else
+#define SIMDE__ARTIFICIAL
+#endif
+
+/* Intended for checking coverage, you should never use this in
+   production. */
+#if defined(SIMDE_NO_INLINE)
+#define SIMDE__FUNCTION_ATTRIBUTES HEDLEY_NEVER_INLINE SIMDE__UNUSED static
+#else
+#define SIMDE__FUNCTION_ATTRIBUTES HEDLEY_INLINE SIMDE__ARTIFICIAL static
+#endif
+
+#if defined(_MSC_VER)
+#define SIMDE__BEGIN_DECLS                                            \
+	HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(disable : 4996 4204)) \
+		HEDLEY_BEGIN_C_DECLS
+#define SIMDE__END_DECLS HEDLEY_DIAGNOSTIC_POP HEDLEY_END_C_DECLS
+#else
+#define SIMDE__BEGIN_DECLS HEDLEY_BEGIN_C_DECLS
+#define SIMDE__END_DECLS HEDLEY_END_C_DECLS
+#endif
+
+#if defined(__SIZEOF_INT128__)
+#define SIMDE__HAVE_INT128
+typedef __int128 simde_int128;
+typedef unsigned __int128 simde_uint128;
+#endif
+
+/* TODO: we should at least make an attempt to detect the correct
+   types for simde_float32/float64 instead of just assuming float and
+   double. */
+
+#if !defined(SIMDE_FLOAT32_TYPE)
+#define SIMDE_FLOAT32_TYPE float
+#define SIMDE_FLOAT32_C(value) value##f
+#else
+#define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT32_TYPE)value)
+#endif
+typedef SIMDE_FLOAT32_TYPE simde_float32;
+HEDLEY_STATIC_ASSERT(sizeof(simde_float32) == 4,
+		     "Unable to find 32-bit floating-point type.");
+
+#if !defined(SIMDE_FLOAT64_TYPE)
+#define SIMDE_FLOAT64_TYPE double
+#define SIMDE_FLOAT64_C(value) value
+#else
+#define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT64_TYPE)value)
+#endif
+typedef SIMDE_FLOAT64_TYPE simde_float64;
+HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8,
+		     "Unable to find 64-bit floating-point type.");
+
+/* Whether to assume that the compiler can auto-vectorize reasonably
+   well.  This will cause SIMDe to attempt to compose vector
+   operations using more simple vector operations instead of minimize
+   serial work.
+
+   As an example, consider the _mm_add_ss(a, b) function from SSE,
+   which returns { a0 + b0, a1, a2, a3 }.  This pattern is repeated
+   for other operations (sub, mul, etc.).
+
+   The naïve implementation would result in loading a0 and b0, adding
+   them into a temporary variable, then splicing that value into a new
+   vector with the remaining elements from a.
+
+   On platforms which support vectorization, it's generally faster to
+   simply perform the operation on the entire vector to avoid having
+   to move data between SIMD registers and non-SIMD registers.
+   Basically, instead of the temporary variable being (a0 + b0) it
+   would be a vector of (a + b), which is then combined with a to form
+   the result.
+
+   By default, SIMDe will prefer the pure-vector versions if we detect
+   a vector ISA extension, but this can be overridden by defining
+   SIMDE_NO_ASSUME_VECTORIZATION.  You can also define
+   SIMDE_ASSUME_VECTORIZATION if you want to force SIMDe to use the
+   vectorized version. */
+#if !defined(SIMDE_NO_ASSUME_VECTORIZATION) && \
+	!defined(SIMDE_ASSUME_VECTORIZATION)
+#if defined(__SSE__) || defined(__ARM_NEON) || defined(__mips_msa) || \
+	defined(__ALTIVEC__)
+#define SIMDE_ASSUME_VECTORIZATION
+#endif
+#endif
+
+/* GCC and clang have built-in functions to handle shuffling of
+   vectors, but the implementations are slightly different.  This
+   macro is just an abstraction over them.  Note that elem_size is in
+   bits but vec_size is in bytes. */
+#if HEDLEY_CLANG_HAS_BUILTIN(__builtin_shufflevector)
+#define SIMDE__SHUFFLE_VECTOR(elem_size, vec_size, a, b, ...) \
+	__builtin_shufflevector(a, b, __VA_ARGS__)
+#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_shuffle, 4, 7, 0) && \
+	!defined(__INTEL_COMPILER)
+#define SIMDE__SHUFFLE_VECTOR(elem_size, vec_size, a, b, ...) \
+	__builtin_shuffle(a, b,                               \
+			  (int##elem_size##_t __attribute__(  \
+				  (__vector_size__(vec_size)))){__VA_ARGS__})
+#endif
+
+/* Some algorithms are iterative, and fewer iterations means less
+   accuracy.  Lower values here will result in faster, but less
+   accurate, calculations for some functions. */
+#if !defined(SIMDE_ACCURACY_ITERS)
+#define SIMDE_ACCURACY_ITERS 2
+#endif
+
+/* This will probably move into Hedley at some point, but I'd like to
+   more thoroughly check for other compilers which define __GNUC__
+   first. */
+#if defined(SIMDE__REALLY_GCC)
+#undef SIMDE__REALLY_GCC
+#endif
+#if !defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
+#define SIMDE__REALLY_GCC 0
+#else
+#define SIMDE__REALLY_GCC 1
+#endif
+
+#if defined(SIMDE__ASSUME_ALIGNED)
+#undef SIMDE__ASSUME_ALIGNED
+#endif
+#if HEDLEY_INTEL_VERSION_CHECK(9, 0, 0)
+#define SIMDE__ASSUME_ALIGNED(ptr, align) __assume_aligned(ptr, align)
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0)
+#define SIMDE__ASSUME_ALIGNED(ptr, align) \
+	__assume((((char *)ptr) - ((char *)0)) % (align) == 0)
+#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_assume_aligned, 4, 7, 0)
+#define SIMDE__ASSUME_ALIGNED(ptr, align) \
+	(ptr = (__typeof__(ptr))__builtin_assume_aligned((ptr), align))
+#elif HEDLEY_CLANG_HAS_BUILTIN(__builtin_assume)
+#define SIMDE__ASSUME_ALIGNED(ptr, align) \
+	__builtin_assume((((char *)ptr) - ((char *)0)) % (align) == 0)
+#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_unreachable, 4, 5, 0)
+#define SIMDE__ASSUME_ALIGNED(ptr, align)              \
+	((((char *)ptr) - ((char *)0)) % (align) == 0) \
+		? (1)                                  \
+		: (__builtin_unreachable(), 0)
+#else
+#define SIMDE__ASSUME_ALIGNED(ptr, align)
+#endif
+
+/* Sometimes we run into problems with specific versions of compilers
+   which make the native versions unusable for us.  Often this is due
+   to missing functions, sometimes buggy implementations, etc.  These
+   macros are how we check for specific bugs.  As they are fixed we'll
+   start only defining them for problematic compiler versions. */
+
+#if !defined(SIMDE_IGNORE_COMPILER_BUGS)
+#if SIMDE__REALLY_GCC
+#if !HEDLEY_GCC_VERSION_CHECK(4, 9, 0)
+#define SIMDE_BUG_GCC_REV_208793
+#endif
+#if !HEDLEY_GCC_VERSION_CHECK(5, 0, 0)
+#define SIMDE_BUG_GCC_BAD_MM_SRA_EPI32 /* TODO: find relevant bug or commit */
+#endif
+#if !HEDLEY_GCC_VERSION_CHECK(4, 6, 0)
+#define SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8 /* TODO: find relevant bug or commit */
+#endif
+#endif
+#if defined(__EMSCRIPTEN__)
+#define SIMDE_BUG_EMSCRIPTEN_MISSING_IMPL /* Placeholder for (as yet) unfiled issues. */
+#define SIMDE_BUG_EMSCRIPTEN_5242
+#endif
+#endif
+
+#endif /* !defined(SIMDE_COMMON_H) */

+ 2591 - 0
libobs/util/aarch/sse.h

@@ -0,0 +1,2591 @@
+/* Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ *   2017      Evan Nemerson <[email protected]>
+ *   2015-2017 John W. Ratcliff <[email protected]>
+ *   2015      Brandon Rowlett <[email protected]>
+ *   2015      Ken Fast <[email protected]>
+ */
+
+#if !defined(SIMDE__SSE_H)
+#if !defined(SIMDE__SSE_H)
+#define SIMDE__SSE_H
+#endif
+#include "mmx.h"
+
+#if defined(SIMDE_SSE_NATIVE)
+#undef SIMDE_SSE_NATIVE
+#endif
+#if defined(SIMDE_SSE_FORCE_NATIVE)
+#define SIMDE_SSE_NATIVE
+#elif defined(__SSE__) && !defined(SIMDE_SSE_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#define SIMDE_SSE_NATIVE
+#elif defined(__ARM_NEON) && !defined(SIMDE_SSE_NO_NEON) && \
+	!defined(SIMDE_NO_NEON)
+#define SIMDE_SSE_NEON
+#endif
+
+#if defined(SIMDE_SSE_NATIVE) && !defined(SIMDE_MMX_NATIVE)
+#if defined(SIMDE_SSE_FORCE_NATIVE)
+#error Native SSE support requires native MMX support
+#else
+#warning Native SSE support requires native MMX support, disabling
+#undef SIMDE_SSE_NATIVE
+#endif
+#elif defined(SIMDE_SSE_NEON) && !defined(SIMDE_MMX_NEON)
+#warning SSE3 NEON support requires MMX NEON support, disabling
+#undef SIMDE_SSE3_NEON
+#endif
+
+#if defined(SIMDE_SSE_NATIVE)
+#include <xmmintrin.h>
+#else
+#if defined(SIMDE_SSE_NEON)
+#include <arm_neon.h>
+#endif
+
+#if !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && \
+	(__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
+#include <stdatomic.h>
+#elif defined(_WIN32)
+#include <Windows.h>
+#endif
+#endif
+
+#include <math.h>
+#include <fenv.h>
+
+#define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment)))
+SIMDE__BEGIN_DECLS
+
+typedef SIMDE_ALIGN(16) union {
+#if defined(SIMDE__ENABLE_GCC_VEC_EXT)
+	int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
+	int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
+	int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
+	int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
+	uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
+	uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
+	uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
+	uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
+#if defined(SIMDE__HAVE_INT128)
+	simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__));
+	simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__));
+#endif
+	simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
+#else
+	int8_t i8[16];
+	int16_t i16[8];
+	int32_t i32[4];
+	int64_t i64[2];
+	uint8_t u8[16];
+	uint16_t u16[8];
+	uint32_t u32[4];
+	uint64_t u64[2];
+#if defined(SIMDE__HAVE_INT128)
+	simde_int128 i128[1];
+	simde_uint128 u128[1];
+#endif
+	simde_float32 f32[4];
+#endif
+
+#if defined(SIMDE_SSE_NATIVE)
+	__m128 n;
+#elif defined(SIMDE_SSE_NEON)
+	int8x16_t neon_i8;
+	int16x8_t neon_i16;
+	int32x4_t neon_i32;
+	int64x2_t neon_i64;
+	uint8x16_t neon_u8;
+	uint16x8_t neon_u16;
+	uint32x4_t neon_u32;
+	uint64x2_t neon_u64;
+	float32x4_t neon_f32;
+#endif
+} simde__m128;
+
+#if defined(SIMDE_SSE_NATIVE)
+HEDLEY_STATIC_ASSERT(sizeof(__m128) == sizeof(simde__m128),
+		     "__m128 size doesn't match simde__m128 size");
+SIMDE__FUNCTION_ATTRIBUTES simde__m128 SIMDE__M128_C(__m128 v)
+{
+	simde__m128 r;
+	r.n = v;
+	return r;
+}
+#elif defined(SIMDE_SSE_NEON)
+#define SIMDE__M128_NEON_C(T, expr) \
+	(simde__m128) { .neon_##T = expr }
+#endif
+HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128), "simde__m128 size incorrect");
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_add_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_add_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_f32 = vaddq_f32(a.neon_f32, b.neon_f32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.f32[i] = a.f32[i] + b.f32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_add_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_add_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32_t b0 = vgetq_lane_f32(b.neon_f32, 0);
+	float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+	/* the upper values in the result must be the remnants of <a>. */
+	r.neon_f32 = vaddq_f32(a.neon_f32, value);
+#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
+	r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, simde_mm_add_ps(a, b).f32,
+				      4, 1, 2, 3);
+#else
+	r.f32[0] = a.f32[0] + b.f32[0];
+	r.f32[1] = a.f32[1];
+	r.f32[2] = a.f32[2];
+	r.f32[3] = a.f32[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_and_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_and_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_i32 = vandq_s32(a.neon_i32, b.neon_i32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = a.i32[i] & b.i32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_andnot_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_andnot_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_i32 = vbicq_s32(b.neon_i32, a.neon_i32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = ~(a.i32[i]) & b.i32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_avg_pu16(simde__m64 a, simde__m64 b)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_avg_pu16(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_u16 = vrhadd_u16(b.neon_u16, a.neon_u16);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < 4; i++) {
+		r.u16[i] = (a.u16[i] + b.u16[i] + 1) >> 1;
+	}
+#endif
+
+	return r;
+}
+#define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_avg_pu8(simde__m64 a, simde__m64 b)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_avg_pu8(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_u8 = vrhadd_u8(b.neon_u8, a.neon_u8);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < 8; i++) {
+		r.u8[i] = (a.u8[i] + b.u8[i] + 1) >> 1;
+	}
+#endif
+
+	return r;
+}
+#define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpeq_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmpeq_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_u32 = vceqq_f32(a.neon_f32, b.neon_f32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.u32[i] = (a.f32[i] == b.f32[i]) ? 0xffffffff : 0;
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpeq_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmpeq_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32x4_t s =
+		vreinterpretq_f32_u32(vceqq_f32(a.neon_f32, b.neon_f32));
+	float32x4_t t = vextq_f32(a.neon_f32, s, 1);
+	r.neon_f32 = vextq_f32(t, t, 3);
+#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
+	r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
+				      simde_mm_cmpeq_ps(a, b).f32, 4, 1, 2, 3);
+#else
+	r.u32[0] = (a.f32[0] == b.f32[0]) ? 0xffffffff : 0;
+	SIMDE__VECTORIZE
+	for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.u32[i] = a.u32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpge_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmpge_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_u32 = vcgeq_f32(a.neon_f32, b.neon_f32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.u32[i] = (a.f32[i] >= b.f32[i]) ? 0xffffffff : 0;
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpge_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
+	r.n = _mm_cmpge_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32x4_t s =
+		vreinterpretq_f32_u32(vcgeq_f32(a.neon_f32, b.neon_f32));
+	float32x4_t t = vextq_f32(a.neon_f32, s, 1);
+	r.neon_f32 = vextq_f32(t, t, 3);
+#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
+	r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
+				      simde_mm_cmpge_ps(a, b).f32, 4, 1, 2, 3);
+#else
+	r.u32[0] = (a.f32[0] >= b.f32[0]) ? 0xffffffff : 0;
+	SIMDE__VECTORIZE
+	for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.u32[i] = a.u32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpgt_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmpgt_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_u32 = vcgtq_f32(a.neon_f32, b.neon_f32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.u32[i] = (a.f32[i] > b.f32[i]) ? 0xffffffff : 0;
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpgt_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
+	r.n = _mm_cmpgt_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32x4_t s =
+		vreinterpretq_f32_u32(vcgtq_f32(a.neon_f32, b.neon_f32));
+	float32x4_t t = vextq_f32(a.neon_f32, s, 1);
+	r.neon_f32 = vextq_f32(t, t, 3);
+#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
+	r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
+				      simde_mm_cmpgt_ps(a, b).f32, 4, 1, 2, 3);
+#else
+	r.u32[0] = (a.f32[0] > b.f32[0]) ? 0xffffffff : 0;
+	SIMDE__VECTORIZE
+	for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.u32[i] = a.u32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmple_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmple_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_u32 = vcleq_f32(a.neon_f32, b.neon_f32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.u32[i] = (a.f32[i] <= b.f32[i]) ? 0xffffffff : 0;
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmple_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmple_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32x4_t s =
+		vreinterpretq_f32_u32(vcleq_f32(a.neon_f32, b.neon_f32));
+	float32x4_t t = vextq_f32(a.neon_f32, s, 1);
+	r.neon_f32 = vextq_f32(t, t, 3);
+#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
+	r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
+				      simde_mm_cmple_ps(a, b).f32, 4, 1, 2, 3);
+#else
+	r.u32[0] = (a.f32[0] <= b.f32[0]) ? 0xffffffff : 0;
+	SIMDE__VECTORIZE
+	for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.u32[i] = a.u32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmplt_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmplt_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_u32 = vcltq_f32(a.neon_f32, b.neon_f32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.u32[i] = (a.f32[i] < b.f32[i]) ? 0xffffffff : 0;
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmplt_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmplt_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32x4_t s =
+		vreinterpretq_f32_u32(vcltq_f32(a.neon_f32, b.neon_f32));
+	float32x4_t t = vextq_f32(a.neon_f32, s, 1);
+	r.neon_f32 = vextq_f32(t, t, 3);
+#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
+	r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
+				      simde_mm_cmplt_ps(a, b).f32, 4, 1, 2, 3);
+#else
+	r.u32[0] = (a.f32[0] < b.f32[0]) ? 0xffffffff : 0;
+	SIMDE__VECTORIZE
+	for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.u32[i] = a.u32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpneq_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmpneq_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_u32 = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32));
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.u32[i] = (a.f32[i] != b.f32[i]) ? 0xffffffff : 0;
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpneq_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmpneq_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32x4_t e =
+		vreinterpretq_f32_u32(vceqq_f32(a.neon_f32, b.neon_f32));
+	float32x4_t s =
+		vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(e)));
+	float32x4_t t = vextq_f32(a.neon_f32, s, 1);
+	r.neon_f32 = vextq_f32(t, t, 3);
+#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
+	r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
+				      simde_mm_cmpneq_ps(a, b).f32, 4, 1, 2, 3);
+#else
+	r.u32[0] = (a.f32[0] != b.f32[0]) ? 0xffffffff : 0;
+	SIMDE__VECTORIZE
+	for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.u32[i] = a.u32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpnge_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmpnge_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_u32 = vcltq_f32(a.neon_f32, b.neon_f32);
+#else
+	r = simde_mm_cmplt_ps(a, b);
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpnge_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
+	r.n = _mm_cmpnge_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32x4_t s =
+		vreinterpretq_f32_u32(vcltq_f32(a.neon_f32, b.neon_f32));
+	float32x4_t t = vextq_f32(a.neon_f32, s, 1);
+	r.neon_f32 = vextq_f32(t, t, 3);
+#else
+	r = simde_mm_cmplt_ss(a, b);
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpngt_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmpngt_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_u32 = vcleq_f32(a.neon_f32, b.neon_f32);
+#else
+	r = simde_mm_cmple_ps(a, b);
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpngt_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
+	r.n = _mm_cmpngt_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32x4_t s =
+		vreinterpretq_f32_u32(vcleq_f32(a.neon_f32, b.neon_f32));
+	float32x4_t t = vextq_f32(a.neon_f32, s, 1);
+	r.neon_f32 = vextq_f32(t, t, 3);
+#else
+	r = simde_mm_cmple_ss(a, b);
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpnle_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmpnle_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_u32 = vcgtq_f32(a.neon_f32, b.neon_f32);
+#else
+	r = simde_mm_cmpgt_ps(a, b);
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpnle_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmpnle_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32x4_t s =
+		vreinterpretq_f32_u32(vcgtq_f32(a.neon_f32, b.neon_f32));
+	float32x4_t t = vextq_f32(a.neon_f32, s, 1);
+	r.neon_f32 = vextq_f32(t, t, 3);
+#else
+	r = simde_mm_cmpgt_ss(a, b);
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpnlt_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmpnlt_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_u32 = vcgeq_f32(a.neon_f32, b.neon_f32);
+#else
+	r = simde_mm_cmpge_ps(a, b);
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpnlt_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmpnlt_ss(a.n, b.n);
+#else
+	r = simde_mm_cmpge_ss(a, b);
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpord_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmpord_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	/* Note: NEON does not have ordered compare builtin
+     Need to compare a eq a and b eq b to check for NaN
+     Do AND of results to get final */
+	uint32x4_t ceqaa = vceqq_f32(a.neon_f32, a.neon_f32);
+	uint32x4_t ceqbb = vceqq_f32(b.neon_f32, b.neon_f32);
+	r.neon_u32 = vandq_u32(ceqaa, ceqbb);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.u32[i] = (isnan(a.f32[i]) || isnan(b.f32[i])) ? 0
+								: 0xffffffff;
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpord_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmpord_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	uint32x4_t ceqaa = vceqq_f32(a.neon_f32, a.neon_f32);
+	uint32x4_t ceqbb = vceqq_f32(b.neon_f32, b.neon_f32);
+	float32x4_t s = vreinterpretq_f32_u32(vandq_u32(ceqaa, ceqbb));
+	float32x4_t t = vextq_f32(a.neon_f32, s, 1);
+	r.neon_f32 = vextq_f32(t, t, 3);
+#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
+	r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
+				      simde_mm_cmpord_ps(a, b).f32, 4, 1, 2, 3);
+#else
+	r.u32[0] = (isnan(a.f32[0]) || isnan(b.f32[0])) ? 0 : 0xffffffff;
+	SIMDE__VECTORIZE
+	for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.f32[i] = a.f32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpunord_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cmpunord_ps(a.n, b.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.u32[i] = (isnan(a.f32[i]) || isnan(b.f32[i])) ? 0xffffffff
+								: 0;
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cmpunord_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
+	r.n = _mm_cmpunord_ss(a.n, b.n);
+#elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
+	r.f32 = SIMDE__SHUFFLE_VECTOR(
+		32, 16, a.f32, simde_mm_cmpunord_ps(a, b).f32, 4, 1, 2, 3);
+#else
+	r.u32[0] = (isnan(a.f32[0]) || isnan(b.f32[0])) ? 0xffffffff : 0;
+	SIMDE__VECTORIZE
+	for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.f32[i] = a.f32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_comieq_ss(simde__m128 a, simde__m128 b)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_comieq_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
+	uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
+	uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
+	uint32x4_t a_eq_b = vceqq_f32(a.neon_f32, b.neon_f32);
+	return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0) ? 1 : 0;
+#else
+	return a.f32[0] == b.f32[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_comige_ss(simde__m128 a, simde__m128 b)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_comige_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
+	uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
+	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+	uint32x4_t a_ge_b = vcgeq_f32(a.neon_f32, b.neon_f32);
+	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1
+									    : 0;
+#else
+	return a.f32[0] >= b.f32[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_comigt_ss(simde__m128 a, simde__m128 b)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_comigt_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
+	uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
+	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+	uint32x4_t a_gt_b = vcgtq_f32(a.neon_f32, b.neon_f32);
+	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1
+									    : 0;
+#else
+	return a.f32[0] > b.f32[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_comile_ss(simde__m128 a, simde__m128 b)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_comile_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
+	uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
+	uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
+	uint32x4_t a_le_b = vcleq_f32(a.neon_f32, b.neon_f32);
+	return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0) ? 1 : 0;
+#else
+	return a.f32[0] <= b.f32[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_comilt_ss(simde__m128 a, simde__m128 b)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_comilt_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NATIVE)
+	uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
+	uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
+	uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
+	uint32x4_t a_lt_b = vcltq_f32(a.neon_f32, b.neon_f32);
+	return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0) ? 1 : 0;
+#else
+	return a.f32[0] < b.f32[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_comineq_ss(simde__m128 a, simde__m128 b)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_comineq_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
+	uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
+	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+	uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32));
+	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0)
+		       ? 1
+		       : 0;
+#else
+	return a.f32[0] != b.f32[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cvt_pi2ps(simde__m128 a, simde__m64 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cvt_pi2ps(a.n, b.n);
+#else
+	r.f32[0] = (simde_float32)b.i32[0];
+	r.f32[1] = (simde_float32)b.i32[1];
+	r.i32[2] = a.i32[2];
+	r.i32[3] = a.i32[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cvt_ps2pi(simde__m128 a)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cvt_ps2pi(a.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = (int32_t)a.f32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cvt_si2ss(simde__m128 a, int32_t b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cvt_si2ss(a.n, b);
+#else
+	r.f32[0] = (simde_float32)b;
+	r.i32[1] = a.i32[1];
+	r.i32[2] = a.i32[2];
+	r.i32[3] = a.i32[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int32_t simde_mm_cvt_ss2si(simde__m128 a)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_cvt_ss2si(a.n);
+#else
+	return (int32_t)a.f32[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cvtpi16_ps(simde__m64 a)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cvtpi16_ps(a.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.f32[i] = (simde_float32)a.i16[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cvtpi32_ps(simde__m128 a, simde__m64 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cvtpi32_ps(a.n, b.n);
+#else
+	r.f32[0] = (simde_float32)b.i32[0];
+	r.f32[1] = (simde_float32)b.i32[1];
+	r.i32[2] = a.i32[2];
+	r.i32[3] = a.i32[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cvtpi32x2_ps(simde__m64 a, simde__m64 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cvtpi32x2_ps(a.n, b.n);
+#else
+	r.f32[0] = (simde_float32)a.i32[0];
+	r.f32[1] = (simde_float32)a.i32[1];
+	r.f32[2] = (simde_float32)b.i32[0];
+	r.f32[3] = (simde_float32)b.i32[1];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cvtpi8_ps(simde__m64 a)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cvtpi8_ps(a.n);
+#else
+	r.f32[0] = (simde_float32)a.i8[0];
+	r.f32[1] = (simde_float32)a.i8[1];
+	r.f32[2] = (simde_float32)a.i8[2];
+	r.f32[3] = (simde_float32)a.i8[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cvtps_pi16(simde__m128 a)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cvtps_pi16(a.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.i16[i] = (int16_t)a.f32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cvtps_pi32(simde__m128 a)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cvtps_pi32(a.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = (int32_t)a.f32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cvtps_pi8(simde__m128 a)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cvtps_pi8(a.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(a.f32) / sizeof(a.f32[0])); i++) {
+		r.i8[i] = (int8_t)a.f32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cvtpu16_ps(simde__m64 a)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cvtpu16_ps(a.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.f32[i] = (simde_float32)a.u16[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cvtpu8_ps(simde__m64 a)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cvtpu8_ps(a.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < 4; i++) {
+		r.f32[i] = (simde_float32)a.u8[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cvtsi32_ss(simde__m128 a, int32_t b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cvtsi32_ss(a.n, b);
+#else
+	r.f32[0] = (simde_float32)b;
+	SIMDE__VECTORIZE
+	for (size_t i = 1; i < 4; i++) {
+		r.i32[i] = a.i32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cvtsi64_ss(simde__m128 a, int64_t b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
+#if !defined(__PGI)
+	r.n = _mm_cvtsi64_ss(a.n, b);
+#else
+	r.n = _mm_cvtsi64x_ss(a.n, b);
+#endif
+#else
+	r.f32[0] = (simde_float32)b;
+	SIMDE__VECTORIZE
+	for (size_t i = 1; i < 4; i++) {
+		r.i32[i] = a.i32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde_float32 simde_mm_cvtss_f32(simde__m128 a)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_cvtss_f32(a.n);
+#elif defined(SIMDE_SSE_NEON)
+	return vgetq_lane_f32(a.neon_f32, 0);
+#else
+	return a.f32[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int32_t simde_mm_cvtss_si32(simde__m128 a)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_cvtss_si32(a.n);
+#else
+	return (int32_t)a.f32[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int64_t simde_mm_cvtss_si64(simde__m128 a)
+{
+#if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
+#if !defined(__PGI)
+	return _mm_cvtss_si64(a.n);
+#else
+	return _mm_cvtss_si64x(a.n);
+#endif
+#else
+	return (int64_t)a.f32[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cvtt_ps2pi(simde__m128 a)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cvtt_ps2pi(a.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.i32[i] = (int32_t)truncf(a.f32[i]);
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int32_t simde_mm_cvtt_ss2si(simde__m128 a)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_cvtt_ss2si(a.n);
+#else
+	return (int32_t)truncf(a.f32[0]);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cvttps_pi32(simde__m128 a)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_cvttps_pi32(a.n);
+#else
+	r = simde_mm_cvtt_ps2pi(a);
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int32_t simde_mm_cvttss_si32(simde__m128 a)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_cvttss_si32(a.n);
+#else
+	return (int32_t)truncf(a.f32[0]);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int64_t simde_mm_cvttss_si64(simde__m128 a)
+{
+#if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
+#if defined(__PGI)
+	return _mm_cvttss_si64x(a.n);
+#else
+	return _mm_cvttss_si64(a.n);
+#endif
+#else
+	return (int64_t)truncf(a.f32[0]);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_div_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_div_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32x4_t recip0 = vrecpeq_f32(b.neon_f32);
+	float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, b.neon_f32));
+	r.neon_f32 = vmulq_f32(a.neon_f32, recip1);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.f32[i] = a.f32[i] / b.f32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_div_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_div_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32_t value = vgetq_lane_f32(simde_mm_div_ps(a, b).neon_f32, 0);
+	r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
+#else
+	r.f32[0] = a.f32[0] / b.f32[0];
+	SIMDE__VECTORIZE
+	for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.f32[i] = a.f32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int32_t simde_mm_extract_pi16(simde__m64 a, const int imm8)
+{
+	return a.u16[imm8];
+}
+#if defined(SIMDE_SSE_NATIVE)
+#define simde_mm_extract_pi16(a, imm8) _mm_extract_pi16(a.n, imm8)
+#endif
+#define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a.n, imm8)
+
+enum {
+#if defined(SIMDE_SSE_NATIVE)
+	simde_MM_ROUND_NEAREST = _MM_ROUND_NEAREST,
+	simde_MM_ROUND_DOWN = _MM_ROUND_DOWN,
+	simde_MM_ROUND_UP = _MM_ROUND_UP,
+	simde_MM_ROUND_TOWARD_ZERO = _MM_ROUND_TOWARD_ZERO
+#else
+	simde_MM_ROUND_NEAREST
+#if defined(FE_TONEAREST)
+	= FE_TONEAREST
+#endif
+	,
+
+	simde_MM_ROUND_DOWN
+#if defined(FE_DOWNWARD)
+	= FE_DOWNWARD
+#endif
+	,
+
+	simde_MM_ROUND_UP
+#if defined(FE_UPWARD)
+	= FE_UPWARD
+#endif
+	,
+
+	simde_MM_ROUND_TOWARD_ZERO
+#if defined(FE_TOWARDZERO)
+	= FE_TOWARDZERO
+#endif
+#endif
+};
+
+SIMDE__FUNCTION_ATTRIBUTES
+unsigned int simde_MM_GET_ROUNDING_MODE(void)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _MM_GET_ROUNDING_MODE();
+#else
+	return fegetround();
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_MM_SET_ROUNDING_MODE(unsigned int a)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	_MM_SET_ROUNDING_MODE(a);
+#else
+	fesetround((int)a);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_insert_pi16(simde__m64 a, int16_t i, const int imm8)
+{
+	simde__m64 r;
+	r.i64[0] = a.i64[0];
+	r.i16[imm8] = i;
+	return r;
+}
+#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
+#define simde_mm_insert_pi16(a, i, imm8) \
+	SIMDE__M64_C(_mm_insert_pi16((a).n, i, imm8));
+#endif
+#define simde_m_pinsrw(a, i, imm8) \
+	SIMDE__M64_C(simde_mm_insert_pi16((a).n, i, imm8));
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128
+simde_mm_load_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
+{
+	simde__m128 r;
+
+	simde_assert_aligned(16, mem_addr);
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_load_ps(mem_addr);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_f32 = vld1q_f32(mem_addr);
+#else
+	memcpy(&r, mem_addr, sizeof(r.f32));
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_load_ps1(simde_float32 const *mem_addr)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_load_ps1(mem_addr);
+#else
+	const simde_float32 v = *mem_addr;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.f32[i] = v;
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_load_ss(simde_float32 const *mem_addr)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_load_ss(mem_addr);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0);
+#else
+	r.f32[0] = *mem_addr;
+	r.i32[1] = 0;
+	r.i32[2] = 0;
+	r.i32[3] = 0;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_load1_ps(simde_float32 const *mem_addr)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_load1_ps(mem_addr);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_f32 = vld1q_dup_f32(mem_addr);
+#else
+	r = simde_mm_load_ps1(mem_addr);
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_loadh_pi(simde__m128 a, simde__m64 const *mem_addr)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_loadh_pi(a.n, (__m64 *)mem_addr);
+#else
+	r.f32[0] = a.f32[0];
+	r.f32[1] = a.f32[1];
+	r.f32[2] = mem_addr->f32[0];
+	r.f32[3] = mem_addr->f32[1];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_loadl_pi(simde__m128 a, simde__m64 const *mem_addr)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_loadl_pi(a.n, (__m64 *)mem_addr);
+#else
+	r.f32[0] = mem_addr->f32[0];
+	r.f32[1] = mem_addr->f32[1];
+	r.f32[2] = a.f32[2];
+	r.f32[3] = a.f32[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128
+simde_mm_loadr_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
+{
+	simde__m128 r;
+
+	simde_assert_aligned(16, mem_addr);
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_loadr_ps(mem_addr);
+#else
+	r.f32[0] = mem_addr[3];
+	r.f32[1] = mem_addr[2];
+	r.f32[2] = mem_addr[1];
+	r.f32[3] = mem_addr[0];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128
+simde_mm_loadu_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_loadu_ps(mem_addr);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_f32 = vld1q_f32(mem_addr);
+#else
+	r.f32[0] = mem_addr[0];
+	r.f32[1] = mem_addr[1];
+	r.f32[2] = mem_addr[2];
+	r.f32[3] = mem_addr[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_maskmove_si64(simde__m64 a, simde__m64 mask, char *mem_addr)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	_mm_maskmove_si64(a.n, mask.n, mem_addr);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(a.i8) / sizeof(a.i8[0])); i++)
+		if (mask.i8[i] < 0)
+			mem_addr[i] = a.i8[i];
+#endif
+}
+#define simde_m_maskmovq(a, mask, mem_addr) \
+	simde_mm_maskmove_si64(a, mask, mem_addr)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_max_pi16(simde__m64 a, simde__m64 b)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_max_pi16(a.n, b.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.i16[i] = (a.i16[i] > b.i16[i]) ? a.i16[i] : b.i16[i];
+	}
+#endif
+
+	return r;
+}
+#define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_max_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_max_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_f32 = vmaxq_f32(a.neon_f32, b.neon_f32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.f32[i] = (a.f32[i] > b.f32[i]) ? a.f32[i] : b.f32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_max_pu8(simde__m64 a, simde__m64 b)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_max_pu8(a.n, b.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
+		r.u8[i] = (a.u8[i] > b.u8[i]) ? a.u8[i] : b.u8[i];
+	}
+#endif
+
+	return r;
+}
+#define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_max_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_max_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32_t value = vgetq_lane_f32(vmaxq_f32(a.neon_f32, b.neon_f32), 0);
+	r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
+#else
+	r.f32[0] = (a.f32[0] > b.f32[0]) ? a.f32[0] : b.f32[0];
+	r.f32[1] = a.f32[1];
+	r.f32[2] = a.f32[2];
+	r.f32[3] = a.f32[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_min_pi16(simde__m64 a, simde__m64 b)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_min_pi16(a.n, b.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.i16[i] = (a.i16[i] < b.i16[i]) ? a.i16[i] : b.i16[i];
+	}
+#endif
+
+	return r;
+}
+#define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_min_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_min_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_f32 = vminq_f32(a.neon_f32, b.neon_f32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.f32[i] = (a.f32[i] < b.f32[i]) ? a.f32[i] : b.f32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_min_pu8(simde__m64 a, simde__m64 b)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_min_pu8(a.n, b.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
+		r.u8[i] = (a.u8[i] < b.u8[i]) ? a.u8[i] : b.u8[i];
+	}
+#endif
+
+	return r;
+}
+#define simde_m_pminub(a, b) simde_mm_min_pu8(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_min_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_min_ss(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32_t value = vgetq_lane_f32(vminq_f32(a.neon_f32, b.neon_f32), 0);
+	r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
+#else
+	r.f32[0] = (a.f32[0] < b.f32[0]) ? a.f32[0] : b.f32[0];
+	r.f32[1] = a.f32[1];
+	r.f32[2] = a.f32[2];
+	r.f32[3] = a.f32[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_move_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_move_ss(a.n, b.n);
+#else
+	r.f32[0] = b.f32[0];
+	r.f32[1] = a.f32[1];
+	r.f32[2] = a.f32[2];
+	r.f32[3] = a.f32[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_movehl_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_movehl_ps(a.n, b.n);
+#else
+	r.f32[0] = b.f32[2];
+	r.f32[1] = b.f32[3];
+	r.f32[2] = a.f32[2];
+	r.f32[3] = a.f32[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_movelh_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_movelh_ps(a.n, b.n);
+#else
+	r.f32[0] = a.f32[0];
+	r.f32[1] = a.f32[1];
+	r.f32[2] = b.f32[0];
+	r.f32[3] = b.f32[1];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_movemask_pi8(simde__m64 a)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_movemask_pi8(a.n);
+#else
+	int r = 0;
+	const size_t nmemb = sizeof(a.i8) / sizeof(a.i8[0]);
+
+	SIMDE__VECTORIZE_REDUCTION(| : r)
+	for (size_t i = 0; i < nmemb; i++) {
+		r |= (a.u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i);
+	}
+
+	return r;
+#endif
+}
+#define simde_m_pmovmskb(a, b) simde_mm_movemask_pi8(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_movemask_ps(simde__m128 a)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_movemask_ps(a.n);
+#elif defined(SIMDE_SSE_NEON)
+	/* TODO: check to see if NEON version is faster than the portable version */
+	static const uint32x4_t movemask = {1, 2, 4, 8};
+	static const uint32x4_t highbit = {0x80000000, 0x80000000, 0x80000000,
+					   0x80000000};
+	uint32x4_t t0 = a.neon_u32;
+	uint32x4_t t1 = vtstq_u32(t0, highbit);
+	uint32x4_t t2 = vandq_u32(t1, movemask);
+	uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
+	return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
+#else
+	int r = 0;
+
+	SIMDE__VECTORIZE_REDUCTION(| : r)
+	for (size_t i = 0; i < sizeof(a.u32) / sizeof(a.u32[0]); i++) {
+		r |= (a.u32[i] >> ((sizeof(a.u32[i]) * CHAR_BIT) - 1)) << i;
+	}
+
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_mul_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_mul_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_f32 = vmulq_f32(a.neon_f32, b.neon_f32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.f32[i] = a.f32[i] * b.f32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_mul_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_mul_ss(a.n, b.n);
+#else
+	r.f32[0] = a.f32[0] * b.f32[0];
+	r.f32[1] = a.f32[1];
+	r.f32[2] = a.f32[2];
+	r.f32[3] = a.f32[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_mulhi_pu16(simde__m64 a, simde__m64 b)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_mulhi_pu16(a.n, b.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
+		r.u16[i] = (a.u16[i] * b.u16[i]) >> 16;
+	}
+#endif
+
+	return r;
+}
+#define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_or_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_or_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) {
+		r.u32[i] = a.u32[i] | b.u32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_prefetch(char const *p, int i)
+{
+	(void)p;
+	(void)i;
+}
+#if defined(SIMDE_SSE_NATIVE)
+#define simde_mm_prefetch(p, i) _mm_prefetch(p, i)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_rcp_ps(simde__m128 a)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_rcp_ps(a.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32x4_t recip = vrecpeq_f32(a.neon_f32);
+
+#if !defined(SIMDE_MM_RCP_PS_ITERS)
+#define SIMDE_MM_RCP_PS_ITERS SIMDE_ACCURACY_ITERS
+#endif
+
+	for (int i = 0; i < SIMDE_MM_RCP_PS_ITERS; ++i) {
+		recip = vmulq_f32(recip, vrecpsq_f32(recip, a.neon_f32));
+	}
+
+	r.neon_f32 = recip;
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.f32[i] = 1.0f / a.f32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_rcp_ss(simde__m128 a)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_rcp_ss(a.n);
+#else
+	r.f32[0] = 1.0f / a.f32[0];
+	r.f32[1] = a.f32[1];
+	r.f32[2] = a.f32[2];
+	r.f32[3] = a.f32[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_rsqrt_ps(simde__m128 a)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_rsqrt_ps(a.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_f32 = vrsqrteq_f32(a.neon_f32);
+#elif defined(__STDC_IEC_559__)
+	/* http://h14s.p5r.org/2012/09/0x5f3759df.html?mwh=1 */
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.i32[i] = INT32_C(0x5f3759df) - (a.i32[i] >> 1);
+
+#if SIMDE_ACCURACY_ITERS > 2
+		const float half = SIMDE_FLOAT32_C(0.5) * a.f32[i];
+		for (int ai = 2; ai < SIMDE_ACCURACY_ITERS; ai++)
+			r.f32[i] *= SIMDE_FLOAT32_C(1.5) -
+				    (half * r.f32[i] * r.f32[i]);
+#endif
+	}
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.f32[i] = 1.0f / sqrtf(a.f32[i]);
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_rsqrt_ss(simde__m128 a)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_rsqrt_ss(a.n);
+#elif defined(__STDC_IEC_559__)
+	{
+		r.i32[0] = INT32_C(0x5f3759df) - (a.i32[0] >> 1);
+
+#if SIMDE_ACCURACY_ITERS > 2
+		float half = SIMDE_FLOAT32_C(0.5) * a.f32[0];
+		for (int ai = 2; ai < SIMDE_ACCURACY_ITERS; ai++)
+			r.f32[0] *= SIMDE_FLOAT32_C(1.5) -
+				    (half * r.f32[0] * r.f32[0]);
+#endif
+	}
+	r.f32[0] = 1.0f / sqrtf(a.f32[0]);
+	r.f32[1] = a.f32[1];
+	r.f32[2] = a.f32[2];
+	r.f32[3] = a.f32[3];
+#else
+	r.f32[0] = 1.0f / sqrtf(a.f32[0]);
+	r.f32[1] = a.f32[1];
+	r.f32[2] = a.f32[2];
+	r.f32[3] = a.f32[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_sad_pu8(simde__m64 a, simde__m64 b)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_sad_pu8(a.n, b.n);
+#else
+	uint16_t sum = 0;
+
+	SIMDE__VECTORIZE_REDUCTION(+ : sum)
+	for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
+		sum += (uint8_t)abs(a.u8[i] - b.u8[i]);
+	}
+
+	r.i16[0] = sum;
+	r.i16[1] = 0;
+	r.i16[2] = 0;
+	r.i16[3] = 0;
+#endif
+
+	return r;
+}
+#define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_set_ps(simde_float32 e3, simde_float32 e2,
+			    simde_float32 e1, simde_float32 e0)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_set_ps(e3, e2, e1, e0);
+#elif defined(SIMDE_SSE_NEON)
+	SIMDE_ALIGN(16) simde_float32 data[4] = {e0, e1, e2, e3};
+	r.neon_f32 = vld1q_f32(data);
+#else
+	r.f32[0] = e0;
+	r.f32[1] = e1;
+	r.f32[2] = e2;
+	r.f32[3] = e3;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_set_ps1(simde_float32 a)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_set1_ps(a);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_f32 = vdupq_n_f32(a);
+#else
+	r = simde_mm_set_ps(a, a, a, a);
+#endif
+
+	return r;
+}
+#define simde_mm_set1_ps(a) simde_mm_set_ps1(a)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_set_ss(simde_float32 a)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_set_ss(a);
+#else
+	r = simde_mm_set_ps(0, 0, 0, a);
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_setr_ps(simde_float32 e3, simde_float32 e2,
+			     simde_float32 e1, simde_float32 e0)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_setr_ps(e3, e2, e1, e0);
+#elif defined(SIMDE_SSE_NEON)
+	SIMDE_ALIGN(16) simde_float32 data[4] = {e3, e2, e1, e0};
+	r.neon_f32 = vld1q_f32(data);
+#else
+	r = simde_mm_set_ps(e0, e1, e2, e3);
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_setzero_ps(void)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_setzero_ps();
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_f32 = vdupq_n_f32(0.0f);
+#else
+	r = simde_mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_sfence(void)
+{
+	/* TODO: Use Hedley. */
+#if defined(SIMDE_SSE_NATIVE)
+	_mm_sfence();
+#elif defined(__GNUC__) && \
+	((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
+	__atomic_thread_fence(__ATOMIC_SEQ_CST);
+#elif !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && \
+	(__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
+#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9)
+	__atomic_thread_fence(__ATOMIC_SEQ_CST);
+#else
+	atomic_thread_fence(memory_order_seq_cst);
+#endif
+#elif defined(_MSC_VER)
+	MemoryBarrier();
+#elif defined(__GNUC__) && \
+	((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
+	__atomic_thread_fence(__ATOMIC_SEQ_CST);
+#elif HEDLEY_CLANG_HAS_FEATURE(c_atomic)
+	__c11_atomic_thread_fence(__ATOMIC_SEQ_CST)
+#elif defined(__GNUC__) && \
+	((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
+	__sync_synchronize();
+#elif (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x5140)) || \
+	(defined(__SUNPRO_CC) && (__SUNPRO_CC >= 0x5140))
+	__atomic_thread_fence(__ATOMIC_SEQ_CST);
+#elif defined(_OPENMP)
+#pragma omp critical(simde_mm_sfence_)
+	{
+	}
+#endif
+}
+
+#define SIMDE_MM_SHUFFLE(z, y, x, w) \
+	(((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_shuffle_pi16(simde__m64 a, const int imm8)
+{
+	simde__m64 r;
+	for (size_t i = 0; i < sizeof(r.u16) / sizeof(r.u16[0]); i++) {
+		r.i16[i] = a.i16[(imm8 >> (i * 2)) & 3];
+	}
+	return r;
+}
+#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
+#define simde_mm_shuffle_pi16(a, imm8) SIMDE__M64_C(_mm_shuffle_pi16(a.n, imm8))
+#elif defined(SIMDE__SHUFFLE_VECTOR)
+#define simde_mm_shuffle_pi16(a, imm8)                                         \
+	({                                                                     \
+		const simde__m64 simde__tmp_a_ = a;                            \
+		(simde__m64){.i16 = SIMDE__SHUFFLE_VECTOR(                     \
+				     16, 8, (simde__tmp_a_).i16,               \
+				     (simde__tmp_a_).i16, (((imm8)) & 3),      \
+				     (((imm8) >> 2) & 3), (((imm8) >> 4) & 3), \
+				     (((imm8) >> 6) & 3))};                    \
+	})
+#endif
+
+#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
+#define simde_m_pshufw(a, imm8) SIMDE__M64_C(_m_pshufw(a.n, imm8))
+#else
+#define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_shuffle_ps(simde__m128 a, simde__m128 b, const int imm8)
+{
+	simde__m128 r;
+	r.f32[0] = a.f32[(imm8 >> 0) & 3];
+	r.f32[1] = a.f32[(imm8 >> 2) & 3];
+	r.f32[2] = b.f32[(imm8 >> 4) & 3];
+	r.f32[3] = b.f32[(imm8 >> 6) & 3];
+	return r;
+}
+#if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
+#define simde_mm_shuffle_ps(a, b, imm8) \
+	SIMDE__M128_C(_mm_shuffle_ps(a.n, b.n, imm8))
+#elif defined(SIMDE__SHUFFLE_VECTOR)
+#define simde_mm_shuffle_ps(a, b, imm8)                                    \
+	({                                                                 \
+		(simde__m128){.f32 = SIMDE__SHUFFLE_VECTOR(                \
+				      32, 16, (a).f32, (b).f32,            \
+				      (((imm8)) & 3), (((imm8) >> 2) & 3), \
+				      (((imm8) >> 4) & 3) + 4,             \
+				      (((imm8) >> 6) & 3) + 4)};           \
+	})
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_sqrt_ps(simde__m128 a)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_sqrt_ps(a.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32x4_t recipsq = vrsqrteq_f32(a.neon_f32);
+	float32x4_t sq = vrecpeq_f32(recipsq);
+	/* ??? use step versions of both sqrt and recip for better accuracy? */
+	r.neon_f32 = sq;
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < sizeof(r.f32) / sizeof(r.f32[0]); i++) {
+		r.f32[i] = sqrtf(a.f32[i]);
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_sqrt_ss(simde__m128 a)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_sqrt_ss(a.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32_t value = vgetq_lane_f32(simde_mm_sqrt_ps(a).neon_f32, 0);
+	r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
+#else
+	r.f32[0] = sqrtf(a.f32[0]);
+	r.f32[1] = a.f32[1];
+	r.f32[2] = a.f32[2];
+	r.f32[3] = a.f32[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_store_ps(simde_float32 mem_addr[4], simde__m128 a)
+{
+	simde_assert_aligned(16, mem_addr);
+
+#if defined(SIMDE_SSE_NATIVE)
+	_mm_store_ps(mem_addr, a.n);
+#elif defined(SIMDE_SSE_NEON)
+	vst1q_f32(mem_addr, a.neon_f32);
+#else
+	SIMDE__VECTORIZE_ALIGNED(mem_addr : 16)
+	for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) {
+		mem_addr[i] = a.f32[i];
+	}
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_store_ps1(simde_float32 mem_addr[4], simde__m128 a)
+{
+	simde_assert_aligned(16, mem_addr);
+
+#if defined(SIMDE_SSE_NATIVE)
+	_mm_store_ps1(mem_addr, a.n);
+#else
+	SIMDE__VECTORIZE_ALIGNED(mem_addr : 16)
+	for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) {
+		mem_addr[i] = a.f32[0];
+	}
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_store_ss(simde_float32 *mem_addr, simde__m128 a)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	_mm_store_ss(mem_addr, a.n);
+#elif defined(SIMDE_SSE_NEON)
+	vst1q_lane_f32(mem_addr, a.neon_f32, 0);
+#else
+	*mem_addr = a.f32[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_store1_ps(simde_float32 mem_addr[4], simde__m128 a)
+{
+	simde_assert_aligned(16, mem_addr);
+
+#if defined(SIMDE_SSE_NATIVE)
+	_mm_store1_ps(mem_addr, a.n);
+#else
+	simde_mm_store_ps1(mem_addr, a);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_storeh_pi(simde__m64 *mem_addr, simde__m128 a)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	_mm_storeh_pi(&(mem_addr->n), a.n);
+#else
+	mem_addr->f32[0] = a.f32[2];
+	mem_addr->f32[1] = a.f32[3];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_storel_pi(simde__m64 *mem_addr, simde__m128 a)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	_mm_storel_pi(&(mem_addr->n), a.n);
+#else
+	mem_addr->f32[0] = a.f32[0];
+	mem_addr->f32[1] = a.f32[1];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_storer_ps(simde_float32 mem_addr[4], simde__m128 a)
+{
+	simde_assert_aligned(16, mem_addr);
+
+#if defined(SIMDE_SSE_NATIVE)
+	_mm_storer_ps(mem_addr, a.n);
+#else
+	SIMDE__VECTORIZE_ALIGNED(mem_addr : 16)
+	for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) {
+		mem_addr[i] =
+			a.f32[((sizeof(a.f32) / sizeof(a.f32[0])) - 1) - i];
+	}
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_storeu_ps(simde_float32 mem_addr[4], simde__m128 a)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	_mm_storeu_ps(mem_addr, a.n);
+#elif defined(SIMDE_SSE_NEON)
+	vst1q_f32(mem_addr, a.neon_f32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) {
+		mem_addr[i] = a.f32[i];
+	}
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_sub_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_sub_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_f32 = vsubq_f32(a.neon_f32, b.neon_f32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.f32[i] = a.f32[i] - b.f32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_sub_ss(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_sub_ss(a.n, b.n);
+#else
+	r.f32[0] = a.f32[0] - b.f32[0];
+	r.f32[1] = a.f32[1];
+	r.f32[2] = a.f32[2];
+	r.f32[3] = a.f32[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_ucomieq_ss(simde__m128 a, simde__m128 b)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_ucomieq_ss(a.n, b.n);
+#else
+	fenv_t envp;
+	int x = feholdexcept(&envp);
+	int r = a.f32[0] == b.f32[0];
+	if (HEDLEY_LIKELY(x == 0))
+		fesetenv(&envp);
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_ucomige_ss(simde__m128 a, simde__m128 b)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_ucomige_ss(a.n, b.n);
+#else
+	fenv_t envp;
+	int x = feholdexcept(&envp);
+	int r = a.f32[0] >= b.f32[0];
+	if (HEDLEY_LIKELY(x == 0))
+		fesetenv(&envp);
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_ucomigt_ss(simde__m128 a, simde__m128 b)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_ucomigt_ss(a.n, b.n);
+#else
+	fenv_t envp;
+	int x = feholdexcept(&envp);
+	int r = a.f32[0] > b.f32[0];
+	if (HEDLEY_LIKELY(x == 0))
+		fesetenv(&envp);
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_ucomile_ss(simde__m128 a, simde__m128 b)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_ucomile_ss(a.n, b.n);
+#else
+	fenv_t envp;
+	int x = feholdexcept(&envp);
+	int r = a.f32[0] <= b.f32[0];
+	if (HEDLEY_LIKELY(x == 0))
+		fesetenv(&envp);
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_ucomilt_ss(simde__m128 a, simde__m128 b)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_ucomilt_ss(a.n, b.n);
+#else
+	fenv_t envp;
+	int x = feholdexcept(&envp);
+	int r = a.f32[0] < b.f32[0];
+	if (HEDLEY_LIKELY(x == 0))
+		fesetenv(&envp);
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_ucomineq_ss(simde__m128 a, simde__m128 b)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_ucomineq_ss(a.n, b.n);
+#else
+	fenv_t envp;
+	int x = feholdexcept(&envp);
+	int r = a.f32[0] != b.f32[0];
+	if (HEDLEY_LIKELY(x == 0))
+		fesetenv(&envp);
+	return r;
+#endif
+}
+
+#if defined(SIMDE_SSE_NATIVE)
+#if defined(__has_builtin)
+#if __has_builtin(__builtin_ia32_undef128)
+#define SIMDE__HAVE_UNDEFINED128
+#endif
+#elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793)
+#define SIMDE__HAVE_UNDEFINED128
+#endif
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_undefined_ps(void)
+{
+	simde__m128 r;
+
+#if defined(SIMDE__HAVE_UNDEFINED128)
+	r.n = _mm_undefined_ps();
+#else
+	r = simde_mm_setzero_ps();
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_unpackhi_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_unpackhi_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32x2_t a1 = vget_high_f32(a.neon_f32);
+	float32x2_t b1 = vget_high_f32(b.neon_f32);
+	float32x2x2_t result = vzip_f32(a1, b1);
+	r.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
+#else
+	r.f32[0] = a.f32[2];
+	r.f32[1] = b.f32[2];
+	r.f32[2] = a.f32[3];
+	r.f32[3] = b.f32[3];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_unpacklo_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_unpacklo_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	float32x2_t a1 = vget_low_f32(a.neon_f32);
+	float32x2_t b1 = vget_low_f32(b.neon_f32);
+	float32x2x2_t result = vzip_f32(a1, b1);
+	r.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
+#else
+	r.f32[0] = a.f32[0];
+	r.f32[1] = b.f32[0];
+	r.f32[2] = a.f32[1];
+	r.f32[3] = b.f32[1];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_xor_ps(simde__m128 a, simde__m128 b)
+{
+	simde__m128 r;
+
+#if defined(SIMDE_SSE_NATIVE)
+	r.n = _mm_xor_ps(a.n, b.n);
+#elif defined(SIMDE_SSE_NEON)
+	r.neon_i32 = veorq_s32(a.neon_i32, b.neon_i32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) {
+		r.u32[i] = a.u32[i] ^ b.u32[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_stream_pi(simde__m64 *mem_addr, simde__m64 a)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	_mm_stream_pi(&(mem_addr->n), a.n);
+#else
+	mem_addr->i64[0] = a.i64[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_stream_ps(simde_float32 mem_addr[4], simde__m128 a)
+{
+	simde_assert_aligned(16, mem_addr);
+
+#if defined(SIMDE_SSE_NATIVE)
+	_mm_stream_ps(mem_addr, a.n);
+#else
+	SIMDE__ASSUME_ALIGNED(mem_addr, 16);
+	memcpy(mem_addr, &a, sizeof(a));
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+uint32_t simde_mm_getcsr(void)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	return _mm_getcsr();
+#else
+	uint32_t r = 0;
+	int rounding_mode = fegetround();
+
+	switch (rounding_mode) {
+	case FE_TONEAREST:
+		break;
+	case FE_UPWARD:
+		r |= 2 << 13;
+		break;
+	case FE_DOWNWARD:
+		r |= 1 << 13;
+		break;
+	case FE_TOWARDZERO:
+		r = 3 << 13;
+		break;
+	}
+
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_setcsr(uint32_t a)
+{
+#if defined(SIMDE_SSE_NATIVE)
+	_mm_setcsr(a);
+#else
+	switch ((a >> 13) & 3) {
+	case 0:
+		fesetround(FE_TONEAREST);
+		break;
+	case 1:
+		fesetround(FE_DOWNWARD);
+		break;
+	case 2:
+		fesetround(FE_UPWARD);
+		break;
+	case 3:
+		fesetround(FE_TOWARDZERO);
+		break;
+	}
+#endif
+}
+
+#define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3)       \
+	do {                                                 \
+		simde__m128 tmp3, tmp2, tmp1, tmp0;          \
+		tmp0 = simde_mm_unpacklo_ps((row0), (row1)); \
+		tmp2 = simde_mm_unpacklo_ps((row2), (row3)); \
+		tmp1 = simde_mm_unpackhi_ps((row0), (row1)); \
+		tmp3 = simde_mm_unpackhi_ps((row2), (row3)); \
+		row0 = simde_mm_movelh_ps(tmp0, tmp2);       \
+		row1 = simde_mm_movehl_ps(tmp2, tmp0);       \
+		row2 = simde_mm_movelh_ps(tmp1, tmp3);       \
+		row3 = simde_mm_movehl_ps(tmp3, tmp1);       \
+	} while (0)
+
+SIMDE__END_DECLS
+
+#endif /* !defined(SIMDE__SSE_H) */

+ 4197 - 0
libobs/util/aarch/sse2.h

@@ -0,0 +1,4197 @@
+/* Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ *   2017      Evan Nemerson <[email protected]>
+ *   2015-2017 John W. Ratcliff <[email protected]>
+ *   2015      Brandon Rowlett <[email protected]>
+ *   2015      Ken Fast <[email protected]>
+ *   2017      Hasindu Gamaarachchi <[email protected]>
+ *   2018      Jeff Daily <[email protected]>
+ */
+
+#if !defined(SIMDE__SSE2_H)
+#if !defined(SIMDE__SSE2_H)
+#define SIMDE__SSE2_H
+#endif
+#include "sse.h"
+
+#if defined(SIMDE_SSE2_NATIVE)
+#undef SIMDE_SSE2_NATIVE
+#endif
+#if defined(SIMDE_SSE2_FORCE_NATIVE)
+#define SIMDE_SSE2_NATIVE
+#elif defined(__SSE2__) && !defined(SIMDE_SSE2_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#define SIMDE_SSE2_NATIVE
+#elif defined(__ARM_NEON) && !defined(SIMDE_SSE2_NO_NEON) && \
+	!defined(SIMDE_NO_NEON)
+#define SIMDE_SSE2_NEON
+#endif
+
+#if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_SSE_NATIVE)
+#if defined(SIMDE_SSE2_FORCE_NATIVE)
+#error Native SSE2 support requires native SSE support
+#else
+#warning Native SSE2 support requires native SSE support, disabling
+#undef SIMDE_SSE2_NATIVE
+#endif
+#elif defined(SIMDE_SSE2_NEON) && !defined(SIMDE_SSE_NEON)
+#warning SSE2 NEON support requires SSE NEON support, disabling
+#undef SIMDE_SSE_NEON
+#endif
+
+#if defined(SIMDE_SSE2_NATIVE)
+#include <emmintrin.h>
+#else
+#if defined(SIMDE_SSE2_NEON)
+#include <arm_neon.h>
+#endif
+#endif
+
+#include <stdint.h>
+#include <limits.h>
+#include <string.h>
+
+#define vreinterpretq_m128i_s32(v) \
+	(simde__m128i) { .neon_i32 = v }
+#define vreinterpretq_m128i_u64(v) \
+	(simde__m128i) { .neon_u64 = v }
+
+#define vreinterpretq_s32_m128i(a) a.neon_i32
+#define vreinterpretq_u64_m128i(a) a.neon_u64
+
+SIMDE__BEGIN_DECLS
+
+typedef SIMDE_ALIGN(16) union {
+#if defined(SIMDE__ENABLE_GCC_VEC_EXT)
+	int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
+	int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
+	int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
+	int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
+	uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
+	uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
+	uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
+	uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
+#if defined(SIMDE__HAVE_INT128)
+	simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__));
+	simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__));
+#endif
+	simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
+	simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__));
+#else
+	int8_t i8[16];
+	int16_t i16[8];
+	int32_t i32[4];
+	int64_t i64[2];
+	uint8_t u8[16];
+	uint16_t u16[8];
+	uint32_t u32[4];
+	uint64_t u64[2];
+#if defined(SIMDE__HAVE_INT128)
+	simde_int128 i128[1];
+	simde_uint128 u128[1];
+#endif
+	simde_float32 f32[4];
+	simde_float64 f64[2];
+#endif
+
+#if defined(SIMDE_SSE2_NATIVE)
+	__m128i n;
+#elif defined(SIMDE_SSE2_NEON)
+	int8x16_t neon_i8;
+	int16x8_t neon_i16;
+	int32x4_t neon_i32;
+	int64x2_t neon_i64;
+	uint8x16_t neon_u8;
+	uint16x8_t neon_u16;
+	uint32x4_t neon_u32;
+	uint64x2_t neon_u64;
+	float32x4_t neon_f32;
+#if defined(SIMDE_ARCH_AMD64)
+	float64x2_t neon_f64;
+#endif
+#endif
+} simde__m128i;
+
+typedef SIMDE_ALIGN(16) union {
+#if defined(SIMDE__ENABLE_GCC_VEC_EXT)
+	int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
+	int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
+	int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
+	int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
+	uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
+	uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
+	uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
+	uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
+	simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
+	simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__));
+#else
+	int8_t i8[16];
+	int16_t i16[8];
+	int32_t i32[4];
+	int64_t i64[2];
+	uint8_t u8[16];
+	uint16_t u16[8];
+	uint32_t u32[4];
+	uint64_t u64[2];
+	simde_float32 f32[4];
+	simde_float64 f64[2];
+#endif
+
+#if defined(SIMDE_SSE2_NATIVE)
+	__m128d n;
+#elif defined(SIMDE_SSE2_NEON)
+	int8x16_t neon_i8;
+	int16x8_t neon_i16;
+	int32x4_t neon_i32;
+	int64x2_t neon_i64;
+	uint8x16_t neon_u8;
+	uint16x8_t neon_u16;
+	uint32x4_t neon_u32;
+	uint64x2_t neon_u64;
+	float32x4_t neon_f32;
+#if defined(SIMDE_ARCH_AMD64)
+	float64x2_t neon_f64;
+#endif
+#endif
+} simde__m128d;
+
+#if defined(SIMDE_SSE2_NATIVE)
+HEDLEY_STATIC_ASSERT(sizeof(__m128i) == sizeof(simde__m128i),
+		     "__m128i size doesn't match simde__m128i size");
+HEDLEY_STATIC_ASSERT(sizeof(__m128d) == sizeof(simde__m128d),
+		     "__m128d size doesn't match simde__m128d size");
+SIMDE__FUNCTION_ATTRIBUTES simde__m128i SIMDE__M128I_C(__m128i v)
+{
+	simde__m128i r;
+	r.n = v;
+	return r;
+}
+SIMDE__FUNCTION_ATTRIBUTES simde__m128d SIMDE__M128D_C(__m128d v)
+{
+	simde__m128d r;
+	r.n = v;
+	return r;
+}
+#elif defined(SIMDE_SSE_NEON)
+#define SIMDE__M128I_NEON_C(T, expr) \
+	(simde__m128i) { .neon_##T = expr }
+#define SIMDE__M128D_NEON_C(T, expr) \
+	(simde__m128d) { .neon_##T = expr }
+#endif
+HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
+HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_add_epi8(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_add_epi8(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i8, vaddq_s8(a.neon_i8, b.neon_i8));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
+		r.i8[i] = a.i8[i] + b.i8[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_add_epi16(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_add_epi16(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i16, vaddq_s16(a.neon_i16, b.neon_i16));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.i16[i] = a.i16[i] + b.i16[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_add_epi32(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_add_epi32(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i32, vaddq_s32(a.neon_i32, b.neon_i32));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = a.i32[i] + b.i32[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_add_epi64(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_add_epi64(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i64, vaddq_s64(a.neon_i64, b.neon_i64));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		r.i64[i] = a.i64[i] + b.i64[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_add_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_add_pd(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON) && defined(SIMDE_ARCH_AMD64)
+	return SIMDE__M128I_NEON_C(f64, vaddq_f64(a.neon_f64, b.neon_f64));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.f64[i] = a.f64[i] + b.f64[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_add_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_add_sd(a.n, b.n));
+#else
+	simde__m128d r;
+	r.f64[0] = a.f64[0] + b.f64[0];
+	r.f64[1] = a.f64[1];
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_add_si64(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M64_C(_mm_add_si64(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M64_NEON_C(i64, vadd_s64(a.neon_i64, b.neon_i64));
+#else
+	simde__m64 r;
+	r.i64[0] = a.i64[0] + b.i64[0];
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_adds_epi8(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_adds_epi8(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i8, vqaddq_s8(a.neon_i8, b.neon_i8));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
+		if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) {
+			r.i8[i] = INT8_MAX;
+		} else if ((((b.i8[i]) < 0) &&
+			    ((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) {
+			r.i8[i] = INT8_MIN;
+		} else {
+			r.i8[i] = (a.i8[i]) + (b.i8[i]);
+		}
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_adds_epi16(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_adds_epi16(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i16, vqaddq_s16(a.neon_i16, b.neon_i16));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		if ((((b.i16[i]) > 0) &&
+		     ((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) {
+			r.i16[i] = INT16_MAX;
+		} else if ((((b.i16[i]) < 0) &&
+			    ((a.i16[i]) < (INT16_MIN - (b.i16[i]))))) {
+			r.i16[i] = INT16_MIN;
+		} else {
+			r.i16[i] = (a.i16[i]) + (b.i16[i]);
+		}
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_adds_epu8(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_adds_epu8(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(u8, vqaddq_u8(a.neon_u8, b.neon_u8));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
+		r.u8[i] = ((UINT8_MAX - a.u8[i]) > b.u8[i])
+				  ? (a.u8[i] + b.u8[i])
+				  : UINT8_MAX;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_adds_epu16(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_adds_epu16(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(u16, vqaddq_u16(a.neon_u16, b.neon_u16));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
+		r.u16[i] = ((UINT16_MAX - a.u16[i]) > b.u16[i])
+				   ? (a.u16[i] + b.u16[i])
+				   : UINT16_MAX;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_and_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_and_pd(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128D_NEON_C(i32, vandq_s32(a.neon_i32, b.neon_i32));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
+		r.u64[i] = a.u64[i] & b.u64[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_and_si128(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_and_si128(a.n, b.n));
+#elif defined(SIMDE_SSE_NEON)
+	return SIMDE__M128I_NEON_C(i32, vandq_s32(b.neon_i32, a.neon_i32));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		r.i64[i] = a.i64[i] & b.i64[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_andnot_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_andnot_pd(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128D_NEON_C(i32, vbicq_s32(a.neon_i32, b.neon_i32));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
+		r.u64[i] = ~a.u64[i] & b.u64[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_andnot_si128(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_andnot_si128(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i32, vbicq_s32(b.neon_i32, a.neon_i32));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		r.i64[i] = ~(a.i64[i]) & b.i64[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_avg_epu8(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_avg_epu8(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(u8, vrhaddq_u8(b.neon_u8, a.neon_u8));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
+		r.u8[i] = (a.u8[i] + b.u8[i] + 1) >> 1;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_avg_epu16(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_avg_epu16(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(u16, vrhaddq_u16(b.neon_u16, a.neon_u16));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
+		r.u16[i] = (a.u16[i] + b.u16[i] + 1) >> 1;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_bslli_si128(simde__m128i a, const int imm8)
+{
+	simde__m128i r;
+
+	if (HEDLEY_UNLIKELY(imm8 > 15)) {
+		r.u64[0] = 0;
+		r.u64[1] = 0;
+		return r;
+	}
+
+	const int s = imm8 * 8;
+
+#if defined(SIMDE__HAVE_INT128)
+	r.u128[0] = a.u128[0] << s;
+#else
+	if (s < 64) {
+		r.u64[0] = (a.u64[0] << s);
+		r.u64[1] = (a.u64[1] << s) | (a.u64[0] >> (64 - s));
+	} else {
+		r.u64[0] = 0;
+		r.u64[1] = a.u64[0] << (s - 64);
+	}
+#endif
+
+	return r;
+}
+#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
+#define simde_mm_bslli_si128(a, imm8) SIMDE__M128I_C(_mm_slli_si128(a.n, imm8))
+#elif defined(SIMDE_SSE2_NEON)
+#define simde_mm_bslli_si128(a, imm8)                                      \
+	SIMDE__M128I_NEON_C(                                               \
+		i8,                                                        \
+		(((imm8) <= 0) ? ((a).neon_i8)                             \
+			       : (((imm8) > 15) ? (vdupq_n_s8(0))          \
+						: (vextq_s8(vdupq_n_s8(0), \
+							    (a).neon_i8,   \
+							    16 - (imm8))))))
+#endif
+#define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_bsrli_si128(simde__m128i a, const int imm8)
+{
+	simde__m128i r;
+
+	if (HEDLEY_UNLIKELY(imm8 > 15)) {
+		r.u64[0] = 0;
+		r.u64[1] = 0;
+		return r;
+	}
+
+	const int s = imm8 * 8;
+
+#if defined(SIMDE__HAVE_INT128)
+	r.u128[0] = a.u128[0] >> s;
+#else
+	if (s < 64) {
+		r.u64[0] = (a.u64[0] >> s) | (a.u64[1] << (64 - s));
+		r.u64[1] = (a.u64[1] >> s);
+	} else {
+		r.u64[0] = a.u64[1] >> (s - 64);
+		r.u64[1] = 0;
+	}
+#endif
+
+	return r;
+}
+#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
+#define simde_mm_bsrli_si128(a, imm8) SIMDE__M128I_C(_mm_srli_si128(a.n, imm8))
+#elif defined(SIMDE_SSE2_NEON)
+#define simde_mm_bsrli_si128(a, imm8)                             \
+	SIMDE__M128I_NEON_C(                                      \
+		i8,                                               \
+		((imm8) <= 0)                                     \
+			? ((a).neon_i8)                           \
+			: (((imm8) > 15) ? (vdupq_n_s8(0))        \
+					 : (vextq_s8((a).neon_i8, \
+						     vdupq_n_s8(0), (imm8)))))
+#endif
+#define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128(a, imm8)
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_clflush(void const *p)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_clflush(p);
+#else
+	(void)p;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_comieq_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_comieq_sd(a.n, b.n);
+#else
+	return a.f64[0] == b.f64[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_comige_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_comige_sd(a.n, b.n);
+#else
+	return a.f64[0] >= b.f64[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_comigt_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_comigt_sd(a.n, b.n);
+#else
+	return a.f64[0] > b.f64[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_comile_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_comile_sd(a.n, b.n);
+#else
+	return a.f64[0] <= b.f64[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_comilt_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_comilt_sd(a.n, b.n);
+#else
+	return a.f64[0] < b.f64[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_comineq_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_comineq_sd(a.n, b.n);
+#else
+	return a.f64[0] != b.f64[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_castpd_ps(simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128_C(_mm_castpd_ps(a.n));
+#else
+	union {
+		simde__m128d pd;
+		simde__m128 ps;
+	} r;
+	r.pd = a;
+	return r.ps;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_castpd_si128(simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_castpd_si128(a.n));
+#else
+	union {
+		simde__m128d pd;
+		simde__m128i si128;
+	} r;
+	r.pd = a;
+	return r.si128;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_castps_pd(simde__m128 a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_castps_pd(a.n));
+#else
+	union {
+		simde__m128 ps;
+		simde__m128d pd;
+	} r;
+	r.ps = a;
+	return r.pd;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_castps_si128(simde__m128 a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_castps_si128(a.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i32, a.neon_i32);
+#else
+	union {
+		simde__m128 ps;
+		simde__m128i si128;
+	} r;
+	r.ps = a;
+	return r.si128;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_castsi128_pd(simde__m128i a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_castsi128_pd(a.n));
+#else
+	union {
+		simde__m128i si128;
+		simde__m128d pd;
+	} r;
+	r.si128 = a;
+	return r.pd;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_castsi128_ps(simde__m128i a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128_C(_mm_castsi128_ps(a.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128_NEON_C(f32, a.neon_f32);
+#else
+	union {
+		simde__m128i si128;
+		simde__m128 ps;
+	} r;
+	r.si128 = a;
+	return r.ps;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_cmpeq_epi8(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_cmpeq_epi8(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(
+		i8, vreinterpretq_s8_u8(vceqq_s8(b.neon_i8, a.neon_i8)));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
+		r.i8[i] = (a.i8[i] == b.i8[i]) ? 0xff : 0x00;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_cmpeq_epi16(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_cmpeq_epi16(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(
+		i16, vreinterpretq_s16_u16(vceqq_s16(b.neon_i16, a.neon_i16)));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.i16[i] = (a.i16[i] == b.i16[i]) ? 0xffff : 0x0000;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_cmpeq_epi32(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_cmpeq_epi32(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(
+		i32, vreinterpretq_s32_u32(vceqq_s32(b.neon_i32, a.neon_i32)));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = (a.i32[i] == b.i32[i]) ? 0xffffffff : 0x00000000;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpeq_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmpeq_pd(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128D_NEON_C(
+		i32, vreinterpretq_s32_u32(
+			     vceqq_s32(vreinterpretq_s32_f32(b.neon_f32),
+				       vreinterpretq_s32_f32(a.neon_f32))));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.u64[i] = (a.f64[i] == b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpeq_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmpeq_sd(a.n, b.n));
+#else
+	simde__m128d r;
+	r.u64[0] = (a.f64[0] == b.f64[0]) ? ~UINT64_C(0) : 0;
+	r.u64[1] = a.u64[1];
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpneq_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmpneq_pd(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128D_NEON_C(f32,
+				   vreinterpretq_f32_u16(vmvnq_u16(
+					   vceqq_s16(b.neon_i16, a.neon_i16))));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.u64[i] = (a.f64[i] != b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpneq_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmpneq_sd(a.n, b.n));
+#else
+	simde__m128d r;
+	r.u64[0] = (a.f64[0] != b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
+	r.u64[1] = a.u64[1];
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_cmplt_epi8(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_cmplt_epi8(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(
+		i8, vreinterpretq_s8_u8(vcltq_s8(a.neon_i8, b.neon_i8)));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
+		r.i8[i] = (a.i8[i] < b.i8[i]) ? 0xff : 0x00;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_cmplt_epi16(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_cmplt_epi16(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(
+		i16, vreinterpretq_s16_u16(vcltq_s16(a.neon_i16, b.neon_i16)));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.i16[i] = (a.i16[i] < b.i16[i]) ? 0xffff : 0x0000;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_cmplt_epi32(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_cmplt_epi32(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(
+		i32, vreinterpretq_s32_u32(vcltq_s32(a.neon_i32, b.neon_i32)));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = (a.i32[i] < b.i32[i]) ? 0xffffffff : 0x00000000;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmplt_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmplt_pd(a.n, b.n));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.u64[i] = (a.f64[i] < b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmplt_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmplt_sd(a.n, b.n));
+#else
+	simde__m128d r;
+	r.u64[0] = (a.f64[0] < b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
+	r.u64[1] = a.u64[1];
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmple_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmple_pd(a.n, b.n));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.u64[i] = (a.f64[i] <= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmple_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmple_sd(a.n, b.n));
+#else
+	simde__m128d r;
+	r.u64[0] = (a.f64[0] <= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
+	r.u64[1] = a.u64[1];
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_cmpgt_epi8(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_cmpgt_epi8(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(
+		i8, vreinterpretq_s8_u8(vcgtq_s8(a.neon_i8, b.neon_i8)));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
+		r.i8[i] = (a.i8[i] > b.i8[i]) ? 0xff : 0x00;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_cmpgt_epi16(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_cmpgt_epi16(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(
+		i16, vreinterpretq_s16_u16(vcgtq_s16(a.neon_i16, b.neon_i16)));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.i16[i] = (a.i16[i] > b.i16[i]) ? 0xffff : 0x0000;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_cmpgt_epi32(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_cmpgt_epi32(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(
+		i32, vreinterpretq_s32_u32(vcgtq_s32(a.neon_i32, b.neon_i32)));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = (a.i32[i] > b.i32[i]) ? 0xffffffff : 0x00000000;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpgt_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmpgt_pd(a.n, b.n));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.u64[i] = (a.f64[i] > b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpgt_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
+	return SIMDE__M128D_C(_mm_cmpgt_sd(a.n, b.n));
+#else
+	simde__m128d r;
+	r.u64[0] = (a.f64[0] > b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
+	r.u64[1] = a.u64[1];
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpge_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmpge_pd(a.n, b.n));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.u64[i] = (a.f64[i] >= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpge_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
+	return SIMDE__M128D_C(_mm_cmpge_sd(a.n, b.n));
+#else
+	simde__m128d r;
+	r.u64[0] = (a.f64[0] >= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
+	r.u64[1] = a.u64[1];
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpnge_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmpnge_pd(a.n, b.n));
+#else
+	return simde_mm_cmplt_pd(a, b);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpnge_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
+	return SIMDE__M128D_C(_mm_cmpnge_sd(a.n, b.n));
+#else
+	return simde_mm_cmplt_sd(a, b);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpnlt_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmpnlt_pd(a.n, b.n));
+#else
+	return simde_mm_cmpge_pd(a, b);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpnlt_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmpnlt_sd(a.n, b.n));
+#else
+	return simde_mm_cmpge_sd(a, b);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpnle_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmpnle_pd(a.n, b.n));
+#else
+	return simde_mm_cmpgt_pd(a, b);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpnle_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmpnle_sd(a.n, b.n));
+#else
+	return simde_mm_cmpgt_sd(a, b);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpord_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmpord_pd(a.n, b.n));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.u64[i] = (!isnan(a.f64[i]) && !isnan(b.f64[i])) ? ~UINT64_C(0)
+								  : UINT64_C(0);
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpord_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmpord_sd(a.n, b.n));
+#else
+	simde__m128d r;
+	r.u64[0] = (!isnan(a.f64[0]) && !isnan(b.f64[0])) ? ~UINT64_C(0)
+							  : UINT64_C(0);
+	r.u64[1] = a.u64[1];
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpunord_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmpunord_pd(a.n, b.n));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.u64[i] = (isnan(a.f64[i]) || isnan(b.f64[i])) ? ~UINT64_C(0)
+								: UINT64_C(0);
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cmpunord_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cmpunord_sd(a.n, b.n));
+#else
+	simde__m128d r;
+	r.u64[0] = (isnan(a.f64[0]) || isnan(b.f64[0])) ? ~UINT64_C(0)
+							: UINT64_C(0);
+	r.u64[1] = a.u64[1];
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cvtepi32_pd(simde__m128i a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cvtepi32_pd(a.n));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.f64[i] = (simde_float64)a.i32[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cvtepi32_ps(simde__m128i a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128_C(_mm_cvtepi32_ps(a.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128_NEON_C(f32, vcvtq_f32_s32(a.neon_i32));
+#else
+	simde__m128 r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
+		r.f32[i] = (simde_float32)a.i32[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_cvtpd_epi32(simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_cvtpd_epi32(a.n));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.i32[i] = (int32_t)a.f64[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cvtpd_pi32(simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M64_C(_mm_cvtpd_pi32(a.n));
+#else
+	simde__m64 r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = (int32_t)a.f64[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cvtpd_ps(simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128_C(_mm_cvtpd_ps(a.n));
+#else
+	simde__m128 r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(a.f64) / sizeof(a.f64[0])); i++) {
+		r.f32[i] = (simde_float32)a.f64[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cvtpi32_pd(simde__m64 a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cvtpi32_pd(a.n));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.f64[i] = (simde_float64)a.i32[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_cvtps_epi32(simde__m128 a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_cvtps_epi32(a.n));
+#elif defined(SIMDE_SSE2_NEON)
+/* The default rounding mode on SSE is 'round to even', which ArmV7
+     does not support!  It is supported on ARMv8 however. */
+#if defined(SIMDE_ARCH_AARCH64)
+	return SIMDE__M128I_NEON_C(i32, vcvtnq_s32_f32(a.neon_f32));
+#else
+	uint32x4_t signmask = vdupq_n_u32(0x80000000);
+	float32x4_t half = vbslq_f32(signmask, a.neon_f32,
+				     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+	int32x4_t r_normal = vcvtq_s32_f32(
+		vaddq_f32(a.neon_f32, half)); /* round to integer: [a + 0.5]*/
+	int32x4_t r_trunc =
+		vcvtq_s32_f32(a.neon_f32); /* truncate to integer: [a] */
+	int32x4_t plusone = vshrq_n_s32(vnegq_s32(r_trunc), 31); /* 1 or 0 */
+	int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+				     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+	float32x4_t delta = vsubq_f32(
+		a.neon_f32,
+		vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+	uint32x4_t is_delta_half =
+		vceqq_f32(delta, half); /* delta == +/- 0.5 */
+	return SIMDE__M128I_NEON_C(i32,
+				   vbslq_s32(is_delta_half, r_even, r_normal));
+#endif
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = (int32_t)a.f32[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cvtps_pd(simde__m128 a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cvtps_pd(a.n));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.f64[i] = a.f32[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+double simde_mm_cvtsd_f64(simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
+	return _mm_cvtsd_f64(a.n);
+#else
+	return a.f64[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int32_t simde_mm_cvtsd_si32(simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_cvtsd_si32(a.n);
+#else
+	return (int32_t)a.f64[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int32_t simde_mm_cvtsd_si64(simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
+#if defined(__PGI)
+	return _mm_cvtsd_si64x(a.n);
+#else
+	return _mm_cvtsd_si64(a.n);
+#endif
+#else
+	return (int32_t)a.f64[0];
+#endif
+}
+#define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128 simde_mm_cvtsd_ss(simde__m128 a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128_C(_mm_cvtsd_ss(a.n, b.n));
+#else
+	simde__m128 r;
+
+	r.f32[0] = (simde_float32)b.f64[0];
+
+	SIMDE__VECTORIZE
+	for (size_t i = 1; i < (sizeof(r) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = a.i32[i];
+	}
+
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int32_t simde_mm_cvtsi128_si32(simde__m128i a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_cvtsi128_si32(a.n);
+#elif defined(SIMDE_SSE2_NEON)
+	return vgetq_lane_s32(a.neon_i32, 0);
+#else
+	return a.i32[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int64_t simde_mm_cvtsi128_si64(simde__m128i a)
+{
+#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
+#if defined(__PGI)
+	return _mm_cvtsi128_si64x(a.n);
+#else
+	return _mm_cvtsi128_si64(a.n);
+#endif
+#else
+	return a.i64[0];
+#endif
+}
+#define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cvtsi32_sd(simde__m128d a, int32_t b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_cvtsi32_sd(a.n, b));
+#else
+	simde__m128d r;
+
+	r.f64[0] = (simde_float64)b;
+	r.i64[1] = a.i64[1];
+
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_cvtsi32_si128(int32_t a)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_cvtsi32_si128(a);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
+#else
+	r.i32[0] = a;
+	r.i32[1] = 0;
+	r.i32[2] = 0;
+	r.i32[3] = 0;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cvtsi64_sd(simde__m128d a, int32_t b)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
+#if !defined(__PGI)
+	r.n = _mm_cvtsi64_sd(a.n, b);
+#else
+	r.n = _mm_cvtsi64x_sd(a.n, b);
+#endif
+#else
+	r.f64[0] = (simde_float64)b;
+	r.f64[1] = a.f64[1];
+#endif
+
+	return r;
+}
+#define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64(a, b)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_cvtsi64_si128(int64_t a)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
+#if !defined(__PGI)
+	r.n = _mm_cvtsi64_si128(a);
+#else
+	r.n = _mm_cvtsi64x_si128(a);
+#endif
+#else
+	r.i64[0] = a;
+	r.i64[1] = 0;
+#endif
+
+	return r;
+}
+#define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_cvtss_sd(simde__m128d a, simde__m128 b)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_cvtss_sd(a.n, b.n);
+#else
+	r.f64[0] = b.f32[0];
+	r.i64[1] = a.i64[1];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_cvttpd_epi32(simde__m128d a)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_cvttpd_epi32(a.n);
+#else
+	for (size_t i = 0; i < (sizeof(a.f64) / sizeof(a.f64[0])); i++) {
+		r.i32[i] = (int32_t)trunc(a.f64[i]);
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_cvttpd_pi32(simde__m128d a)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_cvttpd_pi32(a.n);
+#else
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = (int32_t)trunc(a.f64[i]);
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_cvttps_epi32(simde__m128 a)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_cvttps_epi32(a.n);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i32 = vcvtq_s32_f32(a.neon_f32);
+#else
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = (int32_t)truncf(a.f32[i]);
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int32_t simde_mm_cvttsd_si32(simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_cvttsd_si32(a.n);
+#else
+	return (int32_t)trunc(a.f64[0]);
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int64_t simde_mm_cvttsd_si64(simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
+#if !defined(__PGI)
+	return _mm_cvttsd_si64(a.n);
+#else
+	return _mm_cvttsd_si64x(a.n);
+#endif
+#else
+	return (int64_t)trunc(a.f64[0]);
+#endif
+}
+#define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_div_pd(simde__m128d a, simde__m128d b)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_div_pd(a.n, b.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.f64[i] = a.f64[i] / b.f64[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_div_sd(simde__m128d a, simde__m128d b)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_div_sd(a.n, b.n);
+#else
+	r.f64[0] = a.f64[0] / b.f64[0];
+	r.f64[1] = a.f64[1];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int32_t simde_mm_extract_epi16(simde__m128i a, const int imm8)
+{
+	return a.u16[imm8 & 7];
+}
+#if defined(SIMDE_SSE2_NATIVE) && \
+	(!defined(SIMDE__REALLY_GCC) || HEDLEY_GCC_VERSION_CHECK(4, 6, 0))
+#define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a.n, imm8)
+#elif defined(SIMDE_SSE2_NEON)
+#define simde_mm_extract_epi16(a, imm8) \
+	(vgetq_lane_s16((a).neon_i16, (imm8)) & ((int32_t)UINT32_C(0x0000ffff)))
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_insert_epi16(simde__m128i a, int32_t i, const int imm8)
+{
+	a.u16[imm8 & 7] = (int16_t)i;
+	return a;
+}
+#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
+#define simde_mm_insert_epi16(a, i, imm8) \
+	SIMDE__M128I_C(_mm_insert_epi16((a).n, (i), (imm8)))
+#elif defined(SIMDE_SSE2_NEON)
+#define simde_mm_insert_epi16(a, i, imm8) \
+	SIMDE__M128I_NEON_C(i16, vsetq_lane_s16((i), a.neon_i16, (imm8)))
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d
+simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
+{
+	simde__m128d r;
+
+	simde_assert_aligned(16, mem_addr);
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_load_pd(mem_addr);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_u32 = vld1q_u32((uint32_t const *)mem_addr);
+#else
+	SIMDE__ASSUME_ALIGNED(mem_addr, 16);
+	memcpy(&r, mem_addr, sizeof(r));
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_load_pd1(simde_float64 const *mem_addr)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_load_pd1(mem_addr);
+#else
+	r.f64[0] = *mem_addr;
+	r.f64[1] = *mem_addr;
+#endif
+
+	return r;
+}
+#define simde_mm_load1_pd(mem_addr) simde_mm_load_pd1(mem_addr)
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_load_sd(simde_float64 const *mem_addr)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_load_sd(mem_addr);
+#else
+	memcpy(&r, mem_addr, sizeof(simde_float64));
+	r.u64[1] = 0;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_load_si128(simde__m128i const *mem_addr)
+{
+	simde__m128i r;
+
+	simde_assert_aligned(16, mem_addr);
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_load_si128(&(mem_addr->n));
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i32 = vld1q_s32((int32_t const *)mem_addr);
+#else
+	SIMDE__ASSUME_ALIGNED(mem_addr, 16);
+	memcpy(&r, mem_addr, sizeof(r));
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_loadh_pd(simde__m128d a, simde_float64 const *mem_addr)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_loadh_pd(a.n, mem_addr);
+#else
+	simde_float64 t;
+	memcpy(&t, mem_addr, sizeof(t));
+	r.f64[0] = a.f64[0];
+	r.f64[1] = t;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_loadl_epi64(simde__m128i const *mem_addr)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_loadl_epi64(&mem_addr->n);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i32 = vcombine_s32(vld1_s32((int32_t const *)mem_addr),
+				  vcreate_s32(0));
+#else
+	r.u64[0] = mem_addr->u64[0];
+	r.u64[1] = 0;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_loadl_pd(simde__m128d a, simde_float64 const *mem_addr)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_loadl_pd(a.n, mem_addr);
+#else
+	memcpy(&r, mem_addr, sizeof(simde_float64));
+	r.u64[1] = a.u64[1];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d
+simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
+{
+	simde__m128d r;
+
+	simde_assert_aligned(16, mem_addr);
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_loadr_pd(mem_addr);
+#else
+	SIMDE__ASSUME_ALIGNED(mem_addr, 16);
+	r.f64[0] = mem_addr[1];
+	r.f64[1] = mem_addr[0];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d
+simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_loadu_pd(mem_addr);
+#else
+	simde_float64 l, h;
+	memcpy(&l, &mem_addr[0], sizeof(l));
+	memcpy(&h, &mem_addr[1], sizeof(h));
+	r.f64[0] = l;
+	r.f64[1] = h;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_loadu_si128(simde__m128i const *mem_addr)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_loadu_si128(&((*mem_addr).n));
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i32 = vld1q_s32((int32_t const *)mem_addr);
+#else
+	memcpy(&r, mem_addr, sizeof(r));
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_madd_epi16(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_madd_epi16(a.n, b.n);
+#elif defined(SIMDE_SSE2_NEON)
+	int32x4_t pl =
+		vmull_s16(vget_low_s16(a.neon_i16), vget_low_s16(b.neon_i16));
+	int32x4_t ph =
+		vmull_s16(vget_high_s16(a.neon_i16), vget_high_s16(b.neon_i16));
+	int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
+	int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
+	r.neon_i32 = vcombine_s32(rl, rh);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i += 2) {
+		r.i32[i / 2] =
+			(a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]);
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_maskmoveu_si128(simde__m128i a, simde__m128i mask,
+			      int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)])
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_maskmoveu_si128(a.n, mask.n, (char *)mem_addr);
+#else
+	for (size_t i = 0; i < 16; i++) {
+		if (mask.u8[i] & 0x80) {
+			mem_addr[i] = a.i8[i];
+		}
+	}
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int32_t simde_mm_movemask_epi8(simde__m128i a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_movemask_epi8(a.n);
+#elif defined(SIMDE_SSE2_NEON)
+	uint8x16_t input = a.neon_u8;
+	SIMDE_ALIGN(16)
+	static const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0};
+	uint8x8_t mask_and = vdup_n_u8(0x80);
+	int8x8_t mask_shift = vld1_s8(xr);
+
+	uint8x8_t lo = vget_low_u8(input);
+	uint8x8_t hi = vget_high_u8(input);
+
+	lo = vand_u8(lo, mask_and);
+	lo = vshl_u8(lo, mask_shift);
+
+	hi = vand_u8(hi, mask_and);
+	hi = vshl_u8(hi, mask_shift);
+
+	lo = vpadd_u8(lo, lo);
+	lo = vpadd_u8(lo, lo);
+	lo = vpadd_u8(lo, lo);
+
+	hi = vpadd_u8(hi, hi);
+	hi = vpadd_u8(hi, hi);
+	hi = vpadd_u8(hi, hi);
+
+	return ((hi[0] << 8) | (lo[0] & 0xFF));
+#else
+	int32_t r = 0;
+	SIMDE__VECTORIZE_REDUCTION(| : r)
+	for (size_t i = 0; i < 16; i++) {
+		r |= (a.u8[15 - i] >> 7) << (15 - i);
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int32_t simde_mm_movemask_pd(simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_movemask_pd(a.n);
+#else
+	int32_t r = 0;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(a.u64) / sizeof(a.u64[0])); i++) {
+		r |= (a.u64[i] >> 63) << i;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_movepi64_pi64(simde__m128i a)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_movepi64_pi64(a.n);
+#else
+	r.i64[0] = a.i64[0];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_movpi64_epi64(simde__m64 a)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_movpi64_epi64(a.n);
+#else
+	r.i64[0] = a.i64[0];
+	r.i64[1] = 0;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_min_epi16(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_min_epi16(a.n, b.n);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i16 = vminq_s16(a.neon_i16, b.neon_i16);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.i16[i] = (a.i16[i] < b.i16[i]) ? a.i16[i] : b.i16[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_min_epu8(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_min_epu8(a.n, b.n);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_u8 = vminq_u8(a.neon_u8, b.neon_u8);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
+		r.u8[i] = (a.u8[i] < b.u8[i]) ? a.u8[i] : b.u8[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_min_pd(simde__m128d a, simde__m128d b)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_min_pd(a.n, b.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.f64[i] = (a.f64[i] < b.f64[i]) ? a.f64[i] : b.f64[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_min_sd(simde__m128d a, simde__m128d b)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_min_sd(a.n, b.n);
+#else
+	r.f64[0] = (a.f64[0] < b.f64[0]) ? a.f64[0] : b.f64[0];
+	r.f64[1] = a.f64[1];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_max_epi16(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_max_epi16(a.n, b.n);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i16 = vmaxq_s16(a.neon_i16, b.neon_i16);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.i16[i] = (a.i16[i] > b.i16[i]) ? a.i16[i] : b.i16[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_max_epu8(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_max_epu8(a.n, b.n);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_u8 = vmaxq_u8(a.neon_u8, b.neon_u8);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
+		r.u8[i] = (a.u8[i] > b.u8[i]) ? a.u8[i] : b.u8[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_max_pd(simde__m128d a, simde__m128d b)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_max_pd(a.n, b.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.f64[i] = (a.f64[i] > b.f64[i]) ? a.f64[i] : b.f64[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_max_sd(simde__m128d a, simde__m128d b)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_max_sd(a.n, b.n);
+#else
+	r.f64[0] = (a.f64[0] > b.f64[0]) ? a.f64[0] : b.f64[0];
+	r.f64[1] = a.f64[1];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_move_epi64(simde__m128i a)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_move_epi64(a.n);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i64 = vsetq_lane_s64(0, a.neon_i64, 1);
+#else
+	r.i64[0] = a.i64[0];
+	r.i64[1] = 0;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_move_sd(simde__m128d a, simde__m128d b)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_move_sd(a.n, b.n);
+#else
+	r.f64[0] = b.f64[0];
+	r.f64[1] = a.f64[1];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_mul_epu32(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_mul_epu32(a.n, b.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
+		r.u64[i] = ((uint64_t)a.u32[i * 2]) * ((uint64_t)b.u32[i * 2]);
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_x_mm_mul_epi64(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		r.i64[i] = a.i64[i] * b.i64[i];
+	}
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_x_mm_mod_epi64(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		r.i64[i] = a.i64[i] % b.i64[i];
+	}
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_mul_pd(simde__m128d a, simde__m128d b)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_mul_pd(a.n, b.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.f64[i] = a.f64[i] * b.f64[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_mul_sd(simde__m128d a, simde__m128d b)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_mul_sd(a.n, b.n);
+#else
+	r.f64[0] = a.f64[0] * b.f64[0];
+	r.f64[1] = a.f64[1];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_mul_su32(simde__m64 a, simde__m64 b)
+{
+	simde__m64 r;
+
+#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
+	r.n = _mm_mul_su32(a.n, b.n);
+#else
+	r.u64[0] = ((uint64_t)a.u32[0]) * ((uint64_t)b.u32[0]);
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_mulhi_epi16(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_mulhi_epi16(a.n, b.n);
+#elif defined(SIMDE_SSE2_NEON)
+	int16x4_t a3210 = vget_low_s16(a.neon_i16);
+	int16x4_t b3210 = vget_low_s16(b.neon_i16);
+	int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+	int16x4_t a7654 = vget_high_s16(a.neon_i16);
+	int16x4_t b7654 = vget_high_s16(b.neon_i16);
+	int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+	uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210),
+				    vreinterpretq_u16_s32(ab7654));
+	r.neon_u16 = rv.val[1];
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) *
+						  ((int32_t)b.i16[i]))) >>
+				      16);
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_mulhi_epu16(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
+	r.n = _mm_mulhi_epu16(a.n, b.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
+		r.u16[i] = (uint16_t)(
+			(((uint32_t)a.u16[i]) * ((uint32_t)b.u16[i])) >> 16);
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_mullo_epi16(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_mullo_epi16(a.n, b.n);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i16 = vmulq_s16(a.neon_i16, b.neon_i16);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) *
+						  ((int32_t)b.i16[i]))) &
+				      0xffff);
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_or_pd(simde__m128d a, simde__m128d b)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_or_pd(a.n, b.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		r.i64[i] = a.i64[i] | b.i64[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_or_si128(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_or_si128(a.n, b.n);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		r.i64[i] = a.i64[i] | b.i64[i];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_packs_epi16(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_packs_epi16(a.n, b.n);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i8 = vcombine_s8(vqmovn_s16(a.neon_i16), vqmovn_s16(b.neon_i16));
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.i8[i] = (a.i16[i] > INT8_MAX)
+				  ? INT8_MAX
+				  : ((a.i16[i] < INT8_MIN)
+					     ? INT8_MIN
+					     : ((int8_t)a.i16[i]));
+		r.i8[i + 8] = (b.i16[i] > INT8_MAX)
+				      ? INT8_MAX
+				      : ((b.i16[i] < INT8_MIN)
+						 ? INT8_MIN
+						 : ((int8_t)b.i16[i]));
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_packs_epi32(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_packs_epi32(a.n, b.n);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i16 =
+		vcombine_s16(vqmovn_s32(a.neon_i32), vqmovn_s32(b.neon_i32));
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i16[i] = (a.i32[i] > INT16_MAX)
+				   ? INT16_MAX
+				   : ((a.i32[i] < INT16_MIN)
+					      ? INT16_MIN
+					      : ((int16_t)a.i32[i]));
+		r.i16[i + 4] = (b.i32[i] > INT16_MAX)
+				       ? INT16_MAX
+				       : ((b.i32[i] < INT16_MIN)
+						  ? INT16_MIN
+						  : ((int16_t)b.i32[i]));
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_packus_epi16(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_packus_epi16(a.n, b.n);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_u8 =
+		vcombine_u8(vqmovun_s16(a.neon_i16), vqmovun_s16(b.neon_i16));
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.u8[i] = (a.i16[i] > UINT8_MAX)
+				  ? UINT8_MAX
+				  : ((a.i16[i] < 0) ? 0 : ((int8_t)a.i16[i]));
+		r.u8[i + 8] =
+			(b.i16[i] > UINT8_MAX)
+				? UINT8_MAX
+				: ((b.i16[i] < 0) ? 0 : ((int8_t)b.i16[i]));
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_pause(void)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_pause();
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_sad_epu8(simde__m128i a, simde__m128i b)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_sad_epu8(a.n, b.n);
+#else
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		uint16_t tmp = 0;
+		SIMDE__VECTORIZE_REDUCTION(+ : tmp)
+		for (size_t j = 0; j < ((sizeof(r.u8) / sizeof(r.u8[0])) / 2);
+		     j++) {
+			const size_t e = j + (i * 8);
+			tmp += (a.u8[e] > b.u8[e]) ? (a.u8[e] - b.u8[e])
+						   : (b.u8[e] - a.u8[e]);
+		}
+		r.i64[i] = tmp;
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_set_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12,
+			       int8_t e11, int8_t e10, int8_t e9, int8_t e8,
+			       int8_t e7, int8_t e6, int8_t e5, int8_t e4,
+			       int8_t e3, int8_t e2, int8_t e1, int8_t e0)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4,
+			   e3, e2, e1, e0);
+#else
+	r.i8[0] = e0;
+	r.i8[1] = e1;
+	r.i8[2] = e2;
+	r.i8[3] = e3;
+	r.i8[4] = e4;
+	r.i8[5] = e5;
+	r.i8[6] = e6;
+	r.i8[7] = e7;
+	r.i8[8] = e8;
+	r.i8[9] = e9;
+	r.i8[10] = e10;
+	r.i8[11] = e11;
+	r.i8[12] = e12;
+	r.i8[13] = e13;
+	r.i8[14] = e14;
+	r.i8[15] = e15;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_set_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4,
+				int16_t e3, int16_t e2, int16_t e1, int16_t e0)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
+#elif defined(SIMDE_SSE2_NEON)
+	SIMDE_ALIGN(16) int16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7};
+	r.neon_i16 = vld1q_s16(data);
+#else
+	r.i16[0] = e0;
+	r.i16[1] = e1;
+	r.i16[2] = e2;
+	r.i16[3] = e3;
+	r.i16[4] = e4;
+	r.i16[5] = e5;
+	r.i16[6] = e6;
+	r.i16[7] = e7;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_set_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_set_epi32(e3, e2, e1, e0);
+#elif defined(SIMDE_SSE2_NEON)
+	SIMDE_ALIGN(16) int32_t data[4] = {e0, e1, e2, e3};
+	r.neon_i32 = vld1q_s32(data);
+#else
+	r.i32[0] = e0;
+	r.i32[1] = e1;
+	r.i32[2] = e2;
+	r.i32[3] = e3;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_set_epi64(simde__m64 e1, simde__m64 e0)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_set_epi64(e1.n, e0.n);
+#else
+	r.i64[0] = e0.i64[0];
+	r.i64[1] = e1.i64[0];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_set_epi64x(int64_t e1, int64_t e0)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_set_epi64x(e1, e0);
+#elif defined(SIMDE_SSE2_NEON)
+	r = SIMDE__M128I_NEON_C(i64,
+				vcombine_s64(vdup_n_s64(e0), vdup_n_s64(e1)));
+#else
+	r.i64[0] = e0;
+	r.i64[1] = e1;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_x_mm_set_epu8(uint8_t e15, uint8_t e14, uint8_t e13,
+				 uint8_t e12, uint8_t e11, uint8_t e10,
+				 uint8_t e9, uint8_t e8, uint8_t e7, uint8_t e6,
+				 uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2,
+				 uint8_t e1, uint8_t e0)
+{
+	simde__m128i r;
+
+	r.u8[0] = e0;
+	r.u8[1] = e1;
+	r.u8[2] = e2;
+	r.u8[3] = e3;
+	r.u8[4] = e4;
+	r.u8[5] = e5;
+	r.u8[6] = e6;
+	r.u8[7] = e7;
+	r.u8[8] = e8;
+	r.u8[9] = e9;
+	r.u8[10] = e10;
+	r.u8[11] = e11;
+	r.u8[12] = e12;
+	r.u8[13] = e13;
+	r.u8[14] = e14;
+	r.u8[15] = e15;
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_x_mm_set_epu16(uint16_t e7, uint16_t e6, uint16_t e5,
+				  uint16_t e4, uint16_t e3, uint16_t e2,
+				  uint16_t e1, uint16_t e0)
+{
+	simde__m128i r;
+
+	r.u16[0] = e0;
+	r.u16[1] = e1;
+	r.u16[2] = e2;
+	r.u16[3] = e3;
+	r.u16[4] = e4;
+	r.u16[5] = e5;
+	r.u16[6] = e6;
+	r.u16[7] = e7;
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_x_mm_set_epu32(uint32_t e3, uint32_t e2, uint32_t e1,
+				  uint32_t e0)
+{
+	simde__m128i r;
+
+	r.u32[0] = e0;
+	r.u32[1] = e1;
+	r.u32[2] = e2;
+	r.u32[3] = e3;
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_x_mm_set_epu64x(uint64_t e1, uint64_t e0)
+{
+	simde__m128i r;
+
+	r.u64[0] = e0;
+	r.u64[1] = e1;
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_set_pd(simde_float64 e1, simde_float64 e0)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_set_pd(e1, e0);
+#else
+	r.f64[0] = e0;
+	r.f64[1] = e1;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_set_pd1(simde_float64 a)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_set1_pd(a);
+#else
+	r.f64[0] = a;
+	r.f64[1] = a;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_set_sd(simde_float64 a)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_set_sd(a);
+#else
+	r.f64[0] = a;
+	r.u64[1] = 0;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_set1_epi8(int8_t a)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_set1_epi8(a);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i8 = vdupq_n_s8(a);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
+		r.i8[i] = a;
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_set1_epi16(int16_t a)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_set1_epi16(a);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i16 = vdupq_n_s16(a);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.i16[i] = a;
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_set1_epi32(int32_t a)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_set1_epi32(a);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i32 = vdupq_n_s32(a);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = a;
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_set1_epi64x(int64_t a)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_set1_epi64x(a);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i64 = vmovq_n_s64(a);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		r.i64[i] = a;
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_set1_epi64(simde__m64 a)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_set1_epi64(a.n);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		r.i64[i] = a.i64[0];
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_set1_pd(simde_float64 a)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_set1_pd(a);
+#else
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		r.f64[i] = a;
+	}
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_setr_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12,
+				int8_t e11, int8_t e10, int8_t e9, int8_t e8,
+				int8_t e7, int8_t e6, int8_t e5, int8_t e4,
+				int8_t e3, int8_t e2, int8_t e1, int8_t e0)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5,
+			    e4, e3, e2, e1, e0);
+#elif defined(SIMDE_SSE2_NEON)
+	int8_t t[] = {e15, e14, e13, e12, e11, e10, e9, e8,
+		      e7,  e6,  e5,  e4,  e3,  e2,  e1, e0};
+	r.neon_i8 = vld1q_s8(t);
+#else
+	r.i8[0] = e15;
+	r.i8[1] = e14;
+	r.i8[2] = e13;
+	r.i8[3] = e12;
+	r.i8[4] = e11;
+	r.i8[5] = e10;
+	r.i8[6] = e9;
+	r.i8[7] = e8;
+	r.i8[8] = e7;
+	r.i8[9] = e6;
+	r.i8[10] = e5;
+	r.i8[11] = e4;
+	r.i8[12] = e3;
+	r.i8[13] = e2;
+	r.i8[14] = e1;
+	r.i8[15] = e0;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_setr_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4,
+				 int16_t e3, int16_t e2, int16_t e1, int16_t e0)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
+#elif defined(SIMDE_SSE2_NEON)
+	int16_t t[] = {e7, e6, e5, e4, e3, e2, e1, e0};
+	r.neon_i16 = vld1q_s16(t);
+#else
+	r.i16[0] = e7;
+	r.i16[1] = e6;
+	r.i16[2] = e5;
+	r.i16[3] = e4;
+	r.i16[4] = e3;
+	r.i16[5] = e2;
+	r.i16[6] = e1;
+	r.i16[7] = e0;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_setr_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_setr_epi32(e3, e2, e1, e0);
+#elif defined(SIMDE_SSE2_NEON)
+	int32_t t[] = {e3, e2, e1, e0};
+	r.neon_i32 = vld1q_s32(t);
+#else
+	r.i32[0] = e3;
+	r.i32[1] = e2;
+	r.i32[2] = e1;
+	r.i32[3] = e0;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_setr_epi64(simde__m64 e1, simde__m64 e0)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_setr_epi64(e1.n, e0.n);
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i64 = vcombine_s64(e1.neon_i64, e0.neon_i64);
+#else
+	r.i64[0] = e1.i64[0];
+	r.i64[1] = e0.i64[0];
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_setr_pd(simde_float64 e1, simde_float64 e0)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_setr_pd(e1, e0);
+#else
+	r.f64[0] = e1;
+	r.f64[1] = e0;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_setzero_pd(void)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_setzero_pd();
+#else
+	r.u64[0] = 0;
+	r.u64[1] = 0;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_setzero_si128(void)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE)
+	r.n = _mm_setzero_si128();
+#elif defined(SIMDE_SSE2_NEON)
+	r.neon_i32 = vdupq_n_s32(0);
+#else
+	r.u64[0] = 0;
+	r.u64[1] = 0;
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_shuffle_epi32(simde__m128i a, const int imm8)
+{
+	simde__m128i r;
+
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = a.i32[(imm8 >> (i * 2)) & 3];
+	}
+
+	return r;
+}
+#if defined(SIMDE_SSE2_NATIVE)
+#define simde_mm_shuffle_epi32(a, imm8) \
+	SIMDE__M128I_C(_mm_shuffle_epi32((a).n, (imm8)))
+#elif defined(SIMDE__SHUFFLE_VECTOR)
+#define simde_mm_shuffle_epi32(a, imm8)                                      \
+	({                                                                   \
+		const simde__m128i simde__tmp_a_ = a;                        \
+		(simde__m128i){.i32 = SIMDE__SHUFFLE_VECTOR(                 \
+				       32, 16, (simde__tmp_a_).i32,          \
+				       (simde__tmp_a_).i32, ((imm8)) & 3,    \
+				       ((imm8) >> 2) & 3, ((imm8) >> 4) & 3, \
+				       ((imm8) >> 6) & 3)};                  \
+	})
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_shuffle_pd(simde__m128d a, simde__m128d b, const int imm8)
+{
+	simde__m128d r;
+
+	r.f64[0] = ((imm8 & 1) == 0) ? a.f64[0] : a.f64[1];
+	r.f64[1] = ((imm8 & 2) == 0) ? b.f64[0] : b.f64[1];
+
+	return r;
+}
+#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
+#define simde_mm_shuffle_pd(a, b, imm8) \
+	SIMDE__M128D_C(_mm_shuffle_pd((a).n, (b).n, (imm8)))
+#elif defined(SIMDE__SHUFFLE_VECTOR)
+#define simde_mm_shuffle_pd(a, b, imm8)                           \
+	({                                                        \
+		(simde__m128d){.f64 = SIMDE__SHUFFLE_VECTOR(      \
+				       64, 16, (a).f64, (b).f64,  \
+				       (((imm8)) & 1),            \
+				       (((imm8) >> 1) & 1) + 2)}; \
+	})
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_shufflehi_epi16(simde__m128i a, const int imm8)
+{
+	simde__m128i r;
+
+	r.i64[0] = a.i64[0];
+	for (size_t i = 4; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.i16[i] = a.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
+	}
+
+	return r;
+}
+#if defined(SIMDE_SSE2_NATIVE)
+#define simde_mm_shufflehi_epi16(a, imm8) \
+	SIMDE__M128I_C(_mm_shufflehi_epi16((a).n, (imm8)))
+#elif defined(SIMDE__SHUFFLE_VECTOR)
+#define simde_mm_shufflehi_epi16(a, imm8)                               \
+	({                                                              \
+		const simde__m128i simde__tmp_a_ = a;                   \
+		(simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR(            \
+				       16, 16, (simde__tmp_a_).i16,     \
+				       (simde__tmp_a_).i16, 0, 1, 2, 3, \
+				       (((imm8)) & 3) + 4,              \
+				       (((imm8) >> 2) & 3) + 4,         \
+				       (((imm8) >> 4) & 3) + 4,         \
+				       (((imm8) >> 6) & 3) + 4)};       \
+	})
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_shufflelo_epi16(simde__m128i a, const int imm8)
+{
+	simde__m128i r;
+
+	for (size_t i = 0; i < ((sizeof(r.i16) / sizeof(r.i16[0])) / 2); i++) {
+		r.i16[i] = a.i16[((imm8 >> (i * 2)) & 3)];
+	}
+	r.i64[1] = a.i64[1];
+
+	return r;
+}
+#if defined(SIMDE_SSE2_NATIVE)
+#define simde_mm_shufflelo_epi16(a, imm8) \
+	SIMDE__M128I_C(_mm_shufflelo_epi16((a).n, (imm8)))
+#elif defined(SIMDE__SHUFFLE_VECTOR)
+#define simde_mm_shufflelo_epi16(a, imm8)                                   \
+	({                                                                  \
+		const simde__m128i simde__tmp_a_ = a;                       \
+		(simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR(                \
+				       16, 16, (simde__tmp_a_).i16,         \
+				       (simde__tmp_a_).i16, (((imm8)) & 3), \
+				       (((imm8) >> 2) & 3),                 \
+				       (((imm8) >> 4) & 3),                 \
+				       (((imm8) >> 6) & 3), 4, 5, 6, 7)};   \
+	})
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_sll_epi16(simde__m128i a, simde__m128i count)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_sll_epi16(a.n, count.n));
+#else
+	simde__m128i r;
+
+	if (count.u64[0] > 15)
+		return simde_mm_setzero_si128();
+	const int s = (int)(count.u64[0]);
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
+		r.u16[i] = a.u16[i] << s;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_sll_epi32(simde__m128i a, simde__m128i count)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_sll_epi32(a.n, count.n));
+#else
+	simde__m128i r;
+
+	if (count.u64[0] > 31)
+		return simde_mm_setzero_si128();
+	const int s = (int)(count.u64[0]);
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = a.i32[i] << s;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_sll_epi64(simde__m128i a, simde__m128i count)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_sll_epi64(a.n, count.n));
+#else
+	simde__m128i r;
+
+	if (count.u64[0] > 63)
+		return simde_mm_setzero_si128();
+	const int s = (int)(count.u64[0]);
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		r.i64[i] = a.i64[i] << s;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_sqrt_pd(simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_sqrt_pd(a.n));
+#else
+	simde__m128d r;
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.f64[i] = sqrt(a.f64[i]);
+	}
+
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_sqrt_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_sqrt_sd(a.n, b.n));
+#else
+	simde__m128d r;
+	r.f64[0] = sqrt(b.f64[0]);
+	r.f64[1] = a.f64[1];
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_srl_epi16(simde__m128i a, simde__m128i count)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_srl_epi16(a.n, count.n));
+#else
+	simde__m128i r;
+
+	if (count.u64[0] > 15)
+		return simde_mm_setzero_si128();
+	const int s = (int)(count.u64[0]);
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
+		r.u16[i] = a.u16[i] >> s;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_srl_epi32(simde__m128i a, simde__m128i count)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_srl_epi32(a.n, count.n));
+#else
+	simde__m128i r;
+
+	if (count.u64[0] > 31)
+		return simde_mm_setzero_si128();
+	const int s = (int)(count.u64[0]);
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) {
+		r.u32[i] = a.u32[i] >> s;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_srl_epi64(simde__m128i a, simde__m128i count)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_srl_epi64(a.n, count.n));
+#else
+	simde__m128i r;
+
+	if (count.u64[0] > 31)
+		return simde_mm_setzero_si128();
+	const int s = (int)(count.u64[0]);
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
+		r.u64[i] = a.u64[i] >> s;
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_srai_epi16(simde__m128i a, int imm8)
+{
+	simde__m128i r;
+
+	const uint16_t m =
+		(uint16_t)((~0U) << ((sizeof(int16_t) * CHAR_BIT) - imm8));
+
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r) / sizeof(r.u16[0])); i++) {
+		const uint16_t is_neg = ((uint16_t)(
+			((a.u16[i]) >> ((sizeof(int16_t) * CHAR_BIT) - 1))));
+		r.u16[i] = (a.u16[i] >> imm8) | (m * is_neg);
+	}
+
+	return r;
+}
+#if defined(SIMDE_SSE2_NATIVE)
+#define simde_mm_srai_epi16(a, imm8) \
+	SIMDE__M128I_C(_mm_srai_epi16((a).n, (imm8)));
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_srai_epi32(simde__m128i a, int imm8)
+{
+	simde__m128i r;
+
+	const uint32_t m =
+		(uint32_t)((~0U) << ((sizeof(int) * CHAR_BIT) - imm8));
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r) / sizeof(r.u32[0])); i++) {
+		uint32_t is_neg = ((uint32_t)(
+			((a.u32[i]) >> ((sizeof(int32_t) * CHAR_BIT) - 1))));
+		r.u32[i] = (a.u32[i] >> imm8) | (m * is_neg);
+	}
+
+	return r;
+}
+#if defined(SIMDE_SSE2_NATIVE)
+#define simde_mm_srai_epi32(a, imm8) \
+	SIMDE__M128I_C(_mm_srai_epi32((a).n, (imm8)))
+#elif defined(SIMDE_SSE2_NEON)
+#define simde_mm_srai_epi32(a, imm8)                                           \
+	SIMDE__M128I_NEON_C(                                                   \
+		i32,                                                           \
+		((imm8) <= 0)                                                  \
+			? (a.neon_i32)                                         \
+			: (((imm8) > 31)                                       \
+				   ? (vshrq_n_s32(vshrq_n_s32(a.neon_i32, 16), \
+						  16))                         \
+				   : (vshrq_n_s32(a.neon_i32, (imm8)))))
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_sra_epi16(simde__m128i a, simde__m128i count)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_sra_epi16(a.n, count.n));
+#else
+	simde__m128i r;
+	int cnt = (int)count.i64[0];
+
+	if (cnt > 15 || cnt < 0) {
+		for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
+		     i++) {
+			r.u16[i] = (a.i16[i] < 0) ? 0xffff : 0x0000;
+		}
+	} else {
+		const uint16_t m = (uint16_t)(
+			(~0U) << ((sizeof(int16_t) * CHAR_BIT) - cnt));
+		for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
+		     i++) {
+			const uint16_t is_neg = a.i16[i] < 0;
+			r.u16[i] = (a.u16[i] >> cnt) | (m * is_neg);
+		}
+	}
+
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_sra_epi32(simde__m128i a, simde__m128i count)
+{
+#if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
+	return SIMDE__M128I_C(_mm_sra_epi32(a.n, count.n));
+#else
+	simde__m128i r;
+	const uint64_t cnt = count.u64[0];
+
+	if (cnt > 31) {
+		for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
+		     i++) {
+			r.u32[i] = (a.i32[i] < 0) ? UINT32_MAX : 0;
+		}
+	} else if (cnt == 0) {
+		memcpy(&r, &a, sizeof(r));
+	} else {
+		const uint32_t m = (uint32_t)(
+			(~0U) << ((sizeof(int32_t) * CHAR_BIT) - cnt));
+		for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
+		     i++) {
+			const uint32_t is_neg = a.i32[i] < 0;
+			r.u32[i] = (a.u32[i] >> cnt) | (m * is_neg);
+		}
+	}
+
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_slli_epi16(simde__m128i a, const int imm8)
+{
+	simde__m128i r;
+	const int s = (imm8 > ((int)sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0
+								      : imm8;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.i16[i] = a.i16[i] << s;
+	}
+	return r;
+}
+#if defined(SIMDE_SSE2_NATIVE)
+#define simde_mm_slli_epi16(a, imm8) SIMDE__M128I_C(_mm_slli_epi16(a.n, imm8));
+#elif defined(SIMDE_SSE2_NEON)
+#define simde_mm_slli_epi16(a, imm8)                                       \
+	SIMDE__M128I_NEON_C(                                               \
+		i16, ((imm8) <= 0)                                         \
+			     ? ((a).neon_i16)                              \
+			     : (((imm8) > 31) ? (vdupq_n_s16(0))           \
+					      : (vshlq_n_s16((a).neon_i16, \
+							     (imm8)))))
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_slli_epi32(simde__m128i a, const int imm8)
+{
+	simde__m128i r;
+	const int s = (imm8 > ((int)sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0
+								      : imm8;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = a.i32[i] << s;
+	}
+	return r;
+}
+#if defined(SIMDE_SSE2_NATIVE)
+#define simde_mm_slli_epi32(a, imm8) SIMDE__M128I_C(_mm_slli_epi32(a.n, imm8));
+#elif defined(SIMDE_SSE2_NEON)
+#define simde_mm_slli_epi32(a, imm8)                                       \
+	SIMDE__M128I_NEON_C(                                               \
+		i32, ((imm8) <= 0)                                         \
+			     ? ((a).neon_i32)                              \
+			     : (((imm8) > 31) ? (vdupq_n_s32(0))           \
+					      : (vshlq_n_s32((a).neon_i32, \
+							     (imm8)))))
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_slli_epi64(simde__m128i a, const int imm8)
+{
+	simde__m128i r;
+	const int s = (imm8 > ((int)sizeof(r.i64[0]) * CHAR_BIT) - 1) ? 0
+								      : imm8;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		r.i64[i] = a.i64[i] << s;
+	}
+	return r;
+}
+#if defined(SIMDE_SSE2_NATIVE)
+#define simde_mm_slli_epi64(a, imm8) SIMDE__M128I_C(_mm_slli_epi64(a.n, imm8));
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_srli_epi16(simde__m128i a, const int imm8)
+{
+	simde__m128i r;
+	const int s = (imm8 > ((int)sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0
+								      : imm8;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.u16[i] = a.u16[i] >> s;
+	}
+	return r;
+}
+#if defined(SIMDE_SSE2_NATIVE)
+#define simde_mm_srli_epi16(a, imm8) SIMDE__M128I_C(_mm_srli_epi16(a.n, imm8));
+#elif defined(SIMDE_SSE2_NEON)
+#define simde_mm_srli_epi16(a, imm8)                                       \
+	SIMDE__M128I_NEON_C(                                               \
+		u16, ((imm8) <= 0)                                         \
+			     ? ((a).neon_u16)                              \
+			     : (((imm8) > 31) ? (vdupq_n_u16(0))           \
+					      : (vshrq_n_u16((a).neon_u16, \
+							     (imm8)))))
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_srli_epi32(simde__m128i a, const int imm8)
+{
+	simde__m128i r;
+	const int s = (imm8 > ((int)sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0
+								      : imm8;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.u32[i] = a.u32[i] >> s;
+	}
+	return r;
+}
+#if defined(SIMDE_SSE2_NATIVE)
+#define simde_mm_srli_epi32(a, imm8) SIMDE__M128I_C(_mm_srli_epi32(a.n, imm8))
+#elif defined(SIMDE_SSE2_NEON)
+#define simde_mm_srli_epi32(a, imm8)                                       \
+	SIMDE__M128I_NEON_C(                                               \
+		u32, ((imm8) <= 0)                                         \
+			     ? ((a).neon_u32)                              \
+			     : (((imm8) > 31) ? (vdupq_n_u32(0))           \
+					      : (vshrq_n_u32((a).neon_u32, \
+							     (imm8)))))
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_srli_epi64(simde__m128i a, const int imm8)
+{
+	simde__m128i r;
+	const unsigned char s = imm8 & 255;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		if (s > 63) {
+			r.u64[i] = 0;
+		} else {
+			r.u64[i] = a.u64[i] >> s;
+		}
+	}
+	return r;
+}
+#if defined(SIMDE_SSE2_NATIVE)
+#define simde_mm_srli_epi64(a, imm8) SIMDE__M128I_C(_mm_srli_epi64(a.n, imm8))
+#elif defined(SIMDE_SSE2_NEON)
+#define simde_mm_srli_epi64(a, imm8)                    \
+	SIMDE__M128I_NEON_C(                            \
+		u64,                                    \
+		(((imm8)&255) < 0 || ((imm8)&255) > 63) \
+			? (vdupq_n_u64(0))              \
+			: ((((imm8)&255) == 0)          \
+				   ? (a.neon_u64)       \
+				   : (vshrq_n_u64((a).neon_u64, (imm8)&255))))
+#endif
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
+		       simde__m128d a)
+{
+	simde_assert_aligned(16, mem_addr);
+
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_store_pd(mem_addr, a.n);
+#else
+	SIMDE__ASSUME_ALIGNED(mem_addr, 16);
+	memcpy(mem_addr, &a, sizeof(a));
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
+			simde__m128d a)
+{
+	simde_assert_aligned(16, mem_addr);
+
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_store1_pd(mem_addr, a.n);
+#else
+	SIMDE__ASSUME_ALIGNED(mem_addr, 16);
+	mem_addr[0] = a.f64[0];
+	mem_addr[1] = a.f64[0];
+#endif
+}
+#define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(mem_addr, a)
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_store_sd(simde_float64 *mem_addr, simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_store_sd(mem_addr, a.n);
+#else
+	memcpy(mem_addr, &a, sizeof(a.f64[0]));
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_store_si128(simde__m128i *mem_addr, simde__m128i a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_store_si128(&mem_addr->n, a.n);
+#elif defined(SIMDE_SSE2_NEON)
+	vst1q_s32((int32_t *)mem_addr, a.neon_i32);
+#else
+	SIMDE__ASSUME_ALIGNED(mem_addr, 16);
+	memcpy(mem_addr, &a, sizeof(a));
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_storeh_pd(simde_float64 *mem_addr, simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_storeh_pd(mem_addr, a.n);
+#else
+	*mem_addr = a.f64[1];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_storel_epi64(simde__m128i *mem_addr, simde__m128i a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_storel_epi64(&(mem_addr->n), a.n);
+#elif defined(SIMDE_SSE2_NEON)
+	mem_addr->i64[0] = vgetq_lane_s64(a.neon_i64, 0);
+#else
+	mem_addr->i64[0] = a.i64[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_storel_pd(simde_float64 *mem_addr, simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_storel_pd(mem_addr, a.n);
+#else
+	*mem_addr = a.f64[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_storer_pd(simde_float64 mem_addr[2], simde__m128d a)
+{
+	simde_assert_aligned(16, mem_addr);
+
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_storer_pd(mem_addr, a.n);
+#else
+	SIMDE__ASSUME_ALIGNED(mem_addr, 16);
+	mem_addr[0] = a.f64[1];
+	mem_addr[1] = a.f64[0];
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_storeu_pd(simde_float64 *mem_addr, simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_storeu_pd(mem_addr, a.n);
+#else
+	memcpy(mem_addr, &a, sizeof(a));
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_storeu_si128(simde__m128i *mem_addr, simde__m128i a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_storeu_si128(&mem_addr->n, a.n);
+#elif defined(SIMDE_SSE2_NEON)
+	int32_t v[4];
+	vst1q_s32(v, a.neon_i32);
+	memcpy(mem_addr, v, sizeof(v));
+#else
+	memcpy(mem_addr, &a, sizeof(a));
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
+			simde__m128d a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_stream_pd(mem_addr, a.n);
+#else
+	SIMDE__ASSUME_ALIGNED(mem_addr, 16);
+	memcpy(mem_addr, &a, sizeof(a));
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_stream_si128(simde__m128i *mem_addr, simde__m128i a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_stream_si128(&mem_addr->n, a.n);
+#else
+	SIMDE__ASSUME_ALIGNED(mem_addr, 16);
+	memcpy(mem_addr, &a, sizeof(a));
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_stream_si32(int32_t *mem_addr, int32_t a)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_stream_si32(mem_addr, a);
+#else
+	*mem_addr = a;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_stream_si64(int64_t *mem_addr, int64_t a)
+{
+#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
+#if defined(SIMDE__REALLY_GCC) && !HEDLEY_GCC_VERSION_CHECK(4, 8, 0)
+	*mem_addr = a;
+#elif defined(__GNUC__)
+	_mm_stream_si64((long long *)mem_addr, a);
+#else
+	_mm_stream_si64(mem_addr, a);
+#endif
+#else
+	*mem_addr = a;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_sub_epi8(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_sub_epi8(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i8, vsubq_s8(a.neon_i8, b.neon_i8));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
+		r.i8[i] = a.i8[i] - b.i8[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_sub_epi16(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_sub_epi16(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i16, vsubq_s16(a.neon_i16, b.neon_i16));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
+		r.i16[i] = a.i16[i] - b.i16[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_sub_epi32(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_sub_epi32(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i32, vsubq_s32(a.neon_i32, b.neon_i32));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = a.i32[i] - b.i32[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_sub_epi64(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_sub_epi64(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i64, vsubq_s64(a.neon_i64, b.neon_i64));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		r.i64[i] = a.i64[i] - b.i64[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_sub_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_sub_pd(a.n, b.n));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
+		r.f64[i] = a.f64[i] - b.f64[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_sub_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_sub_sd(a.n, b.n));
+#else
+	simde__m128d r;
+	r.f64[0] = a.f64[0] - b.f64[0];
+	r.f64[1] = a.f64[1];
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m64 simde_mm_sub_si64(simde__m64 a, simde__m64 b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M64_C(_mm_sub_si64(a.n, b.n));
+#else
+	simde__m64 r;
+	r.i64[0] = a.i64[0] - b.i64[0];
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_subs_epi8(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_subs_epi8(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i8, vqsubq_s8(a.neon_i8, b.neon_i8));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r) / sizeof(r.i8[0])); i++) {
+		if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) {
+			r.i8[i] = INT8_MIN;
+		} else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) {
+			r.i8[i] = INT8_MAX;
+		} else {
+			r.i8[i] = (a.i8[i]) - (b.i8[i]);
+		}
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_subs_epi16(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_subs_epi16(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i16, vqsubq_s16(a.neon_i16, b.neon_i16));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i++) {
+		if (((b.i16[i]) > 0 && (a.i16[i]) < INT16_MIN + (b.i16[i]))) {
+			r.i16[i] = INT16_MIN;
+		} else if ((b.i16[i]) < 0 &&
+			   (a.i16[i]) > INT16_MAX + (b.i16[i])) {
+			r.i16[i] = INT16_MAX;
+		} else {
+			r.i16[i] = (a.i16[i]) - (b.i16[i]);
+		}
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_subs_epu8(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_subs_epu8(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(u8, vqsubq_u8(a.neon_u8, b.neon_u8));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r) / sizeof(r.i8[0])); i++) {
+		const int32_t x = a.u8[i] - b.u8[i];
+		if (x < 0) {
+			r.u8[i] = 0;
+		} else if (x > UINT8_MAX) {
+			r.u8[i] = UINT8_MAX;
+		} else {
+			r.u8[i] = (uint8_t)x;
+		}
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_subs_epu16(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_subs_epu16(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(u16, vqsubq_u16(a.neon_u16, b.neon_u16));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i++) {
+		const int32_t x = a.u16[i] - b.u16[i];
+		if (x < 0) {
+			r.u16[i] = 0;
+		} else if (x > UINT16_MAX) {
+			r.u16[i] = UINT16_MAX;
+		} else {
+			r.u16[i] = (uint16_t)x;
+		}
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_ucomieq_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_ucomieq_sd(a.n, b.n);
+#else
+	fenv_t envp;
+	int x = feholdexcept(&envp);
+	int r = a.f64[0] == b.f64[0];
+	if (HEDLEY_LIKELY(x == 0))
+		fesetenv(&envp);
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_ucomige_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_ucomige_sd(a.n, b.n);
+#else
+	fenv_t envp;
+	int x = feholdexcept(&envp);
+	int r = a.f64[0] >= b.f64[0];
+	if (HEDLEY_LIKELY(x == 0))
+		fesetenv(&envp);
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_ucomigt_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_ucomigt_sd(a.n, b.n);
+#else
+	fenv_t envp;
+	int x = feholdexcept(&envp);
+	int r = a.f64[0] > b.f64[0];
+	if (HEDLEY_LIKELY(x == 0))
+		fesetenv(&envp);
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_ucomile_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_ucomile_sd(a.n, b.n);
+#else
+	fenv_t envp;
+	int x = feholdexcept(&envp);
+	int r = a.f64[0] <= b.f64[0];
+	if (HEDLEY_LIKELY(x == 0))
+		fesetenv(&envp);
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_ucomilt_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_ucomilt_sd(a.n, b.n);
+#else
+	fenv_t envp;
+	int x = feholdexcept(&envp);
+	int r = a.f64[0] < b.f64[0];
+	if (HEDLEY_LIKELY(x == 0))
+		fesetenv(&envp);
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+int simde_mm_ucomineq_sd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return _mm_ucomineq_sd(a.n, b.n);
+#else
+	fenv_t envp;
+	int x = feholdexcept(&envp);
+	int r = a.f64[0] != b.f64[0];
+	if (HEDLEY_LIKELY(x == 0))
+		fesetenv(&envp);
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_undefined_pd(void)
+{
+	simde__m128d r;
+
+#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
+	r.n = _mm_undefined_pd();
+#else
+	r = simde_mm_setzero_pd();
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_undefined_si128(void)
+{
+	simde__m128i r;
+
+#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
+	r.n = _mm_undefined_si128();
+#else
+	r = simde_mm_setzero_si128();
+#endif
+
+	return r;
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_lfence(void)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_lfence();
+#else
+	simde_mm_sfence();
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+void simde_mm_mfence(void)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	_mm_mfence();
+#else
+	simde_mm_sfence();
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_unpackhi_epi8(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_unpackhi_epi8(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a.neon_i16));
+	int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b.neon_i16));
+	int8x8x2_t result = vzip_s8(a1, b1);
+	return SIMDE__M128I_NEON_C(i8,
+				   vcombine_s8(result.val[0], result.val[1]));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i8[0])) / 2); i++) {
+		r.i8[(i * 2)] = a.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)];
+		r.i8[(i * 2) + 1] =
+			b.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_unpackhi_epi16(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_unpackhi_epi16(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	int16x4_t a1 = vget_high_s16(a.neon_i16);
+	int16x4_t b1 = vget_high_s16(b.neon_i16);
+	int16x4x2_t result = vzip_s16(a1, b1);
+	return SIMDE__M128I_NEON_C(i16,
+				   vcombine_s16(result.val[0], result.val[1]));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i16[0])) / 2); i++) {
+		r.i16[(i * 2)] =
+			a.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)];
+		r.i16[(i * 2) + 1] =
+			b.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_unpackhi_epi32(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_unpackhi_epi32(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	int32x2_t a1 = vget_high_s32(a.neon_i32);
+	int32x2_t b1 = vget_high_s32(b.neon_i32);
+	int32x2x2_t result = vzip_s32(a1, b1);
+	return SIMDE__M128I_NEON_C(i32,
+				   vcombine_s32(result.val[0], result.val[1]));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i32[0])) / 2); i++) {
+		r.i32[(i * 2)] =
+			a.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)];
+		r.i32[(i * 2) + 1] =
+			b.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_unpackhi_epi64(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_unpackhi_epi64(a.n, b.n));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i64[0])) / 2); i++) {
+		r.i64[(i * 2)] =
+			a.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)];
+		r.i64[(i * 2) + 1] =
+			b.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_unpackhi_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_unpackhi_pd(a.n, b.n));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < ((sizeof(r) / sizeof(r.f64[0])) / 2); i++) {
+		r.f64[(i * 2)] =
+			a.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)];
+		r.f64[(i * 2) + 1] =
+			b.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_unpacklo_epi8(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_unpacklo_epi8(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a.neon_i16));
+	int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b.neon_i16));
+	int8x8x2_t result = vzip_s8(a1, b1);
+	return SIMDE__M128I_NEON_C(i8,
+				   vcombine_s8(result.val[0], result.val[1]));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i8[0])) / 2); i++) {
+		r.i8[(i * 2)] = a.i8[i];
+		r.i8[(i * 2) + 1] = b.i8[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_unpacklo_epi16(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_unpacklo_epi16(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	int16x4_t a1 = vget_low_s16(a.neon_i16);
+	int16x4_t b1 = vget_low_s16(b.neon_i16);
+	int16x4x2_t result = vzip_s16(a1, b1);
+	return SIMDE__M128I_NEON_C(i16,
+				   vcombine_s16(result.val[0], result.val[1]));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i16[0])) / 2); i++) {
+		r.i16[(i * 2)] = a.i16[i];
+		r.i16[(i * 2) + 1] = b.i16[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_unpacklo_epi32(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_unpacklo_epi32(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	int32x2_t a1 = vget_low_s32(a.neon_i32);
+	int32x2_t b1 = vget_low_s32(b.neon_i32);
+	int32x2x2_t result = vzip_s32(a1, b1);
+	return SIMDE__M128I_NEON_C(i32,
+				   vcombine_s32(result.val[0], result.val[1]));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i32[0])) / 2); i++) {
+		r.i32[(i * 2)] = a.i32[i];
+		r.i32[(i * 2) + 1] = b.i32[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_unpacklo_epi64(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_unpacklo_epi64(a.n, b.n));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i64[0])) / 2); i++) {
+		r.i64[(i * 2)] = a.i64[i];
+		r.i64[(i * 2) + 1] = b.i64[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_unpacklo_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_unpacklo_pd(a.n, b.n));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < ((sizeof(r) / sizeof(r.f64[0])) / 2); i++) {
+		r.f64[(i * 2)] = a.f64[i];
+		r.f64[(i * 2) + 1] = b.f64[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128d simde_mm_xor_pd(simde__m128d a, simde__m128d b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128D_C(_mm_xor_pd(a.n, b.n));
+#else
+	simde__m128d r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
+		r.i64[i] = a.i64[i] ^ b.i64[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_mm_xor_si128(simde__m128i a, simde__m128i b)
+{
+#if defined(SIMDE_SSE2_NATIVE)
+	return SIMDE__M128I_C(_mm_xor_si128(a.n, b.n));
+#elif defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i32, veorq_s32(a.neon_i32, b.neon_i32));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = a.i32[i] ^ b.i32[i];
+	}
+	return r;
+#endif
+}
+
+SIMDE__FUNCTION_ATTRIBUTES
+simde__m128i simde_x_mm_not_si128(simde__m128i a)
+{
+#if defined(SIMDE_SSE2_NEON)
+	return SIMDE__M128I_NEON_C(i32, vmvnq_s32(a.neon_i32));
+#else
+	simde__m128i r;
+	SIMDE__VECTORIZE
+	for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
+		r.i32[i] = ~(a.i32[i]);
+	}
+	return r;
+#endif
+}
+
+SIMDE__END_DECLS
+
+#endif /* !defined(SIMDE__SSE2_H) */

+ 66 - 0
libobs/util/sse-intrin.h

@@ -0,0 +1,66 @@
+/******************************************************************************
+    Copyright (C) 2019 by Peter Geis <[email protected]>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+******************************************************************************/
+
+#pragma once
+
+#ifdef __aarch64__
+
+#include "aarch/sse2.h"
+
+#define __m128 simde__m128
+#define _mm_setzero_ps simde_mm_setzero_ps
+#define _mm_set_ps simde_mm_set_ps
+#define _mm_add_ps simde_mm_add_ps
+#define _mm_sub_ps simde_mm_sub_ps
+#define _mm_mul_ps simde_mm_mul_ps
+#define _mm_div_ps simde_mm_div_ps
+#define _mm_set1_ps simde_mm_set1_ps
+#define _mm_movehl_ps simde_mm_movehl_ps
+#define _mm_shuffle_ps simde_mm_shuffle_ps
+#define _mm_min_ps simde_mm_min_ps
+#define _mm_max_ps simde_mm_max_ps
+#define _mm_movelh_ps simde_mm_movelh_ps
+#define _mm_unpacklo_ps simde_mm_unpacklo_ps
+#define _mm_unpackhi_ps simde_mm_unpackhi_ps
+#define _mm_load_ps simde_mm_load_ps
+#define _mm_andnot_ps simde_mm_andnot_ps
+#define _mm_storeu_ps simde_mm_storeu_ps
+#define _mm_loadu_ps simde_mm_loadu_ps
+
+#define __m128i simde__m128i
+#define _mm_set1_epi32 simde_mm_set1_epi32
+#define _mm_set1_epi16 simde_mm_set1_epi16
+#define _mm_load_si128 simde_mm_load_si128
+#define _mm_packs_epi32 simde_mm_packs_epi32
+#define _mm_srli_si128 simde_mm_srli_si128
+#define _mm_and_si128 simde_mm_and_si128
+#define _mm_packus_epi16 simde_mm_packus_epi16
+#define _mm_add_epi64 simde_mm_add_epi64
+#define _mm_shuffle_epi32 simde_mm_shuffle_epi32
+#define _mm_srai_epi16 simde_mm_srai_epi16
+#define _mm_shufflelo_epi16 simde_mm_shufflelo_epi16
+#define _mm_storeu_si128 simde_mm_storeu_si128
+
+#define _MM_SHUFFLE SIMDE_MM_SHUFFLE
+#define _MM_TRANSPOSE4_PS SIMDE_MM_TRANSPOSE4_PS
+
+#else
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#endif