Browse Source

libobs: Update to SIMDe 0.7.1

https://github.com/simd-everywhere/simde/commit/c3d7abfaba6729a8b11d09a314b34a4db628911d

Simplify usage of the SIMDe header

This obviates the need for sse2neon as well and fixes compilation of all
plugins that referenced sse-intrin.h on all architectures, not just
arm*.
Michael R. Crusoe 4 years ago
parent
commit
1e96573328

+ 0 - 4
CMakeLists.txt

@@ -123,18 +123,14 @@ else ()
 endif ()
 endif ()
 
 
 if(LOWERCASE_CMAKE_SYSTEM_PROCESSOR MATCHES "(i[3-6]86|x86|x64|x86_64|amd64|e2k)")
 if(LOWERCASE_CMAKE_SYSTEM_PROCESSOR MATCHES "(i[3-6]86|x86|x64|x86_64|amd64|e2k)")
-	set(NEEDS_SIMDE "0")
 	if(NOT MSVC)
 	if(NOT MSVC)
 		set(ARCH_SIMD_FLAGS "-mmmx" "-msse" "-msse2")
 		set(ARCH_SIMD_FLAGS "-mmmx" "-msse" "-msse2")
 	endif()
 	endif()
 elseif(LOWERCASE_CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64(le)?")
 elseif(LOWERCASE_CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64(le)?")
-	set(NEEDS_SIMDE "0")
 	set(ARCH_SIMD_DEFINES "-DNO_WARN_X86_INTRINSICS")
 	set(ARCH_SIMD_DEFINES "-DNO_WARN_X86_INTRINSICS")
 	set(ARCH_SIMD_FLAGS "-mvsx")
 	set(ARCH_SIMD_FLAGS "-mvsx")
 	add_compile_definitions(NO_WARN_X86_INTRINSICS)
 	add_compile_definitions(NO_WARN_X86_INTRINSICS)
 else()
 else()
-	set(NEEDS_SIMDE "1")
-	add_definitions(-DNEEDS_SIMDE=1)
 	if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX)
 	if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX)
 		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSIMDE_ENABLE_OPENMP")
 		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSIMDE_ENABLE_OPENMP")
 		set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSIMDE_ENABLE_OPENMP")
 		set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSIMDE_ENABLE_OPENMP")

+ 16 - 15
libobs/CMakeLists.txt

@@ -188,20 +188,8 @@ elseif(UNIX)
 		util/pipe-posix.c
 		util/pipe-posix.c
 		util/platform-nix.c)
 		util/platform-nix.c)
 
 
-	if(NEEDS_SIMDE)
-		set(libobs_PLATFORM_HEADERS
-			util/simde/check.h
-			util/simde/hedley.h
-			util/simde/mmx.h
-			util/simde/simde-arch.h
-			util/simde/simde-common.h
-			util/simde/sse.h
-			util/simde/sse2.h
-			util/threading-posix.h)
-	else()
-		set(libobs_PLATFORM_HEADERS
-			util/threading-posix.h)
-	endif()
+	set(libobs_PLATFORM_HEADERS
+		util/threading-posix.h)
 
 
 	if(HAVE_PULSEAUDIO)
 	if(HAVE_PULSEAUDIO)
 		set(libobs_audio_monitoring_HEADERS
 		set(libobs_audio_monitoring_HEADERS
@@ -369,7 +357,6 @@ set(libobs_util_SOURCES
 set(libobs_util_HEADERS
 set(libobs_util_HEADERS
 	util/curl/curl-helper.h
 	util/curl/curl-helper.h
 	util/sse-intrin.h
 	util/sse-intrin.h
-	util/sse2neon.h
 	util/array-serializer.h
 	util/array-serializer.h
 	util/file-serializer.h
 	util/file-serializer.h
 	util/utf8.h
 	util/utf8.h
@@ -419,6 +406,20 @@ set(libobs_libobs_SOURCES
 	obs-video-gpu-encode.c
 	obs-video-gpu-encode.c
 	obs-video.c)
 	obs-video.c)
 set(libobs_libobs_HEADERS
 set(libobs_libobs_HEADERS
+	util/simde/check.h
+	util/simde/debug-trap.h
+	util/simde/hedley.h
+	util/simde/simde-align.h
+	util/simde/simde-arch.h
+	util/simde/simde-common.h
+	util/simde/simde-constify.h
+	util/simde/simde-detect-clang.h
+	util/simde/simde-diagnostic.h
+	util/simde/simde-features.h
+	util/simde/simde-math.h
+	util/simde/x86/mmx.h
+	util/simde/x86/sse2.h
+	util/simde/x86/sse.h
 	${libobs_PLATFORM_HEADERS}
 	${libobs_PLATFORM_HEADERS}
 	obs-audio-controls.h
 	obs-audio-controls.h
 	obs-defs.h
 	obs-defs.h

+ 0 - 1
libobs/obsconfig.h.in

@@ -18,7 +18,6 @@
 #define HAVE_DBUS @HAVE_DBUS@
 #define HAVE_DBUS @HAVE_DBUS@
 #define HAVE_PULSEAUDIO @HAVE_PULSEAUDIO@
 #define HAVE_PULSEAUDIO @HAVE_PULSEAUDIO@
 #define USE_XINPUT @USE_XINPUT@
 #define USE_XINPUT @USE_XINPUT@
-#define NEEDS_SIMDE @NEEDS_SIMDE@
 #define LIBOBS_IMAGEMAGICK_DIR_STYLE_6L 6
 #define LIBOBS_IMAGEMAGICK_DIR_STYLE_6L 6
 #define LIBOBS_IMAGEMAGICK_DIR_STYLE_7GE 7
 #define LIBOBS_IMAGEMAGICK_DIR_STYLE_7GE 7
 #define LIBOBS_IMAGEMAGICK_DIR_STYLE @LIBOBS_IMAGEMAGICK_DIR_STYLE@
 #define LIBOBS_IMAGEMAGICK_DIR_STYLE @LIBOBS_IMAGEMAGICK_DIR_STYLE@

+ 3 - 3
libobs/util/simde/README.libobs

@@ -1,5 +1,5 @@
-This is a slightly modified version of https://github.com/nemequ/simde/commit/cafec4b952fa5a31a51a10326f97c2e7c9067771
-sse{,2}.h and mmx.h was moved down from the original "x86" subdirectory,
-subsequently the '#include "../simde-common.h"' line in mmx.h was changed to '#include "simde-common.h"'
+This is a slightly modified version of the simde directory in
+https://github.com/simd-everywhere/simde/commit/c3d7abfaba6729a8b11d09a314b34a4db628911d
+Unused files have removed.
 
 
 Then the code was reformatted using the "formatcode.sh" script in the root of this repository.
 Then the code was reformatted using the "formatcode.sh" script in the root of this repository.

+ 1 - 0
libobs/util/simde/check.h

@@ -18,6 +18,7 @@
 #endif
 #endif
 
 
 #include "hedley.h"
 #include "hedley.h"
+#include "simde-diagnostic.h"
 #include <stdint.h>
 #include <stdint.h>
 
 
 #if !defined(_WIN32)
 #if !defined(_WIN32)

+ 191 - 116
libobs/util/simde/hedley.h

@@ -10,11 +10,11 @@
  * SPDX-License-Identifier: CC0-1.0
  * SPDX-License-Identifier: CC0-1.0
  */
  */
 
 
-#if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 12)
+#if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 14)
 #if defined(HEDLEY_VERSION)
 #if defined(HEDLEY_VERSION)
 #undef HEDLEY_VERSION
 #undef HEDLEY_VERSION
 #endif
 #endif
-#define HEDLEY_VERSION 12
+#define HEDLEY_VERSION 14
 
 
 #if defined(HEDLEY_STRINGIFY_EX)
 #if defined(HEDLEY_STRINGIFY_EX)
 #undef HEDLEY_STRINGIFY_EX
 #undef HEDLEY_STRINGIFY_EX
@@ -36,6 +36,16 @@
 #endif
 #endif
 #define HEDLEY_CONCAT(a, b) HEDLEY_CONCAT_EX(a, b)
 #define HEDLEY_CONCAT(a, b) HEDLEY_CONCAT_EX(a, b)
 
 
+#if defined(HEDLEY_CONCAT3_EX)
+#undef HEDLEY_CONCAT3_EX
+#endif
+#define HEDLEY_CONCAT3_EX(a, b, c) a##b##c
+
+#if defined(HEDLEY_CONCAT3)
+#undef HEDLEY_CONCAT3
+#endif
+#define HEDLEY_CONCAT3(a, b, c) HEDLEY_CONCAT3_EX(a, b, c)
+
 #if defined(HEDLEY_VERSION_ENCODE)
 #if defined(HEDLEY_VERSION_ENCODE)
 #undef HEDLEY_VERSION_ENCODE
 #undef HEDLEY_VERSION_ENCODE
 #endif
 #endif
@@ -80,17 +90,17 @@
 #if defined(HEDLEY_MSVC_VERSION)
 #if defined(HEDLEY_MSVC_VERSION)
 #undef HEDLEY_MSVC_VERSION
 #undef HEDLEY_MSVC_VERSION
 #endif
 #endif
-#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000)
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL)
 #define HEDLEY_MSVC_VERSION                                        \
 #define HEDLEY_MSVC_VERSION                                        \
 	HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000,            \
 	HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000,            \
 			      (_MSC_FULL_VER % 10000000) / 100000, \
 			      (_MSC_FULL_VER % 10000000) / 100000, \
 			      (_MSC_FULL_VER % 100000) / 100)
 			      (_MSC_FULL_VER % 100000) / 100)
-#elif defined(_MSC_FULL_VER)
+#elif defined(_MSC_FULL_VER) && !defined(__ICL)
 #define HEDLEY_MSVC_VERSION                                      \
 #define HEDLEY_MSVC_VERSION                                      \
 	HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000,           \
 	HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000,           \
 			      (_MSC_FULL_VER % 1000000) / 10000, \
 			      (_MSC_FULL_VER % 1000000) / 10000, \
 			      (_MSC_FULL_VER % 10000) / 10)
 			      (_MSC_FULL_VER % 10000) / 10)
-#elif defined(_MSC_VER)
+#elif defined(_MSC_VER) && !defined(__ICL)
 #define HEDLEY_MSVC_VERSION \
 #define HEDLEY_MSVC_VERSION \
 	HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0)
 	HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0)
 #endif
 #endif
@@ -98,7 +108,7 @@
 #if defined(HEDLEY_MSVC_VERSION_CHECK)
 #if defined(HEDLEY_MSVC_VERSION_CHECK)
 #undef HEDLEY_MSVC_VERSION_CHECK
 #undef HEDLEY_MSVC_VERSION_CHECK
 #endif
 #endif
-#if !defined(_MSC_VER)
+#if !defined(HEDLEY_MSVC_VERSION)
 #define HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) (0)
 #define HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) (0)
 #elif defined(_MSC_VER) && (_MSC_VER >= 1400)
 #elif defined(_MSC_VER) && (_MSC_VER >= 1400)
 #define HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) \
 #define HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) \
@@ -114,11 +124,12 @@
 #if defined(HEDLEY_INTEL_VERSION)
 #if defined(HEDLEY_INTEL_VERSION)
 #undef HEDLEY_INTEL_VERSION
 #undef HEDLEY_INTEL_VERSION
 #endif
 #endif
-#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE)
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && \
+	!defined(__ICL)
 #define HEDLEY_INTEL_VERSION                                                  \
 #define HEDLEY_INTEL_VERSION                                                  \
 	HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, \
 	HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, \
 			      __INTEL_COMPILER_UPDATE)
 			      __INTEL_COMPILER_UPDATE)
-#elif defined(__INTEL_COMPILER)
+#elif defined(__INTEL_COMPILER) && !defined(__ICL)
 #define HEDLEY_INTEL_VERSION \
 #define HEDLEY_INTEL_VERSION \
 	HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0)
 	HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0)
 #endif
 #endif
@@ -133,6 +144,25 @@
 #define HEDLEY_INTEL_VERSION_CHECK(major, minor, patch) (0)
 #define HEDLEY_INTEL_VERSION_CHECK(major, minor, patch) (0)
 #endif
 #endif
 
 
+#if defined(HEDLEY_INTEL_CL_VERSION)
+#undef HEDLEY_INTEL_CL_VERSION
+#endif
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && \
+	defined(__ICL)
+#define HEDLEY_INTEL_CL_VERSION \
+	HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0)
+#endif
+
+#if defined(HEDLEY_INTEL_CL_VERSION_CHECK)
+#undef HEDLEY_INTEL_CL_VERSION_CHECK
+#endif
+#if defined(HEDLEY_INTEL_CL_VERSION)
+#define HEDLEY_INTEL_CL_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_INTEL_CL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define HEDLEY_INTEL_CL_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
 #if defined(HEDLEY_PGI_VERSION)
 #if defined(HEDLEY_PGI_VERSION)
 #undef HEDLEY_PGI_VERSION
 #undef HEDLEY_PGI_VERSION
 #endif
 #endif
@@ -788,6 +818,68 @@
 	HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
 	HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
 #endif
 #endif
 
 
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
+	defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(3, 0, 0) ||  \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                     \
+	HEDLEY_IAR_VERSION_CHECK(8, 0, 0) ||                        \
+	HEDLEY_PGI_VERSION_CHECK(18, 4, 0) ||                       \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                        \
+	HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                       \
+	HEDLEY_TI_ARMCL_VERSION_CHECK(4, 7, 0) ||                   \
+	HEDLEY_TI_CL430_VERSION_CHECK(2, 0, 1) ||                   \
+	HEDLEY_TI_CL2000_VERSION_CHECK(6, 1, 0) ||                  \
+	HEDLEY_TI_CL6X_VERSION_CHECK(7, 0, 0) ||                    \
+	HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||                    \
+	HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) ||                   \
+	HEDLEY_CRAY_VERSION_CHECK(5, 0, 0) ||                       \
+	HEDLEY_TINYC_VERSION_CHECK(0, 9, 17) ||                     \
+	HEDLEY_SUNPRO_VERSION_CHECK(8, 0, 0) ||                     \
+	(HEDLEY_IBM_VERSION_CHECK(10, 1, 0) && defined(__C99_PRAGMA_OPERATOR))
+#define HEDLEY_PRAGMA(value) _Pragma(#value)
+#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
+#define HEDLEY_PRAGMA(value) __pragma(value)
+#else
+#define HEDLEY_PRAGMA(value)
+#endif
+
+#if defined(HEDLEY_DIAGNOSTIC_PUSH)
+#undef HEDLEY_DIAGNOSTIC_PUSH
+#endif
+#if defined(HEDLEY_DIAGNOSTIC_POP)
+#undef HEDLEY_DIAGNOSTIC_POP
+#endif
+#if defined(__clang__)
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
+#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#elif HEDLEY_GCC_VERSION_CHECK(4, 6, 0)
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
+#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0) || \
+	HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
+#define HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
+#elif HEDLEY_ARM_VERSION_CHECK(5, 6, 0)
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
+#elif HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||       \
+	HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \
+	HEDLEY_TI_CL430_VERSION_CHECK(4, 4, 0) || \
+	HEDLEY_TI_CL6X_VERSION_CHECK(8, 1, 0) ||  \
+	HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||  \
+	HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
+#elif HEDLEY_PELLES_VERSION_CHECK(2, 90, 0)
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#else
+#define HEDLEY_DIAGNOSTIC_PUSH
+#define HEDLEY_DIAGNOSTIC_POP
+#endif
+
 /* HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for
 /* HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for
    HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
    HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
 #if defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
 #if defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
@@ -796,11 +888,20 @@
 #if defined(__cplusplus)
 #if defined(__cplusplus)
 #if HEDLEY_HAS_WARNING("-Wc++98-compat")
 #if HEDLEY_HAS_WARNING("-Wc++98-compat")
 #if HEDLEY_HAS_WARNING("-Wc++17-extensions")
 #if HEDLEY_HAS_WARNING("-Wc++17-extensions")
+#if HEDLEY_HAS_WARNING("-Wc++1z-extensions")
+#define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr)                  \
+	HEDLEY_DIAGNOSTIC_PUSH                                             \
+	_Pragma("clang diagnostic ignored \"-Wc++98-compat\"") _Pragma(    \
+		"clang diagnostic ignored \"-Wc++17-extensions\"")         \
+		_Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") \
+			xpr HEDLEY_DIAGNOSTIC_POP
+#else
 #define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr)                  \
 #define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr)                  \
 	HEDLEY_DIAGNOSTIC_PUSH                                             \
 	HEDLEY_DIAGNOSTIC_PUSH                                             \
 	_Pragma("clang diagnostic ignored \"-Wc++98-compat\"")             \
 	_Pragma("clang diagnostic ignored \"-Wc++98-compat\"")             \
 		_Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
 		_Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
 			xpr HEDLEY_DIAGNOSTIC_POP
 			xpr HEDLEY_DIAGNOSTIC_POP
+#endif
 #else
 #else
 #define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr)      \
 #define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr)      \
 	HEDLEY_DIAGNOSTIC_PUSH                                 \
 	HEDLEY_DIAGNOSTIC_PUSH                                 \
@@ -861,74 +962,14 @@
 #elif HEDLEY_IAR_VERSION_CHECK(8, 3, 0)
 #elif HEDLEY_IAR_VERSION_CHECK(8, 3, 0)
 #define HEDLEY_CPP_CAST(T, expr) \
 #define HEDLEY_CPP_CAST(T, expr) \
 	HEDLEY_DIAGNOSTIC_PUSH   \
 	HEDLEY_DIAGNOSTIC_PUSH   \
-	_Pragma("diag_suppress=Pe137") HEDLEY_DIAGNOSTIC_POP #else
+	_Pragma("diag_suppress=Pe137") HEDLEY_DIAGNOSTIC_POP
+#else
 #define HEDLEY_CPP_CAST(T, expr) ((T)(expr))
 #define HEDLEY_CPP_CAST(T, expr) ((T)(expr))
 #endif
 #endif
 #else
 #else
 #define HEDLEY_CPP_CAST(T, expr) (expr)
 #define HEDLEY_CPP_CAST(T, expr) (expr)
 #endif
 #endif
 
 
-#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
-	defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(3, 0, 0) ||  \
-	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                     \
-	HEDLEY_IAR_VERSION_CHECK(8, 0, 0) ||                        \
-	HEDLEY_PGI_VERSION_CHECK(18, 4, 0) ||                       \
-	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                        \
-	HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                       \
-	HEDLEY_TI_ARMCL_VERSION_CHECK(4, 7, 0) ||                   \
-	HEDLEY_TI_CL430_VERSION_CHECK(2, 0, 1) ||                   \
-	HEDLEY_TI_CL2000_VERSION_CHECK(6, 1, 0) ||                  \
-	HEDLEY_TI_CL6X_VERSION_CHECK(7, 0, 0) ||                    \
-	HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||                    \
-	HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) ||                   \
-	HEDLEY_CRAY_VERSION_CHECK(5, 0, 0) ||                       \
-	HEDLEY_TINYC_VERSION_CHECK(0, 9, 17) ||                     \
-	HEDLEY_SUNPRO_VERSION_CHECK(8, 0, 0) ||                     \
-	(HEDLEY_IBM_VERSION_CHECK(10, 1, 0) && defined(__C99_PRAGMA_OPERATOR))
-#define HEDLEY_PRAGMA(value) _Pragma(#value)
-#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
-#define HEDLEY_PRAGMA(value) __pragma(value)
-#else
-#define HEDLEY_PRAGMA(value)
-#endif
-
-#if defined(HEDLEY_DIAGNOSTIC_PUSH)
-#undef HEDLEY_DIAGNOSTIC_PUSH
-#endif
-#if defined(HEDLEY_DIAGNOSTIC_POP)
-#undef HEDLEY_DIAGNOSTIC_POP
-#endif
-#if defined(__clang__)
-#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
-#define HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
-#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
-#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
-#define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
-#elif HEDLEY_GCC_VERSION_CHECK(4, 6, 0)
-#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
-#define HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
-#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
-#define HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
-#define HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
-#elif HEDLEY_ARM_VERSION_CHECK(5, 6, 0)
-#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
-#define HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
-#elif HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||       \
-	HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \
-	HEDLEY_TI_CL430_VERSION_CHECK(4, 4, 0) || \
-	HEDLEY_TI_CL6X_VERSION_CHECK(8, 1, 0) ||  \
-	HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||  \
-	HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)
-#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
-#define HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
-#elif HEDLEY_PELLES_VERSION_CHECK(2, 90, 0)
-#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
-#define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
-#else
-#define HEDLEY_DIAGNOSTIC_PUSH
-#define HEDLEY_DIAGNOSTIC_POP
-#endif
-
 #if defined(HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED)
 #if defined(HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED)
 #undef HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
 #undef HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
 #endif
 #endif
@@ -938,6 +979,12 @@
 #elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
 #elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
 #define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \
 #define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \
 	_Pragma("warning(disable:1478 1786)")
 	_Pragma("warning(disable:1478 1786)")
+#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \
+	__pragma(warning(disable : 1478 1786))
+#elif HEDLEY_PGI_VERSION_CHECK(20, 7, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \
+	_Pragma("diag_suppress 1215,1216,1444,1445")
 #elif HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
 #elif HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
 #define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
 #define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
 #elif HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
 #elif HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
@@ -985,6 +1032,9 @@
 #elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
 #elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
 #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
 #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
 	_Pragma("warning(disable:161)")
 	_Pragma("warning(disable:161)")
+#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+	__pragma(warning(disable : 161))
 #elif HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
 #elif HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
 #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675")
 #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675")
 #elif HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
 #elif HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
@@ -1018,9 +1068,15 @@
 #elif HEDLEY_INTEL_VERSION_CHECK(17, 0, 0)
 #elif HEDLEY_INTEL_VERSION_CHECK(17, 0, 0)
 #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \
 #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \
 	_Pragma("warning(disable:1292)")
 	_Pragma("warning(disable:1292)")
+#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \
+	__pragma(warning(disable : 1292))
 #elif HEDLEY_MSVC_VERSION_CHECK(19, 0, 0)
 #elif HEDLEY_MSVC_VERSION_CHECK(19, 0, 0)
 #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \
 #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \
 	__pragma(warning(disable : 5030))
 	__pragma(warning(disable : 5030))
+#elif HEDLEY_PGI_VERSION_CHECK(20, 7, 0)
+#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \
+	_Pragma("diag_suppress 1097,1098")
 #elif HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
 #elif HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
 #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \
 #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \
 	_Pragma("diag_suppress 1097")
 	_Pragma("diag_suppress 1097")
@@ -1061,13 +1117,11 @@
 #if defined(HEDLEY_DEPRECATED_FOR)
 #if defined(HEDLEY_DEPRECATED_FOR)
 #undef HEDLEY_DEPRECATED_FOR
 #undef HEDLEY_DEPRECATED_FOR
 #endif
 #endif
-#if defined(__cplusplus) && (__cplusplus >= 201402L)
-#define HEDLEY_DEPRECATED(since)                      \
-	HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \
-		[[deprecated("Since " #since)]])
-#define HEDLEY_DEPRECATED_FOR(since, replacement)     \
-	HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \
-		[[deprecated("Since " #since "; use " #replacement)]])
+#if HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) || \
+	HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " #since))
+#define HEDLEY_DEPRECATED_FOR(since, replacement) \
+	__declspec(deprecated("Since " #since "; use " #replacement))
 #elif HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) || \
 #elif HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) || \
 	HEDLEY_GCC_VERSION_CHECK(4, 5, 0) ||                     \
 	HEDLEY_GCC_VERSION_CHECK(4, 5, 0) ||                     \
 	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                  \
 	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                  \
@@ -1083,6 +1137,13 @@
 	__attribute__((__deprecated__("Since " #since)))
 	__attribute__((__deprecated__("Since " #since)))
 #define HEDLEY_DEPRECATED_FOR(since, replacement) \
 #define HEDLEY_DEPRECATED_FOR(since, replacement) \
 	__attribute__((__deprecated__("Since " #since "; use " #replacement)))
 	__attribute__((__deprecated__("Since " #since "; use " #replacement)))
+#elif defined(__cplusplus) && (__cplusplus >= 201402L)
+#define HEDLEY_DEPRECATED(since)                      \
+	HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \
+		[[deprecated("Since " #since)]])
+#define HEDLEY_DEPRECATED_FOR(since, replacement)     \
+	HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \
+		[[deprecated("Since " #since "; use " #replacement)]])
 #elif HEDLEY_HAS_ATTRIBUTE(deprecated) || HEDLEY_GCC_VERSION_CHECK(3, 1, 0) || \
 #elif HEDLEY_HAS_ATTRIBUTE(deprecated) || HEDLEY_GCC_VERSION_CHECK(3, 1, 0) || \
 	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                                   \
 	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                                   \
 	HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                                  \
 	HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                                  \
@@ -1103,12 +1164,9 @@
 #define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__))
 #define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__))
 #define HEDLEY_DEPRECATED_FOR(since, replacement) \
 #define HEDLEY_DEPRECATED_FOR(since, replacement) \
 	__attribute__((__deprecated__))
 	__attribute__((__deprecated__))
-#elif HEDLEY_MSVC_VERSION_CHECK(14, 0, 0)
-#define HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " #since))
-#define HEDLEY_DEPRECATED_FOR(since, replacement) \
-	__declspec(deprecated("Since " #since "; use " #replacement))
-#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || \
-	HEDLEY_PELLES_VERSION_CHECK(6, 50, 0)
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) ||    \
+	HEDLEY_PELLES_VERSION_CHECK(6, 50, 0) || \
+	HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
 #define HEDLEY_DEPRECATED(since) __declspec(deprecated)
 #define HEDLEY_DEPRECATED(since) __declspec(deprecated)
 #define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated)
 #define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated)
 #elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
 #elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
@@ -1136,17 +1194,7 @@
 #if defined(HEDLEY_WARN_UNUSED_RESULT_MSG)
 #if defined(HEDLEY_WARN_UNUSED_RESULT_MSG)
 #undef HEDLEY_WARN_UNUSED_RESULT_MSG
 #undef HEDLEY_WARN_UNUSED_RESULT_MSG
 #endif
 #endif
-#if (HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L)
-#define HEDLEY_WARN_UNUSED_RESULT \
-	HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
-#define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) \
-	HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]])
-#elif HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard)
-#define HEDLEY_WARN_UNUSED_RESULT \
-	HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
-#define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) \
-	HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
-#elif HEDLEY_HAS_ATTRIBUTE(warn_unused_result) ||                          \
+#if HEDLEY_HAS_ATTRIBUTE(warn_unused_result) ||                            \
 	HEDLEY_GCC_VERSION_CHECK(3, 4, 0) ||                               \
 	HEDLEY_GCC_VERSION_CHECK(3, 4, 0) ||                               \
 	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
 	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
 	HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                              \
 	HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                              \
@@ -1169,6 +1217,16 @@
 #define HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
 #define HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
 #define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) \
 #define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) \
 	__attribute__((__warn_unused_result__))
 	__attribute__((__warn_unused_result__))
+#elif (HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L)
+#define HEDLEY_WARN_UNUSED_RESULT \
+	HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+#define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) \
+	HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]])
+#elif HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard)
+#define HEDLEY_WARN_UNUSED_RESULT \
+	HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+#define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) \
+	HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
 #elif defined(_Check_return_) /* SAL */
 #elif defined(_Check_return_) /* SAL */
 #define HEDLEY_WARN_UNUSED_RESULT _Check_return_
 #define HEDLEY_WARN_UNUSED_RESULT _Check_return_
 #define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_
 #define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_
@@ -1222,7 +1280,8 @@
 #define HEDLEY_NO_RETURN __attribute__((__noreturn__))
 #define HEDLEY_NO_RETURN __attribute__((__noreturn__))
 #elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0)
 #elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0)
 #define HEDLEY_NO_RETURN _Pragma("does_not_return")
 #define HEDLEY_NO_RETURN _Pragma("does_not_return")
-#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0)
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || \
+	HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
 #define HEDLEY_NO_RETURN __declspec(noreturn)
 #define HEDLEY_NO_RETURN __declspec(noreturn)
 #elif HEDLEY_TI_CL6X_VERSION_CHECK(6, 0, 0) && defined(__cplusplus)
 #elif HEDLEY_TI_CL6X_VERSION_CHECK(6, 0, 0) && defined(__cplusplus)
 #define HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;")
 #define HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;")
@@ -1252,7 +1311,9 @@
 #if defined(HEDLEY_ASSUME)
 #if defined(HEDLEY_ASSUME)
 #undef HEDLEY_ASSUME
 #undef HEDLEY_ASSUME
 #endif
 #endif
-#if HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#if HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) ||     \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \
+	HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
 #define HEDLEY_ASSUME(expr) __assume(expr)
 #define HEDLEY_ASSUME(expr) __assume(expr)
 #elif HEDLEY_HAS_BUILTIN(__builtin_assume)
 #elif HEDLEY_HAS_BUILTIN(__builtin_assume)
 #define HEDLEY_ASSUME(expr) __builtin_assume(expr)
 #define HEDLEY_ASSUME(expr) __builtin_assume(expr)
@@ -1389,7 +1450,8 @@ HEDLEY_DIAGNOSTIC_POP
 #if HEDLEY_HAS_BUILTIN(__builtin_unpredictable)
 #if HEDLEY_HAS_BUILTIN(__builtin_unpredictable)
 #define HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr))
 #define HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr))
 #endif
 #endif
-#if HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) || \
+#if (HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && \
+     !defined(HEDLEY_PGI_VERSION)) ||                         \
 	HEDLEY_GCC_VERSION_CHECK(9, 0, 0)
 	HEDLEY_GCC_VERSION_CHECK(9, 0, 0)
 #define HEDLEY_PREDICT(expr, value, probability) \
 #define HEDLEY_PREDICT(expr, value, probability) \
 	__builtin_expect_with_probability((expr), (value), (probability))
 	__builtin_expect_with_probability((expr), (value), (probability))
@@ -1399,7 +1461,8 @@ HEDLEY_DIAGNOSTIC_POP
 	__builtin_expect_with_probability(!!(expr), 0, (probability))
 	__builtin_expect_with_probability(!!(expr), 0, (probability))
 #define HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1)
 #define HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1)
 #define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
 #define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
-#elif HEDLEY_HAS_BUILTIN(__builtin_expect) ||                              \
+#elif (HEDLEY_HAS_BUILTIN(__builtin_expect) &&                             \
+       !defined(HEDLEY_INTEL_CL_VERSION)) ||                               \
 	HEDLEY_GCC_VERSION_CHECK(3, 0, 0) ||                               \
 	HEDLEY_GCC_VERSION_CHECK(3, 0, 0) ||                               \
 	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
 	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
 	(HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0) && defined(__cplusplus)) || \
 	(HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0) && defined(__cplusplus)) || \
@@ -1476,7 +1539,8 @@ HEDLEY_DIAGNOSTIC_POP
 #define HEDLEY_MALLOC __attribute__((__malloc__))
 #define HEDLEY_MALLOC __attribute__((__malloc__))
 #elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0)
 #elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0)
 #define HEDLEY_MALLOC _Pragma("returns_new_memory")
 #define HEDLEY_MALLOC _Pragma("returns_new_memory")
-#elif HEDLEY_MSVC_VERSION_CHECK(14, 0, 0)
+#elif HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) || \
+	HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
 #define HEDLEY_MALLOC __declspec(restrict)
 #define HEDLEY_MALLOC __declspec(restrict)
 #else
 #else
 #define HEDLEY_MALLOC
 #define HEDLEY_MALLOC
@@ -1557,6 +1621,7 @@ HEDLEY_DIAGNOSTIC_POP
 #elif HEDLEY_GCC_VERSION_CHECK(3, 1, 0) ||                                 \
 #elif HEDLEY_GCC_VERSION_CHECK(3, 1, 0) ||                                 \
 	HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) ||                             \
 	HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) ||                             \
 	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
 	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
+	HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0) ||                       \
 	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                               \
 	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                               \
 	HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||                              \
 	HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||                              \
 	HEDLEY_PGI_VERSION_CHECK(17, 10, 0) ||                             \
 	HEDLEY_PGI_VERSION_CHECK(17, 10, 0) ||                             \
@@ -1581,13 +1646,14 @@ HEDLEY_DIAGNOSTIC_POP
 #define HEDLEY_INLINE inline
 #define HEDLEY_INLINE inline
 #elif defined(HEDLEY_GCC_VERSION) || HEDLEY_ARM_VERSION_CHECK(6, 2, 0)
 #elif defined(HEDLEY_GCC_VERSION) || HEDLEY_ARM_VERSION_CHECK(6, 2, 0)
 #define HEDLEY_INLINE __inline__
 #define HEDLEY_INLINE __inline__
-#elif HEDLEY_MSVC_VERSION_CHECK(12, 0, 0) ||       \
-	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||       \
-	HEDLEY_TI_ARMCL_VERSION_CHECK(5, 1, 0) ||  \
-	HEDLEY_TI_CL430_VERSION_CHECK(3, 1, 0) ||  \
-	HEDLEY_TI_CL2000_VERSION_CHECK(6, 2, 0) || \
-	HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0) ||   \
-	HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||   \
+#elif HEDLEY_MSVC_VERSION_CHECK(12, 0, 0) ||         \
+	HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0) || \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||         \
+	HEDLEY_TI_ARMCL_VERSION_CHECK(5, 1, 0) ||    \
+	HEDLEY_TI_CL430_VERSION_CHECK(3, 1, 0) ||    \
+	HEDLEY_TI_CL2000_VERSION_CHECK(6, 2, 0) ||   \
+	HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0) ||     \
+	HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||     \
 	HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)
 	HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)
 #define HEDLEY_INLINE __inline
 #define HEDLEY_INLINE __inline
 #else
 #else
@@ -1619,7 +1685,8 @@ HEDLEY_DIAGNOSTIC_POP
 	HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||    \
 	HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||    \
 	HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)
 	HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)
 #define HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) HEDLEY_INLINE
 #define HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) HEDLEY_INLINE
-#elif HEDLEY_MSVC_VERSION_CHECK(12, 0, 0)
+#elif HEDLEY_MSVC_VERSION_CHECK(12, 0, 0) || \
+	HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
 #define HEDLEY_ALWAYS_INLINE __forceinline
 #define HEDLEY_ALWAYS_INLINE __forceinline
 #elif defined(__cplusplus) && (HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) ||  \
 #elif defined(__cplusplus) && (HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) ||  \
 			       HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) ||  \
 			       HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) ||  \
@@ -1658,7 +1725,8 @@ HEDLEY_DIAGNOSTIC_POP
 	HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||                           \
 	HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||                           \
 	HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)
 	HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)
 #define HEDLEY_NEVER_INLINE __attribute__((__noinline__))
 #define HEDLEY_NEVER_INLINE __attribute__((__noinline__))
-#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0)
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || \
+	HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
 #define HEDLEY_NEVER_INLINE __declspec(noinline)
 #define HEDLEY_NEVER_INLINE __declspec(noinline)
 #elif HEDLEY_PGI_VERSION_CHECK(10, 2, 0)
 #elif HEDLEY_PGI_VERSION_CHECK(10, 2, 0)
 #define HEDLEY_NEVER_INLINE _Pragma("noinline")
 #define HEDLEY_NEVER_INLINE _Pragma("noinline")
@@ -1711,7 +1779,9 @@ HEDLEY_DIAGNOSTIC_POP
 #if HEDLEY_HAS_ATTRIBUTE(nothrow) || HEDLEY_GCC_VERSION_CHECK(3, 3, 0) || \
 #if HEDLEY_HAS_ATTRIBUTE(nothrow) || HEDLEY_GCC_VERSION_CHECK(3, 3, 0) || \
 	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
 	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
 #define HEDLEY_NO_THROW __attribute__((__nothrow__))
 #define HEDLEY_NO_THROW __attribute__((__nothrow__))
-#elif HEDLEY_MSVC_VERSION_CHECK(13, 1, 0) || HEDLEY_ARM_VERSION_CHECK(4, 1, 0)
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 1, 0) ||         \
+	HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0) || \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0)
 #define HEDLEY_NO_THROW __declspec(nothrow)
 #define HEDLEY_NO_THROW __declspec(nothrow)
 #else
 #else
 #define HEDLEY_NO_THROW
 #define HEDLEY_NO_THROW
@@ -1720,8 +1790,7 @@ HEDLEY_DIAGNOSTIC_POP
 #if defined(HEDLEY_FALL_THROUGH)
 #if defined(HEDLEY_FALL_THROUGH)
 #undef HEDLEY_FALL_THROUGH
 #undef HEDLEY_FALL_THROUGH
 #endif
 #endif
-#if HEDLEY_GNUC_HAS_ATTRIBUTE(fallthrough, 7, 0, 0) && \
-	!defined(HEDLEY_PGI_VERSION)
+#if HEDLEY_HAS_ATTRIBUTE(fallthrough) || HEDLEY_GCC_VERSION_CHECK(7, 0, 0)
 #define HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
 #define HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
 #elif HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang, fallthrough)
 #elif HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang, fallthrough)
 #define HEDLEY_FALL_THROUGH \
 #define HEDLEY_FALL_THROUGH \
@@ -1866,12 +1935,14 @@ HEDLEY_DIAGNOSTIC_POP
 #endif
 #endif
 #if !defined(__cplusplus) &&                                             \
 #if !defined(__cplusplus) &&                                             \
 	((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
 	((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
-	 HEDLEY_HAS_FEATURE(c_static_assert) ||                          \
+	 (HEDLEY_HAS_FEATURE(c_static_assert) &&                         \
+	  !defined(HEDLEY_INTEL_CL_VERSION)) ||                          \
 	 HEDLEY_GCC_VERSION_CHECK(6, 0, 0) ||                            \
 	 HEDLEY_GCC_VERSION_CHECK(6, 0, 0) ||                            \
 	 HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || defined(_Static_assert))
 	 HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || defined(_Static_assert))
 #define HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
 #define HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
 #elif (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
 #elif (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
-	HEDLEY_MSVC_VERSION_CHECK(16, 0, 0)
+	HEDLEY_MSVC_VERSION_CHECK(16, 0, 0) ||              \
+	HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
 #define HEDLEY_STATIC_ASSERT(expr, message)           \
 #define HEDLEY_STATIC_ASSERT(expr, message)           \
 	HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \
 	HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \
 		static_assert(expr, message))
 		static_assert(expr, message))
@@ -1930,7 +2001,8 @@ HEDLEY_DIAGNOSTIC_POP
 	HEDLEY_PGI_VERSION_CHECK(18, 4, 0) || \
 	HEDLEY_PGI_VERSION_CHECK(18, 4, 0) || \
 	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
 	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
 #define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(GCC warning msg)
 #define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(GCC warning msg)
-#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
+#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0) || \
+	HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
 #define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(message(msg))
 #define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(message(msg))
 #else
 #else
 #define HEDLEY_WARNING(msg) HEDLEY_MESSAGE(msg)
 #define HEDLEY_WARNING(msg) HEDLEY_MESSAGE(msg)
@@ -1970,6 +2042,8 @@ HEDLEY_DIAGNOSTIC_POP
 #endif
 #endif
 #if HEDLEY_HAS_ATTRIBUTE(flag_enum)
 #if HEDLEY_HAS_ATTRIBUTE(flag_enum)
 #define HEDLEY_FLAGS __attribute__((__flag_enum__))
 #define HEDLEY_FLAGS __attribute__((__flag_enum__))
+#else
+#define HEDLEY_FLAGS
 #endif
 #endif
 
 
 #if defined(HEDLEY_FLAGS_CAST)
 #if defined(HEDLEY_FLAGS_CAST)
@@ -1989,8 +2063,9 @@ HEDLEY_DIAGNOSTIC_POP
 #if defined(HEDLEY_EMPTY_BASES)
 #if defined(HEDLEY_EMPTY_BASES)
 #undef HEDLEY_EMPTY_BASES
 #undef HEDLEY_EMPTY_BASES
 #endif
 #endif
-#if HEDLEY_MSVC_VERSION_CHECK(19, 0, 23918) && \
-	!HEDLEY_MSVC_VERSION_CHECK(20, 0, 0)
+#if (HEDLEY_MSVC_VERSION_CHECK(19, 0, 23918) && \
+     !HEDLEY_MSVC_VERSION_CHECK(20, 0, 0)) ||   \
+	HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
 #define HEDLEY_EMPTY_BASES __declspec(empty_bases)
 #define HEDLEY_EMPTY_BASES __declspec(empty_bases)
 #else
 #else
 #define HEDLEY_EMPTY_BASES
 #define HEDLEY_EMPTY_BASES

+ 481 - 0
libobs/util/simde/simde-align.h

@@ -0,0 +1,481 @@
+/* Alignment
+ * Created by Evan Nemerson <[email protected]>
+ *
+ *   To the extent possible under law, the authors have waived all
+ *   copyright and related or neighboring rights to this code.  For
+ *   details, see the Creative Commons Zero 1.0 Universal license at
+ *   <https://creativecommons.org/publicdomain/zero/1.0/>
+ *
+ * SPDX-License-Identifier: CC0-1.0
+ *
+ **********************************************************************
+ *
+ * This is portability layer which should help iron out some
+ * differences across various compilers, as well as various verisons of
+ * C and C++.
+ *
+ * It was originally developed for SIMD Everywhere
+ * (<https://github.com/simd-everywhere/simde>), but since its only
+ * dependency is Hedley (<https://nemequ.github.io/hedley>, also CC0)
+ * it can easily be used in other projects, so please feel free to do
+ * so.
+ *
+ * If you do use this in your project, please keep a link to SIMDe in
+ * your code to remind you where to report any bugs and/or check for
+ * updated versions.
+ *
+ * # API Overview
+ *
+ * The API has several parts, and most macros have a few variations.
+ * There are APIs for declaring aligned fields/variables, optimization
+ * hints, and run-time alignment checks.
+ *
+ * Briefly, macros ending with "_TO" take numeric values and are great
+ * when you know the value you would like to use.  Macros ending with
+ * "_LIKE", on the other hand, accept a type and are used when you want
+ * to use the alignment of a type instead of hardcoding a value.
+ *
+ * Documentation for each section of the API is inline.
+ *
+ * True to form, MSVC is the main problem and imposes several
+ * limitations on the effectiveness of the APIs.  Detailed descriptions
+ * of the limitations of each macro are inline, but in general:
+ *
+ *  * On C11+ or C++11+ code written using this API will work.  The
+ *    ASSUME macros may or may not generate a hint to the compiler, but
+ *    that is only an optimization issue and will not actually cause
+ *    failures.
+ *  * If you're using pretty much any compiler other than MSVC,
+ *    everything should basically work as well as in C11/C++11.
+ */
+
+#if !defined(SIMDE_ALIGN_H)
+#define SIMDE_ALIGN_H
+
+#include "hedley.h"
+
+/* I know this seems a little silly, but some non-hosted compilers
+ * don't have stddef.h, so we try to accomodate them. */
+#if !defined(SIMDE_ALIGN_SIZE_T_)
+#if defined(__SIZE_TYPE__)
+#define SIMDE_ALIGN_SIZE_T_ __SIZE_TYPE__
+#elif defined(__SIZE_T_TYPE__)
+#define SIMDE_ALIGN_SIZE_T_ __SIZE_TYPE__
+#elif defined(__cplusplus)
+#include <cstddef>
+#define SIMDE_ALIGN_SIZE_T_ size_t
+#else
+#include <stddef.h>
+#define SIMDE_ALIGN_SIZE_T_ size_t
+#endif
+#endif
+
+#if !defined(SIMDE_ALIGN_INTPTR_T_)
+#if defined(__INTPTR_TYPE__)
+#define SIMDE_ALIGN_INTPTR_T_ __INTPTR_TYPE__
+#elif defined(__PTRDIFF_TYPE__)
+#define SIMDE_ALIGN_INTPTR_T_ __PTRDIFF_TYPE__
+#elif defined(__PTRDIFF_T_TYPE__)
+#define SIMDE_ALIGN_INTPTR_T_ __PTRDIFF_T_TYPE__
+#elif defined(__cplusplus)
+#include <cstddef>
+#define SIMDE_ALIGN_INTPTR_T_ ptrdiff_t
+#else
+#include <stddef.h>
+#define SIMDE_ALIGN_INTPTR_T_ ptrdiff_t
+#endif
+#endif
+
+#if defined(SIMDE_ALIGN_DEBUG)
+#if defined(__cplusplus)
+#include <cstdio>
+#else
+#include <stdio.h>
+#endif
+#endif
+
+/* SIMDE_ALIGN_OF(Type)
+ *
+ * The SIMDE_ALIGN_OF macro works like alignof, or _Alignof, or
+ * __alignof, or __alignof__, or __ALIGNOF__, depending on the compiler.
+ * It isn't defined everywhere (only when the compiler has some alignof-
+ * like feature we can use to implement it), but it should work in most
+ * modern compilers, as well as C11 and C++11.
+ *
+ * If we can't find an implementation for SIMDE_ALIGN_OF then the macro
+ * will not be defined, so if you can handle that situation sensibly
+ * you may need to sprinkle some ifdefs into your code.
+ */
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
+	(0 && HEDLEY_HAS_FEATURE(c_alignof))
+#define SIMDE_ALIGN_OF(Type) _Alignof(Type)
+#elif (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
+	(0 && HEDLEY_HAS_FEATURE(cxx_alignof))
+#define SIMDE_ALIGN_OF(Type) alignof(Type)
+#elif HEDLEY_GCC_VERSION_CHECK(2, 95, 0) ||                                    \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                                   \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                                \
+	HEDLEY_SUNPRO_VERSION_CHECK(5, 13, 0) ||                               \
+	HEDLEY_TINYC_VERSION_CHECK(0, 9, 24) ||                                \
+	HEDLEY_PGI_VERSION_CHECK(19, 10, 0) ||                                 \
+	HEDLEY_CRAY_VERSION_CHECK(10, 0, 0) ||                                 \
+	HEDLEY_TI_ARMCL_VERSION_CHECK(16, 9, 0) ||                             \
+	HEDLEY_TI_CL2000_VERSION_CHECK(16, 9, 0) ||                            \
+	HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0) ||                               \
+	HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||                               \
+	HEDLEY_TI_CL430_VERSION_CHECK(16, 9, 0) ||                             \
+	HEDLEY_TI_CLPRU_VERSION_CHECK(2, 3, 2) || defined(__IBM__ALIGNOF__) || \
+	defined(__clang__)
+#define SIMDE_ALIGN_OF(Type) __alignof__(Type)
+#elif HEDLEY_IAR_VERSION_CHECK(8, 40, 0)
+#define SIMDE_ALIGN_OF(Type) __ALIGNOF__(Type)
+#elif HEDLEY_MSVC_VERSION_CHECK(19, 0, 0)
+/* Probably goes back much further, but MS takes down their old docs.
+   * If you can verify that this works in earlier versions please let
+   * me know! */
+#define SIMDE_ALIGN_OF(Type) __alignof(Type)
+#endif
+
+/* SIMDE_ALIGN_MAXIMUM:
+ *
+ * This is the maximum alignment that the compiler supports.  You can
+ * define the value prior to including SIMDe if necessary, but in that
+ * case *please* submit an issue so we can add the platform to the
+ * detection code.
+ *
+ * Most compilers are okay with types which are aligned beyond what
+ * they think is the maximum, as long as the alignment is a power
+ * of two.  MSVC is the exception (of course), so we need to cap the
+ * alignment requests at values that the implementation supports.
+ *
+ * XL C/C++ will accept values larger than 16 (which is the alignment
+ * of an AltiVec vector), but will not reliably align to the larger
+ * value, so so we cap the value at 16 there.
+ *
+ * If the compiler accepts any power-of-two value within reason then
+ * this macro should be left undefined, and the SIMDE_ALIGN_CAP
+ * macro will just return the value passed to it. */
+#if !defined(SIMDE_ALIGN_MAXIMUM)
+#if defined(HEDLEY_MSVC_VERSION)
+#if defined(_M_IX86) || defined(_M_AMD64)
+#if HEDLEY_MSVC_VERSION_CHECK(19, 14, 0)
+#define SIMDE_ALIGN_PLATFORM_MAXIMUM 64
+#elif HEDLEY_MSVC_VERSION_CHECK(16, 0, 0)
+/* VS 2010 is really a guess based on Wikipedia; if anyone can
+         * test with old VS versions I'd really appreciate it. */
+#define SIMDE_ALIGN_PLATFORM_MAXIMUM 32
+#else
+#define SIMDE_ALIGN_PLATFORM_MAXIMUM 16
+#endif
+#elif defined(_M_ARM) || defined(_M_ARM64)
+#define SIMDE_ALIGN_PLATFORM_MAXIMUM 8
+#endif
+#elif defined(HEDLEY_IBM_VERSION)
+#define SIMDE_ALIGN_PLATFORM_MAXIMUM 16
+#endif
+#endif
+
+/* You can mostly ignore these; they're intended for internal use.
+ * If you do need to use them please let me know; if they fulfill
+ * a common use case I'll probably drop the trailing underscore
+ * and make them part of the public API. */
+#if defined(SIMDE_ALIGN_PLATFORM_MAXIMUM)
+#if SIMDE_ALIGN_PLATFORM_MAXIMUM >= 64
+#define SIMDE_ALIGN_64_ 64
+#define SIMDE_ALIGN_32_ 32
+#define SIMDE_ALIGN_16_ 16
+#define SIMDE_ALIGN_8_ 8
+#elif SIMDE_ALIGN_PLATFORM_MAXIMUM >= 32
+#define SIMDE_ALIGN_64_ 32
+#define SIMDE_ALIGN_32_ 32
+#define SIMDE_ALIGN_16_ 16
+#define SIMDE_ALIGN_8_ 8
+#elif SIMDE_ALIGN_PLATFORM_MAXIMUM >= 16
+#define SIMDE_ALIGN_64_ 16
+#define SIMDE_ALIGN_32_ 16
+#define SIMDE_ALIGN_16_ 16
+#define SIMDE_ALIGN_8_ 8
+#elif SIMDE_ALIGN_PLATFORM_MAXIMUM >= 8
+#define SIMDE_ALIGN_64_ 8
+#define SIMDE_ALIGN_32_ 8
+#define SIMDE_ALIGN_16_ 8
+#define SIMDE_ALIGN_8_ 8
+#else
+#error Max alignment expected to be >= 8
+#endif
+#else
+#define SIMDE_ALIGN_64_ 64
+#define SIMDE_ALIGN_32_ 32
+#define SIMDE_ALIGN_16_ 16
+#define SIMDE_ALIGN_8_ 8
+#endif
+
+/**
+ * SIMDE_ALIGN_CAP(Alignment)
+ *
+ * Returns the minimum of Alignment or SIMDE_ALIGN_MAXIMUM.
+ */
+#if defined(SIMDE_ALIGN_MAXIMUM)
+#define SIMDE_ALIGN_CAP(Alignment)                      \
+	(((Alignment) < (SIMDE_ALIGN_PLATFORM_MAXIMUM)) \
+		 ? (Alignment)                          \
+		 : (SIMDE_ALIGN_PLATFORM_MAXIMUM))
+#else
+#define SIMDE_ALIGN_CAP(Alignment) (Alignment)
+#endif
+
+/* SIMDE_ALIGN_TO(Alignment)
+ *
+ * SIMDE_ALIGN_TO is used to declare types or variables.  It basically
+ * maps to the align attribute in most compilers, the align declspec
+ * in MSVC, or _Alignas/alignas in C11/C++11.
+ *
+ * Example:
+ *
+ *   struct i32x4 {
+ *     SIMDE_ALIGN_TO(16) int32_t values[4];
+ *   }
+ *
+ * Limitations:
+ *
+ * MSVC requires that the Alignment parameter be numeric; you can't do
+ * something like `SIMDE_ALIGN_TO(SIMDE_ALIGN_OF(int))`.  This is
+ * unfortunate because that's really how the LIKE macros are
+ * implemented, and I am not aware of a way to get anything like this
+ * to work without using the C11/C++11 keywords.
+ *
+ * It also means that we can't use SIMDE_ALIGN_CAP to limit the
+ * alignment to the value specified, which MSVC also requires, so on
+ * MSVC you should use the `SIMDE_ALIGN_TO_8/16/32/64` macros instead.
+ * They work like `SIMDE_ALIGN_TO(SIMDE_ALIGN_CAP(Alignment))` would,
+ * but should be safe to use on MSVC.
+ *
+ * All this is to say that, if you want your code to work on MSVC, you
+ * should use the SIMDE_ALIGN_TO_8/16/32/64 macros below instead of
+ * SIMDE_ALIGN_TO(8/16/32/64).
+ */
+#if HEDLEY_HAS_ATTRIBUTE(aligned) || HEDLEY_GCC_VERSION_CHECK(2, 95, 0) || \
+	HEDLEY_CRAY_VERSION_CHECK(8, 4, 0) ||                              \
+	HEDLEY_IBM_VERSION_CHECK(11, 1, 0) ||                              \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
+	HEDLEY_PGI_VERSION_CHECK(19, 4, 0) ||                              \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                               \
+	HEDLEY_TINYC_VERSION_CHECK(0, 9, 24) ||                            \
+	HEDLEY_TI_ARMCL_VERSION_CHECK(16, 9, 0) ||                         \
+	HEDLEY_TI_CL2000_VERSION_CHECK(16, 9, 0) ||                        \
+	HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0) ||                           \
+	HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||                           \
+	HEDLEY_TI_CL430_VERSION_CHECK(16, 9, 0) ||                         \
+	HEDLEY_TI_CLPRU_VERSION_CHECK(2, 3, 2)
+#define SIMDE_ALIGN_TO(Alignment) \
+	__attribute__((__aligned__(SIMDE_ALIGN_CAP(Alignment))))
+#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))
+#define SIMDE_ALIGN_TO(Alignment) _Alignas(SIMDE_ALIGN_CAP(Alignment))
+#elif (defined(__cplusplus) && (__cplusplus >= 201103L))
+#define SIMDE_ALIGN_TO(Alignment) alignas(SIMDE_ALIGN_CAP(Alignment))
+#elif defined(HEDLEY_MSVC_VERSION)
+#define SIMDE_ALIGN_TO(Alignment) __declspec(align(Alignment))
+/* Unfortunately MSVC can't handle __declspec(align(__alignof(Type)));
+   * the alignment passed to the declspec has to be an integer. */
+#define SIMDE_ALIGN_OF_UNUSABLE_FOR_LIKE
+#endif
+#define SIMDE_ALIGN_TO_64 SIMDE_ALIGN_TO(SIMDE_ALIGN_64_)
+#define SIMDE_ALIGN_TO_32 SIMDE_ALIGN_TO(SIMDE_ALIGN_32_)
+#define SIMDE_ALIGN_TO_16 SIMDE_ALIGN_TO(SIMDE_ALIGN_16_)
+#define SIMDE_ALIGN_TO_8 SIMDE_ALIGN_TO(SIMDE_ALIGN_8_)
+
+/* SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment)
+ *
+ * SIMDE_ALIGN_ASSUME_TO is semantically similar to C++20's
+ * std::assume_aligned, or __builtin_assume_aligned.  It tells the
+ * compiler to assume that the provided pointer is aligned to an
+ * `Alignment`-byte boundary.
+ *
+ * If you define SIMDE_ALIGN_DEBUG prior to including this header then
+ * SIMDE_ALIGN_ASSUME_TO will turn into a runtime check.   We don't
+ * integrate with NDEBUG in this header, but it may be a good idea to
+ * put something like this in your code:
+ *
+ *   #if !defined(NDEBUG)
+ *     #define SIMDE_ALIGN_DEBUG
+ *   #endif
+ *   #include <.../simde-align.h>
+ */
+#if HEDLEY_HAS_BUILTIN(__builtin_assume_aligned) || \
+	HEDLEY_GCC_VERSION_CHECK(4, 7, 0)
+#define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment)                   \
+	HEDLEY_REINTERPRET_CAST(                                              \
+		__typeof__(Pointer),                                          \
+		__builtin_assume_aligned(                                     \
+			HEDLEY_CONST_CAST(                                    \
+				void *, HEDLEY_REINTERPRET_CAST(const void *, \
+								Pointer)),    \
+			Alignment))
+#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment)           \
+	(__extension__({                                              \
+		__typeof__(v) simde_assume_aligned_t_ = (Pointer);    \
+		__assume_aligned(simde_assume_aligned_t_, Alignment); \
+		simde_assume_aligned_t_;                              \
+	}))
+#elif defined(__cplusplus) && (__cplusplus > 201703L)
+#include <memory>
+#define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) \
+	std::assume_aligned<Alignment>(Pointer)
+#else
+#if defined(__cplusplus)
+template<typename T>
+HEDLEY_ALWAYS_INLINE static T *
+simde_align_assume_to_unchecked(T *ptr, const size_t alignment)
+#else
+HEDLEY_ALWAYS_INLINE static void *
+simde_align_assume_to_unchecked(void *ptr, const size_t alignment)
+#endif
+{
+	HEDLEY_ASSUME((HEDLEY_REINTERPRET_CAST(size_t, (ptr)) %
+		       SIMDE_ALIGN_CAP(alignment)) == 0);
+	return ptr;
+}
+#if defined(__cplusplus)
+#define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) \
+	simde_align_assume_to_unchecked((Pointer), (Alignment))
+#else
+#define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment)                \
+	simde_align_assume_to_unchecked(                                   \
+		HEDLEY_CONST_CAST(void *, HEDLEY_REINTERPRET_CAST(         \
+						  const void *, Pointer)), \
+		(Alignment))
+#endif
+#endif
+
+#if !defined(SIMDE_ALIGN_DEBUG)
+#define SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment) \
+	SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment)
+#else
+#include <stdio.h>
+#if defined(__cplusplus)
+template<typename T>
+static HEDLEY_ALWAYS_INLINE T *
+simde_align_assume_to_checked_uncapped(T *ptr, const size_t alignment,
+				       const char *file, int line,
+				       const char *ptrname)
+#else
+static HEDLEY_ALWAYS_INLINE void *
+simde_align_assume_to_checked_uncapped(void *ptr, const size_t alignment,
+				       const char *file, int line,
+				       const char *ptrname)
+#endif
+{
+	if (HEDLEY_UNLIKELY(
+		    (HEDLEY_REINTERPRET_CAST(SIMDE_ALIGN_INTPTR_T_, (ptr)) %
+		     HEDLEY_STATIC_CAST(SIMDE_ALIGN_INTPTR_T_,
+					SIMDE_ALIGN_CAP(alignment))) != 0)) {
+		fprintf(stderr,
+			"%s:%d: alignment check failed for `%s' (%p %% %u == %u)\n",
+			file, line, ptrname,
+			HEDLEY_REINTERPRET_CAST(const void *, ptr),
+			HEDLEY_STATIC_CAST(unsigned int,
+					   SIMDE_ALIGN_CAP(alignment)),
+			HEDLEY_STATIC_CAST(
+				unsigned int,
+				HEDLEY_REINTERPRET_CAST(SIMDE_ALIGN_INTPTR_T_,
+							(ptr)) %
+					HEDLEY_STATIC_CAST(
+						SIMDE_ALIGN_INTPTR_T_,
+						SIMDE_ALIGN_CAP(alignment))));
+	}
+
+	return ptr;
+}
+
+#if defined(__cplusplus)
+#define SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment)                      \
+	simde_align_assume_to_checked_uncapped((Pointer), (Alignment), \
+					       __FILE__, __LINE__, #Pointer)
+#else
+#define SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment)                          \
+	simde_align_assume_to_checked_uncapped(                            \
+		HEDLEY_CONST_CAST(void *, HEDLEY_REINTERPRET_CAST(         \
+						  const void *, Pointer)), \
+		(Alignment), __FILE__, __LINE__, #Pointer)
+#endif
+#endif
+
+/* SIMDE_ALIGN_LIKE(Type)
+ * SIMDE_ALIGN_LIKE_#(Type)
+ *
+ * The SIMDE_ALIGN_LIKE macros are similar to the SIMDE_ALIGN_TO macros
+ * except instead of an integer they take a type; basically, it's just
+ * a more convenient way to do something like:
+ *
+ *   SIMDE_ALIGN_TO(SIMDE_ALIGN_OF(Type))
+ *
+ * The versions with a numeric suffix will fall back on using a numeric
+ * value in the event we can't use SIMDE_ALIGN_OF(Type).  This is
+ * mainly for MSVC, where __declspec(align()) can't handle anything
+ * other than hard-coded numeric values.
+ */
+#if defined(SIMDE_ALIGN_OF) && defined(SIMDE_ALIGN_TO) && \
+	!defined(SIMDE_ALIGN_OF_UNUSABLE_FOR_LIKE)
+#define SIMDE_ALIGN_LIKE(Type) SIMDE_ALIGN_TO(SIMDE_ALIGN_OF(Type))
+#define SIMDE_ALIGN_LIKE_64(Type) SIMDE_ALIGN_LIKE(Type)
+#define SIMDE_ALIGN_LIKE_32(Type) SIMDE_ALIGN_LIKE(Type)
+#define SIMDE_ALIGN_LIKE_16(Type) SIMDE_ALIGN_LIKE(Type)
+#define SIMDE_ALIGN_LIKE_8(Type) SIMDE_ALIGN_LIKE(Type)
+#else
+#define SIMDE_ALIGN_LIKE_64(Type) SIMDE_ALIGN_TO_64
+#define SIMDE_ALIGN_LIKE_32(Type) SIMDE_ALIGN_TO_32
+#define SIMDE_ALIGN_LIKE_16(Type) SIMDE_ALIGN_TO_16
+#define SIMDE_ALIGN_LIKE_8(Type) SIMDE_ALIGN_TO_8
+#endif
+
+/* SIMDE_ALIGN_ASSUME_LIKE(Pointer, Type)
+ *
+ * Tihs is similar to SIMDE_ALIGN_ASSUME_TO, except that it takes a
+ * type instead of a numeric value. */
+#if defined(SIMDE_ALIGN_OF) && defined(SIMDE_ALIGN_ASSUME_TO)
+#define SIMDE_ALIGN_ASSUME_LIKE(Pointer, Type) \
+	SIMDE_ALIGN_ASSUME_TO(Pointer, SIMDE_ALIGN_OF(Type))
+#endif
+
+/* SIMDE_ALIGN_CAST(Type, Pointer)
+ *
+ * SIMDE_ALIGN_CAST is like C++'s reinterpret_cast, but it will try
+ * to silence warnings that some compilers may produce if you try
+ * to assign to a type with increased alignment requirements.
+ *
+ * Note that it does *not* actually attempt to tell the compiler that
+ * the pointer is aligned like the destination should be; that's the
+ * job of the next macro.  This macro is necessary for stupid APIs
+ * like _mm_loadu_si128 where the input is a __m128i* but the function
+ * is specifically for data which isn't necessarily aligned to
+ * _Alignof(__m128i).
+ */
+#if HEDLEY_HAS_WARNING("-Wcast-align") || defined(__clang__) || \
+	HEDLEY_GCC_VERSION_CHECK(3, 4, 0)
+#define SIMDE_ALIGN_CAST(Type, Pointer)                                 \
+	(__extension__({                                                \
+		HEDLEY_DIAGNOSTIC_PUSH                                  \
+		_Pragma("GCC diagnostic ignored \"-Wcast-align\"")      \
+			Type simde_r_ =                                 \
+				HEDLEY_REINTERPRET_CAST(Type, Pointer); \
+		HEDLEY_DIAGNOSTIC_POP                                   \
+		simde_r_;                                               \
+	}))
+#else
+#define SIMDE_ALIGN_CAST(Type, Pointer) HEDLEY_REINTERPRET_CAST(Type, Pointer)
+#endif
+
+/* SIMDE_ALIGN_ASSUME_CAST(Type, Pointer)
+ *
+ * This is sort of like a combination of a reinterpret_cast and a
+ * SIMDE_ALIGN_ASSUME_LIKE.  It uses SIMDE_ALIGN_ASSUME_LIKE to tell
+ * the compiler that the pointer is aligned like the specified type
+ * and casts the pointer to the specified type while suppressing any
+ * warnings from the compiler about casting to a type with greater
+ * alignment requirements.
+ */
+#define SIMDE_ALIGN_ASSUME_CAST(Type, Pointer) \
+	SIMDE_ALIGN_ASSUME_LIKE(SIMDE_ALIGN_CAST(Type, Pointer), Type)
+
+#endif /* !defined(SIMDE_ALIGN_H) */

+ 22 - 3
libobs/util/simde/simde-arch.h

@@ -27,14 +27,14 @@
  * an undefined macro being used (e.g., GCC with -Wundef).
  * an undefined macro being used (e.g., GCC with -Wundef).
  *
  *
  * This was originally created for SIMDe
  * This was originally created for SIMDe
- * <https://github.com/nemequ/simde> (hence the prefix), but this
+ * <https://github.com/simd-everywhere/simde> (hence the prefix), but this
  * header has no dependencies and may be used anywhere.  It is
  * header has no dependencies and may be used anywhere.  It is
  * originally based on information from
  * originally based on information from
  * <https://sourceforge.net/p/predef/wiki/Architectures/>, though it
  * <https://sourceforge.net/p/predef/wiki/Architectures/>, though it
  * has been enhanced with additional information.
  * has been enhanced with additional information.
  *
  *
  * If you improve this file, or find a bug, please file the issue at
  * If you improve this file, or find a bug, please file the issue at
- * <https://github.com/nemequ/simde/issues>.  If you copy this into
+ * <https://github.com/simd-everywhere/simde/issues>.  If you copy this into
  * your project, even if you change the prefix, please keep the links
  * your project, even if you change the prefix, please keep the links
  * to SIMDe intact so others know where to report issues, submit
  * to SIMDe intact so others know where to report issues, submit
  * enhancements, and find the latest version. */
  * enhancements, and find the latest version. */
@@ -70,7 +70,7 @@
 /* AMD64 / x86_64
 /* AMD64 / x86_64
    <https://en.wikipedia.org/wiki/X86-64> */
    <https://en.wikipedia.org/wiki/X86-64> */
 #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || \
 #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || \
-	defined(__x86_64) || defined(_M_X66) || defined(_M_AMD64)
+	defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
 #define SIMDE_ARCH_AMD64 1000
 #define SIMDE_ARCH_AMD64 1000
 #endif
 #endif
 
 
@@ -125,6 +125,9 @@
 #define SIMDE_ARCH_ARM_NEON SIMDE_ARCH_ARM
 #define SIMDE_ARCH_ARM_NEON SIMDE_ARCH_ARM
 #endif
 #endif
 #endif
 #endif
+#if defined(__ARM_FEATURE_SVE)
+#define SIMDE_ARCH_ARM_SVE
+#endif
 
 
 /* Blackfin
 /* Blackfin
    <https://en.wikipedia.org/wiki/Blackfin> */
    <https://en.wikipedia.org/wiki/Blackfin> */
@@ -276,6 +279,12 @@
 #define SIMDE_ARCH_X86_AVX 1
 #define SIMDE_ARCH_X86_AVX 1
 #endif
 #endif
 #endif
 #endif
+#if defined(__AVX512VP2INTERSECT__)
+#define SIMDE_ARCH_X86_AVX512VP2INTERSECT 1
+#endif
+#if defined(__AVX512VBMI__)
+#define SIMDE_ARCH_X86_AVX512VBMI 1
+#endif
 #if defined(__AVX512BW__)
 #if defined(__AVX512BW__)
 #define SIMDE_ARCH_X86_AVX512BW 1
 #define SIMDE_ARCH_X86_AVX512BW 1
 #endif
 #endif
@@ -294,6 +303,12 @@
 #if defined(__GFNI__)
 #if defined(__GFNI__)
 #define SIMDE_ARCH_X86_GFNI 1
 #define SIMDE_ARCH_X86_GFNI 1
 #endif
 #endif
+#if defined(__PCLMUL__)
+#define SIMDE_ARCH_X86_PCLMUL 1
+#endif
+#if defined(__VPCLMULQDQ__)
+#define SIMDE_ARCH_X86_VPCLMULQDQ 1
+#endif
 #endif
 #endif
 
 
 /* Itanium
 /* Itanium
@@ -363,6 +378,10 @@
 #define SIMDE_ARCH_MIPS_CHECK(version) (0)
 #define SIMDE_ARCH_MIPS_CHECK(version) (0)
 #endif
 #endif
 
 
+#if defined(__mips_loongson_mmi)
+#define SIMDE_ARCH_MIPS_LOONGSON_MMI 1
+#endif
+
 /* Matsushita MN10300
 /* Matsushita MN10300
    <https://en.wikipedia.org/wiki/MN103> */
    <https://en.wikipedia.org/wiki/MN103> */
 #if defined(__MN10300__) || defined(__mn10300__)
 #if defined(__MN10300__) || defined(__mn10300__)

+ 329 - 162
libobs/util/simde/simde-common.h

@@ -30,63 +30,104 @@
 #include "hedley.h"
 #include "hedley.h"
 
 
 #define SIMDE_VERSION_MAJOR 0
 #define SIMDE_VERSION_MAJOR 0
-#define SIMDE_VERSION_MINOR 5
-#define SIMDE_VERSION_MICRO 0
+#define SIMDE_VERSION_MINOR 7
+#define SIMDE_VERSION_MICRO 1
 #define SIMDE_VERSION                                                   \
 #define SIMDE_VERSION                                                   \
 	HEDLEY_VERSION_ENCODE(SIMDE_VERSION_MAJOR, SIMDE_VERSION_MINOR, \
 	HEDLEY_VERSION_ENCODE(SIMDE_VERSION_MAJOR, SIMDE_VERSION_MINOR, \
 			      SIMDE_VERSION_MICRO)
 			      SIMDE_VERSION_MICRO)
 
 
-#include "simde-arch.h"
-#include "simde-features.h"
-#include "simde-diagnostic.h"
-
 #include <stddef.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdint.h>
 
 
-#if HEDLEY_HAS_ATTRIBUTE(aligned) || HEDLEY_GCC_VERSION_CHECK(2, 95, 0) || \
-	HEDLEY_CRAY_VERSION_CHECK(8, 4, 0) ||                              \
-	HEDLEY_IBM_VERSION_CHECK(11, 1, 0) ||                              \
-	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
-	HEDLEY_PGI_VERSION_CHECK(19, 4, 0) ||                              \
-	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                               \
-	HEDLEY_TINYC_VERSION_CHECK(0, 9, 24) ||                            \
-	HEDLEY_TI_VERSION_CHECK(8, 1, 0)
-#define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment)))
-#elif defined(_MSC_VER) && !(defined(_M_ARM) && !defined(_M_ARM64))
-#define SIMDE_ALIGN(alignment) __declspec(align(alignment))
-#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
-#define SIMDE_ALIGN(alignment) _Alignas(alignment)
-#elif defined(__cplusplus) && (__cplusplus >= 201103L)
-#define SIMDE_ALIGN(alignment) alignas(alignment)
-#else
-#define SIMDE_ALIGN(alignment)
-#endif
-
-#if HEDLEY_GNUC_VERSION_CHECK(2, 95, 0) ||   \
-	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \
-	HEDLEY_IBM_VERSION_CHECK(11, 1, 0)
-#define SIMDE_ALIGN_OF(T) (__alignof__(T))
-#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
-	HEDLEY_HAS_FEATURE(c11_alignof)
-#define SIMDE_ALIGN_OF(T) (_Alignof(T))
-#elif (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
-	HEDLEY_HAS_FEATURE(cxx_alignof)
-#define SIMDE_ALIGN_OF(T) (alignof(T))
-#endif
-
-#if defined(SIMDE_ALIGN_OF)
-#define SIMDE_ALIGN_AS(N, T) SIMDE_ALIGN(SIMDE_ALIGN_OF(T))
-#else
-#define SIMDE_ALIGN_AS(N, T) SIMDE_ALIGN(N)
+#include "simde-detect-clang.h"
+#include "simde-arch.h"
+#include "simde-features.h"
+#include "simde-diagnostic.h"
+#include "simde-math.h"
+#include "simde-constify.h"
+#include "simde-align.h"
+
+/* In some situations, SIMDe has to make large performance sacrifices
+ * for small increases in how faithfully it reproduces an API, but
+ * only a relatively small number of users will actually need the API
+ * to be completely accurate.  The SIMDE_FAST_* options can be used to
+ * disable these trade-offs.
+ *
+ * They can be enabled by passing -DSIMDE_FAST_MATH to the compiler, or
+ * the individual defines (e.g., -DSIMDE_FAST_NANS) if you only want to
+ * enable some optimizations.  Using -ffast-math and/or
+ * -ffinite-math-only will also enable the relevant options.  If you
+ * don't want that you can pass -DSIMDE_NO_FAST_* to disable them. */
+
+/* Most programs avoid NaNs by never passing values which can result in
+ * a NaN; for example, if you only pass non-negative values to the sqrt
+ * functions, it won't generate a NaN.  On some platforms, similar
+ * functions handle NaNs differently; for example, the _mm_min_ps SSE
+ * function will return 0.0 if you pass it (0.0, NaN), but the NEON
+ * vminq_f32 function will return NaN.  Making them behave like one
+ * another is expensive; it requires generating a mask of all lanes
+ * with NaNs, then performing the operation (e.g., vminq_f32), then
+ * blending together the result with another vector using the mask.
+ *
+ * If you don't want SIMDe to worry about the differences between how
+ * NaNs are handled on the two platforms, define this (or pass
+ * -ffinite-math-only) */
+#if !defined(SIMDE_FAST_MATH) && !defined(SIMDE_NO_FAST_MATH) && \
+	defined(__FAST_MATH__)
+#define SIMDE_FAST_MATH
+#endif
+
+#if !defined(SIMDE_FAST_NANS) && !defined(SIMDE_NO_FAST_NANS)
+#if defined(SIMDE_FAST_MATH)
+#define SIMDE_FAST_NANS
+#elif defined(__FINITE_MATH_ONLY__)
+#if __FINITE_MATH_ONLY__
+#define SIMDE_FAST_NANS
+#endif
+#endif
+#endif
+
+/* Many functions are defined as using the current rounding mode
+ * (i.e., the SIMD version of fegetround()) when converting to
+ * an integer.  For example, _mm_cvtpd_epi32.  Unfortunately,
+ * on some platforms (such as ARMv8+ where round-to-nearest is
+ * always used, regardless of the FPSCR register) this means we
+ * have to first query the current rounding mode, then choose
+ * the proper function (rounnd
+ , ceil, floor, etc.) */
+#if !defined(SIMDE_FAST_ROUND_MODE) && !defined(SIMDE_NO_FAST_ROUND_MODE) && \
+	defined(SIMDE_FAST_MATH)
+#define SIMDE_FAST_ROUND_MODE
+#endif
+
+/* This controls how ties are rounded.  For example, does 10.5 round to
+ * 10 or 11?  IEEE 754 specifies round-towards-even, but ARMv7 (for
+ * example) doesn't support it and it must be emulated (which is rather
+ * slow).  If you're okay with just using the default for whatever arch
+ * you're on, you should definitely define this.
+ *
+ * Note that we don't use this macro to avoid correct implementations
+ * in functions which are explicitly about rounding (such as vrnd* on
+ * NEON, _mm_round_* on x86, etc.); it is only used for code where
+ * rounding is a component in another function, and even then it isn't
+ * usually a problem since such functions will use the current rounding
+ * mode. */
+#if !defined(SIMDE_FAST_ROUND_TIES) && !defined(SIMDE_NO_FAST_ROUND_TIES) && \
+	defined(SIMDE_FAST_MATH)
+#define SIMDE_FAST_ROUND_TIES
+#endif
+
+/* For functions which convert from one type to another (mostly from
+ * floating point to integer types), sometimes we need to do a range
+ * check and potentially return a different result if the value
+ * falls outside that range.  Skipping this check can provide a
+ * performance boost, at the expense of faithfulness to the API we're
+ * emulating. */
+#if !defined(SIMDE_FAST_CONVERSION_RANGE) && \
+	!defined(SIMDE_NO_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_MATH)
+#define SIMDE_FAST_CONVERSION_RANGE
 #endif
 #endif
 
 
-#define simde_assert_aligned(alignment, val)                                \
-	simde_assert_int(HEDLEY_REINTERPRET_CAST(                           \
-				 uintptr_t, HEDLEY_REINTERPRET_CAST(        \
-						    const void *, (val))) % \
-				 (alignment),                               \
-			 ==, 0)
-
 #if HEDLEY_HAS_BUILTIN(__builtin_constant_p) ||                             \
 #if HEDLEY_HAS_BUILTIN(__builtin_constant_p) ||                             \
 	HEDLEY_GCC_VERSION_CHECK(3, 4, 0) ||                                \
 	HEDLEY_GCC_VERSION_CHECK(3, 4, 0) ||                                \
 	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                             \
 	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                             \
@@ -102,15 +143,21 @@
 #define SIMDE_CHECK_CONSTANT_(expr) (std::is_constant_evaluated())
 #define SIMDE_CHECK_CONSTANT_(expr) (std::is_constant_evaluated())
 #endif
 #endif
 
 
-/* diagnose_if + __builtin_constant_p was broken until clang 9,
- * which is when __FILE_NAME__ was added. */
-#if defined(SIMDE_CHECK_CONSTANT_) && defined(__FILE_NAME__)
+#if !defined(SIMDE_NO_CHECK_IMMEDIATE_CONSTANT)
+#if defined(SIMDE_CHECK_CONSTANT_) &&                \
+	SIMDE_DETECT_CLANG_VERSION_CHECK(9, 0, 0) && \
+	(!defined(__apple_build_version__) ||        \
+	 ((__apple_build_version__ < 11000000) ||    \
+	  (__apple_build_version__ >= 12000000)))
 #define SIMDE_REQUIRE_CONSTANT(arg)                    \
 #define SIMDE_REQUIRE_CONSTANT(arg)                    \
 	HEDLEY_REQUIRE_MSG(SIMDE_CHECK_CONSTANT_(arg), \
 	HEDLEY_REQUIRE_MSG(SIMDE_CHECK_CONSTANT_(arg), \
 			   "`" #arg "' must be constant")
 			   "`" #arg "' must be constant")
 #else
 #else
 #define SIMDE_REQUIRE_CONSTANT(arg)
 #define SIMDE_REQUIRE_CONSTANT(arg)
 #endif
 #endif
+#else
+#define SIMDE_REQUIRE_CONSTANT(arg)
+#endif
 
 
 #define SIMDE_REQUIRE_RANGE(arg, min, max)                         \
 #define SIMDE_REQUIRE_RANGE(arg, min, max)                         \
 	HEDLEY_REQUIRE_MSG((((arg) >= (min)) && ((arg) <= (max))), \
 	HEDLEY_REQUIRE_MSG((((arg) >= (min)) && ((arg) <= (max))), \
@@ -120,39 +167,20 @@
 	SIMDE_REQUIRE_CONSTANT(arg)                 \
 	SIMDE_REQUIRE_CONSTANT(arg)                 \
 	SIMDE_REQUIRE_RANGE(arg, min, max)
 	SIMDE_REQUIRE_RANGE(arg, min, max)
 
 
-/* SIMDE_ASSUME_ALIGNED allows you to (try to) tell the compiler
- * that a pointer is aligned to an `alignment`-byte boundary. */
-#if HEDLEY_HAS_BUILTIN(__builtin_assume_aligned) || \
-	HEDLEY_GCC_VERSION_CHECK(4, 7, 0)
-#define SIMDE_ASSUME_ALIGNED(alignment, v)     \
-	HEDLEY_REINTERPRET_CAST(__typeof__(v), \
-				__builtin_assume_aligned(v, alignment))
-#elif defined(__cplusplus) && (__cplusplus > 201703L)
-#define SIMDE_ASSUME_ALIGNED(alignment, v) std::assume_aligned<alignment>(v)
-#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
-#define SIMDE_ASSUME_ALIGNED(alignment, v)                            \
-	(__extension__({                                              \
-		__typeof__(v) simde_assume_aligned_t_ = (v);          \
-		__assume_aligned(simde_assume_aligned_t_, alignment); \
-		simde_assume_aligned_t_;                              \
-	}))
-#else
-#define SIMDE_ASSUME_ALIGNED(alignment, v) (v)
-#endif
-
-/* SIMDE_ALIGN_CAST allows you to convert to a type with greater
- * aligment requirements without triggering a warning. */
-#if HEDLEY_HAS_WARNING("-Wcast-align")
-#define SIMDE_ALIGN_CAST(T, v)                                       \
-	(__extension__({                                             \
-		HEDLEY_DIAGNOSTIC_PUSH                               \
-		_Pragma("clang diagnostic ignored \"-Wcast-align\"") \
-			T simde_r_ = HEDLEY_REINTERPRET_CAST(T, v);  \
-		HEDLEY_DIAGNOSTIC_POP                                \
-		simde_r_;                                            \
-	}))
-#else
-#define SIMDE_ALIGN_CAST(T, v) HEDLEY_REINTERPRET_CAST(T, v)
+/* A copy of HEDLEY_STATIC_ASSERT, except we don't define an empty
+ * fallback if we can't find an implementation; instead we have to
+ * check if SIMDE_STATIC_ASSERT is defined before using it. */
+#if !defined(__cplusplus) &&                                             \
+	((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
+	 HEDLEY_HAS_FEATURE(c_static_assert) ||                          \
+	 HEDLEY_GCC_VERSION_CHECK(6, 0, 0) ||                            \
+	 HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || defined(_Static_assert))
+#define SIMDE_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
+#elif (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
+	HEDLEY_MSVC_VERSION_CHECK(16, 0, 0)
+#define SIMDE_STATIC_ASSERT(expr, message)            \
+	HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \
+		static_assert(expr, message))
 #endif
 #endif
 
 
 #if (HEDLEY_HAS_ATTRIBUTE(may_alias) && !defined(HEDLEY_SUNPRO_VERSION)) || \
 #if (HEDLEY_HAS_ATTRIBUTE(may_alias) && !defined(HEDLEY_SUNPRO_VERSION)) || \
@@ -170,6 +198,7 @@
 
 
     * SIMDE_VECTOR - Declaring a vector.
     * SIMDE_VECTOR - Declaring a vector.
     * SIMDE_VECTOR_OPS - basic operations (binary and unary).
     * SIMDE_VECTOR_OPS - basic operations (binary and unary).
+    * SIMDE_VECTOR_NEGATE - negating a vector
     * SIMDE_VECTOR_SCALAR - For binary operators, the second argument
     * SIMDE_VECTOR_SCALAR - For binary operators, the second argument
         can be a scalar, in which case the result is as if that scalar
         can be a scalar, in which case the result is as if that scalar
         had been broadcast to all lanes of a vector.
         had been broadcast to all lanes of a vector.
@@ -182,11 +211,13 @@
 #if HEDLEY_GCC_VERSION_CHECK(4, 8, 0)
 #if HEDLEY_GCC_VERSION_CHECK(4, 8, 0)
 #define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
 #define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
 #define SIMDE_VECTOR_OPS
 #define SIMDE_VECTOR_OPS
+#define SIMDE_VECTOR_NEGATE
 #define SIMDE_VECTOR_SCALAR
 #define SIMDE_VECTOR_SCALAR
 #define SIMDE_VECTOR_SUBSCRIPT
 #define SIMDE_VECTOR_SUBSCRIPT
 #elif HEDLEY_INTEL_VERSION_CHECK(16, 0, 0)
 #elif HEDLEY_INTEL_VERSION_CHECK(16, 0, 0)
 #define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
 #define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
 #define SIMDE_VECTOR_OPS
 #define SIMDE_VECTOR_OPS
+#define SIMDE_VECTOR_NEGATE
 /* ICC only supports SIMDE_VECTOR_SCALAR for constants */
 /* ICC only supports SIMDE_VECTOR_SCALAR for constants */
 #define SIMDE_VECTOR_SUBSCRIPT
 #define SIMDE_VECTOR_SUBSCRIPT
 #elif HEDLEY_GCC_VERSION_CHECK(4, 1, 0) || HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
 #elif HEDLEY_GCC_VERSION_CHECK(4, 1, 0) || HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
@@ -197,8 +228,9 @@
 #elif HEDLEY_HAS_ATTRIBUTE(vector_size)
 #elif HEDLEY_HAS_ATTRIBUTE(vector_size)
 #define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
 #define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
 #define SIMDE_VECTOR_OPS
 #define SIMDE_VECTOR_OPS
+#define SIMDE_VECTOR_NEGATE
 #define SIMDE_VECTOR_SUBSCRIPT
 #define SIMDE_VECTOR_SUBSCRIPT
-#if HEDLEY_HAS_ATTRIBUTE(diagnose_if) /* clang 4.0 */
+#if SIMDE_DETECT_CLANG_VERSION_CHECK(5, 0, 0)
 #define SIMDE_VECTOR_SCALAR
 #define SIMDE_VECTOR_SCALAR
 #endif
 #endif
 #endif
 #endif
@@ -281,27 +313,34 @@ HEDLEY_DIAGNOSTIC_POP
 #endif
 #endif
 
 
 #if defined(SIMDE_ENABLE_OPENMP)
 #if defined(SIMDE_ENABLE_OPENMP)
-#define SIMDE_VECTORIZE _Pragma("omp simd")
+#define SIMDE_VECTORIZE HEDLEY_PRAGMA(omp simd)
 #define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(omp simd safelen(l))
 #define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(omp simd safelen(l))
+#if defined(__clang__)
+#define SIMDE_VECTORIZE_REDUCTION(r)                              \
+	HEDLEY_DIAGNOSTIC_PUSH                                    \
+	_Pragma("clang diagnostic ignored \"-Wsign-conversion\"") \
+		HEDLEY_PRAGMA(omp simd reduction(r)) HEDLEY_DIAGNOSTIC_POP
+#else
 #define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(omp simd reduction(r))
 #define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(omp simd reduction(r))
+#endif
 #define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd aligned(a))
 #define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd aligned(a))
 #elif defined(SIMDE_ENABLE_CILKPLUS)
 #elif defined(SIMDE_ENABLE_CILKPLUS)
-#define SIMDE_VECTORIZE _Pragma("simd")
+#define SIMDE_VECTORIZE HEDLEY_PRAGMA(simd)
 #define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l))
 #define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l))
 #define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r))
 #define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r))
 #define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(simd aligned(a))
 #define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(simd aligned(a))
 #elif defined(__clang__) && !defined(HEDLEY_IBM_VERSION)
 #elif defined(__clang__) && !defined(HEDLEY_IBM_VERSION)
-#define SIMDE_VECTORIZE _Pragma("clang loop vectorize(enable)")
+#define SIMDE_VECTORIZE HEDLEY_PRAGMA(clang loop vectorize(enable))
 #define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l))
 #define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l))
 #define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
 #define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
 #define SIMDE_VECTORIZE_ALIGNED(a)
 #define SIMDE_VECTORIZE_ALIGNED(a)
 #elif HEDLEY_GCC_VERSION_CHECK(4, 9, 0)
 #elif HEDLEY_GCC_VERSION_CHECK(4, 9, 0)
-#define SIMDE_VECTORIZE _Pragma("GCC ivdep")
+#define SIMDE_VECTORIZE HEDLEY_PRAGMA(GCC ivdep)
 #define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE
 #define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE
 #define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
 #define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
 #define SIMDE_VECTORIZE_ALIGNED(a)
 #define SIMDE_VECTORIZE_ALIGNED(a)
 #elif HEDLEY_CRAY_VERSION_CHECK(5, 0, 0)
 #elif HEDLEY_CRAY_VERSION_CHECK(5, 0, 0)
-#define SIMDE_VECTORIZE _Pragma("_CRI ivdep")
+#define SIMDE_VECTORIZE HEDLEY_PRAGMA(_CRI ivdep)
 #define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE
 #define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE
 #define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
 #define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
 #define SIMDE_VECTORIZE_ALIGNED(a)
 #define SIMDE_VECTORIZE_ALIGNED(a)
@@ -350,20 +389,10 @@ HEDLEY_DIAGNOSTIC_POP
 	HEDLEY_DIAGNOSTIC_POP
 	HEDLEY_DIAGNOSTIC_POP
 #endif
 #endif
 
 
-#if HEDLEY_HAS_WARNING("-Wpedantic")
-#define SIMDE_DIAGNOSTIC_DISABLE_INT128 \
-	_Pragma("clang diagnostic ignored \"-Wpedantic\"")
-#elif defined(HEDLEY_GCC_VERSION)
-#define SIMDE_DIAGNOSTIC_DISABLE_INT128 \
-	_Pragma("GCC diagnostic ignored \"-Wpedantic\"")
-#else
-#define SIMDE_DIAGNOSTIC_DISABLE_INT128
-#endif
-
 #if defined(__SIZEOF_INT128__)
 #if defined(__SIZEOF_INT128__)
 #define SIMDE_HAVE_INT128_
 #define SIMDE_HAVE_INT128_
 HEDLEY_DIAGNOSTIC_PUSH
 HEDLEY_DIAGNOSTIC_PUSH
-SIMDE_DIAGNOSTIC_DISABLE_INT128
+SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_
 typedef __int128 simde_int128;
 typedef __int128 simde_int128;
 typedef unsigned __int128 simde_uint128;
 typedef unsigned __int128 simde_uint128;
 HEDLEY_DIAGNOSTIC_POP
 HEDLEY_DIAGNOSTIC_POP
@@ -488,39 +517,6 @@ typedef SIMDE_FLOAT32_TYPE simde_float32;
 #endif
 #endif
 typedef SIMDE_FLOAT64_TYPE simde_float64;
 typedef SIMDE_FLOAT64_TYPE simde_float64;
 
 
-/* Whether to assume that the compiler can auto-vectorize reasonably
-   well.  This will cause SIMDe to attempt to compose vector
-   operations using more simple vector operations instead of minimize
-   serial work.
-
-   As an example, consider the _mm_add_ss(a, b) function from SSE,
-   which returns { a0 + b0, a1, a2, a3 }.  This pattern is repeated
-   for other operations (sub, mul, etc.).
-
-   The naïve implementation would result in loading a0 and b0, adding
-   them into a temporary variable, then splicing that value into a new
-   vector with the remaining elements from a.
-
-   On platforms which support vectorization, it's generally faster to
-   simply perform the operation on the entire vector to avoid having
-   to move data between SIMD registers and non-SIMD registers.
-   Basically, instead of the temporary variable being (a0 + b0) it
-   would be a vector of (a + b), which is then combined with a to form
-   the result.
-
-   By default, SIMDe will prefer the pure-vector versions if we detect
-   a vector ISA extension, but this can be overridden by defining
-   SIMDE_NO_ASSUME_VECTORIZATION.  You can also define
-   SIMDE_ASSUME_VECTORIZATION if you want to force SIMDe to use the
-   vectorized version. */
-#if !defined(SIMDE_NO_ASSUME_VECTORIZATION) && \
-	!defined(SIMDE_ASSUME_VECTORIZATION)
-#if defined(__SSE__) || defined(__ARM_NEON) || defined(__mips_msa) || \
-	defined(__ALTIVEC__) || defined(__wasm_simd128__)
-#define SIMDE_ASSUME_VECTORIZATION
-#endif
-#endif
-
 #if HEDLEY_HAS_WARNING("-Wbad-function-cast")
 #if HEDLEY_HAS_WARNING("-Wbad-function-cast")
 #define SIMDE_CONVERT_FTOI(T, v)                                    \
 #define SIMDE_CONVERT_FTOI(T, v)                                    \
 	HEDLEY_DIAGNOSTIC_PUSH                                      \
 	HEDLEY_DIAGNOSTIC_PUSH                                      \
@@ -530,11 +526,18 @@ typedef SIMDE_FLOAT64_TYPE simde_float64;
 #define SIMDE_CONVERT_FTOI(T, v) ((T)(v))
 #define SIMDE_CONVERT_FTOI(T, v) ((T)(v))
 #endif
 #endif
 
 
+/* TODO: detect compilers which support this outside of C11 mode */
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
 #define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) \
 #define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) \
-	(_Generic((value), to : (value), from : ((to)(value))))
+	_Generic((value), to                            \
+		 : (value), default                     \
+		 : (_Generic((value), from              \
+			     : ((to)(value)))))
 #define SIMDE_CHECKED_STATIC_CAST(to, from, value) \
 #define SIMDE_CHECKED_STATIC_CAST(to, from, value) \
-	(_Generic((value), to : (value), from : ((to)(value))))
+	_Generic((value), to                       \
+		 : (value), default                \
+		 : (_Generic((value), from         \
+			     : ((to)(value)))))
 #else
 #else
 #define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) \
 #define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) \
 	HEDLEY_REINTERPRET_CAST(to, value)
 	HEDLEY_REINTERPRET_CAST(to, value)
@@ -564,7 +567,7 @@ typedef SIMDE_FLOAT64_TYPE simde_float64;
 #if defined(__STDC_HOSTED__)
 #if defined(__STDC_HOSTED__)
 #define SIMDE_STDC_HOSTED __STDC_HOSTED__
 #define SIMDE_STDC_HOSTED __STDC_HOSTED__
 #else
 #else
-#if defined(HEDLEY_PGI_VERSION_CHECK) || defined(HEDLEY_MSVC_VERSION_CHECK)
+#if defined(HEDLEY_PGI_VERSION) || defined(HEDLEY_MSVC_VERSION)
 #define SIMDE_STDC_HOSTED 1
 #define SIMDE_STDC_HOSTED 1
 #else
 #else
 #define SIMDE_STDC_HOSTED 0
 #define SIMDE_STDC_HOSTED 0
@@ -572,23 +575,34 @@ typedef SIMDE_FLOAT64_TYPE simde_float64;
 #endif
 #endif
 
 
 /* Try to deal with environments without a standard library. */
 /* Try to deal with environments without a standard library. */
-#if !defined(simde_memcpy) || !defined(simde_memset)
-#if !defined(SIMDE_NO_STRING_H) && defined(__has_include)
-#if __has_include(<string.h>)
-#include <string.h>
 #if !defined(simde_memcpy)
 #if !defined(simde_memcpy)
-#define simde_memcpy(dest, src, n) memcpy(dest, src, n)
+#if HEDLEY_HAS_BUILTIN(__builtin_memcpy)
+#define simde_memcpy(dest, src, n) __builtin_memcpy(dest, src, n)
+#endif
 #endif
 #endif
 #if !defined(simde_memset)
 #if !defined(simde_memset)
-#define simde_memset(s, c, n) memset(s, c, n)
+#if HEDLEY_HAS_BUILTIN(__builtin_memset)
+#define simde_memset(s, c, n) __builtin_memset(s, c, n)
 #endif
 #endif
-#else
+#endif
+#if !defined(simde_memcmp)
+#if HEDLEY_HAS_BUILTIN(__builtin_memcmp)
+#define simde_memcmp(s1, s2, n) __builtin_memcmp(s1, s2, n)
+#endif
+#endif
+
+#if !defined(simde_memcpy) || !defined(simde_memset) || !defined(simde_memcmp)
+#if !defined(SIMDE_NO_STRING_H)
+#if defined(__has_include)
+#if !__has_include(<string.h>)
 #define SIMDE_NO_STRING_H
 #define SIMDE_NO_STRING_H
 #endif
 #endif
+#elif (SIMDE_STDC_HOSTED == 0)
+#define SIMDE_NO_STRING_H
 #endif
 #endif
 #endif
 #endif
-#if !defined(simde_memcpy) || !defined(simde_memset)
-#if !defined(SIMDE_NO_STRING_H) && (SIMDE_STDC_HOSTED == 1)
+
+#if !defined(SIMDE_NO_STRING_H)
 #include <string.h>
 #include <string.h>
 #if !defined(simde_memcpy)
 #if !defined(simde_memcpy)
 #define simde_memcpy(dest, src, n) memcpy(dest, src, n)
 #define simde_memcpy(dest, src, n) memcpy(dest, src, n)
@@ -596,14 +610,8 @@ typedef SIMDE_FLOAT64_TYPE simde_float64;
 #if !defined(simde_memset)
 #if !defined(simde_memset)
 #define simde_memset(s, c, n) memset(s, c, n)
 #define simde_memset(s, c, n) memset(s, c, n)
 #endif
 #endif
-#elif (HEDLEY_HAS_BUILTIN(__builtin_memcpy) &&  \
-       HEDLEY_HAS_BUILTIN(__builtin_memset)) || \
-	HEDLEY_GCC_VERSION_CHECK(4, 2, 0)
-#if !defined(simde_memcpy)
-#define simde_memcpy(dest, src, n) __builtin_memcpy(dest, src, n)
-#endif
-#if !defined(simde_memset)
-#define simde_memset(s, c, n) __builtin_memset(s, c, n)
+#if !defined(simde_memcmp)
+#define simde_memcmp(s1, s2, n) memcmp(s1, s2, n)
 #endif
 #endif
 #else
 #else
 /* These are meant to be portable, not fast.  If you're hitting them you
 /* These are meant to be portable, not fast.  If you're hitting them you
@@ -637,10 +645,24 @@ void simde_memset_(void *s, int c, size_t len)
 }
 }
 #define simde_memset(s, c, n) simde_memset_(s, c, n)
 #define simde_memset(s, c, n) simde_memset_(s, c, n)
 #endif
 #endif
-#endif /* !defined(SIMDE_NO_STRING_H) && (SIMDE_STDC_HOSTED == 1) */
-#endif /* !defined(simde_memcpy) || !defined(simde_memset) */
 
 
-#include "simde-math.h"
+#if !defined(simde_memcmp)
+SIMDE_FUCTION_ATTRIBUTES
+int simde_memcmp_(const void *s1, const void *s2, size_t n)
+{
+	unsigned char *s1_ = HEDLEY_STATIC_CAST(unsigned char *, s1);
+	unsigned char *s2_ = HEDLEY_STATIC_CAST(unsigned char *, s2);
+	for (size_t i = 0; i < len; i++) {
+		if (s1_[i] != s2_[i]) {
+			return (int)(s1_[i] - s2_[i]);
+		}
+	}
+	return 0;
+}
+#define simde_memcmp(s1, s2, n) simde_memcmp_(s1, s2, n)
+#endif
+#endif
+#endif
 
 
 #if defined(FE_ALL_EXCEPT)
 #if defined(FE_ALL_EXCEPT)
 #define SIMDE_HAVE_FENV_H
 #define SIMDE_HAVE_FENV_H
@@ -682,6 +704,105 @@ void simde_memset_(void *s, int c, size_t len)
 
 
 #include "check.h"
 #include "check.h"
 
 
+/* GCC/clang have a bunch of functionality in builtins which we would
+ * like to access, but the suffixes indicate whether the operate on
+ * int, long, or long long, not fixed width types (e.g., int32_t).
+ * we use these macros to attempt to map from fixed-width to the
+ * names GCC uses.  Note that you should still cast the input(s) and
+ * return values (to/from SIMDE_BUILTIN_TYPE_*_) since often even if
+ * types are the same size they may not be compatible according to the
+ * compiler.  For example, on x86 long and long lonsg are generally
+ * both 64 bits, but platforms vary on whether an int64_t is mapped
+ * to a long or long long. */
+
+#include <limits.h>
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_
+
+#if (INT8_MAX == INT_MAX) && (INT8_MIN == INT_MIN)
+#define SIMDE_BUILTIN_SUFFIX_8_
+#define SIMDE_BUILTIN_TYPE_8_ int
+#elif (INT8_MAX == LONG_MAX) && (INT8_MIN == LONG_MIN)
+#define SIMDE_BUILTIN_SUFFIX_8_ l
+#define SIMDE_BUILTIN_TYPE_8_ long
+#elif (INT8_MAX == LLONG_MAX) && (INT8_MIN == LLONG_MIN)
+#define SIMDE_BUILTIN_SUFFIX_8_ ll
+#define SIMDE_BUILTIN_TYPE_8_ long long
+#endif
+
+#if (INT16_MAX == INT_MAX) && (INT16_MIN == INT_MIN)
+#define SIMDE_BUILTIN_SUFFIX_16_
+#define SIMDE_BUILTIN_TYPE_16_ int
+#elif (INT16_MAX == LONG_MAX) && (INT16_MIN == LONG_MIN)
+#define SIMDE_BUILTIN_SUFFIX_16_ l
+#define SIMDE_BUILTIN_TYPE_16_ long
+#elif (INT16_MAX == LLONG_MAX) && (INT16_MIN == LLONG_MIN)
+#define SIMDE_BUILTIN_SUFFIX_16_ ll
+#define SIMDE_BUILTIN_TYPE_16_ long long
+#endif
+
+#if (INT32_MAX == INT_MAX) && (INT32_MIN == INT_MIN)
+#define SIMDE_BUILTIN_SUFFIX_32_
+#define SIMDE_BUILTIN_TYPE_32_ int
+#elif (INT32_MAX == LONG_MAX) && (INT32_MIN == LONG_MIN)
+#define SIMDE_BUILTIN_SUFFIX_32_ l
+#define SIMDE_BUILTIN_TYPE_32_ long
+#elif (INT32_MAX == LLONG_MAX) && (INT32_MIN == LLONG_MIN)
+#define SIMDE_BUILTIN_SUFFIX_32_ ll
+#define SIMDE_BUILTIN_TYPE_32_ long long
+#endif
+
+#if (INT64_MAX == INT_MAX) && (INT64_MIN == INT_MIN)
+#define SIMDE_BUILTIN_SUFFIX_64_
+#define SIMDE_BUILTIN_TYPE_64_ int
+#elif (INT64_MAX == LONG_MAX) && (INT64_MIN == LONG_MIN)
+#define SIMDE_BUILTIN_SUFFIX_64_ l
+#define SIMDE_BUILTIN_TYPE_64_ long
+#elif (INT64_MAX == LLONG_MAX) && (INT64_MIN == LLONG_MIN)
+#define SIMDE_BUILTIN_SUFFIX_64_ ll
+#define SIMDE_BUILTIN_TYPE_64_ long long
+#endif
+
+#if defined(SIMDE_BUILTIN_SUFFIX_8_)
+#define SIMDE_BUILTIN_8_(name) \
+	HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_8_)
+#define SIMDE_BUILTIN_HAS_8_(name) \
+	HEDLEY_HAS_BUILTIN(        \
+		HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_8_))
+#else
+#define SIMDE_BUILTIN_HAS_8_(name) 0
+#endif
+#if defined(SIMDE_BUILTIN_SUFFIX_16_)
+#define SIMDE_BUILTIN_16_(name) \
+	HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_16_)
+#define SIMDE_BUILTIN_HAS_16_(name) \
+	HEDLEY_HAS_BUILTIN(         \
+		HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_16_))
+#else
+#define SIMDE_BUILTIN_HAS_16_(name) 0
+#endif
+#if defined(SIMDE_BUILTIN_SUFFIX_32_)
+#define SIMDE_BUILTIN_32_(name) \
+	HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_32_)
+#define SIMDE_BUILTIN_HAS_32_(name) \
+	HEDLEY_HAS_BUILTIN(         \
+		HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_32_))
+#else
+#define SIMDE_BUILTIN_HAS_32_(name) 0
+#endif
+#if defined(SIMDE_BUILTIN_SUFFIX_64_)
+#define SIMDE_BUILTIN_64_(name) \
+	HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_64_)
+#define SIMDE_BUILTIN_HAS_64_(name) \
+	HEDLEY_HAS_BUILTIN(         \
+		HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_64_))
+#else
+#define SIMDE_BUILTIN_HAS_64_(name) 0
+#endif
+
+HEDLEY_DIAGNOSTIC_POP
+
 /* Sometimes we run into problems with specific versions of compilers
 /* Sometimes we run into problems with specific versions of compilers
    which make the native versions unusable for us.  Often this is due
    which make the native versions unusable for us.  Often this is due
    to missing functions, sometimes buggy implementations, etc.  These
    to missing functions, sometimes buggy implementations, etc.  These
@@ -712,29 +833,75 @@ void simde_memset_(void *s, int c, size_t len)
 #if defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)
 #if defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)
 #define SIMDE_BUG_GCC_94482
 #define SIMDE_BUG_GCC_94482
 #endif
 #endif
+#if (defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) || \
+	defined(SIMDE_ARCH_SYSTEMZ)
+#define SIMDE_BUG_GCC_53784
+#endif
+#if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)
+#if HEDLEY_GCC_VERSION_CHECK(4, 3, 0) /* -Wsign-conversion */
+#define SIMDE_BUG_GCC_95144
+#endif
+#endif
 #if !HEDLEY_GCC_VERSION_CHECK(9, 4, 0) && defined(SIMDE_ARCH_AARCH64)
 #if !HEDLEY_GCC_VERSION_CHECK(9, 4, 0) && defined(SIMDE_ARCH_AARCH64)
 #define SIMDE_BUG_GCC_94488
 #define SIMDE_BUG_GCC_94488
 #endif
 #endif
-#if defined(SIMDE_ARCH_POWER)
+#if defined(SIMDE_ARCH_ARM)
+#define SIMDE_BUG_GCC_95399
+#define SIMDE_BUG_GCC_95471
+#elif defined(SIMDE_ARCH_POWER)
 #define SIMDE_BUG_GCC_95227
 #define SIMDE_BUG_GCC_95227
+#define SIMDE_BUG_GCC_95782
+#elif defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)
+#if !HEDLEY_GCC_VERSION_CHECK(10, 2, 0) && !defined(__OPTIMIZE__)
+#define SIMDE_BUG_GCC_96174
+#endif
 #endif
 #endif
 #define SIMDE_BUG_GCC_95399
 #define SIMDE_BUG_GCC_95399
 #elif defined(__clang__)
 #elif defined(__clang__)
 #if defined(SIMDE_ARCH_AARCH64)
 #if defined(SIMDE_ARCH_AARCH64)
 #define SIMDE_BUG_CLANG_45541
 #define SIMDE_BUG_CLANG_45541
+#define SIMDE_BUG_CLANG_46844
+#define SIMDE_BUG_CLANG_48257
+#if SIMDE_DETECT_CLANG_VERSION_CHECK(10, 0, 0) && \
+	SIMDE_DETECT_CLANG_VERSION_NOT(11, 0, 0)
+#define SIMDE_BUG_CLANG_BAD_VI64_OPS
+#endif
+#endif
+#if defined(SIMDE_ARCH_POWER)
+#define SIMDE_BUG_CLANG_46770
+#endif
+#if defined(_ARCH_PWR9) && !SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0) && \
+	!defined(__OPTIMIZE__)
+#define SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT
+#endif
+#if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)
+#if HEDLEY_HAS_WARNING("-Wsign-conversion") && \
+	SIMDE_DETECT_CLANG_VERSION_NOT(11, 0, 0)
+#define SIMDE_BUG_CLANG_45931
+#endif
+#if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
+	SIMDE_DETECT_CLANG_VERSION_NOT(11, 0, 0)
+#define SIMDE_BUG_CLANG_44589
+#endif
 #endif
 #endif
+#define SIMDE_BUG_CLANG_45959
+#elif defined(HEDLEY_MSVC_VERSION)
+#if defined(SIMDE_ARCH_X86)
+#define SIMDE_BUG_MSVC_ROUND_EXTRACT
 #endif
 #endif
-#if defined(HEDLEY_EMSCRIPTEN_VERSION)
-#define SIMDE_BUG_EMSCRIPTEN_MISSING_IMPL /* Placeholder for (as yet) unfiled issues. */
-#define SIMDE_BUG_EMSCRIPTEN_5242
+#elif defined(HEDLEY_INTEL_VERSION)
+#define SIMDE_BUG_INTEL_857088
 #endif
 #endif
 #endif
 #endif
 
 
 /* GCC and Clang both have the same issue:
 /* GCC and Clang both have the same issue:
  * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95144
  * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95144
  * https://bugs.llvm.org/show_bug.cgi?id=45931
  * https://bugs.llvm.org/show_bug.cgi?id=45931
+ * This is just an easy way to work around it.
  */
  */
-#if HEDLEY_HAS_WARNING("-Wsign-conversion") || HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
+#if (HEDLEY_HAS_WARNING("-Wsign-conversion") &&   \
+     SIMDE_DETECT_CLANG_VERSION_NOT(11, 0, 0)) || \
+	HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
 #define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr)                                      \
 #define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr)                                      \
 	(__extension__({                                                            \
 	(__extension__({                                                            \
 		HEDLEY_DIAGNOSTIC_PUSH                                              \
 		HEDLEY_DIAGNOSTIC_PUSH                                              \

+ 925 - 0
libobs/util/simde/simde-constify.h

@@ -0,0 +1,925 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ *   2020      Evan Nemerson <[email protected]>
+ */
+
+/* Constify macros.  For internal use only.
+ *
+ * These are used to make it possible to call a function which takes
+ * an Integer Constant Expression (ICE) using a compile time constant.
+ * Technically it would also be possible to use a value not trivially
+ * known by the compiler, but there would be a siginficant performance
+ * hit (a switch switch is used).
+ *
+ * The basic idea is pretty simple; we just emit a do while loop which
+ * contains a switch with a case for every possible value of the
+ * constant.
+ *
+ * As long as the value you pass to the function in constant, pretty
+ * much any copmiler shouldn't have a problem generating exactly the
+ * same code as if you had used an ICE.
+ *
+ * This is intended to be used in the SIMDe implementations of
+ * functions the compilers require to be an ICE, but the other benefit
+ * is that if we also disable the warnings from
+ * SIMDE_REQUIRE_CONSTANT_RANGE we can actually just allow the tests
+ * to use non-ICE parameters
+ */
+
+#if !defined(SIMDE_CONSTIFY_H)
+#define SIMDE_CONSTIFY_H
+
+#include "simde-diagnostic.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_
+SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_
+
+#define SIMDE_CONSTIFY_2_(func_name, result, default_case, imm, ...) \
+	do {                                                         \
+		switch (imm) {                                       \
+		case 0:                                              \
+			result = func_name(__VA_ARGS__, 0);          \
+			break;                                       \
+		case 1:                                              \
+			result = func_name(__VA_ARGS__, 1);          \
+			break;                                       \
+		default:                                             \
+			result = default_case;                       \
+			break;                                       \
+		}                                                    \
+	} while (0)
+
+#define SIMDE_CONSTIFY_4_(func_name, result, default_case, imm, ...) \
+	do {                                                         \
+		switch (imm) {                                       \
+		case 0:                                              \
+			result = func_name(__VA_ARGS__, 0);          \
+			break;                                       \
+		case 1:                                              \
+			result = func_name(__VA_ARGS__, 1);          \
+			break;                                       \
+		case 2:                                              \
+			result = func_name(__VA_ARGS__, 2);          \
+			break;                                       \
+		case 3:                                              \
+			result = func_name(__VA_ARGS__, 3);          \
+			break;                                       \
+		default:                                             \
+			result = default_case;                       \
+			break;                                       \
+		}                                                    \
+	} while (0)
+
+#define SIMDE_CONSTIFY_8_(func_name, result, default_case, imm, ...) \
+	do {                                                         \
+		switch (imm) {                                       \
+		case 0:                                              \
+			result = func_name(__VA_ARGS__, 0);          \
+			break;                                       \
+		case 1:                                              \
+			result = func_name(__VA_ARGS__, 1);          \
+			break;                                       \
+		case 2:                                              \
+			result = func_name(__VA_ARGS__, 2);          \
+			break;                                       \
+		case 3:                                              \
+			result = func_name(__VA_ARGS__, 3);          \
+			break;                                       \
+		case 4:                                              \
+			result = func_name(__VA_ARGS__, 4);          \
+			break;                                       \
+		case 5:                                              \
+			result = func_name(__VA_ARGS__, 5);          \
+			break;                                       \
+		case 6:                                              \
+			result = func_name(__VA_ARGS__, 6);          \
+			break;                                       \
+		case 7:                                              \
+			result = func_name(__VA_ARGS__, 7);          \
+			break;                                       \
+		default:                                             \
+			result = default_case;                       \
+			break;                                       \
+		}                                                    \
+	} while (0)
+
+#define SIMDE_CONSTIFY_16_(func_name, result, default_case, imm, ...) \
+	do {                                                          \
+		switch (imm) {                                        \
+		case 0:                                               \
+			result = func_name(__VA_ARGS__, 0);           \
+			break;                                        \
+		case 1:                                               \
+			result = func_name(__VA_ARGS__, 1);           \
+			break;                                        \
+		case 2:                                               \
+			result = func_name(__VA_ARGS__, 2);           \
+			break;                                        \
+		case 3:                                               \
+			result = func_name(__VA_ARGS__, 3);           \
+			break;                                        \
+		case 4:                                               \
+			result = func_name(__VA_ARGS__, 4);           \
+			break;                                        \
+		case 5:                                               \
+			result = func_name(__VA_ARGS__, 5);           \
+			break;                                        \
+		case 6:                                               \
+			result = func_name(__VA_ARGS__, 6);           \
+			break;                                        \
+		case 7:                                               \
+			result = func_name(__VA_ARGS__, 7);           \
+			break;                                        \
+		case 8:                                               \
+			result = func_name(__VA_ARGS__, 8);           \
+			break;                                        \
+		case 9:                                               \
+			result = func_name(__VA_ARGS__, 9);           \
+			break;                                        \
+		case 10:                                              \
+			result = func_name(__VA_ARGS__, 10);          \
+			break;                                        \
+		case 11:                                              \
+			result = func_name(__VA_ARGS__, 11);          \
+			break;                                        \
+		case 12:                                              \
+			result = func_name(__VA_ARGS__, 12);          \
+			break;                                        \
+		case 13:                                              \
+			result = func_name(__VA_ARGS__, 13);          \
+			break;                                        \
+		case 14:                                              \
+			result = func_name(__VA_ARGS__, 14);          \
+			break;                                        \
+		case 15:                                              \
+			result = func_name(__VA_ARGS__, 15);          \
+			break;                                        \
+		default:                                              \
+			result = default_case;                        \
+			break;                                        \
+		}                                                     \
+	} while (0)
+
+#define SIMDE_CONSTIFY_32_(func_name, result, default_case, imm, ...) \
+	do {                                                          \
+		switch (imm) {                                        \
+		case 0:                                               \
+			result = func_name(__VA_ARGS__, 0);           \
+			break;                                        \
+		case 1:                                               \
+			result = func_name(__VA_ARGS__, 1);           \
+			break;                                        \
+		case 2:                                               \
+			result = func_name(__VA_ARGS__, 2);           \
+			break;                                        \
+		case 3:                                               \
+			result = func_name(__VA_ARGS__, 3);           \
+			break;                                        \
+		case 4:                                               \
+			result = func_name(__VA_ARGS__, 4);           \
+			break;                                        \
+		case 5:                                               \
+			result = func_name(__VA_ARGS__, 5);           \
+			break;                                        \
+		case 6:                                               \
+			result = func_name(__VA_ARGS__, 6);           \
+			break;                                        \
+		case 7:                                               \
+			result = func_name(__VA_ARGS__, 7);           \
+			break;                                        \
+		case 8:                                               \
+			result = func_name(__VA_ARGS__, 8);           \
+			break;                                        \
+		case 9:                                               \
+			result = func_name(__VA_ARGS__, 9);           \
+			break;                                        \
+		case 10:                                              \
+			result = func_name(__VA_ARGS__, 10);          \
+			break;                                        \
+		case 11:                                              \
+			result = func_name(__VA_ARGS__, 11);          \
+			break;                                        \
+		case 12:                                              \
+			result = func_name(__VA_ARGS__, 12);          \
+			break;                                        \
+		case 13:                                              \
+			result = func_name(__VA_ARGS__, 13);          \
+			break;                                        \
+		case 14:                                              \
+			result = func_name(__VA_ARGS__, 14);          \
+			break;                                        \
+		case 15:                                              \
+			result = func_name(__VA_ARGS__, 15);          \
+			break;                                        \
+		case 16:                                              \
+			result = func_name(__VA_ARGS__, 16);          \
+			break;                                        \
+		case 17:                                              \
+			result = func_name(__VA_ARGS__, 17);          \
+			break;                                        \
+		case 18:                                              \
+			result = func_name(__VA_ARGS__, 18);          \
+			break;                                        \
+		case 19:                                              \
+			result = func_name(__VA_ARGS__, 19);          \
+			break;                                        \
+		case 20:                                              \
+			result = func_name(__VA_ARGS__, 20);          \
+			break;                                        \
+		case 21:                                              \
+			result = func_name(__VA_ARGS__, 21);          \
+			break;                                        \
+		case 22:                                              \
+			result = func_name(__VA_ARGS__, 22);          \
+			break;                                        \
+		case 23:                                              \
+			result = func_name(__VA_ARGS__, 23);          \
+			break;                                        \
+		case 24:                                              \
+			result = func_name(__VA_ARGS__, 24);          \
+			break;                                        \
+		case 25:                                              \
+			result = func_name(__VA_ARGS__, 25);          \
+			break;                                        \
+		case 26:                                              \
+			result = func_name(__VA_ARGS__, 26);          \
+			break;                                        \
+		case 27:                                              \
+			result = func_name(__VA_ARGS__, 27);          \
+			break;                                        \
+		case 28:                                              \
+			result = func_name(__VA_ARGS__, 28);          \
+			break;                                        \
+		case 29:                                              \
+			result = func_name(__VA_ARGS__, 29);          \
+			break;                                        \
+		case 30:                                              \
+			result = func_name(__VA_ARGS__, 30);          \
+			break;                                        \
+		case 31:                                              \
+			result = func_name(__VA_ARGS__, 31);          \
+			break;                                        \
+		default:                                              \
+			result = default_case;                        \
+			break;                                        \
+		}                                                     \
+	} while (0)
+
+#define SIMDE_CONSTIFY_64_(func_name, result, default_case, imm, ...) \
+	do {                                                          \
+		switch (imm) {                                        \
+		case 0:                                               \
+			result = func_name(__VA_ARGS__, 0);           \
+			break;                                        \
+		case 1:                                               \
+			result = func_name(__VA_ARGS__, 1);           \
+			break;                                        \
+		case 2:                                               \
+			result = func_name(__VA_ARGS__, 2);           \
+			break;                                        \
+		case 3:                                               \
+			result = func_name(__VA_ARGS__, 3);           \
+			break;                                        \
+		case 4:                                               \
+			result = func_name(__VA_ARGS__, 4);           \
+			break;                                        \
+		case 5:                                               \
+			result = func_name(__VA_ARGS__, 5);           \
+			break;                                        \
+		case 6:                                               \
+			result = func_name(__VA_ARGS__, 6);           \
+			break;                                        \
+		case 7:                                               \
+			result = func_name(__VA_ARGS__, 7);           \
+			break;                                        \
+		case 8:                                               \
+			result = func_name(__VA_ARGS__, 8);           \
+			break;                                        \
+		case 9:                                               \
+			result = func_name(__VA_ARGS__, 9);           \
+			break;                                        \
+		case 10:                                              \
+			result = func_name(__VA_ARGS__, 10);          \
+			break;                                        \
+		case 11:                                              \
+			result = func_name(__VA_ARGS__, 11);          \
+			break;                                        \
+		case 12:                                              \
+			result = func_name(__VA_ARGS__, 12);          \
+			break;                                        \
+		case 13:                                              \
+			result = func_name(__VA_ARGS__, 13);          \
+			break;                                        \
+		case 14:                                              \
+			result = func_name(__VA_ARGS__, 14);          \
+			break;                                        \
+		case 15:                                              \
+			result = func_name(__VA_ARGS__, 15);          \
+			break;                                        \
+		case 16:                                              \
+			result = func_name(__VA_ARGS__, 16);          \
+			break;                                        \
+		case 17:                                              \
+			result = func_name(__VA_ARGS__, 17);          \
+			break;                                        \
+		case 18:                                              \
+			result = func_name(__VA_ARGS__, 18);          \
+			break;                                        \
+		case 19:                                              \
+			result = func_name(__VA_ARGS__, 19);          \
+			break;                                        \
+		case 20:                                              \
+			result = func_name(__VA_ARGS__, 20);          \
+			break;                                        \
+		case 21:                                              \
+			result = func_name(__VA_ARGS__, 21);          \
+			break;                                        \
+		case 22:                                              \
+			result = func_name(__VA_ARGS__, 22);          \
+			break;                                        \
+		case 23:                                              \
+			result = func_name(__VA_ARGS__, 23);          \
+			break;                                        \
+		case 24:                                              \
+			result = func_name(__VA_ARGS__, 24);          \
+			break;                                        \
+		case 25:                                              \
+			result = func_name(__VA_ARGS__, 25);          \
+			break;                                        \
+		case 26:                                              \
+			result = func_name(__VA_ARGS__, 26);          \
+			break;                                        \
+		case 27:                                              \
+			result = func_name(__VA_ARGS__, 27);          \
+			break;                                        \
+		case 28:                                              \
+			result = func_name(__VA_ARGS__, 28);          \
+			break;                                        \
+		case 29:                                              \
+			result = func_name(__VA_ARGS__, 29);          \
+			break;                                        \
+		case 30:                                              \
+			result = func_name(__VA_ARGS__, 30);          \
+			break;                                        \
+		case 31:                                              \
+			result = func_name(__VA_ARGS__, 31);          \
+			break;                                        \
+		case 32:                                              \
+			result = func_name(__VA_ARGS__, 32);          \
+			break;                                        \
+		case 33:                                              \
+			result = func_name(__VA_ARGS__, 33);          \
+			break;                                        \
+		case 34:                                              \
+			result = func_name(__VA_ARGS__, 34);          \
+			break;                                        \
+		case 35:                                              \
+			result = func_name(__VA_ARGS__, 35);          \
+			break;                                        \
+		case 36:                                              \
+			result = func_name(__VA_ARGS__, 36);          \
+			break;                                        \
+		case 37:                                              \
+			result = func_name(__VA_ARGS__, 37);          \
+			break;                                        \
+		case 38:                                              \
+			result = func_name(__VA_ARGS__, 38);          \
+			break;                                        \
+		case 39:                                              \
+			result = func_name(__VA_ARGS__, 39);          \
+			break;                                        \
+		case 40:                                              \
+			result = func_name(__VA_ARGS__, 40);          \
+			break;                                        \
+		case 41:                                              \
+			result = func_name(__VA_ARGS__, 41);          \
+			break;                                        \
+		case 42:                                              \
+			result = func_name(__VA_ARGS__, 42);          \
+			break;                                        \
+		case 43:                                              \
+			result = func_name(__VA_ARGS__, 43);          \
+			break;                                        \
+		case 44:                                              \
+			result = func_name(__VA_ARGS__, 44);          \
+			break;                                        \
+		case 45:                                              \
+			result = func_name(__VA_ARGS__, 45);          \
+			break;                                        \
+		case 46:                                              \
+			result = func_name(__VA_ARGS__, 46);          \
+			break;                                        \
+		case 47:                                              \
+			result = func_name(__VA_ARGS__, 47);          \
+			break;                                        \
+		case 48:                                              \
+			result = func_name(__VA_ARGS__, 48);          \
+			break;                                        \
+		case 49:                                              \
+			result = func_name(__VA_ARGS__, 49);          \
+			break;                                        \
+		case 50:                                              \
+			result = func_name(__VA_ARGS__, 50);          \
+			break;                                        \
+		case 51:                                              \
+			result = func_name(__VA_ARGS__, 51);          \
+			break;                                        \
+		case 52:                                              \
+			result = func_name(__VA_ARGS__, 52);          \
+			break;                                        \
+		case 53:                                              \
+			result = func_name(__VA_ARGS__, 53);          \
+			break;                                        \
+		case 54:                                              \
+			result = func_name(__VA_ARGS__, 54);          \
+			break;                                        \
+		case 55:                                              \
+			result = func_name(__VA_ARGS__, 55);          \
+			break;                                        \
+		case 56:                                              \
+			result = func_name(__VA_ARGS__, 56);          \
+			break;                                        \
+		case 57:                                              \
+			result = func_name(__VA_ARGS__, 57);          \
+			break;                                        \
+		case 58:                                              \
+			result = func_name(__VA_ARGS__, 58);          \
+			break;                                        \
+		case 59:                                              \
+			result = func_name(__VA_ARGS__, 59);          \
+			break;                                        \
+		case 60:                                              \
+			result = func_name(__VA_ARGS__, 60);          \
+			break;                                        \
+		case 61:                                              \
+			result = func_name(__VA_ARGS__, 61);          \
+			break;                                        \
+		case 62:                                              \
+			result = func_name(__VA_ARGS__, 62);          \
+			break;                                        \
+		case 63:                                              \
+			result = func_name(__VA_ARGS__, 63);          \
+			break;                                        \
+		default:                                              \
+			result = default_case;                        \
+			break;                                        \
+		}                                                     \
+	} while (0)
+
+#define SIMDE_CONSTIFY_2_NO_RESULT_(func_name, default_case, imm, ...) \
+	do {                                                           \
+		switch (imm) {                                         \
+		case 0:                                                \
+			func_name(__VA_ARGS__, 0);                     \
+			break;                                         \
+		case 1:                                                \
+			func_name(__VA_ARGS__, 1);                     \
+			break;                                         \
+		default:                                               \
+			default_case;                                  \
+			break;                                         \
+		}                                                      \
+	} while (0)
+
+#define SIMDE_CONSTIFY_4_NO_RESULT_(func_name, default_case, imm, ...) \
+	do {                                                           \
+		switch (imm) {                                         \
+		case 0:                                                \
+			func_name(__VA_ARGS__, 0);                     \
+			break;                                         \
+		case 1:                                                \
+			func_name(__VA_ARGS__, 1);                     \
+			break;                                         \
+		case 2:                                                \
+			func_name(__VA_ARGS__, 2);                     \
+			break;                                         \
+		case 3:                                                \
+			func_name(__VA_ARGS__, 3);                     \
+			break;                                         \
+		default:                                               \
+			default_case;                                  \
+			break;                                         \
+		}                                                      \
+	} while (0)
+
+#define SIMDE_CONSTIFY_8_NO_RESULT_(func_name, default_case, imm, ...) \
+	do {                                                           \
+		switch (imm) {                                         \
+		case 0:                                                \
+			func_name(__VA_ARGS__, 0);                     \
+			break;                                         \
+		case 1:                                                \
+			func_name(__VA_ARGS__, 1);                     \
+			break;                                         \
+		case 2:                                                \
+			func_name(__VA_ARGS__, 2);                     \
+			break;                                         \
+		case 3:                                                \
+			func_name(__VA_ARGS__, 3);                     \
+			break;                                         \
+		case 4:                                                \
+			func_name(__VA_ARGS__, 4);                     \
+			break;                                         \
+		case 5:                                                \
+			func_name(__VA_ARGS__, 5);                     \
+			break;                                         \
+		case 6:                                                \
+			func_name(__VA_ARGS__, 6);                     \
+			break;                                         \
+		case 7:                                                \
+			func_name(__VA_ARGS__, 7);                     \
+			break;                                         \
+		default:                                               \
+			default_case;                                  \
+			break;                                         \
+		}                                                      \
+	} while (0)
+
+#define SIMDE_CONSTIFY_16_NO_RESULT_(func_name, default_case, imm, ...) \
+	do {                                                            \
+		switch (imm) {                                          \
+		case 0:                                                 \
+			func_name(__VA_ARGS__, 0);                      \
+			break;                                          \
+		case 1:                                                 \
+			func_name(__VA_ARGS__, 1);                      \
+			break;                                          \
+		case 2:                                                 \
+			func_name(__VA_ARGS__, 2);                      \
+			break;                                          \
+		case 3:                                                 \
+			func_name(__VA_ARGS__, 3);                      \
+			break;                                          \
+		case 4:                                                 \
+			func_name(__VA_ARGS__, 4);                      \
+			break;                                          \
+		case 5:                                                 \
+			func_name(__VA_ARGS__, 5);                      \
+			break;                                          \
+		case 6:                                                 \
+			func_name(__VA_ARGS__, 6);                      \
+			break;                                          \
+		case 7:                                                 \
+			func_name(__VA_ARGS__, 7);                      \
+			break;                                          \
+		case 8:                                                 \
+			func_name(__VA_ARGS__, 8);                      \
+			break;                                          \
+		case 9:                                                 \
+			func_name(__VA_ARGS__, 9);                      \
+			break;                                          \
+		case 10:                                                \
+			func_name(__VA_ARGS__, 10);                     \
+			break;                                          \
+		case 11:                                                \
+			func_name(__VA_ARGS__, 11);                     \
+			break;                                          \
+		case 12:                                                \
+			func_name(__VA_ARGS__, 12);                     \
+			break;                                          \
+		case 13:                                                \
+			func_name(__VA_ARGS__, 13);                     \
+			break;                                          \
+		case 14:                                                \
+			func_name(__VA_ARGS__, 14);                     \
+			break;                                          \
+		case 15:                                                \
+			func_name(__VA_ARGS__, 15);                     \
+			break;                                          \
+		default:                                                \
+			default_case;                                   \
+			break;                                          \
+		}                                                       \
+	} while (0)
+
+#define SIMDE_CONSTIFY_32_NO_RESULT_(func_name, default_case, imm, ...) \
+	do {                                                            \
+		switch (imm) {                                          \
+		case 0:                                                 \
+			func_name(__VA_ARGS__, 0);                      \
+			break;                                          \
+		case 1:                                                 \
+			func_name(__VA_ARGS__, 1);                      \
+			break;                                          \
+		case 2:                                                 \
+			func_name(__VA_ARGS__, 2);                      \
+			break;                                          \
+		case 3:                                                 \
+			func_name(__VA_ARGS__, 3);                      \
+			break;                                          \
+		case 4:                                                 \
+			func_name(__VA_ARGS__, 4);                      \
+			break;                                          \
+		case 5:                                                 \
+			func_name(__VA_ARGS__, 5);                      \
+			break;                                          \
+		case 6:                                                 \
+			func_name(__VA_ARGS__, 6);                      \
+			break;                                          \
+		case 7:                                                 \
+			func_name(__VA_ARGS__, 7);                      \
+			break;                                          \
+		case 8:                                                 \
+			func_name(__VA_ARGS__, 8);                      \
+			break;                                          \
+		case 9:                                                 \
+			func_name(__VA_ARGS__, 9);                      \
+			break;                                          \
+		case 10:                                                \
+			func_name(__VA_ARGS__, 10);                     \
+			break;                                          \
+		case 11:                                                \
+			func_name(__VA_ARGS__, 11);                     \
+			break;                                          \
+		case 12:                                                \
+			func_name(__VA_ARGS__, 12);                     \
+			break;                                          \
+		case 13:                                                \
+			func_name(__VA_ARGS__, 13);                     \
+			break;                                          \
+		case 14:                                                \
+			func_name(__VA_ARGS__, 14);                     \
+			break;                                          \
+		case 15:                                                \
+			func_name(__VA_ARGS__, 15);                     \
+			break;                                          \
+		case 16:                                                \
+			func_name(__VA_ARGS__, 16);                     \
+			break;                                          \
+		case 17:                                                \
+			func_name(__VA_ARGS__, 17);                     \
+			break;                                          \
+		case 18:                                                \
+			func_name(__VA_ARGS__, 18);                     \
+			break;                                          \
+		case 19:                                                \
+			func_name(__VA_ARGS__, 19);                     \
+			break;                                          \
+		case 20:                                                \
+			func_name(__VA_ARGS__, 20);                     \
+			break;                                          \
+		case 21:                                                \
+			func_name(__VA_ARGS__, 21);                     \
+			break;                                          \
+		case 22:                                                \
+			func_name(__VA_ARGS__, 22);                     \
+			break;                                          \
+		case 23:                                                \
+			func_name(__VA_ARGS__, 23);                     \
+			break;                                          \
+		case 24:                                                \
+			func_name(__VA_ARGS__, 24);                     \
+			break;                                          \
+		case 25:                                                \
+			func_name(__VA_ARGS__, 25);                     \
+			break;                                          \
+		case 26:                                                \
+			func_name(__VA_ARGS__, 26);                     \
+			break;                                          \
+		case 27:                                                \
+			func_name(__VA_ARGS__, 27);                     \
+			break;                                          \
+		case 28:                                                \
+			func_name(__VA_ARGS__, 28);                     \
+			break;                                          \
+		case 29:                                                \
+			func_name(__VA_ARGS__, 29);                     \
+			break;                                          \
+		case 30:                                                \
+			func_name(__VA_ARGS__, 30);                     \
+			break;                                          \
+		case 31:                                                \
+			func_name(__VA_ARGS__, 31);                     \
+			break;                                          \
+		default:                                                \
+			default_case;                                   \
+			break;                                          \
+		}                                                       \
+	} while (0)
+
+#define SIMDE_CONSTIFY_64_NO_RESULT_(func_name, default_case, imm, ...) \
+	do {                                                            \
+		switch (imm) {                                          \
+		case 0:                                                 \
+			func_name(__VA_ARGS__, 0);                      \
+			break;                                          \
+		case 1:                                                 \
+			func_name(__VA_ARGS__, 1);                      \
+			break;                                          \
+		case 2:                                                 \
+			func_name(__VA_ARGS__, 2);                      \
+			break;                                          \
+		case 3:                                                 \
+			func_name(__VA_ARGS__, 3);                      \
+			break;                                          \
+		case 4:                                                 \
+			func_name(__VA_ARGS__, 4);                      \
+			break;                                          \
+		case 5:                                                 \
+			func_name(__VA_ARGS__, 5);                      \
+			break;                                          \
+		case 6:                                                 \
+			func_name(__VA_ARGS__, 6);                      \
+			break;                                          \
+		case 7:                                                 \
+			func_name(__VA_ARGS__, 7);                      \
+			break;                                          \
+		case 8:                                                 \
+			func_name(__VA_ARGS__, 8);                      \
+			break;                                          \
+		case 9:                                                 \
+			func_name(__VA_ARGS__, 9);                      \
+			break;                                          \
+		case 10:                                                \
+			func_name(__VA_ARGS__, 10);                     \
+			break;                                          \
+		case 11:                                                \
+			func_name(__VA_ARGS__, 11);                     \
+			break;                                          \
+		case 12:                                                \
+			func_name(__VA_ARGS__, 12);                     \
+			break;                                          \
+		case 13:                                                \
+			func_name(__VA_ARGS__, 13);                     \
+			break;                                          \
+		case 14:                                                \
+			func_name(__VA_ARGS__, 14);                     \
+			break;                                          \
+		case 15:                                                \
+			func_name(__VA_ARGS__, 15);                     \
+			break;                                          \
+		case 16:                                                \
+			func_name(__VA_ARGS__, 16);                     \
+			break;                                          \
+		case 17:                                                \
+			func_name(__VA_ARGS__, 17);                     \
+			break;                                          \
+		case 18:                                                \
+			func_name(__VA_ARGS__, 18);                     \
+			break;                                          \
+		case 19:                                                \
+			func_name(__VA_ARGS__, 19);                     \
+			break;                                          \
+		case 20:                                                \
+			func_name(__VA_ARGS__, 20);                     \
+			break;                                          \
+		case 21:                                                \
+			func_name(__VA_ARGS__, 21);                     \
+			break;                                          \
+		case 22:                                                \
+			func_name(__VA_ARGS__, 22);                     \
+			break;                                          \
+		case 23:                                                \
+			func_name(__VA_ARGS__, 23);                     \
+			break;                                          \
+		case 24:                                                \
+			func_name(__VA_ARGS__, 24);                     \
+			break;                                          \
+		case 25:                                                \
+			func_name(__VA_ARGS__, 25);                     \
+			break;                                          \
+		case 26:                                                \
+			func_name(__VA_ARGS__, 26);                     \
+			break;                                          \
+		case 27:                                                \
+			func_name(__VA_ARGS__, 27);                     \
+			break;                                          \
+		case 28:                                                \
+			func_name(__VA_ARGS__, 28);                     \
+			break;                                          \
+		case 29:                                                \
+			func_name(__VA_ARGS__, 29);                     \
+			break;                                          \
+		case 30:                                                \
+			func_name(__VA_ARGS__, 30);                     \
+			break;                                          \
+		case 31:                                                \
+			func_name(__VA_ARGS__, 31);                     \
+			break;                                          \
+		case 32:                                                \
+			func_name(__VA_ARGS__, 32);                     \
+			break;                                          \
+		case 33:                                                \
+			func_name(__VA_ARGS__, 33);                     \
+			break;                                          \
+		case 34:                                                \
+			func_name(__VA_ARGS__, 34);                     \
+			break;                                          \
+		case 35:                                                \
+			func_name(__VA_ARGS__, 35);                     \
+			break;                                          \
+		case 36:                                                \
+			func_name(__VA_ARGS__, 36);                     \
+			break;                                          \
+		case 37:                                                \
+			func_name(__VA_ARGS__, 37);                     \
+			break;                                          \
+		case 38:                                                \
+			func_name(__VA_ARGS__, 38);                     \
+			break;                                          \
+		case 39:                                                \
+			func_name(__VA_ARGS__, 39);                     \
+			break;                                          \
+		case 40:                                                \
+			func_name(__VA_ARGS__, 40);                     \
+			break;                                          \
+		case 41:                                                \
+			func_name(__VA_ARGS__, 41);                     \
+			break;                                          \
+		case 42:                                                \
+			func_name(__VA_ARGS__, 42);                     \
+			break;                                          \
+		case 43:                                                \
+			func_name(__VA_ARGS__, 43);                     \
+			break;                                          \
+		case 44:                                                \
+			func_name(__VA_ARGS__, 44);                     \
+			break;                                          \
+		case 45:                                                \
+			func_name(__VA_ARGS__, 45);                     \
+			break;                                          \
+		case 46:                                                \
+			func_name(__VA_ARGS__, 46);                     \
+			break;                                          \
+		case 47:                                                \
+			func_name(__VA_ARGS__, 47);                     \
+			break;                                          \
+		case 48:                                                \
+			func_name(__VA_ARGS__, 48);                     \
+			break;                                          \
+		case 49:                                                \
+			func_name(__VA_ARGS__, 49);                     \
+			break;                                          \
+		case 50:                                                \
+			func_name(__VA_ARGS__, 50);                     \
+			break;                                          \
+		case 51:                                                \
+			func_name(__VA_ARGS__, 51);                     \
+			break;                                          \
+		case 52:                                                \
+			func_name(__VA_ARGS__, 52);                     \
+			break;                                          \
+		case 53:                                                \
+			func_name(__VA_ARGS__, 53);                     \
+			break;                                          \
+		case 54:                                                \
+			func_name(__VA_ARGS__, 54);                     \
+			break;                                          \
+		case 55:                                                \
+			func_name(__VA_ARGS__, 55);                     \
+			break;                                          \
+		case 56:                                                \
+			func_name(__VA_ARGS__, 56);                     \
+			break;                                          \
+		case 57:                                                \
+			func_name(__VA_ARGS__, 57);                     \
+			break;                                          \
+		case 58:                                                \
+			func_name(__VA_ARGS__, 58);                     \
+			break;                                          \
+		case 59:                                                \
+			func_name(__VA_ARGS__, 59);                     \
+			break;                                          \
+		case 60:                                                \
+			func_name(__VA_ARGS__, 60);                     \
+			break;                                          \
+		case 61:                                                \
+			func_name(__VA_ARGS__, 61);                     \
+			break;                                          \
+		case 62:                                                \
+			func_name(__VA_ARGS__, 62);                     \
+			break;                                          \
+		case 63:                                                \
+			func_name(__VA_ARGS__, 63);                     \
+			break;                                          \
+		default:                                                \
+			default_case;                                   \
+			break;                                          \
+		}                                                       \
+	} while (0)
+
+HEDLEY_DIAGNOSTIC_POP
+
+#endif

+ 114 - 0
libobs/util/simde/simde-detect-clang.h

@@ -0,0 +1,114 @@
+/* Detect Clang Version
+ * Created by Evan Nemerson <[email protected]>
+ *
+ * To the extent possible under law, the author(s) have dedicated all
+ * copyright and related and neighboring rights to this software to
+ * the public domain worldwide. This software is distributed without
+ * any warranty.
+ *
+ * For details, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ * SPDX-License-Identifier: CC0-1.0
+ */
+
+/* This file was originally part of SIMDe
+ * (<https://github.com/simd-everywhere/simde>).  You're free to do with it as
+ * you please, but I do have a few small requests:
+ *
+ *  * If you make improvements, please submit them back to SIMDe
+ *    (at <https://github.com/simd-everywhere/simde/issues>) so others can
+ *    benefit from them.
+ *  * Please keep a link to SIMDe intact so people know where to submit
+ *    improvements.
+ *  * If you expose it publicly, please change the SIMDE_ prefix to
+ *    something specific to your project.
+ *
+ * The version numbers clang exposes (in the ___clang_major__,
+ * __clang_minor__, and __clang_patchlevel__ macros) are unreliable.
+ * Vendors such as Apple will define these values to their version
+ * numbers; for example, "Apple Clang 4.0" is really clang 3.1, but
+ * __clang_major__ and __clang_minor__ are defined to 4 and 0
+ * respectively, instead of 3 and 1.
+ *
+ * The solution is *usually* to use clang's feature detection macros
+ * (<https://clang.llvm.org/docs/LanguageExtensions.html#feature-checking-macros>)
+ * to determine if the feature you're interested in is available.  This
+ * generally works well, and it should probably be the first thing you
+ * try.  Unfortunately, it's not possible to check for everything.  In
+ * particular, compiler bugs.
+ *
+ * This file just uses the feature checking macros to detect features
+ * added in specific versions of clang to identify which version of
+ * clang the compiler is based on.
+ *
+ * Right now it only goes back to 3.6, but I'm happy to accept patches
+ * to go back further.  And, of course, newer versions are welcome if
+ * they're not already present, and if you find a way to detect a point
+ * release that would be great, too!
+ */
+
+#if !defined(SIMDE_DETECT_CLANG_H)
+#define SIMDE_DETECT_CLANG_H 1
+
+/* Attempt to detect the upstream clang version number.  I usually only
+ * worry about major version numbers (at least for 4.0+), but if you
+ * need more resolution I'm happy to accept patches that are able to
+ * detect minor versions as well.  That said, you'll probably have a
+ * hard time with detection since AFAIK most minor releases don't add
+ * anything we can detect. */
+
+#if defined(__clang__) && !defined(SIMDE_DETECT_CLANG_VERSION)
+#if __has_warning("-Wformat-insufficient-args")
+#define SIMDE_DETECT_CLANG_VERSION 120000
+#elif __has_warning("-Wimplicit-const-int-float-conversion")
+#define SIMDE_DETECT_CLANG_VERSION 110000
+#elif __has_warning("-Wmisleading-indentation")
+#define SIMDE_DETECT_CLANG_VERSION 100000
+#elif defined(__FILE_NAME__)
+#define SIMDE_DETECT_CLANG_VERSION 90000
+#elif __has_warning("-Wextra-semi-stmt") || \
+	__has_builtin(__builtin_rotateleft32)
+#define SIMDE_DETECT_CLANG_VERSION 80000
+#elif __has_warning("-Wc++98-compat-extra-semi")
+#define SIMDE_DETECT_CLANG_VERSION 70000
+#elif __has_warning("-Wpragma-pack")
+#define SIMDE_DETECT_CLANG_VERSION 60000
+#elif __has_warning("-Wbitfield-enum-conversion")
+#define SIMDE_DETECT_CLANG_VERSION 50000
+#elif __has_attribute(diagnose_if)
+#define SIMDE_DETECT_CLANG_VERSION 40000
+#elif __has_warning("-Wcast-calling-convention")
+#define SIMDE_DETECT_CLANG_VERSION 30900
+#elif __has_warning("-WCL4")
+#define SIMDE_DETECT_CLANG_VERSION 30800
+#elif __has_warning("-WIndependentClass-attribute")
+#define SIMDE_DETECT_CLANG_VERSION 30700
+#elif __has_warning("-Wambiguous-ellipsis")
+#define SIMDE_DETECT_CLANG_VERSION 30600
+#else
+#define SIMDE_DETECT_CLANG_VERSION 1
+#endif
+#endif /* defined(__clang__) && !defined(SIMDE_DETECT_CLANG_VERSION) */
+
+/* The SIMDE_DETECT_CLANG_VERSION_CHECK macro is pretty
+ * straightforward; it returns true if the compiler is a derivative
+ * of clang >= the specified version.
+ *
+ * Since this file is often (primarily?) useful for working around bugs
+ * it is also helpful to have a macro which returns true if only if the
+ * compiler is a version of clang *older* than the specified version to
+ * make it a bit easier to ifdef regions to add code for older versions,
+ * such as pragmas to disable a specific warning. */
+
+#if defined(SIMDE_DETECT_CLANG_VERSION)
+#define SIMDE_DETECT_CLANG_VERSION_CHECK(major, minor, revision) \
+	(SIMDE_DETECT_CLANG_VERSION >=                           \
+	 ((major * 10000) + (minor * 1000) + (revision)))
+#define SIMDE_DETECT_CLANG_VERSION_NOT(major, minor, revision) \
+	(SIMDE_DETECT_CLANG_VERSION <                          \
+	 ((major * 10000) + (minor * 1000) + (revision)))
+#else
+#define SIMDE_DETECT_CLANG_VERSION_CHECK(major, minor, revision) (0)
+#define SIMDE_DETECT_CLANG_VERSION_NOT(major, minor, revision) (1)
+#endif
+
+#endif /* !defined(SIMDE_DETECT_CLANG_H) */

+ 187 - 10
libobs/util/simde/simde-diagnostic.h

@@ -45,8 +45,10 @@
  */
  */
 
 
 #if !defined(SIMDE_DIAGNOSTIC_H)
 #if !defined(SIMDE_DIAGNOSTIC_H)
+#define SIMDE_DIAGNOSTIC_H
 
 
 #include "hedley.h"
 #include "hedley.h"
+#include "simde-detect-clang.h"
 
 
 /* This is only to help us implement functions like _mm_undefined_ps. */
 /* This is only to help us implement functions like _mm_undefined_ps. */
 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
@@ -119,6 +121,9 @@
 #define SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_
 #define SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_
 #endif
 #endif
 
 
+/* MSVC emits a diagnostic when we call a function (like
+ * simde_mm_set_epi32) while initializing a struct.  We currently do
+ * this a *lot* in the tests. */
 #if defined(HEDLEY_MSVC_VERSION)
 #if defined(HEDLEY_MSVC_VERSION)
 #define SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ \
 #define SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ \
 	__pragma(warning(disable : 4204))
 	__pragma(warning(disable : 4204))
@@ -183,6 +188,32 @@
 #define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_
 #define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_
 #endif
 #endif
 
 
+/* emscripten requires us to use a __wasm_unimplemented_simd128__ macro
+ * before we can access certain SIMD intrinsics, but this diagnostic
+ * warns about it being a reserved name.  It is a reserved name, but
+ * it's reserved for the compiler and we are using it to convey
+ * information to the compiler.
+ *
+ * This is also used when enabling native aliases since we don't get to
+ * choose the macro names. */
+#if HEDLEY_HAS_WARNING("-Wdouble-promotion")
+#define SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_ \
+	_Pragma("clang diagnostic ignored \"-Wreserved-id-macro\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_
+#endif
+
+/* clang 3.8 warns about the packed attribute being unnecessary when
+ * used in the _mm_loadu_* functions.  That *may* be true for version
+ * 3.8, but for later versions it is crucial in order to make unaligned
+ * access safe. */
+#if HEDLEY_HAS_WARNING("-Wpacked")
+#define SIMDE_DIAGNOSTIC_DISABLE_PACKED_ \
+	_Pragma("clang diagnostic ignored \"-Wpacked\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_PACKED_
+#endif
+
 /* Triggered when assigning a float to a double implicitly.  We use
 /* Triggered when assigning a float to a double implicitly.  We use
  * explicit casts in SIMDe, this is only used in the test suite. */
  * explicit casts in SIMDe, this is only used in the test suite. */
 #if HEDLEY_HAS_WARNING("-Wdouble-promotion")
 #if HEDLEY_HAS_WARNING("-Wdouble-promotion")
@@ -194,7 +225,7 @@
 
 
 /* Several compilers treat conformant array parameters as VLAs.  We
 /* Several compilers treat conformant array parameters as VLAs.  We
  * test to make sure we're in C mode (C++ doesn't support CAPs), and
  * test to make sure we're in C mode (C++ doesn't support CAPs), and
- * that the version of the standard supports CAPs.  We also blacklist
+ * that the version of the standard supports CAPs.  We also reject
  * some buggy compilers like MSVC (the logic is in Hedley if you want
  * some buggy compilers like MSVC (the logic is in Hedley if you want
  * to take a look), but with certain warnings enabled some compilers
  * to take a look), but with certain warnings enabled some compilers
  * still like to emit a diagnostic. */
  * still like to emit a diagnostic. */
@@ -221,6 +252,9 @@
 #elif HEDLEY_GCC_VERSION_CHECK(3, 4, 0)
 #elif HEDLEY_GCC_VERSION_CHECK(3, 4, 0)
 #define SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_ \
 #define SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_ \
 	_Pragma("GCC diagnostic ignored \"-Wunused-function\"")
 	_Pragma("GCC diagnostic ignored \"-Wunused-function\"")
+#elif HEDLEY_MSVC_VERSION_CHECK(19, 0, 0) /* Likely goes back further */
+#define SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_ \
+	__pragma(warning(disable : 4505))
 #else
 #else
 #define SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_
 #define SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_
 #endif
 #endif
@@ -232,13 +266,63 @@
 #define SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_
 #define SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_
 #endif
 #endif
 
 
-/* https://github.com/nemequ/simde/issues/277 */
+#if HEDLEY_HAS_WARNING("-Wpadded")
+#define SIMDE_DIAGNOSTIC_DISABLE_PADDED_ \
+	_Pragma("clang diagnostic ignored \"-Wpadded\"")
+#elif HEDLEY_MSVC_VERSION_CHECK(19, 0, 0) /* Likely goes back further */
+#define SIMDE_DIAGNOSTIC_DISABLE_PADDED_ __pragma(warning(disable : 4324))
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_PADDED_
+#endif
+
+#if HEDLEY_HAS_WARNING("-Wzero-as-null-pointer-constant")
+#define SIMDE_DIAGNOSTIC_DISABLE_ZERO_AS_NULL_POINTER_CONSTANT_ \
+	_Pragma("clang diagnostic ignored \"-Wzero-as-null-pointer-constant\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_ZERO_AS_NULL_POINTER_CONSTANT_
+#endif
+
+#if HEDLEY_HAS_WARNING("-Wold-style-cast")
+#define SIMDE_DIAGNOSTIC_DISABLE_OLD_STYLE_CAST_ \
+	_Pragma("clang diagnostic ignored \"-Wold-style-cast\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_OLD_STYLE_CAST_
+#endif
+
+#if HEDLEY_HAS_WARNING("-Wcast-function-type") || \
+	HEDLEY_GCC_VERSION_CHECK(8, 0, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_CAST_FUNCTION_TYPE_ \
+	_Pragma("GCC diagnostic ignored \"-Wcast-function-type\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_CAST_FUNCTION_TYPE_
+#endif
+
+/* clang will emit this warning when we use C99 extensions whan not in
+ * C99 mode, even though it does support this.  In such cases we check
+ * the compiler and version first, so we know it's not a problem. */
+#if HEDLEY_HAS_WARNING("-Wc99-extensions")
+#define SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_ \
+	_Pragma("clang diagnostic ignored \"-Wc99-extensions\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_
+#endif
+
+/* https://github.com/simd-everywhere/simde/issues/277 */
 #if defined(HEDLEY_GCC_VERSION) && HEDLEY_GCC_VERSION_CHECK(4, 6, 0) && \
 #if defined(HEDLEY_GCC_VERSION) && HEDLEY_GCC_VERSION_CHECK(4, 6, 0) && \
-	!HEDLEY_GCC_VERSION_CHECK(6, 0, 0) && defined(__cplusplus)
-#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE \
+	!HEDLEY_GCC_VERSION_CHECK(6, 4, 0) && defined(__cplusplus)
+#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_ \
 	_Pragma("GCC diagnostic ignored \"-Wunused-but-set-variable\"")
 	_Pragma("GCC diagnostic ignored \"-Wunused-but-set-variable\"")
 #else
 #else
-#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE
+#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_
+#endif
+
+/* This is the warning that you normally define _CRT_SECURE_NO_WARNINGS
+ * to silence, but you have to do that before including anything and
+ * that would require reordering includes. */
+#if defined(_MSC_VER)
+#define SIMDE_DIAGNOSTIC_DISABLE_ANNEX_K_ __pragma(warning(disable : 4996))
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_ANNEX_K_
 #endif
 #endif
 
 
 /* Some compilers, such as clang, may use `long long` for 64-bit
 /* Some compilers, such as clang, may use `long long` for 64-bit
@@ -246,13 +330,104 @@
  * -Wc++98-compat-pedantic which says 'long long' is incompatible with
  * -Wc++98-compat-pedantic which says 'long long' is incompatible with
  * C++98. */
  * C++98. */
 #if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic")
 #if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic")
-#define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC \
+#define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ \
 	_Pragma("clang diagnostic ignored \"-Wc++98-compat-pedantic\"")
 	_Pragma("clang diagnostic ignored \"-Wc++98-compat-pedantic\"")
 #else
 #else
-#define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC
+#define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_
+#endif
+
+/* Some problem as above */
+#if HEDLEY_HAS_WARNING("-Wc++11-long-long")
+#define SIMDE_DIAGNOSTIC_DISABLE_CPP11_LONG_LONG_ \
+	_Pragma("clang diagnostic ignored \"-Wc++11-long-long\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_CPP11_LONG_LONG_
+#endif
+
+/* emscripten emits this whenever stdin/stdout/stderr is used in a
+ * macro. */
+#if HEDLEY_HAS_WARNING("-Wdisabled-macro-expansion")
+#define SIMDE_DIAGNOSTIC_DISABLE_DISABLED_MACRO_EXPANSION_ \
+	_Pragma("clang diagnostic ignored \"-Wdisabled-macro-expansion\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_DISABLED_MACRO_EXPANSION_
+#endif
+
+/* Clang uses C11 generic selections to implement some AltiVec
+ * functions, which triggers this diagnostic when not compiling
+ * in C11 mode */
+#if HEDLEY_HAS_WARNING("-Wc11-extensions")
+#define SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_ \
+	_Pragma("clang diagnostic ignored \"-Wc11-extensions\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_
+#endif
+
+/* Clang sometimes triggers this warning in macros in the AltiVec and
+ * NEON headers, or due to missing functions. */
+#if HEDLEY_HAS_WARNING("-Wvector-conversion")
+#define SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_ \
+	_Pragma("clang diagnostic ignored \"-Wvector-conversion\"")
+/* For NEON, the situation with -Wvector-conversion in clang < 10 is
+   * bad enough that we just disable the warning altogether. */
+#if defined(SIMDE_ARCH_ARM) && SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_ \
+	SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
+#endif
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
+#endif
+#if !defined(SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_)
+#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_
+#endif
+
+/* SLEEF triggers this a *lot* in their headers */
+#if HEDLEY_HAS_WARNING("-Wignored-qualifiers")
+#define SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_ \
+	_Pragma("clang diagnostic ignored \"-Wignored-qualifiers\"")
+#elif HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_ \
+	_Pragma("GCC diagnostic ignored \"-Wignored-qualifiers\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_
+#endif
+
+/* GCC emits this under some circumstances when using __int128 */
+#if HEDLEY_GCC_VERSION_CHECK(4, 8, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_ \
+	_Pragma("GCC diagnostic ignored \"-Wpedantic\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_
+#endif
+
+/* MSVC doesn't like (__assume(0), code) and will warn about code being
+ * unreachable, but we want it there because not all compilers
+ * understand the unreachable macro and will complain if it is missing.
+ * I'm planning on adding a new macro to Hedley to handle this a bit
+ * more elegantly, but until then... */
+#if defined(HEDLEY_MSVC_VERSION)
+#define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_ __pragma(warning(disable : 4702))
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_
+#endif
+
+/* This is a false positive from GCC in a few places. */
+#if HEDLEY_GCC_VERSION_CHECK(4, 7, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ \
+	_Pragma("GCC diagnostic ignored \"-Wmaybe-uninitialized\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_
+#endif
+
+#if defined(SIMDE_ENABLE_NATIVE_ALIASES)
+#define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS_NATIVE_ALIASES_ \
+	SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_
+#else
+#define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS_NATIVE_ALIASES_
 #endif
 #endif
 
 
 #define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS                           \
 #define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS                           \
+	SIMDE_DISABLE_UNWANTED_DIAGNOSTICS_NATIVE_ALIASES_           \
 	SIMDE_DIAGNOSTIC_DISABLE_PSABI_                              \
 	SIMDE_DIAGNOSTIC_DISABLE_PSABI_                              \
 	SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_                \
 	SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_                \
 	SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_             \
 	SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_             \
@@ -264,7 +439,9 @@
 	SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_             \
 	SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_             \
 	SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_                    \
 	SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_                    \
 	SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_                        \
 	SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_                        \
-	SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC               \
-	SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE
+	SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_              \
+	SIMDE_DIAGNOSTIC_DISABLE_CPP11_LONG_LONG_                    \
+	SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_      \
+	SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_
 
 
-#endif
+#endif /* !defined(SIMDE_DIAGNOSTIC_H) */

+ 207 - 14
libobs/util/simde/simde-features.h

@@ -32,6 +32,7 @@
 #define SIMDE_FEATURES_H
 #define SIMDE_FEATURES_H
 
 
 #include "simde-arch.h"
 #include "simde-arch.h"
+#include "simde-diagnostic.h"
 
 
 #if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && \
 #if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && \
 	!defined(SIMDE_NO_NATIVE)
 	!defined(SIMDE_NO_NATIVE)
@@ -43,6 +44,28 @@
 #define SIMDE_X86_AVX512F_NATIVE
 #define SIMDE_X86_AVX512F_NATIVE
 #endif
 #endif
 
 
+#if !defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) &&        \
+	!defined(SIMDE_X86_AVX512VP2INTERSECT_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_AVX512VP2INTERSECT)
+#define SIMDE_X86_AVX512VP2INTERSECT_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && \
+	!defined(SIMDE_X86_AVX512F_NATIVE)
+#define SIMDE_X86_AVX512F_NATIVE
+#endif
+
+#if !defined(SIMDE_X86_AVX512VBMI_NATIVE) && \
+	!defined(SIMDE_X86_AVX512VBMI_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_AVX512VBMI)
+#define SIMDE_X86_AVX512VBMI_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_AVX512VBMI_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE)
+#define SIMDE_X86_AVX512F_NATIVE
+#endif
+
 #if !defined(SIMDE_X86_AVX512CD_NATIVE) && \
 #if !defined(SIMDE_X86_AVX512CD_NATIVE) && \
 	!defined(SIMDE_X86_AVX512CD_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
 	!defined(SIMDE_X86_AVX512CD_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
 #if defined(SIMDE_ARCH_X86_AVX512CD)
 #if defined(SIMDE_ARCH_X86_AVX512CD)
@@ -194,6 +217,20 @@
 #endif
 #endif
 #endif
 #endif
 
 
+#if !defined(SIMDE_X86_PCLMUL_NATIVE) && \
+	!defined(SIMDE_X86_PCLMUL_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_PCLMUL)
+#define SIMDE_X86_PCLMUL_NATIVE
+#endif
+#endif
+
+#if !defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && \
+	!defined(SIMDE_X86_VPCLMULQDQ_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_VPCLMULQDQ)
+#define SIMDE_X86_VPCLMULQDQ_NATIVE
+#endif
+#endif
+
 #if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && \
 #if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && \
 	!defined(SIMDE_NO_NATIVE)
 	!defined(SIMDE_NO_NATIVE)
 #if defined(__INTEL_COMPILER)
 #if defined(__INTEL_COMPILER)
@@ -206,8 +243,7 @@
 #pragma warning(disable : 4799)
 #pragma warning(disable : 4799)
 #endif
 #endif
 
 
-#if defined(SIMDE_X86_AVX_NATIVE) || defined(SIMDE_X86_GFNI_NATIVE) || \
-	defined(SIMDE_X86_SVML_NATIVE)
+#if defined(SIMDE_X86_AVX_NATIVE) || defined(SIMDE_X86_GFNI_NATIVE)
 #include <immintrin.h>
 #include <immintrin.h>
 #elif defined(SIMDE_X86_SSE4_2_NATIVE)
 #elif defined(SIMDE_X86_SSE4_2_NATIVE)
 #include <nmmintrin.h>
 #include <nmmintrin.h>
@@ -243,7 +279,8 @@
 
 
 #if !defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
 #if !defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
 	!defined(SIMDE_ARM_NEON_A32V8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
 	!defined(SIMDE_ARM_NEON_A32V8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
-#if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(80)
+#if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(80) && \
+	(__ARM_NEON_FP & 0x02)
 #define SIMDE_ARM_NEON_A32V8_NATIVE
 #define SIMDE_ARM_NEON_A32V8_NATIVE
 #endif
 #endif
 #endif
 #endif
@@ -262,6 +299,14 @@
 #include <arm_neon.h>
 #include <arm_neon.h>
 #endif
 #endif
 
 
+#if !defined(SIMDE_ARM_SVE_NATIVE) && !defined(SIMDE_ARM_SVE_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_ARM_SVE)
+#define SIMDE_ARM_SVE_NATIVE
+#include <arm_sve.h>
+#endif
+#endif
+
 #if !defined(SIMDE_WASM_SIMD128_NATIVE) && \
 #if !defined(SIMDE_WASM_SIMD128_NATIVE) && \
 	!defined(SIMDE_WASM_SIMD128_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
 	!defined(SIMDE_WASM_SIMD128_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
 #if defined(SIMDE_ARCH_WASM_SIMD128)
 #if defined(SIMDE_ARCH_WASM_SIMD128)
@@ -270,7 +315,10 @@
 #endif
 #endif
 #if defined(SIMDE_WASM_SIMD128_NATIVE)
 #if defined(SIMDE_WASM_SIMD128_NATIVE)
 #if !defined(__wasm_unimplemented_simd128__)
 #if !defined(__wasm_unimplemented_simd128__)
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_
 #define __wasm_unimplemented_simd128__
 #define __wasm_unimplemented_simd128__
+HEDLEY_DIAGNOSTIC_POP
 #endif
 #endif
 #include <wasm_simd128.h>
 #include <wasm_simd128.h>
 #endif
 #endif
@@ -326,15 +374,28 @@
 #define SIMDE_POWER_ALTIVEC_P5_NATIVE
 #define SIMDE_POWER_ALTIVEC_P5_NATIVE
 #endif
 #endif
 #endif
 #endif
-#if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
-/* stdbool.h conflicts with the bool in altivec.h */
-#if defined(bool) && !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF_BOOL_)
+
+#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
+/* AltiVec conflicts with lots of stuff.  The bool keyword conflicts
+   * with the bool keyword in C++ and the bool macro in C99+ (defined
+   * in stdbool.h).  The vector keyword conflicts with std::vector in
+   * C++ if you are `using std;`.
+   *
+   * Luckily AltiVec allows you to use `__vector`/`__bool`/`__pixel`
+   * instead, but altivec.h will unconditionally define
+   * `vector`/`bool`/`pixel` so we need to work around that.
+   *
+   * Unfortunately this means that if your code uses AltiVec directly
+   * it may break.  If this is the case you'll want to define
+   * `SIMDE_POWER_ALTIVEC_NO_UNDEF` before including SIMDe.  Or, even
+   * better, port your code to use the double-underscore versions. */
+#if defined(bool)
 #undef bool
 #undef bool
 #endif
 #endif
+
 #include <altivec.h>
 #include <altivec.h>
-/* GCC allows you to undefine these macros to prevent conflicts with
-   * standard types as they become context-sensitive keywords. */
-#if defined(__cplusplus)
+
+#if !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF)
 #if defined(vector)
 #if defined(vector)
 #undef vector
 #undef vector
 #endif
 #endif
@@ -344,14 +405,146 @@
 #if defined(bool)
 #if defined(bool)
 #undef bool
 #undef bool
 #endif
 #endif
-#define SIMDE_POWER_ALTIVEC_VECTOR(T) vector T
-#define SIMDE_POWER_ALTIVEC_PIXEL pixel
-#define SIMDE_POWER_ALTIVEC_BOOL bool
-#else
+#endif /* !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF) */
+
+/* Use these intsead of vector/pixel/bool in SIMDe. */
 #define SIMDE_POWER_ALTIVEC_VECTOR(T) __vector T
 #define SIMDE_POWER_ALTIVEC_VECTOR(T) __vector T
 #define SIMDE_POWER_ALTIVEC_PIXEL __pixel
 #define SIMDE_POWER_ALTIVEC_PIXEL __pixel
 #define SIMDE_POWER_ALTIVEC_BOOL __bool
 #define SIMDE_POWER_ALTIVEC_BOOL __bool
-#endif /* defined(__cplusplus) */
+
+/* Re-define bool if we're using stdbool.h */
+#if !defined(__cplusplus) && defined(__bool_true_false_are_defined) && \
+	!defined(SIMDE_POWER_ALTIVEC_NO_UNDEF)
+#define bool _Bool
+#endif
+#endif
+
+#if !defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) &&        \
+	!defined(SIMDE_MIPS_LOONGSON_MMI_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_MIPS_LOONGSON_MMI)
+#define SIMDE_MIPS_LOONGSON_MMI_NATIVE 1
+#endif
+#endif
+#if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+#include <loongson-mmiintrin.h>
+#endif
+
+/* This is used to determine whether or not to fall back on a vector
+ * function in an earlier ISA extensions, as well as whether
+ * we expected any attempts at vectorization to be fruitful or if we
+ * expect to always be running serial code. */
+
+#if !defined(SIMDE_NATURAL_VECTOR_SIZE)
+#if defined(SIMDE_X86_AVX512F_NATIVE)
+#define SIMDE_NATURAL_VECTOR_SIZE (512)
+#elif defined(SIMDE_X86_AVX_NATIVE)
+#define SIMDE_NATURAL_VECTOR_SIZE (256)
+#elif defined(SIMDE_X86_SSE_NATIVE) || defined(SIMDE_ARM_NEON_A32V7_NATIVE) || \
+	defined(SIMDE_WASM_SIMD128_NATIVE) ||                                  \
+	defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
+#define SIMDE_NATURAL_VECTOR_SIZE (128)
+#endif
+
+#if !defined(SIMDE_NATURAL_VECTOR_SIZE)
+#define SIMDE_NATURAL_VECTOR_SIZE (0)
+#endif
+#endif
+
+#define SIMDE_NATURAL_VECTOR_SIZE_LE(x) \
+	((SIMDE_NATURAL_VECTOR_SIZE > 0) && (SIMDE_NATURAL_VECTOR_SIZE <= (x)))
+#define SIMDE_NATURAL_VECTOR_SIZE_GE(x) \
+	((SIMDE_NATURAL_VECTOR_SIZE > 0) && (SIMDE_NATURAL_VECTOR_SIZE >= (x)))
+
+/* Native aliases */
+#if defined(SIMDE_ENABLE_NATIVE_ALIASES)
+#if !defined(SIMDE_X86_MMX_NATIVE)
+#define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_SSE_NATIVE)
+#define SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_SSE2_NATIVE)
+#define SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_SSE3_NATIVE)
+#define SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_SSSE3_NATIVE)
+#define SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_SSE4_1_NATIVE)
+#define SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_SSE4_2_NATIVE)
+#define SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_AVX_NATIVE)
+#define SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_AVX2_NATIVE)
+#define SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_FMA_NATIVE)
+#define SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_AVX512F_NATIVE)
+#define SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_AVX512VL_NATIVE)
+#define SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_AVX512BW_NATIVE)
+#define SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_AVX512DQ_NATIVE)
+#define SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_AVX512CD_NATIVE)
+#define SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_GFNI_NATIVE)
+#define SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_PCLMUL_NATIVE)
+#define SIMDE_X86_PCLMUL_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_X86_VPCLMULQDQ_NATIVE)
+#define SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES
+#endif
+
+#if !defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+#define SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+#define SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES
+#endif
+#if !defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+#define SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES
+#endif
+#endif
+
+/* Are floating point values stored using IEEE 754?  Knowing
+ * this at during preprocessing is a bit tricky, mostly because what
+ * we're curious about is how values are stored and not whether the
+ * implementation is fully conformant in terms of rounding, NaN
+ * handling, etc.
+ *
+ * For example, if you use -ffast-math or -Ofast on
+ * GCC or clang IEEE 754 isn't strictly followed, therefore IEE 754
+ * support is not advertised (by defining __STDC_IEC_559__).
+ *
+ * However, what we care about is whether it is safe to assume that
+ * floating point values are stored in IEEE 754 format, in which case
+ * we can provide faster implementations of some functions.
+ *
+ * Luckily every vaugely modern architecture I'm aware of uses IEEE 754-
+ * so we just assume IEEE 754 for now.  There is a test which verifies
+ * this, if that test fails sowewhere please let us know and we'll add
+ * an exception for that platform.  Meanwhile, you can define
+ * SIMDE_NO_IEEE754_STORAGE. */
+#if !defined(SIMDE_IEEE754_STORAGE) && !defined(SIMDE_NO_IEE754_STORAGE)
+#define SIMDE_IEEE754_STORAGE
 #endif
 #endif
 
 
 #endif /* !defined(SIMDE_FEATURES_H) */
 #endif /* !defined(SIMDE_FEATURES_H) */

+ 493 - 49
libobs/util/simde/simde-math.h

@@ -34,6 +34,58 @@
 #include "hedley.h"
 #include "hedley.h"
 #include "simde-features.h"
 #include "simde-features.h"
 
 
+#include <stdint.h>
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+#include <arm_neon.h>
+#endif
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+
+/* SLEEF support
+ * https://sleef.org/
+ *
+ * If you include <sleef.h> prior to including SIMDe, SIMDe will use
+ * SLEEF.  You can also define SIMDE_MATH_SLEEF_ENABLE prior to
+ * including SIMDe to force the issue.
+ *
+ * Note that SLEEF does requires linking to libsleef.
+ *
+ * By default, SIMDe will use the 1 ULP functions, but if you use
+ * SIMDE_ACCURACY_PREFERENCE of 0 we will use up to 4 ULP.  This is
+ * only the case for the simde_math_* functions; for code in other
+ * SIMDe headers which calls SLEEF directly we may use functions with
+ * greater error if the API we're implementing is less precise (for
+ * example, SVML guarantees 4 ULP, so we will generally use the 3.5
+ * ULP functions from SLEEF). */
+#if !defined(SIMDE_MATH_SLEEF_DISABLE)
+#if defined(__SLEEF_H__)
+#define SIMDE_MATH_SLEEF_ENABLE
+#endif
+#endif
+
+#if defined(SIMDE_MATH_SLEEF_ENABLE) && !defined(__SLEEF_H__)
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_
+#include <sleef.h>
+HEDLEY_DIAGNOSTIC_POP
+#endif
+
+#if defined(SIMDE_MATH_SLEEF_ENABLE) && defined(__SLEEF_H__)
+#if defined(SLEEF_VERSION_MAJOR)
+#define SIMDE_MATH_SLEEF_VERSION_CHECK(major, minor, patch)              \
+	(HEDLEY_VERSION_ENCODE(SLEEF_VERSION_MAJOR, SLEEF_VERSION_MINOR, \
+			       SLEEF_VERSION_PATCHLEVEL) >=              \
+	 HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define SIMDE_MATH_SLEEF_VERSION_CHECK(major, minor, patch) \
+	(HEDLEY_VERSION_ENCODE(3, 0, 0) >=                  \
+	 HEDLEY_VERSION_ENCODE(major, minor, patch))
+#endif
+#else
+#define SIMDE_MATH_SLEEF_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
 #if defined(__has_builtin)
 #if defined(__has_builtin)
 #define SIMDE_MATH_BUILTIN_LIBM(func) __has_builtin(__builtin_##func)
 #define SIMDE_MATH_BUILTIN_LIBM(func) __has_builtin(__builtin_##func)
 #elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \
 #elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \
@@ -82,11 +134,35 @@ HEDLEY_DIAGNOSTIC_POP
 #endif
 #endif
 #endif
 #endif
 
 
-#if !defined(__cplusplus)
-/* If this is a problem we *might* be able to avoid including
-   * <complex.h> on some compilers (gcc, clang, and others which
-   * implement builtins like __builtin_cexpf).  If you don't have
-   * a <complex.h> please file an issue and we'll take a look. */
+/* Try to avoid including <complex> since it pulls in a *lot* of code. */
+#if HEDLEY_HAS_BUILTIN(__builtin_creal) ||   \
+	HEDLEY_GCC_VERSION_CHECK(4, 7, 0) || \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_
+typedef __complex__ float simde_cfloat32;
+typedef __complex__ double simde_cfloat64;
+HEDLEY_DIAGNOSTIC_POP
+#define SIMDE_MATH_CMPLX(x, y)           \
+	(HEDLEY_STATIC_CAST(double, x) + \
+	 HEDLEY_STATIC_CAST(double, y) * (__extension__ 1.0j))
+#define SIMDE_MATH_CMPLXF(x, y)         \
+	(HEDLEY_STATIC_CAST(float, x) + \
+	 HEDLEY_STATIC_CAST(float, y) * (__extension__ 1.0fj))
+
+#if !defined(simde_math_creal)
+#define simde_math_crealf(z) __builtin_crealf(z)
+#endif
+#if !defined(simde_math_crealf)
+#define simde_math_creal(z) __builtin_creal(z)
+#endif
+#if !defined(simde_math_cimag)
+#define simde_math_cimagf(z) __builtin_cimagf(z)
+#endif
+#if !defined(simde_math_cimagf)
+#define simde_math_cimag(z) __builtin_cimag(z)
+#endif
+#elif !defined(__cplusplus)
 #include <complex.h>
 #include <complex.h>
 
 
 #if !defined(HEDLEY_MSVC_VERSION)
 #if !defined(HEDLEY_MSVC_VERSION)
@@ -96,20 +172,14 @@ typedef double _Complex simde_cfloat64;
 typedef _Fcomplex simde_cfloat32;
 typedef _Fcomplex simde_cfloat32;
 typedef _Dcomplex simde_cfloat64;
 typedef _Dcomplex simde_cfloat64;
 #endif
 #endif
-#if HEDLEY_HAS_BUILTIN(__builtin_complex) || \
-	HEDLEY_GCC_VERSION_CHECK(4, 7, 0) || \
-	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
-#define SIMDE_MATH_CMPLX(x, y) __builtin_complex((double)(x), (double)(y))
-#define SIMDE_MATH_CMPLXF(x, y) __builtin_complex((float)(x), (float)(y))
-#elif defined(HEDLEY_MSVC_VERSION)
+
+#if defined(HEDLEY_MSVC_VERSION)
 #define SIMDE_MATH_CMPLX(x, y) ((simde_cfloat64){(x), (y)})
 #define SIMDE_MATH_CMPLX(x, y) ((simde_cfloat64){(x), (y)})
 #define SIMDE_MATH_CMPLXF(x, y) ((simde_cfloat32){(x), (y)})
 #define SIMDE_MATH_CMPLXF(x, y) ((simde_cfloat32){(x), (y)})
 #elif defined(CMPLX) && defined(CMPLXF)
 #elif defined(CMPLX) && defined(CMPLXF)
 #define SIMDE_MATH_CMPLX(x, y) CMPLX(x, y)
 #define SIMDE_MATH_CMPLX(x, y) CMPLX(x, y)
 #define SIMDE_MATH_CMPLXF(x, y) CMPLXF(x, y)
 #define SIMDE_MATH_CMPLXF(x, y) CMPLXF(x, y)
 #else
 #else
-/* CMPLX / CMPLXF are in C99, but these seem to be necessary in
-     * some compilers that aren't even MSVC. */
 #define SIMDE_MATH_CMPLX(x, y) \
 #define SIMDE_MATH_CMPLX(x, y) \
 	(HEDLEY_STATIC_CAST(double, x) + HEDLEY_STATIC_CAST(double, y) * I)
 	(HEDLEY_STATIC_CAST(double, x) + HEDLEY_STATIC_CAST(double, y) * I)
 #define SIMDE_MATH_CMPLXF(x, y) \
 #define SIMDE_MATH_CMPLXF(x, y) \
@@ -117,38 +187,18 @@ typedef _Dcomplex simde_cfloat64;
 #endif
 #endif
 
 
 #if !defined(simde_math_creal)
 #if !defined(simde_math_creal)
-#if SIMDE_MATH_BUILTIN_LIBM(creal)
-#define simde_math_creal(z) __builtin_creal(z)
-#else
 #define simde_math_creal(z) creal(z)
 #define simde_math_creal(z) creal(z)
 #endif
 #endif
-#endif
-
 #if !defined(simde_math_crealf)
 #if !defined(simde_math_crealf)
-#if SIMDE_MATH_BUILTIN_LIBM(crealf)
-#define simde_math_crealf(z) __builtin_crealf(z)
-#else
 #define simde_math_crealf(z) crealf(z)
 #define simde_math_crealf(z) crealf(z)
 #endif
 #endif
-#endif
-
 #if !defined(simde_math_cimag)
 #if !defined(simde_math_cimag)
-#if SIMDE_MATH_BUILTIN_LIBM(cimag)
-#define simde_math_cimag(z) __builtin_cimag(z)
-#else
 #define simde_math_cimag(z) cimag(z)
 #define simde_math_cimag(z) cimag(z)
 #endif
 #endif
-#endif
-
 #if !defined(simde_math_cimagf)
 #if !defined(simde_math_cimagf)
-#if SIMDE_MATH_BUILTIN_LIBM(cimagf)
-#define simde_math_cimagf(z) __builtin_cimagf(z)
-#else
 #define simde_math_cimagf(z) cimagf(z)
 #define simde_math_cimagf(z) cimagf(z)
 #endif
 #endif
-#endif
 #else
 #else
-
 HEDLEY_DIAGNOSTIC_PUSH
 HEDLEY_DIAGNOSTIC_PUSH
 #if defined(HEDLEY_MSVC_VERSION)
 #if defined(HEDLEY_MSVC_VERSION)
 #pragma warning(disable : 4530)
 #pragma warning(disable : 4530)
@@ -240,6 +290,26 @@ typedef std::complex<double> simde_cfloat64;
 #endif
 #endif
 #endif
 #endif
 
 
+#if !defined(SIMDE_MATH_PI_OVER_180)
+#define SIMDE_MATH_PI_OVER_180 \
+	0.0174532925199432957692369076848861271344287188854172545609719144
+#endif
+
+#if !defined(SIMDE_MATH_PI_OVER_180F)
+#define SIMDE_MATH_PI_OVER_180F \
+	0.0174532925199432957692369076848861271344287188854172545609719144f
+#endif
+
+#if !defined(SIMDE_MATH_180_OVER_PI)
+#define SIMDE_MATH_180_OVER_PI \
+	57.295779513082320876798154814105170332405472466564321549160243861
+#endif
+
+#if !defined(SIMDE_MATH_180_OVER_PIF)
+#define SIMDE_MATH_180_OVER_PIF \
+	57.295779513082320876798154814105170332405472466564321549160243861f
+#endif
+
 #if !defined(SIMDE_MATH_FLT_MIN)
 #if !defined(SIMDE_MATH_FLT_MIN)
 #if defined(FLT_MIN)
 #if defined(FLT_MIN)
 #define SIMDE_MATH_FLT_MIN FLT_MIN
 #define SIMDE_MATH_FLT_MIN FLT_MIN
@@ -341,6 +411,36 @@ typedef std::complex<double> simde_cfloat64;
 #endif
 #endif
 #endif
 #endif
 
 
+/*** Manipulation functions ***/
+
+#if !defined(simde_math_nextafter)
+#if (HEDLEY_HAS_BUILTIN(__builtin_nextafter) && \
+     !defined(HEDLEY_IBM_VERSION)) ||           \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||    \
+	HEDLEY_GCC_VERSION_CHECK(3, 4, 0) ||    \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define simde_math_nextafter(x, y) __builtin_nextafter(x, y)
+#elif defined(SIMDE_MATH_HAVE_CMATH)
+#define simde_math_nextafter(x, y) std::nextafter(x, y)
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
+#define simde_math_nextafter(x, y) nextafter(x, y)
+#endif
+#endif
+
+#if !defined(simde_math_nextafterf)
+#if (HEDLEY_HAS_BUILTIN(__builtin_nextafterf) && \
+     !defined(HEDLEY_IBM_VERSION)) ||            \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||     \
+	HEDLEY_GCC_VERSION_CHECK(3, 4, 0) ||     \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define simde_math_nextafterf(x, y) __builtin_nextafterf(x, y)
+#elif defined(SIMDE_MATH_HAVE_CMATH)
+#define simde_math_nextafterf(x, y) std::nextafter(x, y)
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
+#define simde_math_nextafterf(x, y) nextafterf(x, y)
+#endif
+#endif
+
 /*** Functions from C99 ***/
 /*** Functions from C99 ***/
 
 
 #if !defined(simde_math_abs)
 #if !defined(simde_math_abs)
@@ -353,13 +453,13 @@ typedef std::complex<double> simde_cfloat64;
 #endif
 #endif
 #endif
 #endif
 
 
-#if !defined(simde_math_absf)
-#if SIMDE_MATH_BUILTIN_LIBM(absf)
-#define simde_math_absf(v) __builtin_absf(v)
+#if !defined(simde_math_fabsf)
+#if SIMDE_MATH_BUILTIN_LIBM(fabsf)
+#define simde_math_fabsf(v) __builtin_fabsf(v)
 #elif defined(SIMDE_MATH_HAVE_CMATH)
 #elif defined(SIMDE_MATH_HAVE_CMATH)
-#define simde_math_absf(v) std::abs(v)
+#define simde_math_fabsf(v) std::abs(v)
 #elif defined(SIMDE_MATH_HAVE_MATH_H)
 #elif defined(SIMDE_MATH_HAVE_MATH_H)
-#define simde_math_absf(v) absf(v)
+#define simde_math_fabsf(v) fabsf(v)
 #endif
 #endif
 #endif
 #endif
 
 
@@ -574,7 +674,13 @@ typedef std::complex<double> simde_cfloat64;
 #endif
 #endif
 
 
 #if !defined(simde_math_cosf)
 #if !defined(simde_math_cosf)
-#if SIMDE_MATH_BUILTIN_LIBM(cosf)
+#if defined(SIMDE_MATH_SLEEF_ENABLE)
+#if SIMDE_ACCURACY_PREFERENCE < 1
+#define simde_math_cosf(v) Sleef_cosf_u35(v)
+#else
+#define simde_math_cosf(v) Sleef_cosf_u10(v)
+#endif
+#elif SIMDE_MATH_BUILTIN_LIBM(cosf)
 #define simde_math_cosf(v) __builtin_cosf(v)
 #define simde_math_cosf(v) __builtin_cosf(v)
 #elif defined(SIMDE_MATH_HAVE_CMATH)
 #elif defined(SIMDE_MATH_HAVE_CMATH)
 #define simde_math_cosf(v) std::cos(v)
 #define simde_math_cosf(v) std::cos(v)
@@ -755,6 +861,46 @@ typedef std::complex<double> simde_cfloat64;
 #endif
 #endif
 #endif
 #endif
 
 
+#if !defined(simde_math_fma)
+#if SIMDE_MATH_BUILTIN_LIBM(fma)
+#define simde_math_fma(x, y, z) __builtin_fma(x, y, z)
+#elif defined(SIMDE_MATH_HAVE_CMATH)
+#define simde_math_fma(x, y, z) std::fma(x, y, z)
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
+#define simde_math_fma(x, y, z) fma(x, y, z)
+#endif
+#endif
+
+#if !defined(simde_math_fmaf)
+#if SIMDE_MATH_BUILTIN_LIBM(fmaf)
+#define simde_math_fmaf(x, y, z) __builtin_fmaf(x, y, z)
+#elif defined(SIMDE_MATH_HAVE_CMATH)
+#define simde_math_fmaf(x, y, z) std::fma(x, y, z)
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
+#define simde_math_fmaf(x, y, z) fmaf(x, y, z)
+#endif
+#endif
+
+#if !defined(simde_math_fmax)
+#if SIMDE_MATH_BUILTIN_LIBM(fmax)
+#define simde_math_fmax(x, y, z) __builtin_fmax(x, y, z)
+#elif defined(SIMDE_MATH_HAVE_CMATH)
+#define simde_math_fmax(x, y, z) std::fmax(x, y, z)
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
+#define simde_math_fmax(x, y, z) fmax(x, y, z)
+#endif
+#endif
+
+#if !defined(simde_math_fmaxf)
+#if SIMDE_MATH_BUILTIN_LIBM(fmaxf)
+#define simde_math_fmaxf(x, y, z) __builtin_fmaxf(x, y, z)
+#elif defined(SIMDE_MATH_HAVE_CMATH)
+#define simde_math_fmaxf(x, y, z) std::fmax(x, y, z)
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
+#define simde_math_fmaxf(x, y, z) fmaxf(x, y, z)
+#endif
+#endif
+
 #if !defined(simde_math_hypot)
 #if !defined(simde_math_hypot)
 #if SIMDE_MATH_BUILTIN_LIBM(hypot)
 #if SIMDE_MATH_BUILTIN_LIBM(hypot)
 #define simde_math_hypot(y, x) __builtin_hypot(y, x)
 #define simde_math_hypot(y, x) __builtin_hypot(y, x)
@@ -875,6 +1021,26 @@ typedef std::complex<double> simde_cfloat64;
 #endif
 #endif
 #endif
 #endif
 
 
+#if !defined(simde_math_modf)
+#if SIMDE_MATH_BUILTIN_LIBM(modf)
+#define simde_math_modf(x, iptr) __builtin_modf(x, iptr)
+#elif defined(SIMDE_MATH_HAVE_CMATH)
+#define simde_math_modf(x, iptr) std::modf(x, iptr)
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
+#define simde_math_modf(x, iptr) modf(x, iptr)
+#endif
+#endif
+
+#if !defined(simde_math_modff)
+#if SIMDE_MATH_BUILTIN_LIBM(modff)
+#define simde_math_modff(x, iptr) __builtin_modff(x, iptr)
+#elif defined(SIMDE_MATH_HAVE_CMATH)
+#define simde_math_modff(x, iptr) std::modf(x, iptr)
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
+#define simde_math_modff(x, iptr) modff(x, iptr)
+#endif
+#endif
+
 #if !defined(simde_math_nearbyint)
 #if !defined(simde_math_nearbyint)
 #if SIMDE_MATH_BUILTIN_LIBM(nearbyint)
 #if SIMDE_MATH_BUILTIN_LIBM(nearbyint)
 #define simde_math_nearbyint(v) __builtin_nearbyint(v)
 #define simde_math_nearbyint(v) __builtin_nearbyint(v)
@@ -955,6 +1121,44 @@ typedef std::complex<double> simde_cfloat64;
 #endif
 #endif
 #endif
 #endif
 
 
+#if !defined(simde_math_roundeven)
+#if HEDLEY_HAS_BUILTIN(__builtin_roundeven) || \
+	HEDLEY_GCC_VERSION_CHECK(10, 0, 0)
+#define simde_math_roundeven(v) __builtin_roundeven(v)
+#elif defined(simde_math_round) && defined(simde_math_fabs)
+static HEDLEY_INLINE double simde_math_roundeven(double v)
+{
+	double rounded = simde_math_round(v);
+	double diff = rounded - v;
+	if (HEDLEY_UNLIKELY(simde_math_fabs(diff) == 0.5) &&
+	    (HEDLEY_STATIC_CAST(int64_t, rounded) & 1)) {
+		rounded = v - diff;
+	}
+	return rounded;
+}
+#define simde_math_roundeven simde_math_roundeven
+#endif
+#endif
+
+#if !defined(simde_math_roundevenf)
+#if HEDLEY_HAS_BUILTIN(__builtin_roundevenf) || \
+	HEDLEY_GCC_VERSION_CHECK(10, 0, 0)
+#define simde_math_roundevenf(v) __builtin_roundevenf(v)
+#elif defined(simde_math_roundf) && defined(simde_math_fabsf)
+static HEDLEY_INLINE float simde_math_roundevenf(float v)
+{
+	float rounded = simde_math_roundf(v);
+	float diff = rounded - v;
+	if (HEDLEY_UNLIKELY(simde_math_fabsf(diff) == 0.5f) &&
+	    (HEDLEY_STATIC_CAST(int32_t, rounded) & 1)) {
+		rounded = v - diff;
+	}
+	return rounded;
+}
+#define simde_math_roundevenf simde_math_roundevenf
+#endif
+#endif
+
 #if !defined(simde_math_sin)
 #if !defined(simde_math_sin)
 #if SIMDE_MATH_BUILTIN_LIBM(sin)
 #if SIMDE_MATH_BUILTIN_LIBM(sin)
 #define simde_math_sin(v) __builtin_sin(v)
 #define simde_math_sin(v) __builtin_sin(v)
@@ -1078,20 +1282,20 @@ typedef std::complex<double> simde_cfloat64;
 /***  Complex functions ***/
 /***  Complex functions ***/
 
 
 #if !defined(simde_math_cexp)
 #if !defined(simde_math_cexp)
-#if defined(__cplusplus)
-#define simde_math_cexp(v) std::cexp(v)
-#elif SIMDE_MATH_BUILTIN_LIBM(cexp)
+#if SIMDE_MATH_BUILTIN_LIBM(cexp)
 #define simde_math_cexp(v) __builtin_cexp(v)
 #define simde_math_cexp(v) __builtin_cexp(v)
+#elif defined(__cplusplus)
+#define simde_math_cexp(v) std::cexp(v)
 #elif defined(SIMDE_MATH_HAVE_MATH_H)
 #elif defined(SIMDE_MATH_HAVE_MATH_H)
 #define simde_math_cexp(v) cexp(v)
 #define simde_math_cexp(v) cexp(v)
 #endif
 #endif
 #endif
 #endif
 
 
 #if !defined(simde_math_cexpf)
 #if !defined(simde_math_cexpf)
-#if defined(__cplusplus)
-#define simde_math_cexpf(v) std::exp(v)
-#elif SIMDE_MATH_BUILTIN_LIBM(cexpf)
+#if SIMDE_MATH_BUILTIN_LIBM(cexpf)
 #define simde_math_cexpf(v) __builtin_cexpf(v)
 #define simde_math_cexpf(v) __builtin_cexpf(v)
+#elif defined(__cplusplus)
+#define simde_math_cexpf(v) std::exp(v)
 #elif defined(SIMDE_MATH_HAVE_MATH_H)
 #elif defined(SIMDE_MATH_HAVE_MATH_H)
 #define simde_math_cexpf(v) cexpf(v)
 #define simde_math_cexpf(v) cexpf(v)
 #endif
 #endif
@@ -1393,22 +1597,262 @@ HEDLEY_DIAGNOSTIC_POP
 
 
 static HEDLEY_INLINE double simde_math_rad2deg(double radians)
 static HEDLEY_INLINE double simde_math_rad2deg(double radians)
 {
 {
-	return radians * (180.0 / SIMDE_MATH_PI);
+	return radians * SIMDE_MATH_180_OVER_PI;
 }
 }
 
 
 static HEDLEY_INLINE float simde_math_rad2degf(float radians)
 static HEDLEY_INLINE float simde_math_rad2degf(float radians)
 {
 {
-	return radians * (180.0f / SIMDE_MATH_PIF);
+	return radians * SIMDE_MATH_180_OVER_PIF;
 }
 }
 
 
 static HEDLEY_INLINE double simde_math_deg2rad(double degrees)
 static HEDLEY_INLINE double simde_math_deg2rad(double degrees)
 {
 {
-	return degrees * (SIMDE_MATH_PI / 180.0);
+	return degrees * SIMDE_MATH_PI_OVER_180;
 }
 }
 
 
 static HEDLEY_INLINE float simde_math_deg2radf(float degrees)
 static HEDLEY_INLINE float simde_math_deg2radf(float degrees)
 {
 {
-	return degrees * (SIMDE_MATH_PIF / 180.0f);
+	return degrees * (SIMDE_MATH_PI_OVER_180F);
 }
 }
 
 
+/***  Saturated arithmetic ***/
+
+static HEDLEY_INLINE int8_t simde_math_adds_i8(int8_t a, int8_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqaddb_s8(a, b);
+#else
+	uint8_t a_ = HEDLEY_STATIC_CAST(uint8_t, a);
+	uint8_t b_ = HEDLEY_STATIC_CAST(uint8_t, b);
+	uint8_t r_ = a_ + b_;
+
+	a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT8_MAX;
+	if (HEDLEY_STATIC_CAST(int8_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) {
+		r_ = a_;
+	}
+
+	return HEDLEY_STATIC_CAST(int8_t, r_);
+#endif
+}
+
+static HEDLEY_INLINE int16_t simde_math_adds_i16(int16_t a, int16_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqaddh_s16(a, b);
+#else
+	uint16_t a_ = HEDLEY_STATIC_CAST(uint16_t, a);
+	uint16_t b_ = HEDLEY_STATIC_CAST(uint16_t, b);
+	uint16_t r_ = a_ + b_;
+
+	a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT16_MAX;
+	if (HEDLEY_STATIC_CAST(int16_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) {
+		r_ = a_;
+	}
+
+	return HEDLEY_STATIC_CAST(int16_t, r_);
+#endif
+}
+
+static HEDLEY_INLINE int32_t simde_math_adds_i32(int32_t a, int32_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqadds_s32(a, b);
+#else
+	uint32_t a_ = HEDLEY_STATIC_CAST(uint32_t, a);
+	uint32_t b_ = HEDLEY_STATIC_CAST(uint32_t, b);
+	uint32_t r_ = a_ + b_;
+
+	a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT32_MAX;
+	if (HEDLEY_STATIC_CAST(int32_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) {
+		r_ = a_;
+	}
+
+	return HEDLEY_STATIC_CAST(int32_t, r_);
+#endif
+}
+
+static HEDLEY_INLINE int64_t simde_math_adds_i64(int64_t a, int64_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqaddd_s64(a, b);
+#else
+	uint64_t a_ = HEDLEY_STATIC_CAST(uint64_t, a);
+	uint64_t b_ = HEDLEY_STATIC_CAST(uint64_t, b);
+	uint64_t r_ = a_ + b_;
+
+	a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT64_MAX;
+	if (HEDLEY_STATIC_CAST(int64_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) {
+		r_ = a_;
+	}
+
+	return HEDLEY_STATIC_CAST(int64_t, r_);
+#endif
+}
+
+static HEDLEY_INLINE uint8_t simde_math_adds_u8(uint8_t a, uint8_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqaddb_u8(a, b);
+#else
+	uint8_t r = a + b;
+	r |= -(r < a);
+	return r;
+#endif
+}
+
+static HEDLEY_INLINE uint16_t simde_math_adds_u16(uint16_t a, uint16_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqaddh_u16(a, b);
+#else
+	uint16_t r = a + b;
+	r |= -(r < a);
+	return r;
+#endif
+}
+
+static HEDLEY_INLINE uint32_t simde_math_adds_u32(uint32_t a, uint32_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqadds_u32(a, b);
+#else
+	uint32_t r = a + b;
+	r |= -(r < a);
+	return r;
+#endif
+}
+
+static HEDLEY_INLINE uint64_t simde_math_adds_u64(uint64_t a, uint64_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqaddd_u64(a, b);
+#else
+	uint64_t r = a + b;
+	r |= -(r < a);
+	return r;
+#endif
+}
+
+static HEDLEY_INLINE int8_t simde_math_subs_i8(int8_t a, int8_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqsubb_s8(a, b);
+#else
+	uint8_t a_ = HEDLEY_STATIC_CAST(uint8_t, a);
+	uint8_t b_ = HEDLEY_STATIC_CAST(uint8_t, b);
+	uint8_t r_ = a_ - b_;
+
+	a_ = (a_ >> 7) + INT8_MAX;
+
+	if (HEDLEY_STATIC_CAST(int8_t, (a_ ^ b_) & (a_ ^ r_)) < 0) {
+		r_ = a_;
+	}
+
+	return HEDLEY_STATIC_CAST(int8_t, r_);
+#endif
+}
+
+static HEDLEY_INLINE int16_t simde_math_subs_i16(int16_t a, int16_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqsubh_s16(a, b);
+#else
+	uint16_t a_ = HEDLEY_STATIC_CAST(uint16_t, a);
+	uint16_t b_ = HEDLEY_STATIC_CAST(uint16_t, b);
+	uint16_t r_ = a_ - b_;
+
+	a_ = (a_ >> 15) + INT16_MAX;
+
+	if (HEDLEY_STATIC_CAST(int16_t, (a_ ^ b_) & (a_ ^ r_)) < 0) {
+		r_ = a_;
+	}
+
+	return HEDLEY_STATIC_CAST(int16_t, r_);
+#endif
+}
+
+static HEDLEY_INLINE int32_t simde_math_subs_i32(int32_t a, int32_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqsubs_s32(a, b);
+#else
+	uint32_t a_ = HEDLEY_STATIC_CAST(uint32_t, a);
+	uint32_t b_ = HEDLEY_STATIC_CAST(uint32_t, b);
+	uint32_t r_ = a_ - b_;
+
+	a_ = (a_ >> 31) + INT32_MAX;
+
+	if (HEDLEY_STATIC_CAST(int32_t, (a_ ^ b_) & (a_ ^ r_)) < 0) {
+		r_ = a_;
+	}
+
+	return HEDLEY_STATIC_CAST(int32_t, r_);
+#endif
+}
+
+static HEDLEY_INLINE int64_t simde_math_subs_i64(int64_t a, int64_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqsubd_s64(a, b);
+#else
+	uint64_t a_ = HEDLEY_STATIC_CAST(uint64_t, a);
+	uint64_t b_ = HEDLEY_STATIC_CAST(uint64_t, b);
+	uint64_t r_ = a_ - b_;
+
+	a_ = (a_ >> 63) + INT64_MAX;
+
+	if (HEDLEY_STATIC_CAST(int64_t, (a_ ^ b_) & (a_ ^ r_)) < 0) {
+		r_ = a_;
+	}
+
+	return HEDLEY_STATIC_CAST(int64_t, r_);
+#endif
+}
+
+static HEDLEY_INLINE uint8_t simde_math_subs_u8(uint8_t a, uint8_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqsubb_u8(a, b);
+#else
+	uint8_t res = a - b;
+	res &= -(res <= a);
+	return res;
+#endif
+}
+
+static HEDLEY_INLINE uint16_t simde_math_subs_u16(uint16_t a, uint16_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqsubh_u16(a, b);
+#else
+	uint16_t res = a - b;
+	res &= -(res <= a);
+	return res;
+#endif
+}
+
+static HEDLEY_INLINE uint32_t simde_math_subs_u32(uint32_t a, uint32_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqsubs_u32(a, b);
+#else
+	uint32_t res = a - b;
+	res &= -(res <= a);
+	return res;
+#endif
+}
+
+static HEDLEY_INLINE uint64_t simde_math_subs_u64(uint64_t a, uint64_t b)
+{
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	return vqsubd_u64(a, b);
+#else
+	uint64_t res = a - b;
+	res &= -(res <= a);
+	return res;
+#endif
+}
+
+HEDLEY_DIAGNOSTIC_POP
+
 #endif /* !defined(SIMDE_MATH_H) */
 #endif /* !defined(SIMDE_MATH_H) */

+ 236 - 50
libobs/util/simde/mmx.h → libobs/util/simde/x86/mmx.h

@@ -27,11 +27,7 @@
 #if !defined(SIMDE_X86_MMX_H)
 #if !defined(SIMDE_X86_MMX_H)
 #define SIMDE_X86_MMX_H
 #define SIMDE_X86_MMX_H
 
 
-#include "simde-common.h"
-
-#if !defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
-#define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES
-#endif
+#include "../simde-common.h"
 
 
 HEDLEY_DIAGNOSTIC_PUSH
 HEDLEY_DIAGNOSTIC_PUSH
 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
@@ -46,6 +42,8 @@ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
 #include <mmintrin.h>
 #include <mmintrin.h>
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #include <arm_neon.h>
 #include <arm_neon.h>
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+#include <loongson-mmiintrin.h>
 #endif
 #endif
 
 
 #include <stdint.h>
 #include <stdint.h>
@@ -55,29 +53,29 @@ SIMDE_BEGIN_DECLS_
 
 
 typedef union {
 typedef union {
 #if defined(SIMDE_VECTOR_SUBSCRIPT)
 #if defined(SIMDE_VECTOR_SUBSCRIPT)
-	SIMDE_ALIGN(8) int8_t i8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
-	SIMDE_ALIGN(8) int16_t i16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
-	SIMDE_ALIGN(8) int32_t i32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
-	SIMDE_ALIGN(8) int64_t i64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
-	SIMDE_ALIGN(8) uint8_t u8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
-	SIMDE_ALIGN(8) uint16_t u16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
-	SIMDE_ALIGN(8) uint32_t u32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
-	SIMDE_ALIGN(8) uint64_t u64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
-	SIMDE_ALIGN(8) simde_float32 f32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
-	SIMDE_ALIGN(8) int_fast32_t i32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
-	SIMDE_ALIGN(8) uint_fast32_t u32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
-#else
-	SIMDE_ALIGN(8) int8_t i8[8];
-	SIMDE_ALIGN(8) int16_t i16[4];
-	SIMDE_ALIGN(8) int32_t i32[2];
-	SIMDE_ALIGN(8) int64_t i64[1];
-	SIMDE_ALIGN(8) uint8_t u8[8];
-	SIMDE_ALIGN(8) uint16_t u16[4];
-	SIMDE_ALIGN(8) uint32_t u32[2];
-	SIMDE_ALIGN(8) uint64_t u64[1];
-	SIMDE_ALIGN(8) simde_float32 f32[2];
-	SIMDE_ALIGN(8) int_fast32_t i32f[8 / sizeof(int_fast32_t)];
-	SIMDE_ALIGN(8) uint_fast32_t u32f[8 / sizeof(uint_fast32_t)];
+	SIMDE_ALIGN_TO_8 int8_t i8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN_TO_8 int16_t i16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN_TO_8 int32_t i32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN_TO_8 int64_t i64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN_TO_8 uint8_t u8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN_TO_8 uint16_t u16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN_TO_8 uint32_t u32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN_TO_8 uint64_t u64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN_TO_8 simde_float32 f32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN_TO_8 int_fast32_t i32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN_TO_8 uint_fast32_t u32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+#else
+	SIMDE_ALIGN_TO_8 int8_t i8[8];
+	SIMDE_ALIGN_TO_8 int16_t i16[4];
+	SIMDE_ALIGN_TO_8 int32_t i32[2];
+	SIMDE_ALIGN_TO_8 int64_t i64[1];
+	SIMDE_ALIGN_TO_8 uint8_t u8[8];
+	SIMDE_ALIGN_TO_8 uint16_t u16[4];
+	SIMDE_ALIGN_TO_8 uint32_t u32[2];
+	SIMDE_ALIGN_TO_8 uint64_t u64[1];
+	SIMDE_ALIGN_TO_8 simde_float32 f32[2];
+	SIMDE_ALIGN_TO_8 int_fast32_t i32f[8 / sizeof(int_fast32_t)];
+	SIMDE_ALIGN_TO_8 uint_fast32_t u32f[8 / sizeof(uint_fast32_t)];
 #endif
 #endif
 
 
 #if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
 #if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
@@ -94,14 +92,26 @@ typedef union {
 	uint64x1_t neon_u64;
 	uint64x1_t neon_u64;
 	float32x2_t neon_f32;
 	float32x2_t neon_f32;
 #endif
 #endif
+#if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	int8x8_t mmi_i8;
+	int16x4_t mmi_i16;
+	int32x2_t mmi_i32;
+	int64_t mmi_i64;
+	uint8x8_t mmi_u8;
+	uint16x4_t mmi_u16;
+	uint32x2_t mmi_u32;
+	uint64_t mmi_u64;
+#endif
 } simde__m64_private;
 } simde__m64_private;
 
 
 #if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
 #if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
 typedef __m64 simde__m64;
 typedef __m64 simde__m64;
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 typedef int32x2_t simde__m64;
 typedef int32x2_t simde__m64;
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+typedef int32x2_t simde__m64;
 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
 #elif defined(SIMDE_VECTOR_SUBSCRIPT)
-typedef int32_t simde__m64 SIMDE_ALIGN(8) SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+typedef int32_t simde__m64 SIMDE_ALIGN_TO_8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
 #else
 #else
 typedef simde__m64_private simde__m64;
 typedef simde__m64_private simde__m64;
 #endif
 #endif
@@ -169,6 +179,17 @@ SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64x1_t, neon, u64)
 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, float32x2_t, neon, f32)
 SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, float32x2_t, neon, f32)
 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
 #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
 
 
+#if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int8x8_t, mmi, i8)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int16x4_t, mmi, i16)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int32x2_t, mmi, i32)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int64_t, mmi, i64)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint8x8_t, mmi, u8)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint16x4_t, mmi, u16)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint32x2_t, mmi, u32)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64_t, mmi, u64)
+#endif /* defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) */
+
 SIMDE_FUNCTION_ATTRIBUTES
 SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_add_pi8(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_add_pi8(simde__m64 a, simde__m64 b)
 {
 {
@@ -181,6 +202,8 @@ simde__m64 simde_mm_add_pi8(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i8 = vadd_s8(a_.neon_i8, b_.neon_i8);
 	r_.neon_i8 = vadd_s8(a_.neon_i8, b_.neon_i8);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i8 = paddb_s(a_.mmi_i8, b_.mmi_i8);
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
 	r_.i8 = a_.i8 + b_.i8;
 	r_.i8 = a_.i8 + b_.i8;
 #else
 #else
@@ -211,6 +234,8 @@ simde__m64 simde_mm_add_pi16(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i16 = vadd_s16(a_.neon_i16, b_.neon_i16);
 	r_.neon_i16 = vadd_s16(a_.neon_i16, b_.neon_i16);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i16 = paddh_s(a_.mmi_i16, b_.mmi_i16);
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
 	r_.i16 = a_.i16 + b_.i16;
 	r_.i16 = a_.i16 + b_.i16;
 #else
 #else
@@ -226,7 +251,7 @@ simde__m64 simde_mm_add_pi16(simde__m64 a, simde__m64 b)
 #define simde_m_paddw(a, b) simde_mm_add_pi16(a, b)
 #define simde_m_paddw(a, b) simde_mm_add_pi16(a, b)
 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
 #define _mm_add_pi16(a, b) simde_mm_add_pi16(a, b)
 #define _mm_add_pi16(a, b) simde_mm_add_pi16(a, b)
-#define _m_add_paddw(a, b) simde_mm_add_pi16(a, b)
+#define _m_paddw(a, b) simde_mm_add_pi16(a, b)
 #endif
 #endif
 
 
 SIMDE_FUNCTION_ATTRIBUTES
 SIMDE_FUNCTION_ATTRIBUTES
@@ -241,6 +266,8 @@ simde__m64 simde_mm_add_pi32(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i32 = vadd_s32(a_.neon_i32, b_.neon_i32);
 	r_.neon_i32 = vadd_s32(a_.neon_i32, b_.neon_i32);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i32 = paddw_s(a_.mmi_i32, b_.mmi_i32);
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
 	r_.i32 = a_.i32 + b_.i32;
 	r_.i32 = a_.i32 + b_.i32;
 #else
 #else
@@ -256,7 +283,7 @@ simde__m64 simde_mm_add_pi32(simde__m64 a, simde__m64 b)
 #define simde_m_paddd(a, b) simde_mm_add_pi32(a, b)
 #define simde_m_paddd(a, b) simde_mm_add_pi32(a, b)
 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
 #define _mm_add_pi32(a, b) simde_mm_add_pi32(a, b)
 #define _mm_add_pi32(a, b) simde_mm_add_pi32(a, b)
-#define _m_add_paddd(a, b) simde_mm_add_pi32(a, b)
+#define _m_paddd(a, b) simde_mm_add_pi32(a, b)
 #endif
 #endif
 
 
 SIMDE_FUNCTION_ATTRIBUTES
 SIMDE_FUNCTION_ATTRIBUTES
@@ -270,6 +297,8 @@ simde__m64 simde_mm_adds_pi8(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i8 = vqadd_s8(a_.neon_i8, b_.neon_i8);
 	r_.neon_i8 = vqadd_s8(a_.neon_i8, b_.neon_i8);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i8 = paddsb(a_.mmi_i8, b_.mmi_i8);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
@@ -291,7 +320,7 @@ simde__m64 simde_mm_adds_pi8(simde__m64 a, simde__m64 b)
 #define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b)
 #define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b)
 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
 #define _mm_adds_pi8(a, b) simde_mm_adds_pi8(a, b)
 #define _mm_adds_pi8(a, b) simde_mm_adds_pi8(a, b)
-#define _m_add_paddsb(a, b) simde_mm_adds_pi8(a, b)
+#define _m_paddsb(a, b) simde_mm_adds_pi8(a, b)
 #endif
 #endif
 
 
 SIMDE_FUNCTION_ATTRIBUTES
 SIMDE_FUNCTION_ATTRIBUTES
@@ -306,6 +335,8 @@ simde__m64 simde_mm_adds_pu8(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_u8 = vqadd_u8(a_.neon_u8, b_.neon_u8);
 	r_.neon_u8 = vqadd_u8(a_.neon_u8, b_.neon_u8);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_u8 = paddusb(a_.mmi_u8, b_.mmi_u8);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
@@ -340,6 +371,8 @@ simde__m64 simde_mm_adds_pi16(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i16 = vqadd_s16(a_.neon_i16, b_.neon_i16);
 	r_.neon_i16 = vqadd_s16(a_.neon_i16, b_.neon_i16);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i16 = paddsh(a_.mmi_i16, b_.mmi_i16);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
@@ -376,6 +409,8 @@ simde__m64 simde_mm_adds_pu16(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_u16 = vqadd_u16(a_.neon_u16, b_.neon_u16);
 	r_.neon_u16 = vqadd_u16(a_.neon_u16, b_.neon_u16);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_u16 = paddush(a_.mmi_u16, b_.mmi_u16);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
@@ -435,6 +470,8 @@ simde__m64 simde_mm_andnot_si64(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i32 = vbic_s32(b_.neon_i32, a_.neon_i32);
 	r_.neon_i32 = vbic_s32(b_.neon_i32, a_.neon_i32);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i32 = pandn_sw(a_.mmi_i32, b_.mmi_i32);
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
 	r_.i32f = ~a_.i32f & b_.i32f;
 	r_.i32f = ~a_.i32f & b_.i32f;
 #else
 #else
@@ -461,7 +498,9 @@ simde__m64 simde_mm_cmpeq_pi8(simde__m64 a, simde__m64 b)
 	simde__m64_private b_ = simde__m64_to_private(b);
 	simde__m64_private b_ = simde__m64_to_private(b);
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
-	r_.neon_i8 = vreinterpret_s8_u8(vceq_s8(a_.neon_i8, b_.neon_i8));
+	r_.neon_u8 = vceq_s8(a_.neon_i8, b_.neon_i8);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i8 = pcmpeqb_s(a_.mmi_i8, b_.mmi_i8);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
@@ -489,7 +528,9 @@ simde__m64 simde_mm_cmpeq_pi16(simde__m64 a, simde__m64 b)
 	simde__m64_private b_ = simde__m64_to_private(b);
 	simde__m64_private b_ = simde__m64_to_private(b);
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
-	r_.neon_i16 = vreinterpret_s16_u16(vceq_s16(a_.neon_i16, b_.neon_i16));
+	r_.neon_u16 = vceq_s16(a_.neon_i16, b_.neon_i16);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i16 = pcmpeqh_s(a_.mmi_i16, b_.mmi_i16);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
@@ -517,7 +558,9 @@ simde__m64 simde_mm_cmpeq_pi32(simde__m64 a, simde__m64 b)
 	simde__m64_private b_ = simde__m64_to_private(b);
 	simde__m64_private b_ = simde__m64_to_private(b);
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
-	r_.neon_i32 = vreinterpret_s32_u32(vceq_s32(a_.neon_i32, b_.neon_i32));
+	r_.neon_u32 = vceq_s32(a_.neon_i32, b_.neon_i32);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i32 = pcmpeqw_s(a_.mmi_i32, b_.mmi_i32);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
@@ -545,7 +588,9 @@ simde__m64 simde_mm_cmpgt_pi8(simde__m64 a, simde__m64 b)
 	simde__m64_private b_ = simde__m64_to_private(b);
 	simde__m64_private b_ = simde__m64_to_private(b);
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
-	r_.neon_i8 = vreinterpret_s8_u8(vcgt_s8(a_.neon_i8, b_.neon_i8));
+	r_.neon_u8 = vcgt_s8(a_.neon_i8, b_.neon_i8);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i8 = pcmpgtb_s(a_.mmi_i8, b_.mmi_i8);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
@@ -573,7 +618,9 @@ simde__m64 simde_mm_cmpgt_pi16(simde__m64 a, simde__m64 b)
 	simde__m64_private b_ = simde__m64_to_private(b);
 	simde__m64_private b_ = simde__m64_to_private(b);
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
-	r_.neon_i16 = vreinterpret_s16_u16(vcgt_s16(a_.neon_i16, b_.neon_i16));
+	r_.neon_u16 = vcgt_s16(a_.neon_i16, b_.neon_i16);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i16 = pcmpgth_s(a_.mmi_i16, b_.mmi_i16);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
@@ -601,7 +648,9 @@ simde__m64 simde_mm_cmpgt_pi32(simde__m64 a, simde__m64 b)
 	simde__m64_private b_ = simde__m64_to_private(b);
 	simde__m64_private b_ = simde__m64_to_private(b);
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
-	r_.neon_i32 = vreinterpret_s32_u32(vcgt_s32(a_.neon_i32, b_.neon_i32));
+	r_.neon_u32 = vcgt_s32(a_.neon_i32, b_.neon_i32);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i32 = pcmpgtw_s(a_.mmi_i32, b_.mmi_i32);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
@@ -628,7 +677,13 @@ int64_t simde_mm_cvtm64_si64(simde__m64 a)
 	simde__m64_private a_ = simde__m64_to_private(a);
 	simde__m64_private a_ = simde__m64_to_private(a);
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	HEDLEY_DIAGNOSTIC_PUSH
+#if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
+	SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
+#pragma clang diagnostic ignored "-Wvector-conversion"
+#endif
 	return vget_lane_s64(a_.neon_i64, 0);
 	return vget_lane_s64(a_.neon_i64, 0);
+	HEDLEY_DIAGNOSTIC_POP
 #else
 #else
 	return a_.i64[0];
 	return a_.i64[0];
 #endif
 #endif
@@ -698,7 +753,13 @@ int32_t simde_mm_cvtsi64_si32(simde__m64 a)
 	simde__m64_private a_ = simde__m64_to_private(a);
 	simde__m64_private a_ = simde__m64_to_private(a);
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	HEDLEY_DIAGNOSTIC_PUSH
+#if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
+	SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
+#pragma clang diagnostic ignored "-Wvector-conversion"
+#endif
 	return vget_lane_s32(a_.neon_i32, 0);
 	return vget_lane_s32(a_.neon_i32, 0);
+	HEDLEY_DIAGNOSTIC_POP
 #else
 #else
 	return a_.i32[0];
 	return a_.i32[0];
 #endif
 #endif
@@ -714,6 +775,7 @@ void simde_mm_empty(void)
 #if defined(SIMDE_X86_MMX_NATIVE)
 #if defined(SIMDE_X86_MMX_NATIVE)
 	_mm_empty();
 	_mm_empty();
 #else
 #else
+	/* noop */
 #endif
 #endif
 }
 }
 #define simde_m_empty() simde_mm_empty()
 #define simde_m_empty() simde_mm_empty()
@@ -735,6 +797,8 @@ simde__m64 simde_mm_madd_pi16(simde__m64 a, simde__m64 b)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	int32x4_t i1 = vmull_s16(a_.neon_i16, b_.neon_i16);
 	int32x4_t i1 = vmull_s16(a_.neon_i16, b_.neon_i16);
 	r_.neon_i32 = vpadd_s32(vget_low_s32(i1), vget_high_s32(i1));
 	r_.neon_i32 = vpadd_s32(vget_low_s32(i1), vget_high_s32(i1));
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i32 = pmaddhw(a_.mmi_i16, b_.mmi_i16);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i += 2) {
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i += 2) {
@@ -766,7 +830,9 @@ simde__m64 simde_mm_mulhi_pi16(simde__m64 a, simde__m64 b)
 	const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
 	const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
 	const uint32x4_t t2 = vshrq_n_u32(vreinterpretq_u32_s32(t1), 16);
 	const uint32x4_t t2 = vshrq_n_u32(vreinterpretq_u32_s32(t1), 16);
 	const uint16x4_t t3 = vmovn_u32(t2);
 	const uint16x4_t t3 = vmovn_u32(t2);
-	r_.neon_i16 = vreinterpret_s16_u16(t3);
+	r_.neon_u16 = t3;
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i16 = pmulhh(a_.mmi_i16, b_.mmi_i16);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
@@ -797,7 +863,9 @@ simde__m64 simde_mm_mullo_pi16(simde__m64 a, simde__m64 b)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
 	const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
 	const uint16x4_t t2 = vmovn_u32(vreinterpretq_u32_s32(t1));
 	const uint16x4_t t2 = vmovn_u32(vreinterpretq_u32_s32(t1));
-	r_.neon_i16 = vreinterpret_s16_u16(t2);
+	r_.neon_u16 = t2;
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i16 = pmullh(a_.mmi_i16, b_.mmi_i16);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
@@ -854,6 +922,8 @@ simde__m64 simde_mm_packs_pi16(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i8 = vqmovn_s16(vcombine_s16(a_.neon_i16, b_.neon_i16));
 	r_.neon_i8 = vqmovn_s16(vcombine_s16(a_.neon_i16, b_.neon_i16));
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i8 = packsshb(a_.mmi_i16, b_.mmi_i16);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
@@ -884,7 +954,7 @@ simde__m64 simde_mm_packs_pi16(simde__m64 a, simde__m64 b)
 #define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b)
 #define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b)
 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
 #define _mm_packs_pi16(a, b) simde_mm_packs_pi16(a, b)
 #define _mm_packs_pi16(a, b) simde_mm_packs_pi16(a, b)
-#define _m_packsswb(a, b) mm_packs_pi16(a, b)
+#define _m_packsswb(a, b) simde_mm_packs_pi16(a, b)
 #endif
 #endif
 
 
 SIMDE_FUNCTION_ATTRIBUTES
 SIMDE_FUNCTION_ATTRIBUTES
@@ -899,6 +969,8 @@ simde__m64 simde_mm_packs_pi32(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i16 = vqmovn_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
 	r_.neon_i16 = vqmovn_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i16 = packsswh(a_.mmi_i32, b_.mmi_i32);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (8 / sizeof(a_.i32[0])); i++) {
 	for (size_t i = 0; i < (8 / sizeof(a_.i32[0])); i++) {
@@ -950,7 +1022,8 @@ simde__m64 simde_mm_packs_pu16(simde__m64 a, simde__m64 b)
 		vandq_s16(t1, vreinterpretq_s16_u16(vcgezq_s16(t1)));
 		vandq_s16(t1, vreinterpretq_s16_u16(vcgezq_s16(t1)));
 
 
 	/* Vector with all s16 elements set to UINT8_MAX */
 	/* Vector with all s16 elements set to UINT8_MAX */
-	const int16x8_t vmax = vmovq_n_s16((int16_t)UINT8_MAX);
+	const int16x8_t vmax =
+		vmovq_n_s16(HEDLEY_STATIC_CAST(int16_t, UINT8_MAX));
 
 
 	/* Elements which are within the acceptable range */
 	/* Elements which are within the acceptable range */
 	const int16x8_t le_max =
 	const int16x8_t le_max =
@@ -962,6 +1035,8 @@ simde__m64 simde_mm_packs_pu16(simde__m64 a, simde__m64 b)
 	const int16x8_t values = vorrq_s16(le_max, gt_max);
 	const int16x8_t values = vorrq_s16(le_max, gt_max);
 
 
 	r_.neon_u8 = vmovn_u16(vreinterpretq_u16_s16(values));
 	r_.neon_u8 = vmovn_u16(vreinterpretq_u16_s16(values));
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_u8 = packushb(a_.mmi_u16, b_.mmi_u16);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
@@ -1074,6 +1149,7 @@ simde__m64 simde_mm_set_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
 	r_.i16[2] = e2;
 	r_.i16[2] = e2;
 	r_.i16[3] = e3;
 	r_.i16[3] = e3;
 #endif
 #endif
+
 	return simde__m64_from_private(r_);
 	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
@@ -1285,6 +1361,36 @@ simde__m64 simde_mm_setzero_si64(void)
 #define _mm_setzero_si64() simde_mm_setzero_si64()
 #define _mm_setzero_si64() simde_mm_setzero_si64()
 #endif
 #endif
 
 
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m64 simde_x_mm_load_si64(const void *mem_addr)
+{
+	simde__m64 r;
+	simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m64),
+		     sizeof(r));
+	return r;
+}
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m64 simde_x_mm_loadu_si64(const void *mem_addr)
+{
+	simde__m64 r;
+	simde_memcpy(&r, mem_addr, sizeof(r));
+	return r;
+}
+
+SIMDE_FUNCTION_ATTRIBUTES
+void simde_x_mm_store_si64(void *mem_addr, simde__m64 value)
+{
+	simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m64), &value,
+		     sizeof(value));
+}
+
+SIMDE_FUNCTION_ATTRIBUTES
+void simde_x_mm_storeu_si64(void *mem_addr, simde__m64 value)
+{
+	simde_memcpy(mem_addr, &value, sizeof(value));
+}
+
 SIMDE_FUNCTION_ATTRIBUTES
 SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_x_mm_setone_si64(void)
 simde__m64 simde_x_mm_setone_si64(void)
 {
 {
@@ -1302,8 +1408,22 @@ simde__m64 simde_mm_sll_pi16(simde__m64 a, simde__m64 count)
 	simde__m64_private count_ = simde__m64_to_private(count);
 	simde__m64_private count_ = simde__m64_to_private(count);
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
-	r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t)vget_lane_u64(
-						    count_.neon_u64, 0)));
+	HEDLEY_DIAGNOSTIC_PUSH
+#if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
+	SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
+#pragma clang diagnostic ignored "-Wvector-conversion"
+#endif
+	r_.neon_i16 =
+		vshl_s16(a_.neon_i16,
+			 vmov_n_s16(HEDLEY_STATIC_CAST(
+				 int16_t, vget_lane_u64(count_.neon_u64, 0))));
+	HEDLEY_DIAGNOSTIC_POP
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
+	defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT)
+	if (HEDLEY_UNLIKELY(count_.u64[0] > 15))
+		return simde_mm_setzero_si64();
+
+	r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, count_.u64[0]);
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
 	r_.i16 = a_.i16 << count_.u64[0];
 	r_.i16 = a_.i16 << count_.u64[0];
 #else
 #else
@@ -1339,8 +1459,16 @@ simde__m64 simde_mm_sll_pi32(simde__m64 a, simde__m64 count)
 	simde__m64_private count_ = simde__m64_to_private(count);
 	simde__m64_private count_ = simde__m64_to_private(count);
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
-	r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t)vget_lane_u64(
-						    count_.neon_u64, 0)));
+	HEDLEY_DIAGNOSTIC_PUSH
+#if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
+	SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
+#pragma clang diagnostic ignored "-Wvector-conversion"
+#endif
+	r_.neon_i32 =
+		vshl_s32(a_.neon_i32,
+			 vmov_n_s32(HEDLEY_STATIC_CAST(
+				 int32_t, vget_lane_u64(count_.neon_u64, 0))));
+	HEDLEY_DIAGNOSTIC_POP
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
 	r_.i32 = a_.i32 << count_.u64[0];
 	r_.i32 = a_.i32 << count_.u64[0];
 #else
 #else
@@ -1373,10 +1501,19 @@ simde__m64 simde_mm_slli_pi16(simde__m64 a, int count)
 	simde__m64_private r_;
 	simde__m64_private r_;
 	simde__m64_private a_ = simde__m64_to_private(a);
 	simde__m64_private a_ = simde__m64_to_private(a);
 
 
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
+	defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT)
+	if (HEDLEY_UNLIKELY(count > 15))
+		return simde_mm_setzero_si64();
+
+	r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, count);
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
 	r_.i16 = a_.i16 << count;
 	r_.i16 = a_.i16 << count;
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t)count));
 	r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t)count));
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i16 = psllh_s(a_.mmi_i16, b_.mmi_i16);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
@@ -1406,6 +1543,8 @@ simde__m64 simde_mm_slli_pi32(simde__m64 a, int count)
 	r_.i32 = a_.i32 << count;
 	r_.i32 = a_.i32 << count;
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t)count));
 	r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t)count));
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i32 = psllw_s(a_.mmi_i32, b_.mmi_i32);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
@@ -1490,7 +1629,13 @@ simde__m64 simde_mm_srl_pi16(simde__m64 a, simde__m64 count)
 	simde__m64_private a_ = simde__m64_to_private(a);
 	simde__m64_private a_ = simde__m64_to_private(a);
 	simde__m64_private count_ = simde__m64_to_private(count);
 	simde__m64_private count_ = simde__m64_to_private(count);
 
 
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
+	defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT)
+	if (HEDLEY_UNLIKELY(count_.u64[0] > 15))
+		return simde_mm_setzero_si64();
+
+	r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, count_.u64[0]);
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
 	r_.u16 = a_.u16 >> count_.u64[0];
 	r_.u16 = a_.u16 >> count_.u64[0];
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_u16 = vshl_u16(
 	r_.neon_u16 = vshl_u16(
@@ -1567,6 +1712,8 @@ simde__m64 simde_mm_srli_pi16(simde__m64 a, int count)
 	r_.u16 = a_.u16 >> count;
 	r_.u16 = a_.u16 >> count;
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t)count)));
 	r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t)count)));
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i16 = psrlh_s(a_.mmi_i16, b_.mmi_i16);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
@@ -1596,6 +1743,8 @@ simde__m64 simde_mm_srli_pi32(simde__m64 a, int count)
 	r_.u32 = a_.u32 >> count;
 	r_.u32 = a_.u32 >> count;
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t)count)));
 	r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t)count)));
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i32 = psrlw_s(a_.mmi_i32, b_.mmi_i32);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
@@ -1682,7 +1831,10 @@ simde__m64 simde_mm_srai_pi16(simde__m64 a, int count)
 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
 #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
 	r_.i16 = a_.i16 >> (count & 0xff);
 	r_.i16 = a_.i16 >> (count & 0xff);
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
-  r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, count));
+	r_.neon_i16 = vshl_s16(a_.neon_i16,
+			       vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, count)));
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i16 = psrah_s(a_.mmi_i16, count);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
@@ -1713,6 +1865,8 @@ simde__m64 simde_mm_srai_pi32(simde__m64 a, int count)
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i32 = vshl_s32(a_.neon_i32,
 	r_.neon_i32 = vshl_s32(a_.neon_i32,
 			       vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, count)));
 			       vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, count)));
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i32 = psraw_s(a_.mmi_i32, count);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
@@ -1726,7 +1880,7 @@ simde__m64 simde_mm_srai_pi32(simde__m64 a, int count)
 #define simde_m_psradi(a, count) simde_mm_srai_pi32(a, count)
 #define simde_m_psradi(a, count) simde_mm_srai_pi32(a, count)
 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
 #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
 #define _mm_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
 #define _mm_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
-#define _m_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
+#define _m_psradi(a, count) simde_mm_srai_pi32(a, count)
 #endif
 #endif
 
 
 SIMDE_FUNCTION_ATTRIBUTES
 SIMDE_FUNCTION_ATTRIBUTES
@@ -1813,6 +1967,8 @@ simde__m64 simde_mm_sub_pi8(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i8 = vsub_s8(a_.neon_i8, b_.neon_i8);
 	r_.neon_i8 = vsub_s8(a_.neon_i8, b_.neon_i8);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i8 = psubb_s(a_.mmi_i8, b_.mmi_i8);
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
 	r_.i8 = a_.i8 - b_.i8;
 	r_.i8 = a_.i8 - b_.i8;
 #else
 #else
@@ -1843,6 +1999,8 @@ simde__m64 simde_mm_sub_pi16(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i16 = vsub_s16(a_.neon_i16, b_.neon_i16);
 	r_.neon_i16 = vsub_s16(a_.neon_i16, b_.neon_i16);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i16 = psubh_s(a_.mmi_i16, b_.mmi_i16);
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
 	r_.i16 = a_.i16 - b_.i16;
 	r_.i16 = a_.i16 - b_.i16;
 #else
 #else
@@ -1873,6 +2031,8 @@ simde__m64 simde_mm_sub_pi32(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i32 = vsub_s32(a_.neon_i32, b_.neon_i32);
 	r_.neon_i32 = vsub_s32(a_.neon_i32, b_.neon_i32);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i32 = psubw_s(a_.mmi_i32, b_.mmi_i32);
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
 #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
 	r_.i32 = a_.i32 - b_.i32;
 	r_.i32 = a_.i32 - b_.i32;
 #else
 #else
@@ -1903,6 +2063,8 @@ simde__m64 simde_mm_subs_pi8(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i8 = vqsub_s8(a_.neon_i8, b_.neon_i8);
 	r_.neon_i8 = vqsub_s8(a_.neon_i8, b_.neon_i8);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i8 = psubsb(a_.mmi_i8, b_.mmi_i8);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
@@ -1938,6 +2100,8 @@ simde__m64 simde_mm_subs_pu8(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_u8 = vqsub_u8(a_.neon_u8, b_.neon_u8);
 	r_.neon_u8 = vqsub_u8(a_.neon_u8, b_.neon_u8);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_u8 = psubusb(a_.mmi_u8, b_.mmi_u8);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
@@ -1973,6 +2137,8 @@ simde__m64 simde_mm_subs_pi16(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_i16 = vqsub_s16(a_.neon_i16, b_.neon_i16);
 	r_.neon_i16 = vqsub_s16(a_.neon_i16, b_.neon_i16);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i16 = psubsh(a_.mmi_i16, b_.mmi_i16);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
@@ -2008,6 +2174,8 @@ simde__m64 simde_mm_subs_pu16(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	r_.neon_u16 = vqsub_u16(a_.neon_u16, b_.neon_u16);
 	r_.neon_u16 = vqsub_u16(a_.neon_u16, b_.neon_u16);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_u16 = psubush(a_.mmi_u16, b_.mmi_u16);
 #else
 #else
 	SIMDE_VECTORIZE
 	SIMDE_VECTORIZE
 	for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
 	for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
@@ -2046,6 +2214,8 @@ simde__m64 simde_mm_unpackhi_pi8(simde__m64 a, simde__m64 b)
 #elif defined(SIMDE_SHUFFLE_VECTOR_)
 #elif defined(SIMDE_SHUFFLE_VECTOR_)
 	r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14,
 	r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14,
 				      7, 15);
 				      7, 15);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i8 = punpckhbh_s(a_.mmi_i8, b_.mmi_i8);
 #else
 #else
 	r_.i8[0] = a_.i8[4];
 	r_.i8[0] = a_.i8[4];
 	r_.i8[1] = b_.i8[4];
 	r_.i8[1] = b_.i8[4];
@@ -2078,6 +2248,8 @@ simde__m64 simde_mm_unpackhi_pi16(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
 	r_.neon_i16 = vzip2_s16(a_.neon_i16, b_.neon_i16);
 	r_.neon_i16 = vzip2_s16(a_.neon_i16, b_.neon_i16);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i16 = punpckhhw_s(a_.mmi_i16, b_.mmi_i16);
 #elif defined(SIMDE_SHUFFLE_VECTOR_)
 #elif defined(SIMDE_SHUFFLE_VECTOR_)
 	r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 2, 6, 3, 7);
 	r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 2, 6, 3, 7);
 #else
 #else
@@ -2108,6 +2280,8 @@ simde__m64 simde_mm_unpackhi_pi32(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
 	r_.neon_i32 = vzip2_s32(a_.neon_i32, b_.neon_i32);
 	r_.neon_i32 = vzip2_s32(a_.neon_i32, b_.neon_i32);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i32 = punpckhwd_s(a_.mmi_i32, b_.mmi_i32);
 #elif defined(SIMDE_SHUFFLE_VECTOR_)
 #elif defined(SIMDE_SHUFFLE_VECTOR_)
 	r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3);
 	r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3);
 #else
 #else
@@ -2136,6 +2310,8 @@ simde__m64 simde_mm_unpacklo_pi8(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
 	r_.neon_i8 = vzip1_s8(a_.neon_i8, b_.neon_i8);
 	r_.neon_i8 = vzip1_s8(a_.neon_i8, b_.neon_i8);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i8 = punpcklbh_s(a_.mmi_i8, b_.mmi_i8);
 #elif defined(SIMDE_SHUFFLE_VECTOR_)
 #elif defined(SIMDE_SHUFFLE_VECTOR_)
 	r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 0, 8, 1, 9, 2, 10, 3,
 	r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 0, 8, 1, 9, 2, 10, 3,
 				      11);
 				      11);
@@ -2171,6 +2347,8 @@ simde__m64 simde_mm_unpacklo_pi16(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
 	r_.neon_i16 = vzip1_s16(a_.neon_i16, b_.neon_i16);
 	r_.neon_i16 = vzip1_s16(a_.neon_i16, b_.neon_i16);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i16 = punpcklhw_s(a_.mmi_i16, b_.mmi_i16);
 #elif defined(SIMDE_SHUFFLE_VECTOR_)
 #elif defined(SIMDE_SHUFFLE_VECTOR_)
 	r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 4, 1, 5);
 	r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 4, 1, 5);
 #else
 #else
@@ -2201,6 +2379,8 @@ simde__m64 simde_mm_unpacklo_pi32(simde__m64 a, simde__m64 b)
 
 
 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
 	r_.neon_i32 = vzip1_s32(a_.neon_i32, b_.neon_i32);
 	r_.neon_i32 = vzip1_s32(a_.neon_i32, b_.neon_i32);
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
+	r_.mmi_i32 = punpcklwd_s(a_.mmi_i32, b_.mmi_i32);
 #elif defined(SIMDE_SHUFFLE_VECTOR_)
 #elif defined(SIMDE_SHUFFLE_VECTOR_)
 	r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2);
 	r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2);
 #else
 #else
@@ -2253,7 +2433,13 @@ int32_t simde_m_to_int(simde__m64 a)
 	simde__m64_private a_ = simde__m64_to_private(a);
 	simde__m64_private a_ = simde__m64_to_private(a);
 
 
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	HEDLEY_DIAGNOSTIC_PUSH
+#if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
+	SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
+#pragma clang diagnostic ignored "-Wvector-conversion"
+#endif
 	return vget_lane_s32(a_.neon_i32, 0);
 	return vget_lane_s32(a_.neon_i32, 0);
+	HEDLEY_DIAGNOSTIC_POP
 #else
 #else
 	return a_.i32[0];
 	return a_.i32[0];
 #endif
 #endif

File diff suppressed because it is too large
+ 706 - 102
libobs/util/simde/x86/sse.h


File diff suppressed because it is too large
+ 542 - 178
libobs/util/simde/x86/sse2.h


+ 4 - 50
libobs/util/sse-intrin.h

@@ -17,55 +17,9 @@
 
 
 #pragma once
 #pragma once
 
 
-#if NEEDS_SIMDE
-
-#include "simde/sse2.h"
-
-#define __m128 simde__m128
-#define _mm_setzero_ps simde_mm_setzero_ps
-#define _mm_set_ps simde_mm_set_ps
-#define _mm_add_ps simde_mm_add_ps
-#define _mm_sub_ps simde_mm_sub_ps
-#define _mm_mul_ps simde_mm_mul_ps
-#define _mm_div_ps simde_mm_div_ps
-#define _mm_set1_ps simde_mm_set1_ps
-#define _mm_movehl_ps simde_mm_movehl_ps
-#define _mm_shuffle_ps simde_mm_shuffle_ps
-#define _mm_min_ps simde_mm_min_ps
-#define _mm_max_ps simde_mm_max_ps
-#define _mm_movelh_ps simde_mm_movelh_ps
-#define _mm_unpacklo_ps simde_mm_unpacklo_ps
-#define _mm_unpackhi_ps simde_mm_unpackhi_ps
-#define _mm_load_ps simde_mm_load_ps
-#define _mm_andnot_ps simde_mm_andnot_ps
-#define _mm_storeu_ps simde_mm_storeu_ps
-#define _mm_loadu_ps simde_mm_loadu_ps
-
-#define __m128i simde__m128i
-#define _mm_set1_epi32 simde_mm_set1_epi32
-#define _mm_set1_epi16 simde_mm_set1_epi16
-#define _mm_load_si128 simde_mm_load_si128
-#define _mm_packs_epi32 simde_mm_packs_epi32
-#define _mm_srli_si128 simde_mm_srli_si128
-#define _mm_and_si128 simde_mm_and_si128
-#define _mm_packus_epi16 simde_mm_packus_epi16
-#define _mm_add_epi64 simde_mm_add_epi64
-#define _mm_shuffle_epi32 simde_mm_shuffle_epi32
-#define _mm_srai_epi16 simde_mm_srai_epi16
-#define _mm_shufflelo_epi16 simde_mm_shufflelo_epi16
-#define _mm_storeu_si128 simde_mm_storeu_si128
-
-#define _MM_SHUFFLE SIMDE_MM_SHUFFLE
-#define _MM_TRANSPOSE4_PS SIMDE_MM_TRANSPOSE4_PS
-
-#else
-
-#if defined(__aarch64__) || defined(__arm__)
-#include <arm_neon.h>
-#include "sse2neon.h"
-#else
-#include <xmmintrin.h>
+#if defined(_MSC_VER)
 #include <emmintrin.h>
 #include <emmintrin.h>
-#endif
-
+#else
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/sse2.h"
 #endif
 #endif

+ 0 - 4207
libobs/util/sse2neon.h

@@ -1,4207 +0,0 @@
-#ifndef SSE2NEON_H
-#define SSE2NEON_H
-
-// This header file provides a simple API translation layer
-// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
-//
-// This header file does not yet translate all of the SSE intrinsics.
-//
-// Contributors to this work are:
-//   John W. Ratcliff <[email protected]>
-//   Brandon Rowlett <[email protected]>
-//   Ken Fast <[email protected]>
-//   Eric van Beurden <[email protected]>
-//   Alexander Potylitsin <[email protected]>
-//   Hasindu Gamaarachchi <[email protected]>
-//   Jim Huang <[email protected]>
-//   Mark Cheng <[email protected]>
-//   Malcolm James MacLeod <[email protected]>
-//   Devin Hussey (easyaspi314) <[email protected]>
-//   Sebastian Pop <[email protected]>
-//   Developer Ecosystem Engineering <[email protected]>
-//   Danila Kutenin <[email protected]>
-
-/*
- * sse2neon is freely redistributable under the MIT License.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#if defined(__GNUC__) || defined(__clang__)
-#pragma push_macro("FORCE_INLINE")
-#pragma push_macro("ALIGN_STRUCT")
-#define FORCE_INLINE static inline __attribute__((always_inline))
-#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
-#else
-#error "Macro name collisions may happen with unsupported compiler."
-#ifdef FORCE_INLINE
-#undef FORCE_INLINE
-#endif
-#define FORCE_INLINE static inline
-#ifndef ALIGN_STRUCT
-#define ALIGN_STRUCT(x) __declspec(align(x))
-#endif
-#endif
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#include <arm_neon.h>
-
-/* "__has_builtin" can be used to query support for built-in functions
- * provided by gcc/clang and other compilers that support it.
- */
-#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
-/* Compatibility with gcc <= 9 */
-#if __GNUC__ <= 9
-#define __has_builtin(x) HAS##x
-#define HAS__builtin_popcount 1
-#define HAS__builtin_popcountll 1
-#else
-#define __has_builtin(x) 0
-#endif
-#endif
-
-/**
- * MACRO for shuffle parameter for _mm_shuffle_ps().
- * Argument fp3 is a digit[0123] that represents the fp from argument "b"
- * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
- * for fp2 in result. fp1 is a digit[0123] that represents the fp from
- * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
- * fp0 is the same for fp0 of result.
- */
-#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
-	(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
-
-/* indicate immediate constant argument in a given range */
-#define __constrange(a, b) const
-
-/* A few intrinsics accept traditional data types like ints or floats, but
- * most operate on data types that are specific to SSE.
- * If a vector type ends in d, it contains doubles, and if it does not have
- * a suffix, it contains floats. An integer vector type can contain any type
- * of integer, from chars to shorts to unsigned long longs.
- */
-typedef float32x2_t __m64;
-typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
-// On ARM 32-bit architecture, the float64x2_t is not supported.
-// The data type __m128d should be represented in a different way for related
-// intrinsic conversion.
-#if defined(__aarch64__)
-typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
-#else
-typedef float32x4_t __m128d;
-#endif
-typedef int64x1_t __m64i;
-typedef int64x2_t __m128i; /* 128-bit vector containing integers */
-
-/* type-safe casting between types */
-
-#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
-#define vreinterpretq_m128_f32(x) (x)
-#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
-
-#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
-#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
-#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
-#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
-
-#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
-#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
-#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
-
-#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
-#define vreinterpretq_f32_m128(x) (x)
-#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
-
-#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
-#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
-#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
-#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
-
-#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
-#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
-#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
-#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
-
-#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
-#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
-#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
-#define vreinterpretq_m128i_s64(x) (x)
-
-#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
-#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
-#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
-#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
-
-#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
-#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
-#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
-#define vreinterpretq_s64_m128i(x) (x)
-
-#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
-#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
-#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
-#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
-
-#define vreinterpret_m64i_s8(x) vreinterpret_s64_s8(x)
-#define vreinterpret_m64i_s16(x) vreinterpret_s64_s16(x)
-#define vreinterpret_m64i_s32(x) vreinterpret_s64_s32(x)
-#define vreinterpret_m64i_s64(x) (x)
-
-#define vreinterpret_m64i_u8(x) vreinterpret_s64_u8(x)
-#define vreinterpret_m64i_u16(x) vreinterpret_s64_u16(x)
-#define vreinterpret_m64i_u32(x) vreinterpret_s64_u32(x)
-#define vreinterpret_m64i_u64(x) vreinterpret_s64_u64(x)
-
-#define vreinterpret_u8_m64i(x) vreinterpret_u8_s64(x)
-#define vreinterpret_u16_m64i(x) vreinterpret_u16_s64(x)
-#define vreinterpret_u32_m64i(x) vreinterpret_u32_s64(x)
-#define vreinterpret_u64_m64i(x) vreinterpret_u64_s64(x)
-
-#define vreinterpret_s8_m64i(x) vreinterpret_s8_s64(x)
-#define vreinterpret_s16_m64i(x) vreinterpret_s16_s64(x)
-#define vreinterpret_s32_m64i(x) vreinterpret_s32_s64(x)
-#define vreinterpret_s64_m64i(x) (x)
-
-// A struct is defined in this header file called 'SIMDVec' which can be used
-// by applications which attempt to access the contents of an _m128 struct
-// directly.  It is important to note that accessing the __m128 struct directly
-// is bad coding practice by Microsoft: @see:
-// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
-//
-// However, some legacy source code may try to access the contents of an __m128
-// struct directly so the developer can use the SIMDVec as an alias for it.  Any
-// casting must be done manually by the developer, as you cannot cast or
-// otherwise alias the base NEON data type for intrinsic operations.
-//
-// union intended to allow direct access to an __m128 variable using the names
-// that the MSVC compiler provides.  This union should really only be used when
-// trying to access the members of the vector as integer values.  GCC/clang
-// allow native access to the float members through a simple array access
-// operator (in C since 4.6, in C++ since 4.8).
-//
-// Ideally direct accesses to SIMD vectors should not be used since it can cause
-// a performance hit.  If it really is needed however, the original __m128
-// variable can be aliased with a pointer to this union and used to access
-// individual components.  The use of this union should be hidden behind a macro
-// that is used throughout the codebase to access the members instead of always
-// declaring this type of variable.
-typedef union ALIGN_STRUCT(16) SIMDVec {
-	float m128_f32[4];    // as floats - DON'T USE. Added for convenience.
-	int8_t m128_i8[16];   // as signed 8-bit integers.
-	int16_t m128_i16[8];  // as signed 16-bit integers.
-	int32_t m128_i32[4];  // as signed 32-bit integers.
-	int64_t m128_i64[2];  // as signed 64-bit integers.
-	uint8_t m128_u8[16];  // as unsigned 8-bit integers.
-	uint16_t m128_u16[8]; // as unsigned 16-bit integers.
-	uint32_t m128_u32[4]; // as unsigned 32-bit integers.
-	uint64_t m128_u64[2]; // as unsigned 64-bit integers.
-} SIMDVec;
-
-// casting using SIMDVec
-#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *)&x)->m128_u64[n])
-#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *)&x)->m128_u32[n])
-
-/* Backwards compatibility for compilers with lack of specific type support */
-
-// Older gcc does not define vld1q_u8_x4 type
-#if defined(__GNUC__) && !defined(__clang__)
-#if __GNUC__ <= 9
-FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
-{
-	uint8x16x4_t ret;
-	ret.val[0] = vld1q_u8(p + 0);
-	ret.val[1] = vld1q_u8(p + 16);
-	ret.val[2] = vld1q_u8(p + 32);
-	ret.val[3] = vld1q_u8(p + 48);
-	return ret;
-}
-#endif
-#endif
-
-/* Function Naming Conventions
- * The naming convention of SSE intrinsics is straightforward. A generic SSE
- * intrinsic function is given as follows:
- *   _mm_<name>_<data_type>
- *
- * The parts of this format are given as follows:
- * 1. <name> describes the operation performed by the intrinsic
- * 2. <data_type> identifies the data type of the function's primary arguments
- *
- * This last part, <data_type>, is a little complicated. It identifies the
- * content of the input values, and can be set to any of the following values:
- * + ps - vectors contain floats (ps stands for packed single-precision)
- * + pd - vectors cantain doubles (pd stands for packed double-precision)
- * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- *                            signed integers
- * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- *                            unsigned integers
- * + si128 - unspecified 128-bit vector or 256-bit vector
- * + m128/m128i/m128d - identifies input vector types when they are different
- *                      than the type of the returned vector
- *
- * For example, _mm_setzero_ps. The _mm implies that the function returns
- * a 128-bit vector. The _ps at the end implies that the argument vectors
- * contain floats.
- *
- * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
- *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
- *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
- *   // Set packed 8-bit integers
- *   // 128 bits, 16 chars, per 8 bits
- *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
- *                                  4, 5, 12, 13, 6, 7, 14, 15);
- *   // Shuffle packed 8-bit integers
- *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
- *
- * Data (Number, Binary, Byte Index):
-    +------+------+-------------+------+------+-------------+
-    |      1      |      2      |      3      |      4      | Number
-    +------+------+------+------+------+------+------+------+
-    | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
-    +------+------+------+------+------+------+------+------+
-    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
-    +------+------+------+------+------+------+------+------+
-
-    +------+------+------+------+------+------+------+------+
-    |      5      |      6      |      7      |      8      | Number
-    +------+------+------+------+------+------+------+------+
-    | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
-    +------+------+------+------+------+------+------+------+
-    |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
-    +------+------+------+------+------+------+------+------+
- * Index (Byte Index):
-    +------+------+------+------+------+------+------+------+
-    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
-    +------+------+------+------+------+------+------+------+
-
-    +------+------+------+------+------+------+------+------+
-    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
-    +------+------+------+------+------+------+------+------+
- * Result:
-    +------+------+------+------+------+------+------+------+
-    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
-    +------+------+------+------+------+------+------+------+
-    | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
-    +------+------+------+------+------+------+------+------+
-    |     256     |      2      |      5      |      6      | Number
-    +------+------+------+------+------+------+------+------+
-
-    +------+------+------+------+------+------+------+------+
-    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
-    +------+------+------+------+------+------+------+------+
-    | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
-    +------+------+------+------+------+------+------+------+
-    |      3      |      7      |      4      |      8      | Number
-    +------+------+------+------+------+------+-------------+
- */
-
-/* Set/get methods */
-
-/* Constants for use with _mm_prefetch.  */
-enum _mm_hint {
-	_MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
-	_MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
-	_MM_HINT_T1 = 2,   /* load data to L2 cache only */
-	_MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
-	_MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
-	_MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
-	_MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
-	_MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
-};
-
-// Loads one cache line of data from address p to a location closer to the
-// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
-FORCE_INLINE void _mm_prefetch(const void *p, int i)
-{
-	(void)i;
-	__builtin_prefetch(p);
-}
-
-// extracts the lower order floating point value from the parameter :
-// https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396
-FORCE_INLINE float _mm_cvtss_f32(__m128 a)
-{
-	return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-}
-
-// Sets the 128-bit value to zero
-// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_setzero_si128(void)
-{
-	return vreinterpretq_m128i_s32(vdupq_n_s32(0));
-}
-
-// Clears the four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setzero_ps(void)
-{
-	return vreinterpretq_m128_f32(vdupq_n_f32(0));
-}
-
-// Sets the four single-precision, floating-point values to w.
-//
-//   r0 := r1 := r2 := r3 := w
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set1_ps(float _w)
-{
-	return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Sets the four single-precision, floating-point values to w.
-// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set_ps1(float _w)
-{
-	return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Sets the four single-precision, floating-point values to the four inputs.
-// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
-{
-	float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
-	return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Copy single-precision (32-bit) floating-point element a to the lower element
-// of dst, and zero the upper 3 elements.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss&expand=4901,4895,4901
-FORCE_INLINE __m128 _mm_set_ss(float a)
-{
-	float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
-	return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Sets the four single-precision, floating-point values to the four inputs in
-// reverse order.
-// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
-{
-	float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
-	return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Sets the 8 signed 16-bit integer values in reverse order.
-//
-// Return Value
-//   r0 := w0
-//   r1 := w1
-//   ...
-//   r7 := w7
-FORCE_INLINE __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3,
-				    short w4, short w5, short w6, short w7)
-{
-	int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
-	return vreinterpretq_m128i_s16(vld1q_s16((int16_t *)data));
-}
-
-// Sets the 4 signed 32-bit integer values in reverse order
-// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
-{
-	int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
-	return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Sets the 16 signed 8-bit integer values to b.
-//
-//   r0 := b
-//   r1 := b
-//   ...
-//   r15 := b
-//
-// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
-{
-	return vreinterpretq_m128i_s8(vdupq_n_s8(w));
-}
-
-// Sets the 8 signed 16-bit integer values to w.
-//
-//   r0 := w
-//   r1 := w
-//   ...
-//   r7 := w
-//
-// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set1_epi16(short w)
-{
-	return vreinterpretq_m128i_s16(vdupq_n_s16(w));
-}
-
-// Sets the 16 signed 8-bit integer values.
-// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
-FORCE_INLINE __m128i
-_mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12,
-	     signed char b11, signed char b10, signed char b9, signed char b8,
-	     signed char b7, signed char b6, signed char b5, signed char b4,
-	     signed char b3, signed char b2, signed char b1, signed char b0)
-{
-	int8_t ALIGN_STRUCT(16)
-		data[16] = {(int8_t)b0,  (int8_t)b1,  (int8_t)b2,  (int8_t)b3,
-			    (int8_t)b4,  (int8_t)b5,  (int8_t)b6,  (int8_t)b7,
-			    (int8_t)b8,  (int8_t)b9,  (int8_t)b10, (int8_t)b11,
-			    (int8_t)b12, (int8_t)b13, (int8_t)b14, (int8_t)b15};
-	return (__m128i)vld1q_s8(data);
-}
-
-// Sets the 8 signed 16-bit integer values.
-// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4,
-				   short i3, short i2, short i1, short i0)
-{
-	int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
-	return vreinterpretq_m128i_s16(vld1q_s16(data));
-}
-
-// Sets the 16 signed 8-bit integer values in reverse order.
-// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_setr_epi8(
-	signed char b0, signed char b1, signed char b2, signed char b3,
-	signed char b4, signed char b5, signed char b6, signed char b7,
-	signed char b8, signed char b9, signed char b10, signed char b11,
-	signed char b12, signed char b13, signed char b14, signed char b15)
-{
-	int8_t ALIGN_STRUCT(16)
-		data[16] = {(int8_t)b0,  (int8_t)b1,  (int8_t)b2,  (int8_t)b3,
-			    (int8_t)b4,  (int8_t)b5,  (int8_t)b6,  (int8_t)b7,
-			    (int8_t)b8,  (int8_t)b9,  (int8_t)b10, (int8_t)b11,
-			    (int8_t)b12, (int8_t)b13, (int8_t)b14, (int8_t)b15};
-	return (__m128i)vld1q_s8(data);
-}
-
-// Sets the 4 signed 32-bit integer values to i.
-//
-//   r0 := i
-//   r1 := i
-//   r2 := i
-//   r3 := I
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set1_epi32(int _i)
-{
-	return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
-}
-
-// Sets the 2 signed 64-bit integer values to i.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
-FORCE_INLINE __m128i _mm_set1_epi64(int64_t _i)
-{
-	return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
-}
-
-// Sets the 2 signed 64-bit integer values to i.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x&expand=4961
-FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
-{
-	return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
-}
-
-// Sets the 4 signed 32-bit integer values.
-// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
-{
-	int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
-	return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Returns the __m128i structure with its two 64-bit integer values
-// initialized to the values of the two 64-bit integers passed in.
-// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
-FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
-{
-	int64_t ALIGN_STRUCT(16) data[2] = {i2, i1};
-	return vreinterpretq_m128i_s64(vld1q_s64(data));
-}
-
-// Stores four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
-FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
-{
-	vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Stores four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
-FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
-{
-	vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Stores four 32-bit integer values as (as a __m128i value) at the address p.
-// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
-{
-	vst1q_s32((int32_t *)p, vreinterpretq_s32_m128i(a));
-}
-
-// Stores four 32-bit integer values as (as a __m128i value) at the address p.
-// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
-{
-	vst1q_s32((int32_t *)p, vreinterpretq_s32_m128i(a));
-}
-
-// Stores the lower single - precision, floating - point value.
-// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
-FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
-{
-	vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
-}
-
-// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
-// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
-{
-	uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
-	uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
-	*a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
-}
-
-// Stores the lower two single-precision floating point values of a to the
-// address p.
-//
-//   *p0 := a0
-//   *p1 := a1
-//
-// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
-FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
-{
-	*p = vget_low_f32(a);
-}
-
-// Stores the upper two single-precision, floating-point values of a to the
-// address p.
-//
-//   *p0 := a2
-//   *p1 := a3
-//
-// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
-FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
-{
-	*p = vget_high_f32(a);
-}
-
-// Loads a single single-precision, floating-point value, copying it into all
-// four words
-// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load1_ps(const float *p)
-{
-	return vreinterpretq_m128_f32(vld1q_dup_f32(p));
-}
-#define _mm_load_ps1 _mm_load1_ps
-
-// Sets the lower two single-precision, floating-point values with 64
-// bits of data loaded from the address p; the upper two values are passed
-// through from a.
-//
-// Return Value
-//   r0 := *p0
-//   r1 := *p1
-//   r2 := a2
-//   r3 := a3
-//
-// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
-{
-	return vreinterpretq_m128_f32(
-		vcombine_f32(vld1_f32((const float32_t *)p), vget_high_f32(a)));
-}
-
-// Sets the upper two single-precision, floating-point values with 64
-// bits of data loaded from the address p; the lower two values are passed
-// through from a.
-//
-//   r0 := a0
-//   r1 := a1
-//   r2 := *p0
-//   r3 := *p1
-//
-// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
-FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
-{
-	return vreinterpretq_m128_f32(
-		vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *)p)));
-}
-
-// Loads four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load_ps(const float *p)
-{
-	return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Loads four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
-{
-	// for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
-	// equivalent for neon
-	return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Loads a double-precision, floating-point value.
-// The upper double-precision, floating-point is set to zero. The address p does
-// not need to be 16-byte aligned.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/574w9fdd(v%3dvs.100)
-FORCE_INLINE __m128d _mm_load_sd(const double *p)
-{
-#if defined(__aarch64__)
-	return vsetq_lane_f64(*p, vdupq_n_f64(0), 0);
-#else
-	const float *fp = (const float *)p;
-	float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
-	return vld1q_f32(data);
-#endif
-}
-
-// Loads an single - precision, floating - point value into the low word and
-// clears the upper three words.
-// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_load_ss(const float *p)
-{
-	return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
-}
-
-FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
-{
-	/* Load the lower 64 bits of the value pointed to by p into the
-     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
-     */
-	return vreinterpretq_m128i_s32(
-		vcombine_s32(vld1_s32((int32_t const *)p), vcreate_s32(0)));
-}
-
-/* Logic/Binary operations */
-
-// Compares for inequality.
-// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_u32(vmvnq_u32(vceqq_f32(
-		vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Computes the bitwise AND-NOT of the four single-precision, floating-point
-// values of a and b.
-//
-//   r0 := ~a0 & b0
-//   r1 := ~a1 & b1
-//   r2 := ~a2 & b2
-//   r3 := ~a3 & b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_s32(
-		vbicq_s32(vreinterpretq_s32_m128(b),
-			  vreinterpretq_s32_m128(a))); // *NOTE* argument swap
-}
-
-// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
-// 128-bit value in a.
-//
-//   r := (~a) & b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32(
-		vbicq_s32(vreinterpretq_s32_m128i(b),
-			  vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
-}
-
-// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
-// b.
-//
-//   r := a & b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32(vandq_s32(vreinterpretq_s32_m128i(a),
-						 vreinterpretq_s32_m128i(b)));
-}
-
-// Computes the bitwise AND of the four single-precision, floating-point values
-// of a and b.
-//
-//   r0 := a0 & b0
-//   r1 := a1 & b1
-//   r2 := a2 & b2
-//   r3 := a3 & b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_s32(vandq_s32(vreinterpretq_s32_m128(a),
-						vreinterpretq_s32_m128(b)));
-}
-
-// Computes the bitwise OR of the four single-precision, floating-point values
-// of a and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_s32(vorrq_s32(vreinterpretq_s32_m128(a),
-						vreinterpretq_s32_m128(b)));
-}
-
-// Computes bitwise EXOR (exclusive-or) of the four single-precision,
-// floating-point values of a and b.
-// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_s32(veorq_s32(vreinterpretq_s32_m128(a),
-						vreinterpretq_s32_m128(b)));
-}
-
-// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
-//
-//   r := a | b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32(vorrq_s32(vreinterpretq_s32_m128i(a),
-						 vreinterpretq_s32_m128i(b)));
-}
-
-// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
-// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32(veorq_s32(vreinterpretq_s32_m128i(a),
-						 vreinterpretq_s32_m128i(b)));
-}
-
-// Moves the upper two values of B into the lower two values of A.
-//
-//   r3 := a3
-//   r2 := a2
-//   r1 := b3
-//   r0 := b2
-FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
-{
-	float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
-	float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
-	return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
-}
-
-// Moves the lower two values of B into the upper two values of A.
-//
-//   r3 := b1
-//   r2 := b0
-//   r1 := a1
-//   r0 := a0
-FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
-{
-	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
-	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
-	return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
-{
-	return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
-}
-
-FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
-{
-	return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
-}
-
-FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
-{
-	return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
-}
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of b and places it into the high end of the result.
-FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
-{
-	float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in high
-// end of result takes the higher two 32 bit values from b and swaps them and
-// places in low end of result.
-FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
-{
-	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-	float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
-	return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
-{
-	float32x2_t a21 = vget_high_f32(vextq_f32(
-		vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-	float32x2_t b03 = vget_low_f32(vextq_f32(vreinterpretq_f32_m128(b),
-						 vreinterpretq_f32_m128(b), 3));
-	return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
-{
-	float32x2_t a03 = vget_low_f32(vextq_f32(vreinterpretq_f32_m128(a),
-						 vreinterpretq_f32_m128(a), 3));
-	float32x2_t b21 = vget_high_f32(vextq_f32(
-		vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-	return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
-{
-	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
-{
-	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
-{
-	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-	float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
-	return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
-}
-
-// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
-// high
-FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
-{
-	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
-{
-	float32x2_t a11 =
-		vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
-	float32x2_t b00 =
-		vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-	return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
-{
-	float32x2_t a22 =
-		vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-	float32x2_t b00 =
-		vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-	return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
-{
-	float32x2_t a00 =
-		vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
-	float32x2_t b22 =
-		vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
-	return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
-{
-	float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-	float32x2_t a22 =
-		vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-	float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
-	float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
-{
-	float32x2_t a33 =
-		vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
-	float32x2_t b11 =
-		vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
-	return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
-{
-	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-	float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
-	float32x2_t b00 =
-		vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-	float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-	return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
-{
-	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-	float32_t b2 = vgetq_lane_f32(b, 2);
-	float32x2_t b00 =
-		vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-	float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-	return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
-{
-	float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-	float32_t b2 = vgetq_lane_f32(b, 2);
-	float32x2_t b00 =
-		vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-	float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-	return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
-}
-
-// NEON does not support a general purpose permute intrinsic
-// Selects four specific single-precision, floating-point values from a and b,
-// based on the mask i.
-// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
-#if 0 /* C version */
-FORCE_INLINE __m128 _mm_shuffle_ps_default(__m128 a,
-                                           __m128 b,
-                                           __constrange(0, 255) int imm)
-{
-    __m128 ret;
-    ret[0] = a[imm & 0x3];
-    ret[1] = a[(imm >> 2) & 0x3];
-    ret[2] = b[(imm >> 4) & 0x03];
-    ret[3] = b[(imm >> 6) & 0x03];
-    return ret;
-}
-#endif
-#define _mm_shuffle_ps_default(a, b, imm)                                      \
-	__extension__({                                                        \
-		float32x4_t ret;                                               \
-		ret = vmovq_n_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a),    \
-						 (imm) & (0x3)));              \
-		ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a), \
-						    ((imm) >> 2) & 0x3),       \
-				     ret, 1);                                  \
-		ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), \
-						    ((imm) >> 4) & 0x3),       \
-				     ret, 2);                                  \
-		ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), \
-						    ((imm) >> 6) & 0x3),       \
-				     ret, 3);                                  \
-		vreinterpretq_m128_f32(ret);                                   \
-	})
-
-// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
-// int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shuffle_ps(a, b, imm)                                            \
-	__extension__({                                                      \
-		float32x4_t _input1 = vreinterpretq_f32_m128(a);             \
-		float32x4_t _input2 = vreinterpretq_f32_m128(b);             \
-		float32x4_t _shuf = __builtin_shufflevector(                 \
-			_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-			(((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
-		vreinterpretq_m128_f32(_shuf);                               \
-	})
-#else // generic
-#define _mm_shuffle_ps(a, b, imm)                                      \
-	__extension__({                                                \
-		__m128 ret;                                            \
-		switch (imm) {                                         \
-		case _MM_SHUFFLE(1, 0, 3, 2):                          \
-			ret = _mm_shuffle_ps_1032((a), (b));           \
-			break;                                         \
-		case _MM_SHUFFLE(2, 3, 0, 1):                          \
-			ret = _mm_shuffle_ps_2301((a), (b));           \
-			break;                                         \
-		case _MM_SHUFFLE(0, 3, 2, 1):                          \
-			ret = _mm_shuffle_ps_0321((a), (b));           \
-			break;                                         \
-		case _MM_SHUFFLE(2, 1, 0, 3):                          \
-			ret = _mm_shuffle_ps_2103((a), (b));           \
-			break;                                         \
-		case _MM_SHUFFLE(1, 0, 1, 0):                          \
-			ret = _mm_movelh_ps((a), (b));                 \
-			break;                                         \
-		case _MM_SHUFFLE(1, 0, 0, 1):                          \
-			ret = _mm_shuffle_ps_1001((a), (b));           \
-			break;                                         \
-		case _MM_SHUFFLE(0, 1, 0, 1):                          \
-			ret = _mm_shuffle_ps_0101((a), (b));           \
-			break;                                         \
-		case _MM_SHUFFLE(3, 2, 1, 0):                          \
-			ret = _mm_shuffle_ps_3210((a), (b));           \
-			break;                                         \
-		case _MM_SHUFFLE(0, 0, 1, 1):                          \
-			ret = _mm_shuffle_ps_0011((a), (b));           \
-			break;                                         \
-		case _MM_SHUFFLE(0, 0, 2, 2):                          \
-			ret = _mm_shuffle_ps_0022((a), (b));           \
-			break;                                         \
-		case _MM_SHUFFLE(2, 2, 0, 0):                          \
-			ret = _mm_shuffle_ps_2200((a), (b));           \
-			break;                                         \
-		case _MM_SHUFFLE(3, 2, 0, 2):                          \
-			ret = _mm_shuffle_ps_3202((a), (b));           \
-			break;                                         \
-		case _MM_SHUFFLE(3, 2, 3, 2):                          \
-			ret = _mm_movehl_ps((b), (a));                 \
-			break;                                         \
-		case _MM_SHUFFLE(1, 1, 3, 3):                          \
-			ret = _mm_shuffle_ps_1133((a), (b));           \
-			break;                                         \
-		case _MM_SHUFFLE(2, 0, 1, 0):                          \
-			ret = _mm_shuffle_ps_2010((a), (b));           \
-			break;                                         \
-		case _MM_SHUFFLE(2, 0, 0, 1):                          \
-			ret = _mm_shuffle_ps_2001((a), (b));           \
-			break;                                         \
-		case _MM_SHUFFLE(2, 0, 3, 2):                          \
-			ret = _mm_shuffle_ps_2032((a), (b));           \
-			break;                                         \
-		default:                                               \
-			ret = _mm_shuffle_ps_default((a), (b), (imm)); \
-			break;                                         \
-		}                                                      \
-		ret;                                                   \
-	})
-#endif
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of a and places it into the high end of the result.
-FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
-{
-	int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-	int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-	return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in low end
-// of result takes the higher two 32 bit values from a and swaps them and places
-// in high end of result.
-FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
-{
-	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-	int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
-	return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
-}
-
-// rotates the least significant 32 bits into the most signficant 32 bits, and
-// shifts the rest down
-FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
-{
-	return vreinterpretq_m128i_s32(vextq_s32(
-		vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
-}
-
-// rotates the most significant 32 bits into the least signficant 32 bits, and
-// shifts the rest up
-FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
-{
-	return vreinterpretq_m128i_s32(vextq_s32(
-		vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
-}
-
-// gets the lower 64 bits of a, and places it in the upper 64 bits
-// gets the lower 64 bits of a and places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
-{
-	int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-	return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
-// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
-{
-	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-	int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-	return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
-// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
-// places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
-{
-	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-	return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
-{
-	int32x2_t a11 =
-		vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
-	int32x2_t a22 =
-		vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-	return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
-{
-	int32x2_t a22 =
-		vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-	return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
-{
-	int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-	int32x2_t a33 =
-		vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
-	return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
-}
-
-// Shuffle packed 8-bit integers in a according to shuffle control mask in the
-// corresponding 8-bit element of b, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8&expand=5146
-FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
-{
-	int8x16_t tbl = vreinterpretq_s8_m128i(a);  // input a
-	uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
-	uint8x16_t idx_masked =
-		vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
-#if defined(__aarch64__)
-	return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
-#elif defined(__GNUC__)
-	int8x16_t ret;
-	// %e and %f represent the even and odd D registers
-	// respectively.
-	__asm__ __volatile__("vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
-			     "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
-			     : [ret] "=&w"(ret)
-			     : [tbl] "w"(tbl), [idx] "w"(idx_masked));
-	return vreinterpretq_m128i_s8(ret);
-#else
-	// use this line if testing on aarch64
-	int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
-	return vreinterpretq_m128i_s8(
-		vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
-			    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
-#endif
-}
-
-#if 0 /* C version */
-FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a,
-                                               __constrange(0, 255) int imm)
-{
-    __m128i ret;
-    ret[0] = a[imm & 0x3];
-    ret[1] = a[(imm >> 2) & 0x3];
-    ret[2] = a[(imm >> 4) & 0x03];
-    ret[3] = a[(imm >> 6) & 0x03];
-    return ret;
-}
-#endif
-#define _mm_shuffle_epi32_default(a, imm)                                    \
-	__extension__({                                                      \
-		int32x4_t ret;                                               \
-		ret = vmovq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), \
-						 (imm) & (0x3)));            \
-		ret = vsetq_lane_s32(                                        \
-			vgetq_lane_s32(vreinterpretq_s32_m128i(a),           \
-				       ((imm) >> 2) & 0x3),                  \
-			ret, 1);                                             \
-		ret = vsetq_lane_s32(                                        \
-			vgetq_lane_s32(vreinterpretq_s32_m128i(a),           \
-				       ((imm) >> 4) & 0x3),                  \
-			ret, 2);                                             \
-		ret = vsetq_lane_s32(                                        \
-			vgetq_lane_s32(vreinterpretq_s32_m128i(a),           \
-				       ((imm) >> 6) & 0x3),                  \
-			ret, 3);                                             \
-		vreinterpretq_m128i_s32(ret);                                \
-	})
-
-// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
-// int imm)
-#if defined(__aarch64__)
-#define _mm_shuffle_epi32_splat(a, imm)                                      \
-	__extension__({                                                      \
-		vreinterpretq_m128i_s32(                                     \
-			vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
-	})
-#else
-#define _mm_shuffle_epi32_splat(a, imm)                                      \
-	__extension__({                                                      \
-		vreinterpretq_m128i_s32(vdupq_n_s32(                         \
-			vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
-	})
-#endif
-
-// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
-// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
-// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
-//                                        __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shuffle_epi32(a, imm)                                          \
-	__extension__({                                                    \
-		int32x4_t _input = vreinterpretq_s32_m128i(a);             \
-		int32x4_t _shuf = __builtin_shufflevector(                 \
-			_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-			((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
-		vreinterpretq_m128i_s32(_shuf);                            \
-	})
-#else // generic
-#define _mm_shuffle_epi32(a, imm)                                    \
-	__extension__({                                              \
-		__m128i ret;                                         \
-		switch (imm) {                                       \
-		case _MM_SHUFFLE(1, 0, 3, 2):                        \
-			ret = _mm_shuffle_epi_1032((a));             \
-			break;                                       \
-		case _MM_SHUFFLE(2, 3, 0, 1):                        \
-			ret = _mm_shuffle_epi_2301((a));             \
-			break;                                       \
-		case _MM_SHUFFLE(0, 3, 2, 1):                        \
-			ret = _mm_shuffle_epi_0321((a));             \
-			break;                                       \
-		case _MM_SHUFFLE(2, 1, 0, 3):                        \
-			ret = _mm_shuffle_epi_2103((a));             \
-			break;                                       \
-		case _MM_SHUFFLE(1, 0, 1, 0):                        \
-			ret = _mm_shuffle_epi_1010((a));             \
-			break;                                       \
-		case _MM_SHUFFLE(1, 0, 0, 1):                        \
-			ret = _mm_shuffle_epi_1001((a));             \
-			break;                                       \
-		case _MM_SHUFFLE(0, 1, 0, 1):                        \
-			ret = _mm_shuffle_epi_0101((a));             \
-			break;                                       \
-		case _MM_SHUFFLE(2, 2, 1, 1):                        \
-			ret = _mm_shuffle_epi_2211((a));             \
-			break;                                       \
-		case _MM_SHUFFLE(0, 1, 2, 2):                        \
-			ret = _mm_shuffle_epi_0122((a));             \
-			break;                                       \
-		case _MM_SHUFFLE(3, 3, 3, 2):                        \
-			ret = _mm_shuffle_epi_3332((a));             \
-			break;                                       \
-		case _MM_SHUFFLE(0, 0, 0, 0):                        \
-			ret = _mm_shuffle_epi32_splat((a), 0);       \
-			break;                                       \
-		case _MM_SHUFFLE(1, 1, 1, 1):                        \
-			ret = _mm_shuffle_epi32_splat((a), 1);       \
-			break;                                       \
-		case _MM_SHUFFLE(2, 2, 2, 2):                        \
-			ret = _mm_shuffle_epi32_splat((a), 2);       \
-			break;                                       \
-		case _MM_SHUFFLE(3, 3, 3, 3):                        \
-			ret = _mm_shuffle_epi32_splat((a), 3);       \
-			break;                                       \
-		default:                                             \
-			ret = _mm_shuffle_epi32_default((a), (imm)); \
-			break;                                       \
-		}                                                    \
-		ret;                                                 \
-	})
-#endif
-
-// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
-// by imm.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
-// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
-//                                                   __constrange(0,255) int
-//                                                   imm)
-#define _mm_shufflelo_epi16_function(a, imm)                                 \
-	__extension__({                                                      \
-		int16x8_t ret = vreinterpretq_s16_m128i(a);                  \
-		int16x4_t lowBits = vget_low_s16(ret);                       \
-		ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)),  \
-				     ret, 0);                                \
-		ret = vsetq_lane_s16(                                        \
-			vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, 1); \
-		ret = vsetq_lane_s16(                                        \
-			vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, 2); \
-		ret = vsetq_lane_s16(                                        \
-			vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, 3); \
-		vreinterpretq_m128i_s16(ret);                                \
-	})
-
-// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shufflelo_epi16(a, imm)                                            \
-	__extension__({                                                        \
-		int16x8_t _input = vreinterpretq_s16_m128i(a);                 \
-		int16x8_t _shuf = __builtin_shufflevector(                     \
-			_input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
-			(((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6,   \
-			7);                                                    \
-		vreinterpretq_m128i_s16(_shuf);                                \
-	})
-#else // generic
-#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
-#endif
-
-// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
-// by imm.
-// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
-// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
-//                                                   __constrange(0,255) int
-//                                                   imm)
-#define _mm_shufflehi_epi16_function(a, imm)                                  \
-	__extension__({                                                       \
-		int16x8_t ret = vreinterpretq_s16_m128i(a);                   \
-		int16x4_t highBits = vget_high_s16(ret);                      \
-		ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)),  \
-				     ret, 4);                                 \
-		ret = vsetq_lane_s16(                                         \
-			vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, 5); \
-		ret = vsetq_lane_s16(                                         \
-			vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, 6); \
-		ret = vsetq_lane_s16(                                         \
-			vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, 7); \
-		vreinterpretq_m128i_s16(ret);                                 \
-	})
-
-// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shufflehi_epi16(a, imm)                                         \
-	__extension__({                                                     \
-		int16x8_t _input = vreinterpretq_s16_m128i(a);              \
-		int16x8_t _shuf = __builtin_shufflevector(                  \
-			_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
-			(((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
-			(((imm) >> 6) & 0x3) + 4);                          \
-		vreinterpretq_m128i_s16(_shuf);                             \
-	})
-#else // generic
-#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
-#endif
-
-// Blend packed 16-bit integers from a and b using control mask imm8, and store
-// the results in dst.
-//
-//   FOR j := 0 to 7
-//       i := j*16
-//       IF imm8[j]
-//           dst[i+15:i] := b[i+15:i]
-//       ELSE
-//           dst[i+15:i] := a[i+15:i]
-//       FI
-//   ENDFOR
-// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
-//                                      __constrange(0,255) int imm)
-#define _mm_blend_epi16(a, b, imm)                                     \
-	__extension__({                                                \
-		const uint16_t _mask[8] = {                            \
-			((imm) & (1 << 0)) ? 0xFFFF : 0x0000,          \
-			((imm) & (1 << 1)) ? 0xFFFF : 0x0000,          \
-			((imm) & (1 << 2)) ? 0xFFFF : 0x0000,          \
-			((imm) & (1 << 3)) ? 0xFFFF : 0x0000,          \
-			((imm) & (1 << 4)) ? 0xFFFF : 0x0000,          \
-			((imm) & (1 << 5)) ? 0xFFFF : 0x0000,          \
-			((imm) & (1 << 6)) ? 0xFFFF : 0x0000,          \
-			((imm) & (1 << 7)) ? 0xFFFF : 0x0000};         \
-		uint16x8_t _mask_vec = vld1q_u16(_mask);               \
-		uint16x8_t _a = vreinterpretq_u16_m128i(a);            \
-		uint16x8_t _b = vreinterpretq_u16_m128i(b);            \
-		vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
-	})
-
-// Blend packed 8-bit integers from a and b using mask, and store the results in
-// dst.
-//
-//   FOR j := 0 to 15
-//       i := j*8
-//       IF mask[i+7]
-//           dst[i+7:i] := b[i+7:i]
-//       ELSE
-//           dst[i+7:i] := a[i+7:i]
-//       FI
-//   ENDFOR
-FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
-{
-	// Use a signed shift right to create a mask with the sign bit
-	uint8x16_t mask = vreinterpretq_u8_s8(
-		vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
-	uint8x16_t a = vreinterpretq_u8_m128i(_a);
-	uint8x16_t b = vreinterpretq_u8_m128i(_b);
-	return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
-}
-
-/* Shifts */
-
-// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
-// in the sign bit.
-//
-//   r0 := a0 >> count
-//   r1 := a1 >> count
-//   r2 := a2 >> count
-//   r3 := a3 >> count immediate
-FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, int count)
-{
-	return (__m128i)vshlq_s32((int32x4_t)a, vdupq_n_s32(-count));
-}
-
-// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
-// in the sign bit.
-//
-//   r0 := a0 >> count
-//   r1 := a1 >> count
-//   ...
-//   r7 := a7 >> count
-FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int count)
-{
-	return (__m128i)vshlq_s16((int16x8_t)a, vdupq_n_s16(-count));
-}
-
-// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
-// shifting in zeros.
-//
-//   r0 := a0 << count
-//   r1 := a1 << count
-//   ...
-//   r7 := a7 << count
-//
-// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
-#define _mm_slli_epi16(a, imm)                                       \
-	__extension__({                                              \
-		__m128i ret;                                         \
-		if ((imm) <= 0) {                                    \
-			ret = a;                                     \
-		} else if ((imm) > 31) {                             \
-			ret = _mm_setzero_si128();                   \
-		} else {                                             \
-			ret = vreinterpretq_m128i_s16(vshlq_n_s16(   \
-				vreinterpretq_s16_m128i(a), (imm))); \
-		}                                                    \
-		ret;                                                 \
-	})
-
-// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
-// shifting in zeros. :
-// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
-// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_slli_epi32(a, imm)                                       \
-	__extension__({                                              \
-		__m128i ret;                                         \
-		if ((imm) <= 0) {                                    \
-			ret = a;                                     \
-		} else if ((imm) > 31) {                             \
-			ret = _mm_setzero_si128();                   \
-		} else {                                             \
-			ret = vreinterpretq_m128i_s32(vshlq_n_s32(   \
-				vreinterpretq_s32_m128i(a), (imm))); \
-		}                                                    \
-		ret;                                                 \
-	})
-
-// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-#define _mm_slli_epi64(a, imm)                                       \
-	__extension__({                                              \
-		__m128i ret;                                         \
-		if ((imm) <= 0) {                                    \
-			ret = a;                                     \
-		} else if ((imm) > 63) {                             \
-			ret = _mm_setzero_si128();                   \
-		} else {                                             \
-			ret = vreinterpretq_m128i_s64(vshlq_n_s64(   \
-				vreinterpretq_s64_m128i(a), (imm))); \
-		}                                                    \
-		ret;                                                 \
-	})
-
-// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
-// while shifting in zeros.
-//
-//   r0 := srl(a0, count)
-//   r1 := srl(a1, count)
-//   ...
-//   r7 := srl(a7, count)
-//
-// https://msdn.microsoft.com/en-us/library/6tcwd38t(v=vs.90).aspx
-#define _mm_srli_epi16(a, imm)                                       \
-	__extension__({                                              \
-		__m128i ret;                                         \
-		if ((imm) <= 0) {                                    \
-			ret = a;                                     \
-		} else if ((imm) > 31) {                             \
-			ret = _mm_setzero_si128();                   \
-		} else {                                             \
-			ret = vreinterpretq_m128i_u16(vshrq_n_u16(   \
-				vreinterpretq_u16_m128i(a), (imm))); \
-		}                                                    \
-		ret;                                                 \
-	})
-
-// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
-// while shifting in zeros.
-// https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx FORCE_INLINE
-// __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_epi32(a, imm)                                       \
-	__extension__({                                              \
-		__m128i ret;                                         \
-		if ((imm) <= 0) {                                    \
-			ret = a;                                     \
-		} else if ((imm) > 31) {                             \
-			ret = _mm_setzero_si128();                   \
-		} else {                                             \
-			ret = vreinterpretq_m128i_u32(vshrq_n_u32(   \
-				vreinterpretq_u32_m128i(a), (imm))); \
-		}                                                    \
-		ret;                                                 \
-	})
-
-// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-#define _mm_srli_epi64(a, imm)                                       \
-	__extension__({                                              \
-		__m128i ret;                                         \
-		if ((imm) <= 0) {                                    \
-			ret = a;                                     \
-		} else if ((imm) > 63) {                             \
-			ret = _mm_setzero_si128();                   \
-		} else {                                             \
-			ret = vreinterpretq_m128i_u64(vshrq_n_u64(   \
-				vreinterpretq_u64_m128i(a), (imm))); \
-		}                                                    \
-		ret;                                                 \
-	})
-
-// Shifts the 4 signed 32 - bit integers in a right by count bits while shifting
-// in the sign bit.
-// https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx
-// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srai_epi32(a, imm)                                                \
-	__extension__({                                                       \
-		__m128i ret;                                                  \
-		if ((imm) <= 0) {                                             \
-			ret = a;                                              \
-		} else if ((imm) > 31) {                                      \
-			ret = vreinterpretq_m128i_s32(                        \
-				vshrq_n_s32(vreinterpretq_s32_m128i(a), 16)); \
-			ret = vreinterpretq_m128i_s32(vshrq_n_s32(            \
-				vreinterpretq_s32_m128i(ret), 16));           \
-		} else {                                                      \
-			ret = vreinterpretq_m128i_s32(vshrq_n_s32(            \
-				vreinterpretq_s32_m128i(a), (imm)));          \
-		}                                                             \
-		ret;                                                          \
-	})
-
-// Shifts the 128 - bit value in a right by imm bytes while shifting in
-// zeros.imm must be an immediate.
-//
-//   r := srl(a, imm*8)
-//
-// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
-// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_si128(a, imm)                                      \
-	__extension__({                                             \
-		__m128i ret;                                        \
-		if ((imm) <= 0) {                                   \
-			ret = a;                                    \
-		} else if ((imm) > 15) {                            \
-			ret = _mm_setzero_si128();                  \
-		} else {                                            \
-			ret = vreinterpretq_m128i_s8(               \
-				vextq_s8(vreinterpretq_s8_m128i(a), \
-					 vdupq_n_s8(0), (imm)));    \
-		}                                                   \
-		ret;                                                \
-	})
-
-// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
-// must be an immediate.
-//
-//   r := a << (imm * 8)
-//
-// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
-// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
-#define _mm_slli_si128(a, imm)                                            \
-	__extension__({                                                   \
-		__m128i ret;                                              \
-		if ((imm) <= 0) {                                         \
-			ret = a;                                          \
-		} else if ((imm) > 15) {                                  \
-			ret = _mm_setzero_si128();                        \
-		} else {                                                  \
-			ret = vreinterpretq_m128i_s8(vextq_s8(            \
-				vdupq_n_s8(0), vreinterpretq_s8_m128i(a), \
-				16 - (imm)));                             \
-		}                                                         \
-		ret;                                                      \
-	})
-
-// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
-// shifting in zeros.
-//
-//   r0 := a0 << count
-//   r1 := a1 << count
-//   ...
-//   r7 := a7 << count
-//
-// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
-{
-	uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-	if (c > 15)
-		return _mm_setzero_si128();
-
-	int16x8_t vc = vdupq_n_s16((int16_t)c);
-	return vreinterpretq_m128i_s16(
-		vshlq_s16(vreinterpretq_s16_m128i(a), vc));
-}
-
-// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
-// shifting in zeros.
-//
-// r0 := a0 << count
-// r1 := a1 << count
-// r2 := a2 << count
-// r3 := a3 << count
-//
-// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
-{
-	uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-	if (c > 31)
-		return _mm_setzero_si128();
-
-	int32x4_t vc = vdupq_n_s32((int32_t)c);
-	return vreinterpretq_m128i_s32(
-		vshlq_s32(vreinterpretq_s32_m128i(a), vc));
-}
-
-// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
-// shifting in zeros.
-//
-// r0 := a0 << count
-// r1 := a1 << count
-//
-// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
-{
-	uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-	if (c > 63)
-		return _mm_setzero_si128();
-
-	int64x2_t vc = vdupq_n_s64((int64_t)c);
-	return vreinterpretq_m128i_s64(
-		vshlq_s64(vreinterpretq_s64_m128i(a), vc));
-}
-
-// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
-// while shifting in zeros.
-//
-// r0 := srl(a0, count)
-// r1 := srl(a1, count)
-// ...
-// r7 := srl(a7, count)
-//
-// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
-{
-	uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-	if (c > 15)
-		return _mm_setzero_si128();
-
-	int16x8_t vc = vdupq_n_s16(-(int16_t)c);
-	return vreinterpretq_m128i_u16(
-		vshlq_u16(vreinterpretq_u16_m128i(a), vc));
-}
-
-// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
-// while shifting in zeros.
-//
-// r0 := srl(a0, count)
-// r1 := srl(a1, count)
-// r2 := srl(a2, count)
-// r3 := srl(a3, count)
-//
-// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
-{
-	uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-	if (c > 31)
-		return _mm_setzero_si128();
-
-	int32x4_t vc = vdupq_n_s32(-(int32_t)c);
-	return vreinterpretq_m128i_u32(
-		vshlq_u32(vreinterpretq_u32_m128i(a), vc));
-}
-
-// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
-// while shifting in zeros.
-//
-// r0 := srl(a0, count)
-// r1 := srl(a1, count)
-//
-// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
-{
-	uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-	if (c > 63)
-		return _mm_setzero_si128();
-
-	int64x2_t vc = vdupq_n_s64(-(int64_t)c);
-	return vreinterpretq_m128i_u64(
-		vshlq_u64(vreinterpretq_u64_m128i(a), vc));
-}
-
-// NEON does not provide a version of this function.
-// Creates a 16-bit mask from the most significant bits of the 16 signed or
-// unsigned 8-bit integers in a and zero extends the upper bits.
-// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_epi8(__m128i a)
-{
-#if defined(__aarch64__)
-	uint8x16_t input = vreinterpretq_u8_m128i(a);
-	const int8_t ALIGN_STRUCT(16) xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0,
-						-7, -6, -5, -4, -3, -2, -1, 0};
-	const uint8x16_t mask_and = vdupq_n_u8(0x80);
-	const int8x16_t mask_shift = vld1q_s8(xr);
-	const uint8x16_t mask_result =
-		vshlq_u8(vandq_u8(input, mask_and), mask_shift);
-	uint8x8_t lo = vget_low_u8(mask_result);
-	uint8x8_t hi = vget_high_u8(mask_result);
-
-	return vaddv_u8(lo) + (vaddv_u8(hi) << 8);
-#else
-	// Use increasingly wide shifts+adds to collect the sign bits
-	// together.
-	// Since the widening shifts would be rather confusing to follow in little
-	// endian, everything will be illustrated in big endian order instead. This
-	// has a different result - the bits would actually be reversed on a big
-	// endian machine.
-
-	// Starting input (only half the elements are shown):
-	// 89 ff 1d c0 00 10 99 33
-	uint8x16_t input = vreinterpretq_u8_m128i(a);
-
-	// Shift out everything but the sign bits with an unsigned shift right.
-	//
-	// Bytes of the vector::
-	// 89 ff 1d c0 00 10 99 33
-	// \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
-	//  |  |  |  |  |  |  |  |
-	// 01 01 00 01 00 00 01 00
-	//
-	// Bits of first important lane(s):
-	// 10001001 (89)
-	// \______
-	//        |
-	// 00000001 (01)
-	uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
-
-	// Merge the even lanes together with a 16-bit unsigned shift right + add.
-	// 'xx' represents garbage data which will be ignored in the final result.
-	// In the important bytes, the add functions like a binary OR.
-	//
-	// 01 01 00 01 00 00 01 00
-	//  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
-	//    \|    \|    \|    \|
-	// xx 03 xx 01 xx 00 xx 02
-	//
-	// 00000001 00000001 (01 01)
-	//        \_______ |
-	//                \|
-	// xxxxxxxx xxxxxx11 (xx 03)
-	uint32x4_t paired16 =
-		vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
-
-	// Repeat with a wider 32-bit shift + add.
-	// xx 03 xx 01 xx 00 xx 02
-	//     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
-	//     14))
-	//          \|          \|
-	// xx xx xx 0d xx xx xx 02
-	//
-	// 00000011 00000001 (03 01)
-	//        \\_____ ||
-	//         '----.\||
-	// xxxxxxxx xxxx1101 (xx 0d)
-	uint64x2_t paired32 =
-		vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
-
-	// Last, an even wider 64-bit shift + add to get our result in the low 8 bit
-	// lanes. xx xx xx 0d xx xx xx 02
-	//            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
-	//            28))
-	//                      \|
-	// xx xx xx xx xx xx xx d2
-	//
-	// 00001101 00000010 (0d 02)
-	//     \   \___ |  |
-	//      '---.  \|  |
-	// xxxxxxxx 11010010 (xx d2)
-	uint8x16_t paired64 =
-		vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
-
-	// Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
-	// xx xx xx xx xx xx xx d2
-	//                      ||  return paired64[0]
-	//                      d2
-	// Note: Little endian would return the correct value 4b (01001011) instead.
-	return vgetq_lane_u8(paired64, 0) |
-	       ((int)vgetq_lane_u8(paired64, 8) << 8);
-#endif
-}
-
-// NEON does not provide this method
-// Creates a 4-bit mask from the most significant bits of the four
-// single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_ps(__m128 a)
-{
-	uint32x4_t input = vreinterpretq_u32_m128(a);
-#if defined(__aarch64__)
-	static const int32x4_t shift = {-31, -30, -29, -28};
-	static const uint32x4_t highbit = {0x80000000, 0x80000000, 0x80000000,
-					   0x80000000};
-	return vaddvq_u32(vshlq_u32(vandq_u32(input, highbit), shift));
-#else
-	// Uses the exact same method as _mm_movemask_epi8, see that for details.
-	// Shift out everything but the sign bits with a 32-bit unsigned shift
-	// right.
-	uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
-	// Merge the two pairs together with a 64-bit unsigned shift right + add.
-	uint8x16_t paired =
-		vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
-	// Extract the result.
-	return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
-#endif
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and
-// mask, and return 1 if the result is zero, otherwise return 0.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros&expand=5871
-FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
-{
-	int64x2_t a_and_mask = vandq_s64(vreinterpretq_s64_m128i(a),
-					 vreinterpretq_s64_m128i(mask));
-	return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1))
-		       ? 0
-		       : 1;
-}
-
-/* Math operations */
-
-// Subtracts the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 - b0
-//   r1 := a1 - b1
-//   r2 := a2 - b2
-//   r3 := a3 - b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_f32(vsubq_f32(vreinterpretq_f32_m128(a),
-						vreinterpretq_f32_m128(b)));
-}
-
-// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
-// and store the results in dst.
-//    r0 := a0 - b0
-//    r1 := a1 - b1
-FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s64(vsubq_s64(vreinterpretq_s64_m128i(a),
-						 vreinterpretq_s64_m128i(b)));
-}
-
-// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
-// unsigned 32-bit integers of a.
-//
-//   r0 := a0 - b0
-//   r1 := a1 - b1
-//   r2 := a2 - b2
-//   r3 := a3 - b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32(vsubq_s32(vreinterpretq_s32_m128i(a),
-						 vreinterpretq_s32_m128i(b)));
-}
-
-FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s16(vsubq_s16(vreinterpretq_s16_m128i(a),
-						 vreinterpretq_s16_m128i(b)));
-}
-
-FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s8(
-		vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
-// integers of a and saturates..
-// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u16(vqsubq_u16(vreinterpretq_u16_m128i(a),
-						  vreinterpretq_u16_m128i(b)));
-}
-
-// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
-// integers of a and saturates.
-//
-//   r0 := UnsignedSaturate(a0 - b0)
-//   r1 := UnsignedSaturate(a1 - b1)
-//   ...
-//   r15 := UnsignedSaturate(a15 - b15)
-//
-// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(vqsubq_u8(vreinterpretq_u8_m128i(a),
-						vreinterpretq_u8_m128i(b)));
-}
-
-// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
-// of a and saturates.
-//
-//   r0 := SignedSaturate(a0 - b0)
-//   r1 := SignedSaturate(a1 - b1)
-//   ...
-//   r15 := SignedSaturate(a15 - b15)
-//
-// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s8(vqsubq_s8(vreinterpretq_s8_m128i(a),
-						vreinterpretq_s8_m128i(b)));
-}
-
-// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
-// of a and saturates.
-//
-//   r0 := SignedSaturate(a0 - b0)
-//   r1 := SignedSaturate(a1 - b1)
-//   ...
-//   r7 := SignedSaturate(a7 - b7)
-//
-// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s16(vqsubq_s16(vreinterpretq_s16_m128i(a),
-						  vreinterpretq_s16_m128i(b)));
-}
-
-FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u16(vqaddq_u16(vreinterpretq_u16_m128i(a),
-						  vreinterpretq_u16_m128i(b)));
-}
-
-// Negate packed 8-bit integers in a when the corresponding signed
-// 8-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-//
-//   for i in 0..15
-//     if b[i] < 0
-//       r[i] := -a[i]
-//     else if b[i] == 0
-//       r[i] := 0
-//     else
-//       r[i] := a[i]
-//     fi
-//   done
-FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
-{
-	int8x16_t a = vreinterpretq_s8_m128i(_a);
-	int8x16_t b = vreinterpretq_s8_m128i(_b);
-
-	int8x16_t zero = vdupq_n_s8(0);
-	// signed shift right: faster than vclt
-	// (b < 0) ? 0xFF : 0
-	uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
-	// (b == 0) ? 0xFF : 0
-	int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, zero));
-	// -a
-	int8x16_t neg = vnegq_s8(a);
-	// bitwise select either a or neg based on ltMask
-	int8x16_t masked = vbslq_s8(ltMask, a, neg);
-	// res = masked & (~zeroMask)
-	int8x16_t res = vbicq_s8(masked, zeroMask);
-	return vreinterpretq_m128i_s8(res);
-}
-
-// Negate packed 16-bit integers in a when the corresponding signed
-// 16-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-//
-//   for i in 0..7
-//     if b[i] < 0
-//       r[i] := -a[i]
-//     else if b[i] == 0
-//       r[i] := 0
-//     else
-//       r[i] := a[i]
-//     fi
-//   done
-FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
-{
-	int16x8_t a = vreinterpretq_s16_m128i(_a);
-	int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-	int16x8_t zero = vdupq_n_s16(0);
-	// signed shift right: faster than vclt
-	// (b < 0) ? 0xFFFF : 0
-	uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
-	// (b == 0) ? 0xFFFF : 0
-	int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, zero));
-	// -a
-	int16x8_t neg = vnegq_s16(a);
-	// bitwise select either a or neg based on ltMask
-	int16x8_t masked = vbslq_s16(ltMask, a, neg);
-	// res = masked & (~zeroMask)
-	int16x8_t res = vbicq_s16(masked, zeroMask);
-	return vreinterpretq_m128i_s16(res);
-}
-
-// Negate packed 32-bit integers in a when the corresponding signed
-// 32-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-//
-//   for i in 0..3
-//     if b[i] < 0
-//       r[i] := -a[i]
-//     else if b[i] == 0
-//       r[i] := 0
-//     else
-//       r[i] := a[i]
-//     fi
-//   done
-FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
-{
-	int32x4_t a = vreinterpretq_s32_m128i(_a);
-	int32x4_t b = vreinterpretq_s32_m128i(_b);
-
-	int32x4_t zero = vdupq_n_s32(0);
-	// signed shift right: faster than vclt
-	// (b < 0) ? 0xFFFFFFFF : 0
-	uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
-	// (b == 0) ? 0xFFFFFFFF : 0
-	int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, zero));
-	// neg = -a
-	int32x4_t neg = vnegq_s32(a);
-	// bitwise select either a or neg based on ltMask
-	int32x4_t masked = vbslq_s32(ltMask, a, neg);
-	// res = masked & (~zeroMask)
-	int32x4_t res = vbicq_s32(masked, zeroMask);
-	return vreinterpretq_m128i_s32(res);
-}
-
-// Computes the average of the 16 unsigned 8-bit integers in a and the 16
-// unsigned 8-bit integers in b and rounds.
-//
-//   r0 := (a0 + b0) / 2
-//   r1 := (a1 + b1) / 2
-//   ...
-//   r15 := (a15 + b15) / 2
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(vrhaddq_u8(vreinterpretq_u8_m128i(a),
-						 vreinterpretq_u8_m128i(b)));
-}
-
-// Computes the average of the 8 unsigned 16-bit integers in a and the 8
-// unsigned 16-bit integers in b and rounds.
-//
-//   r0 := (a0 + b0) / 2
-//   r1 := (a1 + b1) / 2
-//   ...
-//   r7 := (a7 + b7) / 2
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
-{
-	return (__m128i)vrhaddq_u16(vreinterpretq_u16_m128i(a),
-				    vreinterpretq_u16_m128i(b));
-}
-
-// Adds the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 + b0
-//   r1 := a1 + b1
-//   r2 := a2 + b2
-//   r3 := a3 + b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_f32(vaddq_f32(vreinterpretq_f32_m128(a),
-						vreinterpretq_f32_m128(b)));
-}
-
-// adds the scalar single-precision floating point values of a and b.
-// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
-{
-	float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-	float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
-	// the upper values in the result must be the remnants of <a>.
-	return vreinterpretq_m128_f32(vaddq_f32(a, value));
-}
-
-// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
-// unsigned 32-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s64(vaddq_s64(vreinterpretq_s64_m128i(a),
-						 vreinterpretq_s64_m128i(b)));
-}
-
-// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
-// unsigned 32-bit integers in b.
-//
-//   r0 := a0 + b0
-//   r1 := a1 + b1
-//   r2 := a2 + b2
-//   r3 := a3 + b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32(vaddq_s32(vreinterpretq_s32_m128i(a),
-						 vreinterpretq_s32_m128i(b)));
-}
-
-// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
-// unsigned 16-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s16(vaddq_s16(vreinterpretq_s16_m128i(a),
-						 vreinterpretq_s16_m128i(b)));
-}
-
-// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
-// unsigned 8-bit integers in b.
-// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
-FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s8(
-		vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
-// and saturates.
-//
-//   r0 := SignedSaturate(a0 + b0)
-//   r1 := SignedSaturate(a1 + b1)
-//   ...
-//   r7 := SignedSaturate(a7 + b7)
-//
-// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s16(vqaddq_s16(vreinterpretq_s16_m128i(a),
-						  vreinterpretq_s16_m128i(b)));
-}
-
-// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
-// b and saturates..
-// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(vqaddq_u8(vreinterpretq_u8_m128i(a),
-						vreinterpretq_u8_m128i(b)));
-}
-
-// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
-// unsigned 16-bit integers from b.
-//
-//   r0 := (a0 * b0)[15:0]
-//   r1 := (a1 * b1)[15:0]
-//   ...
-//   r7 := (a7 * b7)[15:0]
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s16(vmulq_s16(vreinterpretq_s16_m128i(a),
-						 vreinterpretq_s16_m128i(b)));
-}
-
-// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
-// unsigned 32-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32(vmulq_s32(vreinterpretq_s32_m128i(a),
-						 vreinterpretq_s32_m128i(b)));
-}
-
-// Multiplies the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 * b0
-//   r1 := a1 * b1
-//   r2 := a2 * b2
-//   r3 := a3 * b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a),
-						vreinterpretq_f32_m128(b)));
-}
-
-// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
-// a and b, and store the unsigned 64-bit results in dst.
-//
-//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
-//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
-FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
-{
-	// vmull_u32 upcasts instead of masking, so we downcast.
-	uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
-	uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
-	return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
-}
-
-// Multiply the low signed 32-bit integers from each packed 64-bit element in
-// a and b, and store the signed 64-bit results in dst.
-//
-//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
-//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
-FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
-{
-	// vmull_s32 upcasts instead of masking, so we downcast.
-	int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
-	int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
-	return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
-}
-
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
-// integers from b.
-//
-//   r0 := (a0 * b0) + (a1 * b1)
-//   r1 := (a2 * b2) + (a3 * b3)
-//   r2 := (a4 * b4) + (a5 * b5)
-//   r3 := (a6 * b6) + (a7 * b7)
-// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
-{
-	int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-				  vget_low_s16(vreinterpretq_s16_m128i(b)));
-	int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-				   vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-	int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
-	int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
-
-	return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
-// the packed 16-bit integers in dst.
-//
-//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
-//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
-//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
-//   ...
-//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
-FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
-{
-	// Has issues due to saturation
-	// return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
-
-	// Multiply
-	int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-				     vget_low_s16(vreinterpretq_s16_m128i(b)));
-	int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-				     vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-	// Rounding narrowing shift right
-	// narrow = (int16_t)((mul + 16384) >> 15);
-	int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
-	int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
-
-	// Join together
-	return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
-}
-
-// Vertically multiply each unsigned 8-bit integer from a with the corresponding
-// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
-// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
-// and pack the saturated results in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*16
-//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
-//      a[i+7:i]*b[i+7:i] )
-//   ENDFOR
-FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
-{
-	// This would be much simpler if x86 would choose to zero extend OR sign
-	// extend, not both. This could probably be optimized better.
-	uint16x8_t a = vreinterpretq_u16_m128i(_a);
-	int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-	// Zero extend a
-	int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
-	int16x8_t a_even =
-		vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
-
-	// Sign extend by shifting left then shifting right.
-	int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
-	int16x8_t b_odd = vshrq_n_s16(b, 8);
-
-	// multiply
-	int16x8_t prod1 = vmulq_s16(a_even, b_even);
-	int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
-
-	// saturated add
-	return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
-}
-
-// Computes the absolute difference of the 16 unsigned 8-bit integers from a
-// and the 16 unsigned 8-bit integers from b.
-//
-// Return Value
-// Sums the upper 8 differences and lower 8 differences and packs the
-// resulting 2 unsigned 16-bit integers into the upper and lower 64-bit
-// elements.
-//
-//   r0 := abs(a0 - b0) + abs(a1 - b1) +...+ abs(a7 - b7)
-//   r1 := 0x0
-//   r2 := 0x0
-//   r3 := 0x0
-//   r4 := abs(a8 - b8) + abs(a9 - b9) +...+ abs(a15 - b15)
-//   r5 := 0x0
-//   r6 := 0x0
-//   r7 := 0x0
-FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
-{
-	uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t)a, (uint8x16_t)b));
-	uint16_t r0 = t[0] + t[1] + t[2] + t[3];
-	uint16_t r4 = t[4] + t[5] + t[6] + t[7];
-	uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
-	return (__m128i)vsetq_lane_u16(r4, r, 4);
-}
-
-// Divides the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 / b0
-//   r1 := a1 / b1
-//   r2 := a2 / b2
-//   r3 := a3 / b3
-//
-// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
-{
-	float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
-	float32x4_t recip1 = vmulq_f32(
-		recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
-	return vreinterpretq_m128_f32(
-		vmulq_f32(vreinterpretq_f32_m128(a), recip1));
-}
-
-// Divides the scalar single-precision floating point value of a by b.
-// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
-{
-	float32_t value =
-		vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
-	return vreinterpretq_m128_f32(
-		vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Computes the approximations of reciprocals of the four single-precision,
-// floating-point values of a.
-// https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
-{
-	float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
-	recip = vmulq_f32(recip,
-			  vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-	return vreinterpretq_m128_f32(recip);
-}
-
-// Computes the approximations of square roots of the four single-precision,
-// floating-point values of a. First computes reciprocal square roots and then
-// reciprocals of the four values.
-//
-//   r0 := sqrt(a0)
-//   r1 := sqrt(a1)
-//   r2 := sqrt(a2)
-//   r3 := sqrt(a3)
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
-{
-#if defined(__aarch64__)
-	return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
-#else
-	float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-	float32x4_t sq = vrecpeq_f32(recipsq);
-	// ??? use step versions of both sqrt and recip for better accuracy?
-	return vreinterpretq_m128_f32(sq);
-#endif
-}
-
-// Computes the approximation of the square root of the scalar single-precision
-// floating point value of in.
-// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
-{
-	float32_t value =
-		vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
-	return vreinterpretq_m128_f32(
-		vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
-}
-
-// Computes the approximations of the reciprocal square roots of the four
-// single-precision floating point values of in.
-// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
-{
-	return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
-}
-
-// Compute the approximate reciprocal square root of the lower single-precision
-// (32-bit) floating-point element in a, store the result in the lower element
-// of dst, and copy the upper 3 packed elements from a to the upper elements of
-// dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
-FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
-{
-	return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
-}
-
-// Computes the maximums of the four single-precision, floating-point values of
-// a and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_f32(vmaxq_f32(vreinterpretq_f32_m128(a),
-						vreinterpretq_f32_m128(b)));
-}
-
-// Computes the minima of the four single-precision, floating-point values of a
-// and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_f32(vminq_f32(vreinterpretq_f32_m128(a),
-						vreinterpretq_f32_m128(b)));
-}
-
-// Computes the maximum of the two lower scalar single-precision floating point
-// values of a and b.
-// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
-{
-	float32_t value = vgetq_lane_f32(vmaxq_f32(vreinterpretq_f32_m128(a),
-						   vreinterpretq_f32_m128(b)),
-					 0);
-	return vreinterpretq_m128_f32(
-		vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Computes the minimum of the two lower scalar single-precision floating point
-// values of a and b.
-// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
-{
-	float32_t value = vgetq_lane_f32(vminq_f32(vreinterpretq_f32_m128(a),
-						   vreinterpretq_f32_m128(b)),
-					 0);
-	return vreinterpretq_m128_f32(
-		vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
-// 16 unsigned 8-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(
-		vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
-// 16 unsigned 8-bit integers from b.
-// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
-FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(
-		vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
-// signed 16-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s16(vminq_s16(vreinterpretq_s16_m128i(a),
-						 vreinterpretq_s16_m128i(b)));
-}
-
-// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
-// signed 16-bit integers from b.
-// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s16(vmaxq_s16(vreinterpretq_s16_m128i(a),
-						 vreinterpretq_s16_m128i(b)));
-}
-
-// epi versions of min/max
-// Computes the pariwise maximums of the four signed 32-bit integer values of a
-// and b.
-//
-// A 128-bit parameter that can be defined with the following equations:
-//   r0 := (a0 > b0) ? a0 : b0
-//   r1 := (a1 > b1) ? a1 : b1
-//   r2 := (a2 > b2) ? a2 : b2
-//   r3 := (a3 > b3) ? a3 : b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32(vmaxq_s32(vreinterpretq_s32_m128i(a),
-						 vreinterpretq_s32_m128i(b)));
-}
-
-// Computes the pariwise minima of the four signed 32-bit integer values of a
-// and b.
-//
-// A 128-bit parameter that can be defined with the following equations:
-//   r0 := (a0 < b0) ? a0 : b0
-//   r1 := (a1 < b1) ? a1 : b1
-//   r2 := (a2 < b2) ? a2 : b2
-//   r3 := (a3 < b3) ? a3 : b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s32(vminq_s32(vreinterpretq_s32_m128i(a),
-						 vreinterpretq_s32_m128i(b)));
-}
-
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
-// integers from b.
-//
-//   r0 := (a0 * b0)[31:16]
-//   r1 := (a1 * b1)[31:16]
-//   ...
-//   r7 := (a7 * b7)[31:16]
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
-{
-	/* FIXME: issue with large values because of result saturation */
-	// int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
-	// vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
-	// vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
-	int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
-	int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
-	int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
-	int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
-	int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
-	int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
-	uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210),
-				   vreinterpretq_u16_s32(ab7654));
-	return vreinterpretq_m128i_u16(r.val[1]);
-}
-
-// Computes pairwise add of each argument as single-precision, floating-point
-// values a and b.
-// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
-FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-	return vreinterpretq_m128_f32(vpaddq_f32(vreinterpretq_f32_m128(a),
-						 vreinterpretq_f32_m128(b)));
-#else
-	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-	float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-	float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(
-		vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
-#endif
-}
-
-// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
-// values a and b.
-FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
-{
-	int16x8_t a = vreinterpretq_s16_m128i(_a);
-	int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-	return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
-#else
-	return vreinterpretq_m128i_s16(
-		vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
-			     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
-#endif
-}
-
-// Computes pairwise difference of each argument as a 16-bit signed or unsigned
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
-{
-	int32x4_t a = vreinterpretq_s32_m128i(_a);
-	int32x4_t b = vreinterpretq_s32_m128i(_b);
-	// Interleave using vshrn/vmovn
-	// [a0|a2|a4|a6|b0|b2|b4|b6]
-	// [a1|a3|a5|a7|b1|b3|b5|b7]
-	int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-	int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-	// Subtract
-	return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
-}
-
-// Computes saturated pairwise sub of each argument as a 16-bit signed
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
-{
-	int32x4_t a = vreinterpretq_s32_m128i(_a);
-	int32x4_t b = vreinterpretq_s32_m128i(_b);
-	// Interleave using vshrn/vmovn
-	// [a0|a2|a4|a6|b0|b2|b4|b6]
-	// [a1|a3|a5|a7|b1|b3|b5|b7]
-	int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-	int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-	// Saturated add
-	return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
-}
-
-// Computes saturated pairwise difference of each argument as a 16-bit signed
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
-{
-	int32x4_t a = vreinterpretq_s32_m128i(_a);
-	int32x4_t b = vreinterpretq_s32_m128i(_b);
-	// Interleave using vshrn/vmovn
-	// [a0|a2|a4|a6|b0|b2|b4|b6]
-	// [a1|a3|a5|a7|b1|b3|b5|b7]
-	int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-	int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-	// Saturated subtract
-	return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
-}
-
-// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
-// values a and b.
-FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
-{
-	int32x4_t a = vreinterpretq_s32_m128i(_a);
-	int32x4_t b = vreinterpretq_s32_m128i(_b);
-	return vreinterpretq_m128i_s32(
-		vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
-			     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
-}
-
-// Computes pairwise difference of each argument as a 32-bit signed or unsigned
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
-{
-	int64x2_t a = vreinterpretq_s64_m128i(_a);
-	int64x2_t b = vreinterpretq_s64_m128i(_b);
-	// Interleave using vshrn/vmovn
-	// [a0|a2|b0|b2]
-	// [a1|a2|b1|b3]
-	int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
-	int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
-	// Subtract
-	return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
-}
-
-/* Compare operations */
-
-// Compares for less than
-// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_u32(vcltq_f32(vreinterpretq_f32_m128(a),
-						vreinterpretq_f32_m128(b)));
-}
-
-// Compares for greater than.
-//
-//   r0 := (a0 > b0) ? 0xffffffff : 0x0
-//   r1 := (a1 > b1) ? 0xffffffff : 0x0
-//   r2 := (a2 > b2) ? 0xffffffff : 0x0
-//   r3 := (a3 > b3) ? 0xffffffff : 0x0
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
-						vreinterpretq_f32_m128(b)));
-}
-
-// Compares for greater than or equal.
-// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
-						vreinterpretq_f32_m128(b)));
-}
-
-// Compares for less than or equal.
-//
-//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
-//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
-//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
-//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_u32(vcleq_f32(vreinterpretq_f32_m128(a),
-						vreinterpretq_f32_m128(b)));
-}
-
-// Compares for equality.
-// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
-{
-	return vreinterpretq_m128_u32(vceqq_f32(vreinterpretq_f32_m128(a),
-						vreinterpretq_f32_m128(b)));
-}
-
-// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
-// unsigned 8-bit integers in b for equality.
-// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(
-		vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
-// unsigned 16-bit integers in b for equality.
-// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u16(vceqq_s16(vreinterpretq_s16_m128i(a),
-						 vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed 32-bit integers in a and b for equality, and store the results
-// in dst
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u32(vceqq_s32(vreinterpretq_s32_m128i(a),
-						 vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed 64-bit integers in a and b for equality, and store the results
-// in dst
-FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-	return vreinterpretq_m128i_u64(vceqq_u64(vreinterpretq_u64_m128i(a),
-						 vreinterpretq_u64_m128i(b)));
-#else
-	// ARMv7 lacks vceqq_u64
-	// (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-	uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128i(a),
-				   vreinterpretq_u32_m128i(b));
-	uint32x4_t swapped = vrev64q_u32(cmp);
-	return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
-#endif
-}
-
-// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
-// in b for lesser than.
-// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(
-		vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
-// in b for greater than.
-//
-//   r0 := (a0 > b0) ? 0xff : 0x0
-//   r1 := (a1 > b1) ? 0xff : 0x0
-//   ...
-//   r15 := (a15 > b15) ? 0xff : 0x0
-//
-// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(
-		vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
-// in b for less than.
-//
-//   r0 := (a0 < b0) ? 0xffff : 0x0
-//   r1 := (a1 < b1) ? 0xffff : 0x0
-//   ...
-//   r7 := (a7 < b7) ? 0xffff : 0x0
-//
-// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u16(vcltq_s16(vreinterpretq_s16_m128i(a),
-						 vreinterpretq_s16_m128i(b)));
-}
-
-// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
-// in b for greater than.
-//
-//   r0 := (a0 > b0) ? 0xffff : 0x0
-//   r1 := (a1 > b1) ? 0xffff : 0x0
-//   ...
-//   r7 := (a7 > b7) ? 0xffff : 0x0
-//
-// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u16(vcgtq_s16(vreinterpretq_s16_m128i(a),
-						 vreinterpretq_s16_m128i(b)));
-}
-
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
-// in b for less than.
-// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u32(vcltq_s32(vreinterpretq_s32_m128i(a),
-						 vreinterpretq_s32_m128i(b)));
-}
-
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
-// in b for greater than.
-// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u32(vcgtq_s32(vreinterpretq_s32_m128i(a),
-						 vreinterpretq_s32_m128i(b)));
-}
-
-// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
-// in b for greater than.
-FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-	return vreinterpretq_m128i_u64(vcgtq_s64(vreinterpretq_s64_m128i(a),
-						 vreinterpretq_s64_m128i(b)));
-#else
-	// ARMv7 lacks vcgtq_s64.
-	// This is based off of Clang's SSE2 polyfill:
-	// (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
-
-	// Mask the sign bit out since we need a signed AND an unsigned comparison
-	// and it is ugly to try and split them.
-	int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
-	int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
-	int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
-	// Check if a > b
-	int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
-	// Copy upper mask to lower mask
-	// a_hi > b_hi
-	int64x2_t gt_hi = vshrq_n_s64(greater, 63);
-	// Copy lower mask to upper mask
-	// a_lo > b_lo
-	int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
-	// Compare for equality
-	int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
-	// Copy upper mask to lower mask
-	// a_hi == b_hi
-	int64x2_t eq_hi = vshrq_n_s64(equal, 63);
-	// a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
-	int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
-	return vreinterpretq_m128i_s64(ret);
-#endif
-}
-
-// Compares the four 32-bit floats in a and b to check if any values are NaN.
-// Ordered compare between each value returns true for "orderable" and false for
-// "not orderable" (NaN).
-// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
-// also:
-// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
-// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
-FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
-{
-	// Note: NEON does not have ordered compare builtin
-	// Need to compare a eq a and b eq b to check for NaN
-	// Do AND of results to get final
-	uint32x4_t ceqaa =
-		vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t ceqbb =
-		vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a less than operation. :
-// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
-// note!! The documentation on MSDN is incorrect!  If either of the values is a
-// NAN the docs say you will get a one, but in fact, it will return a zero!!
-FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
-{
-	uint32x4_t a_not_nan =
-		vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan =
-		vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-	uint32x4_t a_lt_b =
-		vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1
-									    : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a greater than operation. :
-// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
-FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
-{
-	// return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
-	// vreinterpretq_f32_m128(b)), 0);
-	uint32x4_t a_not_nan =
-		vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan =
-		vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-	uint32x4_t a_gt_b =
-		vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1
-									    : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a less than or equal operation. :
-// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
-FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
-{
-	// return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
-	// vreinterpretq_f32_m128(b)), 0);
-	uint32x4_t a_not_nan =
-		vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan =
-		vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-	uint32x4_t a_le_b =
-		vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1
-									    : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a greater than or equal operation. :
-// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
-FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
-{
-	// return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
-	// vreinterpretq_f32_m128(b)), 0);
-	uint32x4_t a_not_nan =
-		vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan =
-		vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-	uint32x4_t a_ge_b =
-		vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1
-									    : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using an equality operation. :
-// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
-FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
-{
-	// return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
-	// vreinterpretq_f32_m128(b)), 0);
-	uint32x4_t a_not_nan =
-		vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan =
-		vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-	uint32x4_t a_eq_b =
-		vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1
-									    : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using an inequality operation. :
-// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
-FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
-{
-	// return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
-	// vreinterpretq_f32_m128(b)), 0);
-	uint32x4_t a_not_nan =
-		vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan =
-		vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
-	uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(vreinterpretq_f32_m128(a),
-						 vreinterpretq_f32_m128(b)));
-	return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
-}
-
-// according to the documentation, these intrinsics behave the same as the
-// non-'u' versions.  We'll just alias them here.
-#define _mm_ucomilt_ss _mm_comilt_ss
-#define _mm_ucomile_ss _mm_comile_ss
-#define _mm_ucomigt_ss _mm_comigt_ss
-#define _mm_ucomige_ss _mm_comige_ss
-#define _mm_ucomieq_ss _mm_comieq_ss
-#define _mm_ucomineq_ss _mm_comineq_ss
-
-/* Conversions */
-
-// Converts the four single-precision, floating-point values of a to signed
-// 32-bit integer values using truncate.
-// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
-{
-	return vreinterpretq_m128i_s32(
-		vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
-}
-
-// Converts the four signed 32-bit integer values of a to single-precision,
-// floating-point values
-// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
-{
-	return vreinterpretq_m128_f32(
-		vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Converts the four unsigned 8-bit integers in the lower 16 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
-{
-	uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
-	uint16x8_t u16x8 =
-		vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
-	return vreinterpretq_m128i_u16(u16x8);
-}
-
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four
-// unsigned 32-bit integers.
-// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
-FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
-{
-	uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
-	uint16x8_t u16x8 =
-		vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
-	uint32x4_t u32x4 =
-		vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
-	return vreinterpretq_m128i_u32(u32x4);
-}
-
-// Converts the two unsigned 8-bit integers in the lower 16 bits to two
-// unsigned 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
-{
-	uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
-	uint16x8_t u16x8 =
-		vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
-	uint32x4_t u32x4 =
-		vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-	uint64x2_t u64x2 =
-		vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-	return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Converts the four unsigned 8-bit integers in the lower 16 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
-{
-	int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
-	int16x8_t s16x8 =
-		vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
-	return vreinterpretq_m128i_s16(s16x8);
-}
-
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
-{
-	int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
-	int16x8_t s16x8 =
-		vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
-	int32x4_t s32x4 =
-		vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
-	return vreinterpretq_m128i_s32(s32x4);
-}
-
-// Converts the two signed 8-bit integers in the lower 32 bits to four
-// signed 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
-{
-	int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
-	int16x8_t s16x8 =
-		vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
-	int32x4_t s32x4 =
-		vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-	int64x2_t s64x2 =
-		vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-	return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Converts the four signed 16-bit integers in the lower 64 bits to four signed
-// 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
-{
-	return vreinterpretq_m128i_s32(
-		vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
-}
-
-// Converts the two signed 16-bit integers in the lower 32 bits two signed
-// 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
-{
-	int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
-	int32x4_t s32x4 =
-		vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-	int64x2_t s64x2 =
-		vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-	return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Converts the four unsigned 16-bit integers in the lower 64 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
-{
-	return vreinterpretq_m128i_u32(
-		vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
-}
-
-// Converts the two unsigned 16-bit integers in the lower 32 bits to two
-// unsigned 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
-{
-	uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
-	uint32x4_t u32x4 =
-		vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-	uint64x2_t u64x2 =
-		vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-	return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Converts the two unsigned 32-bit integers in the lower 64 bits to two
-// unsigned 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
-{
-	return vreinterpretq_m128i_u64(
-		vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
-}
-
-// Converts the two signed 32-bit integers in the lower 64 bits to two signed
-// 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
-{
-	return vreinterpretq_m128i_s64(
-		vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
-}
-
-// Converts the four single-precision, floating-point values of a to signed
-// 32-bit integer values.
-//
-//   r0 := (int) a0
-//   r1 := (int) a1
-//   r2 := (int) a2
-//   r3 := (int) a3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
-// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
-// does not support! It is supported on ARMv8-A however.
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
-{
-#if defined(__aarch64__)
-	return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
-#else
-	uint32x4_t signmask = vdupq_n_u32(0x80000000);
-	float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
-				     vdupq_n_f32(0.5f)); /* +/- 0.5 */
-	int32x4_t r_normal =
-		vcvtq_s32_f32(vaddq_f32(vreinterpretq_f32_m128(a),
-					half)); /* round to integer: [a + 0.5]*/
-	int32x4_t r_trunc = vcvtq_s32_f32(
-		vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
-	int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
-		vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
-	int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
-				     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
-	float32x4_t delta = vsubq_f32(
-		vreinterpretq_f32_m128(a),
-		vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
-	uint32x4_t is_delta_half =
-		vceqq_f32(delta, half); /* delta == +/- 0.5 */
-	return vreinterpretq_m128i_s32(
-		vbslq_s32(is_delta_half, r_even, r_normal));
-#endif
-}
-
-// Moves the least significant 32 bits of a to a 32-bit integer.
-// https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx
-FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
-{
-	return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
-}
-
-// Extracts the low order 64-bit integer from the parameter.
-// https://msdn.microsoft.com/en-us/library/bb531384(v=vs.120).aspx
-FORCE_INLINE uint64_t _mm_cvtsi128_si64(__m128i a)
-{
-	return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
-}
-
-// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
-// zero extending the upper bits.
-//
-//   r0 := a
-//   r1 := 0x0
-//   r2 := 0x0
-//   r3 := 0x0
-//
-// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
-{
-	return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
-}
-
-// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
-// zero extending the upper bits.
-//
-//   r0 := a
-//   r1 := 0x0
-FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
-{
-	return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
-}
-
-// Applies a type cast to reinterpret four 32-bit floating point values passed
-// in as a 128-bit parameter as packed 32-bit integers.
-// https://msdn.microsoft.com/en-us/library/bb514099.aspx
-FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
-{
-	return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
-}
-
-// Applies a type cast to reinterpret four 32-bit integers passed in as a
-// 128-bit parameter as packed 32-bit floating point values.
-// https://msdn.microsoft.com/en-us/library/bb514029.aspx
-FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
-{
-	return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
-}
-
-// Loads 128-bit value. :
-// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
-FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
-{
-	return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *)p));
-}
-
-// Loads 128-bit value. :
-// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
-{
-	return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *)p));
-}
-
-// _mm_lddqu_si128 functions the same as _mm_loadu_si128.
-#define _mm_lddqu_si128 _mm_loadu_si128
-
-/* Miscellaneous Operations */
-
-// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
-// in the sign bit.
-//
-//   r0 := a0 >> count
-//   r1 := a1 >> count
-//   ...
-//   r7 := a7 >> count
-//
-// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
-{
-	int64_t c = (int64_t)vget_low_s64((int64x2_t)count);
-	if (c > 15)
-		return _mm_cmplt_epi16(a, _mm_setzero_si128());
-	return vreinterpretq_m128i_s16(
-		vshlq_s16((int16x8_t)a, vdupq_n_s16(-c)));
-}
-
-// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
-// in the sign bit.
-//
-//   r0 := a0 >> count
-//   r1 := a1 >> count
-//   r2 := a2 >> count
-//   r3 := a3 >> count
-//
-// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
-FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
-{
-	int64_t c = (int64_t)vget_low_s64((int64x2_t)count);
-	if (c > 31)
-		return _mm_cmplt_epi32(a, _mm_setzero_si128());
-	return vreinterpretq_m128i_s32(
-		vshlq_s32((int32x4_t)a, vdupq_n_s32(-c)));
-}
-
-// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
-// saturates.
-// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s8(
-		vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
-			    vqmovn_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
-// integers and saturates.
-//
-//   r0 := UnsignedSaturate(a0)
-//   r1 := UnsignedSaturate(a1)
-//   ...
-//   r7 := UnsignedSaturate(a7)
-//   r8 := UnsignedSaturate(b0)
-//   r9 := UnsignedSaturate(b1)
-//   ...
-//   r15 := UnsignedSaturate(b7)
-//
-// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
-{
-	return vreinterpretq_m128i_u8(
-		vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
-			    vqmovun_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
-// and saturates.
-//
-//   r0 := SignedSaturate(a0)
-//   r1 := SignedSaturate(a1)
-//   r2 := SignedSaturate(a2)
-//   r3 := SignedSaturate(a3)
-//   r4 := SignedSaturate(b0)
-//   r5 := SignedSaturate(b1)
-//   r6 := SignedSaturate(b2)
-//   r7 := SignedSaturate(b3)
-//
-// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_s16(
-		vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
-			     vqmovn_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
-// integers and saturates.
-//
-//   r0 := UnsignedSaturate(a0)
-//   r1 := UnsignedSaturate(a1)
-//   r2 := UnsignedSaturate(a2)
-//   r3 := UnsignedSaturate(a3)
-//   r4 := UnsignedSaturate(b0)
-//   r5 := UnsignedSaturate(b1)
-//   r6 := UnsignedSaturate(b2)
-//   r7 := UnsignedSaturate(b3)
-FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u16(
-		vcombine_u16(vqmovn_u32(vreinterpretq_u32_m128i(a)),
-			     vqmovn_u32(vreinterpretq_u32_m128i(b))));
-}
-
-// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
-// 8 signed or unsigned 8-bit integers in b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//   ...
-//   r14 := a7
-//   r15 := b7
-//
-// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-	return vreinterpretq_m128i_s8(vzip1q_s8(vreinterpretq_s8_m128i(a),
-						vreinterpretq_s8_m128i(b)));
-#else
-	int8x8_t a1 =
-		vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
-	int8x8_t b1 =
-		vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
-	int8x8x2_t result = vzip_s8(a1, b1);
-	return vreinterpretq_m128i_s8(
-		vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
-// lower 4 signed or unsigned 16-bit integers in b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//   r4 := a2
-//   r5 := b2
-//   r6 := a3
-//   r7 := b3
-//
-// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-	return vreinterpretq_m128i_s16(vzip1q_s16(vreinterpretq_s16_m128i(a),
-						  vreinterpretq_s16_m128i(b)));
-#else
-	int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
-	int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
-	int16x4x2_t result = vzip_s16(a1, b1);
-	return vreinterpretq_m128i_s16(
-		vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
-// lower 2 signed or unsigned 32 - bit integers in b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//
-// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-	return vreinterpretq_m128i_s32(vzip1q_s32(vreinterpretq_s32_m128i(a),
-						  vreinterpretq_s32_m128i(b)));
-#else
-	int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
-	int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
-	int32x2x2_t result = vzip_s32(a1, b1);
-	return vreinterpretq_m128i_s32(
-		vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
-{
-	int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
-	int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
-	return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
-}
-
-// Selects and interleaves the lower two single-precision, floating-point values
-// from a and b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//
-// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-	return vreinterpretq_m128_f32(vzip1q_f32(vreinterpretq_f32_m128(a),
-						 vreinterpretq_f32_m128(b)));
-#else
-	float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
-	float32x2x2_t result = vzip_f32(a1, b1);
-	return vreinterpretq_m128_f32(
-		vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Selects and interleaves the upper two single-precision, floating-point values
-// from a and b.
-//
-//   r0 := a2
-//   r1 := b2
-//   r2 := a3
-//   r3 := b3
-//
-// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-	return vreinterpretq_m128_f32(vzip2q_f32(vreinterpretq_f32_m128(a),
-						 vreinterpretq_f32_m128(b)));
-#else
-	float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
-	float32x2x2_t result = vzip_f32(a1, b1);
-	return vreinterpretq_m128_f32(
-		vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
-// 8 signed or unsigned 8-bit integers in b.
-//
-//   r0 := a8
-//   r1 := b8
-//   r2 := a9
-//   r3 := b9
-//   ...
-//   r14 := a15
-//   r15 := b15
-//
-// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-	return vreinterpretq_m128i_s8(vzip2q_s8(vreinterpretq_s8_m128i(a),
-						vreinterpretq_s8_m128i(b)));
-#else
-	int8x8_t a1 =
-		vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
-	int8x8_t b1 =
-		vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
-	int8x8x2_t result = vzip_s8(a1, b1);
-	return vreinterpretq_m128i_s8(
-		vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
-// upper 4 signed or unsigned 16-bit integers in b.
-//
-//   r0 := a4
-//   r1 := b4
-//   r2 := a5
-//   r3 := b5
-//   r4 := a6
-//   r5 := b6
-//   r6 := a7
-//   r7 := b7
-//
-// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-	return vreinterpretq_m128i_s16(vzip2q_s16(vreinterpretq_s16_m128i(a),
-						  vreinterpretq_s16_m128i(b)));
-#else
-	int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
-	int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
-	int16x4x2_t result = vzip_s16(a1, b1);
-	return vreinterpretq_m128i_s16(
-		vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
-// upper 2 signed or unsigned 32-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-	return vreinterpretq_m128i_s32(vzip2q_s32(vreinterpretq_s32_m128i(a),
-						  vreinterpretq_s32_m128i(b)));
-#else
-	int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
-	int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
-	int32x2x2_t result = vzip_s32(a1, b1);
-	return vreinterpretq_m128i_s32(
-		vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper signed or unsigned 64-bit integer in a with the
-// upper signed or unsigned 64-bit integer in b.
-//
-//   r0 := a1
-//   r1 := b1
-FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
-{
-	int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
-	int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
-	return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
-}
-
-// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
-// in a, store the minimum and index in dst, and zero the remaining bits in dst.
-//
-//   index[2:0] := 0
-//   min[15:0] := a[15:0]
-//   FOR j := 0 to 7
-//       i := j*16
-//       IF a[i+15:i] < min[15:0]
-//           index[2:0] := j
-//           min[15:0] := a[i+15:i]
-//       FI
-//   ENDFOR
-//   dst[15:0] := min[15:0]
-//   dst[18:16] := index[2:0]
-//   dst[127:19] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16&expand=3789
-FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
-{
-	__m128i dst;
-	uint16_t min, idx = 0;
-	// Find the minimum value
-#if defined(__aarch64__)
-	min = vminvq_u16(vreinterpretq_u16_m128i(a));
-#else
-	__m64i tmp;
-	tmp = vreinterpret_m64i_u16(
-		vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
-			 vget_high_u16(vreinterpretq_u16_m128i(a))));
-	tmp = vreinterpret_m64i_u16(vpmin_u16(vreinterpret_u16_m64i(tmp),
-					      vreinterpret_u16_m64i(tmp)));
-	tmp = vreinterpret_m64i_u16(vpmin_u16(vreinterpret_u16_m64i(tmp),
-					      vreinterpret_u16_m64i(tmp)));
-	min = vget_lane_u16(vreinterpret_u16_m64i(tmp), 0);
-#endif
-	// Get the index of the minimum value
-	int i;
-	for (i = 0; i < 8; i++) {
-		if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
-			idx = (uint16_t)i;
-			break;
-		}
-		a = _mm_srli_si128(a, 2);
-	}
-	// Generate result
-	dst = _mm_setzero_si128();
-	dst = vreinterpretq_m128i_u16(
-		vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
-	dst = vreinterpretq_m128i_u16(
-		vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
-	return dst;
-}
-
-// shift to right
-// https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
-// http://blog.csdn.net/hemmingway/article/details/44828303
-// Clang requires a macro here, as it is extremely picky about c being a
-// literal.
-#define _mm_alignr_epi8(a, b, c) \
-	((__m128i)vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c)))
-
-// Extracts the selected signed or unsigned 8-bit integer from a and zero
-// extends.
-// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
-#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
-
-// Inserts the least significant 8 bits of b into the selected 8-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
-//                                      __constrange(0,16) int imm)
-#define _mm_insert_epi8(a, b, imm)                                             \
-	__extension__({                                                        \
-		vreinterpretq_m128i_s8(                                        \
-			vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
-	})
-
-// Extracts the selected signed or unsigned 16-bit integer from a and zero
-// extends.
-// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
-// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
-#define _mm_extract_epi16(a, imm) \
-	vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
-
-// Inserts the least significant 16 bits of b into the selected 16-bit integer
-// of a.
-// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
-// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
-//                                       __constrange(0,8) int imm)
-#define _mm_insert_epi16(a, b, imm)                               \
-	__extension__({                                           \
-		vreinterpretq_m128i_s16(vsetq_lane_s16(           \
-			(b), vreinterpretq_s16_m128i(a), (imm))); \
-	})
-
-// Extracts the selected signed or unsigned 32-bit integer from a and zero
-// extends.
-// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
-#define _mm_extract_epi32(a, imm) \
-	vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
-
-// Extracts the selected single-precision (32-bit) floating-point from a.
-// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
-#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
-
-// Inserts the least significant 32 bits of b into the selected 32-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
-//                                       __constrange(0,4) int imm)
-#define _mm_insert_epi32(a, b, imm)                               \
-	__extension__({                                           \
-		vreinterpretq_m128i_s32(vsetq_lane_s32(           \
-			(b), vreinterpretq_s32_m128i(a), (imm))); \
-	})
-
-// Extracts the selected signed or unsigned 64-bit integer from a and zero
-// extends.
-// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
-#define _mm_extract_epi64(a, imm) \
-	vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
-
-// Inserts the least significant 64 bits of b into the selected 64-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
-//                                       __constrange(0,2) int imm)
-#define _mm_insert_epi64(a, b, imm)                               \
-	__extension__({                                           \
-		vreinterpretq_m128i_s64(vsetq_lane_s64(           \
-			(b), vreinterpretq_s64_m128i(a), (imm))); \
-	})
-
-// Count the number of bits set to 1 in unsigned 32-bit integer a, and
-// return that count in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
-FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcount)
-	return __builtin_popcount(a);
-#else
-	return (int)vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t)a)));
-#endif
-#else
-	uint32_t count = 0;
-	uint8x8_t input_val, count8x8_val;
-	uint16x4_t count16x4_val;
-	uint32x2_t count32x2_val;
-
-	input_val = vld1_u8((uint8_t *)&a);
-	count8x8_val = vcnt_u8(input_val);
-	count16x4_val = vpaddl_u8(count8x8_val);
-	count32x2_val = vpaddl_u16(count16x4_val);
-
-	vst1_u32(&count, count32x2_val);
-	return count;
-#endif
-}
-
-// Count the number of bits set to 1 in unsigned 64-bit integer a, and
-// return that count in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
-FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcountll)
-	return __builtin_popcountll(a);
-#else
-	return (int64_t)vaddlv_u8(vcnt_u8(vcreate_u8(a)));
-#endif
-#else
-	uint64_t count = 0;
-	uint8x8_t input_val, count8x8_val;
-	uint16x4_t count16x4_val;
-	uint32x2_t count32x2_val;
-	uint64x1_t count64x1_val;
-
-	input_val = vld1_u8((uint8_t *)&a);
-	count8x8_val = vcnt_u8(input_val);
-	count16x4_val = vpaddl_u8(count8x8_val);
-	count32x2_val = vpaddl_u16(count16x4_val);
-	count64x1_val = vpaddl_u32(count32x2_val);
-	vst1_u64(&count, count64x1_val);
-	return count;
-#endif
-}
-
-// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
-// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
-// transposed matrix in these vectors (row0 now contains column 0, etc.).
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS&expand=5949
-#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)   \
-	do {                                        \
-		__m128 tmp0, tmp1, tmp2, tmp3;      \
-		tmp0 = _mm_unpacklo_ps(row0, row1); \
-		tmp2 = _mm_unpacklo_ps(row2, row3); \
-		tmp1 = _mm_unpackhi_ps(row0, row1); \
-		tmp3 = _mm_unpackhi_ps(row2, row3); \
-		row0 = _mm_movelh_ps(tmp0, tmp2);   \
-		row1 = _mm_movehl_ps(tmp2, tmp0);   \
-		row2 = _mm_movelh_ps(tmp1, tmp3);   \
-		row3 = _mm_movehl_ps(tmp3, tmp1);   \
-	} while (0)
-
-/* Crypto Extensions */
-
-#if defined(__ARM_FEATURE_CRYPTO)
-// Wraps vmull_p64
-FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-	poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
-	poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
-	return vreinterpretq_u64_p128(vmull_p64(a, b));
-}
-#else // ARMv7 polyfill
-// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
-//
-// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
-// 64-bit->128-bit polynomial multiply.
-//
-// It needs some work and is somewhat slow, but it is still faster than all
-// known scalar methods.
-//
-// Algorithm adapted to C from
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
-// from "Fast Software Polynomial Multiplication on ARM Processors Using the
-// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
-// (https://hal.inria.fr/hal-01506572)
-static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-	poly8x8_t a = vreinterpret_p8_u64(_a);
-	poly8x8_t b = vreinterpret_p8_u64(_b);
-
-	// Masks
-	uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
-					vcreate_u8(0x00000000ffffffff));
-	uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
-					vcreate_u8(0x0000000000000000));
-
-	// Do the multiplies, rotating with vext to get all combinations
-	uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
-	uint8x16_t e = vreinterpretq_u8_p16(
-		vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
-	uint8x16_t f = vreinterpretq_u8_p16(
-		vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
-	uint8x16_t g = vreinterpretq_u8_p16(
-		vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
-	uint8x16_t h = vreinterpretq_u8_p16(
-		vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
-	uint8x16_t i = vreinterpretq_u8_p16(
-		vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
-	uint8x16_t j = vreinterpretq_u8_p16(
-		vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
-	uint8x16_t k = vreinterpretq_u8_p16(
-		vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
-
-	// Add cross products
-	uint8x16_t l = veorq_u8(e, f); // L = E + F
-	uint8x16_t m = veorq_u8(g, h); // M = G + H
-	uint8x16_t n = veorq_u8(i, j); // N = I + J
-
-	// Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
-	// instructions.
-#if defined(__aarch64__)
-	uint8x16_t lm_p0 = vreinterpretq_u8_u64(
-		vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-	uint8x16_t lm_p1 = vreinterpretq_u8_u64(
-		vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-	uint8x16_t nk_p0 = vreinterpretq_u8_u64(
-		vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-	uint8x16_t nk_p1 = vreinterpretq_u8_u64(
-		vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-#else
-	uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
-	uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
-	uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
-	uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
-#endif
-	// t0 = (L) (P0 + P1) << 8
-	// t1 = (M) (P2 + P3) << 16
-	uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
-	uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
-	uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
-
-	// t2 = (N) (P4 + P5) << 24
-	// t3 = (K) (P6 + P7) << 32
-	uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
-	uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
-	uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
-
-	// De-interleave
-#if defined(__aarch64__)
-	uint8x16_t t0 = vreinterpretq_u8_u64(vuzp1q_u64(
-		vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-	uint8x16_t t1 = vreinterpretq_u8_u64(vuzp2q_u64(
-		vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-	uint8x16_t t2 = vreinterpretq_u8_u64(vuzp1q_u64(
-		vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-	uint8x16_t t3 = vreinterpretq_u8_u64(vuzp2q_u64(
-		vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-#else
-	uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
-	uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
-	uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
-	uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
-#endif
-	// Shift the cross products
-	uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
-	uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
-	uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
-	uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
-
-	// Accumulate the products
-	uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
-	uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
-	uint8x16_t mix = veorq_u8(d, cross1);
-	uint8x16_t r = veorq_u8(mix, cross2);
-	return vreinterpretq_u64_u8(r);
-}
-#endif // ARMv7 polyfill
-
-FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
-{
-	uint64x2_t a = vreinterpretq_u64_m128i(_a);
-	uint64x2_t b = vreinterpretq_u64_m128i(_b);
-	switch (imm & 0x11) {
-	case 0x00:
-		return vreinterpretq_m128i_u64(
-			_sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
-	case 0x01:
-		return vreinterpretq_m128i_u64(
-			_sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
-	case 0x10:
-		return vreinterpretq_m128i_u64(
-			_sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
-	case 0x11:
-		return vreinterpretq_m128i_u64(_sse2neon_vmull_p64(
-			vget_high_u64(a), vget_high_u64(b)));
-	default:
-		abort();
-	}
-}
-
-#if !defined(__ARM_FEATURE_CRYPTO) && defined(__aarch64__)
-// In the absence of crypto extensions, implement aesenc using regular neon
-// intrinsics instead. See:
-// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
-// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
-// for more information Reproduced with permission of the author.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
-{
-	static const uint8_t crypto_aes_sbox[256] = {
-		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01,
-		0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d,
-		0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4,
-		0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
-		0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 0x04, 0xc7,
-		0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
-		0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e,
-		0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
-		0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb,
-		0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb,
-		0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c,
-		0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
-		0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c,
-		0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d,
-		0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a,
-		0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
-		0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3,
-		0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
-		0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a,
-		0xae, 0x08, 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
-		0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e,
-		0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9,
-		0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9,
-		0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
-		0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99,
-		0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
-	static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
-					     0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
-					     0xc, 0x1, 0x6, 0xb};
-	static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6,
-					   0x7, 0x4, 0x9, 0xa, 0xb, 0x8,
-					   0xd, 0xe, 0xf, 0xc};
-
-	uint8x16_t v;
-	uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
-
-	// shift rows
-	w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
-
-	// sub bytes
-	v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), w);
-	v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x40), w - 0x40);
-	v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x80), w - 0x80);
-	v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0xc0), w - 0xc0);
-
-	// mix columns
-	w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
-	w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
-	w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-
-	//  add round key
-	return vreinterpretq_m128i_u8(w) ^ RoundKey;
-}
-#elif defined(__ARM_FEATURE_CRYPTO)
-// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
-// AESMC and then manually applying the real key as an xor operation This
-// unfortunately means an additional xor op; the compiler should be able to
-// optimise this away for repeated calls however See
-// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
-// for more details.
-inline __m128i _mm_aesenc_si128(__m128i a, __m128i b)
-{
-	return vreinterpretq_m128i_u8(
-		vaesmcq_u8(
-			vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
-		vreinterpretq_u8_m128i(b));
-}
-#endif
-
-/* Streaming Extensions */
-
-// Guarantees that every preceding store is globally visible before any
-// subsequent store.
-// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_sfence(void)
-{
-	__sync_synchronize();
-}
-
-// Stores the data in a to the address p without polluting the caches.  If the
-// cache line containing address p is already in the cache, the cache will be
-// updated.Address p must be 16 - byte aligned.
-// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-	__builtin_nontemporal_store(a, p);
-#else
-	vst1q_s64((int64_t *)p, vreinterpretq_s64_m128i(a));
-#endif
-}
-
-// Cache line containing p is flushed and invalidated from all caches in the
-// coherency domain. :
-// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
-FORCE_INLINE void _mm_clflush(void const *p)
-{
-	(void)p;
-	// no corollary for Neon?
-}
-
-// Allocate aligned blocks of memory.
-// https://software.intel.com/en-us/
-//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
-FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
-{
-	void *ptr;
-	if (align == 1)
-		return malloc(size);
-	if (align == 2 || (sizeof(void *) == 8 && align == 4))
-		align = sizeof(void *);
-	if (!posix_memalign(&ptr, align, size))
-		return ptr;
-	return NULL;
-}
-
-FORCE_INLINE void _mm_free(void *addr)
-{
-	free(addr);
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 8-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
-FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-	__asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
-			     : [c] "+r"(crc)
-			     : [v] "r"(v));
-#else
-	crc ^= v;
-	for (int bit = 0; bit < 8; bit++) {
-		if (crc & 1)
-			crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
-		else
-			crc = (crc >> 1);
-	}
-#endif
-	return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 16-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
-FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-	__asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
-			     : [c] "+r"(crc)
-			     : [v] "r"(v));
-#else
-	crc = _mm_crc32_u8(crc, v & 0xff);
-	crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
-#endif
-	return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 32-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
-FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-	__asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
-			     : [c] "+r"(crc)
-			     : [v] "r"(v));
-#else
-	crc = _mm_crc32_u16(crc, v & 0xffff);
-	crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
-#endif
-	return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 64-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
-FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-	__asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
-			     : [c] "+r"(crc)
-			     : [v] "r"(v));
-#else
-	crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
-	crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
-#endif
-	return crc;
-}
-
-#if defined(__GNUC__) || defined(__clang__)
-#pragma pop_macro("ALIGN_STRUCT")
-#pragma pop_macro("FORCE_INLINE")
-#endif
-
-#endif

Some files were not shown because too many files changed in this diff