Bläddra i källkod

Merge pull request #3006 from mr-c/simde_20200529

libos: Freshen SIMDe code copy
Jim 5 år sedan
förälder
incheckning
0d7b4e1ced

+ 4 - 0
CMakeLists.txt

@@ -132,6 +132,10 @@ elseif(LOWERCASE_CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64(le)?")
 else()
 else()
 	set(NEEDS_SIMDE "1")
 	set(NEEDS_SIMDE "1")
 	add_definitions(-DNEEDS_SIMDE=1)
 	add_definitions(-DNEEDS_SIMDE=1)
+	if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX)
+		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp-simd -DSIMDE_ENABLE_OPENMP")
+		set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp-simd -DSIMDE_ENABLE_OPENMP")
+	endif()
 	set(ARCH_SIMD_FLAGS "")
 	set(ARCH_SIMD_FLAGS "")
 	message(STATUS "No Native SSE2 SIMD Support - Using SIMDE")
 	message(STATUS "No Native SSE2 SIMD Support - Using SIMDE")
 endif()
 endif()

+ 5 - 0
libobs/util/simde/README.libobs

@@ -0,0 +1,5 @@
+This is a slightly modified version of https://github.com/nemequ/simde/commit/cafec4b952fa5a31a51a10326f97c2e7c9067771
+sse{,2}.h and mmx.h was moved down from the original "x86" subdirectory,
+subsequently the '#include "../simde-common.h"' line in mmx.h was changed to '#include "simde-common.h"'
+
+Then the code was reformatted using the "formatcode.sh" script in the root of this repository.

+ 50 - 24
libobs/util/simde/check.h

@@ -6,6 +6,8 @@
  *   copyright and related or neighboring rights to this code.  For
  *   copyright and related or neighboring rights to this code.  For
  *   details, see the Creative Commons Zero 1.0 Universal license at
  *   details, see the Creative Commons Zero 1.0 Universal license at
  *   https://creativecommons.org/publicdomain/zero/1.0/
  *   https://creativecommons.org/publicdomain/zero/1.0/
+ *
+ * SPDX-License-Identifier: CC0-1.0
  */
  */
 
 
 #if !defined(SIMDE_CHECK_H)
 #if !defined(SIMDE_CHECK_H)
@@ -15,6 +17,7 @@
 #define SIMDE_NDEBUG 1
 #define SIMDE_NDEBUG 1
 #endif
 #endif
 
 
+#include "hedley.h"
 #include <stdint.h>
 #include <stdint.h>
 
 
 #if !defined(_WIN32)
 #if !defined(_WIN32)
@@ -32,24 +35,47 @@
 #endif
 #endif
 
 
 #if defined(_MSC_VER) && (_MSC_VER >= 1500)
 #if defined(_MSC_VER) && (_MSC_VER >= 1500)
-#define SIMDE__PUSH_DISABLE_MSVC_C4127 \
+#define SIMDE_PUSH_DISABLE_MSVC_C4127_ \
 	__pragma(warning(push)) __pragma(warning(disable : 4127))
 	__pragma(warning(push)) __pragma(warning(disable : 4127))
-#define SIMDE__POP_DISABLE_MSVC_C4127 __pragma(warning(pop))
+#define SIMDE_POP_DISABLE_MSVC_C4127_ __pragma(warning(pop))
 #else
 #else
-#define SIMDE__PUSH_DISABLE_MSVC_C4127
-#define SIMDE__POP_DISABLE_MSVC_C4127
+#define SIMDE_PUSH_DISABLE_MSVC_C4127_
+#define SIMDE_POP_DISABLE_MSVC_C4127_
 #endif
 #endif
 
 
 #if !defined(simde_errorf)
 #if !defined(simde_errorf)
+#if defined(__has_include)
+#if __has_include(<stdio.h>)
+#include <stdio.h>
+#endif
+#elif defined(SIMDE_STDC_HOSTED)
+#if SIMDE_STDC_HOSTED == 1
 #include <stdio.h>
 #include <stdio.h>
-#include <stdlib.h>
+#endif
+#elif defined(__STDC_HOSTED__)
+#if __STDC_HOSTETD__ == 1
+#include <stdio.h>
+#endif
+#endif
+
+#include "debug-trap.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_
+#if defined(EOF)
 #define simde_errorf(format, ...) \
 #define simde_errorf(format, ...) \
 	(fprintf(stderr, format, __VA_ARGS__), abort())
 	(fprintf(stderr, format, __VA_ARGS__), abort())
+#else
+#define simde_errorf(format, ...) (simde_trap())
+#endif
+HEDLEY_DIAGNOSTIC_POP
 #endif
 #endif
 
 
 #define simde_error(msg) simde_errorf("%s", msg)
 #define simde_error(msg) simde_errorf("%s", msg)
 
 
-#if defined(SIMDE_NDEBUG)
+#if defined(SIMDE_NDEBUG) ||                                 \
+	(defined(__cplusplus) && (__cplusplus < 201103L)) || \
+	(defined(__STDC__) && (__STDC__ < 199901L))
 #if defined(SIMDE_CHECK_FAIL_DEFINED)
 #if defined(SIMDE_CHECK_FAIL_DEFINED)
 #define simde_assert(expr)
 #define simde_assert(expr)
 #else
 #else
@@ -78,8 +104,8 @@
 		if (!HEDLEY_LIKELY(expr)) {                           \
 		if (!HEDLEY_LIKELY(expr)) {                           \
 			simde_error("assertion failed: " #expr "\n"); \
 			simde_error("assertion failed: " #expr "\n"); \
 		}                                                     \
 		}                                                     \
-		SIMDE__PUSH_DISABLE_MSVC_C4127                        \
-	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+		SIMDE_PUSH_DISABLE_MSVC_C4127_                        \
+	} while (0) SIMDE_POP_DISABLE_MSVC_C4127_
 
 
 #define simde_assert_true(expr)                                \
 #define simde_assert_true(expr)                                \
 	do {                                                   \
 	do {                                                   \
@@ -87,8 +113,8 @@
 			simde_error("assertion failed: " #expr \
 			simde_error("assertion failed: " #expr \
 				    " is not true\n");         \
 				    " is not true\n");         \
 		}                                              \
 		}                                              \
-		SIMDE__PUSH_DISABLE_MSVC_C4127                 \
-	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+		SIMDE_PUSH_DISABLE_MSVC_C4127_                 \
+	} while (0) SIMDE_POP_DISABLE_MSVC_C4127_
 
 
 #define simde_assert_false(expr)                               \
 #define simde_assert_false(expr)                               \
 	do {                                                   \
 	do {                                                   \
@@ -96,8 +122,8 @@
 			simde_error("assertion failed: " #expr \
 			simde_error("assertion failed: " #expr \
 				    " is not false\n");        \
 				    " is not false\n");        \
 		}                                              \
 		}                                              \
-		SIMDE__PUSH_DISABLE_MSVC_C4127                 \
-	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+		SIMDE_PUSH_DISABLE_MSVC_C4127_                 \
+	} while (0) SIMDE_POP_DISABLE_MSVC_C4127_
 
 
 #define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b)           \
 #define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b)           \
 	do {                                                               \
 	do {                                                               \
@@ -110,8 +136,8 @@
 				     #a, #op, #b, simde_tmp_a_, #op,       \
 				     #a, #op, #b, simde_tmp_a_, #op,       \
 				     simde_tmp_b_);                        \
 				     simde_tmp_b_);                        \
 		}                                                          \
 		}                                                          \
-		SIMDE__PUSH_DISABLE_MSVC_C4127                             \
-	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+		SIMDE_PUSH_DISABLE_MSVC_C4127_                             \
+	} while (0) SIMDE_POP_DISABLE_MSVC_C4127_
 
 
 #define simde_assert_double_equal(a, b, precision)                           \
 #define simde_assert_double_equal(a, b, precision)                           \
 	do {                                                                 \
 	do {                                                                 \
@@ -127,8 +153,8 @@
 				"g == %0." #precision "g)\n",                \
 				"g == %0." #precision "g)\n",                \
 				#a, #b, simde_tmp_a_, simde_tmp_b_);         \
 				#a, #b, simde_tmp_a_, simde_tmp_b_);         \
 		}                                                            \
 		}                                                            \
-		SIMDE__PUSH_DISABLE_MSVC_C4127                               \
-	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+		SIMDE_PUSH_DISABLE_MSVC_C4127_                               \
+	} while (0) SIMDE_POP_DISABLE_MSVC_C4127_
 
 
 #include <string.h>
 #include <string.h>
 #define simde_assert_string_equal(a, b)                                                   \
 #define simde_assert_string_equal(a, b)                                                   \
@@ -141,8 +167,8 @@
 				"assertion failed: string %s == %s (\"%s\" == \"%s\")\n", \
 				"assertion failed: string %s == %s (\"%s\" == \"%s\")\n", \
 				#a, #b, simde_tmp_a_, simde_tmp_b_);                      \
 				#a, #b, simde_tmp_a_, simde_tmp_b_);                      \
 		}                                                                         \
 		}                                                                         \
-		SIMDE__PUSH_DISABLE_MSVC_C4127                                            \
-	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+		SIMDE_PUSH_DISABLE_MSVC_C4127_                                            \
+	} while (0) SIMDE_POP_DISABLE_MSVC_C4127_
 
 
 #define simde_assert_string_not_equal(a, b)                                               \
 #define simde_assert_string_not_equal(a, b)                                               \
 	do {                                                                              \
 	do {                                                                              \
@@ -154,8 +180,8 @@
 				"assertion failed: string %s != %s (\"%s\" == \"%s\")\n", \
 				"assertion failed: string %s != %s (\"%s\" == \"%s\")\n", \
 				#a, #b, simde_tmp_a_, simde_tmp_b_);                      \
 				#a, #b, simde_tmp_a_, simde_tmp_b_);                      \
 		}                                                                         \
 		}                                                                         \
-		SIMDE__PUSH_DISABLE_MSVC_C4127                                            \
-	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+		SIMDE_PUSH_DISABLE_MSVC_C4127_                                            \
+	} while (0) SIMDE_POP_DISABLE_MSVC_C4127_
 
 
 #define simde_assert_memory_equal(size, a, b)                                                                        \
 #define simde_assert_memory_equal(size, a, b)                                                                        \
 	do {                                                                                                         \
 	do {                                                                                                         \
@@ -180,8 +206,8 @@
 				}                                                                                    \
 				}                                                                                    \
 			}                                                                                            \
 			}                                                                                            \
 		}                                                                                                    \
 		}                                                                                                    \
-		SIMDE__PUSH_DISABLE_MSVC_C4127                                                                       \
-	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+		SIMDE_PUSH_DISABLE_MSVC_C4127_                                                                       \
+	} while (0) SIMDE_POP_DISABLE_MSVC_C4127_
 
 
 #define simde_assert_memory_not_equal(size, a, b)                                          \
 #define simde_assert_memory_not_equal(size, a, b)                                          \
 	do {                                                                               \
 	do {                                                                               \
@@ -197,8 +223,8 @@
 				"u bytes)\n",                                              \
 				"u bytes)\n",                                              \
 				#a, #b, simde_tmp_size_);                                  \
 				#a, #b, simde_tmp_size_);                                  \
 		}                                                                          \
 		}                                                                          \
-		SIMDE__PUSH_DISABLE_MSVC_C4127                                             \
-	} while (0) SIMDE__POP_DISABLE_MSVC_C4127
+		SIMDE_PUSH_DISABLE_MSVC_C4127_                                             \
+	} while (0) SIMDE_POP_DISABLE_MSVC_C4127_
 #endif
 #endif
 
 
 #define simde_assert_type(T, fmt, a, op, b) \
 #define simde_assert_type(T, fmt, a, op, b) \

+ 117 - 0
libobs/util/simde/debug-trap.h

@@ -0,0 +1,117 @@
+/* Debugging assertions and traps
+ * Portable Snippets - https://gitub.com/nemequ/portable-snippets
+ * Created by Evan Nemerson <[email protected]>
+ *
+ *   To the extent possible under law, the authors have waived all
+ *   copyright and related or neighboring rights to this code.  For
+ *   details, see the Creative Commons Zero 1.0 Universal license at
+ *   https://creativecommons.org/publicdomain/zero/1.0/
+ *
+ * SPDX-License-Identifier: CC0-1.0
+ */
+
+#if !defined(SIMDE_DEBUG_TRAP_H)
+#define SIMDE_DEBUG_TRAP_H
+
+#if !defined(SIMDE_NDEBUG) && defined(NDEBUG) && !defined(SIMDE_DEBUG)
+#define SIMDE_NDEBUG 1
+#endif
+
+#if defined(__has_builtin) && !defined(__ibmxl__)
+#if __has_builtin(__builtin_debugtrap)
+#define simde_trap() __builtin_debugtrap()
+#elif __has_builtin(__debugbreak)
+#define simde_trap() __debugbreak()
+#endif
+#endif
+#if !defined(simde_trap)
+#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+#define simde_trap() __debugbreak()
+#elif defined(__ARMCC_VERSION)
+#define simde_trap() __breakpoint(42)
+#elif defined(__ibmxl__) || defined(__xlC__)
+#include <builtins.h>
+#define simde_trap() __trap(42)
+#elif defined(__DMC__) && defined(_M_IX86)
+static inline void simde_trap(void)
+{
+	__asm int 3h;
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static inline void simde_trap(void)
+{
+	__asm__ __volatile__("int $03");
+}
+#elif defined(__thumb__)
+static inline void simde_trap(void)
+{
+	__asm__ __volatile__(".inst 0xde01");
+}
+#elif defined(__aarch64__)
+static inline void simde_trap(void)
+{
+	__asm__ __volatile__(".inst 0xd4200000");
+}
+#elif defined(__arm__)
+static inline void simde_trap(void)
+{
+	__asm__ __volatile__(".inst 0xe7f001f0");
+}
+#elif defined(__alpha__) && !defined(__osf__)
+static inline void simde_trap(void)
+{
+	__asm__ __volatile__("bpt");
+}
+#elif defined(_54_)
+static inline void simde_trap(void)
+{
+	__asm__ __volatile__("ESTOP");
+}
+#elif defined(_55_)
+static inline void simde_trap(void)
+{
+	__asm__ __volatile__(
+		";\n .if (.MNEMONIC)\n ESTOP_1\n .else\n ESTOP_1()\n .endif\n NOP");
+}
+#elif defined(_64P_)
+static inline void simde_trap(void)
+{
+	__asm__ __volatile__("SWBP 0");
+}
+#elif defined(_6x_)
+static inline void simde_trap(void)
+{
+	__asm__ __volatile__("NOP\n .word 0x10000000");
+}
+#elif defined(__STDC_HOSTED__) && (__STDC_HOSTED__ == 0) && defined(__GNUC__)
+#define simde_trap() __builtin_trap()
+#else
+#include <signal.h>
+#if defined(SIGTRAP)
+#define simde_trap() raise(SIGTRAP)
+#else
+#define simde_trap() raise(SIGABRT)
+#endif
+#endif
+#endif
+
+#if defined(HEDLEY_LIKELY)
+#define SIMDE_DBG_LIKELY(expr) HEDLEY_LIKELY(expr)
+#elif defined(__GNUC__) && (__GNUC__ >= 3)
+#define SIMDE_DBG_LIKELY(expr) __builtin_expect(!!(expr), 1)
+#else
+#define SIMDE_DBG_LIKELY(expr) (!!(expr))
+#endif
+
+#if !defined(SIMDE_NDEBUG) || (SIMDE_NDEBUG == 0)
+#define simde_dbg_assert(expr)                 \
+	do {                                   \
+		if (!SIMDE_DBG_LIKELY(expr)) { \
+			simde_trap();          \
+		}                              \
+	} while (0)
+#else
+#define simde_dbg_assert(expr)
+#endif
+
+#endif /* !defined(SIMDE_DEBUG_TRAP_H) */

Filskillnaden har hållts tillbaka eftersom den är för stor
+ 583 - 135
libobs/util/simde/hedley.h


+ 1613 - 699
libobs/util/simde/mmx.h

@@ -1,4 +1,4 @@
-/* Copyright (c) 2017-2018 Evan Nemerson <[email protected]>
+/* SPDX-License-Identifier: MIT
  *
  *
  * Permission is hereby granted, free of charge, to any person
  * Permission is hereby granted, free of charge, to any person
  * obtaining a copy of this software and associated documentation
  * obtaining a copy of this software and associated documentation
@@ -19,64 +19,71 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  * SOFTWARE.
+ *
+ * Copyright:
+ *   2017-2020 Evan Nemerson <[email protected]>
  */
  */
 
 
-#if !defined(SIMDE__MMX_H)
-#if !defined(SIMDE__MMX_H)
-#define SIMDE__MMX_H
-#endif
+#if !defined(SIMDE_X86_MMX_H)
+#define SIMDE_X86_MMX_H
+
 #include "simde-common.h"
 #include "simde-common.h"
 
 
-#if defined(SIMDE_MMX_FORCE_NATIVE)
-#define SIMDE_MMX_NATIVE
-#elif defined(__MMX__) && !defined(SIMDE_MMX_NO_NATIVE) && \
-	!defined(SIMDE_NO_NATIVE)
-#define SIMDE_MMX_NATIVE
-#elif defined(__ARM_NEON) && !defined(SIMDE_MMX_NO_NEON) && \
-	!defined(SIMDE_NO_NEON)
-#define SIMDE_MMX_NEON
+#if !defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
+#define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES
+#endif
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+
+#if defined(SIMDE_X86_MMX_NATIVE)
+#define SIMDE_X86_MMX_USE_NATIVE_TYPE
+#elif defined(SIMDE_X86_SSE_NATIVE)
+#define SIMDE_X86_MMX_USE_NATIVE_TYPE
 #endif
 #endif
 
 
-#if defined(SIMDE_MMX_NATIVE)
+#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
 #include <mmintrin.h>
 #include <mmintrin.h>
-#else
-#if defined(SIMDE_MMX_NEON)
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 #include <arm_neon.h>
 #include <arm_neon.h>
 #endif
 #endif
-#endif
+
 #include <stdint.h>
 #include <stdint.h>
 #include <limits.h>
 #include <limits.h>
-#include <stdlib.h>
-#include <string.h>
 
 
-SIMDE__BEGIN_DECLS
+SIMDE_BEGIN_DECLS_
 
 
 typedef union {
 typedef union {
-#if defined(SIMDE__ENABLE_GCC_VEC_EXT)
-	int8_t i8 __attribute__((__vector_size__(8), __may_alias__));
-	int16_t i16 __attribute__((__vector_size__(8), __may_alias__));
-	int32_t i32 __attribute__((__vector_size__(8), __may_alias__));
-	int64_t i64 __attribute__((__vector_size__(8), __may_alias__));
-	uint8_t u8 __attribute__((__vector_size__(8), __may_alias__));
-	uint16_t u16 __attribute__((__vector_size__(8), __may_alias__));
-	uint32_t u32 __attribute__((__vector_size__(8), __may_alias__));
-	uint64_t u64 __attribute__((__vector_size__(8), __may_alias__));
-	simde_float32 f32 __attribute__((__vector_size__(8), __may_alias__));
-#else
-	int8_t i8[8];
-	int16_t i16[4];
-	int32_t i32[2];
-	int64_t i64[1];
-	uint8_t u8[8];
-	uint16_t u16[4];
-	uint32_t u32[2];
-	uint64_t u64[1];
-	simde_float32 f32[2];
-#endif
-
-#if defined(SIMDE_MMX_NATIVE)
+#if defined(SIMDE_VECTOR_SUBSCRIPT)
+	SIMDE_ALIGN(8) int8_t i8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN(8) int16_t i16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN(8) int32_t i32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN(8) int64_t i64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN(8) uint8_t u8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN(8) uint16_t u16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN(8) uint32_t u32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN(8) uint64_t u64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN(8) simde_float32 f32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN(8) int_fast32_t i32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+	SIMDE_ALIGN(8) uint_fast32_t u32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+#else
+	SIMDE_ALIGN(8) int8_t i8[8];
+	SIMDE_ALIGN(8) int16_t i16[4];
+	SIMDE_ALIGN(8) int32_t i32[2];
+	SIMDE_ALIGN(8) int64_t i64[1];
+	SIMDE_ALIGN(8) uint8_t u8[8];
+	SIMDE_ALIGN(8) uint16_t u16[4];
+	SIMDE_ALIGN(8) uint32_t u32[2];
+	SIMDE_ALIGN(8) uint64_t u64[1];
+	SIMDE_ALIGN(8) simde_float32 f32[2];
+	SIMDE_ALIGN(8) int_fast32_t i32f[8 / sizeof(int_fast32_t)];
+	SIMDE_ALIGN(8) uint_fast32_t u32f[8 / sizeof(uint_fast32_t)];
+#endif
+
+#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
 	__m64 n;
 	__m64 n;
-#elif defined(SIMDE_MMX_NEON)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
 	int8x8_t neon_i8;
 	int8x8_t neon_i8;
 	int16x4_t neon_i16;
 	int16x4_t neon_i16;
 	int32x2_t neon_i32;
 	int32x2_t neon_i32;
@@ -87,1270 +94,2177 @@ typedef union {
 	uint64x1_t neon_u64;
 	uint64x1_t neon_u64;
 	float32x2_t neon_f32;
 	float32x2_t neon_f32;
 #endif
 #endif
-} simde__m64;
+} simde__m64_private;
+
+#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
+typedef __m64 simde__m64;
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+typedef int32x2_t simde__m64;
+#elif defined(SIMDE_VECTOR_SUBSCRIPT)
+typedef int32_t simde__m64 SIMDE_ALIGN(8) SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
+#else
+typedef simde__m64_private simde__m64;
+#endif
+
+#if !defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) && \
+	defined(SIMDE_ENABLE_NATIVE_ALIASES)
+#define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES
+typedef simde__m64 __m64;
+#endif
 
 
-#if defined(SIMDE_MMX_NATIVE)
-HEDLEY_STATIC_ASSERT(sizeof(__m64) == sizeof(simde__m64),
-		     "__m64 size doesn't match simde__m64 size");
-SIMDE__FUNCTION_ATTRIBUTES simde__m64 SIMDE__M64_C(__m64 v)
+HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect");
+HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64_private), "__m64 size incorrect");
+#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
+HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64) == 8,
+		     "simde__m64 is not 8-byte aligned");
+HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64_private) == 8,
+		     "simde__m64_private is not 8-byte aligned");
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m64 simde__m64_from_private(simde__m64_private v)
 {
 {
 	simde__m64 r;
 	simde__m64 r;
-	r.n = v;
+	simde_memcpy(&r, &v, sizeof(r));
 	return r;
 	return r;
 }
 }
-#elif defined(SIMDE_MMX_NEON)
-#define SIMDE__M64_NEON_C(T, expr) \
-	(simde__m64) { .neon_##T = (expr) }
-#endif
-HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect");
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m64_private simde__m64_to_private(simde__m64 v)
+{
+	simde__m64_private r;
+	simde_memcpy(&r, &v, sizeof(r));
+	return r;
+}
+
+#define SIMDE_X86_GENERATE_CONVERSION_FUNCTION(simde_type, source_type, isax, \
+					       fragment)                      \
+	SIMDE_FUNCTION_ATTRIBUTES                                             \
+	simde__##simde_type simde__##simde_type##_from_##isax##_##fragment(   \
+		source_type value)                                            \
+	{                                                                     \
+		simde__##simde_type##_private r_;                             \
+		r_.isax##_##fragment = value;                                 \
+		return simde__##simde_type##_from_private(r_);                \
+	}                                                                     \
+                                                                              \
+	SIMDE_FUNCTION_ATTRIBUTES                                             \
+	source_type simde__##simde_type##_to_##isax##_##fragment(             \
+		simde__##simde_type value)                                    \
+	{                                                                     \
+		simde__##simde_type##_private r_ =                            \
+			simde__##simde_type##_to_private(value);              \
+		return r_.isax##_##fragment;                                  \
+	}
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int8x8_t, neon, i8)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int16x4_t, neon, i16)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int32x2_t, neon, i32)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int64x1_t, neon, i64)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint8x8_t, neon, u8)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint16x4_t, neon, u16)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint32x2_t, neon, u32)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64x1_t, neon, u64)
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, float32x2_t, neon, f32)
+#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
+
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_add_pi8(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_add_pi8(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_add_pi8(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_add_pi8(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < 8; i++) {
-		r.i8[i] = a.i8[i] + b.i8[i];
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i8 = vadd_s8(a_.neon_i8, b_.neon_i8);
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+	r_.i8 = a_.i8 + b_.i8;
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
+		r_.i8[i] = a_.i8[i] + b_.i8[i];
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_paddb(a, b) simde_mm_add_pi8(a, b)
 #define simde_m_paddb(a, b) simde_mm_add_pi8(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_add_pi8(a, b) simde_mm_add_pi8(a, b)
+#define _m_paddb(a, b) simde_m_paddb(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_add_pi16(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_add_pi16(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_add_pi16(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_add_pi16(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
-		r.i16[i] = a.i16[i] + b.i16[i];
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i16 = vadd_s16(a_.neon_i16, b_.neon_i16);
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+	r_.i16 = a_.i16 + b_.i16;
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
+		r_.i16[i] = a_.i16[i] + b_.i16[i];
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_paddw(a, b) simde_mm_add_pi16(a, b)
 #define simde_m_paddw(a, b) simde_mm_add_pi16(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_add_pi16(a, b) simde_mm_add_pi16(a, b)
+#define _m_add_paddw(a, b) simde_mm_add_pi16(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_add_pi32(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_add_pi32(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_add_pi32(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_add_pi32(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(int32_t)); i++) {
-		r.i32[i] = a.i32[i] + b.i32[i];
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i32 = vadd_s32(a_.neon_i32, b_.neon_i32);
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+	r_.i32 = a_.i32 + b_.i32;
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
+		r_.i32[i] = a_.i32[i] + b_.i32[i];
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_paddd(a, b) simde_mm_add_pi32(a, b)
 #define simde_m_paddd(a, b) simde_mm_add_pi32(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_add_pi32(a, b) simde_mm_add_pi32(a, b)
+#define _m_add_paddd(a, b) simde_mm_add_pi32(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_adds_pi8(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_adds_pi8(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_adds_pi8(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_adds_pi8(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (int i = 0; i < 8; i++) {
-		if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) {
-			r.i8[i] = INT8_MAX;
-		} else if ((((b.i8[i]) < 0) &&
-			    ((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) {
-			r.i8[i] = INT8_MIN;
+	simde__m64_private r_, a_ = simde__m64_to_private(a),
+			       b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i8 = vqadd_s8(a_.neon_i8, b_.neon_i8);
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
+		if ((((b_.i8[i]) > 0) &&
+		     ((a_.i8[i]) > (INT8_MAX - (b_.i8[i]))))) {
+			r_.i8[i] = INT8_MAX;
+		} else if ((((b_.i8[i]) < 0) &&
+			    ((a_.i8[i]) < (INT8_MIN - (b_.i8[i]))))) {
+			r_.i8[i] = INT8_MIN;
 		} else {
 		} else {
-			r.i8[i] = (a.i8[i]) + (b.i8[i]);
+			r_.i8[i] = (a_.i8[i]) + (b_.i8[i]);
 		}
 		}
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b)
 #define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_adds_pi8(a, b) simde_mm_adds_pi8(a, b)
+#define _m_add_paddsb(a, b) simde_mm_adds_pi8(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_adds_pu8(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_adds_pu8(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_adds_pu8(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_adds_pu8(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < 8; i++) {
-		const int32_t x = a.u8[i] + b.u8[i];
-		if (x < 0)
-			r.u8[i] = 0;
-		else if (x > UINT8_MAX)
-			r.u8[i] = UINT8_MAX;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_u8 = vqadd_u8(a_.neon_u8, b_.neon_u8);
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
+		const uint_fast16_t x =
+			HEDLEY_STATIC_CAST(uint_fast16_t, a_.u8[i]) +
+			HEDLEY_STATIC_CAST(uint_fast16_t, b_.u8[i]);
+		if (x > UINT8_MAX)
+			r_.u8[i] = UINT8_MAX;
 		else
 		else
-			r.u8[i] = (uint8_t)x;
+			r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b)
 #define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_adds_pu8(a, b) simde_mm_adds_pu8(a, b)
+#define _m_paddusb(a, b) simde_mm_adds_pu8(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_adds_pi16(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_adds_pi16(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_adds_pi16(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_adds_pi16(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (int i = 0; i < 4; i++) {
-		if ((((b.i16[i]) > 0) &&
-		     ((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) {
-			r.i16[i] = INT16_MAX;
-		} else if ((((b.i16[i]) < 0) &&
-			    ((a.i16[i]) < (SHRT_MIN - (b.i16[i]))))) {
-			r.i16[i] = SHRT_MIN;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i16 = vqadd_s16(a_.neon_i16, b_.neon_i16);
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
+		if ((((b_.i16[i]) > 0) &&
+		     ((a_.i16[i]) > (INT16_MAX - (b_.i16[i]))))) {
+			r_.i16[i] = INT16_MAX;
+		} else if ((((b_.i16[i]) < 0) &&
+			    ((a_.i16[i]) < (SHRT_MIN - (b_.i16[i]))))) {
+			r_.i16[i] = SHRT_MIN;
 		} else {
 		} else {
-			r.i16[i] = (a.i16[i]) + (b.i16[i]);
+			r_.i16[i] = (a_.i16[i]) + (b_.i16[i]);
 		}
 		}
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b)
 #define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_adds_pi16(a, b) simde_mm_adds_pi16(a, b)
+#define _m_paddsw(a, b) simde_mm_adds_pi16(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_adds_pu16(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_adds_pu16(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_adds_pu16(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_adds_pu16(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
-		const uint32_t x = a.u16[i] + b.u16[i];
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_u16 = vqadd_u16(a_.neon_u16, b_.neon_u16);
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
+		const uint32_t x = a_.u16[i] + b_.u16[i];
 		if (x > UINT16_MAX)
 		if (x > UINT16_MAX)
-			r.u16[i] = UINT16_MAX;
+			r_.u16[i] = UINT16_MAX;
 		else
 		else
-			r.u16[i] = (uint16_t)x;
+			r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b)
 #define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_adds_pu16(a, b) simde_mm_adds_pu16(a, b)
+#define _m_paddusw(a, b) simde_mm_adds_pu16(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_and_si64(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_and_si64(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_and_si64(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_and_si64(a, b);
 #else
 #else
-	simde__m64 r;
-	r.i64[0] = a.i64[0] & b.i64[0];
-	return r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i32 = vand_s32(a_.neon_i32, b_.neon_i32);
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+	r_.i64 = a_.i64 & b_.i64;
+#else
+	r_.i64[0] = a_.i64[0] & b_.i64[0];
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_pand(a, b) simde_mm_and_si64(a, b)
 #define simde_m_pand(a, b) simde_mm_and_si64(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_and_si64(a, b) simde_mm_and_si64(a, b)
+#define _m_pand(a, b) simde_mm_and_si64(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_andnot_si64(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_andnot_si64(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_andnot_si64(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_andnot_si64(a, b);
 #else
 #else
-	simde__m64 r;
-	r.i64[0] = ~(a.i64[0]) & b.i64[0];
-	return r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i32 = vbic_s32(b_.neon_i32, a_.neon_i32);
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+	r_.i32f = ~a_.i32f & b_.i32f;
+#else
+	r_.u64[0] = (~(a_.u64[0])) & (b_.u64[0]);
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b)
 #define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_andnot_si64(a, b) simde_mm_andnot_si64(a, b)
+#define _m_pandn(a, b) simde_mm_andnot_si64(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_cmpeq_pi8(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_cmpeq_pi8(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_cmpeq_pi8(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_cmpeq_pi8(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (int i = 0; i < 8; i++) {
-		r.i8[i] = (a.i8[i] == b.i8[i]) * 0xff;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i8 = vreinterpret_s8_u8(vceq_s8(a_.neon_i8, b_.neon_i8));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
+		r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
 #define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_cmpeq_pi8(a, b) simde_mm_cmpeq_pi8(a, b)
+#define _m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_cmpeq_pi16(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_cmpeq_pi16(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_cmpeq_pi16(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_cmpeq_pi16(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (int i = 0; i < 4; i++) {
-		r.i16[i] = (a.i16[i] == b.i16[i]) * 0xffff;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i16 = vreinterpret_s16_u16(vceq_s16(a_.neon_i16, b_.neon_i16));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
+		r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
 #define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_cmpeq_pi16(a, b) simde_mm_cmpeq_pi16(a, b)
+#define _m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_cmpeq_pi32(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_cmpeq_pi32(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_cmpeq_pi32(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_cmpeq_pi32(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (int i = 0; i < 2; i++) {
-		r.i32[i] = (a.i32[i] == b.i32[i]) * 0xffffffff;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i32 = vreinterpret_s32_u32(vceq_s32(a_.neon_i32, b_.neon_i32));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
+		r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
 #define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_cmpeq_pi32(a, b) simde_mm_cmpeq_pi32(a, b)
+#define _m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_cmpgt_pi8(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_cmpgt_pi8(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_cmpgt_pi8(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_cmpgt_pi8(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (int i = 0; i < 8; i++) {
-		r.i8[i] = (a.i8[i] > b.i8[i]) * 0xff;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i8 = vreinterpret_s8_u8(vcgt_s8(a_.neon_i8, b_.neon_i8));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
+		r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
 #define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_cmpgt_pi8(a, b) simde_mm_cmpgt_pi8(a, b)
+#define _m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_cmpgt_pi16(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_cmpgt_pi16(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_cmpgt_pi16(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_cmpgt_pi16(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (int i = 0; i < 4; i++) {
-		r.i16[i] = (a.i16[i] > b.i16[i]) * 0xffff;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i16 = vreinterpret_s16_u16(vcgt_s16(a_.neon_i16, b_.neon_i16));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
+		r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
 #define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_cmpgt_pi16(a, b) simde_mm_cmpgt_pi16(a, b)
+#define _m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_cmpgt_pi32(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_cmpgt_pi32(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_cmpgt_pi32(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_cmpgt_pi32(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (int i = 0; i < 2; i++) {
-		r.i32[i] = (a.i32[i] > b.i32[i]) * 0xffffffff;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i32 = vreinterpret_s32_u32(vcgt_s32(a_.neon_i32, b_.neon_i32));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
+		r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
 #define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_cmpgt_pi32(a, b) simde_mm_cmpgt_pi32(a, b)
+#define _m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 int64_t simde_mm_cvtm64_si64(simde__m64 a)
 int64_t simde_mm_cvtm64_si64(simde__m64 a)
 {
 {
-#if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI)
-	return _mm_cvtm64_si64(a.n);
+#if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
+	!defined(__PGI)
+	return _mm_cvtm64_si64(a);
+#else
+	simde__m64_private a_ = simde__m64_to_private(a);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	return vget_lane_s64(a_.neon_i64, 0);
 #else
 #else
-	return a.i64[0];
+	return a_.i64[0];
+#endif
 #endif
 #endif
 }
 }
 #define simde_m_to_int64(a) simde_mm_cvtm64_si64(a)
 #define simde_m_to_int64(a) simde_mm_cvtm64_si64(a)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_cvtm64_si64(a) simde_mm_cvtm64_si64(a)
+#define _m_to_int64(a) simde_mm_cvtm64_si64(a)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_cvtsi32_si64(int32_t a)
 simde__m64 simde_mm_cvtsi32_si64(int32_t a)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_cvtsi32_si64(a));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_cvtsi32_si64(a);
 #else
 #else
-	simde__m64 r;
-	r.i32[0] = a;
-	r.i32[1] = 0;
-	return r;
+	simde__m64_private r_;
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	const int32_t av[sizeof(r_.neon_i32) / sizeof(r_.neon_i32[0])] = {a, 0};
+	r_.neon_i32 = vld1_s32(av);
+#else
+	r_.i32[0] = a;
+	r_.i32[1] = 0;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_from_int(a) simde_mm_cvtsi32_si64(a)
 #define simde_m_from_int(a) simde_mm_cvtsi32_si64(a)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_cvtsi32_si64(a) simde_mm_cvtsi32_si64(a)
+#define _m_from_int(a) simde_mm_cvtsi32_si64(a)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_cvtsi64_m64(int64_t a)
 simde__m64 simde_mm_cvtsi64_m64(int64_t a)
 {
 {
-#if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI)
-	return SIMDE__M64_C(_mm_cvtsi64_m64(a));
+#if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
+	!defined(__PGI)
+	return _mm_cvtsi64_m64(a);
 #else
 #else
-	simde__m64 r;
-	r.i64[0] = a;
-	return r;
+	simde__m64_private r_;
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i64 = vld1_s64(&a);
+#else
+	r_.i64[0] = a;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a)
 #define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_cvtsi64_m64(a) simde_mm_cvtsi64_m64(a)
+#define _m_from_int64(a) simde_mm_cvtsi64_m64(a)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 int32_t simde_mm_cvtsi64_si32(simde__m64 a)
 int32_t simde_mm_cvtsi64_si32(simde__m64 a)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return _mm_cvtsi64_si32(a.n);
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_cvtsi64_si32(a);
+#else
+	simde__m64_private a_ = simde__m64_to_private(a);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	return vget_lane_s32(a_.neon_i32, 0);
 #else
 #else
-	return a.i32[0];
+	return a_.i32[0];
+#endif
 #endif
 #endif
 }
 }
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_cvtsi64_si32(a) simde_mm_cvtsi64_si32(a)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 void simde_mm_empty(void)
 void simde_mm_empty(void)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
+#if defined(SIMDE_X86_MMX_NATIVE)
 	_mm_empty();
 	_mm_empty();
 #else
 #else
 #endif
 #endif
 }
 }
 #define simde_m_empty() simde_mm_empty()
 #define simde_m_empty() simde_mm_empty()
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_empty() simde_mm_empty()
+#define _m_empty() simde_mm_empty()
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_madd_pi16(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_madd_pi16(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_madd_pi16(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_madd_pi16(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (int i = 0; i < 4; i += 2) {
-		r.i32[i / 2] =
-			(a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]);
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	int32x4_t i1 = vmull_s16(a_.neon_i16, b_.neon_i16);
+	r_.neon_i32 = vpadd_s32(vget_low_s32(i1), vget_high_s32(i1));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i += 2) {
+		r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) +
+				(a_.i16[i + 1] * b_.i16[i + 1]);
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
 #define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_madd_pi16(a, b) simde_mm_madd_pi16(a, b)
+#define _m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_mulhi_pi16(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_mulhi_pi16(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_mulhi_pi16(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_mulhi_pi16(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (int i = 0; i < 4; i++) {
-		r.i16[i] = (int16_t)((a.i16[i] * b.i16[i]) >> 16);
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
+	const uint32x4_t t2 = vshrq_n_u32(vreinterpretq_u32_s32(t1), 16);
+	const uint16x4_t t3 = vmovn_u32(t2);
+	r_.neon_i16 = vreinterpret_s16_u16(t3);
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
+		r_.i16[i] = HEDLEY_STATIC_CAST(int16_t,
+					       ((a_.i16[i] * b_.i16[i]) >> 16));
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
 #define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_mulhi_pi16(a, b) simde_mm_mulhi_pi16(a, b)
+#define _m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_mullo_pi16(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_mullo_pi16(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_mullo_pi16(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_mullo_pi16(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (int i = 0; i < 4; i++) {
-		r.i16[i] = (int16_t)((a.i16[i] * b.i16[i]) & 0xffff);
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
+	const uint16x4_t t2 = vmovn_u32(vreinterpretq_u32_s32(t1));
+	r_.neon_i16 = vreinterpret_s16_u16(t2);
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
+		r_.i16[i] = HEDLEY_STATIC_CAST(
+			int16_t, ((a_.i16[i] * b_.i16[i]) & 0xffff));
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
 #define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_mullo_pi16(a, b) simde_mm_mullo_pi16(a, b)
+#define _m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_or_si64(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_or_si64(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_or_si64(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_or_si64(a, b);
 #else
 #else
-	simde__m64 r;
-	r.i64[0] = a.i64[0] | b.i64[0];
-	return r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i32 = vorr_s32(a_.neon_i32, b_.neon_i32);
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+	r_.i64 = a_.i64 | b_.i64;
+#else
+	r_.i64[0] = a_.i64[0] | b_.i64[0];
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_por(a, b) simde_mm_or_si64(a, b)
 #define simde_m_por(a, b) simde_mm_or_si64(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_or_si64(a, b) simde_mm_or_si64(a, b)
+#define _m_por(a, b) simde_mm_or_si64(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_packs_pi16(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_packs_pi16(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_packs_pi16(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_packs_pi16(a, b);
 #else
 #else
-	simde__m64 r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
 
 
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
-		if (a.i16[i] < INT8_MIN) {
-			r.i8[i] = INT8_MIN;
-		} else if (a.i16[i] > INT8_MAX) {
-			r.i8[i] = INT8_MAX;
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i8 = vqmovn_s16(vcombine_s16(a_.neon_i16, b_.neon_i16));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
+		if (a_.i16[i] < INT8_MIN) {
+			r_.i8[i] = INT8_MIN;
+		} else if (a_.i16[i] > INT8_MAX) {
+			r_.i8[i] = INT8_MAX;
 		} else {
 		} else {
-			r.i8[i] = (int8_t)a.i16[i];
+			r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, a_.i16[i]);
 		}
 		}
 	}
 	}
 
 
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
-		if (b.i16[i] < INT8_MIN) {
-			r.i8[i + 4] = INT8_MIN;
-		} else if (b.i16[i] > INT8_MAX) {
-			r.i8[i + 4] = INT8_MAX;
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
+		if (b_.i16[i] < INT8_MIN) {
+			r_.i8[i + 4] = INT8_MIN;
+		} else if (b_.i16[i] > INT8_MAX) {
+			r_.i8[i + 4] = INT8_MAX;
 		} else {
 		} else {
-			r.i8[i + 4] = (int8_t)b.i16[i];
+			r_.i8[i + 4] = HEDLEY_STATIC_CAST(int8_t, b_.i16[i]);
 		}
 		}
 	}
 	}
+#endif
 
 
-	return r;
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b)
 #define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_packs_pi16(a, b) simde_mm_packs_pi16(a, b)
+#define _m_packsswb(a, b) mm_packs_pi16(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_packs_pi32(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_packs_pi32(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_packs_pi32(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_packs_pi32(a, b);
 #else
 #else
-	simde__m64 r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
 
 
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(a.i32[0])); i++) {
-		if (a.i32[i] < SHRT_MIN) {
-			r.i16[i] = SHRT_MIN;
-		} else if (a.i32[i] > INT16_MAX) {
-			r.i16[i] = INT16_MAX;
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i16 = vqmovn_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(a_.i32[0])); i++) {
+		if (a_.i32[i] < SHRT_MIN) {
+			r_.i16[i] = SHRT_MIN;
+		} else if (a_.i32[i] > INT16_MAX) {
+			r_.i16[i] = INT16_MAX;
 		} else {
 		} else {
-			r.i16[i] = (int16_t)a.i32[i];
+			r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i32[i]);
 		}
 		}
 	}
 	}
 
 
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(b.i32[0])); i++) {
-		if (b.i32[i] < SHRT_MIN) {
-			r.i16[i + 2] = SHRT_MIN;
-		} else if (b.i32[i] > INT16_MAX) {
-			r.i16[i + 2] = INT16_MAX;
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (8 / sizeof(b_.i32[0])); i++) {
+		if (b_.i32[i] < SHRT_MIN) {
+			r_.i16[i + 2] = SHRT_MIN;
+		} else if (b_.i32[i] > INT16_MAX) {
+			r_.i16[i + 2] = INT16_MAX;
 		} else {
 		} else {
-			r.i16[i + 2] = (int16_t)b.i32[i];
+			r_.i16[i + 2] = HEDLEY_STATIC_CAST(int16_t, b_.i32[i]);
 		}
 		}
 	}
 	}
+#endif
 
 
-	return r;
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b)
 #define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_packs_pi32(a, b) simde_mm_packs_pi32(a, b)
+#define _m_packssdw(a, b) simde_mm_packs_pi32(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_packs_pu16(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_packs_pu16(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_packs_pu16(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_packs_pu16(a, b);
 #else
 #else
-	simde__m64 r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
 
 
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
-		if (a.i16[i] > UINT8_MAX) {
-			r.u8[i] = UINT8_MAX;
-		} else if (a.i16[i] < 0) {
-			r.u8[i] = 0;
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	const int16x8_t t1 = vcombine_s16(a_.neon_i16, b_.neon_i16);
+
+	/* Set elements which are < 0 to 0 */
+	const int16x8_t t2 =
+		vandq_s16(t1, vreinterpretq_s16_u16(vcgezq_s16(t1)));
+
+	/* Vector with all s16 elements set to UINT8_MAX */
+	const int16x8_t vmax = vmovq_n_s16((int16_t)UINT8_MAX);
+
+	/* Elements which are within the acceptable range */
+	const int16x8_t le_max =
+		vandq_s16(t2, vreinterpretq_s16_u16(vcleq_s16(t2, vmax)));
+	const int16x8_t gt_max =
+		vandq_s16(vmax, vreinterpretq_s16_u16(vcgtq_s16(t2, vmax)));
+
+	/* Final values as 16-bit integers */
+	const int16x8_t values = vorrq_s16(le_max, gt_max);
+
+	r_.neon_u8 = vmovn_u16(vreinterpretq_u16_s16(values));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
+		if (a_.i16[i] > UINT8_MAX) {
+			r_.u8[i] = UINT8_MAX;
+		} else if (a_.i16[i] < 0) {
+			r_.u8[i] = 0;
 		} else {
 		} else {
-			r.u8[i] = (int8_t)a.i16[i];
+			r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]);
 		}
 		}
 	}
 	}
 
 
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
-		if (b.i16[i] > UINT8_MAX) {
-			r.u8[i + 4] = UINT8_MAX;
-		} else if (b.i16[i] < 0) {
-			r.u8[i + 4] = 0;
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
+		if (b_.i16[i] > UINT8_MAX) {
+			r_.u8[i + 4] = UINT8_MAX;
+		} else if (b_.i16[i] < 0) {
+			r_.u8[i + 4] = 0;
 		} else {
 		} else {
-			r.u8[i + 4] = (int8_t)b.i16[i];
+			r_.u8[i + 4] = HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]);
 		}
 		}
 	}
 	}
+#endif
 
 
-	return r;
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b)
 #define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_packs_pu16(a, b) simde_mm_packs_pu16(a, b)
+#define _m_packuswb(a, b) simde_mm_packs_pu16(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_set_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
 simde__m64 simde_mm_set_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
 			    int8_t e3, int8_t e2, int8_t e1, int8_t e0)
 			    int8_t e3, int8_t e2, int8_t e1, int8_t e0)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0);
 #else
 #else
-	simde__m64 r;
-	r.i8[0] = e0;
-	r.i8[1] = e1;
-	r.i8[2] = e2;
-	r.i8[3] = e3;
-	r.i8[4] = e4;
-	r.i8[5] = e5;
-	r.i8[6] = e6;
-	r.i8[7] = e7;
-	return r;
+	simde__m64_private r_;
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	const int8_t v[sizeof(r_.i8) / sizeof(r_.i8[0])] = {e0, e1, e2, e3,
+							    e4, e5, e6, e7};
+	r_.neon_i8 = vld1_s8(v);
+#else
+	r_.i8[0] = e0;
+	r_.i8[1] = e1;
+	r_.i8[2] = e2;
+	r_.i8[3] = e3;
+	r_.i8[4] = e4;
+	r_.i8[5] = e5;
+	r_.i8[6] = e6;
+	r_.i8[7] = e7;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0) \
+	simde_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_x_mm_set_pu8(uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4,
 simde__m64 simde_x_mm_set_pu8(uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4,
 			      uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0)
 			      uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_set_pi8((int8_t)e7, (int8_t)e6, (int8_t)e5,
-					(int8_t)e4, (int8_t)e3, (int8_t)e2,
-					(int8_t)e1, (int8_t)e0));
+	simde__m64_private r_;
+
+#if defined(SIMDE_X86_MMX_NATIVE)
+	r_.n = _mm_set_pi8(
+		HEDLEY_STATIC_CAST(int8_t, e7), HEDLEY_STATIC_CAST(int8_t, e6),
+		HEDLEY_STATIC_CAST(int8_t, e5), HEDLEY_STATIC_CAST(int8_t, e4),
+		HEDLEY_STATIC_CAST(int8_t, e3), HEDLEY_STATIC_CAST(int8_t, e2),
+		HEDLEY_STATIC_CAST(int8_t, e1), HEDLEY_STATIC_CAST(int8_t, e0));
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	const uint8_t v[sizeof(r_.u8) / sizeof(r_.u8[0])] = {e0, e1, e2, e3,
+							     e4, e5, e6, e7};
+	r_.neon_u8 = vld1_u8(v);
 #else
 #else
-	simde__m64 r;
-	r.u8[0] = e0;
-	r.u8[1] = e1;
-	r.u8[2] = e2;
-	r.u8[3] = e3;
-	r.u8[4] = e4;
-	r.u8[5] = e5;
-	r.u8[6] = e6;
-	r.u8[7] = e7;
-	return r;
+	r_.u8[0] = e0;
+	r_.u8[1] = e1;
+	r_.u8[2] = e2;
+	r_.u8[3] = e3;
+	r_.u8[4] = e4;
+	r_.u8[5] = e5;
+	r_.u8[6] = e6;
+	r_.u8[7] = e7;
 #endif
 #endif
+
+	return simde__m64_from_private(r_);
 }
 }
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_set_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
 simde__m64 simde_mm_set_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_set_pi16(e3, e2, e1, e0));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_set_pi16(e3, e2, e1, e0);
 #else
 #else
-	simde__m64 r;
-	r.i16[0] = e0;
-	r.i16[1] = e1;
-	r.i16[2] = e2;
-	r.i16[3] = e3;
-	return r;
+	simde__m64_private r_;
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	const int16_t v[sizeof(r_.i16) / sizeof(r_.i16[0])] = {e0, e1, e2, e3};
+	r_.neon_i16 = vld1_s16(v);
+#else
+	r_.i16[0] = e0;
+	r_.i16[1] = e1;
+	r_.i16[2] = e2;
+	r_.i16[3] = e3;
+#endif
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_set_pi16(e3, e2, e1, e0) simde_mm_set_pi16(e3, e2, e1, e0)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_x_mm_set_pu16(uint16_t e3, uint16_t e2, uint16_t e1,
 simde__m64 simde_x_mm_set_pu16(uint16_t e3, uint16_t e2, uint16_t e1,
 			       uint16_t e0)
 			       uint16_t e0)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_set_pi16((int16_t)e3, (int16_t)e2, (int16_t)e1,
-					 (int16_t)e0));
+	simde__m64_private r_;
+
+#if defined(SIMDE_X86_MMX_NATIVE)
+	r_.n = _mm_set_pi16(HEDLEY_STATIC_CAST(int16_t, e3),
+			    HEDLEY_STATIC_CAST(int16_t, e2),
+			    HEDLEY_STATIC_CAST(int16_t, e1),
+			    HEDLEY_STATIC_CAST(int16_t, e0));
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	const uint16_t v[sizeof(r_.u16) / sizeof(r_.u16[0])] = {e0, e1, e2, e3};
+	r_.neon_u16 = vld1_u16(v);
 #else
 #else
-	simde__m64 r;
-	r.u16[0] = e0;
-	r.u16[1] = e1;
-	r.u16[2] = e2;
-	r.u16[3] = e3;
-	return r;
+	r_.u16[0] = e0;
+	r_.u16[1] = e1;
+	r_.u16[2] = e2;
+	r_.u16[3] = e3;
 #endif
 #endif
+
+	return simde__m64_from_private(r_);
 }
 }
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_x_mm_set_pu32(uint32_t e1, uint32_t e0)
 simde__m64 simde_x_mm_set_pu32(uint32_t e1, uint32_t e0)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_set_pi32((int32_t)e1, (int32_t)e0));
+	simde__m64_private r_;
+
+#if defined(SIMDE_X86_MMX_NATIVE)
+	r_.n = _mm_set_pi32(HEDLEY_STATIC_CAST(int32_t, e1),
+			    HEDLEY_STATIC_CAST(int32_t, e0));
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	const uint32_t v[sizeof(r_.u32) / sizeof(r_.u32[0])] = {e0, e1};
+	r_.neon_u32 = vld1_u32(v);
 #else
 #else
-	simde__m64 r;
-	r.u32[0] = e0;
-	r.u32[1] = e1;
-	return r;
+	r_.u32[0] = e0;
+	r_.u32[1] = e1;
 #endif
 #endif
+
+	return simde__m64_from_private(r_);
 }
 }
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_set_pi32(int32_t e1, int32_t e0)
 simde__m64 simde_mm_set_pi32(int32_t e1, int32_t e0)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_set_pi32(e1, e0));
+	simde__m64_private r_;
+
+#if defined(SIMDE_X86_MMX_NATIVE)
+	r_.n = _mm_set_pi32(e1, e0);
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	const int32_t v[sizeof(r_.i32) / sizeof(r_.i32[0])] = {e0, e1};
+	r_.neon_i32 = vld1_s32(v);
 #else
 #else
-	simde__m64 r;
-	r.i32[0] = e0;
-	r.i32[1] = e1;
-	return r;
+	r_.i32[0] = e0;
+	r_.i32[1] = e1;
 #endif
 #endif
+
+	return simde__m64_from_private(r_);
 }
 }
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_set_pi32(e1, e0) simde_mm_set_pi32(e1, e0)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m64 simde_x_mm_set_pi64(int64_t e0)
+{
+	simde__m64_private r_;
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	const int64_t v[sizeof(r_.i64) / sizeof(r_.i64[0])] = {e0};
+	r_.neon_i64 = vld1_s64(v);
+#else
+	r_.i64[0] = e0;
+#endif
+
+	return simde__m64_from_private(r_);
+}
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m64 simde_x_mm_set_f32x2(simde_float32 e1, simde_float32 e0)
+{
+	simde__m64_private r_;
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	const simde_float32 v[sizeof(r_.f32) / sizeof(r_.f32[0])] = {e0, e1};
+	r_.neon_f32 = vld1_f32(v);
+#else
+	r_.f32[0] = e0;
+	r_.f32[1] = e1;
+#endif
+
+	return simde__m64_from_private(r_);
+}
+
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_set1_pi8(int8_t a)
 simde__m64 simde_mm_set1_pi8(int8_t a)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_set1_pi8(a));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_set1_pi8(a);
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	simde__m64_private r_;
+	r_.neon_i8 = vmov_n_s8(a);
+	return simde__m64_from_private(r_);
 #else
 #else
 	return simde_mm_set_pi8(a, a, a, a, a, a, a, a);
 	return simde_mm_set_pi8(a, a, a, a, a, a, a, a);
 #endif
 #endif
 }
 }
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_set1_pi8(a) simde_mm_set1_pi8(a)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_set1_pi16(int16_t a)
 simde__m64 simde_mm_set1_pi16(int16_t a)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_set1_pi16(a));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_set1_pi16(a);
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	simde__m64_private r_;
+	r_.neon_i16 = vmov_n_s16(a);
+	return simde__m64_from_private(r_);
 #else
 #else
 	return simde_mm_set_pi16(a, a, a, a);
 	return simde_mm_set_pi16(a, a, a, a);
 #endif
 #endif
 }
 }
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_set1_pi16(a) simde_mm_set1_pi16(a)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_set1_pi32(int32_t a)
 simde__m64 simde_mm_set1_pi32(int32_t a)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_set1_pi32(a));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_set1_pi32(a);
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	simde__m64_private r_;
+	r_.neon_i32 = vmov_n_s32(a);
+	return simde__m64_from_private(r_);
 #else
 #else
 	return simde_mm_set_pi32(a, a);
 	return simde_mm_set_pi32(a, a);
 #endif
 #endif
 }
 }
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_set1_pi32(a) simde_mm_set1_pi32(a)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_setr_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
 simde__m64 simde_mm_setr_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
 			     int8_t e3, int8_t e2, int8_t e1, int8_t e0)
 			     int8_t e3, int8_t e2, int8_t e1, int8_t e0)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0);
 #else
 #else
 	return simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7);
 	return simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7);
 #endif
 #endif
 }
 }
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0) \
+	simde_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_setr_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
 simde__m64 simde_mm_setr_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_setr_pi16(e3, e2, e1, e0));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_setr_pi16(e3, e2, e1, e0);
 #else
 #else
 	return simde_mm_set_pi16(e0, e1, e2, e3);
 	return simde_mm_set_pi16(e0, e1, e2, e3);
 #endif
 #endif
 }
 }
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_setr_pi16(e3, e2, e1, e0) simde_mm_setr_pi16(e3, e2, e1, e0)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_setr_pi32(int32_t e1, int32_t e0)
 simde__m64 simde_mm_setr_pi32(int32_t e1, int32_t e0)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_setr_pi32(e1, e0));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_setr_pi32(e1, e0);
 #else
 #else
 	return simde_mm_set_pi32(e0, e1);
 	return simde_mm_set_pi32(e0, e1);
 #endif
 #endif
 }
 }
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_setr_pi32(e1, e0) simde_mm_setr_pi32(e1, e0)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_setzero_si64(void)
 simde__m64 simde_mm_setzero_si64(void)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_setzero_si64());
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_setzero_si64();
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	simde__m64_private r_;
+	r_.neon_u32 = vmov_n_u32(0);
+	return simde__m64_from_private(r_);
 #else
 #else
 	return simde_mm_set_pi32(0, 0);
 	return simde_mm_set_pi32(0, 0);
 #endif
 #endif
 }
 }
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_setzero_si64() simde_mm_setzero_si64()
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
+simde__m64 simde_x_mm_setone_si64(void)
+{
+	return simde_mm_set1_pi32(~INT32_C(0));
+}
+
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_sll_pi16(simde__m64 a, simde__m64 count)
 simde__m64 simde_mm_sll_pi16(simde__m64 a, simde__m64 count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_sll_pi16(a.n, count.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_sll_pi16(a, count);
 #else
 #else
-	simde__m64 r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private count_ = simde__m64_to_private(count);
 
 
-	if (HEDLEY_UNLIKELY(count.u64[0] > 15)) {
-		memset(&r, 0, sizeof(r));
-		return r;
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t)vget_lane_u64(
+						    count_.neon_u64, 0)));
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+	r_.i16 = a_.i16 << count_.u64[0];
+#else
+	if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) {
+		simde_memset(&r_, 0, sizeof(r_));
+		return simde__m64_from_private(r_);
 	}
 	}
 
 
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
-		r.u16[i] = a.u16[i] << count.u64[0];
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
+		r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t,
+					       a_.u16[i] << count_.u64[0]);
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count)
 #define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_sll_pi16(a, count) simde_mm_sll_pi16(a, count)
+#define _m_psllw(a, count) simde_mm_sll_pi16(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_sll_pi32(simde__m64 a, simde__m64 count)
 simde__m64 simde_mm_sll_pi32(simde__m64 a, simde__m64 count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_sll_pi32(a.n, count.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_sll_pi32(a, count);
 #else
 #else
-	simde__m64 r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private count_ = simde__m64_to_private(count);
 
 
-	if (HEDLEY_UNLIKELY(count.u64[0] > 31)) {
-		memset(&r, 0, sizeof(r));
-		return r;
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t)vget_lane_u64(
+						    count_.neon_u64, 0)));
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+	r_.i32 = a_.i32 << count_.u64[0];
+#else
+	if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) {
+		simde_memset(&r_, 0, sizeof(r_));
+		return simde__m64_from_private(r_);
 	}
 	}
 
 
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) {
-		r.u32[i] = a.u32[i] << count.u64[0];
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
+		r_.u32[i] = a_.u32[i] << count_.u64[0];
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count)
 #define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_sll_pi32(a, count) simde_mm_sll_pi32(a, count)
+#define _m_pslld(a, count) simde_mm_sll_pi32(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_slli_pi16(simde__m64 a, int count)
 simde__m64 simde_mm_slli_pi16(simde__m64 a, int count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
-	return SIMDE__M64_C(_mm_slli_pi16(a.n, count));
+#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
+	return _mm_slli_pi16(a, count);
 #else
 #else
-	simde__m64 r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
 
 
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
-		r.u16[i] = a.u16[i] << count;
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+	r_.i16 = a_.i16 << count;
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t)count));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
+		r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] << count);
 	}
 	}
+#endif
 
 
-	return r;
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count)
 #define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_slli_pi16(a, count) simde_mm_slli_pi16(a, count)
+#define _m_psllwi(a, count) simde_mm_slli_pi16(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_slli_pi32(simde__m64 a, int count)
 simde__m64 simde_mm_slli_pi32(simde__m64 a, int count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
-	return SIMDE__M64_C(_mm_slli_pi32(a.n, count));
+#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
+	return _mm_slli_pi32(a, count);
 #else
 #else
-	simde__m64 r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
 
 
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(int)); i++) {
-		r.u32[i] = a.u32[i] << count;
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+	r_.i32 = a_.i32 << count;
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t)count));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
+		r_.u32[i] = a_.u32[i] << count;
 	}
 	}
+#endif
 
 
-	return r;
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b)
 #define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_slli_pi32(a, count) simde_mm_slli_pi32(a, count)
+#define _m_pslldi(a, count) simde_mm_slli_pi32(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_slli_si64(simde__m64 a, int count)
 simde__m64 simde_mm_slli_si64(simde__m64 a, int count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_slli_si64(a.n, count));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_slli_si64(a, count);
 #else
 #else
-	simde__m64 r;
-	r.u64[0] = a.u64[0] << count;
-	return r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+	r_.i64 = a_.i64 << count;
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i64 = vshl_s64(a_.neon_i64, vmov_n_s64((int64_t)count));
+#else
+	r_.u64[0] = a_.u64[0] << count;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count)
 #define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_slli_si64(a, count) simde_mm_slli_si64(a, count)
+#define _m_psllqi(a, count) simde_mm_slli_si64(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_sll_si64(simde__m64 a, simde__m64 count)
 simde__m64 simde_mm_sll_si64(simde__m64 a, simde__m64 count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_sll_si64(a.n, count.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_sll_si64(a, count);
 #else
 #else
-	simde__m64 r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private count_ = simde__m64_to_private(count);
 
 
-	if (HEDLEY_UNLIKELY(count.u64[0] > 63)) {
-		memset(&r, 0, sizeof(r));
-		return r;
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i64 = vshl_s64(a_.neon_i64, count_.neon_i64);
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+	r_.i64 = a_.i64 << count_.i64;
+#else
+	if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) {
+		simde_memset(&r_, 0, sizeof(r_));
+		return simde__m64_from_private(r_);
 	}
 	}
 
 
-	r.u64[0] = a.u64[0] << count.u64[0];
+	r_.u64[0] = a_.u64[0] << count_.u64[0];
+#endif
 
 
-	return r;
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psllq(a, count) simde_mm_sll_si64(a, count)
 #define simde_m_psllq(a, count) simde_mm_sll_si64(a, count)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_sll_si64(a, count) simde_mm_sll_si64(a, count)
+#define _m_psllq(a, count) simde_mm_sll_si64(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_srl_pi16(simde__m64 a, simde__m64 count)
 simde__m64 simde_mm_srl_pi16(simde__m64 a, simde__m64 count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_srl_pi16(a.n, count.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_srl_pi16(a, count);
 #else
 #else
-	simde__m64 r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private count_ = simde__m64_to_private(count);
 
 
-	if (HEDLEY_UNLIKELY(count.u64[0] > 15)) {
-		memset(&r, 0, sizeof(r));
-		return r;
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+	r_.u16 = a_.u16 >> count_.u64[0];
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_u16 = vshl_u16(
+		a_.neon_u16,
+		vmov_n_s16(-((int16_t)vget_lane_u64(count_.neon_u64, 0))));
+#else
+	if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) {
+		simde_memset(&r_, 0, sizeof(r_));
+		return simde__m64_from_private(r_);
 	}
 	}
 
 
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < sizeof(r.u16) / sizeof(r.u16[0]); i++) {
-		r.u16[i] = a.u16[i] >> count.u64[0];
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < sizeof(r_.u16) / sizeof(r_.u16[0]); i++) {
+		r_.u16[i] = a_.u16[i] >> count_.u64[0];
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count)
 #define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_srl_pi16(a, count) simde_mm_srl_pi16(a, count)
+#define _m_psrlw(a, count) simde_mm_srl_pi16(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_srl_pi32(simde__m64 a, simde__m64 count)
 simde__m64 simde_mm_srl_pi32(simde__m64 a, simde__m64 count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_srl_pi32(a.n, count.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_srl_pi32(a, count);
 #else
 #else
-	simde__m64 r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private count_ = simde__m64_to_private(count);
 
 
-	if (HEDLEY_UNLIKELY(count.u64[0] > 31)) {
-		memset(&r, 0, sizeof(r));
-		return r;
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+	r_.u32 = a_.u32 >> count_.u64[0];
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_u32 = vshl_u32(
+		a_.neon_u32,
+		vmov_n_s32(-((int32_t)vget_lane_u64(count_.neon_u64, 0))));
+#else
+	if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) {
+		simde_memset(&r_, 0, sizeof(r_));
+		return simde__m64_from_private(r_);
 	}
 	}
 
 
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < sizeof(r.u32) / sizeof(r.u32[0]); i++) {
-		r.u32[i] = a.u32[i] >> count.u64[0];
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < sizeof(r_.u32) / sizeof(r_.u32[0]); i++) {
+		r_.u32[i] = a_.u32[i] >> count_.u64[0];
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count)
 #define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_srl_pi32(a, count) simde_mm_srl_pi32(a, count)
+#define _m_psrld(a, count) simde_mm_srl_pi32(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_srli_pi16(simde__m64 a, int count)
 simde__m64 simde_mm_srli_pi16(simde__m64 a, int count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
-	return SIMDE__M64_C(_mm_srli_pi16(a.n, count));
+#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
+	return _mm_srli_pi16(a, count);
 #else
 #else
-	simde__m64 r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
 
 
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(uint16_t)); i++) {
-		r.u16[i] = a.u16[i] >> count;
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+	r_.u16 = a_.u16 >> count;
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t)count)));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
+		r_.u16[i] = a_.u16[i] >> count;
 	}
 	}
+#endif
 
 
-	return r;
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
 #define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_srli_pi16(a, count) simde_mm_srli_pi16(a, count)
+#define _m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_srli_pi32(simde__m64 a, int count)
 simde__m64 simde_mm_srli_pi32(simde__m64 a, int count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
-	return SIMDE__M64_C(_mm_srli_pi32(a.n, count));
+#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
+	return _mm_srli_pi32(a, count);
 #else
 #else
-	simde__m64 r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
 
 
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(int)); i++) {
-		r.u32[i] = a.u32[i] >> count;
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+	r_.u32 = a_.u32 >> count;
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t)count)));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
+		r_.u32[i] = a_.u32[i] >> count;
 	}
 	}
+#endif
 
 
-	return r;
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count)
 #define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_srli_pi32(a, count) simde_mm_srli_pi32(a, count)
+#define _m_psrldi(a, count) simde_mm_srli_pi32(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_srli_si64(simde__m64 a, int count)
 simde__m64 simde_mm_srli_si64(simde__m64 a, int count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
-	return SIMDE__M64_C(_mm_srli_si64(a.n, count));
+#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
+	return _mm_srli_si64(a, count);
 #else
 #else
-	simde__m64 r;
-	r.u64[0] = a.u64[0] >> count;
-	return r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_u64 = vshl_u64(a_.neon_u64, vmov_n_s64(-count));
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+	r_.u64 = a_.u64 >> count;
+#else
+	r_.u64[0] = a_.u64[0] >> count;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count)
 #define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_srli_si64(a, count) simde_mm_srli_si64(a, count)
+#define _m_psrlqi(a, count) simde_mm_srli_si64(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_srl_si64(simde__m64 a, simde__m64 count)
 simde__m64 simde_mm_srl_si64(simde__m64 a, simde__m64 count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_srl_si64(a.n, count.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_srl_si64(a, count);
 #else
 #else
-	simde__m64 r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private count_ = simde__m64_to_private(count);
 
 
-	if (HEDLEY_UNLIKELY(count.u64[0] > 63)) {
-		memset(&r, 0, sizeof(r));
-		return r;
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	r_.neon_u64 = vshl_u64(a_.neon_u64, vneg_s64(count_.neon_i64));
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+	r_.u64 = a_.u64 >> count_.u64;
+#else
+	if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) {
+		simde_memset(&r_, 0, sizeof(r_));
+		return simde__m64_from_private(r_);
 	}
 	}
 
 
-	r.u64[0] = a.u64[0] >> count.u64[0];
-	return r;
+	r_.u64[0] = a_.u64[0] >> count_.u64[0];
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count)
 #define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_srl_si64(a, count) simde_mm_srl_si64(a, count)
+#define _m_psrlq(a, count) simde_mm_srl_si64(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_srai_pi16(simde__m64 a, int count)
 simde__m64 simde_mm_srai_pi16(simde__m64 a, int count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
-	return SIMDE__M64_C(_mm_srai_pi16(a.n, count));
+#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
+	return _mm_srai_pi16(a, count);
 #else
 #else
-	simde__m64 r;
-
-	const uint16_t m =
-		(uint16_t)((~0U) << ((sizeof(int16_t) * CHAR_BIT) - count));
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
 
 
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
-		const uint16_t is_neg = ((uint16_t)(
-			((a.u16[i]) >> ((sizeof(int16_t) * CHAR_BIT) - 1))));
-		r.u16[i] = (a.u16[i] >> count) | (m * is_neg);
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+	r_.i16 = a_.i16 >> (count & 0xff);
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+  r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, count));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
+		r_.i16[i] = a_.i16[i] >> (count & 0xff);
 	}
 	}
+#endif
 
 
-	return r;
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count)
 #define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_srai_pi16(a, count) simde_mm_srai_pi16(a, count)
+#define _m_psrawi(a, count) simde_mm_srai_pi16(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_srai_pi32(simde__m64 a, int count)
 simde__m64 simde_mm_srai_pi32(simde__m64 a, int count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
-	return SIMDE__M64_C(_mm_srai_pi32(a.n, count));
+#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
+	return _mm_srai_pi32(a, count);
 #else
 #else
-	simde__m64 r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
 
 
-	const uint32_t m =
-		(uint32_t)((~0U) << ((sizeof(int) * CHAR_BIT) - count));
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(int)); i++) {
-		const uint32_t is_neg = ((uint32_t)(
-			((a.u32[i]) >> ((sizeof(int) * CHAR_BIT) - 1))));
-		r.u32[i] = (a.u32[i] >> count) | (m * is_neg);
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+	r_.i32 = a_.i32 >> (count & 0xff);
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i32 = vshl_s32(a_.neon_i32,
+			       vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, count)));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
+		r_.i32[i] = a_.i32[i] >> (count & 0xff);
 	}
 	}
+#endif
 
 
-	return r;
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
-#define simde_m_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
+#define simde_m_psradi(a, count) simde_mm_srai_pi32(a, count)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
+#define _m_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_sra_pi16(simde__m64 a, simde__m64 count)
 simde__m64 simde_mm_sra_pi16(simde__m64 a, simde__m64 count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_sra_pi16(a.n, count.n));
-#else
-	simde__m64 r;
-	int cnt = (int)count.i64[0];
-
-	if (cnt > 15 || cnt < 0) {
-		for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
-		     i++) {
-			r.u16[i] = (a.i16[i] < 0) ? 0xffff : 0x0000;
-		}
-	} else {
-		const uint16_t m = (uint16_t)(
-			(~0U) << ((sizeof(int16_t) * CHAR_BIT) - cnt));
-		for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
-		     i++) {
-			const uint16_t is_neg = a.i16[i] < 0;
-			r.u16[i] = (a.u16[i] >> cnt) | (m * is_neg);
-		}
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_sra_pi16(a, count);
+#else
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private count_ = simde__m64_to_private(count);
+	const int cnt = HEDLEY_STATIC_CAST(
+		int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+	r_.i16 = a_.i16 >> cnt;
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i16 =
+		vshl_s16(a_.neon_i16,
+			 vmov_n_s16(-HEDLEY_STATIC_CAST(
+				 int16_t, vget_lane_u64(count_.neon_u64, 0))));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
+		r_.i16[i] = a_.i16[i] >> cnt;
 	}
 	}
+#endif
 
 
-	return r;
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count)
 #define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_sra_pi16(a, count) simde_mm_sra_pi16(a, count)
+#define _m_psraw(a, count) simde_mm_sra_pi16(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_sra_pi32(simde__m64 a, simde__m64 count)
 simde__m64 simde_mm_sra_pi32(simde__m64 a, simde__m64 count)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_sra_pi32(a.n, count.n));
-#else
-	simde__m64 r;
-	const uint64_t cnt = count.u64[0];
-
-	if (cnt > 31) {
-		for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
-		     i++) {
-			r.u32[i] = (a.i32[i] < 0) ? UINT32_MAX : 0;
-		}
-	} else if (cnt == 0) {
-		memcpy(&r, &a, sizeof(r));
-	} else {
-		const uint32_t m = (uint32_t)(
-			(~0U) << ((sizeof(int32_t) * CHAR_BIT) - cnt));
-		for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
-		     i++) {
-			const uint32_t is_neg = a.i32[i] < 0;
-			r.u32[i] = (a.u32[i] >> cnt) | (m * is_neg);
-		}
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_sra_pi32(a, count);
+#else
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private count_ = simde__m64_to_private(count);
+	const int32_t cnt =
+		(count_.u64[0] > 31)
+			? 31
+			: HEDLEY_STATIC_CAST(int32_t, count_.u64[0]);
+
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
+	r_.i32 = a_.i32 >> cnt;
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i32 =
+		vshl_s32(a_.neon_i32,
+			 vmov_n_s32(-HEDLEY_STATIC_CAST(
+				 int32_t, vget_lane_u64(count_.neon_u64, 0))));
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
+		r_.i32[i] = a_.i32[i] >> cnt;
 	}
 	}
+#endif
 
 
-	return r;
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b)
 #define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_sra_pi32(a, count) simde_mm_sra_pi32(a, count)
+#define _m_psrad(a, count) simde_mm_sra_pi32(a, count)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_sub_pi8(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_sub_pi8(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_sub_pi8(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_sub_pi8(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < 8; i++) {
-		r.i8[i] = a.i8[i] - b.i8[i];
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i8 = vsub_s8(a_.neon_i8, b_.neon_i8);
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+	r_.i8 = a_.i8 - b_.i8;
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
+		r_.i8[i] = a_.i8[i] - b_.i8[i];
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b)
 #define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_sub_pi8(a, b) simde_mm_sub_pi8(a, b)
+#define _m_psubb(a, b) simde_mm_sub_pi8(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_sub_pi16(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_sub_pi16(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_sub_pi16(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_sub_pi16(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
-		r.i16[i] = a.i16[i] - b.i16[i];
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i16 = vsub_s16(a_.neon_i16, b_.neon_i16);
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+	r_.i16 = a_.i16 - b_.i16;
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
+		r_.i16[i] = a_.i16[i] - b_.i16[i];
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b)
 #define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_sub_pi16(a, b) simde_mm_sub_pi16(a, b)
+#define _m_psubw(a, b) simde_mm_sub_pi16(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_sub_pi32(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_sub_pi32(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_sub_pi32(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_sub_pi32(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(int)); i++) {
-		r.i32[i] = a.i32[i] - b.i32[i];
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i32 = vsub_s32(a_.neon_i32, b_.neon_i32);
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+	r_.i32 = a_.i32 - b_.i32;
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
+		r_.i32[i] = a_.i32[i] - b_.i32[i];
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b)
 #define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_sub_pi32(a, b) simde_mm_sub_pi32(a, b)
+#define _m_psubd(a, b) simde_mm_sub_pi32(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_subs_pi8(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_subs_pi8(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_subs_pi8(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_subs_pi8(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8); i++) {
-		if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) {
-			r.i8[i] = INT8_MIN;
-		} else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) {
-			r.i8[i] = INT8_MAX;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i8 = vqsub_s8(a_.neon_i8, b_.neon_i8);
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
+		if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
+			r_.i8[i] = INT8_MIN;
+		} else if ((b_.i8[i]) < 0 &&
+			   (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
+			r_.i8[i] = INT8_MAX;
 		} else {
 		} else {
-			r.i8[i] = (a.i8[i]) - (b.i8[i]);
+			r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
 		}
 		}
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b)
 #define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_subs_pi8(a, b) simde_mm_subs_pi8(a, b)
+#define _m_psubsb(a, b) simde_mm_subs_pi8(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_subs_pu8(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_subs_pu8(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_subs_pu8(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_subs_pu8(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8); i++) {
-		const int32_t x = a.u8[i] - b.u8[i];
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_u8 = vqsub_u8(a_.neon_u8, b_.neon_u8);
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
+		const int32_t x = a_.u8[i] - b_.u8[i];
 		if (x < 0) {
 		if (x < 0) {
-			r.u8[i] = 0;
+			r_.u8[i] = 0;
 		} else if (x > UINT8_MAX) {
 		} else if (x > UINT8_MAX) {
-			r.u8[i] = UINT8_MAX;
+			r_.u8[i] = UINT8_MAX;
 		} else {
 		} else {
-			r.u8[i] = (uint8_t)x;
+			r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
 		}
 		}
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b)
 #define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_subs_pu8(a, b) simde_mm_subs_pu8(a, b)
+#define _m_psubusb(a, b) simde_mm_subs_pu8(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_subs_pi16(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_subs_pi16(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_subs_pi16(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_subs_pi16(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
-		if (((b.i16[i]) > 0 && (a.i16[i]) < SHRT_MIN + (b.i16[i]))) {
-			r.i16[i] = SHRT_MIN;
-		} else if ((b.i16[i]) < 0 &&
-			   (a.i16[i]) > INT16_MAX + (b.i16[i])) {
-			r.i16[i] = INT16_MAX;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i16 = vqsub_s16(a_.neon_i16, b_.neon_i16);
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
+		if (((b_.i16[i]) > 0 && (a_.i16[i]) < SHRT_MIN + (b_.i16[i]))) {
+			r_.i16[i] = SHRT_MIN;
+		} else if ((b_.i16[i]) < 0 &&
+			   (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
+			r_.i16[i] = INT16_MAX;
 		} else {
 		} else {
-			r.i16[i] = (a.i16[i]) - (b.i16[i]);
+			r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
 		}
 		}
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b)
 #define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_subs_pi16(a, b) simde_mm_subs_pi16(a, b)
+#define _m_psubsw(a, b) simde_mm_subs_pi16(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_subs_pu16(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_subs_pu16(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_subs_pu16(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_subs_pu16(a, b);
 #else
 #else
-	simde__m64 r;
-	SIMDE__VECTORIZE
-	for (size_t i = 0; i < (8 / sizeof(uint16_t)); i++) {
-		const int x = a.u16[i] - b.u16[i];
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_u16 = vqsub_u16(a_.neon_u16, b_.neon_u16);
+#else
+	SIMDE_VECTORIZE
+	for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
+		const int x = a_.u16[i] - b_.u16[i];
 		if (x < 0) {
 		if (x < 0) {
-			r.u16[i] = 0;
+			r_.u16[i] = 0;
 		} else if (x > UINT16_MAX) {
 		} else if (x > UINT16_MAX) {
-			r.u16[i] = UINT16_MAX;
+			r_.u16[i] = UINT16_MAX;
 		} else {
 		} else {
-			r.u16[i] = (uint16_t)x;
+			r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
 		}
 		}
 	}
 	}
-	return r;
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b)
 #define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_subs_pu16(a, b) simde_mm_subs_pu16(a, b)
+#define _m_psubusw(a, b) simde_mm_subs_pu16(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_unpackhi_pi8(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_unpackhi_pi8(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_unpackhi_pi8(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_unpackhi_pi8(a, b);
 #else
 #else
-	simde__m64 r;
-	r.i8[0] = a.i8[4];
-	r.i8[1] = b.i8[4];
-	r.i8[2] = a.i8[5];
-	r.i8[3] = b.i8[5];
-	r.i8[4] = a.i8[6];
-	r.i8[5] = b.i8[6];
-	r.i8[6] = a.i8[7];
-	r.i8[7] = b.i8[7];
-	return r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	r_.neon_i8 = vzip2_s8(a_.neon_i8, b_.neon_i8);
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
+	r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14,
+				      7, 15);
+#else
+	r_.i8[0] = a_.i8[4];
+	r_.i8[1] = b_.i8[4];
+	r_.i8[2] = a_.i8[5];
+	r_.i8[3] = b_.i8[5];
+	r_.i8[4] = a_.i8[6];
+	r_.i8[5] = b_.i8[6];
+	r_.i8[6] = a_.i8[7];
+	r_.i8[7] = b_.i8[7];
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
 #define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_unpackhi_pi8(a, b) simde_mm_unpackhi_pi8(a, b)
+#define _m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_unpackhi_pi16(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_unpackhi_pi16(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_unpackhi_pi16(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_unpackhi_pi16(a, b);
 #else
 #else
-	simde__m64 r;
-	r.i16[0] = a.i16[2];
-	r.i16[1] = b.i16[2];
-	r.i16[2] = a.i16[3];
-	r.i16[3] = b.i16[3];
-	return r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	r_.neon_i16 = vzip2_s16(a_.neon_i16, b_.neon_i16);
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
+	r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 2, 6, 3, 7);
+#else
+	r_.i16[0] = a_.i16[2];
+	r_.i16[1] = b_.i16[2];
+	r_.i16[2] = a_.i16[3];
+	r_.i16[3] = b_.i16[3];
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
 #define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_unpackhi_pi16(a, b) simde_mm_unpackhi_pi16(a, b)
+#define _m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_unpackhi_pi32(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_unpackhi_pi32(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_unpackhi_pi32(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_unpackhi_pi32(a, b);
 #else
 #else
-	simde__m64 r;
-	r.i32[0] = a.i32[1];
-	r.i32[1] = b.i32[1];
-	return r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	r_.neon_i32 = vzip2_s32(a_.neon_i32, b_.neon_i32);
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
+	r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3);
+#else
+	r_.i32[0] = a_.i32[1];
+	r_.i32[1] = b_.i32[1];
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
 #define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_unpackhi_pi32(a, b) simde_mm_unpackhi_pi32(a, b)
+#define _m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_unpacklo_pi8(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_unpacklo_pi8(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_unpacklo_pi8(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_unpacklo_pi8(a, b);
 #else
 #else
-	simde__m64 r;
-	r.i8[0] = a.i8[0];
-	r.i8[1] = b.i8[0];
-	r.i8[2] = a.i8[1];
-	r.i8[3] = b.i8[1];
-	r.i8[4] = a.i8[2];
-	r.i8[5] = b.i8[2];
-	r.i8[6] = a.i8[3];
-	r.i8[7] = b.i8[3];
-	return r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	r_.neon_i8 = vzip1_s8(a_.neon_i8, b_.neon_i8);
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
+	r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 0, 8, 1, 9, 2, 10, 3,
+				      11);
+#else
+	r_.i8[0] = a_.i8[0];
+	r_.i8[1] = b_.i8[0];
+	r_.i8[2] = a_.i8[1];
+	r_.i8[3] = b_.i8[1];
+	r_.i8[4] = a_.i8[2];
+	r_.i8[5] = b_.i8[2];
+	r_.i8[6] = a_.i8[3];
+	r_.i8[7] = b_.i8[3];
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
 #define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_unpacklo_pi8(a, b) simde_mm_unpacklo_pi8(a, b)
+#define _m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_unpacklo_pi16(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_unpacklo_pi16(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_unpacklo_pi16(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_unpacklo_pi16(a, b);
 #else
 #else
-	simde__m64 r;
-	r.i16[0] = a.i16[0];
-	r.i16[1] = b.i16[0];
-	r.i16[2] = a.i16[1];
-	r.i16[3] = b.i16[1];
-	return r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	r_.neon_i16 = vzip1_s16(a_.neon_i16, b_.neon_i16);
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
+	r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 4, 1, 5);
+#else
+	r_.i16[0] = a_.i16[0];
+	r_.i16[1] = b_.i16[0];
+	r_.i16[2] = a_.i16[1];
+	r_.i16[3] = b_.i16[1];
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
 #define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_unpacklo_pi16(a, b) simde_mm_unpacklo_pi16(a, b)
+#define _m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_unpacklo_pi32(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_unpacklo_pi32(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_unpacklo_pi32(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_unpacklo_pi32(a, b);
 #else
 #else
-	simde__m64 r;
-	r.i32[0] = a.i32[0];
-	r.i32[1] = b.i32[0];
-	return r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+	r_.neon_i32 = vzip1_s32(a_.neon_i32, b_.neon_i32);
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
+	r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2);
+#else
+	r_.i32[0] = a_.i32[0];
+	r_.i32[1] = b_.i32[0];
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
 #define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_unpacklo_pi32(a, b) simde_mm_unpacklo_pi32(a, b)
+#define _m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 simde__m64 simde_mm_xor_si64(simde__m64 a, simde__m64 b)
 simde__m64 simde_mm_xor_si64(simde__m64 a, simde__m64 b)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return SIMDE__M64_C(_mm_xor_si64(a.n, b.n));
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _mm_xor_si64(a, b);
 #else
 #else
-	simde__m64 r;
-	r.i64[0] = a.i64[0] ^ b.i64[0];
-	return r;
+	simde__m64_private r_;
+	simde__m64_private a_ = simde__m64_to_private(a);
+	simde__m64_private b_ = simde__m64_to_private(b);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	r_.neon_i32 = veor_s32(a_.neon_i32, b_.neon_i32);
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+	r_.i32f = a_.i32f ^ b_.i32f;
+#else
+	r_.u64[0] = a_.u64[0] ^ b_.u64[0];
+#endif
+
+	return simde__m64_from_private(r_);
 #endif
 #endif
 }
 }
 #define simde_m_pxor(a, b) simde_mm_xor_si64(a, b)
 #define simde_m_pxor(a, b) simde_mm_xor_si64(a, b)
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _mm_xor_si64(a, b) simde_mm_xor_si64(a, b)
+#define _m_pxor(a, b) simde_mm_xor_si64(a, b)
+#endif
 
 
-SIMDE__FUNCTION_ATTRIBUTES
+SIMDE_FUNCTION_ATTRIBUTES
 int32_t simde_m_to_int(simde__m64 a)
 int32_t simde_m_to_int(simde__m64 a)
 {
 {
-#if defined(SIMDE_MMX_NATIVE)
-	return _m_to_int(a.n);
+#if defined(SIMDE_X86_MMX_NATIVE)
+	return _m_to_int(a);
+#else
+	simde__m64_private a_ = simde__m64_to_private(a);
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+	return vget_lane_s32(a_.neon_i32, 0);
 #else
 #else
-	return a.i32[0];
+	return a_.i32[0];
+#endif
 #endif
 #endif
 }
 }
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
+#define _m_to_int(a) simde_m_to_int(a)
+#endif
+
+SIMDE_END_DECLS_
 
 
-SIMDE__END_DECLS
+HEDLEY_DIAGNOSTIC_POP
 
 
-#endif /* !defined(SIMDE__MMX_H) */
+#endif /* !defined(SIMDE_X86_MMX_H) */

+ 165 - 2
libobs/util/simde/simde-arch.h

@@ -6,6 +6,8 @@
  *   details, see the Creative Commons Zero 1.0 Universal license at
  *   details, see the Creative Commons Zero 1.0 Universal license at
  *   <https://creativecommons.org/publicdomain/zero/1.0/>
  *   <https://creativecommons.org/publicdomain/zero/1.0/>
  *
  *
+ * SPDX-License-Identifier: CC0-1.0
+ *
  * Different compilers define different preprocessor macros for the
  * Different compilers define different preprocessor macros for the
  * same architecture.  This is an attempt to provide a single
  * same architecture.  This is an attempt to provide a single
  * interface which is usable on any compiler.
  * interface which is usable on any compiler.
@@ -53,6 +55,11 @@
 #define SIMDE_ARCH_ALPHA 1
 #define SIMDE_ARCH_ALPHA 1
 #endif
 #endif
 #endif
 #endif
+#if defined(SIMDE_ARCH_ALPHA)
+#define SIMDE_ARCH_ALPHA_CHECK(version) ((version) <= SIMDE_ARCH_ALPHA)
+#else
+#define SIMDE_ARCH_ALPHA_CHECK(version) (0)
+#endif
 
 
 /* Atmel AVR
 /* Atmel AVR
    <https://en.wikipedia.org/wiki/Atmel_AVR> */
    <https://en.wikipedia.org/wiki/Atmel_AVR> */
@@ -64,7 +71,7 @@
    <https://en.wikipedia.org/wiki/X86-64> */
    <https://en.wikipedia.org/wiki/X86-64> */
 #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || \
 #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || \
 	defined(__x86_64) || defined(_M_X66) || defined(_M_AMD64)
 	defined(__x86_64) || defined(_M_X66) || defined(_M_AMD64)
-#define SIMDE_ARCH_AMD64 1
+#define SIMDE_ARCH_AMD64 1000
 #endif
 #endif
 
 
 /* ARM
 /* ARM
@@ -93,11 +100,30 @@
 	defined(_ARM) || defined(_M_ARM) || defined(_M_ARM)
 	defined(_ARM) || defined(_M_ARM) || defined(_M_ARM)
 #define SIMDE_ARCH_ARM 1
 #define SIMDE_ARCH_ARM 1
 #endif
 #endif
+#if defined(SIMDE_ARCH_ARM)
+#define SIMDE_ARCH_ARM_CHECK(version) ((version) <= SIMDE_ARCH_ARM)
+#else
+#define SIMDE_ARCH_ARM_CHECK(version) (0)
+#endif
 
 
 /* AArch64
 /* AArch64
    <https://en.wikipedia.org/wiki/ARM_architecture> */
    <https://en.wikipedia.org/wiki/ARM_architecture> */
 #if defined(__aarch64__) || defined(_M_ARM64)
 #if defined(__aarch64__) || defined(_M_ARM64)
-#define SIMDE_ARCH_AARCH64 10
+#define SIMDE_ARCH_AARCH64 1000
+#endif
+#if defined(SIMDE_ARCH_AARCH64)
+#define SIMDE_ARCH_AARCH64_CHECK(version) ((version) <= SIMDE_ARCH_AARCH64)
+#else
+#define SIMDE_ARCH_AARCH64_CHECK(version) (0)
+#endif
+
+/* ARM SIMD ISA extensions */
+#if defined(__ARM_NEON)
+#if defined(SIMDE_ARCH_AARCH64)
+#define SIMDE_ARCH_ARM_NEON SIMDE_ARCH_AARCH64
+#elif defined(SIMDE_ARCH_ARM)
+#define SIMDE_ARCH_ARM_NEON SIMDE_ARCH_ARM
+#endif
 #endif
 #endif
 
 
 /* Blackfin
 /* Blackfin
@@ -128,6 +154,11 @@
 #elif defined(__convex__)
 #elif defined(__convex__)
 #define SIMDE_ARCH_CONVEX 1
 #define SIMDE_ARCH_CONVEX 1
 #endif
 #endif
+#if defined(SIMDE_ARCH_CONVEX)
+#define SIMDE_ARCH_CONVEX_CHECK(version) ((version) <= SIMDE_ARCH_CONVEX)
+#else
+#define SIMDE_ARCH_CONVEX_CHECK(version) (0)
+#endif
 
 
 /* Adapteva Epiphany
 /* Adapteva Epiphany
    <https://en.wikipedia.org/wiki/Adapteva_Epiphany> */
    <https://en.wikipedia.org/wiki/Adapteva_Epiphany> */
@@ -159,6 +190,11 @@
 #elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa)
 #elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa)
 #define SIMDE_ARCH_HPPA 1
 #define SIMDE_ARCH_HPPA 1
 #endif
 #endif
+#if defined(SIMDE_ARCH_HPPA)
+#define SIMDE_ARCH_HPPA_CHECK(version) ((version) <= SIMDE_ARCH_HPPA)
+#else
+#define SIMDE_ARCH_HPPA_CHECK(version) (0)
+#endif
 
 
 /* x86
 /* x86
    <https://en.wikipedia.org/wiki/X86> */
    <https://en.wikipedia.org/wiki/X86> */
@@ -177,6 +213,88 @@
 #elif defined(_X86_) || defined(__X86__) || defined(__THW_INTEL__)
 #elif defined(_X86_) || defined(__X86__) || defined(__THW_INTEL__)
 #define SIMDE_ARCH_X86 3
 #define SIMDE_ARCH_X86 3
 #endif
 #endif
+#if defined(SIMDE_ARCH_X86)
+#define SIMDE_ARCH_X86_CHECK(version) ((version) <= SIMDE_ARCH_X86)
+#else
+#define SIMDE_ARCH_X86_CHECK(version) (0)
+#endif
+
+/* SIMD ISA extensions for x86/x86_64 */
+#if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)
+#if defined(_M_IX86_FP)
+#define SIMDE_ARCH_X86_MMX
+#if (_M_IX86_FP >= 1)
+#define SIMDE_ARCH_X86_SSE 1
+#endif
+#if (_M_IX86_FP >= 2)
+#define SIMDE_ARCH_X86_SSE2 1
+#endif
+#elif defined(_M_X64)
+#define SIMDE_ARCH_X86_SSE 1
+#define SIMDE_ARCH_X86_SSE2 1
+#else
+#if defined(__MMX__)
+#define SIMDE_ARCH_X86_MMX 1
+#endif
+#if defined(__SSE__)
+#define SIMDE_ARCH_X86_SSE 1
+#endif
+#if defined(__SSE2__)
+#define SIMDE_ARCH_X86_SSE2 1
+#endif
+#endif
+#if defined(__SSE3__)
+#define SIMDE_ARCH_X86_SSE3 1
+#endif
+#if defined(__SSSE3__)
+#define SIMDE_ARCH_X86_SSSE3 1
+#endif
+#if defined(__SSE4_1__)
+#define SIMDE_ARCH_X86_SSE4_1 1
+#endif
+#if defined(__SSE4_2__)
+#define SIMDE_ARCH_X86_SSE4_2 1
+#endif
+#if defined(__AVX__)
+#define SIMDE_ARCH_X86_AVX 1
+#if !defined(SIMDE_ARCH_X86_SSE3)
+#define SIMDE_ARCH_X86_SSE3 1
+#endif
+#if !defined(SIMDE_ARCH_X86_SSE4_1)
+#define SIMDE_ARCH_X86_SSE4_1 1
+#endif
+#if !defined(SIMDE_ARCH_X86_SSE4_1)
+#define SIMDE_ARCH_X86_SSE4_2 1
+#endif
+#endif
+#if defined(__AVX2__)
+#define SIMDE_ARCH_X86_AVX2 1
+#endif
+#if defined(__FMA__)
+#define SIMDE_ARCH_X86_FMA 1
+#if !defined(SIMDE_ARCH_X86_AVX)
+#define SIMDE_ARCH_X86_AVX 1
+#endif
+#endif
+#if defined(__AVX512BW__)
+#define SIMDE_ARCH_X86_AVX512BW 1
+#endif
+#if defined(__AVX512CD__)
+#define SIMDE_ARCH_X86_AVX512CD 1
+#endif
+#if defined(__AVX512DQ__)
+#define SIMDE_ARCH_X86_AVX512DQ 1
+#endif
+#if defined(__AVX512F__)
+#define SIMDE_ARCH_X86_AVX512F 1
+#endif
+#if defined(__AVX512VL__)
+#define SIMDE_ARCH_X86_AVX512VL 1
+#endif
+#if defined(__GFNI__)
+#define SIMDE_ARCH_X86_GFNI 1
+#endif
+#endif
 
 
 /* Itanium
 /* Itanium
    <https://en.wikipedia.org/wiki/Itanium> */
    <https://en.wikipedia.org/wiki/Itanium> */
@@ -206,6 +324,11 @@
 #elif defined(__mc68000__) || defined(__MC68000__)
 #elif defined(__mc68000__) || defined(__MC68000__)
 #define SIMDE_ARCH_M68K 68000
 #define SIMDE_ARCH_M68K 68000
 #endif
 #endif
+#if defined(SIMDE_ARCH_M68K)
+#define SIMDE_ARCH_M68K_CHECK(version) ((version) <= SIMDE_ARCH_M68K)
+#else
+#define SIMDE_ARCH_M68K_CHECK(version) (0)
+#endif
 
 
 /* Xilinx MicroBlaze
 /* Xilinx MicroBlaze
    <https://en.wikipedia.org/wiki/MicroBlaze> */
    <https://en.wikipedia.org/wiki/MicroBlaze> */
@@ -234,6 +357,11 @@
 #elif defined(_MIPS_ISA_MIPS) || defined(__mips) || defined(__MIPS__)
 #elif defined(_MIPS_ISA_MIPS) || defined(__mips) || defined(__MIPS__)
 #define SIMDE_ARCH_MIPS 1
 #define SIMDE_ARCH_MIPS 1
 #endif
 #endif
+#if defined(SIMDE_ARCH_MIPS)
+#define SIMDE_ARCH_MIPS_CHECK(version) ((version) <= SIMDE_ARCH_MIPS)
+#else
+#define SIMDE_ARCH_MIPS_CHECK(version) (0)
+#endif
 
 
 /* Matsushita MN10300
 /* Matsushita MN10300
    <https://en.wikipedia.org/wiki/MN103> */
    <https://en.wikipedia.org/wiki/MN103> */
@@ -245,6 +373,8 @@
    <https://en.wikipedia.org/wiki/IBM_POWER_Instruction_Set_Architecture> */
    <https://en.wikipedia.org/wiki/IBM_POWER_Instruction_Set_Architecture> */
 #if defined(_M_PPC)
 #if defined(_M_PPC)
 #define SIMDE_ARCH_POWER _M_PPC
 #define SIMDE_ARCH_POWER _M_PPC
+#elif defined(_ARCH_PWR9)
+#define SIMDE_ARCH_POWER 900
 #elif defined(_ARCH_PWR8)
 #elif defined(_ARCH_PWR8)
 #define SIMDE_ARCH_POWER 800
 #define SIMDE_ARCH_POWER 800
 #elif defined(_ARCH_PWR7)
 #elif defined(_ARCH_PWR7)
@@ -274,6 +404,20 @@
 	defined(__ppc)
 	defined(__ppc)
 #define SIMDE_ARCH_POWER 1
 #define SIMDE_ARCH_POWER 1
 #endif
 #endif
+#if defined(SIMDE_ARCH_POWER)
+#define SIMDE_ARCH_POWER_CHECK(version) ((version) <= SIMDE_ARCH_POWER)
+#else
+#define SIMDE_ARCH_POWER_CHECK(version) (0)
+#endif
+
+#if defined(__ALTIVEC__)
+#define SIMDE_ARCH_POWER_ALTIVEC SIMDE_ARCH_POWER
+#endif
+#if defined(SIMDE_ARCH_POWER)
+#define SIMDE_ARCH_POWER_ALTIVEC_CHECK(version) ((version) <= SIMDE_ARCH_POWER)
+#else
+#define SIMDE_ARCH_POWER_ALTIVEC_CHECK(version) (0)
+#endif
 
 
 /* SPARC
 /* SPARC
    <https://en.wikipedia.org/wiki/SPARC> */
    <https://en.wikipedia.org/wiki/SPARC> */
@@ -298,6 +442,11 @@
 #elif defined(__sparc__) || defined(__sparc)
 #elif defined(__sparc__) || defined(__sparc)
 #define SIMDE_ARCH_SPARC 1
 #define SIMDE_ARCH_SPARC 1
 #endif
 #endif
+#if defined(SIMDE_ARCH_SPARC)
+#define SIMDE_ARCH_SPARC_CHECK(version) ((version) <= SIMDE_ARCH_SPARC)
+#else
+#define SIMDE_ARCH_SPARC_CHECK(version) (0)
+#endif
 
 
 /* SuperH
 /* SuperH
    <https://en.wikipedia.org/wiki/SuperH> */
    <https://en.wikipedia.org/wiki/SuperH> */
@@ -345,6 +494,20 @@
 #elif defined(_TMS320C28X) || defined(__TMS320C28X__)
 #elif defined(_TMS320C28X) || defined(__TMS320C28X__)
 #define SIMDE_ARCH_TMS320 280
 #define SIMDE_ARCH_TMS320 280
 #endif
 #endif
+#if defined(SIMDE_ARCH_TMS320)
+#define SIMDE_ARCH_TMS320_CHECK(version) ((version) <= SIMDE_ARCH_TMS320)
+#else
+#define SIMDE_ARCH_TMS320_CHECK(version) (0)
+#endif
+
+/* WebAssembly */
+#if defined(__wasm__)
+#define SIMDE_ARCH_WASM 1
+#endif
+
+#if defined(SIMDE_ARCH_WASM) && defined(__wasm_simd128__)
+#define SIMDE_ARCH_WASM_SIMD128
+#endif
 
 
 /* Xtensa
 /* Xtensa
    <https://en.wikipedia.org/wiki/> */
    <https://en.wikipedia.org/wiki/> */

+ 590 - 117
libobs/util/simde/simde-common.h

@@ -1,4 +1,4 @@
-/* Copyright (c) 2017-2019 Evan Nemerson <[email protected]>
+/* SPDX-License-Identifier: MIT
  *
  *
  * Permission is hereby granted, free of charge, to any person
  * Permission is hereby granted, free of charge, to any person
  * obtaining a copy of this software and associated documentation
  * obtaining a copy of this software and associated documentation
@@ -19,39 +19,254 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  * SOFTWARE.
+ *
+ * Copyright:
+ *   2017-2020 Evan Nemerson <[email protected]>
  */
  */
 
 
 #if !defined(SIMDE_COMMON_H)
 #if !defined(SIMDE_COMMON_H)
 #define SIMDE_COMMON_H
 #define SIMDE_COMMON_H
 
 
 #include "hedley.h"
 #include "hedley.h"
-#include "check.h"
+
+#define SIMDE_VERSION_MAJOR 0
+#define SIMDE_VERSION_MINOR 5
+#define SIMDE_VERSION_MICRO 0
+#define SIMDE_VERSION                                                   \
+	HEDLEY_VERSION_ENCODE(SIMDE_VERSION_MAJOR, SIMDE_VERSION_MINOR, \
+			      SIMDE_VERSION_MICRO)
+
 #include "simde-arch.h"
 #include "simde-arch.h"
+#include "simde-features.h"
+#include "simde-diagnostic.h"
 
 
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
-#define SIMDE_ALIGN(alignment) _Alignas(alignment)
-#elif (defined(__cplusplus) && (__cplusplus >= 201103L))
-#define SIMDE_ALIGN(alignment) alignas(alignment)
-#elif HEDLEY_GCC_VERSION_CHECK(2, 95, 0) ||     \
-	HEDLEY_CRAY_VERSION_CHECK(8, 4, 0) ||   \
-	HEDLEY_IBM_VERSION_CHECK(11, 1, 0) ||   \
-	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \
-	HEDLEY_PGI_VERSION_CHECK(19, 4, 0) ||   \
-	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||    \
-	HEDLEY_TINYC_VERSION_CHECK(0, 9, 24) || \
+#include <stddef.h>
+#include <stdint.h>
+
+#if HEDLEY_HAS_ATTRIBUTE(aligned) || HEDLEY_GCC_VERSION_CHECK(2, 95, 0) || \
+	HEDLEY_CRAY_VERSION_CHECK(8, 4, 0) ||                              \
+	HEDLEY_IBM_VERSION_CHECK(11, 1, 0) ||                              \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
+	HEDLEY_PGI_VERSION_CHECK(19, 4, 0) ||                              \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                               \
+	HEDLEY_TINYC_VERSION_CHECK(0, 9, 24) ||                            \
 	HEDLEY_TI_VERSION_CHECK(8, 1, 0)
 	HEDLEY_TI_VERSION_CHECK(8, 1, 0)
 #define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment)))
 #define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment)))
-#elif defined(_MSC_VER) && (!defined(_M_IX86) || defined(_M_AMD64))
+#elif defined(_MSC_VER) && !(defined(_M_ARM) && !defined(_M_ARM64))
 #define SIMDE_ALIGN(alignment) __declspec(align(alignment))
 #define SIMDE_ALIGN(alignment) __declspec(align(alignment))
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+#define SIMDE_ALIGN(alignment) _Alignas(alignment)
+#elif defined(__cplusplus) && (__cplusplus >= 201103L)
+#define SIMDE_ALIGN(alignment) alignas(alignment)
 #else
 #else
 #define SIMDE_ALIGN(alignment)
 #define SIMDE_ALIGN(alignment)
 #endif
 #endif
 
 
-#define simde_assert_aligned(alignment, val) \
-	simde_assert_int(((uintptr_t)(val)) % (alignment), ==, 0)
+#if HEDLEY_GNUC_VERSION_CHECK(2, 95, 0) ||   \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \
+	HEDLEY_IBM_VERSION_CHECK(11, 1, 0)
+#define SIMDE_ALIGN_OF(T) (__alignof__(T))
+#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
+	HEDLEY_HAS_FEATURE(c11_alignof)
+#define SIMDE_ALIGN_OF(T) (_Alignof(T))
+#elif (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
+	HEDLEY_HAS_FEATURE(cxx_alignof)
+#define SIMDE_ALIGN_OF(T) (alignof(T))
+#endif
+
+#if defined(SIMDE_ALIGN_OF)
+#define SIMDE_ALIGN_AS(N, T) SIMDE_ALIGN(SIMDE_ALIGN_OF(T))
+#else
+#define SIMDE_ALIGN_AS(N, T) SIMDE_ALIGN(N)
+#endif
 
 
-#if HEDLEY_GCC_HAS_ATTRIBUTE(vector_size, 4, 6, 0)
-#define SIMDE__ENABLE_GCC_VEC_EXT
+#define simde_assert_aligned(alignment, val)                                \
+	simde_assert_int(HEDLEY_REINTERPRET_CAST(                           \
+				 uintptr_t, HEDLEY_REINTERPRET_CAST(        \
+						    const void *, (val))) % \
+				 (alignment),                               \
+			 ==, 0)
+
+#if HEDLEY_HAS_BUILTIN(__builtin_constant_p) ||                             \
+	HEDLEY_GCC_VERSION_CHECK(3, 4, 0) ||                                \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                             \
+	HEDLEY_TINYC_VERSION_CHECK(0, 9, 19) ||                             \
+	HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                                \
+	HEDLEY_IBM_VERSION_CHECK(13, 1, 0) ||                               \
+	HEDLEY_TI_CL6X_VERSION_CHECK(6, 1, 0) ||                            \
+	(HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0) && !defined(__cplusplus)) || \
+	HEDLEY_CRAY_VERSION_CHECK(8, 1, 0)
+#define SIMDE_CHECK_CONSTANT_(expr) (__builtin_constant_p(expr))
+#elif defined(__cplusplus) && (__cplusplus > 201703L)
+#include <type_traits>
+#define SIMDE_CHECK_CONSTANT_(expr) (std::is_constant_evaluated())
+#endif
+
+/* diagnose_if + __builtin_constant_p was broken until clang 9,
+ * which is when __FILE_NAME__ was added. */
+#if defined(SIMDE_CHECK_CONSTANT_) && defined(__FILE_NAME__)
+#define SIMDE_REQUIRE_CONSTANT(arg)                    \
+	HEDLEY_REQUIRE_MSG(SIMDE_CHECK_CONSTANT_(arg), \
+			   "`" #arg "' must be constant")
+#else
+#define SIMDE_REQUIRE_CONSTANT(arg)
+#endif
+
+#define SIMDE_REQUIRE_RANGE(arg, min, max)                         \
+	HEDLEY_REQUIRE_MSG((((arg) >= (min)) && ((arg) <= (max))), \
+			   "'" #arg "' must be in [" #min ", " #max "]")
+
+#define SIMDE_REQUIRE_CONSTANT_RANGE(arg, min, max) \
+	SIMDE_REQUIRE_CONSTANT(arg)                 \
+	SIMDE_REQUIRE_RANGE(arg, min, max)
+
+/* SIMDE_ASSUME_ALIGNED allows you to (try to) tell the compiler
+ * that a pointer is aligned to an `alignment`-byte boundary. */
+#if HEDLEY_HAS_BUILTIN(__builtin_assume_aligned) || \
+	HEDLEY_GCC_VERSION_CHECK(4, 7, 0)
+#define SIMDE_ASSUME_ALIGNED(alignment, v)     \
+	HEDLEY_REINTERPRET_CAST(__typeof__(v), \
+				__builtin_assume_aligned(v, alignment))
+#elif defined(__cplusplus) && (__cplusplus > 201703L)
+#define SIMDE_ASSUME_ALIGNED(alignment, v) std::assume_aligned<alignment>(v)
+#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define SIMDE_ASSUME_ALIGNED(alignment, v)                            \
+	(__extension__({                                              \
+		__typeof__(v) simde_assume_aligned_t_ = (v);          \
+		__assume_aligned(simde_assume_aligned_t_, alignment); \
+		simde_assume_aligned_t_;                              \
+	}))
+#else
+#define SIMDE_ASSUME_ALIGNED(alignment, v) (v)
+#endif
+
+/* SIMDE_ALIGN_CAST allows you to convert to a type with greater
+ * aligment requirements without triggering a warning. */
+#if HEDLEY_HAS_WARNING("-Wcast-align")
+#define SIMDE_ALIGN_CAST(T, v)                                       \
+	(__extension__({                                             \
+		HEDLEY_DIAGNOSTIC_PUSH                               \
+		_Pragma("clang diagnostic ignored \"-Wcast-align\"") \
+			T simde_r_ = HEDLEY_REINTERPRET_CAST(T, v);  \
+		HEDLEY_DIAGNOSTIC_POP                                \
+		simde_r_;                                            \
+	}))
+#else
+#define SIMDE_ALIGN_CAST(T, v) HEDLEY_REINTERPRET_CAST(T, v)
+#endif
+
+#if (HEDLEY_HAS_ATTRIBUTE(may_alias) && !defined(HEDLEY_SUNPRO_VERSION)) || \
+	HEDLEY_GCC_VERSION_CHECK(3, 3, 0) ||                                \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                             \
+	HEDLEY_IBM_VERSION_CHECK(13, 1, 0)
+#define SIMDE_MAY_ALIAS __attribute__((__may_alias__))
+#else
+#define SIMDE_MAY_ALIAS
+#endif
+
+/*  Lots of compilers support GCC-style vector extensions, but many
+    don't support all the features.  Define different macros depending
+    on support for
+
+    * SIMDE_VECTOR - Declaring a vector.
+    * SIMDE_VECTOR_OPS - basic operations (binary and unary).
+    * SIMDE_VECTOR_SCALAR - For binary operators, the second argument
+        can be a scalar, in which case the result is as if that scalar
+        had been broadcast to all lanes of a vector.
+    * SIMDE_VECTOR_SUBSCRIPT - Supports array subscript notation for
+        extracting/inserting a single element.=
+
+    SIMDE_VECTOR can be assumed if any others are defined, the
+    others are independent. */
+#if !defined(SIMDE_NO_VECTOR)
+#if HEDLEY_GCC_VERSION_CHECK(4, 8, 0)
+#define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
+#define SIMDE_VECTOR_OPS
+#define SIMDE_VECTOR_SCALAR
+#define SIMDE_VECTOR_SUBSCRIPT
+#elif HEDLEY_INTEL_VERSION_CHECK(16, 0, 0)
+#define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
+#define SIMDE_VECTOR_OPS
+/* ICC only supports SIMDE_VECTOR_SCALAR for constants */
+#define SIMDE_VECTOR_SUBSCRIPT
+#elif HEDLEY_GCC_VERSION_CHECK(4, 1, 0) || HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
+#define SIMDE_VECTOR_OPS
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 12, 0)
+#define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
+#elif HEDLEY_HAS_ATTRIBUTE(vector_size)
+#define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
+#define SIMDE_VECTOR_OPS
+#define SIMDE_VECTOR_SUBSCRIPT
+#if HEDLEY_HAS_ATTRIBUTE(diagnose_if) /* clang 4.0 */
+#define SIMDE_VECTOR_SCALAR
+#endif
+#endif
+
+/* GCC and clang have built-in functions to handle shuffling and
+   converting of vectors, but the implementations are slightly
+   different.  This macro is just an abstraction over them.  Note that
+   elem_size is in bits but vec_size is in bytes. */
+#if !defined(SIMDE_NO_SHUFFLE_VECTOR) && defined(SIMDE_VECTOR_SUBSCRIPT)
+HEDLEY_DIAGNOSTIC_PUSH
+/* We don't care about -Wvariadic-macros; all compilers that support
+      * shufflevector/shuffle support them. */
+#if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic")
+#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
+#endif
+#if HEDLEY_HAS_WARNING("-Wvariadic-macros") || HEDLEY_GCC_VERSION_CHECK(4, 0, 0)
+#pragma GCC diagnostic ignored "-Wvariadic-macros"
+#endif
+
+#if HEDLEY_HAS_BUILTIN(__builtin_shufflevector)
+#define SIMDE_SHUFFLE_VECTOR_(elem_size, vec_size, a, b, ...) \
+	__builtin_shufflevector(a, b, __VA_ARGS__)
+#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_shuffle, 4, 7, 0) && \
+	!defined(__INTEL_COMPILER)
+#define SIMDE_SHUFFLE_VECTOR_(elem_size, vec_size, a, b, ...) \
+	(__extension__({                                      \
+		int##elem_size##_t SIMDE_VECTOR(vec_size)     \
+			simde_shuffle_ = {__VA_ARGS__};       \
+		__builtin_shuffle(a, b, simde_shuffle_);      \
+	}))
+#endif
+HEDLEY_DIAGNOSTIC_POP
+#endif
+
+/* TODO: this actually works on XL C/C++ without SIMDE_VECTOR_SUBSCRIPT
+   but the code needs to be refactored a bit to take advantage. */
+#if !defined(SIMDE_NO_CONVERT_VECTOR) && defined(SIMDE_VECTOR_SUBSCRIPT)
+#if HEDLEY_HAS_BUILTIN(__builtin_convertvector) || \
+	HEDLEY_GCC_VERSION_CHECK(9, 0, 0)
+#if HEDLEY_GCC_VERSION_CHECK(9, 0, 0) && !HEDLEY_GCC_VERSION_CHECK(9, 3, 0)
+/* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93557 */
+#define SIMDE_CONVERT_VECTOR_(to, from)                          \
+	((to) = (__extension__({                                 \
+		 __typeof__(from) from_ = (from);                \
+		 ((void)from_);                                  \
+		 __builtin_convertvector(from_, __typeof__(to)); \
+	 })))
+#else
+#define SIMDE_CONVERT_VECTOR_(to, from) \
+	((to) = __builtin_convertvector((from), __typeof__(to)))
+#endif
+#endif
+#endif
+#endif
+
+/* Since we currently require SUBSCRIPT before using a vector in a
+   union, we define these as dependencies of SUBSCRIPT.  They are
+   likely to disappear in the future, once SIMDe learns how to make
+   use of vectors without using the union members.  Do not use them
+   in your code unless you're okay with it breaking when SIMDe
+   changes. */
+#if defined(SIMDE_VECTOR_SUBSCRIPT)
+#if defined(SIMDE_VECTOR_OPS)
+#define SIMDE_VECTOR_SUBSCRIPT_OPS
+#endif
+#if defined(SIMDE_VECTOR_SCALAR)
+#define SIMDE_VECTOR_SUBSCRIPT_SCALAR
+#endif
 #endif
 #endif
 
 
 #if !defined(SIMDE_ENABLE_OPENMP) &&                   \
 #if !defined(SIMDE_ENABLE_OPENMP) &&                   \
@@ -60,81 +275,197 @@
 #define SIMDE_ENABLE_OPENMP
 #define SIMDE_ENABLE_OPENMP
 #endif
 #endif
 
 
-#if !defined(SIMDE_ENABLE_CILKPLUS) && defined(__cilk)
+#if !defined(SIMDE_ENABLE_CILKPLUS) && \
+	(defined(__cilk) || defined(HEDLEY_INTEL_VERSION))
 #define SIMDE_ENABLE_CILKPLUS
 #define SIMDE_ENABLE_CILKPLUS
 #endif
 #endif
 
 
 #if defined(SIMDE_ENABLE_OPENMP)
 #if defined(SIMDE_ENABLE_OPENMP)
-#define SIMDE__VECTORIZE _Pragma("omp simd")
-#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(omp simd safelen(l))
-#define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(omp simd reduction(r))
-#define SIMDE__VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd aligned(a))
+#define SIMDE_VECTORIZE _Pragma("omp simd")
+#define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(omp simd safelen(l))
+#define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(omp simd reduction(r))
+#define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd aligned(a))
 #elif defined(SIMDE_ENABLE_CILKPLUS)
 #elif defined(SIMDE_ENABLE_CILKPLUS)
-#define SIMDE__VECTORIZE _Pragma("simd")
-#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l))
-#define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r))
-#define SIMDE__VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(simd aligned(a))
-#elif defined(__INTEL_COMPILER)
-#define SIMDE__VECTORIZE _Pragma("simd")
-#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l))
-#define SIMDE__VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r))
-#define SIMDE__VECTORIZE_ALIGNED(a)
-#elif defined(__clang__)
-#define SIMDE__VECTORIZE _Pragma("clang loop vectorize(enable)")
-#define SIMDE__VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l))
-#define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE
-#define SIMDE__VECTORIZE_ALIGNED(a)
+#define SIMDE_VECTORIZE _Pragma("simd")
+#define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l))
+#define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r))
+#define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(simd aligned(a))
+#elif defined(__clang__) && !defined(HEDLEY_IBM_VERSION)
+#define SIMDE_VECTORIZE _Pragma("clang loop vectorize(enable)")
+#define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l))
+#define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
+#define SIMDE_VECTORIZE_ALIGNED(a)
 #elif HEDLEY_GCC_VERSION_CHECK(4, 9, 0)
 #elif HEDLEY_GCC_VERSION_CHECK(4, 9, 0)
-#define SIMDE__VECTORIZE _Pragma("GCC ivdep")
-#define SIMDE__VECTORIZE_SAFELEN(l) SIMDE__VECTORIZE
-#define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE
-#define SIMDE__VECTORIZE_ALIGNED(a)
+#define SIMDE_VECTORIZE _Pragma("GCC ivdep")
+#define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE
+#define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
+#define SIMDE_VECTORIZE_ALIGNED(a)
 #elif HEDLEY_CRAY_VERSION_CHECK(5, 0, 0)
 #elif HEDLEY_CRAY_VERSION_CHECK(5, 0, 0)
-#define SIMDE__VECTORIZE _Pragma("_CRI ivdep")
-#define SIMDE__VECTORIZE_SAFELEN(l) SIMDE__VECTORIZE
-#define SIMDE__VECTORIZE_REDUCTION(r) SIMDE__VECTORIZE
-#define SIMDE__VECTORIZE_ALIGNED(a)
+#define SIMDE_VECTORIZE _Pragma("_CRI ivdep")
+#define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE
+#define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
+#define SIMDE_VECTORIZE_ALIGNED(a)
 #else
 #else
-#define SIMDE__VECTORIZE
-#define SIMDE__VECTORIZE_SAFELEN(l)
-#define SIMDE__VECTORIZE_REDUCTION(r)
-#define SIMDE__VECTORIZE_ALIGNED(a)
+#define SIMDE_VECTORIZE
+#define SIMDE_VECTORIZE_SAFELEN(l)
+#define SIMDE_VECTORIZE_REDUCTION(r)
+#define SIMDE_VECTORIZE_ALIGNED(a)
 #endif
 #endif
 
 
-#if HEDLEY_GCC_HAS_ATTRIBUTE(unused, 3, 1, 0)
-#define SIMDE__UNUSED __attribute__((__unused__))
+#define SIMDE_MASK_NZ_(v, mask) (((v) & (mask)) | !((v) & (mask)))
+
+/* Intended for checking coverage, you should never use this in
+   production. */
+#if defined(SIMDE_NO_INLINE)
+#define SIMDE_FUNCTION_ATTRIBUTES HEDLEY_NEVER_INLINE static
 #else
 #else
-#define SIMDE__UNUSED
+#define SIMDE_FUNCTION_ATTRIBUTES HEDLEY_ALWAYS_INLINE static
 #endif
 #endif
 
 
-#if HEDLEY_GCC_HAS_ATTRIBUTE(artificial, 4, 3, 0)
-#define SIMDE__ARTIFICIAL __attribute__((__artificial__))
+#if HEDLEY_HAS_ATTRIBUTE(unused) || HEDLEY_GCC_VERSION_CHECK(2, 95, 0)
+#define SIMDE_FUNCTION_POSSIBLY_UNUSED_ __attribute__((__unused__))
 #else
 #else
-#define SIMDE__ARTIFICIAL
+#define SIMDE_FUNCTION_POSSIBLY_UNUSED_
 #endif
 #endif
 
 
-/* Intended for checking coverage, you should never use this in
-   production. */
-#if defined(SIMDE_NO_INLINE)
-#define SIMDE__FUNCTION_ATTRIBUTES HEDLEY_NEVER_INLINE SIMDE__UNUSED static
+#if HEDLEY_HAS_WARNING("-Wused-but-marked-unused")
+#define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED \
+	_Pragma("clang diagnostic ignored \"-Wused-but-marked-unused\"")
 #else
 #else
-#define SIMDE__FUNCTION_ATTRIBUTES HEDLEY_INLINE SIMDE__ARTIFICIAL static
+#define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED
 #endif
 #endif
 
 
 #if defined(_MSC_VER)
 #if defined(_MSC_VER)
-#define SIMDE__BEGIN_DECLS                                            \
+#define SIMDE_BEGIN_DECLS_                                            \
 	HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(disable : 4996 4204)) \
 	HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(disable : 4996 4204)) \
 		HEDLEY_BEGIN_C_DECLS
 		HEDLEY_BEGIN_C_DECLS
-#define SIMDE__END_DECLS HEDLEY_DIAGNOSTIC_POP HEDLEY_END_C_DECLS
+#define SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP HEDLEY_END_C_DECLS
+#else
+#define SIMDE_BEGIN_DECLS_                              \
+	HEDLEY_DIAGNOSTIC_PUSH                          \
+	SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED \
+	HEDLEY_BEGIN_C_DECLS
+#define SIMDE_END_DECLS_   \
+	HEDLEY_END_C_DECLS \
+	HEDLEY_DIAGNOSTIC_POP
+#endif
+
+#if HEDLEY_HAS_WARNING("-Wpedantic")
+#define SIMDE_DIAGNOSTIC_DISABLE_INT128 \
+	_Pragma("clang diagnostic ignored \"-Wpedantic\"")
+#elif defined(HEDLEY_GCC_VERSION)
+#define SIMDE_DIAGNOSTIC_DISABLE_INT128 \
+	_Pragma("GCC diagnostic ignored \"-Wpedantic\"")
 #else
 #else
-#define SIMDE__BEGIN_DECLS HEDLEY_BEGIN_C_DECLS
-#define SIMDE__END_DECLS HEDLEY_END_C_DECLS
+#define SIMDE_DIAGNOSTIC_DISABLE_INT128
 #endif
 #endif
 
 
 #if defined(__SIZEOF_INT128__)
 #if defined(__SIZEOF_INT128__)
-#define SIMDE__HAVE_INT128
+#define SIMDE_HAVE_INT128_
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DIAGNOSTIC_DISABLE_INT128
 typedef __int128 simde_int128;
 typedef __int128 simde_int128;
 typedef unsigned __int128 simde_uint128;
 typedef unsigned __int128 simde_uint128;
+HEDLEY_DIAGNOSTIC_POP
+#endif
+
+#if !defined(SIMDE_ENDIAN_LITTLE)
+#define SIMDE_ENDIAN_LITTLE 1234
+#endif
+#if !defined(SIMDE_ENDIAN_BIG)
+#define SIMDE_ENDIAN_BIG 4321
+#endif
+
+#if !defined(SIMDE_ENDIAN_ORDER)
+/* GCC (and compilers masquerading as GCC) define  __BYTE_ORDER__. */
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
+	(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
+	(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG
+/* TI defines _BIG_ENDIAN or _LITTLE_ENDIAN */
+#elif defined(_BIG_ENDIAN)
+#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG
+#elif defined(_LITTLE_ENDIAN)
+#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
+/* We know the endianness of some common architectures.  Common
+ * architectures not listed (ARM, POWER, MIPS, etc.) here are
+ * bi-endian. */
+#elif defined(__amd64) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
+#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
+#elif defined(__s390x__) || defined(__zarch__)
+#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG
+/* Looks like we'll have to rely on the platform.  If we're missing a
+ * platform, please let us know. */
+#elif defined(_WIN32)
+#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
+#elif defined(sun) || defined(__sun) /* Solaris */
+#include <sys/byteorder.h>
+#if defined(_LITTLE_ENDIAN)
+#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
+#elif defined(_BIG_ENDIAN)
+#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG
+#endif
+#elif defined(__APPLE__)
+#include <libkern/OSByteOrder.h>
+#if defined(__LITTLE_ENDIAN__)
+#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
+#elif defined(__BIG_ENDIAN__)
+#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG
+#endif
+#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
+	defined(__bsdi__) || defined(__DragonFly__) || defined(BSD)
+#include <machine/endian.h>
+#if defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN)
+#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
+#elif defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN)
+#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG
+#endif
+#elif defined(__linux__) || defined(__linux) || defined(__gnu_linux__)
+#include <endian.h>
+#if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \
+	(__BYTE_ORDER == __LITTLE_ENDIAN)
+#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE
+#elif defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \
+	(__BYTE_ORDER == __BIG_ENDIAN)
+#define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG
+#endif
+#endif
+#endif
+
+#if HEDLEY_HAS_BUILTIN(__builtin_bswap64) ||  \
+	HEDLEY_GCC_VERSION_CHECK(4, 3, 0) ||  \
+	HEDLEY_IBM_VERSION_CHECK(13, 1, 0) || \
+	HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define simde_bswap64(v) __builtin_bswap64(v)
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0)
+#define simde_bswap64(v) _byteswap_uint64(v)
+#else
+SIMDE_FUNCTION_ATTRIBUTES
+uint64_t simde_bswap64(uint64_t v)
+{
+	return ((v & (((uint64_t)0xff) << 56)) >> 56) |
+	       ((v & (((uint64_t)0xff) << 48)) >> 40) |
+	       ((v & (((uint64_t)0xff) << 40)) >> 24) |
+	       ((v & (((uint64_t)0xff) << 32)) >> 8) |
+	       ((v & (((uint64_t)0xff) << 24)) << 8) |
+	       ((v & (((uint64_t)0xff) << 16)) << 24) |
+	       ((v & (((uint64_t)0xff) << 8)) << 40) |
+	       ((v & (((uint64_t)0xff))) << 56);
+}
+#endif
+
+#if !defined(SIMDE_ENDIAN_ORDER)
+#error Unknown byte order; please file a bug
+#else
+#if SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE
+#define simde_endian_bswap64_be(value) simde_bswap64(value)
+#define simde_endian_bswap64_le(value) (value)
+#elif SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG
+#define simde_endian_bswap64_be(value) (value)
+#define simde_endian_bswap64_le(value) simde_bswap64(value)
+#endif
 #endif
 #endif
 
 
 /* TODO: we should at least make an attempt to detect the correct
 /* TODO: we should at least make an attempt to detect the correct
@@ -148,8 +479,6 @@ typedef unsigned __int128 simde_uint128;
 #define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT32_TYPE)value)
 #define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT32_TYPE)value)
 #endif
 #endif
 typedef SIMDE_FLOAT32_TYPE simde_float32;
 typedef SIMDE_FLOAT32_TYPE simde_float32;
-HEDLEY_STATIC_ASSERT(sizeof(simde_float32) == 4,
-		     "Unable to find 32-bit floating-point type.");
 
 
 #if !defined(SIMDE_FLOAT64_TYPE)
 #if !defined(SIMDE_FLOAT64_TYPE)
 #define SIMDE_FLOAT64_TYPE double
 #define SIMDE_FLOAT64_TYPE double
@@ -158,8 +487,6 @@ HEDLEY_STATIC_ASSERT(sizeof(simde_float32) == 4,
 #define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT64_TYPE)value)
 #define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT64_TYPE)value)
 #endif
 #endif
 typedef SIMDE_FLOAT64_TYPE simde_float64;
 typedef SIMDE_FLOAT64_TYPE simde_float64;
-HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8,
-		     "Unable to find 64-bit floating-point type.");
 
 
 /* Whether to assume that the compiler can auto-vectorize reasonably
 /* Whether to assume that the compiler can auto-vectorize reasonably
    well.  This will cause SIMDe to attempt to compose vector
    well.  This will cause SIMDe to attempt to compose vector
@@ -189,67 +516,171 @@ HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8,
 #if !defined(SIMDE_NO_ASSUME_VECTORIZATION) && \
 #if !defined(SIMDE_NO_ASSUME_VECTORIZATION) && \
 	!defined(SIMDE_ASSUME_VECTORIZATION)
 	!defined(SIMDE_ASSUME_VECTORIZATION)
 #if defined(__SSE__) || defined(__ARM_NEON) || defined(__mips_msa) || \
 #if defined(__SSE__) || defined(__ARM_NEON) || defined(__mips_msa) || \
-	defined(__ALTIVEC__)
+	defined(__ALTIVEC__) || defined(__wasm_simd128__)
 #define SIMDE_ASSUME_VECTORIZATION
 #define SIMDE_ASSUME_VECTORIZATION
 #endif
 #endif
 #endif
 #endif
 
 
-/* GCC and clang have built-in functions to handle shuffling of
-   vectors, but the implementations are slightly different.  This
-   macro is just an abstraction over them.  Note that elem_size is in
-   bits but vec_size is in bytes. */
-#if HEDLEY_CLANG_HAS_BUILTIN(__builtin_shufflevector)
-#define SIMDE__SHUFFLE_VECTOR(elem_size, vec_size, a, b, ...) \
-	__builtin_shufflevector(a, b, __VA_ARGS__)
-#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_shuffle, 4, 7, 0) && \
-	!defined(__INTEL_COMPILER)
-#define SIMDE__SHUFFLE_VECTOR(elem_size, vec_size, a, b, ...) \
-	__builtin_shuffle(a, b,                               \
-			  (int##elem_size##_t __attribute__(  \
-				  (__vector_size__(vec_size)))){__VA_ARGS__})
+#if HEDLEY_HAS_WARNING("-Wbad-function-cast")
+#define SIMDE_CONVERT_FTOI(T, v)                                    \
+	HEDLEY_DIAGNOSTIC_PUSH                                      \
+	_Pragma("clang diagnostic ignored \"-Wbad-function-cast\"") \
+		HEDLEY_STATIC_CAST(T, (v)) HEDLEY_DIAGNOSTIC_POP
+#else
+#define SIMDE_CONVERT_FTOI(T, v) ((T)(v))
+#endif
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+#define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) \
+	(_Generic((value), to : (value), from : ((to)(value))))
+#define SIMDE_CHECKED_STATIC_CAST(to, from, value) \
+	(_Generic((value), to : (value), from : ((to)(value))))
+#else
+#define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) \
+	HEDLEY_REINTERPRET_CAST(to, value)
+#define SIMDE_CHECKED_STATIC_CAST(to, from, value) HEDLEY_STATIC_CAST(to, value)
 #endif
 #endif
 
 
-/* Some algorithms are iterative, and fewer iterations means less
-   accuracy.  Lower values here will result in faster, but less
-   accurate, calculations for some functions. */
-#if !defined(SIMDE_ACCURACY_ITERS)
-#define SIMDE_ACCURACY_ITERS 2
+#if HEDLEY_HAS_WARNING("-Wfloat-equal")
+#define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL \
+	_Pragma("clang diagnostic ignored \"-Wfloat-equal\"")
+#elif HEDLEY_GCC_VERSION_CHECK(3, 0, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL \
+	_Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL
 #endif
 #endif
 
 
-/* This will probably move into Hedley at some point, but I'd like to
-   more thoroughly check for other compilers which define __GNUC__
-   first. */
-#if defined(SIMDE__REALLY_GCC)
-#undef SIMDE__REALLY_GCC
+/* Some functions can trade accuracy for speed.  For those functions
+   you can control the trade-off using this macro.  Possible values:
+
+   0: prefer speed
+   1: reasonable trade-offs
+   2: prefer accuracy */
+#if !defined(SIMDE_ACCURACY_PREFERENCE)
+#define SIMDE_ACCURACY_PREFERENCE 1
 #endif
 #endif
-#if !defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
-#define SIMDE__REALLY_GCC 0
+
+#if defined(__STDC_HOSTED__)
+#define SIMDE_STDC_HOSTED __STDC_HOSTED__
+#else
+#if defined(HEDLEY_PGI_VERSION_CHECK) || defined(HEDLEY_MSVC_VERSION_CHECK)
+#define SIMDE_STDC_HOSTED 1
 #else
 #else
-#define SIMDE__REALLY_GCC 1
+#define SIMDE_STDC_HOSTED 0
+#endif
 #endif
 #endif
 
 
-#if defined(SIMDE__ASSUME_ALIGNED)
-#undef SIMDE__ASSUME_ALIGNED
+/* Try to deal with environments without a standard library. */
+#if !defined(simde_memcpy) || !defined(simde_memset)
+#if !defined(SIMDE_NO_STRING_H) && defined(__has_include)
+#if __has_include(<string.h>)
+#include <string.h>
+#if !defined(simde_memcpy)
+#define simde_memcpy(dest, src, n) memcpy(dest, src, n)
+#endif
+#if !defined(simde_memset)
+#define simde_memset(s, c, n) memset(s, c, n)
 #endif
 #endif
-#if HEDLEY_INTEL_VERSION_CHECK(9, 0, 0)
-#define SIMDE__ASSUME_ALIGNED(ptr, align) __assume_aligned(ptr, align)
-#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0)
-#define SIMDE__ASSUME_ALIGNED(ptr, align) \
-	__assume((((char *)ptr) - ((char *)0)) % (align) == 0)
-#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_assume_aligned, 4, 7, 0)
-#define SIMDE__ASSUME_ALIGNED(ptr, align) \
-	(ptr = (__typeof__(ptr))__builtin_assume_aligned((ptr), align))
-#elif HEDLEY_CLANG_HAS_BUILTIN(__builtin_assume)
-#define SIMDE__ASSUME_ALIGNED(ptr, align) \
-	__builtin_assume((((char *)ptr) - ((char *)0)) % (align) == 0)
-#elif HEDLEY_GCC_HAS_BUILTIN(__builtin_unreachable, 4, 5, 0)
-#define SIMDE__ASSUME_ALIGNED(ptr, align)              \
-	((((char *)ptr) - ((char *)0)) % (align) == 0) \
-		? (1)                                  \
-		: (__builtin_unreachable(), 0)
 #else
 #else
-#define SIMDE__ASSUME_ALIGNED(ptr, align)
+#define SIMDE_NO_STRING_H
+#endif
+#endif
+#endif
+#if !defined(simde_memcpy) || !defined(simde_memset)
+#if !defined(SIMDE_NO_STRING_H) && (SIMDE_STDC_HOSTED == 1)
+#include <string.h>
+#if !defined(simde_memcpy)
+#define simde_memcpy(dest, src, n) memcpy(dest, src, n)
 #endif
 #endif
+#if !defined(simde_memset)
+#define simde_memset(s, c, n) memset(s, c, n)
+#endif
+#elif (HEDLEY_HAS_BUILTIN(__builtin_memcpy) &&  \
+       HEDLEY_HAS_BUILTIN(__builtin_memset)) || \
+	HEDLEY_GCC_VERSION_CHECK(4, 2, 0)
+#if !defined(simde_memcpy)
+#define simde_memcpy(dest, src, n) __builtin_memcpy(dest, src, n)
+#endif
+#if !defined(simde_memset)
+#define simde_memset(s, c, n) __builtin_memset(s, c, n)
+#endif
+#else
+/* These are meant to be portable, not fast.  If you're hitting them you
+     * should think about providing your own (by defining the simde_memcpy
+     * macro prior to including any SIMDe files) or submitting a patch to
+     * SIMDe so we can detect your system-provided memcpy/memset, like by
+     * adding your compiler to the checks for __builtin_memcpy and/or
+     * __builtin_memset. */
+#if !defined(simde_memcpy)
+SIMDE_FUNCTION_ATTRIBUTES
+void simde_memcpy_(void *dest, const void *src, size_t len)
+{
+	char *dest_ = HEDLEY_STATIC_CAST(char *, dest);
+	char *src_ = HEDLEY_STATIC_CAST(const char *, src);
+	for (size_t i = 0; i < len; i++) {
+		dest_[i] = src_[i];
+	}
+}
+#define simde_memcpy(dest, src, n) simde_memcpy_(dest, src, n)
+#endif
+
+#if !defined(simde_memset)
+SIMDE_FUNCTION_ATTRIBUTES
+void simde_memset_(void *s, int c, size_t len)
+{
+	char *s_ = HEDLEY_STATIC_CAST(char *, s);
+	char c_ = HEDLEY_STATIC_CAST(char, c);
+	for (size_t i = 0; i < len; i++) {
+		s_[i] = c_[i];
+	}
+}
+#define simde_memset(s, c, n) simde_memset_(s, c, n)
+#endif
+#endif /* !defined(SIMDE_NO_STRING_H) && (SIMDE_STDC_HOSTED == 1) */
+#endif /* !defined(simde_memcpy) || !defined(simde_memset) */
+
+#include "simde-math.h"
+
+#if defined(FE_ALL_EXCEPT)
+#define SIMDE_HAVE_FENV_H
+#elif defined(__has_include)
+#if __has_include(<fenv.h>)
+#include <fenv.h>
+#define SIMDE_HAVE_FENV_H
+#endif
+#elif SIMDE_STDC_HOSTED == 1
+#include <fenv.h>
+#define SIMDE_HAVE_FENV_H
+#endif
+
+#if defined(EXIT_FAILURE)
+#define SIMDE_HAVE_STDLIB_H
+#elif defined(__has_include)
+#if __has_include(<stdlib.h>)
+#include <stdlib.h>
+#define SIMDE_HAVE_STDLIB_H
+#endif
+#elif SIMDE_STDC_HOSTED == 1
+#include <stdlib.h>
+#define SIMDE_HAVE_STDLIB_H
+#endif
+
+#if defined(__has_include)
+#if defined(__cplusplus) && (__cplusplus >= 201103L) && __has_include(<cfenv>)
+#include <cfenv>
+#elif __has_include(<fenv.h>)
+#include <fenv.h>
+#endif
+#if __has_include(<stdlib.h>)
+#include <stdlib.h>
+#endif
+#elif SIMDE_STDC_HOSTED == 1
+#include <stdlib.h>
+#include <fenv.h>
+#endif
+
+#include "check.h"
 
 
 /* Sometimes we run into problems with specific versions of compilers
 /* Sometimes we run into problems with specific versions of compilers
    which make the native versions unusable for us.  Often this is due
    which make the native versions unusable for us.  Often this is due
@@ -258,7 +689,7 @@ HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8,
    start only defining them for problematic compiler versions. */
    start only defining them for problematic compiler versions. */
 
 
 #if !defined(SIMDE_IGNORE_COMPILER_BUGS)
 #if !defined(SIMDE_IGNORE_COMPILER_BUGS)
-#if SIMDE__REALLY_GCC
+#if defined(HEDLEY_GCC_VERSION)
 #if !HEDLEY_GCC_VERSION_CHECK(4, 9, 0)
 #if !HEDLEY_GCC_VERSION_CHECK(4, 9, 0)
 #define SIMDE_BUG_GCC_REV_208793
 #define SIMDE_BUG_GCC_REV_208793
 #endif
 #endif
@@ -268,11 +699,53 @@ HEDLEY_STATIC_ASSERT(sizeof(simde_float64) == 8,
 #if !HEDLEY_GCC_VERSION_CHECK(4, 6, 0)
 #if !HEDLEY_GCC_VERSION_CHECK(4, 6, 0)
 #define SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8 /* TODO: find relevant bug or commit */
 #define SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8 /* TODO: find relevant bug or commit */
 #endif
 #endif
+#if !HEDLEY_GCC_VERSION_CHECK(8, 0, 0)
+#define SIMDE_BUG_GCC_REV_247851
+#endif
+#if !HEDLEY_GCC_VERSION_CHECK(10, 0, 0)
+#define SIMDE_BUG_GCC_REV_274313
+#define SIMDE_BUG_GCC_91341
+#endif
+#if !HEDLEY_GCC_VERSION_CHECK(9, 0, 0) && defined(SIMDE_ARCH_AARCH64)
+#define SIMDE_BUG_GCC_ARM_SHIFT_SCALAR
+#endif
+#if defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)
+#define SIMDE_BUG_GCC_94482
+#endif
+#if !HEDLEY_GCC_VERSION_CHECK(9, 4, 0) && defined(SIMDE_ARCH_AARCH64)
+#define SIMDE_BUG_GCC_94488
 #endif
 #endif
-#if defined(__EMSCRIPTEN__)
+#if defined(SIMDE_ARCH_POWER)
+#define SIMDE_BUG_GCC_95227
+#endif
+#define SIMDE_BUG_GCC_95399
+#elif defined(__clang__)
+#if defined(SIMDE_ARCH_AARCH64)
+#define SIMDE_BUG_CLANG_45541
+#endif
+#endif
+#if defined(HEDLEY_EMSCRIPTEN_VERSION)
 #define SIMDE_BUG_EMSCRIPTEN_MISSING_IMPL /* Placeholder for (as yet) unfiled issues. */
 #define SIMDE_BUG_EMSCRIPTEN_MISSING_IMPL /* Placeholder for (as yet) unfiled issues. */
 #define SIMDE_BUG_EMSCRIPTEN_5242
 #define SIMDE_BUG_EMSCRIPTEN_5242
 #endif
 #endif
 #endif
 #endif
 
 
+/* GCC and Clang both have the same issue:
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95144
+ * https://bugs.llvm.org/show_bug.cgi?id=45931
+ */
+#if HEDLEY_HAS_WARNING("-Wsign-conversion") || HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
+#define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr)                                      \
+	(__extension__({                                                            \
+		HEDLEY_DIAGNOSTIC_PUSH                                              \
+		HEDLEY_DIAGNOSTIC_POP                                               \
+		_Pragma("GCC diagnostic ignored \"-Wsign-conversion\"") __typeof__( \
+			expr) simde_bug_ignore_sign_conversion_v_ = (expr);         \
+		HEDLEY_DIAGNOSTIC_PUSH                                              \
+		simde_bug_ignore_sign_conversion_v_;                                \
+	}))
+#else
+#define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr) (expr)
+#endif
+
 #endif /* !defined(SIMDE_COMMON_H) */
 #endif /* !defined(SIMDE_COMMON_H) */

+ 270 - 0
libobs/util/simde/simde-diagnostic.h

@@ -0,0 +1,270 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ *   2017-2020 Evan Nemerson <[email protected]>
+ */
+
+/* SIMDe targets a very wide range of standards and compilers, and our
+ * goal is to compile cleanly even with extremely aggressive warnings
+ * (i.e., -Weverything in clang, -Wextra in GCC, /W4 for MSVC, etc.)
+ * treated as errors.
+ *
+ * While our preference is to resolve the underlying issue a given
+ * diagnostic is warning us about, sometimes that's not possible.
+ * Fixing a warning in one compiler may cause problems in another.
+ * Sometimes a warning doesn't really apply to us (false positives),
+ * and sometimes adhering to a warning would mean dropping a feature
+ * we *know* the compiler supports since we have tested specifically
+ * for the compiler or feature.
+ *
+ * When practical, warnings are only disabled for specific code.  For
+ * a list of warnings which are enabled by default in all SIMDe code,
+ * see SIMDE_DISABLE_UNWANTED_DIAGNOSTICS.  Note that we restore the
+ * warning stack when SIMDe is done parsing, so code which includes
+ * SIMDe is not deprived of these warnings.
+ */
+
+#if !defined(SIMDE_DIAGNOSTIC_H)
+
+#include "hedley.h"
+
+/* This is only to help us implement functions like _mm_undefined_ps. */
+#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
+#undef SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
+#endif
+#if HEDLEY_HAS_WARNING("-Wuninitialized")
+#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ \
+	_Pragma("clang diagnostic ignored \"-Wuninitialized\"")
+#elif HEDLEY_GCC_VERSION_CHECK(4, 2, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ \
+	_Pragma("GCC diagnostic ignored \"-Wuninitialized\"")
+#elif HEDLEY_PGI_VERSION_CHECK(19, 10, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("diag_suppress 549")
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 14, 0) && defined(__cplusplus)
+#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ \
+	_Pragma("error_messages(off,SEC_UNINITIALIZED_MEM_READ,SEC_UNDEFINED_RETURN_VALUE,unassigned)")
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 14, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ \
+	_Pragma("error_messages(off,SEC_UNINITIALIZED_MEM_READ,SEC_UNDEFINED_RETURN_VALUE)")
+#elif HEDLEY_SUNPRO_VERSION_CHECK(5, 12, 0) && defined(__cplusplus)
+#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ \
+	_Pragma("error_messages(off,unassigned)")
+#elif HEDLEY_TI_VERSION_CHECK(16, 9, 9) ||       \
+	HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0) || \
+	HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \
+	HEDLEY_TI_CLPRU_VERSION_CHECK(2, 3, 2)
+#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("diag_suppress 551")
+#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("warning(disable:592)")
+#elif HEDLEY_MSVC_VERSION_CHECK(19, 0, 0) && !defined(__MSVC_RUNTIME_CHECKS)
+#define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ \
+	__pragma(warning(disable : 4700))
+#endif
+
+/* GCC emits a lot of "notes" about the ABI being different for things
+ * in newer versions of GCC.  We don't really care because all our
+ * functions are inlined and don't generate ABI. */
+#if HEDLEY_GCC_VERSION_CHECK(7, 0, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_PSABI_ \
+	_Pragma("GCC diagnostic ignored \"-Wpsabi\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_PSABI_
+#endif
+
+/* Since MMX uses x87 FP registers, you're supposed to call _mm_empty()
+ * after each MMX function before any floating point instructions.
+ * Some compilers warn about functions which use MMX functions but
+ * don't call _mm_empty().  However, since SIMDe is implementyng the
+ * MMX API we shouldn't be calling _mm_empty(); we leave it to the
+ * caller to invoke simde_mm_empty(). */
+#if HEDLEY_INTEL_VERSION_CHECK(19, 0, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ \
+	_Pragma("warning(disable:13200 13203)")
+#elif defined(HEDLEY_MSVC_VERSION)
+#define SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ \
+	__pragma(warning(disable : 4799))
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_
+#endif
+
+/* Intel is pushing people to use OpenMP SIMD instead of Cilk+, so they
+ * emit a diagnostic if you use #pragma simd instead of
+ * #pragma omp simd.  SIMDe supports OpenMP SIMD, you just need to
+ * compile with -qopenmp or -qopenmp-simd and define
+ * SIMDE_ENABLE_OPENMP.  Cilk+ is just a fallback. */
+#if HEDLEY_INTEL_VERSION_CHECK(18, 0, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_ \
+	_Pragma("warning(disable:3948)")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_
+#endif
+
+#if defined(HEDLEY_MSVC_VERSION)
+#define SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ \
+	__pragma(warning(disable : 4204))
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_
+#endif
+
+/* This warning needs a lot of work.  It is triggered if all you do is
+ * pass the value to memcpy/__builtin_memcpy, or if you initialize a
+ * member of the union, even if that member takes up the entire union.
+ * Last tested with clang-10, hopefully things will improve in the
+ * future; if clang fixes this I'd love to enable it. */
+#if HEDLEY_HAS_WARNING("-Wconditional-uninitialized")
+#define SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_ \
+	_Pragma("clang diagnostic ignored \"-Wconditional-uninitialized\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_
+#endif
+
+/* This warning is meant to catch things like `0.3 + 0.4 == 0.7`, which
+ * will is false.  However, SIMDe uses these operations exclusively
+ * for things like _mm_cmpeq_ps, for which we really do want to check
+ * for equality (or inequality).
+ *
+ * If someone wants to put together a SIMDE_FLOAT_EQUAL(a, op, b) macro
+ * which just wraps a check in some code do disable this diagnostic I'd
+ * be happy to accept it. */
+#if HEDLEY_HAS_WARNING("-Wfloat-equal") || HEDLEY_GCC_VERSION_CHECK(3, 0, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ \
+	_Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_
+#endif
+
+/* This is because we use HEDLEY_STATIC_ASSERT for static assertions.
+ * If Hedley can't find an implementation it will preprocess to
+ * nothing, which means there will be a trailing semi-colon. */
+#if HEDLEY_HAS_WARNING("-Wextra-semi")
+#define SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ \
+	_Pragma("clang diagnostic ignored \"-Wextra-semi\"")
+#elif HEDLEY_GCC_VERSION_CHECK(8, 1, 0) && defined(__cplusplus)
+#define SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ \
+	_Pragma("GCC diagnostic ignored \"-Wextra-semi\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_
+#endif
+
+/* We do use a few variadic macros, which technically aren't available
+ * until C99 and C++11, but every compiler I'm aware of has supported
+ * them for much longer.  That said, usage is isolated to the test
+ * suite and compilers known to support them. */
+#if HEDLEY_HAS_WARNING("-Wvariadic-macros") || HEDLEY_GCC_VERSION_CHECK(4, 0, 0)
+#if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic")
+#define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_                          \
+	_Pragma("clang diagnostic ignored \"-Wvariadic-macros\"") _Pragma( \
+		"clang diagnostic ignored \"-Wc++98-compat-pedantic\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ \
+	_Pragma("GCC diagnostic ignored \"-Wvariadic-macros\"")
+#endif
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_
+#endif
+
+/* Triggered when assigning a float to a double implicitly.  We use
+ * explicit casts in SIMDe, this is only used in the test suite. */
+#if HEDLEY_HAS_WARNING("-Wdouble-promotion")
+#define SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ \
+	_Pragma("clang diagnostic ignored \"-Wdouble-promotion\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_
+#endif
+
+/* Several compilers treat conformant array parameters as VLAs.  We
+ * test to make sure we're in C mode (C++ doesn't support CAPs), and
+ * that the version of the standard supports CAPs.  We also blacklist
+ * some buggy compilers like MSVC (the logic is in Hedley if you want
+ * to take a look), but with certain warnings enabled some compilers
+ * still like to emit a diagnostic. */
+#if HEDLEY_HAS_WARNING("-Wvla")
+#define SIMDE_DIAGNOSTIC_DISABLE_VLA_ \
+	_Pragma("clang diagnostic ignored \"-Wvla\"")
+#elif HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_VLA_ \
+	_Pragma("GCC diagnostic ignored \"-Wvla\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_VLA_
+#endif
+
+#if HEDLEY_HAS_WARNING("-Wused-but-marked-unused")
+#define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ \
+	_Pragma("clang diagnostic ignored \"-Wused-but-marked-unused\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_
+#endif
+
+#if HEDLEY_HAS_WARNING("-Wunused-function")
+#define SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_ \
+	_Pragma("clang diagnostic ignored \"-Wunused-function\"")
+#elif HEDLEY_GCC_VERSION_CHECK(3, 4, 0)
+#define SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_ \
+	_Pragma("GCC diagnostic ignored \"-Wunused-function\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_
+#endif
+
+#if HEDLEY_HAS_WARNING("-Wpass-failed")
+#define SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_ \
+	_Pragma("clang diagnostic ignored \"-Wpass-failed\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_
+#endif
+
+/* https://github.com/nemequ/simde/issues/277 */
+#if defined(HEDLEY_GCC_VERSION) && HEDLEY_GCC_VERSION_CHECK(4, 6, 0) && \
+	!HEDLEY_GCC_VERSION_CHECK(6, 0, 0) && defined(__cplusplus)
+#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE \
+	_Pragma("GCC diagnostic ignored \"-Wunused-but-set-variable\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE
+#endif
+
+/* Some compilers, such as clang, may use `long long` for 64-bit
+ * integers, but `long long` triggers a diagnostic with
+ * -Wc++98-compat-pedantic which says 'long long' is incompatible with
+ * C++98. */
+#if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic")
+#define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC \
+	_Pragma("clang diagnostic ignored \"-Wc++98-compat-pedantic\"")
+#else
+#define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC
+#endif
+
+#define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS                           \
+	SIMDE_DIAGNOSTIC_DISABLE_PSABI_                              \
+	SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_                \
+	SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_             \
+	SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_          \
+	SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_                        \
+	SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ \
+	SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_                         \
+	SIMDE_DIAGNOSTIC_DISABLE_VLA_                                \
+	SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_             \
+	SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_                    \
+	SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_                        \
+	SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC               \
+	SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE
+
+#endif

+ 357 - 0
libobs/util/simde/simde-features.h

@@ -0,0 +1,357 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ *   2020      Evan Nemerson <[email protected]>
+ */
+
+/* simde-arch.h is used to determine which features are available according
+   to the compiler.  However, we want to make it possible to forcibly enable
+   or disable APIs */
+
+#if !defined(SIMDE_FEATURES_H)
+#define SIMDE_FEATURES_H
+
+#include "simde-arch.h"
+
+#if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_SVML)
+#define SIMDE_X86_SVML_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE)
+#define SIMDE_X86_AVX512F_NATIVE
+#endif
+
+#if !defined(SIMDE_X86_AVX512CD_NATIVE) && \
+	!defined(SIMDE_X86_AVX512CD_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_AVX512CD)
+#define SIMDE_X86_AVX512CD_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_AVX512CD_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE)
+#define SIMDE_X86_AVX512F_NATIVE
+#endif
+
+#if !defined(SIMDE_X86_AVX512DQ_NATIVE) && \
+	!defined(SIMDE_X86_AVX512DQ_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_AVX512DQ)
+#define SIMDE_X86_AVX512DQ_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_AVX512DQ_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE)
+#define SIMDE_X86_AVX512F_NATIVE
+#endif
+
+#if !defined(SIMDE_X86_AVX512VL_NATIVE) && \
+	!defined(SIMDE_X86_AVX512VL_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_AVX512VL)
+#define SIMDE_X86_AVX512VL_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE)
+#define SIMDE_X86_AVX512F_NATIVE
+#endif
+
+#if !defined(SIMDE_X86_AVX512BW_NATIVE) && \
+	!defined(SIMDE_X86_AVX512BW_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_AVX512BW)
+#define SIMDE_X86_AVX512BW_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE)
+#define SIMDE_X86_AVX512F_NATIVE
+#endif
+
+#if !defined(SIMDE_X86_AVX512F_NATIVE) && \
+	!defined(SIMDE_X86_AVX512F_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_AVX512F)
+#define SIMDE_X86_AVX512F_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_X86_AVX2_NATIVE)
+#define SIMDE_X86_AVX2_NATIVE
+#endif
+
+#if !defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_X86_FMA_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_FMA)
+#define SIMDE_X86_FMA_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_X86_AVX_NATIVE)
+#define SIMDE_X86_AVX_NATIVE
+#endif
+
+#if !defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_X86_AVX2_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_AVX2)
+#define SIMDE_X86_AVX2_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_X86_AVX_NATIVE)
+#define SIMDE_X86_AVX_NATIVE
+#endif
+
+#if !defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_AVX_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_AVX)
+#define SIMDE_X86_AVX_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_SSE4_1_NATIVE)
+#define SIMDE_X86_SSE4_2_NATIVE
+#endif
+
+#if !defined(SIMDE_X86_SSE4_2_NATIVE) && \
+	!defined(SIMDE_X86_SSE4_2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_SSE4_2)
+#define SIMDE_X86_SSE4_2_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_SSE4_2_NATIVE) && !defined(SIMDE_X86_SSE4_1_NATIVE)
+#define SIMDE_X86_SSE4_1_NATIVE
+#endif
+
+#if !defined(SIMDE_X86_SSE4_1_NATIVE) && \
+	!defined(SIMDE_X86_SSE4_1_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_SSE4_1)
+#define SIMDE_X86_SSE4_1_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_X86_SSSE3_NATIVE)
+#define SIMDE_X86_SSSE3_NATIVE
+#endif
+
+#if !defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_X86_SSSE3_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_SSSE3)
+#define SIMDE_X86_SSSE3_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_X86_SSE3_NATIVE)
+#define SIMDE_X86_SSE3_NATIVE
+#endif
+
+#if !defined(SIMDE_X86_SSE3_NATIVE) && !defined(SIMDE_X86_SSE3_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_SSE3)
+#define SIMDE_X86_SSE3_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_SSE3_NATIVE) && !defined(SIMDE_X86_SSE2_NATIVE)
+#define SIMDE_X86_SSE2_NATIVE
+#endif
+
+#if !defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_X86_SSE2_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_SSE2)
+#define SIMDE_X86_SSE2_NATIVE
+#endif
+#endif
+#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_X86_SSE_NATIVE)
+#define SIMDE_X86_SSE_NATIVE
+#endif
+
+#if !defined(SIMDE_X86_SSE_NATIVE) && !defined(SIMDE_X86_SSE_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_SSE)
+#define SIMDE_X86_SSE_NATIVE
+#endif
+#endif
+
+#if !defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_X86_MMX_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_MMX)
+#define SIMDE_X86_MMX_NATIVE
+#endif
+#endif
+
+#if !defined(SIMDE_X86_GFNI_NATIVE) && !defined(SIMDE_X86_GFNI_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_X86_GFNI)
+#define SIMDE_X86_GFNI_NATIVE
+#endif
+#endif
+
+#if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if defined(__INTEL_COMPILER)
+#define SIMDE_X86_SVML_NATIVE
+#endif
+#endif
+
+#if defined(HEDLEY_MSVC_VERSION)
+#pragma warning(push)
+#pragma warning(disable : 4799)
+#endif
+
+#if defined(SIMDE_X86_AVX_NATIVE) || defined(SIMDE_X86_GFNI_NATIVE) || \
+	defined(SIMDE_X86_SVML_NATIVE)
+#include <immintrin.h>
+#elif defined(SIMDE_X86_SSE4_2_NATIVE)
+#include <nmmintrin.h>
+#elif defined(SIMDE_X86_SSE4_1_NATIVE)
+#include <smmintrin.h>
+#elif defined(SIMDE_X86_SSSE3_NATIVE)
+#include <tmmintrin.h>
+#elif defined(SIMDE_X86_SSE3_NATIVE)
+#include <pmmintrin.h>
+#elif defined(SIMDE_X86_SSE2_NATIVE)
+#include <emmintrin.h>
+#elif defined(SIMDE_X86_SSE_NATIVE)
+#include <xmmintrin.h>
+#elif defined(SIMDE_X86_MMX_NATIVE)
+#include <mmintrin.h>
+#endif
+
+#if defined(HEDLEY_MSVC_VERSION)
+#pragma warning(pop)
+#endif
+
+#if !defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \
+	!defined(SIMDE_ARM_NEON_A64V8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_ARM_NEON) && defined(SIMDE_ARCH_AARCH64) && \
+	SIMDE_ARCH_ARM_CHECK(80)
+#define SIMDE_ARM_NEON_A64V8_NATIVE
+#endif
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \
+	!defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+#define SIMDE_ARM_NEON_A32V8_NATIVE
+#endif
+
+#if !defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
+	!defined(SIMDE_ARM_NEON_A32V8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(80)
+#define SIMDE_ARM_NEON_A32V8_NATIVE
+#endif
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
+	!defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+#define SIMDE_ARM_NEON_A32V7_NATIVE
+#endif
+
+#if !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+	!defined(SIMDE_ARM_NEON_A32V7_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(70)
+#define SIMDE_ARM_NEON_A32V7_NATIVE
+#endif
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+#include <arm_neon.h>
+#endif
+
+#if !defined(SIMDE_WASM_SIMD128_NATIVE) && \
+	!defined(SIMDE_WASM_SIMD128_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
+#if defined(SIMDE_ARCH_WASM_SIMD128)
+#define SIMDE_WASM_SIMD128_NATIVE
+#endif
+#endif
+#if defined(SIMDE_WASM_SIMD128_NATIVE)
+#if !defined(__wasm_unimplemented_simd128__)
+#define __wasm_unimplemented_simd128__
+#endif
+#include <wasm_simd128.h>
+#endif
+
+#if !defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) &&        \
+	!defined(SIMDE_POWER_ALTIVEC_P9_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if SIMDE_ARCH_POWER_ALTIVEC_CHECK(900)
+#define SIMDE_POWER_ALTIVEC_P9_NATIVE
+#endif
+#endif
+#if defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P8)
+#define SIMDE_POWER_ALTIVEC_P8_NATIVE
+#endif
+
+#if !defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) &&        \
+	!defined(SIMDE_POWER_ALTIVEC_P8_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if SIMDE_ARCH_POWER_ALTIVEC_CHECK(800)
+#define SIMDE_POWER_ALTIVEC_P8_NATIVE
+#endif
+#endif
+#if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P7)
+#define SIMDE_POWER_ALTIVEC_P7_NATIVE
+#endif
+
+#if !defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) &&        \
+	!defined(SIMDE_POWER_ALTIVEC_P7_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if SIMDE_ARCH_POWER_ALTIVEC_CHECK(700)
+#define SIMDE_POWER_ALTIVEC_P7_NATIVE
+#endif
+#endif
+#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P6)
+#define SIMDE_POWER_ALTIVEC_P6_NATIVE
+#endif
+
+#if !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) &&        \
+	!defined(SIMDE_POWER_ALTIVEC_P6_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if SIMDE_ARCH_POWER_ALTIVEC_CHECK(600)
+#define SIMDE_POWER_ALTIVEC_P6_NATIVE
+#endif
+#endif
+#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P5)
+#define SIMDE_POWER_ALTIVEC_P5_NATIVE
+#endif
+
+#if !defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) &&        \
+	!defined(SIMDE_POWER_ALTIVEC_P5_NO_NATIVE) && \
+	!defined(SIMDE_NO_NATIVE)
+#if SIMDE_ARCH_POWER_ALTIVEC_CHECK(500)
+#define SIMDE_POWER_ALTIVEC_P5_NATIVE
+#endif
+#endif
+#if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
+/* stdbool.h conflicts with the bool in altivec.h */
+#if defined(bool) && !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF_BOOL_)
+#undef bool
+#endif
+#include <altivec.h>
+/* GCC allows you to undefine these macros to prevent conflicts with
+   * standard types as they become context-sensitive keywords. */
+#if defined(__cplusplus)
+#if defined(vector)
+#undef vector
+#endif
+#if defined(pixel)
+#undef pixel
+#endif
+#if defined(bool)
+#undef bool
+#endif
+#define SIMDE_POWER_ALTIVEC_VECTOR(T) vector T
+#define SIMDE_POWER_ALTIVEC_PIXEL pixel
+#define SIMDE_POWER_ALTIVEC_BOOL bool
+#else
+#define SIMDE_POWER_ALTIVEC_VECTOR(T) __vector T
+#define SIMDE_POWER_ALTIVEC_PIXEL __pixel
+#define SIMDE_POWER_ALTIVEC_BOOL __bool
+#endif /* defined(__cplusplus) */
+#endif
+
+#endif /* !defined(SIMDE_FEATURES_H) */

Filskillnaden har hållts tillbaka eftersom den är för stor
+ 1995 - 1236
libobs/util/simde/sse.h


Filskillnaden har hållts tillbaka eftersom den är för stor
+ 2246 - 1108
libobs/util/simde/sse2.h


Vissa filer visades inte eftersom för många filer har ändrats