12 年之前 · 603b262d4c
--- a/libobs/media-io/format-conversion.c
+++ b/libobs/media-io/format-conversion.c
@@ -0,0 +1,269 @@
 
				+/******************************************************************************
			
 
				+    Copyright (C) 2013 by Hugh Bailey <[email protected]>
			
 
				+
			
 
				+    This program is free software: you can redistribute it and/or modify
			
 
				+    it under the terms of the GNU General Public License as published by
			
 
				+    the Free Software Foundation, either version 3 of the License, or
			
 
				+    (at your option) any later version.
			
 
				+
			
 
				+    This program is distributed in the hope that it will be useful,
			
 
				+    but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				+    GNU General Public License for more details.
			
 
				+
			
 
				+    You should have received a copy of the GNU General Public License
			
 
				+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
			
 
				+******************************************************************************/
			
 
				+
			
 
				+#include "format-conversion.h"
			
 
				+#include <xmmintrin.h>
			
 
				+#include <emmintrin.h>
			
 
				+
			
 
				+static inline uint32_t get_m128_32_0(const __m128i val)
			
 
				+{
			
 
				+	return *(uint32_t* const)&val;
			
 
				+}
			
 
				+
			
 
				+static inline uint32_t get_m128_32_1(const __m128i val)
			
 
				+{
			
 
				+	return *(((uint32_t* const)&val)+1);
			
 
				+}
			
 
				+
			
 
				+static inline void pack_lum(uint8_t *lum_plane,
			
 
				+		uint32_t lum_pos0, uint32_t lum_pos1,
			
 
				+		const __m128i line1, const __m128i line2,
			
 
				+		const __m128i lum_mask)
			
 
				+{
			
 
				+	__m128i pack_val = _mm_packs_epi32(
			
 
				+			_mm_srli_si128(_mm_and_si128(line1, lum_mask), 1),
			
 
				+			_mm_srli_si128(_mm_and_si128(line2, lum_mask), 1));
			
 
				+	pack_val = _mm_packus_epi16(pack_val, pack_val);
			
 
				+
			
 
				+	*(uint32_t*)(lum_plane+lum_pos0) = get_m128_32_0(pack_val);
			
 
				+	*(uint32_t*)(lum_plane+lum_pos1) = get_m128_32_1(pack_val);
			
 
				+}
			
 
				+
			
 
				+static inline void pack_chroma_1plane(uint8_t *uv_plane,
			
 
				+		uint32_t chroma_pos,
			
 
				+		const __m128i line1, const __m128i line2,
			
 
				+		const __m128i uv_mask)
			
 
				+{
			
 
				+	__m128i add_val = _mm_add_epi64(
			
 
				+			_mm_and_si128(line1, uv_mask),
			
 
				+			_mm_and_si128(line2, uv_mask));
			
 
				+	__m128i avg_val = _mm_add_epi64(
			
 
				+			add_val,
			
 
				+			_mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1)));
			
 
				+	avg_val = _mm_srai_epi16(avg_val, 2);
			
 
				+	avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
			
 
				+	avg_val = _mm_packus_epi16(avg_val, avg_val);
			
 
				+
			
 
				+	*(uint32_t*)(uv_plane+chroma_pos) = get_m128_32_0(avg_val);
			
 
				+}
			
 
				+
			
 
				+static inline void pack_chroma_2plane(uint8_t *u_plane, uint8_t *v_plane,
			
 
				+		uint32_t chroma_pos,
			
 
				+		const __m128i line1, const __m128i line2,
			
 
				+		const __m128i uv_mask)
			
 
				+{
			
 
				+	uint32_t packed_vals;
			
 
				+
			
 
				+	__m128i add_val = _mm_add_epi64(
			
 
				+			_mm_and_si128(line1, uv_mask),
			
 
				+			_mm_and_si128(line2, uv_mask));
			
 
				+	__m128i avg_val = _mm_add_epi64(
			
 
				+			add_val,
			
 
				+			_mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1)));
			
 
				+	avg_val = _mm_srai_epi16(avg_val, 2);
			
 
				+	avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
			
 
				+	avg_val = _mm_shufflelo_epi16(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
			
 
				+	avg_val = _mm_packus_epi16(avg_val, avg_val);
			
 
				+
			
 
				+	packed_vals = get_m128_32_0(avg_val);
			
 
				+
			
 
				+	*(uint16_t*)(u_plane+chroma_pos) = (uint16_t)(packed_vals);
			
 
				+	*(uint16_t*)(v_plane+chroma_pos) = (uint16_t)(packed_vals>>16);
			
 
				+}
			
 
				+
			
 
				+void compress_uyvx_to_i420(const void *input_v, uint32_t width, uint32_t height,
			
 
				+		uint32_t row_bytes, uint32_t start_y, uint32_t end_y,
			
 
				+		void **output)
			
 
				+{
			
 
				+	const uint8_t *input = input_v;
			
 
				+	uint8_t  *lum_plane   = output[0];
			
 
				+	uint8_t  *u_plane     = output[1];
			
 
				+	uint8_t  *v_plane     = output[2];
			
 
				+	uint32_t chroma_pitch = width >> 1;
			
 
				+	uint32_t y;
			
 
				+
			
 
				+	__m128i lum_mask = _mm_set1_epi32(0x0000FF00);
			
 
				+	__m128i uv_mask  = _mm_set1_epi16(0x00FF);
			
 
				+
			
 
				+	for (y = start_y; y < end_y; y += 2) {
			
 
				+		uint32_t y_pos        = y * row_bytes;
			
 
				+		uint32_t chroma_y_pos = (y>>1) * chroma_pitch;
			
 
				+		uint32_t lum_y_pos    = y * width;
			
 
				+		uint32_t x;
			
 
				+
			
 
				+		for (x = 0; x < width; x += 4) {
			
 
				+			const uint8_t *img = input + y_pos + x*4;
			
 
				+			uint32_t lum_pos0  = lum_y_pos + x;
			
 
				+			uint32_t lum_pos1  = lum_pos0 + width;
			
 
				+
			
 
				+			__m128i line1 = _mm_load_si128((const __m128i*)img);
			
 
				+			__m128i line2 = _mm_load_si128(
			
 
				+					(const __m128i*)(img + row_bytes));
			
 
				+
			
 
				+			pack_lum(lum_plane, lum_pos0, lum_pos1,
			
 
				+					line1, line2, lum_mask);
			
 
				+			pack_chroma_2plane(u_plane, v_plane,
			
 
				+					chroma_y_pos + (x>>1),
			
 
				+					line1, line2, uv_mask);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static inline void _compress_uyvx_to_nv12(const uint8_t *input,
			
 
				+		uint32_t width, uint32_t height, uint32_t pitch,
			
 
				+		uint32_t start_y, uint32_t end_y, uint32_t row_bytes_out,
			
 
				+		void **output)
			
 
				+{
			
 
				+	uint8_t *lum_plane    = output[0];
			
 
				+	uint8_t *chroma_plane = output[1];
			
 
				+	uint32_t y;
			
 
				+
			
 
				+	__m128i lum_mask = _mm_set1_epi32(0x0000FF00);
			
 
				+	__m128i uv_mask  = _mm_set1_epi16(0x00FF);
			
 
				+
			
 
				+	for (y = start_y; y < end_y; y += 2) {
			
 
				+		uint32_t y_pos        = y * pitch;
			
 
				+		uint32_t chroma_y_pos = (y>>1) * row_bytes_out;
			
 
				+		uint32_t lum_y_pos    = y * row_bytes_out;
			
 
				+		uint32_t x;
			
 
				+
			
 
				+		for (x = 0; x < width; x += 4) {
			
 
				+			const uint8_t *img = input + y_pos + x*4;
			
 
				+			uint32_t lum_pos0  = lum_y_pos + x;
			
 
				+			uint32_t lum_pos1  = lum_pos0 + row_bytes_out;
			
 
				+
			
 
				+			__m128i line1 = _mm_load_si128((const __m128i*)img);
			
 
				+			__m128i line2 = _mm_load_si128(
			
 
				+					(const __m128i*)(img + pitch));
			
 
				+
			
 
				+			pack_lum(lum_plane, lum_pos0, lum_pos1,
			
 
				+					line1, line2, lum_mask);
			
 
				+			pack_chroma_1plane(chroma_plane, chroma_y_pos + x,
			
 
				+					line1, line2, uv_mask);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void compress_uyvx_to_nv12(const void *input, uint32_t width, uint32_t height,
			
 
				+		uint32_t row_bytes, uint32_t start_y, uint32_t end_y,
			
 
				+		void **output)
			
 
				+{
			
 
				+	_compress_uyvx_to_nv12(input, width, height, row_bytes,
			
 
				+			start_y, end_y, width, output);
			
 
				+}
			
 
				+
			
 
				+void compress_uyvx_to_nv12_aligned(const void *input,
			
 
				+		uint32_t width, uint32_t height, uint32_t row_bytes,
			
 
				+		uint32_t start_y, uint32_t end_y, uint32_t row_bytes_out,
			
 
				+		void **output)
			
 
				+{
			
 
				+	_compress_uyvx_to_nv12(input, width, height, row_bytes,
			
 
				+			start_y, end_y, row_bytes_out, output);
			
 
				+}
			
 
				+
			
 
				+void decompress_i420(const void *input_v, uint32_t width, uint32_t height,
			
 
				+		uint32_t row_bytes, uint32_t start_y, uint32_t end_y,
			
 
				+		void *output_v)
			
 
				+{
			
 
				+	uint8_t       *output = output_v;
			
 
				+	const uint8_t *input  = input_v;
			
 
				+	const uint8_t *input2 = input + width * height;
			
 
				+	const uint8_t *input3 = input2 + width * height / 4;
			
 
				+
			
 
				+	uint32_t start_y_d2 = start_y/2;
			
 
				+	uint32_t width_d2   = width/2;
			
 
				+	uint32_t height_d2  = end_y/2;
			
 
				+	uint32_t y;
			
 
				+
			
 
				+	for (y = start_y_d2; y < height_d2; y++) {
			
 
				+		const uint8_t *chroma0 = input2 + y * width_d2;
			
 
				+		const uint8_t *chroma1 = input3 + y * width_d2;
			
 
				+		register const uint8_t *lum0, *lum1;
			
 
				+		register uint32_t *output0, *output1;
			
 
				+		uint32_t x;
			
 
				+
			
 
				+		lum0 = input + y * 2*width;
			
 
				+		lum1 = lum0 + width;
			
 
				+		output0 = (uint32_t*)(output + y * 2*row_bytes);
			
 
				+		output1 = (uint32_t*)((uint8_t*)output0 + row_bytes);
			
 
				+
			
 
				+		for (x = 0; x < width_d2; x++) {
			
 
				+			uint32_t out;
			
 
				+			out = (*(chroma0++) << 8) | (*(chroma1++) << 16);
			
 
				+
			
 
				+			*(output0++) = *(lum0++) | out;
			
 
				+			*(output0++) = *(lum0++) | out;
			
 
				+
			
 
				+			*(output1++) = *(lum1++) | out;
			
 
				+			*(output1++) = *(lum1++) | out;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void decompress_i422(const void *input_v, uint32_t width, uint32_t height,
			
 
				+		uint32_t row_bytes, uint32_t start_y, uint32_t end_y,
			
 
				+		void *output_v, bool leading_lum)
			
 
				+{
			
 
				+	const uint8_t *input  = input_v;
			
 
				+	uint8_t       *output = output_v;
			
 
				+
			
 
				+	uint32_t width_d2  = width>>1;
			
 
				+	uint32_t line_size = width*2;
			
 
				+	uint32_t y;
			
 
				+
			
 
				+	register const uint32_t *input32;
			
 
				+	register const uint32_t *input32_end;
			
 
				+	register uint32_t       *output32;
			
 
				+
			
 
				+	if (leading_lum) {
			
 
				+		for (y = 0; y < height; y++) {
			
 
				+			input32     = (uint32_t*)(input + y*line_size);
			
 
				+			input32_end = input32 + width_d2;
			
 
				+			output32    = (uint32_t*)(output + y*row_bytes);
			
 
				+
			
 
				+			while(input32 < input32_end) {
			
 
				+				register uint32_t dw = *input32;
			
 
				+
			
 
				+				output32[0] = dw;
			
 
				+				dw &= 0xFFFFFF00;
			
 
				+				dw |= (uint8_t)(dw>>16);
			
 
				+				output32[1] = dw;
			
 
				+
			
 
				+				output32 += 2;
			
 
				+				input32++;
			
 
				+			}
			
 
				+		}
			
 
				+	} else {
			
 
				+		for (y = 0; y < height; y++) {
			
 
				+			input32     = (uint32_t*)(input + y*line_size);
			
 
				+			input32_end = input32 + width_d2;
			
 
				+			output32    = (uint32_t*)(output + y*row_bytes);
			
 
				+
			
 
				+			while (input32 < input32_end) {
			
 
				+				register uint32_t dw = *input32;
			
 
				+
			
 
				+				output32[0] = dw;
			
 
				+				dw &= 0xFFFF00FF;
			
 
				+				dw |= (dw>>16) & 0xFF00;
			
 
				+				output32[1] = dw;
			
 
				+
			
 
				+				output32 += 2;
			
 
				+				input32++;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/libobs/media-io/format-conversion.h
+++ b/libobs/media-io/format-conversion.h
@@ -0,0 +1,51 @@
 
				+/******************************************************************************
			
 
				+    Copyright (C) 2013 by Hugh Bailey <[email protected]>
			
 
				+
			
 
				+    This program is free software: you can redistribute it and/or modify
			
 
				+    it under the terms of the GNU General Public License as published by
			
 
				+    the Free Software Foundation, either version 3 of the License, or
			
 
				+    (at your option) any later version.
			
 
				+
			
 
				+    This program is distributed in the hope that it will be useful,
			
 
				+    but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				+    GNU General Public License for more details.
			
 
				+
			
 
				+    You should have received a copy of the GNU General Public License
			
 
				+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
			
 
				+******************************************************************************/
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include "../util/c99defs.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+EXPORT void compress_uyvx_to_i420(const void *input,
			
 
				+		uint32_t width, uint32_t height, uint32_t row_bytes,
			
 
				+		uint32_t start_y, uint32_t end_y, void **output);
			
 
				+
			
 
				+EXPORT void compress_uyvx_to_nv12(const void *input,
			
 
				+		uint32_t width, uint32_t height, uint32_t row_bytes,
			
 
				+		uint32_t start_y, uint32_t end_y, void **output);
			
 
				+
			
 
				+EXPORT void decompress_420(const void *input,
			
 
				+		uint32_t width, uint32_t height, uint32_t row_bytes,
			
 
				+		uint32_t start_y, uint32_t end_y, void *output);
			
 
				+
			
 
				+EXPORT void decompress_422(const void *input,
			
 
				+		uint32_t width, uint32_t height, uint32_t row_bytes,
			
 
				+		uint32_t start_y, uint32_t end_y, void *output,
			
 
				+		bool leading_lum);
			
 
				+
			
 
				+/* special case for quicksync */
			
 
				+EXPORT void compress_uyvx_to_nv12_aligned(const void *input,
			
 
				+		uint32_t width, uint32_t height, uint32_t row_bytes,
			
 
				+		uint32_t start_y, uint32_t end_y,
			
 
				+		uint32_t row_bytes_out, void **output);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif