Browse Source

win-capture: Modify 16bit to 32bit color conversion to use SSE

Bl00drav3n 10 years ago
parent
commit
ba4ac47ee3
1 changed files with 136 additions and 38 deletions
  1. 136 38
      plugins/win-capture/game-capture.c

+ 136 - 38
plugins/win-capture/game-capture.c

@@ -3,6 +3,7 @@
 #include <util/platform.h>
 #include <windows.h>
 #include <dxgi.h>
+#include <emmintrin.h>
 #include <ipc-util/pipe.h>
 #include "obfuscate.h"
 #include "graphics-hook-info.h"
@@ -865,24 +866,67 @@ static void copy_b5g6r5_tex(struct game_capture *gc, int cur_texture,
 	uint32_t gc_pitch = gc->pitch;
 
 	for (uint32_t y = 0; y < gc_cy; y++) {
-		register uint8_t *in  = input + (gc_pitch * y);
-		register uint8_t *end = in + (gc_cx * PIXEL_16BIT_SIZE);
-		register uint8_t *out = data  + (pitch * y);
-
-		while (in < end) {
-			register uint16_t in_pix = *(uint16_t*)in;
-			register uint32_t out_pix = 0xFF000000;
-
-			out_pix |= convert_5_to_8bit(in_pix);
-			in_pix >>= 5;
-			out_pix |= convert_6_to_8bit(in_pix) << 8;
-			in_pix >>= 6;
-			out_pix |= convert_5_to_8bit(in_pix) << 16;
-
-			*(uint32_t*)out = out_pix;
-
-			in  += PIXEL_16BIT_SIZE;
-			out += PIXEL_32BIT_SIZE;
+		uint8_t *row = input + (gc_pitch * y);
+		uint8_t *out = data + (pitch * y);
+
+		for (uint32_t x = 0; x < gc_cx; x += 8) {
+			__m128i pixels_blue, pixels_green, pixels_red;
+			__m128i pixels_result;
+			__m128i *pixels_dest;
+
+			__m128i *pixels_src = (__m128i*)(row + x * sizeof(uint16_t));
+			__m128i pixels = _mm_load_si128(pixels_src);
+
+			__m128i zero = _mm_setzero_si128();
+			__m128i pixels_low = _mm_unpacklo_epi16(pixels, zero);
+			__m128i pixels_high = _mm_unpackhi_epi16(pixels, zero);
+
+			__m128i blue_channel_mask = _mm_set1_epi32(0x0000001F);
+			__m128i blue_offset = _mm_set1_epi32(0x00000003);
+			__m128i green_channel_mask = _mm_set1_epi32(0x000007E0);
+			__m128i green_offset = _mm_set1_epi32(0x00000008);
+			__m128i red_channel_mask = _mm_set1_epi32(0x0000F800);
+			__m128i red_offset = _mm_set1_epi32(0x00000300);
+
+			pixels_blue = _mm_and_si128(pixels_low, blue_channel_mask);
+			pixels_blue = _mm_slli_epi32(pixels_blue, 3);
+			pixels_blue = _mm_add_epi32(pixels_blue, blue_offset);
+
+			pixels_green = _mm_and_si128(pixels_low, green_channel_mask);
+			pixels_green = _mm_add_epi32(pixels_green, green_offset);
+			pixels_green = _mm_slli_epi32(pixels_green, 5);
+
+			pixels_red = _mm_and_si128(pixels_low, red_channel_mask);
+			pixels_red = _mm_add_epi32(pixels_red, red_offset);
+			pixels_red = _mm_slli_epi32(pixels_red, 8);
+
+			pixels_result = _mm_set1_epi32(0xFF000000);
+			pixels_result = _mm_or_si128(pixels_result, pixels_blue);
+			pixels_result = _mm_or_si128(pixels_result, pixels_green);
+			pixels_result = _mm_or_si128(pixels_result, pixels_red);
+
+			pixels_dest = (__m128i*)(out + x * sizeof(uint32_t));
+			_mm_store_si128(pixels_dest, pixels_result);
+
+			pixels_blue = _mm_and_si128(pixels_high, blue_channel_mask);
+			pixels_blue = _mm_slli_epi32(pixels_blue, 3);
+			pixels_blue = _mm_add_epi32(pixels_blue, blue_offset);
+
+			pixels_green = _mm_and_si128(pixels_high, green_channel_mask);
+			pixels_green = _mm_add_epi32(pixels_green, green_offset);
+			pixels_green = _mm_slli_epi32(pixels_green, 5);
+
+			pixels_red = _mm_and_si128(pixels_high, red_channel_mask);
+			pixels_red = _mm_add_epi32(pixels_red, red_offset);
+			pixels_red = _mm_slli_epi32(pixels_red, 8);
+
+			pixels_result = _mm_set1_epi32(0xFF000000);
+			pixels_result = _mm_or_si128(pixels_result, pixels_blue);
+			pixels_result = _mm_or_si128(pixels_result, pixels_green);
+			pixels_result = _mm_or_si128(pixels_result, pixels_red);
+
+			pixels_dest = (__m128i*)(out + (x + 4) * sizeof(uint32_t));
+			_mm_store_si128(pixels_dest, pixels_result);
 		}
 	}
 }
@@ -896,26 +940,80 @@ static void copy_b5g5r5a1_tex(struct game_capture *gc, int cur_texture,
 	uint32_t gc_pitch = gc->pitch;
 
 	for (uint32_t y = 0; y < gc_cy; y++) {
-		register uint8_t *in  = input + (gc_pitch * y);
-		register uint8_t *end = in + (gc_cx * PIXEL_16BIT_SIZE);
-		register uint8_t *out = data  + (pitch * y);
-
-		while (in < end) {
-			register uint16_t in_pix = *(uint16_t*)in;
-			register uint32_t out_pix = 0;
-
-			out_pix |= convert_5_to_8bit(in_pix);
-			in_pix >>= 5;
-			out_pix |= convert_5_to_8bit(in_pix) << 8;
-			in_pix >>= 5;
-			out_pix |= convert_5_to_8bit(in_pix) << 16;
-			in_pix >>= 5;
-			out_pix |= (in_pix * 255) << 24;
-
-			*(uint32_t*)out = out_pix;
-
-			in  += PIXEL_16BIT_SIZE;
-			out += PIXEL_32BIT_SIZE;
+		uint8_t *row = input + (gc_pitch * y);
+		uint8_t *out = data + (pitch * y);
+
+		for (uint32_t x = 0; x < gc_cx; x += 8) {
+			__m128i pixels_blue, pixels_green, pixels_red, pixels_alpha;
+			__m128i pixels_result;
+			__m128i *pixels_dest;
+
+			__m128i *pixels_src = (__m128i*)(row + x * sizeof(uint16_t));
+			__m128i pixels = _mm_load_si128(pixels_src);
+
+			__m128i zero = _mm_setzero_si128();
+			__m128i pixels_low = _mm_unpacklo_epi16(pixels, zero);
+			__m128i pixels_high = _mm_unpackhi_epi16(pixels, zero);
+
+			__m128i blue_channel_mask = _mm_set1_epi32(0x0000001F);
+			__m128i blue_offset = _mm_set1_epi32(0x00000003);
+			__m128i green_channel_mask = _mm_set1_epi32(0x000003E0);
+			__m128i green_offset = _mm_set1_epi32(0x000000C);
+			__m128i red_channel_mask = _mm_set1_epi32(0x00007C00);
+			__m128i red_offset = _mm_set1_epi32(0x00000180);
+			__m128i alpha_channel_mask = _mm_set1_epi32(0x00008000);
+			__m128i alpha_offset = _mm_set1_epi32(0x00000001);
+			__m128i alpha_mask32 = _mm_set1_epi32(0xFF000000);
+
+			pixels_blue = _mm_and_si128(pixels_low, blue_channel_mask);
+			pixels_blue = _mm_slli_epi32(pixels_blue, 3);
+			pixels_blue = _mm_add_epi32(pixels_blue, blue_offset);
+
+			pixels_green = _mm_and_si128(pixels_low, green_channel_mask);
+			pixels_green = _mm_add_epi32(pixels_green, green_offset);
+			pixels_green = _mm_slli_epi32(pixels_green, 6);
+
+			pixels_red = _mm_and_si128(pixels_low, red_channel_mask);
+			pixels_red = _mm_add_epi32(pixels_red, red_offset);
+			pixels_red = _mm_slli_epi32(pixels_red, 9);
+
+			pixels_alpha = _mm_and_si128(pixels_low, alpha_channel_mask);
+			pixels_alpha = _mm_srli_epi32(pixels_alpha, 15);
+			pixels_alpha = _mm_sub_epi32(pixels_alpha, alpha_offset);
+			pixels_alpha = _mm_andnot_si128(pixels_alpha, alpha_mask32);
+
+			pixels_result = pixels_red;
+			pixels_result = _mm_or_si128(pixels_result, pixels_alpha);
+			pixels_result = _mm_or_si128(pixels_result, pixels_blue);
+			pixels_result = _mm_or_si128(pixels_result, pixels_green);
+
+			pixels_dest = (__m128i*)(out + x * sizeof(uint32_t));
+			_mm_store_si128(pixels_dest, pixels_result);
+
+			pixels_blue = _mm_and_si128(pixels_high, blue_channel_mask);
+			pixels_blue = _mm_slli_epi32(pixels_blue, 3);
+			pixels_blue = _mm_add_epi32(pixels_blue, blue_offset);
+
+			pixels_green = _mm_and_si128(pixels_high, green_channel_mask);
+			pixels_green = _mm_add_epi32(pixels_green, green_offset);
+			pixels_green = _mm_slli_epi32(pixels_green, 6);
+
+			pixels_red = _mm_and_si128(pixels_high, red_channel_mask);
+			pixels_red = _mm_add_epi32(pixels_red, red_offset);
+			pixels_red = _mm_slli_epi32(pixels_red, 9);
+
+			pixels_alpha = _mm_and_si128(pixels_high, alpha_channel_mask);
+			pixels_alpha = _mm_srli_epi32(pixels_alpha, 15);
+			pixels_alpha = _mm_sub_epi32(pixels_alpha, alpha_offset);
+			pixels_alpha = _mm_andnot_si128(pixels_alpha, alpha_mask32);
+
+			pixels_result = pixels_red;
+			pixels_result = _mm_or_si128(pixels_result, pixels_alpha);
+			pixels_result = _mm_or_si128(pixels_result, pixels_blue);
+			pixels_result = _mm_or_si128(pixels_result, pixels_green);
+
+			pixels_dest = (__m128i*)(out + (x + 4) * sizeof(uint32_t));
+			_mm_store_si128(pixels_dest, pixels_result);
 		}
 	}
 }