format-conversion.c 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. /******************************************************************************
  2. Copyright (C) 2013 by Hugh Bailey <[email protected]>
  3. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation, either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ******************************************************************************/
  14. #include "format-conversion.h"
  15. #include <xmmintrin.h>
  16. #include <emmintrin.h>
  17. static inline uint32_t get_m128_32_0(const __m128i val)
  18. {
  19. return *(uint32_t* const)&val;
  20. }
  21. static inline uint32_t get_m128_32_1(const __m128i val)
  22. {
  23. return *(((uint32_t* const)&val)+1);
  24. }
  25. static inline void pack_lum(uint8_t *lum_plane,
  26. uint32_t lum_pos0, uint32_t lum_pos1,
  27. const __m128i line1, const __m128i line2,
  28. const __m128i lum_mask)
  29. {
  30. __m128i pack_val = _mm_packs_epi32(
  31. _mm_srli_si128(_mm_and_si128(line1, lum_mask), 1),
  32. _mm_srli_si128(_mm_and_si128(line2, lum_mask), 1));
  33. pack_val = _mm_packus_epi16(pack_val, pack_val);
  34. *(uint32_t*)(lum_plane+lum_pos0) = get_m128_32_0(pack_val);
  35. *(uint32_t*)(lum_plane+lum_pos1) = get_m128_32_1(pack_val);
  36. }
  37. static inline void pack_chroma_1plane(uint8_t *uv_plane,
  38. uint32_t chroma_pos,
  39. const __m128i line1, const __m128i line2,
  40. const __m128i uv_mask)
  41. {
  42. __m128i add_val = _mm_add_epi64(
  43. _mm_and_si128(line1, uv_mask),
  44. _mm_and_si128(line2, uv_mask));
  45. __m128i avg_val = _mm_add_epi64(
  46. add_val,
  47. _mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1)));
  48. avg_val = _mm_srai_epi16(avg_val, 2);
  49. avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
  50. avg_val = _mm_packus_epi16(avg_val, avg_val);
  51. *(uint32_t*)(uv_plane+chroma_pos) = get_m128_32_0(avg_val);
  52. }
  53. static inline void pack_chroma_2plane(uint8_t *u_plane, uint8_t *v_plane,
  54. uint32_t chroma_pos,
  55. const __m128i line1, const __m128i line2,
  56. const __m128i uv_mask)
  57. {
  58. uint32_t packed_vals;
  59. __m128i add_val = _mm_add_epi64(
  60. _mm_and_si128(line1, uv_mask),
  61. _mm_and_si128(line2, uv_mask));
  62. __m128i avg_val = _mm_add_epi64(
  63. add_val,
  64. _mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1)));
  65. avg_val = _mm_srai_epi16(avg_val, 2);
  66. avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
  67. avg_val = _mm_shufflelo_epi16(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
  68. avg_val = _mm_packus_epi16(avg_val, avg_val);
  69. packed_vals = get_m128_32_0(avg_val);
  70. *(uint16_t*)(u_plane+chroma_pos) = (uint16_t)(packed_vals);
  71. *(uint16_t*)(v_plane+chroma_pos) = (uint16_t)(packed_vals>>16);
  72. }
  73. void compress_uyvx_to_i420(
  74. const uint8_t *input, uint32_t in_row_bytes,
  75. uint32_t width, uint32_t height,
  76. uint32_t start_y, uint32_t end_y,
  77. uint8_t *output[], const uint32_t out_row_bytes[])
  78. {
  79. uint8_t *lum_plane = output[0];
  80. uint8_t *u_plane = output[1];
  81. uint8_t *v_plane = output[2];
  82. uint32_t y;
  83. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  84. __m128i uv_mask = _mm_set1_epi16(0x00FF);
  85. for (y = start_y; y < end_y; y += 2) {
  86. uint32_t y_pos = y * in_row_bytes;
  87. uint32_t chroma_y_pos = (y>>1) * out_row_bytes[1];
  88. uint32_t lum_y_pos = y * out_row_bytes[0];
  89. uint32_t x;
  90. for (x = 0; x < width; x += 4) {
  91. const uint8_t *img = input + y_pos + x*4;
  92. uint32_t lum_pos0 = lum_y_pos + x;
  93. uint32_t lum_pos1 = lum_pos0 + out_row_bytes[0];
  94. __m128i line1 = _mm_load_si128((const __m128i*)img);
  95. __m128i line2 = _mm_load_si128(
  96. (const __m128i*)(img + in_row_bytes));
  97. pack_lum(lum_plane, lum_pos0, lum_pos1,
  98. line1, line2, lum_mask);
  99. pack_chroma_2plane(u_plane, v_plane,
  100. chroma_y_pos + (x>>1),
  101. line1, line2, uv_mask);
  102. }
  103. }
  104. }
  105. void compress_uyvx_to_nv12(
  106. const uint8_t *input, uint32_t in_row_bytes,
  107. uint32_t width, uint32_t height,
  108. uint32_t start_y, uint32_t end_y,
  109. uint8_t *output[], const uint32_t out_row_bytes[])
  110. {
  111. uint8_t *lum_plane = output[0];
  112. uint8_t *chroma_plane = output[1];
  113. uint32_t y;
  114. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  115. __m128i uv_mask = _mm_set1_epi16(0x00FF);
  116. for (y = start_y; y < end_y; y += 2) {
  117. uint32_t y_pos = y * in_row_bytes;
  118. uint32_t chroma_y_pos = (y>>1) * out_row_bytes[1];
  119. uint32_t lum_y_pos = y * out_row_bytes[0];
  120. uint32_t x;
  121. for (x = 0; x < width; x += 4) {
  122. const uint8_t *img = input + y_pos + x*4;
  123. uint32_t lum_pos0 = lum_y_pos + x;
  124. uint32_t lum_pos1 = lum_pos0 + out_row_bytes[0];
  125. __m128i line1 = _mm_load_si128((const __m128i*)img);
  126. __m128i line2 = _mm_load_si128(
  127. (const __m128i*)(img + in_row_bytes));
  128. pack_lum(lum_plane, lum_pos0, lum_pos1,
  129. line1, line2, lum_mask);
  130. pack_chroma_1plane(chroma_plane, chroma_y_pos + x,
  131. line1, line2, uv_mask);
  132. }
  133. }
  134. }
  135. void decompress_420(
  136. const uint8_t *const input[], const uint32_t in_row_bytes[],
  137. uint32_t width, uint32_t height,
  138. uint32_t start_y, uint32_t end_y,
  139. uint8_t *output, uint32_t out_row_bytes)
  140. {
  141. uint32_t start_y_d2 = start_y/2;
  142. uint32_t width_d2 = width/2;
  143. uint32_t height_d2 = end_y/2;
  144. uint32_t y;
  145. for (y = start_y_d2; y < height_d2; y++) {
  146. const uint8_t *chroma0 = input[1] + y * in_row_bytes[1];
  147. const uint8_t *chroma1 = input[2] + y * in_row_bytes[2];
  148. register const uint8_t *lum0, *lum1;
  149. register uint32_t *output0, *output1;
  150. uint32_t x;
  151. lum0 = input[0] + y * 2*width;
  152. lum1 = lum0 + width;
  153. output0 = (uint32_t*)(output + y * 2 * in_row_bytes[0]);
  154. output1 = (uint32_t*)((uint8_t*)output0 + in_row_bytes[0]);
  155. for (x = 0; x < width_d2; x++) {
  156. uint32_t out;
  157. out = (*(chroma0++) << 8) | (*(chroma1++) << 16);
  158. *(output0++) = *(lum0++) | out;
  159. *(output0++) = *(lum0++) | out;
  160. *(output1++) = *(lum1++) | out;
  161. *(output1++) = *(lum1++) | out;
  162. }
  163. }
  164. }
  165. void decompress_nv12(
  166. const uint8_t *const input[], const uint32_t in_row_bytes[],
  167. uint32_t width, uint32_t height,
  168. uint32_t start_y, uint32_t end_y,
  169. uint8_t *output, uint32_t out_row_bytes)
  170. {
  171. uint32_t start_y_d2 = start_y/2;
  172. uint32_t width_d2 = width/2;
  173. uint32_t height_d2 = end_y/2;
  174. uint32_t y;
  175. for (y = start_y_d2; y < height_d2; y++) {
  176. const uint16_t *chroma;
  177. register const uint8_t *lum0, *lum1;
  178. register uint32_t *output0, *output1;
  179. uint32_t x;
  180. chroma = (const uint16_t*)(input[1] + y * in_row_bytes[1]);
  181. lum0 = input[0] + y*2 * in_row_bytes[0];
  182. lum1 = lum0 + in_row_bytes[0];
  183. output0 = (uint32_t*)(output + y*2 * out_row_bytes);
  184. output1 = (uint32_t*)((uint8_t*)output0 + out_row_bytes);
  185. for (x = 0; x < width_d2; x++) {
  186. uint32_t out = *(chroma++) << 8;
  187. *(output0++) = *(lum0++) | out;
  188. *(output0++) = *(lum0++) | out;
  189. *(output1++) = *(lum1++) | out;
  190. *(output1++) = *(lum1++) | out;
  191. }
  192. }
  193. }
  194. void decompress_422(
  195. const uint8_t *input, uint32_t in_row_bytes,
  196. uint32_t width, uint32_t height,
  197. uint32_t start_y, uint32_t end_y,
  198. uint8_t *output, uint32_t out_row_bytes,
  199. bool leading_lum)
  200. {
  201. uint32_t width_d2 = width >> 1;
  202. uint32_t y;
  203. register const uint32_t *input32;
  204. register const uint32_t *input32_end;
  205. register uint32_t *output32;
  206. if (leading_lum) {
  207. for (y = start_y; y < end_y; y++) {
  208. input32 = (const uint32_t*)(input + y*in_row_bytes);
  209. input32_end = input32 + width_d2;
  210. output32 = (uint32_t*)(output + y*out_row_bytes);
  211. while(input32 < input32_end) {
  212. register uint32_t dw = *input32;
  213. output32[0] = dw;
  214. dw &= 0xFFFFFF00;
  215. dw |= (uint8_t)(dw>>16);
  216. output32[1] = dw;
  217. output32 += 2;
  218. input32++;
  219. }
  220. }
  221. } else {
  222. for (y = start_y; y < end_y; y++) {
  223. input32 = (const uint32_t*)(input + y*in_row_bytes);
  224. input32_end = input32 + width_d2;
  225. output32 = (uint32_t*)(output + y*out_row_bytes);
  226. while (input32 < input32_end) {
  227. register uint32_t dw = *input32;
  228. output32[0] = dw;
  229. dw &= 0xFFFF00FF;
  230. dw |= (dw>>16) & 0xFF00;
  231. output32[1] = dw;
  232. output32 += 2;
  233. input32++;
  234. }
  235. }
  236. }
  237. }