format-conversion.c 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. /******************************************************************************
  2. Copyright (C) 2013 by Hugh Bailey <[email protected]>
  3. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation, either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ******************************************************************************/
  14. #include "format-conversion.h"
  15. #include <xmmintrin.h>
  16. #include <emmintrin.h>
  17. /* ...surprisingly, if I don't use a macro to force inlining, it causes the
  18. * CPU usage to boost by a tremendous amount in debug builds. */
  19. #define get_m128_32_0(val) (*((uint32_t*)&val))
  20. #define get_m128_32_1(val) (*(((uint32_t*)&val)+1))
  21. #define pack_shift(lum_plane, lum_pos0, lum_pos1, line1, line2, mask, sh) \
  22. do { \
  23. __m128i pack_val = _mm_packs_epi32( \
  24. _mm_srli_si128(_mm_and_si128(line1, mask), sh), \
  25. _mm_srli_si128(_mm_and_si128(line2, mask), sh)); \
  26. pack_val = _mm_packus_epi16(pack_val, pack_val); \
  27. \
  28. *(uint32_t*)(lum_plane+lum_pos0) = get_m128_32_0(pack_val); \
  29. *(uint32_t*)(lum_plane+lum_pos1) = get_m128_32_1(pack_val); \
  30. } while (false)
  31. #define pack_ch_1plane(uv_plane, chroma_pos, line1, line2, uv_mask) \
  32. do { \
  33. __m128i add_val = _mm_add_epi64( \
  34. _mm_and_si128(line1, uv_mask), \
  35. _mm_and_si128(line2, uv_mask)); \
  36. __m128i avg_val = _mm_add_epi64( \
  37. add_val, \
  38. _mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1))); \
  39. avg_val = _mm_srai_epi16(avg_val, 2); \
  40. avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
  41. avg_val = _mm_packus_epi16(avg_val, avg_val); \
  42. \
  43. *(uint32_t*)(uv_plane+chroma_pos) = get_m128_32_0(avg_val); \
  44. } while (false)
  45. #define pack_ch_2plane(u_plane, v_plane, chroma_pos, line1, line2, uv_mask) \
  46. do { \
  47. uint32_t packed_vals; \
  48. \
  49. __m128i add_val = _mm_add_epi64( \
  50. _mm_and_si128(line1, uv_mask), \
  51. _mm_and_si128(line2, uv_mask)); \
  52. __m128i avg_val = _mm_add_epi64( \
  53. add_val, \
  54. _mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1))); \
  55. avg_val = _mm_srai_epi16(avg_val, 2); \
  56. avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
  57. avg_val = _mm_shufflelo_epi16(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
  58. avg_val = _mm_packus_epi16(avg_val, avg_val); \
  59. \
  60. packed_vals = get_m128_32_0(avg_val); \
  61. \
  62. *(uint16_t*)(u_plane+chroma_pos) = (uint16_t)(packed_vals); \
  63. *(uint16_t*)(v_plane+chroma_pos) = (uint16_t)(packed_vals>>16); \
  64. } while (false)
  65. static FORCE_INLINE uint32_t min_uint32(uint32_t a, uint32_t b)
  66. {
  67. return a < b ? a : b;
  68. }
  69. void compress_uyvx_to_i420(
  70. const uint8_t *input, uint32_t in_linesize,
  71. uint32_t start_y, uint32_t end_y,
  72. uint8_t *output[], const uint32_t out_linesize[])
  73. {
  74. uint8_t *lum_plane = output[0];
  75. uint8_t *u_plane = output[1];
  76. uint8_t *v_plane = output[2];
  77. uint32_t width = min_uint32(in_linesize, out_linesize[0]);
  78. uint32_t y;
  79. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  80. __m128i uv_mask = _mm_set1_epi16(0x00FF);
  81. for (y = start_y; y < end_y; y += 2) {
  82. uint32_t y_pos = y * in_linesize;
  83. uint32_t chroma_y_pos = (y>>1) * out_linesize[1];
  84. uint32_t lum_y_pos = y * out_linesize[0];
  85. uint32_t x;
  86. for (x = 0; x < width; x += 4) {
  87. const uint8_t *img = input + y_pos + x*4;
  88. uint32_t lum_pos0 = lum_y_pos + x;
  89. uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
  90. __m128i line1 = _mm_load_si128((const __m128i*)img);
  91. __m128i line2 = _mm_load_si128(
  92. (const __m128i*)(img + in_linesize));
  93. pack_shift(lum_plane, lum_pos0, lum_pos1,
  94. line1, line2, lum_mask, 1);
  95. pack_ch_2plane(u_plane, v_plane,
  96. chroma_y_pos + (x>>1),
  97. line1, line2, uv_mask);
  98. }
  99. }
  100. }
  101. void compress_uyvx_to_nv12(
  102. const uint8_t *input, uint32_t in_linesize,
  103. uint32_t start_y, uint32_t end_y,
  104. uint8_t *output[], const uint32_t out_linesize[])
  105. {
  106. uint8_t *lum_plane = output[0];
  107. uint8_t *chroma_plane = output[1];
  108. uint32_t width = min_uint32(in_linesize, out_linesize[0]);
  109. uint32_t y;
  110. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  111. __m128i uv_mask = _mm_set1_epi16(0x00FF);
  112. for (y = start_y; y < end_y; y += 2) {
  113. uint32_t y_pos = y * in_linesize;
  114. uint32_t chroma_y_pos = (y>>1) * out_linesize[1];
  115. uint32_t lum_y_pos = y * out_linesize[0];
  116. uint32_t x;
  117. for (x = 0; x < width; x += 4) {
  118. const uint8_t *img = input + y_pos + x*4;
  119. uint32_t lum_pos0 = lum_y_pos + x;
  120. uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
  121. __m128i line1 = _mm_load_si128((const __m128i*)img);
  122. __m128i line2 = _mm_load_si128(
  123. (const __m128i*)(img + in_linesize));
  124. pack_shift(lum_plane, lum_pos0, lum_pos1,
  125. line1, line2, lum_mask, 1);
  126. pack_ch_1plane(chroma_plane, chroma_y_pos + x,
  127. line1, line2, uv_mask);
  128. }
  129. }
  130. }
  131. void decompress_420(
  132. const uint8_t *const input[], const uint32_t in_linesize[],
  133. uint32_t start_y, uint32_t end_y,
  134. uint8_t *output, uint32_t out_linesize)
  135. {
  136. uint32_t start_y_d2 = start_y/2;
  137. uint32_t width_d2 = min_uint32(in_linesize[0], out_linesize)/2;
  138. uint32_t height_d2 = end_y/2;
  139. uint32_t y;
  140. for (y = start_y_d2; y < height_d2; y++) {
  141. const uint8_t *chroma0 = input[1] + y * in_linesize[1];
  142. const uint8_t *chroma1 = input[2] + y * in_linesize[2];
  143. register const uint8_t *lum0, *lum1;
  144. register uint32_t *output0, *output1;
  145. uint32_t x;
  146. lum0 = input[0] + y * 2 * in_linesize[0];
  147. lum1 = lum0 + in_linesize[0];
  148. output0 = (uint32_t*)(output + y * 2 * in_linesize[0]);
  149. output1 = (uint32_t*)((uint8_t*)output0 + in_linesize[0]);
  150. for (x = 0; x < width_d2; x++) {
  151. uint32_t out;
  152. out = (*(chroma0++) << 8) | (*(chroma1++) << 16);
  153. *(output0++) = *(lum0++) | out;
  154. *(output0++) = *(lum0++) | out;
  155. *(output1++) = *(lum1++) | out;
  156. *(output1++) = *(lum1++) | out;
  157. }
  158. }
  159. }
  160. void decompress_nv12(
  161. const uint8_t *const input[], const uint32_t in_linesize[],
  162. uint32_t start_y, uint32_t end_y,
  163. uint8_t *output, uint32_t out_linesize)
  164. {
  165. uint32_t start_y_d2 = start_y/2;
  166. uint32_t width_d2 = min_uint32(in_linesize[0], out_linesize)/2;
  167. uint32_t height_d2 = end_y/2;
  168. uint32_t y;
  169. for (y = start_y_d2; y < height_d2; y++) {
  170. const uint16_t *chroma;
  171. register const uint8_t *lum0, *lum1;
  172. register uint32_t *output0, *output1;
  173. uint32_t x;
  174. chroma = (const uint16_t*)(input[1] + y * in_linesize[1]);
  175. lum0 = input[0] + y * 2 * in_linesize[0];
  176. lum1 = lum0 + in_linesize[0];
  177. output0 = (uint32_t*)(output + y * 2 * out_linesize);
  178. output1 = (uint32_t*)((uint8_t*)output0 + out_linesize);
  179. for (x = 0; x < width_d2; x++) {
  180. uint32_t out = *(chroma++) << 8;
  181. *(output0++) = *(lum0++) | out;
  182. *(output0++) = *(lum0++) | out;
  183. *(output1++) = *(lum1++) | out;
  184. *(output1++) = *(lum1++) | out;
  185. }
  186. }
  187. }
  188. void decompress_422(
  189. const uint8_t *input, uint32_t in_linesize,
  190. uint32_t start_y, uint32_t end_y,
  191. uint8_t *output, uint32_t out_linesize,
  192. bool leading_lum)
  193. {
  194. uint32_t width_d2 = min_uint32(in_linesize, out_linesize)/2;
  195. uint32_t y;
  196. register const uint32_t *input32;
  197. register const uint32_t *input32_end;
  198. register uint32_t *output32;
  199. if (leading_lum) {
  200. for (y = start_y; y < end_y; y++) {
  201. input32 = (const uint32_t*)(input + y*in_linesize);
  202. input32_end = input32 + width_d2;
  203. output32 = (uint32_t*)(output + y*out_linesize);
  204. while(input32 < input32_end) {
  205. register uint32_t dw = *input32;
  206. output32[0] = dw;
  207. dw &= 0xFFFFFF00;
  208. dw |= (uint8_t)(dw>>16);
  209. output32[1] = dw;
  210. output32 += 2;
  211. input32++;
  212. }
  213. }
  214. } else {
  215. for (y = start_y; y < end_y; y++) {
  216. input32 = (const uint32_t*)(input + y*in_linesize);
  217. input32_end = input32 + width_d2;
  218. output32 = (uint32_t*)(output + y*out_linesize);
  219. while (input32 < input32_end) {
  220. register uint32_t dw = *input32;
  221. output32[0] = dw;
  222. dw &= 0xFFFF00FF;
  223. dw |= (dw>>16) & 0xFF00;
  224. output32[1] = dw;
  225. output32 += 2;
  226. input32++;
  227. }
  228. }
  229. }
  230. }