format-conversion.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. /******************************************************************************
  2. Copyright (C) 2013 by Hugh Bailey <[email protected]>
  3. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation, either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ******************************************************************************/
  14. #include "format-conversion.h"
  15. #include <xmmintrin.h>
  16. #include <emmintrin.h>
  17. /* ...surprisingly, if I don't use a macro to force inlining, it causes the
  18. * CPU usage to boost by a tremendous amount in debug builds. */
  19. #define get_m128_32_0(val) (*((uint32_t *)&val))
  20. #define get_m128_32_1(val) (*(((uint32_t *)&val) + 1))
  21. #define pack_shift(lum_plane, lum_pos0, lum_pos1, line1, line2, mask, sh) \
  22. do { \
  23. __m128i pack_val = _mm_packs_epi32( \
  24. _mm_srli_si128(_mm_and_si128(line1, mask), sh), \
  25. _mm_srli_si128(_mm_and_si128(line2, mask), sh)); \
  26. pack_val = _mm_packus_epi16(pack_val, pack_val); \
  27. \
  28. *(uint32_t *)(lum_plane + lum_pos0) = get_m128_32_0(pack_val); \
  29. *(uint32_t *)(lum_plane + lum_pos1) = get_m128_32_1(pack_val); \
  30. } while (false)
  31. #define pack_val(lum_plane, lum_pos0, lum_pos1, line1, line2, mask) \
  32. do { \
  33. __m128i pack_val = \
  34. _mm_packs_epi32(_mm_and_si128(line1, mask), \
  35. _mm_and_si128(line2, mask)); \
  36. pack_val = _mm_packus_epi16(pack_val, pack_val); \
  37. \
  38. *(uint32_t *)(lum_plane + lum_pos0) = get_m128_32_0(pack_val); \
  39. *(uint32_t *)(lum_plane + lum_pos1) = get_m128_32_1(pack_val); \
  40. } while (false)
  41. #define pack_ch_1plane(uv_plane, chroma_pos, line1, line2, uv_mask) \
  42. do { \
  43. __m128i add_val = \
  44. _mm_add_epi64(_mm_and_si128(line1, uv_mask), \
  45. _mm_and_si128(line2, uv_mask)); \
  46. __m128i avg_val = _mm_add_epi64( \
  47. add_val, \
  48. _mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1))); \
  49. avg_val = _mm_srai_epi16(avg_val, 2); \
  50. avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
  51. avg_val = _mm_packus_epi16(avg_val, avg_val); \
  52. \
  53. *(uint32_t *)(uv_plane + chroma_pos) = get_m128_32_0(avg_val); \
  54. } while (false)
  55. #define pack_ch_2plane(u_plane, v_plane, chroma_pos, line1, line2, uv_mask) \
  56. do { \
  57. uint32_t packed_vals; \
  58. \
  59. __m128i add_val = \
  60. _mm_add_epi64(_mm_and_si128(line1, uv_mask), \
  61. _mm_and_si128(line2, uv_mask)); \
  62. __m128i avg_val = _mm_add_epi64( \
  63. add_val, \
  64. _mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1))); \
  65. avg_val = _mm_srai_epi16(avg_val, 2); \
  66. avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
  67. avg_val = \
  68. _mm_shufflelo_epi16(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
  69. avg_val = _mm_packus_epi16(avg_val, avg_val); \
  70. \
  71. packed_vals = get_m128_32_0(avg_val); \
  72. \
  73. *(uint16_t *)(u_plane + chroma_pos) = (uint16_t)(packed_vals); \
  74. *(uint16_t *)(v_plane + chroma_pos) = \
  75. (uint16_t)(packed_vals >> 16); \
  76. } while (false)
  77. static FORCE_INLINE uint32_t min_uint32(uint32_t a, uint32_t b)
  78. {
  79. return a < b ? a : b;
  80. }
  81. void compress_uyvx_to_i420(const uint8_t *input, uint32_t in_linesize,
  82. uint32_t start_y, uint32_t end_y, uint8_t *output[],
  83. const uint32_t out_linesize[])
  84. {
  85. uint8_t *lum_plane = output[0];
  86. uint8_t *u_plane = output[1];
  87. uint8_t *v_plane = output[2];
  88. uint32_t width = min_uint32(in_linesize, out_linesize[0]);
  89. uint32_t y;
  90. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  91. __m128i uv_mask = _mm_set1_epi16(0x00FF);
  92. for (y = start_y; y < end_y; y += 2) {
  93. uint32_t y_pos = y * in_linesize;
  94. uint32_t chroma_y_pos = (y >> 1) * out_linesize[1];
  95. uint32_t lum_y_pos = y * out_linesize[0];
  96. uint32_t x;
  97. for (x = 0; x < width; x += 4) {
  98. const uint8_t *img = input + y_pos + x * 4;
  99. uint32_t lum_pos0 = lum_y_pos + x;
  100. uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
  101. __m128i line1 = _mm_load_si128((const __m128i *)img);
  102. __m128i line2 = _mm_load_si128(
  103. (const __m128i *)(img + in_linesize));
  104. pack_shift(lum_plane, lum_pos0, lum_pos1, line1, line2,
  105. lum_mask, 1);
  106. pack_ch_2plane(u_plane, v_plane,
  107. chroma_y_pos + (x >> 1), line1, line2,
  108. uv_mask);
  109. }
  110. }
  111. }
  112. void compress_uyvx_to_nv12(const uint8_t *input, uint32_t in_linesize,
  113. uint32_t start_y, uint32_t end_y, uint8_t *output[],
  114. const uint32_t out_linesize[])
  115. {
  116. uint8_t *lum_plane = output[0];
  117. uint8_t *chroma_plane = output[1];
  118. uint32_t width = min_uint32(in_linesize, out_linesize[0]);
  119. uint32_t y;
  120. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  121. __m128i uv_mask = _mm_set1_epi16(0x00FF);
  122. for (y = start_y; y < end_y; y += 2) {
  123. uint32_t y_pos = y * in_linesize;
  124. uint32_t chroma_y_pos = (y >> 1) * out_linesize[1];
  125. uint32_t lum_y_pos = y * out_linesize[0];
  126. uint32_t x;
  127. for (x = 0; x < width; x += 4) {
  128. const uint8_t *img = input + y_pos + x * 4;
  129. uint32_t lum_pos0 = lum_y_pos + x;
  130. uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
  131. __m128i line1 = _mm_load_si128((const __m128i *)img);
  132. __m128i line2 = _mm_load_si128(
  133. (const __m128i *)(img + in_linesize));
  134. pack_shift(lum_plane, lum_pos0, lum_pos1, line1, line2,
  135. lum_mask, 1);
  136. pack_ch_1plane(chroma_plane, chroma_y_pos + x, line1,
  137. line2, uv_mask);
  138. }
  139. }
  140. }
  141. void convert_uyvx_to_i444(const uint8_t *input, uint32_t in_linesize,
  142. uint32_t start_y, uint32_t end_y, uint8_t *output[],
  143. const uint32_t out_linesize[])
  144. {
  145. uint8_t *lum_plane = output[0];
  146. uint8_t *u_plane = output[1];
  147. uint8_t *v_plane = output[2];
  148. uint32_t width = min_uint32(in_linesize, out_linesize[0]);
  149. uint32_t y;
  150. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  151. __m128i u_mask = _mm_set1_epi32(0x000000FF);
  152. __m128i v_mask = _mm_set1_epi32(0x00FF0000);
  153. for (y = start_y; y < end_y; y += 2) {
  154. uint32_t y_pos = y * in_linesize;
  155. uint32_t lum_y_pos = y * out_linesize[0];
  156. uint32_t x;
  157. for (x = 0; x < width; x += 4) {
  158. const uint8_t *img = input + y_pos + x * 4;
  159. uint32_t lum_pos0 = lum_y_pos + x;
  160. uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
  161. __m128i line1 = _mm_load_si128((const __m128i *)img);
  162. __m128i line2 = _mm_load_si128(
  163. (const __m128i *)(img + in_linesize));
  164. pack_shift(lum_plane, lum_pos0, lum_pos1, line1, line2,
  165. lum_mask, 1);
  166. pack_val(u_plane, lum_pos0, lum_pos1, line1, line2,
  167. u_mask);
  168. pack_shift(v_plane, lum_pos0, lum_pos1, line1, line2,
  169. v_mask, 2);
  170. }
  171. }
  172. }
  173. void decompress_420(const uint8_t *const input[], const uint32_t in_linesize[],
  174. uint32_t start_y, uint32_t end_y, uint8_t *output,
  175. uint32_t out_linesize)
  176. {
  177. uint32_t start_y_d2 = start_y / 2;
  178. uint32_t width_d2 = in_linesize[0] / 2;
  179. uint32_t height_d2 = end_y / 2;
  180. uint32_t y;
  181. for (y = start_y_d2; y < height_d2; y++) {
  182. const uint8_t *chroma0 = input[1] + y * in_linesize[1];
  183. const uint8_t *chroma1 = input[2] + y * in_linesize[2];
  184. register const uint8_t *lum0, *lum1;
  185. register uint32_t *output0, *output1;
  186. uint32_t x;
  187. lum0 = input[0] + y * 2 * in_linesize[0];
  188. lum1 = lum0 + in_linesize[0];
  189. output0 = (uint32_t *)(output + y * 2 * out_linesize);
  190. output1 = (uint32_t *)((uint8_t *)output0 + out_linesize);
  191. for (x = 0; x < width_d2; x++) {
  192. uint32_t out;
  193. out = (*(chroma0++) << 8) | *(chroma1++);
  194. *(output0++) = (*(lum0++) << 16) | out;
  195. *(output0++) = (*(lum0++) << 16) | out;
  196. *(output1++) = (*(lum1++) << 16) | out;
  197. *(output1++) = (*(lum1++) << 16) | out;
  198. }
  199. }
  200. }
  201. void decompress_nv12(const uint8_t *const input[], const uint32_t in_linesize[],
  202. uint32_t start_y, uint32_t end_y, uint8_t *output,
  203. uint32_t out_linesize)
  204. {
  205. uint32_t start_y_d2 = start_y / 2;
  206. uint32_t width_d2 = min_uint32(in_linesize[0], out_linesize) / 2;
  207. uint32_t height_d2 = end_y / 2;
  208. uint32_t y;
  209. for (y = start_y_d2; y < height_d2; y++) {
  210. const uint16_t *chroma;
  211. register const uint8_t *lum0, *lum1;
  212. register uint32_t *output0, *output1;
  213. uint32_t x;
  214. chroma = (const uint16_t *)(input[1] + y * in_linesize[1]);
  215. lum0 = input[0] + y * 2 * in_linesize[0];
  216. lum1 = lum0 + in_linesize[0];
  217. output0 = (uint32_t *)(output + y * 2 * out_linesize);
  218. output1 = (uint32_t *)((uint8_t *)output0 + out_linesize);
  219. for (x = 0; x < width_d2; x++) {
  220. uint32_t out = *(chroma++) << 8;
  221. *(output0++) = *(lum0++) | out;
  222. *(output0++) = *(lum0++) | out;
  223. *(output1++) = *(lum1++) | out;
  224. *(output1++) = *(lum1++) | out;
  225. }
  226. }
  227. }
  228. void decompress_422(const uint8_t *input, uint32_t in_linesize,
  229. uint32_t start_y, uint32_t end_y, uint8_t *output,
  230. uint32_t out_linesize, bool leading_lum)
  231. {
  232. uint32_t width_d2 = min_uint32(in_linesize, out_linesize) / 2;
  233. uint32_t y;
  234. register const uint32_t *input32;
  235. register const uint32_t *input32_end;
  236. register uint32_t *output32;
  237. if (leading_lum) {
  238. for (y = start_y; y < end_y; y++) {
  239. input32 = (const uint32_t *)(input + y * in_linesize);
  240. input32_end = input32 + width_d2;
  241. output32 = (uint32_t *)(output + y * out_linesize);
  242. while (input32 < input32_end) {
  243. register uint32_t dw = *input32;
  244. output32[0] = dw;
  245. dw &= 0xFFFFFF00;
  246. dw |= (uint8_t)(dw >> 16);
  247. output32[1] = dw;
  248. output32 += 2;
  249. input32++;
  250. }
  251. }
  252. } else {
  253. for (y = start_y; y < end_y; y++) {
  254. input32 = (const uint32_t *)(input + y * in_linesize);
  255. input32_end = input32 + width_d2;
  256. output32 = (uint32_t *)(output + y * out_linesize);
  257. while (input32 < input32_end) {
  258. register uint32_t dw = *input32;
  259. output32[0] = dw;
  260. dw &= 0xFFFF00FF;
  261. dw |= (dw >> 16) & 0xFF00;
  262. output32[1] = dw;
  263. output32 += 2;
  264. input32++;
  265. }
  266. }
  267. }
  268. }