format-conversion.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. /******************************************************************************
  2. Copyright (C) 2013 by Hugh Bailey <[email protected]>
  3. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation, either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ******************************************************************************/
  14. #include "format-conversion.h"
  15. #include "../util/sse-intrin.h"
  16. /* ...surprisingly, if I don't use a macro to force inlining, it causes the
  17. * CPU usage to boost by a tremendous amount in debug builds. */
  18. #define get_m128_32_0(val) (*((uint32_t *)&val))
  19. #define get_m128_32_1(val) (*(((uint32_t *)&val) + 1))
  20. #define pack_shift(lum_plane, lum_pos0, lum_pos1, line1, line2, mask, sh) \
  21. do { \
  22. __m128i pack_val = _mm_packs_epi32( \
  23. _mm_srli_si128(_mm_and_si128(line1, mask), sh), \
  24. _mm_srli_si128(_mm_and_si128(line2, mask), sh)); \
  25. pack_val = _mm_packus_epi16(pack_val, pack_val); \
  26. \
  27. *(uint32_t *)(lum_plane + lum_pos0) = get_m128_32_0(pack_val); \
  28. *(uint32_t *)(lum_plane + lum_pos1) = get_m128_32_1(pack_val); \
  29. } while (false)
  30. #define pack_val(lum_plane, lum_pos0, lum_pos1, line1, line2, mask) \
  31. do { \
  32. __m128i pack_val = \
  33. _mm_packs_epi32(_mm_and_si128(line1, mask), \
  34. _mm_and_si128(line2, mask)); \
  35. pack_val = _mm_packus_epi16(pack_val, pack_val); \
  36. \
  37. *(uint32_t *)(lum_plane + lum_pos0) = get_m128_32_0(pack_val); \
  38. *(uint32_t *)(lum_plane + lum_pos1) = get_m128_32_1(pack_val); \
  39. } while (false)
  40. #define pack_ch_1plane(uv_plane, chroma_pos, line1, line2, uv_mask) \
  41. do { \
  42. __m128i add_val = \
  43. _mm_add_epi64(_mm_and_si128(line1, uv_mask), \
  44. _mm_and_si128(line2, uv_mask)); \
  45. __m128i avg_val = _mm_add_epi64( \
  46. add_val, \
  47. _mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1))); \
  48. avg_val = _mm_srai_epi16(avg_val, 2); \
  49. avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
  50. avg_val = _mm_packus_epi16(avg_val, avg_val); \
  51. \
  52. *(uint32_t *)(uv_plane + chroma_pos) = get_m128_32_0(avg_val); \
  53. } while (false)
  54. #define pack_ch_2plane(u_plane, v_plane, chroma_pos, line1, line2, uv_mask) \
  55. do { \
  56. uint32_t packed_vals; \
  57. \
  58. __m128i add_val = \
  59. _mm_add_epi64(_mm_and_si128(line1, uv_mask), \
  60. _mm_and_si128(line2, uv_mask)); \
  61. __m128i avg_val = _mm_add_epi64( \
  62. add_val, \
  63. _mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1))); \
  64. avg_val = _mm_srai_epi16(avg_val, 2); \
  65. avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
  66. avg_val = \
  67. _mm_shufflelo_epi16(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
  68. avg_val = _mm_packus_epi16(avg_val, avg_val); \
  69. \
  70. packed_vals = get_m128_32_0(avg_val); \
  71. \
  72. *(uint16_t *)(u_plane + chroma_pos) = (uint16_t)(packed_vals); \
  73. *(uint16_t *)(v_plane + chroma_pos) = \
  74. (uint16_t)(packed_vals >> 16); \
  75. } while (false)
  76. static FORCE_INLINE uint32_t min_uint32(uint32_t a, uint32_t b)
  77. {
  78. return a < b ? a : b;
  79. }
  80. void compress_uyvx_to_i420(const uint8_t *input, uint32_t in_linesize,
  81. uint32_t start_y, uint32_t end_y, uint8_t *output[],
  82. const uint32_t out_linesize[])
  83. {
  84. uint8_t *lum_plane = output[0];
  85. uint8_t *u_plane = output[1];
  86. uint8_t *v_plane = output[2];
  87. uint32_t width = min_uint32(in_linesize, out_linesize[0]);
  88. uint32_t y;
  89. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  90. __m128i uv_mask = _mm_set1_epi16(0x00FF);
  91. for (y = start_y; y < end_y; y += 2) {
  92. uint32_t y_pos = y * in_linesize;
  93. uint32_t chroma_y_pos = (y >> 1) * out_linesize[1];
  94. uint32_t lum_y_pos = y * out_linesize[0];
  95. uint32_t x;
  96. for (x = 0; x < width; x += 4) {
  97. const uint8_t *img = input + y_pos + x * 4;
  98. uint32_t lum_pos0 = lum_y_pos + x;
  99. uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
  100. __m128i line1 = _mm_load_si128((const __m128i *)img);
  101. __m128i line2 = _mm_load_si128(
  102. (const __m128i *)(img + in_linesize));
  103. pack_shift(lum_plane, lum_pos0, lum_pos1, line1, line2,
  104. lum_mask, 1);
  105. pack_ch_2plane(u_plane, v_plane,
  106. chroma_y_pos + (x >> 1), line1, line2,
  107. uv_mask);
  108. }
  109. }
  110. }
  111. void compress_uyvx_to_nv12(const uint8_t *input, uint32_t in_linesize,
  112. uint32_t start_y, uint32_t end_y, uint8_t *output[],
  113. const uint32_t out_linesize[])
  114. {
  115. uint8_t *lum_plane = output[0];
  116. uint8_t *chroma_plane = output[1];
  117. uint32_t width = min_uint32(in_linesize, out_linesize[0]);
  118. uint32_t y;
  119. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  120. __m128i uv_mask = _mm_set1_epi16(0x00FF);
  121. for (y = start_y; y < end_y; y += 2) {
  122. uint32_t y_pos = y * in_linesize;
  123. uint32_t chroma_y_pos = (y >> 1) * out_linesize[1];
  124. uint32_t lum_y_pos = y * out_linesize[0];
  125. uint32_t x;
  126. for (x = 0; x < width; x += 4) {
  127. const uint8_t *img = input + y_pos + x * 4;
  128. uint32_t lum_pos0 = lum_y_pos + x;
  129. uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
  130. __m128i line1 = _mm_load_si128((const __m128i *)img);
  131. __m128i line2 = _mm_load_si128(
  132. (const __m128i *)(img + in_linesize));
  133. pack_shift(lum_plane, lum_pos0, lum_pos1, line1, line2,
  134. lum_mask, 1);
  135. pack_ch_1plane(chroma_plane, chroma_y_pos + x, line1,
  136. line2, uv_mask);
  137. }
  138. }
  139. }
  140. void convert_uyvx_to_i444(const uint8_t *input, uint32_t in_linesize,
  141. uint32_t start_y, uint32_t end_y, uint8_t *output[],
  142. const uint32_t out_linesize[])
  143. {
  144. uint8_t *lum_plane = output[0];
  145. uint8_t *u_plane = output[1];
  146. uint8_t *v_plane = output[2];
  147. uint32_t width = min_uint32(in_linesize, out_linesize[0]);
  148. uint32_t y;
  149. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  150. __m128i u_mask = _mm_set1_epi32(0x000000FF);
  151. __m128i v_mask = _mm_set1_epi32(0x00FF0000);
  152. for (y = start_y; y < end_y; y += 2) {
  153. uint32_t y_pos = y * in_linesize;
  154. uint32_t lum_y_pos = y * out_linesize[0];
  155. uint32_t x;
  156. for (x = 0; x < width; x += 4) {
  157. const uint8_t *img = input + y_pos + x * 4;
  158. uint32_t lum_pos0 = lum_y_pos + x;
  159. uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
  160. __m128i line1 = _mm_load_si128((const __m128i *)img);
  161. __m128i line2 = _mm_load_si128(
  162. (const __m128i *)(img + in_linesize));
  163. pack_shift(lum_plane, lum_pos0, lum_pos1, line1, line2,
  164. lum_mask, 1);
  165. pack_val(u_plane, lum_pos0, lum_pos1, line1, line2,
  166. u_mask);
  167. pack_shift(v_plane, lum_pos0, lum_pos1, line1, line2,
  168. v_mask, 2);
  169. }
  170. }
  171. }
  172. void decompress_420(const uint8_t *const input[], const uint32_t in_linesize[],
  173. uint32_t start_y, uint32_t end_y, uint8_t *output,
  174. uint32_t out_linesize)
  175. {
  176. uint32_t start_y_d2 = start_y / 2;
  177. uint32_t width_d2 = in_linesize[0] / 2;
  178. uint32_t height_d2 = end_y / 2;
  179. uint32_t y;
  180. for (y = start_y_d2; y < height_d2; y++) {
  181. const uint8_t *chroma0 = input[1] + y * in_linesize[1];
  182. const uint8_t *chroma1 = input[2] + y * in_linesize[2];
  183. register const uint8_t *lum0, *lum1;
  184. register uint32_t *output0, *output1;
  185. uint32_t x;
  186. lum0 = input[0] + y * 2 * in_linesize[0];
  187. lum1 = lum0 + in_linesize[0];
  188. output0 = (uint32_t *)(output + y * 2 * out_linesize);
  189. output1 = (uint32_t *)((uint8_t *)output0 + out_linesize);
  190. for (x = 0; x < width_d2; x++) {
  191. uint32_t out;
  192. out = (*(chroma0++) << 8) | *(chroma1++);
  193. *(output0++) = (*(lum0++) << 16) | out;
  194. *(output0++) = (*(lum0++) << 16) | out;
  195. *(output1++) = (*(lum1++) << 16) | out;
  196. *(output1++) = (*(lum1++) << 16) | out;
  197. }
  198. }
  199. }
  200. void decompress_nv12(const uint8_t *const input[], const uint32_t in_linesize[],
  201. uint32_t start_y, uint32_t end_y, uint8_t *output,
  202. uint32_t out_linesize)
  203. {
  204. uint32_t start_y_d2 = start_y / 2;
  205. uint32_t width_d2 = min_uint32(in_linesize[0], out_linesize) / 2;
  206. uint32_t height_d2 = end_y / 2;
  207. uint32_t y;
  208. for (y = start_y_d2; y < height_d2; y++) {
  209. const uint16_t *chroma;
  210. register const uint8_t *lum0, *lum1;
  211. register uint32_t *output0, *output1;
  212. uint32_t x;
  213. chroma = (const uint16_t *)(input[1] + y * in_linesize[1]);
  214. lum0 = input[0] + y * 2 * in_linesize[0];
  215. lum1 = lum0 + in_linesize[0];
  216. output0 = (uint32_t *)(output + y * 2 * out_linesize);
  217. output1 = (uint32_t *)((uint8_t *)output0 + out_linesize);
  218. for (x = 0; x < width_d2; x++) {
  219. uint32_t out = *(chroma++) << 8;
  220. *(output0++) = *(lum0++) | out;
  221. *(output0++) = *(lum0++) | out;
  222. *(output1++) = *(lum1++) | out;
  223. *(output1++) = *(lum1++) | out;
  224. }
  225. }
  226. }
  227. void decompress_422(const uint8_t *input, uint32_t in_linesize,
  228. uint32_t start_y, uint32_t end_y, uint8_t *output,
  229. uint32_t out_linesize, bool leading_lum)
  230. {
  231. uint32_t width_d2 = min_uint32(in_linesize, out_linesize) / 2;
  232. uint32_t y;
  233. register const uint32_t *input32;
  234. register const uint32_t *input32_end;
  235. register uint32_t *output32;
  236. if (leading_lum) {
  237. for (y = start_y; y < end_y; y++) {
  238. input32 = (const uint32_t *)(input + y * in_linesize);
  239. input32_end = input32 + width_d2;
  240. output32 = (uint32_t *)(output + y * out_linesize);
  241. while (input32 < input32_end) {
  242. register uint32_t dw = *input32;
  243. output32[0] = dw;
  244. dw &= 0xFFFFFF00;
  245. dw |= (uint8_t)(dw >> 16);
  246. output32[1] = dw;
  247. output32 += 2;
  248. input32++;
  249. }
  250. }
  251. } else {
  252. for (y = start_y; y < end_y; y++) {
  253. input32 = (const uint32_t *)(input + y * in_linesize);
  254. input32_end = input32 + width_d2;
  255. output32 = (uint32_t *)(output + y * out_linesize);
  256. while (input32 < input32_end) {
  257. register uint32_t dw = *input32;
  258. output32[0] = dw;
  259. dw &= 0xFFFF00FF;
  260. dw |= (dw >> 16) & 0xFF00;
  261. output32[1] = dw;
  262. output32 += 2;
  263. input32++;
  264. }
  265. }
  266. }
  267. }