format-conversion.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. /******************************************************************************
  2. Copyright (C) 2023 by Lain Bailey <[email protected]>
  3. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation, either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ******************************************************************************/
  14. #include "format-conversion.h"
  15. #include "../util/sse-intrin.h"
  16. /* ...surprisingly, if I don't use a macro to force inlining, it causes the
  17. * CPU usage to boost by a tremendous amount in debug builds. */
  18. #define get_m128_32_0(val) (*((uint32_t *)&val))
  19. #define get_m128_32_1(val) (*(((uint32_t *)&val) + 1))
  20. #define pack_shift(lum_plane, lum_pos0, lum_pos1, line1, line2, mask, sh) \
  21. do { \
  22. __m128i pack_val = _mm_packs_epi32(_mm_srli_si128(_mm_and_si128(line1, mask), sh), \
  23. _mm_srli_si128(_mm_and_si128(line2, mask), sh)); \
  24. pack_val = _mm_packus_epi16(pack_val, pack_val); \
  25. \
  26. *(uint32_t *)(lum_plane + lum_pos0) = get_m128_32_0(pack_val); \
  27. *(uint32_t *)(lum_plane + lum_pos1) = get_m128_32_1(pack_val); \
  28. } while (false)
  29. #define pack_val(lum_plane, lum_pos0, lum_pos1, line1, line2, mask) \
  30. do { \
  31. __m128i pack_val = _mm_packs_epi32(_mm_and_si128(line1, mask), _mm_and_si128(line2, mask)); \
  32. pack_val = _mm_packus_epi16(pack_val, pack_val); \
  33. \
  34. *(uint32_t *)(lum_plane + lum_pos0) = get_m128_32_0(pack_val); \
  35. *(uint32_t *)(lum_plane + lum_pos1) = get_m128_32_1(pack_val); \
  36. } while (false)
  37. #define pack_ch_1plane(uv_plane, chroma_pos, line1, line2, uv_mask) \
  38. do { \
  39. __m128i add_val = _mm_add_epi64(_mm_and_si128(line1, uv_mask), _mm_and_si128(line2, uv_mask)); \
  40. __m128i avg_val = _mm_add_epi64(add_val, _mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1))); \
  41. avg_val = _mm_srai_epi16(avg_val, 2); \
  42. avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
  43. avg_val = _mm_packus_epi16(avg_val, avg_val); \
  44. \
  45. *(uint32_t *)(uv_plane + chroma_pos) = get_m128_32_0(avg_val); \
  46. } while (false)
  47. #define pack_ch_2plane(u_plane, v_plane, chroma_pos, line1, line2, uv_mask) \
  48. do { \
  49. uint32_t packed_vals; \
  50. \
  51. __m128i add_val = _mm_add_epi64(_mm_and_si128(line1, uv_mask), _mm_and_si128(line2, uv_mask)); \
  52. __m128i avg_val = _mm_add_epi64(add_val, _mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1))); \
  53. avg_val = _mm_srai_epi16(avg_val, 2); \
  54. avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
  55. avg_val = _mm_shufflelo_epi16(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
  56. avg_val = _mm_packus_epi16(avg_val, avg_val); \
  57. \
  58. packed_vals = get_m128_32_0(avg_val); \
  59. \
  60. *(uint16_t *)(u_plane + chroma_pos) = (uint16_t)(packed_vals); \
  61. *(uint16_t *)(v_plane + chroma_pos) = (uint16_t)(packed_vals >> 16); \
  62. } while (false)
  63. static FORCE_INLINE uint32_t min_uint32(uint32_t a, uint32_t b)
  64. {
  65. return a < b ? a : b;
  66. }
  67. void compress_uyvx_to_i420(const uint8_t *input, uint32_t in_linesize, uint32_t start_y, uint32_t end_y,
  68. uint8_t *output[], const uint32_t out_linesize[])
  69. {
  70. uint8_t *lum_plane = output[0];
  71. uint8_t *u_plane = output[1];
  72. uint8_t *v_plane = output[2];
  73. uint32_t width = min_uint32(in_linesize, out_linesize[0]);
  74. uint32_t y;
  75. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  76. __m128i uv_mask = _mm_set1_epi16(0x00FF);
  77. for (y = start_y; y < end_y; y += 2) {
  78. uint32_t y_pos = y * in_linesize;
  79. uint32_t chroma_y_pos = (y >> 1) * out_linesize[1];
  80. uint32_t lum_y_pos = y * out_linesize[0];
  81. uint32_t x;
  82. for (x = 0; x < width; x += 4) {
  83. const uint8_t *img = input + y_pos + x * 4;
  84. uint32_t lum_pos0 = lum_y_pos + x;
  85. uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
  86. __m128i line1 = _mm_load_si128((const __m128i *)img);
  87. __m128i line2 = _mm_load_si128((const __m128i *)(img + in_linesize));
  88. pack_shift(lum_plane, lum_pos0, lum_pos1, line1, line2, lum_mask, 1);
  89. pack_ch_2plane(u_plane, v_plane, chroma_y_pos + (x >> 1), line1, line2, uv_mask);
  90. }
  91. }
  92. }
  93. void compress_uyvx_to_nv12(const uint8_t *input, uint32_t in_linesize, uint32_t start_y, uint32_t end_y,
  94. uint8_t *output[], const uint32_t out_linesize[])
  95. {
  96. uint8_t *lum_plane = output[0];
  97. uint8_t *chroma_plane = output[1];
  98. uint32_t width = min_uint32(in_linesize, out_linesize[0]);
  99. uint32_t y;
  100. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  101. __m128i uv_mask = _mm_set1_epi16(0x00FF);
  102. for (y = start_y; y < end_y; y += 2) {
  103. uint32_t y_pos = y * in_linesize;
  104. uint32_t chroma_y_pos = (y >> 1) * out_linesize[1];
  105. uint32_t lum_y_pos = y * out_linesize[0];
  106. uint32_t x;
  107. for (x = 0; x < width; x += 4) {
  108. const uint8_t *img = input + y_pos + x * 4;
  109. uint32_t lum_pos0 = lum_y_pos + x;
  110. uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
  111. __m128i line1 = _mm_load_si128((const __m128i *)img);
  112. __m128i line2 = _mm_load_si128((const __m128i *)(img + in_linesize));
  113. pack_shift(lum_plane, lum_pos0, lum_pos1, line1, line2, lum_mask, 1);
  114. pack_ch_1plane(chroma_plane, chroma_y_pos + x, line1, line2, uv_mask);
  115. }
  116. }
  117. }
  118. void convert_uyvx_to_i444(const uint8_t *input, uint32_t in_linesize, uint32_t start_y, uint32_t end_y,
  119. uint8_t *output[], const uint32_t out_linesize[])
  120. {
  121. uint8_t *lum_plane = output[0];
  122. uint8_t *u_plane = output[1];
  123. uint8_t *v_plane = output[2];
  124. uint32_t width = min_uint32(in_linesize, out_linesize[0]);
  125. uint32_t y;
  126. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  127. __m128i u_mask = _mm_set1_epi32(0x000000FF);
  128. __m128i v_mask = _mm_set1_epi32(0x00FF0000);
  129. for (y = start_y; y < end_y; y += 2) {
  130. uint32_t y_pos = y * in_linesize;
  131. uint32_t lum_y_pos = y * out_linesize[0];
  132. uint32_t x;
  133. for (x = 0; x < width; x += 4) {
  134. const uint8_t *img = input + y_pos + x * 4;
  135. uint32_t lum_pos0 = lum_y_pos + x;
  136. uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
  137. __m128i line1 = _mm_load_si128((const __m128i *)img);
  138. __m128i line2 = _mm_load_si128((const __m128i *)(img + in_linesize));
  139. pack_shift(lum_plane, lum_pos0, lum_pos1, line1, line2, lum_mask, 1);
  140. pack_val(u_plane, lum_pos0, lum_pos1, line1, line2, u_mask);
  141. pack_shift(v_plane, lum_pos0, lum_pos1, line1, line2, v_mask, 2);
  142. }
  143. }
  144. }
  145. void decompress_420(const uint8_t *const input[], const uint32_t in_linesize[], uint32_t start_y, uint32_t end_y,
  146. uint8_t *output, uint32_t out_linesize)
  147. {
  148. uint32_t start_y_d2 = start_y / 2;
  149. uint32_t width_d2 = in_linesize[0] / 2;
  150. uint32_t height_d2 = end_y / 2;
  151. uint32_t y;
  152. for (y = start_y_d2; y < height_d2; y++) {
  153. const uint8_t *chroma0 = input[1] + y * in_linesize[1];
  154. const uint8_t *chroma1 = input[2] + y * in_linesize[2];
  155. register const uint8_t *lum0, *lum1;
  156. register uint32_t *output0, *output1;
  157. uint32_t x;
  158. lum0 = input[0] + y * 2 * in_linesize[0];
  159. lum1 = lum0 + in_linesize[0];
  160. output0 = (uint32_t *)(output + y * 2 * out_linesize);
  161. output1 = (uint32_t *)((uint8_t *)output0 + out_linesize);
  162. for (x = 0; x < width_d2; x++) {
  163. uint32_t out;
  164. out = (*(chroma0++) << 8) | *(chroma1++);
  165. *(output0++) = (*(lum0++) << 16) | out;
  166. *(output0++) = (*(lum0++) << 16) | out;
  167. *(output1++) = (*(lum1++) << 16) | out;
  168. *(output1++) = (*(lum1++) << 16) | out;
  169. }
  170. }
  171. }
  172. void decompress_nv12(const uint8_t *const input[], const uint32_t in_linesize[], uint32_t start_y, uint32_t end_y,
  173. uint8_t *output, uint32_t out_linesize)
  174. {
  175. uint32_t start_y_d2 = start_y / 2;
  176. uint32_t width_d2 = min_uint32(in_linesize[0], out_linesize) / 2;
  177. uint32_t height_d2 = end_y / 2;
  178. uint32_t y;
  179. for (y = start_y_d2; y < height_d2; y++) {
  180. const uint16_t *chroma;
  181. register const uint8_t *lum0, *lum1;
  182. register uint32_t *output0, *output1;
  183. uint32_t x;
  184. chroma = (const uint16_t *)(input[1] + y * in_linesize[1]);
  185. lum0 = input[0] + y * 2 * in_linesize[0];
  186. lum1 = lum0 + in_linesize[0];
  187. output0 = (uint32_t *)(output + y * 2 * out_linesize);
  188. output1 = (uint32_t *)((uint8_t *)output0 + out_linesize);
  189. for (x = 0; x < width_d2; x++) {
  190. uint32_t out = *(chroma++) << 8;
  191. *(output0++) = *(lum0++) | out;
  192. *(output0++) = *(lum0++) | out;
  193. *(output1++) = *(lum1++) | out;
  194. *(output1++) = *(lum1++) | out;
  195. }
  196. }
  197. }
  198. void decompress_422(const uint8_t *input, uint32_t in_linesize, uint32_t start_y, uint32_t end_y, uint8_t *output,
  199. uint32_t out_linesize, bool leading_lum)
  200. {
  201. uint32_t width_d2 = min_uint32(in_linesize, out_linesize) / 2;
  202. uint32_t y;
  203. register const uint32_t *input32;
  204. register const uint32_t *input32_end;
  205. register uint32_t *output32;
  206. if (leading_lum) {
  207. for (y = start_y; y < end_y; y++) {
  208. input32 = (const uint32_t *)(input + y * in_linesize);
  209. input32_end = input32 + width_d2;
  210. output32 = (uint32_t *)(output + y * out_linesize);
  211. while (input32 < input32_end) {
  212. register uint32_t dw = *input32;
  213. output32[0] = dw;
  214. dw &= 0xFFFFFF00;
  215. dw |= (uint8_t)(dw >> 16);
  216. output32[1] = dw;
  217. output32 += 2;
  218. input32++;
  219. }
  220. }
  221. } else {
  222. for (y = start_y; y < end_y; y++) {
  223. input32 = (const uint32_t *)(input + y * in_linesize);
  224. input32_end = input32 + width_d2;
  225. output32 = (uint32_t *)(output + y * out_linesize);
  226. while (input32 < input32_end) {
  227. register uint32_t dw = *input32;
  228. output32[0] = dw;
  229. dw &= 0xFFFF00FF;
  230. dw |= (dw >> 16) & 0xFF00;
  231. output32[1] = dw;
  232. output32 += 2;
  233. input32++;
  234. }
  235. }
  236. }
  237. }