1
0

format-conversion.c 11 KB


  1. /******************************************************************************
  2. Copyright (C) 2013 by Hugh Bailey <[email protected]>
  3. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation, either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ******************************************************************************/
  14. #include "format-conversion.h"
  15. #include <xmmintrin.h>
  16. #include <emmintrin.h>
  17. /* ...surprisingly, if I don't use a macro to force inlining, it causes the
  18. * CPU usage to boost by a tremendous amount in debug builds. */
  19. #define get_m128_32_0(val) (*((uint32_t*)&val))
  20. #define get_m128_32_1(val) (*(((uint32_t*)&val)+1))
  21. #define pack_shift(lum_plane, lum_pos0, lum_pos1, line1, line2, mask, sh) \
  22. do { \
  23. __m128i pack_val = _mm_packs_epi32( \
  24. _mm_srli_si128(_mm_and_si128(line1, mask), sh), \
  25. _mm_srli_si128(_mm_and_si128(line2, mask), sh)); \
  26. pack_val = _mm_packus_epi16(pack_val, pack_val); \
  27. \
  28. *(uint32_t*)(lum_plane+lum_pos0) = get_m128_32_0(pack_val); \
  29. *(uint32_t*)(lum_plane+lum_pos1) = get_m128_32_1(pack_val); \
  30. } while (false)
  31. #define pack_val(lum_plane, lum_pos0, lum_pos1, line1, line2, mask) \
  32. do { \
  33. __m128i pack_val = _mm_packs_epi32( \
  34. _mm_and_si128(line1, mask), \
  35. _mm_and_si128(line2, mask)); \
  36. pack_val = _mm_packus_epi16(pack_val, pack_val); \
  37. \
  38. *(uint32_t*)(lum_plane+lum_pos0) = get_m128_32_0(pack_val); \
  39. *(uint32_t*)(lum_plane+lum_pos1) = get_m128_32_1(pack_val); \
  40. } while (false)
  41. #define pack_ch_1plane(uv_plane, chroma_pos, line1, line2, uv_mask) \
  42. do { \
  43. __m128i add_val = _mm_add_epi64( \
  44. _mm_and_si128(line1, uv_mask), \
  45. _mm_and_si128(line2, uv_mask)); \
  46. __m128i avg_val = _mm_add_epi64( \
  47. add_val, \
  48. _mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1))); \
  49. avg_val = _mm_srai_epi16(avg_val, 2); \
  50. avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
  51. avg_val = _mm_packus_epi16(avg_val, avg_val); \
  52. \
  53. *(uint32_t*)(uv_plane+chroma_pos) = get_m128_32_0(avg_val); \
  54. } while (false)
  55. #define pack_ch_2plane(u_plane, v_plane, chroma_pos, line1, line2, uv_mask) \
  56. do { \
  57. uint32_t packed_vals; \
  58. \
  59. __m128i add_val = _mm_add_epi64( \
  60. _mm_and_si128(line1, uv_mask), \
  61. _mm_and_si128(line2, uv_mask)); \
  62. __m128i avg_val = _mm_add_epi64( \
  63. add_val, \
  64. _mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1))); \
  65. avg_val = _mm_srai_epi16(avg_val, 2); \
  66. avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
  67. avg_val = _mm_shufflelo_epi16(avg_val, _MM_SHUFFLE(3, 1, 2, 0)); \
  68. avg_val = _mm_packus_epi16(avg_val, avg_val); \
  69. \
  70. packed_vals = get_m128_32_0(avg_val); \
  71. \
  72. *(uint16_t*)(u_plane+chroma_pos) = (uint16_t)(packed_vals); \
  73. *(uint16_t*)(v_plane+chroma_pos) = (uint16_t)(packed_vals>>16); \
  74. } while (false)
  75. static FORCE_INLINE uint32_t min_uint32(uint32_t a, uint32_t b)
  76. {
  77. return a < b ? a : b;
  78. }
  79. void compress_uyvx_to_i420(
  80. const uint8_t *input, uint32_t in_linesize,
  81. uint32_t start_y, uint32_t end_y,
  82. uint8_t *output[], const uint32_t out_linesize[])
  83. {
  84. uint8_t *lum_plane = output[0];
  85. uint8_t *u_plane = output[1];
  86. uint8_t *v_plane = output[2];
  87. uint32_t width = min_uint32(in_linesize, out_linesize[0]);
  88. uint32_t y;
  89. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  90. __m128i uv_mask = _mm_set1_epi16(0x00FF);
  91. for (y = start_y; y < end_y; y += 2) {
  92. uint32_t y_pos = y * in_linesize;
  93. uint32_t chroma_y_pos = (y>>1) * out_linesize[1];
  94. uint32_t lum_y_pos = y * out_linesize[0];
  95. uint32_t x;
  96. for (x = 0; x < width; x += 4) {
  97. const uint8_t *img = input + y_pos + x*4;
  98. uint32_t lum_pos0 = lum_y_pos + x;
  99. uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
  100. __m128i line1 = _mm_load_si128((const __m128i*)img);
  101. __m128i line2 = _mm_load_si128(
  102. (const __m128i*)(img + in_linesize));
  103. pack_shift(lum_plane, lum_pos0, lum_pos1,
  104. line1, line2, lum_mask, 1);
  105. pack_ch_2plane(u_plane, v_plane,
  106. chroma_y_pos + (x>>1),
  107. line1, line2, uv_mask);
  108. }
  109. }
  110. }
  111. void compress_uyvx_to_nv12(
  112. const uint8_t *input, uint32_t in_linesize,
  113. uint32_t start_y, uint32_t end_y,
  114. uint8_t *output[], const uint32_t out_linesize[])
  115. {
  116. uint8_t *lum_plane = output[0];
  117. uint8_t *chroma_plane = output[1];
  118. uint32_t width = min_uint32(in_linesize, out_linesize[0]);
  119. uint32_t y;
  120. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  121. __m128i uv_mask = _mm_set1_epi16(0x00FF);
  122. for (y = start_y; y < end_y; y += 2) {
  123. uint32_t y_pos = y * in_linesize;
  124. uint32_t chroma_y_pos = (y>>1) * out_linesize[1];
  125. uint32_t lum_y_pos = y * out_linesize[0];
  126. uint32_t x;
  127. for (x = 0; x < width; x += 4) {
  128. const uint8_t *img = input + y_pos + x*4;
  129. uint32_t lum_pos0 = lum_y_pos + x;
  130. uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
  131. __m128i line1 = _mm_load_si128((const __m128i*)img);
  132. __m128i line2 = _mm_load_si128(
  133. (const __m128i*)(img + in_linesize));
  134. pack_shift(lum_plane, lum_pos0, lum_pos1,
  135. line1, line2, lum_mask, 1);
  136. pack_ch_1plane(chroma_plane, chroma_y_pos + x,
  137. line1, line2, uv_mask);
  138. }
  139. }
  140. }
  141. void convert_uyvx_to_i444(
  142. const uint8_t *input, uint32_t in_linesize,
  143. uint32_t start_y, uint32_t end_y,
  144. uint8_t *output[], const uint32_t out_linesize[])
  145. {
  146. uint8_t *lum_plane = output[0];
  147. uint8_t *u_plane = output[1];
  148. uint8_t *v_plane = output[2];
  149. uint32_t width = min_uint32(in_linesize, out_linesize[0]);
  150. uint32_t y;
  151. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  152. __m128i u_mask = _mm_set1_epi32(0x000000FF);
  153. __m128i v_mask = _mm_set1_epi32(0x00FF0000);
  154. for (y = start_y; y < end_y; y += 2) {
  155. uint32_t y_pos = y * in_linesize;
  156. uint32_t lum_y_pos = y * out_linesize[0];
  157. uint32_t x;
  158. for (x = 0; x < width; x += 4) {
  159. const uint8_t *img = input + y_pos + x*4;
  160. uint32_t lum_pos0 = lum_y_pos + x;
  161. uint32_t lum_pos1 = lum_pos0 + out_linesize[0];
  162. __m128i line1 = _mm_load_si128((const __m128i*)img);
  163. __m128i line2 = _mm_load_si128(
  164. (const __m128i*)(img + in_linesize));
  165. pack_shift(lum_plane, lum_pos0, lum_pos1,
  166. line1, line2, lum_mask, 1);
  167. pack_val(u_plane, lum_pos0, lum_pos1,
  168. line1, line2, u_mask);
  169. pack_shift(v_plane, lum_pos0, lum_pos1,
  170. line1, line2, v_mask, 2);
  171. }
  172. }
  173. }
  174. void decompress_420(
  175. const uint8_t *const input[], const uint32_t in_linesize[],
  176. uint32_t start_y, uint32_t end_y,
  177. uint8_t *output, uint32_t out_linesize)
  178. {
  179. uint32_t start_y_d2 = start_y/2;
  180. uint32_t width_d2 = in_linesize[0]/2;
  181. uint32_t height_d2 = end_y/2;
  182. uint32_t y;
  183. for (y = start_y_d2; y < height_d2; y++) {
  184. const uint8_t *chroma0 = input[1] + y * in_linesize[1];
  185. const uint8_t *chroma1 = input[2] + y * in_linesize[2];
  186. register const uint8_t *lum0, *lum1;
  187. register uint32_t *output0, *output1;
  188. uint32_t x;
  189. lum0 = input[0] + y * 2 * in_linesize[0];
  190. lum1 = lum0 + in_linesize[0];
  191. output0 = (uint32_t*)(output + y * 2 * out_linesize);
  192. output1 = (uint32_t*)((uint8_t*)output0 + out_linesize);
  193. for (x = 0; x < width_d2; x++) {
  194. uint32_t out;
  195. out = (*(chroma0++) << 8) | *(chroma1++);
  196. *(output0++) = (*(lum0++) << 16) | out;
  197. *(output0++) = (*(lum0++) << 16) | out;
  198. *(output1++) = (*(lum1++) << 16) | out;
  199. *(output1++) = (*(lum1++) << 16) | out;
  200. }
  201. }
  202. }
  203. void decompress_nv12(
  204. const uint8_t *const input[], const uint32_t in_linesize[],
  205. uint32_t start_y, uint32_t end_y,
  206. uint8_t *output, uint32_t out_linesize)
  207. {
  208. uint32_t start_y_d2 = start_y/2;
  209. uint32_t width_d2 = min_uint32(in_linesize[0], out_linesize)/2;
  210. uint32_t height_d2 = end_y/2;
  211. uint32_t y;
  212. for (y = start_y_d2; y < height_d2; y++) {
  213. const uint16_t *chroma;
  214. register const uint8_t *lum0, *lum1;
  215. register uint32_t *output0, *output1;
  216. uint32_t x;
  217. chroma = (const uint16_t*)(input[1] + y * in_linesize[1]);
  218. lum0 = input[0] + y * 2 * in_linesize[0];
  219. lum1 = lum0 + in_linesize[0];
  220. output0 = (uint32_t*)(output + y * 2 * out_linesize);
  221. output1 = (uint32_t*)((uint8_t*)output0 + out_linesize);
  222. for (x = 0; x < width_d2; x++) {
  223. uint32_t out = *(chroma++) << 8;
  224. *(output0++) = *(lum0++) | out;
  225. *(output0++) = *(lum0++) | out;
  226. *(output1++) = *(lum1++) | out;
  227. *(output1++) = *(lum1++) | out;
  228. }
  229. }
  230. }
  231. void decompress_422(
  232. const uint8_t *input, uint32_t in_linesize,
  233. uint32_t start_y, uint32_t end_y,
  234. uint8_t *output, uint32_t out_linesize,
  235. bool leading_lum)
  236. {
  237. uint32_t width_d2 = min_uint32(in_linesize, out_linesize)/2;
  238. uint32_t y;
  239. register const uint32_t *input32;
  240. register const uint32_t *input32_end;
  241. register uint32_t *output32;
  242. if (leading_lum) {
  243. for (y = start_y; y < end_y; y++) {
  244. input32 = (const uint32_t*)(input + y*in_linesize);
  245. input32_end = input32 + width_d2;
  246. output32 = (uint32_t*)(output + y*out_linesize);
  247. while(input32 < input32_end) {
  248. register uint32_t dw = *input32;
  249. output32[0] = dw;
  250. dw &= 0xFFFFFF00;
  251. dw |= (uint8_t)(dw>>16);
  252. output32[1] = dw;
  253. output32 += 2;
  254. input32++;
  255. }
  256. }
  257. } else {
  258. for (y = start_y; y < end_y; y++) {
  259. input32 = (const uint32_t*)(input + y*in_linesize);
  260. input32_end = input32 + width_d2;
  261. output32 = (uint32_t*)(output + y*out_linesize);
  262. while (input32 < input32_end) {
  263. register uint32_t dw = *input32;
  264. output32[0] = dw;
  265. dw &= 0xFFFF00FF;
  266. dw |= (dw>>16) & 0xFF00;
  267. output32[1] = dw;
  268. output32 += 2;
  269. input32++;
  270. }
  271. }
  272. }
  273. }