format-conversion.c 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. /******************************************************************************
  2. Copyright (C) 2013 by Hugh Bailey <[email protected]>
  3. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation, either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ******************************************************************************/
  14. #include "format-conversion.h"
  15. #include <xmmintrin.h>
  16. #include <emmintrin.h>
  17. static inline uint32_t get_m128_32_0(const __m128i val)
  18. {
  19. return *(uint32_t* const)&val;
  20. }
  21. static inline uint32_t get_m128_32_1(const __m128i val)
  22. {
  23. return *(((uint32_t* const)&val)+1);
  24. }
  25. static inline void pack_lum(uint8_t *lum_plane,
  26. uint32_t lum_pos0, uint32_t lum_pos1,
  27. const __m128i line1, const __m128i line2,
  28. const __m128i lum_mask)
  29. {
  30. __m128i pack_val = _mm_packs_epi32(
  31. _mm_srli_si128(_mm_and_si128(line1, lum_mask), 1),
  32. _mm_srli_si128(_mm_and_si128(line2, lum_mask), 1));
  33. pack_val = _mm_packus_epi16(pack_val, pack_val);
  34. *(uint32_t*)(lum_plane+lum_pos0) = get_m128_32_0(pack_val);
  35. *(uint32_t*)(lum_plane+lum_pos1) = get_m128_32_1(pack_val);
  36. }
  37. static inline void pack_chroma_1plane(uint8_t *uv_plane,
  38. uint32_t chroma_pos,
  39. const __m128i line1, const __m128i line2,
  40. const __m128i uv_mask)
  41. {
  42. __m128i add_val = _mm_add_epi64(
  43. _mm_and_si128(line1, uv_mask),
  44. _mm_and_si128(line2, uv_mask));
  45. __m128i avg_val = _mm_add_epi64(
  46. add_val,
  47. _mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1)));
  48. avg_val = _mm_srai_epi16(avg_val, 2);
  49. avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
  50. avg_val = _mm_packus_epi16(avg_val, avg_val);
  51. *(uint32_t*)(uv_plane+chroma_pos) = get_m128_32_0(avg_val);
  52. }
  53. static inline void pack_chroma_2plane(uint8_t *u_plane, uint8_t *v_plane,
  54. uint32_t chroma_pos,
  55. const __m128i line1, const __m128i line2,
  56. const __m128i uv_mask)
  57. {
  58. uint32_t packed_vals;
  59. __m128i add_val = _mm_add_epi64(
  60. _mm_and_si128(line1, uv_mask),
  61. _mm_and_si128(line2, uv_mask));
  62. __m128i avg_val = _mm_add_epi64(
  63. add_val,
  64. _mm_shuffle_epi32(add_val, _MM_SHUFFLE(2, 3, 0, 1)));
  65. avg_val = _mm_srai_epi16(avg_val, 2);
  66. avg_val = _mm_shuffle_epi32(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
  67. avg_val = _mm_shufflelo_epi16(avg_val, _MM_SHUFFLE(3, 1, 2, 0));
  68. avg_val = _mm_packus_epi16(avg_val, avg_val);
  69. packed_vals = get_m128_32_0(avg_val);
  70. *(uint16_t*)(u_plane+chroma_pos) = (uint16_t)(packed_vals);
  71. *(uint16_t*)(v_plane+chroma_pos) = (uint16_t)(packed_vals>>16);
  72. }
  73. void compress_uyvx_to_i420(const void *input_v, uint32_t width, uint32_t height,
  74. uint32_t row_bytes, uint32_t start_y, uint32_t end_y,
  75. void **output)
  76. {
  77. const uint8_t *input = input_v;
  78. uint8_t *lum_plane = output[0];
  79. uint8_t *u_plane = output[1];
  80. uint8_t *v_plane = output[2];
  81. uint32_t chroma_pitch = width >> 1;
  82. uint32_t y;
  83. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  84. __m128i uv_mask = _mm_set1_epi16(0x00FF);
  85. for (y = start_y; y < end_y; y += 2) {
  86. uint32_t y_pos = y * row_bytes;
  87. uint32_t chroma_y_pos = (y>>1) * chroma_pitch;
  88. uint32_t lum_y_pos = y * width;
  89. uint32_t x;
  90. for (x = 0; x < width; x += 4) {
  91. const uint8_t *img = input + y_pos + x*4;
  92. uint32_t lum_pos0 = lum_y_pos + x;
  93. uint32_t lum_pos1 = lum_pos0 + width;
  94. __m128i line1 = _mm_load_si128((const __m128i*)img);
  95. __m128i line2 = _mm_load_si128(
  96. (const __m128i*)(img + row_bytes));
  97. pack_lum(lum_plane, lum_pos0, lum_pos1,
  98. line1, line2, lum_mask);
  99. pack_chroma_2plane(u_plane, v_plane,
  100. chroma_y_pos + (x>>1),
  101. line1, line2, uv_mask);
  102. }
  103. }
  104. }
  105. static inline void _compress_uyvx_to_nv12(const uint8_t *input,
  106. uint32_t width, uint32_t height, uint32_t pitch,
  107. uint32_t start_y, uint32_t end_y, uint32_t row_bytes_out,
  108. void **output)
  109. {
  110. uint8_t *lum_plane = output[0];
  111. uint8_t *chroma_plane = output[1];
  112. uint32_t y;
  113. __m128i lum_mask = _mm_set1_epi32(0x0000FF00);
  114. __m128i uv_mask = _mm_set1_epi16(0x00FF);
  115. for (y = start_y; y < end_y; y += 2) {
  116. uint32_t y_pos = y * pitch;
  117. uint32_t chroma_y_pos = (y>>1) * row_bytes_out;
  118. uint32_t lum_y_pos = y * row_bytes_out;
  119. uint32_t x;
  120. for (x = 0; x < width; x += 4) {
  121. const uint8_t *img = input + y_pos + x*4;
  122. uint32_t lum_pos0 = lum_y_pos + x;
  123. uint32_t lum_pos1 = lum_pos0 + row_bytes_out;
  124. __m128i line1 = _mm_load_si128((const __m128i*)img);
  125. __m128i line2 = _mm_load_si128(
  126. (const __m128i*)(img + pitch));
  127. pack_lum(lum_plane, lum_pos0, lum_pos1,
  128. line1, line2, lum_mask);
  129. pack_chroma_1plane(chroma_plane, chroma_y_pos + x,
  130. line1, line2, uv_mask);
  131. }
  132. }
  133. }
  134. void compress_uyvx_to_nv12(const void *input, uint32_t width, uint32_t height,
  135. uint32_t row_bytes, uint32_t start_y, uint32_t end_y,
  136. void **output)
  137. {
  138. _compress_uyvx_to_nv12(input, width, height, row_bytes,
  139. start_y, end_y, width, output);
  140. }
  141. void compress_uyvx_to_nv12_aligned(const void *input,
  142. uint32_t width, uint32_t height, uint32_t row_bytes,
  143. uint32_t start_y, uint32_t end_y, uint32_t row_bytes_out,
  144. void **output)
  145. {
  146. _compress_uyvx_to_nv12(input, width, height, row_bytes,
  147. start_y, end_y, row_bytes_out, output);
  148. }
  149. void decompress_420(const void *input_v, uint32_t width, uint32_t height,
  150. uint32_t row_bytes, uint32_t start_y, uint32_t end_y,
  151. void *output_v)
  152. {
  153. uint8_t *output = output_v;
  154. const uint8_t *input = input_v;
  155. const uint8_t *input2 = input + width * height;
  156. const uint8_t *input3 = input2 + width * height / 4;
  157. uint32_t start_y_d2 = start_y/2;
  158. uint32_t width_d2 = width/2;
  159. uint32_t height_d2 = end_y/2;
  160. uint32_t y;
  161. for (y = start_y_d2; y < height_d2; y++) {
  162. const uint8_t *chroma0 = input2 + y * width_d2;
  163. const uint8_t *chroma1 = input3 + y * width_d2;
  164. register const uint8_t *lum0, *lum1;
  165. register uint32_t *output0, *output1;
  166. uint32_t x;
  167. lum0 = input + y * 2*width;
  168. lum1 = lum0 + width;
  169. output0 = (uint32_t*)(output + y * 2*row_bytes);
  170. output1 = (uint32_t*)((uint8_t*)output0 + row_bytes);
  171. for (x = 0; x < width_d2; x++) {
  172. uint32_t out;
  173. out = (*(chroma0++) << 8) | (*(chroma1++) << 16);
  174. *(output0++) = *(lum0++) | out;
  175. *(output0++) = *(lum0++) | out;
  176. *(output1++) = *(lum1++) | out;
  177. *(output1++) = *(lum1++) | out;
  178. }
  179. }
  180. }
  181. void decompress_nv12(const void *input_v, uint32_t width, uint32_t height,
  182. uint32_t row_bytes, uint32_t start_y, uint32_t end_y,
  183. void *output_v)
  184. {
  185. uint8_t *output = output_v;
  186. const uint8_t *input = input_v;
  187. const uint8_t *input2 = input + width * height;
  188. uint32_t start_y_d2 = start_y/2;
  189. uint32_t width_d2 = width/2;
  190. uint32_t height_d2 = end_y/2;
  191. uint32_t y;
  192. for (y = start_y_d2; y < height_d2; y++) {
  193. const uint16_t *chroma = (uint16_t*)(input2 + y * width);
  194. register const uint8_t *lum0, *lum1;
  195. register uint32_t *output0, *output1;
  196. uint32_t x;
  197. lum0 = input + y * 2*width;
  198. lum1 = lum0 + width;
  199. output0 = (uint32_t*)(output + y * 2*row_bytes);
  200. output1 = (uint32_t*)((uint8_t*)output0 + row_bytes);
  201. for (x = 0; x < width_d2; x++) {
  202. uint32_t out = *(chroma++) << 8;
  203. *(output0++) = *(lum0++) | out;
  204. *(output0++) = *(lum0++) | out;
  205. *(output1++) = *(lum1++) | out;
  206. *(output1++) = *(lum1++) | out;
  207. }
  208. }
  209. }
  210. void decompress_422(const void *input_v, uint32_t width, uint32_t height,
  211. uint32_t row_bytes, uint32_t start_y, uint32_t end_y,
  212. void *output_v, bool leading_lum)
  213. {
  214. const uint8_t *input = input_v;
  215. uint8_t *output = output_v;
  216. uint32_t width_d2 = width >> 1;
  217. uint32_t line_size = width * 2;
  218. uint32_t y;
  219. register const uint32_t *input32;
  220. register const uint32_t *input32_end;
  221. register uint32_t *output32;
  222. if (leading_lum) {
  223. for (y = start_y; y < end_y; y++) {
  224. input32 = (uint32_t*)(input + y*line_size);
  225. input32_end = input32 + width_d2;
  226. output32 = (uint32_t*)(output + y*row_bytes);
  227. while(input32 < input32_end) {
  228. register uint32_t dw = *input32;
  229. output32[0] = dw;
  230. dw &= 0xFFFFFF00;
  231. dw |= (uint8_t)(dw>>16);
  232. output32[1] = dw;
  233. output32 += 2;
  234. input32++;
  235. }
  236. }
  237. } else {
  238. for (y = start_y; y < end_y; y++) {
  239. input32 = (uint32_t*)(input + y*line_size);
  240. input32_end = input32 + width_d2;
  241. output32 = (uint32_t*)(output + y*row_bytes);
  242. while (input32 < input32_end) {
  243. register uint32_t dw = *input32;
  244. output32[0] = dw;
  245. dw &= 0xFFFF00FF;
  246. dw |= (dw>>16) & 0xFF00;
  247. output32[1] = dw;
  248. output32 += 2;
  249. input32++;
  250. }
  251. }
  252. }
  253. }