xbrz.cpp 52 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356
  1. // ****************************************************************************
  2. // * This file is part of the xBRZ project. It is distributed under *
  3. // * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0 *
  4. // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved *
  5. // * *
  6. // * Additionally and as a special exception, the author gives permission *
  7. // * to link the code of this program with the following libraries *
  8. // * (or with modified versions that use the same licenses), and distribute *
  9. // * linked combinations including the two: MAME, FreeFileSync, Snes9x, ePSXe *
  10. // * You must obey the GNU General Public License in all respects for all of *
  11. // * the code used other than MAME, FreeFileSync, Snes9x, ePSXe. *
  12. // * If you modify this file, you may extend this exception to your version *
  13. // * of the file, but you are not obligated to do so. If you do not wish to *
  14. // * do so, delete this exception statement from your version. *
  15. // ****************************************************************************
  16. #include "xbrz.h"
  17. #include <cassert>
  18. #include <vector>
  19. #include <algorithm>
  20. #include <cmath> //std::sqrt
  21. #include "xbrz_tools.h"
  22. using namespace xbrz;
  23. namespace
  24. {
  25. template <unsigned int M, unsigned int N> inline
  26. uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: https://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
  27. {
  28. static_assert(0 < M && M < N && N <= 1000);
  29. auto calcColor = [](unsigned char colFront, unsigned char colBack) -> unsigned char { return (colFront * M + colBack * (N - M)) / N; };
  30. return makePixel(calcColor(getRed (pixFront), getRed (pixBack)),
  31. calcColor(getGreen(pixFront), getGreen(pixBack)),
  32. calcColor(getBlue (pixFront), getBlue (pixBack)));
  33. }
  34. template <unsigned int M, unsigned int N> inline
  35. uint32_t gradientARGB(uint32_t pixFront, uint32_t pixBack) //find intermediate color between two colors with alpha channels (=> NO alpha blending!!!)
  36. {
  37. static_assert(0 < M && M < N && N <= 1000);
  38. const unsigned int weightFront = getAlpha(pixFront) * M;
  39. const unsigned int weightBack = getAlpha(pixBack) * (N - M);
  40. const unsigned int weightSum = weightFront + weightBack;
  41. if (weightSum == 0)
  42. return 0;
  43. auto calcColor = [=](unsigned char colFront, unsigned char colBack)
  44. {
  45. return static_cast<unsigned char>((colFront * weightFront + colBack * weightBack) / weightSum);
  46. };
  47. return makePixel(static_cast<unsigned char>(weightSum / N),
  48. calcColor(getRed (pixFront), getRed (pixBack)),
  49. calcColor(getGreen(pixFront), getGreen(pixBack)),
  50. calcColor(getBlue (pixFront), getBlue (pixBack)));
  51. }
  52. //inline
  53. //double fastSqrt(double n)
  54. //{
  55. // __asm //speeds up xBRZ by about 9% compared to std::sqrt which internally uses the same assembler instructions but adds some "fluff"
  56. // {
  57. // fld n
  58. // fsqrt
  59. // }
  60. //}
  61. //
  62. #ifdef _MSC_VER
  63. #define FORCE_INLINE __forceinline
  64. #elif defined __GNUC__
  65. #define FORCE_INLINE __attribute__((always_inline)) inline
  66. #else
  67. #define FORCE_INLINE inline
  68. #endif
  69. enum RotationDegree //clock-wise
  70. {
  71. ROT_0,
  72. ROT_90,
  73. ROT_180,
  74. ROT_270
  75. };
  76. //calculate input matrix coordinates after rotation at compile time
  77. template <RotationDegree rotDeg, size_t I, size_t J, size_t N>
  78. struct MatrixRotation;
  79. template <size_t I, size_t J, size_t N>
  80. struct MatrixRotation<ROT_0, I, J, N>
  81. {
  82. static const size_t I_old = I;
  83. static const size_t J_old = J;
  84. };
  85. template <RotationDegree rotDeg, size_t I, size_t J, size_t N> //(i, j) = (row, col) indices, N = size of (square) matrix
  86. struct MatrixRotation
  87. {
  88. static const size_t I_old = N - 1 - MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::J_old; //old coordinates before rotation!
  89. static const size_t J_old = MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::I_old; //
  90. };
  91. template <size_t N, RotationDegree rotDeg>
  92. class OutputMatrix
  93. {
  94. public:
  95. OutputMatrix(uint32_t* out, int outWidth) : //access matrix area, top-left at position "out" for image with given width
  96. out_(out),
  97. outWidth_(outWidth) {}
  98. template <size_t I, size_t J>
  99. uint32_t& ref() const
  100. {
  101. static const size_t I_old = MatrixRotation<rotDeg, I, J, N>::I_old;
  102. static const size_t J_old = MatrixRotation<rotDeg, I, J, N>::J_old;
  103. return *(out_ + J_old + I_old * outWidth_);
  104. }
  105. private:
  106. uint32_t* out_;
  107. const int outWidth_;
  108. };
  109. template <class T> inline
  110. T square(T value) { return value * value; }
  111. #if 0
  112. inline
  113. double distRGB(uint32_t pix1, uint32_t pix2)
  114. {
  115. const double r_diff = static_cast<int>(getRed (pix1)) - getRed (pix2);
  116. const double g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2);
  117. const double b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2);
  118. //euklidean RGB distance
  119. return std::sqrt(square(r_diff) + square(g_diff) + square(b_diff));
  120. }
  121. #endif
  122. inline
  123. double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
  124. {
  125. //https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
  126. //YCbCr conversion is a matrix multiplication => take advantage of linearity by subtracting first!
  127. const int r_diff = static_cast<int>(getRed (pix1)) - getRed (pix2); //we may delay division by 255 to after matrix multiplication
  128. const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2); //
  129. const int b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2); //substraction for int is noticeable faster than for double!
  130. //const double k_b = 0.0722; //ITU-R BT.709 conversion
  131. //const double k_r = 0.2126; //
  132. const double k_b = 0.0593; //ITU-R BT.2020 conversion
  133. const double k_r = 0.2627; //
  134. const double k_g = 1 - k_b - k_r;
  135. const double scale_b = 0.5 / (1 - k_b);
  136. const double scale_r = 0.5 / (1 - k_r);
  137. const double y = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
  138. const double c_b = scale_b * (b_diff - y);
  139. const double c_r = scale_r * (r_diff - y);
  140. //we skip division by 255 to have similar range like other distance functions
  141. return std::sqrt(square(lumaWeight * y) + square(c_b) + square(c_r));
  142. }
  143. inline
  144. double distYCbCrBuffered(uint32_t pix1, uint32_t pix2)
  145. {
  146. //30% perf boost compared to plain distYCbCr()!
  147. //consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
  148. static const std::vector<float> diffToDist = []
  149. {
  150. std::vector<float> tmp;
  151. for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
  152. {
  153. const int r_diff = static_cast<signed char>(getByte<2>(i)) * 2;
  154. const int g_diff = static_cast<signed char>(getByte<1>(i)) * 2;
  155. const int b_diff = static_cast<signed char>(getByte<0>(i)) * 2;
  156. const double k_b = 0.0593; //ITU-R BT.2020 conversion
  157. const double k_r = 0.2627; //
  158. const double k_g = 1 - k_b - k_r;
  159. const double scale_b = 0.5 / (1 - k_b);
  160. const double scale_r = 0.5 / (1 - k_r);
  161. const double y = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
  162. const double c_b = scale_b * (b_diff - y);
  163. const double c_r = scale_r * (r_diff - y);
  164. tmp.push_back(static_cast<float>(std::sqrt(square(y) + square(c_b) + square(c_r))));
  165. }
  166. return tmp;
  167. }();
  168. //if (pix1 == pix2) -> 8% perf degradation!
  169. // return 0;
  170. //if (pix1 < pix2)
  171. // std::swap(pix1, pix2); -> 30% perf degradation!!!
  172. const int r_diff = static_cast<int>(getRed (pix1)) - getRed (pix2);
  173. const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2);
  174. const int b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2);
  175. const size_t index = (static_cast<unsigned char>(r_diff / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
  176. (static_cast<unsigned char>(g_diff / 2) << 8) |
  177. (static_cast<unsigned char>(b_diff / 2));
  178. #if 0 //attention: the following calculation creates an asymmetric color distance!!! (e.g. r_diff=46 will be unpacked as 45, but r_diff=-46 unpacks to -47
  179. const size_t index = (((r_diff + 0xFF) / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
  180. (((g_diff + 0xFF) / 2) << 8) |
  181. (( b_diff + 0xFF) / 2);
  182. #endif
  183. return diffToDist[index];
  184. }
  185. #if defined _MSC_VER && !defined NDEBUG
  186. const int debugPixelX = -1;
  187. const int debugPixelY = 58;
  188. thread_local bool breakIntoDebugger = false;
  189. #endif
  190. enum BlendType
  191. {
  192. BLEND_NONE = 0,
  193. BLEND_NORMAL, //a normal indication to blend
  194. BLEND_DOMINANT, //a strong indication to blend
  195. //attention: BlendType must fit into the value range of 2 bit!!!
  196. };
  197. struct BlendResult
  198. {
  199. BlendType
  200. /**/blend_f, blend_g,
  201. /**/blend_j, blend_k;
  202. };
  203. struct Kernel_3x3
  204. {
  205. uint32_t
  206. a, b, c,
  207. d, e, f,
  208. g, h, i;
  209. };
  210. struct Kernel_4x4 //kernel for preprocessing step
  211. {
  212. uint32_t
  213. a, b, c, //
  214. e, f, g, // support reinterpret_cast from Kernel_4x4 => Kernel_3x3
  215. i, j, k, //
  216. m, n, o,
  217. d, h, l, p;
  218. };
  219. /* input kernel area naming convention:
  220. -----------------
  221. | A | B | C | D |
  222. |---|---|---|---|
  223. | E | F | G | H | evaluate the four corners between F, G, J, K
  224. |---|---|---|---| input pixel is at position F
  225. | I | J | K | L |
  226. |---|---|---|---|
  227. | M | N | O | P |
  228. -----------------
  229. */
  230. template <class ColorDistance>
  231. FORCE_INLINE //detect blend direction
  232. BlendResult preProcessCorners(const Kernel_4x4& ker, const xbrz::ScalerCfg& cfg) //result: F, G, J, K corners of "GradientType"
  233. {
  234. #if defined _MSC_VER && !defined NDEBUG
  235. if (breakIntoDebugger)
  236. __debugbreak(); //__asm int 3;
  237. #endif
  238. BlendResult result = {};
  239. if ((ker.f == ker.g &&
  240. ker.j == ker.k) ||
  241. (ker.f == ker.j &&
  242. ker.g == ker.k))
  243. return result;
  244. auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight); };
  245. double jg = dist(ker.i, ker.f) + dist(ker.f, ker.c) + dist(ker.n, ker.k) + dist(ker.k, ker.h) + cfg.centerDirectionBias * dist(ker.j, ker.g);
  246. double fk = dist(ker.e, ker.j) + dist(ker.j, ker.o) + dist(ker.b, ker.g) + dist(ker.g, ker.l) + cfg.centerDirectionBias * dist(ker.f, ker.k);
  247. if (jg < fk) //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
  248. {
  249. const bool dominantGradient = cfg.dominantDirectionThreshold * jg < fk;
  250. if (ker.f != ker.g && ker.f != ker.j)
  251. result.blend_f = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  252. if (ker.k != ker.j && ker.k != ker.g)
  253. result.blend_k = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  254. }
  255. else if (fk < jg)
  256. {
  257. const bool dominantGradient = cfg.dominantDirectionThreshold * fk < jg;
  258. if (ker.j != ker.f && ker.j != ker.k)
  259. result.blend_j = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  260. if (ker.g != ker.f && ker.g != ker.k)
  261. result.blend_g = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  262. }
  263. return result;
  264. }
  265. #define DEF_GETTER(x) template <RotationDegree rotDeg> uint32_t inline get_##x(const Kernel_3x3& ker) { return ker.x; }
  266. //we cannot and NEED NOT write "ker.##x" since ## concatenates preprocessor tokens but "." is not a token
  267. DEF_GETTER(a) DEF_GETTER(b) DEF_GETTER(c)
  268. DEF_GETTER(d) DEF_GETTER(e) DEF_GETTER(f)
  269. DEF_GETTER(g) DEF_GETTER(h) DEF_GETTER(i)
  270. #undef DEF_GETTER
  271. #define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_90>(const Kernel_3x3& ker) { return ker.y; }
  272. DEF_GETTER(a, g) DEF_GETTER(b, d) DEF_GETTER(c, a)
  273. DEF_GETTER(d, h) DEF_GETTER(e, e) DEF_GETTER(f, b)
  274. DEF_GETTER(g, i) DEF_GETTER(h, f) DEF_GETTER(i, c)
  275. #undef DEF_GETTER
  276. #define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_180>(const Kernel_3x3& ker) { return ker.y; }
  277. DEF_GETTER(a, i) DEF_GETTER(b, h) DEF_GETTER(c, g)
  278. DEF_GETTER(d, f) DEF_GETTER(e, e) DEF_GETTER(f, d)
  279. DEF_GETTER(g, c) DEF_GETTER(h, b) DEF_GETTER(i, a)
  280. #undef DEF_GETTER
  281. #define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_270>(const Kernel_3x3& ker) { return ker.y; }
  282. DEF_GETTER(a, c) DEF_GETTER(b, f) DEF_GETTER(c, i)
  283. DEF_GETTER(d, b) DEF_GETTER(e, e) DEF_GETTER(f, h)
  284. DEF_GETTER(g, a) DEF_GETTER(h, d) DEF_GETTER(i, g)
  285. #undef DEF_GETTER
  286. //compress four blend types into a single byte
  287. //inline BlendType getTopL (unsigned char b) { return static_cast<BlendType>(0x3 & b); }
  288. inline BlendType getTopR (unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 2)); }
  289. inline BlendType getBottomR(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 4)); }
  290. inline BlendType getBottomL(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 6)); }
  291. inline void clearAddTopL(unsigned char& b, BlendType bt) { b = static_cast<unsigned char>(bt); }
  292. inline void addTopR (unsigned char& b, BlendType bt) { b |= (bt << 2); } //buffer is assumed to be initialized before preprocessing!
  293. inline void addBottomR (unsigned char& b, BlendType bt) { b |= (bt << 4); } //e.g. via clearAddTopL()
  294. inline void addBottomL (unsigned char& b, BlendType bt) { b |= (bt << 6); } //
  295. inline bool blendingNeeded(unsigned char b)
  296. {
  297. static_assert(BLEND_NONE == 0);
  298. return b != 0;
  299. }
  300. template <RotationDegree rotDeg> inline
  301. unsigned char rotateBlendInfo(unsigned char b) { return b; }
  302. template <> inline unsigned char rotateBlendInfo<ROT_90 >(unsigned char b) { return ((b << 2) | (b >> 6)) & 0xff; }
  303. template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) { return ((b << 4) | (b >> 4)) & 0xff; }
  304. template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; }
  305. /* input kernel area naming convention:
  306. -------------
  307. | A | B | C |
  308. |---|---|---|
  309. | D | E | F | input pixel is at position E
  310. |---|---|---|
  311. | G | H | I |
  312. -------------
  313. */
  314. template <class Scaler, class ColorDistance, RotationDegree rotDeg>
  315. FORCE_INLINE //perf: quite worth it!
  316. void blendPixel(const Kernel_3x3& ker,
  317. uint32_t* target, int trgWidth,
  318. unsigned char blendInfo, //result of preprocessing all four corners of pixel "e"
  319. const xbrz::ScalerCfg& cfg)
  320. {
  321. //#define a get_a<rotDeg>(ker)
  322. #define b get_b<rotDeg>(ker)
  323. #define c get_c<rotDeg>(ker)
  324. #define d get_d<rotDeg>(ker)
  325. #define e get_e<rotDeg>(ker)
  326. #define f get_f<rotDeg>(ker)
  327. #define g get_g<rotDeg>(ker)
  328. #define h get_h<rotDeg>(ker)
  329. #define i get_i<rotDeg>(ker)
  330. #if defined _MSC_VER && !defined NDEBUG
  331. if (breakIntoDebugger)
  332. __debugbreak(); //__asm int 3;
  333. #endif
  334. const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo);
  335. if (getBottomR(blend) >= BLEND_NORMAL)
  336. {
  337. auto eq = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight) < cfg.equalColorTolerance; };
  338. auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight); };
  339. const bool doLineBlend = [&]() -> bool
  340. {
  341. if (getBottomR(blend) >= BLEND_DOMINANT)
  342. return true;
  343. //make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
  344. if (getTopR(blend) != BLEND_NONE && !eq(e, g)) //but support double-blending for 90° corners
  345. return false;
  346. if (getBottomL(blend) != BLEND_NONE && !eq(e, c))
  347. return false;
  348. //no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
  349. if (!eq(e, i) && eq(g, h) && eq(h, i) && eq(i, f) && eq(f, c))
  350. return false;
  351. return true;
  352. }();
  353. const uint32_t px = dist(e, f) <= dist(e, h) ? f : h; //choose most similar color
  354. OutputMatrix<Scaler::scale, rotDeg> out(target, trgWidth);
  355. if (doLineBlend)
  356. {
  357. const double fg = dist(f, g); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
  358. const double hc = dist(h, c); //
  359. const bool haveShallowLine = cfg.steepDirectionThreshold * fg <= hc && e != g && d != g;
  360. const bool haveSteepLine = cfg.steepDirectionThreshold * hc <= fg && e != c && b != c;
  361. if (haveShallowLine)
  362. {
  363. if (haveSteepLine)
  364. Scaler::blendLineSteepAndShallow(px, out);
  365. else
  366. Scaler::blendLineShallow(px, out);
  367. }
  368. else
  369. {
  370. if (haveSteepLine)
  371. Scaler::blendLineSteep(px, out);
  372. else
  373. Scaler::blendLineDiagonal(px, out);
  374. }
  375. }
  376. else
  377. Scaler::blendCorner(px, out);
  378. }
  379. //#undef a
  380. #undef b
  381. #undef c
  382. #undef d
  383. #undef e
  384. #undef f
  385. #undef g
  386. #undef h
  387. #undef i
  388. }
  389. class OobReaderTransparent
  390. {
  391. public:
  392. OobReaderTransparent(const uint32_t* src, int srcWidth, int srcHeight, int y) :
  393. s_m1(0 <= y - 1 && y - 1 < srcHeight ? src + srcWidth * (y - 1) : nullptr),
  394. s_0 (0 <= y && y < srcHeight ? src + srcWidth * y : nullptr),
  395. s_p1(0 <= y + 1 && y + 1 < srcHeight ? src + srcWidth * (y + 1) : nullptr),
  396. s_p2(0 <= y + 2 && y + 2 < srcHeight ? src + srcWidth * (y + 2) : nullptr),
  397. srcWidth_(srcWidth) {}
  398. void readDhlp(Kernel_4x4& ker, int x) const //(x, y) is at kernel position F
  399. {
  400. [[likely]] if (const int x_p2 = x + 2; 0 <= x_p2 && x_p2 < srcWidth_)
  401. {
  402. ker.d = s_m1 ? s_m1[x_p2] : 0;
  403. ker.h = s_0 ? s_0 [x_p2] : 0;
  404. ker.l = s_p1 ? s_p1[x_p2] : 0;
  405. ker.p = s_p2 ? s_p2[x_p2] : 0;
  406. }
  407. else
  408. {
  409. ker.d = 0;
  410. ker.h = 0;
  411. ker.l = 0;
  412. ker.p = 0;
  413. }
  414. }
  415. private:
  416. const uint32_t* const s_m1;
  417. const uint32_t* const s_0;
  418. const uint32_t* const s_p1;
  419. const uint32_t* const s_p2;
  420. const int srcWidth_;
  421. };
  422. class OobReaderDuplicate
  423. {
  424. public:
  425. OobReaderDuplicate(const uint32_t* src, int srcWidth, int srcHeight, int y) :
  426. s_m1(src + srcWidth * std::clamp(y - 1, 0, srcHeight - 1)),
  427. s_0 (src + srcWidth * std::clamp(y, 0, srcHeight - 1)),
  428. s_p1(src + srcWidth * std::clamp(y + 1, 0, srcHeight - 1)),
  429. s_p2(src + srcWidth * std::clamp(y + 2, 0, srcHeight - 1)),
  430. srcWidth_(srcWidth) {}
  431. void readDhlp(Kernel_4x4& ker, int x) const //(x, y) is at kernel position F
  432. {
  433. const int x_p2 = std::clamp(x + 2, 0, srcWidth_ - 1);
  434. ker.d = s_m1[x_p2];
  435. ker.h = s_0 [x_p2];
  436. ker.l = s_p1[x_p2];
  437. ker.p = s_p2[x_p2];
  438. }
  439. private:
  440. const uint32_t* const s_m1;
  441. const uint32_t* const s_0;
  442. const uint32_t* const s_p1;
  443. const uint32_t* const s_p2;
  444. const int srcWidth_;
  445. };
  446. template <class Scaler, class ColorDistance, class OobReader> //scaler policy: see "Scaler2x" reference implementation
  447. void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, const xbrz::ScalerCfg& cfg, int yFirst, int yLast)
  448. {
  449. yFirst = std::max(yFirst, 0);
  450. yLast = std::min(yLast, srcHeight);
  451. if (yFirst >= yLast || srcWidth <= 0)
  452. return;
  453. const int trgWidth = srcWidth * Scaler::scale;
  454. //(ab)use space of "sizeof(uint32_t) * srcWidth * Scaler::scale" at the end of the image as temporary
  455. //buffer for "on the fly preprocessing" without risk of accidental overwriting before accessing
  456. unsigned char* const preProcBuf = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - srcWidth;
  457. //initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
  458. //this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
  459. {
  460. const OobReader oobReader(src, srcWidth, srcHeight, yFirst - 1);
  461. //initialize at position x = -1
  462. Kernel_4x4 ker4 = {};
  463. oobReader.readDhlp(ker4, -4); //hack: read a, e, i, m at x = -1
  464. ker4.a = ker4.d;
  465. ker4.e = ker4.h;
  466. ker4.i = ker4.l;
  467. ker4.m = ker4.p;
  468. oobReader.readDhlp(ker4, -3);
  469. ker4.b = ker4.d;
  470. ker4.f = ker4.h;
  471. ker4.j = ker4.l;
  472. ker4.n = ker4.p;
  473. oobReader.readDhlp(ker4, -2);
  474. ker4.c = ker4.d;
  475. ker4.g = ker4.h;
  476. ker4.k = ker4.l;
  477. ker4.o = ker4.p;
  478. oobReader.readDhlp(ker4, -1);
  479. {
  480. const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
  481. clearAddTopL(preProcBuf[0], res.blend_k); //set 1st known corner for (0, yFirst)
  482. }
  483. for (int x = 0; x < srcWidth; ++x)
  484. {
  485. ker4.a = ker4.b; //shift previous kernel to the left
  486. ker4.e = ker4.f; // -----------------
  487. ker4.i = ker4.j; // | A | B | C | D |
  488. ker4.m = ker4.n; // |---|---|---|---|
  489. /**/ // | E | F | G | H | (x, yFirst - 1) is at position F
  490. ker4.b = ker4.c; // |---|---|---|---|
  491. ker4.f = ker4.g; // | I | J | K | L |
  492. ker4.j = ker4.k; // |---|---|---|---|
  493. ker4.n = ker4.o; // | M | N | O | P |
  494. /**/ // -----------------
  495. ker4.c = ker4.d;
  496. ker4.g = ker4.h;
  497. ker4.k = ker4.l;
  498. ker4.o = ker4.p;
  499. oobReader.readDhlp(ker4, x);
  500. /* preprocessing blend result:
  501. ---------
  502. | F | G | evaluate corner between F, G, J, K
  503. |---+---| current input pixel is at position F
  504. | J | K |
  505. --------- */
  506. const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
  507. addTopR(preProcBuf[x], res.blend_j); //set 2nd known corner for (x, yFirst)
  508. if (x + 1 < srcWidth)
  509. clearAddTopL(preProcBuf[x + 1], res.blend_k); //set 1st known corner for (x + 1, yFirst)
  510. }
  511. }
  512. //------------------------------------------------------------------------------------
  513. for (int y = yFirst; y < yLast; ++y)
  514. {
  515. uint32_t* out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access
  516. const OobReader oobReader(src, srcWidth, srcHeight, y);
  517. //initialize at position x = -1
  518. Kernel_4x4 ker4 = {};
  519. oobReader.readDhlp(ker4, -4); //hack: read a, e, i, m at x = -1
  520. ker4.a = ker4.d;
  521. ker4.e = ker4.h;
  522. ker4.i = ker4.l;
  523. ker4.m = ker4.p;
  524. oobReader.readDhlp(ker4, -3);
  525. ker4.b = ker4.d;
  526. ker4.f = ker4.h;
  527. ker4.j = ker4.l;
  528. ker4.n = ker4.p;
  529. oobReader.readDhlp(ker4, -2);
  530. ker4.c = ker4.d;
  531. ker4.g = ker4.h;
  532. ker4.k = ker4.l;
  533. ker4.o = ker4.p;
  534. oobReader.readDhlp(ker4, -1);
  535. unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
  536. {
  537. const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
  538. clearAddTopL(blend_xy1, res.blend_k); //set 1st known corner for (0, y + 1) and buffer for use on next column
  539. addBottomL(preProcBuf[0], res.blend_g); //set 3rd known corner for (0, y)
  540. }
  541. for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
  542. {
  543. #if defined _MSC_VER && !defined NDEBUG
  544. breakIntoDebugger = debugPixelX == x && debugPixelY == y;
  545. #endif
  546. ker4.a = ker4.b; //shift previous kernel to the left
  547. ker4.e = ker4.f; // -----------------
  548. ker4.i = ker4.j; // | A | B | C | D |
  549. ker4.m = ker4.n; // |---|---|---|---|
  550. /**/ // | E | F | G | H | (x, y) is at position F
  551. ker4.b = ker4.c; // |---|---|---|---|
  552. ker4.f = ker4.g; // | I | J | K | L |
  553. ker4.j = ker4.k; // |---|---|---|---|
  554. ker4.n = ker4.o; // | M | N | O | P |
  555. /**/ // -----------------
  556. ker4.c = ker4.d;
  557. ker4.g = ker4.h;
  558. ker4.k = ker4.l;
  559. ker4.o = ker4.p;
  560. oobReader.readDhlp(ker4, x);
  561. //evaluate the four corners on bottom-right of current pixel
  562. unsigned char blend_xy = preProcBuf[x]; //for current (x, y) position
  563. {
  564. /* preprocessing blend result:
  565. ---------
  566. | F | G | evaluate corner between F, G, J, K
  567. |---+---| current input pixel is at position F
  568. | J | K |
  569. --------- */
  570. const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
  571. addBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
  572. addTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
  573. preProcBuf[x] = blend_xy1; //store on current buffer position for use on next row
  574. [[likely]] if (x + 1 < srcWidth)
  575. {
  576. //blend_xy1 -> blend_x1y1
  577. clearAddTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
  578. addBottomL(preProcBuf[x + 1], res.blend_g); //set 3rd known corner for (x + 1, y)
  579. }
  580. }
  581. //fill block of size scale * scale with the given color
  582. fillBlock(out, trgWidth * sizeof(uint32_t), ker4.f, Scaler::scale, Scaler::scale);
  583. //place *after* preprocessing step, to not overwrite the results while processing the last pixel!
  584. //blend all four corners of current pixel
  585. if (blendingNeeded(blend_xy))
  586. {
  587. const auto& ker3 = reinterpret_cast<const Kernel_3x3&>(ker4); //"The Things We Do for Perf"
  588. blendPixel<Scaler, ColorDistance, ROT_0 >(ker3, out, trgWidth, blend_xy, cfg);
  589. blendPixel<Scaler, ColorDistance, ROT_90 >(ker3, out, trgWidth, blend_xy, cfg);
  590. blendPixel<Scaler, ColorDistance, ROT_180>(ker3, out, trgWidth, blend_xy, cfg);
  591. blendPixel<Scaler, ColorDistance, ROT_270>(ker3, out, trgWidth, blend_xy, cfg);
  592. }
  593. }
  594. }
  595. }
  596. //------------------------------------------------------------------------------------
  597. template <class ColorGradient>
  598. struct Scaler2x : public ColorGradient
  599. {
  600. static const int scale = 2;
  601. template <unsigned int M, unsigned int N> //bring template function into scope for GCC
  602. static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
  603. template <class OutputMatrix>
  604. static void blendLineShallow(uint32_t col, OutputMatrix& out)
  605. {
  606. alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  607. alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  608. }
  609. template <class OutputMatrix>
  610. static void blendLineSteep(uint32_t col, OutputMatrix& out)
  611. {
  612. alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  613. alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  614. }
  615. template <class OutputMatrix>
  616. static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
  617. {
  618. alphaGrad<1, 4>(out.template ref<1, 0>(), col);
  619. alphaGrad<1, 4>(out.template ref<0, 1>(), col);
  620. alphaGrad<5, 6>(out.template ref<1, 1>(), col); //[!] fixes 7/8 used in xBR
  621. }
  622. template <class OutputMatrix>
  623. static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
  624. {
  625. alphaGrad<1, 2>(out.template ref<1, 1>(), col);
  626. }
  627. template <class OutputMatrix>
  628. static void blendCorner(uint32_t col, OutputMatrix& out)
  629. {
  630. //model a round corner
  631. alphaGrad<21, 100>(out.template ref<1, 1>(), col); //exact: 1 - pi/4 = 0.2146018366
  632. }
  633. };
  634. template <class ColorGradient>
  635. struct Scaler3x : public ColorGradient
  636. {
  637. static const int scale = 3;
  638. template <unsigned int M, unsigned int N> //bring template function into scope for GCC
  639. static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
  640. template <class OutputMatrix>
  641. static void blendLineShallow(uint32_t col, OutputMatrix& out)
  642. {
  643. alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  644. alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  645. alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  646. out.template ref<scale - 1, 2>() = col;
  647. }
  648. template <class OutputMatrix>
  649. static void blendLineSteep(uint32_t col, OutputMatrix& out)
  650. {
  651. alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  652. alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  653. alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  654. out.template ref<2, scale - 1>() = col;
  655. }
  656. template <class OutputMatrix>
  657. static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
  658. {
  659. alphaGrad<1, 4>(out.template ref<2, 0>(), col);
  660. alphaGrad<1, 4>(out.template ref<0, 2>(), col);
  661. alphaGrad<3, 4>(out.template ref<2, 1>(), col);
  662. alphaGrad<3, 4>(out.template ref<1, 2>(), col);
  663. out.template ref<2, 2>() = col;
  664. }
  665. template <class OutputMatrix>
  666. static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
  667. {
  668. alphaGrad<1, 8>(out.template ref<1, 2>(), col); //conflict with other rotations for this odd scale
  669. alphaGrad<1, 8>(out.template ref<2, 1>(), col);
  670. alphaGrad<7, 8>(out.template ref<2, 2>(), col); //
  671. }
  672. template <class OutputMatrix>
  673. static void blendCorner(uint32_t col, OutputMatrix& out)
  674. {
  675. //model a round corner
  676. alphaGrad<45, 100>(out.template ref<2, 2>(), col); //exact: 0.4545939598
  677. //alphaGrad<7, 256>(out.template ref<2, 1>(), col); //0.02826017254 -> negligible + avoid conflicts with other rotations for this odd scale
  678. //alphaGrad<7, 256>(out.template ref<1, 2>(), col); //0.02826017254
  679. }
  680. };
  681. template <class ColorGradient>
  682. struct Scaler4x : public ColorGradient
  683. {
  684. static const int scale = 4;
  685. template <unsigned int M, unsigned int N> //bring template function into scope for GCC
  686. static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
  687. template <class OutputMatrix>
  688. static void blendLineShallow(uint32_t col, OutputMatrix& out)
  689. {
  690. alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  691. alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  692. alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  693. alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
  694. out.template ref<scale - 1, 2>() = col;
  695. out.template ref<scale - 1, 3>() = col;
  696. }
  697. template <class OutputMatrix>
  698. static void blendLineSteep(uint32_t col, OutputMatrix& out)
  699. {
  700. alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  701. alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  702. alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  703. alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
  704. out.template ref<2, scale - 1>() = col;
  705. out.template ref<3, scale - 1>() = col;
  706. }
  707. template <class OutputMatrix>
  708. static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
  709. {
  710. alphaGrad<3, 4>(out.template ref<3, 1>(), col);
  711. alphaGrad<3, 4>(out.template ref<1, 3>(), col);
  712. alphaGrad<1, 4>(out.template ref<3, 0>(), col);
  713. alphaGrad<1, 4>(out.template ref<0, 3>(), col);
  714. alphaGrad<1, 3>(out.template ref<2, 2>(), col); //[!] fixes 1/4 used in xBR
  715. out.template ref<3, 3>() = col;
  716. out.template ref<3, 2>() = col;
  717. out.template ref<2, 3>() = col;
  718. }
  719. template <class OutputMatrix>
  720. static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
  721. {
  722. alphaGrad<1, 2>(out.template ref<scale - 1, scale / 2 >(), col);
  723. alphaGrad<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
  724. out.template ref<scale - 1, scale - 1>() = col;
  725. }
  726. template <class OutputMatrix>
  727. static void blendCorner(uint32_t col, OutputMatrix& out)
  728. {
  729. //model a round corner
  730. alphaGrad<68, 100>(out.template ref<3, 3>(), col); //exact: 0.6848532563
  731. alphaGrad< 9, 100>(out.template ref<3, 2>(), col); //0.08677704501
  732. alphaGrad< 9, 100>(out.template ref<2, 3>(), col); //0.08677704501
  733. }
  734. };
  735. template <class ColorGradient>
  736. struct Scaler5x : public ColorGradient
  737. {
  738. static const int scale = 5;
  739. template <unsigned int M, unsigned int N> //bring template function into scope for GCC
  740. static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
  741. template <class OutputMatrix>
  742. static void blendLineShallow(uint32_t col, OutputMatrix& out)
  743. {
  744. alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  745. alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  746. alphaGrad<1, 4>(out.template ref<scale - 3, 4>(), col);
  747. alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  748. alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
  749. out.template ref<scale - 1, 2>() = col;
  750. out.template ref<scale - 1, 3>() = col;
  751. out.template ref<scale - 1, 4>() = col;
  752. out.template ref<scale - 2, 4>() = col;
  753. }
  754. template <class OutputMatrix>
  755. static void blendLineSteep(uint32_t col, OutputMatrix& out)
  756. {
  757. alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  758. alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  759. alphaGrad<1, 4>(out.template ref<4, scale - 3>(), col);
  760. alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  761. alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
  762. out.template ref<2, scale - 1>() = col;
  763. out.template ref<3, scale - 1>() = col;
  764. out.template ref<4, scale - 1>() = col;
  765. out.template ref<4, scale - 2>() = col;
  766. }
  767. template <class OutputMatrix>
  768. static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
  769. {
  770. alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  771. alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  772. alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  773. alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  774. alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  775. alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  776. alphaGrad<2, 3>(out.template ref<3, 3>(), col);
  777. out.template ref<2, scale - 1>() = col;
  778. out.template ref<3, scale - 1>() = col;
  779. out.template ref<4, scale - 1>() = col;
  780. out.template ref<scale - 1, 2>() = col;
  781. out.template ref<scale - 1, 3>() = col;
  782. }
  783. template <class OutputMatrix>
  784. static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
  785. {
  786. alphaGrad<1, 8>(out.template ref<scale - 1, scale / 2 >(), col); //conflict with other rotations for this odd scale
  787. alphaGrad<1, 8>(out.template ref<scale - 2, scale / 2 + 1>(), col);
  788. alphaGrad<1, 8>(out.template ref<scale - 3, scale / 2 + 2>(), col); //
  789. alphaGrad<7, 8>(out.template ref<4, 3>(), col);
  790. alphaGrad<7, 8>(out.template ref<3, 4>(), col);
  791. out.template ref<4, 4>() = col;
  792. }
  793. template <class OutputMatrix>
  794. static void blendCorner(uint32_t col, OutputMatrix& out)
  795. {
  796. //model a round corner
  797. alphaGrad<86, 100>(out.template ref<4, 4>(), col); //exact: 0.8631434088
  798. alphaGrad<23, 100>(out.template ref<4, 3>(), col); //0.2306749731
  799. alphaGrad<23, 100>(out.template ref<3, 4>(), col); //0.2306749731
  800. //alphaGrad<1, 64>(out.template ref<4, 2>(), col); //0.01676812367 -> negligible + avoid conflicts with other rotations for this odd scale
  801. //alphaGrad<1, 64>(out.template ref<2, 4>(), col); //0.01676812367
  802. }
  803. };
  804. template <class ColorGradient>
  805. struct Scaler6x : public ColorGradient
  806. {
  807. static const int scale = 6;
  808. template <unsigned int M, unsigned int N> //bring template function into scope for GCC
  809. static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
  810. template <class OutputMatrix>
  811. static void blendLineShallow(uint32_t col, OutputMatrix& out)
  812. {
  813. alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  814. alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  815. alphaGrad<1, 4>(out.template ref<scale - 3, 4>(), col);
  816. alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  817. alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
  818. alphaGrad<3, 4>(out.template ref<scale - 3, 5>(), col);
  819. out.template ref<scale - 1, 2>() = col;
  820. out.template ref<scale - 1, 3>() = col;
  821. out.template ref<scale - 1, 4>() = col;
  822. out.template ref<scale - 1, 5>() = col;
  823. out.template ref<scale - 2, 4>() = col;
  824. out.template ref<scale - 2, 5>() = col;
  825. }
  826. template <class OutputMatrix>
  827. static void blendLineSteep(uint32_t col, OutputMatrix& out)
  828. {
  829. alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  830. alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  831. alphaGrad<1, 4>(out.template ref<4, scale - 3>(), col);
  832. alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  833. alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
  834. alphaGrad<3, 4>(out.template ref<5, scale - 3>(), col);
  835. out.template ref<2, scale - 1>() = col;
  836. out.template ref<3, scale - 1>() = col;
  837. out.template ref<4, scale - 1>() = col;
  838. out.template ref<5, scale - 1>() = col;
  839. out.template ref<4, scale - 2>() = col;
  840. out.template ref<5, scale - 2>() = col;
  841. }
  842. template <class OutputMatrix>
  843. static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
  844. {
  845. alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  846. alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  847. alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  848. alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
  849. alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  850. alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  851. alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  852. alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
  853. out.template ref<2, scale - 1>() = col;
  854. out.template ref<3, scale - 1>() = col;
  855. out.template ref<4, scale - 1>() = col;
  856. out.template ref<5, scale - 1>() = col;
  857. out.template ref<4, scale - 2>() = col;
  858. out.template ref<5, scale - 2>() = col;
  859. out.template ref<scale - 1, 2>() = col;
  860. out.template ref<scale - 1, 3>() = col;
  861. }
  862. template <class OutputMatrix>
  863. static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
  864. {
  865. alphaGrad<1, 2>(out.template ref<scale - 1, scale / 2 >(), col);
  866. alphaGrad<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
  867. alphaGrad<1, 2>(out.template ref<scale - 3, scale / 2 + 2>(), col);
  868. out.template ref<scale - 2, scale - 1>() = col;
  869. out.template ref<scale - 1, scale - 1>() = col;
  870. out.template ref<scale - 1, scale - 2>() = col;
  871. }
  872. template <class OutputMatrix>
  873. static void blendCorner(uint32_t col, OutputMatrix& out)
  874. {
  875. //model a round corner
  876. alphaGrad<97, 100>(out.template ref<5, 5>(), col); //exact: 0.9711013910
  877. alphaGrad<42, 100>(out.template ref<4, 5>(), col); //0.4236372243
  878. alphaGrad<42, 100>(out.template ref<5, 4>(), col); //0.4236372243
  879. alphaGrad< 6, 100>(out.template ref<5, 3>(), col); //0.05652034508
  880. alphaGrad< 6, 100>(out.template ref<3, 5>(), col); //0.05652034508
  881. }
  882. };
  883. //------------------------------------------------------------------------------------
  884. struct ColorDistanceRGB
  885. {
  886. static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
  887. {
  888. return distYCbCrBuffered(pix1, pix2);
  889. //if (pix1 == pix2) //about 4% perf boost
  890. // return 0;
  891. //return distYCbCr(pix1, pix2, luminanceWeight);
  892. }
  893. };
  894. struct ColorDistanceARGB
  895. {
  896. static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
  897. {
  898. const double a1 = getAlpha(pix1) / 255.0 ;
  899. const double a2 = getAlpha(pix2) / 255.0 ;
  900. /*
  901. Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
  902. 1. if a1 = a2, distance should be: a1 * distYCbCr()
  903. 2. if a1 = 0, distance should be: a2 * distYCbCr(black, white) = a2 * 255
  904. 3. if a1 = 1, ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
  905. */
  906. //return std::min(a1, a2) * distYCbCrBuffered(pix1, pix2) + 255 * abs(a1 - a2);
  907. //=> following code is 15% faster:
  908. const double d = distYCbCrBuffered(pix1, pix2);
  909. if (a1 < a2)
  910. return a1 * d + 255 * (a2 - a1);
  911. else
  912. return a2 * d + 255 * (a1 - a2);
  913. //alternative? return std::sqrt(a1 * a2 * square(distYCbCrBuffered(pix1, pix2)) + square(255 * (a1 - a2)));
  914. }
  915. };
  916. struct ColorDistanceUnbufferedARGB
  917. {
  918. static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
  919. {
  920. const double a1 = getAlpha(pix1) / 255.0 ;
  921. const double a2 = getAlpha(pix2) / 255.0 ;
  922. const double d = distYCbCr(pix1, pix2, luminanceWeight);
  923. if (a1 < a2)
  924. return a1 * d + 255 * (a2 - a1);
  925. else
  926. return a2 * d + 255 * (a1 - a2);
  927. }
  928. };
  929. struct ColorGradientRGB
  930. {
  931. template <unsigned int M, unsigned int N>
  932. static void alphaGrad(uint32_t& pixBack, uint32_t pixFront)
  933. {
  934. pixBack = gradientRGB<M, N>(pixFront, pixBack);
  935. }
  936. };
  937. struct ColorGradientARGB
  938. {
  939. template <unsigned int M, unsigned int N>
  940. static void alphaGrad(uint32_t& pixBack, uint32_t pixFront)
  941. {
  942. pixBack = gradientARGB<M, N>(pixFront, pixBack);
  943. }
  944. };
  945. }
  946. void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, ColorFormat colFmt, const xbrz::ScalerCfg& cfg, int yFirst, int yLast)
  947. {
  948. if (factor == 1)
  949. {
  950. std::copy(src + yFirst * srcWidth, src + yLast * srcWidth, trg);
  951. return;
  952. }
  953. static_assert(SCALE_FACTOR_MAX == 6);
  954. switch (colFmt)
  955. {
  956. case ColorFormat::RGB:
  957. switch (factor)
  958. {
  959. case 2:
  960. return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  961. case 3:
  962. return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  963. case 4:
  964. return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  965. case 5:
  966. return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  967. case 6:
  968. return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  969. }
  970. break;
  971. case ColorFormat::ARGB:
  972. switch (factor)
  973. {
  974. case 2:
  975. return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  976. case 3:
  977. return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  978. case 4:
  979. return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  980. case 5:
  981. return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  982. case 6:
  983. return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  984. }
  985. break;
  986. case ColorFormat::ARGB_UNBUFFERED:
  987. switch (factor)
  988. {
  989. case 2:
  990. return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  991. case 3:
  992. return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  993. case 4:
  994. return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  995. case 5:
  996. return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  997. case 6:
  998. return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  999. }
  1000. break;
  1001. }
  1002. assert(false);
  1003. }
  1004. bool xbrz::equalColorTest(uint32_t col1, uint32_t col2, ColorFormat colFmt, double luminanceWeight, double equalColorTolerance)
  1005. {
  1006. switch (colFmt)
  1007. {
  1008. case ColorFormat::RGB:
  1009. return ColorDistanceRGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
  1010. case ColorFormat::ARGB:
  1011. return ColorDistanceARGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
  1012. case ColorFormat::ARGB_UNBUFFERED:
  1013. return ColorDistanceUnbufferedARGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
  1014. }
  1015. assert(false);
  1016. return false;
  1017. }
  1018. void xbrz::bilinearScale(const uint32_t* src, int srcWidth, int srcHeight,
  1019. /**/ uint32_t* trg, int trgWidth, int trgHeight)
  1020. {
  1021. bilinearScale(src, srcWidth, srcHeight, srcWidth * sizeof(uint32_t),
  1022. trg, trgWidth, trgHeight, trgWidth * sizeof(uint32_t),
  1023. 0, trgHeight, [](uint32_t pix) { return pix; });
  1024. }
  1025. void xbrz::nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight,
  1026. /**/ uint32_t* trg, int trgWidth, int trgHeight)
  1027. {
  1028. nearestNeighborScale(src, srcWidth, srcHeight, srcWidth * sizeof(uint32_t),
  1029. trg, trgWidth, trgHeight, trgWidth * sizeof(uint32_t),
  1030. 0, trgHeight, [](uint32_t pix) { return pix; });
  1031. }
  1032. #if 0
  1033. //#include <ppl.h>
  1034. void bilinearScaleCpu(const uint32_t* src, int srcWidth, int srcHeight,
  1035. /**/ uint32_t* trg, int trgWidth, int trgHeight)
  1036. {
  1037. const int TASK_GRANULARITY = 16;
  1038. concurrency::task_group tg;
  1039. for (int i = 0; i < trgHeight; i += TASK_GRANULARITY)
  1040. tg.run([=]
  1041. {
  1042. const int iLast = std::min(i + TASK_GRANULARITY, trgHeight);
  1043. xbrz::bilinearScale(src, srcWidth, srcHeight, srcWidth * sizeof(uint32_t),
  1044. trg, trgWidth, trgHeight, trgWidth * sizeof(uint32_t),
  1045. i, iLast, [](uint32_t pix) { return pix; });
  1046. });
  1047. tg.wait();
  1048. }
  1049. //Perf: AMP vs CPU: merely ~10% shorter runtime (scaling 1280x800 -> 1920x1080)
  1050. //#include <amp.h>
  1051. void bilinearScaleAmp(const uint32_t* src, int srcWidth, int srcHeight, //throw concurrency::runtime_exception
  1052. /**/ uint32_t* trg, int trgWidth, int trgHeight)
  1053. {
  1054. //C++ AMP reference: https://msdn.microsoft.com/en-us/library/hh289390.aspx
  1055. //introduction to C++ AMP: https://msdn.microsoft.com/en-us/magazine/hh882446.aspx
  1056. using namespace concurrency;
  1057. //TODO: pitch
  1058. if (srcHeight <= 0 || srcWidth <= 0) return;
  1059. const float scaleX = static_cast<float>(trgWidth ) / srcWidth;
  1060. const float scaleY = static_cast<float>(trgHeight) / srcHeight;
  1061. array_view<const uint32_t, 2> srcView(srcHeight, srcWidth, src);
  1062. array_view< uint32_t, 2> trgView(trgHeight, trgWidth, trg);
  1063. trgView.discard_data();
  1064. parallel_for_each(trgView.extent, [=](index<2> idx) restrict(amp) //throw ?
  1065. {
  1066. const int y = idx[0];
  1067. const int x = idx[1];
  1068. //Perf notes:
  1069. // -> float-based calculation is (almost) 2x as fas as double!
  1070. // -> no noticeable improvement via tiling: https://msdn.microsoft.com/en-us/magazine/hh882447.aspx
  1071. // -> no noticeable improvement with restrict(amp,cpu)
  1072. // -> iterating over y-axis only is significantly slower!
  1073. // -> pre-calculating x,y-dependent variables in a buffer + array_view<> is ~ 20 % slower!
  1074. const int y1 = srcHeight * y / trgHeight;
  1075. int y2 = y1 + 1;
  1076. if (y2 == srcHeight) --y2;
  1077. const float yy1 = y / scaleY - y1;
  1078. const float y2y = 1 - yy1;
  1079. //-------------------------------------
  1080. const int x1 = srcWidth * x / trgWidth;
  1081. int x2 = x1 + 1;
  1082. if (x2 == srcWidth) --x2;
  1083. const float xx1 = x / scaleX - x1;
  1084. const float x2x = 1 - xx1;
  1085. //-------------------------------------
  1086. const float x2xy2y = x2x * y2y;
  1087. const float xx1y2y = xx1 * y2y;
  1088. const float x2xyy1 = x2x * yy1;
  1089. const float xx1yy1 = xx1 * yy1;
  1090. auto interpolate = [=](int offset)
  1091. {
  1092. /*
  1093. https://en.wikipedia.org/wiki/Bilinear_interpolation
  1094. (c11(x2 - x) + c21(x - x1)) * (y2 - y ) +
  1095. (c12(x2 - x) + c22(x - x1)) * (y - y1)
  1096. */
  1097. const auto c11 = (srcView(y1, x1) >> (8 * offset)) & 0xff;
  1098. const auto c21 = (srcView(y1, x2) >> (8 * offset)) & 0xff;
  1099. const auto c12 = (srcView(y2, x1) >> (8 * offset)) & 0xff;
  1100. const auto c22 = (srcView(y2, x2) >> (8 * offset)) & 0xff;
  1101. return c11 * x2xy2y + c21 * xx1y2y +
  1102. c12 * x2xyy1 + c22 * xx1yy1;
  1103. };
  1104. const float bi = interpolate(0);
  1105. const float gi = interpolate(1);
  1106. const float ri = interpolate(2);
  1107. const float ai = interpolate(3);
  1108. const auto b = static_cast<uint32_t>(bi + 0.5f);
  1109. const auto g = static_cast<uint32_t>(gi + 0.5f);
  1110. const auto r = static_cast<uint32_t>(ri + 0.5f);
  1111. const auto a = static_cast<uint32_t>(ai + 0.5f);
  1112. trgView(y, x) = (a << 24) | (r << 16) | (g << 8) | b;
  1113. });
  1114. trgView.synchronize(); //throw ?
  1115. }
  1116. #endif